第五次课后作业 基于Python语言,使用回归分析,对白酒的品质进行预测,给出分析过程、源代码和结果。 数据来源 https://data.world/uci/wine-quality
回归分析是研究自变量和因变量之间关系的一种预测模型技术。 Input variables (based on physicochemical tests) Output variable (based on sensory data): 1 - fixed acidity 12 - quality (score between 0 and 10) 2 - volatile acidity 3 - citric acid 4 - residual sugar 5 - chlorides 6 - free sulfur dioxide 7 - total sulfur dioxide 8 - density 9 - pH 10 - sulphates 11 - alcohol 分别使用线性回归、Lasso回归、Ridge回归、Elasitc Net四类回归算法构建模型,并比较这些回归算法的效果 线性回归 回归线是线性的 Ridge回归 当碰到数据有多重共线性时,我们就会用到岭回归。所谓多重共线性,简单的说就是自变量之间有高度相关关系。 Lasso回归 Lasso回归和岭回归不同的是,Lasso回归在惩罚方程中用的是绝对值 ElasticNet回归 是Lasso回归和岭回归的组合
path1 = "datas/winequality-white.csv" df= pd.read_csv(path1, sep=";") import numpy as np import matplotlib as mpl import matplotlib.pyplot as plt import pandas as pd import warnings import sklearn from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV from sklearn.preprocessing import PolynomialFeatures from sklearn.pipeline import Pipeline from sklearn.linear_model.coordinate_descent import ConvergenceWarning from sklearn.cross_validation import train_test_split # 读取数据 path1 = "datas/winequality-white.csv" df= pd.read_csv(path1, sep=";") # 自变量名称 names = ["fixed acidity","volatile acidity","citric acid", "residual sugar","chlorides","free sulfur dioxide", "total sulfur dioxide","density","pH","sulphates", "alcohol"] # 因变量名称 quality = "quality"
# 异常数据处理 new_df = df.replace('?', np.nan) datas = new_df.dropna(how = 'any') # 只要有列为空,就进行删除操作 X = datas[names] Y = datas[quality] ## 创建模型列表 models = [ Pipeline([ ('Poly', PolynomialFeatures()), ('Linear', LinearRegression()) ]), ('Linear', RidgeCV(alphas=np.logspace(-4, 2, 20))) ('Linear', LassoCV(alphas=np.logspace(-4, 2, 20))) ('Linear', ElasticNetCV(alphas=np.logspace(-4,2, 20), l1_ratio=np.linspace(0, 1, 5))) ]) ]
plt.figure(figsize=(16,8), facecolor='w') titles = u'线性回归预测', u'Ridge回归预测', u'Lasso回归预测', u'ElasticNet预测' # 将数据分为训练数据和测试数据 X_train,X_test,Y_train,Y_test = train_test_split(X, Y, test_size=0.01, random_state=0) ln_x_test = range(len(X_test)) # 给定阶以及颜色 d_pool = np.arange(1,4,1) # 1 2 3 阶 m = len(d_pool) # clrs = [] # 颜色 # for c in np.linspace(5570560, 255, m): # clrs.append('#%06x' % c) clrs = ['black', 'blue', 'green'] # 颜色
for t in range(4): plt.subplot(2, 2, t + 1) model = models[t] plt.plot(ln_x_test, Y_test, c='r', lw=2, alpha=0.75, zorder=10, label=u'真实值') for i,d in enumerate(d_pool): # 设置参数 model.set_params(Poly__degree=d) # 模型训练 model.fit(X_train, Y_train) # 模型预测及计算R^2 Y_pre = model.predict(X_test) R = model.score(X_train, Y_train) # 输出信息 lin = model.get_params('Linear')['Linear'] output = u"%s:%d阶, 截距:%d, 系数:" % (titles[t], d, lin.intercept_) print(output, lin.coef_) # 图形展示 plt.plot(ln_x_test, Y_pre, c=clrs[i], lw=2,alpha=0.75, zorder=i, label=u'%d阶预测值,$R^2$=%.3f' % (d,R))
plt.legend(loc = 'upper left') plt.grid(True) plt.title(titles[t], fontsize=18) plt.xlabel('X', fontsize=16) plt.ylabel('Y', fontsize=16) plt.suptitle(u'葡萄酒质量预测', fontsize=22) plt.show()