基于线性回归预测房价

来源：互联网发布：人生老鼠知乎编辑：程序博客网时间：2024/04/28 12:57

预测结果:

样本数据:

 0.00632  18.00   2.310  0  0.5380  6.5750  65.20  4.0900   1  296.0  15.30 396.90   4.98  24.00 0.02731   0.00   7.070  0  0.4690  6.4210  78.90  4.9671   2  242.0  17.80 396.90   9.14  21.60 0.02729   0.00   7.070  0  0.4690  7.1850  61.10  4.9671   2  242.0  17.80 392.83   4.03  34.70 0.03237   0.00   2.180  0  0.4580  6.9980  45.80  6.0622   3  222.0  18.70 394.63   2.94  33.40 0.06905   0.00   2.180  0  0.4580  7.1470  54.20  6.0622   3  222.0  18.70 396.90   5.33  36.20 0.02985   0.00   2.180  0  0.4580  6.4300  58.70  6.0622   3  222.0  18.70 394.12   5.21  28.70 0.08829  12.50   7.870  0  0.5240  6.0120  66.60  5.5605   5  311.0  15.20 395.60  12.43  22.90 0.14455  12.50   7.870  0  0.5240  6.1720  96.10  5.9505   5  311.0  15.20 396.90  19.15  27.10 0.21124  12.50   7.870  0  0.5240  5.6310 100.00  6.0821   5  311.0  15.20 386.63  29.93  16.50 0.17004  12.50   7.870  0  0.5240  6.0040  85.90  6.5921   5  311.0  15.20 386.71  17.10  18.90 0.22489  12.50   7.870  0  0.5240  6.3770  94.30  6.3467   5  311.0  15.20 392.52  20.45  15.00 0.11747  12.50   7.870  0  0.5240  6.0090  82.90  6.2267   5  311.0  15.20 396.90  13.27  18.90 0.09378  12.50   7.870  0  0.5240  5.8890  39.00  5.4509   5  311.0  15.20 390.50  15.71  21.70 0.62976   0.00   8.140  0  0.5380  5.9490  61.80  4.7075   4  307.0  21.00 396.90   8.26  20.40 0.63796   0.00   8.140  0  0.5380  6.0960  84.50  4.4619   4  307.0  21.00 380.02  10.26  18.20

#coding:utf-8import numpy as npimport matplotlib as mplimport matplotlib.pyplot as pltimport pandas as pdfrom sklearn.model_selection import train_test_splitfrom sklearn.linear_model import ElasticNetCVimport sklearn.datasetsfrom pprint import pprintfrom sklearn.preprocessing import PolynomialFeatures,StandardScalerfrom sklearn.pipeline import Pipelinefrom sklearn.metrics import mean_squared_errorimport warningsimport exceptionsdef not_empty(s):    return s!=''#使用sklearn做单机特征工程#http://www.cnblogs.com/jasonfreak/p/5448385.htmlif __name__=="__main__":    #忽略警告信息    warnings.filterwarnings(action='ignore')    #设置打印选项 --suppress消除小的数字使用科学记数法    np.set_printoptions(suppress=True)    #header=None时，即指明原始文件数据没有列索引，这样read_csv为自动加上列索引，除非你给定列索引的名字    file_data=pd.read_csv('10.housing.data',header=None)    #根据读入的文件内容创建二维数组,第一个参数为行数,第二个参数为列数    data=np.empty((len(file_data),14))    #enumerate遍历列表    for i,d in enumerate(file_data.values):        #map()是 Python 内置的高阶函数，它接收一个函数 f 和一个 list，并通过把函数 f 依次作用在 list 的每个元素上，得到一个新的 list 并返回        #filter()函数接收一个函数 f 和一个list，这个函数 f 的作用是对每个元素进行判断，返回 True或 False，filter()根据判断结果自动过滤掉不符合条件的元素，返回由符合条件元素组成的新list        #首先过滤掉非空元素,然后将列表中的每个元素转换成float类型        d=map(float,filter(not_empty,d[0].split(' ')))        data[i]=d    #array是按照从左至右的顺序切分x是特征向量 1-13列  y是标签列 14列    x,y=np.split(data,(13,),axis=1)    print u'样本个数:%d,特征个数:%d' % x.shape    # print 'y='    # print y    # print 'x='    # print x    # print 'x.shape:',x.shape    # print 'y.shape:',y.shape    #构建训练集合测试集    x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7,random_state=0)    #Pipeline可以将许多算法模型串联起来，比如将特征提取、归一化、分类组织在一起形成一个典型的机器学习问题工作流。主要带来两点好处：    #1. 直接调用fit和predict方法来对pipeline中的所有算法模型进行训练和预测。    #2. 可以结合grid search对参数进行选择开始建模...    model=Pipeline([        ('ss',StandardScaler()),        ('poly',PolynomialFeatures(degree=3,include_bias=True)),        ('linear',ElasticNetCV(l1_ratio=[0.1, 0.3, 0.5, 0.7, 0.99, 1],alphas=np.logspace(-3,2,5),                               fit_intercept=False,max_iter=1e3,cv=3))    ])    print u'开始建模...'    model.fit(x_train,y_train.ravel())    linear=model.get_params('linear')['linear']    print u'超参数:',linear.alpha_    print u'L1 ratio: ',linear.l1_ratio_    y_pred=model.predict(x_test)    r2=model.score(x_test,y_test)    mse=mean_squared_error(y_test,y_pred)    print 'R2:',r2    print u'均方误差:',mse    t=np.arange(len(y_pred))    mpl.rcParams['font.sans-serif']=[u'simHei']    mpl.rcParams['axes.unicode_minus']=False    plt.figure(facecolor='w')    plt.plot(t,y_test.ravel(),'r-',lw=2,label=u'真实值')    plt.plot(t,y_pred,'g-',lw=2,label=u'估计值')    plt.legend(loc='best')    plt.title(u'波士顿房价预测',fontsize=18)    plt.xlabel(u'样本编号',fontsize=15)    plt.ylabel(u'房屋价格',fontsize=15)    plt.grid()    plt.show()

完整代码和数据下载地址：http://download.csdn.net/detail/hb707934728/9810756

2 0