lgfj
来源:互联网 发布:数控镗床编程入门 编辑:程序博客网 时间:2024/05/16 01:10
数据来源于公司的mongodb 数据库,由于公司保密需要,端口不能给出。大家看下吧。利用四个小区的历史交易训练模型,给出房价输出。
import pymongofrom pymongo import MongoClientimport numpy as npimport pandas as pdfrom pandas import DataFrame,Seriesfrom numpy import row_stack,column_stackclient = MongoClient('192.168.xx.xx',2xxxx)db = client.fangjiaseawater = db.seawaterseawater.find_one()#["dancing","swimming"]query = {"city":"上海","cat":"sell","region":"浦东", "district_name":{"$in":["康桥半岛二期","康桥半岛五期", "绿洲清水湾","中邦城市"]},"p_date":{"$gt":20170508}}lt= seawater.count(query)print(lt)pos = list()for s in seawater.find(query).limit(lt-1): pos.append(s)data=DataFrame(pos)data.to_excel('data.xls')choose_class=['total_price','area','height','room', 'direction','hall','toilet','fitment','district_name','p_date' ]dc=data[choose_class]dc.to_excel('dc.xls')'''lo=list(range(dc.shape[0]))la=list(range(dc.shape[0]))k2=[121.5886,31.148452] #康桥半岛二期经纬度k5=[121.589463,31.139917] #康桥半岛五期经纬度lw=[121.586066,31.154501] #绿洲清水湾经纬度klk=[121.58401,31.157145] #中邦城市期经纬度'''for i in dc['district_name'].index : if dc['district_name'][i]=='康桥半岛二期': dc['district_name'][i]=0 elif dc['district_name'][i]=='康桥半岛五期': dc['district_name'][i]=1 elif dc['district_name'][i]=='绿洲清水湾': dc['district_name'][i]=2 elif dc['district_name'][i]=='中邦城市': dc['district_name'][i] =3''' for i in dc['district_name'].index : if dc['district_name'][i]=='康桥半岛二期': dc['district_name'][i]=0 elif dc['district_name'][i]=='康桥半岛五期': dc['district_name'][i]=1 elif dc['district_name'][i]=='绿洲清水湾': dc['district_name'][i]=2 elif dc['district_name'][i]=='康桥绿洲康城1期': dc['district_name'][i] =3''''''dc.to_excel('dc.xls') for i in dc['direction'].index: if ('东' in dc['direction'][i]) or ('西' in dc['direction'][i]): dc['direction'][i]=0 else: dc['direction'][i]=1for i in dc['fitment'].index: if ('豪' in dc['fitment'][i]==True) or ('精' in dc['fitment'][i]==True): dc['fitment'][i]=0 elif ('毛' in dc['fitment'][i]==True) : dc['direction'][i]=1 else : dc['direction'][i]=2 '''uy=dc.valuesfor i in range(uy.shape[0]): if (uy[i][4]=='南') or (uy[i][4]=='南北'): uy[i][4]=1 else: uy[i][4]=0for i in range(uy.shape[0]): if (uy[i][7]=='精装修') or (uy[i][7]=='中装修'): uy[i][7]=1 else: uy[i][7]=0uu=DataFrame(uy)uu1 = uu.fillna({2:18,3:3,5:2,6:2,7:1})data_train = uu1.drop([0],axis=0)data_max = data_train.max()data_min = data_train.min()data_train1 = (data_train-data_min)/(data_max-data_min+0.2) #数据标准化knife=int(0.95*(data_train.shape[0]))#用于切割数据80%用于训练,20%用于计算x_train = data_train1.iloc[0:knife,1:9].as_matrix() #训练样本标签列y_train = data_train1.iloc[0:knife,0:1].as_matrix() #训练样本特征from keras.models import Sequentialfrom keras.layers.core import Dense, Dropout, Activationmodel = Sequential() #建立模型model.add(Dense(input_dim = 8, output_dim = 48)) #添加输入层、隐藏层的连接model.add(Activation('tanh')) #以Relu函数为激活函数model.add(Dense(input_dim = 100, output_dim = 100)) #添加隐藏层、隐藏层的连接model.add(Activation('relu')) #以Relu函数为激活函数model.add(Dropout(0.2))model.add(Dense(input_dim = 100, output_dim = 50)) #添加隐藏层、隐藏层的连接model.add(Activation('relu')) #以Relu函数为激活函数model.add(Dropout(0.2))model.add(Dense(input_dim = 50, output_dim = 36)) #添加隐藏层、隐藏层的连接model.add(Activation('relu')) #以Relu函数为激活函数model.add(Dense(input_dim = 36, output_dim = 12)) #添加隐藏层、隐藏层的连接model.add(Activation('relu')) #以Relu函数为激活函数model.add(Dense(input_dim = 12, output_dim = 12)) #添加隐藏层、隐藏层的连接model.add(Activation('relu')) #以Relu函数为激活函数model.add(Dense(input_dim = 12, output_dim = 1)) #添加隐藏层、输出层的连接model.add(Activation('tanh')) #以sigmoid函数为激活函数#编译模型,损失函数为binary_crossentropy,用adam法求解model.compile(loss='mean_squared_error', optimizer='adam')model.fit(x_train, y_train, nb_epoch = 300, batch_size = 5) #训练模型model.save_weights('net.model') #保存模型参数x_test = data_train1.iloc[knife:,1:9].as_matrix() #训练样本标签列y_test = data_train1.iloc[knife:,0:1].as_matrix() #训练样本特征r = pd.DataFrame(model.predict(x_test))rt=r*(data_max-data_min+0.2)+data_min #print(rt.round(2))predict=rt.values[:,0:1]realvalue= data_train.values[knife:,0:1]error=abs((predict-realvalue)/realvalue)*100geek=column_stack((predict,realvalue,error))DataFrame(geek).to_excel('geek.xls')print(geek)print('平均计算误差:','%.2f'%error.mean(),'%')
输出的是小区均价,已经把时间平滑处理,即把时间转换成一组数,随机从数据集中取出一条数据进行验证,当然训练集不包含此条数据,计算结果非常好,误差几乎是0。在这一点上,神经网络秒杀经典机器学习算法,秒杀xgboost
# -*- coding: utf-8 -*-"""Created on Thu Aug 24 15:14:07 2017@author: Administrator"""import pymongofrom pymongo import MongoClientimport numpy as npimport pandas as pdfrom pandas import DataFrame,Seriesfrom numpy import row_stack,column_stackfrom dateutil.parser import parsefrom matplotlib.pylab import date2numimport random#从公司的数据库中导入数据client = MongoClient('192.168.xx.xx',2xxxx)db = client.fangjiaseawater = db.seawaterseawater.find_one()# 索引数据库里的数据query = {"city":"上海","cat":"sell","region":"松江", "district_name":{"$in":["绿洲比华利花园","沿海丽水馨庭","雅仕轩","上海康城"]}, "p_date":{"$gt":20170508}}lt= seawater.count(query)print(lt)pos = list()#数据转化为数组,数组的元素为字典for s in seawater.find(query).limit(lt-1): pos.append(s)#将数据转化为 DataFramedata=DataFrame(pos)data.to_excel('data.xls')#需要提取的特征choose_class=['total_price','area','height','room', 'direction','hall','toilet','fitment','district_name','p_date' ]dc=data[choose_class]#将'total_price' 转化为均价,并把均价赋值给'total_price'mean_price=dc['total_price']/dc['area']dc['total_price']=mean_price #将'total_price' 转化为均价#这段代码用于把时间转化成一个连续的数,至于是否有效有待观察####################h=dc['p_date']for i in range(1,len(h)): a=int(h[i]) b=str(a) c=parse(b) e = date2num(c) h[i]=e dc['p_date']=h################### dc.to_excel('dc.xls')'''lo=list(range(dc.shape[0]))la=list(range(dc.shape[0]))k2=[121.5886,31.148452] #康桥半岛二期经纬度k5=[121.589463,31.139917] #康桥半岛五期经纬度lw=[121.586066,31.154501] #绿洲清水湾经纬度klk=[121.58401,31.157145] #中邦城市期经纬度'''for i in dc['district_name'].index : if dc['district_name'][i]=='绿洲比华利花园': dc['district_name'][i]=0 elif dc['district_name'][i]=='沿海丽水馨庭': dc['district_name'][i]=1 elif dc['district_name'][i]=='雅仕轩': dc['district_name'][i]=2 elif dc['district_name'][i]=='上海康城': dc['district_name'][i] =3''' for i in dc['district_name'].index : if dc['district_name'][i]=='康桥半岛二期': dc['district_name'][i]=0 elif dc['district_name'][i]=='康桥半岛五期': dc['district_name'][i]=1 elif dc['district_name'][i]=='绿洲清水湾': dc['district_name'][i]=2 elif dc['district_name'][i]=='康桥绿洲康城1期': dc['district_name'][i] =3''''''dc.to_excel('dc.xls') for i in dc['direction'].index: if ('东' in dc['direction'][i]) or ('西' in dc['direction'][i]): dc['direction'][i]=0 else: dc['direction'][i]=1for i in dc['fitment'].index: if ('豪' in dc['fitment'][i]==True) or ('精' in dc['fitment'][i]==True): dc['fitment'][i]=0 elif ('毛' in dc['fitment'][i]==True) : dc['direction'][i]=1 else : dc['direction'][i]=2 '''uy=dc.valuesfor i in range(uy.shape[0]): if (uy[i][4]=='南') or (uy[i][4]=='南北'): uy[i][4]=1 else: uy[i][4]=0for i in range(uy.shape[0]): if (uy[i][7]=='精装修') or (uy[i][7]=='中装修'): uy[i][7]=1 else: uy[i][7]=0uu=DataFrame(uy)uu1 = uu.fillna({2:18,3:3,5:2,6:2,7:1})data_all = uu1.drop([0],axis=0)sample_number=data_all.shape[0]kk=int(0.05 *sample_number)test_label=[random.randint(0,sample_number) for _ in range(kk)]data_train= data_all.drop(test_label,axis=0)#data_train.to_excel('data_train.xls')data_max = data_train.max()data_min = data_train.min()data_train1 = (data_train-data_min)/(data_max-data_min+0.2) #数据标准化#knife=int(0.95*(data_train.shape[0]))#用于切割数据80%用于训练,20%用于计算x_train = data_train1.iloc[:,1:10].as_matrix() #训练样本标签列y_train = data_train1.iloc[:,0:1].as_matrix() #训练样本特征from keras.models import Sequentialfrom keras.layers.core import Dense, Dropout, Activationmodel = Sequential() #建立模型model.add(Dense(input_dim = 9, output_dim = 48)) #添加输入层、隐藏层的连接model.add(Activation('tanh')) #以Relu函数为激活函数model.add(Dense(input_dim = 100, output_dim = 100)) #添加隐藏层、隐藏层的连接model.add(Activation('relu')) #以Relu函数为激活函数model.add(Dropout(0.2))model.add(Dense(input_dim = 100, output_dim = 50)) #添加隐藏层、隐藏层的连接model.add(Activation('relu')) #以Relu函数为激活函数model.add(Dropout(0.2))model.add(Dense(input_dim = 50, output_dim = 36)) #添加隐藏层、隐藏层的连接model.add(Activation('relu')) #以Relu函数为激活函数model.add(Dense(input_dim = 36, output_dim = 12)) #添加隐藏层、隐藏层的连接model.add(Activation('relu')) #以Relu函数为激活函数model.add(Dense(input_dim = 12, output_dim = 12)) #添加隐藏层、隐藏层的连接model.add(Activation('relu')) #以Relu函数为激活函数model.add(Dense(input_dim = 12, output_dim = 1)) #添加隐藏层、输出层的连接model.add(Activation('tanh')) #以sigmoid函数为激活函数#编译模型,损失函数为binary_crossentropy,用adam法求解model.compile(loss='mean_squared_error', optimizer='adam')model.fit(x_train, y_train, nb_epoch = 200, batch_size = 3) #训练模型model.save_weights('net.model') #保存模型参数test=data_all.ix[test_label,:]#test_max = test.max()#test_min = test.min()data_test = (test-data_min)/(data_max-data_min+0.2) x_test = data_test.iloc[:,1:10].as_matrix()y_test = data_test.iloc[:,0:1].as_matrix()#x_test = data_train1.iloc[knife:,1:9].as_matrix() #训练样本标签列#y_test = data_train1.iloc[knife:,0:1].as_matrix() #训练样本特征r = pd.DataFrame(model.predict(x_test))rt=r*(data_max-data_min+0.2)+data_min#print(rt.round(2))predict=rt.values[:,0:1]realvalue= test.iloc[:,0:1].as_matrix()error=abs((predict-realvalue)/realvalue)*100geek=column_stack((predict,realvalue,error))DataFrame(geek).to_excel('geek.xls')print(geek)print('平均计算误差:','%.2f'%error.mean(),'%')
均值填充和考虑经纬度2017.8.30
# -*- coding: utf-8 -*-"""Created on Thu Aug 24 15:14:07 2017@author: Administrator"""import pymongofrom pymongo import MongoClientimport numpy as npimport pandas as pdfrom pandas import DataFrame,Seriesfrom numpy import row_stack,column_stackfrom dateutil.parser import parsefrom matplotlib.pylab import date2numimport random#导入经度和纬度#从公司的数据库中导入数据client1 = MongoClient('192.168.0.136',xxx)db1 = client1.fangjiaseaweed1 = db1.seaweed#print(seaweed.find_one({"city":"上海","region":"浦东","name":"康桥半岛二期"},{"lat2":1,"lng2":1}))'''print(seaweed.find_one({"city":"上海","region":"浦东", "name":{"$in":["康桥半岛二期","康桥半岛三期","绿洲清水湾","中邦城市"]}} ,{"lat2":1,"lng2":1}))'''query1 = {"status":0,"cat":"district","city":"上海","region":"浦东", "name":{"$in":["康桥半岛二期","康桥半岛三期","绿洲清水湾","中邦城市"]}}fields1 = {"lat2":1,"lng2":1, "city":1,"region":1,"cat":1,"name":1}lct= list()for s in seaweed.find(query1, fields1): lct.append(s)lf=DataFrame(lct)le=lf le.index=le['name'] lr=le[['lng2','lat2']]#从公司的数据库中导入数据client = MongoClient('192.168.10.88',2xxxx)db = client.fangjiaseawater = db.seawaterseawater.find_one()# 索引数据库里的数据query = {"city":"上海","cat":"sell","region":"浦东", "district_name":{"$in":["康桥半岛二期","康桥半岛三期","绿洲清水湾","中邦城市"]}, "p_date":{"$gt":20160508}}lt= seawater.count(query)print(lt)pos = list()#数据转化为数组,数组的元素为字典for s in seawater.find(query).limit(lt-1): pos.append(s)#将数据转化为 DataFramedata=DataFrame(pos)data.to_excel('data.xls')#需要提取的特征choose_class=['total_price','area','height','room', 'direction','hall','toilet','fitment','district_name','p_date' ]dc=data[choose_class]dc['lng2']=0dc['lat2']=1'''for i in range(dc.shape[0]): bn=dc['district_name'] p=bn[i] dc['lng2'][i]=lo['lng2'][p]'''for i in range(dc.shape[0]): if dc['district_name'][i]==lr.index[0]: dc['lng2'][i]=lr['lng2'][0] dc['lat2'][i]=lr['lat2'][0] elif dc['district_name'][i]==lr.index[1]: dc['lng2'][i]=lr['lng2'][1] dc['lat2'][i]=lr['lat2'][1] elif dc['district_name'][i]==lr.index[2]: dc['lng2'][i]=lr['lng2'][2] dc['lat2'][i]=lr['lat2'][2] elif dc['district_name'][i]==lr.index[3]: dc['lng2'][i]=lr['lng2'][3] dc['lat2'][i]=lr['lat2'][3]#将'total_price' 转化为均价,并把均价赋值给'total_price'mean_price=dc['total_price']/dc['area']dc['total_price']=mean_price #将'total_price' 转化为均价#这段代码用于把时间转化成一个连续的数,至于是否有效有待观察####################h=dc['p_date']for i in range(1,len(h)): a=int(h[i]) b=str(a) c=parse(b) e = date2num(c) h[i]=e dc['p_date']=h################### dc.to_excel('dc.xls')'''#给每个小区赋予一个标签for i in dc['district_name'].index : if dc['district_name'][i]=='康桥半岛二期': dc['district_name'][i]=0 elif dc['district_name'][i]=='康桥半岛三期': dc['district_name'][i]=1 elif dc['district_name'][i]=='绿洲清水湾': dc['district_name'][i]=2 elif dc['district_name'][i]=='中邦城市': dc['district_name'][i] =3'''for i in dc['direction'].index: if ('南' in str(dc['direction'][i])) : dc['direction'][i]=0 else: dc['direction'][i]=1for i in dc['fitment'].index: if ('豪' or '精') in str(dc['fitment'][i]) : dc['fitment'][i]=0 else : dc['fitment'][i]=1dc=dc.fillna({'height':dc['height'].mean(), 'room':dc['room'].mean(), 'toilet':dc['toilet'].mean(), 'hall':dc['hall'].mean(), })ds=dc.drop('district_name',axis=1)data_all = ds.drop([0],axis=0)sample_number=data_all.shape[0]kk=int(0.05 *sample_number)test_label=[random.randint(1,sample_number) for _ in range(kk)]data_train= data_all.drop(test_label,axis=0)#data_train.to_excel('data_train.xls')data_max = data_train.max()data_min = data_train.min()data_train1 = (data_train-data_min)/(data_max-data_min+0.2) #数据标准化#knife=int(0.95*(data_train.shape[0]))#用于切割数据80%用于训练,20%用于计算x_train = data_train1.iloc[:,1:11].as_matrix() #训练样本标签列y_train = data_train1.iloc[:,0:1].as_matrix() #训练样本特征from keras.models import Sequentialfrom keras.layers.core import Dense, Dropout, Activationmodel = Sequential() #建立模型model.add(Dense(input_dim = 10, output_dim = 48)) #添加输入层、隐藏层的连接model.add(Activation('relu')) #以Relu函数为激活函数model.add(Dense(input_dim = 48, output_dim = 100)) #添加隐藏层、隐藏层的连接model.add(Activation('relu')) #以Relu函数为激活函数model.add(Dense(input_dim = 100, output_dim = 50)) #添加隐藏层、隐藏层的连接model.add(Activation('relu')) #以Relu函数为激活函数model.add(Dense(input_dim = 50, output_dim = 36)) #添加隐藏层、隐藏层的连接model.add(Activation('relu')) #以Relu函数为激活函数model.add(Dense(input_dim = 36, output_dim = 12)) #添加隐藏层、隐藏层的连接model.add(Activation('relu')) #以Relu函数为激活函数model.add(Dense(input_dim = 12, output_dim = 12)) #添加隐藏层、隐藏层的连接model.add(Activation('relu')) #以Relu函数为激活函数model.add(Dense(input_dim = 12, output_dim = 1)) #添加隐藏层、输出层的连接model.add(Activation('sigmoid')) #以sigmoid函数为激活函数#编译模型,损失函数为binary_crossentropy,用adam法求解model.compile(loss='mean_squared_error', optimizer='adam')model.fit(x_train, y_train, nb_epoch = 300, batch_size = 2) #训练模型model.save_weights('net.model') #保存模型参数test=data_all.ix[test_label,:]#test_max = test.max()#test_min = test.min()data_test = (test-data_min)/(data_max-data_min+0.2) x_test = data_test.iloc[:,1:11].as_matrix()y_test = data_test.iloc[:,0:1].as_matrix()#x_test = data_train1.iloc[knife:,1:9].as_matrix() #训练样本标签列#y_test = data_train1.iloc[knife:,0:1].as_matrix() #训练样本特征r = (model.predict(x_test))rt=r*(data_max.values-data_min.values+0.2)+data_min.values#print(rt.round(2))predict=rt[:,0:1]realvalue= test.iloc[:,0:1].as_matrix()error=abs((predict-realvalue)/realvalue)*100geek=column_stack((predict,realvalue,error))DataFrame(geek).to_excel('geek.xls')print(geek)print('平均计算误差:','%.2f'%error.mean(),'%')
阅读全文
1 0
- lgfj
- 实现ListView的上拉刷新和下拉加载
- startUML破解注册方法
- JavaScript正在吞噬着这个世界
- 详解not in与not exists的区别与用法(not in的性能并不差!)
- 中兴新支点命令篇-文件管理命令(第三弹)
- lgfj
- hdu 5230 ZCC loves hacking
- 我与博客的故事(上)——博客花落知多少
- codility Triangle
- Javaweb核心之request
- 深度学习(五)——DRN, Bi-directional RNN, Attention, seq2seq, DMN
- delete以其他表为参照删除当前表的某些数据
- java中四种引用类型(对象的强、软、弱和虚引用)
- 在鼠标右键上加入使用notepad++打开编辑