多类分类回归

来源:互联网 发布:java web 用户重复登录 编辑:程序博客网 时间:2024/06/05 17:36
#-*- coding:utf-8 -*-import pandas as pdimport numpy as npfrom sklearn.feature_extraction.text import TfidfVectorizerfrom sklearn.linear_model.logistic import  LogisticRegressionfrom sklearn.cross_validation import train_test_splitfrom sklearn.metrics import classification_report,accuracy_score,confusion_matrixfrom sklearn.pipeline import Pipelinefrom sklearn.grid_search import GridSearchCVimport zipfileimport sysreload(sys)sys.setdefaultencoding('utf-8')#网格搜索 寻找最优参数'''pipeline=Pipeline([    ('vect',TfidfVectorizer(stop_words='english')),    ('clf',LogisticRegression())])paramters={    'vect__max_df':(0.25,0.5), #文档频率高于给定阈值将会被忽略    'vect__ngram_range':((1,1),(1,2)),    'vect__use_idf':(True,False),    'clf__C':(0.1,1,10)}if __name__ == '__main__':    z=zipfile.ZipFile('train.tsv.zip')    df=pd.read_csv(z.open(z.namelist()[0]),header=0,delimiter='\t')    x,y=df['Phrase'].as_matrix(),df['Sentiment'].as_matrix()  #as_matrix()转换成数组和np.array(df['Sentiment'])效果一样    x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.5)    gridsearch=GridSearchCV(pipeline,paramters,n_jobs=3,verbose=1,scoring='accuracy') #n_jobs并行数    gridsearch.fit(x_train,y_train)    print '最佳效果%0.3f'%gridsearch.best_score_    print '最佳参数组合'    for name in paramters.keys():        print name,':',gridsearch.best_params_[name]'''vect=TfidfVectorizer(stop_words='english',max_df=0.25,ngram_range=(1,2),use_idf=False)clf=LogisticRegression(C=10)z=zipfile.ZipFile('train.tsv.zip')df=pd.read_csv(z.open(z.namelist()[0]),header=0,delimiter='\t')zt=zipfile.ZipFile('test.tsv.zip')dft=pd.read_csv(zt.open(zt.namelist()[0]),header=0,delimiter='\t')tarin_y=df['Sentiment'].as_matrix()tarin_x=vect.fit_transform(df['Phrase'].as_matrix())xt=dft['Phrase'].as_matrix()xid=dft['PhraseId'].as_matrix()test_x=vect.transform(xt)clf.fit(tarin_x,tarin_y)test_precdict=clf.predict(test_x)csv_data=[]for i,item in enumerate(test_precdict):    #print '预测类型:',item,'--','评论:',xt[i]    csv_data.append([xid[i],item])pd_data=pd.DataFrame(np.array(csv_data),columns=['PhraseId','Sentiment'])pd_data.to_csv('Submission.csv')print '完成'