Python使用doc2vec和LR进行文本分类

来源:互联网 发布:mac os操作系统教程 编辑:程序博客网 时间:2024/05/29 06:53

(1)数据预处理
a.对文本数据进行贴标签处理,标签数据类似入下:

平素体质:健康状况:良,既往有“高血压病史”多年。#1

其中1表示患有高血压,0表示没有患有高血压。
然后进行分开,文本存储在一个文件,标签存储在一个文件,文本内容和标签行对行对应。
b.对文本文件的内容进行分词。

import jieba#读取数据生成sentencesfile=open(u'/home/ubuntu/file/数据平衡分类',encoding='utf-8')filenoclass=open(u'/home/ubuntu/file/数据平衡无分类','w')fileclass=open(u'/home/ubuntu/file/数据平衡分类结果','w')documents=[]tig=[]for lines in file:    text=lines.strip().split('#')    segs=jieba.cut(text[0])    for seg in segs:        filenoclass.write(seg+" ")    filenoclass.write('\n')    fileclass.write(str(text[1])+'\n')filenoclass.close()fileclass.close()file.close()

(2)训练doc2vec得到文本向量

import gensimfrom sklearn.linear_model import LogisticRegressionimport pandas as pdfrom sklearn.model_selection import train_test_splitfile = open(u'/home/ubuntu/file/数据平衡无分类', encoding='utf-8')fileclass=open(u'/home/ubuntu/file/数据平衡分类结果',encoding='utf-8')documents = gensim.models.doc2vec.TaggedLineDocument(file)model = gensim.models.Doc2Vec(documents, size=100, window=8, min_count=100, workers=8)#生成文本向量print(model.docvecs[1])

(3)准备进行分类的数据

def getData():    #生成pandas    tigs = []    data_dict = {}    # 生成pandas数据    for tig in fileclass:        tigs.append(tig.strip())    for i in range(len(model.docvecs)):        data_dict['p' + str(i)] = model.docvecs[i]    print(tigs)    print(data_dict)    data = pd.DataFrame(data_dict)    data = data.T    data['class0'] = tigs    X_train1, X_test1, y_train1, y_test1 = train_test_split(data, data['class0'], test_size=0.4, random_state=0)    return X_train1, y_train1, X_test1, y_test1

(4)准备测试方法

def getRecognitionRate(testPre, testClass):    testNum = len(testPre)    rightNum = 0    for i in range(0, testNum):        if testClass[i] == testPre[i]:            rightNum += 1    return float(rightNum) / float(testNum)

(5)进行模型训练

import gensimfrom sklearn.linear_model import LogisticRegressionimport pandas as pdfrom sklearn.model_selection import train_test_splitfile = open(u'/home/ubuntu/file/数据平衡无分类', encoding='utf-8')fileclass=open(u'/home/ubuntu/file/数据平衡分类结果',encoding='utf-8')documents = gensim.models.doc2vec.TaggedLineDocument(file)model = gensim.models.Doc2Vec(documents, size=100, window=8, min_count=100, workers=8)#生成文本向量print(model.docvecs[1])#使用逻辑回归进行预测def LR():    clf = LogisticRegression()    return clfdef getRecognitionRate(testPre, testClass):    testNum = len(testPre)    rightNum = 0    for i in range(0, testNum):        if testClass[i] == testPre[i]:            rightNum += 1    return float(rightNum) / float(testNum)def getData():    #生成pandas    tigs = []    data_dict = {}    # 生成pandas数据    for tig in fileclass:        tigs.append(tig.strip())    for i in range(len(model.docvecs)):        data_dict['p' + str(i)] = model.docvecs[i]    print(tigs)    print(data_dict)    data = pd.DataFrame(data_dict)    data = data.T    data['class0'] = tigs    X_train1, X_test1, y_train1, y_test1 = train_test_split(data, data['class0'], test_size=0.4, random_state=0)    return X_train1, y_train1, X_test1, y_test1T = getData()trainMatrix, trainClass, testMatrix, testClass = T[0], T[1], T[2], T[3]clf_LR=LR()clf_LR.fit(trainMatrix, trainClass)print('Logistic Regression recognition rate: ', getRecognitionRate(clf_LR.predict(testMatrix), testClass))
原创粉丝点击