从文本中构建词向量

来源:互联网 发布:淘宝店铺设置自动回复 编辑:程序博客网 时间:2024/05/31 04:03

词表到向量的转换函数:

def loadDataSet():    postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],                   ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],                   ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],                   ['stop', 'posting', 'stupid', 'wprthless', 'garbage'],                   ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],                   ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']] #进行词条切分后的文档集合    classVec = [0, 1, 0, 1, 0, 1] #1表示侮辱性文字,0表示正常言论    return postingList, classVecdef createVocabList(dataSet):    vocabSet = set([]) #创建一个空子集    for document in dataSet:        vocabSet = vocabSet | set(document)  #set子集并运算,创建一个不重复词列表    return list(vocabSet)#检查某个词是否在vocabList中def setOfWords2Vec(vocabList, inputSet): #vocabList词汇表,inputSet输入文档    returnVec = [0]*len(vocabList) #创建一个与vocabList等长的0向量    for word in inputSet:        if word in vocabList:            returnVec[vocabList.index(word)] = 1 #相应的位置置1        else: print("the word: %s is not in my Vocabulary!" % word)    return returnVeclistOPosts, listClasses = loadDataSet()myVocabList = createVocabList(listOPosts)print(myVocabList)print(setOfWords2Vec(myVocabList, listOPosts[0]))>>['park', 'flea', 'dalmation', 'to', 'stop', 'love', 'dog', 'problems', 'posting', 'help', 'garbage', 'him', 'not', 'is', 'stupid', 'mr', 'buying', 'how', 'quit', 'I', 'food', 'ate', 'steak', 'cute', 'licks', 'wprthless', 'my', 'so', 'take', 'worthless', 'maybe', 'please', 'has']>>[0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1]
原创粉丝点击