python 自然语言处理 代码实现(批量读取,分词,词向量化)

来源:互联网 发布:java纯数字正则表达式 编辑:程序博客网 时间:2024/06/06 00:09
#coding=utf-8#coding=gbkimport pandas as pdimport jiebaimport codecsimport pandas as pd#decode_set=['ANSI','utf-8','gb18030','ISO-8859-2','gb2312','gbk','Error']    df=open("D:\A仲敏2015\python_code\飞狐外传.txt")df1=df.read()#open与read放在一起,read只能用一次df_test=df1[1000:1100]cf=open("D:\A仲敏2015\python_code\天龙八部.txt",encoding='gb18030',errors='ignore')cf1=cf.read()cf_test=cf1[1000:1100]df_ceshi=jieba.cut(df_test)cf_ceshi=jieba.cut(cf_test)cf_r=" ".join(cf_ceshi)df_r=" ".join(df_ceshi)##引入停用词import restop_words=open("D:\A仲敏2015\python_code\stop_words.txt")stop_content=stop_words.read()#现实内容stop_list=stop_content.splitlines()#将停用词表转化为liststop_words.close()##scikit-learn向量化##特征处理的方法,向量化与Hash Trick##用scikit-learn的TfidfVectorizer类来进行TF-IDF特征处理。import sklearnfrom sklearn.feature_extraction.text import TfidfVectorizercorpus=[cf_r,df_r]vector=TfidfVectorizer(stop_words=stop_list)#将停词引入模型tfidf=vector.fit_transform(corpus)#模型向量化###每次词和TF-IDF的对应关系wordlist=vector.get_feature_names()#获取词带模型中的所有词weightlist=tfidf.toarray()#将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重  for i in range(len(weightlist)):    print(".......第 %s 段文本的词语ti-idf权重........."%i)    for j in range(len(wordlist)):        print(wordlist[j],weightlist[i][j])#词袋模型向量化方法二#用scikit-learn的CountVectorizer类完成词频统计和向量化from sklearn import feature_extraction  from sklearn.feature_extraction.text import TfidfTransformer  from sklearn.feature_extraction.text import CountVectorizervectorizer=CountVectorizer(stop_words=stop_list)#该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频  transformer=TfidfTransformer()#该类会统计每个词语的tf-idf权值  tfidf=transformer.fit_transform(vectorizer.fit_transform(corpus))#第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵  word=vectorizer.get_feature_names()#获取词袋模型中的所有词语  weight=tfidf.toarray()#将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重  for i in range(len(weight)):    print(".......第 %s 段文本的词语ti-idf权重........."%i)    for j in range(len(word)):        print(wordlist[j],weightlist[i][j])##查看编码类型import chardet         chardit1 = chardet.detect(f)#f为查看编码的文件print (chardic1['encoding'])###python批量读取txt文件import os  import re  import sys  import glob  import os  def open_allfile(path,filetype):      data=[]      read_files=glob.glob(path+'*'+filetype)      for i in read_files:          with open(i,encoding='gb18030',errors='ignore') as infile:              data.append(infile.read())      return data #定义函数用于,获得文件名def get_filename(path,filetype):      import os      name=[]      for root,dirs,files in os.walk(path):          for i in files:              if filetype in i:                  name.append(i.replace(filetype,''))       return name  #测试path1='D:\A仲敏2015\python_code\批量文本\\'  filetype2='.txt'  data1=open_allfile(path1,filetype2)  name2=get_filename(path1,filetype2)  print name2 Rs2=[] #建立存储分词的列表  for i in range(len(data1)):      result=[]      seg_list = jieba.cut(data1[i])      for w in seg_list :#读取每一行分词          result.append(w)      Rs2.append(result)##建立存储分词的列表 方法二data=['str']*len(data1)for da in data1:    sen=[]    cut=jieba.cut(da)   # sen=' '.join(cut)       data=data.append(sen)

0 0
原创粉丝点击