bagofwords tf-idf word2vec特征实践
来源:互联网 发布:mac ios 配置 jenkins 编辑:程序博客网 时间:2024/06/03 10:20
1 bagofwords + bayes
import pandas as pdfrom sklearn.feature_extraction.text import CountVectorizerfrom sklearn.naive_bayes import MultinomialNBfrom sklearn.metrics import accuracy_score , roc_auc_score , roc_curveimport matplotlib.pyplot as plt#数据处理def data_prepare(): df = pd.read_excel("window regulator01.xlsx") split = 0.7 Nodf = df[df.categories == 0] print("0样本数量 = %d" %len(Nodf)) Yesdf = df[df.categories == 1] print("1样本数量 = %d" %len(Yesdf)) d_train = Nodf[:int(split * len(Nodf))] d_train = pd.concat([d_train , Yesdf[:int(split * len(Yesdf))]]) d_test = Nodf[int(split * len(Nodf)):] d_test = pd.concat([d_test ,Yesdf[int(split * len(Yesdf) ):] ]) print("训练样本:测试样本 = %.1f" %split) print("训练样本 = %d" %len(d_train)) print("测试样本 = %d" %len(d_test)) return d_train , d_testdef create_model(d_train , d_test): vectorizer = CountVectorizer() #词袋特征抽取 features = vectorizer.fit_transform(d_train.title) print("训练样本特征表长度为 " + str(features.shape)) # print(vectorizer.get_feature_names()[3000:3050]) #特征名展示 test_features = vectorizer.transform(d_test.title) #贝叶斯构建模型 NBmodle = MultinomialNB() print("训练中。。。") NBmodle.fit(features , d_train.categories) print("测试中。。。") predict = NBmodle.predict_proba(test_features) y_true = d_test.categories acc = accuracy_score(y_true , predict[:,1] > 0.5) auc = roc_auc_score(y_true, predict[:, 1]) print("acc = %0.2f" %acc ) print("AUC = %0.2f" % auc ) return y_true , predict#模型评估def performance(y_true , predict ): acc = accuracy_score(y_true , predict[:,1] > 0.5) auc = roc_auc_score(y_true, predict[:, 1]) fpr, tpr , thr = roc_curve(y_true , predict[:,1]) # plt.ion() #开启interactive mode plt.plot(fpr , tpr ) plt.xlabel("False positive rate") plt.ylabel("True positive rate") plt.annotate("Acc : %0.2f" % acc , (0.2 , 0.7) , size = 14) plt.annotate("AUC:%0.2f" %auc , (0.2 , 0.6) , size = 14) plt.show()d_train , d_test = data_prepare()y_true, predict = create_model(d_train , d_test )performance(y_true, predict)
2 tf-idf + bayes
import pandas as pdfrom sklearn.feature_extraction.text import TfidfVectorizerfrom sklearn.naive_bayes import MultinomialNBfrom sklearn.metrics import accuracy_score , roc_auc_score , roc_curveimport matplotlib.pyplot as plt#数据处理def data_prepare(): df = pd.read_excel("window regulator01.xlsx") split = 0.7 Nodf = df[df.categories == 0] print("0样本数量 = %d" %len(Nodf)) Yesdf = df[df.categories == 1] print("1样本数量 = %d" %len(Yesdf)) d_train = Nodf[:int(split * len(Nodf))] d_train = pd.concat([d_train , Yesdf[:int(split * len(Yesdf))]]) d_test = Nodf[int(split * len(Nodf)):] d_test = pd.concat([d_test ,Yesdf[int(split * len(Yesdf) ):] ]) print("训练样本:测试样本 = %.1f" %split) print("训练样本 = %d" %len(d_train)) print("测试样本 = %d" %len(d_test)) return d_train , d_testdef create_model(d_train , d_test): vectorizer = TfidfVectorizer() #词袋特征抽取 features = vectorizer.fit_transform(d_train.title) print("训练样本特征表长度为 " + str(features.shape)) # print(vectorizer.get_feature_names()[3000:3050]) #特征名展示 test_features = vectorizer.transform(d_test.title) #贝叶斯构建模型 NBmodle = MultinomialNB() print("训练中。。。") NBmodle.fit(features , d_train.categories) print("测试中。。。") predict = NBmodle.predict_proba(test_features) y_true = d_test.categories acc = accuracy_score(y_true , predict[:,1] > 0.5) auc = roc_auc_score(y_true, predict[:, 1]) print("acc = %0.2f" %acc ) print("AUC = %0.2f" % auc ) return y_true , predict#模型评估def performance(y_true , predict ): acc = accuracy_score(y_true , predict[:,1] > 0.5) auc = roc_auc_score(y_true, predict[:, 1]) fpr, tpr , thr = roc_curve(y_true , predict[:,1]) # plt.ion() #开启interactive mode plt.plot(fpr , tpr ) plt.xlabel("False positive rate") plt.ylabel("True positive rate") plt.annotate("Acc : %0.2f" % acc , (0.2 , 0.7) , size = 14) plt.annotate("AUC:%0.2f" %auc , (0.2 , 0.6) , size = 14) plt.show()d_train , d_test = data_prepare()y_true, predict = create_model(d_train , d_test )performance(y_true, predict)
3 tf-idf + bayes 参数优化
import pandas as pdfrom sklearn.feature_extraction.text import TfidfVectorizerfrom sklearn.naive_bayes import MultinomialNBfrom sklearn.metrics import accuracy_score , roc_auc_score , roc_curveimport matplotlib.pyplot as pltfrom itertools import productimport csv#数据处理def data_prepare(): df = pd.read_excel("window regulator01.xlsx") split = 0.7 Nodf = df[df.categories == 0] print("0样本数量 = %d" %len(Nodf)) Yesdf = df[df.categories == 1] print("1样本数量 = %d" %len(Yesdf)) d_train = Nodf[:int(split * len(Nodf))] d_train = pd.concat([d_train , Yesdf[:int(split * len(Yesdf))]]) d_test = Nodf[int(split * len(Nodf)):] d_test = pd.concat([d_test ,Yesdf[int(split * len(Yesdf) ):] ]) print("训练样本:测试样本 = %.1f" %split) print("训练样本 = %d" %len(d_train)) print("测试样本 = %d" %len(d_test)) return d_train , d_testdef create_model(d_train , d_test): vectorizer = TfidfVectorizer() #特征抽取 features = vectorizer.fit_transform(d_train.title) print("训练样本特征表长度为 " + str(features.shape)) # print(vectorizer.get_feature_names()[3000:3050]) #特征名展示 test_features = vectorizer.transform(d_test.title) #贝叶斯构建模型 NBmodle = MultinomialNB() print("训练中。。。") NBmodle.fit(features , d_train.categories) print("测试中。。。") predict = NBmodle.predict_proba(test_features) y_true = d_test.categories acc = accuracy_score(y_true , predict[:,1] > 0.5) auc = roc_auc_score(y_true, predict[:, 1]) print("acc = %0.2f" %acc ) print("AUC = %0.2f" % auc ) return y_true , predict#参数优化的模型def create_model_param(d_train , d_test , max_features = None , min_df = 1, nb_alpha = 1.0): vectorizer = TfidfVectorizer(max_features=max_features , min_df=min_df) features = vectorizer.fit_transform(d_train.title) print("训练样本特征表长度为 " + str(features.shape)) # print(vectorizer.get_feature_names()[3000:3050]) #特征名展示 test_features = vectorizer.transform(d_test.title) #贝叶斯构建模型 NBmodle = MultinomialNB(alpha= nb_alpha) print("训练中。。。") NBmodle.fit(features , d_train.categories) print("测试中。。。") predict = NBmodle.predict_proba(test_features) y_true = d_test.categories auc = roc_auc_score(y_true, predict[:, 1]) print("AUC = %0.2f" % auc ) return { "max_feature" : max_features, "min_df":min_df, "nb_alpha":nb_alpha, "AUC":auc }#模型评估def performance(y_true , predict ): acc = accuracy_score(y_true , predict[:,1] > 0.5) auc = roc_auc_score(y_true, predict[:, 1]) fpr, tpr , thr = roc_curve(y_true , predict[:,1]) # plt.ion() #开启interactive mode plt.plot(fpr , tpr ) plt.xlabel("False positive rate") plt.ylabel("True positive rate") plt.annotate("Acc : %0.2f" % acc , (0.2 , 0.7) , size = 14) plt.annotate("AUC:%0.2f" %auc , (0.2 , 0.6) , size = 14) plt.show()#参数组合param_values = { "max_feature": [1000 , 2000 , 3000 , None], "min_df": [1,2,3], "nb_alpha": [0.01 , 0.1 , 1.0],}result = []file = open("result.csv" , "w",newline= "")writer = csv.DictWriter(file , fieldnames=["AUC","max_feature","min_df","nb_alpha"])writer.writeheader()d_train , d_test = data_prepare()for p in product(*param_values.values()): print(p) res = create_model_param(d_train , d_test , p[0],p[1],p[2]) result.append(res) print(res) writer.writerow(res)
4word2vec + randforest
import pandas as pdfrom sklearn.feature_extraction.text import TfidfVectorizerfrom sklearn.naive_bayes import MultinomialNBfrom sklearn.ensemble import RandomForestClassifierfrom sklearn.metrics import accuracy_score , roc_auc_score , roc_curveimport matplotlib.pyplot as pltfrom itertools import productimport csvfrom gensim.models.word2vec import Word2Vecimport nltkimport numpy as np#数据处理def data_prepare(): df = pd.read_excel("window regulator01.xlsx") split = 0.7 Nodf = df[df.categories == 0] print("0样本数量 = %d" %len(Nodf)) Yesdf = df[df.categories == 1] print("1样本数量 = %d" %len(Yesdf)) d_train = Nodf[:int(split * len(Nodf))] d_train = pd.concat([d_train , Yesdf[:int(split * len(Yesdf))]]) d_test = Nodf[int(split * len(Nodf)):] d_test = pd.concat([d_test ,Yesdf[int(split * len(Yesdf) ):] ]) print("训练样本:测试样本 = %.1f" %split) print("训练样本 = %d" %len(d_train)) print("测试样本 = %d" %len(d_test)) return d_train , d_testdef data_process(doc): #word2vec接受的是将每个文档当做一个列表,整个文档是列表里面的元素也是列表 sentences = [] for d in doc: d = d.lower() words = nltk.word_tokenize(d) sentences.append(words) return sentencesdef featurize_w2v(model, sentences): #文档向量化 f= np.zeros((len(sentences),model.vector_size)) #文档特征为文档中所有单词的平均向量 for i , s in enumerate(sentences): for w in s: try: vec = model[w] except KeyError: continue f[i,:] = f[i,:] + vec f[i,:] = f[i,:] / len(s) return fdef create_model(d_train , d_test): sentences = data_process(d_train.title) model = Word2Vec(sentences , size = 300 , window=1 , min_count=1 , sample= 1e-3 , workers=2) model.init_sims(replace=True) feature_train = featurize_w2v(model, sentences) #word2vec特征抽取 RFCmodel =RandomForestClassifier(n_estimators= 100 , n_jobs= -1) RFCmodel.fit(feature_train, d_train.categories) test_sentences = data_process(d_test.title) feature_test = featurize_w2v(model , test_sentences) predict = RFCmodel.predict_proba(feature_test) return d_test.categories , predict#模型评估def performance(y_true , predict ): acc = accuracy_score(y_true , predict[:,1] > 0.5) auc = roc_auc_score(y_true, predict[:, 1]) fpr, tpr , thr = roc_curve(y_true , predict[:,1]) # plt.ion() #开启interactive mode plt.plot(fpr , tpr ) plt.xlabel("False positive rate") plt.ylabel("True positive rate") plt.annotate("Acc : %0.2f" % acc , (0.2 , 0.7) , size = 14) plt.annotate("AUC:%0.2f" %auc , (0.2 , 0.6) , size = 14) plt.show()d_train , d_test = data_prepare()y_true , predict = create_model(d_train , d_test)performance(y_true , predict )
阅读全文
0 0
- bagofwords tf-idf word2vec特征实践
- TF IDF 特征选择
- SparkML中三种文本特征提取算法(TF-IDF/Word2Vec/CountVectorizer)
- 特征提取-计算tf-idf
- Spark特征提取---TF-IDF
- 三种文本特征提取(TF-IDF/Word2Vec/CountVectorizer)及Spark MLlib调用实例(Scala/Java/python)
- scala--三种文本特征提取(TF-IDF/Word2Vec/CountVectorizer)及Spark MLlib调用实例(Scala/Java/python)
- 特征选择方法之TF-IDF、DF
- Spark-特征抽取(TF-IDF)
- 特征选择方法之TF-IDF、DF
- 【Spark Mllib】TF-IDF&Word2Vec——文本相似度
- TF-IDF特征提取 用sklearn提取tfidf特征
- TF/IDF
- TF-IDF
- TF-IDF
- TF-IDF
- TF-IDF
- TF-IDF
- Groovy&Java动态编译执行
- OkHttp3 基本用法
- 三 无限轮播 有原点
- Java 编译器代码定义的 Token 保留字
- Java 编译器代码定义的 Java语言的类型 Types
- bagofwords tf-idf word2vec特征实践
- java代码执行过程简介
- python3 装饰器
- 第1章 JVM语言家族概览 《Kotin 编程思想·实战》
- 第2章 Kotlin简介 《Kotin 编程思想·实战》
- 第3章 快速开始:HelloWorld 《Kotin 编程思想·实战》
- 第4章 kotlin代码执行过程《Kotin 编程思想·实战》
- 服务器安装好tomcat之后,启动无法通过ip访问
- 除了清空购物车,阿里年会的技术也超霸气!