bagofwords tf-idf word2vec特征实践

来源:互联网 发布:mac ios 配置 jenkins 编辑:程序博客网 时间:2024/06/03 10:20

1 bagofwords + bayes

import pandas as pdfrom sklearn.feature_extraction.text import CountVectorizerfrom sklearn.naive_bayes import MultinomialNBfrom sklearn.metrics import accuracy_score , roc_auc_score , roc_curveimport matplotlib.pyplot as plt#数据处理def data_prepare():    df = pd.read_excel("window regulator01.xlsx")    split = 0.7    Nodf = df[df.categories == 0]    print("0样本数量 = %d" %len(Nodf))    Yesdf = df[df.categories == 1]    print("1样本数量 = %d" %len(Yesdf))    d_train = Nodf[:int(split * len(Nodf))]    d_train = pd.concat([d_train , Yesdf[:int(split * len(Yesdf))]])    d_test = Nodf[int(split * len(Nodf)):]    d_test = pd.concat([d_test ,Yesdf[int(split * len(Yesdf) ):] ])    print("训练样本:测试样本 = %.1f" %split)    print("训练样本 = %d" %len(d_train))    print("测试样本 = %d" %len(d_test))    return d_train , d_testdef create_model(d_train , d_test):    vectorizer = CountVectorizer() #词袋特征抽取    features = vectorizer.fit_transform(d_train.title)    print("训练样本特征表长度为 " + str(features.shape))    # print(vectorizer.get_feature_names()[3000:3050]) #特征名展示    test_features = vectorizer.transform(d_test.title)    #贝叶斯构建模型    NBmodle = MultinomialNB()    print("训练中。。。")    NBmodle.fit(features , d_train.categories)    print("测试中。。。")    predict = NBmodle.predict_proba(test_features)    y_true = d_test.categories    acc = accuracy_score(y_true , predict[:,1] > 0.5)    auc = roc_auc_score(y_true, predict[:, 1])    print("acc = %0.2f" %acc )    print("AUC = %0.2f" % auc )    return y_true , predict#模型评估def performance(y_true , predict ):    acc = accuracy_score(y_true , predict[:,1] > 0.5)    auc = roc_auc_score(y_true, predict[:, 1])    fpr, tpr , thr = roc_curve(y_true , predict[:,1])    # plt.ion() #开启interactive mode    plt.plot(fpr , tpr  )    plt.xlabel("False positive rate")    plt.ylabel("True positive rate")    plt.annotate("Acc : %0.2f" % acc , (0.2 , 0.7) , size = 14)    plt.annotate("AUC:%0.2f" %auc , (0.2 , 0.6) , size = 14)    plt.show()d_train , d_test = data_prepare()y_true, predict = create_model(d_train , d_test )performance(y_true, predict)


2 tf-idf + bayes

import pandas as pdfrom sklearn.feature_extraction.text import TfidfVectorizerfrom sklearn.naive_bayes import MultinomialNBfrom sklearn.metrics import accuracy_score , roc_auc_score , roc_curveimport matplotlib.pyplot as plt#数据处理def data_prepare():    df = pd.read_excel("window regulator01.xlsx")    split = 0.7    Nodf = df[df.categories == 0]    print("0样本数量 = %d" %len(Nodf))    Yesdf = df[df.categories == 1]    print("1样本数量 = %d" %len(Yesdf))    d_train = Nodf[:int(split * len(Nodf))]    d_train = pd.concat([d_train , Yesdf[:int(split * len(Yesdf))]])    d_test = Nodf[int(split * len(Nodf)):]    d_test = pd.concat([d_test ,Yesdf[int(split * len(Yesdf) ):] ])    print("训练样本:测试样本 = %.1f" %split)    print("训练样本 = %d" %len(d_train))    print("测试样本 = %d" %len(d_test))    return d_train , d_testdef create_model(d_train , d_test):    vectorizer = TfidfVectorizer() #词袋特征抽取    features = vectorizer.fit_transform(d_train.title)    print("训练样本特征表长度为 " + str(features.shape))    # print(vectorizer.get_feature_names()[3000:3050]) #特征名展示    test_features = vectorizer.transform(d_test.title)    #贝叶斯构建模型    NBmodle = MultinomialNB()    print("训练中。。。")    NBmodle.fit(features , d_train.categories)    print("测试中。。。")    predict = NBmodle.predict_proba(test_features)    y_true = d_test.categories    acc = accuracy_score(y_true , predict[:,1] > 0.5)    auc = roc_auc_score(y_true, predict[:, 1])    print("acc = %0.2f" %acc )    print("AUC = %0.2f" % auc )    return y_true , predict#模型评估def performance(y_true , predict ):    acc = accuracy_score(y_true , predict[:,1] > 0.5)    auc = roc_auc_score(y_true, predict[:, 1])    fpr, tpr , thr = roc_curve(y_true , predict[:,1])    # plt.ion() #开启interactive mode    plt.plot(fpr , tpr  )    plt.xlabel("False positive rate")    plt.ylabel("True positive rate")    plt.annotate("Acc : %0.2f" % acc , (0.2 , 0.7) , size = 14)    plt.annotate("AUC:%0.2f" %auc , (0.2 , 0.6) , size = 14)    plt.show()d_train , d_test = data_prepare()y_true, predict = create_model(d_train , d_test )performance(y_true, predict)



3 tf-idf + bayes 参数优化

import pandas as pdfrom sklearn.feature_extraction.text import TfidfVectorizerfrom sklearn.naive_bayes import MultinomialNBfrom sklearn.metrics import accuracy_score , roc_auc_score , roc_curveimport matplotlib.pyplot as pltfrom itertools import productimport csv#数据处理def data_prepare():    df = pd.read_excel("window regulator01.xlsx")    split = 0.7    Nodf = df[df.categories == 0]    print("0样本数量 = %d" %len(Nodf))    Yesdf = df[df.categories == 1]    print("1样本数量 = %d" %len(Yesdf))    d_train = Nodf[:int(split * len(Nodf))]    d_train = pd.concat([d_train , Yesdf[:int(split * len(Yesdf))]])    d_test = Nodf[int(split * len(Nodf)):]    d_test = pd.concat([d_test ,Yesdf[int(split * len(Yesdf) ):] ])    print("训练样本:测试样本 = %.1f" %split)    print("训练样本 = %d" %len(d_train))    print("测试样本 = %d" %len(d_test))    return d_train , d_testdef create_model(d_train , d_test):    vectorizer = TfidfVectorizer() #特征抽取    features = vectorizer.fit_transform(d_train.title)    print("训练样本特征表长度为 " + str(features.shape))    # print(vectorizer.get_feature_names()[3000:3050]) #特征名展示    test_features = vectorizer.transform(d_test.title)    #贝叶斯构建模型    NBmodle = MultinomialNB()    print("训练中。。。")    NBmodle.fit(features , d_train.categories)    print("测试中。。。")    predict = NBmodle.predict_proba(test_features)    y_true = d_test.categories    acc = accuracy_score(y_true , predict[:,1] > 0.5)    auc = roc_auc_score(y_true, predict[:, 1])    print("acc = %0.2f" %acc )    print("AUC = %0.2f" % auc )    return y_true , predict#参数优化的模型def create_model_param(d_train , d_test , max_features = None , min_df = 1, nb_alpha = 1.0):    vectorizer = TfidfVectorizer(max_features=max_features , min_df=min_df)    features = vectorizer.fit_transform(d_train.title)    print("训练样本特征表长度为 " + str(features.shape))    # print(vectorizer.get_feature_names()[3000:3050]) #特征名展示    test_features = vectorizer.transform(d_test.title)    #贝叶斯构建模型    NBmodle = MultinomialNB(alpha= nb_alpha)    print("训练中。。。")    NBmodle.fit(features , d_train.categories)    print("测试中。。。")    predict = NBmodle.predict_proba(test_features)    y_true = d_test.categories    auc = roc_auc_score(y_true, predict[:, 1])    print("AUC = %0.2f" % auc )    return {        "max_feature" : max_features,        "min_df":min_df,        "nb_alpha":nb_alpha,        "AUC":auc    }#模型评估def performance(y_true , predict ):    acc = accuracy_score(y_true , predict[:,1] > 0.5)    auc = roc_auc_score(y_true, predict[:, 1])    fpr, tpr , thr = roc_curve(y_true , predict[:,1])    # plt.ion() #开启interactive mode    plt.plot(fpr , tpr  )    plt.xlabel("False positive rate")    plt.ylabel("True positive rate")    plt.annotate("Acc : %0.2f" % acc , (0.2 , 0.7) , size = 14)    plt.annotate("AUC:%0.2f" %auc , (0.2 , 0.6) , size = 14)    plt.show()#参数组合param_values = {    "max_feature": [1000 , 2000 , 3000 , None],    "min_df": [1,2,3],    "nb_alpha": [0.01 , 0.1 , 1.0],}result = []file = open("result.csv" , "w",newline= "")writer = csv.DictWriter(file , fieldnames=["AUC","max_feature","min_df","nb_alpha"])writer.writeheader()d_train , d_test = data_prepare()for p in product(*param_values.values()):    print(p)    res = create_model_param(d_train , d_test , p[0],p[1],p[2])    result.append(res)    print(res)    writer.writerow(res)


4word2vec + randforest

import pandas as pdfrom sklearn.feature_extraction.text import TfidfVectorizerfrom sklearn.naive_bayes import MultinomialNBfrom sklearn.ensemble import RandomForestClassifierfrom sklearn.metrics import accuracy_score , roc_auc_score , roc_curveimport matplotlib.pyplot as pltfrom itertools import productimport csvfrom gensim.models.word2vec import Word2Vecimport nltkimport numpy as np#数据处理def data_prepare():    df = pd.read_excel("window regulator01.xlsx")    split = 0.7    Nodf = df[df.categories == 0]    print("0样本数量 = %d" %len(Nodf))    Yesdf = df[df.categories == 1]    print("1样本数量 = %d" %len(Yesdf))    d_train = Nodf[:int(split * len(Nodf))]    d_train = pd.concat([d_train , Yesdf[:int(split * len(Yesdf))]])    d_test = Nodf[int(split * len(Nodf)):]    d_test = pd.concat([d_test ,Yesdf[int(split * len(Yesdf) ):] ])    print("训练样本:测试样本 = %.1f" %split)    print("训练样本 = %d" %len(d_train))    print("测试样本 = %d" %len(d_test))    return d_train , d_testdef data_process(doc): #word2vec接受的是将每个文档当做一个列表,整个文档是列表里面的元素也是列表    sentences = []    for d in doc:        d = d.lower()        words = nltk.word_tokenize(d)        sentences.append(words)    return sentencesdef featurize_w2v(model, sentences): #文档向量化    f= np.zeros((len(sentences),model.vector_size)) #文档特征为文档中所有单词的平均向量    for i , s in enumerate(sentences):        for w in s:            try:                vec = model[w]            except KeyError:                continue            f[i,:] = f[i,:] + vec        f[i,:] = f[i,:] / len(s)    return fdef create_model(d_train , d_test):    sentences = data_process(d_train.title)    model = Word2Vec(sentences , size = 300 , window=1 , min_count=1 , sample= 1e-3 , workers=2)    model.init_sims(replace=True)    feature_train = featurize_w2v(model, sentences) #word2vec特征抽取    RFCmodel =RandomForestClassifier(n_estimators= 100 , n_jobs= -1)    RFCmodel.fit(feature_train, d_train.categories)    test_sentences = data_process(d_test.title)    feature_test = featurize_w2v(model , test_sentences)    predict = RFCmodel.predict_proba(feature_test)    return d_test.categories , predict#模型评估def performance(y_true , predict ):    acc = accuracy_score(y_true , predict[:,1] > 0.5)    auc = roc_auc_score(y_true, predict[:, 1])    fpr, tpr , thr = roc_curve(y_true , predict[:,1])    # plt.ion() #开启interactive mode    plt.plot(fpr , tpr  )    plt.xlabel("False positive rate")    plt.ylabel("True positive rate")    plt.annotate("Acc : %0.2f" % acc , (0.2 , 0.7) , size = 14)    plt.annotate("AUC:%0.2f" %auc , (0.2 , 0.6) , size = 14)    plt.show()d_train , d_test = data_prepare()y_true , predict  = create_model(d_train , d_test)performance(y_true , predict )



原创粉丝点击