使用CountVectorizer和TfidfVectorizer对fetch_20newsgroups数据进行分类,并对是否使用停用词进行对比(精确度)

来源:互联网 发布:高维数据进行降维 编辑:程序博客网 时间:2024/06/06 01:39
from sklearn.datasets import fetch_20newsgroups
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer


#1.使用为去掉停用词的DictVectorizer对20newsgroup进行分类
news = fetch_20newsgroups(subset='all') 
#对数据进行分割
X_train,X_test,y_train,y_test = train_test_split(news.data,news.target,test_size=0.25,random_state=33)
#采用默认配置对CountVectorizer进行初始化
count_vec = CountVectorizer()
#使用词频统计的方式将原始数据和测试文本转化为特征向量
X_count_train = count_vec.fit_transform(X_train)
X_count_test = count_vec.transform(X_test)
#使用默认的配置对分类器进行初始化(朴素贝叶斯分类器)
mnb_count=MultinomialNB()
#使用分类器对不去停用词的训练样本的参数进行学习
mnb_count.fit(X_count_train, y_train)
print('The accuracy of classifying 20newsgroup using Navie Bayes CountVectorizer without filtering stopwords:',mnb_count.score(X_count_test,y_test))
#将分类预测的结果存储在变量y_count_predict
y_count_predict = mnb_count.predict(X_count_test)
print(classification_report(y_test, y_count_predict,target_names=news.target_names))


#2.使用tTfidf_vectorizer并且不去掉停用词的情况下对20newsgroup进行分类
#使用默认配置对TfidfVectorizer进行初始化
tfidf_vec = TfidfVectorizer()
#使用tfidf的方式,将原始训练和测试文本转化为特征向量
X_tfidf_train = tfidf_vec.fit_transform(X_train)
X_tfidf_test = tfidf_vec.transform(X_test)
#使用默认的配置对分类器进行初始化(朴素贝叶斯分类器)
mnb_count=MultinomialNB()
#使用分类器对不去停用词的训练样本的参数进行学习
mnb_count.fit(X_tfidf_train, y_train)
print('The accuracy of classifying 20newsgroup using Navie Bayes CountVectorizer without filtering stopwords:',mnb_count.score(X_tfidf_test,y_test))
#将分类预测的结果存储在变量y_count_predict
y_tfidf_predict = mnb_count.predict(X_tfidf_test)
print(classification_report(y_test, y_tfidf_predict,target_names=news.target_names))


#3.使用CountVectorizer和TfidfVectorizer,并且去掉停用词的情况下对文本的特征进行量化的朴素贝叶斯分类的性能测试
count_filter_vec=CountVectorizer(analyzer='word',stop_words='english')
tfidf_filter_vec=TfidfVectorizer(analyzer='word',stop_words='english')
#使用带有停用词过滤器的CountVectorizer对训练和测试文本进行量化处理
X_count_filter_train = count_filter_vec.fit_transform(X_train)
X_count_filter_test = count_filter_vec.transform((X_test))
#使用带有停用词的CountVectorizer对训练和测试文本进行量化处理
X_tfidf_filter_train = tfidf_filter_vec.fit_transform(X_train)
X_tfidf_filter_test = tfidf_filter_vec.transform(X_test)
#初始化默认配置的朴素贝叶斯分类器,并对CountVectorizer后的数据进行预测与准确的评估
mnb_count_filter= MultinomialNB()
mnb_count_filter.fit(X_count_filter_train,y_train)
print('The accuracy of classifying 20newsgroups using Naive Bayes(CountVectorizer by filter stopwords):',mnb_count_filter.score(X_count_filter_test,y_test))
y_count_filter_predict = mnb_count_filter.predict(X_count_filter_test)
#初始化另一个默认配置的朴素贝叶斯分类器并对TfidfVectorizer后的数据机型预测和准确性进行评估
mnb_tfidf_filter=MultinomialNB()
mnb_tfidf_filter.fit(X_tfidf_filter_train,y_train)
print('The accuracy of classifying 2newsgroup with Naive Bays(TfidfVectorizer by filtering stopwords):',mnb_tfidf_filter.score(X_tfidf_filter_test,y_test))
y_tfidf_filter_predict = mnb_tfidf_filter.predict(X_tfidf_filter_test)
print(classification_report(y_test, y_count_filter_predict,target_names=news.target_names))
print(classification_report(y_test, y_tfidf_filter_predict,target_names=news.target_names))
阅读全文
0 0