利用python,基于SVM实现文本分类
来源:互联网 发布:加工中心攻丝编程格式 编辑:程序博客网 时间:2024/05/22 10:29
算法介绍
由于采用Latex排版,此时无暇在博客上重新排版,具体内容详见本人文档,下载链接代码所用数据
下载链接
文件结构├─doc_classification.py
├─train.data
├─train.label
├─train.map
├─test.data
├─test.label
└─test.mappython代码
需要安装的库:
pandas, liblinearutil
# doc_classification.pyimport pandas as pdimport mathfrom liblinearutil import *import time# 读取数据def loadOriginData(src='train'): # train.data dataSrc = r'%s.data' % src # train.label labelSrc = r'%s.label' % src label = pd.read_table(labelSrc, sep=' ', names=['label']) # train.map mapSrc = r'%s.map' % src # 每个文档拥有的terms doc2term = {} # 每个term出现在哪些文档 term2doc = {} # 每个类别下有哪些docs cate2docs = {} # TF值 TF = {} with open(dataSrc, 'r') as f: for line in f: str_docIdx, str_wordIdx, str_cnt = line.split() docIdx = int(str_docIdx) wordIdx = int(str_wordIdx) cnt = int(str_cnt) # update 数据结构 doc2term.setdefault(docIdx, []).append(wordIdx) term2doc.setdefault(wordIdx, []).append(docIdx) TF.setdefault(docIdx, {})[wordIdx] = cnt # 统计每个类别下有哪些文档 with open(labelSrc, 'r') as f: for line_index, line in enumerate(f, 1): labelVal = int(line.strip()) cate2docs.setdefault(labelVal, []).append(line_index) return TF, doc2term, term2doc, cate2docs, label# 特征选择def featureSel(doc2term, term2doc, cate2docs): # CHI衡量的是特征项ti和类别Cj之间的关联程度, A,B, C, D是四个统计量 CHI_cat2term = {} # N:total number of documents N = len(doc2term) # A + B + C + D = N # A: term出现在某类别中的文档总数 A = {} # B: term出现在除某类别外的其他文档数 B = {} # C: 该类别中不包含term的文档总数 C = {} # D: 其他类别中不包含term的文档总数 D = {} DF = {} # 所有类别 categories = list(cate2docs.keys()) # 停用词词汇表 stopwords = {} stopwordsSrc = r'stopwords.txt' with open(stopwordsSrc) as f: for line in f: stopwords[line.strip()] = True # 训练数据数据词汇表 vocSrc = r'vocabulary.txt' voc = pd.read_table(vocSrc, names=['voc']) # 保存所有的特征 features = set() # 计算一个类别标签下各个词的CHI for category in categories: # 属于第category类的文档为docs docs = cate2docs[category] sumVal = 0 for term in term2doc: # 如果是停用词, 则将CHI置零 if stopwords.get(voc['voc'][term - 1], False): CHI_cat2term.setdefault(category, {})[term] = 0 continue # 属于某类且包含term AVal = len(set(term2doc[term]).intersection(set(docs))) # 不属于某类但包含term BVal = len(term2doc[term]) - AVal # 属于某类,但不包含term CVal = len(docs) - AVal # 不属于某类, 不包含term DVal = N - AVal - BVal - CVal CHIVal = N * (AVal * DVal - CVal * BVal)**2 / ((AVal + CVal) * (BVal + DVal) * (AVal + BVal) * (CVal + DVal)) # CHIVal = math.log(AVal * N / ((AVal + CVal) * (AVal + BVal))) A.setdefault((term, category), AVal) B.setdefault((term, category), BVal) C.setdefault((term, category), CVal) D.setdefault((term, category), DVal) CHI_cat2term.setdefault(category, {})[term] = CHIVal DF[term] = AVal + BVal sumVal += CHIVal # 选出类别中CHI高于平均值的词 terms = CHI_cat2term[category] meanVal = sumVal / len(terms) for term in terms: if CHI_cat2term[category][term] > meanVal: features.add(term) # for feature in features: # print(voc['voc'][feature]) print('There are %d features in VSM model.\n' % len(features)) return features, DFdef buildSVMData(TF, DF, features, N, label, cate2docs, doc2terms): isFeatures = dict(zip(features, [True] * len(features))) categories = list(cate2docs.keys()) # 如果是训练样本, 则计算归一化缩放因子,并返回 # y: label值 y = [0] * N # x: 稀疏矩阵 x = [] for i in range(N): x.append({}) for category in categories: for doc in cate2docs[category]: # 给y进行标记类别 y[doc - 1] = label.iat[doc - 1, 0] scale_factor = -100 for term in doc2terms[doc]: if isFeatures.get(term, False): # 如果term是特征 # TF值 TFVal = TF[doc].get(term, 0) # TF-IDF值 tf_idf = TFVal * math.log(N / DF[term]) x[doc - 1][term] = tf_idf # 更新特征最大值 if scale_factor < tf_idf: scale_factor = tf_idf alpha = 0 # 按一篇文档中特征词最大的tf-idf, 对该文档中的所有特征词进行归一化 for term in doc2terms[doc]: if isFeatures.get(term, False): # 如果term是特征 # x[doc - 1][term] = alpha + (1 - alpha) * x[doc - 1][term] / scale_factor x[doc - 1][term] /= scale_factor print("Data for SVM has been built.\n") return x, y# 计算DFdef getDF(doc2term, term2doc, cate2docs): DF = {} for term in term2doc: DF[term] = len(term2doc[term]) return DFif __name__ == '__main__': start = time.time() # # 主程序 TF, doc2term, term2doc, cate2docs, label = loadOriginData() # 特征选择 features, DF = featureSel(doc2term, term2doc, cate2docs) # 读取数据(train.data) TF, doc2term, term2doc, cate2docs, label = loadOriginData() # 特征选择 features, DF = featureSel(doc2term, term2doc, cate2docs) # build SVM model x, y = buildSVMData(TF, DF, features, len(doc2term), label, cate2docs, doc2term) # 读取测试数据(test.data) TF_test, doc2term_test, term2doc_test, cate2docs_test, label_test = loadOriginData(src='test') DF_test = getDF(doc2term_test, term2doc_test, cate2docs_test) # TF, DF, features, len(doc2term), label, cate2docs, doc2term, scales) x_test, y_test = buildSVMData(TF_test, DF_test, features, len(doc2term_test), label_test, cate2docs_test, doc2term_test) print("处理数据使用了 %s s时间。\n" % (time.time() - start)) # # 调用 liblinear 库进行分类 prob = problem(y, x) param = parameter('-s 0 -c 4 -B 1') # 训练 m = train(prob, param) # 预测test.data p_label, p_acc, p_vals = predict(y_test, x_test, m, '-b 1') # 评价 ACC, MSE, SCC = evaluations(y_test, p_label) print('ACC:\n', ACC) print('MSE', MSE) print('SCC', SCC) # 统计每类中错误率 categoriesErrs = {} for doc_index, doc_label in enumerate(y_test): if doc_label != int(p_label[doc_index]): cateogory = label_test.iat[doc_index, 0] categoriesErrs.setdefault(cateogory, []).append(doc_index + 1) # with open('outcome.txt', 'wb') as f: print("错误分类的样本为:\n") for categoryErr in categoriesErrs: numOfErr = len(categoriesErrs[categoryErr]) print('第%d类共 %d样本, 被错分的个数为 %d, 比例为 %f %%.\n' % (categoryErr,len(cate2docs_test[categoryErr]), numOfErr, numOfErr/len(cate2docs_test[categoryErr]))) end = time.time() print("Total time cost is %s s.\n" % (end - start))
0 0
- 利用python,基于SVM实现文本分类
- 文本分类的python实现-基于SVM算法
- 文本分类的python实现-基于SVM算法
- 利用SVM 实现文本分类的实例
- 基于支持向量机SVM的文本分类的实现
- 基于LIBSVM实现文本分类(python)
- python进行文本分类,基于word2vec,sklearn-svm对微博性别分类
- 利用sklearn 实现SVM分类
- 基于SVM的中文文本分类方法
- python实现文本分类
- sklearn svm实现文本分类 入门
- 文本分类的python实现-基于Xgboost算法
- 文本分类的python实现-基于Xgboost算法
- 基于qt和opencv3实现机器学习之:利用svm(支持向量机)分类
- 基于svm的中文文本自动分类系统
- 基于svm的中文文本自动分类系统
- 基于KNN+层次SVM的文本自动分类技术
- 基于svm的中文文本自动分类系统
- 新手一枚,写了个大数相加的C程序
- Linux下SVN安装与配置
- Combining Sketch and Tone for Pencil Drawing Production的优化过程
- Group by 和Having 的组合用法
- vi 命令使用--AIX环境下总结
- 利用python,基于SVM实现文本分类
- JAVA HashMap 实现原理
- complex form
- java版斐波那契
- Java内存泄漏分析与解决方案
- 整数划分问题
- “The run destination iOS Device is not valid for running the scheme”
- android AsyncTask 源码剖析
- 删除指定目录下的文件和文件夹