NLTK07《Python自然语言处理》code06 学习分类文本
来源:互联网 发布:cms监控软件客户端下载 编辑:程序博客网 时间:2024/05/22 03:22
学习分类文本
# -*- coding: utf-8 -*-# win10 python3.5.3/python3.6.1 nltk3.2.4# 《Python自然语言处理》 06 学习分类文本# pnlp06.py 本部分代码有几个训练比较耗时,大约需要60+分钟# 6.1 监督式分类def gender_features(word): return {'last_letter':word[-1]}res = gender_features('Shrek')print(res) # {'last_letter': 'k'}from nltk.corpus import namesimport randomimport nltknames = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')])random.shuffle(names)featuresets = [(gender_features(n), g) for (n, g) in names]train_set, test_set = featuresets[500:], featuresets[:500]classifier = nltk.NaiveBayesClassifier.train(train_set)res = classifier.classify(gender_features('Neo'))print(res) # maleres = classifier.classify(gender_features('Trinity'))print(res) # femaleres = nltk.classify.accuracy(classifier, test_set)print(res) # 0.73classifier.show_most_informative_features(5)# Most Informative Features# last_letter = 'a' female : male = 38.1 : 1.0# last_letter = 'k' male : female = 30.9 : 1.0# last_letter = 'f' male : female = 17.4 : 1.0# last_letter = 'p' male : female = 11.9 : 1.0# last_letter = 'v' male : female = 10.6 : 1.0from nltk.classify import apply_featurestrain_set = apply_features(gender_features, names[500:])test_set = apply_features(gender_features, names[:500])# 选择正确的特征def gender_features2(name): features = {} features["firstletter"] = name[0].lower() features["lastletter"] = name[-1].lower() for letter in 'abcdefghijklmnopgrstuvwxyz': features["count(%s)" % letter] = name.lower().count(letter) features["has(%s)" % letter] = (letter in name.lower()) return featuresres = gender_features2('John')print(res) # {'firstletter': 'j', 'lastletter': 'n', 'count(a)': 0,...featuresets = [(gender_features2(n), g) for (n, g) in names]train_set, test_set = featuresets[500:], featuresets[:500]classifier = nltk.NaiveBayesClassifier.train(train_set)res = nltk.classify.accuracy(classifier, test_set)print(res) # 0.762# 训练集、开发测试集、测试集train_names = names[1500:]devtest_names = names[500:1500]test_names = names[:500]train_set = [(gender_features(n), g) for (n, g) in train_names]devtest_set = [(gender_features(n), g) for (n, g) in devtest_names]test_set = [(gender_features(n), g) for (n, g) in test_names]classifier = nltk.NaiveBayesClassifier.train(train_set)res = nltk.classify.accuracy(classifier, devtest_set)print(res) # 0.753errors = []for (name, tag) in devtest_names: guess = classifier.classify(gender_features(name)) if guess != tag: errors.append((tag, guess, name))for (tag, guess, name) in sorted(errors): print('correct=%-8s guess=%-8s name=%-30s' % (tag, guess, name))# correct=female guess=male name=Abigael# correct=female guess=male name=Adriaens# ...def gender_featuress(word): return {'suffix1':word[-1:], 'suffix2':word[-2:]}train_set = [(gender_features(n), g) for (n, g) in train_names]devtest_set = [(gender_features(n), g) for (n, g) in devtest_names]classifier = nltk.NaiveBayesClassifier.train(train_set)res = nltk.classify.accuracy(classifier, devtest_set)print(res) # 0.771# 文档分类import random, nltkfrom nltk.corpus import movie_reviewsdocuments = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)]random.shuffle(documents)all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())word_features = list(all_words.keys())[:2000]def document_features(document): document_words = set(document) features = {} for word in word_features: features['contains(%s)' % word] = (word in document_words) return featuresres = document_features(movie_reviews.words('pos/cv957_8737.txt'))print(res) # {'contains(plot)': True, 'contains(:)': True, ...featuresets = [(document_features(d), c) for (d, c) in documents]train_set, test_set = featuresets[100:], featuresets[:100]classifier = nltk.NaiveBayesClassifier.train(train_set)res = nltk.classify.accuracy(classifier, test_set)print(res) # 0.79classifier.show_most_informative_features(5)# Most Informative Features# contains(martian) = True neg : pos = 7.7 : 1.0# contains(atrocious) = True neg : pos = 7.1 : 1.0# contains(unimaginative) = True neg : pos = 7.1 : 1.0# contains(turkey) = True neg : pos = 6.8 : 1.0# contains(schumacher) = True neg : pos = 6.7 : 1.0# 词性标注import nltkfrom nltk.corpus import brownsuffix_fdist = nltk.FreqDist()for word in brown.words(): word = word.lower() suffix_fdist[word[-1:]] += 1 suffix_fdist[word[-2:]] += 1 suffix_fdist[word[-3:]] += 1common_suffixes = list(suffix_fdist.keys())[:100]print(common_suffixes) # ['e', 'he', 'the', 'n', 'on', 'ton', 'y', 'ty',...def pos_features(word): features = {} for suffix in common_suffixes: features['endswith(%s)' % suffix] = word.lower().endswith(suffix) return featurestagged_words = brown.tagged_words(categories='news')featuresets = [(pos_features(n), g) for (n, g) in tagged_words]size = int(len(featuresets) * 0.1)train_set, test_set = featuresets[size:], featuresets[:size]classifier = nltk.DecisionTreeClassifier.train(train_set)res = nltk.classify.accuracy(classifier, test_set)print(res) # 0.5689706613625062res = classifier.classify(pos_features('cats'))print(res) # NNSprint(classifier.pseudocode(depth=4))"""if endswith(the) == False: if endswith(,) == False: if endswith(s) == False: if endswith(.) == False: return '.' if endswith(.) == True: return '.' if endswith(s) == True: if endswith(was) == False: return 'PP$' if endswith(was) == True: return 'BEDZ' if endswith(,) == True: return ','if endswith(the) == True: return 'AT'"""# 探索上下文语境def pos_features(sentence, i): features = {"suffix(1)":sentence[i][-1:], "suffix(2)":sentence[i][-2:], "suffix(3)":sentence[i][-3:]} if i == 0: features["prev-word"] = "<START>" else: features["prev-word"] = sentence[i-1] return featuresres = pos_features(brown.sents()[0], 8)print(res) # {'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ion', 'prev-word': 'an'}tagged_sents = brown.tagged_sents(categories='news')featuresets = []for tagged_sent in tagged_sents: untagged_sent = nltk.tag.untag(tagged_sent) for i, (word, tag) in enumerate(tagged_sent): featuresets.append((pos_features(untagged_sent, i), tag))size = int(len(featuresets) * 0.1)train_set, test_set = featuresets[size:], featuresets[:size]classifier = nltk.NaiveBayesClassifier.train(train_set)res = nltk.classify.accuracy(classifier, test_set)print(res) # 0.7891596220785678# 序列分类def pos_features(sentence, i, history): features = {"suffix(1)": sentence[i][-1:], "suffix(2)": sentence[i][-2:], "suffix(3)": sentence[i][-3:]} if i == 0: features["prev-word"] = "<STRAT>" features["prev-tag"] = "<START>" else: features["prev-word"] = sentence[i-1] features["prev-tag"] = sentence[i-1] return featuresimport nltkfrom nltk.corpus import brownclass ConsecutivePosTagger(nltk.TaggerI): def __init__(self, train_sents): train_set = [] for tagged_sent in train_sents: untagged_sent = nltk.tag.untag(tagged_sent) history = [] for i, (word, tag) in enumerate(tagged_sent): featureset = pos_features(untagged_sent, i, history) train_set.append((featureset, tag)) history.append(tag) self.classifier = nltk.NaiveBayesClassifier.train(train_set) def tag(self, sentence): history = [] for i, word in enumerate(sentence): featureset = pos_features(sentence, i, history) tag = self.classifier.classify(featureset) history.append(tag) return zip(sentence, history)tagged_sents = brown.tagged_sents(categories='news')size = int(len(tagged_sents)*0.1)train_sents, test_sents = tagged_sents[size:], tagged_sents[:size]tagger = ConsecutivePosTagger(train_sents)res = tagger.evaluate(test_sents)print(res) # 0.7965693092257765# 其他序列分类方法# 6.2 监督式分类的举例# 句子分类import nltksents = nltk.corpus.treebank_raw.sents()tokens = []boundaries = set()offset = 0for sent in nltk.corpus.treebank_raw.sents(): tokens.extend(sent) offset += len(sent) boundaries.add(offset - 1)def punct_features(tokens, i): return {'next-word-capitalized': tokens[i+1][0].isupper(), 'prevword': tokens[i-1].lower(), 'punct': tokens[i], 'prev-word-is-one-char': len(tokens[i-1])==1}featuresets = [(punct_features(tokens, i), (i in boundaries)) for i in range(1, len(tokens) - 1) if tokens[i] in '.?!']size = int(len(featuresets) * 0.1)train_set, test_set = featuresets[size:], featuresets[:size]classifier = nltk.NaiveBayesClassifier.train(train_set)res = nltk.classify.accuracy(classifier, test_set)print(res) # 0.936026936026936def segment_sentences(words): start = 0 sents = [] for i, word in words: if word in '.?!' and classifier.classify(punct_features(words, i)) == True: sents.append(words[start:i+1]) start = i + 1 if start < len(words): sents.append(words[start:]) return sents# 识别对话行为类型posts = nltk.corpus.nps_chat.xml_posts()[:10000]def dialogure_act_features(post): features = {} for word in nltk.word_tokenize(post): features['contains(%s)' % word.lower()] = True return featuresfeaturesets = [(dialogure_act_features(post.text), post.get('class')) for post in posts]size = int(len(featuresets) * 0.1)train_set, test_set = featuresets[size:], featuresets[:size]classifier = nltk.NaiveBayesClassifier.train(train_set)res = nltk.classify.accuracy(classifier, test_set)print(res) # 0.668# 识别文字蕴涵(Recognizing textual entailment, RTE)def rte_features(rtepair): extractor = nltk.RTEFeatureExtractor(rtepair) features = {} features['word_overlap'] = len(extractor.overlap('word')) features['word_hyp_extra'] = len(extractor.hyp_extra('word')) features['ne_overlap'] = len(extractor.overlap('ne')) features['ne_hyp_extra'] = len(extractor.hyp_extra('ne')) return featuresrtepair = nltk.corpus.rte.pairs(['rte3_dev.xml'])[33]extractor = nltk.RTEFeatureExtractor(rtepair)print(extractor.text_words) # {'four', 'republics', 'association', 'Iran', 'Co', ...print(extractor.hyp_words) # {'SCO.', 'China', 'member'}print(extractor.overlap('word')) # set()print(extractor.overlap('ne')) # {'China'}print(extractor.hyp_extra('word')) # {'member'}# 扩展到大型数据集# 6.3 评估# 测试集import random, nltkfrom nltk.corpus import browntagged_sents = list(brown.tagged_sents(categories='news'))random.shuffle(tagged_sents)size = int(len(tagged_sents)*0.1)train_set, test_set = tagged_sents[size:], tagged_sents[:size]file_ids = brown.fileids(categories='news')size = int(len(file_ids)*0.1)train_set = brown.tagged_sents(file_ids[size:])test_set = brown.tagged_sents(file_ids[:size])train_set = brown.tagged_sents(categories='news')test_set = brown.tagged_sents(categories='fiction')# 精确度# classifier = nltk.NaiveBayesClassifier.train(train_set)# print('Accuracy: %4.2f' % nltk.classify.accuracy(classifier, test_set))# 精确度和召回率# 精确度(Precision),表示发现的项目中有多少是相关的# 召回率(Recall),表示相关的项目中发现了多少# F-度量值(F-Measure|F-Score),(2*Precision*Recall)/(Precision+Recall)# 混淆矩阵def tag_list(tagged_sents): return [tag for sent in tagged_sents for (word, tag) in sent]def apply_tagger(tagger, corpus): return [tagger.tag(nltk.tag.untag(sent)) for sent in corpus]sents = brown.tagged_sents(categories='editorial')t0 = nltk.DefaultTagger('NN')t1 = nltk.UnigramTagger(sents, backoff=t0)t2 = nltk.BigramTagger(sents, backoff=t1)gold = tag_list(sents)test = tag_list(apply_tagger(t2, sents))cm = nltk.ConfusionMatrix(gold, test)print(cm)# 交叉验证# 6.4 决策树# 商和信息增益import mathdef entropy(labels): freqdist = nltk.FreqDist(labels) probs = [freqdist.freq(l) for l in nltk.FreqDist(labels)] return -sum([p * math.log(p, 2) for p in probs])print(entropy(['male', 'male', 'male', 'male'])) # -0.0print(entropy(['male', 'female', 'male', 'male'])) # 0.8112781244591328print(entropy(['female', 'male', 'female', 'male'])) # 1.0print(entropy(['female', 'female', 'male', 'female'])) # 0.8112781244591328print(entropy(['female', 'female', 'female', 'female'])) # -0.0# 6.5 朴素贝叶斯分类器# 潜在概率模型# 零计数和平滑# 非二元特征# 独立的朴素贝叶斯# 双重计数的原因# 6.6 最大熵分类器# 6.7 为语言模式建模
阅读全文
0 0
- NLTK07《Python自然语言处理》code06 学习分类文本
- Python自然语言处理 6 学习分类文本
- 自然语言处理之:文本分类
- 统计自然语言处理--文本分类
- 五、自然语言处理中的文本分类
- 自然语言处理-搭建文本分类器
- Python自然语言处理--处理原始文本
- Python自然语言处理 3 处理原始文本
- python自然语言处理02--搜索文本
- python自然语言处理-----计算文本相似度
- python 自然语言处理学习1
- 使用lingpipe自然语言处理包进行文本分类
- 使用lingpipe自然语言处理包进行文本分类
- 自然语言处理课程作业 中文文本情感分类
- 自然语言处理课程作业 中文文本情感分类
- 奋战聊天机器人(四)自然语言处理中的文本分类
- NLTK04《Python自然语言处理》code03 处理原始文本
- 【python】python自然语言处理-----计算中文文本相似度
- 使用struts2的输入校验
- 两点画一条线
- 层次聚类算法java实现
- 谈《与神对话》中,你最高的真实隐…
- 转:锁的优化和注意事项
- NLTK07《Python自然语言处理》code06 学习分类文本
- BZOJ 题目整理
- GCD收录3--dispatch_group线程调度组
- Scala中package与import实战详解
- 梯度下降法
- Android图片加载框架最全解析(五),Glide强大的图片变换功能
- 微信公众号支付开发全过程 --JAVA
- ffmpeg 中print_report中相关日志说明
- FSN (dp)