NLTK06《Python自然语言处理》code05 分类和标注词汇

来源:互联网 发布:战舰世界 个人数据 编辑:程序博客网 时间:2024/05/01 11:21

分类和标注词汇

# -*- coding: utf-8 -*-# win10 python3.5.3/python3.6.1 nltk3.2.4# 《Python自然语言处理》 05 分类和标注词汇# pnlp05.py# 5.1 使用词性标注器# 词性标注器(part-of-speech tagger|POS tagger)import nltktext = nltk.word_tokenize("And now for something completely different")res = nltk.pos_tag(text)print(res)# [('And', 'CC'), ('now', 'RB'), ('for', 'IN'), ('something', 'NN'), ('completely', 'RB'), ('different', 'JJ')]text = nltk.word_tokenize("They refuse to permit us to obtain the refuse permit")res = nltk.pos_tag(text)print(res)# [('They', 'PRP'), ('refuse', 'VBP'), ('to', 'TO'), ('permit', 'VB'), ('us', 'PRP'), ('to', 'TO'),# ('obtain', 'VB'), ('the', 'DT'), ('refuse', 'NN'), ('permit', 'NN')]text = nltk.Text(word.lower() for word in nltk.corpus.brown.words())text.similar('woman')# man time day year car moment world house family child country boy# state job place way war girl work wordtext.similar('bought')# made said done put had seen found given left heard was been brought# set got that took in told felttext.similar('over')# in on to of and for with from at by that into as up out down through# is all about# 5.2 标注语料库# 表示已标注的标识tagged_token = nltk.tag.str2tuple('fly/NN')print(tagged_token) # ('fly', 'NN')print(tagged_token[0]) # flyprint(tagged_token[1]) # NNsent = """The/AT grand/JJ jury/NN commented/VBD on/IN a/AT number/NN of/IN other/AP topics/NNS ,/, AMONG/IN them/PPO the/AT Atlanta/NP and/CC Fulton/NP-tl County/NN-tl purchasing/VBG departments/NNS which/WDT it/PPSsaid/VBD ``/`` ARE/BER well/QL operated/VBN and/CC follow/VB generally/RBaccepted/VBN practices/NNS which/WDT inure/VB to/IN the/AT best/JJT interest/NN of/IN both/ABX governments/NNS ''/'' ./."""res = [nltk.tag.str2tuple(t) for t in sent.split()]print(res) # [('The', 'AT'), ('grand', 'JJ'), ('jury', 'NN'), ...# 读取已标注的语料库res = nltk.corpus.brown.tagged_words()print(res) # [('The', 'AT'), ('Fulton', 'NP-TL'), ...]res = nltk.corpus.nps_chat.tagged_words()print(res) # [('now', 'RB'), ('im', 'PRP'), ('left', 'VBD'), ...]res = nltk.corpus.conll2000.tagged_words()print(res) # [('Confidence', 'NN'), ('in', 'IN'), ('the', 'DT'), ...]res = nltk.corpus.treebank.tagged_words()print(res) # [('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ...]res = nltk.corpus.sinica_treebank.tagged_words()print(res) # [('一', 'Neu'), ('友情', 'Nad'), ('嘉珍', 'Nba'), ...]res = nltk.corpus.indian.tagged_words()print(res) # [('মহিষের', 'NN'), ('সন্তান', 'NN'), (':', 'SYM'), ...]res = nltk.corpus.mac_morpho.tagged_words()print(res) # [('Jersei', 'N'), ('atinge', 'V'), ('média', 'N'), ...]res = nltk.corpus.conll2002.tagged_words()print(res) # [('Sao', 'NC'), ('Paulo', 'VMI'), ('(', 'Fpa'), ...]res = nltk.corpus.cess_cat.tagged_words()print(res) # [('El', 'da0ms0'), ('Tribunal_Suprem', 'np0000o'), ...]# 使用tagged_sents()方法将已标注的词划分成句子# 简化的词性标记集# ADJ 形容词# ADV 动词# CNJ 连词# DET 限定词# EX 存在量词# FW 外来词# MOD 情态动词# N 名词# NP 专有名词# NUM 数次# PRO 代词# P 介词# TO 词to# UH 感叹词# V 动词# VD 过去式# VG 现在分词# VN 过去分词# WH Wh限定词from nltk.corpus import brownbrown_news_tagged = brown.tagged_words(categories='news')tag_fd = nltk.FreqDist(tag for (word, tag) in brown_news_tagged)print(list(tag_fd.keys())) # ['AT', 'NP-TL', 'NN-TL', 'JJ-TL', 'VBD', 'NR', 'NN',...# 名词word_tag_pairs= nltk.bigrams(brown_news_tagged)word_tag_pairs = list(word_tag_pairs)res = list(nltk.FreqDist(a[1] for (a, b) in word_tag_pairs if b[1] == 'NP'))print(res) # ['AT', 'NN-TL', 'NP', 'CS',...# 动词wsj = nltk.corpus.treebank.tagged_words()word_tag_fd = nltk.FreqDist(wsj)res = [word + "/" + tag for (word, tag) in word_tag_fd if tag.startswith("V")]print(res) # ['join/VB', 'is/VBZ', 'publishing/VBG',...cfd1 = nltk.ConditionalFreqDist(wsj)res = list(cfd1['yield'].keys())print(res) # ['NN', 'VB']res = list(cfd1['cut'].keys())print(res) # ['VBD', 'VB', 'VBN', 'NN']cfd2 = nltk.ConditionalFreqDist((tag, word) for (word, tag) in wsj)print(list(cfd2)) # ['NNP', ',', 'CD', 'NNS', 'JJ', 'MD', 'VB',...res = list(cfd2['VBG'].keys())print(res) # ['publishing', 'causing', 'using', 'talking', 'having',res = [w for w in cfd1.conditions() if 'VBD' in cfd1[w] and 'VBN' in cfd1[w]]print(res) # ['named', 'used', 'caused', 'reported', 'said',idx1 = wsj.index(('kicked', 'VBD'))print(wsj[idx1-4:idx1+1]) # [('While', 'IN'), ('program', 'NN'), ('trades', 'NNS'), ('swiftly', 'RB'), ('kicked', 'VBD')]idx2 = wsj.index(('kicked', 'VBN'))print(wsj[idx2-4:idx2+1]) # [('head', 'NN'), ('of', 'IN'), ('state', 'NN'), ('has', 'VBZ'), ('kicked', 'VBN')]# 形容词和副词# 未简化的标记def findtags(tag_prefix, tagged_text):    cfd = nltk.ConditionalFreqDist((tag, word) for (word, tag) in tagged_text                                   if tag.startswith(tag_prefix))    #[print(tag, cfd[tag].keys()) for tag in cfd.conditions()]    return cfdtagdict = findtags("NN", list(nltk.corpus.brown.tagged_words(categories='news')))for tag in sorted(tagdict):    print(tag, tagdict[tag])# 探索已标注的语料库from nltk.corpus import brownbrown_learned_text = brown.words(categories = 'learned')res = sorted(set(b for (a, b) in nltk.bigrams(brown_learned_text) if a == 'often'))print(res) # [',', '.', 'accomplished', 'analytically', 'appear', 'apt', 'associated',...brown_lrnd_tagged = brown.tagged_words(categories='learned')tags = [b[1] for (a, b) in nltk.bigrams(brown_lrnd_tagged) if a[0] == 'often']fd = nltk.FreqDist(tags)fd.tabulate()# VBN  VB VBD  JJ  IN  QL   ,  CS  RB  AP VBG  RP VBZ QLP BEN WRB   .  TO  HV# 15  10   8   5   4   3   3   3   3   1   1   1   1   1   1   1   1   1   1# 使用POS标记寻找三词短语from nltk.corpus import browndef process(sentence):    for (w1, t1), (w2, t2), (w3,t3) in nltk.trigrams(sentence):        if (t1.startswith('V') and t2 == 'TO' and t3.startswith('V')):            print(w1, w2, w3)for tagged_sent in brown.tagged_sents():    process(tagged_sent)# combined to achieve# continue to place# ...from nltk.corpus import brownbrown_news_tagged = brown.tagged_words(categories='news')data = nltk.ConditionalFreqDist((word.lower(), tag) for (word, tag) in brown_news_tagged)for word in data.conditions():    if len(data[word]) > 3:        tags = data[word].keys()        print(word, ' '.join(tags))# no AT RB AT-HL AT-TL# that CS WPS DT QL WPO# ...# 5.3 使用Python字典映射词及其属性# 索引链表VS字典# Python字典pos = {}print(pos) # {}pos['colorless'] = 'ADJ'print(pos) # {'colorless': 'ADJ'}pos['ideas'] = 'N'pos['sleep'] = 'V'pos['furiously'] = 'ADV'print(pos) # {'colorless': 'ADJ', 'ideas': 'N', 'sleep': 'V', 'furiously': 'ADV'}print(pos['ideas']) # Nprint(pos['colorless']) # ADJ# print(pos['green']) # KeyError: 'green'print(list(pos)) # ['colorless', 'ideas', 'sleep', 'furiously']print(sorted(pos)) # ['colorless', 'furiously', 'ideas', 'sleep']res = [w for w in pos if w.endswith('s')]print(res) # ['colorless', 'ideas']for word in sorted(pos):    print(word + ":", pos[word])# colorless: ADJ# furiously: ADV# ideas: N# sleep: Vprint(list(pos.keys())) # ['colorless', 'ideas', 'sleep', 'furiously']print(list(pos.values())) # ['ADJ', 'N', 'V', 'ADV']print(list(pos.items())) # [('colorless', 'ADJ'), ('ideas', 'N'), ('sleep', 'V'), ('furiously', 'ADV')]for key, val in sorted(list(pos.items())):    print(key + ":", val)# colorless: ADJ# furiously: ADV# ideas: N# sleep: Vpos['sleep'] = 'V'print(pos['sleep']) # Vpos['sleep'] = 'N'print(pos['sleep']) # N# 定义字典pos = {'colorless':'ADJ', 'ideas':'N', 'sleep':'V', 'furiously':'ADV'}print(pos)  # {'colorless': 'ADJ', 'ideas': 'N', 'sleep': 'V', 'furiously': 'ADV'}pos1 = dict(colorless='ADJ', ideas='N', sleep='V', furiously='ADV')print(pos1) # {'colorless': 'ADJ', 'ideas': 'N', 'sleep': 'V', 'furiously': 'ADV'}# 默认字典frequency = nltk.defaultdict(int)frequency['colorless'] = 4print(frequency['ideas']) # 0pos = nltk.defaultdict(list)pos['sleep'] = ['N', 'V']print(pos['ideas']) # []pos = nltk.defaultdict(lambda: 'N')pos['colorless'] = 'ADJ'print(pos['blog']) # Nprint(list(pos.items())) # [('colorless', 'ADJ'), ('blog', 'N')]alice = nltk.corpus.gutenberg.words('carroll-alice.txt')vocab = nltk.FreqDist(alice)v1000 = list(vocab)[:1000]mapping = nltk.defaultdict(lambda: 'UNK')for v in v1000:    mapping[v] = valice2 = [mapping[v] for v in alice]print(alice2[:100]) # ['[', 'Alice', "'", 's', 'Adventures', 'in', 'Wonderland',...print(len(set(alice2))) # 1001# 递增更新字典counts = nltk.defaultdict(int)from nltk.corpus import brownfor (word, tag) in brown.tagged_words(categories='news'):    counts[tag] += 1print(counts['NN']) # 13162print(list(counts)) # ['AT', 'NP-TL', 'NN-TL', 'JJ-TL', 'VBD', 'NR', ...from operator import itemgetterres = sorted(counts.items(), key=itemgetter(1), reverse=True)print(res) # [('NN', 13162), ('IN', 10616), ('AT', 8893), ...res = [t for t, c in sorted(counts.items(), key=itemgetter(1), reverse=True)]print(res) # ['NN', 'IN', 'AT', 'NP', ',...pair = ('NP', 8336)print(pair[1]) # 8336print(itemgetter(1)(pair)) # 8336last_letters = nltk.defaultdict(list)words = nltk.corpus.words.words('en')for word in words:    key = word[-2:]    last_letters[key].append(word)res = last_letters['ly']print(res) # ['abactinally', 'abandonedly', 'abasedly', 'abashedly',...print(last_letters['zy']) # ['blazy', 'bleezy', 'blowzy', 'boozy', 'breezy',...anagrams = nltk.defaultdict(list)for word in words:    key = ''.join(sorted(word))    anagrams[key].append(word)print(anagrams['aeilnrt']) # ['entrail', 'latrine', 'ratline', 'reliant', 'retinal', 'trenail']anagrams = nltk.Index((''.join(sorted(w)), w) for w in words)print(anagrams['aeilnrt']) # ['entrail', 'latrine', 'ratline', 'reliant', 'retinal', 'trenail']# 复杂的键和值pos = nltk.defaultdict(lambda: nltk.defaultdict(int))brown_news_tagged = brown.tagged_words(categories='news')for ((w1, t1), (w2, t2)) in nltk.bigrams(brown_news_tagged):    pos[(t1, w2)][t2] += 1print(pos[('DT', 'library')]) # defaultdict(<class 'int'>, {'NN': 1})# 颠倒字典counts = nltk.defaultdict(int)for word in nltk.corpus.gutenberg.words('milton-paradise.txt'):    counts[word] += 1res = [key for (key, value) in counts.items() if value == 32]print(res) # ['mortal', 'Against', 'Him', 'There', 'brought', 'King', 'virtue', 'every', 'been', 'thine']pos = {'colorless':'ADJ', 'ideas':'N', 'sleep':'V', 'furiously':'ADV'}pos2 = dict((value, key) for (key, value) in pos.items())print(pos2['N']) # ideaspos.update({'cats':'N', 'scratch':'V', 'peacefully':'ADV', 'old':'ADJ'})pos2 = nltk.defaultdict(list)for key, value in pos.items():    pos2[value].append(key)print(pos2['ADV']) # ['furiously', 'peacefully']pos2 = nltk.Index((value, key) for (key, value) in pos.items())print(pos2['ADV']) # ['furiously', 'peacefully']# 5.4 自动标注from nltk.corpus import brownbrown_tagged_sents = brown.tagged_sents(categories='news')brown_sents = brown.sents(categories='news')# 默认标注器tags = [tag for (word, tag) in brown.tagged_words(categories='news')]print(nltk.FreqDist(tags).max()) # NNraw = 'I do not like green eggs and ham, I do not like them Sam I am!'tokens = nltk.word_tokenize(raw)default_tagger = nltk.DefaultTagger('NN')res = default_tagger.tag(tokens)print(res) # [('I', 'NN'), ('do', 'NN'), ('not', 'NN'), ('like', 'NN'),...res = default_tagger.evaluate(brown_tagged_sents)print(res) # 0.13089484257215028# 正则表达式标注器patterns = [    (r'.*img$', 'VBG'),    (r'.*ed$', 'VBD'),    (r'.*es$', 'VBZ'),    (r'.*ould$', 'MD'),    (r'.*\'s$', 'NN$'),    (r'.*$', 'NNS'),    (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),    (r'.*', 'NN')]regexp_tagger = nltk.RegexpTagger(patterns)res = regexp_tagger.tag(brown_sents[3])print(res) # [('``', 'NNS'), ('Only', 'NNS'), ('a', 'NNS'), ...print(regexp_tagger.evaluate(brown_tagged_sents)) # 0.05904290232114088# 查询标注器fd = nltk.FreqDist(brown.words(categories='news'))cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))most_freq_words = list(fd.keys())[:100]likely_tags= dict((word, cfd[word].max()) for word in most_freq_words)baseline_tagger = nltk.UnigramTagger(model=likely_tags)res = baseline_tagger.evaluate(brown_tagged_sents)print(res) # 0.3329355371243312sent = brown.sents(categories = 'news')[3]res = baseline_tagger.tag(sent)print(res) # [('``', '``'), ('Only', 'RB'), ('a', 'AT'), ...baseline_tagger = nltk.UnigramTagger(model=likely_tags, backoff=nltk.DefaultTagger('NN'))def performance(cfd, wordlist):    lt = dict((word, cfd[word].max()) for word in wordlist)    baseline_tagger = nltk.UnigramTagger(model=lt, backoff=nltk.DefaultTagger('NN'))    return baseline_tagger.evaluate(brown.tagged_sents(categories='news'))def display():    import pylab    words_by_freq = list(nltk.FreqDist(brown.words(categories='news')))    cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))    sizes = 2 ** pylab.arange(15)    perfs = [performance(cfd, words_by_freq[:size]) for size in sizes]    pylab.plot(sizes, perfs, '-bo')    pylab.title('Lookup Tagger Performance with Varying Model Size')    pylab.xlabel('Model Size')    pylab.ylabel('Performance')    pylab.show()display()# 评估# 5.5 N-gram标注# 一元标注器(Unigram Tagging)from nltk.corpus import brownbrown_tagged_sents = brown.tagged_sents(categories='news')brown_sents = brown.sents(categories='news')unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)res = unigram_tagger.tag(brown_sents[2007])print(res) # [('Various', 'JJ'), ('of', 'IN'), ('the', 'AT'), ('apartments', 'NNS'),...res = unigram_tagger.evaluate(brown_tagged_sents)print(res) # 0.9349006503968017# 分离训练和测试数据size = int(len(brown_tagged_sents)*0.9)train_sents = brown_tagged_sents[:size]test_sents = brown_tagged_sents[size:]unigram_tagger = nltk.UnigramTagger(train_sents)res = unigram_tagger.evaluate(test_sents)print(res) # 0.8121200039868434# 一般的N-gram的标注bigram_tagger = nltk.BigramTagger(train_sents)res = bigram_tagger.tag(brown_sents[2007])print(res) # [('Various', 'JJ'), ('of', 'IN'), ('the', 'AT'),...unseen_sent = brown_sents[4203]res = bigram_tagger.tag(unseen_sent)print(res) # [('The', 'AT'), ('population', 'NN'),...res = bigram_tagger.evaluate(test_sents)print(res) # 0.10206319146815508# 组合标注器t0 = nltk.DefaultTagger('NN')t1 = nltk.UnigramTagger(train_sents, backoff=t0)t2 = nltk.BigramTagger(train_sents, backoff=t1)res = t2.evaluate(test_sents)print(res) # 0.8452108043456593# 标注生词# 存储标注器from pickle import dumpoutput = open('t2.pkl', 'wb')dump(t2, output, -1)output.close()from pickle import loadinput = open('t2.pkl', 'rb')tagger = load(input)input.close()text = """The board's action shows what free enterprise is up against in our complex maze of regulatory laws."""tokens = text.split()res = tagger.tag(tokens)print(res) # [('The', 'AT'), ("board's", 'NN$'), ('action', 'NN'),# 性能限制cfd = nltk.ConditionalFreqDist(    ((x[1], y[1], z[0]), z[1])    for sent in brown_tagged_sents    for x, y, z in nltk.trigrams(sent))ambiguous_contexts = [c for c in cfd.conditions() if len(cfd[c]) > 1]res = sum(cfd[c].N() for c in ambiguous_contexts) / cfd.N()print(res) # 0.049297702068029296test_tags = [tag for sent in brown.sents(categories='editorial')             for (word, tag) in t2.tag(sent)]gold_tags = [tag for (word, tag) in brown.tagged_words(categories='editorial')]print(nltk.ConfusionMatrix(gold_tags, test_tags))# 跨句子边界标注brown_tagged_sents = brown.tagged_sents(categories='news')brown_sents = brown.sents(categories='news')size = int(len(brown_tagged_sents) * 0.9)train_sents = brown_tagged_sents[:size]test_sents = brown_tagged_sents[size:]t0 = nltk.DefaultTagger('NN')t1 = nltk.UnigramTagger(train_sents, backoff=t0)t2 = nltk.BigramTagger(train_sents, backoff=t1)res = t2.evaluate(test_sents)print(res) # 0.8452108043456593# 5.6 基于转换的标注res = nltk.tag.brill.nltkdemo18()print(res)# 5.7 如何确定一个词的分类# 形态学线索# 句法线索# 语义线索# 新词# 词性标记集中的形态学
阅读全文
0 0