NLTK03 《Python自然语言处理》code02 获得文本语料和词汇资源

来源：互联网发布：网络信息安全要学什么编辑：程序博客网时间：2024/05/21 23:32
02 获得文本语料和词汇资源

# -*- coding: utf-8 -*-# win10 python3.5.3/python3.6.1 nltk3.2.4# 《Python自然语言处理》 02 获得文本语料和词汇资源# pnlp02.py# 2.1 获取文本语料库# 古滕堡语料库import nltkgtb = nltk.corpus.gutenberg.fileids()print(gtb)'''['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt',  'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt',  'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt',  'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt',  'shakespeare-macbeth.txt', 'whitman-leaves.txt']'''emma = nltk.corpus.gutenberg.words('austen-emma.txt')print(len(emma)) # 192427emma = nltk.Text(nltk.corpus.gutenberg.words('austen-emma.txt'))print(emma.concordance("surprize"))'''Displaying 25 of 37 matches:er father , was sometimes taken by surprize at his being still able to pity ` ...g engaged !" Emma even jumped with surprize ;-- and , horror - struck , exclaiNone'''from nltk.corpus import gutenbergprint(gutenberg.fileids())'''['austen-emma.txt', 'austen-persuasion.txt', ..., 'whitman-leaves.txt']'''# 计算平均词长、平均句子长度和文本中每个词出现的平均次数for fileid in gutenberg.fileids():    num_chars = len(gutenberg.raw(fileid))    num_words = len(gutenberg.words(fileid))    num_sents = len(gutenberg.sents(fileid))    num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))    print(int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), fileid)'''4 24 26 austen-emma.txt4 26 16 austen-persuasion.txt4 28 22 austen-sense.txt4 33 79 bible-kjv.txt4 19 5 blake-poems.txt4 19 14 bryant-stories.txt4 17 12 burgess-busterbrown.txt4 20 12 carroll-alice.txt4 20 11 chesterton-ball.txt4 22 11 chesterton-brown.txt4 18 10 chesterton-thursday.txt4 20 24 edgeworth-parents.txt4 25 15 melville-moby_dick.txt4 52 10 milton-paradise.txt4 11 8 shakespeare-caesar.txt4 12 7 shakespeare-hamlet.txt4 12 6 shakespeare-macbeth.txt4 36 12 whitman-leaves.txt'''macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')print(macbeth_sentences)# [['[', 'The', 'Tragedie', 'of', 'Macbeth', 'by', 'William', 'Shakespeare', '1603', ']'], ['Actus', 'Primus', '.'], ...]print(macbeth_sentences[1037])# ['Good', 'night', ',', 'and', 'better', 'health', 'Attend', 'his', 'Maiesty']longest_len = max([len(s) for s in macbeth_sentences])l1 = [s for s in macbeth_sentences if (len(s) == longest_len)]print(l1)'''[['Doubtfull', 'it', 'stood', ',', 'As', 'two', 'spent', 'Swimmers', ',', 'that', 'doe', 'cling', ...'Head', 'vpon', 'our', 'Battlements']]'''# 网络和聊天文本from nltk.corpus import webtextfor fileid in webtext.fileids():    print(fileid, webtext.raw(fileid)[:65], '...')'''firefox.txt Cookie Manager: "Don't allow sites that set removed cookies to se ...grail.txt SCENE 1: [wind] [clop clop clop] KING ARTHUR: Whoa there!  [clop ...overheard.txt White guy: So, do you have any plans for this evening?Asian girl ...pirates.txt PIRATES OF THE CARRIBEAN: DEAD MAN'S CHEST, by Ted Elliott & Terr ...singles.txt 25 SEXY MALE, seeks attrac older single lady, for discreet encoun ...wine.txt Lovely delicate, fragrant Rhone wine. Polished leather and strawb ...'''from nltk.corpus import nps_chatchatroom = nps_chat.posts('10-19-20s_706posts.xml')print(chatroom)'''[['now', 'im', 'left', 'with', 'this', 'gay', 'name'], [':P'], ...]'''# 布朗语料库from nltk.corpus import brownprint(brown.categories())'''['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']'''print(brown.words(categories='news'))'''['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]'''print(brown.words(fileids=['cg22']))'''['Does', 'our', 'society', 'have', 'a', 'runaway', ',', ...]'''print(brown.sents(categories=['news', 'editorial', 'reviews']))'''[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ...]'''from nltk.corpus import brownnews_text = brown.words(categories='news')fdist = nltk.FreqDist([w.lower() for w in news_text])modals = ['can', 'could', 'may', 'might', 'must', 'will']for m in modals:    print(m + ':', fdist[m])'''can: 94could: 87may: 93might: 38must: 53will: 389'''cfd = nltk.ConditionalFreqDist(    (genre, word)    for genre in brown.categories()    for word in brown.words(categories=genre))genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']modals = ['can', 'could', 'may', 'might', 'must', 'will']res = cfd.tabulate(conditions=genres, samples=modals)print(res)'''                  can could   may might  must  will            news    93    86    66    38    50   389        religion    82    59    78    12    54    71         hobbies   268    58   131    22    83   264 science_fiction    16    49     4    12     8    16         romance    74   193    11    51    45    43           humor    16    30     8     8     9    13 None'''# 透露社语料库from nltk.corpus import reutersprint(reuters.fileids())'''['test/14826', 'test/14828', 'test/14829', ..., 'training/9994', 'training/9995']'''print(reuters.categories())'''['acq', 'alum', 'barley', 'bop', ..., 'wpi', 'yen', 'zinc']'''print(reuters.categories('training/9865'))# ['barley', 'corn', 'grain', 'wheat']print(reuters.categories(['training/9865', 'training/9880']))# ['barley', 'corn', 'grain', 'money-fx', 'wheat']print(reuters.fileids('barley'))'''['test/15618', 'test/15649', 'test/15676', ..., 'training/9865', 'training/9958']'''print(reuters.fileids(['barley', 'corn']))'''['test/14832', 'test/14858', 'test/15033', ..., 'training/9958', 'training/9989']'''print(reuters.words('training/9865')[:14])'''['FRENCH', 'FREE', 'MARKET', 'CEREAL', 'EXPORT', 'BIDS', 'DETAILED', 'French', 'operators', 'have', 'requested', 'licences', 'to', 'export']'''print(reuters.words(['training/9865', 'training/9880']))'''['FRENCH', 'FREE', 'MARKET', 'CEREAL', 'EXPORT', ...]'''print(reuters.words(categories='barley'))'''['FRENCH', 'FREE', 'MARKET', 'CEREAL', 'EXPORT', ...]'''print(reuters.words(categories=['barley', 'corn']))'''['THAI', 'TRADE', 'DEFICIT', 'WIDENS', 'IN', 'FIRST', ...]'''# 就职演说语料库from nltk.corpus import inauguralprint(inaugural.fileids())'''['1789-Washington.txt', '1793-Washington.txt', ..., '2005-Bush.txt', '2009-Obama.txt']'''res = [fileid[:4] for fileid in inaugural.fileids()]print(res)'''['1789', '1793', '1797', ..., '2005', '2009']'''cfd = nltk.ConditionalFreqDist(    (target, fileid[:4])    for fileid in inaugural.fileids()    for w in inaugural.words(fileid)    for target in ['america', 'citizen']    if w.lower().startswith(target))cfd.plot()# 标注文本预料库# 其他语料库print(nltk.corpus.cess_esp.words())'''['El', 'grupo', 'estatal', 'Electricité_de_France', ...]'''print(nltk.corpus.floresta.words())'''['Um', 'revivalismo', 'refrescante', 'O', '7_e_Meio', ...]'''print(nltk.corpus.indian.words('hindi.pos'))'''['पूर्ण', 'प्रतिबंध', 'हटाओ', ':', 'इराक', 'संयुक्त', ...]'''print(nltk.corpus.udhr.fileids())'''['Abkhaz-Cyrillic+Abkh', 'Abkhaz-UTF8', ..., 'Zhuang-Latin1', 'Zulu-Latin1']'''print(nltk.corpus.udhr.words('Javanese-Latin1')[11:])'''['Saben', 'umat', 'manungsa', 'lair', 'kanthi', 'hak', ...]'''from nltk.corpus import udhrlanguages = ['Chickasaw', 'English', 'German_Deutsch', 'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik']cfd = nltk.ConditionalFreqDist(    (lang, len(word))    for lang in languages    for word in udhr.words(lang + '-Latin1'))cfd.plot(cumulative=True)# 文本语料库结构# help(nltk.corpus.reader)raw = gutenberg.raw("burgess-busterbrown.txt")print(raw[1:20]) # The Adventures of Bwords = gutenberg.words("burgess-busterbrown.txt")print(words[1:20])'''['The', 'Adventures', 'of', 'Buster', ..., 'Bear']'''sents = gutenberg.sents("burgess-busterbrown.txt")print(sents[1:20])'''[['I'], ['BUSTER', 'BEAR', 'GOES', 'FISHING'], ..., 'for', 'breakfast', '.']]'''# 加载自己的语料库from nltk.corpus import PlaintextCorpusReadercorpus_root = 'D:/tmp/tensorflow/data'wordlists = PlaintextCorpusReader(corpus_root, 'my*\.txt')print(wordlists.fileids()) # 无数据显示print(wordlists.readme()) # 需要有README文件print(wordlists.words('mya.txt')) # 需要有mya.txt文件# 2.2 条件频率分布 ConditionalFreqDist# 条件和事件text = ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said']pairs = [('news', 'The'), ('news', 'Fulton'), ('news', 'County')]# 按文件技术词汇from nltk.corpus import browncfd = nltk.ConditionalFreqDist(    (genre, word)    for genre in brown.categories()    for word in brown.words(categories=genre))genre_word = [(genre, word)              for genre in ['news', 'romance']              for word in brown.words(categories=genre)]print(len(genre_word)) # 170576print(genre_word[:4]) # [('news', 'The'), ('news', 'Fulton'), ('news', 'County'), ('news', 'Grand')]print(genre_word[-4:]) # [('romance', 'afraid'), ('romance', 'not'), ('romance', "''"), ('romance', '.')]cfd = nltk.ConditionalFreqDist(genre_word)print(cfd) # <ConditionalFreqDist with 2 conditions>print(cfd.conditions()) # ['news', 'romance']print(cfd['news']) # <FreqDist with 14394 samples and 100554 outcomes>print(cfd['romance']) # <FreqDist with 8452 samples and 70022 outcomes>print(list(cfd['romance'])) # ['They', 'neither', 'liked', ..., 'expect', 'episode']print(cfd['romance']['could']) # 193# 绘制分布图和分布表from nltk.corpus import inauguralcfd = nltk.ConditionalFreqDist(    (target, fileid[:4])    for fileid in inaugural.fileids()    for w in inaugural.words(fileid)    for target in ['america', 'citizen']    if w.lower().startswith(target))from nltk.corpus import udhrlanguages = ['Chickasaw', 'English', 'German_Deutsch', 'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik']cfd = nltk.ConditionalFreqDist(    (lang, len(word))    for lang in languages    for word in udhr.words(lang + '-Latin1'))cfd.tabulate(conditions=['English', 'German_Deutsch'], samples = range(10), cumulative=True)'''                  0    1    2    3    4    5    6    7    8    9        English    0  185  525  883  997 1166 1283 1440 1558 1638 German_Deutsch    0  171  263  614  717  894 1013 1110 1213 1275 '''# 使用双连词生成随机文本sent = ['In', 'the', 'beginning', 'God', 'Created', 'the', 'heaven', 'and', 'the', 'earth', '.']print(nltk.bigrams(sent)) # <generator object bigrams at 0x00000219653297D8>print(list(nltk.bigrams(sent)))'''[('In', 'the'), ('the', 'beginning'), ('beginning', 'God'), ('God', 'Created'), ('Created', 'the'), ('the', 'heaven'), ('heaven', 'and'), ('and', 'the'), ('the', 'earth'), ('earth', '.')]'''# 2.3 更多关于Python代码重用# 使用文本编辑器创建程序# 函数# 模块# 2.4 词典资源# 词汇列表语料库def unusual_words(text):    text_vocab = set(w.lower() for w in text if w.isalpha())    english_vocab = set(w.lower() for w in nltk.corpus.words.words())    unusual = text_vocab.difference(english_vocab)    return sorted(unusual)res = unusual_words(nltk.corpus.gutenberg.words('austen-sense.txt'))print(res)'''['abbeyland', 'abhorred', 'abilities', ..., 'yielded', 'youngest']'''res = unusual_words(nltk.corpus.nps_chat.words())print(res)'''['aaaaaaaaaaaaaaaaa', 'aaahhhh', 'abortions', ..., 'zzzzzzzing', 'zzzzzzzz']'''from nltk.corpus import stopwordsprint(stopwords.words('english'))'''['i', 'me', 'my', 'myself', 'we', ..., 'won', 'wouldn']'''def content_fraction(text):    stopwords = nltk.corpus.stopwords.words('english')    content = [w for w in text if w.lower() not in stopwords]    return len(content)/len(text)print(content_fraction(nltk.corpus.reuters.words())) # 0.735240435097661puzzle_letters = nltk.FreqDist('egivrvonl')obligatory = 'r'wordlist = nltk.corpus.words.words()res = [w for w in wordlist if len(w) >= 6 and obligatory in w and nltk.FreqDist(w) <= puzzle_letters]print(res)'''['glover', 'gorlin', 'govern', 'grovel', 'ignore', ..., 'violer', 'virole']'''names = nltk.corpus.namesprint(names.fileids()) # ['female.txt', 'male.txt']male_names = names.words('male.txt')female_names = names.words('female.txt')res = [w for w in male_names if w in female_names]print(res)'''['Abbey', 'Abbie', 'Abby', ..., 'Winnie', 'Winny', 'Wynn']'''# 发音的词典entries = nltk.corpus.cmudict.entries()print(len(entries)) # 133737for entry in entries[39943:39951]:    print(entry)'''('explorer', ['IH0', 'K', 'S', 'P', 'L', 'AO1', 'R', 'ER0'])('explorers', ['IH0', 'K', 'S', 'P', 'L', 'AO1', 'R', 'ER0', 'Z'])('explores', ['IH0', 'K', 'S', 'P', 'L', 'AO1', 'R', 'Z'])('exploring', ['IH0', 'K', 'S', 'P', 'L', 'AO1', 'R', 'IH0', 'NG'])('explosion', ['IH0', 'K', 'S', 'P', 'L', 'OW1', 'ZH', 'AH0', 'N'])('explosions', ['IH0', 'K', 'S', 'P', 'L', 'OW1', 'ZH', 'AH0', 'N', 'Z'])('explosive', ['IH0', 'K', 'S', 'P', 'L', 'OW1', 'S', 'IH0', 'V'])('explosively', ['EH2', 'K', 'S', 'P', 'L', 'OW1', 'S', 'IH0', 'V', 'L', 'IY0'])'''for word, pron in entries:    if len(pron) == 3:        ph1, ph2, ph3 = pron        if ph1 == 'P' and ph3 == 'T':            print(word, ph2)'''pait EY1pat AE1...put UH1putt AH1'''syllable = ['N', 'IHO', 'K', 'S']res = [word for word, pron in entries if pron[-4:] == syllable]print(res)'''[]'''res = [w for w, pron in entries if pron[-1] == 'M' and w[-1] == 'n']print(res)'''['autumn', 'column', 'condemn', 'damn', 'goddamn', 'hymn', 'solemn']'''res = sorted(set(w[:2] for w, pron in entries if pron[0] == 'N' and w[0] != 'n'))print(res)'''['gn', 'kn', 'mn', 'pn']'''def stress(pron):    return [char for phone in pron for char in phone if char.isdigit()]res = [w for w, pron in entries if stress(pron) == ['0', '1', '0', '2', '0']]print(res)'''['abbreviated', 'abbreviated', 'abbreviating', ..., 'vocabulary', 'voluntarism']'''res = [w for w, pron in entries if stress(pron) == ['0', '2', '0', '1', '0']]print(res)'''['abbreviation', 'abbreviations', 'abomination', ..., 'wakabayashi', 'yekaterinburg']'''p3 = [(pron[0] + '-' + pron[2], word)      for (word, pron) in entries      if pron[0] == 'P' and len(pron) == 3]cfd = nltk.ConditionalFreqDist(p3)for template in cfd.conditions():    if len(cfd[template]) > 10:        words = cfd[template].keys()        wordlist = ' '.join(words)        print(template, wordlist[:70] + "...")'''P-P paap paape pap pape papp paup peep pep pip pipe pipp poop pop pope pop...P-R paar pair par pare parr pear peer pier poor poore por pore porr pour...P-K pac pack paek paik pak pake paque peak peake pech peck peek perc perk ...P-S pace pass pasts peace pearse pease perce pers perse pesce piece piss p...P-L pahl pail paille pal pale pall paul paule paull peal peale pearl pearl...P-N paign pain paine pan pane pawn payne peine pen penh penn pin pine pinn...P-Z pais paiz pao's pas pause paws pays paz peas pease pei's perz pez pies...P-T pait pat pate patt peart peat peet peete pert pet pete pett piet piett...P-CH patch pautsch peach perch petsch petsche piche piech pietsch pitch pit...P-UW1 peru peugh pew plew plue prew pru prue prugh pshew pugh...'''prondict = nltk.corpus.cmudict.dict()print(prondict['fire']) # [['F', 'AY1', 'ER0'], ['F', 'AY1', 'R']]# print(prondict['blog']) # KeyError: 'blog'prondict['blog'] = [['B', 'L', 'AA1', 'G']]print(prondict['blog']) # [['B', 'L', 'AA1', 'G']]text = ['natural', 'language', 'processing']res = [ph for w in text for ph in prondict[w][0]]print(res)'''['N', 'AE1', 'CH', 'ER0', 'AH0', 'L', 'L', 'AE1', 'NG', 'G', 'W', 'AH0', 'JH', 'P', 'R', 'AA1', 'S', 'EH0', 'S', 'IH0', 'NG']'''# 比较词表from nltk.corpus import swadeshprint(swadesh.fileids())'''['be', 'bg', 'bs', 'ca', 'cs', 'cu', 'de', 'en', 'es', 'fr', 'hr', 'it', 'la', 'mk', 'nl', 'pl', 'pt', 'ro', 'ru', 'sk', 'sl', 'sr', 'sw', 'uk']'''print(swadesh.words('en'))'''['I', 'you (singular), thou', ..., 'if', 'because', 'name']'''fr2en = swadesh.entries(['fr', 'en'])print(fr2en)'''[('je', 'I'), ('tu, vous', 'you (singular), thou'), ..., ('parce que', 'because'), ('nom', 'name')]'''translate = dict(fr2en)print(translate['chien']) # dogprint(translate['jeter']) # throwde2en = swadesh.entries(['de', 'en'])es2en = swadesh.entries(['es', 'en'])translate.update(dict(de2en))translate.update(dict(es2en))print(translate['Hund']) # dogprint(translate['perro']) # doglanguages = ['en', 'de', 'nl', 'es', 'fr', 'pt', 'la']for i in [139, 140, 141, 142]:    print(swadesh.entries(languages)[i])'''('say', 'sagen', 'zeggen', 'decir', 'dire', 'dizer', 'dicere')('sing', 'singen', 'zingen', 'cantar', 'chanter', 'cantar', 'canere')('play', 'spielen', 'spelen', 'jugar', 'jouer', 'jogar, brincar', 'ludere')('float', 'schweben', 'zweven', 'flotar', 'flotter', 'flutuar, boiar', 'fluctuare')'''# 词汇工具：Toolbox和Shoeboxfrom nltk.corpus import toolboxprint(toolbox.entries('rotokas.dic'))'''[('kaa', [('ps', 'V'), ('pt', 'A'), ..., ('tkp', 'laplap'), ('dt', '28/Jul/2004')])]'''# 2.5 WordNet# 意义与同义词from nltk.corpus import wordnet as wnprint(wn.synsets('motorcar')) # [Synset('car.n.01')]print(wn.synset('car.n.01').lemma_names()) # ['car', 'auto', 'automobile', 'machine', 'motorcar']print(wn.synset('car.n.01').definition()) # a motor vehicle with four wheels; usually propelled by an internal combustion engineprint(wn.synset('car.n.01').examples()) # ['he needs a car to get to work']print(wn.synset('car.n.01').lemmas())'''[Lemma('car.n.01.car'), Lemma('car.n.01.auto'), Lemma('car.n.01.automobile'), Lemma('car.n.01.machine'), Lemma('car.n.01.motorcar')]'''print(wn.lemma('car.n.01.automobile')) # Lemma('car.n.01.automobile')print(wn.lemma('car.n.01.automobile').synset()) # Synset('car.n.01')print(wn.lemma('car.n.01.automobile').name()) # automobileprint(wn.synsets('car'))'''[Synset('car.n.01'), Synset('car.n.02'), Synset('car.n.03'), Synset('car.n.04'), Synset('cable_car.n.01')]'''for synset in wn.synsets('car'):    print(synset.lemma_names())'''['car', 'auto', 'automobile', 'machine', 'motorcar']['car', 'railcar', 'railway_car', 'railroad_car']['car', 'gondola']['car', 'elevator_car']['cable_car', 'car']'''print(wn.lemmas('car'))# WordNet的层次结构motorcar = wn.synset('car.n.01')types_of_motorcar = motorcar.hyponyms()print(types_of_motorcar[26])'''[Lemma('car.n.01.car'), Lemma('car.n.02.car'), Lemma('car.n.03.car'), Lemma('car.n.04.car'), Lemma('cable_car.n.01.car')]'''res = sorted([lemma.name() for synset in types_of_motorcar for lemma in synset.lemmas()])print(res)'''['Model_T', 'S.U.V.', 'SUV', 'Stanley_Steamer', ..., 'used-car', 'waggon', 'wagon']'''print(motorcar.hypernyms()) # [Synset('motor_vehicle.n.01')]paths = motorcar.hypernym_paths()print(len(paths)) # 2res = [synset.name() for synset in paths[0]]print(res)'''['entity.n.01', 'physical_entity.n.01', 'object.n.01', 'whole.n.02', 'artifact.n.01', 'instrumentality.n.03', 'container.n.01', 'wheeled_vehicle.n.01', 'self-propelled_vehicle.n.01', 'motor_vehicle.n.01', 'car.n.01']'''res = [synset.name() for synset in paths[1]]print(res)'''['entity.n.01', 'physical_entity.n.01', 'object.n.01', 'whole.n.02', 'artifact.n.01', 'instrumentality.n.03', 'conveyance.n.03', 'vehicle.n.01', 'wheeled_vehicle.n.01', 'self-propelled_vehicle.n.01', 'motor_vehicle.n.01', 'car.n.01']'''print(motorcar.root_hypernyms()) # [Synset('entity.n.01')]# 更多的词汇关系print(wn.synset('tree.n.01').part_meronyms())'''[Synset('burl.n.02'), Synset('crown.n.07'), Synset('limb.n.02'), Synset('stump.n.01'), Synset('trunk.n.01')]'''print(wn.synset('tree.n.01').substance_meronyms())'''[Synset('heartwood.n.01'), Synset('sapwood.n.01')]'''print(wn.synset('tree.n.01').member_holonyms())'''[Synset('forest.n.01')]'''for synset in wn.synsets('mint', wn.NOUN):    print(synset.name() + ':', synset.definition())'''batch.n.02: (often followed by `of') a large number or amount or extentmint.n.02: any north temperate plant of the genus Mentha with aromatic leaves and small mauve flowersmint.n.03: any member of the mint family of plantsmint.n.04: the leaves of a mint plant used fresh or candiedmint.n.05: a candy that is flavored with a mint oilmint.n.06: a plant where money is coined by authority of the government'''print(wn.synset('mint.n.04').part_holonyms()) # [Synset('mint.n.02')]print(wn.synset('mint.n.04').substance_holonyms()) # [Synset('mint.n.05')]print(wn.synset('walk.v.01').entailments()) # [Synset('step.v.01')]print(wn.synset('eat.v.01').entailments()) # [Synset('chew.v.01'), Synset('swallow.v.01')]print(wn.synset('tease.v.03').entailments()) # [Synset('arouse.v.07'), Synset('disappoint.v.01')]print(wn.lemma('supply.n.02.supply').antonyms()) # [Lemma('demand.n.02.demand')]print(wn.lemma('rush.v.01.rush').antonyms()) # [Lemma('linger.v.04.linger')]print(wn.lemma('horizontal.a.01.horizontal').antonyms())'''[Lemma('inclined.a.02.inclined'), Lemma('vertical.a.01.vertical')]'''print(wn.lemma('staccato.r.01.staccato').antonyms()) # [Lemma('legato.r.01.legato')]# 语义相似度right = wn.synset('right_whale.n.01')orca = wn.synset('orca.n.01')minke = wn.synset('minke_whale.n.01')tortoise = wn.synset('tortoise.n.01')novel = wn.synset('novel.n.01')print(right.lowest_common_hypernyms(minke)) # [Synset('baleen_whale.n.01')]print(right.lowest_common_hypernyms(orca)) # [Synset('whale.n.02')]print(right.lowest_common_hypernyms(tortoise)) # [Synset('vertebrate.n.01')]print(right.lowest_common_hypernyms(novel)) # [Synset('entity.n.01')]print(wn.synset('baleen_whale.n.01').min_depth()) # 14print(wn.synset('whale.n.02').min_depth()) # 13print(wn.synset('vertebrate.n.01').min_depth()) # 8print(wn.synset('entity.n.01').min_depth()) # 0print(right.path_similarity(minke)) # 0.25print(right.path_similarity(orca)) # 0.16666666666666666print(right.path_similarity(tortoise)) # 0.07692307692307693print(right.path_similarity(novel)) # 0.043478260869565216
阅读全文
0 0