NLTK09《Python自然语言处理》code08 分析句子结构
来源:互联网 发布:网站后台软件 编辑:程序博客网 时间:2024/06/01 09:56
分析句子结构
# -*- coding: utf-8 -*-# win10 python3.5.3/python3.6.1 nltk3.2.4# 《Python自然语言处理》 08 分析句子结构# pnlp08.py# 8.1 一些语法困境# 语言数据和无限可能性# 普遍存在的歧义import nltkgroucho_grammar = nltk.CFG.fromstring("""S -> NP VPPP -> P NPNP -> Det N | Det N PP | 'I'VP -> V NP | VP PPDet -> 'an' | 'my'N -> 'elephant' | 'pajamas'V -> 'shot'P -> 'in'""")sent = ['I', 'shot', 'an', 'elephant', 'in', 'my', 'pajamas']parser = nltk.ChartParser(groucho_grammar)trees = parser.parse(sent)for tree in trees: print(tree)"""(S (NP I) (VP (VP (V shot) (NP (Det an) (N elephant))) (PP (P in) (NP (Det my) (N pajamas)))))(S (NP I) (VP (V shot) (NP (Det an) (N elephant) (PP (P in) (NP (Det my) (N pajamas))))))"""# 8.2 文法的用途# 8.3 上下文无关文法grammar1 = nltk.CFG.fromstring("""S -> NP VPVP -> V NP | V NP PPPP -> P NPV -> "saw" | "ate" | "walked"NP -> "John" | "Mary" | "Bob" | Det N | Det N PPDet -> "a" | "an" | "the" | "my"N -> "man" | "dog" | "cat" | "telescope" | "park"P -> "in" | "on" | "by" | "with"""")sent = "Mary saw Bob".split()rd_parser = nltk.RecursiveDescentParser(grammar1)for tree in rd_parser.parse(sent): print(tree)"""(S (NP Mary) (VP (V saw) (NP Bob)))"""# 编写自己的文法grammar1 = nltk.data.load('file:mygrammar.cfg')sent = "Mary saw Bob".split()rd_parser = nltk.RecursiveDescentParser(grammar1)for tree in rd_parser.parse(sent): print(tree)"""(S (NP Mary) (VP (V saw) (NP Bob)))"""# 例8-2 递归的上下文无关文法grammar2 = nltk.CFG.fromstring("""S -> NP VPNP -> Det Nom | PropNNom -> Adj Nom | NVP -> V Adj | V NP | V S | V NP PPPP -> P NPPropN -> 'Buster' | 'Chatterer' | 'Joe'Det -> 'the' | 'a'N -> 'bear' | 'squirrel' | 'tree' | 'fish' | 'log'Adj -> 'angry' | 'frightened' | 'little' | 'tall'V -> 'chased' | 'saw' | 'said' | 'thought' | 'was' | 'put'P -> 'on'""")# 8.4 上下文无关文法分析# 递归下降解析# 移进-归约分析# 左角落解析器# 符合句子规则的子串表# 例8-3 使用符合语句规则的子串表接收器def init_wfst(tokens, grammar): numtokens = len(tokens) wfst = [[None for i in range(numtokens+1)] for j in range(numtokens+1)] for i in range(numtokens): productions = grammar.productions(rhs=tokens[i]) wfst[i][i+1] = productions[0].lhs() return wfstdef complete_wfst(wfst, tokens, grammar, trace=False): index = dict((p.rhs(), p.lhs()) for p in grammar.productions()) numtokens = len(tokens) for span in range(2, numtokens + 1): for start in range(numtokens + 1): end = start + span if end > numtokens: break for mid in range(start+1, end): nt1, nt2 = wfst[start][mid], wfst[mid][end] if nt1 and nt2 and (nt1, nt2) in index: wfst[start][end] = index[(nt1, nt2)] if trace: print("[%s] %3s [%s] %3s [%s] ==> [%s] %3s [%s]" %(start, nt1, mid, nt2, end, start, index[(nt1, nt2)], end)) return wfstdef display(wfst, tokens): print('\nWFST ' + ' '.join([("%-4d" % i) for i in range(1, len(wfst))])) for i in range(len(wfst)-1): print("%d " %i, end="") for j in range(1, len(wfst)): print("%-4s" % (wfst[i][j] or '.'), end="") print("")tokens = "I shot an elephant in my pajamas".split()wfst0 = init_wfst(tokens, groucho_grammar)display(wfst0, tokens)"""WFST 1 2 3 4 5 6 7 0 NP . . . . . . 1 . V . . . . . 2 . . Det . . . . 3 . . . N . . . 4 . . . . P . . 5 . . . . . Det . 6 . . . . . . N """wfst1 = complete_wfst(wfst0, tokens, groucho_grammar)display(wfst1, tokens)"""WFST 1 2 3 4 5 6 7 0 NP . . S . . S 1 . V . VP . . VP 2 . . Det NP . . . 3 . . . N . . . 4 . . . . P . PP 5 . . . . . Det NP 6 . . . . . . N """wfst1 = complete_wfst(wfst0, tokens, groucho_grammar, trace=True)"""[2] Det [3] N [4] ==> [2] NP [4][5] Det [6] N [7] ==> [5] NP [7][1] V [2] NP [4] ==> [1] VP [4][4] P [5] NP [7] ==> [4] PP [7][0] NP [1] VP [4] ==> [0] S [4][1] VP [4] PP [7] ==> [1] VP [7][0] NP [1] VP [7] ==> [0] S [7]"""# 8.5 依存关系和依存文法import nltk# groucho_dep_grammar = nltk.parse_dependency_grammar(groucho_dep_grammar = nltk.grammar.DependencyGrammar.fromstring("""'shot' -> 'I' | 'elephant' | 'in''elephant' -> 'an' | 'in''in' -> 'pajamas''pajamas' -> 'my'""")print(groucho_dep_grammar)"""Dependency grammar with 7 productions 'shot' -> 'I' 'shot' -> 'elephant' 'shot' -> 'in' 'elephant' -> 'an' 'elephant' -> 'in' 'in' -> 'pajamas' 'pajamas' -> 'my'"""pdp = nltk.ProjectiveDependencyParser(groucho_dep_grammar)sent = 'I shot an elephant in my pajamas'.split()trees = pdp.parse(sent)for tree in trees: print(tree)"""(shot I (elephant an (in (pajamas my))))(shot I (elephant an) (in (pajamas my)))"""# 配价与词汇# 扩大规模# 8.6 文法开发# 树库和文法from nltk.corpus import treebankt = treebank.parsed_sents('wsj_0001.mrg')[0]print(t)"""(S (NP-SBJ (NP (NNP Pierre) (NNP Vinken)) (, ,) (ADJP (NP (CD 61) (NNS years)) (JJ old)) (, ,)) (VP (MD will) (VP (VB join) (NP (DT the) (NN board)) (PP-CLR (IN as) (NP (DT a) (JJ nonexecutive) (NN director))) (NP-TMP (NNP Nov.) (CD 29)))) (. .))"""# 例8-4 搜索树库找出句子的补语def filter(tree): child_nodes = [child.label() for child in tree if isinstance(child, nltk.Tree)] return (tree.label() == 'VP') and ('S' in child_nodes)from nltk.corpus import treebankres = [subtree for tree in treebank.parsed_sents() for subtree in tree.subtrees(filter)]print(res)"""[Tree('VP', [Tree('VBN', ['named']), ..."""import nltkentries = nltk.corpus.ppattach.attachments('training')table = nltk.defaultdict(lambda: nltk.defaultdict(set))for entry in entries: key = entry.noun1 + '-' + entry.prep + '-' + entry.noun2 table[key][entry.attachment].add(entry.verb)for key in sorted(table): if len(table[key]) > 1: print(key, 'N:', sorted(table[key]['N']), 'V:', sorted(table[key]['V']))"""%-below-level N: ['left'] V: ['be']%-from-year N: ['was'] V: ['declined', 'dropped', 'fell', 'grew', 'increased', 'plunged', 'rose', 'was']..."""nltk.corpus.sinica_treebank.parsed_sents()[3450].draw()import nltk# 有害的歧义grammar = nltk.CFG.fromstring("""S -> NP V NPNP -> NP SbarSbar -> NP VNP -> 'fish'V -> 'fish'""")tokens = ["fish"] * 5cp = nltk.ChartParser(grammar)for tree in cp.parse(tokens): print(tree)"""(S (NP fish) (V fish) (NP (NP fish) (Sbar (NP fish) (V fish))))(S (NP (NP fish) (Sbar (NP fish) (V fish))) (V fish) (NP fish))"""# 加权文法# 例8-5 宾州树库样本中give和gave的用法def give(t): return (t.label() == 'VP' and len(t) > 2 and t[1].label() == 'NP' and (t[2].label() == 'PP-DTV' or t[2].label() == 'NP') and ('give' in t[0].leaves() or 'gave' in t[0].leaves()))def sent(t): return ' '.join(token for token in t.leaves() if token[0] not in '*-0')def print_node(t, width): output = "%s %s: %s / %s: %s" %\ (sent(t[0]), t[1].label(), sent(t[1]), t[2].label(), sent(t[2])) if len(output) > width: output = output[:width] + "..." print(output)for tree in nltk.corpus.treebank.parsed_sents(): for t in tree.subtrees(give): print_node(t, 72)"""gave NP: the chefs / NP: a standing ovationgive NP: advertisers / NP: discounts for maintaining or increasing ad sp...give NP: it / PP-DTV: to the politiciansgave NP: them / NP: similar helpgive NP: them / NP: give NP: only French history questions / PP-DTV: to students in a Europe...give NP: federal judges / NP: a raisegive NP: consumers / NP: the straight scoop on the U.S. waste crisisgave NP: Mitsui / NP: access to a high-tech medical productgive NP: Mitsubishi / NP: a window on the U.S. glass industrygive NP: much thought / PP-DTV: to the rates she was receiving , nor to ...give NP: your Foster Savings Institution / NP: the gift of hope and free...give NP: market operators / NP: the authority to suspend trading in futu...gave NP: quick approval / PP-DTV: to $ 3.18 billion in supplemental appr...give NP: the Transportation Department / NP: up to 50 days to review any...give NP: the president / NP: such powergive NP: me / NP: the heebie-jeebiesgive NP: holders / NP: the right , but not the obligation , to buy a cal...gave NP: Mr. Thomas / NP: only a `` qualified '' rating , rather than ``...give NP: the president / NP: line-item veto power"""# 概率上下文无关文法# 例8-6 定义一个概率上下文无关文法(PCFG)import nltkgrammar = nltk.PCFG.fromstring("""S -> NP VP [1.0]VP -> TV NP [0.4]VP -> IV [0.3]VP -> DatV NP NP [0.3]TV -> 'saw' [1.0]IV -> 'ate' [1.0]DatV -> 'gave' [1.0]NP -> 'telescopes' [0.8]NP -> 'Jack' [0.2]""")print(grammar)"""Grammar with 9 productions (start state = S) S -> NP VP [1.0] VP -> TV NP [0.4] VP -> IV [0.3] VP -> DatV NP NP [0.3] TV -> 'saw' [1.0] IV -> 'ate' [1.0] DatV -> 'gave' [1.0] NP -> 'telescopes' [0.8] NP -> 'Jack' [0.2]"""viterbi_parser = nltk.ViterbiParser(grammar)trees = viterbi_parser.parse(['Jack', 'saw', 'telescopes'])for tree in trees: print(tree)"""(S (NP Jack) (VP (TV saw) (NP telescopes))) (p=0.064)"""
阅读全文
0 0
- NLTK09《Python自然语言处理》code08 分析句子结构
- Python自然语言处理 8 分析句子结构
- 分析句子结构
- 分析句子结构文法
- python自然语言处理学习笔记-信息提取结构
- NLTK05《Python自然语言处理》code04 编写结构化程序
- Python自然语言处理 4 编写结构化程序
- Python 自然语言处理 一
- 《python自然语言处理》笔记
- 《Python自然语言处理》
- PYTHON 自然语言处理
- python与自然语言处理
- Python自然语言处理
- python自然语言处理-WordNet
- python自然语言处理
- python 自然语言处理
- Python 自然语言处理 第一章
- python自然语言处理 第一章
- 适配器模式和外观模式
- 微信红包随机金额算法
- HTTP请求报头中各个字段的含义
- 大一
- Logback配置解析
- NLTK09《Python自然语言处理》code08 分析句子结构
- Android开发笔记: 按2次返回键,退出应用
- 得分UVa1585
- navicat11全系列破解教程
- 雷蛇设备占用80端口
- STL(八)list双向链表容器
- LowPoly
- Oracle数据泵expdp的compression压缩测试
- Spark安装及部署