[Python开发] python使用nltk进行Chart Parsing

来源：互联网发布：知乎live可以回看吗编辑：程序博客网时间：2024/05/23 01:20

python用自然语言处理工具nltk（Natural Language Toolkit, http://www.nltk.org/ ）完成计算语言学中的Chart Parsing。

安装

使用pip安装
http://www.nltk.org/install.html
```
pip install nltk
```
或下载安装包安装（ http://pypi.python.org/pypi/nltk ）
```
python setup.py install
```
需要安装NLTK Data（ http://www.nltk.org/data.html ）。在python中输入以下代码：
```
import nltknltk.download()
```

Chart Parsing

参考
http://www.nltk.org/api/nltk.parse.html
http://www.ling.helsinki.fi/kit/2008s/clt231/nltk-0.9.5/doc/en/ch08.html
http://www.nltk.org/book/ch08-extras.html

demo - chart parser

tree

Parse a sentence John ate the cat

import nltkgrammar = nltk.CFG.fromstring("""    S -> NP VP    VP -> V NP | V    NP -> NAME | ART N    NAME -> 'John'    V -> 'ate'    ART -> 'the'    N -> 'cat'    """)tokens = ['John', 'ate', 'the', 'cat']parser = nltk.ChartParser(grammar, trace=1)for tree in parser.parse(tokens):    print(tree)    tree.draw()parser = nltk.parse.chart.BottomUpChartParser(grammar, trace=1)for tree in parser.parse(tokens):    print(tree)parser = nltk.parse.earleychart.EarleyChartParser(grammar, trace=1)for tree in parser.parse(tokens):    print(tree)

output

|.   John  .   ate   .   the   .   cat   .||[---------]         .         .         .| [0:1] 'John'|.         [---------]         .         .| [1:2] 'ate'|.         .         [---------]         .| [2:3] 'the'|.         .         .         [---------]| [3:4] 'cat'|[---------]         .         .         .| [0:1] NAME -> 'John' *|[---------]         .         .         .| [0:1] NP -> NAME *|[--------->         .         .         .| [0:1] S  -> NP * VP|.         [---------]         .         .| [1:2] V  -> 'ate' *|.         [--------->         .         .| [1:2] VP -> V * NP|.         [---------]         .         .| [1:2] VP -> V *|[-------------------]         .         .| [0:2] S  -> NP VP *|.         .         [---------]         .| [2:3] ART -> 'the' *|.         .         [--------->         .| [2:3] NP -> ART * N|.         .         .         [---------]| [3:4] N  -> 'cat' *|.         .         [-------------------]| [2:4] NP -> ART N *|.         .         [------------------->| [2:4] S  -> NP * VP|.         [-----------------------------]| [1:4] VP -> V NP *|[=======================================]| [0:4] S  -> NP VP *(S (NP (NAME John)) (VP (V ate) (NP (ART the) (N cat))))|.   John  .   ate   .   the   .   cat   .||[---------]         .         .         .| [0:1] 'John'|.         [---------]         .         .| [1:2] 'ate'|.         .         [---------]         .| [2:3] 'the'|.         .         .         [---------]| [3:4] 'cat'|>         .         .         .         .| [0:0] NAME -> * 'John'|[---------]         .         .         .| [0:1] NAME -> 'John' *|>         .         .         .         .| [0:0] NP -> * NAME|[---------]         .         .         .| [0:1] NP -> NAME *|>         .         .         .         .| [0:0] S  -> * NP VP|[--------->         .         .         .| [0:1] S  -> NP * VP|.         >         .         .         .| [1:1] V  -> * 'ate'|.         [---------]         .         .| [1:2] V  -> 'ate' *|.         >         .         .         .| [1:1] VP -> * V NP|.         >         .         .         .| [1:1] VP -> * V|.         [--------->         .         .| [1:2] VP -> V * NP|.         [---------]         .         .| [1:2] VP -> V *|[-------------------]         .         .| [0:2] S  -> NP VP *|.         .         >         .         .| [2:2] ART -> * 'the'|.         .         [---------]         .| [2:3] ART -> 'the' *|.         .         >         .         .| [2:2] NP -> * ART N|.         .         [--------->         .| [2:3] NP -> ART * N|.         .         .         >         .| [3:3] N  -> * 'cat'|.         .         .         [---------]| [3:4] N  -> 'cat' *|.         .         [-------------------]| [2:4] NP -> ART N *|.         .         >         .         .| [2:2] S  -> * NP VP|.         [-----------------------------]| [1:4] VP -> V NP *|.         .         [------------------->| [2:4] S  -> NP * VP|[=======================================]| [0:4] S  -> NP VP *(S (NP (NAME John)) (VP (V ate) (NP (ART the) (N cat))))|.   John  .   ate   .   the   .   cat   .||[---------]         .         .         .| [0:1] 'John'|.         [---------]         .         .| [1:2] 'ate'|.         .         [---------]         .| [2:3] 'the'|.         .         .         [---------]| [3:4] 'cat'|>         .         .         .         .| [0:0] S  -> * NP VP|>         .         .         .         .| [0:0] NP -> * NAME|>         .         .         .         .| [0:0] NP -> * ART N|>         .         .         .         .| [0:0] NAME -> * 'John'|[---------]         .         .         .| [0:1] NAME -> 'John' *|[---------]         .         .         .| [0:1] NP -> NAME *|[--------->         .         .         .| [0:1] S  -> NP * VP|.         >         .         .         .| [1:1] VP -> * V NP|.         >         .         .         .| [1:1] VP -> * V|.         >         .         .         .| [1:1] V  -> * 'ate'|.         [---------]         .         .| [1:2] V  -> 'ate' *|.         [--------->         .         .| [1:2] VP -> V * NP|.         [---------]         .         .| [1:2] VP -> V *|[-------------------]         .         .| [0:2] S  -> NP VP *|.         .         >         .         .| [2:2] NP -> * NAME|.         .         >         .         .| [2:2] NP -> * ART N|.         .         >         .         .| [2:2] ART -> * 'the'|.         .         [---------]         .| [2:3] ART -> 'the' *|.         .         [--------->         .| [2:3] NP -> ART * N|.         .         .         >         .| [3:3] N  -> * 'cat'|.         .         .         [---------]| [3:4] N  -> 'cat' *|.         .         [-------------------]| [2:4] NP -> ART N *|.         [-----------------------------]| [1:4] VP -> V NP *|[=======================================]| [0:4] S  -> NP VP *(S (NP (NAME John)) (VP (V ate) (NP (ART the) (N cat))))

Well-Formed Substring Tables

Parse a sentence John ate the cat

import nltkgrammar = nltk.CFG.fromstring("""    S -> NP VP    VP -> V NP | V    NP -> NAME | ART N    NAME -> 'John'    V -> 'ate'    ART -> 'the'    N -> 'cat'    """)tokens = ['John', 'ate', 'the', 'cat']def init_wfst(tokens, grammar):    numtokens = len(tokens)    wfst = [['.' for i in range(numtokens+1)] for j in range(numtokens+1)]    for i in range(numtokens):        productions = grammar.productions(rhs=tokens[i])        wfst[i][i+1] = productions[0].lhs()    return wfstdef display(wfst, tokens):    print('\nWFST ' + ' '.join([("%-4d" % i) for i in range(1, len(wfst))]))    for i in range(len(wfst)-1):        print_string = ("%d   " % i)        for j in range(1, len(wfst)):            print_string += (" %-4s"% wfst[i][j])        print(print_string)wfst0 = init_wfst(tokens, grammar)display(wfst0, tokens)

output

WFST 1    2    3    40    NAME .    .    .1    .    V    .    .2    .    .    ART  .3    .    .    .    N

Charts

Parse a sentence John ate the cat

import nltkgrammar = nltk.CFG.fromstring("""    S -> NP VP    VP -> V NP | V    NP -> NAME | ART N    NAME -> 'John'    V -> 'ate'    ART -> 'the'    N -> 'cat'    """)tokens = ['John', 'ate', 'the', 'cat']def complete_wfst(wfst, tokens, trace=False):    index = {}    for prod in grammar.productions():        index[prod.rhs()] = prod.lhs()    numtokens = len(tokens)    for span in range(2, numtokens+1):        for start in range(numtokens+1-span):            end = start + span            for mid in range(start+1, end):                nt1, nt2 = wfst[start][mid], wfst[mid][end]                if (nt1,nt2) in index:                    if trace:                        print("[%s] %3s [%s] %3s [%s] ==> [%s] %3s [%s]" % \                        (start, nt1, mid, nt2, end, start, index[(nt1,nt2)], end))                    wfst[start][end] = index[(nt1,nt2)]    return wfstwfst1 = complete_wfst(wfst0, tokens, trace=True)

output

[2] ART [3]   N [4] ==> [2]  NP [4][1]   V [2]  NP [4] ==> [1]  VP [4]

earleychart demo

import nltknltk.parse.earleychart.demo()

Complex grammar and tokens

grammar = nltk.CFG.fromstring("""    S -> NP VP    PP -> P NP    NP -> Det N | Det N PP | 'I'    VP -> V NP | VP PP    Det -> 'an' | 'my'    N -> 'elephant' | 'pajamas'    V -> 'shot'    P -> 'in'    """)tokens = ['I', 'shot', 'an', 'elephant', 'in', 'my', 'pajamas']grammar = nltk.CFG.fromstring("""    S -> NP VP    NP -> ART ADJ N | ART N | ADJ N    VP -> AUX VP | V NP    ART -> 'the'    ADJ -> 'large'    N -> 'can' | 'hold' | 'water'    AUX -> 'can'    V -> 'can' | 'hold' | 'water'    """)tokens = ['the', 'large', 'can', 'can', 'hold', 'the', 'water']

0 0