python自然语言处理 第四章基本语法

来源:互联网 发布:java字符串比较大小 编辑:程序博客网 时间:2024/06/03 09:25
#赋值foo = 'Monty'bar = foofoo = 'Python'bar# 'Monty'foo = ['Monty', 'Python']bar = foofoo[1] = 'Bodkin'bar# ['Monty', 'Bodkin']empty = []nested = [empty, empty, empty]nested# [[], [], []]nested[1].append('Python')nested#  [['Python'], ['Python'], ['Python']]nested = [[]] * 3nested[1].append('aaa')nested# [['aaa'], ['aaa'], ['aaa']]nested[1] = ['Monty']nested# [['aaa'], ['Monty'], ['aaa']]#等式size = 5python = ['Python']snake_nest = [python] * sizesnake_nest# [['Python'], ['Python'], ['Python'], ['Python'], ['Python']]snake_nest[0] == snake_nest[1] == snake_nest[2] == snake_nest[3] == snake_nest[4]# Truesnake_nest[0] is snake_nest[1] is snake_nest[2] is snake_nest[3] is snake_nest[4]# Trueimport randomposition = random.choice(range(size))snake_nest[position] = ['Python']snake_nest# [['Python'], ['Python'], ['Python'], ['Python'], ['Python']]snake_nest[0] == snake_nest[1] == snake_nest[2] == snake_nest[3] == snake_nest[4]# Truesnake_nest[0] is snake_nest[1] is snake_nest[2] is snake_nest[3] is snake_nest[4]# False[id(snake) for snake in snake_nest]# [212336032, 212336032, 212128032, 212336032, 212336032]#条件语句,if...elif...,if中为真不会执行elif语句mixed = ['cat', '', ['dog'], []]for element in mixed:    if element:        print elementanimals = ['cat', 'dog']if 'rabbit' in animals:    print 1elif 'dog' in animals:    print 2sent = ['No', 'good', 'fish', 'goes', 'anywhere', 'without', 'a', 'porpoise', '.']all(len(w) > 4 for w in sent)# False,反映的是是否全部满足any(len(w) > 4 for w in sent)# True,反映的是有一项满足#序列t = 'walk', 'fem', 3t#  ('walk', 'fem', 3)t[0]# 'walk't[1:]# ('fem', 3)len(t)# 3#字符串、元组、列表比较raw = 'I turned off the spectroroute'text = ['I', 'turned', 'off', 'the', 'spectroroute']pair = (6, 'turned')raw[2],text[3],pair[1]# ('t', 'the', 'turned')raw[-3:],text[-3:],pair[-3:]# ('ute', ['off', 'the', 'spectroroute'], (6, 'turned'))len(raw),len(text),len(pair)# (29, 5, 2)import nltkraw = 'Red lorry, yellow lorry, red lorry, yellow lorry'text = nltk.word_tokenize(raw)fdist = nltk.FreqDist(text)list(fdist)# ['yellow', 'red', 'lorry', 'Red', ',']for key in fdist:    print fdist[key],#2 1 4 1 3,2个yellow,4个lorrywords = ['I', 'turned', 'off', 'the', 'spectroroute']words[2], words[3], words[4] = words[3], words[4], words[2]words#  ['I', 'turned', 'the', 'spectroroute', 'off']tmp = words[2]words[2] = words[3]words[3] = words[4]words[4] = tmpwords#  ['I', 'turned', 'spectroroute', 'off', 'the']words = ['I' ,'turned', 'off', 'the', 'spectroroute']tags = ['noun', 'verb' ,'prep', 'det', 'noun']zip(words, tags)# zip取两个或两个以上序列中的项目,将其“压缩”打包成单个配对列表#==============================================================================# [('I', 'noun'),#  ('turned', 'verb'),#  ('off', 'prep'),#  ('the', 'det'),#  ('spectroroute', 'noun')]#==============================================================================#数据分割text = nltk.corpus.nps_chat.words()cut = int(0.9 * len(text))training_data, test_data = text[:cut], text[cut:]text == training_data + test_data# Truelen(training_data) / len(test_data)# 9#合并不同类型的序列words = 'I turned off the spectroroute'.split()wordlens = [(len(word), word) for word in words]wordlens# [(1, 'I'), (6, 'turned'), (3, 'off'), (3, 'the'), (12, 'spectroroute')]wordlens.sort()' '.join(w for (_,w) in wordlens)#  'I off the turned spectroroute'#产生器表达式text = """When I use a word,"Humpty Dumpty said in rather a acornful tone,"it means just what I choose it to mean - neither more nor less."""[w.lower() for w in nltk.word_tokenize(text)]#如何读中文分词,啊不明白!!!!!!!max([w.lower() for w in nltk.word_tokenize(text)])min([w.lower() for w in nltk.word_tokenize(text)])#产生一个词链表的所有排列def permutations(seq):    if len(seq) <= 1:        yield seq    else:        for perm in permutations(seq[1:]):            for i in range(len(perm)+1):                yield perm[:i] + seq[0:1] + perm[i:]list(permutations(['police', 'fish', 'buffalo']))#检查一个词是否来自一个开放的实词类def is_content_word(word):    return word.lower() not in ['a', 'of', 'the', 'and', 'will', ',', '.']sent = ['Take', 'care', 'of', 'the', 'sense', ',', 'and', 'the', 'sounds', 'will', 'take', 'care',        'of', 'themeselves', '.']filter(is_content_word, sent)[w for w in sent if is_content_word(w)]nltk.metrics.distance.__file__def factorial2(n):    if n == 1: return 1    else: return n*factorial2(n-1)factorial2(5)

原创粉丝点击