python自然语言处理学习笔记第二章第二部分

来源:互联网 发布:守望先锋网络延迟查询 编辑:程序博客网 时间:2024/05/04 16:54

>>> from nltk.corpus import brown
>>> cfd = nltk.ConditionalFreqDist((genre,word)
          for genre in brown.categories()
          for word in brown.words(categories=genre))

>>> cfd.items()
[('mystery', <FreqDist with 6982 samples and 57169 outcomes>), ('belles_lettres', <FreqDist with 18421 samples and 173096 outcomes>), ('humor', <FreqDist with 5017 samples and 21695 outcomes>), ('government', <FreqDist with 8181 samples and 70117 outcomes>), ('fiction', <FreqDist with 9302 samples and 68488 outcomes>), ('reviews', <FreqDist with 8626 samples and 40704 outcomes>), ('religion', <FreqDist with 6373 samples and 39399 outcomes>), ('romance', <FreqDist with 8452 samples and 70022 outcomes>), ('science_fiction', <FreqDist with 3233 samples and 14470 outcomes>), ('adventure', <FreqDist with 8874 samples and 69342 outcomes>), ('editorial', <FreqDist with 9890 samples and 61604 outcomes>), ('hobbies', <FreqDist with 11935 samples and 82345 outcomes>), ('lore', <FreqDist with 14503 samples and 110299 outcomes>), ('news', <FreqDist with 14394 samples and 100554 outcomes>), ('learned', <FreqDist with 16859 samples and 181888 outcomes>)]

 

 

 

from __future__ import division

def lexical_diversity(text):
     return len(text) / len(set(text))

def lexical_diversity(my_text_data):
    word_count = len(my_text_data)
    vocab_size = len(set(my_text_data))
    diversity_score = word_count / vocab_size
    return diversity_score

def plural(word):
    if word.endswith('y'):
        return word[:-1] + 'ies'
    elif word[-1] in 'sx' or word[-2:] in ['sh', 'ch']:
 return word + 'es'
    elif word.endswith('an'):
 return word[:-2] + 'en'
    else:
 return word + 's'

原创粉丝点击