[python]NLTK简明教程

来源:互联网 发布:算法新解 pdf 编辑:程序博客网 时间:2024/05/22 14:41

nltk简明教程

NLTK是python环境下NLP工具包,包含了丰富的文本处理和文本挖掘API。

安装

安装NLTK比较简单,linux环境下只需要简单的执行sudo pip install -U nltk即可完成安装。

语料下载

import nltk#指定目录下载nltk自带的英文语料#如果不是使用的默认路径需要执行下面的语句添加环境变量:#vim ~/.profile#文件末尾添加NLTK_DATA="full/path"#source ~/.profilenltk.download(download_dir='./data/nltk/')#在弹出GUI界面就可以选择下载的语料了

测试下载好的语料

from nltk.corpus import brownprint(brown.words()[0:10])#打印前10个单词print(brown.tagged_words()[0:10])#打印前10个单词的标注print(len(brown.words()))#有多少个单词print(dir(brown))

测试下载好的书籍

from nltk.book import *# *** Introductory Examples for the NLTK Book ***# Loading text1, ..., text9 and sent1, ..., sent9# Type the name of the text or sentence to view it.# Type: 'texts()' or 'sents()' to list the materials.# text1: Moby Dick by Herman Melville 1851# text2: Sense and Sensibility by Jane Austen 1811# text3: The Book of Genesis# text4: Inaugural Address Corpus# text5: Chat Corpus# text6: Monty Python and the Holy Grail# text7: Wall Street Journal# text8: Personals Corpus# text9: The Man Who Was Thursday by G . K . Chesterton 1908print(text1.name)#书名print(text1.concordance(word="love"))#上下文print(text1.similar(word="very"))#相似上下文场景print(text1.common_contexts(words=["pretty","very"]))#相似上下文text4.dispersion_plot(words=['citizens','freedom','democracy'])#美国总统就职演说词汇分布图print(text1.collocations())#搭配print(type(text1))print(len(text1))#文本长度print(len(set(text1)))#词汇长度fword=FreqDist(text1)print(text1.name)#书名print(fword)voc=fword.most_common(50)#频率最高的50个字符fword.plot(50,cumulative=True)#绘出波形图print(fword.hapaxes())#低频词

分词和分句

from nltk.tokenize import word_tokenize,sent_tokenize#分词  TreebankWordTokenizer PunktTokenizerprint(word_tokenize(text="All work and no play makes jack a dull boy, all work and no play",language="english"))#分句data = "All work and no play makes jack dull boy. All work and no play makes jack a dull boy."print(sent_tokenize(data))from nltk.corpus import stopwordsprint(type(stopwords.words('english')))print([w for w in word_tokenize(text="All work and no play makes jack a dull boy, all work and no play",language="english") if w not in stopwords.words('english')])#去掉停用词

时态 和 单复数

from nltk.stem import PorterStemmerdata=word_tokenize(text="All work and no play makes jack a dull boy, all work and no play,playing,played",language="english")ps=PorterStemmer()for w in data:    print(w,":",ps.stem(word=w))from nltk.stem import SnowballStemmersnowball_stemmer = SnowballStemmer('english')snowball_stemmer.stem('presumably')#u’presum’from nltk.stem import WordNetLemmatizerwordnet_lemmatizer = WordNetLemmatizer()wordnet_lemmatizer.lemmatize(‘dogs’)u’dog’

词性标注

sentence = """At eight o'clock on Thursday morning... Arthur didn't feel very good.""" tokens = nltk.word_tokenize(sentence)print(tokens)#['At', 'eight', "o'clock", 'on', 'Thursday', 'morning',# 'Arthur', 'did', "n't", 'feel', 'very', 'good', '.']nltk.help.upenn_tagset(‘NNP’)#输出NNP的含义tagged = nltk.pos_tag(tokens) nltk.batch_pos_tag([[‘this’, ‘is’, ‘batch’, ‘tag’, ‘test’], [‘nltk’, ‘is’, ‘text’, ‘analysis’, ‘tool’]])#批量标注print(tagged)# [('At', 'IN'), ('eight', 'CD'), ("o'clock", 'JJ'), ('on', 'IN'),# ('Thursday', 'NNP'), ('morning', 'NN')]

附表:

分类器

下面列出的是NLTK中自带的分类器

from nltk.classify.api import ClassifierI, MultiClassifierIfrom nltk.classify.megam import config_megam, call_megamfrom nltk.classify.weka import WekaClassifier, config_wekafrom nltk.classify.naivebayes import NaiveBayesClassifierfrom nltk.classify.positivenaivebayes import PositiveNaiveBayesClassifierfrom nltk.classify.decisiontree import DecisionTreeClassifierfrom nltk.classify.rte_classify import rte_classifier, rte_features, RTEFeatureExtractorfrom nltk.classify.util import accuracy, apply_features, log_likelihoodfrom nltk.classify.scikitlearn import SklearnClassifierfrom nltk.classify.maxent import (MaxentClassifier, BinaryMaxentFeatureEncoding,TypedMaxentFeatureEncoding,ConditionalExponentialClassifier)

应用1:通过名字预测性别

from nltk.corpus import names#特征取的是最后一个字母def gender_features(word):    return {'last_letter': word[-1]}#数据准备name=[(n,'male') for n in names.words('male.txt')]+[(n,'female') for n in names.words('female.txt')]print(len(name))#特征提取和训练模型features=[(gender_features(n),g) for (n,g) in name]classifier = nltk.NaiveBayesClassifier.train(features[:6000])#测试print(classifier.classify(gender_features('Frank')))from nltk import classifyprint(classify.accuracy(classifier,features[6000:]))

应用2:情感分析

import nltk.classify.utilfrom nltk.classify import NaiveBayesClassifierfrom nltk.corpus import namesdef word_feats(words):    return dict([(word, True) for word in words])#数据准备positive_vocab = ['awesome', 'outstanding', 'fantastic', 'terrific', 'good', 'nice', 'great', ':)']negative_vocab = ['bad', 'terrible', 'useless', 'hate', ':(']neutral_vocab = ['movie', 'the', 'sound', 'was', 'is', 'actors', 'did', 'know', 'words', 'not']#特征提取positive_features = [(word_feats(pos), 'pos') for pos in positive_vocab]negative_features = [(word_feats(neg), 'neg') for neg in negative_vocab]neutral_features = [(word_feats(neu), 'neu') for neu in neutral_vocab]train_set = negative_features + positive_features + neutral_features#训练classifier = NaiveBayesClassifier.train(train_set)# 测试neg = 0pos = 0sentence = "Awesome movie, I liked it"sentence = sentence.lower()words = sentence.split(' ')for word in words:    classResult = classifier.classify(word_feats(word))        if classResult == 'neg':            neg = neg + 1        if classResult == 'pos':            pos = pos + 1print('Positive: ' + str(float(pos) / len(words)))print('Negative: ' + str(float(neg) / len(words)))

以上就是一些NLTK的简单应用,如果更复杂的应用,就需要看源码以及官网文档了。