python3.x如何从互联网获取想要的文章，及转化为nltk可以处理的文本

来源：互联网发布：板绘用什么软件编辑：程序博客网时间：2024/06/04 00:32

from urllib.request import urlopen
from bs4 import BeautifulSoup
from nltk import word_tokenize
import nltk
#2种方式解析HTML中的文本
url = "http://news.bbc.co.uk/2/hi/health/2284783.stm"
html = urlopen(url).read().decode('utf8')
print(html[:60])
#print(html)
#第一种，使用find和rfind查找到文本的开始位置和结束位置，使用python的切片功能
raw = BeautifulSoup(html,'lxml').get_text()#如果出现bs4.FeatureNotFound：（没有安装解析器只需在cmd下:pip install lxml即可）

tokens = word_tokenize(raw)
print(tokens[:10])#检测HTML文本
print(raw.find("Blondes 'to die out in 200 years'"))
print(raw.rfind("The frequency of blondes may drop but they won't disappear."))
#print(raw[22:2449])打印整篇报道

#第二种，使用beautifulsoup自带的功能，匹配到相应的模块，输出文本内容（此时不包含标题，如果需要还需再找相应的模块）
bs = BeautifulSoup(html,'lxml')
print(bs.find("div",class_='bodytext').get_text())
#过滤无关内容
tokens = tokens[110:390]
text = nltk.Text(tokens)#把文本转化为nltk文本进行后续处理
print(text.concordance('gene'))

阅读全文

0 0