python BeautifulSoup 抓取python中文开发者社区中的所有高级教程

来源:互联网 发布:中国电信四个重构 网络 编辑:程序博客网 时间:2024/06/05 22:31

话不多说直接上代码:

#coding=utf-8from bs4 import BeautifulSoupimport urllib2url = 'http://www.pythontab.com/html/pythonhexinbiancheng/index.html'url_list = [url]for i in range(2,19):    url_list.append('http://www.pythontab.com/html/pythonhexinbiancheng/%s.html'%i)source_list = []for j in url_list:    request = urllib2.urlopen(j)    html = request.read()    suop = BeautifulSoup(html,'lxml')    titles = suop.select('#catlist > li > a')    links = suop.select('#catlist > li > a')    for title, link in zip(titles, links):        data = {            "title" : title.get_text(),            "link" : link.get('href')        }        source_list.append(data)    for l in source_list:        request = urllib2.urlopen(l['link'])        html = request.read()        suop = BeautifulSoup(html,'lxml')        text_p = suop.select('#Article > div.content > p')        text = []        print(text_p)        for t in text_p:            text.append(t.get_text().encode('utf-8'))        title_text  = l['title']        title_text = title_text.replace('*','').replace('/','or').replace('"',' ').replace('?','wenhao').replace(':',' ')        with open('%s.txt'%title_text, 'wb') as f:            for a in text:                f.write(a)


0 0