python3.5简单爬虫爬取百度百科（参考imooc实战）

来源：互联网发布：2017全国最新人口数据编辑：程序博客网时间：2024/05/20 01:47

思路：
1、添加一个入口url
2、解析该url对应的html中的内容，提取标签与内容，再从中提取其中所含的其他百科的链接，因此，维护新老两个set，防止重复url
3、由于一个页面解析出来的百科链接不止一个，所以要计数break
4、将内容保存在一个html文件中

#coding=gbk'''Created on 2016年6月23日@author: Administrator'''from bs4 import BeautifulSoupimport reimport urllib.parseimport urllib.requestnew_urls = set()old_urls = set()datas = []NewUrls = set()def crawler(root_url):    tot = 1    count = 1    new_urls.add(root_url)    while len(new_urls) != 0:        try:            new_url = new_urls.pop()            old_urls.add(new_url)            print('正在打印第 %d个: %s'%(tot,new_url))            if new_url is None:                continue            response = urllib.request.urlopen(new_url)            if response.getcode() != 200:                continue            html_cont = response.read()            soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8')            links = soup.find_all('a', href=re.compile(r"/view/\d+\.htm"))            for link in links:                new_urllrh = link['href']                new_full_url = urllib.parse.urljoin(new_url, new_urllrh)                new_urls.add(new_full_url)            new_data = {}            new_data['url'] = new_url            title_node = soup.find('dd', class_="lemmaWgt-lemmaTitle-title").find('h1')            new_data['title'] = title_node.get_text()            summary_node = soup.find('div', class_="lemma-summary")            new_data['summary'] = summary_node.get_text()            datas.append(new_data)            if tot == 30:                break            tot = tot + 1        except:            print('第%d次使用套路失败'%count)            if count == 3:                break            count += 1    fout = open('output.html', 'w', encoding="utf-8")    fout.write("<html>")    fout.write("<head><meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\"></head>")    fout.write("<body>")    fout.write("<table>")    for data in datas:        fout.write("<tr>")        fout.write("<td>%s</td>" % data['url'])        fout.write("<td>%s</td>" % data['title'])        fout.write("<td>%s</td>" % data['summary'])        fout.write("</tr>")    fout.write("</table>")    fout.write("</body>")    fout.write("</html>")    fout.close()    if __name__ == '__main__':    root_url = "http://baike.baidu.com/view/21087.htm"    crawler(root_url)

如有错误，欢迎提出

0 0