爬虫-百度百科

来源:互联网 发布:网络用语lq什么意思 编辑:程序博客网 时间:2024/06/05 13:32

本文是基于慕课网教程(http://www.imooc.com/learn/563)编写的。讲师非常棒,把爬虫的架构思想讲得简单易懂,最后实现了一个爬取百度百科内容的爬虫。


基于python2.7,IDE用的是PyCharm(https://www.jetbrains.com/pycharm/)

目标:以百度百科python词条为root_url(http://baike.baidu.com/view/21087.htm),该词条页面的其他所有词条链接为后继,共1000条,循环获取他们的标题及简介。最终输出到一个html文件中。

分析:

root_url : http://baike.baidu.com/view/21087.htm

后继链接类似 : /view/21087.htm

标题类似 : 

<dd class="lemmaWgt-lemmaTitle-title"><h1>Python</h1></dd>
简介类似 : 

<div class="lemma-summary" label-module="lemmaSummary">......</div>



总共5个类,各功能代码如下:

spider_main.py : 主类,爬虫调度器

# coding:utf8import url_managerimport html_parserimport html_downloaderimport html_outputerclass SpiderMain(object):  def __init__(self):    self.urls=url_manager.UrlManager()    self.downloader=html_downloader.HtmlDownloader()    self.outputer=html_outputer.HtmlOutputer()    self.parser=html_parser.HtmlParser()  def craw(self,root_url):    count=1    self.urls.add_new_url(root_url)    while self.urls.has_new_url():      try:        new_url=self.urls.get_new_url()        print 'craw %d : %s' %(count,new_url)        html_cont=self.downloader.download(new_url)        new_urls,new_data=self.parser.parse(new_url,html_cont)        self.urls.add_new_urls(new_urls)        self.outputer.collect_data(new_data)        if count==1000:          break        count=count+1      except Exception,e:        print 'craw failed:',e    self.outputer.output_html()if __name__=='__main__':  print 'begin'  root_url='http://baike.baidu.com/view/21087.htm'  obj_spider=SpiderMain()  obj_spider.craw(root_url)


url_manager.py : url管理器,主要用于保存已爬取url和待爬取url

# coding:utf8class UrlManager(object):  def __init__(self):    self.new_urls=set()    self.old_urls=set()    def add_new_url(self,new_url):    if new_url is None:      return    if new_url not in self.old_urls:      self.new_urls.add(new_url)  def add_new_urls(self,new_urls):    if new_urls is None or len(new_urls)==0:      return    for new_url in new_urls:      self.add_new_url(new_url)  def has_new_url(self):    return len(self.new_urls)!=0        def get_new_url(self):    new_url=self.new_urls.pop()    self.old_urls.add(self)    return new_url


html_downloader.py : 爬取html页面

# coding:utf8import urllib2class HtmlDownloader(object):  def download(self,new_url):    if new_url is None or len(new_url)==0:      return None    request=urllib2.Request(new_url)    response=urllib2.urlopen(new_url)    if response.getcode()!=200:      return None    return response.read()


html_parser.py : html解析器,用于解析爬取的html页面,抽取其中的后继url 及 标题简介

from bs4 import BeautifulSoupimport urlparseimport reclass HtmlParser(object):  def _get_new_urls(self,page_url,soup):    # /view/123.html    new_full_urls=set()    links=soup.find_all('a',href=re.compile(r"/view/\d+\.htm"))    for link in links:      new_url=link['href']      new_full_url=urlparse.urljoin(page_url,new_url)      new_full_urls.add(new_full_url)    return new_full_urls  def _get_new_data(self,page_url,soup):    res_data={}    res_data['url']=page_url    title_node=soup.find('dd',class_='lemmaWgt-lemmaTitle-title').find('h1')    res_data['title']=title_node.get_text()    summary_node=soup.find('div',class_='lemma-summary')    res_data['summary']=summary_node.get_text()        return res_data  def parse(self,page_url,html_cont):    if page_url is None or html_cont is None:      return    soup=BeautifulSoup(html_cont, 'html.parser',from_encoding='utf-8')    new_urls=self._get_new_urls(page_url,soup)    data=self._get_new_data(page_url,soup)    return new_urls,data


html_outputer.py : 存储各url对应的标题、简介,将存储的数据输出到html文件中

# conding:utf8class HtmlOutputer(object):  def __init__(self):    self.datas=[]  def collect_data(self,data):    if data is None:      return    self.datas.append(data)  def output_html(self):    fout=open('output.html','w')    try:      fout.write('<html>')      fout.write('<body>')      fout.write('<table>')      for data in self.datas:        fout.write('<tr>')        fout.write('<td>%s</td>' % data['url'])        fout.write('<td>%s</td>' % data['title'].encode('utf-8'))        fout.write('<td>%s</td>' % data['summary'].encode('utf-8'))        fout.write('</tr>')      fout.write('</table>')      fout.write('</body>')      fout.write('</html>')    finally:      fout.close()


0 0
原创粉丝点击