python实现网络爬虫

来源：互联网发布：网络写手的收入编辑：程序博客网时间：2024/04/29 05:37
http://blog.csdn.net/lianxiang_biancheng/article/details/7674844

一.简介

该爬虫程序包含2个类，一个管理整个crawling进程（Crawler），一个检索并解析每一个下载的web页面(Retriever)。
二.程序

[python] view plaincopy
#!/usr/bin/env python  
  
from sys import argv  
from os import makedirs,unlink,sep  
from os.path import dirname,exists,isdir,splitext  
from string import replace,find,lower  
from htmllib import HTMLParser  
from urllib import urlretrieve  
from urlparse import urlparse,urljoin  
from formatter import DumbWriter,AbstractFormatter  
from cStringIO import StringIO  
  
class Retriever(object): #download web pages  
  def __init__(self,url):  
    self.url = url  
    self.file = self.filename(url)  
  
  def filename(self,url,deffile='index.htm'):  
    parsedurl = urlparse(url,'http:',0) ## parse path  
    path = parsedurl[1] + parsedurl[2]  
    ext = splitext(path)  
    if ext[1] == '' : #no file,use default  
       if path[-1] == '/':  
          path += deffile  
       else:  
          path += '/' + deffile  
    ldir = dirname(path) #local directory  
    if sep != '/': # os-indep. path separator  
       ldir = replace(ldir,'/',sep)  
    if not isdir(ldir): # create archive dir if nec.  
       if exists(ldir): unlink(ldir)  
       makedirs(ldir)  
    return path  
  
  def download(self): #download Web page  
    try:  
      retval = urlretrieve(self.url,self.file)  
    except IOError:  
      retval = ('*** ERROR: invalid URL "%s"' % \  
          self.url,)  
      return retval  
    
  def parseAndGetLinks(self): #parse HTML,save links  
    self.parser = HTMLParser(AbstractFormatter(\  
       DumbWriter(StringIO())))  
    self.parser.feed(open(self.file).read())  
    self.parse.close()  
    return self.parser.anchorlist  
  
      
class Crawler(object): #manage entire crawling process  
  count = 0 #static downloaded page counter  
    
  def __init__(self,url):  
    self.q = [url]     
    self.seen = []   #have seen the url  
    self.dom = urlparse(url)[1]  
    
  def getPage(self,url):  
    r = Retriever(url)  
    retval = r.download()  
    if retval[0] == '*': # error situation,do not parse  
      print retval,'... skipping parse'  
      return   
    Crawler.count += 1  
    print '\n(',Crawler.count,')'  
    print 'URL:',url  
    print 'FILE:',retval[0]  
    self.seen.append(url)  
  
    links = r.parseAndGetLinks() #get and process links  
    for eachLink in links:  
       if eachLink[:4] != 'http' and \  
          find(eachLink,'://') == -1:  
          eachLink = urljoin(url,eachLink)  
       print '* ',eachLink,  
         
       if find(lower(eachLink),'mailto:') != -1:  
          print '... discarded,mailto link'  
          continue  
         
       if eachLink not in self.seen:  
          if find(eachLink,self.dom) == -1:  
             print '... discarded, not in domain'  
          else:  
             if eachLink not in self.q:  
                  self.q.append(eachLink)  
                  print '... new, added to Q'  
             else:  
                print '... discarded, already in Q'  
       else:  
           print '... discarded, already processed'  
  
  def go(self): # process links in queue  
      while self.q:  
          url = self.q.pop()  
          self.getPage(url)  
  
  
def main():  
  if len(argv) > 1:  
      url = argv[1]  
  else:  
     try:  
       url = raw_input('Enter starting URL:')  
     except(KeyboardInterrupt,EOFError):  
       url = ''  
     if not url: return   
     robot = Crawler(url)  
     robot.go()  
  
if __name__ == '__main__':  
   main()