python实现网络爬虫

来源：互联网发布：淘宝怎么设置运费0.01 编辑：程序博客网时间：2024/04/29 04:26
一.简介

该爬虫程序包含2个类，一个管理整个crawling进程（Crawler），一个检索并解析每一个下载的web页面(Retriever)。
二.程序

#!/usr/bin/env pythonfrom sys import argvfrom os import makedirs,unlink,sepfrom os.path import dirname,exists,isdir,splitextfrom string import replace,find,lowerfrom htmllib import HTMLParserfrom urllib import urlretrievefrom urlparse import urlparse,urljoinfrom formatter import DumbWriter,AbstractFormatterfrom cStringIO import StringIOclass Retriever(object): #download web pages  def __init__(self,url):    self.url = url    self.file = self.filename(url)  def filename(self,url,deffile='index.htm'):    parsedurl = urlparse(url,'http:',0) ## parse path    path = parsedurl[1] + parsedurl[2]    ext = splitext(path)    if ext[1] == '' : #no file,use default       if path[-1] == '/':          path += deffile       else:          path += '/' + deffile    ldir = dirname(path) #local directory    if sep != '/': # os-indep. path separator       ldir = replace(ldir,'/',sep)    if not isdir(ldir): # create archive dir if nec.       if exists(ldir): unlink(ldir)       makedirs(ldir)    return path  def download(self): #download Web page    try:      retval = urlretrieve(self.url,self.file)    except IOError:      retval = ('*** ERROR: invalid URL "%s"' % \          self.url,)      return retval    def parseAndGetLinks(self): #parse HTML,save links    self.parser = HTMLParser(AbstractFormatter(\       DumbWriter(StringIO())))    self.parser.feed(open(self.file).read())    self.parse.close()    return self.parser.anchorlist    class Crawler(object): #manage entire crawling process  count = 0 #static downloaded page counter    def __init__(self,url):    self.q = [url]       self.seen = []   #have seen the url    self.dom = urlparse(url)[1]    def getPage(self,url):    r = Retriever(url)    retval = r.download()    if retval[0] == '*': # error situation,do not parse      print retval,'... skipping parse'      return     Crawler.count += 1    print '\n(',Crawler.count,')'    print 'URL:',url    print 'FILE:',retval[0]    self.seen.append(url)    links = r.parseAndGetLinks() #get and process links    for eachLink in links:       if eachLink[:4] != 'http' and \          find(eachLink,'://') == -1:          eachLink = urljoin(url,eachLink)       print '* ',eachLink,              if find(lower(eachLink),'mailto:') != -1:          print '... discarded,mailto link'          continue              if eachLink not in self.seen:          if find(eachLink,self.dom) == -1:             print '... discarded, not in domain'          else:             if eachLink not in self.q:                  self.q.append(eachLink)                  print '... new, added to Q'             else:                print '... discarded, already in Q'       else:           print '... discarded, already processed'  def go(self): # process links in queue      while self.q:          url = self.q.pop()          self.getPage(url)def main():  if len(argv) > 1:      url = argv[1]  else:     try:       url = raw_input('Enter starting URL:')     except(KeyboardInterrupt,EOFError):       url = ''     if not url: return      robot = Crawler(url)     robot.go()if __name__ == '__main__':   main()