python实现网络爬虫
来源:互联网 发布:淘宝怎么设置运费0.01 编辑:程序博客网 时间:2024/04/29 04:26
一.简介
该爬虫程序包含2个类,一个管理整个crawling进程(Crawler),一个检索并解析每一个下载的web页面(Retriever)。
二.程序
#!/usr/bin/env pythonfrom sys import argvfrom os import makedirs,unlink,sepfrom os.path import dirname,exists,isdir,splitextfrom string import replace,find,lowerfrom htmllib import HTMLParserfrom urllib import urlretrievefrom urlparse import urlparse,urljoinfrom formatter import DumbWriter,AbstractFormatterfrom cStringIO import StringIOclass Retriever(object): #download web pages def __init__(self,url): self.url = url self.file = self.filename(url) def filename(self,url,deffile='index.htm'): parsedurl = urlparse(url,'http:',0) ## parse path path = parsedurl[1] + parsedurl[2] ext = splitext(path) if ext[1] == '' : #no file,use default if path[-1] == '/': path += deffile else: path += '/' + deffile ldir = dirname(path) #local directory if sep != '/': # os-indep. path separator ldir = replace(ldir,'/',sep) if not isdir(ldir): # create archive dir if nec. if exists(ldir): unlink(ldir) makedirs(ldir) return path def download(self): #download Web page try: retval = urlretrieve(self.url,self.file) except IOError: retval = ('*** ERROR: invalid URL "%s"' % \ self.url,) return retval def parseAndGetLinks(self): #parse HTML,save links self.parser = HTMLParser(AbstractFormatter(\ DumbWriter(StringIO()))) self.parser.feed(open(self.file).read()) self.parse.close() return self.parser.anchorlist class Crawler(object): #manage entire crawling process count = 0 #static downloaded page counter def __init__(self,url): self.q = [url] self.seen = [] #have seen the url self.dom = urlparse(url)[1] def getPage(self,url): r = Retriever(url) retval = r.download() if retval[0] == '*': # error situation,do not parse print retval,'... skipping parse' return Crawler.count += 1 print '\n(',Crawler.count,')' print 'URL:',url print 'FILE:',retval[0] self.seen.append(url) links = r.parseAndGetLinks() #get and process links for eachLink in links: if eachLink[:4] != 'http' and \ find(eachLink,'://') == -1: eachLink = urljoin(url,eachLink) print '* ',eachLink, if find(lower(eachLink),'mailto:') != -1: print '... discarded,mailto link' continue if eachLink not in self.seen: if find(eachLink,self.dom) == -1: print '... discarded, not in domain' else: if eachLink not in self.q: self.q.append(eachLink) print '... new, added to Q' else: print '... discarded, already in Q' else: print '... discarded, already processed' def go(self): # process links in queue while self.q: url = self.q.pop() self.getPage(url)def main(): if len(argv) > 1: url = argv[1] else: try: url = raw_input('Enter starting URL:') except(KeyboardInterrupt,EOFError): url = '' if not url: return robot = Crawler(url) robot.go()if __name__ == '__main__': main()
- Python实现网络爬虫
- Python实现网络爬虫
- Python实现网络爬虫
- python实现网络爬虫
- python实现网络爬虫
- [Python] 实现网络爬虫
- python实现网络爬虫
- Python实现网络爬虫
- [Python] 实现网络爬虫
- Python实现网络爬虫
- Python 实现网络爬虫
- Python实现网络爬虫
- python实现网络爬虫
- 用python实现网络爬虫
- python 实现简单网络爬虫
- python实现简易网络爬虫
- 用Python实现网络爬虫
- python实现简单网络爬虫
- 将树转换成二叉树
- Sample for effective C++
- JAVA方法中的参数用final来修饰的原因
- 【shell脚本学习】1.linux shell基础
- Javascript中的陷阱大集合
- python实现网络爬虫
- 学习单片机第二天
- uIP中的结构体ui_conn
- 区域生长法的编程实现——程序员数字图像处理第一步
- C++新标准: C++ 0x
- Pull 解析xml
- Servlet的生命周期
- Matlab7.0 Windows7 32出现Runtime Error错误解决方法
- C++ primer 根据家族名查找所以的家族成员的名和生日