python实现网络爬虫
来源:互联网 发布:gxh-305分析仪的数据 编辑:程序博客网 时间:2024/04/29 15:48
一.简介二.程序
该爬虫程序包含2个类,一个管理整个crawling进程(Crawler),一个检索并解析每一个下载的web页面(Retriever)。
二.程序
- #!/usr/bin/env python
- from sys import argv
- from os import makedirs,unlink,sep
- from os.path import dirname,exists,isdir,splitext
- from string import replace,find,lower
- from htmllib import HTMLParser
- from urllib import urlretrieve
- from urlparse import urlparse,urljoin
- from formatter import DumbWriter,AbstractFormatter
- from cStringIO import StringIO
- class Retriever(object): #download web pages
- def __init__(self,url):
- self.url = url
- self.file = self.filename(url)
- def filename(self,url,deffile='index.htm'):
- parsedurl = urlparse(url,'http:',0) ## parse path
- path = parsedurl[1] + parsedurl[2]
- ext = splitext(path)
- if ext[1] == '' : #no file,use default
- if path[-1] == '/':
- path += deffile
- else:
- path += '/' + deffile
- ldir = dirname(path) #local directory
- if sep != '/': # os-indep. path separator
- ldir = replace(ldir,'/',sep)
- if not isdir(ldir): # create archive dir if nec.
- if exists(ldir): unlink(ldir)
- makedirs(ldir)
- return path
- def download(self): #download Web page
- try:
- retval = urlretrieve(self.url,self.file)
- except IOError:
- retval = ('*** ERROR: invalid URL "%s"' % \
- self.url,)
- return retval
- def parseAndGetLinks(self): #parse HTML,save links
- self.parser = HTMLParser(AbstractFormatter(\
- DumbWriter(StringIO())))
- self.parser.feed(open(self.file).read())
- self.parse.close()
- return self.parser.anchorlist
- class Crawler(object): #manage entire crawling process
- count = 0 #static downloaded page counter
- def __init__(self,url):
- self.q = [url]
- self.seen = [] #have seen the url
- self.dom = urlparse(url)[1]
- def getPage(self,url):
- r = Retriever(url)
- retval = r.download()
- if retval[0] == '*': # error situation,do not parse
- print retval,'... skipping parse'
- return
- Crawler.count += 1
- print '\n(',Crawler.count,')'
- print 'URL:',url
- print 'FILE:',retval[0]
- self.seen.append(url)
- links = r.parseAndGetLinks() #get and process links
- for eachLink in links:
- if eachLink[:4] != 'http' and \
- find(eachLink,'://') == -1:
- eachLink = urljoin(url,eachLink)
- print '* ',eachLink,
- if find(lower(eachLink),'mailto:') != -1:
- print '... discarded,mailto link'
- continue
- if eachLink not in self.seen:
- if find(eachLink,self.dom) == -1:
- print '... discarded, not in domain'
- else:
- if eachLink not in self.q:
- self.q.append(eachLink)
- print '... new, added to Q'
- else:
- print '... discarded, already in Q'
- else:
- print '... discarded, already processed'
- def go(self): # process links in queue
- while self.q:
- url = self.q.pop()
- self.getPage(url)
- def main():
- if len(argv) > 1:
- url = argv[1]
- else:
- try:
- url = raw_input('Enter starting URL:')
- except(KeyboardInterrupt,EOFError):
- url = ''
- if not url: return
- robot = Crawler(url)
- robot.go()
- if __name__ == '__main__':
- main()
- Python实现网络爬虫
- Python实现网络爬虫
- Python实现网络爬虫
- python实现网络爬虫
- python实现网络爬虫
- [Python] 实现网络爬虫
- python实现网络爬虫
- Python实现网络爬虫
- [Python] 实现网络爬虫
- Python实现网络爬虫
- Python 实现网络爬虫
- Python实现网络爬虫
- python实现网络爬虫
- 用python实现网络爬虫
- python 实现简单网络爬虫
- python实现简易网络爬虫
- 用Python实现网络爬虫
- python实现简单网络爬虫
- C#反转字符串
- python中线程的使用
- 谈谈proxy的安全问题
- C程序设计课程 第十三堂课后作业
- 在 VC6 中使用 GdiPlus-使用
- python实现网络爬虫
- win7 64位安装pomelo 惊魂记
- DirectX 3D_实践之DirectX3D中网格的使用
- GDB 命令详细解释【转】
- Markdown and Pando
- UBI文件系统制作
- 【Android 开发】:UI控件之 ScrollView垂直滚动控件 和 HorizontalScrollView水平滚动控件的使用
- 在Linux中开机自动运行普通用户脚本程序
- test