一个简单的爬虫程序,包含请求头。
来源:互联网 发布:后台数据集 js调用 编辑:程序博客网 时间:2024/06/05 20:54
import urlparsefrom os import sep, unlink, makedirs, rmdirfrom os.path import splitext, dirname, isdir, existsimport urllibimport urllib2from htmllib import HTMLParserfrom formatter import AbstractFormatter, DumbWriterfrom cStringIO import StringIOfrom string import replace, find, lower, indexfrom sys import argvimport shutilclass Retrieve(object):def __init__(self, url):self.url = urlself.fileName = self.getFileName(url)self.user_agent = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1)'def getFileName(self, url, defaultName = 'index.html'):parseurl = urlparse.urlparse(url, 'http:', False)path = parseurl[1] + parseurl[2] ext = splitext(path) if ext[1] == '': if path[-1] == '/':path += defaultNameelse:path += '/' + defaultNameldir = dirname(path)if not isdir(ldir):if exists(ldir):unlink(ldir)totalDir = ''while True:try:sepIndex = index(ldir, '/')totalDir += ldir[0 : sepIndex]if not isdir(totalDir):if exists(totalDir):unlink(totalDir)makedirs(totalDir)totalDir += '/'ldir = ldir[sepIndex + 1:]except ValueError:totalDir += ldirmakedirs(totalDir)breakreturn pathdef download(self):try:headers = {'User-Agent' : self.user_agent}req = urllib2.Request(self.url, headers = headers)response = urllib2.urlopen(req)retval = response.readlines()f = open(self.fileName, 'w')for str in retval:f.write(str)f.close()except IOError:retval = '***'return retvaldef parseAndGetLinks(self):self.htmlParse = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))self.htmlParse.feed(open(self.fileName).read())self.htmlParse.close()return self.htmlParse.anchorlistclass Crawler(object):def __init__(self, url):self.url = urlself.urlQueue = [url]self.urlSeenQueue = [] self.domain = urlparse.urlparse(url)[1]if isdir(self.domain):shutil.rmtree(self.domain)def getPage(self, url):r = Retrieve(url)retVal = r.download()if retVal[0] == '*':returnurls = r.parseAndGetLinks()for urlOne in urls:if urlOne[:4] != 'http' and find(urlOne, '://') == -1:urlOne = urlparse.urljoin(url, urlOne)if find(lower(urlOne), 'mailto:') != -1:continueif urlOne not in self.urlSeenQueue:if find(urlOne, self.domain) == -1:continueif (find(urlOne, '#comments') != -1):continueif (find(urlOne, 'li2818') == -1):continueif urlOne not in self.urlQueue and urlOne not in self.urlSeenQueue:self.urlQueue.append(urlOne)self.urlSeenQueue.append(url)def testUseful(self, url):fUrl = urllib.urlopen(url)hCode = fUrl.getcode()if hCode != 200:return Falsereturn Truedef go(self):while self.urlQueue:url = self.urlQueue.pop()#if self.testUseful(url) == False:#continues = 'seen url' + urlprint sself.getPage(url)def printSeen(self):f = open('already_seen_url', 'w')while self.urlSeenQueue:f.write(self.urlSeenQueue.pop() + '\n')def main():#if len(argv) > 1:#url = argv[1]#else:#try:#url = raw_input('start with one url: ')#except(KeyboardInterrupt, EOFError):#url = ''#if not url:#return#crawler = Crawler(url)crawler = Crawler('http://blog.csdn.net/li2818')#crawler = Crawler('http://www.hao123.com')#crawler = Crawler('http://blog.csdn.net')crawler.go()crawler.printSeen()print 'done!'if __name__ == '__main__':main()
阅读全文
1 0
- 一个简单的爬虫程序,包含请求头。
- 一个简单的爬虫程序
- 一个简单的爬虫程序
- 一个简单的Web爬虫程序
- 一个简单的python爬虫程序
- 一个简单的python爬虫程序
- 一个简单的python爬虫程序
- 一个简单的pyhton 爬虫程序
- 【JAVA】一个简单的爬虫程序
- 一个简单的网络爬虫程序
- 一个最简单的爬虫-调度程序
- 简单的爬虫程序
- 一个简单的爬虫
- 一个简单的爬虫
- 一个简单的爬虫
- 一个简单的包含类的C++程序
- 制作一个简单的包含有过滤器的登录程序
- 一个简单的python爬虫程序+分词+标签云
- [译]Android架构组件 – 查看ViewModel – 第二部分
- 打印从1到n的整数
- Ubuntu12.04 LTS下Facebook scribe安装过程
- 你不知道的javascript之Object.create 和new区别
- Java简介
- 一个简单的爬虫程序,包含请求头。
- [Tomcat源码]-架构1
- Python使用struct模块转换C语言结构体,打包、解包二进制数据
- Android中的AsyncTask
- ARM处理器与51单片机程序编写的区别
- RecyclerView去掉滑动到边界阴影
- 65. Valid Number
- Caffe 模型微调 的场景、问题、技巧以及解决方案
- STM32中I2C协议时序和使用