一个简单的爬虫程序,包含请求头。

来源:互联网 发布:后台数据集 js调用 编辑:程序博客网 时间:2024/06/05 20:54
import urlparsefrom os import sep, unlink, makedirs, rmdirfrom os.path import splitext, dirname, isdir, existsimport urllibimport urllib2from htmllib import HTMLParserfrom formatter import AbstractFormatter, DumbWriterfrom cStringIO import StringIOfrom string import replace, find, lower, indexfrom sys import argvimport shutilclass Retrieve(object):def __init__(self, url):self.url = urlself.fileName = self.getFileName(url)self.user_agent = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1)'def getFileName(self, url, defaultName = 'index.html'):parseurl = urlparse.urlparse(url, 'http:', False)path = parseurl[1] + parseurl[2]  ext = splitext(path)  if ext[1] == '': if path[-1] == '/':path += defaultNameelse:path += '/' + defaultNameldir = dirname(path)if not isdir(ldir):if exists(ldir):unlink(ldir)totalDir = ''while True:try:sepIndex = index(ldir, '/')totalDir += ldir[0 : sepIndex]if not isdir(totalDir):if exists(totalDir):unlink(totalDir)makedirs(totalDir)totalDir += '/'ldir = ldir[sepIndex + 1:]except ValueError:totalDir += ldirmakedirs(totalDir)breakreturn pathdef download(self):try:headers = {'User-Agent' : self.user_agent}req = urllib2.Request(self.url, headers = headers)response = urllib2.urlopen(req)retval = response.readlines()f = open(self.fileName, 'w')for str in retval:f.write(str)f.close()except IOError:retval = '***'return retvaldef parseAndGetLinks(self):self.htmlParse = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))self.htmlParse.feed(open(self.fileName).read())self.htmlParse.close()return self.htmlParse.anchorlistclass Crawler(object):def __init__(self, url):self.url = urlself.urlQueue = [url]self.urlSeenQueue = [] self.domain = urlparse.urlparse(url)[1]if isdir(self.domain):shutil.rmtree(self.domain)def getPage(self, url):r = Retrieve(url)retVal = r.download()if retVal[0] == '*':returnurls = r.parseAndGetLinks()for urlOne in urls:if urlOne[:4] != 'http' and find(urlOne, '://') == -1:urlOne = urlparse.urljoin(url, urlOne)if find(lower(urlOne), 'mailto:') != -1:continueif urlOne not in self.urlSeenQueue:if find(urlOne, self.domain) == -1:continueif (find(urlOne, '#comments') != -1):continueif (find(urlOne, 'li2818') == -1):continueif urlOne not in self.urlQueue and urlOne not in self.urlSeenQueue:self.urlQueue.append(urlOne)self.urlSeenQueue.append(url)def testUseful(self, url):fUrl = urllib.urlopen(url)hCode = fUrl.getcode()if hCode != 200:return Falsereturn Truedef go(self):while self.urlQueue:url = self.urlQueue.pop()#if self.testUseful(url) == False:#continues = 'seen url' + urlprint sself.getPage(url)def printSeen(self):f = open('already_seen_url', 'w')while self.urlSeenQueue:f.write(self.urlSeenQueue.pop() + '\n')def main():#if len(argv) > 1:#url = argv[1]#else:#try:#url = raw_input('start with one url: ')#except(KeyboardInterrupt, EOFError):#url = ''#if not url:#return#crawler = Crawler(url)crawler = Crawler('http://blog.csdn.net/li2818')#crawler = Crawler('http://www.hao123.com')#crawler = Crawler('http://blog.csdn.net')crawler.go()crawler.printSeen()print 'done!'if __name__ == '__main__':main()