基于广域网的主从分布式爬虫系统
来源:互联网 发布:crm系统数据 编辑:程序博客网 时间:2024/05/29 05:13
目的:构造基于广域网的主从分布式爬虫系统,以asp300网站为例,主机用来爬取任务地址,从机在每次任务完成会向主机发送任务请求,由主机分配任务。
原理:
整个爬虫系统分为两块,一块是主机,一块是从机,从机数量不限,视主机的瓶颈压力而定!主机一般只用一台,但如果性能不够或者达到性能瓶颈,那主机就成了限制整个爬虫系统的最大效率的瓶颈,这也就是所谓的“木桶效应”。
如何提高主机效率在本篇文章中不做研究,大家自行探索。
实现工具:
语言:Python
网址:ASP300.COM
环境:腾讯云学生主机(配置:1GHZ 单核 1G内存)
代码示例:
主机代码:
#coding=gb2312import urllib2import urllibimport randomimport socketfrom bs4 import BeautifulSoupuser_agent = ["User-Agent,Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36", "User-Agent,Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", "User-Agent,Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", "User-Agent,Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;", "User-Agent,Mozilla/5.0 (Windows NT 6.1; rv,2.0.1) Gecko/20100101 Firefox/4.0.1", "User-Agent,Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11", "User-Agent, Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"]def iteration(): url_list = [] for i in range(1, 100): url = 'http://www.asp300.com/SoftList/27/27_%d.html' % i # 伪造用户身份 headers = {'User-Agent': user_agent[random.randint(0, 6)]} # 随机headers req = urllib2.Request(url=url, headers=headers) #测试页面是否能丢失 try: response = urllib2.urlopen(req) except urllib2.URLError, e: if hasattr(e, 'code') and e.code == 404: continue url_list.append(url) return url_listdef server(url_list): HOST = '' PORT = 21567 BUFSIZ = 1024 ADDR = (HOST,PORT) #创建套接字 tcpSerSock = socket.socket(socket.AF_INET,socket.SOCK_STREAM) tcpSerSock.bind(ADDR) tcpSerSock.listen(5) while True: #死循环,使主机保持接受从机访问 print 'waiting for connection......' tcpCliSock, addr = tcpSerSock.accept() print '...connected from:',addr while True: data = tcpCliSock.recv(BUFSIZ) if not data: break #发送任务到从机 tcpCliSock.send('%s'%url_list.pop) tcpCliSock.close() tcpSerSock.close()if __name__ == 'main': url_list = iteration() server(url_list)
从机代码:
#coding=gb2312import urllib2import urllibimport cookielibimport refrom PIL import Imagefrom bs4 import BeautifulSoupimport socketimport timeclass MyError(Exception): #定义一个错误,留引发 passclass IgnoError(Exception): #定义一个错误,留引发 passclass RedirctHandler(urllib2.HTTPRedirectHandler): #重构302,301页面处理类 """docstring for RedirctHandler""" def http_error_301(self, req, fp, code, msg, headers): if 'location' in headers: newurl = headers.getheaders('location')[0] elif 'uri' in headers: newurl = headers.getheaders('uri')[0] else: return return newurl def http_error_302(self, req, fp, code, msg, headers): if 'location' in headers: newurl = headers.getheaders('location')[0] elif 'uri' in headers: newurl = headers.getheaders('uri')[0] else: return return newurldef Download(url,headers,num_retries=9): #下载url所指向的页面 req = urllib2.Request(url, headers=headers) try: response = urllib2.urlopen(req,timeout=60) the_page = response.read() response.close() except urllib2.URLError,e: if num_retries > 0 and hasattr(e,'code') and (500 <= e.code <600): return Download(url,headers,num_retries - 1) elif hasattr(e, 'code') and (e.code == 404): raise IgnoError else: print 'Download Error:', e.reason raise MyError except socket.timeout: if num_retries > 0 : return Download(url, headers, num_retries - 1) raise socket.timeout return the_pagedef resolve(html): #提取源码信息 image_url = [] soup = BeautifulSoup(html,"lxml") title = unicode(soup.head.title.string) title = re.search('(.*?)_ASP300',title).groups()[0] size = soup.find('div',class_='box').find('div',class_='box_1').find('div',id='goodsInfo').find('div',class_='textInfo').ul.find_all('li')[2].dd size = unicode(size) size = float(re.search(u'软件大小:(.*?)\D',size).groups()[0]) summary_tag = soup.find('div',class_='s') summary_content = unicode(summary_tag).strip() summary_content = summary_content.split('<br/>') summary_content[0] = summary_content[0][15:] del summary_content[len(summary_content)-1] for a,b in enumerate(summary_content): if b == '\n': del summary_content[a] summary_cahe = u'' for c in summary_content: summary_cahe += (c + u'<br/>') summary_content = summary_cahe for i in summary_tag.p.find_all('img'): image_url.append('http://www.asp300.com' + i['src']) #获取图片下载地址,放入image_url中,image_url中的元素为str,非unicode return title,size,summary_content,image_url#title,summary是unicode,size是float,image_url中的元素均为strdef download_image(name,url,headers,num_tries=9): #下载图片 req = urllib2.Request(url=url,headers=headers) try: f = urllib2.urlopen(req,timeout=60) except urllib2.URLError, e: if num_tries > 0 and hasattr(e,'code') and 500 <= e.code <600: return download_image(name,url,headers,num_tries - 1) else: print '下载图片出错:',e.reason raise MyError except socket.timeout: if num_tries > 0 : return download_image(name,url,headers,num_tries - 1) raise socket.timeout image = open(name,'wb') image.write(f.read()) f.close() image.close()def screenshot(name,change,format): # 去除水印 im = Image.open(name) w,h = im.size box = (0,0,w,h-change) region = im.crop(box) region.save(name,format)def soft_url(url,headers,num_retries=9): #获取软件真实下载地址 id = int(re.search('SoftView_(.*?).html',url).groups()[0]) url1 = 'http://www.asp300.com/2012dll/Down.jsp?CodeID=%d&id=1'%id #这一步是为了获取商品的ookie cookie = cookielib.CookieJar() handler = urllib2.HTTPCookieProcessor(cookie) opener1 = urllib2.build_opener(handler) req1 = urllib2.Request(url=url1,headers=headers) try: opener1.open(req1,timeout=60) print '%s:获取下载COOKIE成功'%time.ctime() except urllib2.URLError, e: if num_retries > 0 and hasattr(e,'code') and (500 <= e.code <600): return soft_url(url,headers,num_retries - 1) else: print 'SOFT_URL1 Error:', e.reason raise MyError except socket.timeout: if num_retries > 0: return soft_url(url, headers, num_retries - 1) raise socket.timeout #cookie获取完毕 #这一步是基于上一步的cookie,获取真是下载地址 #debug_handler = urllib2.HTTPHandler(debuglevel=1) opener2 = urllib2.build_opener(RedirctHandler,handler) url2 = 'http://www.asp300.com/2012dll/DownBJ.jsp?CodeID=%d'%id req2 = urllib2.Request(url=url2,headers=headers) try: html = opener2.open(req2,timeout=60) print '%s:获取下载地址成功'%time.ctime() except urllib2.URLError, e: if num_retries > 0 and hasattr(e,'code') and (500 <= e.code <600): return soft_url(url,headers,num_retries - 1) else: print 'SOFT_URL2 Error:', e.reason raise MyError except socket.timeout: if num_retries > 0: return soft_url(url, headers, num_retries - 1) raise socket.timeout return htmldef clicent(): HOST = ''#这里填主机ip,自己根据情况修改 PORT = 21567#这里填主机端口,自己根据情况修改 BUFSIZ = 1024 ADDR = (HOST,PORT) #创建套接字 tcpCliSock = socket.socket(socket.AF_INET,socket.SOCK_STREAM) tcpCliSock.connect(ADDR) data = "ask for task" tcpCliSock.send(data) data = tcpCliSock.recv(BUFSIZ) return datadef main(): headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 '} url = clicent() html = Download(url, headers) print '页面下载完成' title, size, summary_content, image_url = resolve(html) print '信息提取完成' id = 0 for i in image_url: name = './image/image_cache%d.jpg' % id download_image(name, i, headers) print '图片下载完成%d' % id screenshot(name, 52, 'jpeg') print '图片转换完成' id += 1 download_url = soft_url(url, headers) print title print size summary_content = summary_content.replace(u'\u200b',u'')#unicode特殊字符'\u200b',不能转为gb2312 print summary_content,type(summary_content) print summary_content.encode('gb2312') print image_url print download_urlif __name__ == '__main__': main()
0 0
- 基于广域网的主从分布式爬虫系统
- 基于java的分布式爬虫
- 基于java的分布式爬虫
- 基于Redis分布式爬虫
- 分布式爬虫系统
- 分布式爬虫系统随笔
- 基于Python的DBLP数据库爬虫系统
- 基于Java的DBLP数据库爬虫系统
- 基于Hadoop 的分布式网络爬虫技术学习笔记
- 基于Hadoop 的分布式网络爬虫技术学习笔记
- 基于Python,scrapy,redis的分布式爬虫实现框架
- 基于Redis的三种分布式爬虫策略
- 基于scrapy和redis的分布式爬虫环境搭建
- 基于Hadoop 的分布式网络爬虫技术学习笔记
- 基于PyHusky的分布式爬虫原理及实现
- 基于Scrapy分布式爬虫的开发与设计
- 基于Scrapy分布式爬虫的开发与设计
- 基于Python+scrapy+redis的分布式爬虫实现框架
- CCF201512-3画图
- lucene6 IntPoint
- Mac 常用命令
- 试题库问题
- <jsp:include page>与<%@include file>
- 基于广域网的主从分布式爬虫系统
- 【正一专栏】春日随感—赢在起跑线上
- Salesforce与微信开发(1)
- Android安全开发之安全使用HTTPS
- SQL数据库(10)——数据库设计中的命名规范
- Java构造器和方法的区别
- Processing 状态量控制动画技巧
- 上升子序列
- ubunut安装后配置