分享一個python爬蟲,用來檢測網站可用性
来源:互联网 发布:ansys软件怎么用 编辑:程序博客网 时间:2024/06/05 05:23
talk is cheep, show you my code. 歡迎吐槽。
Code1. queue server.
#!/usr/bin/python# -*- coding: utf-8 -*-# queue server. import socketfrom base64 import decodestring as b64decodefrom Queue import Queuevisited = set()prequeue= Queue()def rqs_process(ss): rqs = ss.recv(1024).split() if rqs[0] == 'rqaddr': # request a address to spide. try: url = prequeue.get(timeout=1) ss.send(url) except: ss.send('wait') elif rqs[0] == 'response': # submit data to slave. dsize = int(rqs[1]) ss.send('start') data = '' while len(data) < dsize: data += ss.recv(1024) addrs = eval(b64decode(data)) for addr in addrs: if addr not in visited: visited.add(addr) prequeue.put(addr) print 'Queue size:',prequeue.qsize()if __name__ == '__main__': #init root = 'http://m.sohu.com/' prequeue.put(root) visited.add(root) s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.bind(('127.0.0.1', 52000)) s.listen(5) while True: ss, addr = s.accept() rqs_process(ss) ss.close() s.close()
Code 2. worker. 可多機並行工作,可自己寫header設置函數和網頁分析函數。
#!/usr/bin/python# -*- coding: utf-8 -*-# worker.from base64 import encodestring as b64encodefrom time import sleepimport reimport socketfrom threading import Thread, Lockimport requestsfrom datetime import datetime as dtfrom BeautifulSoup import BeautifulSoup,SoupStrainerimport MySQLdbimport urlparse as urlprunning, queue_addr = None, Nonemutex = Lock()class PseudoRequest(): status_code = None reason = None url = Noneclass ErrorRecoder(): host = None user = None passwd = None db = None def __init__(self): self.db = MySQLdb.connect( host=self.host, user=self.user, passwd=self.passwd, db=self.db, charset="utf8") self.cr = self.db.cursor() def save(self, url, status_code, reason): self.cr.execute( "insert into " "errors (url, create_time, status_code, reason) " "values (%s,%s,%s,%s)", (url, dt.now().isoformat(), status_code, reason) ) self.db.commit() print 'RECORD:', url, status_code, reason def __del__(self): self.cr.close() self.db.close()def get_url(): s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) try: s.connect(queue_addr) s.send('rqaddr') data = s.recv(1024) s.close() except: data = 'wait' return datadef get_req(url, timeout=5, header=None): if not header: header = { 'Accept-Encoding': 'gzip, deflate, compress', 'Accept': '*/*', 'User-Agent': 'WebChk0.0001' } for time in xrange(3): try: r = requests.get(url, timeout=timeout, headers = header) return r except: pass # create PseudoClass if timeout r = PseudoRequest() r.url = url r.status_code = 999 r.reason = 'Server Unreachable' return rdef submit_data(addrs): s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.connect(queue_addr) data = b64encode(str(addrs)) s.send('response ' + '%06d ' % (len(data))) verify = s.recv(1024) if verify == 'start': s.send(data) s.close()def worker_t(href_get, head_mker, timeout): rcd = ErrorRecoder() while True: if running == False: break mutex.acquire(1) proc_url = get_url() mutex.release() if proc_url == 'wait': sleep(2) print 'server problem. keep trying...' continue r = get_req(proc_url, timeout, head_mker(proc_url)) if r.status_code != 200: rcd.save(proc_url, r.status_code, r.reason) elif r.text.find(u'<header class="ns">403:页面没有找到。</header>') > 0: rcd.save(proc_url, '403', 'Normal Bad Request') else: hrefs = href_get(r) mutex.acquire(1) submit_data(hrefs) mutex.release()def href_get(r): def parse_href(href): x = urlp.urljoin(r.url, href) x = urlp.urlparse(x) x = list(x[:3]) + ['']*3 x = urlp.urlunparse(x) return x if not re.search('m\.sohu\.com', r.url): return [] try: soup = BeautifulSoup(r.text, parseOnlyThese=SoupStrainer('a')) except: return [] hrefs = [ parse_href(l['href']) for l in soup if l.has_key('href')] #print 'links:',len(hrefs) return hrefsdef start_work(workers = 2, qaddr = ('127.0.0.1', 52000), dbargs = ('localhost','root','','webchk'), href_fliter_func = href_get, header_set_func = lambda x:None, request_time_out = 5): # globals global running, queue_addr running = True queue_addr = qaddr ER = ErrorRecoder ER.host, ER.user, ER.passwd, ER.db = dbargs # mk threads args = (href_fliter_func, header_set_func, request_time_out) t = [ Thread(target=worker_t, args=args) for i in range(workers)] for i in t: i.start() # waitting for halt raw_input('Press ENTER to stop threads...\n') running = False print 'Workers dead.'if __name__ == '__main__': start_work(50)
- 分享一個python爬蟲,用來檢測網站可用性
- 可用性
- python测试一批机器80端口可用性
- (一)什么是高可用性解决方案?
- [分享]可用性管理:简单的指标,综合的管理
- Python爬虫抓取代理IP并检验可用性
- SQL Server 2005复制(一.可用性测试评估)
- 实验一:SQL server 2005高可用性之----日志传送
- SQL SERVER 2005 数据库镜像 高可用性 (一)
- 实验一:SQL server 2005高可用性之----日志传送
- OpenStack 高可用性解决方案手动安装指南(一)
- SQL Server 2005复制(一.可用性测试评估)
- SAP HANA分布式系统及高可用性(一)
- SAP HANA分布式系统及高可用性(一)
- SAP HANA分布式系统及高可用性(一)
- [开源] android apk 一键生成混淆文件 脚本分享 (python)
- 1900页Python系列PPT分享一:基础知识(106页)
- 分享一套 python 试题
- android EditText中的inputType
- 败局—断送职业生涯的行为和思维方式
- 解释图像的频谱
- logback 常用配置详解(三) <filter>
- 1
- 分享一個python爬蟲,用來檢測網站可用性
- 神奇的图像处理算法
- js获得屏幕的位置
- C++动态绑定和模板的简单配合使用(virtual关键字)
- cookie和session的使用和区别
- AP作为WLAN用户接入认证点的PEAP用户接入流程
- MFC中的ODBC类
- [iphone]实现单选按钮
- Android service里面启动activity和alertdialog