分享一個python爬蟲,用來檢測網站可用性

来源:互联网 发布:ansys软件怎么用 编辑:程序博客网 时间:2024/06/05 05:23

talk is cheep, show you my code. 歡迎吐槽。


Code1. queue server.

#!/usr/bin/python# -*- coding: utf-8 -*-# queue server. import socketfrom base64 import decodestring as b64decodefrom Queue import Queuevisited = set()prequeue= Queue()def rqs_process(ss):    rqs = ss.recv(1024).split()    if rqs[0] == 'rqaddr': # request a address to spide.        try:            url = prequeue.get(timeout=1)            ss.send(url)        except:            ss.send('wait')    elif rqs[0] == 'response': # submit data to slave.        dsize = int(rqs[1])        ss.send('start')        data = ''        while len(data) < dsize:            data += ss.recv(1024)        addrs = eval(b64decode(data))        for addr in addrs:            if addr not in visited:                visited.add(addr)                prequeue.put(addr)    print 'Queue size:',prequeue.qsize()if __name__ == '__main__':    #init    root = 'http://m.sohu.com/'    prequeue.put(root)    visited.add(root)    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)    s.bind(('127.0.0.1', 52000))    s.listen(5)    while True:        ss, addr = s.accept()        rqs_process(ss)        ss.close()    s.close()

Code 2. worker. 可多機並行工作,可自己寫header設置函數和網頁分析函數。

#!/usr/bin/python# -*- coding: utf-8 -*-# worker.from base64 import encodestring as b64encodefrom time import sleepimport reimport socketfrom threading import Thread, Lockimport requestsfrom datetime import datetime as dtfrom BeautifulSoup import BeautifulSoup,SoupStrainerimport MySQLdbimport urlparse as urlprunning, queue_addr = None, Nonemutex = Lock()class PseudoRequest():    status_code = None    reason = None    url = Noneclass ErrorRecoder():    host = None    user = None    passwd = None    db = None    def __init__(self):        self.db = MySQLdb.connect(                host=self.host, user=self.user,                passwd=self.passwd, db=self.db, charset="utf8")        self.cr = self.db.cursor()    def save(self, url, status_code, reason):        self.cr.execute(           "insert into "           "errors (url, create_time, status_code, reason) "           "values (%s,%s,%s,%s)",           (url, dt.now().isoformat(), status_code, reason)        )        self.db.commit()        print 'RECORD:', url, status_code, reason    def __del__(self):        self.cr.close()        self.db.close()def get_url():    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)    try:        s.connect(queue_addr)        s.send('rqaddr')        data = s.recv(1024)        s.close()    except:        data = 'wait'    return datadef get_req(url, timeout=5, header=None):    if not header:        header = {                'Accept-Encoding': 'gzip, deflate, compress',                'Accept': '*/*',                'User-Agent': 'WebChk0.0001'        }    for time in xrange(3):        try:            r = requests.get(url, timeout=timeout, headers = header)            return r        except:            pass    # create PseudoClass if timeout    r = PseudoRequest()    r.url = url    r.status_code = 999    r.reason = 'Server Unreachable'    return rdef submit_data(addrs):    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)    s.connect(queue_addr)    data = b64encode(str(addrs))    s.send('response ' + '%06d ' % (len(data)))    verify = s.recv(1024)    if verify == 'start':        s.send(data)    s.close()def worker_t(href_get, head_mker, timeout):    rcd = ErrorRecoder()    while True:        if running == False:            break        mutex.acquire(1)        proc_url = get_url()        mutex.release()        if proc_url == 'wait':            sleep(2)            print 'server problem. keep trying...'            continue        r = get_req(proc_url, timeout, head_mker(proc_url))        if r.status_code != 200:            rcd.save(proc_url, r.status_code, r.reason)        elif r.text.find(u'<header class="ns">403:页面没有找到。</header>') > 0:            rcd.save(proc_url, '403', 'Normal Bad Request')        else:            hrefs = href_get(r)            mutex.acquire(1)            submit_data(hrefs)            mutex.release()def href_get(r):    def parse_href(href):        x = urlp.urljoin(r.url, href)        x = urlp.urlparse(x)        x = list(x[:3]) + ['']*3        x = urlp.urlunparse(x)        return x    if not re.search('m\.sohu\.com', r.url):        return []    try:        soup = BeautifulSoup(r.text, parseOnlyThese=SoupStrainer('a'))    except:        return []    hrefs = [ parse_href(l['href']) for l in soup if l.has_key('href')]    #print 'links:',len(hrefs)    return hrefsdef start_work(workers = 2,               qaddr = ('127.0.0.1', 52000),               dbargs = ('localhost','root','','webchk'),               href_fliter_func = href_get,               header_set_func = lambda x:None,               request_time_out = 5):    # globals    global running, queue_addr    running = True    queue_addr = qaddr    ER = ErrorRecoder    ER.host, ER.user, ER.passwd, ER.db = dbargs    # mk threads    args = (href_fliter_func, header_set_func, request_time_out)    t = [ Thread(target=worker_t, args=args) for i in range(workers)]    for i in t:        i.start()    # waitting for halt    raw_input('Press ENTER to stop threads...\n')    running = False    print 'Workers dead.'if __name__ == '__main__':    start_work(50)

原创粉丝点击