第2.2章 scrapy之多进程检测代理ip的有效性

来源:互联网 发布:网络测线仪品牌 编辑:程序博客网 时间:2024/06/10 20:56

1 multiprocessing
Python多进程multiprocessing使用示例
mutilprocess的作用是能够像线程一样管理进程,在多核CPU利用率比threading要好的多。
2 从数据库中读取爬到的代理进行验证
下面的代码参考了qiyeboy/IPProxyPool

# -*- coding: utf-8 -*-'''Created on 2017年6月14日检测ip是否可用@author: dzm'''import sysreload(sys)sys.setdefaultencoding('utf8')from eie.service.EieIpService import EieIpServiceimport multiprocessingfrom multiprocessing import Processfrom eie.middlewares import udf_configfrom gevent import monkeyimport geventmonkey.patch_all()import osfrom eie.middlewares.random_user_agent import RandomUserAgentimport jsonimport timeimport requestslogger = udf_config.loggereieIpService = EieIpService()class CheckIpProxyService(object):    def __init__(self):        '''                        使用httpbin做http验证                        检测有效期为5s        '''        self.http_timeout = 5        self.target_url = 'http://httpbin.org/get'        self.target_url_https = 'https://httpbin.org/get'        self.target_url_ip = 'http://httpbin.org/ip'        self.my_ip = None        # 最大进程数量        self.max_check_process = multiprocessing.cpu_count()        # 每个进程最大并发        self.max_check_construct_per_process = 30        # 任务队列数量        self.task_queue_size = 50        # 进程数达到上限时的等待时间        self.check_wati_time = 1    def detect_proxy(self,proxy):        ip = proxy['ip']        port = proxy['port']        proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)}        http, types, speed = self.check_proxy(proxies,proxy['types'])        if http==False:            eieIpService.delete(ip, port)        else:            pass    def check_proxy(self,proxies,types):        if types == 'HTTP':            http, http_types, http_speed = self._checkHttpProxy(proxies)        else:            http, http_types, http_speed = self._checkHttpProxy(proxies,False)        return http, http_types, http_speed    def _checkHttpProxy(self,proxies,is_http=True):        types = -1        speed = -1        if is_http:            test_url = self.target_url        else:            test_url = self.target_url_https        try:            start = time.time()            randomUserAgent = RandomUserAgent()            r = requests.get(url=test_url,headers=randomUserAgent.get_headers(),timeout=self.http_timeout,proxies=proxies)            logger.debug('请求结果为%s' % r)            if r.ok:                speed = round(time.time()-start,2)                content = json.loads(r.text)                headers = content['headers']                ip = content['origin']                proxy_connection = headers.get('Proxy-Connection',None)                if ',' in ip:                    types = 2                elif proxy_connection:                    types = 1                else:                    types = 0                logger.debug('%s 代理有效' % proxies)                return True,types, speed            else:                return False,types, speed        except Exception,e:            logger.debug('%s 代理无效' % proxies)            return False,types, speed    def get_my_ip(self):        '''                        检测自己的ip地址        '''        try:            randomUserAgent = RandomUserAgent()            r = requests.get(url=self.target_url_ip, headers=randomUserAgent.get_headers(), timeout=self.http_timeout)            ip = json.loads(r.text)            self.my_ip = ip['origin']        except Exception,e:            raise Exception('访问 %s 失败,请检查网络连接' % self.target_url_ip)    def run(self):        '''        gevent协程的用法        @see: http://www.cnblogs.com/tkqasn/p/5705338.html        '''        proxy_list = eieIpService.select()        spawns = []        for proxy in proxy_list:            spawns.append(gevent.spawn(self.detect_proxy, proxy))            if len(spawns) >= self.max_check_construct_per_process:                gevent.joinall(spawns)                spawns = []        if len(spawns)>0:            gevent.joinall(spawns)if __name__ == '__main__':    ip = '59.37.17.202'    port = '808'    proxies = {"http": "http://%s:%s" % (ip, port)}    c = CheckIpProxyService()#     c.check_proxy(proxies, 'HTTP')    p = Process(target=c.run)    p.start()    p.join()

3 gevent
待续…