【scrapy ip代理池】如何解决ip被限制的问题

来源:互联网 发布:linux网络编程教程 编辑:程序博客网 时间:2024/06/03 19:36

1、首先写一个脚本proxies.py 自动获取代理ip

# *-* coding:utf-8 *-*import requestsfrom bs4 import BeautifulSoupimport lxmlfrom multiprocessing import Process, Queueimport randomimport jsonimport timeimport requestsclass Proxies(object):    """docstring for Proxies"""    def __init__(self, page=3):        self.proxies = []        self.verify_pro = []        self.page = page        self.headers = {        'Accept': '*/*',        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36',        'Accept-Encoding': 'gzip, deflate, sdch',        'Accept-Language': 'zh-CN,zh;q=0.8'        }        self.get_proxies()        self.get_proxies_nn()    def get_proxies(self):        page = random.randint(1,10)        page_stop = page + self.page        while page < page_stop:            url = 'http://www.xicidaili.com/nt/%d' % page            html = requests.get(url, headers=self.headers).content            soup = BeautifulSoup(html, 'lxml')            ip_list = soup.find(id='ip_list')            for odd in ip_list.find_all(class_='odd'):                protocol = odd.find_all('td')[5].get_text().lower()+'://'                self.proxies.append(protocol + ':'.join([x.get_text() for x in odd.find_all('td')[1:3]]))            page += 1    def get_proxies_nn(self):        page = random.randint(1,10)        page_stop = page + self.page        while page < page_stop:            url = 'http://www.xicidaili.com/nn/%d' % page            html = requests.get(url, headers=self.headers).content            soup = BeautifulSoup(html, 'lxml')            ip_list = soup.find(id='ip_list')            for odd in ip_list.find_all(class_='odd'):                protocol = odd.find_all('td')[5].get_text().lower() + '://'                self.proxies.append(protocol + ':'.join([x.get_text() for x in odd.find_all('td')[1:3]]))            page += 1    def verify_proxies(self):        # 没验证的代理        old_queue = Queue()        # 验证后的代理        new_queue = Queue()        print ('verify proxy........')        works = []        for _ in range(15):            works.append(Process(target=self.verify_one_proxy, args=(old_queue,new_queue)))        for work in works:            work.start()        for proxy in self.proxies:            old_queue.put(proxy)        for work in works:            old_queue.put(0)        for work in works:            work.join()        self.proxies = []        while 1:            try:                self.proxies.append(new_queue.get(timeout=1))            except:                break        print ('verify_proxies done!')    def verify_one_proxy(self, old_queue, new_queue):        while 1:            proxy = old_queue.get()            if proxy == 0:break            protocol = 'https' if 'https' in proxy else 'http'            proxies = {protocol: proxy}            try:                if requests.get('http://www.baidu.com', proxies=proxies, timeout=2).status_code == 200:                    print ('success %s' % proxy)                    new_queue.put(proxy)            except:                print ('fail %s' % proxy)if __name__ == '__main__':    a = Proxies()    a.verify_proxies()    print (a.proxies)    proxie = a.proxies    with open('proxies.txt', 'a') as f:       for proxy in proxie:             f.write(proxy+'\n')

2、接下来修改代理文件middlewares.py的内容为如下:

import random  import scrapy  from scrapy import log  # logger = logging.getLogger()  class ProxyMiddleWare(object):      """docstring for ProxyMiddleWare"""      def process_request(self,request, spider):          '''对request对象加上proxy'''          proxy = self.get_random_proxy()          print("this is request ip:"+proxy)          request.meta['proxy'] = proxy       def process_response(self, request, response, spider):          '''对返回的response处理'''          # 如果返回的response状态不是200,重新生成当前request对象          if response.status != 200:              proxy = self.get_random_proxy()              print("this is response ip:"+proxy)              # 对当前reque加上代理              request.meta['proxy'] = proxy               return request          return response      def get_random_proxy(self):          '''随机从文件中读取proxy'''          while 1:              with open('G:\\Scrapy_work\\myproxies\\myproxies\\proxies.txt', 'r') as f:                  proxies = f.readlines()              if proxies:                  break              else:                  time.sleep(1)          proxy = random.choice(proxies).strip()          return proxy  

3、修改下settings文件

DOWNLOADER_MIDDLEWARES = {     'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware':None,       'youx.middlewares.ProxyMiddleWare':125,       'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware':None  }或者DOWNLOADER_MIDDLEWARES = { 'haodf.middlewares.ProxyMiddleWare':400 }想要ip应用起来,还要在配置文件 settings 中添加 DOWNLOADER_MIDDLEWARES = { 'demo.proxy.proxMiddleware':400 } 这里的demo是工程的名字,proxy是py文件的名,proxMiddleware是类的名字
原创粉丝点击