获得代理ip并验证

来源:互联网 发布:windows日志在哪里 编辑:程序博客网 时间:2024/05/17 03:41

import reimport requestsimport threadingfrom queue import Queuefrom lxml import etreeclass ProxyIp:    @classmethod    def get_proxy_ip(cls):        url1 = "http://www.youdaili.net/"        href_queue = Queue()  # 到该网站找到需要抓取ip的链接        ip_queue = Queue()  # 网站上能找到的ip        ip_valid = []  # 有效的代理        url1_resp = requests.get(url1)        content = url1_resp.content.decode()        html = etree.HTML(content)        box = html.xpath("//div[@class='m_box2']/ul/li/a")        for i in range(24):            href_queue.put(box[i].get("href"))  # 将链接放到队列里        while not href_queue.empty():  # 获取所有代理            ip_href = href_queue.get()            try:                resp = requests.get(ip_href)                content = resp.text                html = etree.HTML(content)                ips = html.xpath("//div[@class='cont_font']/p/span[1]")                for ip in ips[0].itertext():                    ip_queue.put(ip)            except Exception as e:                print(e)                break        checkth = []        print("Starting Checking...")        for i in range(30):  # 使用多线程进行验证是否可用            checkth.append(CheckThreads(ip_queue, ip_valid))        for th in checkth:            th.start()        for th in checkth:            th.join()        ip_valid = list(set(ip_valid))        return ip_validclass CheckThreads(threading.Thread):    mutex = threading.Lock()    def __init__(self, ip_queue, ip_valid):        super(CheckThreads, self).__init__()        self.ip_queue = ip_queue        self.ip_valid = ip_valid    def run(self):        while not self.ip_queue.empty():            try:                ip_re = re.compile(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\:\d{2,5})')                ip = self.ip_queue.get()                ip_proxy = ip_re.search(ip).group()                proxies = {"http": ip_proxy}                headers = {                    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",                    "Accept-Encoding": "gzip, deflate",                    "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",                    "Cache-Control": "no-cache",                    "Connection": "keep-alive",                    "Host": "www.1356789.com",                    "Pragma": "no-cache",                    "Referer": "http://m.ip138.com/",                }                resp = requests.get(url="http://www.1356789.com/", proxies=proxies, headers=headers, timeout=10)                if resp.status_code == 200:                    page = resp.text                    ip_url = re.search(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})', page).group()                    ip_now = re.search(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})', ip_proxy).group()                    if ip_url == ip_now:  # 判断是否是匿名代理                        self.ip_valid.append(ip_proxy + " 匿名代理")                        print(ip_proxy + " 匿名代理")                    else:                        self.ip_valid.append(ip_proxy + " 透明代理")                        print(ip_proxy + " 透明代理")            except Exception:                print(ip_proxy + " 该代理ip无效或响应过慢!")if __name__ == "__main__":    ip = ProxyIp.get_proxy_ip()    f = open("ip.txt", "w")    for i in ip:        try:            f.write(i + "\n")        except Exception as e:            print(e)    f.close()


该爬虫抓取该站24个页面的代理ip,并且对抓取的ip进行过滤,过滤是否可用,以及是否为匿名代理。



来源:http://tor1024.com/spider/4218ZDsqRItW57236333

0 0