Python采集代理ip并判断是否可用和定时更新

来源:互联网 发布:watershed算法原理 编辑:程序博客网 时间:2024/06/06 17:01

网上有很多免费的ip地址,都是可以使用的,但是如果手动来获取太麻烦,这里通过Python自动抓取,可以批量获取。代码如下:

# -*- coding: utf-8 -*-import reimport urllib2import jsonimport osimport timeimport socketclass ProxyIp(object):    def __init__(self):        self.path = os.path.split(os.path.realpath(__file__))[0]    # Get latest proxy ip and download to json    def update_ip(self):        print 'Update Ip'        url = 'http://www.ip3366.net/free/'        req = urllib2.Request(url)        response = urllib2.urlopen(req)        matches = re.findall(            ur'(\d+.\d+.\d+.\d+)</td>\s+<td>(\d+)</td>\s+<td>.*?</td>\s+<td>(HTTPS?)</td>',            response.read(),            re.I        )        ls = []        for match in matches:            if self.is_open(match[0], match[1]):                ls.append({'ip':match[0], 'port':match[1], 'protocol': match[2]})        with open('%s/ip.json' % self.path, 'w') as f:            json.dump(ls, f)        return ls    # whether the ips is last or old.    def is_last(self):        m_time = int(os.path.getmtime('%s/ip.json' % self.path))        now_time = int(time.time())        return (now_time - m_time) > 60*60*4  # 4 hours    @staticmethod    def is_open(ip, port):        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)        try:            s.connect(ip, int(port))            return True        except:            print 'Faild IP: %s:%s' % (ip, port)            return False    def get_proxy_ips(self):        if not self.is_last():            return self.update_ip()        else:            with open('%s/ip.json' % self.path, 'r') as f:                return json.load(f)
原创粉丝点击