抓proxyhttp.net代理脚本
来源:互联网 发布:centos 7 mount ntfs 编辑:程序博客网 时间:2024/05/17 23:58
#!/usr/bin/env python # -*- coding: utf-8 -*- ''''' file: proxys.py author: darkbull date: 2011-08-01 desc: 从http://proxyhttp.net上抓取代码服务器地址 ''' import urllib2 import re import time _LASTEST_PROXY = '' # 最近一次获取的ip def _get_html(url): try: headers = {"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3", # "Accept-Encoding": "gzip,deflate,sdch", 'Accept-Language': 'en-US,en;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'proxyhttp.net', 'User-Agent': 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30', 'Referer': '',} req = urllib2.Request(url, headers = headers) conn = urllib2.urlopen(req, timeout = 10) html = conn.read() conn.close() return html except: return '' def _parse_html(html): # 网页在计算端口值时做了手脚,例如端口80,在网站上表示为:52,44, 其实52 = ord('8') - 4, 0 = ord('0') - 4,这里的seed就是4 try: seed = 0 pattern = r'String\.fromCharCode\((.+)\+parseInt\(a\[i\]\)\)' m = re.search(pattern, html) if m: seed = int(m.groups(0)[0]) else: raise Exception(u'known flag') beg_tag = '<table class="proxytbl" cellSpacing="1">' beg = html.index(beg_tag) + len(beg_tag) html = html[beg:] end = html.index('</table>') html = html[:end] pattern = r'<td class="t_ip">(.+?)</td>\s*<td class="t_port">(.+?)</td>\s*<td class="t_country"><img[^>]*/>(\w+)</td>[\s\S]+?<td class="t_https">(.*?)</td>' _p = lambda port, seed: ''.join([chr(int(i) + seed) for i in port.split(',')]) return ['%s:%s,%s,%s' % (ip, _p(port, seed), controy, 'http' if https == '-' else 'https') for ip, port, controy, https in re.findall(pattern, html)] except: return [] def get_all_proxys(): '''''获取所有的代码服务器列表 Note: http://proxyhttp.net上只保存最近8页数据 ''' global _LASTEST_PROXY pat = 'http://proxyhttp.net/free-list/anonymous-server-hide-ip-address/%d#proxylist' ret = [ ] for i in range(1, 3): url = pat % i html = _get_html(url) if html: ret.extend(_parse_html(html)) else: if __debug__: print 'no html at:', url if ret: _LASTEST_PROXY = ret[0] return ret def get_lastest_proxys(): '''''获取最新添加的代理服务器 ''' global _LASTEST_PROXY if not _LASTEST_PROXY: return get_all_proxys() else: pat = 'http://proxyhttp.net/free-list/anonymous-server-hide-ip-address/%d#proxylist' ret = [ ] for i in range(1, 3): url = pat % i html = _get_html(url) if html: t = _parse_html(html) if _LASTEST_PROXY in t: idx = t.index(_LASTEST_PROXY) if idx == 0: return ret t = t[:idx] _LASTEST_PROXY = t[0] ret.extend(t) return ret else: ret.extend(t) else: if __debug__: print 'no html at:', url return ret if __name__ == '__main__': import time while True: ret = get_lastest_proxys() for i in ret: print i print len(ret) time.sleep(10)