抓proxyhttp.net代理脚本

来源：互联网发布：centos 7 mount ntfs 编辑：程序博客网时间：2024/05/17 23:58
#!/usr/bin/env python  # -*- coding: utf-8 -*-    '''''     file: proxys.py     author: darkbull     date: 2011-08-01     desc:         从http://proxyhttp.net上抓取代码服务器地址 '''    import urllib2  import re  import time    _LASTEST_PROXY = '' # 最近一次获取的ip    def _get_html(url):      try:          headers = {"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",                "Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3",                # "Accept-Encoding": "gzip,deflate,sdch",              'Accept-Language': 'en-US,en;q=0.8',              'Cache-Control': 'max-age=0',              'Connection': 'keep-alive',              'Host': 'proxyhttp.net',              'User-Agent': 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30',              'Referer': '',}          req = urllib2.Request(url, headers = headers)          conn = urllib2.urlopen(req, timeout = 10)          html = conn.read()          conn.close()          return html      except:          return ''          def _parse_html(html):      # 网页在计算端口值时做了手脚，例如端口80，在网站上表示为：52,44, 其实52 = ord('8') - 4, 0 = ord('0') - 4，这里的seed就是4      try:          seed = 0          pattern = r'String\.fromCharCode\((.+)\+parseInt\(a\[i\]\)\)'          m = re.search(pattern, html)          if m:              seed = int(m.groups(0)[0])          else:              raise Exception(u'known flag')            beg_tag = '<table class="proxytbl" cellSpacing="1">'          beg = html.index(beg_tag)  + len(beg_tag)          html = html[beg:]          end = html.index('</table>')          html = html[:end]                    pattern = r'<td class="t_ip">(.+?)</td>\s*<td class="t_port">(.+?)</td>\s*<td class="t_country"><img[^>]*/>(\w+)</td>[\s\S]+?<td class="t_https">(.*?)</td>'          _p = lambda port, seed: ''.join([chr(int(i) + seed) for i in port.split(',')])          return ['%s:%s,%s,%s' % (ip, _p(port, seed), controy, 'http' if https == '-' else 'https') for ip, port, controy, https in re.findall(pattern, html)]      except:          return []          def get_all_proxys():      '''''获取所有的代码服务器列表     Note: http://proxyhttp.net上只保存最近8页数据     '''      global _LASTEST_PROXY      pat = 'http://proxyhttp.net/free-list/anonymous-server-hide-ip-address/%d#proxylist'      ret = [ ]      for i in range(1, 3):          url = pat % i          html = _get_html(url)          if html:              ret.extend(_parse_html(html))          else:              if __debug__:                  print 'no html at:', url      if ret:          _LASTEST_PROXY = ret[0]      return ret          def get_lastest_proxys():      '''''获取最新添加的代理服务器     '''      global _LASTEST_PROXY      if not _LASTEST_PROXY:          return get_all_proxys()      else:          pat = 'http://proxyhttp.net/free-list/anonymous-server-hide-ip-address/%d#proxylist'          ret = [ ]          for i in range(1, 3):              url = pat % i              html = _get_html(url)              if html:                  t = _parse_html(html)                  if _LASTEST_PROXY in t:                      idx = t.index(_LASTEST_PROXY)                      if idx == 0:                          return ret                      t = t[:idx]                      _LASTEST_PROXY = t[0]                      ret.extend(t)                      return ret                  else:                      ret.extend(t)              else:                  if __debug__:                      print 'no html at:', url          return ret          if __name__ == '__main__':      import time      while True:          ret = get_lastest_proxys()          for i in ret:              print i                        print len(ret)          time.sleep(10)