验证最新可用代理 For http://www.5uproxy.net 多线程版

来源:互联网 发布:c语言逻辑运算符号 编辑:程序博客网 时间:2024/06/06 01:18




# coding:gbk# 验证最新可用代理 For http://www.5uproxy.net 多线程版#  2010.12.05import sysreload(sys)sys.setdefaultencoding('gbk')import urllibimport urllib2from urllib2 import URLError, HTTPErrorDEBUG = True#html页面下载函数def getHtml(url,post_data=None,cookie=None):        """Fetch the target html        url - URL to fetch        post_data - POST Entity        cookie - Cookie Header        """        if DEBUG:            print "getHtml: ",url        result =''                try:            #create a request            request = urllib2.Request(url)            #change User-Agent            request.add_header('User-Agent','Mozilla/5.0')                        #change Referrer            request.add_header('Referrer',url)                        #if has cookie,add cookie header            if cookie:               request.add_header('Cookie',cookie)            #create a opener            opener = urllib2.build_opener()                                   #if has post entity            if post_data:                #encode post data                post_data = urllib.urlencode(post_data)                                response = opener.open(request,post_data)            else:                response = opener.open(request)                        result = response.read()                            response.close()            #no content,don't save            if not result or len(result)==0:                return ''                        return  result        except HTTPError, e:            if DEBUG:                print 'Error retrieving data:',e                print 'Server error document follows:\n'                #print e.read()            return ''        except URLError, e:            if hasattr(e, 'reason'):                if DEBUG:                    print 'Failed to reach a server.'                    print 'Reason: ', e.reason                return ''            elif hasattr(e, 'code'):                if DEBUG:                    print 'The server couldn\'t fulfill the request.'                    print 'Error code: ', e.code                return ''        except Exception, e:            if DEBUG:                print e            return ''        #需要验证的代理列表proxy_urls = []proxy_urls.append({'url':'http://www.5uproxy.net/http_fast.html','type':'http_fast'})proxy_urls.append({'url':'http://www.5uproxy.net/http_anonymous.html','type':'http_anonymous'})proxy_urls.append({'url':'http://www.5uproxy.net/http_non_anonymous.html','type':'http_transparent'})proxy_urls.append({'url':'http://www.5uproxy.net/socks5.html','type':'socks5'})import reimport socketimport timeimport threadingresult =[]#线程同步锁lock = threading.Lock() def synchronous(f):    def call(*args, **kwargs):        lock.acquire()        try:            return f(*args, **kwargs)        finally:            lock.release()    return call#先获取所有待验证的代理proxies = []for proxy_url in proxy_urls:    html = getHtml(proxy_url['url'])        #正则匹配获取每一代理    rs = re.compile(r'''<tr .*?>[\s\S]*?<td .*?>\d+?</td>[\s\S]*?<td>(\S+?)</td>[\s\S]*?<td .*?>(\S+?)</td>[\s\S]*?<td .*?>(\S+?)</td>[\s\S]*?</tr>''',re.DOTALL).findall(html)        for r in rs:        proxy = {}                #代理域名        proxy['domain'] = r[0]        #代理端口        proxy['port'] = r[1]        #代理国家        proxy['state'] = r[2]        #代理类型        proxy['type'] = proxy_url['type']        #响应时间        proxy['time'] = 0                if not (proxy in proxies):            proxies.append(proxy)#获取一个待验证代理@synchronousdef getproxy():    global proxies    if len(proxies)>0:        return proxies.pop()    else:        return ''    #保存验证结果@synchronousdef saveresult(proxy):    global result        if not(proxy in result):        result.append(proxy)#线程函数def verify():        while 1:        proxy = getproxy()        #所有代理均已验证完毕        if len(proxy)==0:            return                print "正在验证:%s,%s" % (proxy['domain'],proxy['port'])                #验证代理的可用性        #创建一个TCP连接套接字        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)        #设置10超时        sock.settimeout(10)        try:            start = time.clock()                        #连接代理服务器            sock.connect((proxy['domain'], int(proxy['port'])))            proxy['time'] = int((time.clock() - start) * 1000)             sock.close()                        saveresult(proxy)            print "%s,%s 验证通过,响应时间:%d ms." % (proxy['domain'],proxy['port'],proxy['time'])        except Exception, e:            if DEBUG:                print e                        print "%s,%s 验证失败." % (proxy['domain'],proxy['port'])#init thread_pool thread_pool = []for i in range(20):     th = threading.Thread(target=verify,args=()) ;     thread_pool.append(th)# start threads one by one         for thread in thread_pool:     thread.start()#collect all threads for thread in thread_pool:     threading.Thread.join(thread)#结果按响应时间从小到大排序result.sort(lambda x,y: cmp(x['time'], y['time']))  fname = 'proxy_'+ time.strftime('%Y-%m-%d-%H-%M-%S',time.localtime(time.time())) +'.txt'file = open(fname,'w')print "验证结果如下:"for item in result:    str = '%s,%s,%s,%s,%d' % (item['type'],item['domain'],item['port'],item['state'],item['time'])    print str    file.write(str+'\n')    file.close()print "所有代理已验证完毕,共计%d个验证通过。验证通过的代理已存入%s" % (len(result),fname)             


原创粉丝点击