Python 爬虫 —— 爬取 IP && 快速验证

来源:互联网 发布:网络答题知识竞赛答案 编辑:程序博客网 时间:2024/04/29 21:46

知识共享许可协议
本作品采用知识共享署名 4.0 国际许可协议进行许可。

使用的是西刺免费代理IP的 IP 资源,参考了python爬虫-爬取代理IP并通过多线程快速验证

CODE:
获取 IP

import requestsfrom bs4 import BeautifulSoup# get proxiesfor page in range(1,50):    url = 'http://www.xicidaili.com/nn/%s' %page    user_agent = "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0"    headers = {'user-agent': user_agent}    r = requests.get(url, headers = headers)    print 'opening %s\n' % url    soup = BeautifulSoup(r.content, 'lxml')    # print soup.prettify()    trs = soup.find('table', id = 'ip_list').findAll('tr') #   print trs[1]    for tr in trs[1:]:        tds = tr.findAll('td')        ip = tds[1].text.strip()        port = tds[2].text.strip()        with open('proxies.txt', 'a') as f:            f.write('http://%s:%s\n' % (ip, port))        print 'Adding http://%s:%s to table' % (ip, port)

验证

import reimport requestsimport threadingsrcfile = open('proxies.txt', 'r')outfile = open('verified.txt', 'w')url = 'http://pv.sohu.com/cityjson?ie=utf-8'user_agent = "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0"headers = {'user-agent': user_agent}mutex = threading.Lock()def verify():    try:        mutex.acquire()        proxy = srcfile.readline().strip()        mutex.release()        print proxy        proxies = {'http': proxy}        r = requests.get(url, proxies = proxies, headers = headers, timeout = 5)        print r.content        mutex.acquire()        outfile.write('%s\n' %proxy)        mutex.release()    except requests.RequestException, e:        print echildthread = []for i in range(4900):    t = threading.Thread(target = verify)    childthread.append(t)    t.start()for t in childthread:    t.join()srcfile.close()outfile.close()
0 0
原创粉丝点击