python维护代理池脚本

来源:互联网 发布:linux u 编辑:程序博客网 时间:2024/05/16 06:21

设计思路:

1.找个免费代理的网站,爬取网站代理ip等信息;

2.验证代理ip是否有用

3.把有用的代理ip信息存到数据库中

import requestsimport MySQLdbfrom bs4 import BeautifulSoupheaders = {    'Host':"map.baidu.com",    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",    "Accept-Encoding": "gzip, deflate",    "Accept-Language": "en-US,en;q=0.5",    "Connection": "keep-alive",    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36"}db = MySQLdb.connect("localhost","root",'liao1234','liao')cursor = db.cursor()sql ="""create table proxies(ip char(20) not null,    port char(20),    area char(20),    nm   char(20),    type char(20),    livetime char(20),    ytime char(20))"""cursor.execute(sql)r = requests.get("http://www.xicidaili.com/",headers=headers)html =r.textsoup = BeautifulSoup(html)for tag in soup.find('table',id='ip_list').find_all('tr'):    ss = []    for aa in tag.find_all('td'):        if aa.string is None:            continue        else:            print aa.string            ss.append(aa.string)    if len(ss) == 0:        continue    else:        domian = "http://"+str(ss[0])+":"+str(ss[1])        proxies = { "http": domian, "https": domian, }        try:            r1 = requests.get("http://www.webkaka.com/", proxies=proxies,headers=headers,timeout=5)            print r1.status_code            if r1.status_code == 200:                sql = "insert into proxies(ip,port,area,nm,type,livetime,ytime) values('%s','%s','%s','%s','%s','%s','%s')"%(ss[0].encode('utf-8'),ss[1].encode('utf-8'),ss[2].encode('utf-8'),ss[3].encode('utf-8'),ss[4].encode('utf-8'),ss[5].encode('utf-8'),ss[6].encode('utf-8'))                cursor.execute(sql)             else:                print "code is not 200"        except:            print "this ip is droped!"db.close

扫描结果如下:



0 0
原创粉丝点击