Python豆瓣爬虫

来源:互联网 发布:des算法 java 编辑:程序博客网 时间:2024/05/17 02:41

豆瓣爬虫代码

这里是爬去豆瓣图书的,修改一下爬什么电影/音乐应该很easy

架构:

1、urlib request使用,浏览器header伪装(随机从不同header选择),汉字代码解码

我认为本人做得比较好的

2、ip地址爬取,豆瓣查ip访问统计的,同一ip访问次数多就封掉,因此,我们使用代理。为了不被封,我们使用不同的代理ip,但是如何获得?我们去爬ip,用爬到的ip再去爬ip。如此循环,完美。

3、re表达式使用,结合beutifulsoup4更佳。

4、统计与数据挖掘。这个就看大家的需要了,筛选什么的。

#encoding=UTF-8import urllib.requestimport urllibimport osimport reimport randomimport timefrom _codecs import encodeimport sysdef url_open_help(url,iplist):    ip=random.choice(iplist)    proxy_support=urllib.request.ProxyHandler({'http':ip})    opener=urllib.request.build_opener(proxy_support)    urllib.request.install_opener(opener)    req=urllib.request.Request(url)    agent=['Mozilla/5.0 (Windows NT 10.0; WOW64; rv:48.0) Gecko/20100101 Firefox/47.0','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36','Mozilla/5.0 (Windows NT 10.0; WOW64; rv:48.0) Gecko/20100101 Firefox/48.0','Mozilla/5.0 (Windows NT 10.0; WOW64; rv:48.0) Gecko/20100101 Firefox/44.0']    req.add_header('User-Agent',random.choice(agent))def url_open(url,iplist):    url_open_help(url,iplist)    try:        respond=urllib.request.urlopen(url)#         print("请等待3")    except:        print("请等待30s")        time.sleep(30)        url_open_help(url,iplist)        respond=urllib.request.urlopen(url)    html=respond.read()    return htmldef calculate(url1,iplist):    cal_html=url_open(url1,iplist).decode('utf-8')    q=re.findall("索结果1-15 &nbsp; 共(.*?)</div>",cal_html)    return int(q[0]) def save_raw(strr,author):    filename="D:/PHP/wamp/www/eee/eee/eee/"+author+"/raw.txt"    q=open(filename,'r+')    q.seek(0,2)    q.write(strr+'\n')    q.close()def save_true(strr,author):     filename="D:/PHP/wamp/www/eee/eee/eee/"+author+"/true.txt"      m=open(filename,'r+')    m.seek(0,2)    m.write(strr+'\n')    m.close()     def calculate_tongji(url2,iplist,s,author):    ip=random.choice(iplist)    ip2=ip    tongji_html=url_open(url2,iplist).decode('utf-8')    str_raw=[]    str_true=[]    q=re.findall("href=\"(.*?[0-9])\/\"", tongji_html)    f=set(q)    p=0    for each in f:        str_1=[]        str_2=[]        str_3=[]        ip = random.choice(iplist)        text=url_open(each,iplist).decode('utf-8')        str_1=re.findall("property=\"v:itemreviewed\">(.*?)</span>",text)        str_2=re.findall("property=\"v:average\">(.*?)</strong>",text)        if len(str_2[0])==2:            str_2[0]=("无评分")        str_3=re.findall("property=\"v:votes\">(.*?)</span>",text)        if len(str_3)==0:            str_3.append("评价人数不足")        str_4s=re.findall("class=\"intro\">(.*?)</div>",text,re.S|re.M)        if len(str_4s)==0:            str_4s.append("<p>无内容简介</p>")        str_4=re.findall("<p>(.*?)</p>",str_4s[0],re.S|re.M)        p=p+1        str_num=str(s)+"%"+str(p)        strr=str_num+"%"+str_1[0]+"%"+str_2[0]+"%"+str_3[0]+"%"+str_4[0]        strr=strr.encode("GBK",'ignore')        strr=strr.decode("GBK",'ignore')        str_raw.append(strr)        save_raw(strr,author)        if len(re.findall("无评分",strr))==0:            str_true.append(strr)            save_true(strr,author)               print(strr)        time.sleep(random.randint(1,2))       return ip2   def find_ip(find_time,ip2):    iplist=get_ip(find_time,ip2)#     ip = random.choice(iplist)#     print(ip)    return iplist    def get_ip(find_time,ip2):    if find_time==1:        iplist=[]        ip_initial=['221.182.132.30:8000','112.95.17.95:8118','113.242.174.241:8118','103.59.178.17:80','116.30.153.22:9797']        proxy_support=urllib.request.ProxyHandler({'https':random.choice(ip_initial)})        opener=urllib.request.build_opener(proxy_support)        urllib.request.install_opener(opener)      elif find_time==2:        iplist=[]        proxy_support=urllib.request.ProxyHandler({'https':ip2})        opener=urllib.request.build_opener(proxy_support)        urllib.request.install_opener(opener)    url="http://www.kuaidaili.com/"    req=urllib.request.Request(url)    req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64; rv:48.0) Gecko/20100101 Firefox/47.0')    try:        ip_net=urllib.request.urlopen(url)#         print("请等待5")    except :        print("请等待50s")        time.sleep(50)        iplist=[]        ip_initial=['123.59.86.206:80','171.217.112.209:9999','123.59.86.194:80','163.125.73.182:9999','113.122.10.185:808']        proxy_support=urllib.request.ProxyHandler({'https':random.choice(ip_initial)})        opener=urllib.request.build_opener(proxy_support)        urllib.request.install_opener(opener)        url="http://www.kuaidaili.com/"        req=urllib.request.Request(url)        req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64; rv:48.0) Gecko/20100101 Firefox/47.0')        ip_net=urllib.request.urlopen(url)     ip_html=ip_net.read().decode("UTF-8",'ignore')     ip_first=re.findall("<tbody>(.*?)</tbody>",ip_html,re.S|re.M)    try:        str=ip_first[0]    except:        print("微微卡")        iplist=[]        ip_initial=['123.59.86.206:80','171.217.112.209:9999','123.59.86.194:80','163.125.73.182:9999','113.122.10.185:808']        proxy_support=urllib.request.ProxyHandler({'https':random.choice(ip_initial)})        opener=urllib.request.build_opener(proxy_support)        urllib.request.install_opener(opener)        url="http://www.kuaidaili.com/"        req=urllib.request.Request(url)        req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64; rv:48.0) Gecko/20100101 Firefox/47.0')        ip_net=urllib.request.urlopen(url)        ip_html=ip_net.read().decode("UTF-8",'ignore')         ip_first=re.findall("<tbody>(.*?)</tbody>",ip_html,re.S|re.M)        str=ip_first[0]    ip_odd=re.findall("<tr>(.*?)</tr>",str,re.S|re.M)    for each in ip_odd:        flag=re.findall("<td data-title=\"类型\">(.*?)</td>",each,re.S|re.M)        ip_address=re.findall("<td data-title=\"IP\">(.*?)</td>",each,re.S|re.M)        ip_port=re.findall("<td data-title=\"PORT\">(.*?)</td>",each,re.S|re.M)        ip=ip_address[0]+':'+ip_port[0]        if flag[0]!="HTTP":            iplist.append(ip)              return iplistdef download():    find_time=1    ip2="0"    iplist=find_ip(find_time,ip2)    author=input("请输入作者")    os.mkdir(author)    os.chdir(author)    f=open("raw.txt",'w')    f.close()    l=open("true.txt",'w')    l.close    url1='https://book.douban.com/subject_search?search_text=%E6%9D%91%E4%B8%8A%E6%98%A5%E6%A0%91&cat=1001'    books_num=calculate(url1,iplist)    find_time=2    i=books_num//15    print(i)    for s in range(i):                url2='https://book.douban.com/subject_search?start='+str(15*s)+'&search_text=%E6%9D%91%E4%B8%8A%E6%98%A5%E6%A0%91&cat=1001'        ip2=calculate_tongji(url2,iplist,s,author)         if s!=0 and s%2==0:            iplist=find_ip(find_time,ip2) download()

仅作为测试交流,如果有非法用途,与本人无关。

谢谢慧慧,我的天使。

1 0
原创粉丝点击