Python豆瓣爬虫
来源:互联网 发布:des算法 java 编辑:程序博客网 时间:2024/05/17 02:41
豆瓣爬虫代码
这里是爬去豆瓣图书的,修改一下爬什么电影/音乐应该很easy
架构:
1、urlib request使用,浏览器header伪装(随机从不同header选择),汉字代码解码
我认为本人做得比较好的
2、ip地址爬取,豆瓣查ip访问统计的,同一ip访问次数多就封掉,因此,我们使用代理。为了不被封,我们使用不同的代理ip,但是如何获得?我们去爬ip,用爬到的ip再去爬ip。如此循环,完美。
3、re表达式使用,结合beutifulsoup4更佳。
4、统计与数据挖掘。这个就看大家的需要了,筛选什么的。
#encoding=UTF-8import urllib.requestimport urllibimport osimport reimport randomimport timefrom _codecs import encodeimport sysdef url_open_help(url,iplist): ip=random.choice(iplist) proxy_support=urllib.request.ProxyHandler({'http':ip}) opener=urllib.request.build_opener(proxy_support) urllib.request.install_opener(opener) req=urllib.request.Request(url) agent=['Mozilla/5.0 (Windows NT 10.0; WOW64; rv:48.0) Gecko/20100101 Firefox/47.0','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36','Mozilla/5.0 (Windows NT 10.0; WOW64; rv:48.0) Gecko/20100101 Firefox/48.0','Mozilla/5.0 (Windows NT 10.0; WOW64; rv:48.0) Gecko/20100101 Firefox/44.0'] req.add_header('User-Agent',random.choice(agent))def url_open(url,iplist): url_open_help(url,iplist) try: respond=urllib.request.urlopen(url)# print("请等待3") except: print("请等待30s") time.sleep(30) url_open_help(url,iplist) respond=urllib.request.urlopen(url) html=respond.read() return htmldef calculate(url1,iplist): cal_html=url_open(url1,iplist).decode('utf-8') q=re.findall("索结果1-15 共(.*?)</div>",cal_html) return int(q[0]) def save_raw(strr,author): filename="D:/PHP/wamp/www/eee/eee/eee/"+author+"/raw.txt" q=open(filename,'r+') q.seek(0,2) q.write(strr+'\n') q.close()def save_true(strr,author): filename="D:/PHP/wamp/www/eee/eee/eee/"+author+"/true.txt" m=open(filename,'r+') m.seek(0,2) m.write(strr+'\n') m.close() def calculate_tongji(url2,iplist,s,author): ip=random.choice(iplist) ip2=ip tongji_html=url_open(url2,iplist).decode('utf-8') str_raw=[] str_true=[] q=re.findall("href=\"(.*?[0-9])\/\"", tongji_html) f=set(q) p=0 for each in f: str_1=[] str_2=[] str_3=[] ip = random.choice(iplist) text=url_open(each,iplist).decode('utf-8') str_1=re.findall("property=\"v:itemreviewed\">(.*?)</span>",text) str_2=re.findall("property=\"v:average\">(.*?)</strong>",text) if len(str_2[0])==2: str_2[0]=("无评分") str_3=re.findall("property=\"v:votes\">(.*?)</span>",text) if len(str_3)==0: str_3.append("评价人数不足") str_4s=re.findall("class=\"intro\">(.*?)</div>",text,re.S|re.M) if len(str_4s)==0: str_4s.append("<p>无内容简介</p>") str_4=re.findall("<p>(.*?)</p>",str_4s[0],re.S|re.M) p=p+1 str_num=str(s)+"%"+str(p) strr=str_num+"%"+str_1[0]+"%"+str_2[0]+"%"+str_3[0]+"%"+str_4[0] strr=strr.encode("GBK",'ignore') strr=strr.decode("GBK",'ignore') str_raw.append(strr) save_raw(strr,author) if len(re.findall("无评分",strr))==0: str_true.append(strr) save_true(strr,author) print(strr) time.sleep(random.randint(1,2)) return ip2 def find_ip(find_time,ip2): iplist=get_ip(find_time,ip2)# ip = random.choice(iplist)# print(ip) return iplist def get_ip(find_time,ip2): if find_time==1: iplist=[] ip_initial=['221.182.132.30:8000','112.95.17.95:8118','113.242.174.241:8118','103.59.178.17:80','116.30.153.22:9797'] proxy_support=urllib.request.ProxyHandler({'https':random.choice(ip_initial)}) opener=urllib.request.build_opener(proxy_support) urllib.request.install_opener(opener) elif find_time==2: iplist=[] proxy_support=urllib.request.ProxyHandler({'https':ip2}) opener=urllib.request.build_opener(proxy_support) urllib.request.install_opener(opener) url="http://www.kuaidaili.com/" req=urllib.request.Request(url) req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64; rv:48.0) Gecko/20100101 Firefox/47.0') try: ip_net=urllib.request.urlopen(url)# print("请等待5") except : print("请等待50s") time.sleep(50) iplist=[] ip_initial=['123.59.86.206:80','171.217.112.209:9999','123.59.86.194:80','163.125.73.182:9999','113.122.10.185:808'] proxy_support=urllib.request.ProxyHandler({'https':random.choice(ip_initial)}) opener=urllib.request.build_opener(proxy_support) urllib.request.install_opener(opener) url="http://www.kuaidaili.com/" req=urllib.request.Request(url) req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64; rv:48.0) Gecko/20100101 Firefox/47.0') ip_net=urllib.request.urlopen(url) ip_html=ip_net.read().decode("UTF-8",'ignore') ip_first=re.findall("<tbody>(.*?)</tbody>",ip_html,re.S|re.M) try: str=ip_first[0] except: print("微微卡") iplist=[] ip_initial=['123.59.86.206:80','171.217.112.209:9999','123.59.86.194:80','163.125.73.182:9999','113.122.10.185:808'] proxy_support=urllib.request.ProxyHandler({'https':random.choice(ip_initial)}) opener=urllib.request.build_opener(proxy_support) urllib.request.install_opener(opener) url="http://www.kuaidaili.com/" req=urllib.request.Request(url) req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64; rv:48.0) Gecko/20100101 Firefox/47.0') ip_net=urllib.request.urlopen(url) ip_html=ip_net.read().decode("UTF-8",'ignore') ip_first=re.findall("<tbody>(.*?)</tbody>",ip_html,re.S|re.M) str=ip_first[0] ip_odd=re.findall("<tr>(.*?)</tr>",str,re.S|re.M) for each in ip_odd: flag=re.findall("<td data-title=\"类型\">(.*?)</td>",each,re.S|re.M) ip_address=re.findall("<td data-title=\"IP\">(.*?)</td>",each,re.S|re.M) ip_port=re.findall("<td data-title=\"PORT\">(.*?)</td>",each,re.S|re.M) ip=ip_address[0]+':'+ip_port[0] if flag[0]!="HTTP": iplist.append(ip) return iplistdef download(): find_time=1 ip2="0" iplist=find_ip(find_time,ip2) author=input("请输入作者") os.mkdir(author) os.chdir(author) f=open("raw.txt",'w') f.close() l=open("true.txt",'w') l.close url1='https://book.douban.com/subject_search?search_text=%E6%9D%91%E4%B8%8A%E6%98%A5%E6%A0%91&cat=1001' books_num=calculate(url1,iplist) find_time=2 i=books_num//15 print(i) for s in range(i): url2='https://book.douban.com/subject_search?start='+str(15*s)+'&search_text=%E6%9D%91%E4%B8%8A%E6%98%A5%E6%A0%91&cat=1001' ip2=calculate_tongji(url2,iplist,s,author) if s!=0 and s%2==0: iplist=find_ip(find_time,ip2) download()
仅作为测试交流,如果有非法用途,与本人无关。
谢谢慧慧,我的天使。
1 0
- python爬虫 豆瓣电影
- python豆瓣电影爬虫
- Python豆瓣爬虫
- python 爬虫 豆瓣韩国电影
- Python爬虫豆瓣电影top250
- Python爬虫模拟登陆豆瓣
- python爬虫(豆瓣影评)
- python爬虫之登录豆瓣
- python爬虫 登陆豆瓣 爬豆瓣电影短评
- <Python爬虫>爬取豆瓣图书/豆瓣电影系列
- Python 网络爬虫获取豆瓣信息
- python爬虫,爬豆瓣top250电影
- 用python分布式地爬虫豆瓣/Twitter
- Python爬虫 爬取豆瓣读书
- Python伪装浏览器请求爬虫豆瓣小组
- python第一只爬虫:爬豆瓣top250
- python爬虫之豆瓣图书信息几行字
- python爬虫之豆瓣电影评分
- Servlet 3异步+Html5 SSE示例
- Codeforces Round #324 (Div. 2)D
- shell变量替换总结
- 九度 oj 题目1085:求root(N, k)
- 多态性——虚函数
- Python豆瓣爬虫
- Java编舟录一----简单介绍
- Mybatis工作机制源码分析—缓存机制及事务机制
- LINUX免锁线程池C++
- Realm数据库读取数据时的处理:每次加载数据时,都将原有的数据删除,重新读取模型中的数据
- org.codehaus.jackson.map包下的ObjectMapper类源码
- Java Thread(Android Nougat源码)
- 微信小程序入门之tabBar
- 学习笔记--mysql索引(四) 多列索引