spider for douban

来源:互联网 发布:淘宝店铺运营公司 编辑:程序博客网 时间:2024/06/07 04:45
<span style="font-size:18px;">获取豆瓣排行榜内容,新片榜,口碑榜,北美榜</span>
<span style="font-size:18px;">#!/usr/bin/env python# coding:utf-8import requests,urllibimport re,osurl = r"https://movie.douban.com/chart"s = requests.Session()r2 = s.get(url)                         #session.get方法html = r2.text                          #requests获取网址内容速度远快于(urllib.urlopen).read()newnum = re.compile(r'<a class="nbg" href="https://movie.douban.com/subject/([0-9]+)/"')  #新片榜newname = re.compile(r'.jpg" alt="(.*?)" class=""/>')kbnum = re.compile(r'mv_week.*?([0-9]+)/" class="">')   #口碑榜bmnum = re.compile(r'mv_us_week.*?([0-9]+)/" class="">')  #北美票房榜x = r'href="https://movie.douban.com/subject/.*?/" class="">(.*?)</a>'   #两榜片名newvisit = []kbvisit = []f = open("豆瓣排行.txt",'w')        #在本目录下打开文件,如果存在就创建newvisit.extend(newnum.findall(html))       #extend,以字符串形式的元素添加进list。append则是将list添加到listnewvisit.extend(newname.findall(html.encode('utf-8')))kbvisit.extend(kbnum.findall(html))kbvisit.extend(bmnum.findall(html))kbvisit.extend(re.findall(x,html.encode('utf-8'),re.S))f.write("豆瓣新片榜:\n")for a in (range(len(newvisit)/2)):    f.write(newvisit[a])    f.write(newvisit[a+10])    f.write("\n")    print newvisit[a],newvisit[a+10]f.write("\n本周豆瓣口碑榜,北美榜:\n")for b in (range(len(kbvisit)/2)):    f.write(kbvisit[b])    f.write(kbvisit[b+20].strip())    f.write("\n")    print kbvisit[b],kbvisit[b+20].strip()f.closenewpic = re.compile(r'<img src="(https://img.\.doubanio.com/view/movie_poster_cover/ipst/public/.*?\.jpg)" alt=')   #图片解析,需要拿到图片存放的地址piclist1 = re.findall(newpic,html)picnum=0for c in piclist1:    urllib.urlretrieve(c,"%s.jpg"%picnum)    picnum+=1</span>


0 0
原创粉丝点击