豆瓣最受欢迎的影评爬虫(第一个爬虫撒花!)

来源:互联网 发布:卡洛斯实况巅峰数据 编辑:程序博客网 时间:2024/04/30 11:21

-- coding: utf-8 --

from bs4 import BeautifulSoup
import requests
import urllib
import codecs
import re
import json

urlnumber = 0
while urlnumber<60:
print type(urlnumber), urlnumber
url = ‘https://movie.douban.com/review/best/?start=%d’ % (urlnumber,)
html=urllib.urlopen(url)
soup=BeautifulSoup(html)
links = soup.find_all(“a”,class_=”j a_unfolder”)
i=1+urlnumber
for link in links:
new_url = link[‘href’]
new_html=urllib.urlopen(new_url)
newsoup=BeautifulSoup(new_html)
title=newsoup.find(“span”,property=”v:summary”)
a=title.get_text().encode(‘utf-8’) #转换成str
# print a
content=newsoup.find(“div”,property=”v:description”)
b=content.get_text().encode(‘utf-8’)
dict = {‘title’:a,’content’:b}
# print b
j = json.dumps(dict,ensure_ascii=False) #后面的参数是重点,json从此不再是编码,而是中文了
output=codecs.open(‘yingping/%d.json’%i,’w’)
output.writelines(j)
output.close()
i += 1
urlnumber += 10

0 0
原创粉丝点击