爬取豆瓣影评数据
来源:互联网 发布:timeline js 编辑:程序博客网 时间:2024/05/15 18:39
import requestsfrom bs4 import BeautifulSoupimport reimport csv# 字符串格式化def F_data(data): # *data 传递的是空元祖 F_d = data.replace('\t', '').replace('\n', '').replace(' ', '').replace('\"', '') return F_ddef getlink(baseurl): links = [] for i in range(0,5): #实际爬的过程中发现,10,30,40这些页都没显示。 url = baseurl+str(i*20) req = requests.get(url) req.encoding = 'utf-8' soup = BeautifulSoup(req.text, 'html.parser') results = soup.find_all('a', {'class': 'title-link'}) for result in results: print(result['href']) links.append(result['href']) print('共获取到'+len(links)+'条影评') return linksdef getdata(link): req = requests.get(link) req.encoding = 'utf-8' soup = BeautifulSoup(req.text, 'html.parser') name = soup.find('span', {'property': 'v:reviewer'}).text film = soup.find('a', {'href': re.compile(r'https://movie.douban.com/subject/[0-9]+/')}).text rate = soup.find('span', {'class': re.compile('allstar.0 main-title-rating')})['title'] viewdt = soup.find('span', {'property': 'v:dtreviewed'}).text.strip() favor = F_data(soup.find('button', {'class': re.compile('btn useful_count .+ j a_show_login')}).text) disfavor = F_data(soup.find('button', {'class':re.compile('btn useless_count .+ j a_show_login')}).text) content = F_data(soup.find('', {'property': 'v:description'}).text) setdata(name, film, rate, viewdt, favor, disfavor, content) return Nonedef setdata(name, film, rate, viewdt, favor, disfavor, content): csvfile = open('mydata.csv', 'a', encoding='utf-8') try: writer = csv.writer(csvfile) writer.writerow((name, film, rate, viewdt, favor, disfavor, content)) finally: csvfile.close()baseurl = 'https://movie.douban.com/review/best/?start='links = getlink(baseurl)for link in links: getdata(link)
0 0
- 爬取豆瓣影评数据
- 爬取豆瓣影评TOP250Demo
- 使用python爬取《长城》豆瓣影评
- 使用requests爬取豆瓣《长城》影评
- (8)Python爬虫——爬取豆瓣影评数据
- 豆瓣影评数据抓取
- 爬虫实践---Scrapy-豆瓣电影影评&深度爬取
- 使用scrapy爬取豆瓣上面《战狼2》影评
- 爬取豆瓣的战狼影评(cookies 云词)
- python爬虫爬取豆瓣top250电影影评
- 爬取豆瓣电影数据
- 豆瓣影评数据抓取与简要分析
- [转载]Python爬取豆瓣影评并生成词云图代码
- python爬取豆瓣上面<战狼2>的20w影评
- Python3网络爬虫:requests+mongodb+wordcloud 爬取豆瓣影评并生成词云
- Python爬取豆瓣电影Top250数据
- 豆瓣影评 盲山
- 豆瓣影评 高地战
- # ajax #
- Building Apps with Location & Maps
- try-catch-finally结构体的简要说明
- iOS之类方法load和initialize
- 使用SLM方法降低PAPR
- 爬取豆瓣影评数据
- java学习(14)
- 使用Dom4j解析xml文件
- itchat4j -- 用Java扩展个人微信号的能力
- pointPolygonTest多边形检测
- 面试用算法复杂度总结
- SIFT算法详解与应用
- Netty简介
- solr hard soft提交均衡设置,并取消手动提交