python爬取豆瓣上面<战狼2>的20w影评
来源:互联网 发布:优化mysql数据库的方法 编辑:程序博客网 时间:2024/05/16 11:01
import string
import urllib
from urllib.error import HTTPError, URLError
from urllib.request import urlopen
import pymongo as pm
import time
from PIL import Image
from bs4 import BeautifulSoup
import requests
from pytesseract import pytesseract
from requests import TooManyRedirects
#爬取战狼2所有影评并将其放入MongoDB
COUNT=0
HEADER = dict()
urlset = set()
MONGONDB = pm.MongoClient('localhost', 27017)
DOUB = MONGONDB.douban
loginUrl = 'http://accounts.douban.com/login'
#使用用户名和密码登录豆瓣,通过fiddle或者其他工具查看产生的cookies,将下面两个字段添加保存下来
cookies = {'ue="用户名"; dbcl2="XXXX"': 1}
cookie = {}
for line in cookies:
key, value = line.split('=', 1)
cookie[key] = value
def __getTitle(bs):
try:
title = bs.body.h1
return title.get_text()
except AttributeError as e:
print('不存在此标签')
return None
#模拟登陆
def __login(url):
requestData={
"source":None,
"redir":"https://www.douban.com/doumail/",
"form_email": 账号
"form_password": 密码
"login": u'登录'
}
#
# headers={"User-Agent":'Mozilla/5.0 (Windows NT 6.1)\
# AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36'}
# requ=requests.post(url,verify=False)
# print("requ========"+str(requ))
#获取验证码图片
# soup = BeautifulSoup(requ.text,"html.parser")
# captchaAddr = soup.find('img',id='captcha_image')['src']
# pattern = re.findall(".*id=(.*)&.*", captchaAddr)
# catchId = pattern[0]
# print("captchaAddr========="+str(catchId))
# requestData['captcha-solution']='sharp'
# requestData['captcha-id']=catchId
# time.sleep(1)
# r = requests.post(url, data=requestData, headers=headers,verify=False)
# page = r.text
# print("page========="+str(page))
# 获取当前url下所有目标数据,并返回新的url
# 使用set防止重复爬取
def __getTargetData(url,database):
global COUNT
global HEADER
HEADER['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
HEADER['Accept-Encoding'] = 'gzip, deflate, sdch, br'
HEADER['Accept-Language'] = 'zh-CN,zh;q=0.8,zh-TW;q=0.6'
HEADER['Connection'] = 'keep-alive'
HEADER['Host'] = 'movie.douban.com'
HEADER['Referer'] = 'https://www.douban.com/accounts/login?source=movie'
HEADER[
'User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
try:
if COUNT==7:
__login(loginUrl)
# html=requests.session()
# data={}
data=requests.get(url,timeout=20,headers = HEADER,cookies=cookie).text
# html = urlopen(url)
#获取当前url
# print(html.geturl())
#捕获输入验证码异常
except TooManyRedirects as e:
print("人机验证开始...")
except HTTPError as e:
print("无法连接网站")
return None
except TimeoutError as e:
print("连接失败")
return None
except URLError as e:
print("超时")
return None
try:
# bs = BeautifulSoup(html.read(), "lxml")
bs = BeautifulSoup(data, "lxml")
# print("bs==="+str(bs.prettify()))
except NameError as e:
print("无法解析")
return None
try:
print("第一个有效url影评:")
# nameList代表所有p标签
nameList = bs.find_all('div', 'comment-item')
for i in nameList:
# 使用contents获取标签子标签
COUNT = COUNT + 1
targetData = i.contents[3].contents[3].contents[0]
print("第" + str(COUNT) + "条" + "数据:" + str(targetData) + '\n')
database.col.insert({'索引':COUNT,'评论':targetData})
urlset.add(targetData)
# 获取下一页的url
nextUrl = "https://movie.douban.com/subject/26363254/comments" + str(
bs.find(id='paginator').find("a", 'next')['href'])
print("新一条url:" + str(nextUrl))
return nextUrl
except AttributeError as e:
print(e)
print('不存在此标签')
return None
def __main():
global DOUB
counts = 0
print("战狼<2>豆瓣影评:")
nextUrls = __getTargetData(
"https://movie.douban.com/subject/26363254/comments?start=0&limit=20&sort=new_score&status=P&percent_type=",DOUB)
while (True):
if nextUrls == "":
print("爬取结束")
return False
elif nextUrls is None:
print("爬取结束")
return False
else:
counts = counts + 1
print("开始爬取第" + str(counts) + "页")
nextUrls = __getTargetData(nextUrls,DOUB)
print("暂停0.5s...")
time1 = time.time()
__main()
print("共耗时=========="+str(time.time()-time1))
import urllib
from urllib.error import HTTPError, URLError
from urllib.request import urlopen
import pymongo as pm
import time
from PIL import Image
from bs4 import BeautifulSoup
import requests
from pytesseract import pytesseract
from requests import TooManyRedirects
#爬取战狼2所有影评并将其放入MongoDB
COUNT=0
HEADER = dict()
urlset = set()
MONGONDB = pm.MongoClient('localhost', 27017)
DOUB = MONGONDB.douban
loginUrl = 'http://accounts.douban.com/login'
#使用用户名和密码登录豆瓣,通过fiddle或者其他工具查看产生的cookies,将下面两个字段添加保存下来
cookies = {'ue="用户名"; dbcl2="XXXX"': 1}
cookie = {}
for line in cookies:
key, value = line.split('=', 1)
cookie[key] = value
def __getTitle(bs):
try:
title = bs.body.h1
return title.get_text()
except AttributeError as e:
print('不存在此标签')
return None
#模拟登陆
def __login(url):
requestData={
"source":None,
"redir":"https://www.douban.com/doumail/",
"form_email": 账号
"form_password": 密码
"login": u'登录'
}
#
# headers={"User-Agent":'Mozilla/5.0 (Windows NT 6.1)\
# AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36'}
# requ=requests.post(url,verify=False)
# print("requ========"+str(requ))
#获取验证码图片
# soup = BeautifulSoup(requ.text,"html.parser")
# captchaAddr = soup.find('img',id='captcha_image')['src']
# pattern = re.findall(".*id=(.*)&.*", captchaAddr)
# catchId = pattern[0]
# print("captchaAddr========="+str(catchId))
# requestData['captcha-solution']='sharp'
# requestData['captcha-id']=catchId
# time.sleep(1)
# r = requests.post(url, data=requestData, headers=headers,verify=False)
# page = r.text
# print("page========="+str(page))
# 获取当前url下所有目标数据,并返回新的url
# 使用set防止重复爬取
def __getTargetData(url,database):
global COUNT
global HEADER
HEADER['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
HEADER['Accept-Encoding'] = 'gzip, deflate, sdch, br'
HEADER['Accept-Language'] = 'zh-CN,zh;q=0.8,zh-TW;q=0.6'
HEADER['Connection'] = 'keep-alive'
HEADER['Host'] = 'movie.douban.com'
HEADER['Referer'] = 'https://www.douban.com/accounts/login?source=movie'
HEADER[
'User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
try:
if COUNT==7:
__login(loginUrl)
# html=requests.session()
# data={}
data=requests.get(url,timeout=20,headers = HEADER,cookies=cookie).text
# html = urlopen(url)
#获取当前url
# print(html.geturl())
#捕获输入验证码异常
except TooManyRedirects as e:
print("人机验证开始...")
except HTTPError as e:
print("无法连接网站")
return None
except TimeoutError as e:
print("连接失败")
return None
except URLError as e:
print("超时")
return None
try:
# bs = BeautifulSoup(html.read(), "lxml")
bs = BeautifulSoup(data, "lxml")
# print("bs==="+str(bs.prettify()))
except NameError as e:
print("无法解析")
return None
try:
print("第一个有效url影评:")
# nameList代表所有p标签
nameList = bs.find_all('div', 'comment-item')
for i in nameList:
# 使用contents获取标签子标签
COUNT = COUNT + 1
targetData = i.contents[3].contents[3].contents[0]
print("第" + str(COUNT) + "条" + "数据:" + str(targetData) + '\n')
database.col.insert({'索引':COUNT,'评论':targetData})
urlset.add(targetData)
# 获取下一页的url
nextUrl = "https://movie.douban.com/subject/26363254/comments" + str(
bs.find(id='paginator').find("a", 'next')['href'])
print("新一条url:" + str(nextUrl))
return nextUrl
except AttributeError as e:
print(e)
print('不存在此标签')
return None
def __main():
global DOUB
counts = 0
print("战狼<2>豆瓣影评:")
nextUrls = __getTargetData(
"https://movie.douban.com/subject/26363254/comments?start=0&limit=20&sort=new_score&status=P&percent_type=",DOUB)
while (True):
if nextUrls == "":
print("爬取结束")
return False
elif nextUrls is None:
print("爬取结束")
return False
else:
counts = counts + 1
print("开始爬取第" + str(counts) + "页")
nextUrls = __getTargetData(nextUrls,DOUB)
print("暂停0.5s...")
time.sleep(0.5)
#使用pytesseract进行验证码的校验,仅提供最简单的实现。。能识别端正的验证码(歪的不行。。。),求大神指教
#校验验证码def checkCaptcha(url,cookie): datas = requests.get(url, cookies=cookie).text print("开始解析验证码======") # 获取验证码图片 bs = BeautifulSoup(datas, "lxml") # print("bs======" + str(bs)) div = bs.find_all('form') for divs in div: # print("div===========" + str(divs)) ck = divs.contents[0].contents[0]['value'] print("ck===========" + str(ck)) img = divs.contents[2]['src'] print("img===========" + str(img)) # 保存图片到本地 urllib.request.urlretrieve(img, 'D:/%s.jpg' % COUNT) # 获取验证码 # imggg=urlopen(img,'utf-8').read() imgg = Image.open('D:/%s.jpg' % COUNT) imgg.show() gray = imgg.convert('L') print("gray======" + str(gray)) bw = gray.point(lambda x: 0 if x < 1 else 255, '1') print("bw======" + str(bw)) word = pytesseract.image_to_string(bw) words = pytesseract.image_to_string(imgg) print("word======" + str(word)) print("words======" + str(words)) captchasolution = ''.join(c for c in word if c in string.printable).lower() print("captchasolution======" + str(captchasolution)) if(captchasolution is not None or captchasolution is not ""): print("重新获取验证码") time.sleep(10) checkCaptcha(url,cookie) return False else: captchaid = div.contents[4]['value'] parameter = {'ck': ck, 'captcha-solution': captchasolution, 'captcha-id': captchaid, 'original-url': url} data = requests.post(url, timeout=20, headers=HEADER, cookies=cookie, data=parameter) print("填写验证码:"+data)
time1 = time.time()
__main()
print("共耗时=========="+str(time.time()-time1))
阅读全文
0 0
- python爬取豆瓣上面<战狼2>的20w影评
- 使用scrapy爬取豆瓣上面《战狼2》影评
- 使用python爬取《长城》豆瓣影评
- 爬取豆瓣影评数据
- 爬取豆瓣影评TOP250Demo
- python爬虫爬取豆瓣top250电影影评
- 爬取豆瓣的战狼影评(cookies 云词)
- 使用requests爬取豆瓣《长城》影评
- Python 爬虫实践:《战狼2》豆瓣影评分析
- Python 爬虫实践:《战狼2》豆瓣影评分析
- python爬虫爬取《战狼Ⅱ》影评
- (8)Python爬虫——爬取豆瓣影评数据
- [转载]Python爬取豆瓣影评并生成词云图代码
- Python 豆瓣影评抓取
- 爬取6.6w+豆瓣电影之后的分析故事
- 爬虫实践---Scrapy-豆瓣电影影评&深度爬取
- python爬虫(豆瓣影评)
- python爬虫实战:分析豆瓣中最新电影的影评
- HTTP status code is not handled or not allowed的解决方法
- 算法的离线评估
- 51nod1076(边双联通分量)
- 网络常用设备及介绍
- 图论 应用篇
- python爬取豆瓣上面<战狼2>的20w影评
- 转载:Struts2+Jquery实现ajax并返回json类型数据
- Android中Shape的属性说明
- 编程初始之路
- TCP SYNACK定时器梳理
- React 实现井字棋游戏 (tic-tac-toe) 教程 (1) <译自官方文档>
- 【原创】基于禅道的敏捷开发管理实践
- window更新dns缓存,阻止dns更新,开启dns缓存更新
- 作业3