python爬取豆瓣上面<战狼2>的20w影评

来源:互联网 发布:优化mysql数据库的方法 编辑:程序博客网 时间:2024/05/16 11:01
import string
import urllib


from urllib.error import HTTPError, URLError
from urllib.request import urlopen
import pymongo as pm


import time
from PIL import Image
from bs4 import BeautifulSoup
import requests


from pytesseract import pytesseract
from requests import TooManyRedirects
#爬取战狼2所有影评并将其放入MongoDB


COUNT=0
HEADER = dict()


urlset = set()
MONGONDB = pm.MongoClient('localhost', 27017)
DOUB = MONGONDB.douban


loginUrl = 'http://accounts.douban.com/login'
#使用用户名和密码登录豆瓣,通过fiddle或者其他工具查看产生的cookies,将下面两个字段添加保存下来
cookies = {'ue="用户名"; dbcl2="XXXX"': 1}
cookie = {}
for line in cookies:
    key, value = line.split('=', 1)
    cookie[key] = value


def __getTitle(bs):
    try:
        title = bs.body.h1
        return title.get_text()
    except AttributeError as e:
        print('不存在此标签')
        return None


#模拟登陆
def __login(url):
    requestData={
        "source":None,
        "redir":"https://www.douban.com/doumail/",
        "form_email": 账号
        "form_password": 密码
        "login": u'登录'


    }
#
#     headers={"User-Agent":'Mozilla/5.0 (Windows NT 6.1)\
# AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36'}


    # requ=requests.post(url,verify=False)
    # print("requ========"+str(requ))
    #获取验证码图片
    # soup = BeautifulSoup(requ.text,"html.parser")
    # captchaAddr = soup.find('img',id='captcha_image')['src']
    # pattern = re.findall(".*id=(.*)&.*", captchaAddr)
    # catchId = pattern[0]
    # print("captchaAddr========="+str(catchId))
    # requestData['captcha-solution']='sharp'
    # requestData['captcha-id']=catchId
    # time.sleep(1)
    # r = requests.post(url, data=requestData, headers=headers,verify=False)
    # page = r.text
    # print("page========="+str(page))






# 获取当前url下所有目标数据,并返回新的url
# 使用set防止重复爬取
def __getTargetData(url,database):
    global COUNT
    global HEADER
    HEADER['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
    HEADER['Accept-Encoding'] = 'gzip, deflate, sdch, br'
    HEADER['Accept-Language'] = 'zh-CN,zh;q=0.8,zh-TW;q=0.6'
    HEADER['Connection'] = 'keep-alive'
    HEADER['Host'] = 'movie.douban.com'
    HEADER['Referer'] = 'https://www.douban.com/accounts/login?source=movie'
    HEADER[
        'User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'


   
    try:
        if COUNT==7:
            __login(loginUrl)
        # html=requests.session()
        # data={}


        data=requests.get(url,timeout=20,headers = HEADER,cookies=cookie).text
        # html = urlopen(url)
        #获取当前url
        # print(html.geturl())
    #捕获输入验证码异常
    except TooManyRedirects as e:
        print("人机验证开始...")
    except HTTPError as e:
        print("无法连接网站")
        return None
    except TimeoutError as e:
        print("连接失败")
        return None
    except URLError as e:
        print("超时")
        return None


    try:


        # bs = BeautifulSoup(html.read(), "lxml")
        bs = BeautifulSoup(data, "lxml")


        # print("bs==="+str(bs.prettify()))
    except NameError as e:
        print("无法解析")
        return None
    try:
        print("第一个有效url影评:")


        # nameList代表所有p标签
        nameList = bs.find_all('div', 'comment-item')
        for i in nameList:
            # 使用contents获取标签子标签
            COUNT = COUNT + 1
            targetData = i.contents[3].contents[3].contents[0]
            print("第" + str(COUNT) + "条" + "数据:" + str(targetData) + '\n')
            database.col.insert({'索引':COUNT,'评论':targetData})
            urlset.add(targetData)


        # 获取下一页的url
        nextUrl = "https://movie.douban.com/subject/26363254/comments" + str(
            bs.find(id='paginator').find("a", 'next')['href'])
        print("新一条url:" + str(nextUrl))
        return nextUrl
    except AttributeError as e:
        print(e)
        print('不存在此标签')
        return None
def __main():


    global DOUB
    counts = 0
    print("战狼<2>豆瓣影评:")
     nextUrls = __getTargetData(
        "https://movie.douban.com/subject/26363254/comments?start=0&limit=20&sort=new_score&status=P&percent_type=",DOUB)




    while (True):
        if nextUrls == "":
            print("爬取结束")
            return False


        elif nextUrls is None:
            print("爬取结束")
            return False
        else:
            counts = counts + 1
            print("开始爬取第" + str(counts) + "页")
            nextUrls = __getTargetData(nextUrls,DOUB)
            print("暂停0.5s...")

            time.sleep(0.5)

#使用pytesseract进行验证码的校验,仅提供最简单的实现。。能识别端正的验证码(歪的不行。。。),求大神指教

#校验验证码def checkCaptcha(url,cookie):    datas = requests.get(url, cookies=cookie).text    print("开始解析验证码======")    # 获取验证码图片    bs = BeautifulSoup(datas, "lxml")    # print("bs======" + str(bs))    div = bs.find_all('form')    for divs in div:        # print("div===========" + str(divs))        ck = divs.contents[0].contents[0]['value']        print("ck===========" + str(ck))        img = divs.contents[2]['src']        print("img===========" + str(img))        # 保存图片到本地        urllib.request.urlretrieve(img, 'D:/%s.jpg' % COUNT)        # 获取验证码        # imggg=urlopen(img,'utf-8').read()        imgg = Image.open('D:/%s.jpg' % COUNT)        imgg.show()        gray = imgg.convert('L')        print("gray======" + str(gray))        bw = gray.point(lambda x: 0 if x < 1 else 255, '1')        print("bw======" + str(bw))        word = pytesseract.image_to_string(bw)        words = pytesseract.image_to_string(imgg)        print("word======" + str(word))        print("words======" + str(words))        captchasolution = ''.join(c for c in word if c in string.printable).lower()        print("captchasolution======" + str(captchasolution))        if(captchasolution is not None or captchasolution is not ""):            print("重新获取验证码")            time.sleep(10)            checkCaptcha(url,cookie)            return False        else:            captchaid = div.contents[4]['value']            parameter = {'ck': ck, 'captcha-solution': captchasolution, 'captcha-id': captchaid, 'original-url': url}            data = requests.post(url, timeout=20, headers=HEADER, cookies=cookie, data=parameter)            print("填写验证码:"+data)


if __name__ == '__main__':
    time1 = time.time()
    __main()
    print("共耗时=========="+str(time.time()-time1))
原创粉丝点击