python模拟登录豆瓣

来源:互联网 发布:杜兰特里约奥运会数据 编辑:程序博客网 时间:2024/05/19 17:27

python模拟登录豆瓣

原理:

——–模拟浏览器登录

环境:

——–开发软件:pycharm
——–运行环境:mac;python2.7.10;
——–requests 用于http请求;HTMLParser用于解析html数据

注意:

——–如果登录次数过多,豆瓣会要求输入密码;这个时候程序自动将验证码下载到当前目录下,需要手动打开该图片,输入验证码进行验证!

代码块

#!/usr/bin/env python# -*- coding:utf-8 -*-import requestsfrom HTMLParser import HTMLParserclass DouBanClient(object):    def __init__(self):        self.session = requests.session() ##获取会话session        headers = {            'User-Agent':"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36",            'Referer': "https://www.douban.com/",            'Host': "www.douban.com",        }        self.session.headers.update(headers)    def login(self, form_email, form_password, source="index_nav"):        url = "https://www.douban.com/accounts/login"        r = self.session.get(url)        #验证码地址        mp = MyParser()        mp.feed(r.text)        mp.close()        captcha_solution = None        if mp.captcha_url:            #访问图片url,把图片保存到本地当前目录下            img = self.session.get(mp.captcha_url)            with open('picture.jpg', 'wb') as file:                file.write(img.content)            print mp.captcha_id            captcha_solution = raw_input("验证码:")            post_data = {                'source':source,                'form_email':form_email,                'form_password':form_password,                'captcha-solution':captcha_solution,                'captcha-id':mp.captcha_id            }            print post_data            out = self.session.post(url, data = post_data)            print out.content        else:            post_data = {                'source': source,                'form_email': form_email,                'form_password': form_password,            }            out = self.session.post(url, data=post_data)            print out.content        passclass MyParser(HTMLParser):    def __init__(self):        HTMLParser.__init__(self)        self.captcha_url = None        self.captcha_id = None    #继承HTMLParser的方法,自动遍历所有标签属性    def handle_starttag(self, tag, attrs):        def _attr(attrlist, attrname):            for each in attrlist:                if attrname == each[0]:                    return each[1]            return None        if tag == 'img' and _attr(attrs, 'id') == 'captcha_image':            self.captcha_url = _attr(attrs, 'src')        if tag == 'input' and _attr(attrs, 'name') == 'captcha-id':            self.captcha_id = _attr(attrs, 'value')if __name__ == "__main__":    print("test");    client = DouBanClient()    email = raw_input("邮箱:")    pwd = raw_input("密码:")    client.login(email, pwd)
原创粉丝点击