python爬虫(登录豆瓣并修改签名)

来源:互联网 发布:侠客风云传优化工具 编辑:程序博客网 时间:2024/06/09 15:44


代码:

  1 #coding: utf-8  2   3 import requests  4 from HTMLParser import HTMLParser  5   6   7 class DoubanClient(object):  8     def __init__(self):  9         object.__init__(self) 10  11         myheaders = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840    .71 Safari/537.36','Origin': 'https://accounts.douban.com'}        #浏览器开发者:Request Headers 12  13         self.session = requests.session()       #requests包 创建session 14         self.session.headers.update(myheaders)  #将定制的header加入session 15  16  17     def login(self, username, password,source='None',redir='https://www.douban.com/',login='登录'): 18         #浏览器开发者:Form Data 19  20         url = 'https://accounts.douban.com/login'       #网页URL 21         r = self.session.get(url)       #用session访问该网页 22         (captcha_id, captcha_url) = _get_captcha(r.content)     #调用get_captchar()解析网页中的内容,获取验证码的id和url 23  24         #如果得到了验证码的id和url,提示用户打开url并输入其中的验证码 25         if captcha_id: 26             captcha_solution = raw_input('please input solution for captcha [%s]:' % captcha_url) 27  28         url = 'https://accounts.douban.com/login' 29         mydata = {'form_email': username, 30                 'form_password': password, 31                 'source': source, 32                 'redir': redir, 33                 'login': login} 34         myheaders = {'referer': 'https://acocunts.douban.com/login', 35                    'host': 'accounts.douban.com'} 36         #浏览器开发者:Request Headers 37  38         #将验证码的id和用户输入的验证码 加入post的data中 39         if captcha_id: 40             mydata['captcha-id'] = captcha_id 41             mydata['captcha-solution'] = captcha_solution 42  43         self.session.post(url, data=mydata, headers=myheaders)  #post发出请求 44         print(self.session.cookies.items()) 45  46     #更改签名 47     def edit_signature(self, username, signature): 48         url = 'https://www.douban.com/people/%s/' % username    #网页URL 49         r = self.session.get(url)       #用session访问该网页 50         mydata = {'ck': _get_ck(r.content), 51                 'signature': signature} 52         myurl = 'https://www.douban.com/j/people/%s/edit_signature' % username 53         myheaders = {'referer': url, 54                    'host': 'www.douban.com', 55                    'x-requested-with': 'XMLHttpRequest'} 56         r = self.session.post(myurl, data=mydata, headers=myheaders)    #post 57         print(r.content) 58  59  60 def _attr(attrs, attrname): 61     for attr in attrs: 62         if attr[0] == attrname: 63             return attr[1] 64     return None 65  66  67 def _get_captcha(content): 68     #获取验证码的id和url         69     class CaptchaParser(HTMLParser):    #继承父类HTMLParser 70         def __init__(self): 71             HTMLParser.__init__(self) 72             self.captcha_id = None      #默认值设为None 73             self.captcha_url = None 74  75         def handle_starttag(self, tag, attrs): 76             if tag == 'img' and _attr(attrs, 'id') == 'captcha_image' and _attr(attrs, 'class') == 'captcha_image': 77                 #根据网页框架进行条件限定,定位至验证码图片 78                 self.captcha_url = _attr(attrs, 'src')  #得到验证码图片的url 79  80             if tag == 'input' and _attr(attrs, 'type') == 'hidden' and _attr(attrs, 'name') == 'captcha-id': 81                 #条件限定,定位至验证码id 82                 self.captcha_id = _attr(attrs, 'value') #得到验证码的id value 83  84     p = CaptchaParser() 85     p.feed(content)     #feed()向解析器喂数据 86     return p.captcha_id, p.captcha_url 87  88  89 def _get_ck(content): 90  91     class CKParser(HTMLParser): 92         def __init__(self): 93             HTMLParser.__init__(self) 94             self.ck = None 95  96         def handle_starttag(self, tag, attrs): 97             if tag == 'input' and _attr(attrs, 'type') == 'hidden' and _attr(attrs, 'name') == 'ck': 98                 #条件限定,定位至签名框 99                 self.ck = _attr(attrs, 'value')100 101     p = CKParser()      #实例化类102     p.feed(content)     #feed()向解析器喂数据103     return p.ck104 105 106 if __name__ == '__main__':107     c = DoubanClient()108     c.login('791368726@qq.com', '**此处为密码**')109     c.edit_signature('162101126', '**此处为签名**')

执行:





Google Chrome:




原创粉丝点击