新浪微博数据挖掘食谱之一: 登录篇 (API)

来源:互联网 发布:重生网络女主播txt 编辑:程序博客网 时间:2024/04/28 07:12

#!/usr/bin/python # -*- coding: utf-8 -*-'''Created on 2014-12-28@author: beyondzhou@name: login.py'''import re, jsonimport urllib, urllib2, urllib3, cookielibimport base64, rsa, binascii # encryptfrom weibo import APIClientclass SmartRedirectHandler(urllib2.HTTPRedirectHandler):    def http_error_301(self, cls, req, fp, code, msg, headers):        result = urllib2.HTTPRedirectHandler.http_error_301(cls, req, fp, code, msg, headers)        result.status = code        return result    def http_error_302(self, cls, req, fp, code, msg, headers):        result = urllib2.HTTPRedirectHandler.http_error_302(cls, req, fp, code, msg, headers)        result.status = code        return result    def get_cookie():    cookies = cookielib.CookieJar()    return urllib2.HTTPCookieProcessor(cookies)   def get_opener(proxy=False):    rv=urllib2.build_opener(get_cookie(), SmartRedirectHandler())    rv.addheaders = [('User-agent', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)')]    return rvclass SinaAPI():    def __init__(self, CALLBACK_URL, APP_KEY, REDIRECT_URL, USER_ID, USER_PSWD):        self.CALLBACK_URL = CALLBACK_URL        self.APP_KEY = APP_KEY        self.REDIRECT_URL = REDIRECT_URL        self.USER_ID = USER_ID        self.USER_PSWD = USER_PSWD        self.http = urllib3.PoolManager()            def get_username(self, USER_ID):        # The Encryption Algorithm of username         # ssologin.js : ah.su=sinaSSOEncoder.base64.encode(m(aj));        USER_ID_ = urllib.quote(USER_ID) # encode username, avoid error refer:@ &          su = base64.encodestring(USER_ID_)[:-1]        return su       def get_password_rsa(self, USER_PSWD, PUBKEY, servertime, nonce):        # rsa Encrypt :  #when pwencode = "rsa2"        rsaPubkey = int(PUBKEY, 16)#pubkey from 16 to 10        key_1 = int('10001', 16) #10001 to 65537         key = rsa.PublicKey(rsaPubkey, key_1) #        message = str(servertime) + "\t" + str(nonce) + "\n" + str(USER_PSWD)        passwd = rsa.encrypt(message, key)        passwd = binascii.b2a_hex(passwd) #to 16        return passwd          def get_parameter(self):        su = self.get_username(self.USER_ID)        url = "https://login.sina.com.cn/sso/prelogin.php?entry=openapi&callback=sinaSSOController.preloginCallBack\&su="+su+"&rsakt=mod&checkpin=1&client=ssologin.js(v1.4.15)"        r = self.http.request('GET', url)        p = re.compile('\((.*)\)')        json_data = p.search(r.data).group(1)        data = json.loads(json_data)                PUBKEY = data['pubkey']        pcid = data['pcid']        servertime = str(data['servertime'])        nonce = data['nonce']        rsakv = str(data['rsakv'])        sp = self.get_password_rsa(self.USER_PSWD, PUBKEY, servertime, nonce)                #print pcid; print servertime; print nonce; print rsakv; print sp; print su        return pcid, servertime, nonce, rsakv, sp, su             def get_ticket(self):        pcid, servertime, nonce, rsakv, sp, su = self.get_parameter()        fields = urllib.urlencode({            'entry'        : 'openapi',            'gateway'      : '1',            'from'         : '',            'savestate'    : '0',            'useticket'    : '1',            'pagerefer'    :'',            'pcid'         : pcid,            'ct'           : '1800',            's'            : '1',            'vsnf'         : '1',            'vsnval'       : '',            'door'         : '',            'appkey'       : 'kxR5R',            'su'           : su,            'service'      : 'miniblog',            'servertime'   : servertime,            'nonce'        : nonce,            'pwencode'     : 'rsa2',            'rsakv'        : rsakv,            'sp'           : sp,            'sr'           : '1680*1050',            'encoding'     : 'UTF-8',            'cdult'        : '2',            'domain'       : 'weibo.com',            'prelt'        : '0',            'returntype'   : 'TEXT',        })        headers = {                   #"Request": "POST /sso/login.php?client=ssologin.js(v1.4.15)&_=1400652171542 HTTP/1.1",                   #"Accept": "*/*",                    "Content-Type": "application/x-www-form-urlencoded",                   #"Referer": self.CALLBACK_URL,                   #"Accept-Language": "zh-CN",                   #"Origin": "https://api.weibo.com",                   #"Accept-Encoding": "gzip, deflate",                   #"User-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; EIE10;ZHCNMSE; rv:11.0) like Gecko",                   #"Host": "login.sina.com.cn",                   #"Connection": "Keep-Alive",                   #"Cache-Control": "no-cache",                   }        url = "https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.15)"        req = urllib2.Request(url, fields, headers)        f = urllib2.urlopen(req)        data = json.loads(f.read())        return data["ticket"]        def get_code_Security(self):         ticket = self.get_ticket()        fields = urllib.urlencode({            'action': 'submit', # must            'display': 'default',            'withOfficalFlag': '0', # must            'quick_auth': 'null',            'withOfficalAccount': '',            'scope': '',            'ticket': ticket, # must            'isLoginSina': '',              'response_type': 'code', # must            'regCallback': 'https://api.weibo.com/2/oauth2/authorize?client_id='+self.APP_KEY+'\&response_type=code&display=default&redirect_uri='+self.REDIRECT_URL+'&from=&with_cookie=',            'redirect_uri': self.REDIRECT_URL, # must            'client_id': self.APP_KEY, # must            'appkey62': 'kxR5R',            'state': '', # must            'verifyToken': 'null',            'from': '', # must            'userId': "", # do not need enter userId            'passwd': "", # do not need enter password            })        LOGIN_URL = 'https://api.weibo.com/oauth2/authorize'         headers = {"User-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; EIE10;ZHCNMSE; rv:11.0) like Gecko",                   "Referer": self.CALLBACK_URL,                   "Content-Type": "application/x-www-form-urlencoded",                   }        req = urllib2.Request(LOGIN_URL, fields, headers)        req_ =urllib2.urlopen(req)        return_redirect_uri = req_.geturl()                print 'return_redirect_uri:', return_redirect_uri        code = re.findall(r"(?<=code%3D).{32}|(?<=code=).{32}|(?<=code%253D).{32}", return_redirect_uri) # url is formatted with %3D or=         return code         def get_code_NS(self):        fields = urllib.urlencode({            'action': 'submit', # must            'display': 'default',            'withOfficalFlag': '0', # must            'quick_auth': 'null',            'withOfficalAccount': '',            'scope': '',            'ticket': '', # must            'isLoginSina': '',              'response_type': 'code', # must            'regCallback': '',            'redirect_uri': self.REDIRECT_URL, # must            'client_id': self.APP_KEY, # must            'appkey62': 'kxR5R',            'state': '', # must            'verifyToken': 'null',            'from': '', # must            'userId': self.USER_ID, # must            'passwd': self.USER_PSWD, # must            })        LOGIN_URL = 'https://api.weibo.com/oauth2/authorize'         headers = {"User-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; EIE10;ZHCNMSE; rv:11.0) like Gecko",                   "Referer": self.CALLBACK_URL,                   "Content-Type": "application/x-www-form-urlencoded",                   }        r = urllib2.Request(LOGIN_URL, fields, headers)        opener = get_opener(False)         urllib2.install_opener(opener)        try:              f = opener.open(r)              return_redirect_uri = f.url               print "NS1", return_redirect_uri                     except urllib2.HTTPError, e:              return_redirect_uri = e.geturl()              print "NS2", return_redirect_uri          # get the code        #code = return_redirect_uri.split('=')[1]        # re-generate with regexp expression        code = re.findall(r"(?<=code%3D).{32}|(?<=code=).{32}", return_redirect_uri)         print code        return code def weibo_login():     # sina weibo basic secret information    APP_KEY = u'' # app key    APP_SECRET = u'' # app secret    REDIRECT_URL = ''    USER_NAME = ''    USER_PASSWD = ''        client = APIClient(app_key=APP_KEY, app_secret=APP_SECRET, redirect_uri=REDIRECT_URL)    CALLBACK_URL = client.get_authorize_url()    print 'callback_url:', CALLBACK_URL    API = SinaAPI(CALLBACK_URL, APP_KEY, REDIRECT_URL, USER_NAME, USER_PASSWD)    code = API.get_code_Security()    print 'code:', code    requests = client.request_access_token(code)      access_token = requests.access_token      expires_in = requests.expires_in          # access_token      client.set_access_token(access_token, expires_in)     return clientif __name__ == '__main__':        # get weibo_api to access sina api    weibo_api = weibo_login()    print 'weibo_api:', weibo_api    # get 200 public weibo    statuses = weibo_api.statuses.public_timeline.get(count=200)    print json.dumps(statuses, indent=1) 

Result:

code: ['514b806e619c320b7b1ed85ec0d9880a']weibo_api: <weibo.APIClient object at 0x027786D0>{ "interval": 0,  "hasvisible": false,  "total_number": 2,  "previous_cursor": 0,  "next_cursor": 0,  "statuses": [  {   "reposts_count": 0,    "truncated": false,    "text": "\u7238\u7238\uff0c\u5c31\u50cf\u8fd9\u4e48\u75bc\u2026\u2026",    "visible": {    "type": 0,     "list_id": 0   },    "in_reply_to_status_id": "",    "bmiddle_pic": "http://ww1.sinaimg.cn/bmiddle/dea230dbjw1enp0tkfoy7j20cs281dm7.jpg",    "id": 3792648903463407,    "thumbnail_pic": "http://ww1.sinaimg.cn/thumbnail/dea230dbjw1enp0tkfoy7j20cs281dm7.jpg",    "mid": "3792648903463407",    "source": "<a href=\"http://app.weibo.com/t/feed/3auC5p\" rel=\"nofollow\">\u76ae\u76ae\u65f6\u5149\u673a</a>",    "attitudes_count": 0,    "in_reply_to_screen_name": "",    "pic_urls": [    {     "thumbnail_pic": "http://ww1.sinaimg.cn/thumbnail/dea230dbjw1enp0tkfoy7j20cs281dm7.jpg"    }   ],    "in_reply_to_user_id": "",    "darwin_tags": [],    "favorited": false,    "original_pic": "http://ww1.sinaimg.cn/large/dea230dbjw1enp0tkfoy7j20cs281dm7.jpg",    "idstr": "3792648903463407",    "source_type": 1,    "user": {    "bi_followers_count": 310,     "domain": "",     "avatar_large": "http://tp4.sinaimg.cn/3735171291/180/40031850543/0",     "verified_source": "",     "ptype": 0,     "statuses_count": 1589,     "allow_all_comment": true,     "id": 3735171291,     "verified_reason_url": "",     "city": "1000",     "province": "50",     "credit_score": 80,     "block_app": 0,     "follow_me": false,     "verified_reason": "",     "followers_count": 563,     "location": "\u91cd\u5e86",     "verified_trade": "",     "mbtype": 0,     "verified_source_url": "",     "profile_url": "u/3735171291",     "block_word": 0,     "avatar_hd": "http://tp4.sinaimg.cn/3735171291/180/40031850543/0",     "star": 0,     "description": "",     "friends_count": 772,     "online_status": 0,     "mbrank": 0,     "idstr": "3735171291",     "profile_image_url": "http://tp4.sinaimg.cn/3735171291/50/40031850543/0",     "allow_all_act_msg": false,     "verified": false,     "geo_enabled": true,     "class": 1,     "screen_name": "Cz_\u5a77\u7ea6",     "lang": "zh-cn",     "weihao": "",     "remark": "",     "favourites_count": 0,     "name": "Cz_\u5a77\u7ea6",     "url": "",     "gender": "f",     "created_at": "Thu Aug 22 12:14:23 +0800 2013",     "verified_type": -1,     "following": false,     "pagefriends_count": 0,     "urank": 8   },    "geo": null,    "created_at": "Sun Dec 28 07:03:45 +0800 2014",    "mlevel": 0,    "comments_count": 0  },   {   "reposts_count": 0,    "truncated": false,    "text": "#\u60c5\u4fc2\u963f\u54f2#\u4e70\u4e00\u5f20\u6f14\u5531\u4f1a\u95e8\u7968\uff0c\u5e26\u4e0a\u5c0f\u4f19\u4f34\uff0c\u7a7f\u7740\u4f1a\u670d\u62ff\u7740\u4f60\u7684\u4e13\u8f91\u548c\u71c8\u724c\u5954\u8d74\uff0c\u4e0d\u7528\u79bb\u4f60\u5f88\u8fd1\u54ea\u6015\u5750\u5728\u89d2\u843d\uff0c\u54ea\u6015\u4e0d\u80fd\u770b\u6e05\u53f0\u4e0a\u7684\u4f60\uff0c\u54ea\u6015\u4f60\u4e0d\u77e5\u9053\u4eba\u7fa4\u8fd8\u6709\u4e00\u4e2a\u6211\u3002\u4f46\u53ea\u8981\u80fd\u542c\u89c1\u5e38\u5e38\u5728\u8033\u673a\u91cc\u51fa\u73b0\u7684\u90a3\u4e2a\u58f0\u97f3\uff0c\u53ea\u8981\u80fd\u5750\u5728\u8fd9\u91cc\u4e00\u8d77\u4e3a\u4f60\u5450\u558a\uff0c\u53ea\u8981\u80fd\u548c\u4f60\u7ad9\u5728\u540c\u6837\u7684\u5929\u7a7a\u4e0b\u547c\u5438\u540c\u4e00\u7247\u6c27\u6c14\uff0c\u8fd9\u6837\u5c31\u5df2\u7ecf\u5f88\u597d\u4e86",    "visible": {    "type": 0,     "list_id": 0   },    "in_reply_to_status_id": "",    "bmiddle_pic": "http://ww4.sinaimg.cn/bmiddle/5d16267bjw1enp0tkhsxhj20c80klq39.jpg",    "id": 3792648903463367,    "thumbnail_pic": "http://ww4.sinaimg.cn/thumbnail/5d16267bjw1enp0tkhsxhj20c80klq39.jpg",    "mid": "3792648903463367",    "source": "<a href=\"http://app.weibo.com/t/feed/4P1GTP\" rel=\"nofollow\">\u6735\u552f\u5973\u6027\u624b\u673a</a>",    "attitudes_count": 0,    "in_reply_to_screen_name": "",    "pic_urls": [    {     "thumbnail_pic": "http://ww4.sinaimg.cn/thumbnail/5d16267bjw1enp0tkhsxhj20c80klq39.jpg"    }   ],    "in_reply_to_user_id": "",    "darwin_tags": [],    "favorited": false,    "original_pic": "http://ww4.sinaimg.cn/large/5d16267bjw1enp0tkhsxhj20c80klq39.jpg",    "idstr": "3792648903463367",    "source_type": 1,    "user": {    "bi_followers_count": 206,     "domain": "",     "avatar_large": "http://tp4.sinaimg.cn/1561732731/180/5709043270/0",     "verified_source": "",     "ptype": 0,     "statuses_count": 57711,     "allow_all_comment": false,     "id": 1561732731,     "verified_reason_url": "",     "city": "8",     "province": "37",     "credit_score": 80,     "block_app": 0,     "follow_me": false,     "verified_reason": "",     "followers_count": 1487,     "location": "\u5c71\u4e1c \u6d4e\u5b81",     "verified_trade": "",     "mbtype": 0,     "verified_source_url": "",     "profile_url": "u/1561732731",     "block_word": 0,     "avatar_hd": "http://ww3.sinaimg.cn/crop.100.0.318.318.1024/5d16267bjw8ellhct6ii9j20ef08ujs6.jpg",     "star": 0,     "description": "\u672c\u547d\u3001\u7537\u795e\uff1a\u5f35\u4fe1\u54f2/\u5973\u795e\uff1a\u6797\u4f9d\u6668/\u5076\u50cf:\u74ca\u7464\u963f\u59e8/\u559c\u6b61\u7684\u9b54\u8853\u5e2b:\u5289\u8b19/\u559c\u6b61\u53f0\u7063\u5287 \u91d1\u5eb8\u5287/",     "friends_count": 951,     "online_status": 1,     "mbrank": 0,     "idstr": "1561732731",     "profile_image_url": "http://tp4.sinaimg.cn/1561732731/50/5709043270/0",     "allow_all_act_msg": false,     "verified": false,     "geo_enabled": true,     "class": 1,     "screen_name": "\u88d9\u89d2\u98db\u63da0326\u4f9d\u5fc3\u54f2\u610f",     "lang": "zh-tw",     "weihao": "",     "remark": "",     "favourites_count": 22,     "name": "\u88d9\u89d2\u98db\u63da0326\u4f9d\u5fc3\u54f2\u610f",     "url": "http://blog.sina.com.cn/qunjiao2quan",     "gender": "f",     "created_at": "Sun Nov 01 17:38:36 +0800 2009",     "verified_type": 220,     "following": false,     "pagefriends_count": 0,     "urank": 23   },    "geo": null,    "created_at": "Sun Dec 28 07:03:45 +0800 2014",    "mlevel": 0,    "comments_count": 0  } ]}


0 0