python爬虫含登录

来源：互联网发布：java执行输入参数命令编辑：程序博客网时间：2024/04/28 20:16

#!/usr/bin/python# -*- coding: utf-8 -*-import re;import cookielib;import urllib;import urllib2;import optparse; #------------------------------------------------------------------------------# check all cookies in cookiesDict is exist in cookieJar or notdef checkAllCookiesExist(cookieNameList, cookieJar) :    cookiesDict = {};    for eachCookieName in cookieNameList :        cookiesDict[eachCookieName] = False;         allCookieFound = True;    for cookie in cookieJar :        if(cookie.name in cookiesDict) :            cookiesDict[cookie.name] = True;         for eachCookie in cookiesDict.keys() :        if(not cookiesDict[eachCookie]) :            allCookieFound = False;            break;     return allCookieFound; #------------------------------------------------------------------------------# just for print delimiterdef printDelimiter():    print '-'*80; #------------------------------------------------------------------------------# main function to emulate login baidudef emulateLogin():    print "Function: Used to demostrate how to use Python code to emulate login baidu main page: http://www.baidu.com/";    print "Usage: emulate_login_baidu_python.py -u yourBaiduUsername -p yourBaiduPassword";    printDelimiter();     # parse input parameters    parser = optparse.OptionParser();    parser.add_option("-u","--username",action="store",type="string",default='',dest="username",help="Your Baidu Username");    parser.add_option("-p","--password",action="store",type="string",default='',dest="password",help="Your Baidu password");    (options, args) = parser.parse_args();    # export all options variables, then later variables can be used    for i in dir(options):        exec(i + " = options." + i);     printDelimiter();    print "[preparation] using cookieJar & HTTPCookieProcessor to automatically handle cookies";    cj = cookielib.CookieJar();    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj));    urllib2.install_opener(opener);     printDelimiter();    print "[step1] to get cookie 58921";    mainUrl="http://58921.com/";    resp=urllib2.urlopen(mainUrl);    #baiduMainUrl = "http://www.baidu.com/";    #resp = urllib2.urlopen(baiduMainUrl);    #respInfo = resp.info();    #print "respInfo=",respInfo;    for index, cookie in enumerate(cj):        print '[',index, ']',cookie;     printDelimiter();    print "[step2] to get token value";    gettokenUrl="http://58921.com/user/login";    gettokenResp=urllib2.urlopen(gettokenUrl);    #getapiUrl = "https://passport.baidu.com/v2/api/?getapi&class=login&tpl=mn&tangram=true";    #getapiResp = urllib2.urlopen(getapiUrl);    #print "getapiResp=",getapiResp;    gettokenRespHtml = gettokenResp.read();    #print gettokenRespHtml;    foundTokenVal=re.search(\    "<input type=\"hidden\" name=\"form_id\" value=\"user_login_form\"/><input type=\"hidden\" name=\"form_token\" value=\"(?P<tokenVal>\w+)\"",gettokenRespHtml);    #print "getapiRespHtml=",getapiRespHtml;    #bdPass.api.params.login_token='5ab690978812b0e7fbbe1bfc267b90b3';    #foundTokenVal = re.search("bdPass\.api\.params\.login_token='(?P<tokenVal>\w+)';", getapiRespHtml);    if(foundTokenVal):        tokenVal = foundTokenVal.group("tokenVal");        print tokenVal;         printDelimiter();        print "[step3] emulate login 58921";        loginUrl = "http://58921.com/user/login/ajax?ajax=submit&__q=user/login";        postDict = {            'form_id': "user_login_form",            'submit'       : "%E7%99%BB%E5%BD%95",            'form_token'         : tokenVal, #de3dbf1e8596642fa2ddf2921cd6257f            'mail'      : username,            'pass'      : password,        };        postData = urllib.urlencode(postDict);        # here will automatically encode values of parameters        # such as:        # encode http://www.baidu.com/cache/user/html/jump.html into http%3A%2F%2Fwww.baidu.com%2Fcache%2Fuser%2Fhtml%2Fjump.html        #print "postData=",postData;        req = urllib2.Request(loginUrl, postData);        # in most case, for do POST request, the content-type, is application/x-www-form-urlencoded        req.add_header('Content-Type', "application/x-www-form-urlencoded;charset=UTF-8");        resp = urllib2.urlopen(req);        #for index, cookie in enumerate(cj):        #    print '[',index, ']',cookie;        cookiesToCheck = ['remember', 'time'];        loginBaiduOK = checkAllCookiesExist(cookiesToCheck, cj);        if(loginBaiduOK):            print "+++ Emulate login 58921 is OK, ^_^";        else:            print "--- Failed to emulate login 58921 !"    else:        print "Fail to extract token value from html=",gettokenRespHtml; if __name__=="__main__":    emulateLogin();

以上为模拟登录，58921，参考cfi登录百度首页代码

登录完之后，爬取上面的电影票房信息

#!/usr/bin/python# -*- coding: utf-8 -*-#---------------------------------import---------------------------------------import urllib2;import re;import login;from BeautifulSoup import BeautifulSoup;#from HTMLParser import HTMLParser#------------------------------------------------------------------------------#------------------------------------------------------------------------------def main():       file=open("C://test/testfilm.txt",'w');    j=0    while j<1100:        userMainUrl = "http://58921.com/content/film/"+str(j)+"/boxoffice"        print j;        j=j+1;        try:            req = urllib2.Request(userMainUrl);            resp = urllib2.urlopen(req);        except urllib2.HTTPError,e:            print "the page not found!";            print "error code:",e.code;            print "return content:",e.read();            continue;        except urllib2.URLError,e:            print "failed to reach the server";            print "the reason",e.reason;            continue;        else:            respHtml = resp.read();    #print "respHtml=",respHtml; # you should see the ouput html          #print "Method 2: Use python third lib BeautifulSoup to extract info from html";        songtasteHtmlEncoding = "GB2312";        soup = BeautifulSoup(respHtml, fromEncoding=songtasteHtmlEncoding);                foundFilmTable = soup.find(attrs={"class":"movie_chart_header_title"});       # print "foundFilmTable=%s",foundFilmTable;        if(foundFilmTable):            filmTableSoup = foundFilmTable;            foundAllh2=filmTableSoup.findAll("h2");            if(foundAllh2):                curFilm=foundAllh2[0];                print curFilm;                file.write(curFilm.string);                file.write("\n");            foundAlltr=filmTableSoup.findAll("tr");            l=len(foundAlltr);            #print l;            row=0;            while row<l:                curTr=foundAlltr[row];                #print curTr;                row=row+1;                if(curTr):                    foundAlltd=curTr.findAll("td");                    ll=len(foundAlltd);                    #print ll;                    col=0;                    while col<ll:                        curTd=foundAlltd[col];                       # print curTd;                        col=col+1;                                                                       while(curTd):                             try:                                #preTd=curTd;                                curTd.contents[0].contents[0]                             except AttributeError,e:                                #curTd=preTd;                                #curTd=curTd.parent();                                #leafTd=curTd;                                #print curTd;                                break;                             else:                                 curTd=curTd.contents[0];                        file.write(curTd.string);                        file.write("\t");                    file.write("\n");                        file.close();              ###############################################################################if __name__=="__main__":    login.emulateLogin();    main();