python爬虫含登录
来源:互联网 发布:java执行输入参数命令 编辑:程序博客网 时间:2024/04/28 20:16
#!/usr/bin/python# -*- coding: utf-8 -*-import re;import cookielib;import urllib;import urllib2;import optparse; #------------------------------------------------------------------------------# check all cookies in cookiesDict is exist in cookieJar or notdef checkAllCookiesExist(cookieNameList, cookieJar) : cookiesDict = {}; for eachCookieName in cookieNameList : cookiesDict[eachCookieName] = False; allCookieFound = True; for cookie in cookieJar : if(cookie.name in cookiesDict) : cookiesDict[cookie.name] = True; for eachCookie in cookiesDict.keys() : if(not cookiesDict[eachCookie]) : allCookieFound = False; break; return allCookieFound; #------------------------------------------------------------------------------# just for print delimiterdef printDelimiter(): print '-'*80; #------------------------------------------------------------------------------# main function to emulate login baidudef emulateLogin(): print "Function: Used to demostrate how to use Python code to emulate login baidu main page: http://www.baidu.com/"; print "Usage: emulate_login_baidu_python.py -u yourBaiduUsername -p yourBaiduPassword"; printDelimiter(); # parse input parameters parser = optparse.OptionParser(); parser.add_option("-u","--username",action="store",type="string",default='',dest="username",help="Your Baidu Username"); parser.add_option("-p","--password",action="store",type="string",default='',dest="password",help="Your Baidu password"); (options, args) = parser.parse_args(); # export all options variables, then later variables can be used for i in dir(options): exec(i + " = options." + i); printDelimiter(); print "[preparation] using cookieJar & HTTPCookieProcessor to automatically handle cookies"; cj = cookielib.CookieJar(); opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)); urllib2.install_opener(opener); printDelimiter(); print "[step1] to get cookie 58921"; mainUrl="http://58921.com/"; resp=urllib2.urlopen(mainUrl); #baiduMainUrl = "http://www.baidu.com/"; #resp = urllib2.urlopen(baiduMainUrl); #respInfo = resp.info(); #print "respInfo=",respInfo; for index, cookie in enumerate(cj): print '[',index, ']',cookie; printDelimiter(); print "[step2] to get token value"; gettokenUrl="http://58921.com/user/login"; gettokenResp=urllib2.urlopen(gettokenUrl); #getapiUrl = "https://passport.baidu.com/v2/api/?getapi&class=login&tpl=mn&tangram=true"; #getapiResp = urllib2.urlopen(getapiUrl); #print "getapiResp=",getapiResp; gettokenRespHtml = gettokenResp.read(); #print gettokenRespHtml; foundTokenVal=re.search(\ "<input type=\"hidden\" name=\"form_id\" value=\"user_login_form\"/><input type=\"hidden\" name=\"form_token\" value=\"(?P<tokenVal>\w+)\"",gettokenRespHtml); #print "getapiRespHtml=",getapiRespHtml; #bdPass.api.params.login_token='5ab690978812b0e7fbbe1bfc267b90b3'; #foundTokenVal = re.search("bdPass\.api\.params\.login_token='(?P<tokenVal>\w+)';", getapiRespHtml); if(foundTokenVal): tokenVal = foundTokenVal.group("tokenVal"); print tokenVal; printDelimiter(); print "[step3] emulate login 58921"; loginUrl = "http://58921.com/user/login/ajax?ajax=submit&__q=user/login"; postDict = { 'form_id': "user_login_form", 'submit' : "%E7%99%BB%E5%BD%95", 'form_token' : tokenVal, #de3dbf1e8596642fa2ddf2921cd6257f 'mail' : username, 'pass' : password, }; postData = urllib.urlencode(postDict); # here will automatically encode values of parameters # such as: # encode http://www.baidu.com/cache/user/html/jump.html into http%3A%2F%2Fwww.baidu.com%2Fcache%2Fuser%2Fhtml%2Fjump.html #print "postData=",postData; req = urllib2.Request(loginUrl, postData); # in most case, for do POST request, the content-type, is application/x-www-form-urlencoded req.add_header('Content-Type', "application/x-www-form-urlencoded;charset=UTF-8"); resp = urllib2.urlopen(req); #for index, cookie in enumerate(cj): # print '[',index, ']',cookie; cookiesToCheck = ['remember', 'time']; loginBaiduOK = checkAllCookiesExist(cookiesToCheck, cj); if(loginBaiduOK): print "+++ Emulate login 58921 is OK, ^_^"; else: print "--- Failed to emulate login 58921 !" else: print "Fail to extract token value from html=",gettokenRespHtml; if __name__=="__main__": emulateLogin();
以上为模拟登录,58921,参考cfi登录百度首页代码
登录完之后,爬取上面的电影票房信息
#!/usr/bin/python# -*- coding: utf-8 -*-#---------------------------------import---------------------------------------import urllib2;import re;import login;from BeautifulSoup import BeautifulSoup;#from HTMLParser import HTMLParser#------------------------------------------------------------------------------#------------------------------------------------------------------------------def main(): file=open("C://test/testfilm.txt",'w'); j=0 while j<1100: userMainUrl = "http://58921.com/content/film/"+str(j)+"/boxoffice" print j; j=j+1; try: req = urllib2.Request(userMainUrl); resp = urllib2.urlopen(req); except urllib2.HTTPError,e: print "the page not found!"; print "error code:",e.code; print "return content:",e.read(); continue; except urllib2.URLError,e: print "failed to reach the server"; print "the reason",e.reason; continue; else: respHtml = resp.read(); #print "respHtml=",respHtml; # you should see the ouput html #print "Method 2: Use python third lib BeautifulSoup to extract info from html"; songtasteHtmlEncoding = "GB2312"; soup = BeautifulSoup(respHtml, fromEncoding=songtasteHtmlEncoding); foundFilmTable = soup.find(attrs={"class":"movie_chart_header_title"}); # print "foundFilmTable=%s",foundFilmTable; if(foundFilmTable): filmTableSoup = foundFilmTable; foundAllh2=filmTableSoup.findAll("h2"); if(foundAllh2): curFilm=foundAllh2[0]; print curFilm; file.write(curFilm.string); file.write("\n"); foundAlltr=filmTableSoup.findAll("tr"); l=len(foundAlltr); #print l; row=0; while row<l: curTr=foundAlltr[row]; #print curTr; row=row+1; if(curTr): foundAlltd=curTr.findAll("td"); ll=len(foundAlltd); #print ll; col=0; while col<ll: curTd=foundAlltd[col]; # print curTd; col=col+1; while(curTd): try: #preTd=curTd; curTd.contents[0].contents[0] except AttributeError,e: #curTd=preTd; #curTd=curTd.parent(); #leafTd=curTd; #print curTd; break; else: curTd=curTd.contents[0]; file.write(curTd.string); file.write("\t"); file.write("\n"); file.close(); ###############################################################################if __name__=="__main__": login.emulateLogin(); main();
- python爬虫含登录
- Python爬虫 - 登录csdn
- python爬虫程序-登录
- Python爬虫 模拟登录
- Python爬虫登录功能
- python爬虫-京东登录
- Python爬虫与模拟登录
- python爬虫之登录豆瓣
- python -- 拉勾网爬虫模拟登录
- python爬虫实践之模拟登录
- python 爬虫 自动登录人人网
- 转载:python爬虫实践之模拟登录
- Python爬虫实现自动登录、签到
- python 爬虫——登录知乎
- python爬虫基础登----网站登录
- Python爬虫之模拟登录总结
- python爬虫:用户名密码登录认证
- python爬虫模拟登录初体验
- (AI-TANK)走圆
- iphone实现声音的录制和播放
- 快速熟悉Oracle索引
- (AI-TANK)朝着机器人原点开火(最简单的第一个开火)
- python 编码问题
- python爬虫含登录
- mysql和Oracle在对clob和blob字段的处理
- 最近调试HEVC中码率控制, 发现HM里面一个重大bug
- 《循序渐进Oracle》部分笔记
- 在stack overflow上看到的关于tornado-RESTful的讨论
- Linux进程同步之记录锁(fcntl)
- 国外大学免费硕博全文数据库以及部分期刊全文
- 视频的Level有什么作用
- 新浪微博布局学习——妙用TabHost