爬虫 登录csdn并获取个人博客文章列表

来源:互联网 发布:国外网络购物网站 编辑:程序博客网 时间:2024/03/29 09:21
# coding:utf-8import requestsfrom lxml import etreeis_next=True  #全局变量class csdncrawl():    #获取登录时所需的post参数    def get_params(self,username,password,post_url,post_session,post_headers):        param_username=username        param_password=password        get_url=post_url        get_session=post_session        get_headers=post_headers        index_page=get_session.get(get_url,headers=get_headers)        html=etree.HTML(index_page.text)        lt=html.xpath(".//input[@name='lt']//@value")[0]        execution=html.xpath(".//input[@name='execution']//@value")[0]        _eventId=html.xpath(".//input[@name='_eventId']//@value")[0]        postdata = {             'username':param_username,             'password':param_password,             'lt':lt,             'execution':execution,             '_eventId':_eventId,         }        return postdata#登录函数 def csdn_login(self,username,password,index_url,session,headers):        login_username=username        login_password=password        post_url = index_url        post_session=session        post_headers=headers        postdata = self.get_params(login_username,login_password,post_url,post_session,post_headers)        post_session.post(post_url,data=postdata,headers=post_headers)#启动爬虫函数 def startcrawl(self,session):        username = 'zkwniky'        password = '+++++++'        start_page_number=1        dict_blog={}        index_url = 'https://passport.csdn.net/account/login'        agent = 'Mozilla/5.0 (Windows NT 5.1; rv:33.0) Gecko/20100101 Firefox/33.0'        headers = {            'User-Agent': agent        }        start_session = session        self.csdn_login(username,password,index_url,start_session,headers)#成功登录        self.crawl_person_csdn(session,headers) #进入个人中心        self.crawl_blog_list(dict_blog,username,session,headers,start_page_number)#进入个人博客#爬取个人中心,确定登录成功    def crawl_person_csdn(self,session,headers):        person_url='http://my.csdn.net/my/mycsdn'        person_session=session        person_headers=headers        person=person_session.get(person_url,headers=person_headers)        print person.text#爬取我的博客列表    def crawl_blog_list(self,dict_blog,username,session,headers,start_page_number):        global is_next        page_number=start_page_number        dict_blog_list = dict_blog        while is_next:            blog_username=username            blog_url='http://blog.csdn.net/'+blog_username+'/article/list/'+str(page_number)            blog_session=session            blog_headers=headers            blog_page=blog_session.get(blog_url,headers=blog_headers)            print blog_page.text            html = etree.HTML(blog_page.text)            href= html.xpath(".//span[@class='link_title']//a//@href")            title = html.xpath(".//span[@class='link_title']//a/text()")            current_page_number=html.xpath(".//div[@class='pagelist']//strong/text()")            last_page_number= html.xpath(".//div[@class='pagelist']//a//@href")            i = 0            while i < len(href):                dict_blog_list['http://blog.csdn.net' + href[i]] = title[i]                i += 1            if self.judge_next_page(current_page_number[0],last_page_number[-1][-1]):                page_number+=1        print len(dict_blog_list)        return dict_blog_list
#判断博客是否有下一页
def judge_next_page(self,current_page,next_page): global is_next if current_page<next_page: #还有下一页 is_next=True else: is_next=False return is_nextif __name__=='__main__': session=requests.session() csdncrawl=csdncrawl() csdncrawl.startcrawl(session) #成功登录

解释如下:
1)
整体过程比较简单,登录时post数据如下:
eventId=submit 
execution=e1s1 
lt=LT-597060-IAanNajzYkoNV67gnQpFNT9m7goQ7U  
password=++++++
username=zkwniky

其中前三个的值需要在登录页面中的隐藏标签中获取)
2)
判断是否有下一页时,使用了当前页面数最小的方法
3)
python 2.7 执行成功
4)登录到个人中心时 返回json数据地址如下
全部文章
http://my.csdn.net/my/mycsdn/get_read_list?lastId=-&size=10&direction=down&type=
热门博客列表
http://my.csdn.net/my/mycsdn/get_hot_blog_list?pageno=1&pagesize=5&username=zkwniky
热门资源列表
http://my.csdn.net/my/mycsdn/get_hot_download_list
热门搜索 :java,python,spring,mysql,php
http://so.csdn.net/so/search/hotQuery.do?&callback=jQuery19009254916422648101_1501121642771&size=5&_=1501121642773
精彩回答
http://my.csdn.net/my/mycsdn/get_ask_list





 

阅读全文
0 0
原创粉丝点击