python爬虫_糗事百科

来源:互联网 发布:淘宝网旅游帐篷 编辑:程序博客网 时间:2024/06/06 00:37
# -*- coding:utf-8 -*-import urllib2,refrom bs4 import BeautifulSoup#获取源码def getContentOrComment(Url):    user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36' #模拟浏览器    headers = {'User-Agent':user_agent}    req = urllib2.Request(url=Url,headers=headers)    try:        response = urllib2.urlopen(req) #打开网址        content = response.read() #读取所有源代码    except Exception,e:        content = None    # print content    return contentarticleUrl = "http://www.qiushibaike.com/textnew/page/%d" #文章地址commentUrl ="http://www.qiushibaike.com/article/%s" #评论地址page = 0while True:    raw = raw_input("点击enter查看或者输入exit退出")    if raw == "exit":        break    page += 1    Url = articleUrl % page    articlePage = getContentOrComment(Url)    articleFloor = 1    #获取段子内容    soupArticle = BeautifulSoup(articlePage,'html.parser')#解析网页    for string in soupArticle.find_all(attrs="article block untagged mb15"):        commentId = str(string.get('id')).strip()[11:]        # print commentId[11:]        print "\n"        print articleFloor,".",string.find(attrs="content").get_text().strip()        articleFloor += 1        #获取评论        commentPage = getContentOrComment(commentUrl % commentId)        if commentPage is None:            continue        soupComment = BeautifulSoup(commentPage,'html.parser')        commentFloor = 1        for comment in soupComment.find_all(attr="body"):            print "    ",commentFloor,"楼回复:",comment.get_text()            commentFloor += 1
爬虫的小例子
原创粉丝点击