爬虫学习笔记1——爬取糗百段子

来源:互联网 发布:未来造价软件 编辑:程序博客网 时间:2024/06/06 19:48

 教程原文

# -*- coding: utf-8 -*-import urllib2import repage = 1url = 'https://www.qiushibaike.com/' + str(page)user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'headers = {'User-Agent': user_agent}try:    request = urllib2.Request(url, headers=headers)    response = urllib2.urlopen(request)    content = response.read().decode('utf-8')    pattern = re.compile('<div class="author.*?>[\s\S]*?<a.*?>[\s\S]*?<img.*?alt=(.*?)>[\s\S]*?</div>[\s\S]*?'                         +'<div class="content">\n<span>([\s\S]*?)</span>[\s\S]*?<!--.*?-->([\s\S]*?)<div class="stats">', re.S)    items = re.findall(pattern, content)    for item in items:        haveImg = re.search("img", item[2])        if not haveImg:            print item[0],item[1]    #print itemsexcept urllib2.URLError, e:    if hasattr(e, "code"):        print e.code    if hasattr(e, "reason"):        print e.reason