糗事百科简单爬虫

来源:互联网 发布:青天白日t恤 淘宝 编辑:程序博客网 时间:2024/05/16 14:03

仅仅为了记录,只抓一页的

1、用正则写的

#coding=utf-8import urllibimport urllib2import refrom lxml import etreepage = 1url = 'http://www.qiushibaike.com/hot/page/' + str(page)user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'headers = { 'User-Agent' : user_agent }try:    request = urllib2.Request(url,headers = headers)    response = urllib2.urlopen(request)    content = response.read().decode('utf-8')    pattern = re.compile('<h2>(.*?)</h2.*?content">(.*?)</div>(.*?)<div class="stats.*?number">(.*?)</',re.S)    items = re.findall(pattern,content)    count = 0    print '开始爬取第 %d 页' % page    for item in items:    print '发布者:'+item[0].encode('utf8'),'\n段子:'+item[1].encode('utf8').replace('<br/>','').strip()+'\n',item[3].encode('utf8')+'人点赞'    count+=1    print count        print '-'*40+'华丽的分界线'+'-'*40    print '第 %d 页共爬取 %d 个段子' % (page,count)except urllib2.URLError, e:    if hasattr(e,"code"):        print e.code    if hasattr(e,"reason"):        print e.reason

2、用xpath写的

#coding=utf-8import urllibimport urllib2from lxml import etreepage = 1url = 'http://www.qiushibaike.com/hot/page/' + str(page)user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'headers = { 'User-Agent' : user_agent }try:    request = urllib2.Request(url,headers = headers)    response = urllib2.urlopen(request)    content = response.read().decode('utf-8')    parse = etree.HTML(content)    divs = parse.xpath('//div[@class="article block untagged mb15"]')    count = 0    for div in divs:      h2 = div.xpath('./div[@class="author clearfix"]/a[2]/h2|./div[@class="author clearfix"]/span[2]/h2')      duanzi = div.xpath('./div[@class="content"]')[0].text.encode('utf8').strip()      author = h2[0].text.encode('utf8')      count += 1       print '第'+str(count)+'条'      print '发布者:'+h2[0].text.encode('utf8'),'\n内容:'+div.xpath('./div[@class="content"]')[0].text.encode('utf8').strip(),'\n'+div.xpath('.//span[@class="stats-vote"]/i')[0].text+'人赞'      print '-'*40+'-'*40    except urllib2.URLError, e:    if hasattr(e,"code"):        print e.code    if hasattr(e,"reason"):        print e.reason

有时候抓不全,不知道为啥



0 0
原创粉丝点击