Python爬虫实例1-抓取百度贴吧

来源:互联网 发布:eclipse java窗体程序 编辑:程序博客网 时间:2024/05/02 04:48

采集 网络爬虫吧 的所有贴吧信息

http://tieba.baidu.com/f?ie=utf-8&kw=%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB&fr=search

解决问题思路:

  1. 确认需求数据在哪

    右键查看源代码

  2. Fidder模拟发送数据

# -*- coding:utf-8 -*-import urllib2import chardetfrom lxml import etreeimport jsonimport urllibdef GetTimeByArticle(url):    request = urllib2.Request(url)    response = urllib2.urlopen(request)    resHtml = response.read()    html = etree.HTML(resHtml)    return html.xpath('.//*[@class="tail-info"]')[1].textdef main():    output = open('tieba0628.json', 'w')    queryUrl = {'kw': '网络爬虫'}    request = urllib2.Request('http://tieba.baidu.com/f?ie=utf-8&'+ urllib.urlencode(queryUrl) +'&fr=search')    response = urllib2.urlopen(request)    print 'response start'    resHtml = response.read()    print 'response read'    print chardet.detect(resHtml)    html = etree.HTML(resHtml)    result = html.xpath('//li[@data-field]')    print result    print len(result)    for site in result:        #print etree.tostring(site, encoding='utf-8')        title = site.xpath('.//a[@title]')[0].text        #title = site.xpath('.//a/@title')[0]        author = site.xpath('.//*[@class="frs-author-name-wrap"]/a')[0].text        lastName = site.xpath('.//*[@class="tb_icon_author_rely j_replyer"]/a')[0].text        reply_date = site.xpath('.//span[@class="threadlist_reply_date pull_right j_reply_data"]')[0].text.strip()        Article_url = site.xpath('.//*[@class ="j_th_tit "]')[0].attrib['href']        reply_date = GetTimeByArticle('http://tieba.baidu.com/'+Article_url)        rep_num = site.xpath('.//*[@class="threadlist_rep_num center_text"]')[0].text        field = json.loads(site.attrib['data-field'])        print title,author,lastName,reply_date,rep_num,field        item = {}        item['title'] = title        item['author'] = author        item['lastName'] = lastName        item['reply_date'] = reply_date        item['rep_num'] = rep_num        item['field'] = field        print item        line = json.dumps(item, ensure_ascii=False)        print line        print type(line)        output.write(line.encode('utf-8') + "\n")        break    output.close()    print 'end'if __name__ == '__main__':    main()


1 0