第一篇python爬虫

来源:互联网 发布:历史周期律 知乎 编辑:程序博客网 时间:2024/05/21 06:55

爬取douban图书



# coding=UTF-8
import json,time
from lxml import etree
from retrying import retry
from gevent import monkey
monkey.patch_all()
import requests


# base_url='https://book.douban.com/tag/?view=type&icn=index-sort-all'
from string import punctuation  #去除标点符号
class Douban(object):
    def __init__(self):
        self.header={
            "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36",
            "Host":"book.douban.com",
            "Referer":"https://book.douban.com/tag/?view=type&icn=index-sort-all"
        }
        self.url='https://book.douban.com/tag/?view=type&icn=index-sort-all'
        self.base_url='https://book.douban.com'
        self.proxy={"http":"http://122.114.31.177:808"}
    @retry(stop_max_attempt_number=3)
    def parse_url(self,url):
        response=requests.get(url,headers=self.header,timeout=3,proxies = self.proxy)
        assert response.status_code ==200
        return response.content.decode()
    #发送请求获取响应
    def get_content(self,html_str):
        result=etree.HTML(html_str)
        div_lists=result.xpath("//div[@class='article']/div[2]/div")
        items=[]
        for div_list in div_lists:
            tr_lists=div_list.xpath("./table[@class='tagCol']/tbody/tr/td")
            for tr in tr_lists:
                item={}
                item['x_href']=tr.xpath("./a/@href")[0]
                item['x_href']=self.base_url+item['x_href'] if len(item['x_href']) >0 else None
                item['x_cate']=tr.xpath("./a/text()")[0]
                item['x_num']=tr.xpath("./b/text()")[0]
                item["b_cate"]=div_list.xpath("./a[1]/h2/text()")[0]
                # print(item)
                items.append(item)
        return items
    # 详情列表页
    def detail_content(self,html_str1):
        detail_result=etree.HTML(html_str1)
        li_lists=detail_result.xpath("//ul[@class='subject-list']/li")
        detail_item=[]
        for li in li_lists:
            item1={}
            item1['book_url']=li.xpath("./div[@class='pic']/a/@href")[0]
            item1['book_img']=li.xpath("./div[@class='pic']/a/img/@src")[0]
            item1['book_name']=li.xpath("./div[@class='info']/h2/a/text()")[0].strip()
            item1['book_chuban']=li.xpath("./div[@class='info']/div[@class='pub']/text()")[0].strip()
            # item1['book_chuban']=[i for i in item1['book_chuban'] if i not in punctuation]
            detail_item.append(item1)
            # print(item1)


        #获取下一页
        # next_url="https://book.douban.com/tag/小说?start={}"
        next_url=detail_result.xpath("//div[@class='paginator']/span[@class='next']/a/@href")[0]
        if len(next_url) > 0:
            next_url = self.base_url + next_url
        else:
            next_url = None
        return detail_item,next_url


    def save_content(self,content):
        with open("lw.txt","w",encoding="utf-8") as f:
            f.write(json.dumps(content,ensure_ascii=False,indent=4))
            f.write("\n")


    def save_detail_content(self,detail_item):
        with open("lw1.txt",'a',encoding="utf-8") as f:
            f.write(json.dumps(detail_item,ensure_ascii=False,indent=4))
            f.write("\n")


    def run(self):
        #获取url,发送请求,获取响应
        html_str=self.parse_url(self.url)
        #提取数据
        content=self.get_content(html_str)
        #保存首页内容
        self.save_content(content)
        time.sleep(2)
        for it in content:
            # print(it['x_href'])
            html_str1=self.parse_url(it['x_href'])
            detail_item,next_url=self.detail_content(html_str1)
            while next_url is not None:
                html_str1=self.parse_url(next_url)
                print(next_url)
                detail_item,next_url=self.detail_content(html_str1)
                #保存数据
                self.save_detail_content(detail_item)


if __name__ == '__main__':
    douban=Douban()
    douban.run()