第一篇python爬虫
来源:互联网 发布:历史周期律 知乎 编辑:程序博客网 时间:2024/05/21 06:55
爬取douban图书
# coding=UTF-8
import json,time
from lxml import etree
from retrying import retry
from gevent import monkey
monkey.patch_all()
import requests
# base_url='https://book.douban.com/tag/?view=type&icn=index-sort-all'
from string import punctuation #去除标点符号
class Douban(object):
def __init__(self):
self.header={
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36",
"Host":"book.douban.com",
"Referer":"https://book.douban.com/tag/?view=type&icn=index-sort-all"
}
self.url='https://book.douban.com/tag/?view=type&icn=index-sort-all'
self.base_url='https://book.douban.com'
self.proxy={"http":"http://122.114.31.177:808"}
@retry(stop_max_attempt_number=3)
def parse_url(self,url):
response=requests.get(url,headers=self.header,timeout=3,proxies = self.proxy)
assert response.status_code ==200
return response.content.decode()
#发送请求获取响应
def get_content(self,html_str):
result=etree.HTML(html_str)
div_lists=result.xpath("//div[@class='article']/div[2]/div")
items=[]
for div_list in div_lists:
tr_lists=div_list.xpath("./table[@class='tagCol']/tbody/tr/td")
for tr in tr_lists:
item={}
item['x_href']=tr.xpath("./a/@href")[0]
item['x_href']=self.base_url+item['x_href'] if len(item['x_href']) >0 else None
item['x_cate']=tr.xpath("./a/text()")[0]
item['x_num']=tr.xpath("./b/text()")[0]
item["b_cate"]=div_list.xpath("./a[1]/h2/text()")[0]
# print(item)
items.append(item)
return items
# 详情列表页
def detail_content(self,html_str1):
detail_result=etree.HTML(html_str1)
li_lists=detail_result.xpath("//ul[@class='subject-list']/li")
detail_item=[]
for li in li_lists:
item1={}
item1['book_url']=li.xpath("./div[@class='pic']/a/@href")[0]
item1['book_img']=li.xpath("./div[@class='pic']/a/img/@src")[0]
item1['book_name']=li.xpath("./div[@class='info']/h2/a/text()")[0].strip()
item1['book_chuban']=li.xpath("./div[@class='info']/div[@class='pub']/text()")[0].strip()
# item1['book_chuban']=[i for i in item1['book_chuban'] if i not in punctuation]
detail_item.append(item1)
# print(item1)
#获取下一页
# next_url="https://book.douban.com/tag/小说?start={}"
next_url=detail_result.xpath("//div[@class='paginator']/span[@class='next']/a/@href")[0]
if len(next_url) > 0:
next_url = self.base_url + next_url
else:
next_url = None
return detail_item,next_url
def save_content(self,content):
with open("lw.txt","w",encoding="utf-8") as f:
f.write(json.dumps(content,ensure_ascii=False,indent=4))
f.write("\n")
def save_detail_content(self,detail_item):
with open("lw1.txt",'a',encoding="utf-8") as f:
f.write(json.dumps(detail_item,ensure_ascii=False,indent=4))
f.write("\n")
def run(self):
#获取url,发送请求,获取响应
html_str=self.parse_url(self.url)
#提取数据
content=self.get_content(html_str)
#保存首页内容
self.save_content(content)
time.sleep(2)
for it in content:
# print(it['x_href'])
html_str1=self.parse_url(it['x_href'])
detail_item,next_url=self.detail_content(html_str1)
while next_url is not None:
html_str1=self.parse_url(next_url)
print(next_url)
detail_item,next_url=self.detail_content(html_str1)
#保存数据
self.save_detail_content(detail_item)
if __name__ == '__main__':
douban=Douban()
douban.run()
- 第一篇python爬虫
- 爬虫 第一篇
- Python爬虫入门(四)PhatomJS+Selenium第一篇
- 【python】入门第一篇
- 第一篇:开始Python
- 第一篇python
- Python初学第一篇
- python第一篇
- python第一篇
- 第一篇:初识Python
- phantomjs爬虫系列---phyang ----第一篇 前言
- python爬虫抓取晋江网一篇小说
- python爬虫学习第一天
- python爬虫学习第三天
- python爬虫学习第五天
- python爬虫学习第七天
- python爬虫学习第八天
- python爬虫学习第九天
- CAS实现多线程计数器
- 寻找ZCMU
- 可绑定可扩展的帐号系统设计原理及其实现
- jsonp跨域百度
- mysql毫秒测试
- 第一篇python爬虫
- Moore-Penrose伪逆
- 【Android】【UI】透明度转十六进制
- OnTouchListenner
- CSS常见兼容性问题总结
- python3中,用Tkinter编写记事本功能
- mysql错误指令:Failed to open file "file_name" error 2/error 22
- logback之二:输出日志到控制台
- spring在web容器启动时执行初始化方法(四种方式)