斗图套图爬取
来源:互联网 发布:centos下搭建hadoop 编辑:程序博客网 时间:2024/06/06 02:11
#encoding:utf8import requestsimport osfrom lxml import etreefrom multiprocessing import Poolclass Dt: def __init__(self): self.stit = "https://www.doutula.com/article/list/?page=" self.root_url = "https://www.doutula.com/article/list/" self.head = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.3964.2 Safari/537.36"} def get_ts(self,url): xpath = etree.HTML(requests.get(url,headers=self.head).text) links = xpath.xpath('//*[@class="row"]/div[1]/a/@href') for ll in links: self.img(ll) def img(self,url): xpath = etree.HTML(requests.get(url,headers=self.head).text) name = xpath.xpath('//*[@class="pic-title"]/h1/a/text()')[0].strip() urls = xpath.xpath('//*[@class="artile_des"]//img/@src') self.download(name,urls) def download(self,name,urls): os.mkdir("img/%s"%name) for i in urls: img = requests.get(i,headers=self.head).content f = open("img/%s/%s"%(name,i[-10:]),"wb") f.write(img) print("完成") def dio(self): pool = Pool(10) kkk = pool.map(self.get_ts,[self.stit+str(i) for i in range(1,50)])#开启进程池 try: for link in kkk: self.img(link) except Exception as e: print(e)if __name__ == '__main__': wode = Dt() wode.dio()
阅读全文