一个图片小爬虫

来源:互联网 发布:浏览器for mac 编辑:程序博客网 时间:2024/04/19 23:20

表示放假看了看python ,一直想写个爬虫玩一下,但是好多网站都有反爬虫的机制orz..,导致爬下来的图片根本没法看(哭 。最近发现一个很良心的站,于是写了个小爬虫嘿嘿嘿。主要用到了BeautifulSoup库和requests库,比较简单,效率比较差,有待改进就当自娱自乐了hhh

import osimport requestsfrom bs4 import BeautifulSoupimport reclass Mmonly(object):    def all_page(self,url):        for page in range(1,388):            print('当前第',page,'页')            page_html = url[:-6] + str(page) + '.html'            self.all_url(page_html)    def all_url(self,url):        html = self.request(url)        all_a = BeautifulSoup(html.text,'html5lib').find_all('div',class_ = 'ABox')        for a in all_a:            href = a.contents[0]            name = href.contents[0]            url = href['href']            title = name['alt']            path = str(title)            if self.mkdir(path):                self.html(url)            else:                pass    def mkdir(slef,path):        path = path.strip()        isExists = os.path.exists(os.path.join('G:\python\mmonly.cc',path))        if not isExists:            print('创建一个叫',path,'的文件夹')            os.mkdir(os.path.join('G:\python\mmonly.cc',path))            os.chdir(os.path.join('G:\python\mmonly.cc',path))            return True        else:            print(path,'文件夹已经存在')            return False    def html(self,url):        pic_html = self.request(url)        pic_num = BeautifulSoup(pic_html.text,'html5lib').find('span',class_ = 'totalpage').get_text()        cnt = 0        for page in range(1,int(pic_num)+1):            page_url = url[:-5] + '_' + str(page) + '.html'            #print(page_url)            cnt += 1            self.img(page_url,cnt)        #print(pic_num)    def img(self,url,cnt):        img_html = self.request(url)        img_url = BeautifulSoup(img_html.text,'html5lib').find('div', class_ ='big-pic').find('img')['src']        #print(img_url)        self.save(img_url,cnt)    def save(self,url,cnt):        name = str(cnt)        #print(name)        img = self.request(url)        f = open(name+'.jpg','ab')        f.write(img.content)        f.close();    def request(self,url):        headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36'}        content = requests.get(url,headers = headers)        return contentmmonly = Mmonly()url = 'http://www.mmonly.cc/ktmh/list_28_1.html'mmonly.all_page(url)