爬虫入门1——爬图

来源:互联网 发布:陕西网络作家协会 编辑:程序博客网 时间:2024/06/16 02:33

低级版

import urllib.requestimport osdef url_open(url):    req=urllib.request.Request(url)    req.add_header('User-Agent','Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36')    response=urllib.request.urlopen(url)      html=response.read()    return htmldef get_page(url):    html=url_open(url).decode('utf-8')    a=html.find('current-comment-page')+23    b=html.find(']',a)    return html[a:b]def find_imgs(url):    html=url_open(url).decode('utf-8')    img_addrs=[]    a=html.find('img src=')    while a!=-1:        b=html.find('.jpg',a,a+255)        if b!=-1:            img_addrs.append(('http:'+html[a+9:b+4]))        else:            b=a+9        a=html.find('img src=',b)    return img_addrsdef save_imgs(img_addrs):    for each in img_addrs:        filename=each.split('/')[-1]        with open(filename,'wb') as f:            img=url_open(each)            f.write(img)def download_mm(folder='picture',pages=10):    if not os.path.exists(folder):        os.mkdir(folder)    os.chdir(folder)    url='http://jandan.net/ooxx/'    page_num=int(get_page(url))    for i in range(pages):        page_num-=i          page_url=url+'page-'+str(page_num)+'#comments'        print(page_url)        img_addrs=find_imgs(page_url)        save_imgs(img_addrs)if __name__=='__main__':    download_mm()

正则化版

import urllib.requestimport osimport redef url_open(url):    req=urllib.request.Request(url)    req.add_header('User-Agent','Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36')    response=urllib.request.urlopen(url)      html=response.read()    return htmldef get_page(url):    html=url_open(url).decode('utf-8')    a=html.find('current-comment-page')+23    b=html.find(']',a)    return html[a:b]def get_imgs(url):    html=url_open(url).decode('utf-8')    p=r'src="(.+\.jpg)"'    imglist=re.findall(p,html)    for each in imglist:        filename=each.split('/')[-1]        urllib.request.urlretrieve('http:'+each,filename)def download_mm(folder='picture',pages=10):    if not os.path.exists(folder):        os.mkdir(folder)    os.chdir(folder)    url='http://jandan.net/ooxx/'    page_num=int(get_page(url))    for i in range(pages):        page_num-=i          page_url=url+'page-'+str(page_num)+'#comments'        print(page_url)        get_imgs(page_url)if __name__=='__main__':    download_mm()
阅读全文
1 0