爬虫 妹子图

来源:互联网 发布:分类预测类算法 编辑:程序博客网 时间:2024/04/27 23:23

import requestsfrom lxml import etreeimport ossource_url = 'http://www.mmjpg.com's = requests.Session()s.headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8's.headers['Cache-Control']='max-age=0's.headers['Connection']='keep-alive's.headers['Accept-Encoding']='gzip, deflate, sdch's.headers['Upgrade-Insecure-Requests']='1'# s.headers['Host']='www.mmjpg.com's.headers['Referer']='http://www.mmjpg.com/'s.headers['Accept-Language']='zh-CN,zh;q=0.8's.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'def downpage(droute, ele_con_url, No=1):    print(No)    con = requests.get(ele_con_url)    page_con = con.content.decode('utf-8')    page_con = etree.HTML(page_con)    down_url = page_con.xpath('//div[@class="content"]/a/img/@src')[0]    print(down_url)    with open(droute+str(No)+'.jpg', 'wb') as f:        con = s.get(down_url)        # print(con)        f.write(con.content)    try:        bldown = page_con.xpath('//div[@class="page"]/a[text()="下一张"]/@href')[0]    except:        print("pictures of this lady have been downloaded ")    else:        if bldown:            ele_con_url = source_url + bldown            No = No + 1            downpage(droute, ele_con_url, No)con_url = source_urlpage = 0while True:    page = page +1    print(con_url)    con = s.get(con_url)    print()    page_content = etree.HTML(con.content.decode('utf-8'))    item_data = page_content.xpath('//div[@class="pic"]/ul/li')    item_len = len(item_data)    print(item_len)    print("page :%d" % page)    for eve_item in item_data:        ele_con_url = eve_item.xpath('a/@href')[0]        ele_name = eve_item.xpath('a/img/@alt')[0]        print(ele_con_url)        if os.path.exists(ele_name):            continue        else:            os.mkdir(ele_name)            downpage(ele_name + '/', ele_con_url)        print(ele_con_url,ele_name)    try:        blnextpage = page_content.xpath('//div[@class="page"]/a[text()="下一页"]/@href')[0]    except:        print("pictures of all ladies have been downloaded ")        break    else:        con_url = source_url + blnextpage



总结:

1.etree xpath提取时都是链表,一般取第一个元素,刚刚忘了,花了不少时间去排错。

2.编写的时候遇到了图片可以下载无法打开的问题,原因不是在于图片下载本身,而是服务器发送过来的数据根本是错误的。

1.错误状态值,不是200

2.转为其他图片

解决方法:headers尽量写完整,使用session时,host应省去

3. try:

except:

else:

非常好用,查找页面是否有 下一页 按钮判断是否跳转下一页,在这一直接提取 下一页的链接,当页面没有下一页按钮时,根本无法提取链接此时会报错,使用except 与 else 判断是否到页面尽头。


4.下载页面函数可以改为非递归。

如:

import requestsfrom lxml import etreeimport ossource_url = 'http://www.mmjpg.com's = requests.Session()s.headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8's.headers['Cache-Control']='max-age=0's.headers['Connection']='keep-alive's.headers['Accept-Encoding']='gzip, deflate, sdch's.headers['Upgrade-Insecure-Requests']='1'# s.headers['Host']='www.mmjpg.com's.headers['Referer']='http://www.mmjpg.com/'s.headers['Accept-Language']='zh-CN,zh;q=0.8's.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'def downpage(droute, ele_con_url, No=1):    while True:        print(No)        con = requests.get(ele_con_url)        page_con = con.content.decode('utf-8')        page_con = etree.HTML(page_con)        down_url = page_con.xpath('//div[@class="content"]/a/img/@src')[0]        print(down_url)        with open(droute+str(No)+'.jpg', 'wb') as f:            con = s.get(down_url)            # print(con)            f.write(con.content)        try:            bldown = page_con.xpath('//div[@class="page"]/a[text()="下一张"]/@href')[0]        except:            print("pictures of this lady have been downloaded ")            break        else:            ele_con_url = source_url + bldown            No = No + 1con_url = source_urlpage = 0while True:    page = page +1    print(con_url)    con = s.get(con_url)    print()    page_content = etree.HTML(con.content.decode('utf-8'))    item_data = page_content.xpath('//div[@class="pic"]/ul/li')    item_len = len(item_data)    print(item_len)    print("page :%d" % page)    for eve_item in item_data:        ele_con_url = eve_item.xpath('a/@href')[0]        ele_name = eve_item.xpath('a/img/@alt')[0]        print(ele_con_url)        if os.path.exists(ele_name):            continue        else:            os.mkdir(ele_name)            downpage(ele_name + '/', ele_con_url)        print(ele_con_url,ele_name)    try:        blnextpage = page_content.xpath('//div[@class="page"]/a[text()="下一页"]/@href')[0]    except:        print("pictures of all ladies have been downloaded ")        break    else:        con_url = source_url + blnextpage


欢迎大家提出宝贵意见




原创粉丝点击