python3 27270网站美女爬虫(二)

来源:互联网 发布:sql 日期格式 编辑:程序博客网 时间:2024/06/04 17:47

对于27270网站美女 进行了一个尝试
使用

python3库:urllib BeautifulSoup lxml 

主要是下载静态网页的图片

显示从IP代理网站上获取代理池,然后在从静态页面获取图片的连接,最后下载图片

1、爬虫下载IP代理
2、模拟浏览器下载

分析页面

<body>....省略其他页面代码<div>....省略其他页面代码 <div class="MeinvTuPianBox">  <ul>....省略其他页面代码 <li> <a href="*****" title="******" class="MMPic" target="_blank"><i><img src="*****" width="190" height="280" alt="*****"  /></i></a>....省略其他页面代码</li> ....省略其他页面代码</div>

从上面可以看出页面各个元素之间的关系,确定好要找元素的位置

body > div > div class=MeinvTuPianBox > ul > li > a class=MMPic > i > img

完整的代码

from urllib.request import urlopenimport urllib.requestfrom bs4 import  BeautifulSoupimport os, timeimport http.cookiejarimport  random from urllib.request import urlretrieve ,HTTPError ,urlopen,URLError base_url='http://www.27270.com/'#ent/meinvtupian/' #list_11_%s.html';one_url=['word']base_dir=''proxy_ip=[]#class myThread (threading.Thread):#    def __init__(self, start,end):#        threading.Thread.__init__(self)#        #self.threadID = threadID#        self.start = start#        self.end = end#       #    def run(self):#        print ("开始线程:" + self.name)#        #print_time(self.name, self.counter, 5)#        get_url_list( self.start,self.end )#        print ("退出线程:" + self.name)#ip代理池def getProxyIp():      proxy = []      for i in range(1, 3):          #print(i)          header = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) '                                             'AppleWebKit/537.36 (KHTML, like Gecko) '                                                 'Ubuntu Chromium/44.0.2403.89 '                                                 'Chrome/44.0.2403.89 '                                                 'Safari/537.36'}          req = urllib.request.Request(url='http://www.xicidaili.com/nt/{0}'.format(i), headers=header)          r = urllib.request.urlopen(req)          soup = BeautifulSoup(r,'html.parser',from_encoding='utf-8')          table = soup.find('table', attrs={'id': 'ip_list'})          tr = table.find_all('tr')[1:]          #解析得到代理ip的地址,端口,和类型          for item in tr:              tds =  item.find_all('td')              temp_dict = {}              kind = "{0}:{1}".format(tds[1].get_text().lower(), tds[2].get_text())              proxy.append("http://"+kind)      return proxy  #随机获取IP地址def getIP():    ip=random.choice(proxy_ip)    return ip  def makeMyOpener(head={    'Connection': 'Keep-Alive',    'Accept': 'text/html, application/xhtml+xml, */*',    'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',    'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'}):    proxy_dict=getIP()    print(proxy_dict)    cj = http.cookiejar.CookieJar()    opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))    header = []    for key, value in head.items():        elem = (key, value)        header.append(elem)    elem=('http',proxy_dict)    header.append(elem)    opener.addheaders = header    return opener   #图片下载def download(url,file_name,index):      dir=base_dir+str(index)+'/'     if not os.path.isdir(dir):        os.makedirs(dir)    dir=dir+file_name    try:        with  urlopen(url,timeout=30) as r:            content=r.read();            with open(dir,'wb') as code:            #open(name,[mode[,buffering]]) 打开文件夹                code.write(content)        #time.sleep(1)    except :        passdef get_url_list(index,end):    girl_list=[]    try:        #if end==index:        #    print(u'已经全部抓取完毕')        #    threading.currentThread().stop()           oper = makeMyOpener()        url='http://www.27270.com/ent/meinvtupian/list_11_%s.html' % index        html = oper.open(url)        #第一种方法        #bsObj = BeautifulSoup(html,'lxml')        #girl_list = bsObj.findAll('img')        #第二种方法        soup = BeautifulSoup(html,'lxml')        girl_list = soup.select('body > div > div.MeinvTuPianBox > ul > li > a.MMPic > i > img')        if not girl_list:            print(u'已经全部抓取完毕')            sys.exit(0)        #第三寻找元素方法        #response = requests.get(image_detail_link).content        #sel = html.fromstring(html)                    #girl_list =sel.xpath("//div[@class='MeinvTuPianBox']/ul/li/a[@class='MMPic']/i/img")[0]        mm_down = []        mm_names = []        #第四种方法 正则,此处略        for mpoto in girl_list:            mm_link = mpoto.get('src')             mm_nick = mpoto.get('alt')            mm_down.append(mm_link)            mm_names.append(mm_nick)        for gril,name in zip(mm_down,mm_names):            download(gril, name + '.jpg',index)                     print(gril+name)        index=index+1               get_url_list(index,end)    except HTTPError as e:          print('HTTPError'+str(e.code))        get_url_list(index,end)     except URLError as e:        print('URLError'+e)        get_url_list(index,end)    #return girl_list       if __name__ == '__main__':    proxy_ip=getProxyIp()    base_dir='E:/cache-work/python3/images1/'    if not os.path.isdir(base_dir):        os.makedirs(base_dir)    get_url_list(163,100)    """    try:        _thread.start_new_thread( get_url_list, ( 1,35, ) )        _thread.start_new_thread(get_url_list, ( 35,70, ) )        _thread.start_new_thread( get_url_list, ( 70,110, ) )        _thread.start_new_thread( get_url_list, ( 110,150, ) )        _thread.start_new_thread( get_url_list, ( 150,500,) )    except:       print ("Error: 无法启动线程")    while 1:       pass    """    """    thread1= myThread( 1,35)    thread2= myThread(35,70)    thread3= myThread(70,110)    thread4= myThread(110,150)    thread5= myThread(150,1000)    thread1.start()    thread2.start()    thread3.start()    thread4.start()    thread5.start()""" # 创建两个线程