用python爬大一波美女妹子

来源:互联网 发布:木马下载软件 编辑:程序博客网 时间:2024/05/02 00:25
python版本:2.7
和大家一起交流编程心得,如果代码有需要改进的地方,希望大家多提意见

一大波的妹子福利哦

#coding:utf-8import urllib2,re,osfrom urllib import *from time import sleep class spider:  def __init__(self):    self.lst_girl=[]    self.lst_fail=[]    self.lst_use=[]    self.PATH=os.getcwd()    self.host='http://www.zngirls.com'  def saveimg(self,fdir,img_url):    fn=img_url.split('/')    try:      data=urllib2.urlopen(img_url,timeout=20).read()      f=open(fdir+'\\'+fn[-1],'wb')      f.write(data)      f.close()      print 'save image ===========  ok'    except:      print 'save image error ==== OK'      f=open(fdir+'\\err.txt','w')      f.write(img_url)      f.close()       def mkdir(self,fdir):    ie=os.path.exists(fdir)    if not ie:      os.makedirs(fdir)   #获取所有列表  def getgirllist(self):    url='http://www.zngirls.com/ajax/girl_query_total.ashx'    c='%E9%9F%A9%E5%9B%BD'  #country棒子    #p='%E8%BD%A6%E6%A8%A1'  #模特    tmp=unquote(c)    #temp=unquote(p)     #url double encode    country=unquote(tmp)    #profe=unquote(temp)    hd={'Host':'www.zngirls.com',        'User-Agent':'Mozilla/5.0 (Windows NT 5.1; rv:17.0) Gecko/20100101 Firefox/17.0',        'Referer':'http://www.zngirls.com/find/',        'X-Requested-With':'XMLHttpRequest'        }    i=1    go=True    lst_count=[]    while go:      postdata={'country':country,              'curpage':str(i),              'pagesize':'20'              }      post_data=urlencode(postdata)      req=urllib2.Request(url,post_data,hd)      html=urllib2.urlopen(req).read()      pat=re.compile('/girl/[\d]+')      lst_url=re.findall(pat,html)      lst_count+=lst_url      print '初始化完成页数: ' +str(i)      if len(lst_url)>1:        go=True        i+=1      else:        go=False           glst=list(set(lst_count))    fp=open('list.txt','w')    for s in glst:      fp.write(s+'\n')    fp.close()    print '初始化完成 ================ OK'    print '获取数据长度: '+ str(len(glst))    return glst       #处理数据  def solvedata(self,html):    pat=re.compile("value='(.*?)'")    found=re.findall(pat,html)         ipat=re.compile('<td colspan="3">(.*?)</textarea></td>',re.S)    tmp=ipat.search(html).group(1)    info=re.sub('<[^>]+>','',tmp)    info=info.replace(' ','')    fdir=os.getcwd()+'\\spider\\'+found[0]    print fdir    self.mkdir(fdir)    fp=open(fdir+'\\list.txt','w')    for opt in found:      fp.write(opt+'\n')    fp.write(info)    fp.close()    print 'write file ======  ok'    #===image ================    im=re.compile("class='imglink' href='(.*?)'><img",re.I)    imglink=im.search(html).group(1)    self.saveimg(fdir,imglink)   def main(self):    url='http://www.zngirls.com'    fp=open('list.txt','r')    buf=fp.read()    if len(buf)<250:      self.lst_girl=self.getgirllist()    else:      self.lst_girl=buf.split('\n')      print '读取缓冲完成 === ok'    print '数据长度:  '+str(len(self.lst_girl))         hd={'Host':'www.zngirls.com',          'User-Agent':'Mozilla/5.0 (Windows NT 5.1; rv:17.0)',          'Referer':'http://www.zngirls.com/'        }    for s in self.lst_girl:      g_url=url + s      if os.path.exists(os.getcwd()+'\\spider\\'+ s[s.rfind('/')+1:]):        print s +'exist'      else:        try:          req=urllib2.Request(g_url,headers=hd)          html=urllib2.urlopen(req).read()          self.solvedata(html)          #self.lst_use.append(s)          sleep(2)        except urllib2.URLError,e:          self.lst_fail.append(s)          print '1.error:'+ str(e.reason)          sleep(5)     fp=open('err.txt','w')    for err in self.lst_fail:      fp.write(err+'\n')    fp.close()    print "spider success"     craw=spider()craw.main()



0 0
原创粉丝点击