爬虫实战代码

来源：互联网发布：淘宝cvr 编辑：程序博客网时间：2024/05/01 22:59

这是我的第一个爬虫作品，也算是处女作了吧这个项目主要是爬取ASP300这个网站的白金源码信息（白金源码地址：http://www.asp300.com/SoftList/27/27_1.html）。

主要内容包括get请求，伪造headers，cookie处理，正则表达式和BeautifulSoup提取信息，重构urllib2的重定向处理，保存图片到本地，用PIL去除图片水印，以及如何使用cookie去获取源码的真实下载地址(本来是写爬虫的，结果无意中发现这个网站可以用cookie和get进行重定向，重定向后的地址即真实下载地址，这个BUG我已经和站长说过了，打坏心思的人可以省省心了)

下面贴代码，关键地方已经添加注释，有不懂的地方可以在我的博客下方留言。

#coding=gb2312import urllib2import urllibimport cookielibimport refrom PIL import Imagefrom bs4 import BeautifulSoupimport socketclass MyError(Exception):    #定义一个错误，留引发    passclass IgnoError(Exception):    #定义一个错误，留引发    passclass RedirctHandler(urllib2.HTTPRedirectHandler):    #重构302,301页面处理类  """docstring for RedirctHandler"""  def http_error_301(self, req, fp, code, msg, headers):      if 'location' in headers:          newurl = headers.getheaders('location')[0]      elif 'uri' in headers:          newurl = headers.getheaders('uri')[0]      else:          return      return newurl  def http_error_302(self, req, fp, code, msg, headers):      if 'location' in headers:          newurl = headers.getheaders('location')[0]      elif 'uri' in headers:          newurl = headers.getheaders('uri')[0]      else:          return      return newurldef Download(url,headers,num_retries=9):    #下载url所指向的页面    req = urllib2.Request(url, headers=headers)    try:        response = urllib2.urlopen(req,timeout=60)        the_page = response.read()        response.close()    except urllib2.URLError,e:        if num_retries > 0 and hasattr(e,'code') and (500 <= e.code <600):            return Download(url,headers,num_retries - 1)        elif hasattr(e, 'code') and (e.code == 404):            raise IgnoError        else:            print 'Download Error:', e.reason            raise MyError    except socket.timeout:        if num_retries > 0 :            return Download(url, headers, num_retries - 1)        raise socket.timeout    return the_pagedef resolve(html):    #提取源码信息    image_url = []    soup = BeautifulSoup(html,"lxml")    title = unicode(soup.head.title.string)    title = re.search('(.*?)_ASP300',title).groups()[0]    size = soup.find('div',class_='box').find('div',class_='box_1').find('div',id='goodsInfo').find('div',class_='textInfo').ul.find_all('li')[2].dd    size = unicode(size)    size = float(re.search(u'软件大小：(.*?)\D',size).groups()[0])    summary_tag = soup.find('div',class_='s')    summary_content = unicode(summary_tag).strip()    summary_content = summary_content.split('<br/>')    summary_content[0] = summary_content[0][15:]    del summary_content[len(summary_content)-1]    for a,b in enumerate(summary_content):        if b == '\n':            del summary_content[a]    summary_cahe = u''    for c in summary_content:        summary_cahe += (c + u'<br/>')    summary_content = summary_cahe    for i in summary_tag.p.find_all('img'):        image_url.append('http://www.asp300.com' + i['src'])    #获取图片下载地址，放入image_url中,image_url中的元素为str，非unicode    return title,size,summary_content,image_url#title,summary是unicode，size是float，image_url中的元素均为strdef download_image(name,url,headers,num_tries=9):    #下载图片    req = urllib2.Request(url=url,headers=headers)    try:        f = urllib2.urlopen(req,timeout=60)    except urllib2.URLError, e:        if num_tries > 0 and hasattr(e,'code') and 500 <= e.code <600:            return download_image(name,url,headers,num_tries - 1)        else:            print '下载图片出错：',e.reason            raise MyError    except socket.timeout:        if num_tries > 0 :            return download_image(name,url,headers,num_tries - 1)        raise socket.timeout    image = open(name,'wb')    image.write(f.read())    f.close()    image.close()def screenshot(name,change,format):    # 去除水印    im = Image.open(name)    w,h = im.size    box = (0,0,w,h-change)    region = im.crop(box)    region.save(name,format)def soft_url(url,headers,num_retries=9):    #获取软件真实下载地址    id = int(re.search('SoftView_(.*?).html',url).groups()[0])    url1 = 'http://www.asp300.com/2012dll/Down.jsp?CodeID=%d&id=1'%id    #这一步是为了获取商品的ookie    cookie = cookielib.CookieJar()    handler = urllib2.HTTPCookieProcessor(cookie)    opener1 = urllib2.build_opener(handler)    req1 = urllib2.Request(url=url1,headers=headers)    try:        opener1.open(req1,timeout=60)        print '%s：获取下载COOKIE成功'%time.ctime()    except urllib2.URLError, e:        if num_retries > 0 and hasattr(e,'code') and (500 <= e.code <600):            return soft_url(url,headers,num_retries - 1)        else:            print 'SOFT_URL1 Error:', e.reason            raise MyError    except socket.timeout:        if num_retries > 0:            return soft_url(url, headers, num_retries - 1)        raise socket.timeout    #cookie获取完毕    #这一步是基于上一步的cookie，获取真是下载地址    #debug_handler = urllib2.HTTPHandler(debuglevel=1)    opener2 = urllib2.build_opener(RedirctHandler,handler)    url2 = 'http://www.asp300.com/2012dll/DownBJ.jsp?CodeID=%d'%id    req2 = urllib2.Request(url=url2,headers=headers)    try:        html = opener2.open(req2,timeout=60)        print '%s：获取下载地址成功'%time.ctime()    except urllib2.URLError, e:        if num_retries > 0 and hasattr(e,'code') and (500 <= e.code <600):            return soft_url(url,headers,num_retries - 1)        else:            print 'SOFT_URL2 Error:', e.reason            raise MyError    except socket.timeout:        if num_retries > 0:            return soft_url(url, headers, num_retries - 1)        raise socket.timeout    return htmlif __name__ == '__main__':    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 '}    url = 'http://www.asp300.com/SoftView/27/SoftView_61577.html'    html = Download(url, headers)    print '页面下载完成'    title, size, summary_content, image_url = resolve(html)    print '信息提取完成'    id = 0    for i in image_url:        name = './image/image_cache%d.jpg' % id        download_image(name, i, headers)        print '图片下载完成%d' % id        screenshot(name, 52, 'jpeg')        print '图片转换完成'        id += 1    download_url = soft_url(url, headers)    print title    print size    summary_content = summary_content.replace(u'\u200b',u'')#unicode特殊字符'\u200b'，不能转为gb2312    print summary_content,type(summary_content)    print summary_content.encode('gb2312')    print image_url    print download_url

1 0