python爬虫程序，由根目录一直爬各个网址

来源：互联网发布：手机淘宝怎么看优惠券编辑：程序博客网时间：2024/04/30 13:57
#coding=utf-8import urllib2import urllibimport reimport osfrom threading import Threadimport timeclass TimeoutException(Exception):    passThreadStop = Thread._Thread__stop#获取私有函数def timelimited(timeout):    def decorator(function):        def decorator2(*args,**kwargs):            class TimeLimited(Thread):                def __init__(self,_error= None,):                    Thread.__init__(self)                    self._error =  _error                def run(self):                    try:                        self.result = function(*args,**kwargs)                    except Exception,e:                        self._error =e                def _stop(self):                    if self.isAlive():                        ThreadStop(self)            t = TimeLimited()            t.start()            t.join(timeout)            if isinstance(t._error,TimeoutException):                t._stop()                raise TimeoutException('timeout for %s' % (repr(function)))            if t.isAlive():                t._stop()                raise TimeoutException('timeout for %s' % (repr(function)))            if t._error is None:                return t.result        return decorator2    return decorator@timelimited(5)def fn_1(url):    data=urllib.urlopen(url).read()    return dataurlbase='https://baidu.com'patt='a href=\"(http.+?)\"'p=re.compile(patt)def downHtml(url):    count=1    backage='g:/downHtml/'+str(count)    #os.mkdir(backage);    path=backage+'/'+str(count)+'.html'    request = urllib2.Request(url)    #request = urllib2.Request('http://jandan.net/ooxx/page-1507#comments')    request.add_header('User-Agent','Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6')    opener = urllib2.build_opener()    f= opener.open(request)    doc=f.read()    m=re.findall(p,doc)    index=1    for i in m:        print index,i        index+=1        if len(m)>10000:            break        try:            data=fn_1(i)        except:            continue        if data is None:            continue        n=re.findall(p,data)        setm=set(m)        setn=set(n)        n=setn-setm        m.extend(n)    for i in m:        print index,i        index+=1    '''for i in m:        print index,urlbase+i        index+=1        data=urllib.urlopen(url+i).read()        n=re.findall('href="(\?.+?)"',data)        f=open(path,'wb')        f.write(data)        f.close()        count=count+1        path=backage+'/'+str(count)+'.html'''''downHtml(urlbase)print 'down'
0 0