使用Python下载整个网站的连接,适合能目录浏览的网站。

来源:互联网 发布:堤防网络诈骗 编辑:程序博客网 时间:2024/05/08 18:42
# Copyright (C) 2012 xxx(xxx) Co., LTD.# All rights reserved.## Developed by RD BIOS Team.## Authors: perry <perry.peng@cn.xxx.com>## Date: January 11, 2012## Project Name: WEBDOWN# Project Version: 1.0.0## Project descrition:## History:#    Date        Auther      Description#    -----------------------------------------------------------------#    2012/01/11  perry       created.## Note:#  xxx__version__ = "1.0.0"import os, sys, ioimport sqlite3try:  # Python 2.7  from urlparse import urlparse  from urllib import (    unquote,    url2pathname)except ImportError:  # Python 3.2  from urllib.parse import urlparsetry:  # Python 2.7  from HTMLParser import HTMLParserexcept ImportError:  # Python 3.2  from html.parser import HTMLParsertry:  # Python 2.7  from httplib import HTTPConnectionexcept ImportError:  # Python 3.2  from http.client import HTTPConnectionimport timeimport threadingclass DownloadThread(threading.Thread):  def __init__(self, wd):    self.wd = wd    threading.Thread.__init__(self)      def run(self):    http = HTTPConnection(wd.url)        while True:      s = self.wd.get1()      if s is None:        if not self.wd.finished:          break        time.sleep(1)        continue                    x = unquote(s.encode(sys.stdin.encoding))      p = os.getcwd() + url2pathname(x)            if not os.path.exists(p):        try:          http.close()          http.request('GET', s)          r = http.getresponse()          if r.status == 200:            print r.getheader('content-length', 0), s                                  try:              f = open(p, 'wb')              f.write(r.read())            finally:              f.close()                except:          print 'FAIL ', s      else:        print 'EXISTS ', s              self.wd.set1(s, 1)          print('exit...')class Webdown(HTMLParser):  finished = False  def __init__(self, url):    try:      url_info = urlparse(url, 'http')      self.url = url_info.netloc      self.http = HTTPConnection(url_info.netloc)      self.dbc = sqlite3.connect(':memory:', check_same_thread = False)      self.lock = threading.Lock()      self.path = url_info.path      self.dbc.execute('''        create table if not exists download (          id integer primary key autoincrement,          name text,          url text,          path text,          local_path text,          is_dir integer default 0,          is_searched integer default 0,          is_queried integer default 0,          is_download integer default 0)''')      name = self.path      while name.endswith('/'):        name = name[:-1]      self.path = name + '/'      i = name.rfind('/')      if i > 0:        name = name[i + 1:]      self.puturl(name, self.url, self.path, os.getcwd(), 1)    except:      print('WebDown initialize failure...')    HTMLParser.__init__(self)  def handle_starttag(self, tag, attrs):    if tag != 'a' or len(attrs) != 1 or attrs[0][0] != 'href':      return    href = attrs[0][1]    if href == '../':     # ignore the parent folder.      return    if href == './':      # ignore the current folder.      return          if href.startswith('?'):      return          if href.startswith('~'):      return    dir = 0    name = href    searched = 1    if name.endswith('/'):      name = name[:-1]      searched = 0      dir = 1    self.puturl(name, self.url, self.path + href, '', dir, searched)  def puturl(self, name, url, path, lpath='', isdir=0, searched=0):    self.lock.acquire()    self.dbc.execute('insert into download (name,url,path,local_path,is_dir,is_searched) values(?,?,?,?,?,?)', (      name,url, path, lpath, isdir, searched))    self.lock.release()  def set1(self, path, status=0):    self.lock.acquire()    self.dbc.execute('update download set is_queried=? where path=?', (status, path))    self.lock.release()  def get1(self):    self.lock.acquire()    r = self.dbc.execute('select path from download where is_dir=0 and is_queried=0 limit 1')    s = r.fetchone()    self.lock.release()        if s is not None:      return s[0]    return s  def set2(self, path, status=0):    self.lock.acquire()    self.dbc.execute('update download set is_searched=? where path=?', (status, path))    self.lock.release()      def get2(self, url):    self.lock.acquire()    r = self.dbc.execute('select path from download where url=? and is_searched=0 and is_dir=1 limit 1', (url,))    s = r.fetchone()    self.lock.release()        if s is not None and s[0] is not None:      s = s[0]      if not s.endswith('/'):        s = s + '/'    return s  def set3(self, path, status=0):    self.lock.acquire()    self.dbc.execute('update download set is_download=? where path=?', (status, path))  def get3(self):    self.lock.acquire()    r = self.dbc.execute('select path from download where is_dir=0 and is_download=0 limit 1')    s = r.fetchone()    self.lock.release()        if s is not None:      return s[0]    return s  def go(self):    self.finished = True    q = DownloadThread(self)    q.start()    while self.path is not None:      try:        s = unquote(self.path.encode(sys.stdin.encoding))        p = os.getcwd() + url2pathname(s)        if not os.path.exists(p):          os.makedirs(p)        #print(s)      except:        pass      try:        self.http.close()        self.http.request('GET', self.path)        r = self.http.getresponse()        if r.status == 200:          self.reset()          self.feed(r.read())      except:        pass      self.set2(self.path, 1)      self.path = self.get2(self.url)          self.finished = False    q.join()if __name__ == "__main__":  if len(sys.argv) > 1:    url = sys.argv[0]    url = url.strip()  else:    # http://www.20cn.net/share/alalmn    # http://www.gaby.de/ftp/pub/win3x/archive/    print('You must provide a valid Url.\n')    print('Usage:\n  Python %s target' % os.path.basename(sys.argv[0]))    print('    target   --- specify a URL to donwload.\n')    url = ''    while len(url) == 0:      if sys.version.startswith('3.2'):        url = input('Please enter a URL:')      else:        url = raw_input('Please enter a URL:')      url = url.strip()  wd = Webdown(url)  wd.go()

原创粉丝点击