# Copyright (C) 2012 xxx(xxx) Co., LTD.# All rights reserved.## Developed by RD BIOS Team.## Authors: perry <perry.peng@cn.xxx.com>## Date: January 11, 2012## Project Name: WEBDOWN# Project Version: 1.0.0## Project descrition:## History:# Date Auther Description# -----------------------------------------------------------------# 2012/01/11 perry created.## Note:# xxx__version__ = "1.0.0"import os, sys, ioimport sqlite3try: # Python 2.7 from urlparse import urlparse from urllib import ( unquote, url2pathname)except ImportError: # Python 3.2 from urllib.parse import urlparsetry: # Python 2.7 from HTMLParser import HTMLParserexcept ImportError: # Python 3.2 from html.parser import HTMLParsertry: # Python 2.7 from httplib import HTTPConnectionexcept ImportError: # Python 3.2 from http.client import HTTPConnectionimport timeimport threadingclass DownloadThread(threading.Thread): def __init__(self, wd): self.wd = wd threading.Thread.__init__(self) def run(self): http = HTTPConnection(wd.url) while True: s = self.wd.get1() if s is None: if not self.wd.finished: break time.sleep(1) continue x = unquote(s.encode(sys.stdin.encoding)) p = os.getcwd() + url2pathname(x) if not os.path.exists(p): try: http.close() http.request('GET', s) r = http.getresponse() if r.status == 200: print r.getheader('content-length', 0), s try: f = open(p, 'wb') f.write(r.read()) finally: f.close() except: print 'FAIL ', s else: print 'EXISTS ', s self.wd.set1(s, 1) print('exit...')class Webdown(HTMLParser): finished = False def __init__(self, url): try: url_info = urlparse(url, 'http') self.url = url_info.netloc self.http = HTTPConnection(url_info.netloc) self.dbc = sqlite3.connect(':memory:', check_same_thread = False) self.lock = threading.Lock() self.path = url_info.path self.dbc.execute(''' create table if not exists download ( id integer primary key autoincrement, name text, url text, path text, local_path text, is_dir integer default 0, is_searched integer default 0, is_queried integer default 0, is_download integer default 0)''') name = self.path while name.endswith('/'): name = name[:-1] self.path = name + '/' i = name.rfind('/') if i > 0: name = name[i + 1:] self.puturl(name, self.url, self.path, os.getcwd(), 1) except: print('WebDown initialize failure...') HTMLParser.__init__(self) def handle_starttag(self, tag, attrs): if tag != 'a' or len(attrs) != 1 or attrs[0][0] != 'href': return href = attrs[0][1] if href == '../': # ignore the parent folder. return if href == './': # ignore the current folder. return if href.startswith('?'): return if href.startswith('~'): return dir = 0 name = href searched = 1 if name.endswith('/'): name = name[:-1] searched = 0 dir = 1 self.puturl(name, self.url, self.path + href, '', dir, searched) def puturl(self, name, url, path, lpath='', isdir=0, searched=0): self.lock.acquire() self.dbc.execute('insert into download (name,url,path,local_path,is_dir,is_searched) values(?,?,?,?,?,?)', ( name,url, path, lpath, isdir, searched)) self.lock.release() def set1(self, path, status=0): self.lock.acquire() self.dbc.execute('update download set is_queried=? where path=?', (status, path)) self.lock.release() def get1(self): self.lock.acquire() r = self.dbc.execute('select path from download where is_dir=0 and is_queried=0 limit 1') s = r.fetchone() self.lock.release() if s is not None: return s[0] return s def set2(self, path, status=0): self.lock.acquire() self.dbc.execute('update download set is_searched=? where path=?', (status, path)) self.lock.release() def get2(self, url): self.lock.acquire() r = self.dbc.execute('select path from download where url=? and is_searched=0 and is_dir=1 limit 1', (url,)) s = r.fetchone() self.lock.release() if s is not None and s[0] is not None: s = s[0] if not s.endswith('/'): s = s + '/' return s def set3(self, path, status=0): self.lock.acquire() self.dbc.execute('update download set is_download=? where path=?', (status, path)) def get3(self): self.lock.acquire() r = self.dbc.execute('select path from download where is_dir=0 and is_download=0 limit 1') s = r.fetchone() self.lock.release() if s is not None: return s[0] return s def go(self): self.finished = True q = DownloadThread(self) q.start() while self.path is not None: try: s = unquote(self.path.encode(sys.stdin.encoding)) p = os.getcwd() + url2pathname(s) if not os.path.exists(p): os.makedirs(p) #print(s) except: pass try: self.http.close() self.http.request('GET', self.path) r = self.http.getresponse() if r.status == 200: self.reset() self.feed(r.read()) except: pass self.set2(self.path, 1) self.path = self.get2(self.url) self.finished = False q.join()if __name__ == "__main__": if len(sys.argv) > 1: url = sys.argv[0] url = url.strip() else: # http://www.20cn.net/share/alalmn # http://www.gaby.de/ftp/pub/win3x/archive/ print('You must provide a valid Url.\n') print('Usage:\n Python %s target' % os.path.basename(sys.argv[0])) print(' target --- specify a URL to donwload.\n') url = '' while len(url) == 0: if sys.version.startswith('3.2'): url = input('Please enter a URL:') else: url = raw_input('Please enter a URL:') url = url.strip() wd = Webdown(url) wd.go()