python multi process download files

来源:互联网 发布:中国联通拨号软件 编辑:程序博客网 时间:2024/04/28 04:16
import urllibimport osfrom multiprocessing import Process import timedef download(url, filename):  try:    urllib.urlretrieve(url, filename)  except Exception:    if os.path.isfile(filename):      os.remove(filename)    print("Failed Downloading... ", filename)def getphotos():  prefix = 'http://hwcdn.ddstatic.com/fhg/fhg_photos/2011_04_28/11121/roxanna_milan_11121_1-gal-1600-jpg/'    for i in range(20):    filename = "%04d" % (i + 1) + '.jpg'    print filename    download(prefix + filename, filename)def removepics():  pics = [x for x in os.listdir('.') if x.endswith('.jpg')]  count = 0  for pic in pics:    d = open(pic).read()    if len(d) < 20000:      os.remove(pic)      count += 1  print 'remove %d files' % countdef down_from_file():  itr = 1   urls = '/Users/chenhaoy/tutorial/piclinks.txt'  maxprocess = 50  processlist = []  for url in open(urls):    while len(processlist) > maxprocess:      time.sleep(1)      lst = []      for p in processlist:        if p.is_alive():          lst.append(p)      processlist = lst          filename = "%05d.jpg" % (itr)    itr += 1    print 'downloading.. %s to %s' % (url, filename)    p = Process(target=download, args=(url, filename))    processlist.append(p)    p.start()  time.sleep(5)  print 'Process count=%d' % (len(processlist))  print 'add all pic to download list..'  for p in processlist:    if p.is_alive():      p.terminate()  for p in processlist:    if p.is_alive():      p.join(2)  removepics()  print 'bye..'  def main():  down_from_file()if __name__ == '__main__':main()

0 0