人民的名义

来源:互联网 发布:淘宝怎么开多个店铺 编辑:程序博客网 时间:2024/04/25 23:45

当时追剧的时候写的。www.h6080.com还是由不少好东西。用到了wget和progressbar,需要的朋友可以借鉴下。

import requestsfrom lxml import etreeimport refrom multiprocessing import Poolimport wgetimport progressbardef get_page_url():    ls1 = range(1, 32)    ls2 = range(37, 61)    ls1.extend(ls2)    return ['http://www.h6080.com/play/24892/1/{}.html'.format(i) for i in ls1]def get_video_url(page_url):    response = requests.get(page_url).text    selector = etree.HTML(response.encode('utf-8'))    x = selector.xpath('//*[@id="player"]/script/text()')    return re.findall('src="(.*?)&', x[0])[0]def get_mp4(video_url):    text = requests.post(video_url, headers=headers).text    return re.findall("video=.'(.*)']", text.encode('utf-8'))[0]def save_mp4_urls(video_urls):    with open('mp4_urls.txt', 'w')as f:        for i, video_url in enumerate(video_urls):            f.write(str(i + 1) + '  ' + video_url + '\n')def read_mp4_urls(filename):    with open(filename, 'r')as f:        urls = f.readlines()    return [re.findall('  (.*?)\\n', i)[0] for i in urls]if __name__ == '__main__':    headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'}    '''    video_urls = map(get_video_url, get_page_url())    mp4_urls = map(get_mp4, video_urls)    print len(video_urls)    print mp4_urls    save_mp4_urls(mp4_urls)    '''    mp4_urls = read_mp4_urls('mp4_urls.txt')    bar = progressbar.ProgressBar()    for i, url in bar(enumerate(mp4_urls)):        filename = str(i + 1) + '.mp4'        print filename        wget.download(url, filename)    '''    p = Pool(4)    for index, url in bar(enumerate(mp4_urls)):        filename = str(index+1)+'.mp4'        p.apply_async(wget.download, (url, filename, ))    p.close()    p.join()    '''