
来源:互联网 发布:lg扫地机器人.知乎 编辑:程序博客网 时间:2024/05/22 10:30



这里不得不感叹一句:Life is short, you need Python! 简单优雅到你试一次后就会爱上它,对于爬取个神马美女图片啦顺手捻来,不多废话了,直接上代码才100来行,代码只做了简单的注释

import jsonimport threading#from bs4 import BeautifulSoupimport reimport osfrom urllib.request import urlopen,Requestimport socketsocket.setdefaulttimeout(10)#http://fm.baidu.com/dev/api/?tn=channellistdef get_channel_list(page_url):    try:        htmlDoc = urlopen(page_url).read().decode('utf8')    except:        return {}    with open("./channle.json", mode = 'w', encoding = 'utf-8') as file:        file.write(htmlDoc)    file = open('channle.json')    content = json.load(file)    channel_list = content['channel_list']    for channel in channel_list:        print(channel['channel_name'])    return channel_listdef get_song_list(channel_url):    try:        htmlDoc = urlopen(channel_url).read().decode('utf8')    except:        return{}        with open("./songs.json", mode = 'w', encoding = 'utf-8') as file:        file.write(htmlDoc)    file = open('songs.json')    content = json.load(file)    song_id_list = content['list']    #for song in song_id_list:    #    print(song)    return song_id_listdef get_song_real_url(song_url):    try:        htmlDoc = urlopen(song_url).read().decode('utf8')        #print(htmlDoc)    except:        return(None, None, 0)    with open("./song.json", mode = 'w', encoding = 'utf-8') as file:        file.write(htmlDoc)    file = open('song.json')    content = json.load(file)    #print(content['data']['songList'])    try:        song_link = content['data']['songList'][0]['songLink']        song_name = content['data']['songList'][0]['songName']        song_size = int(content['data']['songList'][0]['size'])    except:        print('get real link failed')        return(None, None, 0)    #print(song_name + ':' + song_link)    return song_name, song_link, song_sizedef donwn_mp3_by_link(song_link, song_name, song_size):    file_name = song_name + ".mp3"    base_dir = os.path.dirname(__file__)    file_full_path = os.path.join(base_dir, file_name)    if os.path.exists(file_full_path):        return        print("begin DownLoad %s, size = %d" % (song_name, song_size))    mp3 = urlopen(song_link)         block_size = 8192    down_loaded_size = 0        file = open(file_full_path, "wb")    while True:        try:            buffer = mp3.read(block_size)                        down_loaded_size += len(buffer)                  if(len(buffer) == 0):                if down_loaded_size < song_size:                    if os.path.exists(file_full_path):                        os.remove(file_full_path)                        print('download time out, file deleted')                        with open('log.txt', 'a') as log_file:                            log_file.write("time out rm %s\n" % file_name)                break                        print('%s %d of %d' % (song_name, down_loaded_size, song_size))            file.write(buffer)                        if down_loaded_size >= song_size:                print('%s download finshed' % file_full_path)                break        except:            if os.path.getsize(file_full_path) < song_size:                if os.path.exists(file_full_path):                    os.remove(file_full_path)                    print('download time out, file deleted')                    with open('log.txt', 'a') as log_file:                        log_file.write("time out rm %s\n" % file_name)            break    file.close()      def downViaMutiThread(song_info_list):    task_threads = []  #存储线程    for song_name, song_link, song_size in song_info_list:        t = threading.Thread(target = donwn_mp3_by_link, args = (song_link, song_name, song_size))        task_threads.append(t)    for task in task_threads:        task.start()    for task in task_threads:        task.join()if __name__ == '__main__':    # 第一步,获取频道列表channel    page_url = 'http://fm.baidu.com/dev/api/?tn=channellist'    channel_list = get_channel_list(page_url)    while True:        #第二步,获取某个频道列表下的所有歌曲        #get all song's id in one channel        channel_url = 'http://fm.baidu.com/dev/api/?tn=playlist&format=json&id=%s' % 'public_yuzhong_yueyu'        song_id_list = get_song_list(channel_url)        #第三步,获取该歌曲的所有信息        #get song real url        #song_info = {}        song_info_list = []        for song_id in song_id_list:            #print(song_id['id'])            song_url = "http://music.baidu.com/data/music/fmlink?type=mp3&rate=320&songIds=%s" % song_id['id']            song_name, song_link, song_size = get_song_real_url(song_url)            if song_size != 0:                #song_info[song_name] = song_link                #song_info = (song_name, song_link, song_size)                #song_info_list.append(song_info)                #single thread way                #最后下载歌曲                donwn_mp3_by_link(song_link, song_name, song_size)        #downViaMutiThread(song_info_list)

2 0