YouTube视频爬虫简单抓取

来源:互联网 发布:三星ml1861清零软件 编辑:程序博客网 时间:2024/05/20 07:35
YouTube是世界上最大的视频网站,早期公司总部位于加利福尼亚州的圣布鲁诺。在比萨店和日本餐馆,让用户下载、观看及分享影片或短片。公司于2005年2月15日注册,由华裔美籍华人陈士骏等人创立。2006年11月,Google公司以16.5亿美元收购了YouTube,并把其当做一家子公司来经营。但是对于如何通过YouTube盈利,Google一直保持非常谨慎的态度。被收购后的YouTube依然风靡全球网民,花旗银行分析师认为,以2012年整年计算,Google可能从YouTube获得24亿美元的净收入。2014年1月3日,YouTube宣布在拉斯维加斯消费电子展(CES)上演示4K高清视频流媒体服务。该服务采用谷歌的视频编解码技术VP9。网站的未注册用户仍可以直接观看视频,而注册用户则可以上传无限制数量的影片。而当影片有可能的冒犯性质的内容时,仅提供给18岁以上的注册用户观看。YouTube作为当前行业内在线视频服务提供商,YouTube的系统每天要处理上千万个视频片段,为全球成千上万的用户提供高水平的视频上传、分发、展示、浏览服务。2015年2月,央视首次把春晚推送到YouTube等境外网站。
1、前期准备
安装pytube,本人使用pyCharm直接安装,过程不赘述
2、编写代码
# coding: utf-8__author__ = "SK"__date__ = "2017-03-19"import urllib2from pytube import YouTubefrom pprint import pprintfrom lxml import etreeimport sys, getoptdef getHtml(url):    user_agent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.13 (KHTML, like Gecko) Chrome/24.0.1284.0 Safari/537.13'    headers = {'User-Agent': user_agent}    request = urllib2.Request(url, headers=headers)    response = urllib2.urlopen(request)    html = response.read()    return htmldef getUrl(html):    global savepath    global maxNumber    global timeThreshold    global cur_count    global videoLists    tree = etree.HTML(html)    urllist = tree.xpath(u'//div[@class="thumb-wrapper"]/a/@href')    # print urllist    urllist_time = tree.xpath(u'//div[@class="thumb-wrapper"]/a/span/span/text()')    baseurl = r'https://www.youtube.com'    for (item_name, item_length) in zip(urllist, urllist_time):        # print item_name        # print item_length        try:            yt = YouTube(baseurl + item_name)        except:            print "Some thing wrong about the authority"        print("video name:" + yt.filename)        print("video time:" + item_length)        if yt.filename in videoLists:  # 文件已经存在            print "This video has been downloaded!"        else:            if checktime(item_length):                video = yt.filter('mp4')[-1]                print("Now is loading %s------------>" % yt.filename)                #video.download(savepath)                print("ttt----------"+video.url)                print("--------------->%sVideo is loaded!" % yt.filename)                #cur_count += 1                #videoLists.append(yt.filename)                #if cur_count >= maxNumber:  # 达到要求                #    print('There are %d videos downloaded!This task is completed!' % maxNumber)                    # TODO: if necessary, the videoLists can be logged                #    sys.exit()            else:                print 'This video is too long and it will not be downloaded, just be ignored!'    if urllist:        getUrl(baseurl + urllist[0])  # 下一个页面def checktime(timelength):    global timeThreshold    strs = timelength.split(':')    time = int(strs[0]) * 60 + int(strs[1])    if time < timeThreshold:        return True    else:        return Falsedef usage():    print '''    usage: python dl_youtube [option] [arg]    options and args:    -s      : download path    -t      : time threshold of the video to be loaded, in seconds    -u      : start url which to be crawled, it can be set more than one time    -n      : when downloading is stop, i.e. how many videos will be downloaded, default is 10000.    -h      : print this help message    '''#python dl_youtube.py -n 10 -s D://MyDownloads -t 600 -u https://www.youtube.com/watch?v=TThzH_sJo6oif __name__ == "__main__":    start_urls = [                    'https://www.youtube.com/watch?v=ykmvaJ2Cf3w'                    #'https://www.youtube.com/watch?v=TThzH_sJo6o'        #"https: // www.youtube.com / watch?v = TThzH_sJo6o"                  ]    videoLists = []  # 保存文件名,防止重复下载    # 初始值    savepath = r"D://MyDownloads"    maxNumber = 10000    timeThreshold =600    cur_count = 0##    opts, args = getopt.getopt(sys.argv[1:], 'hs:t:n:u:')#    for op, value in opts:#        if op == "-s":  # 下载路径,如默认 D://MyDownloads#            savepath = value#        elif op == '-t':  # 时常限制,默认240s#            timeThreshold = int(value)#        elif op == "-h":  # help#           usage()#            sys.exit()#        elif op == '-n':#            maxNumber = int(value)#        elif op == '-u':  # 初始的搜索链接#            start_urls.append(value)#    for item in start_urls:       html = getHtml(item)       getUrl(html)


原创粉丝点击