百思不得姐视频爬取

来源:互联网 发布:sql去重复查询余一条 编辑:程序博客网 时间:2024/04/30 00:45
# -*- coding:utf-8 -*-from Tkinter import *from ScrolledText import ScrolledTextimport urllib,requestsimport reimport threadingimport sysurl_name = []a = 1def get():    global a#全局变量    hd = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'}    url = 'http://www.budejie.com/video/'+str(a)    varl.set('已经获取到第%s页视频'%(a))    html = requests.get(url,headers=hd).text    #print html    a+=1    url_content= re.compile(r'<div class="j-r-list-c">.*?</div>.*?</div>',re.S)    url_contents =re.findall(url_content,html)    #print url_contents    for i in url_contents:        url_reg = r'data-mp4="(.*?)">'#正则表达式        url_items = re.findall(url_reg,i)        #print url_items#视频列表        if url_items:#判断地址视频存不存在            name_reg = re.compile(r'<a href="/detail-.{8}?.html">(.*?)</\w',re.S)            name_items = re.findall(name_reg,i)            #print name_items            for i,k in zip(name_items,url_items):#标题与视频结合                url_name.append([i,k])                print i,k    return url_nameid = 1def write():    global id    while id<10:        url_name = get()        for i in url_name:            urllib.urlretrieve(i[1],'video\\%s.mp4' % (i[0]))            text.insert(END,str(id)+'.'+i[1]+'\n'+i[0]+'\n')            url_name.pop(0)            id +=1    varl.set('抓取完毕')def start():    th = threading.Thread(target=write)    th.start()#触发root = Tk()root.title('视频爬取')root.geometry('+400+100')#指定位置text = ScrolledText(root,font=('微软雅黑',10))text.grid()button = Button(root,text='开始爬取',font=('微软雅黑',10),fg='blue',command=start)button.grid()varl = StringVar()label = Label(root,font=('微软雅黑',10),fg='black',textvariable = varl)label.grid()varl.set('已准备...')root.mainloop()
0 0
原创粉丝点击