python 爬取电影下载链接

来源:互联网 发布:iphone6s plus 淘宝 编辑:程序博客网 时间:2024/06/06 19:38
#!/usr/bin/python#coding=UTF-8import sysimport urllib2import osimport chardetfrom bs4 import BeautifulSoupreload(sys)sys.setdefaultencoding("utf-8")#从电影html页面中获取视频下载地址def get_movie_download_url(html):    soup=BeautifulSoup(html,'html.parser')    fixed_html=soup.prettify()    td=soup.find('td',attrs={'style':'WORD-WRAP: break-word'})    url_a=td.find('a')    url_a=url_a.string    return url_a#从电影html页面中获取电影标题def get_movie_title(html):    soup=BeautifulSoup(html,'html.parser')    fixed_html=soup.prettify()    title=soup.find('h1')    title=title.string    return title#访问url,返回html页面def get_html(url):    req=urllib2.Request(url)    req.add_header('User-Agent','Mozilla/5.0')    response=urllib2.urlopen(url)    html=response.read()    return html#从电影列表页,获取电影的url,拼接好,存到列表后返回def get_movie_list(url):    m_list = []    html = get_html(url)    soup=BeautifulSoup(html,'html.parser')    fixed_html=soup.prettify()    a_urls=soup.find_all('a',attrs={'class':'ulink'})    host = "http://www.ygdy8.net"    for a_url in a_urls:        m_url=a_url.get('href')        m_list.append(host+m_url)    return m_list#存入txt文件def file_edit(wr_str):    f1 = open(r'e:\down_load_url.txt','a')    f1.write(wr_str)    f1.close()#传入电影url的列表集合,获取下载地址,并写入文件def write_to_txt(a_urls):    for a_url in a_urls:        html=get_html(a_url)        html=html.decode('GBK')        write_title=get_movie_title(html)        write_url=get_movie_download_url(html)        file_edit(write_title+"\n")        file_edit(write_url+"\n")        file_edit("\n")#传入页数,返回这几页的url列表def get_pages_url(num):    urls_list = []    url="http://www.ygdy8.net/html/gndy/dyzz/list_23_"    for n in range(1,num+1):        new_url = url+str(n)+".html"        urls_list.append(new_url)    return urls_listif __name__=='__main__':    pages = 2 #打算爬取几页电影    p_url = get_pages_url(pages)    for i in p_url:        write_to_txt(get_movie_list(i))#执行写入    print "done"
原创粉丝点击