python爬取bilibili番剧图片

来源:互联网 发布:java源码是什么 编辑:程序博客网 时间:2024/04/27 18:50
# -*- coding:utf-8 -*-import urllibimport urllib2import reimport timeimport osimport sysfrom bs4 import BeautifulSouphello="请输入首位置与末位置(1~999)"print hello.decode("utf-8")page_start=input("page_start:")page_end=input("page_end:")start=int(page_start)end=int(page_end)path=os.getcwd()new_path=os.path.join(path,'bilibili')if not os.path.isdir(new_path):    os.mkdir(new_path)file_url=new_path+"\AnimeList" + str(page_start) + "_" + str(page_end) + ".txt"data = open(file_url,'w')def GetPageInfo(page):       url = 'http://bangumi.bilibili.com/anime/3' + str(page)    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'    headers = { 'User-Agent' : user_agent }    request = urllib2.Request(url,headers = headers)    response = urllib2.urlopen(request).read().decode("utf-8")    soup = BeautifulSoup(response,"html.parser")    pic_name=new_path + "\\3" + str(page) + ".jpg"    div = soup.find("div",{'class':"bangumi-preview"})    pic = div.find("img").get('src')    getpic = urllib2.urlopen(pic).read()    with open(pic_name,'wb') as code:    code.write(getpic)        title = soup.find("h1",{'class':"info-title"}).get_text()    data.write(title.encode('utf-8'))    info=" ID:3"+str(page)+"  \n"    data.write(info)    count=startwhile (count<=end):       try:               GetPageInfo(count)               print str(count)+"get"               count=count+1       except:               print str(count)+"x"               count=count+1               continue#GetPageInfo(346)data.close()ts="已完成"print ts.decode("utf-8")        

0 0
原创粉丝点击