python爬虫下载网站磁力链接
来源:互联网 发布:java进程同步编程 编辑:程序博客网 时间:2024/04/30 15:26
设计分三步走:
1.获取明星列表地址
2.获取明星作品序列号
3.根据作品序列号查找磁力链接
一、获取网站中明星列表的作品集地址
#coding=utf8import requestsimport reimport xlrdimport xlwtimport timefrom bs4 import BeautifulSoup#新建excel表格用于存储数据myfile=xlwt.Workbook()table=myfile.add_sheet(u"信息",cell_overwrite_ok=True)table.write(0,0,u"名字")table.write(0,1,u"链接")user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 'headers = { 'User-Agent' : user_agent }class geturl(): def __init__(self,page): self.page = page def get_url(self): for p in range(1,self.page+1): url = 'https://avso.pw/cn/actresses/page/'+str(p) r = requests.get(url,headers=headers) html = r.text #print html soup = BeautifulSoup(html) i = (p-1)*50 + 1 for tag in soup.find_all(href=re.compile("https://avso.pw/cn/star")): #print tag.attrs['href'] table.write(i,1,tag.attrs['href']) i += 1 j = (p-1)*50 +1 for tag in soup.find_all(class_='photo-info'): for gg in tag.find_all('span'): #print gg.string table.write(j,0,gg.string) j += 1 print u"完成读取第%s页信息"%p test = geturl(2)test.get_url()filename=str(time.strftime('%Y%m%d%H%M%S',time.localtime()))+"url.xlsx"myfile.save(filename)print u"完成%s的url备份"%time.strftime('%Y%m%d%H%M%S',time.localtime())二、获取明星作品的番号
#coding=utf8import requestsimport reimport xlrdimport xlwtimport timeimport ConfigParserfrom bs4 import BeautifulSoupuser_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 'headers = { 'User-Agent' : user_agent }myfile=xlwt.Workbook()wtable=myfile.add_sheet(u"信息",cell_overwrite_ok=True)wtable.write(0,0,u"名字")wtable.write(0,1,u"链接")wtable.write(0,2,u"番号")class getserial(): def get_serial(self): data = xlrd.open_workbook('url.xls') table = data.sheets()[0] nrows = table.nrows for j in range(nrows): try: cf = ConfigParser.ConfigParser() cf.read("liao.ini") p = cf.getint('num','p') if j == 0: continue else: url = table.cell(j,1).value r = requests.get(url,headers=headers) html = r.text soup = BeautifulSoup(html) i = 0 for tag in soup.find_all('date'): if i%2 == 0: #print tag.string wtable.write(p,2,tag.string) wtable.write(p,0,table.cell(j,0).value) wtable.write(p,1,table.cell(j,1).value) p += 1 i+=1 print j cf.set("num", "p", p) cf.write(open("liao.ini", "w")) except: filename=str(time.strftime('%Y%m%d%H%M%S',time.localtime()))+"serial.xlsx" myfile.save(filename) print u"出现异常自动保存%s的番号备份"%time.strftime('%Y%m%d%H%M%S',time.localtime())test = getserial()test.get_serial()filename=str(time.strftime('%Y%m%d%H%M%S',time.localtime()))+"serial.xlsx"myfile.save(filename)print u"完成%s的番号备份"%time.strftime('%Y%m%d%H%M%S',time.localtime())
三、根据番号查找对应的磁力链接
#coding=utf8import requestsimport reimport xlrdimport xlwtimport timeimport ConfigParserimport threadingfrom bs4 import BeautifulSoupuser_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.87 Safari/537.36'headers = { 'Accept':'text/css,*/*;q=0.1','Accept-Encoding':'gzip, deflate, sdch, br','Accept-Language':'zh-CN,zh;q=0.8','Cache-Control':'max-age=0','Connection':'keep-alive','User-Agent' : user_agent ,}class getlink(): def get_link(self,conf,excel): myfile=xlwt.Workbook() wtable=myfile.add_sheet(u"信息",cell_overwrite_ok=True) wtable.write(0,0,u"名字") wtable.write(0,1,u"番号") wtable.write(0,2,u"文件大小") wtable.write(0,3,u"文件更新日期") wtable.write(0,4,u"链接") wtable.write(0,5,u"磁力链接") data = xlrd.open_workbook(excel) table = data.sheets()[0] nrows = table.nrows for j in range(nrows): try: cf = ConfigParser.ConfigParser() cf.read(conf) p = cf.getint('num','p') if j == 0: continue else: serial = table.cell(j,2).value url = 'https://btso.pw/search/' + serial #print url r = requests.get(url,headers=headers,timeout=30) html = r.text #print html soup = BeautifulSoup(html) for tag in soup.find_all('div',class_='row'): for gg in tag.find_all(class_='col-sm-2 col-lg-1 hidden-xs text-right size'): print gg.string wtable.write(p,0,table.cell(j,0).value) wtable.write(p,1,table.cell(j,2).value) wtable.write(p,2,gg.string) for aa in tag.find_all(class_='col-sm-2 col-lg-2 hidden-xs text-right date'): print aa.string wtable.write(p,3,aa.string) for xx in tag.find_all(href=re.compile("https://btso.pw/magnet/detail/hash")): print xx.attrs['href'] wtable.write(p,4,xx.attrs['href']) r1 = requests.get(xx.attrs['href'],headers=headers,timeout=30) html1 = r1.text #print html1 soup1 = BeautifulSoup(html1) for tag1 in soup1.find_all('textarea',id='magnetLink'): print tag1.string wtable.write(p,5,tag1.string) p += 1 cf.set("num", "p", p) cf.write(open(conf, "w")) except: filename=str(time.strftime('%Y%m%d%H%M%S',time.localtime()))+"link.xls" myfile.save(filename) print u"出现异常自动保存%s的磁力链接备份"%time.strftime('%Y%m%d%H%M%S',time.localtime()) filename=str(time.strftime('%Y%m%d%H%M%S',time.localtime()))+"link.xls" myfile.save(filename) print u"自动保存%s的磁力链接备份"%time.strftime('%Y%m%d%H%M%S',time.localtime())if __name__ == '__main__': test = getlink() threads = [] t1 = threading.Thread(target=test.get_link,args=('link1.ini','serial1.xls',)) threads.append(t1) t2 = threading.Thread(target=test.get_link,args=('link2.ini','serial2.xls',)) threads.append(t2) t3 = threading.Thread(target=test.get_link,args=('link3.ini','serial3.xls',)) threads.append(t3) t4 = threading.Thread(target=test.get_link,args=('link4.ini','serial4.xls',)) threads.append(t4) t5 = threading.Thread(target=test.get_link,args=('link5.ini','serial5.xls',)) threads.append(t5) t6 = threading.Thread(target=test.get_link,args=('link6.ini','serial6.xls',)) threads.append(t6) for t in threads: t.setDaemon(True) t.start() t.join() print u"完成所有进程"
看看最后的excel:
0 0
- python爬虫下载网站磁力链接
- 磁力链接+爬虫
- 磁力链接搜索引擎源码下载
- python爬虫爬取女u番号和磁力链接,封面,保存到csv文件
- python 爬虫下载网站图片
- 磁力链接搜索网站研究心得!
- Go语言爬取网站磁力链接
- 磁力链接方式下载完全攻略
- Python 实现自动获取种子磁力链接
- 磁力链接
- 磁力链接
- Python多线程爬虫获取电影下载链接
- 详解什么是BT种子、迅雷下载链接、磁力链接
- 用ubuntu下载电影:磁力链接,torrent,迅雷链接
- python爬虫下载网站所有文件
- python 爬虫 网络小说下载(静态网站)
- 种子文件转成为磁力链接 下载BT磁力转换小工具
- Java实现bt文件下载、制作、解析、磁力链接
- java基本数据类型与引用数据类型的区别
- Uva220黑白棋
- 关于Shiro中的Realm
- DateUtil
- runC源码分析——Create/Run Container
- python爬虫下载网站磁力链接
- 关于自己
- ijkplayer支持h264
- 工作五年的经历、吐槽
- Codeforces 283E
- oracle 忘记密码时,本机登陆时不需要密
- Celery源码分析(一)-------------从命令执行到生成Worker
- Ubuntu16.04+CUDA-8.0+opencv3.1+matlab2016b+mkl+caffe
- ubuntu源码安装phpredis拓展