NIPS 全文下载

来源:互联网 发布:三巨网络待遇如何 编辑:程序博客网 时间:2024/04/28 15:43

有些收集癖,所以这次打算把NIPS上历年的论文都下载下来。尝试通过python直接下载,不过发现很慢,所以想到,那就直接先爬取所有的下载链接,分好文件夹,然后在手动对应不同的年份将其复制到迅雷中下载(目前一共开了29期,所以手动工作还好)

代码及解释如下:

# -*- coding: utf-8 -*-"""Created on Sat Sep  9 19:10:39 2017@author: shouhuxianjian"""'''导入包'''import osimport re#import wgetimport os.path as ospimport requestsfrom bs4 import BeautifulSoup as bsurl0 = 'http://papers.nips.cc/''''下载首页'''html0 = requests.get(url = url0)html0 = bs(html0.text,'html5lib')gResDir = r'e:\NIPS'#存储的位置NIPSTimes = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10,             11, 12, 13, 14, 15, 16, 17, 18, 19, 20,             21, 22, 23, 24, 25, 26, 27, 28, 29]#这条没用到'''获取所有会议次数 '''books_hrefs = [(li.text,li.find('a')['href']) for li in \                 html0.find_all('li',text=re.compile('Advances'))]books_hrefs = [(book.replace(' ','_'), url0.rstrip('/')+href ) for book,href in \                books_hrefs]'''创建文件夹函数 '''def makedirs(indir):    try:        os.makedirs(indir)    except Exception as e:        pass'''为每次会议单独创建一个文件夹'''[makedirs(osp.join(gResDir,indir)) for indir,_ in books_hrefs]invalidChars = re.compile('[;?\*|"<>:/]')#windows下文件夹命名的非法字符cur = 14'''读取每次会议,并读取该次会议下面的paper链接 '''for indb,(book,href) in enumerate(books_hrefs):    if indb < cur: continue    html1 = requests.get(url = href)    html1 = bs(html1.text,'html5lib')    papers_hrefs = [(li.text,li.find('a')['href']) for li in html1.find_all('li') if\                     '/paper/' in li.find('a')['href'] ]    papers_hrefs = [(invalidChars.sub(' ',paper),url0.rstrip('/')+hrefPaper ) for \                     paper,hrefPaper in papers_hrefs]    resPath = osp.join(gResDir,book)    '''读取该次会议已经下载过paper的链接,防止多次下载,即跳过之前下载的部分,从而有断点续下载功能 '''    papersDi = {line.strip():1 for line in open(osp.join(resPath,'book.txt'),encoding = 'utf-8')} \               if osp.exists( osp.join(resPath,'book.txt')) else {}    ''' 读取该次会议下paper的链接,并将其中的pdf和supplemental链接保存到文件中'''          for ind,(paper,hrefPapers) in enumerate(papers_hrefs):        if paper in papersDi: continue        print('book:[{}/{}]   [{}]'.format(indb+1,len(books_hrefs),book))        print('paper:[{}/{}]  [{}]'.format(ind+1,len(papers_hrefs),paper))        print('='*50)        '''下面设置超时30秒,如果爆出超时异常,中断当前程序然后重开就行'''        html2 = requests.get(url = hrefPapers,timeout = 30)        html2 = bs(html2.text,'html5lib')        '''将未下载pdf的paper名字保存到文件cannotDownload.txt中 '''        try:            hrefPDF = html2.find('a',text = '[PDF]')['href']        except Exception as e:            with open(osp.join(resPath,'cannotDownload.txt'),'a',encoding = 'utf-8') as fa:                fa.write(paper+'\n')            continue        '''有些paper并没有supplemental '''        supplemental = html2.find('a',text = '[Supplemental]')        hrefSupplemental = supplemental['href'] if supplemental else ''        '''如下代码运行时下载速度慢,所以才将下载链接收集起来,交给迅雷等下载 '''#        pdf = wget.download(url = url0.rstrip('/')+hrefPDF,#                            out = osp.join(resPath,paper))#        pdfContent = requests.get(url = url0.rstrip('/')+hrefPDF )#        with open(osp.join(resPath,paper.split(',')[0])+'.pdf','wb') as fw:#            fw.write(pdfContent.content)        '''将pdf下载链接写入文件urls.txt '''        with open(osp.join(resPath,'urls.txt'),'a',encoding = 'utf-8') as fw:            fw.write(url0.rstrip('/')+hrefPDF+'\n')        '''处理对应supplemental '''         if hrefSupplemental:            suppFormat = hrefSupplemental.split('.')[-1]#            suppContent = requests.get(url = url0.rstrip('/')+hrefSupplemental )#            with open(osp.join(resPath,paper.split(',')[0])+'.'+suppFormat,'wb') as fw:#                fw.write(suppContent.content)            with open(osp.join(resPath,'urls.txt'),'a',encoding = 'utf-8') as fw:                fw.write(url0.rstrip('/')+hrefSupplemental+'\n')        '''最后将当前paper名字写入文件book.txt用于确保上述行为运行结束,且保证断点续下载功能 '''        with open(osp.join(resPath,'book.txt'),'a',encoding = 'utf-8') as fa:            fa.write(paper+'\n')

结果如下图所示:
1 - 按照会议次数创建文件夹

这里写图片描述

2 - 每次会议文件夹下生成文件
这里写图片描述

3 - urls.txt中下载链接
这里写图片描述

已经将1988-2016 共29次会议的下载链接放在这里了

原创粉丝点击