下载巨潮网络数据的python脚本

来源:互联网 发布:camshift跟踪算法 编辑:程序博客网 时间:2024/05/17 08:18

从巨潮网络下载财报数据,觉得手动比较麻烦,就做了一个简单的python脚本。具体主要代码如下:

    driver = webdriver.PhantomJS(executable_path='浏览器引擎/自己使用phantomjs')    if int(stockNumber) >= 600000:        dst_url = 'http://www.cninfo.com.cn/cninfo-new/disclosure/sse'    else:        dst_url = 'http://www.cninfo.com.cn/cninfo-new/disclosure/szse'        driver.get(dst_url)    prefixpath = "文件夹路径"    driver.find_element_by_class_name("input-stock").send_keys(stockNumber)    driver.find_element_by_xpath("//ul[@id='stock_list']/li[1]/a").click()      prefixpathname = prefixpath+stockNumber+"/"    if os.path.exists(prefixpathname):        pass    else:        os.mkdir(prefixpathname)    #driver.find_element_by_xpath("//ul[@id='stock_list']/li[1]").send_keys(Keys.ENTER)    #切换网页,以获取新弹出的网页窗口    for handle in driver.window_handles:        driver.switch_to_window(handle)        #print('current url:%s'%driver.current_url)    time.sleep(1)    urldata = driver.find_element_by_xpath("//ul[@id='category_list']")    #print('%s'%urldata.text)    chain = ActionChains(driver)    moveelment = driver.find_element_by_xpath("//div[@class='search-condition c5 drop-down']/a/div")    chain.move_to_element(moveelment).perform()    driver.find_element_by_xpath("//div[@class='search-condition c5 drop-down']/a/div").click()    urldata = driver.find_element_by_xpath("//div[@class='search-condition c5 drop-down']/a/div")    #print('%s'%urldata.text)    driver.find_element_by_xpath("//ul[@id='category_list']/li[1]/a").click()    #选择需要的项,年报,半年报...等等    ... ...    #如果需要下载全部数据,需要点击更多,直到数据全部显示    #while(rslt[0] != rslt[1]):    #    driver.find_element_by_link_text('更多').click()    #    #等待网页相应时间    #    time.sleep(1)    #    urldata = driver.find_element_by_xpath("//div[@id='con-div-his-fulltext']/div[@class='stat-right']")    #    print('%s'%urldata.text)    #    patternStr = '\d+'    #    rslt = re.findall(patternStr,urldata.text)    listNum = int(rslt[0])        #listNum    if(listNum != 0):        for indexValue in range(1,listNum+1):            findXpathStr = "//ul[@id='ul_his_fulltext']/li[%d]/div[@class='t3']/dd/span/a"%indexValue            #print('%s'%findXpathStr)            urlTextGet = driver.find_element_by_xpath(findXpathStr)            print('%s'%urlTextGet.text)                        driver.find_element_by_xpath(findXpathStr).click()            time.sleep(1)            for handle in driver.window_handles:                driver.switch_to_window(handle)            time.sleep(1)            #print('%s'%driver.current_url)            urldata1 = driver.find_element_by_class_name("year")            try:                 urldata2 = driver.find_element_by_class_name("new_day")            except:                urldata2 = driver.find_element_by_class_name("day")            timeStr = '%s%s'%(urldata1.text,urldata2.text)            #print('%s%s'%(urldata1.text,urldata2.text))            nameUrl = driver.find_element_by_xpath('//div[@class="bd-top"]/h2')            #print('%s'%nameUrl.text.replace(name,'').strip())            #srcUrl = driver.find_element_by_xpath('//div[@class="bd-ct"]/iframe.src')            #print('%s'%srcUrl.text)            #downloadfilename = './new/%s%s.pdf'%(nameUrl.text.replace(name,'').strip(),timeStr)            nameUrlList = nameUrl.text.split('\n')            if(len(nameUrlList)>1):                downloadfilename = '%s%s%s.pdf'%(prefixpathname,nameUrlList[1].strip(),timeStr)            else:                downloadfilename = '%s%s%s.pdf'%(prefixpathname,nameUrlList[0].strip(),timeStr)            pageRequest = request.urlopen(driver.current_url)            pageRead = pageRequest.read().decode('utf-8')            #pageRequest.readlines().decode('utf-8')            findlinkSuccess = 0            for eachline in pageRead.split('\n'):                webDownloadURL = re.findall('src="(.+)"',eachline)                if(len(webDownloadURL)>0) and re.search('iframe',eachline) and re.search('pdf',eachline.lower()):                   wgetURL = webDownloadURL[0]                   #print('%s'%wgetURL)                   findlinkSuccess = 1                   break            #关闭当前URL窗口            if(os.path.exists(downloadfilename)):                print('%s已存在'%downloadfilename)            elif(findlinkSuccess == 1):                wget.download(wgetURL,downloadfilename)            else:                print('无效链接!ignore')            driver.close()                        #返回指向前一次最新的URL            for handle in driver.window_handles:                driver.switch_to_window(handle)            time.sleep(1)    driver.close()        driver.quit()