抓取数据 格式化
来源:互联网 发布:北京超图软件 编辑:程序博客网 时间:2024/06/05 10:03
#!/usr/bin/python#encoding=utf-8__author__ = 'Administrator'from bs4 import BeautifulSoupimport seleniumfrom prettytable import PrettyTableimport sysimport urllibimport requestsimport timeimport reif __name__ == "__main__": import os from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'} chromedriver = "/home/henson/Documents/pycharm/webdriver/chromedriver" os.environ["webdriver.chrome.driver"] = chromedriver driver = webdriver.Chrome(chromedriver) driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") driver.get('http://lib.cqvip.com/zk/search.aspx') #inputElement = driver.find_element_by_xpath("//*[@id='b_Text0']")#//*[@id="b_Text0"] inputElement = driver.find_element_by_name("b_Text0") #inputElement = driver.find_element_by_onkeypress("EnteryKeyno(event)") searchWord="土壤" inputElement.send_keys((searchWord)) driver.find_element_by_xpath("//*[@id='searchnormal']/form/div[3]/div/input[1]").click() currentURL=driver.current_url urlList=[] localDir = '/home/henson/Downloads/paper' #基金抓取 #fund=driver.find_element_by_xpath("//*[@id='result_divlist']/dl/dd[4]") # fund=driver.find_element_by_class_name("fund") # print(fund.text) req = urllib.request.Request(url=currentURL, headers=headers) #html = urllib.request.urlopen(req) #html = driver.find_element_by_xpath("//*[@id='result_divlist']/dl[4]/dd[4]").text x = PrettyTable(["number", "title", "author", "fund"]) k=1 for j in range (1,21): driver.find_element_by_xpath("/html/body/div/div[2]/div/div[5]/div[2]/div/div/div/div[3]/div[2]/a["+str(k)+"]").click() time.sleep(2) currentURL = driver.current_url # print(currentURL) #print("NO."+str(j)+"页") if k==11: k=3 else:k=k+1 for i in range(1, 21): try: title = driver.find_element_by_xpath( "//*[@id='result_divlist']/dl[" + str(i) + "]/dt/a").text author = driver.find_element_by_xpath( "//*[@id='result_divlist']/dl[" + str(i) + "]/dd[@class='author']").text number = driver.find_element_by_xpath( "//*[@id='result_divlist']/dl[" + str(i) + "]/dd[@class='num']").text fund = driver.find_element_by_xpath( "//*[@id='result_divlist']/dl[" + str(i) + "]/dd[@class='fund']").text #print(title) #print(author) #print(fund) #data={number:[title,author,fund]} data=number+" "+title+" "+author.replace("\n", " ")+" "+fund.replace("\n", " ") # print('%-4s%-60s%-30s%-100s' %(number,title,author.replace("\n", " "),fund.replace("\n", " "))) #去除换行符 #print(data) #print({number},{title},{author.replace("\n", " ")},{fund.replace("\n", " ")}) #f=open('plus.txt','ab+') #f.write(number) #f.close() x.border=False x.align["number"] = "l" x.valign["title"] = "t" x.padding_width = 10 x.add_row([number,title,author,fund]) print(x) #迭代次数 每个多少输出一次? def myAlign(string, length): if length == 0: return string slen = len(string) re = string if isinstance(string, str): placeholder = ' ' else: placeholder = u' ' while slen < length: re += placeholder slen += 1 return re #print(number,myAlign(title,60)+str+myAlign(author.replace("\n", " "),120),fund.replace("\n", " ")) except Exception: try: number = driver.find_element_by_xpath( "//*[@id='result_divlist']/dl[" + str(i) + "]/dd[@class='num']").text htmls = driver.find_element_by_xpath( "//*[@id='result_divlist']/dl[" + str(i) + "]").text continue; except Exception: break; break;"""分开获取j(1-21) : k=1 , k=11 j(19-n) : k=12, k=10 """
目的,想要得到抓去的数据格式化输出,可以采用:
1.设定好字符串的长度m,字符串实际长度n,m-n为补齐空格数或者半角或全角
2.导入prettytable,根据需求,设定输出。
3.最接地气的,最笨的方法,通过设大字符串长度,然后转换成xls,通过分列格式化。
问题:
1.为什么不能通过% n s来给定字符串的固定长度,从而可以格式化输出?
2. #f=open(‘plus.txt’,’ab+’)
#f.write(number)
#f.close()
无法追加写入文件
阅读全文
0 0
- 抓取数据 格式化
- 数据抓取
- 数据抓取
- 抓取数据
- 抓取数据
- 数据格式化
- 数据格式化
- 数据格式化
- 数据格式化
- 数据格式化
- 数据格式化
- 数据格式化
- 数据格式化
- 数据格式化
- 数据格式化
- 格式化数据
- 数据格式化###,###.##
- 数据格式化
- LightOJ-1141-Number Transformation
- 用node接口
- 读写锁
- TrickGCD HDU
- PTA L3-008 喊山 团体程序设计天梯赛 (bfs)
- 抓取数据 格式化
- MVC模式与三层架构的区别
- OPNET 模块计算机类型“X86”与目标计算机类型“X64”冲突 fatal error LNK1112
- 怎么想静态内部类可以有静态成员和方法而成员内部类不行?
- MySQL:控制台远程登录命令
- python面向对象高级编程
- [J
- 缓冲输出流写出数据的缓冲区问题
- 从HDFS上读取带lzo压缩的SequenceFile文件