抓取数据格式化

来源：互联网发布：北京超图软件编辑：程序博客网时间：2024/06/05 10:03

#!/usr/bin/python#encoding=utf-8__author__ = 'Administrator'from  bs4 import  BeautifulSoupimport seleniumfrom prettytable import PrettyTableimport sysimport urllibimport requestsimport timeimport reif __name__ == "__main__":    import os    from selenium import webdriver    from selenium.webdriver.support.ui import WebDriverWait    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}    chromedriver = "/home/henson/Documents/pycharm/webdriver/chromedriver"    os.environ["webdriver.chrome.driver"] = chromedriver    driver = webdriver.Chrome(chromedriver)    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")    driver.get('http://lib.cqvip.com/zk/search.aspx')    #inputElement = driver.find_element_by_xpath("//*[@id='b_Text0']")#//*[@id="b_Text0"]    inputElement = driver.find_element_by_name("b_Text0")    #inputElement = driver.find_element_by_onkeypress("EnteryKeyno(event)")    searchWord="土壤"    inputElement.send_keys((searchWord))    driver.find_element_by_xpath("//*[@id='searchnormal']/form/div[3]/div/input[1]").click()    currentURL=driver.current_url    urlList=[]    localDir = '/home/henson/Downloads/paper'     #基金抓取    #fund=driver.find_element_by_xpath("//*[@id='result_divlist']/dl/dd[4]")   # fund=driver.find_element_by_class_name("fund")   # print(fund.text)    req = urllib.request.Request(url=currentURL, headers=headers)    #html = urllib.request.urlopen(req)    #html = driver.find_element_by_xpath("//*[@id='result_divlist']/dl[4]/dd[4]").text    x = PrettyTable(["number", "title", "author", "fund"])    k=1    for j in  range (1,21):        driver.find_element_by_xpath("/html/body/div/div[2]/div/div[5]/div[2]/div/div/div/div[3]/div[2]/a["+str(k)+"]").click()        time.sleep(2)        currentURL = driver.current_url       # print(currentURL)        #print("NO."+str(j)+"页")        if k==11:            k=3        else:k=k+1        for i in range(1, 21):            try:                title = driver.find_element_by_xpath(                    "//*[@id='result_divlist']/dl[" + str(i) + "]/dt/a").text                author = driver.find_element_by_xpath(                    "//*[@id='result_divlist']/dl[" + str(i) + "]/dd[@class='author']").text                number = driver.find_element_by_xpath(                    "//*[@id='result_divlist']/dl[" + str(i) + "]/dd[@class='num']").text                fund = driver.find_element_by_xpath(                    "//*[@id='result_divlist']/dl[" + str(i) + "]/dd[@class='fund']").text                #print(title)                #print(author)                #print(fund)                #data={number:[title,author,fund]}                data=number+" "+title+" "+author.replace("\n", " ")+" "+fund.replace("\n", " ")               # print('%-4s%-60s%-30s%-100s' %(number,title,author.replace("\n", " "),fund.replace("\n", " "))) #去除换行符                #print(data)                #print({number},{title},{author.replace("\n", " ")},{fund.replace("\n", " ")})                #f=open('plus.txt','ab+')                #f.write(number)                #f.close()                x.border=False                x.align["number"] = "l"                x.valign["title"] = "t"                x.padding_width = 10                x.add_row([number,title,author,fund])                print(x)   #迭代次数 每个多少输出一次？                def myAlign(string, length):                    if length == 0:                        return string                    slen = len(string)                    re = string                    if isinstance(string, str):                        placeholder = ' '                    else:                        placeholder = u'　'                    while slen < length:                        re += placeholder                        slen += 1                    return re                #print(number,myAlign(title,60)+str+myAlign(author.replace("\n", " "),120),fund.replace("\n", " "))            except Exception:                try:                    number = driver.find_element_by_xpath(                        "//*[@id='result_divlist']/dl[" + str(i) + "]/dd[@class='num']").text                    htmls = driver.find_element_by_xpath(                        "//*[@id='result_divlist']/dl[" + str(i) + "]").text                    continue;                except Exception:                    break;                break;"""分开获取j(1-21)  :  k=1 ,    k=11 j(19-n)  :  k=12,    k=10 """

目的，想要得到抓去的数据格式化输出，可以采用：
1.设定好字符串的长度m，字符串实际长度n，m-n为补齐空格数或者半角或全角
2.导入prettytable，根据需求，设定输出。
3.最接地气的，最笨的方法，通过设大字符串长度，然后转换成xls，通过分列格式化。

问题：
1.为什么不能通过% n s来给定字符串的固定长度，从而可以格式化输出？
2. #f=open(‘plus.txt’,’ab+’)
#f.write(number)
#f.close()
无法追加写入文件

阅读全文

0 0

抓取数据 格式化

抓取数据格式化