爬虫之旅(四)

来源:互联网 发布:互联网运营工资 知乎 编辑:程序博客网 时间:2024/06/03 21:31
#!/usr/bin/python#encoding=utf-8__author__ = 'Administrator'from  bs4 import  BeautifulSoupimport seleniumimport sysimport urllibimport requestsimport timeimport reif __name__ == "__main__":    import os    from selenium import webdriver    from selenium.webdriver.support.ui import WebDriverWait    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}    chromedriver = "/home/henson/Documents/pycharm/webdriver/chromedriver"    os.environ["webdriver.chrome.driver"] = chromedriver    driver = webdriver.Chrome(chromedriver)    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")    driver.get('http://lib.cqvip.com/zk/search.aspx')    #inputElement = driver.find_element_by_xpath("//*[@id='b_Text0']")#//*[@id="b_Text0"]    inputElement = driver.find_element_by_name("b_Text0")    #inputElement = driver.find_element_by_onkeypress("EnteryKeyno(event)")    searchWord="土壤"    inputElement.send_keys((searchWord))    driver.find_element_by_xpath("//*[@id='searchnormal']/form/div[3]/div/input[1]").click()    currentURL=driver.current_url    urlList=[]    localDir = '/home/henson/Downloads/paper'     #基金抓取    #fund=driver.find_element_by_xpath("//*[@id='result_divlist']/dl/dd[4]")   # fund=driver.find_element_by_class_name("fund")   # print(fund.text)    req = urllib.request.Request(url=currentURL, headers=headers)    #html = urllib.request.urlopen(req)    #html = driver.find_element_by_xpath("//*[@id='result_divlist']/dl[4]/dd[4]").text    k=1    for j in  range (1,21):        driver.find_element_by_xpath("/html/body/div/div[2]/div/div[5]/div[2]/div/div/div/div[3]/div[2]/a["+str(k)+"]").click()        time.sleep(2)        currentURL = driver.current_url        print("NO."+str(j)+"页")        if k==11:            k=3        else:k=k+1        for i in range(1, 100):            try:                number = driver.find_element_by_xpath(                    "//*[@id='result_divlist']/dl[" + str(i) + "]/dd[@class='num']").text                html = driver.find_element_by_xpath(                    "//*[@id='result_divlist']/dl[" + str(i) + "]/dd[@class='fund']").text                print(number)                print(html)            except Exception:                try:                    number = driver.find_element_by_xpath(                        "//*[@id='result_divlist']/dl[" + str(i) + "]/dd[@class='num']").text                    htmls = driver.find_element_by_xpath(                        "//*[@id='result_divlist']/dl[" + str(i) + "]").text                    continue;                except Exception:                    break;                break;

原来还有这种打开方式,可以在标签里放变量,dl[” + str(i) + “]/dd[@class=’num’] 还附加别的条件,学到了,赶紧做笔记了。
调试了半天,终于能出来点东西了,按照这样的套路,或者说思路,我觉得抓下下载的链接应该没毛病的,希望吧,明天弄好了,就去放飞自我。
顺便把CT图也弄完。

原创粉丝点击