爬虫之旅(二)

来源:互联网 发布:福建广电网络集团 缴费 编辑:程序博客网 时间:2024/05/21 03:18
#!/usr/bin/python#encoding=utf-8__author__ = 'Administrator'from  bs4 import  BeautifulSoupimport seleniumimport sysimport urllibimport requestsimport reif __name__ == "__main__":    import os    from selenium import webdriver    from selenium.webdriver.support.ui import WebDriverWait    chromedriver = "/home/henson/Documents/pycharm/webdriver/chromedriver"    os.environ["webdriver.chrome.driver"] = chromedriver    driver = webdriver.Chrome(chromedriver)    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")    driver.get('http://lib.cqvip.com/zk/search.aspx')    #inputElement = driver.find_element_by_xpath("//*[@id='b_Text0']")#//*[@id="b_Text0"]    inputElement = driver.find_element_by_name("b_Text0")    #inputElement = driver.find_element_by_onkeypress("EnteryKeyno(event)")    searchWord="大气"    inputElement.send_keys((searchWord))    driver.find_element_by_xpath("//*[@id='searchnormal']/form/div[3]/div/input[1]").click()    currentURL=driver.current_url    urlList=[]    localDir = '/home/henson/Downloads/paper'    r=requests.get(currentURL)    #rr = urllib.request.urlopen(currentURL)    data=r.text    #print(data)   # link_list = re.findall(r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')", data)   # link_list=re.findall(r"a href=./download/ target=\"_blank\" ",data)    #pattern = re.compile(r"<a href=.* target=\"_blank\" onclick=.*\s?.*<img src=.*\.jpg\" alt=.*title=\"\".*height=")   # res_list=pattern.findall(rr)    #for url in link_list:     #   print(url)    driver.find_element_by_xpath("//*[@id='result_divlist']/dl[2]/dt/span/a[2]").click()    currentURL = driver.current_url    urlList = []    r = requests.get(currentURL)    print(currentURL)   # data = r.text    #driver.find_element_by_xpath("/html/body/div/div[2]/div/div/div/div/div[2]/div[1]/div[1]/ul/li[0]/a").click()   # re.findall(r"(?<=href=\/ download \/ confirm.aspx\?)",data).click()    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}    req = urllib.request.Request(url=currentURL, headers=headers)    html=urllib.request.urlopen(req)    #soup = BeautifulSoup(data, "html.parser")    soup=BeautifulSoup(html,"html.parser")    #print(soup.prettify())    a=soup.find_all(href=re.compile("\.dll"))    b=soup.find_all("a",class_="main")    print(a)    print(b)   # link_list =    # -    #  re.findall(r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')", data)   # print(data)    currentURL = driver.current_url    r = requests.get(currentURL)    data = r.text    #print(data)    #driver.find_element_by_class_name("op").click() #   pattern = re.compile(r"/html/body/div/")  #  link_list=re.findall(pattern,data)    #print(data)    #driver.find_element_by_xpath("/html/body/div[2]/div[2]/div/div[4]/div/div/div/div/div[1]/span[2]/a[2]]").click()    #driver.find_element_by_xpath("//*[@id='result_divlist']/dl[2]/dt/span/a[2]").click()    #driver.find_element_by_class_name("btns_a down").click()    #link_list = re.findall(r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')", data)  #  link_list = re.findall('<a href="(.*)" /html/body/div[2]/div[2]/div/div[4]/div/div/div/div/div[1]/span[2]/a[2]target="_blank"', data) #   print(data)   # Element = driver.find_elements_by_class("btnTitle")    #LINK_PATTERN =  re.findall(r'<a href="(http:\/\/.*)" class="down_link">',data)    #LINK_PATTERN = re.findall(r'<li><a href="(http:\/\/.*)" </a></li>',data)   # LINK_PATTERN = '<a href="(/ download / confirm.aspx?.*)" target="_blank">'    #url_list = list(set(re.findall(LINK_PATTERN, data)))    #url_list = re.findall(LINK_PATTERN, data)   # for url in link_list:#        print(url)

Q1:用了bs正则抓取,都匹配不上链接,扑空了
Q2:不理解在第二个.click()之后,driver.current_url获取的应该是第跳转之后的页面链接,为什么获取到的还是前面的那个。

S1:Xpath的获取是可以在需要的位置前两三个节点copy Xpath+再添加后面的节点
S2:代表连续a标签的第二个

I.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}II.req = urllib.request.Request(url=currentURL, headers=headers)III.html=urllib.request.urlopen(req)IV. soup=BeautifulSoup(html,"html.parser")

S3:之前一直导不入bs4 module 原来自己犯了个愚蠢的错误,自己随意起了个bs4.py,所以在同一目录下当然导入bs4找不到BeautifulSoup了,费了我老半天。。。Damn

原创粉丝点击