Selenium 实现爬虫

来源:互联网 发布:维多利亚 知乎 编辑:程序博客网 时间:2024/04/29 17:58

1下载

selenium-server-standalone-2.41.0.jar

chromedriver_win32.zip

       IEDriverServer_x64_2.42.0.zip

    

2设置环境

1)解压chromedriver_win32.zip,把chromedriver.exe拷贝至C:/ selenium/chrome/

2)解压IEDriverServer_x64_2.42.0.zip把IEDriverServer.exe拷贝至C:/ selenium/ie/

3)ie驱动的路径加入至环境变量PATH

 

3 代码示例

 建java工程,添加jar包selenium-server-standalone-2.41.0.jar。

3.1 baidu

import org.openqa.selenium.By;import org.openqa.selenium.NoSuchElementException;import org.openqa.selenium.StaleElementReferenceException;import org.openqa.selenium.WebDriver;import org.openqa.selenium.WebDriverException;import org.openqa.selenium.WebElement;import org.openqa.selenium.ie.InternetExplorerDriver;import org.openqa.selenium.chrome.ChromeDriver;import org.openqa.selenium.chrome.ChromeOptions;import org.openqa.selenium.firefox.FirefoxDriver;import org.openqa.selenium.support.ui.ExpectedCondition;import org.openqa.selenium.support.ui.ExpectedConditions;import org.openqa.selenium.support.ui.WebDriverWait;

privatestaticvoid example(){//for firefox//WebDriver driver = new FirefoxDriver();//for chrome System.setProperty("webdriver.chrome.driver","C:/ selenium/chrome/chromedriver.exe");WebDriver driver =newChromeDriver();//for IE//WebDriver driver = new InternetExplorerDriver();WebDriverWait w =newWebDriverWait(driver,10); driver.get("http://www.baidu.com/");  

// 等价于 driver.navigate().to("http://www.baidu.com/"); w.until(ExpectedConditions.visibilityOfElementLocated(By.id("kw1"))); w.until(ExpectedConditions.elementToBeClickable(By.id("su1")));// Find the text input element by its nameWebElement element = driver.findElement(By.id("kw1")); element.sendKeys("liaoxiangui"); element.submit();  

//WebDriver会自动的找element对应的form,并提交. 等价于:  

//driver.findElement(By.id("su1")).click();System.out.println("Page title is: "+ driver.getTitle());// baidu's search is rendered dynamically with JavaScript.// Wait for the page to load, timeout after 10 seconds w.until(newExpectedCondition<Boolean>(){publicBoolean apply(WebDriver d){return d.getTitle().toLowerCase().startsWith("liaoxiangui");}});System.out.println("Page title is: "+ driver.getTitle());try{Thread.sleep(10000);}catch(Exception e){}//Close the browser driver.quit();}


3.2 一号店

privatestaticvoid yihaodian(){System.setProperty("webdriver.chrome.driver","C:/work/research/bijia/selenium/chromedriver_win32/chromedriver.exe");ChromeOptions options =newChromeOptions();//options.addArguments("--disable-images");WebDriver driver =newChromeDriver(options);try{driver.get("http://www.yhd.com/ctg/s2/c21289-0-60761/b/a-s1-v0-p15-price-d0-f0-m1-rt0-pid-mid0-k/#page=1&sort=1");Boolean first =true;int products=0;WebDriverWait ww =newWebDriverWait(driver,10);while(true){System.out.println("processing filter page:"+driver.getCurrentUrl());if(first){ first=false; ww.until(ExpectedConditions.presenceOfElementLocated(By.id("startShopping")));WebElement s = driver.findElement(By.id("selectProvince")); s.click(); s = driver.findElement(By.id("p_13")); s.click(); s = driver.findElement(By.id("startShopping")); s.click();} ww.until(ExpectedConditions.presenceOfAllElementsLocatedBy(By.cssSelector("#itemSearchList > li")));

//窗口最大化。 必须使得那个按钮在屏幕上可见,否则会抛异常?//driver.manage().window().maximize();

final int liCounter = driver.findElements(By.cssSelector("#itemSearchList > li")).size(); for(int kk=0;kk<liCounter;kk++){try{String cssStr ="#itemSearchList > li:nth-of-type("+(kk+1)+")";WebElement li = driver.findElement(By.cssSelector(cssStr));; products++;String t;try{ ww.until(ExpectedConditions.visibilityOf(li.findElement(By.cssSelector(".owner > a")))); t = li.findElement(By.cssSelector(".owner > a")).getAttribute("title");System.out.println("vendor name="+t);}catch(NoSuchElementException e){System.out.println("vendor=自营");} ww.until(ExpectedConditions.visibilityOf(li.findElement(By.className("electrical_item_box")))); t = li.findElement(By.className("electrical_item_box")).getAttribute("comproid");System.out.println("selfid="+t); ww.until(ExpectedConditions.visibilityOf(li.findElement(By.cssSelector("div > .search_prod_img > img")))); t = li.findElement(By.cssSelector("div > .search_prod_img > img")).getAttribute("src");System.out.println("pic url="+t); ww.until(ExpectedConditions.visibilityOf(li.findElement(By.cssSelector(".title > .title")))); t = li.findElement(By.cssSelector(".title > .title")).getText();System.out.println("title="+t); t = li.findElement(By.cssSelector(".title > .title")).getAttribute("href");System.out.println("detailed url="+t); ww.until(ExpectedConditions.visibilityOf(li.findElement(By.className("color_red")))); t = li.findElement(By.className("color_red")).getText();System.out.println("price="+t);}catch(StaleElementReferenceException ex)//see http://docs.seleniumhq.org/exceptions/stale_element_reference.jsp{ kk--;System.out.println("stale element. retry to get it.");}//break;}WebElement ne =null;while(true){try{ ww.until(ExpectedConditions.presenceOfElementLocated(By.cssSelector("#rankOpDiv")));//如果找不到指定的element,selenium的做法是抛异常。 ne = driver.findElement(By.cssSelector("#rankOpDiv .select_page_btn a.next"));//throw NoSuchElementException ww.until(ExpectedConditions.elementToBeClickable(By.cssSelector("#rankOpDiv a.next"))); ne.click();}catch(StaleElementReferenceException ex){System.out.println("retry going to next page.");continue;}catch(NoSuchElementException e){System.out.println("this category end!");}break;}if(ne ==null)break;}System.out.println("get "+products+" products.");}catch(Exception ex){System.out.println(ex);}//Close the browser driver.quit();}

0 0
原创粉丝点击