Java

来源:互联网 发布:达内c语言视频 编辑:程序博客网 时间:2024/06/06 09:27
import java.io.BufferedReader;import java.io.File;import java.io.FileInputStream;import java.io.FileOutputStream;import java.io.InputStreamReader;import java.util.ArrayList;import java.util.HashMap;import java.util.List;import java.util.Map;import java.util.concurrent.TimeUnit;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.openqa.selenium.TimeoutException;import org.openqa.selenium.WebDriver;import org.openqa.selenium.phantomjs.PhantomJSDriver;import org.openqa.selenium.remote.DesiredCapabilities;import org.openqa.selenium.support.ui.WebDriverWait;public class Spider_1 {private static WebDriver browser = null;  private static WebDriverWait driverWait = null;  public Spider_1() {          System.setProperty("phantomjs.binary.path", "driver/phantomjs.exe");          DesiredCapabilities desiredCapabilities = DesiredCapabilities.phantomjs();          this.browser = new PhantomJSDriver(desiredCapabilities);            // 设置超时        this.browser.manage().timeouts().pageLoadTimeout(5, TimeUnit.SECONDS);             }  String getDetail(String url){this.browser.get(url);return this.browser.getPageSource();}List<Map<String, String>> readTxtIntoList(String file_path) throws Exception {// 结果listList<Map<String, String>> res_list = new ArrayList<Map<String, String>>();// 编码String encoding = "UTF-8";File file = new File(file_path);// 通过文件流创建流readerInputStreamReader read = new InputStreamReader(new FileInputStream(file), encoding);// 创建bufferreaderBufferedReader bufferedReader = new BufferedReader(read);// 读取String new_line = "";String line = "";Matcher matcher = null;int count = 0;while ((new_line = bufferedReader.readLine()) != null) {if (++count%11 == 0) {matcher = Pattern.compile("#id:(.*?)#.*?homepage:(.*?)#.*?").matcher(line); //非贪婪模式if (matcher.find()) {Map<String, String> map = new HashMap<String, String>();map.put("id", matcher.group(1));map.put("homepage", matcher.group(2));res_list.add(map);};line = "";} else {line += new_line;}}return res_list;}void saveAsTxt(String homepage_str, String base_path, String id) {File dir = new File(base_path);          if (!dir.exists()) {              dir.mkdirs();          }          String file_path = base_path + id + ".homepage";        try {FileOutputStream fos = new FileOutputStream(new File(file_path));fos.write(homepage_str.getBytes());} catch (Exception e) {// TODO Auto-generated catch blocke.printStackTrace();}        System.out.println(file_path);}void run() throws Exception {String base_path = "F:\\javadata\\jdhacker\\";for(Map<String,String> map : readTxtIntoList("F:\\javadata\\data.txt")) {String id = map.get("id");String homepage_url = map.get("homepage");String homepage_str;try {homepage_str = getDetail(homepage_url);saveAsTxt(homepage_str, base_path, id);} catch (TimeoutException e) {System.out.println("id " + id + " 超时");}}}public static void main(String[] args) throws Exception {Spider_1 s = new Spider_1();s.run();}}

原创粉丝点击