Java
来源:互联网 发布:达内c语言视频 编辑:程序博客网 时间:2024/06/06 09:27
import java.io.BufferedReader;import java.io.File;import java.io.FileInputStream;import java.io.FileOutputStream;import java.io.InputStreamReader;import java.util.ArrayList;import java.util.HashMap;import java.util.List;import java.util.Map;import java.util.concurrent.TimeUnit;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.openqa.selenium.TimeoutException;import org.openqa.selenium.WebDriver;import org.openqa.selenium.phantomjs.PhantomJSDriver;import org.openqa.selenium.remote.DesiredCapabilities;import org.openqa.selenium.support.ui.WebDriverWait;public class Spider_1 {private static WebDriver browser = null; private static WebDriverWait driverWait = null; public Spider_1() { System.setProperty("phantomjs.binary.path", "driver/phantomjs.exe"); DesiredCapabilities desiredCapabilities = DesiredCapabilities.phantomjs(); this.browser = new PhantomJSDriver(desiredCapabilities); // 设置超时 this.browser.manage().timeouts().pageLoadTimeout(5, TimeUnit.SECONDS); } String getDetail(String url){this.browser.get(url);return this.browser.getPageSource();}List<Map<String, String>> readTxtIntoList(String file_path) throws Exception {// 结果listList<Map<String, String>> res_list = new ArrayList<Map<String, String>>();// 编码String encoding = "UTF-8";File file = new File(file_path);// 通过文件流创建流readerInputStreamReader read = new InputStreamReader(new FileInputStream(file), encoding);// 创建bufferreaderBufferedReader bufferedReader = new BufferedReader(read);// 读取String new_line = "";String line = "";Matcher matcher = null;int count = 0;while ((new_line = bufferedReader.readLine()) != null) {if (++count%11 == 0) {matcher = Pattern.compile("#id:(.*?)#.*?homepage:(.*?)#.*?").matcher(line); //非贪婪模式if (matcher.find()) {Map<String, String> map = new HashMap<String, String>();map.put("id", matcher.group(1));map.put("homepage", matcher.group(2));res_list.add(map);};line = "";} else {line += new_line;}}return res_list;}void saveAsTxt(String homepage_str, String base_path, String id) {File dir = new File(base_path); if (!dir.exists()) { dir.mkdirs(); } String file_path = base_path + id + ".homepage"; try {FileOutputStream fos = new FileOutputStream(new File(file_path));fos.write(homepage_str.getBytes());} catch (Exception e) {// TODO Auto-generated catch blocke.printStackTrace();} System.out.println(file_path);}void run() throws Exception {String base_path = "F:\\javadata\\jdhacker\\";for(Map<String,String> map : readTxtIntoList("F:\\javadata\\data.txt")) {String id = map.get("id");String homepage_url = map.get("homepage");String homepage_str;try {homepage_str = getDetail(homepage_url);saveAsTxt(homepage_str, base_path, id);} catch (TimeoutException e) {System.out.println("id " + id + " 超时");}}}public static void main(String[] args) throws Exception {Spider_1 s = new Spider_1();s.run();}}
阅读全文
0 0
- java
- JAVA
- JAVA
- JAVA
- java
- Java
- Java
- JAVA:
- java
- java
- java
- java
- Java
- java
- java
- java
- JAVA?
- java
- 【Portfolio】IC、IR 和 BR 详解
- FIDO U2F NFC协议
- 记录spring controller从页面接收参数的几种方法
- jquery 的 map类型操作
- LinkedList源码解析
- Java
- 不正确退出vim
- Struts2:通过action标签向页面传值
- 算法-重建二叉树
- Hadhoop与HBase服务器启动与停止相关操作
- HDU 1114 Piggy-Bank
- 一些常用git基本命令
- 遇到问题-----web前端----select默认选中无效
- 用 Demo 的形式快速入门 CORS、JSONP 等各种跨域