一个抓取网页解析内容的程序。
来源:互联网 发布:java 文件md5校验码 编辑:程序博客网 时间:2024/05/29 19:27
Crawler.java
package com.web.crawler;import java.util.*;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.apache.http.HttpEntity;import org.apache.http.HttpResponse;import org.apache.http.HttpStatus;import org.apache.http.client.HttpClient;import org.apache.http.client.methods.HttpGet;import org.apache.http.impl.client.DefaultHttpClient;import org.apache.http.util.EntityUtils;public class Crawler implements Runnable {private int startIndex;private int endIndex;private String url;private PuzzleList locatList;public Crawler(int startIndex, int endIndex, String url,PuzzleList locatList) {this.startIndex = startIndex;this.endIndex = endIndex;this.url = url;this.locatList = locatList;}public void run() {System.out.println("begin run!");int row = 0, column = 0, number = 0;String descr = null;try {for (int urlIndex = this.startIndex; urlIndex <= this.endIndex; urlIndex++) {System.out.println("URL_INDEX::" + urlIndex);String html = null;HttpClient httpClient = new DefaultHttpClient();HttpGet httpget = new HttpGet("http://www.menneske.no/arukone/5x5/eng/?number=" + urlIndex);try {HttpResponse responce = httpClient.execute(httpget);int resStatu = responce.getStatusLine().getStatusCode();if (resStatu == HttpStatus.SC_OK) {HttpEntity entity = responce.getEntity();if (entity != null) {html = EntityUtils.toString(entity);}}} catch (Exception e) {e.printStackTrace();} finally {httpClient.getConnectionManager().shutdown();}Pattern p = Pattern.compile("</table>Showing puzzle number: [\\d]+<br/>");Matcher m = p.matcher(html);while (m.find()) {String tmp = m.group();number = Integer.parseInt(tmp.substring(new String("</table>Showing puzzle number: ").length(),tmp.indexOf("<br/>")));}p = Pattern.compile("Difficulty: [A-Za-z ]+<br/><a href=");m = p.matcher(html);while (m.find()) {String tmp = m.group();descr = tmp.substring(new String("Difficulty: ").length(),tmp.indexOf("<br/><a href="));}p = Pattern.compile("<td class=\"white\">[1-9]*</td>");m = p.matcher(html);int tdIndex = 1;while (m.find()) {String tmp = m.group();String numberStr = tmp.substring(new String("<td class=\"white\">").length(), tmp.indexOf("</td>"));if(numberStr.length() > 0){row = getRow(tdIndex);column = getColumn(tdIndex);number = Integer.parseInt(numberStr);System.out.println(urlIndex +"," + descr + ","+ row + "," + column + "," + number);PuzzleLocation locat = new PuzzleLocation(urlIndex, descr, row, column, number);locatList.addLocation(locat);}tdIndex++;Thread.currentThread().sleep(100);}}} catch (Exception e) {e.printStackTrace();}System.out.println("end run!");//Thread.currentThread().notify();locatList.finishtask.incrementAndGet();}private static int getRow(int tdIndex) {if (0 == (tdIndex % 5))return (tdIndex / 5);elsereturn ((tdIndex / 5) + 1);}private static int getColumn(int tdIndex) {if (0 == (tdIndex % 5))return 5;elsereturn (tdIndex % 5);}}
PuzzleList.java
package com.web.crawler;import java.lang.*;import java.util.*;import com.web.crawler.PuzzleLocation;import java.util.concurrent.atomic.AtomicInteger;public class PuzzleList {private ArrayList<PuzzleLocation> locatList = null;public AtomicInteger finishtask = null;public PuzzleList(){locatList = new ArrayList<PuzzleLocation>(1500);finishtask = new AtomicInteger(0);}public synchronized void addLocation(PuzzleLocation locat){locatList.add(locat);//System.out.println("List Size::" + locatList.size());}public void sortResult(){Collections.sort(locatList);}public List<PuzzleLocation> getPuzzleList(){return this.locatList;}public String toString(){StringBuffer outputBuf = new StringBuffer(102400);for(PuzzleLocation locat : locatList){outputBuf.append(locat.getIndex()).append(",");outputBuf.append(locat.getDescr()).append(",");outputBuf.append(locat.getRow()).append(",");outputBuf.append(locat.getColumn()).append(",");outputBuf.append(locat.getNumber()).append("\n");}return new String(outputBuf);}}
PuzzleLocation.java
package com.web.crawler;import java.io.Serializable;public class PuzzleLocation implements Comparable<PuzzleLocation>, Serializable {private static final long serialVersionUID = 823498623L;private int index;private String descrp;private int row;private int column;private int number;public int getIndex() {return this.index;}public int getNumber() {return this.number;}public int getRow() {return this.row;}public int getColumn() {return this.column;}public String getDescr() {return this.descrp;}public PuzzleLocation(int index, String descrp, int row, int column,int number) {this.index = index;this.descrp = descrp;this.row = row;this.column = column;this.number = number;}public int compareTo(PuzzleLocation dest) {if (this.index > dest.index) {return 1;} else {if (this.index < dest.index) {return -1;} else {if (this.number > dest.number) {return 1;} else {if (this.number < dest.number) {return -1;} else {if (this.row > dest.row) {return 1;} else {if (this.row < dest.row) {return -1;} else {if (this.column > dest.column) {return 1;} else {if (this.column < dest.column) {return -1;} else {return 0;}}}}}}}}}}
ExcelUtil.java
package com.web.crawler;import java.util.*;import java.io.*;public class ExcelUtil {public static void exportExcel(PuzzleList locatList){try{File file=new File("C:/puzzle.xls");if(!file.exists()){file.createNewFile();}else{file.delete();file.createNewFile();}FileOutputStream out=new FileOutputStream(file,true);for(PuzzleLocation locat : locatList.getPuzzleList()){StringBuffer outputBuf = new StringBuffer();outputBuf.append(locat.getIndex()).append("\t");outputBuf.append(locat.getDescr()).append("\t");outputBuf.append(locat.getRow()).append("\t");outputBuf.append(locat.getColumn()).append("\t");outputBuf.append(locat.getNumber()).append("\n");out.write(outputBuf.toString().getBytes("utf-8"));} out.close(); }catch(Exception exp){exp.printStackTrace();}}}
WebCrawler.java
package com.web.crawler;import java.io.BufferedInputStream;import java.io.FileInputStream;import java.io.InputStream;import java.util.*;public class WebCrawler {public static void main(String[] args) {System.out.println("Main start!");String cfgUrl = null;try {Properties config = new Properties();InputStream inStream = new BufferedInputStream(new FileInputStream("resource/puzzle.property"));config.load(inStream);cfgUrl = config.getProperty("URL");} catch (Exception exp) {exp.printStackTrace();}String paramUrl = cfgUrl.substring(0, cfgUrl.indexOf("="));int paramIndex = Integer.parseInt(cfgUrl.substring(cfgUrl.indexOf("=") + 1));System.out.println(paramUrl);System.out.println(paramIndex);/*try {Thread.currentThread().sleep(50000);} catch (Exception exp) {exp.printStackTrace();}*/PuzzleList locatList = new PuzzleList();for(int threadIndex = 1; threadIndex <= 5; threadIndex++){int startIndex = (int)(paramIndex + (threadIndex - 1) * (1434 - paramIndex) / 5.0 );int endIndex = (int)(paramIndex + threadIndex * (1434 - paramIndex) / 5.0);if(1 != threadIndex){startIndex += 1;}Crawler crawler = new Crawler(startIndex, endIndex, paramUrl, locatList);new Thread(crawler).start();}while (locatList.finishtask.get() < 5) {try {Thread.currentThread().sleep(5000);} catch (Exception exp) {exp.printStackTrace();}}locatList.sortResult();ExcelUtil.exportExcel(locatList);// System.out.println(html);System.out.println("main end!");return;}}
- 一个抓取网页解析内容的程序。
- delphi 抓取网页内容的程序
- delphi抓取网页内容的程序
- 用XPATH解析网页并抓取要的内容
- 如何写asp.net抓取网页内容的小偷程序
- 【Day6】如何抓取一个网页的内容(多种方法)
- 网页内容抓取 图片的抓取方法
- 抓取网页内容的函数
- [python]抓取网页的内容
- 有关网页抓取的内容
- php 模拟用户抓取网页内容程序
- 服务程序抓取不到网页内容
- 一个简单的基于java的网页抓取程序
- 抓取网页萃取网页内容的代码
- Python实现一个简易的网页抓取程序
- android 开发--抓取网页解析网页内容的若干方法(网络爬虫)(正则表达式)
- 抓取一个网页并解析HTML
- 一个极其简洁的Python网页抓取程序,自动从雅虎财经抓取股票数据
- 【问题解决】在C#中使用C++编写的类 使用托管c++(managed c++)
- 《Android内核剖析》读书笔记 第3章 源码下载及开发环境配置
- android 设置APN
- varnish使用汇总
- iOS的编码格式
- 一个抓取网页解析内容的程序。
- JSP网页编程初解(三)
- 压缩与解压2---文件的压缩
- Hive HBase 集群整合
- Android程序连接Delphi DataSnap服务器
- 收藏的网页--素材资料下载(free)
- 从Hello World中了解程序的运行过程
- Eclipse 不能自动编译
- PHP内核探索:zend_parse_parameters函数