一个抓取网页解析内容的程序。

来源:互联网 发布:java 文件md5校验码 编辑:程序博客网 时间:2024/05/29 19:27

Crawler.java

package com.web.crawler;import java.util.*;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.apache.http.HttpEntity;import org.apache.http.HttpResponse;import org.apache.http.HttpStatus;import org.apache.http.client.HttpClient;import org.apache.http.client.methods.HttpGet;import org.apache.http.impl.client.DefaultHttpClient;import org.apache.http.util.EntityUtils;public class Crawler implements Runnable {private int startIndex;private int endIndex;private String url;private PuzzleList locatList;public Crawler(int startIndex, int endIndex, String url,PuzzleList locatList) {this.startIndex = startIndex;this.endIndex = endIndex;this.url = url;this.locatList = locatList;}public void run() {System.out.println("begin run!");int row = 0, column = 0, number = 0;String descr = null;try {for (int urlIndex = this.startIndex; urlIndex <= this.endIndex; urlIndex++) {System.out.println("URL_INDEX::" + urlIndex);String html = null;HttpClient httpClient = new DefaultHttpClient();HttpGet httpget = new HttpGet("http://www.menneske.no/arukone/5x5/eng/?number=" + urlIndex);try {HttpResponse responce = httpClient.execute(httpget);int resStatu = responce.getStatusLine().getStatusCode();if (resStatu == HttpStatus.SC_OK) {HttpEntity entity = responce.getEntity();if (entity != null) {html = EntityUtils.toString(entity);}}} catch (Exception e) {e.printStackTrace();} finally {httpClient.getConnectionManager().shutdown();}Pattern p = Pattern.compile("</table>Showing puzzle number: [\\d]+<br/>");Matcher m = p.matcher(html);while (m.find()) {String tmp = m.group();number = Integer.parseInt(tmp.substring(new String("</table>Showing puzzle number: ").length(),tmp.indexOf("<br/>")));}p = Pattern.compile("Difficulty: [A-Za-z ]+<br/><a href=");m = p.matcher(html);while (m.find()) {String tmp = m.group();descr = tmp.substring(new String("Difficulty: ").length(),tmp.indexOf("<br/><a href="));}p = Pattern.compile("<td class=\"white\">[1-9]*</td>");m = p.matcher(html);int tdIndex = 1;while (m.find()) {String tmp = m.group();String numberStr = tmp.substring(new String("<td class=\"white\">").length(), tmp.indexOf("</td>"));if(numberStr.length() > 0){row = getRow(tdIndex);column = getColumn(tdIndex);number = Integer.parseInt(numberStr);System.out.println(urlIndex +"," + descr + ","+ row + "," + column + "," + number);PuzzleLocation locat = new PuzzleLocation(urlIndex, descr, row, column, number);locatList.addLocation(locat);}tdIndex++;Thread.currentThread().sleep(100);}}} catch (Exception e) {e.printStackTrace();}System.out.println("end run!");//Thread.currentThread().notify();locatList.finishtask.incrementAndGet();}private static int getRow(int tdIndex) {if (0 == (tdIndex % 5))return (tdIndex / 5);elsereturn ((tdIndex / 5) + 1);}private static int getColumn(int tdIndex) {if (0 == (tdIndex % 5))return 5;elsereturn (tdIndex % 5);}}


PuzzleList.java

package com.web.crawler;import java.lang.*;import java.util.*;import com.web.crawler.PuzzleLocation;import java.util.concurrent.atomic.AtomicInteger;public class PuzzleList {private ArrayList<PuzzleLocation> locatList =  null;public AtomicInteger             finishtask = null;public PuzzleList(){locatList =  new ArrayList<PuzzleLocation>(1500);finishtask = new AtomicInteger(0);}public synchronized void addLocation(PuzzleLocation locat){locatList.add(locat);//System.out.println("List Size::" + locatList.size());}public void sortResult(){Collections.sort(locatList);}public List<PuzzleLocation> getPuzzleList(){return this.locatList;}public String toString(){StringBuffer  outputBuf = new StringBuffer(102400);for(PuzzleLocation locat : locatList){outputBuf.append(locat.getIndex()).append(",");outputBuf.append(locat.getDescr()).append(",");outputBuf.append(locat.getRow()).append(",");outputBuf.append(locat.getColumn()).append(",");outputBuf.append(locat.getNumber()).append("\n");}return new String(outputBuf);}}

PuzzleLocation.java


package com.web.crawler;import java.io.Serializable;public class PuzzleLocation implements Comparable<PuzzleLocation>, Serializable {private static final long serialVersionUID = 823498623L;private int index;private String descrp;private int row;private int column;private int number;public int getIndex() {return this.index;}public int getNumber() {return this.number;}public int getRow() {return this.row;}public int getColumn() {return this.column;}public String getDescr() {return this.descrp;}public PuzzleLocation(int index, String descrp, int row, int column,int number) {this.index = index;this.descrp = descrp;this.row = row;this.column = column;this.number = number;}public int compareTo(PuzzleLocation dest) {if (this.index > dest.index) {return 1;} else {if (this.index < dest.index) {return -1;} else {if (this.number > dest.number) {return 1;} else {if (this.number < dest.number) {return -1;} else {if (this.row > dest.row) {return 1;} else {if (this.row < dest.row) {return -1;} else {if (this.column > dest.column) {return 1;} else {if (this.column < dest.column) {return -1;} else {return 0;}}}}}}}}}}

ExcelUtil.java


package com.web.crawler;import java.util.*;import java.io.*;public class ExcelUtil {public static void exportExcel(PuzzleList locatList){try{File file=new File("C:/puzzle.xls");if(!file.exists()){file.createNewFile();}else{file.delete();file.createNewFile();}FileOutputStream out=new FileOutputStream(file,true);for(PuzzleLocation locat : locatList.getPuzzleList()){StringBuffer outputBuf = new StringBuffer();outputBuf.append(locat.getIndex()).append("\t");outputBuf.append(locat.getDescr()).append("\t");outputBuf.append(locat.getRow()).append("\t");outputBuf.append(locat.getColumn()).append("\t");outputBuf.append(locat.getNumber()).append("\n");out.write(outputBuf.toString().getBytes("utf-8"));}              out.close();  }catch(Exception exp){exp.printStackTrace();}}}

WebCrawler.java

package com.web.crawler;import java.io.BufferedInputStream;import java.io.FileInputStream;import java.io.InputStream;import java.util.*;public class WebCrawler {public static void main(String[] args) {System.out.println("Main start!");String cfgUrl = null;try {Properties config = new Properties();InputStream inStream = new BufferedInputStream(new FileInputStream("resource/puzzle.property"));config.load(inStream);cfgUrl = config.getProperty("URL");} catch (Exception exp) {exp.printStackTrace();}String paramUrl = cfgUrl.substring(0, cfgUrl.indexOf("="));int    paramIndex = Integer.parseInt(cfgUrl.substring(cfgUrl.indexOf("=") + 1));System.out.println(paramUrl);System.out.println(paramIndex);/*try {Thread.currentThread().sleep(50000);} catch (Exception exp) {exp.printStackTrace();}*/PuzzleList locatList = new PuzzleList();for(int threadIndex = 1; threadIndex <= 5; threadIndex++){int startIndex = (int)(paramIndex + (threadIndex - 1) * (1434 - paramIndex) / 5.0 );int endIndex   = (int)(paramIndex + threadIndex * (1434 - paramIndex) / 5.0);if(1 != threadIndex){startIndex += 1;}Crawler crawler = new Crawler(startIndex, endIndex, paramUrl, locatList);new Thread(crawler).start();}while (locatList.finishtask.get() < 5) {try {Thread.currentThread().sleep(5000);} catch (Exception exp) {exp.printStackTrace();}}locatList.sortResult();ExcelUtil.exportExcel(locatList);// System.out.println(html);System.out.println("main end!");return;}}





原创粉丝点击