java实现网络爬虫第一个版本

来源：互联网发布：淘宝神笔怎么使用教程编辑：程序博客网时间：2024/04/29 14:48
功能：
1.返回网页文本内容；
2.正则表达式提取title;
4.对抓取的网页重命名；
5.文件名乱码问题的解决
package basicLearn;import java.io.BufferedReader;import java.io.File;import java.io.FileInputStream;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.io.OutputStream;import java.lang.reflect.Field;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.apache.commons.httpclient.HttpClient;import org.apache.commons.httpclient.HttpException;import org.apache.commons.httpclient.HttpStatus;import org.apache.commons.httpclient.methods.GetMethod;public class CrawlPage1 {private HttpClient httpClient;private GetMethod getMethod;private int statusCode;private InputStream is;private OutputStream os;private File file;// 直接获取网络资源，而不是通过代理服务器/** *  * @param url *            抓取的网页URL *  */public CrawlPage1(String url) {// ********************初始化资源在这里做***********//httpClient = new HttpClient();getMethod = new GetMethod(url);try {statusCode = httpClient.executeMethod(getMethod);} catch (HttpException e) {throw new RuntimeException(e);} catch (IOException e) {throw new RuntimeException(e);}System.out.println("initial over");}// 工具方法：用来获取文件文本内容public static String readContent(File file) throws Exception {System.out.println("read content beginning...");BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(file), "utf-8"));// *****这里必须在构造流的时候就指定源文件的编码格式！！否则读取的到文件// ******肯定乱码！！！********//String string;StringBuilder stringBuilder = new StringBuilder();while ((string = bufferedReader.readLine()) != null)stringBuilder.append(string + "\n");// close resoursebufferedReader.close();String str = stringBuilder.toString();System.out.println("read content over...");return str;}// 创建下载的文件名，根据网页的<title>标签的值public static String getFileName(String content) {Pattern pattern = Pattern.compile("<title>([^</title>]*)");Matcher matcher = pattern.matcher(content);String title = null;while (matcher.find()) {title = matcher.group(1);System.out.println("get file name over..." + title);break;}return title;}/** * @param 下载的网页到本地的目录 *  */public void downLoadPage(String directory) {// *********参数是一个本地的目录，这应该是一个有效的目录，将下载的网页保存到这个目录下******//try {if (statusCode == HttpStatus.SC_OK) {System.out.println("statuscode is " + getMethod.getStatusCode());// 如果访问到了资源，则再在本地创建文件File file = new File(directory);if (!file.exists()) {file.mkdirs();this.file = file;}if (file.isFile()) {throw new RuntimeException("这不是一个目录，而是一个文件");}if (file.exists() && file.isDirectory()) {// 为了获得<title>，先将网页下载到临时的文件中File f = new File(directory + "\\temp.html");if (!f.exists()) {f.createNewFile();}System.out.println("begin downLoadPage...");is = getMethod.getResponseBodyAsStream();os = new FileOutputStream(f);int i;byte[] buf = new byte[2048];while ((i = is.read(buf)) != -1) {os.write(buf, 0, i);}os.close();getMethod.releaseConnection();System.out.println(" downLoadPage over...");String titleString = getFileName(readContent(f));// 重命名f.renameTo(new File(directory + "\\" + titleString+ ".html"));System.out.println("rename the file as" + titleString);System.out.println("download over..");} else {System.out.println("你所指定的保存网页的目录并不是一个有效的位置..");}} elseSystem.out.println("statuscode is " + getMethod.getStatusCode());} catch (Exception e) {e.printStackTrace();throw new RuntimeException("读写错误了！！！");}}/** * 释放所有资源 */public void closeResource() {try {if (is != null) {is.close();}if (os != null) {os.close();}} catch (Exception e) {throw new RuntimeException(e);}if (getMethod != null) {getMethod.releaseConnection();}System.out.println("resource released...");}public static void main(String[] args) {CrawlPage1 crawlPage1 = new CrawlPage1("http://hao.360.cn/");crawlPage1.downLoadPage("E:\\工作\\搜索引擎\\pageDownload\\temp");crawlPage1.closeResource();System.out.println("ok");}}