定时抓取网页连接,提取网页内容,存入数据库
来源:互联网 发布:返利源码 编辑:程序博客网 时间:2024/06/03 20:57
流程
- 提供要抓取的网页地址(列表)
- 提取网页列表中目标所有LINK
- 抓取LINK中的所有网页(爬虫)
- 解析正文内容
- 存入数据库
一、抓取任务(主程序)
- package com.test;
- import java.text.SimpleDateFormat;
- import java.util.Date;
- import java.util.List;
- public class CatchJob {
- public String catchJob(String url){
- String document= null;
- List allLinks = null;
- try {
- // 获取网页内容
- document = ExtractPage.getContentByUrl(url);
- // 获取页面指定内容的Link
- allLinks = ExtractPage.getLinksByConditions(document, "http://www.free9.net/others/gift/");
- if(allLinks!=null&&!allLinks.isEmpty()){
- for(int i=0;i<allLinks.size();i++){
- String link = (String)allLinks.get(i);
- String content = ExtractPage.getContentByUrl(link);
- ExtractPage.readByHtml(content);
- }
- }
- } catch (Exception e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- return "success";
- }
- public static void main(String[] args){
- Long startTime = System.currentTimeMillis();
- System.out.println(">>start.......");
- String httpProxyHost = "211.167.0.131";//default http proxy
- String httpProxyPort = "80"; //default http port
- System.getProperties().setProperty( "http.proxyHost", httpProxyHost);
- System.getProperties().setProperty( "http.proxyPort", httpProxyPort);
- CatchJob job = new CatchJob();
- //System.out.println(job.catchJob("http://www.free9.net/others/gift/2008-12-09/12288046534312.htm"));
- System.out.println(job.catchJob("http://www.free9.net/others/gift/"));
- Date date = new Date(System.currentTimeMillis()-startTime);
- SimpleDateFormat sdf = new SimpleDateFormat("HH:mm:ss ");
- String s = sdf.format(date);
- System.out.println(">>end.......USE"+s+"秒");
- }
- }
二、抓取网页内容,并解析
- package com.test;
- import java.io.BufferedReader;
- import java.io.IOException;
- import java.io.InputStreamReader;
- import java.net.HttpURLConnection;
- import java.net.MalformedURLException;
- import java.net.URL;
- import java.util.ArrayList;
- import java.util.List;
- import org.htmlparser.Node;
- import org.htmlparser.NodeFilter;
- import org.htmlparser.Parser;
- import org.htmlparser.filters.NodeClassFilter;
- import org.htmlparser.filters.OrFilter;
- import org.htmlparser.tags.Div;
- import org.htmlparser.tags.LinkTag;
- import org.htmlparser.tags.TitleTag;
- import org.htmlparser.util.NodeList;
- import org.htmlparser.util.ParserException;
- public class ExtractPage {
- //抓取页面内容
- public static String getContentByUrl(String url){
- System.out.println("**********抓取页面内容***********");
- StringBuffer document= null;
- URL targetUrl;
- try {
- targetUrl = new URL(url);
- HttpURLConnection con = (HttpURLConnection) targetUrl.openConnection();
- con.setFollowRedirects(true);
- con.setInstanceFollowRedirects(false);
- con.connect();
- BufferedReader br = new BufferedReader(new InputStreamReader(con.getInputStream(),"gb2312"));
- String s = "";
- document= new StringBuffer();
- while ((s = br.readLine()) != null) {
- document.append(s+"/r/n");
- }
- s=null;
- br.close();
- return document.toString();
- } catch (MalformedURLException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- } catch (IOException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- return null;
- }
- // 按页面方式处理.解析标准的html页面
- public static void readByHtml(String result) throws Exception
- {
- System.out.println("**********按页面方式处理.解析标准的html页面***********");
- Parser parser;
- NodeList nodelist;
- parser = Parser.createParser(result, "utf8");
- NodeFilter textFilter = new NodeClassFilter(Div.class);
- //NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);
- NodeFilter titleFilter = new NodeClassFilter(TitleTag.class);
- OrFilter lastFilter = new OrFilter();
- lastFilter.setPredicates(new NodeFilter[] { textFilter,titleFilter});
- nodelist = parser.parse(lastFilter);
- Node[] nodes = nodelist.toNodeArray();
- StringBuffer page = new StringBuffer();
- String id = "";
- for (int i = 0; i < nodes.length; i++) {
- Node node = nodes[i];
- if (node instanceof Div) {
- Div textnode = (Div) node;
- id = textnode.getAttribute("id");
- if ("Zoom".equals(id)) {
- //System.out.println(textnode.getChild(5).toHtml());
- page.append(textnode.getChild(5).toHtml().toString());
- page.append(textnode.getChild(6).toHtml().toString());
- }
- }else if (node instanceof TitleTag) {
- TitleTag titlenode = (TitleTag) node;
- page.append(titlenode.getTitle().substring(0, titlenode.getTitle().indexOf("|")));
- }
- /*
- else if (node instanceof LinkTag) {
- LinkTag link = (LinkTag) node;
- line = link.getLink();
- } else if (node instanceof TitleTag) {
- TitleTag titlenode = (TitleTag) node;
- line = titlenode.getTitle();
- }
- */
- //if (isTrimEmpty(line))
- //continue;
- }
- System.out.println(page.toString());
- }
- //获取页面指定内容的Link
- public static List getLinksByConditions(String result,String coditions){
- System.out.println("**********//获取页面指定内容的Link***********");
- List links = null;
- Parser parser;
- NodeList nodelist;
- parser = Parser.createParser(result, "utf8");
- NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);
- try {
- links = new ArrayList();
- nodelist = parser.parse(linkFilter);
- Node[] nodes = nodelist.toNodeArray();
- for (int i = 0; i < nodes.length; i++) {
- Node node = nodes[i];
- if (node instanceof LinkTag) {
- LinkTag link = (LinkTag) node;
- if(link.toHtml().indexOf(coditions)!=-1&&link.toHtml().indexOf("index")==-1&&link.toHtml().indexOf(".htm")!=-1){
- System.out.println(link.toHtml());
- links.add(link.getLink());
- }
- }
- //if (isTrimEmpty(line))
- //continue;
- }
- } catch (ParserException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- return links;
- }
- }
- 定时抓取网页连接,提取网页内容,存入数据库
- CURL抓取网页内容并用正则提取。
- CURL抓取网页内容并用正则提取。
- CURL抓取网页内容并用正则提取。
- Nutch定时抓取网页
- php抓取alexa网页内容 提取站点统计信息
- JAVA 抓取网页内容
- 【JAVA】 抓取网页内容
- 网页内容抓取
- 网页内容抓取
- fsockopen 抓取网页内容
- JAVA 抓取网页内容
- 抓取网页内容
- lotusscript 抓取网页内容
- C# 抓取网页内容
- PHP抓取网页内容
- java 抓取网页内容
- 抓取网页内容
- Oracle通过DBLink操作Excel
- linux下编译CxImage
- 调试成功的按键中断流驱动(转)
- VB函数详解(部分)
- .net 2.0 访问Oracle --与Sql Server的差异,注意事项,常见异常
- 定时抓取网页连接,提取网页内容,存入数据库
- jar 打包生成可执行文件
- ObjectUtils
- [C#]多线程读取数组的小例子
- JAVA如何调用DOS命令
- Linux
- 利用Oracle执行计划机制提高查询性能
- 240个jQuery实现AJAX插件
- 使用 IBM Data Studio Developer 中的 XSLT 转换 Data Web Services 消息