lucene2.0+heritrix

来源:互联网 发布:淘宝开店认证 上半身 编辑:程序博客网 时间:2024/06/04 20:14

由于lucene2.0+heritrix一书示例用的网站(http://mobile.pconline.com.cn/,http://mobile.163.com/)改版了,书上实例不能运行,我又找了一个http://mobile.younet.com/进行开发并成功实现示例,希望感兴趣的同学,近快实践,如果此网站也改了就又得改extractor了,哈哈!
search的Extractor代码如下,(别和书上实例相同)供大家参考:附件里有完整代码

 

来自:http://willpower88.iteye.com/blog/325722

package com.luceneheritrixbook.extractor.younet;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Date;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.HasChildFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;

import com.luceneheritrixbook.extractor.Extractor;
import com.luceneheritrixbook.util.StringUtils;

/**
 * <p></p>
 * @author cnyqiao@hotmail.com
 * @date   Feb 6, 2009
 */

public class ExtractYounetMoblie extends Extractor {

 @Override
 public void extract() {
  BufferedWriter bw = null;
  NodeFilter title_filter = new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "mo_tit"));
  NodeFilter attribute_filter = new AndFilter(new TagNameFilter("p"), new HasChildFilter(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "gn_sp1 blue1"))));
  NodeFilter img_filter = new AndFilter(new TagNameFilter("span"), new HasChildFilter(new TagNameFilter("img")));
  
  //提取标题信息
  try {
   //Parser根据过滤器返回所有满足过滤条件的节点
   // 迭代逐渐查找
      NodeList nodeList=this.getParser().parse(title_filter);
   NodeIterator it = nodeList.elements();
   StringBuffer title = new StringBuffer();
   while (it.hasMoreNodes()) {
    Node node = (Node) it.nextNode();
    String[] names = node.toPlainTextString().split(" ");
    for(int i = 0; i < names.length; i++)
     title.append(names[i]).append("-");
    title.append(new Date().getTime());
    //创建要生成的文件
    bw = new BufferedWriter(new FileWriter(new File(this.getOutputPath() + title + ".txt")));
    //获取当前提取页的完整URL地址
    int startPos = this.getInuputFilePath().indexOf("mirror") + 6;
    String url_seg = this.getInuputFilePath().substring(startPos);
    url_seg = url_seg.replaceAll("\\\\", "/");
    String url = "http:/" + url_seg;
    //写入当前提取页的完整URL地址
    bw.write(url + NEWLINE);
    bw.write(names[0] + NEWLINE);
    bw.write(names[1] + NEWLINE);
    
   }
   // 重置Parser
   this.getParser().reset();
   Parser attNameParser = null;
   Parser attValueParser = null;
            //Parser parser=new Parser("http://www.sina.com.cn");
   NodeFilter attributeName_filter = new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "gn_sp1 blue1"));
      NodeFilter attributeValue_filter = new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "gn_sp2"));
            String attName = "";
            String attValue = "";
            // 迭代逐渐查找
      nodeList=this.getParser().parse(attribute_filter);
   it = nodeList.elements();
   while (it.hasMoreNodes()) {    
    Node node = (Node) it.nextNode();
    attNameParser = new Parser();
    attNameParser.setEncoding("GB2312");
    attNameParser.setInputHTML(node.toHtml());
    NodeList attNameNodeList = attNameParser.parse(attributeName_filter);
    attName = attNameNodeList.elements().nextNode().toPlainTextString();
    
    attValueParser = new Parser();
    attValueParser.setEncoding("GB2312");
    attValueParser.setInputHTML(node.toHtml());
    NodeList attValueNodeList = attValueParser.parse(attributeValue_filter);
    attValue = attValueNodeList.elements().nextNode().toPlainTextString();
    bw.write(attName.trim() + attValue.trim());
    bw.newLine();
   }
   // 重置Parser
   this.getParser().reset();
   String imgUrl = "";
   String fileType ="";
   // 迭代逐渐查找
      nodeList=this.getParser().parse(img_filter);
   it = nodeList.elements();
   while (it.hasMoreNodes()) {    
    Node node = (Node) it.nextNode();
    
    ImageTag imgNode = (ImageTag)node.getChildren().elements().nextNode();
    imgUrl = imgNode.getAttribute("src");    
    fileType = imgUrl.trim().substring(imgUrl
      .lastIndexOf(".") + 1);
    //生成新的图片的文件名
    String new_iamge_file = StringUtils.encodePassword(imgUrl, HASH_ALGORITHM) + "." + fileType;
    //imgUrl = new HtmlPaserFilterTest().replace(new_iamge_file, "+", " ");
    //利用miorr目录下的图片生成的新的图片
    this.copyImage(imgUrl, new_iamge_file);
    bw.write(SEPARATOR + NEWLINE);
    bw.write(new_iamge_file + NEWLINE);
   }
   
      
        } catch(Exception e) {
            e.printStackTrace();
        } finally {
         try{
       if (bw != null)
        bw.close();
      }catch(IOException e){
       e.printStackTrace();
      }
        }
  
 }
}

 

运行书上的heritrix实例,并按书上的默认设置进行抓取如下URI:(请自己分析整理)

http://mobile.younet.com/files/list_1.html
http://mobile.younet.com/files/list_2.html
http://mobile.younet.com/files/list_3.html
...

 

 

 

原创粉丝点击