nutch源码分析1------inject（续）

来源：互联网发布：php是做什么用的编辑：程序博客网时间：2024/06/05 04:34

以下是inject方法的源码：主要过程分三大步，已经用彩色标注

public void inject(Path crawlDb, Path urlDir) throws IOException {

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");      //时间格式
    long start = System.currentTimeMillis();    //将当前时间格式化
    if (LOG.isInfoEnabled()) {
      LOG.info("Injector: starting at " + sdf.format(start));
      LOG.info("Injector: crawlDb: " + crawlDb);
      LOG.info("Injector: urlDir: " + urlDir);
    }

    Path tempDir =                                                                                                               //创建一个临时目录tempDir
      new Path(getConf().get("mapred.temp.dir", ".") +
               "/inject-temp-"+
               Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    // map text input file to a <url,CrawlDatum> file                                                        //第一步，将url目录中的url集合格式化，过滤，消除非法的url，并将结果暂存在tempDir中
    if (LOG.isInfoEnabled()) {，
      LOG.info("Injector: Converting injected urls to crawl db entries.");
    }
    JobConf sortJob = new NutchJob(getConf());
    sortJob.setJobName("inject " + urlDir);
    FileInputFormat.addInputPath(sortJob, urlDir);
    sortJob.setMapperClass(InjectMapper.class);

    FileOutputFormat.setOutputPath(sortJob, tempDir);
    sortJob.setOutputFormat(SequenceFileOutputFormat.class);
    sortJob.setOutputKeyClass(Text.class);
    sortJob.setOutputValueClass(CrawlDatum.class);
    sortJob.setLong("injector.current.time", System.currentTimeMillis());
    JobClient.runJob(sortJob);

    // merge with existing crawl db                               //第二步，将url合并，消除合并的url
    if (LOG.isInfoEnabled()) {
      LOG.info("Injector: Merging injected urls into crawl db.");
    }
    JobConf mergeJob = CrawlDb.createJob(getConf(), crawlDb);
    FileInputFormat.addInputPath(mergeJob, tempDir);
    mergeJob.setReducerClass(InjectReducer.class);
    JobClient.runJob(mergeJob);
    CrawlDb.install(mergeJob, crawlDb);

    // clean up                                                               //第三步，进行一些清理工作
    FileSystem fs = FileSystem.get(getConf());
    fs.delete(tempDir, true);

    long end = System.currentTimeMillis();
    LOG.info("Injector: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));

}

下面就针对这三步，进行较为详细的源码分析

1.先来看第一步：

这些工作都是通过hadoop的map/reduce的job方式来处理的，所以对于第一步着重来分析

sortJob.setMapperClass(InjectMapper.class);也就使InjectMapper这个类

public static class InjectMapper implements Mapper<WritableComparable, Text, Text, CrawlDatum>

InjectMapper这个类实现了Mapper接口，也就实现了map方法，以下使map方法的源码：

    public void map(WritableComparable key, Text value,
                    OutputCollector<Text, CrawlDatum> output, Reporter reporter)
      throws IOException {
      String url = value.toString();              // value is line of text

      if (url != null && url.trim().startsWith("#")) {    //如果url以#开头，说明此url是注释状态，忽略掉
          /* Ignore line that start with # */
          return;
      }

      // if tabs : metadata that could be stored
      // must be name=value and separated by \t
      float customScore = -1f;
      int customInterval = interval;
      Map<String,String> metadata = new TreeMap<String,String>();
      if (url.indexOf("\t")!=-1){     //如果url是带有元数据：http://www.nutch.org/ \t nutch.score=10 \t nutch.fetchInterval=2592000 \t userType=open_source
          String[] splits = url.split("\t"); //用“\t”将url切分成数组
          url = splits[0];                    //切分后的数组的第一项为url，其余的元素使元数据对url进行的描述，将其元数据的信息以key/value的方式存入medadata（Map）
          for (int s=1;s<splits.length;s++){
              // find separation between name and value
              int indexEquals = splits[s].indexOf("=");
              if (indexEquals==-1) {
                  // skip anything without a =
                  continue;
              }
              String metaname = splits[s].substring(0, indexEquals);
              String metavalue = splits[s].substring(indexEquals+1);
              if (metaname.equals(nutchScoreMDName)) {
                  try {
                  customScore = Float.parseFloat(metavalue);}
                  catch (NumberFormatException nfe){}
              }
              else if (metaname.equals(nutchFetchIntervalMDName)) {
                  try {
                      customInterval = Integer.parseInt(metavalue);}
                  catch (NumberFormatException nfe){}
              }
              else metadata.put(metaname,metavalue);
          }
      }
      try {
        url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_INJECT);   //使url格式标准化，这个和nutch源码主目录下的conf/regex-normalize.xml相关
        url = filters.filter(url);             // filter the url            //过滤url，这个和nutch源码主目录下的conf/regex-filter.txt相关
      } catch (Exception e) {
        if (LOG.isWarnEnabled()) { LOG.warn("Skipping " +url+":"+e); }
        url = null;
      }
      if (url != null) {                          // if it passes，url符合标准，并且通过了过滤条件
        value.set(url);                           // collect it，将刚刚解析出来的元数据信息metadata中的key/value存入相应的datum中，最后将以<value,datum>格式输出
        CrawlDatum datum = new CrawlDatum(CrawlDatum.STATUS_INJECTED, customInterval);
        datum.setFetchTime(curTime);
        // now add the metadata
        Iterator<String> keysIter = metadata.keySet().iterator();
        while (keysIter.hasNext()){
           String keymd = keysIter.next();
           String valuemd = metadata.get(keymd);
           datum.getMetaData().put(new Text(keymd), new Text(valuemd));
        }
        if (customScore != -1) datum.setScore(customScore);
        else datum.setScore(scoreInjected);
        try {
           scfilters.injectedScore(value, datum);
        } catch (ScoringFilterException e) {
           if (LOG.isWarnEnabled()) {
               LOG.warn("Cannot filter injected score for url " + url
                       + ", using default (" + e.getMessage() + ")");
           }
        }
        output.collect(value, datum);
      }
    }