nutch源码分析---6

来源：互联网发布：python主流版本编辑：程序博客网时间：2024/06/05 02:06

nutch源码分析—invertlinks

命令“bin/nutch invertlinks crawl/linkdb -dir crawl/segments”最终会调用org.apache.nutch.crawl.LinkDb的main函数。

LinkDb::main

  public static void main(String[] args) throws Exception {    int res = ToolRunner.run(NutchConfiguration.create(), new LinkDb(), args);    System.exit(res);  }

ToolRunner的run函数最终调用LinkDb的run函数。

LinkDb::run

  public int run(String[] args) throws Exception {    invert(db, segs.toArray(new Path[segs.size()]), normalize, filter, force);    return 0;  }  public void invert(Path linkDb, Path[] segments, boolean normalize,      boolean filter, boolean force) throws IOException {    JobConf job = LinkDb.createJob(getConf(), linkDb, normalize, filter);    Path lock = new Path(linkDb, LOCK_NAME);    FileSystem fs = FileSystem.get(getConf());    LockUtil.createLockFile(fs, lock, force);    Path currentLinkDb = new Path(linkDb, CURRENT_NAME);    for (int i = 0; i < segments.length; i++) {      FileInputFormat.addInputPath(job, new Path(segments[i],          ParseData.DIR_NAME));    }    JobClient.runJob(job);    if (fs.exists(currentLinkDb)) {      Path newLinkDb = FileOutputFormat.getOutputPath(job);      job = LinkDbMerger.createMergeJob(getConf(), linkDb, normalize, filter);      FileInputFormat.addInputPath(job, currentLinkDb);      FileInputFormat.addInputPath(job, newLinkDb);      JobClient.runJob(job);      fs.delete(newLinkDb, true);    }    LinkDb.install(job, linkDb);  }  private static JobConf createJob(Configuration config, Path linkDb,      boolean normalize, boolean filter) {    Path newLinkDb = new Path("linkdb-"        + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));    JobConf job = new NutchJob(config);    job.setJobName("linkdb " + linkDb);    job.setInputFormat(SequenceFileInputFormat.class);    job.setMapperClass(LinkDb.class);    job.setCombinerClass(LinkDbMerger.class);    if (normalize || filter) {      FileSystem fs = FileSystem.get(config);      if (!fs.exists(linkDb)) {        job.setBoolean(LinkDbFilter.URL_FILTERING, filter);        job.setBoolean(LinkDbFilter.URL_NORMALIZING, normalize);      }    }    job.setReducerClass(LinkDbMerger.class);    FileOutputFormat.setOutputPath(job, newLinkDb);    job.setOutputFormat(MapFileOutputFormat.class);    job.setBoolean("mapred.output.compress", true);    job.setOutputKeyClass(Text.class);    job.setOutputValueClass(Inlinks.class);    return job;  }  public static JobConf createMergeJob(Configuration config, Path linkDb,      boolean normalize, boolean filter) {    Path newLinkDb = new Path("linkdb-merge-"        + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));    JobConf job = new NutchJob(config);    job.setJobName("linkdb merge " + linkDb);    job.setInputFormat(SequenceFileInputFormat.class);    job.setMapperClass(LinkDbFilter.class);    job.setBoolean(LinkDbFilter.URL_NORMALIZING, normalize);    job.setBoolean(LinkDbFilter.URL_FILTERING, filter);    job.setReducerClass(LinkDbMerger.class);    FileOutputFormat.setOutputPath(job, newLinkDb);    job.setOutputFormat(MapFileOutputFormat.class);    job.setBoolean("mapred.output.compress", true);    job.setOutputKeyClass(Text.class);    job.setOutputValueClass(Inlinks.class);    return job;  }

invert函数创建了两个hadoop的Job，第一个Job设置输入为crawl/segments/*/下的parse_data目录，Mapper函数为LinkDb的map函数，Reducer为LinkDbMerger，Combiner为LinkDbMerger，该Job的作用是把parse_data中记录的url记录连接进行反向，存入临时目录newLinkDb中，所谓反向，比方说parse_data目录中记录的url链接为A(Key)->B(Value)、A(Key)->C(Value)、B(Key)->C(Value)，经过反向处理后，输出的链接为B(Key)->A(value) 、C(Key)->A(value)、C(Key)->B(value)，即正向链接记录的信息是“一个url地址链接到哪些url地址”，反向链接记录的信息是“有多少url地址链接到这个url地址”。
invert的第二个Job通过LinkDbMerger的createMergeJob函数创建，该Job的作用是将第一个Job输出的临时文件夹newLinkDb，与crawl/linkdb/current，即之前的反向信息合并，输出到另一个临时文件夹中。
最后执行LinkDb的install函数更新信息，LinkDb的install函数和前几章分析的install函数类似，都将第二个Job输出的临时文件夹命名为current、将原来的current文件夹命名为old，将原来的old文件夹删除。

先来看第一个Job。

LinkDb::map

  public void map(Text key, ParseData parseData,      OutputCollector<Text, Inlinks> output, Reporter reporter)      throws IOException {    String fromUrl = key.toString();    String fromHost = getHost(fromUrl);    fromUrl = urlNormalizers        .normalize(fromUrl, URLNormalizers.SCOPE_LINKDB);    fromUrl = urlFilters.filter(fromUrl);    Outlink[] outlinks = parseData.getOutlinks();    Inlinks inlinks = new Inlinks();    for (int i = 0; i < outlinks.length; i++) {      Outlink outlink = outlinks[i];      String toUrl = outlink.getToUrl();      toUrl = urlNormalizers.normalize(toUrl, URLNormalizers.SCOPE_LINKDB);      toUrl = urlFilters.filter(toUrl);      inlinks.clear();      String anchor = outlink.getAnchor();      if (anchor.length() > maxAnchorLength) {        anchor = anchor.substring(0, maxAnchorLength);      }      inlinks.add(new Inlink(fromUrl, anchor));      output.collect(new Text(toUrl), inlinks);    }  }

map函数取出parse_data文件夹下记录的从某个url即fromUrl提取出的所有链接outlinks，进行标准化和过滤，最后输出的key为outlinks中每个链接，value为fromUrl。

LinkDbMerger::reduce

  public void reduce(Text key, Iterator<Inlinks> values,      OutputCollector<Text, Inlinks> output, Reporter reporter)      throws IOException {    Inlinks result = new Inlinks();    while (values.hasNext()) {      Inlinks inlinks = values.next();      int end = Math.min(maxInlinks - result.size(), inlinks.size());      Iterator<Inlink> it = inlinks.iterator();      int i = 0;      while (it.hasNext() && i++ < end) {        result.add(it.next());      }    }    if (result.size() == 0)      return;    output.collect(key, result);  }

maxInlinks为最多有多少链接能链接到该url地址，默认为10000，reduce函数就是将链接到相同url地址的链接添加到Inlinks中，最后记录到文件中。

第二个Job的Mapper和Reducer和第一个Job相同，用来将第一个Job的输出和原来crawl/linkdb/current下的数据合并，这里就不看了。

0 0