nutch-2.0源码之GeneratorJob

来源：互联网发布：类似金十数据的网站编辑：程序博客网时间：2024/05/18 01:52

/******************************************************************************* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements.  See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License.  You may obtain a copy of the License at *  *     http://www.apache.org/licenses/LICENSE-2.0 *  * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/package org.apache.nutch.crawl;import java.io.IOException;import java.net.MalformedURLException;import org.apache.hadoop.conf.Configuration;import org.apache.nutch.crawl.GeneratorJob.SelectorEntry;import org.apache.nutch.net.URLFilterException;import org.apache.nutch.net.URLFilters;import org.apache.nutch.net.URLNormalizers;import org.apache.nutch.scoring.ScoringFilterException;import org.apache.nutch.scoring.ScoringFilters;import org.apache.nutch.storage.Mark;import org.apache.nutch.storage.WebPage;import org.apache.nutch.util.TableUtil;import org.apache.gora.mapreduce.GoraMapper;public class GeneratorMapperextends GoraMapper<String, WebPage, SelectorEntry, WebPage> {  private URLFilters filters;  private URLNormalizers normalizers;  private boolean filter;  private boolean normalise;  private FetchSchedule schedule;  private ScoringFilters scoringFilters;  private long curTime;  @Override  public void map(String reversedUrl, WebPage page,      Context context) throws IOException, InterruptedException {    String url = TableUtil.unreverseUrl(reversedUrl);        //跳过已经generated的url    if (Mark.GENERATE_MARK.checkMark(page) != null) {      if (GeneratorJob.LOG.isDebugEnabled()) {        GeneratorJob.LOG.debug("Skipping " + url + "; already generated");      }      return;    }    // If filtering is on don't generate URLs that don't pass URLFilters    // 如果过滤开启，则不生成没有通过过滤的url    try {      //规格化url      if (normalise) {        url = normalizers.normalize(url, URLNormalizers.SCOPE_GENERATE_HOST_COUNT);      }      if (filter && filters.filter(url) == null)        return;    } catch (URLFilterException e) {      if (GeneratorJob.LOG.isWarnEnabled()) {        GeneratorJob.LOG.warn("Couldn't filter url: " + url + " (" + e.getMessage() + ")");        return;      }    } catch (MalformedURLException e) {      if (GeneratorJob.LOG.isWarnEnabled()) {        GeneratorJob.LOG.warn("Couldn't filter url: " + url + " (" + e.getMessage() +")");        return;      }    }    // check fetch schedule    if (!schedule.shouldFetch(url, page, curTime)) {      if (GeneratorJob.LOG.isDebugEnabled()) {        GeneratorJob.LOG.debug("-shouldFetch rejected '" + url + "', fetchTime=" +            page.getFetchTime() + ", curTime=" + curTime);      }      return;    }    float score = page.getScore();    try {      score = scoringFilters.generatorSortValue(url, page, score);    } catch (ScoringFilterException e) {      //ignore    }    SelectorEntry entry = new SelectorEntry(url, score);    System.out.println(entry.score + entry.url);    context.write(entry, page);  }  @Override  public void setup(Context context) {    Configuration conf = context.getConfiguration();    filters = new URLFilters(conf);    curTime =      conf.getLong(GeneratorJob.GENERATOR_CUR_TIME, System.currentTimeMillis());    normalizers =      new URLNormalizers(conf, URLNormalizers.SCOPE_GENERATE_HOST_COUNT);    filter = conf.getBoolean(GeneratorJob.GENERATOR_FILTER, true);    normalise = conf.getBoolean(GeneratorJob.GENERATOR_NORMALISE, true);    schedule = FetchScheduleFactory.getFetchSchedule(conf);    scoringFilters = new ScoringFilters(conf);  }  }

/******************************************************************************* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements.  See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License.  You may obtain a copy of the License at *  *     http://www.apache.org/licenses/LICENSE-2.0 *  * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/package org.apache.nutch.crawl;import java.io.IOException;import java.net.MalformedURLException;import java.util.HashMap;import java.util.Map;import org.apache.avro.util.Utf8;import org.apache.hadoop.conf.Configuration;import org.apache.nutch.crawl.GeneratorJob.SelectorEntry;import org.apache.nutch.fetcher.FetcherJob.FetcherMapper;import org.apache.nutch.storage.Mark;import org.apache.nutch.storage.WebPage;import org.apache.nutch.util.TableUtil;import org.apache.nutch.util.URLUtil;import org.apache.gora.mapreduce.GoraReducer;/** Reduce class for generate * * The #reduce() method write a random integer to all generated URLs. This random * number is then used by {@link FetcherMapper}. * */public class GeneratorReducerextends GoraReducer<SelectorEntry, WebPage, String, WebPage> {  private long limit;  private long maxCount;  private long count = 0;//记录表中的行数  private boolean byDomain = false;  private Map<String, Integer> hostCountMap = new HashMap<String, Integer>();  private Utf8 batchId;  @Override  protected void reduce(SelectorEntry key, Iterable<WebPage> values,      Context context) throws IOException, InterruptedException {//webpage中有几条记录，就循环几次    for (WebPage page : values) {      if (count >= limit) {        return;      }      if (maxCount > 0) {        String hostordomain;        if (byDomain) {          hostordomain = URLUtil.getDomainName(key.url);        } else {          hostordomain = URLUtil.getHost(key.url);        }        Integer hostCount = hostCountMap.get(hostordomain);        if (hostCount == null) {          hostCountMap.put(hostordomain, 0);          hostCount = 0;        }        if (hostCount >= maxCount) {          return;        }        hostCountMap.put(hostordomain, hostCount + 1);      }      //设置"mk:_gnmrk_"      Mark.GENERATE_MARK.putMark(page, batchId);      try {    //page中存储了当前行的所有信息        context.write(TableUtil.reverseUrl(key.url), page);      } catch (MalformedURLException e) {    context.getCounter("Generator", "MALFORMED_URL").increment(1);        continue;      }      context.getCounter("Generator", "GENERATE_MARK").increment(1);      count++;    }  }  @Override  protected void setup(Context context)      throws IOException, InterruptedException {    Configuration conf = context.getConfiguration();    long totalLimit = conf.getLong(GeneratorJob.GENERATOR_TOP_N, Long.MAX_VALUE);    if (totalLimit == Long.MAX_VALUE) {      limit = Long.MAX_VALUE;    } else {      limit = totalLimit / context.getNumReduceTasks();    }    maxCount = conf.getLong(GeneratorJob.GENERATOR_MAX_COUNT, -2);    batchId = new Utf8(conf.get(GeneratorJob.BATCH_ID));    String countMode =      conf.get(GeneratorJob.GENERATOR_COUNT_MODE, GeneratorJob.GENERATOR_COUNT_VALUE_HOST);    if (countMode.equals(GeneratorJob.GENERATOR_COUNT_VALUE_DOMAIN)) {      byDomain = true;    }  }}