/******************************************************************************* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/package org.apache.nutch.crawl;import java.io.IOException;import java.net.MalformedURLException;import org.apache.hadoop.conf.Configuration;import org.apache.nutch.crawl.GeneratorJob.SelectorEntry;import org.apache.nutch.net.URLFilterException;import org.apache.nutch.net.URLFilters;import org.apache.nutch.net.URLNormalizers;import org.apache.nutch.scoring.ScoringFilterException;import org.apache.nutch.scoring.ScoringFilters;import org.apache.nutch.storage.Mark;import org.apache.nutch.storage.WebPage;import org.apache.nutch.util.TableUtil;import org.apache.gora.mapreduce.GoraMapper;public class GeneratorMapperextends GoraMapper<String, WebPage, SelectorEntry, WebPage> { private URLFilters filters; private URLNormalizers normalizers; private boolean filter; private boolean normalise; private FetchSchedule schedule; private ScoringFilters scoringFilters; private long curTime; @Override public void map(String reversedUrl, WebPage page, Context context) throws IOException, InterruptedException { String url = TableUtil.unreverseUrl(reversedUrl); //跳过已经generated的url if (Mark.GENERATE_MARK.checkMark(page) != null) { if (GeneratorJob.LOG.isDebugEnabled()) { GeneratorJob.LOG.debug("Skipping " + url + "; already generated"); } return; } // If filtering is on don't generate URLs that don't pass URLFilters // 如果过滤开启,则不生成没有通过过滤的url try { //规格化url if (normalise) { url = normalizers.normalize(url, URLNormalizers.SCOPE_GENERATE_HOST_COUNT); } if (filter && filters.filter(url) == null) return; } catch (URLFilterException e) { if (GeneratorJob.LOG.isWarnEnabled()) { GeneratorJob.LOG.warn("Couldn't filter url: " + url + " (" + e.getMessage() + ")"); return; } } catch (MalformedURLException e) { if (GeneratorJob.LOG.isWarnEnabled()) { GeneratorJob.LOG.warn("Couldn't filter url: " + url + " (" + e.getMessage() +")"); return; } } // check fetch schedule if (!schedule.shouldFetch(url, page, curTime)) { if (GeneratorJob.LOG.isDebugEnabled()) { GeneratorJob.LOG.debug("-shouldFetch rejected '" + url + "', fetchTime=" + page.getFetchTime() + ", curTime=" + curTime); } return; } float score = page.getScore(); try { score = scoringFilters.generatorSortValue(url, page, score); } catch (ScoringFilterException e) { //ignore } SelectorEntry entry = new SelectorEntry(url, score); System.out.println(entry.score + entry.url); context.write(entry, page); } @Override public void setup(Context context) { Configuration conf = context.getConfiguration(); filters = new URLFilters(conf); curTime = conf.getLong(GeneratorJob.GENERATOR_CUR_TIME, System.currentTimeMillis()); normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_GENERATE_HOST_COUNT); filter = conf.getBoolean(GeneratorJob.GENERATOR_FILTER, true); normalise = conf.getBoolean(GeneratorJob.GENERATOR_NORMALISE, true); schedule = FetchScheduleFactory.getFetchSchedule(conf); scoringFilters = new ScoringFilters(conf); } }
/******************************************************************************* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/package org.apache.nutch.crawl;import java.io.IOException;import java.net.MalformedURLException;import java.util.HashMap;import java.util.Map;import org.apache.avro.util.Utf8;import org.apache.hadoop.conf.Configuration;import org.apache.nutch.crawl.GeneratorJob.SelectorEntry;import org.apache.nutch.fetcher.FetcherJob.FetcherMapper;import org.apache.nutch.storage.Mark;import org.apache.nutch.storage.WebPage;import org.apache.nutch.util.TableUtil;import org.apache.nutch.util.URLUtil;import org.apache.gora.mapreduce.GoraReducer;/** Reduce class for generate * * The #reduce() method write a random integer to all generated URLs. This random * number is then used by {@link FetcherMapper}. * */public class GeneratorReducerextends GoraReducer<SelectorEntry, WebPage, String, WebPage> { private long limit; private long maxCount; private long count = 0;//记录表中的行数 private boolean byDomain = false; private Map<String, Integer> hostCountMap = new HashMap<String, Integer>(); private Utf8 batchId; @Override protected void reduce(SelectorEntry key, Iterable<WebPage> values, Context context) throws IOException, InterruptedException {//webpage中有几条记录,就循环几次 for (WebPage page : values) { if (count >= limit) { return; } if (maxCount > 0) { String hostordomain; if (byDomain) { hostordomain = URLUtil.getDomainName(key.url); } else { hostordomain = URLUtil.getHost(key.url); } Integer hostCount = hostCountMap.get(hostordomain); if (hostCount == null) { hostCountMap.put(hostordomain, 0); hostCount = 0; } if (hostCount >= maxCount) { return; } hostCountMap.put(hostordomain, hostCount + 1); } //设置"mk:_gnmrk_" Mark.GENERATE_MARK.putMark(page, batchId); try { //page中存储了当前行的所有信息 context.write(TableUtil.reverseUrl(key.url), page); } catch (MalformedURLException e) { context.getCounter("Generator", "MALFORMED_URL").increment(1); continue; } context.getCounter("Generator", "GENERATE_MARK").increment(1); count++; } } @Override protected void setup(Context context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); long totalLimit = conf.getLong(GeneratorJob.GENERATOR_TOP_N, Long.MAX_VALUE); if (totalLimit == Long.MAX_VALUE) { limit = Long.MAX_VALUE; } else { limit = totalLimit / context.getNumReduceTasks(); } maxCount = conf.getLong(GeneratorJob.GENERATOR_MAX_COUNT, -2); batchId = new Utf8(conf.get(GeneratorJob.BATCH_ID)); String countMode = conf.get(GeneratorJob.GENERATOR_COUNT_MODE, GeneratorJob.GENERATOR_COUNT_VALUE_HOST); if (countMode.equals(GeneratorJob.GENERATOR_COUNT_VALUE_DOMAIN)) { byDomain = true; } }}