nutch2.3.1 updatejob时错误url导致崩溃

来源:互联网 发布:人,学,立,知天命 编辑:程序博客网 时间:2024/06/08 01:59

原因可能是错误的html解析出来的
在DbUpdateMapper.java的map时加个trycatch

 55  @Override 56  public void map(String key, WebPage page, Context context) 57      throws IOException, InterruptedException { 58    if (Mark.GENERATE_MARK.checkMark(page) == null) { 59      if (LOG.isDebugEnabled()) { 60        LOG.debug("Skipping " + TableUtil.unreverseUrl(key) 61            + "; not generated yet"); 62      } 63      return; 64    } 65 66    String url = TableUtil.unreverseUrl(key); 67 68    scoreData.clear(); 69    Map<CharSequence, CharSequence> outlinks = page.getOutlinks(); 70    if (outlinks != null) { 71      for (Entry<CharSequence, CharSequence> e : outlinks.entrySet()) { 72        int depth = Integer.MAX_VALUE; 73        CharSequence depthUtf8 = page.getMarkers().get(DbUpdaterJob.DISTANCE); 74        if (depthUtf8 != null) 75          depth = Integer.parseInt(depthUtf8.toString());           // add here to filter error url 76        try { 77            String testUrl = TableUtil.reverseUrl(e.getKey().toString()); 78        } catch (MalformedURLException ex) { 79            LOG.warn("dbupdate,error url:" + e.getKey().toString()); 80            continue; 81        } 82        scoreData.add(new ScoreDatum(0.0f, e.getKey().toString(), e.getValue() 83            .toString(), depth)); 84      } 85    } 86 87    // TODO: Outlink filtering (i.e. "only keep the first n outlinks") 88    try { 89      scoringFilters.distributeScoreToOutlinks(url, page, scoreData, 90          (outlinks == null ? 0 : outlinks.size())); 91    } catch (ScoringFilterException e) { 92      LOG.warn("Distributing score failed for URL: " + key + " exception:" 93          + StringUtils.stringifyException(e)); 94    } 95 96    urlWithScore.setUrl(key); 97    urlWithScore.setScore(Float.MAX_VALUE); 98    pageWritable.setWebPage(page); 99    nutchWritable.set(pageWritable);100    context.write(urlWithScore, nutchWritable);101102    for (ScoreDatum scoreDatum : scoreData) {103      String reversedOut = TableUtil.reverseUrl(scoreDatum.getUrl());104      scoreDatum.setUrl(url);105      urlWithScore.setUrl(reversedOut);106      urlWithScore.setScore(scoreDatum.getScore());107      nutchWritable.set(scoreDatum);108      context.write(urlWithScore, nutchWritable);109    }110  }
0 0
原创粉丝点击