nutch2.3.1 构建solr6索引时meta_keywords longer than the max length 32766

来源:互联网 发布:网络错误678怎么解决 编辑:程序博客网 时间:2024/05/17 23:51

解决办法有3
1是在managed schema置meta_* 的index=false
2是在managed schema置meta_* 的type=任意一种class是solr.TextField的类型
3是修改nutch代码MetaTagsParser.java如下

  private void addIndexedMetatags(Map<CharSequence, ByteBuffer> metadata,      String metatag, String value) {      //add here      if(value.getBytes("utf-8").length > 32765) return;    String lcMetatag = metatag.toLowerCase(Locale.ROOT);    if (metatagset.contains("*") || metatagset.contains(lcMetatag)) {      if (LOG.isDebugEnabled()) {        LOG.debug("Found meta tag: " + lcMetatag + "\t" + value);      }      metadata.put(new Utf8(PARSE_META_PREFIX + lcMetatag),          ByteBuffer.wrap(value.getBytes()));    }  }

3 如果数据库已经存在过长数据,需要在index时过滤,修改文件: SolrIndexWriter.java

@Override  public void write(NutchDocument doc) throws IOException {    final SolrInputDocument inputDoc = new SolrInputDocument();    for (final Entry<String, List<String>> e : doc) {      for (final String val : e.getValue()) {        Object val2 = val;        if (e.getKey().equals("content") || e.getKey().equals("title")) {          val2 = SolrUtils.stripNonCharCodepoints(val);        }        if(e.getKey().startsWith("meta_") && val.getBytes("utf-8").length > 32765){            LOG.warn("trim too long value for key:" + e.getKey());            continue;        }        inputDoc.addField(solrMapping.mapKey(e.getKey()), val2);        String sCopy = solrMapping.mapCopyKey(e.getKey());        if (sCopy != e.getKey()) {          inputDoc.addField(sCopy, val2);        }      }    }    inputDoc.setDocumentBoost(doc.getScore());    inputDocs.add(inputDoc);    documentCount++;    if (inputDocs.size() >= batchSize) {      try {        LOG.info("Adding " + Integer.toString(inputDocs.size()) + " documents");        solr.add(inputDocs);      } catch (final SolrServerException e) {        throw new IOException(e);      }      inputDocs.clear();    }  }
0 0
原创粉丝点击