solr 导入csv文件

来源：互联网发布：aj2 wing it 编辑：程序博客网时间：2024/05/16 18:02

今天想用DIH导入csv文件，于是数据源用FileDataSource+自定义转换器粗略实现了一下

package com.besttone.transformer;import java.util.Map;public class CsvTransformer {// 参考资料 http://wiki.apache.org/solr/DIHCustomTransformerpublic Object transformRow(Map<String, Object> row) {// TODO Auto-generated method stubObject rawLine = row.get("rawLine");if (rawLine != null) {String[] props = rawLine.toString().split(",");row.put("id", props[0]);row.put("name", props[1]);}return row;}}

发现很多问题，比如字段里面本来就有逗号符号等等，用这个粗略的转换器肯定是无法实现了，于是继续找文档发现solr自带了一个CSVRequestHandler,不过默认在solrconfig.xml里是没有配这个requestHandler的，于是先配一个：

  <!-- CSV update handler, loaded on demand -->  <requestHandler name="/update/csv" class="solr.CSVRequestHandler" startup="lazy">  </requestHandler>

这样在浏览器里输入URL：http://localhost:8088/solr-src/csv-core/update/csv?stream.file=D:/dpimport/test_data2.csv&stream.contentType=text/plain;charset=utf-8&fieldnames=id,name&commit=true

就能将csv文件导入进去了,我的csv文件有两个字段一个id,一个name，做了点测试数据如：

1,aaa

2,bbb

...

连续行导入当然没有问题，当中间有空行时，office 的 csv 文件会变成：

1，aaa

，

2，bbb

也就是空行会有一个逗号，然后导入的时候恰巧ID字段的FiledSchema是唯一不能为空的，会导致创建索引文件的时候出异常，于是我将CSVRequestHandler源码扩展了一下，我增加了一个参数emptyLine,然后在load方法中加了一个逻辑：

        //是否支持空数据行        if(emptyLine)        {        int totalLength=0;        for (int i = 0; i < vals.length; i++) {        totalLength += vals[i].length();        }        if (totalLength==0)        {        continue;        }        }

修改后的CSVRequestHandler如下：

/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements.  See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License.  You may obtain a copy of the License at * *     http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */package org.apache.solr.handler;import org.apache.solr.request.SolrQueryRequest;import org.apache.solr.response.SolrQueryResponse;import org.apache.solr.common.SolrException;import org.apache.solr.common.SolrInputDocument;import org.apache.solr.common.params.SolrParams;import org.apache.solr.common.params.UpdateParams;import org.apache.solr.common.util.StrUtils;import org.apache.solr.common.util.ContentStream;import org.apache.solr.schema.IndexSchema;import org.apache.solr.schema.SchemaField;import org.apache.solr.update.*;import org.apache.solr.update.processor.UpdateRequestProcessor;import org.apache.solr.internal.csv.CSVStrategy;import org.apache.solr.internal.csv.CSVParser;import org.apache.commons.io.IOUtils;import java.util.regex.Pattern;import java.util.List;import java.io.*;/** * @version $Id: CSVRequestHandler.java 1298169 2012-03-07 22:27:54Z uschindler $ */public class CSVRequestHandler extends ContentStreamHandlerBase {  @Override  protected ContentStreamLoader newLoader(SolrQueryRequest req, UpdateRequestProcessor processor) {    return new SingleThreadedCSVLoader(req, processor);  }  //////////////////////// SolrInfoMBeans methods //////////////////////  @Override  public String getDescription() {    return "Add/Update multiple documents with CSV formatted rows";  }  @Override  public String getVersion() {    return "$Revision: 1298169 $";  }  @Override  public String getSourceId() {    return "$Id: CSVRequestHandler.java 1298169 2012-03-07 22:27:54Z uschindler $";  }  @Override  public String getSource() {    return "$URL: https://svn.apache.org/repos/asf/lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/handler/CSVRequestHandler.java $";  }}abstract class CSVLoader extends ContentStreamLoader {  public static final String SEPARATOR="separator";  public static final String FIELDNAMES="fieldnames";  public static final String HEADER="header";  public static final String SKIP="skip";  public static final String SKIPLINES="skipLines";  public static final String MAP="map";  public static final String TRIM="trim";  public static final String EMPTY="keepEmpty";  public static final String SPLIT="split";  public static final String ENCAPSULATOR="encapsulator";  public static final String ESCAPE="escape";  public static final String OVERWRITE="overwrite";  public static final String EMPTYLINE="emptyLine";//是否支持空数据行  private static Pattern colonSplit = Pattern.compile(":");  private static Pattern commaSplit = Pattern.compile(",");  final IndexSchema schema;  final SolrParams params;  final CSVStrategy strategy;  final UpdateRequestProcessor processor;  String[] fieldnames;  SchemaField[] fields;  CSVLoader.FieldAdder[] adders;  int skipLines;    // number of lines to skip at start of file  boolean emptyLine; //是否支持空数据行    final AddUpdateCommand templateAdd;  /** Add a field to a document unless it's zero length.   * The FieldAdder hierarchy handles all the complexity of   * further transforming or splitting field values to keep the   * main logic loop clean.  All implementations of add() must be   * MT-safe!   */  private class FieldAdder {    void add(SolrInputDocument doc, int line, int column, String val) {      if (val.length() > 0) {        doc.addField(fields[column].getName(),val,1.0f);      }    }  }  /** add zero length fields */  private class FieldAdderEmpty extends CSVLoader.FieldAdder {    @Override    void add(SolrInputDocument doc, int line, int column, String val) {      doc.addField(fields[column].getName(),val,1.0f);    }  }  /** trim fields */  private class FieldTrimmer extends CSVLoader.FieldAdder {    private final CSVLoader.FieldAdder base;    FieldTrimmer(CSVLoader.FieldAdder base) { this.base=base; }    @Override    void add(SolrInputDocument doc, int line, int column, String val) {      base.add(doc, line, column, val.trim());    }  }  /** map a single value.   * for just a couple of mappings, this is probably faster than   * using a HashMap.   */ private class FieldMapperSingle extends CSVLoader.FieldAdder {   private final String from;   private final String to;   private final CSVLoader.FieldAdder base;   FieldMapperSingle(String from, String to, CSVLoader.FieldAdder base) {     this.from=from;     this.to=to;     this.base=base;   }    @Override    void add(SolrInputDocument doc, int line, int column, String val) {      if (from.equals(val)) val=to;      base.add(doc,line,column,val);    } }  /** Split a single value into multiple values based on   * a CSVStrategy.   */  private class FieldSplitter extends CSVLoader.FieldAdder {    private final CSVStrategy strategy;    private final CSVLoader.FieldAdder base;    FieldSplitter(CSVStrategy strategy, CSVLoader.FieldAdder base) {      this.strategy = strategy;      this.base = base;    }    @Override    void add(SolrInputDocument doc, int line, int column, String val) {      CSVParser parser = new CSVParser(new StringReader(val), strategy);      try {        String[] vals = parser.getLine();        if (vals!=null) {          for (String v: vals) base.add(doc,line,column,v);        } else {          base.add(doc,line,column,val);        }      } catch (IOException e) {        throw new SolrException( SolrException.ErrorCode.BAD_REQUEST,e);      }    }  }  String errHeader="CSVLoader:";  CSVLoader(SolrQueryRequest req, UpdateRequestProcessor processor) {    this.processor = processor;    this.params = req.getParams();    schema = req.getSchema();    templateAdd = new AddUpdateCommand();    templateAdd.allowDups=false;    templateAdd.overwriteCommitted=true;    templateAdd.overwritePending=true;    if (params.getBool(OVERWRITE,true)) {      templateAdd.allowDups=false;      templateAdd.overwriteCommitted=true;      templateAdd.overwritePending=true;    } else {      templateAdd.allowDups=true;      templateAdd.overwriteCommitted=false;      templateAdd.overwritePending=false;    }    templateAdd.commitWithin = params.getInt(UpdateParams.COMMIT_WITHIN, -1);        strategy = new CSVStrategy(',', '"', CSVStrategy.COMMENTS_DISABLED, CSVStrategy.ESCAPE_DISABLED, false, false, false, true);    String sep = params.get(SEPARATOR);    if (sep!=null) {      if (sep.length()!=1) throw new SolrException( SolrException.ErrorCode.BAD_REQUEST,"Invalid separator:'"+sep+"'");      strategy.setDelimiter(sep.charAt(0));    }    String encapsulator = params.get(ENCAPSULATOR);    if (encapsulator!=null) {      if (encapsulator.length()!=1) throw new SolrException( SolrException.ErrorCode.BAD_REQUEST,"Invalid encapsulator:'"+encapsulator+"'");    }    String escape = params.get(ESCAPE);    if (escape!=null) {      if (escape.length()!=1) throw new SolrException( SolrException.ErrorCode.BAD_REQUEST,"Invalid escape:'"+escape+"'");    }    // if only encapsulator or escape is set, disable the other escaping mechanism    if (encapsulator == null && escape != null) {      strategy.setEncapsulator( CSVStrategy.ENCAPSULATOR_DISABLED);           strategy.setEscape(escape.charAt(0));    } else {      if (encapsulator != null) {        strategy.setEncapsulator(encapsulator.charAt(0));      }      if (escape != null) {        char ch = escape.charAt(0);        strategy.setEscape(ch);        if (ch == '\\') {          // If the escape is the standard backslash, then also enable          // unicode escapes (it's harmless since 'u' would not otherwise          // be escaped.                              strategy.setUnicodeEscapeInterpretation(true);        }      }    }    String fn = params.get(FIELDNAMES);    fieldnames = fn != null ? commaSplit.split(fn,-1) : null;    Boolean hasHeader = params.getBool(HEADER);    skipLines = params.getInt(SKIPLINES,0);    emptyLine = params.getBool(EMPTYLINE, false);//扩展        if (fieldnames==null) {      if (null == hasHeader) {        // assume the file has the headers if they aren't supplied in the args        hasHeader=true;      } else if (!hasHeader) {        throw new SolrException( SolrException.ErrorCode.BAD_REQUEST,"CSVLoader: must specify fieldnames=<fields>* or header=true");      }    } else {      // if the fieldnames were supplied and the file has a header, we need to      // skip over that header.      if (hasHeader!=null && hasHeader) skipLines++;      prepareFields();    }  }  /** create the FieldAdders that control how each field  is indexed */  void prepareFields() {    // Possible future optimization: for really rapid incremental indexing    // from a POST, one could cache all of this setup info based on the params.    // The link from FieldAdder to this would need to be severed for that to happen.    fields = new SchemaField[fieldnames.length];    adders = new CSVLoader.FieldAdder[fieldnames.length];    String skipStr = params.get(SKIP);    List<String> skipFields = skipStr==null ? null : StrUtils.splitSmart(skipStr,',');    CSVLoader.FieldAdder adder = new CSVLoader.FieldAdder();    CSVLoader.FieldAdder adderKeepEmpty = new CSVLoader.FieldAdderEmpty();    for (int i=0; i<fields.length; i++) {      String fname = fieldnames[i];      // to skip a field, leave the entries in fields and addrs null      if (fname.length()==0 || (skipFields!=null && skipFields.contains(fname))) continue;      fields[i] = schema.getField(fname);      boolean keepEmpty = params.getFieldBool(fname,EMPTY,false);      adders[i] = keepEmpty ? adderKeepEmpty : adder;      // Order that operations are applied: split -> trim -> map -> add      // so create in reverse order.      // Creation of FieldAdders could be optimized and shared among fields      String[] fmap = params.getFieldParams(fname,MAP);      if (fmap!=null) {        for (String mapRule : fmap) {          String[] mapArgs = colonSplit.split(mapRule,-1);          if (mapArgs.length!=2)            throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "Map rules must be of the form 'from:to' ,got '"+mapRule+"'");          adders[i] = new CSVLoader.FieldMapperSingle(mapArgs[0], mapArgs[1], adders[i]);        }      }      if (params.getFieldBool(fname,TRIM,false)) {        adders[i] = new CSVLoader.FieldTrimmer(adders[i]);      }      if (params.getFieldBool(fname,SPLIT,false)) {        String sepStr = params.getFieldParam(fname,SEPARATOR);        char fsep = sepStr==null || sepStr.length()==0 ? ',' : sepStr.charAt(0);        String encStr = params.getFieldParam(fname,ENCAPSULATOR);        char fenc = encStr==null || encStr.length()==0 ? (char)-2 : encStr.charAt(0);        String escStr = params.getFieldParam(fname,ESCAPE);        char fesc = escStr==null || escStr.length()==0 ? CSVStrategy.ESCAPE_DISABLED : escStr.charAt(0);        CSVStrategy fstrat = new CSVStrategy(fsep,fenc,CSVStrategy.COMMENTS_DISABLED,fesc, false, false, false, false);        adders[i] = new CSVLoader.FieldSplitter(fstrat, adders[i]);      }    }  }  private void input_err(String msg, String[] line, int lineno) {    StringBuilder sb = new StringBuilder();    sb.append(errHeader).append(", line=").append(lineno).append(",").append(msg).append("\n\tvalues={");    for (String val: line) {      sb.append("'").append(val).append("',"); }    sb.append('}');    throw new SolrException( SolrException.ErrorCode.BAD_REQUEST,sb.toString());  }  private void input_err(String msg, String[] lines, int lineNo, Throwable e) {    StringBuilder sb = new StringBuilder();    sb.append(errHeader).append(", line=").append(lineNo).append(",").append(msg).append("\n\tvalues={");    if (lines != null) {      for (String val : lines) {        sb.append("'").append(val).append("',");      }    } else {      sb.append("NO LINES AVAILABLE");    }    sb.append('}');    throw new SolrException( SolrException.ErrorCode.BAD_REQUEST,sb.toString(), e);  }  /** load the CSV input */  @Override  public void load(SolrQueryRequest req, SolrQueryResponse rsp, ContentStream stream) throws IOException {    errHeader = "CSVLoader: input=" + stream.getSourceInfo();    Reader reader = null;        try {      reader = stream.getReader();      if (skipLines>0) {        if (!(reader instanceof BufferedReader)) {          reader = new BufferedReader(reader);        }        BufferedReader r = (BufferedReader)reader;        for (int i=0; i<skipLines; i++) {          r.readLine();        }      }      CSVParser parser = new CSVParser(reader, strategy);      // parse the fieldnames from the header of the file      if (fieldnames==null) {        fieldnames = parser.getLine();        if (fieldnames==null) {          throw new SolrException( SolrException.ErrorCode.BAD_REQUEST,"Expected fieldnames in CSV input");        }        prepareFields();      }      // read the rest of the CSV file      for(;;) {        int line = parser.getLineNumber();  // for error reporting in MT mode        String[] vals = null;        try {          vals = parser.getLine();                  } catch (IOException e) {          //Catch the exception and rethrow it with more line information         input_err("can't read line: " + line, null, line, e);        }        if (vals==null) break;        //是否支持空数据行        if(emptyLine)        {        int totalLength=0;        for (int i = 0; i < vals.length; i++) {        totalLength += vals[i].length();        }        if (totalLength==0)        {        continue;        }        }                if (vals.length != fields.length) {          input_err("expected "+fields.length+" values but got "+vals.length, vals, line);        }        addDoc(line,vals);      }    } finally{      if (reader != null) {        IOUtils.closeQuietly(reader);      }    }  }  /** called for each line of values (document) */  abstract void addDoc(int line, String[] vals) throws IOException;  /** this must be MT safe... may be called concurrently from multiple threads. */  void doAdd(int line, String[] vals, SolrInputDocument doc, AddUpdateCommand template) throws IOException {    // the line number is passed simply for error reporting in MT mode.    // first, create the lucene document    for (int i=0; i<vals.length; i++) {      if (fields[i]==null) continue;  // ignore this field      String val = vals[i];      adders[i].add(doc, line, i, val);    }    template.solrDoc = doc;    processor.processAdd(template);  }}class SingleThreadedCSVLoader extends CSVLoader {  SingleThreadedCSVLoader(SolrQueryRequest req, UpdateRequestProcessor processor) {    super(req, processor);  }  @Override  void addDoc(int line, String[] vals) throws IOException {    templateAdd.indexedId = null;    SolrInputDocument doc = new SolrInputDocument();    doAdd(line, vals, doc, templateAdd);  }}

这样在上面那个请求URL的基础上再&emptyLine=true,就能避免空行出现的异常了。

以上是针对3.6版本的solr的修改。不同版本不一定可行