hadoop CombineFileInputFormat的使用

来源：互联网发布：如何识别编程语言编辑：程序博客网时间：2024/05/01 02:18

CombineFileInputFormat作用：将多个小文件打包成一个InputSplit提供给一个Map处理，避免因为大量小文件问题，启动大量任务

CombineFileInputFormat是一种新的inputformat，用于将多个文件合并成一个单独的split，另外，它会考虑数据的存储位置。旧版本的MultiFileInputFormat是按文件单位切分，可能造成split不均匀，如果有一个大文件则会单独由一个map处理,严重偏慢

CombineFileInputFormat是个抽象类，需要手工实现

1、Hive中可以设置:

set mapred.max.split.size=256000000; //合并的每个map大小

Set mapred.min.split.size.per.node=256000000 //控制一个节点上split的至少的大小,按mapred.max.split.size大小切分文件后，剩余大小如果超过mapred.min.split.size.per.node则作为一个分片，否则保留等待rack层处理

Set Mapred.min.split.size.per.rack=256000000 // 控制一个交换机下split至少的大小，合并碎片文件，按mapred.max.split.size分割，最后若剩余大小超过 Mapred.min.split.size.per.rack则作为单独的一分片

最后合并不同rack下的碎片，按mapred.max.split.size分割,剩下的碎片无论大小作为一个split

set hive.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat

2、Mapreduce中使用

自定义类MyMultiFileInputFormat,代码参考其他博客

import java.io.IOException;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.InputSplit;import org.apache.hadoop.mapreduce.RecordReader;import org.apache.hadoop.mapreduce.TaskAttemptContext;import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat;import org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader;import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;public class MyMultiFileInputFormat extends CombineFileInputFormat<MultiFileInputWritableComparable, Text>  {    public RecordReader<MultiFileInputWritableComparable,Text> createRecordReader(InputSplit split,TaskAttemptContext context) throws IOException     {      return new org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader<MultiFileInputWritableComparable, Text>      ((CombineFileSplit)split, context, CombineFileLineRecordReader.class);//CombineFileLineRecordReader.class为自定义类，一个split可能对应多个path则系统自带类..input.CombineFileRecordReader会通过java反射,针对不同的path分别构建自定义的CombineFileLineRecordReader去读key,value数据,具体看input.CombineFileRecordReader类源码

}    }

自定义CombineFileLineRecordReader类:

import java.io.IOException;import org.apache.hadoop.fs.FSDataInputStream;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.InputSplit;import org.apache.hadoop.mapreduce.RecordReader;import org.apache.hadoop.mapreduce.TaskAttemptContext;import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;import org.apache.hadoop.util.LineReader;@SuppressWarnings("deprecation")public class CombineFileLineRecordReader extends RecordReader<MultiFileInputWritableComparable, Text> {private long startOffset; // offset of the chunk;private long end; // end of the chunk;private long pos; // current posprivate FileSystem fs;private Path path; // path of hdfsprivate MultiFileInputWritableComparable key;private Text value; // value should be string(hadoop Text)private FSDataInputStream fileIn;private LineReader reader;public CombineFileLineRecordReader(CombineFileSplit split, TaskAttemptContext context, Integer index) throws IOException {fs = FileSystem.get(context.getConfiguration());this.path = split.getPath(index);this.startOffset = split.getOffset(index);this.end = startOffset + split.getLength(index);boolean skipFirstLine = false;fileIn = fs.open(path); // open the fileif (startOffset != 0) {skipFirstLine = true;--startOffset;fileIn.seek(startOffset);}reader = new LineReader(fileIn);if (skipFirstLine) // skip first line and re-establish "startOffset".{int readNum = reader.readLine(new Text(),0,(int) Math.min((long) Integer.MAX_VALUE, end - startOffset));startOffset += readNum;}this.pos = startOffset;}public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {}public void close() throws IOException {reader.close();}public float getProgress() throws IOException {if (startOffset == end) {return 0.0f;} else {return Math.min(1.0f, (pos - startOffset) / (float) (end - startOffset));}}public boolean nextKeyValue() throws IOException {if (key == null) {key = new MultiFileInputWritableComparable();key.setFileName(path.getName());}key.setOffset(pos);if (value == null) {value = new Text();}int newSize = 0;if (pos < end) {newSize = reader.readLine(value);pos += newSize;}if (newSize == 0) {key = null;value = null;return false;} else {return true;}}public MultiFileInputWritableComparable getCurrentKey() throws IOException, InterruptedException {return key;}public Text getCurrentValue() throws IOException, InterruptedException {return value;}}

MultiFileInputWritableComparable类

import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;import org.apache.hadoop.io.Text;import org.apache.hadoop.io.WritableComparable;@SuppressWarnings("rawtypes")public class MultiFileInputWritableComparable implements WritableComparable {    private long offset;       //offset of this file block    private String fileName;   //filename of this block    public long getOffset() {return offset;}public void setOffset(long offset) {this.offset = offset;}public String getFileName() {return fileName;}public void setFileName(String fileName) {this.fileName = fileName;}public void readFields(DataInput in) throws IOException {      this.offset = in.readLong();      this.fileName = Text.readString(in);    }    public void write(DataOutput out) throws IOException {      out.writeLong(offset);      Text.writeString(out, fileName);    }    public int compareTo(Object object) {      MultiFileInputWritableComparable that = (MultiFileInputWritableComparable)object;      int compare = this.fileName.compareTo(that.fileName);      if(compare == 0) {        return (int)Math.signum((double)(this.offset - that.offset));      }      return compare;    }    @Override    public boolean equals(Object object) {      if(object instanceof MultiFileInputWritableComparable)        return this.compareTo(object) == 0;      return false;    }    @Override    public int hashCode() {      assert false : "hashCode not designed";      return 42; //an arbitrary constant    }}

测试

import java.io.IOException;import java.util.StringTokenizer;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.conf.Configured;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.lib.reduce.IntSumReducer;import org.apache.hadoop.util.GenericOptionsParser;import org.apache.hadoop.util.Tool;import org.apache.hadoop.util.ToolRunner;@SuppressWarnings("deprecation")public class MultiFileWordCount extends Configured implements Tool {  public static class MapClass extends       Mapper<MultiFileInputWritableComparable, Text, Text, IntWritable> {    private final static IntWritable one = new IntWritable(1);    private Text word = new Text();        public void map(MultiFileInputWritableComparable key, Text value, Context context)        throws IOException, InterruptedException {            String line = value.toString();      StringTokenizer itr = new StringTokenizer(line);      while (itr.hasMoreTokens()) {        word.set(itr.nextToken());        context.write(word, one);      }    }  }    private void printUsage() {    System.out.println("Usage : multifilewc <input_dir> <input_dir> <output>" );  }  public int run(String[] args) throws Exception {Configuration conf = new Configuration();String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();    if(otherArgs.length < 3) { //修改为3，前2个参数为输入文件，后1个为输出目录      printUsage();      return 2;    }    Job job = new Job(getConf());    job.setJobName("MultiFileWordCount");    job.setJarByClass(MultiFileWordCount.class);    //set the InputFormat of the job to our InputFormat    job.setInputFormatClass(MyMultiFileInputFormat.class);        // the keys are words (strings)    job.setOutputKeyClass(Text.class);    // the values are counts (ints)    job.setOutputValueClass(IntWritable.class);    //use the defined mapper    job.setMapperClass(MapClass.class);    //use the WordCount Reducer    job.setCombinerClass(IntSumReducer.class);    job.setReducerClass(IntSumReducer.class);    FileInputFormat.addInputPaths(job, otherArgs[0]);//修改为3，前2个参数为输入文件，后1个为输出目录    FileInputFormat.addInputPaths(job, otherArgs[1]);    FileOutputFormat.setOutputPath(job, new Path(otherArgs[2]));    return job.waitForCompletion(true) ? 0 : 1;  }  public static void main(String[] args) throws Exception {    int ret = ToolRunner.run(new MultiFileWordCount(), args);    System.exit(ret);  }}

mapreduce自带的几个类:

org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader

package org.apache.hadoop.mapreduce.lib.input;import java.io.*;import java.lang.reflect.*;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.mapreduce.*;import org.apache.hadoop.classification.InterfaceAudience;import org.apache.hadoop.classification.InterfaceStability;import org.apache.hadoop.conf.Configuration;/** * A generic RecordReader that can hand out different recordReaders * for each chunk in a {@link CombineFileSplit}. * A CombineFileSplit can combine data chunks from multiple files.  * This class allows using different RecordReaders for processing * these data chunks from different files. * @see CombineFileSplit */@InterfaceAudience.Public@InterfaceStability.Stablepublic class CombineFileRecordReader<K, V> extends RecordReader<K, V> {  static final Class [] constructorSignature = new Class []                                          {CombineFileSplit.class,                                          TaskAttemptContext.class,                                          Integer.class};  protected CombineFileSplit split;  protected Class<? extends RecordReader<K,V>> rrClass;  protected Constructor<? extends RecordReader<K,V>> rrConstructor;  protected FileSystem fs;  protected TaskAttemptContext context;    protected int idx;  protected long progress;  protected RecordReader<K, V> curReader;    public void initialize(InputSplit split,      TaskAttemptContext context) throws IOException, InterruptedException {    this.split = (CombineFileSplit)split;    this.context = context;    if (null != this.curReader) {      this.curReader.initialize(split, context);    }  }    public boolean nextKeyValue() throws IOException, InterruptedException {    while ((curReader == null) || !curReader.nextKeyValue()) { //如果当前的recordreader读不到数据了，通过initNextRecordReader构建该分片下一个path对应的recordreadr      if (!initNextRecordReader()) {        return false;      }    }    return true;  }  public K getCurrentKey() throws IOException, InterruptedException {    return curReader.getCurrentKey();  }    public V getCurrentValue() throws IOException, InterruptedException {    return curReader.getCurrentValue();  }    public void close() throws IOException {    if (curReader != null) {      curReader.close();      curReader = null;    }  }    /**   * return progress based on the amount of data processed so far.   */  public float getProgress() throws IOException, InterruptedException {    long subprogress = 0;    // bytes processed in current split    if (null != curReader) {      // idx is always one past the current subsplit's true index.      subprogress = (long)(curReader.getProgress() * split.getLength(idx - 1));    }    return Math.min(1.0f,  (progress + subprogress)/(float)(split.getLength()));  }    /**   * A generic RecordReader that can hand out different recordReaders   * for each chunk in the CombineFileSplit.   */  public CombineFileRecordReader(CombineFileSplit split,                                 TaskAttemptContext context,                                 Class<? extends RecordReader<K,V>> rrClass)    throws IOException {    this.split = split;    this.context = context;    this.rrClass = rrClass;    this.idx = 0;    this.curReader = null;    this.progress = 0;    try {      rrConstructor = rrClass.getDeclaredConstructor(constructorSignature);      rrConstructor.setAccessible(true);    } catch (Exception e) {      throw new RuntimeException(rrClass.getName() +                                  " does not have valid constructor", e);    }    initNextRecordReader();  }    /**   * Get the record reader for the next chunk in this CombineFileSplit.   */  protected boolean initNextRecordReader() throws IOException {    if (curReader != null) {      curReader.close();      curReader = null;      if (idx > 0) {        progress += split.getLength(idx-1);    // done processing so far      }    }    // if all chunks have been processed, nothing more to do.    if (idx == split.getNumPaths()) {  //该分片对应的path  recordreader已全部构建完毕判断      return false;    }    // get a record reader for the idx-th chunk    try {      Configuration conf = context.getConfiguration();      // setup some helper config variables.      conf.set("map.input.file", split.getPath(idx).toString());      conf.setLong("map.input.start", split.getOffset(idx));      conf.setLong("map.input.length", split.getLength(idx));      curReader =  rrConstructor.newInstance(new Object []                             {split, context, Integer.valueOf(idx)});      if (idx > 0) {        // initialize() for the first RecordReader will be called by MapTask;        // we're responsible for initializing subsequent RecordReaders.        curReader.initialize(split, context);      }    } catch (Exception e) {      throw new RuntimeException (e);    }    idx++; //    return true;  }}

CombineFileSplit类:

/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements.  See the NOTICE file * distributed with this work for additional information * regarding copyright ownership.  The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License.  You may obtain a copy of the License at * *     http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */package org.apache.hadoop.mapreduce.lib.input;import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;import org.apache.hadoop.classification.InterfaceAudience;import org.apache.hadoop.classification.InterfaceStability;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.Text;import org.apache.hadoop.io.Writable;import org.apache.hadoop.mapreduce.InputSplit;import org.apache.hadoop.mapreduce.RecordReader;/** * A sub-collection of input files.  *  * Unlike {@link FileSplit}, CombineFileSplit class does not represent  * a split of a file, but a split of input files into smaller sets.  * A split may contain blocks from different file but all  * the blocks in the same split are probably local to some rack <br>  * CombineFileSplit can be used to implement {@link RecordReader}'s,  * with reading one record per file. *  * @see FileSplit * @see CombineFileInputFormat  */@InterfaceAudience.Public@InterfaceStability.Stablepublic class CombineFileSplit extends InputSplit implements Writable {  private Path[] paths;  private long[] startoffset;  private long[] lengths;  private String[] locations;  private long totLength;  /**   * default constructor   */  public CombineFileSplit() {}  public CombineFileSplit(Path[] files, long[] start,                           long[] lengths, String[] locations) {    initSplit(files, start, lengths, locations);  }  public CombineFileSplit(Path[] files, long[] lengths) {    long[] startoffset = new long[files.length];    for (int i = 0; i < startoffset.length; i++) {      startoffset[i] = 0;    }    String[] locations = new String[files.length];    for (int i = 0; i < locations.length; i++) {      locations[i] = "";    }    initSplit(files, startoffset, lengths, locations);  }    private void initSplit(Path[] files, long[] start,                          long[] lengths, String[] locations) {    this.startoffset = start;    this.lengths = lengths;    this.paths = files;    this.totLength = 0;    this.locations = locations;    for(long length : lengths) {      totLength += length;    }  }  /**   * Copy constructor   */  public CombineFileSplit(CombineFileSplit old) throws IOException {    this(old.getPaths(), old.getStartOffsets(),         old.getLengths(), old.getLocations());  }  public long getLength() {    return totLength;  }  /** Returns an array containing the start offsets of the files in the split*/   public long[] getStartOffsets() {    return startoffset;  }    /** Returns an array containing the lengths of the files in the split*/   public long[] getLengths() {    return lengths;  }  /** Returns the start offset of the i<sup>th</sup> Path */  public long getOffset(int i) {    return startoffset[i];  }    /** Returns the length of the i<sup>th</sup> Path */  public long getLength(int i) {    return lengths[i];  }    /** Returns the number of Paths in the split */  public int getNumPaths() {    return paths.length;  }  /** Returns the i<sup>th</sup> Path */  public Path getPath(int i) {    return paths[i];  }    /** Returns all the Paths in the split */  public Path[] getPaths() {    return paths;  }  /** Returns all the Paths where this input-split resides */  public String[] getLocations() throws IOException {    return locations;  }  public void readFields(DataInput in) throws IOException {    totLength = in.readLong();    int arrLength = in.readInt();    lengths = new long[arrLength];    for(int i=0; i<arrLength;i++) {      lengths[i] = in.readLong();    }    int filesLength = in.readInt();    paths = new Path[filesLength];    for(int i=0; i<filesLength;i++) {      paths[i] = new Path(Text.readString(in));    }    arrLength = in.readInt();    startoffset = new long[arrLength];    for(int i=0; i<arrLength;i++) {      startoffset[i] = in.readLong();    }  }  public void write(DataOutput out) throws IOException {    out.writeLong(totLength);    out.writeInt(lengths.length);    for(long length : lengths) {      out.writeLong(length);    }    out.writeInt(paths.length);    for(Path p : paths) {      Text.writeString(out, p.toString());    }    out.writeInt(startoffset.length);    for(long length : startoffset) {      out.writeLong(length);    }  }    @Override public String toString() {    StringBuffer sb = new StringBuffer();    for (int i = 0; i < paths.length; i++) {      if (i == 0 ) {        sb.append("Paths:");      }      sb.append(paths[i].toUri().getPath() + ":" + startoffset[i] +                "+" + lengths[i]);      if (i < paths.length -1) {        sb.append(",");      }    }    if (locations != null) {      String locs = "";      StringBuffer locsb = new StringBuffer();      for (int i = 0; i < locations.length; i++) {        locsb.append(locations[i] + ":");      }      locs = locsb.toString();      sb.append(" Locations:" + locs + "; ");    }    return sb.toString();  }}

抽类类CombineFileInputFormat:

具体分片信息计算查看getSPlits

public abstract class CombineFileInputFormat<K, V>  extends FileInputFormat<K, V> {  public static final String SPLIT_MINSIZE_PERNODE =     "mapreduce.input.fileinputformat.split.minsize.per.node";  public static final String SPLIT_MINSIZE_PERRACK =     "mapreduce.input.fileinputformat.split.minsize.per.rack";  // ability to limit the size of a single split  private long maxSplitSize = 0;  private long minSplitSizeNode = 0;  private long minSplitSizeRack = 0;  // A pool of input paths filters. A split cannot have blocks from files  // across multiple pools.  private ArrayList<MultiPathFilter> pools = new  ArrayList<MultiPathFilter>();  // mapping from a rack name to the set of Nodes in the rack   private HashMap<String, Set<String>> rackToNodes =                             new HashMap<String, Set<String>>();  /**   * Specify the maximum size (in bytes) of each split. Each split is   * approximately equal to the specified size.   */  protected void setMaxSplitSize(long maxSplitSize) {    this.maxSplitSize = maxSplitSize;  }  /**   * Specify the minimum size (in bytes) of each split per node.   * This applies to data that is left over after combining data on a single   * node into splits that are of maximum size specified by maxSplitSize.   * This leftover data will be combined into its own split if its size   * exceeds minSplitSizeNode.   */  protected void setMinSplitSizeNode(long minSplitSizeNode) {    this.minSplitSizeNode = minSplitSizeNode;  }  /**   * Specify the minimum size (in bytes) of each split per rack.   * This applies to data that is left over after combining data on a single   * rack into splits that are of maximum size specified by maxSplitSize.   * This leftover data will be combined into its own split if its size   * exceeds minSplitSizeRack.   */  protected void setMinSplitSizeRack(long minSplitSizeRack) {    this.minSplitSizeRack = minSplitSizeRack;  }  /**   * Create a new pool and add the filters to it.   * A split cannot have files from different pools.   */  protected void createPool(List<PathFilter> filters) {    pools.add(new MultiPathFilter(filters));  }  /**   * Create a new pool and add the filters to it.    * A pathname can satisfy any one of the specified filters.   * A split cannot have files from different pools.   */  protected void createPool(PathFilter... filters) {    MultiPathFilter multi = new MultiPathFilter();    for (PathFilter f: filters) {      multi.add(f);    }    pools.add(multi);  }    @Override  protected boolean isSplitable(JobContext context, Path file) {    final CompressionCodec codec =      new CompressionCodecFactory(context.getConfiguration()).getCodec(file);    return codec == null;  }  /**   * default constructor   */  public CombineFileInputFormat() {  }  @Override  public List<InputSplit> getSplits(JobContext job)     throws IOException {    long minSizeNode = 0;    long minSizeRack = 0;    long maxSize = 0;    Configuration conf = job.getConfiguration();    // the values specified by setxxxSplitSize() takes precedence over the    // values that might have been specified in the config    if (minSplitSizeNode != 0) {      minSizeNode = minSplitSizeNode;    } else {      minSizeNode = conf.getLong(SPLIT_MINSIZE_PERNODE, 0);    }    if (minSplitSizeRack != 0) {      minSizeRack = minSplitSizeRack;    } else {      minSizeRack = conf.getLong(SPLIT_MINSIZE_PERRACK, 0);    }    if (maxSplitSize != 0) {      maxSize = maxSplitSize;    } else {      maxSize = conf.getLong("mapreduce.input.fileinputformat.split.maxsize", 0);    }    if (minSizeNode != 0 && maxSize != 0 && minSizeNode > maxSize) {      throw new IOException("Minimum split size pernode " + minSizeNode +                            " cannot be larger than maximum split size " +                            maxSize);    }    if (minSizeRack != 0 && maxSize != 0 && minSizeRack > maxSize) {      throw new IOException("Minimum split size per rack" + minSizeRack +                            " cannot be larger than maximum split size " +                            maxSize);    }    if (minSizeRack != 0 && minSizeNode > minSizeRack) {      throw new IOException("Minimum split size per node" + minSizeNode +                            " cannot be smaller than minimum split " +                            "size per rack " + minSizeRack);    }    // all the files in input set    Path[] paths = FileUtil.stat2Paths(                     listStatus(job).toArray(new FileStatus[0]));    List<InputSplit> splits = new ArrayList<InputSplit>();    if (paths.length == 0) {      return splits;        }    // Convert them to Paths first. This is a costly operation and     // we should do it first, otherwise we will incur doing it multiple    // times, one time each for each pool in the next loop.    List<Path> newpaths = new LinkedList<Path>();    for (int i = 0; i < paths.length; i++) {      Path p = new Path(paths[i].toUri().getPath());      newpaths.add(p);    }    paths = null;    // In one single iteration, process all the paths in a single pool.    // Processing one pool at a time ensures that a split contains paths    // from a single pool only.    for (MultiPathFilter onepool : pools) {      ArrayList<Path> myPaths = new ArrayList<Path>();            // pick one input path. If it matches all the filters in a pool,      // add it to the output set      for (Iterator<Path> iter = newpaths.iterator(); iter.hasNext();) {        Path p = iter.next();        if (onepool.accept(p)) {          myPaths.add(p); // add it to my output set          iter.remove();        }      }      // create splits for all files in this pool.      getMoreSplits(job, myPaths.toArray(new Path[myPaths.size()]),                     maxSize, minSizeNode, minSizeRack, splits);    }    // create splits for all files that are not in any pool.    getMoreSplits(job, newpaths.toArray(new Path[newpaths.size()]),                   maxSize, minSizeNode, minSizeRack, splits);    // free up rackToNodes map    rackToNodes.clear();    return splits;      }  /**   * Return all the splits in the specified set of paths   */  private void getMoreSplits(JobContext job, Path[] paths,                              long maxSize, long minSizeNode, long minSizeRack,                             List<InputSplit> splits)    throws IOException {    Configuration conf = job.getConfiguration();    // all blocks for all the files in input set    OneFileInfo[] files;      // mapping from a rack name to the list of blocks it has    HashMap<String, List<OneBlockInfo>> rackToBlocks =                               new HashMap<String, List<OneBlockInfo>>();    // mapping from a block to the nodes on which it has replicas    HashMap<OneBlockInfo, String[]> blockToNodes =                               new HashMap<OneBlockInfo, String[]>();    // mapping from a node to the list of blocks that it contains    HashMap<String, List<OneBlockInfo>> nodeToBlocks =                               new HashMap<String, List<OneBlockInfo>>();        files = new OneFileInfo[paths.length];    if (paths.length == 0) {      return;     }    // populate all the blocks for all files    long totLength = 0;    for (int i = 0; i < paths.length; i++) {      files[i] = new OneFileInfo(paths[i], conf, isSplitable(job, paths[i]),                                 rackToBlocks, blockToNodes, nodeToBlocks,                                 rackToNodes, maxSize);      totLength += files[i].getLength();    }    ArrayList<OneBlockInfo> validBlocks = new ArrayList<OneBlockInfo>();    Set<String> nodes = new HashSet<String>();    long curSplitSize = 0;    // process all nodes and create splits that are local    // to a node.     for (Iterator<Map.Entry<String,          List<OneBlockInfo>>> iter = nodeToBlocks.entrySet().iterator();          iter.hasNext();) {      Map.Entry<String, List<OneBlockInfo>> one = iter.next();      nodes.add(one.getKey());      List<OneBlockInfo> blocksInNode = one.getValue();      // for each block, copy it into validBlocks. Delete it from       // blockToNodes so that the same block does not appear in       // two different splits.      for (OneBlockInfo oneblock : blocksInNode) {        if (blockToNodes.containsKey(oneblock)) {          validBlocks.add(oneblock);          blockToNodes.remove(oneblock);          curSplitSize += oneblock.length;          // if the accumulated split size exceeds the maximum, then           // create this split.          if (maxSize != 0 && curSplitSize >= maxSize) {            // create an input split and add it to the splits array            addCreatedSplit(splits, nodes, validBlocks);            curSplitSize = 0;            validBlocks.clear();          }        }      }      // if there were any blocks left over and their combined size is      // larger than minSplitNode, then combine them into one split.      // Otherwise add them back to the unprocessed pool. It is likely       // that they will be combined with other blocks from the       // same rack later on.      if (minSizeNode != 0 && curSplitSize >= minSizeNode) {        // create an input split and add it to the splits array        addCreatedSplit(splits, nodes, validBlocks);      } else {        for (OneBlockInfo oneblock : validBlocks) {          blockToNodes.put(oneblock, oneblock.hosts);        }      }      validBlocks.clear();      nodes.clear();      curSplitSize = 0;    }    // if blocks in a rack are below the specified minimum size, then keep them    // in 'overflow'. After the processing of all racks is complete, these     // overflow blocks will be combined into splits.    ArrayList<OneBlockInfo> overflowBlocks = new ArrayList<OneBlockInfo>();    Set<String> racks = new HashSet<String>();    // Process all racks over and over again until there is no more work to do.    while (blockToNodes.size() > 0) {      // Create one split for this rack before moving over to the next rack.       // Come back to this rack after creating a single split for each of the       // remaining racks.      // Process one rack location at a time, Combine all possible blocks that      // reside on this rack as one split. (constrained by minimum and maximum      // split size).      // iterate over all racks       for (Iterator<Map.Entry<String, List<OneBlockInfo>>> iter =            rackToBlocks.entrySet().iterator(); iter.hasNext();) {        Map.Entry<String, List<OneBlockInfo>> one = iter.next();        racks.add(one.getKey());        List<OneBlockInfo> blocks = one.getValue();        // for each block, copy it into validBlocks. Delete it from         // blockToNodes so that the same block does not appear in         // two different splits.        boolean createdSplit = false;        for (OneBlockInfo oneblock : blocks) {          if (blockToNodes.containsKey(oneblock)) {            validBlocks.add(oneblock);            blockToNodes.remove(oneblock);            curSplitSize += oneblock.length;                  // if the accumulated split size exceeds the maximum, then             // create this split.            if (maxSize != 0 && curSplitSize >= maxSize) {              // create an input split and add it to the splits array              addCreatedSplit(splits, getHosts(racks), validBlocks);              createdSplit = true;              break;            }          }        }        // if we created a split, then just go to the next rack        if (createdSplit) {          curSplitSize = 0;          validBlocks.clear();          racks.clear();          continue;        }        if (!validBlocks.isEmpty()) {          if (minSizeRack != 0 && curSplitSize >= minSizeRack) {            // if there is a minimum size specified, then create a single split            // otherwise, store these blocks into overflow data structure            addCreatedSplit(splits, getHosts(racks), validBlocks);          } else {            // There were a few blocks in this rack that         // remained to be processed. Keep them in 'overflow' block list.         // These will be combined later.            overflowBlocks.addAll(validBlocks);          }        }        curSplitSize = 0;        validBlocks.clear();        racks.clear();      }    }    assert blockToNodes.isEmpty();    assert curSplitSize == 0;    assert validBlocks.isEmpty();    assert racks.isEmpty();    // Process all overflow blocks    for (OneBlockInfo oneblock : overflowBlocks) {      validBlocks.add(oneblock);      curSplitSize += oneblock.length;      // This might cause an exiting rack location to be re-added,      // but it should be ok.      for (int i = 0; i < oneblock.racks.length; i++) {        racks.add(oneblock.racks[i]);      }      // if the accumulated split size exceeds the maximum, then       // create this split.      if (maxSize != 0 && curSplitSize >= maxSize) {        // create an input split and add it to the splits array        addCreatedSplit(splits, getHosts(racks), validBlocks);        curSplitSize = 0;        validBlocks.clear();        racks.clear();      }    }    // Process any remaining blocks, if any.    if (!validBlocks.isEmpty()) {      addCreatedSplit(splits, getHosts(racks), validBlocks);    }  }  /**   * Create a single split from the list of blocks specified in validBlocks   * Add this new split into splitList.   */  private void addCreatedSplit(List<InputSplit> splitList,                                Collection<String> locations,                                ArrayList<OneBlockInfo> validBlocks) {    // create an input split    Path[] fl = new Path[validBlocks.size()];    long[] offset = new long[validBlocks.size()];    long[] length = new long[validBlocks.size()];    for (int i = 0; i < validBlocks.size(); i++) {      fl[i] = validBlocks.get(i).onepath;       offset[i] = validBlocks.get(i).offset;      length[i] = validBlocks.get(i).length;    }     // add this split to the list that is returned    CombineFileSplit thissplit = new CombineFileSplit(fl, offset,                                    length, locations.toArray(new String[0]));    splitList.add(thissplit);   }  /**   * This is not implemented yet.    */  public abstract RecordReader<K, V> createRecordReader(InputSplit split,      TaskAttemptContext context) throws IOException;  /**   * information about one file from the File System   */  private static class OneFileInfo {    private long fileSize;               // size of the file    private OneBlockInfo[] blocks;       // all blocks in this file    OneFileInfo(Path path, Configuration conf,                boolean isSplitable,                HashMap<String, List<OneBlockInfo>> rackToBlocks,                HashMap<OneBlockInfo, String[]> blockToNodes,                HashMap<String, List<OneBlockInfo>> nodeToBlocks,                HashMap<String, Set<String>> rackToNodes,                long maxSize)                throws IOException {      this.fileSize = 0;      // get block locations from file system      FileSystem fs = path.getFileSystem(conf);      FileStatus stat = fs.getFileStatus(path);      BlockLocation[] locations = fs.getFileBlockLocations(stat, 0,                                                            stat.getLen());      // create a list of all block and their locations      if (locations == null) {        blocks = new OneBlockInfo[0];      } else {        if (!isSplitable) {          // if the file is not splitable, just create the one block with          // full file length          blocks = new OneBlockInfo[1];          fileSize = stat.getLen();          blocks[0] = new OneBlockInfo(path, 0, fileSize, locations[0]              .getHosts(), locations[0].getTopologyPaths());        } else {          ArrayList<OneBlockInfo> blocksList = new ArrayList<OneBlockInfo>(              locations.length);          for (int i = 0; i < locations.length; i++) {            fileSize += locations[i].getLength();            // each split can be a maximum of maxSize            long left = locations[i].getLength();            long myOffset = locations[i].getOffset();            long myLength = 0;            while (left > 0) {              if (maxSize == 0) {                myLength = left;              } else {                if (left > maxSize && left < 2 * maxSize) {                  // if remainder is between max and 2*max - then                  // instead of creating splits of size max, left-max we                  // create splits of size left/2 and left/2. This is                  // a heuristic to avoid creating really really small                  // splits.                  myLength = left / 2;                } else {                  myLength = Math.min(maxSize, left);                }              }              OneBlockInfo oneblock = new OneBlockInfo(path, myOffset,                  myLength, locations[i].getHosts(), locations[i]                      .getTopologyPaths());              left -= myLength;              myOffset += myLength;              blocksList.add(oneblock);            }          }          blocks = blocksList.toArray(new OneBlockInfo[blocksList.size()]);        }        for (OneBlockInfo oneblock : blocks) {          // add this block to the block --> node locations map          blockToNodes.put(oneblock, oneblock.hosts);          // For blocks that do not have host/rack information,          // assign to default  rack.          String[] racks = null;          if (oneblock.hosts.length == 0) {            racks = new String[]{NetworkTopology.DEFAULT_RACK};          } else {            racks = oneblock.racks;          }          // add this block to the rack --> block map          for (int j = 0; j < racks.length; j++) {            String rack = racks[j];            List<OneBlockInfo> blklist = rackToBlocks.get(rack);            if (blklist == null) {              blklist = new ArrayList<OneBlockInfo>();              rackToBlocks.put(rack, blklist);            }            blklist.add(oneblock);            if (!racks[j].equals(NetworkTopology.DEFAULT_RACK)) {              // Add this host to rackToNodes map              addHostToRack(rackToNodes, racks[j], oneblock.hosts[j]);            }          }          // add this block to the node --> block map          for (int j = 0; j < oneblock.hosts.length; j++) {            String node = oneblock.hosts[j];            List<OneBlockInfo> blklist = nodeToBlocks.get(node);            if (blklist == null) {              blklist = new ArrayList<OneBlockInfo>();              nodeToBlocks.put(node, blklist);            }            blklist.add(oneblock);          }        }      }    }    long getLength() {      return fileSize;    }    OneBlockInfo[] getBlocks() {      return blocks;    }  }  /**   * information about one block from the File System   */  private static class OneBlockInfo {    Path onepath;                // name of this file    long offset;                 // offset in file    long length;                 // length of this block    String[] hosts;              // nodes on which this block resides    String[] racks;              // network topology of hosts    OneBlockInfo(Path path, long offset, long len,                  String[] hosts, String[] topologyPaths) {      this.onepath = path;      this.offset = offset;      this.hosts = hosts;      this.length = len;      assert (hosts.length == topologyPaths.length ||              topologyPaths.length == 0);      // if the file system does not have any rack information, then      // use dummy rack location.      if (topologyPaths.length == 0) {        topologyPaths = new String[hosts.length];        for (int i = 0; i < topologyPaths.length; i++) {          topologyPaths[i] = (new NodeBase(hosts[i],                               NetworkTopology.DEFAULT_RACK)).toString();        }      }      // The topology paths have the host name included as the last       // component. Strip it.      this.racks = new String[topologyPaths.length];      for (int i = 0; i < topologyPaths.length; i++) {        this.racks[i] = (new NodeBase(topologyPaths[i])).getNetworkLocation();      }    }  }  protected BlockLocation[] getFileBlockLocations(    FileSystem fs, FileStatus stat) throws IOException {    return fs.getFileBlockLocations(stat, 0, stat.getLen());  }  private static void addHostToRack(HashMap<String, Set<String>> rackToNodes,                                    String rack, String host) {    Set<String> hosts = rackToNodes.get(rack);    if (hosts == null) {      hosts = new HashSet<String>();      rackToNodes.put(rack, hosts);    }    hosts.add(host);  }    private Set<String> getHosts(Set<String> racks) {    Set<String> hosts = new HashSet<String>();    for (String rack : racks) {      if (rackToNodes.containsKey(rack)) {        hosts.addAll(rackToNodes.get(rack));      }    }    return hosts;  }    /**   * Accept a path only if any one of filters given in the   * constructor do.    */  private static class MultiPathFilter implements PathFilter {    private List<PathFilter> filters;    public MultiPathFilter() {      this.filters = new ArrayList<PathFilter>();    }    public MultiPathFilter(List<PathFilter> filters) {      this.filters = filters;    }    public void add(PathFilter one) {      filters.add(one);    }    public boolean accept(Path path) {      for (PathFilter filter : filters) {        if (filter.accept(path)) {          return true;        }      }      return false;    }    public String toString() {      StringBuffer buf = new StringBuffer();      buf.append("[");      for (PathFilter f: filters) {        buf.append(f);        buf.append(",");      }      buf.append("]");      return buf.toString();    }  }}