hadoop CombineFileInputFormat的使用
来源:互联网 发布:如何识别 编程语言 编辑:程序博客网 时间:2024/05/01 02:18
CombineFileInputFormat作用:将多个小文件打包成一个InputSplit提供给一个Map处理,避免因为大量小文件问题,启动大量任务
CombineFileInputFormat是一种新的inputformat,用于将多个文件合并成一个单独的split,另外,它会考虑数据的存储位置。旧版本的MultiFileInputFormat是按文件单位切分,可能造成split不均匀,如果有一个大文件则会单独由一个map处理,严重偏慢
CombineFileInputFormat是个抽象类,需要手工实现
1、Hive中可以设置:
set mapred.max.split.size=256000000; //合并的每个map大小
Set mapred.min.split.size.per.node=256000000 //控制一个节点上split的至少的大小,按mapred.max.split.size大小切分文件后,剩余大小如果超过mapred.min.split.size.per.node则作为一个分片,否则保留等待rack层处理
Set Mapred.min.split.size.per.rack=256000000 // 控制一个交换机下split至少的大小,合并碎片文件,按mapred.max.split.size分割,最后若剩余大小超过 Mapred.min.split.size.per.rack则作为单独的一分片
最后合并不同rack下的碎片,按mapred.max.split.size分割,剩下的碎片无论大小作为一个split
set hive.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat
2、Mapreduce中使用
自定义类MyMultiFileInputFormat,代码参考其他博客
import java.io.IOException;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.InputSplit;import org.apache.hadoop.mapreduce.RecordReader;import org.apache.hadoop.mapreduce.TaskAttemptContext;import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat;import org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader;import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;public class MyMultiFileInputFormat extends CombineFileInputFormat<MultiFileInputWritableComparable, Text> { public RecordReader<MultiFileInputWritableComparable,Text> createRecordReader(InputSplit split,TaskAttemptContext context) throws IOException { return new org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader<MultiFileInputWritableComparable, Text> ((CombineFileSplit)split, context, CombineFileLineRecordReader.class);//CombineFileLineRecordReader.class为自定义类,一个split可能对应多个path则系统自带类..input.CombineFileRecordReader会通过java反射,针对不同的path分别构建自定义的CombineFileLineRecordReader去读key,value数据,具体看input.CombineFileRecordReader类源码
} }
自定义CombineFileLineRecordReader类:
import java.io.IOException;import org.apache.hadoop.fs.FSDataInputStream;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.InputSplit;import org.apache.hadoop.mapreduce.RecordReader;import org.apache.hadoop.mapreduce.TaskAttemptContext;import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;import org.apache.hadoop.util.LineReader;@SuppressWarnings("deprecation")public class CombineFileLineRecordReader extends RecordReader<MultiFileInputWritableComparable, Text> {private long startOffset; // offset of the chunk;private long end; // end of the chunk;private long pos; // current posprivate FileSystem fs;private Path path; // path of hdfsprivate MultiFileInputWritableComparable key;private Text value; // value should be string(hadoop Text)private FSDataInputStream fileIn;private LineReader reader;public CombineFileLineRecordReader(CombineFileSplit split, TaskAttemptContext context, Integer index) throws IOException {fs = FileSystem.get(context.getConfiguration());this.path = split.getPath(index);this.startOffset = split.getOffset(index);this.end = startOffset + split.getLength(index);boolean skipFirstLine = false;fileIn = fs.open(path); // open the fileif (startOffset != 0) {skipFirstLine = true;--startOffset;fileIn.seek(startOffset);}reader = new LineReader(fileIn);if (skipFirstLine) // skip first line and re-establish "startOffset".{int readNum = reader.readLine(new Text(),0,(int) Math.min((long) Integer.MAX_VALUE, end - startOffset));startOffset += readNum;}this.pos = startOffset;}public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {}public void close() throws IOException {reader.close();}public float getProgress() throws IOException {if (startOffset == end) {return 0.0f;} else {return Math.min(1.0f, (pos - startOffset) / (float) (end - startOffset));}}public boolean nextKeyValue() throws IOException {if (key == null) {key = new MultiFileInputWritableComparable();key.setFileName(path.getName());}key.setOffset(pos);if (value == null) {value = new Text();}int newSize = 0;if (pos < end) {newSize = reader.readLine(value);pos += newSize;}if (newSize == 0) {key = null;value = null;return false;} else {return true;}}public MultiFileInputWritableComparable getCurrentKey() throws IOException, InterruptedException {return key;}public Text getCurrentValue() throws IOException, InterruptedException {return value;}}
MultiFileInputWritableComparable类
import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;import org.apache.hadoop.io.Text;import org.apache.hadoop.io.WritableComparable;@SuppressWarnings("rawtypes")public class MultiFileInputWritableComparable implements WritableComparable { private long offset; //offset of this file block private String fileName; //filename of this block public long getOffset() {return offset;}public void setOffset(long offset) {this.offset = offset;}public String getFileName() {return fileName;}public void setFileName(String fileName) {this.fileName = fileName;}public void readFields(DataInput in) throws IOException { this.offset = in.readLong(); this.fileName = Text.readString(in); } public void write(DataOutput out) throws IOException { out.writeLong(offset); Text.writeString(out, fileName); } public int compareTo(Object object) { MultiFileInputWritableComparable that = (MultiFileInputWritableComparable)object; int compare = this.fileName.compareTo(that.fileName); if(compare == 0) { return (int)Math.signum((double)(this.offset - that.offset)); } return compare; } @Override public boolean equals(Object object) { if(object instanceof MultiFileInputWritableComparable) return this.compareTo(object) == 0; return false; } @Override public int hashCode() { assert false : "hashCode not designed"; return 42; //an arbitrary constant }}
测试
import java.io.IOException;import java.util.StringTokenizer;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.conf.Configured;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.lib.reduce.IntSumReducer;import org.apache.hadoop.util.GenericOptionsParser;import org.apache.hadoop.util.Tool;import org.apache.hadoop.util.ToolRunner;@SuppressWarnings("deprecation")public class MultiFileWordCount extends Configured implements Tool { public static class MapClass extends Mapper<MultiFileInputWritableComparable, Text, Text, IntWritable> { private final static IntWritable one = new IntWritable(1); private Text word = new Text(); public void map(MultiFileInputWritableComparable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); StringTokenizer itr = new StringTokenizer(line); while (itr.hasMoreTokens()) { word.set(itr.nextToken()); context.write(word, one); } } } private void printUsage() { System.out.println("Usage : multifilewc <input_dir> <input_dir> <output>" ); } public int run(String[] args) throws Exception {Configuration conf = new Configuration();String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if(otherArgs.length < 3) { //修改为3,前2个参数为输入文件,后1个为输出目录 printUsage(); return 2; } Job job = new Job(getConf()); job.setJobName("MultiFileWordCount"); job.setJarByClass(MultiFileWordCount.class); //set the InputFormat of the job to our InputFormat job.setInputFormatClass(MyMultiFileInputFormat.class); // the keys are words (strings) job.setOutputKeyClass(Text.class); // the values are counts (ints) job.setOutputValueClass(IntWritable.class); //use the defined mapper job.setMapperClass(MapClass.class); //use the WordCount Reducer job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); FileInputFormat.addInputPaths(job, otherArgs[0]);//修改为3,前2个参数为输入文件,后1个为输出目录 FileInputFormat.addInputPaths(job, otherArgs[1]); FileOutputFormat.setOutputPath(job, new Path(otherArgs[2])); return job.waitForCompletion(true) ? 0 : 1; } public static void main(String[] args) throws Exception { int ret = ToolRunner.run(new MultiFileWordCount(), args); System.exit(ret); }}
mapreduce自带的几个类:
org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader
package org.apache.hadoop.mapreduce.lib.input;import java.io.*;import java.lang.reflect.*;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.mapreduce.*;import org.apache.hadoop.classification.InterfaceAudience;import org.apache.hadoop.classification.InterfaceStability;import org.apache.hadoop.conf.Configuration;/** * A generic RecordReader that can hand out different recordReaders * for each chunk in a {@link CombineFileSplit}. * A CombineFileSplit can combine data chunks from multiple files. * This class allows using different RecordReaders for processing * these data chunks from different files. * @see CombineFileSplit */@InterfaceAudience.Public@InterfaceStability.Stablepublic class CombineFileRecordReader<K, V> extends RecordReader<K, V> { static final Class [] constructorSignature = new Class [] {CombineFileSplit.class, TaskAttemptContext.class, Integer.class}; protected CombineFileSplit split; protected Class<? extends RecordReader<K,V>> rrClass; protected Constructor<? extends RecordReader<K,V>> rrConstructor; protected FileSystem fs; protected TaskAttemptContext context; protected int idx; protected long progress; protected RecordReader<K, V> curReader; public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { this.split = (CombineFileSplit)split; this.context = context; if (null != this.curReader) { this.curReader.initialize(split, context); } } public boolean nextKeyValue() throws IOException, InterruptedException { while ((curReader == null) || !curReader.nextKeyValue()) { //如果当前的recordreader读不到数据了,通过initNextRecordReader构建该分片下一个path对应的recordreadr if (!initNextRecordReader()) { return false; } } return true; } public K getCurrentKey() throws IOException, InterruptedException { return curReader.getCurrentKey(); } public V getCurrentValue() throws IOException, InterruptedException { return curReader.getCurrentValue(); } public void close() throws IOException { if (curReader != null) { curReader.close(); curReader = null; } } /** * return progress based on the amount of data processed so far. */ public float getProgress() throws IOException, InterruptedException { long subprogress = 0; // bytes processed in current split if (null != curReader) { // idx is always one past the current subsplit's true index. subprogress = (long)(curReader.getProgress() * split.getLength(idx - 1)); } return Math.min(1.0f, (progress + subprogress)/(float)(split.getLength())); } /** * A generic RecordReader that can hand out different recordReaders * for each chunk in the CombineFileSplit. */ public CombineFileRecordReader(CombineFileSplit split, TaskAttemptContext context, Class<? extends RecordReader<K,V>> rrClass) throws IOException { this.split = split; this.context = context; this.rrClass = rrClass; this.idx = 0; this.curReader = null; this.progress = 0; try { rrConstructor = rrClass.getDeclaredConstructor(constructorSignature); rrConstructor.setAccessible(true); } catch (Exception e) { throw new RuntimeException(rrClass.getName() + " does not have valid constructor", e); } initNextRecordReader(); } /** * Get the record reader for the next chunk in this CombineFileSplit. */ protected boolean initNextRecordReader() throws IOException { if (curReader != null) { curReader.close(); curReader = null; if (idx > 0) { progress += split.getLength(idx-1); // done processing so far } } // if all chunks have been processed, nothing more to do. if (idx == split.getNumPaths()) { //该分片对应的path recordreader已全部构建完毕判断 return false; } // get a record reader for the idx-th chunk try { Configuration conf = context.getConfiguration(); // setup some helper config variables. conf.set("map.input.file", split.getPath(idx).toString()); conf.setLong("map.input.start", split.getOffset(idx)); conf.setLong("map.input.length", split.getLength(idx)); curReader = rrConstructor.newInstance(new Object [] {split, context, Integer.valueOf(idx)}); if (idx > 0) { // initialize() for the first RecordReader will be called by MapTask; // we're responsible for initializing subsequent RecordReaders. curReader.initialize(split, context); } } catch (Exception e) { throw new RuntimeException (e); } idx++; // return true; }}
CombineFileSplit类:
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */package org.apache.hadoop.mapreduce.lib.input;import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;import org.apache.hadoop.classification.InterfaceAudience;import org.apache.hadoop.classification.InterfaceStability;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.Text;import org.apache.hadoop.io.Writable;import org.apache.hadoop.mapreduce.InputSplit;import org.apache.hadoop.mapreduce.RecordReader;/** * A sub-collection of input files. * * Unlike {@link FileSplit}, CombineFileSplit class does not represent * a split of a file, but a split of input files into smaller sets. * A split may contain blocks from different file but all * the blocks in the same split are probably local to some rack <br> * CombineFileSplit can be used to implement {@link RecordReader}'s, * with reading one record per file. * * @see FileSplit * @see CombineFileInputFormat */@InterfaceAudience.Public@InterfaceStability.Stablepublic class CombineFileSplit extends InputSplit implements Writable { private Path[] paths; private long[] startoffset; private long[] lengths; private String[] locations; private long totLength; /** * default constructor */ public CombineFileSplit() {} public CombineFileSplit(Path[] files, long[] start, long[] lengths, String[] locations) { initSplit(files, start, lengths, locations); } public CombineFileSplit(Path[] files, long[] lengths) { long[] startoffset = new long[files.length]; for (int i = 0; i < startoffset.length; i++) { startoffset[i] = 0; } String[] locations = new String[files.length]; for (int i = 0; i < locations.length; i++) { locations[i] = ""; } initSplit(files, startoffset, lengths, locations); } private void initSplit(Path[] files, long[] start, long[] lengths, String[] locations) { this.startoffset = start; this.lengths = lengths; this.paths = files; this.totLength = 0; this.locations = locations; for(long length : lengths) { totLength += length; } } /** * Copy constructor */ public CombineFileSplit(CombineFileSplit old) throws IOException { this(old.getPaths(), old.getStartOffsets(), old.getLengths(), old.getLocations()); } public long getLength() { return totLength; } /** Returns an array containing the start offsets of the files in the split*/ public long[] getStartOffsets() { return startoffset; } /** Returns an array containing the lengths of the files in the split*/ public long[] getLengths() { return lengths; } /** Returns the start offset of the i<sup>th</sup> Path */ public long getOffset(int i) { return startoffset[i]; } /** Returns the length of the i<sup>th</sup> Path */ public long getLength(int i) { return lengths[i]; } /** Returns the number of Paths in the split */ public int getNumPaths() { return paths.length; } /** Returns the i<sup>th</sup> Path */ public Path getPath(int i) { return paths[i]; } /** Returns all the Paths in the split */ public Path[] getPaths() { return paths; } /** Returns all the Paths where this input-split resides */ public String[] getLocations() throws IOException { return locations; } public void readFields(DataInput in) throws IOException { totLength = in.readLong(); int arrLength = in.readInt(); lengths = new long[arrLength]; for(int i=0; i<arrLength;i++) { lengths[i] = in.readLong(); } int filesLength = in.readInt(); paths = new Path[filesLength]; for(int i=0; i<filesLength;i++) { paths[i] = new Path(Text.readString(in)); } arrLength = in.readInt(); startoffset = new long[arrLength]; for(int i=0; i<arrLength;i++) { startoffset[i] = in.readLong(); } } public void write(DataOutput out) throws IOException { out.writeLong(totLength); out.writeInt(lengths.length); for(long length : lengths) { out.writeLong(length); } out.writeInt(paths.length); for(Path p : paths) { Text.writeString(out, p.toString()); } out.writeInt(startoffset.length); for(long length : startoffset) { out.writeLong(length); } } @Override public String toString() { StringBuffer sb = new StringBuffer(); for (int i = 0; i < paths.length; i++) { if (i == 0 ) { sb.append("Paths:"); } sb.append(paths[i].toUri().getPath() + ":" + startoffset[i] + "+" + lengths[i]); if (i < paths.length -1) { sb.append(","); } } if (locations != null) { String locs = ""; StringBuffer locsb = new StringBuffer(); for (int i = 0; i < locations.length; i++) { locsb.append(locations[i] + ":"); } locs = locsb.toString(); sb.append(" Locations:" + locs + "; "); } return sb.toString(); }}
抽类类CombineFileInputFormat:
具体分片信息计算查看getSPlits
public abstract class CombineFileInputFormat<K, V> extends FileInputFormat<K, V> { public static final String SPLIT_MINSIZE_PERNODE = "mapreduce.input.fileinputformat.split.minsize.per.node"; public static final String SPLIT_MINSIZE_PERRACK = "mapreduce.input.fileinputformat.split.minsize.per.rack"; // ability to limit the size of a single split private long maxSplitSize = 0; private long minSplitSizeNode = 0; private long minSplitSizeRack = 0; // A pool of input paths filters. A split cannot have blocks from files // across multiple pools. private ArrayList<MultiPathFilter> pools = new ArrayList<MultiPathFilter>(); // mapping from a rack name to the set of Nodes in the rack private HashMap<String, Set<String>> rackToNodes = new HashMap<String, Set<String>>(); /** * Specify the maximum size (in bytes) of each split. Each split is * approximately equal to the specified size. */ protected void setMaxSplitSize(long maxSplitSize) { this.maxSplitSize = maxSplitSize; } /** * Specify the minimum size (in bytes) of each split per node. * This applies to data that is left over after combining data on a single * node into splits that are of maximum size specified by maxSplitSize. * This leftover data will be combined into its own split if its size * exceeds minSplitSizeNode. */ protected void setMinSplitSizeNode(long minSplitSizeNode) { this.minSplitSizeNode = minSplitSizeNode; } /** * Specify the minimum size (in bytes) of each split per rack. * This applies to data that is left over after combining data on a single * rack into splits that are of maximum size specified by maxSplitSize. * This leftover data will be combined into its own split if its size * exceeds minSplitSizeRack. */ protected void setMinSplitSizeRack(long minSplitSizeRack) { this.minSplitSizeRack = minSplitSizeRack; } /** * Create a new pool and add the filters to it. * A split cannot have files from different pools. */ protected void createPool(List<PathFilter> filters) { pools.add(new MultiPathFilter(filters)); } /** * Create a new pool and add the filters to it. * A pathname can satisfy any one of the specified filters. * A split cannot have files from different pools. */ protected void createPool(PathFilter... filters) { MultiPathFilter multi = new MultiPathFilter(); for (PathFilter f: filters) { multi.add(f); } pools.add(multi); } @Override protected boolean isSplitable(JobContext context, Path file) { final CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(file); return codec == null; } /** * default constructor */ public CombineFileInputFormat() { } @Override public List<InputSplit> getSplits(JobContext job) throws IOException { long minSizeNode = 0; long minSizeRack = 0; long maxSize = 0; Configuration conf = job.getConfiguration(); // the values specified by setxxxSplitSize() takes precedence over the // values that might have been specified in the config if (minSplitSizeNode != 0) { minSizeNode = minSplitSizeNode; } else { minSizeNode = conf.getLong(SPLIT_MINSIZE_PERNODE, 0); } if (minSplitSizeRack != 0) { minSizeRack = minSplitSizeRack; } else { minSizeRack = conf.getLong(SPLIT_MINSIZE_PERRACK, 0); } if (maxSplitSize != 0) { maxSize = maxSplitSize; } else { maxSize = conf.getLong("mapreduce.input.fileinputformat.split.maxsize", 0); } if (minSizeNode != 0 && maxSize != 0 && minSizeNode > maxSize) { throw new IOException("Minimum split size pernode " + minSizeNode + " cannot be larger than maximum split size " + maxSize); } if (minSizeRack != 0 && maxSize != 0 && minSizeRack > maxSize) { throw new IOException("Minimum split size per rack" + minSizeRack + " cannot be larger than maximum split size " + maxSize); } if (minSizeRack != 0 && minSizeNode > minSizeRack) { throw new IOException("Minimum split size per node" + minSizeNode + " cannot be smaller than minimum split " + "size per rack " + minSizeRack); } // all the files in input set Path[] paths = FileUtil.stat2Paths( listStatus(job).toArray(new FileStatus[0])); List<InputSplit> splits = new ArrayList<InputSplit>(); if (paths.length == 0) { return splits; } // Convert them to Paths first. This is a costly operation and // we should do it first, otherwise we will incur doing it multiple // times, one time each for each pool in the next loop. List<Path> newpaths = new LinkedList<Path>(); for (int i = 0; i < paths.length; i++) { Path p = new Path(paths[i].toUri().getPath()); newpaths.add(p); } paths = null; // In one single iteration, process all the paths in a single pool. // Processing one pool at a time ensures that a split contains paths // from a single pool only. for (MultiPathFilter onepool : pools) { ArrayList<Path> myPaths = new ArrayList<Path>(); // pick one input path. If it matches all the filters in a pool, // add it to the output set for (Iterator<Path> iter = newpaths.iterator(); iter.hasNext();) { Path p = iter.next(); if (onepool.accept(p)) { myPaths.add(p); // add it to my output set iter.remove(); } } // create splits for all files in this pool. getMoreSplits(job, myPaths.toArray(new Path[myPaths.size()]), maxSize, minSizeNode, minSizeRack, splits); } // create splits for all files that are not in any pool. getMoreSplits(job, newpaths.toArray(new Path[newpaths.size()]), maxSize, minSizeNode, minSizeRack, splits); // free up rackToNodes map rackToNodes.clear(); return splits; } /** * Return all the splits in the specified set of paths */ private void getMoreSplits(JobContext job, Path[] paths, long maxSize, long minSizeNode, long minSizeRack, List<InputSplit> splits) throws IOException { Configuration conf = job.getConfiguration(); // all blocks for all the files in input set OneFileInfo[] files; // mapping from a rack name to the list of blocks it has HashMap<String, List<OneBlockInfo>> rackToBlocks = new HashMap<String, List<OneBlockInfo>>(); // mapping from a block to the nodes on which it has replicas HashMap<OneBlockInfo, String[]> blockToNodes = new HashMap<OneBlockInfo, String[]>(); // mapping from a node to the list of blocks that it contains HashMap<String, List<OneBlockInfo>> nodeToBlocks = new HashMap<String, List<OneBlockInfo>>(); files = new OneFileInfo[paths.length]; if (paths.length == 0) { return; } // populate all the blocks for all files long totLength = 0; for (int i = 0; i < paths.length; i++) { files[i] = new OneFileInfo(paths[i], conf, isSplitable(job, paths[i]), rackToBlocks, blockToNodes, nodeToBlocks, rackToNodes, maxSize); totLength += files[i].getLength(); } ArrayList<OneBlockInfo> validBlocks = new ArrayList<OneBlockInfo>(); Set<String> nodes = new HashSet<String>(); long curSplitSize = 0; // process all nodes and create splits that are local // to a node. for (Iterator<Map.Entry<String, List<OneBlockInfo>>> iter = nodeToBlocks.entrySet().iterator(); iter.hasNext();) { Map.Entry<String, List<OneBlockInfo>> one = iter.next(); nodes.add(one.getKey()); List<OneBlockInfo> blocksInNode = one.getValue(); // for each block, copy it into validBlocks. Delete it from // blockToNodes so that the same block does not appear in // two different splits. for (OneBlockInfo oneblock : blocksInNode) { if (blockToNodes.containsKey(oneblock)) { validBlocks.add(oneblock); blockToNodes.remove(oneblock); curSplitSize += oneblock.length; // if the accumulated split size exceeds the maximum, then // create this split. if (maxSize != 0 && curSplitSize >= maxSize) { // create an input split and add it to the splits array addCreatedSplit(splits, nodes, validBlocks); curSplitSize = 0; validBlocks.clear(); } } } // if there were any blocks left over and their combined size is // larger than minSplitNode, then combine them into one split. // Otherwise add them back to the unprocessed pool. It is likely // that they will be combined with other blocks from the // same rack later on. if (minSizeNode != 0 && curSplitSize >= minSizeNode) { // create an input split and add it to the splits array addCreatedSplit(splits, nodes, validBlocks); } else { for (OneBlockInfo oneblock : validBlocks) { blockToNodes.put(oneblock, oneblock.hosts); } } validBlocks.clear(); nodes.clear(); curSplitSize = 0; } // if blocks in a rack are below the specified minimum size, then keep them // in 'overflow'. After the processing of all racks is complete, these // overflow blocks will be combined into splits. ArrayList<OneBlockInfo> overflowBlocks = new ArrayList<OneBlockInfo>(); Set<String> racks = new HashSet<String>(); // Process all racks over and over again until there is no more work to do. while (blockToNodes.size() > 0) { // Create one split for this rack before moving over to the next rack. // Come back to this rack after creating a single split for each of the // remaining racks. // Process one rack location at a time, Combine all possible blocks that // reside on this rack as one split. (constrained by minimum and maximum // split size). // iterate over all racks for (Iterator<Map.Entry<String, List<OneBlockInfo>>> iter = rackToBlocks.entrySet().iterator(); iter.hasNext();) { Map.Entry<String, List<OneBlockInfo>> one = iter.next(); racks.add(one.getKey()); List<OneBlockInfo> blocks = one.getValue(); // for each block, copy it into validBlocks. Delete it from // blockToNodes so that the same block does not appear in // two different splits. boolean createdSplit = false; for (OneBlockInfo oneblock : blocks) { if (blockToNodes.containsKey(oneblock)) { validBlocks.add(oneblock); blockToNodes.remove(oneblock); curSplitSize += oneblock.length; // if the accumulated split size exceeds the maximum, then // create this split. if (maxSize != 0 && curSplitSize >= maxSize) { // create an input split and add it to the splits array addCreatedSplit(splits, getHosts(racks), validBlocks); createdSplit = true; break; } } } // if we created a split, then just go to the next rack if (createdSplit) { curSplitSize = 0; validBlocks.clear(); racks.clear(); continue; } if (!validBlocks.isEmpty()) { if (minSizeRack != 0 && curSplitSize >= minSizeRack) { // if there is a minimum size specified, then create a single split // otherwise, store these blocks into overflow data structure addCreatedSplit(splits, getHosts(racks), validBlocks); } else { // There were a few blocks in this rack that // remained to be processed. Keep them in 'overflow' block list. // These will be combined later. overflowBlocks.addAll(validBlocks); } } curSplitSize = 0; validBlocks.clear(); racks.clear(); } } assert blockToNodes.isEmpty(); assert curSplitSize == 0; assert validBlocks.isEmpty(); assert racks.isEmpty(); // Process all overflow blocks for (OneBlockInfo oneblock : overflowBlocks) { validBlocks.add(oneblock); curSplitSize += oneblock.length; // This might cause an exiting rack location to be re-added, // but it should be ok. for (int i = 0; i < oneblock.racks.length; i++) { racks.add(oneblock.racks[i]); } // if the accumulated split size exceeds the maximum, then // create this split. if (maxSize != 0 && curSplitSize >= maxSize) { // create an input split and add it to the splits array addCreatedSplit(splits, getHosts(racks), validBlocks); curSplitSize = 0; validBlocks.clear(); racks.clear(); } } // Process any remaining blocks, if any. if (!validBlocks.isEmpty()) { addCreatedSplit(splits, getHosts(racks), validBlocks); } } /** * Create a single split from the list of blocks specified in validBlocks * Add this new split into splitList. */ private void addCreatedSplit(List<InputSplit> splitList, Collection<String> locations, ArrayList<OneBlockInfo> validBlocks) { // create an input split Path[] fl = new Path[validBlocks.size()]; long[] offset = new long[validBlocks.size()]; long[] length = new long[validBlocks.size()]; for (int i = 0; i < validBlocks.size(); i++) { fl[i] = validBlocks.get(i).onepath; offset[i] = validBlocks.get(i).offset; length[i] = validBlocks.get(i).length; } // add this split to the list that is returned CombineFileSplit thissplit = new CombineFileSplit(fl, offset, length, locations.toArray(new String[0])); splitList.add(thissplit); } /** * This is not implemented yet. */ public abstract RecordReader<K, V> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException; /** * information about one file from the File System */ private static class OneFileInfo { private long fileSize; // size of the file private OneBlockInfo[] blocks; // all blocks in this file OneFileInfo(Path path, Configuration conf, boolean isSplitable, HashMap<String, List<OneBlockInfo>> rackToBlocks, HashMap<OneBlockInfo, String[]> blockToNodes, HashMap<String, List<OneBlockInfo>> nodeToBlocks, HashMap<String, Set<String>> rackToNodes, long maxSize) throws IOException { this.fileSize = 0; // get block locations from file system FileSystem fs = path.getFileSystem(conf); FileStatus stat = fs.getFileStatus(path); BlockLocation[] locations = fs.getFileBlockLocations(stat, 0, stat.getLen()); // create a list of all block and their locations if (locations == null) { blocks = new OneBlockInfo[0]; } else { if (!isSplitable) { // if the file is not splitable, just create the one block with // full file length blocks = new OneBlockInfo[1]; fileSize = stat.getLen(); blocks[0] = new OneBlockInfo(path, 0, fileSize, locations[0] .getHosts(), locations[0].getTopologyPaths()); } else { ArrayList<OneBlockInfo> blocksList = new ArrayList<OneBlockInfo>( locations.length); for (int i = 0; i < locations.length; i++) { fileSize += locations[i].getLength(); // each split can be a maximum of maxSize long left = locations[i].getLength(); long myOffset = locations[i].getOffset(); long myLength = 0; while (left > 0) { if (maxSize == 0) { myLength = left; } else { if (left > maxSize && left < 2 * maxSize) { // if remainder is between max and 2*max - then // instead of creating splits of size max, left-max we // create splits of size left/2 and left/2. This is // a heuristic to avoid creating really really small // splits. myLength = left / 2; } else { myLength = Math.min(maxSize, left); } } OneBlockInfo oneblock = new OneBlockInfo(path, myOffset, myLength, locations[i].getHosts(), locations[i] .getTopologyPaths()); left -= myLength; myOffset += myLength; blocksList.add(oneblock); } } blocks = blocksList.toArray(new OneBlockInfo[blocksList.size()]); } for (OneBlockInfo oneblock : blocks) { // add this block to the block --> node locations map blockToNodes.put(oneblock, oneblock.hosts); // For blocks that do not have host/rack information, // assign to default rack. String[] racks = null; if (oneblock.hosts.length == 0) { racks = new String[]{NetworkTopology.DEFAULT_RACK}; } else { racks = oneblock.racks; } // add this block to the rack --> block map for (int j = 0; j < racks.length; j++) { String rack = racks[j]; List<OneBlockInfo> blklist = rackToBlocks.get(rack); if (blklist == null) { blklist = new ArrayList<OneBlockInfo>(); rackToBlocks.put(rack, blklist); } blklist.add(oneblock); if (!racks[j].equals(NetworkTopology.DEFAULT_RACK)) { // Add this host to rackToNodes map addHostToRack(rackToNodes, racks[j], oneblock.hosts[j]); } } // add this block to the node --> block map for (int j = 0; j < oneblock.hosts.length; j++) { String node = oneblock.hosts[j]; List<OneBlockInfo> blklist = nodeToBlocks.get(node); if (blklist == null) { blklist = new ArrayList<OneBlockInfo>(); nodeToBlocks.put(node, blklist); } blklist.add(oneblock); } } } } long getLength() { return fileSize; } OneBlockInfo[] getBlocks() { return blocks; } } /** * information about one block from the File System */ private static class OneBlockInfo { Path onepath; // name of this file long offset; // offset in file long length; // length of this block String[] hosts; // nodes on which this block resides String[] racks; // network topology of hosts OneBlockInfo(Path path, long offset, long len, String[] hosts, String[] topologyPaths) { this.onepath = path; this.offset = offset; this.hosts = hosts; this.length = len; assert (hosts.length == topologyPaths.length || topologyPaths.length == 0); // if the file system does not have any rack information, then // use dummy rack location. if (topologyPaths.length == 0) { topologyPaths = new String[hosts.length]; for (int i = 0; i < topologyPaths.length; i++) { topologyPaths[i] = (new NodeBase(hosts[i], NetworkTopology.DEFAULT_RACK)).toString(); } } // The topology paths have the host name included as the last // component. Strip it. this.racks = new String[topologyPaths.length]; for (int i = 0; i < topologyPaths.length; i++) { this.racks[i] = (new NodeBase(topologyPaths[i])).getNetworkLocation(); } } } protected BlockLocation[] getFileBlockLocations( FileSystem fs, FileStatus stat) throws IOException { return fs.getFileBlockLocations(stat, 0, stat.getLen()); } private static void addHostToRack(HashMap<String, Set<String>> rackToNodes, String rack, String host) { Set<String> hosts = rackToNodes.get(rack); if (hosts == null) { hosts = new HashSet<String>(); rackToNodes.put(rack, hosts); } hosts.add(host); } private Set<String> getHosts(Set<String> racks) { Set<String> hosts = new HashSet<String>(); for (String rack : racks) { if (rackToNodes.containsKey(rack)) { hosts.addAll(rackToNodes.get(rack)); } } return hosts; } /** * Accept a path only if any one of filters given in the * constructor do. */ private static class MultiPathFilter implements PathFilter { private List<PathFilter> filters; public MultiPathFilter() { this.filters = new ArrayList<PathFilter>(); } public MultiPathFilter(List<PathFilter> filters) { this.filters = filters; } public void add(PathFilter one) { filters.add(one); } public boolean accept(Path path) { for (PathFilter filter : filters) { if (filter.accept(path)) { return true; } } return false; } public String toString() { StringBuffer buf = new StringBuffer(); buf.append("["); for (PathFilter f: filters) { buf.append(f); buf.append(","); } buf.append("]"); return buf.toString(); } }}
- hadoop CombineFileInputFormat的使用
- hadoop的archive归档和CombineFileInputFormat的使用
- hadoop CombineFileInputFormat
- hadoop之CombineFileInputFormat篇
- Hadoop中CombineFileInputFormat详解
- Hadoop中CombineFileInputFormat详解
- Hadoop中CombineFileInputFormat详解
- Hadoop CombineFileInputFormat原理说明
- Hadoop CombineFileInputFormat原理说明
- Hadoop中CombineFileInputFormat详解
- Hadoop MapReduce处理小的压缩文件:基于CombineFileInputFormat
- Hadoop CombineFileInputFormat原理说明(转)
- Hadoop CombineFileInputFormat原理说明(转)
- Hadoop使用CombineFileInputFormat处理大量小文件接口实现(Hadoop-1.0.4)
- CombineFileInputFormat
- Hadoop关于处理大量小文件的问题和解决办法(2)--CombineFileInputFormat
- 学习总结五:Hadoop中CombineFileInputFormat详解
- 用CombineFileInputFormat优化Hadoop小文件
- 在mac中找android的MD5 或SHA1
- Python日志模块使用
- Android 中的 Service 全面总结
- 启动其它程序Activity和Service
- c 函数,计算绝对值而不需要使用条件分支
- hadoop CombineFileInputFormat的使用
- 线程的创建和调用
- 硝烟中的Scrum和XP-我们如何实施Scrum 1)简介 2)产品backlog 3)准备sprint计划
- Microsoft Visual Studio 重置窗口布局
- 音视频解决方案AnyChat SDK 的服务器部署与评估
- OPENSSL库的使用-RSA篇
- eclipse4.2安装jad插件
- Thinkphp 中获取参数
- 20130922-OrCAD在Win7_64位系统中打印PDF原理图Grid黑点问题解决办法