inputFormat,RecordRead的理解和定制
来源:互联网 发布:银行内控优化心得体会 编辑:程序博客网 时间:2024/05/16 06:20
先看看InputFormat接口和
public interface InputFormat<K, V> { InputSplit[] getSplits(JobConf var1, int var2) throws IOException; RecordReader<K, V> getRecordReader(InputSplit var1, JobConf var2, Reporter var3)
throws IOException;}
public interface RecordReader<K, V> { boolean next(K var1, V var2) throws IOException; K createKey(); V createValue(); long getPos() throws IOException; void close() throws IOException; float getProgress() throws IOException;}
其中Input定义了如何分割数据,recordread定义了如何给mapper提供k,v
我们如何实现定制呢?
Inputformat有个实现版本,FileInputFormat,里面的代码很值得去查看
<span style="font-size:18px;">//// Source code recreated from a .class file by IntelliJ IDEA// (powered by Fernflower decompiler)//package org.apache.hadoop.mapred;import java.io.IOException;import java.util.ArrayList;import java.util.Collections;import java.util.Comparator;import java.util.HashSet;import java.util.IdentityHashMap;import java.util.Iterator;import java.util.LinkedList;import java.util.List;import java.util.Map;import java.util.Set;import org.apache.commons.logging.Log;import org.apache.commons.logging.LogFactory;import org.apache.hadoop.fs.BlockLocation;import org.apache.hadoop.fs.FileStatus;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.fs.PathFilter;import org.apache.hadoop.mapred.FileSplit;import org.apache.hadoop.mapred.InputFormat;import org.apache.hadoop.mapred.InputSplit;import org.apache.hadoop.mapred.InvalidInputException;import org.apache.hadoop.mapred.JobConf;import org.apache.hadoop.mapred.RecordReader;import org.apache.hadoop.mapred.Reporter;import org.apache.hadoop.mapreduce.security.TokenCache;import org.apache.hadoop.net.NetworkTopology;import org.apache.hadoop.net.Node;import org.apache.hadoop.net.NodeBase;import org.apache.hadoop.util.ReflectionUtils;import org.apache.hadoop.util.StringUtils;public abstract class FileInputFormat<K, V> implements InputFormat<K, V> { public static final Log LOG = LogFactory.getLog(FileInputFormat.class); private static final double SPLIT_SLOP = 1.1D; private long minSplitSize = 1L; private static final PathFilter hiddenFileFilter = new PathFilter() { public boolean accept(Path p) { String name = p.getName(); return !name.startsWith("_") && !name.startsWith("."); } }; static final String NUM_INPUT_FILES = "mapreduce.input.num.files"; public FileInputFormat() { } protected void setMinSplitSize(long minSplitSize) { this.minSplitSize = minSplitSize; } protected boolean isSplitable(FileSystem fs, Path filename) { return true; } public abstract RecordReader<K, V> getRecordReader(InputSplit var1, JobConf var2, Reporter var3) throws IOException; public static void setInputPathFilter(JobConf conf, Class<? extends PathFilter> filter) { conf.setClass("mapred.input.pathFilter.class", filter, PathFilter.class); } public static PathFilter getInputPathFilter(JobConf conf) { Class filterClass = conf.getClass("mapred.input.pathFilter.class", (Class)null, PathFilter.class); return filterClass != null?(PathFilter)ReflectionUtils.newInstance(filterClass, conf):null; } protected FileStatus[] listStatus(JobConf job) throws IOException { Path[] dirs = getInputPaths(job); if(dirs.length == 0) { throw new IOException("No input paths specified in job"); } else { TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job); ArrayList result = new ArrayList(); ArrayList errors = new ArrayList(); ArrayList filters = new ArrayList(); filters.add(hiddenFileFilter); PathFilter jobFilter = getInputPathFilter(job); if(jobFilter != null) { filters.add(jobFilter); } FileInputFormat.MultiPathFilter inputFilter = new FileInputFormat.MultiPathFilter(filters); Path[] arr$ = dirs; int len$ = dirs.length; for(int i$ = 0; i$ < len$; ++i$) { Path p = arr$[i$]; FileSystem fs = p.getFileSystem(job); FileStatus[] matches = fs.globStatus(p, inputFilter); if(matches == null) { errors.add(new IOException("Input path does not exist: " + p)); } else if(matches.length == 0) { errors.add(new IOException("Input Pattern " + p + " matches 0 files")); } else { FileStatus[] arr$1 = matches; int len$1 = matches.length; for(int i$1 = 0; i$1 < len$1; ++i$1) { FileStatus globStat = arr$1[i$1]; if(globStat.isDir()) { FileStatus[] arr$2 = fs.listStatus(globStat.getPath(), inputFilter); int len$2 = arr$2.length; for(int i$2 = 0; i$2 < len$2; ++i$2) { FileStatus stat = arr$2[i$2]; result.add(stat); } } else { result.add(globStat); } } } } if(!errors.isEmpty()) { throw new InvalidInputException(errors); } else { LOG.info("Total input paths to process : " + result.size()); return (FileStatus[])result.toArray(new FileStatus[result.size()]); } } } public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { FileStatus[] files = this.listStatus(job); job.setLong("mapreduce.input.num.files", (long)files.length); long totalSize = 0L; FileStatus[] goalSize = files; int len$ = files.length; for(int minSize = 0; minSize < len$; ++minSize) { FileStatus file = goalSize[minSize]; if(file.isDir()) { throw new IOException("Not a file: " + file.getPath()); } totalSize += file.getLen(); } long var29 = totalSize / (long)(numSplits == 0?1:numSplits); long var30 = Math.max(job.getLong("mapred.min.split.size", 1L), this.minSplitSize); ArrayList splits = new ArrayList(numSplits); NetworkTopology clusterMap = new NetworkTopology(); FileStatus[] arr$ = files; int len$1 = files.length; for(int i$ = 0; i$ < len$1; ++i$) { FileStatus file1 = arr$[i$]; Path path = file1.getPath(); FileSystem fs = path.getFileSystem(job); long length = file1.getLen(); BlockLocation[] blkLocations = fs.getFileBlockLocations(file1, 0L, length); if(length != 0L && this.isSplitable(fs, path)) { long var28 = file1.getBlockSize(); long splitSize = this.computeSplitSize(var29, var30, var28); long bytesRemaining; for(bytesRemaining = length; (double)bytesRemaining / (double)splitSize > 1.1D; bytesRemaining -= splitSize) { String[] splitHosts1 = this.getSplitHosts(blkLocations, length - bytesRemaining, splitSize, clusterMap); splits.add(new FileSplit(path, length - bytesRemaining, splitSize, splitHosts1)); } if(bytesRemaining != 0L) { splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts())); } } else if(length != 0L) { String[] splitHosts = this.getSplitHosts(blkLocations, 0L, length, clusterMap); splits.add(new FileSplit(path, 0L, length, splitHosts)); } else { splits.add(new FileSplit(path, 0L, length, new String[0])); } } LOG.debug("Total # of splits: " + splits.size()); return (InputSplit[])splits.toArray(new FileSplit[splits.size()]); } protected long computeSplitSize(long goalSize, long minSize, long blockSize) { return Math.max(minSize, Math.min(goalSize, blockSize)); } protected int getBlockIndex(BlockLocation[] blkLocations, long offset) { for(int last = 0; last < blkLocations.length; ++last) { if(blkLocations[last].getOffset() <= offset && offset < blkLocations[last].getOffset() + blkLocations[last].getLength()) { return last; } } BlockLocation var7 = blkLocations[blkLocations.length - 1]; long fileLength = var7.getOffset() + var7.getLength() - 1L; throw new IllegalArgumentException("Offset " + offset + " is outside of file (0.." + fileLength + ")"); } public static void setInputPaths(JobConf conf, String commaSeparatedPaths) { setInputPaths(conf, (Path[])StringUtils.stringToPath(getPathStrings(commaSeparatedPaths))); } public static void addInputPaths(JobConf conf, String commaSeparatedPaths) { String[] arr$ = getPathStrings(commaSeparatedPaths); int len$ = arr$.length; for(int i$ = 0; i$ < len$; ++i$) { String str = arr$[i$]; addInputPath(conf, new Path(str)); } } public static void setInputPaths(JobConf conf, Path... inputPaths) { Path path = new Path(conf.getWorkingDirectory(), inputPaths[0]); StringBuffer str = new StringBuffer(StringUtils.escapeString(path.toString())); for(int i = 1; i < inputPaths.length; ++i) { str.append(","); path = new Path(conf.getWorkingDirectory(), inputPaths[i]); str.append(StringUtils.escapeString(path.toString())); } conf.set("mapred.input.dir", str.toString()); } public static void addInputPath(JobConf conf, Path path) { path = new Path(conf.getWorkingDirectory(), path); String dirStr = StringUtils.escapeString(path.toString()); String dirs = conf.get("mapred.input.dir"); conf.set("mapred.input.dir", dirs == null?dirStr:dirs + "," + dirStr); } private static String[] getPathStrings(String commaSeparatedPaths) { int length = commaSeparatedPaths.length(); int curlyOpen = 0; int pathStart = 0; boolean globPattern = false; ArrayList pathStrings = new ArrayList(); for(int i = 0; i < length; ++i) { char ch = commaSeparatedPaths.charAt(i); switch(ch) { case ',': if(!globPattern) { pathStrings.add(commaSeparatedPaths.substring(pathStart, i)); pathStart = i + 1; } break; case '{': ++curlyOpen; if(!globPattern) { globPattern = true; } break; case '}': --curlyOpen; if(curlyOpen == 0 && globPattern) { globPattern = false; } } } pathStrings.add(commaSeparatedPaths.substring(pathStart, length)); return (String[])pathStrings.toArray(new String[0]); } public static Path[] getInputPaths(JobConf conf) { String dirs = conf.get("mapred.input.dir", ""); String[] list = StringUtils.split(dirs); Path[] result = new Path[list.length]; for(int i = 0; i < list.length; ++i) { result[i] = new Path(StringUtils.unEscapeString(list[i])); } return result; } private void sortInDescendingOrder(List<FileInputFormat.NodeInfo> mylist) { Collections.sort(mylist, new Comparator() { public int compare(FileInputFormat.NodeInfo obj1, FileInputFormat.NodeInfo obj2) { return obj1 != null && obj2 != null?(obj1.getValue() == obj2.getValue()?0:(obj1.getValue() < obj2.getValue()?1:-1)):-1; } }); } protected String[] getSplitHosts(BlockLocation[] blkLocations, long offset, long splitSize, NetworkTopology clusterMap) throws IOException { int startIndex = this.getBlockIndex(blkLocations, offset); long bytesInThisBlock = blkLocations[startIndex].getOffset() + blkLocations[startIndex].getLength() - offset; if(bytesInThisBlock >= splitSize) { return blkLocations[startIndex].getHosts(); } else { long bytesInFirstBlock = bytesInThisBlock; int index = startIndex + 1; for(splitSize -= bytesInThisBlock; splitSize > 0L; splitSize -= bytesInThisBlock) { bytesInThisBlock = Math.min(splitSize, blkLocations[index++].getLength()); } long bytesInLastBlock = bytesInThisBlock; int endIndex = index - 1; IdentityHashMap hostsMap = new IdentityHashMap(); IdentityHashMap racksMap = new IdentityHashMap(); String[] allTopos = new String[0]; for(index = startIndex; index <= endIndex; ++index) { if(index == startIndex) { bytesInThisBlock = bytesInFirstBlock; } else if(index == endIndex) { bytesInThisBlock = bytesInLastBlock; } else { bytesInThisBlock = blkLocations[index].getLength(); } allTopos = blkLocations[index].getTopologyPaths(); if(allTopos.length == 0) { allTopos = this.fakeRacks(blkLocations, index); } String[] arr$ = allTopos; int len$ = allTopos.length; for(int i$ = 0; i$ < len$; ++i$) { String topo = arr$[i$]; Object node = clusterMap.getNode(topo); if(node == null) { node = new NodeBase(topo); clusterMap.add((Node)node); } FileInputFormat.NodeInfo nodeInfo = (FileInputFormat.NodeInfo)hostsMap.get(node); Node parentNode; FileInputFormat.NodeInfo parentNodeInfo; if(nodeInfo == null) { nodeInfo = new FileInputFormat.NodeInfo((Node)node); hostsMap.put(node, nodeInfo); parentNode = ((Node)node).getParent(); parentNodeInfo = (FileInputFormat.NodeInfo)racksMap.get(parentNode); if(parentNodeInfo == null) { parentNodeInfo = new FileInputFormat.NodeInfo(parentNode); racksMap.put(parentNode, parentNodeInfo); } parentNodeInfo.addLeaf(nodeInfo); } else { nodeInfo = (FileInputFormat.NodeInfo)hostsMap.get(node); parentNode = ((Node)node).getParent(); parentNodeInfo = (FileInputFormat.NodeInfo)racksMap.get(parentNode); } nodeInfo.addValue(index, bytesInThisBlock); parentNodeInfo.addValue(index, bytesInThisBlock); } } return this.identifyHosts(allTopos.length, racksMap); } } private String[] identifyHosts(int replicationFactor, Map<Node, FileInputFormat.NodeInfo> racksMap) { String[] retVal = new String[replicationFactor]; LinkedList rackList = new LinkedList(); rackList.addAll(racksMap.values()); this.sortInDescendingOrder(rackList); boolean done = false; int index = 0; Iterator i$ = rackList.iterator(); while(i$.hasNext()) { FileInputFormat.NodeInfo ni = (FileInputFormat.NodeInfo)i$.next(); Set hostSet = ni.getLeaves(); LinkedList hostList = new LinkedList(); hostList.addAll(hostSet); this.sortInDescendingOrder(hostList); Iterator i$1 = hostList.iterator(); while(i$1.hasNext()) { FileInputFormat.NodeInfo host = (FileInputFormat.NodeInfo)i$1.next(); retVal[index++] = host.node.getName().split(":")[0]; if(index == replicationFactor) { done = true; break; } } if(done) { break; } } return retVal; } private String[] fakeRacks(BlockLocation[] blkLocations, int index) throws IOException { String[] allHosts = blkLocations[index].getHosts(); String[] allTopos = new String[allHosts.length]; for(int i = 0; i < allHosts.length; ++i) { allTopos[i] = "/default-rack/" + allHosts[i]; } return allTopos; } private static class NodeInfo { final Node node; final Set<Integer> blockIds; final Set<FileInputFormat.NodeInfo> leaves; private long value; NodeInfo(Node node) { this.node = node; this.blockIds = new HashSet(); this.leaves = new HashSet(); } long getValue() { return this.value; } void addValue(int blockIndex, long value) { if(this.blockIds.add(Integer.valueOf(blockIndex))) { this.value += value; } } Set<FileInputFormat.NodeInfo> getLeaves() { return this.leaves; } void addLeaf(FileInputFormat.NodeInfo nodeInfo) { this.leaves.add(nodeInfo); } } private static class MultiPathFilter implements PathFilter { private List<PathFilter> filters; public MultiPathFilter(List<PathFilter> filters) { this.filters = filters; } public boolean accept(Path path) { Iterator i$ = this.filters.iterator(); PathFilter filter; do { if(!i$.hasNext()) { return true; } filter = (PathFilter)i$.next(); } while(filter.accept(path)); return false; } } public static enum Counter { BYTES_READ; private Counter() { } }}</span>比如我们需要定制每个spit的大小,我们且看FileinputFormat是如何定义分割大小的
protected long computeSplitSize(long goalSize, long minSize, long blockSize) { return Math.max(minSize, Math.min(goalSize, blockSize)); }其实split的大小可以超过block的(这样会导致跨block读取导致网络堵塞)如果你需要定制1024b的分割量,修改即可
0 0
- inputFormat,RecordRead的理解和定制
- InputFormat的个人理解
- Hadoop的InputFormat和OutputFormat
- Hadoop开发常用的InputFormat和OutputFormat
- Hadoop开发常用的InputFormat和OutputFormat
- Hadoop开发常用的InputFormat和OutputFormat
- Hadoop开发常用的InputFormat和OutputFormat
- Hadoop开发常用的InputFormat和OutputFormat
- Hadoop开发常用的InputFormat和OutputFormat
- Hadoop开发常用的InputFormat和OutputFormat
- Hadoop开发常用的InputFormat和OutputFormat
- MapReduce之InputFormat理解
- Hadoop InputFormat定制时必须知道的原理---如何划分split,split如何调度,如何读取
- SharePoint 2010 UI 定制化系列之定制Ribbon: Server Ribbon 架构的探讨和理解
- Hadoop:InputFormat和OutputFormat
- OutputFormat和InputFormat
- MapReduce-XML处理-定制InputFormat及定制RecordReader
- Mapreduce的InputFormat和OutputFormat类层次结构
- 文章标题
- UVA 10325 The Lottery(容斥原理)
- I’m stuck!
- java.util.concurrent包 以及 线程池的使用。
- 使用Jsoup抓取页面的数据
- inputFormat,RecordRead的理解和定制
- SWUST oj 2478 最短路问题
- 2014 打印图形
- DAY2:leetcode #2 Add Two Numbers
- Python中排序sort 、sorted和argsort函数
- maven之 build lifecycle
- boost::format小结
- GDOI'2016市选day1 —— 3. 最大值求和 (summax)
- 做个毕设也不容易之购物车模块遇到java.lang.IllegalArgumentException