hadoop初读--setInputFormatClass
来源:互联网 发布:网络招生技巧 编辑:程序博客网 时间:2024/06/08 14:07
鉴于平时的hadoop实践问题或粗读hadoop源码而记录形成”hadoop初读”系列部分。每篇主要以问题-解答两部分组成:问题是实践中的异常或疑惑;解答部分为源码的粗读理解(以注释形式给出说明)。
问题代码(主要实现多输入,报No input paths specified in job):
import java.io.IOException;import java.net.URI;import java.util.ArrayList;import java.util.Map;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.conf.Configured;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;import org.apache.hadoop.util.Tool;import org.apache.hadoop.util.ToolRunner;import org.slf4j.Logger;import org.slf4j.LoggerFactory;public class ReduceSideJoin extends Configured implements Tool { private static final Logger logger = LoggerFactory .getLogger(ReduceSideJoin.class); public static class LeftOutJoinReducer extends Reducer<IntWritable, OrderGoodsInfoWritble, IntWritable, OrderGoodsInfoWritble> { private ArrayList<OrderGoodsInfoWritble> leftTable = new ArrayList<OrderGoodsInfoWritble>(); private ArrayList<OrderGoodsInfoWritble> rightTable = new ArrayList<OrderGoodsInfoWritble>(); private Text secondPar = null; private Text output = new Text(); @Override protected void reduce(IntWritable key, Iterable<OrderGoodsInfoWritble> value, Context context) throws IOException, InterruptedException { leftTable.clear(); rightTable.clear(); for (OrderGoodsInfoWritble cv : value) { String mRecordFlog = cv.getmRecordflag().toString().trim(); if ("orderInfo".equals(mRecordFlog)) { leftTable.add(cv); }else { rightTable.add(cv); } } for (OrderGoodsInfoWritble leftPart : leftTable) { for (OrderGoodsInfoWritble rightPart : rightTable) { if ((leftPart.getOrderId().toString()).equals((rightPart.getOrderId().toString()))) { leftPart.setGoodsSn(rightPart.getGoodsSn()); leftPart.setGoodsName(rightPart.getGoodsName()); leftPart.setmRecordflag(new Text("orderGoodsInfo")); context.write(key, leftPart); } } } } } @Override public int run(String[] args) throws Exception { Configuration conf = getConf(); Job job = Job.getInstance(conf); Path path1 = new Path(args[0]); Path path2 = new Path(args[1]); FileSystem fs = FileSystem.get(URI.create("hdfs://***:9000/venus/ordergoodsinfo"), conf); Path outputPath = new Path("hdfs://***:9000/venus/ordergoodsinfo"); fs.deleteOnExit(outputPath); fs.close(); MultipleInputs.addInputPath(job, path1, TextInputFormat.class, OrderInfoMapper.class); MultipleInputs.addInputPath(job, path2, TextInputFormat.class, OrderGoodsMapper.class); FileOutputFormat.setOutputPath(job, outputPath); job.setJobName("LeftOutJoinMR"); job.setJarByClass(ReduceSideJoin.class); job.setReducerClass(LeftOutJoinReducer.class);/**问题代码:调用MultipleInputs.addInputPath还设置了setInputFormatClass,导致报No input paths specified in job异常**/ job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(OrderGoodsInfoWritble.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(OrderGoodsInfoWritble.class); job.waitForCompletion(true); return job.isSuccessful() ? 0 : 1; } public static void main(String[] args) throws Exception { Tool rdf = new ReduceSideJoin(); int returnCode = ToolRunner.run(rdf, args); System.exit(returnCode); }}
问题:
1.既然已经设置了多输入路径(MultipleInputs.addInputPath)为什么还报No input paths specified in job异常。
解答:
因为最后的setInputFormatClass覆盖了多输入的设置。具体见下面源码实现的说明.
1)job.setInputFormatClass(TextInputFormat.class)的源码实现:
public void setInputFormatClass(Class<? extends InputFormat> cls ) throws IllegalStateException { ensureState(JobState.DEFINE); /** 设置的InputFormatClass为传进来的TextInputFormat **/ conf.setClass(INPUT_FORMAT_CLASS_ATTR, cls, InputFormat.class); }
2)MultipleInputs.addInputPath源码实现:
public static void addInputPath(Job job, Path path, Class<? extends InputFormat> inputFormatClass, Class<? extends Mapper> mapperClass) { /** 调用了另外一个addInputPath方法 **/ addInputPath(job, path, inputFormatClass); Configuration conf = job.getConfiguration(); String mapperMapping = path.toString() + ";" + mapperClass.getName(); String mappers = conf.get(DIR_MAPPERS); conf.set(DIR_MAPPERS, mappers == null ? mapperMapping : mappers + "," + mapperMapping); job.setMapperClass(DelegatingMapper.class); } =========================================== public static void addInputPath(Job job, Path path, Class<? extends InputFormat> inputFormatClass) { String inputFormatMapping = path.toString() + ";" + inputFormatClass.getName(); Configuration conf = job.getConfiguration(); String inputFormats = conf.get(DIR_FORMATS); conf.set(DIR_FORMATS, inputFormats == null ? inputFormatMapping : inputFormats + "," + inputFormatMapping); /** 最后设置的InputFormatClass为DelegatingInputFormat **/ job.setInputFormatClass(DelegatingInputFormat.class); }
在提交job时(submitter.submitJobInternal)会调用InputFormat的getSplits方法,如下:
JobStatus submitJobInternal(Job job, Cluster cluster) throws ClassNotFoundException, InterruptedException, IOException { checkSpecs(job); Configuration conf = job.getConfiguration(); addMRFrameworkToDistributedCache(conf); Path jobStagingArea = JobSubmissionFiles.getStagingDir(cluster, conf); //configure the command line options correctly on the submitting dfs InetAddress ip = InetAddress.getLocalHost(); if (ip != null) { submitHostAddress = ip.getHostAddress(); submitHostName = ip.getHostName(); conf.set(MRJobConfig.JOB_SUBMITHOST,submitHostName); conf.set(MRJobConfig.JOB_SUBMITHOSTADDR,submitHostAddress); } JobID jobId = submitClient.getNewJobID(); job.setJobID(jobId); Path submitJobDir = new Path(jobStagingArea, jobId.toString()); JobStatus status = null; try { conf.set(MRJobConfig.USER_NAME, UserGroupInformation.getCurrentUser().getShortUserName()); conf.set("hadoop.http.filter.initializers", "org.apache.hadoop.yarn.server.webproxy.amfilter.AmFilterInitializer"); conf.set(MRJobConfig.MAPREDUCE_JOB_DIR, submitJobDir.toString()); LOG.debug("Configuring job " + jobId + " with " + submitJobDir + " as the submit dir"); TokenCache.obtainTokensForNamenodes(job.getCredentials(), new Path[] { submitJobDir }, conf); populateTokenCache(conf, job.getCredentials()); if (TokenCache.getShuffleSecretKey(job.getCredentials()) == null) { KeyGenerator keyGen; try { keyGen = KeyGenerator.getInstance(SHUFFLE_KEYGEN_ALGORITHM); keyGen.init(SHUFFLE_KEY_LENGTH); } catch (NoSuchAlgorithmException e) { throw new IOException("Error generating shuffle secret key", e); } SecretKey shuffleKey = keyGen.generateKey(); TokenCache.setShuffleSecretKey(shuffleKey.getEncoded(), job.getCredentials()); } if (CryptoUtils.isEncryptedSpillEnabled(conf)) { conf.setInt(MRJobConfig.MR_AM_MAX_ATTEMPTS, 1); LOG.warn("Max job attempts set to 1 since encrypted intermediate" + "data spill is enabled"); } copyAndConfigureFiles(job, submitJobDir); Path submitJobFile = JobSubmissionFiles.getJobConfPath(submitJobDir); LOG.debug("Creating splits at " + jtFs.makeQualified(submitJobDir)); /** 最后会调用inputformat的getSplits方法 **/ int maps = writeSplits(job, submitJobDir); conf.setInt(MRJobConfig.NUM_MAPS, maps); LOG.info("number of splits:" + maps); String queue = conf.get(MRJobConfig.QUEUE_NAME, JobConf.DEFAULT_QUEUE_NAME); AccessControlList acl = submitClient.getQueueAdmins(queue); conf.set(toFullPropertyName(queue, QueueACL.ADMINISTER_JOBS.getAclName()), acl.getAclString()); TokenCache.cleanUpTokenReferral(conf); if (conf.getBoolean( MRJobConfig.JOB_TOKEN_TRACKING_IDS_ENABLED, MRJobConfig.DEFAULT_JOB_TOKEN_TRACKING_IDS_ENABLED)) { ArrayList<String> trackingIds = new ArrayList<String>(); for (Token<? extends TokenIdentifier> t : job.getCredentials().getAllTokens()) { trackingIds.add(t.decodeIdentifier().getTrackingId()); } conf.setStrings(MRJobConfig.JOB_TOKEN_TRACKING_IDS, trackingIds.toArray(new String[trackingIds.size()])); } ReservationId reservationId = job.getReservationId(); if (reservationId != null) { conf.set(MRJobConfig.RESERVATION_ID, reservationId.toString()); } writeConf(conf, submitJobFile); printTokens(jobId, job.getCredentials()); status = submitClient.submitJob( jobId, submitJobDir.toString(), job.getCredentials()); if (status != null) { return status; } else { throw new IOException("Could not launch job"); } } finally { if (status == null) { LOG.info("Cleaning up the staging area " + submitJobDir); if (jtFs != null && submitJobDir != null) jtFs.delete(submitJobDir, true); } }
private <T extends InputSplit> int writeNewSplits(JobContext job, Path jobSubmitDir) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = job.getConfiguration(); InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), conf); /** 调用了inputformat的getSplits **/ List<InputSplit> splits = input.getSplits(job); T[] array = (T[]) splits.toArray(new InputSplit[splits.size()]); Arrays.sort(array, new SplitComparator()); JobSplitWriter.createSplitFiles(jobSubmitDir, conf, jobSubmitDir.getFileSystem(conf), array); return array.length; }
如果最后设置InputFormatClass为TextInputFormat的情况下TextInputFormat.getSplits方法会通过名为mapreduce.input.fileinputformat.inputdir的属性来获取具体输入路径,在属性值为空的情况下会报
No input paths specified in job异常.
阅读全文
0 0
- hadoop初读--setInputFormatClass
- hadoop初读--Packet.getBuffer()
- hadoop
- Hadoop
- Hadoop
- hadoop
- hadoop
- Hadoop
- Hadoop
- hadoop
- Hadoop
- hadoop
- hadoop
- hadoop
- hadoop
- Hadoop
- Hadoop
- hadoop
- C#调用C++动态库(dll)
- unity3d各平台的路径问题
- GA遗传算法入门到掌握
- WifiP2pSettings工作流程
- java8——lambda expression
- hadoop初读--setInputFormatClass
- ExoPlayer 的小解析
- volatile 浅显描述
- Fiddler:读取Response的body写入本地文件
- GA遗传算法(Genetic Algorithm)
- TimingWheel[时间轮]介绍
- Android Web开发调试之Chrome远程调试
- App的启动过程(5)ViewTree遍历中最后一步的Draw
- 【Maven学习】maven-assembly-plugin的使用