hadoop初读--setInputFormatClass

来源：互联网发布：网络招生技巧编辑：程序博客网时间：2024/06/08 14:07

鉴于平时的hadoop实践问题或粗读hadoop源码而记录形成”hadoop初读”系列部分。每篇主要以问题-解答两部分组成:问题是实践中的异常或疑惑；解答部分为源码的粗读理解(以注释形式给出说明)。

问题代码(主要实现多输入,报No input paths specified in job):

import java.io.IOException;import java.net.URI;import java.util.ArrayList;import java.util.Map;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.conf.Configured;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;import org.apache.hadoop.util.Tool;import org.apache.hadoop.util.ToolRunner;import org.slf4j.Logger;import org.slf4j.LoggerFactory;public class ReduceSideJoin extends Configured implements Tool {    private static final Logger logger = LoggerFactory            .getLogger(ReduceSideJoin.class);    public static class LeftOutJoinReducer extends            Reducer<IntWritable, OrderGoodsInfoWritble, IntWritable, OrderGoodsInfoWritble> {        private ArrayList<OrderGoodsInfoWritble> leftTable = new ArrayList<OrderGoodsInfoWritble>();        private ArrayList<OrderGoodsInfoWritble> rightTable = new ArrayList<OrderGoodsInfoWritble>();        private Text secondPar = null;        private Text output = new Text();        @Override        protected void reduce(IntWritable key, Iterable<OrderGoodsInfoWritble> value,                              Context context) throws IOException, InterruptedException {            leftTable.clear();            rightTable.clear();            for (OrderGoodsInfoWritble cv : value) {                String mRecordFlog = cv.getmRecordflag().toString().trim();                if ("orderInfo".equals(mRecordFlog)) {                    leftTable.add(cv);                }else {                    rightTable.add(cv);                }            }            for (OrderGoodsInfoWritble leftPart : leftTable) {                for (OrderGoodsInfoWritble rightPart : rightTable) {                    if ((leftPart.getOrderId().toString()).equals((rightPart.getOrderId().toString()))) {                        leftPart.setGoodsSn(rightPart.getGoodsSn());                        leftPart.setGoodsName(rightPart.getGoodsName());                        leftPart.setmRecordflag(new Text("orderGoodsInfo"));                        context.write(key, leftPart);                    }                }            }        }    }    @Override    public int run(String[] args) throws Exception {        Configuration conf = getConf();         Job job = Job.getInstance(conf);        Path path1 = new Path(args[0]);        Path path2 = new Path(args[1]);        FileSystem fs = FileSystem.get(URI.create("hdfs://***:9000/venus/ordergoodsinfo"), conf);        Path outputPath = new Path("hdfs://***:9000/venus/ordergoodsinfo");        fs.deleteOnExit(outputPath);        fs.close();        MultipleInputs.addInputPath(job, path1, TextInputFormat.class,                OrderInfoMapper.class);        MultipleInputs.addInputPath(job, path2, TextInputFormat.class,                OrderGoodsMapper.class);        FileOutputFormat.setOutputPath(job, outputPath);         job.setJobName("LeftOutJoinMR");        job.setJarByClass(ReduceSideJoin.class);        job.setReducerClass(LeftOutJoinReducer.class);/**问题代码:调用MultipleInputs.addInputPath还设置了setInputFormatClass,导致报No input paths specified in job异常**/     job.setInputFormatClass(TextInputFormat.class);      job.setOutputFormatClass(TextOutputFormat.class);        job.setMapOutputKeyClass(IntWritable.class);        job.setMapOutputValueClass(OrderGoodsInfoWritble.class);        job.setOutputKeyClass(IntWritable.class);        job.setOutputValueClass(OrderGoodsInfoWritble.class);        job.waitForCompletion(true);        return job.isSuccessful() ? 0 : 1;    }    public static void main(String[] args) throws Exception {            Tool rdf = new ReduceSideJoin();            int returnCode = ToolRunner.run(rdf, args);            System.exit(returnCode);    }}

问题:
1.既然已经设置了多输入路径(MultipleInputs.addInputPath)为什么还报No input paths specified in job异常。

解答:
因为最后的setInputFormatClass覆盖了多输入的设置。具体见下面源码实现的说明.

1)job.setInputFormatClass(TextInputFormat.class)的源码实现:

  public void setInputFormatClass(Class<? extends InputFormat> cls                                  ) throws IllegalStateException {    ensureState(JobState.DEFINE);    /**    设置的InputFormatClass为传进来的TextInputFormat    **/    conf.setClass(INPUT_FORMAT_CLASS_ATTR, cls,                   InputFormat.class);  }

2)MultipleInputs.addInputPath源码实现:

public static void addInputPath(Job job, Path path,      Class<? extends InputFormat> inputFormatClass,      Class<? extends Mapper> mapperClass) {    /**    调用了另外一个addInputPath方法    **/    addInputPath(job, path, inputFormatClass);    Configuration conf = job.getConfiguration();    String mapperMapping = path.toString() + ";" + mapperClass.getName();    String mappers = conf.get(DIR_MAPPERS);    conf.set(DIR_MAPPERS, mappers == null ? mapperMapping       : mappers + "," + mapperMapping);    job.setMapperClass(DelegatingMapper.class);  } =========================================== public static void addInputPath(Job job, Path path,      Class<? extends InputFormat> inputFormatClass) {    String inputFormatMapping = path.toString() + ";"       + inputFormatClass.getName();    Configuration conf = job.getConfiguration();    String inputFormats = conf.get(DIR_FORMATS);    conf.set(DIR_FORMATS,       inputFormats == null ? inputFormatMapping : inputFormats + ","           + inputFormatMapping);    /**    最后设置的InputFormatClass为DelegatingInputFormat    **/    job.setInputFormatClass(DelegatingInputFormat.class);  }

在提交job时(submitter.submitJobInternal)会调用InputFormat的getSplits方法,如下:

JobStatus submitJobInternal(Job job, Cluster cluster)   throws ClassNotFoundException, InterruptedException, IOException {    checkSpecs(job);    Configuration conf = job.getConfiguration();    addMRFrameworkToDistributedCache(conf);    Path jobStagingArea = JobSubmissionFiles.getStagingDir(cluster, conf);    //configure the command line options correctly on the submitting dfs    InetAddress ip = InetAddress.getLocalHost();    if (ip != null) {      submitHostAddress = ip.getHostAddress();      submitHostName = ip.getHostName();      conf.set(MRJobConfig.JOB_SUBMITHOST,submitHostName);      conf.set(MRJobConfig.JOB_SUBMITHOSTADDR,submitHostAddress);    }    JobID jobId = submitClient.getNewJobID();    job.setJobID(jobId);    Path submitJobDir = new Path(jobStagingArea, jobId.toString());    JobStatus status = null;    try {      conf.set(MRJobConfig.USER_NAME,          UserGroupInformation.getCurrentUser().getShortUserName());      conf.set("hadoop.http.filter.initializers",           "org.apache.hadoop.yarn.server.webproxy.amfilter.AmFilterInitializer");      conf.set(MRJobConfig.MAPREDUCE_JOB_DIR, submitJobDir.toString());      LOG.debug("Configuring job " + jobId + " with " + submitJobDir           + " as the submit dir");      TokenCache.obtainTokensForNamenodes(job.getCredentials(),          new Path[] { submitJobDir }, conf);      populateTokenCache(conf, job.getCredentials());      if (TokenCache.getShuffleSecretKey(job.getCredentials()) == null) {        KeyGenerator keyGen;        try {          keyGen = KeyGenerator.getInstance(SHUFFLE_KEYGEN_ALGORITHM);          keyGen.init(SHUFFLE_KEY_LENGTH);        } catch (NoSuchAlgorithmException e) {          throw new IOException("Error generating shuffle secret key", e);        }        SecretKey shuffleKey = keyGen.generateKey();        TokenCache.setShuffleSecretKey(shuffleKey.getEncoded(),            job.getCredentials());      }      if (CryptoUtils.isEncryptedSpillEnabled(conf)) {        conf.setInt(MRJobConfig.MR_AM_MAX_ATTEMPTS, 1);        LOG.warn("Max job attempts set to 1 since encrypted intermediate" +                "data spill is enabled");      }      copyAndConfigureFiles(job, submitJobDir);      Path submitJobFile = JobSubmissionFiles.getJobConfPath(submitJobDir);      LOG.debug("Creating splits at " + jtFs.makeQualified(submitJobDir));      /**      最后会调用inputformat的getSplits方法      **/      int maps = writeSplits(job, submitJobDir);      conf.setInt(MRJobConfig.NUM_MAPS, maps);      LOG.info("number of splits:" + maps);      String queue = conf.get(MRJobConfig.QUEUE_NAME,          JobConf.DEFAULT_QUEUE_NAME);      AccessControlList acl = submitClient.getQueueAdmins(queue);      conf.set(toFullPropertyName(queue,          QueueACL.ADMINISTER_JOBS.getAclName()), acl.getAclString());      TokenCache.cleanUpTokenReferral(conf);      if (conf.getBoolean(          MRJobConfig.JOB_TOKEN_TRACKING_IDS_ENABLED,          MRJobConfig.DEFAULT_JOB_TOKEN_TRACKING_IDS_ENABLED)) {        ArrayList<String> trackingIds = new ArrayList<String>();        for (Token<? extends TokenIdentifier> t :            job.getCredentials().getAllTokens()) {          trackingIds.add(t.decodeIdentifier().getTrackingId());        }        conf.setStrings(MRJobConfig.JOB_TOKEN_TRACKING_IDS,            trackingIds.toArray(new String[trackingIds.size()]));      }      ReservationId reservationId = job.getReservationId();      if (reservationId != null) {        conf.set(MRJobConfig.RESERVATION_ID, reservationId.toString());      }      writeConf(conf, submitJobFile);      printTokens(jobId, job.getCredentials());      status = submitClient.submitJob(          jobId, submitJobDir.toString(), job.getCredentials());      if (status != null) {        return status;      } else {        throw new IOException("Could not launch job");      }    } finally {      if (status == null) {        LOG.info("Cleaning up the staging area " + submitJobDir);        if (jtFs != null && submitJobDir != null)          jtFs.delete(submitJobDir, true);      }    }

private <T extends InputSplit>  int writeNewSplits(JobContext job, Path jobSubmitDir) throws IOException,      InterruptedException, ClassNotFoundException {    Configuration conf = job.getConfiguration();    InputFormat<?, ?> input =      ReflectionUtils.newInstance(job.getInputFormatClass(), conf);    /**    调用了inputformat的getSplits    **/    List<InputSplit> splits = input.getSplits(job);    T[] array = (T[]) splits.toArray(new InputSplit[splits.size()]);    Arrays.sort(array, new SplitComparator());    JobSplitWriter.createSplitFiles(jobSubmitDir, conf,         jobSubmitDir.getFileSystem(conf), array);    return array.length;  }

如果最后设置InputFormatClass为TextInputFormat的情况下TextInputFormat.getSplits方法会通过名为mapreduce.input.fileinputformat.inputdir的属性来获取具体输入路径，在属性值为空的情况下会报
No input paths specified in job异常.

阅读全文

0 0