Hadoop中Mapper过程的源码分析

来源：互联网发布：破解软件论坛编辑：程序博客网时间：2024/05/22 23:50

简单WordCount的案例代码

通过三个简单的类WordCount,MyMapper和MyReducer实现一个简单的单词统计的功能.

WordCount类代码:

import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;public class WordCount {    public static void main(String[] args) throws Exception {        System.getProperty("HADOOP_USER_NAME", "root");        Configuration conf = new Configuration(true);        Job job = Job.getInstance(conf);        job.setJarByClass(WordCount.class);        job.setJobName("myjob");        //设置mapper output的key和value        job.setMapOutputKeyClass(Text.class);        job.setOutputValueClass(IntWritable.class);        job.setMapperClass(MyMapper.class);        job.setReducerClass(MyReducer.class);        Path input = new Path("/temp/wc/input");        FileInputFormat.addInputPath(job, input);        Path output = new Path("/temp/wc/output");        if (output.getFileSystem(conf).exists(output)) {            output.getFileSystem(conf).delete(output);        }        FileOutputFormat.setOutputPath(job, output);        job.waitForCompletion(true);    }}

MyMapper类代码:

import java.io.IOException;import java.util.StringTokenizer;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;public class MyMapper extends Mapper<Object, Text, Text, IntWritable> {    private final static IntWritable one = new IntWritable(1);    private Text word = new Text();    /**     * @param key split后的每一行的偏移量     * @param value split后每一行的内容     */    public void map(Object key, Text value, Context context) throws IOException, InterruptedException {        StringTokenizer itr = new StringTokenizer(value.toString());        while (itr.hasMoreTokens()) {            word.set(itr.nextToken());            context.write(word, one);        }    }}

MyReducer类代码:

import java.io.IOException;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Reducer;public class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable> {    private IntWritable result = new IntWritable();    public void reduce(Text key, Iterable<IntWritable> values, Context context)            throws IOException, InterruptedException {        int sum = 0;        for (IntWritable val : values) {            sum += val.get();        }        result.set(sum);        context.write(key, result);    }}

Map的准备过程

MapTask类里面有一个run()方法:

@SuppressWarnings("unchecked")  private <INKEY,INVALUE,OUTKEY,OUTVALUE>  void runNewMapper(final JobConf job,                    final TaskSplitIndex splitIndex,                    final TaskUmbilicalProtocol umbilical,                    TaskReporter reporter                    ) throws IOException, ClassNotFoundException,                             InterruptedException {    // make a task context so we can get the classes    org.apache.hadoop.mapreduce.TaskAttemptContext taskContext =      new org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl(job,                                                                   getTaskID(),                                                                  reporter);    /* make a mapper,通过反射获取Mapper. 如果用户定义了Mapper,则使用用户定义的Mapper,如果用户没有定义,则使用默认的Mapper. 可进入taskContext.getMapperClass()看一下是怎样从配置信息获取Mapper的.*/    org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE> mapper =      (org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE>)        ReflectionUtils.newInstance(taskContext.getMapperClass(), job);    /*make the input format 通过反射获取InputFormat. 如果用户自定义了就是用用户自定义的,如果用户没有自定义,则使用默认的.可进入taskContext.getInputFormatClass()看一下是怎么从配置信息获取InuputFormat的.*/    org.apache.hadoop.mapreduce.InputFormat<INKEY,INVALUE> inputFormat =      (org.apache.hadoop.mapreduce.InputFormat<INKEY,INVALUE>)        ReflectionUtils.newInstance(taskContext.getInputFormatClass(), job);    /*rebuild the input split,获取split的信息,split的信息包括文件file,开始位置的偏移量,大小,hosts等*/    org.apache.hadoop.mapreduce.InputSplit split = null;    split = getSplitDetails(new Path(splitIndex.getSplitLocation()),        splitIndex.getStartOffset());    LOG.info("Processing split: " + split);    /*通过获取完split,inputFormat,reporter, taskContext的对象去创建input. 这里获取的input是LineRecordReader类型*/    org.apache.hadoop.mapreduce.RecordReader<INKEY,INVALUE> input =      new NewTrackingRecordReader<INKEY,INVALUE>        (split, inputFormat, reporter, taskContext);    job.setBoolean(JobContext.SKIP_RECORDS, isSkipping());    org.apache.hadoop.mapreduce.RecordWriter output = null;    // get an output object,获取输出流    if (job.getNumReduceTasks() == 0) {      output =         new NewDirectOutputCollector(taskContext, job, umbilical, reporter);    } else {      output = new NewOutputCollector(taskContext, job, umbilical, reporter);    }    org.apache.hadoop.mapreduce.MapContext<INKEY, INVALUE, OUTKEY, OUTVALUE>     mapContext =       new MapContextImpl<INKEY, INVALUE, OUTKEY, OUTVALUE>(job, getTaskID(),           input, output,           committer,           reporter, split);    org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE>.Context         mapperContext =           new WrappedMapper<INKEY, INVALUE, OUTKEY, OUTVALUE>().getMapContext(              mapContext);    try {      //初始化输入      input.initialize(split, mapperContext);      mapper.run(mapperContext);      mapPhase.complete();      setPhase(TaskStatus.Phase.SORT);      statusUpdate(umbilical);      input.close();      input = null;      output.close(mapperContext);      output = null;    } finally {      closeQuietly(input);      closeQuietly(output, mapperContext);    }  }

Map的输入分析

通过以上源码分析,可以知道input的类型是LineRecordReader,
input.initialize(split, mapperContext)是进行输入的初始化工作,这个方法的实现类是LineRecordReader. 咱们进入方法里面窥探一下:

public void initialize(InputSplit genericSplit,                         TaskAttemptContext context) throws IOException {    FileSplit split = (FileSplit) genericSplit;    Configuration job = context.getConfiguration();    this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);    //下面三行是获取split的start(偏移量),end(结束位置)和file(文件信息)    start = split.getStart();    end = start + split.getLength();    final Path file = split.getPath();    // open the file and seek to the start of the split    final FileSystem fs = file.getFileSystem(job);    //获取到split的文件输入流    fileIn = fs.open(file);    CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);    if (null!=codec) {      isCompressedInput = true;       decompressor = CodecPool.getDecompressor(codec);      if (codec instanceof SplittableCompressionCodec) {        final SplitCompressionInputStream cIn =          ((SplittableCompressionCodec)codec).createInputStream(            fileIn, decompressor, start, end,            SplittableCompressionCodec.READ_MODE.BYBLOCK);        in = new CompressedSplitLineReader(cIn, job,            this.recordDelimiterBytes);        start = cIn.getAdjustedStart();        end = cIn.getAdjustedEnd();        filePosition = cIn;      } else {        in = new SplitLineReader(codec.createInputStream(fileIn,            decompressor), job, this.recordDelimiterBytes);        filePosition = fileIn;      }      //fileIn输入流从split切片的偏移量开始读取      fileIn.seek(start);      //这里的in是SplitLineReader类型,通过fileIn去获取SplitLineReader数据in      in = new UncompressedSplitLineReader(          fileIn, job, this.recordDelimiterBytes, split.getLength());      filePosition = fileIn;    }    /*If this is not the first split, we always throw away first record because we always (except the last split) read one extra line in next() method. 说人话就是,如果起始偏移量不是0,也就是说如果不是第一个切片的第一行,放弃读取该行,从第二行开始读取. 因为在生成Block块的时候,有可能一行数据会被拆分放到两个block中*/    if (start != 0) {      start += in.readLine(new Text(), 0, maxBytesToConsume(start));    }    this.pos = start;  }

回到MapTask类的run()方法,里面的代码:

      //初始化输入      input.initialize(split, mapperContext);      //输入的初始化完成后就开始mapper的过程了.      mapper.run(mapperContext);

Map的输出分析

回到MapTask类的run()方法,看Map的输出代码:

    // get an output object,获取输出流    //如果Reduce的数量为0,执行NewDirectOutputCollector对象创建    if (job.getNumReduceTasks() == 0) {      output =         new NewDirectOutputCollector(taskContext, job, umbilical, reporter);    } else {      output = new NewOutputCollector(taskContext, job, umbilical, reporter);    }

我们这里分析Reduce数量大于0的情况. 进入NewOutputCollector()的代码:

private class NewOutputCollector<K,V>    extends org.apache.hadoop.mapreduce.RecordWriter<K,V> {    private final MapOutputCollector<K,V> collector;    private final org.apache.hadoop.mapreduce.Partitioner<K,V> partitioner;    private final int partitions;    @SuppressWarnings("unchecked")    NewOutputCollector(org.apache.hadoop.mapreduce.JobContext jobContext,                       JobConf job,                       TaskUmbilicalProtocol umbilical,                       TaskReporter reporter                       ) throws IOException, ClassNotFoundException {      collector = createSortingCollector(job, reporter);      //有几个Reduce就对应几个分区partitions      partitions = jobContext.getNumReduceTasks();      if (partitions > 1) {        //又是反射,反射获取的对象要么是用户设置的,要么是默认的        partitioner = (org.apache.hadoop.mapreduce.Partitioner<K,V>)          ReflectionUtils.newInstance(jobContext.getPartitionerClass(), job);      } else {        partitioner = new org.apache.hadoop.mapreduce.Partitioner<K,V>() {          @Override          public int getPartition(K key, V value, int numPartitions) {            return partitions - 1;          }        };      }@Override    public void write(K key, V value) throws IOException, InterruptedException {      collector.collect(key, value,                        partitioner.getPartition(key, value, partitions));    }    @Override    public void close(TaskAttemptContext context                      ) throws IOException,InterruptedException {      try {        collector.flush();      } catch (ClassNotFoundException cnf) {        throw new IOException("can't find class ", cnf);      }      collector.close();    }  }

collector的创建

NewOutputCollector的构造方法中创建了collector:

collector = createSortingCollector(job, reporter);

我们进去看一下createSortingCollector(job, reporter)是怎么实现的:

@SuppressWarnings("unchecked")  private <KEY, VALUE> MapOutputCollector<KEY, VALUE>          createSortingCollector(JobConf job, TaskReporter reporter)    throws IOException, ClassNotFoundException {    MapOutputCollector.Context context =      new MapOutputCollector.Context(this, job, reporter);    /*看到反射就激动,如果用户定义了MAP_OUTPUT_COLLECTOR_CLASS_ATTR,collectorClasses取该类型,如果用户没有定义,collectorClasses则取MapOutputBuffer.class. 这是一个比较复杂的类,一般情况下不会自己定义这个东西. collectorClasses就是用默认的MapOutputBuffer.class就好了*/    Class<?>[] collectorClasses = job.getClasses(      JobContext.MAP_OUTPUT_COLLECTOR_CLASS_ATTR, MapOutputBuffer.class);    int remainingCollectors = collectorClasses.length;    for (Class clazz : collectorClasses) {      try {        if (!MapOutputCollector.class.isAssignableFrom(clazz)) {          throw new IOException("Invalid output collector class: " + clazz.getName() +            " (does not implement MapOutputCollector)");        }        Class<? extends MapOutputCollector> subclazz =          clazz.asSubclass(MapOutputCollector.class);        LOG.debug("Trying map output collector class: " + subclazz.getName());        MapOutputCollector<KEY, VALUE> collector =          ReflectionUtils.newInstance(subclazz, job);        /*获取完collector后进行collector的初始化,一般情况下collector使用默认的类型MapOutputBuffer.class*/        collector.init(context);        LOG.info("Map output collector class = " + collector.getClass().getName());        return collector;      } catch (Exception e) {        String msg = "Unable to initialize MapOutputCollector " + clazz.getName();        if (--remainingCollectors > 0) {          msg += " (" + remainingCollectors + " more collector(s) to try)";        }        LOG.warn(msg, e);      }    }    throw new IOException("Unable to initialize any output collector");  }

从上面分析collector的类型一般是MapOutputBuffer.我们看一下collector.init(context)是怎么初始化的:

@SuppressWarnings("unchecked")    public void init(MapOutputCollector.Context context                    ) throws IOException, ClassNotFoundException {      job = context.getJobConf();      reporter = context.getReporter();      mapTask = context.getMapTask();      mapOutputFile = mapTask.getMapOutputFile();      sortPhase = mapTask.getSortPhase();      spilledRecordsCounter = reporter.getCounter(TaskCounter.SPILLED_RECORDS);      partitions = job.getNumReduceTasks();      rfs = ((LocalFileSystem)FileSystem.getLocal(job)).getRaw();      //sanity checks      final float spillper =        job.getFloat(JobContext.MAP_SORT_SPILL_PERCENT, (float)0.8);      final int sortmb = job.getInt(JobContext.IO_SORT_MB, 100);      indexCacheMemoryLimit = job.getInt(JobContext.INDEX_CACHE_MEMORY_LIMIT,                                         INDEX_CACHE_MEMORY_LIMIT_DEFAULT);      if (spillper > (float)1.0 || spillper <= (float)0.0) {        throw new IOException("Invalid \"" + JobContext.MAP_SORT_SPILL_PERCENT +            "\": " + spillper);      }      if ((sortmb & 0x7FF) != sortmb) {        throw new IOException(            "Invalid \"" + JobContext.IO_SORT_MB + "\": " + sortmb);      }      sorter = ReflectionUtils.newInstance(job.getClass("map.sort.class",            QuickSort.class, IndexedSorter.class), job);      // buffers and accounting      int maxMemUsage = sortmb << 20;      maxMemUsage -= maxMemUsage % METASIZE;      kvbuffer = new byte[maxMemUsage];      bufvoid = kvbuffer.length;      kvmeta = ByteBuffer.wrap(kvbuffer)         .order(ByteOrder.nativeOrder())         .asIntBuffer();      setEquator(0);      bufstart = bufend = bufindex = equator;      kvstart = kvend = kvindex;      maxRec = kvmeta.capacity() / NMETA;      softLimit = (int)(kvbuffer.length * spillper);      bufferRemaining = softLimit;      LOG.info(JobContext.IO_SORT_MB + ": " + sortmb);      LOG.info("soft limit at " + softLimit);      LOG.info("bufstart = " + bufstart + "; bufvoid = " + bufvoid);      LOG.info("kvstart = " + kvstart + "; length = " + maxRec);      // k/v serialization      /*获取比较器,如果用户自定义了,则使用用户自定义的比较器,如果没有定义,则取默认的比较器.这里就不进去看怎么取比较器了,里面的代码比较简单,看官可以自己进去看*/      comparator = job.getOutputKeyComparator();      keyClass = (Class<K>)job.getMapOutputKeyClass();      valClass = (Class<V>)job.getMapOutputValueClass();      serializationFactory = new SerializationFactory(job);      keySerializer = serializationFactory.getSerializer(keyClass);      keySerializer.open(bb);      valSerializer = serializationFactory.getSerializer(valClass);      valSerializer.open(bb);      // output counters      mapOutputByteCounter = reporter.getCounter(TaskCounter.MAP_OUTPUT_BYTES);      mapOutputRecordCounter =        reporter.getCounter(TaskCounter.MAP_OUTPUT_RECORDS);      fileOutputByteCounter = reporter          .getCounter(TaskCounter.MAP_OUTPUT_MATERIALIZED_BYTES);      // compression      if (job.getCompressMapOutput()) {        Class<? extends CompressionCodec> codecClass =          job.getMapOutputCompressorClass(DefaultCodec.class);        codec = ReflectionUtils.newInstance(codecClass, job);      } else {        codec = null;      }      // combiner      final Counters.Counter combineInputCounter =        reporter.getCounter(TaskCounter.COMBINE_INPUT_RECORDS);      combinerRunner = CombinerRunner.create(job, getTaskID(),                                              combineInputCounter,                                             reporter, null);      if (combinerRunner != null) {        final Counters.Counter combineOutputCounter =          reporter.getCounter(TaskCounter.COMBINE_OUTPUT_RECORDS);        combineCollector= new CombineOutputCollector<K,V>(combineOutputCounter, reporter, job);      } else {        combineCollector = null;      }      spillInProgress = false;      minSpillsForCombine = job.getInt(JobContext.MAP_COMBINE_MIN_SPILLS, 3);      spillThread.setDaemon(true);      spillThread.setName("SpillThread");      spillLock.lock();      try {        //开始往外溢写内容        spillThread.start();        while (!spillThreadRunning) {          spillDone.await();        }      } catch (InterruptedException e) {        throw new IOException("Spill thread failed to initialize", e);      } finally {        spillLock.unlock();      }      if (sortSpillException != null) {        throw new IOException("Spill thread failed to initialize",            sortSpillException);      }    }

里面有代码spillThread.start(),我们看一下map怎么往外溢写的,进入spillThread的run()方法代码:

protected class SpillThread extends Thread {      @Override      public void run() {        spillLock.lock();        spillThreadRunning = true;        try {          while (true) {            spillDone.signal();            while (!spillInProgress) {              spillReady.await();            }            try {              spillLock.unlock();              //排序并开始往外溢写              sortAndSpill();            } catch (Throwable t) {              sortSpillException = t;            } finally {              spillLock.lock();              if (bufend < bufstart) {                bufvoid = kvbuffer.length;              }              kvstart = kvend;              bufstart = bufend;              spillInProgress = false;            }          }        } catch (InterruptedException e) {          Thread.currentThread().interrupt();        } finally {          spillLock.unlock();          spillThreadRunning = false;        }      }    }

我们看一下SpillThread是怎么sortAndSpill()的,进去看一下:

private void sortAndSpill() throws IOException, ClassNotFoundException,                                       InterruptedException {      //approximate the length of the output file to be the length of the      //buffer + header lengths for the partitions      final long size = distanceTo(bufstart, bufend, bufvoid) +                  partitions * APPROX_HEADER_LENGTH;      FSDataOutputStream out = null;      try {        // create spill file        final SpillRecord spillRec = new SpillRecord(partitions);        final Path filename =            mapOutputFile.getSpillFileForWrite(numSpills, size);        out = rfs.create(filename);        final int mstart = kvend / NMETA;        final int mend = 1 + // kvend is a valid record          (kvstart >= kvend          ? kvstart          : kvmeta.capacity() + kvstart) / NMETA;        /*sorter对map进行排序,排序使过程使用的comparator在前面的分析中已经获取过了,看官往回看一下.排序的详细过程这里就不分析了,客官自己进去看吧*/        sorter.sort(MapOutputBuffer.this, mstart, mend, reporter);        int spindex = mstart;        final IndexRecord rec = new IndexRecord();        final InMemValBytes value = new InMemValBytes();        for (int i = 0; i < partitions; ++i) {          IFile.Writer<K, V> writer = null;          try {            long segmentStart = out.getPos();            FSDataOutputStream partitionOut = CryptoUtils.wrapIfNecessary(job, out);            writer = new Writer<K, V>(job, partitionOut, keyClass, valClass, codec,                                      spilledRecordsCounter);            if (combinerRunner == null) {              // spill directly              DataInputBuffer key = new DataInputBuffer();              while (spindex < mend &&                  kvmeta.get(offsetFor(spindex % maxRec) + PARTITION) == i) {                final int kvoff = offsetFor(spindex % maxRec);                int keystart = kvmeta.get(kvoff + KEYSTART);                int valstart = kvmeta.get(kvoff + VALSTART);                key.reset(kvbuffer, keystart, valstart - keystart);                getVBytesForOffset(kvoff, value);                writer.append(key, value);                ++spindex;              }            } else {              int spstart = spindex;              while (spindex < mend &&                  kvmeta.get(offsetFor(spindex % maxRec)                            + PARTITION) == i) {                ++spindex;              }              // Note: we would like to avoid the combiner if we've fewer              // than some threshold of records for a partition              if (spstart != spindex) {                combineCollector.setWriter(writer);                RawKeyValueIterator kvIter =                  new MRResultIterator(spstart, spindex);                combinerRunner.combine(kvIter, combineCollector);              }            }            // close the writer            writer.close();            // record offsets            rec.startOffset = segmentStart;            rec.rawLength = writer.getRawLength() + CryptoUtils.cryptoPadding(job);            rec.partLength = writer.getCompressedLength() + CryptoUtils.cryptoPadding(job);            spillRec.putIndex(rec, i);            writer = null;          } finally {            if (null != writer) writer.close();          }        }        if (totalIndexCacheMemory >= indexCacheMemoryLimit) {          // create spill index file          Path indexFilename =              mapOutputFile.getSpillIndexFileForWrite(numSpills, partitions                  * MAP_OUTPUT_INDEX_RECORD_LENGTH);          spillRec.writeToFile(indexFilename, job);        } else {          indexCacheList.add(spillRec);          totalIndexCacheMemory +=            spillRec.size() * MAP_OUTPUT_INDEX_RECORD_LENGTH;        }        LOG.info("Finished spill " + numSpills);        ++numSpills;      } finally {        if (out != null) out.close();      }    }

怎是分析到这里.我们的目的是为了获取NewOutputCollector collector. 前面的分析就是怎么获取这个collector的.为什么我们要获取这个collector呢?客官别急.
在MyReducer类里面有一行代码:context.write(key, result);

public void reduce(Text key, Iterable<IntWritable> values, Context context)            throws IOException, InterruptedException {        int sum = 0;        for (IntWritable val : values) {            sum += val.get();        }        result.set(sum);        //这是往reduce溢写的开始        context.write(key, result);    }

我们跟进去context.write(key, result);看看:

 @Override    public void write(KEYOUT key, VALUEOUT value) throws IOException,        InterruptedException {      reduceContext.write(key, value);    }

跟进去reduceContext.write(key, value)看看:

public void write(KEYOUT key, VALUEOUT value                    ) throws IOException, InterruptedException {    output.write(key, value);  }

这里的output就是前面我们分析获取的NewOutputCollector collector.到这里,Map的输出过程大致分析完毕了.

确保map中相同的key分发到同一个reduce中

下面来分析一下map是如何确保具有相同key的数据会被分发到同一个reduce中.
回到NewOutputCollector类中,如果deduce的数量大于1:

if (partitions > 1) {        //又是反射,反射获取的对象要么是用户设置的,要么是默认的        partitioner = (org.apache.hadoop.mapreduce.Partitioner<K,V>)          ReflectionUtils.newInstance(jobContext.getPartitionerClass(), job);}

这里jobContext的实现类型是JobContextImpl,进入jobContext.getPartitionerClass()的实现方法:

@SuppressWarnings("unchecked")  public Class<? extends Partitioner<?,?>> getPartitionerClass()      throws ClassNotFoundException {     /*如果用户设置了PARTITIONER_CLASS_ATTR,则取该值,如果用户没有设置,则取HashPartitioner.class.*/    return (Class<? extends Partitioner<?,?>>)       conf.getClass(PARTITIONER_CLASS_ATTR, HashPartitioner.class);  }

我们可以进去看一下HashPartitioner.class是的逻辑是怎么样的:

public class HashPartitioner<K, V> extends Partitioner<K, V> {  /** Use {@link Object#hashCode()} to partition. */  public int getPartition(K key, V value,                          int numReduceTasks) {    return (key.hashCode() & Integer.MAX_VALUE) % numReduceTasks;  }}

从代码我们可以看出,经过key的取hash值再取模后确保了相同的key返回来的partition是相同的,说人话就是相同的key会被放到相同的reduce中.

回来到NewOutputCollector类中:

@SuppressWarnings("unchecked")    NewOutputCollector(org.apache.hadoop.mapreduce.JobContext jobContext,                       JobConf job,                       TaskUmbilicalProtocol umbilical,                       TaskReporter reporter                       ) throws IOException, ClassNotFoundException {      collector = createSortingCollector(job, reporter);      //有几个Reduce就对应几个分区partitions      partitions = jobContext.getNumReduceTasks();      if (partitions > 1) {        /*又是反射,反射获取的对象要么是用户设置的,要么是默认的.这两行的代码的功能是获取map分发出来的key对应到prititioner中,进而给对应的reduce处理.*/        partitioner = (org.apache.hadoop.mapreduce.Partitioner<K,V>)          ReflectionUtils.newInstance(jobContext.getPartitionerClass(), job);      } else {      //若果只有一个reduce,则所有的key都会被放到唯一的reduce中进行计算        partitioner = new org.apache.hadoop.mapreduce.Partitioner<K,V>() {          @Override          public int getPartition(K key, V value, int numPartitions) {            return partitions - 1;          }        };      }    }

阅读全文

0 0