MapReduce中的想详细的细节

来源：互联网发布：淘宝steam慈善包哪里编辑：程序博客网时间：2024/06/07 23:45

1、当Map读取RecordReader的key--value之后，就会将数据存放在缓冲区中，并进行按照可以的排序。默认的排序是IntWritable 按照键的从小到大排序。我们也可以设定按照可以的自定义排序。

2、Combiner好处？

1.Mapper端归约后数据变少，传输时间变短；Reducer端接收数据少了，运行时间短了。job的整体时间运行也变短了。

* 什么情况下使用Combiner？这种归约算法应许多次调用

job.setCombinerClass(MyReducer.class);

如果Map端不能简单的使用已有的Reduce程序，则需要定制的Comibner类。

3、当Map端Combiner之后，可以进行压缩，存储在本地，既可以减少存储空间，也可以减少网络传输的数据量

在job中的Configuration进行配置

Configuration conf = new Configuration();
// 设置map之后进行压缩减少本地的存储以及网络的传输
conf.setBoolean("mapred.compress.map.output", true);
conf.setBoolean("mapred.output.compress", true);
conf.setIfUnset("mapred.output.compression.type", "BLOCK");
conf.setClass("mapred.output.compression.codec", GzipCodec.class,
CompressionCodec.class);

4、partotioner 分区

job.setNumReduceTasks(tasks) job设置Reduce的个数

好处：如果job设置多个Reduce，则需要将Map输出的中间结果按照Map中的key进行分发到不同的Reduce，

相同的key传输到相同的Reduce节点，防止多个Reduce互相通信。

job.setPartitionerClass(PartITioner partitioner)

我们用户可以自定义Partitioner 根据key的值一共有二中方式

第一种方式也是常用的方式

public class MyPartitioner extends HashPartitioner<K,V>{

public int getPartition(K key,V value,int numReduces){

//我们可以将key转化为自己想要的字符串

term =key.toString().split(",")[0];

super(term,value,numReduces);

}

第二种：继承Partitioner

public class MyPartition extends Partitioner<Text, Text> {

@Override
public int getPartition(Text key, Text value, int numPartition) {

}
}

整体写个例子如下：

package bigdatabasealgorithm;

import java.io.IOException;
import java.util.Random;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.map.InverseMapper;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

/*
* 按照单词出现次数由高到低排序
* */
public class WordCount2 {

public static class TokenizerMapper extends Mapper<Object,Text,Text,IntWritable>{
private IntWritable one=new IntWritable(1);
private Text text=new Text();
private String pattern="[^//w]";

@Override
protected void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
String line=value.toString();
//将所有的除了a-z A-Z，0-9字符都变成“ ”
line=line.replaceAll(pattern, " ");
StringTokenizer token=new StringTokenizer(line," ");
while(token.hasMoreTokens()){
text.set(token.nextToken());
context.write(text, one);
}
}
}

public static class IntSumReduce extends Reducer<Text,IntWritable,Text,IntWritable>{

private IntWritable result=new IntWritable(0);

@Override
protected void reduce(Text key, Iterable<IntWritable> values,Context context)
throws IOException, InterruptedException {
int sum=0;
for(IntWritable value:values){
sum+=value.get();
}
result.set(sum);
context.write(key, result);
}
}

//设置Map的Sort的特定比较器
private static class MyComparator extends IntWritable.Comparator{

@Override
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
return -super.compare(b1, s1, l1, b2, s2, l2);
}

@Override
public int compare(WritableComparable a, WritableComparable b) {
return -super.compare(a, b);
}

}

public static void main(String[] args) throws IOException{
Configuration conf = new Configuration();
/*
conf.setBoolean("mapred.compress.map.output", true);
conf.setBoolean("mapred.output.compress", true);
conf.setIfUnset("mapred.output.compression.type", "BLOCK");
conf.setClass("mapred.output.compression.codec", GzipCodec.class,
CompressionCodec.class);
*/
Path tempDir=new Path("/test"+Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
String[] otherArgs = new GenericOptionsParser(conf, args)
.getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: wordcount <in> <out>");
System.exit(2);
}
try {
Job job = new Job(conf, "word count2");
job.setJarByClass(WordCount2.class);
//设置Map
job.setMapperClass(TokenizerMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
//设置Combiner
job.setCombinerClass(IntSumReduce.class);
//设置Reduce
job.setReducerClass(IntSumReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//设置Reduce输出的格式是序列化文件
job.setOutputFormatClass(SequenceFileOutputFormat.class);
FileInputFormat.addInputPath(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job, tempDir);
if(job.waitForCompletion(true)){
Job sortJob=new Job(conf,"sort");
sortJob.setJarByClass(WordCount2.class);
sortJob.setInputFormatClass(SequenceFileInputFormat.class);
sortJob.setMapperClass(InverseMapper.class);
sortJob.setNumReduceTasks(1);
sortJob.setOutputKeyClass(IntWritable.class);
sortJob.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(sortJob, tempDir);
FileOutputFormat.setOutputPath(sortJob, new Path(args[1]));
sortJob.setSortComparatorClass(MyComparator.class);
System.exit(sortJob.waitForCompletion(true)?0:1);
}
} catch (IOException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
} catch (ClassNotFoundException e) {
e.printStackTrace();
}finally{
FileSystem.get(conf).deleteOnExit(tempDir);
}
}
}

0 0