MapReduce处理数据去重与数据排序

来源:互联网 发布:软件资格水平考试 编辑:程序博客网 时间:2024/04/29 09:19

一:MapReduce处理数据去重

Map的key具有数据去重的功能

/* * 去除数据中相同数据 * 数据去重问题 * 以整个数据作为key发送出去, value为null */public class DelsameMap extends Mapper<LongWritable, Text, Text, Text> {@Overrideprotected void map(LongWritable key, Text value,Mapper<LongWritable, Text, Text, Text>.Context context)throws IOException, InterruptedException {String line = value.toString();if (line.length() > 0) {context.write(new Text(line.trim()), new Text(""));}}}


 

public class DelsameRedu extends Reducer<Text, Text, Text, NullWritable> {@Overrideprotected void reduce(Text key, Iterable<Text> values,Reducer<Text, Text, Text, NullWritable>.Context context)throws IOException, InterruptedException {context.write(key, NullWritable.get());}}
public class DelsameMain {public static void main(String[] args) throws Exception {Configuration conf = new Configuration();Job job = new Job(conf);job.setJarByClass(DelsameMain.class);job.setMapperClass(DelsameMap.class);job.setMapOutputKeyClass(Text.class);job.setMapOutputValueClass(Text.class);job.setReducerClass(DelsameRedu.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(NullWritable.class);FileInputFormat.addInputPath(job, new Path(args[0]));FileOutputFormat.setOutputPath(job, new Path(args[1]));job.waitForCompletion(true);}}


二:MapReduce处理数据排序

将原始数据作为map输出的key设置为int类型。map会自动的根据key进行排序

/* * mapreduce处理数据排序 *将原始数据作为map输出的key设置为int类型。map会自动的根据key进行排序 */public class SortMap extends Mapper<LongWritable, Text, IntWritable, Text> {@Overrideprotected void map(LongWritable key, Text value,Mapper<LongWritable, Text, IntWritable, Text>.Context context)throws IOException, InterruptedException {String line = value.toString();if (line.length() > 0) {context.write(new IntWritable(Integer.parseInt(line.trim())),new Text(""));}}}


/* * 将values作为次序key。将map排序好的key作为value输出 */public class SortRedu extendsReducer<IntWritable, Text, IntWritable, IntWritable> {private IntWritable num = new IntWritable(1);@Overrideprotected void reduce(IntWritable key, Iterable<Text> values,Reducer<IntWritable, Text, IntWritable, IntWritable>.Context context)throws IOException, InterruptedException {// 将values作为排序的次序。将map拍好序的key作为reduce的value输出for (Text val : values) {context.write(num, key);num = new IntWritable(num.get() + 1);}}}


public class SortMain {public static void main(String[] args) throws Exception {Configuration conf = new Configuration();Job job = new Job(conf);job.setJarByClass(SortMain.class);job.setMapperClass(SortMap.class);job.setMapOutputKeyClass(IntWritable.class);job.setMapOutputValueClass(Text.class);job.setReducerClass(SortRedu.class);job.setOutputKeyClass(IntWritable.class);job.setOutputValueClass(IntWritable.class);FileInputFormat.addInputPath(job, new Path(args[0]));FileOutputFormat.setOutputPath(job, new Path(args[1]));job.waitForCompletion(true);}}



 

1 0
原创粉丝点击