Mapreduce算法二、数据去重(HashSet)

来源:互联网 发布:大金鹅复制软件 编辑:程序博客网 时间:2024/05/29 19:45
package MRDemo;import java.io.IOException;import java.util.HashSet;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import MRDemo.PV.PVCombine;import MRDemo.PV.PVMap;import MRDemo.PV.PVReduce;import com.google.common.collect.Sets;public class ProductKind {public static void main(String[] args) throws Exception {    if (args.length!=2) {        System.exit(0);    }    Job job = Job.getInstance(new Configuration(), "ProductKind");    job.setJarByClass(ProductKind.class);    FileInputFormat.setInputPaths(job, new Path(args[0]));    FileOutputFormat.setOutputPath(job, new Path(args[1]));    job.setMapperClass(ProductCountMap.class);    //job.setCombinerClass(PVCombine.class);    job.setReducerClass(ProductCountReduce.class);    job.setMapOutputKeyClass(Text.class);    job.setMapOutputValueClass(Text.class);    job.setOutputKeyClass(Text.class);    job.setOutputValueClass(IntWritable.class);    job.waitForCompletion(true);}public static class ProductCountMap extends Mapper<LongWritable, Text,  Text, Text>{    Text province = new Text();    Text kind = new Text();protected void map(LongWritable key, Text value, org.apache.hadoop.mapreduce.Mapper<LongWritable,Text,Text,Text>.Context context) throws java.io.IOException ,InterruptedException {    String[] lines = value.toString().split("\t");    if (lines.length == 6) {        kind.set(lines[0].trim());        province.set(lines[4].trim()    );        context.write(province, kind);    }};  }//public static class ProductCountReduce extends Reducer<Text, Text, Text, IntWritable>{    protected void reduce(Text key, java.lang.Iterable<Text> values, org.apache.hadoop.mapreduce.Reducer<Text,Text,Text,IntWritable>.Context context) throws java.io.IOException ,InterruptedException {        //context.write(new Text("getHttp_user_agent"), new IntWritable(1));        HashSet<String> hashSet = new HashSet<String>();        for (Text value : values) {            hashSet.add(value.toString());        }        if(hashSet.size()>0){            context.write(key, new IntWritable(hashSet.size()));        }    };}}
0 0