mapreduce几个例子

来源:互联网 发布:网络设计师待遇 编辑:程序博客网 时间:2024/06/05 17:55

http://www.cnblogs.com/xia520pi/archive/2012/06/04/2534533.html 原文地址, 动手敲敲实践下

1. 文本去重

要求 : 将文本中重复的行去掉, 输出结果为所有无重复的行
运行大致流程 :
map函数读取每行数据,输出k是这行数据,v是空的, shuffle后,自动完成去重, 进入reduce函数,得到的key就是我们想要的数据

import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.GenericOptionsParser;public class Dedup {    //map将输入中的value复制到输出数据的key上,并直接输出    public static class Map extends Mapper<Object,Text,Text,Text>{        private static Text line=new Text();//每行数据        //实现map函数        public void map(Object key,Text value,Context context)                throws IOException,InterruptedException{            line=value;            context.write(line, new Text(""));        }    }    //reduce将输入中的key复制到输出数据的key上,并直接输出    public static class Reduce extends Reducer<Text,Text,Text,Text>{        //实现reduce函数        public void reduce(Text key,Iterable<Text> values,Context context)                throws IOException,InterruptedException{            context.write(key, new Text(""));        }    }    public static void main(String[] args) throws Exception{    Configuration conf = new Configuration();    Job job = Job.getInstance(conf, "dedup");     job.setJarByClass(Dedup.class);     //设置Map、Combine和Reduce处理类     job.setMapperClass(Map.class);     job.setCombinerClass(Reduce.class);     job.setReducerClass(Reduce.class);     //设置输出类型     job.setOutputKeyClass(Text.class);     job.setOutputValueClass(Text.class);     //设置输入和输出目录     FileInputFormat.addInputPath(job, new Path(args[0]));     FileOutputFormat.setOutputPath(job, new Path(args[1]));     System.out.println(args[0]+"  "+args[1]);     System.exit(job.waitForCompletion(true) ? 0 : 1);     }}

数据排序

要求: 在文本中的数据如下

45455454486656543234532-90

要对些数据进行排序处理 , 得到如下结果

1   -902   23   104   215   346   457   538   4329   56510  86611  454512  54544

大致流程:
map函数收到一个数字,是字符串格式的, 要转成int 在转成IntWritable把这个数字作为k, 1作为v写出
shuffle过程 将数字排序,并将可能相同数字 形成value_list
reduce函数接受到k为数字,v为这个数字出现的次数, 定义一个计数变量就可以实现sort

import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.GenericOptionsParser;public class Sort {    //map将输入中的value化成IntWritable类型,作为输出的key    public static class Map extends Mapper<Object,Text,IntWritable,IntWritable>{        private static IntWritable data=new IntWritable();        //实现map函数        public void map(Object key,Text value,Context context)                throws IOException,InterruptedException{            String line=value.toString();            data.set(Integer.parseInt(line));            context.write(data, new IntWritable(1));        }    }    //reduce将输入中的key复制到输出数据的key上,    //然后根据输入的value-list中元素的个数决定key的输出次数    //用全局linenum来代表key的位次    public static class Reduce extends            Reducer<IntWritable,IntWritable,IntWritable,IntWritable>{        private static IntWritable linenum = new IntWritable(1);        //实现reduce函数        public void reduce(IntWritable key,Iterable<IntWritable> values,Context context)                throws IOException,InterruptedException{            for(IntWritable val:values){                context.write(linenum, key);                linenum = new IntWritable(linenum.get()+1);            }        }    }    public static void main(String[] args) throws Exception{        Configuration conf = new Configuration();        Job job = new Job(conf, "Data Sort");     job.setJarByClass(Sort.class);     //设置Map和Reduce处理类     job.setMapperClass(Map.class);     job.setReducerClass(Reduce.class);     //设置输出类型     job.setOutputKeyClass(IntWritable.class);     job.setOutputValueClass(IntWritable.class);     //设置输入和输出目录     FileInputFormat.addInputPath(job, new Path(args[0]));     FileOutputFormat.setOutputPath(job, new Path(args[1]));     System.exit(job.waitForCompletion(true) ? 0 : 1);     }}

统计平均分

分别有3个文件
file1:

张三    88李四    99王五    66赵六    77

file2:

张三    78李四    89王五    96赵六    67

file3:

张三    80李四    82王五    84赵六    86

要求: 对每个人求平均分

方法: 用FileInputFormat.addInputPath() 读取多个文件
map函数:将每行的name和score筛出来, 用name作为key ,score作为value
shuffle:把name相同的key进行合并,形成<张三,<88,78,70>>
reduce函数: 遍历value_list,求平均

import java.io.IOException;import java.util.Iterator;import java.util.StringTokenizer;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;import org.apache.hadoop.util.GenericOptionsParser;public class Score {    public static class Map extends            Mapper<LongWritable, Text, Text, IntWritable> {        // 实现map函数        public void map(LongWritable key, Text value, Context context)                throws IOException, InterruptedException {            // 将输入的纯文本文件的数据转化成String            String line = value.toString();            // 将输入的数据首先按行进行分割            StringTokenizer tokenizerArticle = new StringTokenizer(line, "\n");            // 分别对每一行进行处理            while (tokenizerArticle.hasMoreElements()) {                // 每行按空格划分                StringTokenizer tokenizerLine = new StringTokenizer(tokenizerArticle.nextToken());                String strName = tokenizerLine.nextToken();// 学生姓名部分                String strScore = tokenizerLine.nextToken();// 成绩部分                Text name = new Text(strName);                int scoreInt = Integer.parseInt(strScore);                // 输出姓名和成绩                context.write(name, new IntWritable(scoreInt));            }        }    }    public static class Reduce extends            Reducer<Text, IntWritable, Text, IntWritable> {        // 实现reduce函数        public void reduce(Text key, Iterable<IntWritable> values,                Context context) throws IOException, InterruptedException {            int sum = 0;            int count = 0;            Iterator<IntWritable> iterator = values.iterator();            while (iterator.hasNext()) {                sum += iterator.next().get();// 计算总分                count++;// 统计总的科目数            }            int average = (int) sum / count;// 计算平均成绩            context.write(key, new IntWritable(average));        }    }    public static void main(String[] args) throws Exception {        Configuration conf = new Configuration();        Job job = new Job(conf, "Score Average");        job.setJarByClass(Score.class);        // 设置Map、Combine和Reduce处理类        job.setMapperClass(Map.class);        job.setCombinerClass(Reduce.class);        job.setReducerClass(Reduce.class);        // 设置输出类型        job.setOutputKeyClass(Text.class);        job.setOutputValueClass(IntWritable.class);        // 将输入的数据集分割成小数据块splites,提供一个RecordReder的实现        job.setInputFormatClass(TextInputFormat.class);        // 提供一个RecordWriter的实现,负责数据输出        job.setOutputFormatClass(TextOutputFormat.class);        // 设置输入和输出目录        for (int i = 0; i<args.length-1; i++){            FileInputFormat.addInputPath(job, new Path(args[i]));        }        FileOutputFormat.setOutputPath(job, new Path(args[args.length-1]));        System.exit(job.waitForCompletion(true) ? 0 : 1);    }}