MapReduce练习一（计数，去重，排序，平均成绩）

来源：互联网发布：windows恶意删除工具编辑：程序博客网时间：2024/05/22 09:54

相关链接：
MapReduce练习一（计数，去重，排序，平均成绩）
MapReduce练习二（单表关联，多表关联，倒排索引）

公司服务器的Hadoop版本是0.20，把hadoop-eclipse-plugin-0.20的jar包放到eclipse安装目录下的plugins目录中，新建项目，导入这个jar包解压之后得到的jar包，本地写代码，写完弄成jar包送服务器运行（shell命令：hadoop jar jar包名字输入输出），注意jar包是在服务器本地的，不在hadoop。

一，WordCount代码：

import java.io.IOException;import java.util.StringTokenizer;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.conf.Configured;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.lib.input.*;import org.apache.hadoop.mapreduce.lib.output.*;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.util.Tool;import org.apache.hadoop.util.ToolRunner; //类Configured实现了接口Configurable，接口Tool继承了接口Configurablepublic class Wc extends Configured implements Tool {    public static class MyMap extends Mapper<LongWritable, Text, Text, IntWritable> {        private final static IntWritable one = new IntWritable(1);        private Text word = new Text();        // map函数的输入key：字符串偏移量（下标），LongWritable类型；输入value:一行字符串内容，Text类型；context为map的输出        @Override        public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {            String line = value.toString();            StringTokenizer tokenizer = new StringTokenizer(line);  //将字符串分割成单词，默认空格            while (tokenizer.hasMoreTokens()) {                word.set(tokenizer.nextToken());                context.write(word, one);  //迭代输出key/value对            }        }    }    public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable> {        // reduce函数的输入和输出对应类型相同，key：一个单词，Text类型；value：该单词出现的次数列表，因为经过了shuffle，所以用迭代器        @Override        public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {            int sum = 0;            for (IntWritable val : values) {               sum += val.get();            }            context.write(key, new IntWritable(sum));        }    }    //run()是Tool接口的一个方法    public int run(String[] args) throws Exception {        if (args.length != 2) {            System.out.println("Usage:hadoop jar WordCount.jar WordCount <input> <output> ");            System.exit(-1);        }        // 必要的配置参数，getConf()是Configured的方法，返回一个Configuration        Configuration conf = getConf();        Job job = new Job(conf);        job.setJarByClass(Wc.class);                        // 设置主类        job.setJobName("WordCount");            // 设置JobName        job.setMapperClass(MyMap.class);                    // 设置Mapper类        job.setReducerClass(Reduce.class);                  // 设置Reducer类        job.setMapOutputKeyClass(Text.class);               // 设置Job的Map输出key类型(如需指定map的输出Key类型，可通过此参数设置)        job.setMapOutputValueClass(IntWritable.class);      // 设置Job的Map输出value类型(如需指定map的输出Value类型，可通过此参数设置)        job.setOutputKeyClass(Text.class);                  // 设置Job的最终输出key类型        job.setOutputValueClass(IntWritable.class);         // 设置Job的最终输出value类型        job.setNumReduceTasks(1);                           // 设置Reduce task的个数        job.setInputFormatClass(TextInputFormat.class);     // 设置Job输入分割格式(InputFormat)        job.setOutputFormatClass(TextOutputFormat.class);   // 设置Job的输出格式(OutputFormat)        //从命令行参数中获取输入输出路径        FileInputFormat.setInputPaths(job, new Path(args[0]));        FileOutputFormat.setOutputPath(job, new Path(args[1]));        boolean success = job.waitForCompletion(true);        return success ? 0 : 1;    }    //在main()方法中通过ToolRunner.run()方法调用上述类的run(String[] args），默认加载core-default.xml与core-site.xml中的参数。    public static void main(String[] args) throws Exception {        int ret = ToolRunner.run(new Wc(), args);        System.exit(ret);    } }

这个程序基本框架都有了，后边列出的代码省略的部分在这里都能找到，一些问题比如StringTokenizer和Split的区别，多路径输入，Combine类，等等都在后边的例子中有提到。

二，数据去重

要求：对数据文件中的数据进行去重。数据文件中的每行都是一个数据。

在MapReduce中，map的输出< key，value>经过shuffle过程（合并相同key）聚集成< key，value-list>后会交给reduce。所以从设计好的reduce输入可以反推出map的输出key应为数据，value任意。继续反推，map输出数据的key为数据，而在这个实例中每个数据代表输入文件中的一行内容，所以map阶段要完成的任务就是将输入value作为输出key，输出value任意。map中的结果经过shuffle过程之后交给reduce（此时key已经去重了）。reduce阶段直接将输入的key作为输出key，输出value置空。
这里只列出map和reduce的代码：

    //map将输入中的value复制到输出数据的key上，并直接输出。Mapper类是一个泛型类型，有4个形参类型，分别制定map函数的输入键，输入值，输出键，输出值的类型。Text相当于String。    public static class Map extends Mapper<LongWritable, Text, Text, Text>{        private static Text line = new Text();//每行数据        //实现map函数，把输入值直接作为输出键，输出值为空。        public void map(LongWritable key, Text value, Context context) throws IOException,InterruptedException{            line = value;            context.write(line, new Text(""));        }    }    //reduce将输入中的key复制到输出数据的key上，并直接输出    public static class Reduce extends Reducer<Text, Text, Text, Text>{        //实现reduce函数        public void reduce(Text key, Iterable<Text> values, Context context) throws IOException,InterruptedException{            context.write(key, new Text(""));        }    }

三，简单排序

要求：对输入文件中数据进行排序。输入文件中的每行内容均为一个数字，即一个数据。要求在输出中每行有两个间隔的数字，其中，第一个为递增序号，第二个按升序排列的数字。

MapReduce默认就会按照key值进行排序，如果key为IntWritable类型，按照数字大小对key排序【默认升序，降序好像要自己写个类继承IntWritable.Comparator类，然后job.setSortComparatorClass(自己写的类名.class)】，如果key为Text类型，按照字典顺序对字符串排序。
在map中将读入的数据转化成IntWritable型，然后作为key值输出（value任意）。reduce拿到< key，value-list>之后，将输入的key作为value输出，并根据value-list中元素的个数决定输出的次数。
这里只列出map和reduce的代码：

    // map ,注意数字也要以文本的形式输入，Text转成String，再转成int    public static class Map extends Mapper<LongWritable, Text, IntWritable, IntWritable>{        @Override        public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException{            String line = value.toString();            int data = Integer.parseInt(line);            context.write(new IntWritable(data), new IntWritable(1));        }    }    // reduce    public static class Reduce extends Reducer<IntWritable, IntWritable, IntWritable, IntWritable>{        static int sortIndex = 1;        @Override        public void reduce(IntWritable key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException{            // 升序，很简单            for(IntWritable value : values){                context.write(new IntWritable(sortIndex++), key);            }        }    }

四，计算平均成绩

要求：输入文件中的每行内容均为一个学生的姓名和他相应的成绩，每门学科为一个文件。要求在输出中每行有两个间隔的数据，其中，第一个代表学生的姓名，第二个代表其平均成绩。

Mapper处理的数据是由InputFormat分解过的数据集，其中InputFormat的作用是将数据集切割成小数据集InputSplit，每一个InputSlit将由一个Mapper负责处理（gz格式的压缩文件不能被split分块，整个文件由一个Mapper处理，知道这一点就行了，其实在程序写法上没有任何区别）。此外，InputFormat中还提供了一个RecordReader的实现，并将一个InputSplit解析成< key,value>对提供给了map函数。InputFormat的默认值是TextInputFormat，它针对文本文件，按行将文本切割成InputSlit，并用LineRecordReader将InputSplit解析成< key,value>对，key是行在文本中的位置，value是文件中的一行。Reduce以格式OutputFormat输出。
注意：Windows的TXT默认为”ANSI”，可以另存为utf-8，不然在Linux里头会出现中文乱码。
下面只列出map和reduce的代码：

        public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException{            String line = value.toString();            // 默认就是空格切分，但提倡使用split或者regex。因为可能中间隔了多个空格，这里用了正则，+匹配一个或多个，            //StringTokenizer token = new StringTokenizer(line);              String[] words = line.split(" +");            Text name = new Text(words[0]);            float score = Float.parseFloat(words[1]);            context.write(name, new FloatWritable(score));        }    }    public static class Reduce extends Reducer<Text, FloatWritable, Text, FloatWritable>{        public void reduce(Text key, Iterable<FloatWritable> values, Context context) throws IOException, InterruptedException{            float sum = 0;            int num = 0;            float avg = 0;            for(FloatWritable value : values){                sum += value.get();                num++;            }            avg = sum / num;            context.write(key, new FloatWritable(avg));        }    }

本文内容参考自：
http://www.cnblogs.com/xia520pi/archive/2012/06/04/2534533.html

阅读全文

0 0