算法(2) TopN Mapreduce/Spark

来源：互联网发布：淘宝客服名称大全编辑：程序博客网时间：2024/05/10 14:55

原理是每个map保留TopN数据，然后发送给reduce, reduce只设置一个，也就是说如果有10个map, 那么每个map处理好数据保留TopN列表，然后10个map就是100条数据，然后发送给reduce. 然后由reduce来过滤出10条数据。

每个Mapper开始之前有setup方法，结束之后有cleanup，通常不会使用，setup, cleanup仅仅是每个mapper开始或者结束后才调用，仅仅调用一次。上述算法原理是map来过滤出数据，然后统一交个cleanup发送给reduce.

reduce层原理也一样，假设10个map数据传给reduce,那么会有reduce会接受top100条数据，然后交给cleanup一次性过滤，然后output. 这也是为什么仅仅设置一个reduce的原因。

原始数据如下：

30;"unemployed";"married";"primary";"no";1787;"no";"no";"cellular";19;"oct";79;1;-1;0;"unknown";"no"33;"services";"married";"secondary";"no";4789;"yes";"yes";"cellular";11;"may";220;1;339;4;"failure";"no"35;"management";"single";"tertiary";"no";1350;"yes";"no";"cellular";16;"apr";185;1;330;1;"failure";"no"30;"management";"married";"tertiary";"no";1476;"yes";"yes";"unknown";3;"jun";199;4;-1;0;"unknown";"no"59;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";226;1;-1;0;"unknown";"no"35;"management";"single";"tertiary";"no";747;"no";"no";"cellular";23;"feb";141;2;176;3;"failure";"no"36;"self-employed";"married";"tertiary";"no";307;"yes";"no";"cellular";14;"may";341;1;330;2;"other";"no"39;"technician";"married";"secondary";"no";147;"yes";"no";"cellular";6;"may";151;2;-1;0;"unknown";"no"41;"entrepreneur";"married";"tertiary";"no";221;"yes";"no";"unknown";14;"may";57;2;-1;0;"unknown";"no"43;"services";"married";"primary";"no";-88;"yes";"yes";"cellular";17;"apr";313;1;147;2;"failure";"no"39;"services";"married";"secondary";"no";9374;"yes";"no";"unknown";20;"may";273;1;-1;0;"unknown";"no"43;"admin.";"married";"secondary";"no";264;"yes";"no";"cellular";17;"apr";113;2;-1;0;"unknown";"no"36;"technician";"married";"tertiary";"no";1109;"no";"no";"cellular";13;"aug";328;2;-1;0;"unknown";"no"20;"student";"single";"secondary";"no";502;"no";"no";"cellular";30;"apr";261;1;-1;0;"unknown";"yes"31;"blue-collar";"married";"secondary";"no";360;"yes";"yes";"cellular";29;"jan";89;1;241;1;"failure";"no"40;"management";"married";"tertiary";"no";194;"no";"yes";"cellular";29;"aug";189;2;-1;0;"unknown";"no"56;"technician";"married";"secondary";"no";4073;"no";"no";"cellular";27;"aug";239;5;-1;0;"unknown";"no"37;"admin.";"single";"tertiary";"no";2317;"yes";"no";"cellular";20;"apr";114;1;152;2;"failure";"no"25;"blue-collar";"single";"primary";"no";-221;"yes";"no";"unknown";23;"may";250;1;-1;0;"unknown";"no"31;"services";"married";"secondary";"no";132;"no";"no";"cellular";7;"jul";148;1;152;1;"other";"no"38;"management";"divorced";"unknown";"no";0;"yes";"no";"cellular";18;"nov";96;2;-1;0;"unknown";"no"42;"management";"divorced";"tertiary";"no";16;"no";"no";"cellular";19;"nov";140;3;-1;0;"unknown";"no"44;"services";"single";"secondary";"no";106;"no";"no";"unknown";12;"jun";109;2;-1;0;"unknown";"no"

.............................. 上面只是数据样例，实际还有很多行.

实现代码如下：

package com.isesol.mapreduce;import java.io.IOException;import java.sql.Connection;import java.sql.DriverManager;import java.sql.SQLException;import java.sql.Statement;import java.util.ArrayList;import java.util.Iterator;import java.util.Map;import java.util.SortedMap;import java.util.StringTokenizer;import java.util.TreeMap;import java.util.TreeSet;import org.apache.commons.net.nntp.NewsgroupInfo;import org.apache.hadoop.conf.Configurable;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapred.TextInputFormat;import org.apache.hadoop.mapred.TextOutputFormat;import org.apache.hadoop.mapred.lib.NullOutputFormat;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.JobContext;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.OutputCommitter;import org.apache.hadoop.mapreduce.OutputFormat;import org.apache.hadoop.mapreduce.Partitioner;import org.apache.hadoop.mapreduce.RecordWriter;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.Reducer.Context;import org.apache.hadoop.mapreduce.TaskAttemptContext;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.SplitLineReader;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.GenericOptionsParser;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat;import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;public class topN {public static class TokenizerMapper extends Mapper<Object, Text, Text, Text> {private SortedMap top10 = new TreeMap();public void map(Object key, Text value, Context context) throws IOException, InterruptedException {String str = "";String[] val = value.toString().split(";");for (int i = 1; i < val.length - 1; i++) {str += val[i];}top10.put(val[0], str);if (top10.size() > 10) {top10.remove(top10.firstKey());}}public void cleanup(Context context) throws IOException, InterruptedException {Iterator iterator = top10.entrySet().iterator();while (iterator.hasNext()) {Map.Entry ent = (Map.Entry) iterator.next();String key = ent.getKey().toString();String value = ent.getValue().toString();context.write(new Text(key), new Text(value));}}}public static class IntSumReducer extends Reducer<Text, Text, Text, Text> {private SortedMap top10 = new TreeMap();public void reduce(Text key, Iterable<Text> value, Context context) throws IOException, InterruptedException {for( Text val : value) {top10.put(key.toString(), val.toString());}}public void cleanup(Context context) throws IOException, InterruptedException {System.out.println("top10 size isa: " + top10.size());while(top10.size() > 10) {top10.remove(top10.firstKey());}Iterator iterator = top10.entrySet().iterator();System.out.println("top10 size is: " + top10.size());while (iterator.hasNext()) {Map.Entry ent = (Map.Entry) iterator.next();String key01 = ent.getKey().toString();String value01 = ent.getValue().toString();context.write(new Text(key01), new Text(value01));} }}public static void main(String[] args) throws Exception {Configuration conf = new Configuration();Job job = Job.getInstance(conf, "topN");job.setJarByClass(topN.class);job.setMapperClass(TokenizerMapper.class);job.setReducerClass(IntSumReducer.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(Text.class);job.setNumReduceTasks(1);// job.setPartitionerClass(twopartitions.class);// job.setOutputFormatClass(fakeOutPutFormat.class);FileInputFormat.addInputPath(job, new Path(args[0]));FileOutputFormat.setOutputPath(job, new Path(args[1]));System.exit(job.waitForCompletion(true) ? 0 : 1);}}

结果如下：

76"retired""married""primary""no"2590"no""no""telephone"9"feb"6812-10"unknown"77"retired""married""primary""no"680"no""no""telephone"27"nov"3414943"failure"78"retired""married""tertiary""no"226"no""no""telephone"6"nov"1361-10"unknown"79"retired""divorced""unknown""no"2628"no""no""telephone"8"jul"22074502"failure"80"housemaid""married""primary""no"0"no""no""cellular"23"feb"63911891"failure"81"retired""married""secondary""no"1"no""no""cellular"19"aug"655-10"unknown"83"retired""divorced""primary""no"1097"no""no""telephone"5"mar"1811-10"unknown"84"retired""divorced""primary""no"639"no""no""telephone"18"may"3533-10"unknown"86"retired""married""secondary""no"1503"no""no""telephone"18"mar"16531011"other"87"retired""married""primary""no"230"no""no""cellular"30"oct"1441-10"unknown"

上面没有考虑有重复数据，如果需要考虑重复数据也很好处理，稍微加工一下即可。

上面的TOP N是固定的，通常在实际些程序不会把N固定，比如我想TOP 100，不可能再去修改参数，因此赋予变量是最好的方式。Mpreduce通过conf.set(name, value)的方式设置变量，然后通过get获取变量。

conf.set("topn", args[2]);

topn = context.getConfiguration().get("topn");if(topn == null) {topn = "10";}

阅读全文

0 0