topk代码

来源:互联网 发布:山寨币源码 编辑:程序博客网 时间:2024/04/30 00:52

1、生成随机数代码:

[java] view plain copy print?
  1. package test;  
  2.   
  3.   
  4. import java.util.Random;  
  5.   
  6.   
  7. public class RandomTest {  
  8.      public static void main(String[] args){          
  9.          int number=0;  
  10.          for (int i = 0; i < 100000; i++) {  
  11.               number = new Random().nextInt(100000) + 1;  
  12.              System.out.println(number);  
  13.         }  
  14.     }  
  15. }  



2、生成部分数据如下所示:
[java] view plain copy print?
  1. [hadoop@hadoop ~]$ tail -10 top_k.txt  
  2. 16287  
  3. 86786  
  4. 63942  
  5. 5960  
  6. 87524  
  7. 53898  
  8. 81409  
  9. 92020  
  10. 68327  
  11. 92706  
  12. [hadoop@hadoop ~]$  



3、代码:
[java] view plain copy print?
  1. package test;  
  2.   
  3.   
  4. import java.io.IOException;  
  5. import java.util.TreeMap;  
  6. import org.apache.hadoop.conf.Configuration;  
  7. import org.apache.hadoop.fs.Path;  
  8. import org.apache.hadoop.io.LongWritable;  
  9. import org.apache.hadoop.io.NullWritable;  
  10. import org.apache.hadoop.io.Text;  
  11. import org.apache.hadoop.mapreduce.Job;  
  12. import org.apache.hadoop.mapreduce.Mapper;  
  13. import org.apache.hadoop.mapreduce.Reducer;  
  14. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  
  15. import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;  
  16. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  
  17. import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;  
  18.   
  19.   
  20. public class Topk {  
  21. /** 
  22. * map方法 
  23. */  
  24. private static final int k=10;  
  25. public static class MyMapper extends Mapper<LongWritable, Text, LongWritable, Text>{   
  26. LongWritable lw=new LongWritable();  
  27. Text text=new Text();  
  28. TreeMap<Long, String > treemap_mapper=new TreeMap<Long, String>();  
  29. @Override  
  30. protected void map(LongWritable key, Text value,Context context)  
  31. throws IOException, InterruptedException {  
  32. String line=value.toString();  
  33. String[] splited=line.split("\t");  
  34.   
  35. treemap_mapper.put(Long.parseLong(splited[0]), line);  
  36.   
  37. if(treemap_mapper.size()>k){  
  38. treemap_mapper.remove(treemap_mapper.firstKey());  
  39. }  
  40.   
  41. }  
  42.   
  43. @Override  
  44. protected void cleanup(Context context)  
  45. throws IOException, InterruptedException {  
  46.   
  47. for (Long numLong : treemap_mapper.keySet()) {  
  48. context.write(new LongWritable(numLong), new Text(treemap_mapper.get(numLong)));  
  49. }  
  50. }  
  51.   
  52. }  
  53.   
  54. /** 
  55. * reducer方法 
  56. * @author Administrator 
  57. * 
  58. */  
  59. public static class MyReducer extends Reducer<LongWritable, Text, LongWritable, NullWritable>{  
  60. TreeMap<Long, String> treemap_reducer=new TreeMap<Long, String>();  
  61. @Override  
  62. protected void reduce(LongWritable k2,Iterable<Text> v2s,Context context)  
  63. throws IOException, InterruptedException {  
  64. treemap_reducer.put(k2.get(), v2s.iterator().next().toString());  
  65.   
  66. if(treemap_reducer.size()>k){   
  67. treemap_reducer.remove(treemap_reducer.firstKey());  
  68. }  
  69. }  
  70.   
  71. @Override  
  72. protected void cleanup(Context context)  
  73. throws IOException, InterruptedException {  
  74. Long[] outLong=new Long[10];  
  75. int flag=0;  
  76. for (Long numLong : treemap_reducer.keySet()) {  
  77. outLong[flag]=numLong;  
  78. flag++;  
  79. }  
  80.   
  81. for (int i=k-1;i>=0;i--) {  
  82. context.write(new LongWritable(outLong[i]),NullWritable.get());  
  83. }  
  84. }  
  85.   
  86. }  
  87. /** 
  88. * 主方法 
  89. * @param args 
  90. * @throws Exception 
  91. */  
  92. public static void main(String[] args) throws Exception {  
  93. Configuration conf = new Configuration();  
  94. Job job=Job.getInstance(conf, Topk.class.getSimpleName());  
  95. job.setJarByClass(Topk.class);  
  96.   
  97. job.setNumReduceTasks(1);  
  98.   
  99. job.setMapperClass(MyMapper.class);  
  100. job.setReducerClass(MyReducer.class);  
  101.   
  102. job.setMapOutputKeyClass(LongWritable.class);  
  103. job.setMapOutputValueClass(Text.class);  
  104.   
  105. job.setOutputKeyClass(LongWritable.class);  
  106. job.setOutputValueClass(NullWritable.class);  
  107.   
  108. job.setInputFormatClass(TextInputFormat.class);  
  109. job.setOutputFormatClass(TextOutputFormat.class);  
  110.   
  111. FileInputFormat.setInputPaths(job, args[0]);  
  112. FileOutputFormat.setOutputPath(job, new Path(args[1]));  
  113.   
  114. job.waitForCompletion(true);  
  115.   
  116. }  
  117. }  




4、代码运行后输出在HDFS上面的目录:
[java] view plain copy print?
  1. [hadoop@hadoop ~]$ hdfs dfs -ls /user/hadoop/output_topk/output3  
  2. Found 2 items  
  3. -rw-r--r--   3 hadoop supergroup          0 2015-08-30 21:09 /user/hadoop/output_topk/output3/_SUCCESS  
  4. -rw-r--r--   3 hadoop supergroup         80 2015-08-30 21:09 /user/hadoop/output_topk/output3/part-r-00000  



5、查看top10的数据:
[java] view plain copy print?
  1. [hadoop@hadoop ~]$ hdfs dfs -text /user/hadoop/output_topk/output3/part-r-00000  
  2. 9999706  
  3. 9999594  
  4. 9999424  
  5. 9999199  
  6. 9997208  
  7. 9996513  
  8. 9995640  
  9. 9995515  
  10. 9993977  
  11. 9991946 
0 0
原创粉丝点击