Hadoop实战演练:搜索数据分析----TopK计算(2)
来源:互联网 发布:淘宝开学季文案 编辑:程序博客网 时间:2024/05/19 18:13
林炳文Evankaka原创作品。转载请注明出处http://blog.csdn.net/evankaka
工程源码下载:https://github.com/appleappleapple/BigDataLearning/tree/master/Hadoop-Demo
这里接上文Hadoop实战演练:搜索数据分析----数据去重 (1)
本文要根据上面算出来的结果我们要计算TopK搜索词。要得到热搜永词,首先得到每个词的搜索次数。
记得在上文的输出数据源如下格式,这里要当前本次运算的输入源
1、WordCount。首先计算每个关键词出现的次数
package com.lin.keyword;import java.io.IOException;import java.util.StringTokenizer;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;import org.apache.hadoop.util.GenericOptionsParser;/** * 功能概要:统计搜索词 * * @author linbingwen */public class KeyWordCount {public static class Map extends Mapper<LongWritable, Text, Text, IntWritable> {private final static IntWritable one = new IntWritable(1); @Overridepublic void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {// 将输入的纯文本文件的数据转化成StringString line = value.toString();// 将输入的数据首先按行进行分割StringTokenizer tokenizerArticle = new StringTokenizer(line, "\n");// 分别对每一行进行处理while (tokenizerArticle.hasMoreElements()) { // 每行按空格划分 StringTokenizer tokenizerLine = new StringTokenizer(tokenizerArticle.nextToken()); String c1 = tokenizerLine.nextToken();// String c2 = tokenizerLine.nextToken();// 关键词 Text newline = new Text(c2); context.write(newline, one);}}}public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable> { private IntWritable result =new IntWritable(); // 实现reduce函数 @Overridepublic void reduce(Text key, Iterable<IntWritable> values,Context context) throws IOException, InterruptedException {int count = 0; for(IntWritable val:values){ count += val.get(); } result.set(count);context.write(key, result);}}public static void main(String[] args) throws Exception {Configuration conf = new Configuration();//设置hadoop的机器、端口conf.set("mapred.job.tracker", "10.75.201.125:9000");//设置输入输出文件目录String[] ioArgs = new String[] { "hdfs://hmaster:9000/clean_same_out", "hdfs://hmaster:9000/Key_out" };String[] otherArgs = new GenericOptionsParser(conf, ioArgs).getRemainingArgs();if (otherArgs.length != 2) {System.err.println("Usage: <in> <out>");System.exit(2);}//设置一个jobJob job = Job.getInstance(conf, "key Word count");job.setJarByClass(KeyWordCount.class);// 设置Map、Combine和Reduce处理类job.setMapperClass(KeyWordCount.Map.class);job.setCombinerClass(KeyWordCount.Reduce.class);job.setReducerClass(KeyWordCount.Reduce.class);// 设置输出类型job.setOutputKeyClass(Text.class);job.setOutputValueClass(IntWritable.class);// 将输入的数据集分割成小数据块splites,提供一个RecordReder的实现job.setInputFormatClass(TextInputFormat.class);// 提供一个RecordWriter的实现,负责数据输出job.setOutputFormatClass(TextOutputFormat.class);// 设置输入和输出目录FileInputFormat.addInputPath(job, new Path(otherArgs[0]));FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));System.exit(job.waitForCompletion(true) ? 0 : 1);}}输出数据格式如下:2、TopK
在wordCount上面输出的数据基础上来做运算
package com.lin.keyword;import java.io.IOException;import java.util.TreeMap;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;import org.apache.hadoop.util.GenericOptionsParser;/** * * 功能概要:统计top 热搜词 * * @author linbingwen */public class TopK {public static final int K = 100;public static class KMap extends Mapper<LongWritable, Text, IntWritable, Text> {TreeMap<Integer, String> map = new TreeMap<Integer, String>(); @Overridepublic void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {String line = value.toString();if(line.trim().length() > 0 && line.indexOf("\t") != -1) {String[] arr = line.split("\t", 2);String name = arr[0];Integer num = Integer.parseInt(arr[1]);map.put(num, name);if(map.size() > K) {map.remove(map.firstKey());}}}@Overrideprotected void cleanup(Mapper<LongWritable, Text, IntWritable, Text>.Context context) throws IOException, InterruptedException {for(Integer num : map.keySet()) {context.write(new IntWritable(num), new Text(map.get(num)));}}}public static class KReduce extends Reducer<IntWritable, Text, IntWritable, Text> {TreeMap<Integer, String> map = new TreeMap<Integer, String>(); @Overridepublic void reduce(IntWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException {map.put(key.get(), values.iterator().next().toString());if(map.size() > K) {map.remove(map.firstKey());}}@Overrideprotected void cleanup(Reducer<IntWritable, Text, IntWritable, Text>.Context context) throws IOException, InterruptedException {for(Integer num : map.keySet()) {context.write(new IntWritable(num), new Text(map.get(num)));}}}public static void main(String[] args) throws Exception {Configuration conf = new Configuration();//设置hadoop的机器、端口conf.set("mapred.job.tracker", "10.75.201.125:9000");//设置输入输出文件目录String[] ioArgs = new String[] { "hdfs://hmaster:9000/Key_out", "hdfs://hmaster:9000/top_out" };String[] otherArgs = new GenericOptionsParser(conf, ioArgs).getRemainingArgs();if (otherArgs.length != 2) {System.err.println("Usage: <in> <out>");System.exit(2);}//设置一个jobJob job = Job.getInstance(conf, "top K"); job.setJarByClass(TopK.class);// 设置Map、Combine和Reduce处理类job.setMapperClass(KMap.class);job.setCombinerClass(KReduce.class);job.setReducerClass(KReduce.class);// 设置输出类型job.setOutputKeyClass(IntWritable.class);job.setOutputValueClass(Text.class);// 将输入的数据集分割成小数据块splites,提供一个RecordReder的实现job.setInputFormatClass(TextInputFormat.class);// 提供一个RecordWriter的实现,负责数据输出job.setOutputFormatClass(TextOutputFormat.class);// 设置输入和输出目录FileInputFormat.addInputPath(job, new Path(otherArgs[0]));FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));System.exit(job.waitForCompletion(true) ? 0 : 1);}}
输出结果:这是从小到大的热搜索词,这里只统计了前100个
结果正确输出没问题
0 0
- Hadoop实战演练:搜索数据分析----TopK计算(2)
- Hadoop实战演练:搜索数据分析----每秒搜索量计算与展示(5)
- Hadoop实战演练:搜索数据分析----计算结果存储到Mysql(3)
- Hadoop实战演练:搜索数据分析----数据去重 (1)
- Hadoop实战演练:搜索数据分析----多个不同的Job进行串连(4)
- Hadoop 实战之分析专利引用数据集(一)
- Hadoop 实战之分析专利引用数据集(一)
- Hadoop 实战之分析专利引用数据集(二)
- Hadoop 实战之分析专利引用数据集(三)
- Hadoop 实战之分析专利引用数据集(二)
- Hadoop 实战之分析专利引用数据集(三)
- HDInsight-Hadoop实战(二)传感器数据分析
- Greenplum Hadoop入门到实战演练教程
- 原因分析与解决方案的实战演练
- Hadoop云计算实战
- hadoop记录topk
- hadoop记录topk
- sass实战演练02 - 嵌套、mixin、变量计算、颜色函数
- leetcode 72. Edit Distance
- Android之dagger2实现依赖注入的原理
- 性能优化——内存优化建议(RAM)
- POJ 1390 Blocks DP *
- 1-gcc 选项
- Hadoop实战演练:搜索数据分析----TopK计算(2)
- 排序算法总结
- 对象的容纳
- HDU 2191 悼念512汶川大地震遇难同胞——珍惜现在,感恩生活(多重背包)
- ubuntu设置DNS永久生效
- javaEE实现数据库信息简单显示在浏览器上
- 【技能储备】关于自学FreeMarker导出word的那些事
- ionic lab新建tab项目如何使用android的tab在底部
- POJ 2373 Dividing the Path DP -