Hadoop入门之Mapreduce过程的几个Demo
来源:互联网 发布:大商创开源破解版源码 编辑:程序博客网 时间:2024/05/16 14:11
1.简单的统计单词数量的demo熟悉下MR的过程
package com.demo.wordcount;import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;/** * @Description: 单词统计MR过程 * @author: songqinghu * @date: 2017年8月24日 下午6:12:24 * Version:1.0 */public class WordCount { public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); //在hadoop集群机器上运行会读取相应的配置文件 $Hadoop_Home Job job = Job.getInstance(conf); job.setJarByClass(WordCount.class);//上传Jar //map & reduce 过程设置 job.setMapperClass(WordCountMap.class);//mapper 过程 job.setReducerClass(WordCountReduce.class);//reduce 过程 job.setMapOutputKeyClass(Text.class);//mapper 输出key job.setMapOutputValueClass(IntWritable.class);//mapper 输出value job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); //文件路径设置 FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); boolean flag = job.waitForCompletion(true); System.out.println("the job exe is :" + flag); } }class WordCountMap extends Mapper<LongWritable, Text, Text, IntWritable>{ @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); String[] words = line.split(" "); for (String word : words) { context.write(new Text(word), new IntWritable(1)); } } }class WordCountReduce extends Reducer<Text, IntWritable, Text, IntWritable>{ @Override protected void reduce(Text text, Iterable<IntWritable> iter, Context context) throws IOException, InterruptedException { //初始化一个计数器用于叠加总次数 int count = 0; for (IntWritable num : iter) { count = count + num.get(); } //汇总结束,写出 context.write(text, new IntWritable(count)); } }
2.MR过程使用到自定义序列化问题的处理
package com.demo.flowsum;import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.io.Writable;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;/** * * @Description: 流量统计(自定义序列化结果处理) * @author: songqinghu * @date: 2017年8月24日 下午8:04:21 * Version:1.0 */public class FlowSum { public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf); job.setJarByClass(FlowSum.class); job.setMapperClass(FlowSumMapper.class); job.setReducerClass(FlowSumReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Flow.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Flow.class); //文件路径设置 FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); boolean flag = job.waitForCompletion(true); System.out.println("the job exe is :" + flag); } }class FlowSumMapper extends Mapper<LongWritable, Text, Text, Flow>{ @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); String[] words = line.split("\t"); context.write(new Text(words[1]), new Flow(new Integer(words[words.length-3]) , new Integer(words[words.length-2]) )); } }class FlowSumReducer extends Reducer<Text, Flow, Text, Flow>{ @Override protected void reduce(Text text, Iterable<Flow> iters, Context context) throws IOException, InterruptedException { int uFlow = 0; int dFlow = 0; for (Flow flow : iters) { uFlow = flow.getuFlow() + uFlow; dFlow = flow.getdFlow() + dFlow; } context.write(text, new Flow(uFlow, dFlow)); } }//hadoop框架序列化接口class Flow implements Writable{ private int uFlow;//上传 private int dFlow;//下载 private int sFlow;//总 public Flow() {} //反射 public Flow(int uFlow, int dFlow) { super(); this.uFlow = uFlow; this.dFlow = dFlow; this.sFlow = uFlow + dFlow; } public int getuFlow() { return uFlow; } public void setuFlow(int uFlow) { this.uFlow = uFlow; } public int getdFlow() { return dFlow; } public void setdFlow(int dFlow) { this.dFlow = dFlow; } public int getsFlow() { return sFlow; } public void setsFlow(int sFlow) { this.sFlow = sFlow; } @Override public void write(DataOutput out) throws IOException { //序列化 out.writeInt(uFlow); out.writeInt(dFlow); out.writeInt(sFlow); } @Override public void readFields(DataInput in) throws IOException { //反序列化 this.uFlow = in.readInt(); this.dFlow = in.readInt(); this.sFlow = in.readInt(); } @Override //最终输出时的打印方式 public String toString() { return uFlow +"\t" + dFlow +"\t" + sFlow; } }
3.MR过程需要对Mapper后的输出进行特殊的分区逻辑处理
package com.demo.flowpart;import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.io.Writable;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;public class FlowPart { public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf); job.setJarByClass(FlowPart.class); job.setMapperClass(FlowSumMapper.class); job.setReducerClass(FlowSumReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Flow.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Flow.class); //文件路径设置 FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setPartitionerClass(FlowPartition.class); job.setNumReduceTasks(5); boolean flag = job.waitForCompletion(true); System.out.println("the job exe is :" + flag); }}class FlowSumMapper extends Mapper<LongWritable, Text, Text, Flow>{ @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); String[] words = line.split("\t"); context.write(new Text(words[1]), new Flow(new Integer(words[words.length-3]) , new Integer(words[words.length-2]) )); } }class FlowSumReducer extends Reducer<Text, Flow, Text, Flow>{ @Override protected void reduce(Text text, Iterable<Flow> iters, Context context) throws IOException, InterruptedException { int uFlow = 0; int dFlow = 0; for (Flow flow : iters) { uFlow = flow.getuFlow() + uFlow; dFlow = flow.getdFlow() + dFlow; } context.write(text, new Flow(uFlow, dFlow)); } }//hadoop框架序列化接口class Flow implements Writable{ private int uFlow;//上传 private int dFlow;//下载 private int sFlow;//总 public Flow() {} //反射 public Flow(int uFlow, int dFlow) { super(); this.uFlow = uFlow; this.dFlow = dFlow; this.sFlow = uFlow + dFlow; } public int getuFlow() { return uFlow; } public void setuFlow(int uFlow) { this.uFlow = uFlow; } public int getdFlow() { return dFlow; } public void setdFlow(int dFlow) { this.dFlow = dFlow; } public int getsFlow() { return sFlow; } public void setsFlow(int sFlow) { this.sFlow = sFlow; } @Override public void write(DataOutput out) throws IOException { //序列化 out.writeInt(uFlow); out.writeInt(dFlow); out.writeInt(sFlow); } @Override public void readFields(DataInput in) throws IOException { //反序列化 this.uFlow = in.readInt(); this.dFlow = in.readInt(); this.sFlow = in.readInt(); } @Override //最终输出时的打印方式 public String toString() { return uFlow +"\t" + dFlow +"\t" + sFlow; } }
package com.demo.flowpart;import java.util.HashMap;import java.util.Map;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Partitioner;/** * * @Description: mapper后的分发过程 * @author: songqinghu * @date: 2017年8月24日 下午8:41:33 * Version:1.0 */public class FlowPartition extends Partitioner<Text, Flow>{ private static Map<String, Integer> iphonePri = new HashMap<String,Integer>(); static{ iphonePri.put("136", 0); iphonePri.put("137", 1); iphonePri.put("138", 2); iphonePri.put("139", 3); } @Override public int getPartition(Text key, Flow value, int numPartitions) { String iphoneNum = key.toString(); String num = iphoneNum.substring(0, 3); if(iphonePri.containsKey(num)){ return iphonePri.get(num); } return 4; } }
阅读全文
0 0
- Hadoop入门之Mapreduce过程的几个Demo
- Hadoop入门之几个Demo的加强版本
- Hadoop入门之Mapreduce流程Shuffle过程
- Hadoop入门之MapReduce
- hadoop的mapreduce过程
- Hadoop之MapReduce快速入门
- Hadoop MapReduce的shuffle过程
- Hadoop的MapReduce执行过程
- Hadoop学习之mapreduce:Shuffle 过程笔记
- Hadoop入门之Join的两种实现Demo
- Hadoop入门之HDFS与MapReduce
- Hadoop入门之HDFS与MapReduce
- Hadoop入门之Mapreduce部分流程解析
- Hadoop 键值对的mapreduce过程剖析
- Hadoop 键值对的MapReduce过程剖析
- hadoop中mapreduce的执行过程
- hadoop MapReduce模型的shuffle过程
- Hadoop 键值对的mapreduce过程剖析
- 【bzoj3669】[Noi2014]魔法森林
- [numpy] 生成正态分布
- mysql中or和in的效率问题 (有无索引差别很大o(n)/log(n))
- Leetcode练习<二十一>求解整数的平方根
- Kafka的Producer和Consumer的示例(使用java语言)
- Hadoop入门之Mapreduce过程的几个Demo
- 坚持#第189天~练字完毕
- JavaScript中的递归函数
- kNN代码
- uva548 (建树)二种方式实现 简单易懂
- jQuery的学习日记
- ios之UILabel显示不同的颜色字体
- Android.mk文件解读
- fread读优