Hadoop 自定义序列化编程
来源:互联网 发布:java spit 编辑:程序博客网 时间:2024/06/17 00:46
一 自定义序列化需求
二 MapReduce代码编写
1 自定义序列化类
package com.cakin.hadoop.mr;import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;import org.apache.hadoop.io.WritableComparable;public class UserWritable implements WritableComparable<UserWritable> { private Integer id; private Integer income; private Integer expenses; private Integer sum; public void write(DataOutput out) throws IOException { // TODO Auto-generated method stub out.writeInt(id); out.writeInt(income); out.writeInt(expenses); out.writeInt(sum); } public void readFields(DataInput in) throws IOException { // TODO Auto-generated method stub this.id=in.readInt(); this.income=in.readInt(); this.expenses=in.readInt(); this.sum=in.readInt(); } public Integer getId() { return id; } public UserWritable setId(Integer id) { this.id = id; return this; } public Integer getIncome() { return income; } public UserWritable setIncome(Integer income) { this.income = income; return this; } public Integer getExpenses() { return expenses; } public UserWritable setExpenses(Integer expenses) { this.expenses = expenses; return this; } public Integer getSum() { return sum; } public UserWritable setSum(Integer sum) { this.sum = sum; return this; } public int compareTo(UserWritable o) { // TODO Auto-generated method stub return this.id>o.getId()?1:-1; } @Override public String toString() { return id + "\t"+income+"\t"+expenses+"\t"+sum; } }
2 编写MapReduce
package com.cakin.hadoop.mr;import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.Reducer;/** 测试数据* 用户id 收入 支出* 1 1000 0* 2 500 300* 1 2000 1000* 2 500 200** 需求:* 用户id 总收入 总支出 总的余额* 1 3000 1000 2000* 2 1000 500 500* */public class CountMapReduce { public static class CountMapper extends Mapper<LongWritable,Text,IntWritable,UserWritable> { private UserWritable userWritable =new UserWritable(); private IntWritable id =new IntWritable(); @Override protected void map(LongWritable key,Text value, Mapper<LongWritable,Text,IntWritable,UserWritable>.Context context) throws IOException, InterruptedException{ String line = value.toString(); String[] words = line.split("\t"); if(words.length ==3) { userWritable.setId(Integer.parseInt(words[0])) .setIncome(Integer.parseInt(words[1])) .setExpenses(Integer.parseInt(words[2])) .setSum(Integer.parseInt(words[1])-Integer.parseInt(words[2])); id.set(Integer.parseInt(words[0])); } context.write(id, userWritable); } } public static class CountReducer extends Reducer<IntWritable,UserWritable,UserWritable,NullWritable> { /* * 输入数据 * <1,{[1,1000,0,1000],[1,2000,1000,1000]}> * <2,[2,500,300,200],[2,500,200,300]> * * */ private UserWritable userWritable = new UserWritable(); private NullWritable n = NullWritable.get(); protected void reduce(IntWritable key,Iterable<UserWritable> values, Reducer<IntWritable,UserWritable,UserWritable,NullWritable>.Context context) throws IOException, InterruptedException{ Integer income=0; Integer expenses = 0; Integer sum =0; for(UserWritable u:values) { income += u.getIncome(); expenses+=u.getExpenses(); } sum = income - expenses; userWritable.setId(key.get()) .setIncome(income) .setExpenses(expenses) .setSum(sum); context.write(userWritable, n); } } public static void main(String[] args) throws IllegalArgumentException, IOException, ClassNotFoundException, InterruptedException { Configuration conf=new Configuration(); /* * 集群中节点都有配置文件 conf.set("mapreduce.framework.name.", "yarn"); conf.set("yarn.resourcemanager.hostname", "mini1"); */ Job job=Job.getInstance(conf,"countMR"); //jar包在哪里,现在在客户端,传递参数 //任意运行,类加载器知道这个类的路径,就可以知道jar包所在的本地路径 job.setJarByClass(CountMapReduce.class); //指定本业务job要使用的mapper/Reducer业务类 job.setMapperClass(CountMapper.class); job.setReducerClass(CountReducer.class); //指定mapper输出数据的kv类型 job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(UserWritable.class); //指定最终输出的数据kv类型 job.setOutputKeyClass(UserWritable.class); job.setOutputKeyClass(NullWritable.class); //指定job的输入原始文件所在目录 FileInputFormat.setInputPaths(job, new Path(args[0])); //指定job的输出结果所在目录 FileOutputFormat.setOutputPath(job, new Path(args[1])); //将job中配置的相关参数及job所用的java类在的jar包,提交给yarn去运行 //提交之后,此时客户端代码就执行完毕,退出 //job.submit(); //等集群返回结果在退出 boolean res=job.waitForCompletion(true); System.exit(res?0:1); //类似于shell中的$? }}
三 通过eclipse将程序打包为mapreduce.jar
四 MapReduce的自定义序列化测试
1 准备数据
[root@centos hadoop-2.7.4]# bin/hdfs dfs -cat /input/data1 1000 02 500 3001 2000 10002 500 200
2 运行MapReduce
[root@centos hadoop-2.7.4]# bin/yarn jar /root/jar/mapreduce.jar /input/data /output317/12/20 21:24:45 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:803217/12/20 21:24:46 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.17/12/20 21:24:47 INFO input.FileInputFormat: Total input paths to process : 117/12/20 21:24:47 INFO mapreduce.JobSubmitter: number of splits:117/12/20 21:24:47 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1513775596077_000117/12/20 21:24:49 INFO impl.YarnClientImpl: Submitted application application_1513775596077_000117/12/20 21:24:49 INFO mapreduce.Job: The url to track the job: http://centos:8088/proxy/application_1513775596077_0001/17/12/20 21:24:49 INFO mapreduce.Job: Running job: job_1513775596077_000117/12/20 21:25:13 INFO mapreduce.Job: Job job_1513775596077_0001 running in uber mode : false17/12/20 21:25:13 INFO mapreduce.Job: map 0% reduce 0%17/12/20 21:25:38 INFO mapreduce.Job: map 100% reduce 0%17/12/20 21:25:54 INFO mapreduce.Job: map 100% reduce 100%17/12/20 21:25:56 INFO mapreduce.Job: Job job_1513775596077_0001 completed successfully17/12/20 21:25:57 INFO mapreduce.Job: Counters: 49 File System Counters FILE: Number of bytes read=94 FILE: Number of bytes written=241391 FILE: Number of read operations=0 FILE: Number of large read operations=0 FILE: Number of write operations=0 HDFS: Number of bytes read=135 HDFS: Number of bytes written=32 HDFS: Number of read operations=6 HDFS: Number of large read operations=0 HDFS: Number of write operations=2 Job Counters Launched map tasks=1 Launched reduce tasks=1 Data-local map tasks=1 Total time spent by all maps in occupied slots (ms)=23672 Total time spent by all reduces in occupied slots (ms)=11815 Total time spent by all map tasks (ms)=23672 Total time spent by all reduce tasks (ms)=11815 Total vcore-milliseconds taken by all map tasks=23672 Total vcore-milliseconds taken by all reduce tasks=11815 Total megabyte-milliseconds taken by all map tasks=24240128 Total megabyte-milliseconds taken by all reduce tasks=12098560 Map-Reduce Framework Map input records=4 Map output records=4 Map output bytes=80 Map output materialized bytes=94 Input split bytes=94 Combine input records=0 Combine output records=0 Reduce input groups=2 Reduce shuffle bytes=94 Reduce input records=4 Reduce output records=2 Spilled Records=8 Shuffled Maps =1 Failed Shuffles=0 Merged Map outputs=1 GC time elapsed (ms)=157 CPU time spent (ms)=1090 Physical memory (bytes) snapshot=275660800 Virtual memory (bytes) snapshot=4160692224 Total committed heap usage (bytes)=139264000 Shuffle Errors BAD_ID=0 CONNECTION=0 IO_ERROR=0 WRONG_LENGTH=0 WRONG_MAP=0 WRONG_REDUCE=0 File Input Format Counters Bytes Read=41 File Output Format Counters Bytes Written=32
3 测试结果
[root@centos hadoop-2.7.4]# bin/hdfs dfs -cat /output3/part-r-000001 3000 1000 20002 1000 500 500
五 参考
http://www.jikexueyuan.com/course/2710.html
阅读全文
0 0
- Hadoop 自定义序列化编程
- hadoop自定义序列化
- Hadoop自定义可序列化的类
- Hadoop中的序列化:自定义value类型
- hadoop 序列化和排序,自定义分区
- hadoop学习之路_10、自定义序列化类
- hadoop自定义对象序列化Writable和WritableComparable
- Hadoop Serialization -- hadoop序列化详解 (3)【ObjectWritable,集合Writable以及自定义的Writable】
- Hadoop Serialization -- hadoop序列化详解 (3)【ObjectWritable,集合Writable以及自定义的Writable】
- Hadoop入门(七)之java对应的Hadoop数据类型及自定义类型序列化
- Hadoop自定义数据类型编程练习
- Hadoop--序列化
- hadoop序列化
- Hadoop的序列化
- Hadoop序列化
- Hadoop序列化
- Hadoop序列化案例
- hadoop序列化
- File I/O
- Integer.bitCount(int i)求二进制数中1的个数
- windows vs2013~2015 MS-MPI v8.1 环境配置
- 单例模式
- Font Awesome(八)
- Hadoop 自定义序列化编程
- 线性代数 02.02 矩阵的运算
- Font Awesome(九)
- RFC2616--100状态码(规范与测试)
- java--集合
- 详解DNS工作流程及原理 域名、IP与DNS的关系
- Font Awesome(十)
- PHP删除数据库数据
- Font Awesome(十一)