Hadoop 自定义序列化编程

来源:互联网 发布:java spit 编辑:程序博客网 时间:2024/06/17 00:46
一 自定义序列化需求

二 MapReduce代码编写
1 自定义序列化类
package com.cakin.hadoop.mr;import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;import org.apache.hadoop.io.WritableComparable;public class UserWritable implements WritableComparable<UserWritable> {     private Integer id;     private Integer income;     private Integer expenses;     private Integer sum;          public void write(DataOutput out) throws IOException {           // TODO Auto-generated method stub           out.writeInt(id);           out.writeInt(income);           out.writeInt(expenses);           out.writeInt(sum);     }     public void readFields(DataInput in) throws IOException {           // TODO Auto-generated method stub           this.id=in.readInt();           this.income=in.readInt();           this.expenses=in.readInt();           this.sum=in.readInt();               }          public Integer getId() {           return id;     }     public UserWritable setId(Integer id) {           this.id = id;           return this;     }     public Integer getIncome() {           return income;     }     public UserWritable setIncome(Integer income) {           this.income = income;           return this;     }     public Integer getExpenses() {           return expenses;     }     public UserWritable setExpenses(Integer expenses) {           this.expenses = expenses;           return this;     }     public Integer getSum() {           return sum;     }     public UserWritable setSum(Integer sum) {           this.sum = sum;           return this;     }     public int compareTo(UserWritable o) {           // TODO Auto-generated method stub           return this.id>o.getId()?1:-1;     }     @Override     public String toString() {           return id + "\t"+income+"\t"+expenses+"\t"+sum;     }     }

2 编写MapReduce
package com.cakin.hadoop.mr;import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.Reducer;/** 测试数据* 用户id        收入        支出* 1        1000    0* 2        500        300* 1        2000    1000* 2        500        200** 需求:* 用户id        总收入    总支出    总的余额* 1        3000    1000    2000* 2        1000    500        500* */public class CountMapReduce {    public static class CountMapper extends Mapper<LongWritable,Text,IntWritable,UserWritable>    {         private UserWritable userWritable =new UserWritable();         private IntWritable id =new IntWritable();         @Override         protected void map(LongWritable key,Text value,                 Mapper<LongWritable,Text,IntWritable,UserWritable>.Context context) throws IOException, InterruptedException{             String line = value.toString();             String[] words = line.split("\t");             if(words.length ==3)             {                 userWritable.setId(Integer.parseInt(words[0]))                 .setIncome(Integer.parseInt(words[1]))                 .setExpenses(Integer.parseInt(words[2]))                 .setSum(Integer.parseInt(words[1])-Integer.parseInt(words[2]));                 id.set(Integer.parseInt(words[0]));             }             context.write(id, userWritable);         }    }    public static class CountReducer extends Reducer<IntWritable,UserWritable,UserWritable,NullWritable>    {        /*         * 输入数据         * <1,{[1,1000,0,1000],[1,2000,1000,1000]}>         * <2,[2,500,300,200],[2,500,200,300]>         *         * */                 private UserWritable userWritable = new UserWritable();         private NullWritable n = NullWritable.get();         protected void reduce(IntWritable key,Iterable<UserWritable> values,                 Reducer<IntWritable,UserWritable,UserWritable,NullWritable>.Context context) throws IOException, InterruptedException{             Integer income=0;             Integer expenses = 0;             Integer sum =0;             for(UserWritable u:values)             {                 income += u.getIncome();                 expenses+=u.getExpenses();             }             sum = income - expenses;             userWritable.setId(key.get())             .setIncome(income)             .setExpenses(expenses)             .setSum(sum);             context.write(userWritable, n);         }    }    public static void main(String[] args) throws IllegalArgumentException, IOException, ClassNotFoundException, InterruptedException {        Configuration conf=new Configuration();        /*         * 集群中节点都有配置文件        conf.set("mapreduce.framework.name.", "yarn");        conf.set("yarn.resourcemanager.hostname", "mini1");        */        Job job=Job.getInstance(conf,"countMR");        //jar包在哪里,现在在客户端,传递参数        //任意运行,类加载器知道这个类的路径,就可以知道jar包所在的本地路径        job.setJarByClass(CountMapReduce.class);        //指定本业务job要使用的mapper/Reducer业务类        job.setMapperClass(CountMapper.class);        job.setReducerClass(CountReducer.class);        //指定mapper输出数据的kv类型        job.setMapOutputKeyClass(IntWritable.class);        job.setMapOutputValueClass(UserWritable.class);        //指定最终输出的数据kv类型        job.setOutputKeyClass(UserWritable.class);        job.setOutputKeyClass(NullWritable.class);        //指定job的输入原始文件所在目录        FileInputFormat.setInputPaths(job, new Path(args[0]));        //指定job的输出结果所在目录        FileOutputFormat.setOutputPath(job, new Path(args[1]));        //将job中配置的相关参数及job所用的java类在的jar包,提交给yarn去运行        //提交之后,此时客户端代码就执行完毕,退出        //job.submit();        //等集群返回结果在退出        boolean res=job.waitForCompletion(true);        System.exit(res?0:1);        //类似于shell中的$?    }}

三 通过eclipse将程序打包为mapreduce.jar

四 MapReduce的自定义序列化测试
1 准备数据
[root@centos hadoop-2.7.4]# bin/hdfs dfs -cat /input/data1    1000    02    500    3001    2000    10002    500    200

2 运行MapReduce
[root@centos hadoop-2.7.4]# bin/yarn jar /root/jar/mapreduce.jar /input/data /output317/12/20 21:24:45 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:803217/12/20 21:24:46 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.17/12/20 21:24:47 INFO input.FileInputFormat: Total input paths to process : 117/12/20 21:24:47 INFO mapreduce.JobSubmitter: number of splits:117/12/20 21:24:47 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1513775596077_000117/12/20 21:24:49 INFO impl.YarnClientImpl: Submitted application application_1513775596077_000117/12/20 21:24:49 INFO mapreduce.Job: The url to track the job: http://centos:8088/proxy/application_1513775596077_0001/17/12/20 21:24:49 INFO mapreduce.Job: Running job: job_1513775596077_000117/12/20 21:25:13 INFO mapreduce.Job: Job job_1513775596077_0001 running in uber mode : false17/12/20 21:25:13 INFO mapreduce.Job:  map 0% reduce 0%17/12/20 21:25:38 INFO mapreduce.Job:  map 100% reduce 0%17/12/20 21:25:54 INFO mapreduce.Job:  map 100% reduce 100%17/12/20 21:25:56 INFO mapreduce.Job: Job job_1513775596077_0001 completed successfully17/12/20 21:25:57 INFO mapreduce.Job: Counters: 49    File System Counters        FILE: Number of bytes read=94        FILE: Number of bytes written=241391        FILE: Number of read operations=0        FILE: Number of large read operations=0        FILE: Number of write operations=0        HDFS: Number of bytes read=135        HDFS: Number of bytes written=32        HDFS: Number of read operations=6        HDFS: Number of large read operations=0        HDFS: Number of write operations=2    Job Counters        Launched map tasks=1        Launched reduce tasks=1        Data-local map tasks=1        Total time spent by all maps in occupied slots (ms)=23672        Total time spent by all reduces in occupied slots (ms)=11815        Total time spent by all map tasks (ms)=23672        Total time spent by all reduce tasks (ms)=11815        Total vcore-milliseconds taken by all map tasks=23672        Total vcore-milliseconds taken by all reduce tasks=11815        Total megabyte-milliseconds taken by all map tasks=24240128        Total megabyte-milliseconds taken by all reduce tasks=12098560    Map-Reduce Framework        Map input records=4        Map output records=4        Map output bytes=80        Map output materialized bytes=94        Input split bytes=94        Combine input records=0        Combine output records=0        Reduce input groups=2        Reduce shuffle bytes=94        Reduce input records=4        Reduce output records=2        Spilled Records=8        Shuffled Maps =1        Failed Shuffles=0        Merged Map outputs=1        GC time elapsed (ms)=157        CPU time spent (ms)=1090        Physical memory (bytes) snapshot=275660800        Virtual memory (bytes) snapshot=4160692224        Total committed heap usage (bytes)=139264000    Shuffle Errors        BAD_ID=0        CONNECTION=0        IO_ERROR=0        WRONG_LENGTH=0        WRONG_MAP=0        WRONG_REDUCE=0    File Input Format Counters        Bytes Read=41    File Output Format Counters        Bytes Written=32

3 测试结果
[root@centos hadoop-2.7.4]# bin/hdfs dfs -cat /output3/part-r-000001    3000    1000    20002    1000    500    500

五 参考
http://www.jikexueyuan.com/course/2710.html
原创粉丝点击