Hadoop自定义数据类型编程练习

来源:互联网 发布:vb连接oracle 编辑:程序博客网 时间:2024/05/22 09:47

 

 Hadoop自定义数据类型编程练习

 

 

代码:

package zidongyi; import java.io.DataInput;import java.io.DataOutput;import java.io.IOException; importorg.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.io.Writable;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;importorg.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;importorg.apache.hadoop.mapreduce.lib.output.FileOutputFormat;importorg.apache.hadoop.mapreduce.lib.output.TextOutputFormat;importorg.apache.hadoop.mapreduce.lib.partition.HashPartitioner; public class KpiApp {       staticfinal String INPUT_PATH ="hdfs://192.168.1.100:9000/input/HTTP_20130313143750.dat";       staticfinal String OUT_PATH = "hdfs://192.168.1.100:9000/output/out02";       publicstatic void main(String[] args) throws Exception{              finalJob job = new Job(new Configuration(), KpiApp.class.getSimpleName());              //1.1指定输入文件路径              FileInputFormat.setInputPaths(job,INPUT_PATH);              //指定哪个类用来格式化输入文件              job.setInputFormatClass(TextInputFormat.class);                           //1.2指定自定义的Mapper类              job.setMapperClass(MyMapper.class);              //指定输出<k2,v2>的类型              job.setMapOutputKeyClass(Text.class);              job.setMapOutputValueClass(KpiWritable.class);                           //1.3指定分区类              job.setPartitionerClass(HashPartitioner.class);              job.setNumReduceTasks(1);                           //1.4TODO 排序、分区                           //1.5  TODO (可选)合并                           //2.2指定自定义的reduce类              job.setReducerClass(MyReducer.class);              //指定输出<k3,v3>的类型              job.setOutputKeyClass(Text.class);              job.setOutputValueClass(KpiWritable.class);                           //2.3指定输出到哪里              FileOutputFormat.setOutputPath(job,new Path(OUT_PATH));              //设定输出文件的格式化类              job.setOutputFormatClass(TextOutputFormat.class);                           //把代码提交给JobTracker执行              job.waitForCompletion(true);       }        staticclass MyMapper extends Mapper<LongWritable, Text, Text, KpiWritable>{              protectedvoid map(LongWritable key, Text value, org.apache.hadoop.mapreduce.Mapper<LongWritable,Text,Text,KpiWritable>.Contextcontext) throws IOException ,InterruptedException {                     finalString[] splited = value.toString().split("\t");                     finalString msisdn = splited[1];                     finalText k2 = new Text(msisdn);                     finalKpiWritable v2 = new KpiWritable(splited[6],splited[7],splited[8],splited[9]);                     context.write(k2,v2);              };       }             staticclass MyReducer extends Reducer<Text, KpiWritable, Text, KpiWritable>{              /**               * @param     k2    表示整个文件中不同的手机号码                     * @param     v2s  表示该手机号在不同时段的流量的集合               */              protectedvoid reduce(Text k2, java.lang.Iterable<KpiWritable> v2s,org.apache.hadoop.mapreduce.Reducer<Text,KpiWritable,Text,KpiWritable>.Contextcontext) throws IOException ,InterruptedException {                     longupPackNum = 0L;                     longdownPackNum = 0L;                     longupPayLoad = 0L;                     longdownPayLoad = 0L;                                         for(KpiWritable kpiWritable : v2s) {                            upPackNum+= kpiWritable.upPackNum;                            downPackNum+= kpiWritable.downPackNum;                            upPayLoad+= kpiWritable.upPayLoad;                            downPayLoad+= kpiWritable.downPayLoad;                     }                                         finalKpiWritable v3 = new KpiWritable(upPackNum+"",downPackNum+"", upPayLoad+"", downPayLoad+"");                     context.write(k2,v3);              };       }} class KpiWritable implements Writable{       longupPackNum;       longdownPackNum;       longupPayLoad;       longdownPayLoad;             publicKpiWritable(){}             publicKpiWritable(String upPackNum, String downPackNum, String upPayLoad, StringdownPayLoad){              this.upPackNum= Long.parseLong(upPackNum);              this.downPackNum= Long.parseLong(downPackNum);              this.upPayLoad= Long.parseLong(upPayLoad);              this.downPayLoad= Long.parseLong(downPayLoad);       }                   @Override       publicvoid readFields(DataInput in) throws IOException {              this.upPackNum= in.readLong();              this.downPackNum= in.readLong();              this.upPayLoad= in.readLong();              this.downPayLoad= in.readLong();       }        @Override       publicvoid write(DataOutput out) throws IOException {              out.writeLong(upPackNum);              out.writeLong(downPackNum);              out.writeLong(upPayLoad);              out.writeLong(downPayLoad);       }             @Override       publicString toString() {              returnupPackNum + "\t" + downPackNum + "\t" + upPayLoad +"\t" + downPayLoad;       }} 





数据上传到HDFS上面:

 

 

运行过程Console:

15/02/2200:04:22 WARN util.NativeCodeLoader: Unable to load native-hadoop library foryour platform... using builtin-java classes where applicable15/02/2200:04:22 WARN mapred.JobClient: Use GenericOptionsParser for parsing thearguments. Applications should implement Tool for the same.15/02/2200:04:22 WARN mapred.JobClient: No job jar file set.  User classes may not be found. SeeJobConf(Class) or JobConf#setJar(String).15/02/2200:04:22 INFO input.FileInputFormat: Total input paths to process : 115/02/2200:04:22 WARN snappy.LoadSnappy: Snappy native library not loaded15/02/2200:04:23 INFO mapred.JobClient: Running job: job_local1887351217_000115/02/2200:04:23 INFO mapred.LocalJobRunner: Waiting for map tasks15/02/2200:04:23 INFO mapred.LocalJobRunner: Starting task:attempt_local1887351217_0001_m_000000_015/02/2200:04:23 INFO mapred.Task:  UsingResourceCalculatorPlugin : null15/02/2200:04:23 INFO mapred.MapTask: Processing split: hdfs://192.168.1.100:9000/input/HTTP_20130313143750.dat:0+221415/02/2200:04:23 INFO mapred.MapTask: io.sort.mb = 10015/02/2200:04:23 INFO mapred.MapTask: data buffer = 79691776/9961472015/02/2200:04:23 INFO mapred.MapTask: record buffer = 262144/32768015/02/2200:04:23 INFO mapred.MapTask: Starting flush of map output15/02/2200:04:23 INFO mapred.MapTask: Finished spill 015/02/2200:04:23 INFO mapred.Task: Task:attempt_local1887351217_0001_m_000000_0 isdone. And is in the process of commiting15/02/2200:04:23 INFO mapred.LocalJobRunner:15/02/2200:04:23 INFO mapred.Task: Task 'attempt_local1887351217_0001_m_000000_0' done.15/02/2200:04:23 INFO mapred.LocalJobRunner: Finishing task:attempt_local1887351217_0001_m_000000_015/02/2200:04:23 INFO mapred.LocalJobRunner: Map task executor complete.15/02/2200:04:23 INFO mapred.Task:  UsingResourceCalculatorPlugin : null15/02/2200:04:23 INFO mapred.LocalJobRunner:15/02/2200:04:23 INFO mapred.Merger: Merging 1 sorted segments15/02/2200:04:23 INFO mapred.Merger: Down to the last merge-pass, with 1 segments leftof total size: 1011 bytes15/02/2200:04:23 INFO mapred.LocalJobRunner:15/02/2200:04:24 INFO mapred.Task: Task:attempt_local1887351217_0001_r_000000_0 isdone. And is in the process of commiting15/02/2200:04:24 INFO mapred.LocalJobRunner:15/02/2200:04:24 INFO mapred.Task: Task attempt_local1887351217_0001_r_000000_0 isallowed to commit now15/02/2200:04:24 INFO mapred.JobClient:  map 100%reduce 0%15/02/2200:04:24 INFO output.FileOutputCommitter: Saved output of task'attempt_local1887351217_0001_r_000000_0' tohdfs://192.168.1.100:9000/output/out0215/02/2200:04:24 INFO mapred.LocalJobRunner: reduce > reduce15/02/2200:04:24 INFO mapred.Task: Task 'attempt_local1887351217_0001_r_000000_0' done.15/02/2200:04:25 INFO mapred.JobClient:  map 100%reduce 100%15/02/2200:04:25 INFO mapred.JobClient: Job complete: job_local1887351217_000115/02/2200:04:25 INFO mapred.JobClient: Counters: 1915/02/2200:04:25 INFO mapred.JobClient:   FileOutput Format Counters15/02/2200:04:25 INFO mapred.JobClient:     BytesWritten=55615/02/2200:04:25 INFO mapred.JobClient:   FileInput Format Counters15/02/2200:04:25 INFO mapred.JobClient:     BytesRead=221415/02/2200:04:25 INFO mapred.JobClient:  FileSystemCounters15/02/2200:04:25 INFO mapred.JobClient:    FILE_BYTES_READ=136515/02/2200:04:25 INFO mapred.JobClient:    HDFS_BYTES_READ=442815/02/2200:04:25 INFO mapred.JobClient:    FILE_BYTES_WRITTEN=14105415/02/2200:04:25 INFO mapred.JobClient:     HDFS_BYTES_WRITTEN=55615/02/2200:04:25 INFO mapred.JobClient:  Map-Reduce Framework15/02/2200:04:25 INFO mapred.JobClient:     Mapoutput materialized bytes=101515/02/2200:04:25 INFO mapred.JobClient:     Mapinput records=2215/02/2200:04:25 INFO mapred.JobClient:    Reduce shuffle bytes=015/02/2200:04:25 INFO mapred.JobClient:    Spilled Records=4415/02/2200:04:25 INFO mapred.JobClient:     Mapoutput bytes=96515/02/2200:04:25 INFO mapred.JobClient:     Totalcommitted heap usage (bytes)=32387891215/02/2200:04:25 INFO mapred.JobClient:    Combine input records=015/02/2200:04:25 INFO mapred.JobClient:    SPLIT_RAW_BYTES=12015/02/2200:04:25 INFO mapred.JobClient:    Reduce input records=2215/02/2200:04:25 INFO mapred.JobClient:     Reduce input groups=2115/02/2200:04:25 INFO mapred.JobClient:    Combine output records=015/02/2200:04:25 INFO mapred.JobClient:    Reduce output records=2115/02/2200:04:25 INFO mapred.JobClient:     Mapoutput records=22


 

 

HDFS显示运行结果:

 

 

 


 

 

 

附数据:

 

格式为:记录报告时间戳、手机号码、AP mac、AC mac、访问的网址、网址种类、上行数据包数、下行数据包数、上行总流量、下行总流量、HTTP Response的状态。

 

 

1363157985066     13726230503  00-FD-07-A4-72-B8:CMCC  120.196.100.82      i02.c.aliimg.com           24    27    2481       24681     2001363157995052     13826544101  5C-0E-8B-C7-F1-E0:CMCC 120.197.40.4                4     0     264  0     2001363157991076     13926435656  20-10-7A-28-CC-0A:CMCC  120.196.100.99                    2     4     132  1512       2001363154400022     13926251106  5C-0E-8B-8B-B1-50:CMCC  120.197.40.4                4     0     240  0     2001363157993044     18211575961  94-71-AC-CD-E6-18:CMCC-EASY     120.196.100.99      iface.qiyi.com 视频网站       15    12       1527       2106       2001363157995074     84138413       5C-0E-8B-8C-E8-20:7DaysInn     120.197.40.4  122.72.52.12         20    16    4116       1432       2001363157993055     13560439658  C4-17-FE-BA-DE-D9:CMCC       120.196.100.99                    18    15    1116 954  2001363157995033     15920133257  5C-0E-8B-C7-BA-20:CMCC 120.197.40.4  sug.so.360.cn  信息安全       20    20    3156       2936       2001363157983019     13719199419  68-A1-B7-03-07-B1:CMCC-EASY      120.196.100.82                    4     0     240  0     2001363157984041     13660577991  5C-0E-8B-92-5C-20:CMCC-EASY      120.197.40.4  s19.cnzz.com  站点统计       24    9       6960       690  2001363157973098     15013685858  5C-0E-8B-C7-F7-90:CMCC  120.197.40.4  rank.ie.sogou.com  搜索引擎       28    27    3659       3538       2001363157986029     15989002119  E8-99-C4-4E-93-E0:CMCC-EASY      120.196.100.99      www.umeng.com   站点统计       3       3     1938       180  2001363157992093     13560439658  C4-17-FE-BA-DE-D9:CMCC       120.196.100.99                    15    9     918  4938       2001363157986041     13480253104  5C-0E-8B-C7-FC-80:CMCC-EASY     120.197.40.4                3     3     180  180  2001363157984040     13602846565  5C-0E-8B-8B-B6-00:CMCC  120.197.40.4  2052.flash2-http.qq.com       综合门户       15    12       1938       2910       2001363157995093     13922314466  00-FD-07-A2-EC-BA:CMCC       120.196.100.82      img.qfc.cn             12    12    3008       3720       2001363157982040     13502468823  5C-0A-5B-6A-0B-D4:CMCC-EASY    120.196.100.99      y0.ifengimg.com    综合门户       57       102  7335       110349    2001363157986072     18320173382  84-25-DB-4F-10-1A:CMCC-EASY     120.196.100.99      input.shouji.sogou.com  搜索引擎       21    18    9531       2412       2001363157990043     13925057413  00-1F-64-E1-E6-9A:CMCC  120.196.100.55      t3.baidu.com   搜索引擎       69    63    11058       48243     2001363157988072     13760778710  00-FD-07-A4-7B-08:CMCC  120.196.100.82                    2     2     120  120  2001363157985079     13823070001  20-7C-8F-70-68-1F:CMCC   120.196.100.99                    6     3     360  180  2001363157985069     13600217502  00-1F-64-E2-E8-B1:CMCC  120.196.100.55                    18    138  1080       186852    200 


 

 

0 0
原创粉丝点击