User Defined Hadoop DataType
来源:互联网 发布:时尚女装淘宝店铺推荐 编辑:程序博客网 时间:2024/05/21 04:41
User Defined Hadoop DataType
目录
- User Defined Hadoop DataType
- 目录
- 需求
- 实现
- 运行
需求
有时候 Hadoop 内置的数据类型不能满足我们的要求,这个时候就需要自定义类型了。
假设输入文件是很多电话号码,每行一个:
1361234567813051812535130518125351391234567713412345678
要求按照如下格式输出
13412345678 is subscribed from ***China Mobile***, appearing 1 times
其中的 China Mobile 和 1,都是算出来的。
实现
需要一个电话号码类 TelNo,需要实现 WritableComparable 接口。
// TelNo.javapackage com.stephen.hadoop;import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;import org.apache.hadoop.io.WritableComparable;public class TelNo implements WritableComparable<TelNo>{ private String no; private String operator; private Integer times; private transient final int BEGINPOS = 0; private transient final int ENDPOS = 3; public TelNo() {} @Override public void write(DataOutput out) throws IOException { out.writeUTF(no); out.writeUTF(operator); out.writeInt(times); } @Override public void readFields(DataInput in) throws IOException { no = in.readUTF(); operator = in.readUTF(); times = in.readInt(); } @Override public int compareTo(TelNo o) { return this.no.compareTo(o.getNo()); } public boolean equals(Object o) { if( !(o instanceof TelNo)) { return false; } TelNo other = (TelNo) o; return this.no.compareTo(other.getNo()) == 0; } public int hashCode() { return no.hashCode(); } public Integer getTimes() { return times; } public void setTimes(Integer times) { this.times = times; } public void setNo(String no) { this.no = no; } public String getNo() { return no; } public String getOperator() { String header = no.substring(BEGINPOS, ENDPOS); if (header.compareTo("130") >= 0) { if (header.compareTo("135") <= 0) { operator = "***China Mobile***"; } else if (header.compareTo("137") <= 0) { operator = "***China Unicom***"; } else if (header.compareTo("139") <= 0) { operator = "***China Telecom***"; } else { operator = "***Invalid Operator***"; } } return operator; } @Override public String toString() { return "is subscribed from " + getOperator() + ", appearing " + times + " times"; }}
MapReduce 实现如下(Partitioner 类没有使用)
// TelNoCategorizerTool.javapackage com.stephen.hadoop;import java.io.IOException;import java.util.ArrayList;import java.util.List;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Partitioner;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat;import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;import org.apache.hadoop.util.Tool;import org.apache.hadoop.util.ToolRunner;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.conf.Configured;public class TelNoCategorizerTool extends Configured implements Tool { public static class TelNoMapper extends Mapper<LongWritable, Text, Text, LongWritable> { private Text telno = new Text(); private final static LongWritable one = new LongWritable(1); public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String newkey = value.toString(); telno.set(newkey); context.write(telno, one); } } public static class TelNoReducer extends Reducer<Text, LongWritable, Text, TelNo> { private TelNo telNo = new TelNo(); public void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException { int sum = 0; for (LongWritable val : values) { sum += val.get(); } telNo.setNo(key.toString()); telNo.setTimes(sum); context.write(key, telNo); } } public static class OperatorPartitioner<K, V> extends Partitioner<K, V> { private static final List<String> mobileNumList = new ArrayList<>(); private static final List<String> unicomNumList = new ArrayList<>(); private static final List<String> telecomNumList = new ArrayList<>(); static { mobileNumList.add("130"); mobileNumList.add("131"); mobileNumList.add("132"); mobileNumList.add("133"); mobileNumList.add("134"); mobileNumList.add("135"); unicomNumList.add("136"); unicomNumList.add("137"); telecomNumList.add("138"); telecomNumList.add("139"); } @Override public int getPartition(K key, V value, int numReduceTasks) { String telNoHead = key.toString().substring(0, 3); if (mobileNumList.contains(telNoHead)) { return 1; } else if (unicomNumList.contains(telNoHead)) { return 2; } else if (telecomNumList.contains(telNoHead)) { return 3; } else { return 0; } } } @Override public int run(String[] args) throws Exception { Configuration conf = this.getConf(); Job job = Job.getInstance(conf, "Telno Categorizer"); job.setJarByClass(TelNoCategorizerTool.class); job.setMapperClass(TelNoMapper.class); job.setReducerClass(TelNoReducer.class); // 只对 Mapper 生效 job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); /** * 这两个方法对 Mapper 和 Reducer 都生效 * 所以要在上面单独指定 Mapper 的Key 和 Value 的格式 * 没有 setReduceOutputKeyClass...方法 */ job.setOutputKeyClass(Text.class); job.setOutputValueClass(TelNo.class); LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); job.setPartitionerClass(OperatorPartitioner.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); return job.waitForCompletion(true) ? 0 : 1; } public static void main(String[] args) throws Exception { int exitCode = ToolRunner.run(new Configuration(), new TelNoCategorizerTool(), args); System.exit(exitCode); }}
运行
执行一下:
hadoop jar TelNoCategorizerTool.jar com.stephen.hadoop.TelNoCategorizerTool /user/stephen/input/ /user/stephen/output
查看结果:
hadoop fs -cat /user/stephen/output/part-r-00000#output13051812535 is subscribed from ***China Mobile***, appearing 2 times13412345678 is subscribed from ***China Mobile***, appearing 1 times13612345678 is subscribed from ***China Unicom***, appearing 1 times13912345677 is subscribed from ***China Telecom***, appearing 1 times
如果想要分区:
hadoop jar TelNoCategorizerTool.jar com.stephen.hadoop.TelNoCategorizerTool -D mapreduce.job.reduces=4 /user/stephen/input/ /user/stephen/output
能看到 3 个文件(使用了 LazyOutputFormat,不会输出空记录),分别包含了分区后的记录。
hadoop fs -ls /user/stephen/output/#output-rw-r--r-- 1 root supergroup 0 2016-08-26 13:48 /user/stephen/output/_SUCCESS-rw-r--r-- 1 root supergroup 144 2016-08-26 13:48 /user/stephen/output/part-r-00001-rw-r--r-- 1 root supergroup 72 2016-08-26 13:48 /user/stephen/output/part-r-00002-rw-r--r-- 1 root supergroup 73 2016-08-26 13:48 /user/stephen/output/part-r-00003
3 个文件的内容合并起来就是之前的 part-r-00000 的内容。
0 0
- User Defined Hadoop DataType
- 13.4 User-defined conversions
- User-defined conversions
- User Defined Window Messages
- [PHP] User Defined Extension
- UDF (User-defined Function)
- python User-defined Exceptions
- User Defined Functions
- gdb user defined command
- User Defined Runtime Attributes
- C++ User-defined Datatypes
- User Defined Runtime Attributes
- User Defined Runtime Attributes
- User Defined Runtime Attributes
- User Defined Runtime Attributes
- User Defined Runtime Attributes
- NOKIA tune ringtones: user defined
- MFC handle user defined message
- 在 SOHU BLOG 开一个空间
- 杂七杂八 洋酒不习惯
- Spring整合redis的错误
- BZOJ4510——[Usaco2016 Jan]Radio Contact
- 悼念贝娜齐尔·布托
- User Defined Hadoop DataType
- 看不下去的《集结号》
- JS(JavaScript) 与 JSP(Java server page) 的区别
- 说说大清国的铁帽子王
- java.lang.reflect.InvocationTargetException 异常(Json转化时抛出)
- 吉林大学唯唐敖庆一人尔
- Filter的生命周期
- you and me ,from one world, we are family
- 周鸿祎博客:谷歌Chrome不是浏览器 说的不错