hadoop in action第四章代码总结(hadoop 0.20的API)
来源:互联网 发布:Dijstra算法属于 编辑:程序博客网 时间:2024/05/16 05:31
《Hadoop in Action》一书(我用的应该是第一版)大部分代码都是基于hadoop0.18.3的,今天学习第四章的前面的代码,同时将代码改为hadoop0.20.2的API下重写。hadoop0.20较之hadoop0.18有些许变化,最有益的变化是引入了上下文对象context,最直接的影响在于替换了map()和reduce()中使用的OutputCollector和Reporter对象。现在将调用context.write()而不是outputcollector.collect()输出键/值对。深远的影响暂时我还是不太明白,但是书上有讲。新的map(),reduce()方法能够多抛出异常(见代码)。
新的AP基本框架如下:
public class xxx extends Configured implements Tool {public static class MapClass extends Mapper<LongWritable,Text,Text,Text>{public void map(LongWritable key, Text value,Context context)throws IOException,InterruptedException{}}public static class Reduce extends Reducer<Text,Text,Text,Text>{public void reduce(Text key,Iterable<Text> values,Context context)throws IOException,InterruptedException{}}public int run(String[] args) throws Exception{Configuration conf = getConf();Job job = new Job(conf,"MyJob");Path in = new Path(args[0]);Path out = new Path(args[1]);FileInputFormat.setInputPaths(job, in);FileOutputFormat.setOutputPath(job, out);job.setMapperClass(MapClass.class);job.setReducerClass(Reduce.class);<pre name="code" class="java"> //自己设定如下参数job.setInputFormatClass(TextInputFormat.class);job.setOutputFormatClass(TextOutputFormat.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(Text.class);System.exit(job.waitForCompletion(true)?0:1);return 0;}public static void main(String[] args) throws Exception{int res = ToolRunner.run(new Configuration(), new MyJob(),args);System.exit(res);}}一,读取专利引用数据并实现倒排
import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.DoubleWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; public class MyJob extends Configured implements Tool {public static class MapClass extends Mapper<LongWritable,Text,Text,Text>{public void map(LongWritable key, Text value,Context context)throws IOException,InterruptedException{String[] citation = value.toString().split(",");//专利数据的格式为xxx,yyy,yyy为被引用的专利号context.write(new Text(citation[1]),new Text( citation[0]));//yyy被xxx引用}}public static class Reduce extends Reducer<Text,Text,Text,Text>{public void reduce(Text key,Iterable<Text> values,Context context)throws IOException,InterruptedException{String csv = "";for(Text val:values){if(csv.length() > 0)csv += ",";csv += val.toString();}context.write(key, new Text(csv));}}public int run(String[] args) throws Exception{Configuration conf = getConf();Job job = new Job(conf,"MyJob");//job.setJarByClass();没有也能运行?why? Path in = new Path(args[0]);Path out = new Path(args[1]);FileInputFormat.setInputPaths(job, in);FileOutputFormat.setOutputPath(job, out);job.setMapperClass(MapClass.class);job.setReducerClass(Reduce.class);job.setInputFormatClass(TextInputFormat.class);//输入数据格式job.setOutputFormatClass(TextOutputFormat.class);//输出数据格式job.setOutputKeyClass(Text.class);//中间数据格式(我是这样理解的)job.setOutputValueClass(Text.class);//中间数据格式(我是这样理解的)System.exit(job.waitForCompletion(true)?0:1);return 0;}public static void main(String[] args) throws Exception{int res = ToolRunner.run(new Configuration(), new MyJob(),args);System.exit(res);}}
二,统计专利被引用次数
只需要按照上面的程序修改map,reduce部分即可。
public static class Reduce extends Reducer<Text,Text,Text,IntWritable>{public void reduce(Text key,Iterable<Text> values,Context context)throws IOException,InterruptedException{Iterator<Text> it = values.iterator();//iterable没有能够遍历全部的方法,用其iterator遍历计数int count = 0;while(it.hasNext()){it.next();count++;}context.write(key, new IntWritable(count));}}输出结果<pre name="code" class="java">121000011000001100000611000007110000111100001711000026110000332100004311000044210000451100004621000049110000511100005411000065110000673100007021000073210000762100008321000084210000864前面表示专利号后面表示被引用次数,其实开头有一个"CITED" tab 1,但是这个数据应该删除,否则下面的实验就没法进行,尽管代码正确。三,最终统计程序
import java.io.IOException; import java.util.Iterator;import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.DoubleWritable; import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.KeyValueTextInputFormat;import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; public class CitationHistogram extends Configured implements Tool{/*这个程序的输入应该是上面一个程序的输出,上面程序的输出格式中间为tab(\t)键*/public static class MapClass extends Mapper<LongWritable,Text,IntWritable,IntWritable>{//采用的是LongWritable,每一行的字节偏移量。private final static IntWritable uno = new IntWritable(1);private final static IntWritable cit = new IntWritable();public void map(LongWritable key, Text value,Context context)throws IOException,InterruptedException{String[] s = value.toString().split("\t");cit.set(Integer.parseInt(s[1]));//s[1]即为被某专利引用的次数context.write(cit,uno);//被引用的次数为key,1为value}}public static class Reduce extends Reducer<IntWritable,IntWritable,IntWritable,IntWritable>{public void reduce(IntWritable key, Iterable<IntWritable> values,Context context)throws IOException,InterruptedException{int count = 0;Iterator<IntWritable> it = values.iterator();while(it.hasNext()){//将上面的分散的1搜集起来,计总数it.next();count++;} context.write(key, new IntWritable(count));}}public int run(String[] args)throws Exception{Configuration conf = getConf();Job job = new Job(conf,"CitationHistogram");job.setJarByClass(CitationHistogram.class);Path in = new Path(args[0]);Path out = new Path(args[1]);FileInputFormat.setInputPaths(job, in);FileOutputFormat.setOutputPath(job, out);job.setMapperClass(MapClass.class);job.setReducerClass(Reduce.class);//job.setInputFormatClass(KeyValueTextInputFormat.class);因为新的版本中不支持KeyValueTextInputFormat输入,实际上上面输入结果是\t隔开的,很适合KeyValueTextInputFormat,没办法,退而求其次!job.setInputFormatClass(TextInputFormat.class);job.setOutputFormatClass(TextOutputFormat.class);job.setOutputKeyClass(IntWritable.class);job.setOutputValueClass(IntWritable.class);System.exit(job.waitForCompletion(true)?0:1);return 0;}public static void main(String[] args)throws Exception{int res = ToolRunner.run(new Configuration(), new CitationHistogram(),args);}}输出的结果为:
1921127255224633803194278438521081461631497127941....6311633165416581678171617791第一行表示1引用的专利为900K+,依次类推。
0 0
- hadoop in action第四章代码总结(hadoop 0.20的API)
- Hadoop In Action
- 读Hadoop in action
- Hadoop In action 笔记
- Hadoop in Action Note
- hadoop in action 读书笔记-第123章
- Hadoop in action 第45678章
- Hadoop in Action]第1章 Hadoop简介
- [hadoop in Action] 第3章 Hadoop组件
- Hadoop in Action简单笔记
- hadoop in action 学习疑点
- John's Hadoop in Action
- hadoop新版的api接口实现启动运行hadoop代码
- hadoop in action里的所需import语句
- adoop in Action] 第2章 初识Hadoop
- [Hadoop in Action] 第5章 高阶MapReduce
- Hadoop in Action] 第6章 编程实践
- [Hadoop in Action] 第7章 细则手册
- C-多线程-代码-CreateThread
- 广义表的存储结构(广义表的递归算法,复制广义表,求广义表的深度)
- LeetCode 4Sum
- c语言数据类型
- 第九章 9.3.4节练习 & 9.3.5节练习
- hadoop in action第四章代码总结(hadoop 0.20的API)
- 使用图灵机器人笑话功能提高微信公众帐号活跃度
- UIScrollView实现自动轮播,可用于广告图片轮播
- FreeBSD10的pkg问题及ports下载优化
- 根据内核打印的段错误信息分析驱动程序——根据出错PC来分析
- android异步请求asyncTask使用—分析getResponseCode()阻塞
- HDU 4971 A simple brute force problem.(dp)
- HTTPURLConnection详解
- 【九度OJ】1024【并查集】【BST】【畅通工程系列2007】