使用Hortonworks Sanbox 练习 Hadoop 和 MapReduce
来源:互联网 发布:数据库里如何切换用户 编辑:程序博客网 时间:2024/05/17 17:41
最近在上Coursera的云计算系列课程。在Cloud Application里面,需要提交练习编写MapReduce的作业。便捷模拟Hadoop环境的虚拟机是Hortonworks Sanbox。
开机之后可以SSH登陆,127.0.0.1:2222
# 添加环境变量export HADOOP_CLASSPATH=$JAVA_HOME/lib/tools.jar# 编译hadoop com.sun.tools.javac.Main TopTitleStatistics.java -d build# 打包jarjar -cvf TopTitleStatistics.jar -C build/ ./# 执行hadoop jar TopTitleStatistics.jar TopTitleStatistics -D stopwords=/mp2/misc/stopwords.txt -D delimiters=/mp2/misc/delimiters.txt -D N=5 /mp2/titles /mp2/C-output# 查看输出结果hadoop fs -cat /mp2/C-output/part* | head -n 100# 删除输出和编译结果(如果要重新运行,必须删除输出)hadoop fs -rm -r /mp2/C-outputrm -rf ./build/* ./TopTitleStatistics.jar
附源码
import org.apache.commons.logging.Log;import org.apache.commons.logging.LogFactory;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.conf.Configured;import org.apache.hadoop.fs.FSDataInputStream;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.ArrayWritable;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;import org.apache.hadoop.util.Tool;import org.apache.hadoop.util.ToolRunner;import java.io.BufferedReader;import java.io.IOException;import java.io.InputStreamReader;import java.util.Arrays;import java.util.List;import java.util.StringTokenizer;import java.util.TreeSet;/* * TopTitles.java */// >>> Don't Changepublic class TopTitles extends Configured implements Tool { public static void main(String[] args) throws Exception { int res = ToolRunner.run(new Configuration(), new TopTitles(), args); System.exit(res); } @Override public int run(String[] args) throws Exception { Configuration conf = this.getConf(); FileSystem fs = FileSystem.get(conf); Path tmpPath = new Path("/mp2/tmp"); fs.delete(tmpPath, true); Job jobA = Job.getInstance(conf, "Title Count"); jobA.setOutputKeyClass(Text.class); jobA.setOutputValueClass(IntWritable.class); jobA.setMapperClass(TitleCountMap.class); jobA.setReducerClass(TitleCountReduce.class); FileInputFormat.setInputPaths(jobA, new Path(args[0])); FileOutputFormat.setOutputPath(jobA, tmpPath); jobA.setJarByClass(TopTitles.class); jobA.waitForCompletion(true); Job jobB = Job.getInstance(conf, "Top Titles"); jobB.setOutputKeyClass(Text.class); jobB.setOutputValueClass(IntWritable.class); jobB.setMapOutputKeyClass(NullWritable.class); jobB.setMapOutputValueClass(TextArrayWritable.class); jobB.setMapperClass(TopTitlesMap.class); jobB.setReducerClass(TopTitlesReduce.class); jobB.setNumReduceTasks(1); FileInputFormat.setInputPaths(jobB, tmpPath); FileOutputFormat.setOutputPath(jobB, new Path(args[1])); jobB.setInputFormatClass(KeyValueTextInputFormat.class); jobB.setOutputFormatClass(TextOutputFormat.class); jobB.setJarByClass(TopTitles.class); return jobB.waitForCompletion(true) ? 0 : 1; } public static String readHDFSFile(String path, Configuration conf) throws IOException{ Path pt=new Path(path); FileSystem fs = FileSystem.get(pt.toUri(), conf); FSDataInputStream file = fs.open(pt); BufferedReader buffIn=new BufferedReader(new InputStreamReader(file)); StringBuilder everything = new StringBuilder(); String line; while( (line = buffIn.readLine()) != null) { everything.append(line); everything.append("\n"); } return everything.toString(); } public static class TextArrayWritable extends ArrayWritable { public TextArrayWritable() { super(Text.class); } public TextArrayWritable(String[] strings) { super(Text.class); Text[] texts = new Text[strings.length]; for (int i = 0; i < strings.length; i++) { texts[i] = new Text(strings[i]); } set(texts); } }// <<< Don't Change public static class TitleCountMap extends Mapper<Object, Text, Text, IntWritable> { List<String> stopWords; String delimiters; @Override protected void setup(Context context) throws IOException,InterruptedException { Configuration conf = context.getConfiguration(); String stopWordsPath = conf.get("stopwords"); String delimitersPath = conf.get("delimiters"); this.stopWords = Arrays.asList(readHDFSFile(stopWordsPath, conf).split("\n")); this.delimiters = readHDFSFile(delimitersPath, conf); } @Override public void map(Object key, Text value, Context context) throws IOException, InterruptedException { // TODO String line = value.toString(); StringTokenizer st = new StringTokenizer(line, delimiters); while (st.hasMoreTokens()) { String word = (st.nextToken()).trim().toLowerCase(); if (!stopWords.contains(word)) { context.write(new Text(word), new IntWritable(1)); } } } } public static class TitleCountReduce extends Reducer<Text, IntWritable, Text, IntWritable> { @Override public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { // TODO int sum = 0; for (IntWritable val: values) { sum += val.get(); } context.write(key, new IntWritable(sum)); } } public static class TopTitlesMap extends Mapper<Text, Text, NullWritable, TextArrayWritable> { Integer N; // TODO /* * add TreeSet. Item in treeset are sorted acsended * sorted by KEY automatically */ TreeSet<Pair<Integer,String>> titleCountMap = new TreeSet<Pair<Integer,String>>(); @Override protected void setup(Context context) throws IOException,InterruptedException { Configuration conf = context.getConfiguration(); this.N = conf.getInt("N", 10); } @Override public void map(Text key, Text value, Context context) throws IOException, InterruptedException { // TODO String word = key.toString(); Integer count = Integer.parseInt(value.toString()); titleCountMap.add(new Pair<Integer,String>(count,word)); if (titleCountMap.size() > N) { //remove too much items, no more than N (default 10) titleCountMap.remove(titleCountMap.first()); } } @Override protected void cleanup(Context context) throws IOException, InterruptedException { // TODO //When mapper is nearly finish, method cleanup() is called for (Pair<Integer,String> item: titleCountMap) { String[] strings = {item.second, item.first.toString()}; TextArrayWritable val = new TextArrayWritable(strings); context.write(NullWritable.get(), val); } } } public static class TopTitlesReduce extends Reducer<NullWritable, TextArrayWritable, Text, IntWritable> { Integer N; // TODO TreeSet<Pair<Integer,String>> titleCountMap = new TreeSet<Pair<Integer,String>>(); @Override protected void setup(Context context) throws IOException,InterruptedException { Configuration conf = context.getConfiguration(); this.N = conf.getInt("N", 10); } @Override public void reduce(NullWritable key, Iterable<TextArrayWritable> values, Context context) throws IOException, InterruptedException { // TODO //Because Mapper's output key is a NullWritable, all of the output will send to a single reducer for (TextArrayWritable val:values) { Text[] pair = (Text[]) val.toArray(); String word = pair[0].toString(); Integer count = Integer.parseInt(pair[1].toString()); titleCountMap.add(new Pair<Integer,String>(count,word)); } if (titleCountMap.size() > N) { titleCountMap.remove(titleCountMap.first()); } for (Pair<Integer,String> item:titleCountMap) { Text word = new Text(item.second); IntWritable count = new IntWritable(item.first); context.write(word, count); } } }}// >>> Don't Changeclass Pair<A extends Comparable<? super A>, B extends Comparable<? super B>> implements Comparable<Pair<A, B>> { public final A first; public final B second; public Pair(A first, B second) { this.first = first; this.second = second; } public static <A extends Comparable<? super A>, B extends Comparable<? super B>> Pair<A, B> of(A first, B second) { return new Pair<A, B>(first, second); } @Override public int compareTo(Pair<A, B> o) { int cmp = o == null ? 1 : (this.first).compareTo(o.first); return cmp == 0 ? (this.second).compareTo(o.second) : cmp; } @Override public int hashCode() { return 31 * hashcode(first) + hashcode(second); } private static int hashcode(Object o) { return o == null ? 0 : o.hashCode(); } @Override public boolean equals(Object obj) { if (!(obj instanceof Pair)) return false; if (this == obj) return true; return equal(first, ((Pair<?, ?>) obj).first) && equal(second, ((Pair<?, ?>) obj).second); } private boolean equal(Object o1, Object o2) { return o1 == o2 || (o1 != null && o1.equals(o2)); } @Override public String toString() { return "(" + first + ", " + second + ')'; }}// <<< Don't Change
0 0
- 使用Hortonworks Sanbox 练习 Hadoop 和 MapReduce
- hortonworks sanbox搭建hadoop2的学习环境
- hortonworks hadoop相关安装
- 企业级三大hadoop-Cloudera、Hortonworks和MapR
- 使用Hadoop MapReduce 排序
- 使用Hortonworks的Hadoop发行版(hdp)在Windows系统上安装Hadoop集群
- MapReduce练习二:ChainMapper和ChainReducer的使用
- hadoop之MapReduce练习-二次排序
- Centos7下Hortonworks的Ambari-server和Hadoop集群平台重装.
- (译文)Cloudera、Hortonworks 和 MapR —— Hadoop商业发行版的对比分析
- Cloudera、Hortonworks 和 MapR —— Hadoop商业发行版的对比分析
- Hadoop—MapReduce进行数据查询和实现推简单荐系统---练习7
- Hadoop第7周练习—MapReduce进行数据查询和实现推简单荐系统
- 使用hadoop MapReduce进行排序
- 使用hadoop MapReduce进行排序
- Hadoop MapReduce进阶 使用Chain
- 使用Hadoop MapReduce 进行排序
- 使用Hadoop MapReduce 进行排序
- VC获取并修改计算机屏幕分辨率
- 设计模式--解释器模式
- http://blog.csdn.net/renfufei/article/details/41647937
- Android中Fragment碎片解析
- SVN学习笔记3 -- SVN及相关安装
- 使用Hortonworks Sanbox 练习 Hadoop 和 MapReduce
- 项目只输入ip即可访问
- iOS7之后JavaScript与Objective-C之间的通信
- CLRS第九章思考题
- 如何查询本机IP
- Android 开发环境下载地址 -- 百度网盘 adt-bundle android-studio sdk adt 下载
- 气急败坏
- godbc中使用mssql的小实例
- CATransition动画基本用法