使用Hortonworks Sanbox 练习 Hadoop 和 MapReduce

最近在上Coursera的云计算系列课程。在Cloud Application里面,需要提交练习编写MapReduce的作业。便捷模拟Hadoop环境的虚拟机是Hortonworks Sanbox。


# 添加环境变量export HADOOP_CLASSPATH=$JAVA_HOME/lib/tools.jar# 编译hadoop com.sun.tools.javac.Main  TopTitleStatistics.java  -d build# 打包jarjar -cvf  TopTitleStatistics.jar -C build/ ./# 执行hadoop jar TopTitleStatistics.jar TopTitleStatistics -D stopwords=/mp2/misc/stopwords.txt -D delimiters=/mp2/misc/delimiters.txt -D N=5 /mp2/titles /mp2/C-output# 查看输出结果hadoop fs -cat /mp2/C-output/part* | head -n 100# 删除输出和编译结果(如果要重新运行,必须删除输出)hadoop fs -rm -r /mp2/C-outputrm -rf ./build/* ./TopTitleStatistics.jar


import org.apache.commons.logging.Log;import org.apache.commons.logging.LogFactory;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.conf.Configured;import org.apache.hadoop.fs.FSDataInputStream;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.ArrayWritable;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;import org.apache.hadoop.util.Tool;import org.apache.hadoop.util.ToolRunner;import java.io.BufferedReader;import java.io.IOException;import java.io.InputStreamReader;import java.util.Arrays;import java.util.List;import java.util.StringTokenizer;import java.util.TreeSet;/* * TopTitles.java */// >>> Don't Changepublic class TopTitles extends Configured implements Tool {    public static void main(String[] args) throws Exception {        int res = ToolRunner.run(new Configuration(), new TopTitles(), args);        System.exit(res);    }    @Override    public int run(String[] args) throws Exception {        Configuration conf = this.getConf();        FileSystem fs = FileSystem.get(conf);        Path tmpPath = new Path("/mp2/tmp");        fs.delete(tmpPath, true);        Job jobA = Job.getInstance(conf, "Title Count");        jobA.setOutputKeyClass(Text.class);        jobA.setOutputValueClass(IntWritable.class);        jobA.setMapperClass(TitleCountMap.class);        jobA.setReducerClass(TitleCountReduce.class);        FileInputFormat.setInputPaths(jobA, new Path(args[0]));        FileOutputFormat.setOutputPath(jobA, tmpPath);        jobA.setJarByClass(TopTitles.class);        jobA.waitForCompletion(true);        Job jobB = Job.getInstance(conf, "Top Titles");        jobB.setOutputKeyClass(Text.class);        jobB.setOutputValueClass(IntWritable.class);        jobB.setMapOutputKeyClass(NullWritable.class);        jobB.setMapOutputValueClass(TextArrayWritable.class);        jobB.setMapperClass(TopTitlesMap.class);        jobB.setReducerClass(TopTitlesReduce.class);        jobB.setNumReduceTasks(1);        FileInputFormat.setInputPaths(jobB, tmpPath);        FileOutputFormat.setOutputPath(jobB, new Path(args[1]));        jobB.setInputFormatClass(KeyValueTextInputFormat.class);        jobB.setOutputFormatClass(TextOutputFormat.class);        jobB.setJarByClass(TopTitles.class);        return jobB.waitForCompletion(true) ? 0 : 1;    }    public static String readHDFSFile(String path, Configuration conf) throws IOException{        Path pt=new Path(path);        FileSystem fs = FileSystem.get(pt.toUri(), conf);        FSDataInputStream file = fs.open(pt);        BufferedReader buffIn=new BufferedReader(new InputStreamReader(file));        StringBuilder everything = new StringBuilder();        String line;        while( (line = buffIn.readLine()) != null) {            everything.append(line);            everything.append("\n");        }        return everything.toString();    }    public static class TextArrayWritable extends ArrayWritable {        public TextArrayWritable() {            super(Text.class);        }        public TextArrayWritable(String[] strings) {            super(Text.class);            Text[] texts = new Text[strings.length];            for (int i = 0; i < strings.length; i++) {                texts[i] = new Text(strings[i]);            }            set(texts);        }    }// <<< Don't Change    public static class TitleCountMap extends Mapper<Object, Text, Text, IntWritable> {        List<String> stopWords;        String delimiters;        @Override        protected void setup(Context context) throws IOException,InterruptedException {            Configuration conf = context.getConfiguration();            String stopWordsPath = conf.get("stopwords");            String delimitersPath = conf.get("delimiters");            this.stopWords = Arrays.asList(readHDFSFile(stopWordsPath, conf).split("\n"));            this.delimiters = readHDFSFile(delimitersPath, conf);        }        @Override        public void map(Object key, Text value, Context context) throws IOException, InterruptedException {        // TODO            String line = value.toString();            StringTokenizer st = new StringTokenizer(line, delimiters);            while (st.hasMoreTokens()) {                String word = (st.nextToken()).trim().toLowerCase();                if (!stopWords.contains(word)) {                    context.write(new Text(word), new IntWritable(1));                }            }        }    }    public static class TitleCountReduce extends Reducer<Text, IntWritable, Text, IntWritable> {        @Override        public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {            // TODO            int sum = 0;            for (IntWritable val: values) {                sum += val.get();            }            context.write(key, new IntWritable(sum));        }    }    public static class TopTitlesMap extends Mapper<Text, Text, NullWritable, TextArrayWritable> {        Integer N;        // TODO        /*         * add TreeSet. Item in treeset are sorted acsended         * sorted by KEY automatically         */        TreeSet<Pair<Integer,String>> titleCountMap = new TreeSet<Pair<Integer,String>>();        @Override        protected void setup(Context context) throws IOException,InterruptedException {            Configuration conf = context.getConfiguration();            this.N = conf.getInt("N", 10);        }        @Override        public void map(Text key, Text value, Context context) throws IOException, InterruptedException {            // TODO            String word = key.toString();            Integer count = Integer.parseInt(value.toString());            titleCountMap.add(new Pair<Integer,String>(count,word));            if (titleCountMap.size() > N) {  //remove too much items, no more than N (default 10)                titleCountMap.remove(titleCountMap.first());            }        }        @Override        protected void cleanup(Context context) throws IOException, InterruptedException {            // TODO            //When mapper is nearly finish, method cleanup() is called            for (Pair<Integer,String> item: titleCountMap) {                String[] strings = {item.second, item.first.toString()};                TextArrayWritable val = new TextArrayWritable(strings);                context.write(NullWritable.get(), val);            }        }    }    public static class TopTitlesReduce extends Reducer<NullWritable, TextArrayWritable, Text, IntWritable> {        Integer N;        // TODO        TreeSet<Pair<Integer,String>> titleCountMap = new TreeSet<Pair<Integer,String>>();        @Override        protected void setup(Context context) throws IOException,InterruptedException {            Configuration conf = context.getConfiguration();            this.N = conf.getInt("N", 10);        }        @Override        public void reduce(NullWritable key, Iterable<TextArrayWritable> values, Context context) throws IOException, InterruptedException {            // TODO            //Because Mapper's output key is a NullWritable, all of the output will send to a single reducer            for (TextArrayWritable val:values) {                Text[] pair = (Text[]) val.toArray();                String word = pair[0].toString();                Integer count = Integer.parseInt(pair[1].toString());                titleCountMap.add(new Pair<Integer,String>(count,word));            }            if (titleCountMap.size() > N) {                titleCountMap.remove(titleCountMap.first());            }            for (Pair<Integer,String> item:titleCountMap) {                Text word = new Text(item.second);                IntWritable count = new IntWritable(item.first);                context.write(word, count);            }        }    }}// >>> Don't Changeclass Pair<A extends Comparable<? super A>,        B extends Comparable<? super B>>        implements Comparable<Pair<A, B>> {    public final A first;    public final B second;    public Pair(A first, B second) {        this.first = first;        this.second = second;    }    public static <A extends Comparable<? super A>,            B extends Comparable<? super B>>    Pair<A, B> of(A first, B second) {        return new Pair<A, B>(first, second);    }    @Override    public int compareTo(Pair<A, B> o) {        int cmp = o == null ? 1 : (this.first).compareTo(o.first);        return cmp == 0 ? (this.second).compareTo(o.second) : cmp;    }    @Override    public int hashCode() {        return 31 * hashcode(first) + hashcode(second);    }    private static int hashcode(Object o) {        return o == null ? 0 : o.hashCode();    }    @Override    public boolean equals(Object obj) {        if (!(obj instanceof Pair))            return false;        if (this == obj)            return true;        return equal(first, ((Pair<?, ?>) obj).first)                && equal(second, ((Pair<?, ?>) obj).second);    }    private boolean equal(Object o1, Object o2) {        return o1 == o2 || (o1 != null && o1.equals(o2));    }    @Override    public String toString() {        return "(" + first + ", " + second + ')';    }}// <<< Don't Change
