HDPCD-Java-复习笔记(8)- lab

来源:互联网 发布:淘宝网图标放桌面 编辑:程序博客网 时间:2024/06/07 05:58

Java lab booklet


Adding a Combiner

The combiner will greatly decrease the number of key/value pairs distributed across the network between the mappers and reducers.

WordCount main()中增加


job.setCombinerClass(IntSumReducer.class);  


Computing the Average of a Collection of Numbers

The MapReduce job will compute and output the average median income in the year 2000 of each of the 50 states and the District of Columbia.


Abbeville, SC,45001,6581,7471,6787,195278,302280,29673,40460,3042,3294
Acadia, LA,22001,13658,15450,16308,338561,618949,24788,40061,5686,5975
Accomack, VA,51001,9401,11507,10857,238824,444818,25404,38656,4720,5319


Notice the first value in each row is a county name, followed by the state. The third value is a unique ID for the county. The remaining values represent median incomes from various years. For example, the 10th value in each row is the median household income for that county for the year 2000. This is the column you are going to compute the average of for each state.

package average;import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.conf.Configured;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.DoubleWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Partitioner;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;import org.apache.hadoop.util.StringUtils;import org.apache.hadoop.util.Tool;import org.apache.hadoop.util.ToolRunner;public class AverageJob extends Configured implements Tool {public static class AveragePartitioner extends Partitioner<Text, Text> {@Overridepublic int getPartition(Text key, Text value, int numPartitions) {if (numPartitions == 1) {return 0;}return (key.hashCode()&Integer.MAX_VALUE) % numPartitions;}}public enum Counters{MAP, COMBINE, REDUCE}public static class AverageMapper extends Mapper<LongWritable, Text, Text, Text> {        private Text outputKey = new Text();        private Text outputValue = new Text();@Overrideprotected void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException {    String[] words = StringUtils.split(value.toString(), '\\', ',');    for (int i = 0; i < words.length; i++) {    //State column.if (i == 1) {outputKey.set(words[i].trim());}//Househould income column.if (i == 9) {outputValue.set(words[i].trim() + ",1");}}    context.getCounter(Counters.MAP).increment(1);    context.write(outputKey, outputValue);    }@Overrideprotected void cleanup(Context context)throws IOException, InterruptedException {}}public static class AverageCombiner extends Reducer<Text, Text, Text, Text> {private Text outputValue = new Text();                private long sum = 0;                private int count = 0;@Overrideprotected void reduce(Text key, Iterable<Text> values, Context context)throws IOException, InterruptedException {String[] tmp = new String[2];for (Text value : values) {tmp = StringUtils.split(value.toString(), ',');sum += Long.parseLong(tmp[0]);count += Integer.parseInt(tmp[1]);}outputValue.set(sum + "," + count);context.getCounter(Counters.COMBINE).increment(1);context.write(key, outputValue);sum = 0;count = 0;}@Overrideprotected void cleanup(Context context)throws IOException, InterruptedException {}}public static class AverageReducer extends Reducer<Text, Text, Text, DoubleWritable> {private DoubleWritable outputValue = new DoubleWritable();private double sum = 0;private int count = 0;@Overrideprotected void reduce(Text key, Iterable<Text> values, Context context)throws IOException, InterruptedException {String[] tmp = new String[2];for (Text value : values) {tmp = StringUtils.split(value.toString(), ',');sum += Long.parseLong(tmp[0]);count += Integer.parseInt(tmp[1]);}outputValue.set(sum/count);context.getCounter(Counters.REDUCE).increment(1);context.write(key, outputValue);sum = 0;count = 0;}@Overrideprotected void cleanup(Context context)throws IOException, InterruptedException {}}@Overridepublic int run(String[] arg0) throws Exception {Configuration conf = super.getConf();Job job = Job.getInstance(conf, "AverageJob");job.setJarByClass(AverageJob.class);Path out = new Path("counties/output");FileInputFormat.setInputPaths(job, "counties");FileOutputFormat.setOutputPath(job, out);out.getFileSystem(conf).delete(out, true);job.setMapperClass(AverageMapper.class);job.setReducerClass(AverageReducer.class);job.setCombinerClass(AverageCombiner.class);job.setPartitionerClass(AveragePartitioner.class);job.setInputFormatClass(TextInputFormat.class);job.setOutputFormatClass(TextOutputFormat.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(Text.class);job.setNumReduceTasks(5);return job.waitForCompletion(true)?0:1;}public static void main(String[] args) {int result = 0;try {result = ToolRunner.run(new Configuration(),  new AverageJob(), args);} catch (Exception e) {e.printStackTrace();}System.exit(result);}}


Writing a Custom Partitioner

The Average MapReduce job will execute with five Reducers that are sent an evenly-distributed number of key/value pairs.

public static class AveragePartitioner extends Partitioner<Text, Text> {@Overridepublic int getPartition(Text key, Text value, int numPartitions) {if (numPartitions == 1) {return 0;}return (key.hashCode()&Integer.MAX_VALUE) % numPartitions;}}job.setNumReduceTasks(5);


Writing a Custom Output Format

A MapReduce jobs that outputs the growth (or loss) of stock dividends.

exchange,stock_symbol,date,dividends
NYSE,AIT,2009-11-12,0.15
NYSE,AIT,2009-08-12,0.15
NYSE,AIT,2009-05-13,0.15
NYSE,AIT,2009-02-11,0.15

package customsort;import java.io.IOException;import org.apache.hadoop.fs.FSDataOutputStream;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.mapreduce.RecordWriter;import org.apache.hadoop.mapreduce.TaskAttemptContext;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;public class DividendOutputFormat extends FileOutputFormat<NullWritable, DividendChange> {@Overridepublic RecordWriter<NullWritable, DividendChange> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException {int partition = job.getTaskAttemptID().getTaskID().getId();Path outputDir = FileOutputFormat.getOutputPath(job);Path filename = new Path(outputDir.getName() + Path.SEPARATOR + job.getJobName() + "_" + partition);FileSystem fs = filename.getFileSystem(job.getConfiguration());FSDataOutputStream dos = fs.create(filename);return new DividendRecordWriter(dos);}}
package customsort;import java.io.DataOutputStream;import java.io.IOException;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.mapreduce.RecordWriter;import org.apache.hadoop.mapreduce.TaskAttemptContext;public class DividendRecordWriter extends RecordWriter<NullWritable, DividendChange> {    public final String SEPERATOR = ",";    private DataOutputStream out;        public DividendRecordWriter(DataOutputStream out) {this.out = out;}    @Overridepublic void write(NullWritable key, DividendChange value)throws IOException, InterruptedException {StringBuilder result = new StringBuilder();result.append(value.getSymbol());result.append(SEPERATOR);result.append(value.getDate());result.append(SEPERATOR);result.append(value.getChange());result.append("\n");out.write(result.toString().getBytes());}@Overridepublic void close(TaskAttemptContext context) throws IOException,InterruptedException {out.close();}}
package customsort;import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.conf.Configured;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.DoubleWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Partitioner;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat;import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;import org.apache.hadoop.util.StringUtils;import org.apache.hadoop.util.Tool;import org.apache.hadoop.util.ToolRunner;public class DividendJob extends Configured implements Tool {public static class DividendGrowthMapper extends Mapper<LongWritable, Text, Stock, DoubleWritable> {private Stock outputKey = new Stock();private DoubleWritable outputValue = new DoubleWritable();private final String EXCHANGE = "exchange";@Overrideprotected void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException {String [] words = StringUtils.split(value.toString(),'\\',',');if(EXCHANGE.equals(words[0])) {return;}outputKey.setSymbol(words[1]);outputKey.setDate(words[2]);outputValue.set(Double.parseDouble(words[3]));context.write(outputKey, outputValue);}}public static class StockPartitioner extends Partitioner<Stock, DoubleWritable> {@Overridepublic int getPartition(Stock key, DoubleWritable value, int numReduceTasks) {char firstLetter = key.getSymbol().trim().charAt(0);return (firstLetter - 'A') % numReduceTasks;}}public static class DividendGrowthReducer extends Reducer<Stock, DoubleWritable, NullWritable, DividendChange> {private NullWritable outputKey = NullWritable.get();private DividendChange outputValue = new DividendChange();private MultipleOutputs < NullWritable , DividendChange > mos;@Overrideprotected void reduce(Stock key, Iterable<DoubleWritable> values, Context context)throws IOException, InterruptedException {double previousDividend = 0.0;for(DoubleWritable dividend : values) {double currentDividend = dividend.get();double growth = currentDividend - previousDividend;if(Math.abs(growth) > 0.000001) {outputValue.setSymbol(key.getSymbol());outputValue.setDate(key.getDate());outputValue.setChange(growth);if (growth > 0) {mos.write("positive", outputKey, outputValue, "pos");}else {mos.write("negative", outputKey, outputValue, "nes");}//context.write(outputKey, outputValue);previousDividend = currentDividend;}}}@Overrideprotected void setup(Reducer<Stock, DoubleWritable, NullWritable, DividendChange>.Context context)throws IOException, InterruptedException {mos = new MultipleOutputs<NullWritable, DividendChange>(context);super.setup(context);}@Overrideprotected void cleanup(Reducer<Stock, DoubleWritable, NullWritable, DividendChange>.Context context)throws IOException, InterruptedException {mos.close();super.cleanup(context);}}@Overridepublic int run(String[] args) throws Exception {Configuration conf = super.getConf();Job job = Job.getInstance(conf, "DividendJob");job.setJarByClass(DividendJob.class);Path out = new Path("growth");FileInputFormat.setInputPaths(job, new Path("dividends"));FileOutputFormat.setOutputPath(job, out);out.getFileSystem(conf).delete(out, true);job.setMapperClass(DividendGrowthMapper.class);job.setReducerClass(DividendGrowthReducer.class);job.setPartitionerClass(StockPartitioner.class);job.setGroupingComparatorClass(StockGroupComparator.class);job.setInputFormatClass(TextInputFormat.class);//job.setOutputFormatClass(DividendOutputFormat.class);MultipleOutputs.addNamedOutput(job, "positive", TextOutputFormat.class, NullWritable.class, DividendChange.class);MultipleOutputs.addNamedOutput(job, "negative", TextOutputFormat.class, NullWritable.class, DividendChange.class);//Drop off default empty files.LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);job.setOutputKeyClass(NullWritable.class);job.setOutputValueClass(DividendChange.class);job.setMapOutputKeyClass(Stock.class);job.setMapOutputValueClass(DoubleWritable.class);job.setNumReduceTasks(3);return job.waitForCompletion(true)?0:1;}public static void main(String[] args) {int result = 0;try {result = ToolRunner.run(new Configuration(),  new DividendJob(), args);} catch (Exception e) {e.printStackTrace();}System.exit(result);}}
package customsort;import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;import org.apache.hadoop.io.Writable;public class DividendChange implements Writable {private String symbol;private String date;private double change;public String getSymbol() {return symbol;}public void setSymbol(String symbol) {this.symbol = symbol;}public String getDate() {return date;}public void setDate(String date) {this.date = date;}public double getChange() {return change;}public void setChange(double change) {this.change = change;}@Overridepublic String toString() {return symbol + "\t" + date + "\t" + change;}@Overridepublic void write(DataOutput out) throws IOException {out.writeUTF(symbol);out.writeUTF(date);out.writeDouble(change);}@Overridepublic void readFields(DataInput in) throws IOException {symbol = in.readUTF();date = in.readUTF();change = in.readDouble();}}
package customsort;import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;import org.apache.hadoop.io.WritableComparable;public class Stock implements WritableComparable<Stock> {private String symbol;private String date;public String getSymbol() {return symbol;}public void setSymbol(String symbol) {this.symbol = symbol;}public String getDate() {return date;}public void setDate(String date) {this.date = date;}@Overridepublic void write(DataOutput out) throws IOException {            out.writeUTF(symbol);            out.writeUTF(date);}@Overridepublic void readFields(DataInput in) throws IOException {            symbol = in.readUTF();           date = in.readUTF();}@Overridepublic int compareTo(Stock stock) {int response = this.symbol.compareTo(stock.symbol);if (response != 0) {return response;}else {response = this.date.compareTo(stock.date);return response;}}}
package customsort;import org.apache.hadoop.io.WritableComparable;import org.apache.hadoop.io.WritableComparator;public class StockGroupComparator extends WritableComparator {protected StockGroupComparator(){super(Stock.class, true);}@SuppressWarnings("rawtypes")@Overridepublic int compare(WritableComparable a, WritableComparable b) {Stock lhs = (Stock)a;Stock rhs = (Stock)b;return lhs.getSymbol().compareTo(rhs.getSymbol());}}



原创粉丝点击