HDPCD-Java-复习笔记(9)-lab

来源:互联网 发布:数据脱敏的主要方法 编辑:程序博客网 时间:2024/05/18 14:45

Java lab booklet


Sorting Using a Composite Key

完整代码参考前一个复习笔记

Define a Custom Key Class -- Stock.参考前一个复习笔记。

Writea Custom Partitioner -- StockPartitioner  -- getPartition  --


char firstLetter =key.getSymbol().trim().charAt(0);return (firstLetter -'A') % numReduceTasks;

Define a Custom Value Class  -- DividendChange (The Reducer is going to output a custom value type that you define)

Add a toString method to DividendChange that looks like the following:

·        @Override

·        public String toString() {

·        return symbol + "\t" + date +"\t" + change;

}


No grouping will appear during the shuffle/sort phase. In this step, you are going to define a group comparator so that stocks with the same symbol are grouped together.

Write a Group Comparator -- StockGroupComparator that extends WritableComparator .


How CombineFileInputFormat Works

其中WordCountMapper, WordCountReducer请参考WordCount程序。

package wordcount;import java.io.IOException;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.InputSplit;import org.apache.hadoop.mapreduce.RecordReader;import org.apache.hadoop.mapreduce.TaskAttemptContext;import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat;import org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader;import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;import org.apache.hadoop.mapreduce.lib.input.FileSplit;import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;public class MyCombinedFilesInputFormat extends CombineFileInputFormat<LongWritable, Text> {@SuppressWarnings({ "unchecked", "rawtypes" })@Overridepublic RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException {return new CombineFileRecordReader((CombineFileSplit) split,context,MyCombinedFilesRecordReader.class);}public static class MyCombinedFilesRecordReader extends RecordReader<LongWritable, Text> {private int index;private LineRecordReader reader;public MyCombinedFilesRecordReader(CombineFileSplit split, TaskAttemptContext context, Integer index) {this.index = index;reader = new LineRecordReader();}@Overridepublic void initialize(InputSplit split, TaskAttemptContext context)throws IOException, InterruptedException {CombineFileSplit cfsplit = (CombineFileSplit) split;FileSplit fileSplit = new FileSplit(cfsplit.getPath(index),cfsplit.getOffset(index),cfsplit.getLength(index),cfsplit.getLocations());reader.initialize(fileSplit, context);}@Overridepublic boolean nextKeyValue() throws IOException, InterruptedException {return reader.nextKeyValue();}@Overridepublic LongWritable getCurrentKey() throws IOException,InterruptedException {return reader.getCurrentKey();}@Overridepublic Text getCurrentValue() throws IOException, InterruptedException {return reader.getCurrentValue();}@Overridepublic float getProgress() throws IOException, InterruptedException {return reader.getProgress();}@Overridepublic void close() throws IOException {reader.close();}}}

package wordcount;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.conf.Configured;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;import org.apache.hadoop.util.Tool;import org.apache.hadoop.util.ToolRunner;public class WordCountJob extends Configured implements Tool {@Overridepublic int run(String[] args) throws Exception {Job job = Job.getInstance(getConf(), "WordCountJob");Configuration conf = job.getConfiguration();job.setJarByClass(getClass());Path in = new Path(args[0]);Path out = new Path(args[1]);out.getFileSystem(conf).delete(out, true);FileInputFormat.setInputPaths(job, in);FileOutputFormat.setOutputPath(job, out);job.setMapperClass(WordCountMapper.class);job.setReducerClass(WordCountReducer.class);//job.setInputFormatClass(TextInputFormat.class);job.setInputFormatClass(MyCombinedFilesInputFormat.class);conf.set(FileInputFormat.SPLIT_MAXSIZE, "50000");job.setOutputFormatClass(TextOutputFormat.class);job.setMapOutputKeyClass(Text.class);job.setMapOutputValueClass(IntWritable.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(IntWritable.class);return job.waitForCompletion(true)?0:1;}public static void main(String[] args) {int result = 0;try {result = ToolRunner.run(new Configuration(), new WordCountJob(),args);} catch (Exception e) {e.printStackTrace();}System.exit(result);}}

Demo: Processing Multiple Inputs


ids_states.txt

1,CA
4,SD
1,NY
6,CO

names_ids.txt

4Rich
5 Barry
12 George
1 Ulf
2 Danielle
9 Tom
3 Manish
6 Mark

package demo;import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.conf.Configured;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;import org.apache.hadoop.util.StringUtils;import org.apache.hadoop.util.Tool;import org.apache.hadoop.util.ToolRunner;public class MultipleInputFiles extends Configured implements Tool {public static class NamesMapper extends Mapper<LongWritable, Text, Text, Text> {private Text outputValue = new Text();private Text outputKey = new Text();@Overrideprotected void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException {String currentLine = value.toString();String [] words = StringUtils.split(currentLine, '\\', '\t');outputKey.set(words[0]);outputValue.set(words[1]);context.write(outputKey, outputValue);}}public static class StatesMapper extends Mapper<LongWritable, Text, Text, Text> {private Text outputValue = new Text();private Text outputKey = new Text();@Overrideprotected void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException {String currentLine = value.toString();String [] words = StringUtils.split(currentLine, '\\', ',');outputKey.set(words[0]);outputValue.set(words[1]);context.write(outputKey, outputValue);}}public static class MultiInputReducer extends Reducer<Text, Text, Text, Text> {private Text outputValue = new Text();@Overrideprotected void reduce(Text key, Iterable<Text> values,Context context)throws IOException, InterruptedException {StringBuilder output = new StringBuilder();for(Text value : values) {output.append(value.toString() + ",");}outputValue.set(output.toString());context.write(key, outputValue);}}@Overridepublic int run(String[] args) throws Exception {Job job = Job.getInstance(getConf(), "MultipleInputFilesJob");Configuration conf = job.getConfiguration();job.setJarByClass(getClass());Path names = new Path("multiinputs/names_ids.txt");Path states = new Path("multiinputs/ids_states.txt");MultipleInputs.addInputPath(job, names, TextInputFormat.class, NamesMapper.class);MultipleInputs.addInputPath(job, states, TextInputFormat.class, StatesMapper.class);Path out = new Path("multiinputs/output");out.getFileSystem(conf).delete(out, true);FileOutputFormat.setOutputPath(job, out);job.setReducerClass(MultiInputReducer.class);job.setNumReduceTasks(1);job.setOutputFormatClass(TextOutputFormat.class);job.setMapOutputKeyClass(Text.class);job.setMapOutputValueClass(Text.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(Text.class);return job.waitForCompletion(true)?0:1;}public static void main(String[] args) {int result = 0;try {result = ToolRunner.run(new Configuration(), new MultipleInputFiles(),args);} catch (Exception e) {e.printStackTrace();}System.exit(result);}}

Lab: Writing a Custom InputFormat


exchange,stock_symbol,date,stock_price_open,stock_price_high,stock_price_low,stock_price_close,stock_volume,stock_price_adj_close
NYSE,           JEF,             2010-02-08,        25.40,                    25.49,                24.78,                    24.82,                         1134300,                   24.82
NYSE,           JEF,             2010-02-05,        24.91,                    25.19,                24.08,                    25.01,                         1765200,                   25.01
NYSE,           JEF,             2010-02-04,        26.01,                    26.20,                24.85,                    24.85,                         1414400,                   24.85


package average;import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.conf.Configured;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.DoubleWritable;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;import org.apache.hadoop.util.Tool;import org.apache.hadoop.util.ToolRunner;public class MovingAveragePreprocessor extends Configured implements Tool {public static class PreprocessorMapper extends Mapper<Stock, StockPrices, Stock, DoubleWritable> {private DoubleWritable outputValue = new DoubleWritable();@Overrideprotected void map(Stock key, StockPrices value, Context context)throws IOException, InterruptedException {outputValue.set(value.getClose());context.write(key, outputValue);}}@Overridepublic int run(String[] arg0) throws Exception {Configuration conf = super.getConf();Job job = Job.getInstance(conf, "MovingAveragePreprocessor");job.setJarByClass(MovingAveragePreprocessor.class);Path out = new Path("closingprices");FileInputFormat.setInputPaths(job, "stocks");FileOutputFormat.setOutputPath(job, out);out.getFileSystem(conf).delete(out, true);job.setMapperClass(PreprocessorMapper.class);job.setReducerClass(Reducer.class);job.setInputFormatClass(StockInputFormat.class);job.setOutputFormatClass(SequenceFileOutputFormat.class);job.setOutputKeyClass(Stock.class);job.setOutputValueClass(DoubleWritable.class);job.setMapOutputKeyClass(Stock.class);job.setMapOutputValueClass(DoubleWritable.class);job.setNumReduceTasks(1);return job.waitForCompletion(true) ? 0 : 1;}public static void main(String[] args) {int result = 0;try {result = ToolRunner.run(new Configuration(), new MovingAveragePreprocessor(), args);}catch (Exception e) {e.printStackTrace();}System.exit(result);}}
package average;import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FSDataInputStream;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.InputSplit;import org.apache.hadoop.mapreduce.RecordReader;import org.apache.hadoop.mapreduce.TaskAttemptContext;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.FileSplit;import org.apache.hadoop.util.LineReader;import org.apache.hadoop.util.StringUtils;public class StockInputFormat extends FileInputFormat<Stock, StockPrices> {public static class StockReader extends RecordReader<Stock, StockPrices> {        private Stock key = new Stock();        private StockPrices value = new StockPrices();        private LineReader in;        private long start;        private long end;        private long currentPos;        private Text line = new Text();                        @Overridepublic void initialize(InputSplit split, TaskAttemptContext context)throws IOException, InterruptedException {FileSplit fileSplit = (FileSplit)split;Configuration configuration = context.getConfiguration();Path path = fileSplit.getPath();FSDataInputStream is = path.getFileSystem(configuration).open(path);in = new LineReader(is, configuration);start = fileSplit.getStart();end = start + fileSplit.getLength();is.seek(start);if (start != 0) {//start += in.readLine(new Text(), 0, (int)Math.min(Integer.MAX_VALUE, end - start));}currentPos = start;}@Overridepublic boolean nextKeyValue() throws IOException, InterruptedException {if (currentPos > end) {return false;}currentPos += in.readLine(line);if (line.getLength() == 0) {return false;}if (line.toString().startsWith("exchange")) {currentPos += in.readLine(line);}String[] values = StringUtils.split(line.toString(), ',');key.setSymbol(values[1]);key.setDate(values[2]);value.setOpen(Double.parseDouble(values[3]));value.setHigh(Double.parseDouble(values[4]));value.setLow(Double.parseDouble(values[5]));value.setClose(Double.parseDouble(values[6]));value.setVolume(Integer.parseInt(values[7]));value.setAdjustedClose(Double.parseDouble(values[8]));return true;}@Overridepublic Stock getCurrentKey() throws IOException, InterruptedException {return key;}@Overridepublic StockPrices getCurrentValue() throws IOException,InterruptedException {return value;}@Overridepublic float getProgress() throws IOException, InterruptedException {return currentPos/end;}@Overridepublic void close() throws IOException {in.close();}}@Overridepublic RecordReader<Stock, StockPrices> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException,InterruptedException {return new StockReader();}}
package average;import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;import org.apache.hadoop.io.WritableComparable;public class Stock implements WritableComparable<Stock> {private String symbol;private String date;@Overridepublic void readFields(DataInput in) throws IOException {symbol = in.readUTF();date = in.readUTF();}@Overridepublic void write(DataOutput out) throws IOException {out.writeUTF(symbol);out.writeUTF(date);}public String getSymbol() {return symbol;}public void setSymbol(String symbol) {this.symbol = symbol;}public String getDate() {return date;}public void setDate(String date) {this.date = date;}@Overridepublic int compareTo(Stock arg0) {int response = symbol.compareTo(arg0.symbol);if(response == 0) {response = date.compareTo(arg0.date);}return response;}public String toString() {return symbol + "\t" + date;}}
package average;import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;import org.apache.hadoop.io.Writable;public class StockPrices implements Writable {private double open, high, low, close, adjustedClose;private int volume;@Overridepublic void readFields(DataInput in) throws IOException {open = in.readDouble();high = in.readDouble();low = in.readDouble();close = in.readDouble();adjustedClose = in.readDouble();volume = in.readInt();}@Overridepublic void write(DataOutput out) throws IOException {out.writeDouble(open);out.writeDouble(high);out.writeDouble(low);out.writeDouble(close);out.writeDouble(adjustedClose);out.writeInt(volume);}public double getOpen() {return open;}public void setOpen(double open) {this.open = open;}public double getHigh() {return high;}public void setHigh(double high) {this.high = high;}public double getLow() {return low;}public void setLow(double low) {this.low = low;}public double getClose() {return close;}public void setClose(double close) {this.close = close;}public double getAdjustedClose() {return adjustedClose;}public void setAdjustedClose(double adjustedClose) {this.adjustedClose = adjustedClose;}public int getVolume() {return volume;}public void setVolume(int volume) {this.volume = volume;}}