MapReduce Design Patterns-chapter 2
来源:互联网 发布:淘宝网名怎么改 编辑:程序博客网 时间:2024/06/05 11:22
CHAPTER 2:Summarization Patterns
一小时内发表评论长度的最大最小以及求和
public class MinMaxCountTuple implements Writable { private Date min = new Date(); private Date max = new Date(); private long count = 0; private final static SimpleDateFormat frmt = new SimpleDateFormat( "yyyy-MM-dd'T'HH:mm:ss.SSS"); public Date getMin() { return min; } public void setMin(Date min) { this.min = min; } public Date getMax() { return max; } public void setMax(Date max) { this.max = max; } public long getCount() { return count; } public void setCount(long count) { this.count = count; } public void readFields(DataInput in) throws IOException { // Read the data out in the order it is written, // creating new Date objects from the UNIX timestamp min = new Date(in.readLong()); max = new Date(in.readLong()); count = in.readLong(); } public void write(DataOutput out) throws IOException {// Write the data out in the order it is read, // using the UNIX timestamp to represent the Date out.writeLong(min.getTime()); out.writeLong(max.getTime()); out.writeLong(count); } public String toString() { return frmt.format(min) + "\t" + frmt.format(max) + "\t" + count; }}
public static class MinMaxCountMapper extends Mapper<Object, Text, Text, MinMaxCountTuple> { // Our output key and value Writables private Text outUserId = new Text(); private MinMaxCountTuple outTuple = new MinMaxCountTuple(); // This object will format the creation date string into a Date object private final static SimpleDateFormat frmt = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS"); public void map(Object key, Text value, Context context) throws IOException, InterruptedException { Map<String, String> parsed = transformXmlToMap(value.toString()); // Grab the "CreationDate" field since it is what we are finding // the min and max value of String strDate = parsed.get("CreationDate"); // Grab the “UserID” since it is what we are grouping byString userId = parsed.get("UserId"); // Parse the string into a Date object Date creationDate = frmt.parse(strDate); // Set the minimum and maximum date values to the creationDate outTuple.setMin(creationDate); outTuple.setMax(creationDate); // Set the comment count to 1 outTuple.setCount(1); // Set our user ID as the output key outUserId.set(userId); // Write out the hour and the average comment length context.write(outUserId, outTuple); }}
public static class MinMaxCountReducer extends Reducer<Text, MinMaxCountTuple, Text, MinMaxCountTuple> { // Our output value Writable private MinMaxCountTuple result = new MinMaxCountTuple(); public void reduce(Text key, Iterable<MinMaxCountTuple> values, Context context) throws IOException, InterruptedException { // Initialize our result result.setMin(null); result.setMax(null); result.setCount(0); int sum = 0; // Iterate through all input values for this key for (MinMaxCountTuple val : values) { // If the value's min is less than the result's min // Set the result's min to value's if (result.getMin() == null ||val.getMin().compareTo(result.getMin()) < 0) { result.setMin(val.getMin()); } // If the value's max is more than the result's max // Set the result's max to value's if (result.getMax() == null || val.getMax().compareTo(result.getMax()) > 0) { result.setMax(val.getMax()); } // Add to our sum the count for value sum += val.getCount(); } // Set our count to the number of input values result.setCount(sum); context.write(key, result); }}可以使用combiner,与reduce 类似
求各小时内评论类容长度的平均值
map的输出为 {小时-(数量,平均值)}
public static class AverageMapper extends Mapper<Object, Text, IntWritable, CountAverageTuple> { private IntWritable outHour = new IntWritable(); private CountAverageTuple outCountAverage = new CountAverageTuple(); private final static SimpleDateFormat frmt = new SimpleDateFormat( "yyyy-MM-dd'T'HH:mm:ss.SSS"); public void map(Object key, Text value, Context context) throws IOException, InterruptedException { Map<String, String> parsed = transformXmlToMap(value.toString()); // Grab the "CreationDate" field, // since it is what we are grouping by String strDate = parsed.get("CreationDate"); // Grab the comment to find the length String text = parsed.get("Text"); // get the hour this comment was posted in Date creationDate = frmt.parse(strDate); outHour.set(creationDate.getHours()); // get the comment length outCountAverage.setCount(1); outCountAverage.setAverage(text.length()); // write out the hour with the comment length context.write(outHour, outCountAverage); }}
reduce中进行求整体平均
public static class AverageReducer extends Reducer<IntWritable, CountAverageTuple, IntWritable, CountAverageTuple> { private CountAverageTuple result = new CountAverageTuple(); public void reduce(IntWritable key, Iterable<CountAverageTuple> values, Context context) throws IOException, InterruptedException { float sum = 0; float count = 0; // Iterate through all input values for this key for (CountAverageTuple val : values) { sum += val.getCount() * val.getAverage(); count += val.getCount(); } result.setCount(count); result.setAverage(sum / count); context.write(key, result); }}
combiner的内容与reduce一致
求个小时中评论长度的中位数与标准差
方法一:无法利用combiner
map的输出为时间和评论长度
public static class MedianStdDevMapper extends Mapper<Object, Text, IntWritable, IntWritable> { private IntWritable outHour = new IntWritable(); private IntWritable outCommentLength = new IntWritable(); private final static SimpleDateFormat frmt = new SimpleDateFormat( "yyyy-MM-dd'T'HH:mm:ss.SSS"); public void map(Object key, Text value, Context context) throws IOException, InterruptedException { Map<String, String> parsed = transformXmlToMap(value.toString()); // Grab the "CreationDate" field, // since it is what we are grouping by String strDate = parsed.get("CreationDate"); // Grab the comment to find the length String text = parsed.get("Text"); // get the hour this comment was posted in Date creationDate = frmt.parse(strDate); outHour.set(creationDate.getHours()); // set the comment length outCommentLength.set(text.length()); // write out the user ID with min max dates and count context.write(outHour, outCommentLength); }}
redece求中位数和标准差
public static class MedianStdDevReducer extends Reducer<IntWritable, IntWritable, IntWritable, MedianStdDevTuple> { private MedianStdDevTuple result = new MedianStdDevTuple(); private ArrayList<Float> commentLengths = new ArrayList<Float>(); public void reduce(IntWritable key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { float sum = 0; float count = 0; commentLengths.clear(); result.setStdDev(0); // Iterate through all input values for this key for (IntWritable val : values) { commentLengths.add((float) val.get()); sum += val.get(); ++count; } // sort commentLengths to calculate median Collections.sort(commentLengths); // if commentLengths is an even value, average middle two elements if (count % 2 == 0) { result.setMedian((commentLengths.get((int) count / 2 - 1) + commentLengths.get((int) count / 2)) / 2.0f); } else { // else, set median to middle value result.setMedian(commentLengths.get((int) count / 2)); } // calculate standard deviation float mean = sum / count; float sumOfSquares = 0.0f; for (Float f : commentLengths) { sumOfSquares += (f - mean) * (f - mean); } result.setStdDev((float) Math.sqrt(sumOfSquares / (count - 1))); context.write(key, result); }}
方法二:可以利用combiner
map的输出为{小时-(长度,1)}
public static class MedianStdDevReducer extends Reducer<IntWritable, SortedMapWritable, IntWritable, MedianStdDevTuple> { private MedianStdDevTuple result = new MedianStdDevTuple(); private TreeMap<Integer, Long> commentLengthCounts = new TreeMap<Integer, Long>(); public void reduce(IntWritable key, Iterable<SortedMapWritable> values, Context context) throws IOException, InterruptedException {String strDate = parsed.get("CreationDate"); // Grab the comment to find the length String text = parsed.get("Text"); // Get the hour this comment was posted in Date creationDate = frmt.parse(strDate); outHour.set(creationDate.getHours()); commentLength.set(text.length()); SortedMapWritable outCommentLength = new SortedMapWritable(); outCommentLength.put(commentLength, ONE); // Write out the user ID with min max dates and count context.write(outHour, outCommentLength); }}
combiner后的结果为{小时-(长度,次数)}
reduce中求中位数和标准差
public static class MedianStdDevReducer extends Reducer<IntWritable, SortedMapWritable, IntWritable, MedianStdDevTuple> { private MedianStdDevTuple result = new MedianStdDevTuple(); private TreeMap<Integer, Long> commentLengthCounts = new TreeMap<Integer, Long>(); public void reduce(IntWritable key, Iterable<SortedMapWritable> values, Context context) throws IOException, InterruptedException {float sum = 0; long totalComments = 0; commentLengthCounts.clear(); result.setMedian(0); result.setStdDev(0); for (SortedMapWritable v : values) { for (Entry<WritableComparable, Writable> entry : v.entrySet()) { int length = ((IntWritable) entry.getKey()).get(); long count = ((LongWritable) entry.getValue()).get(); totalComments += count; sum += length * count; Long storedCount = commentLengthCounts.get(length); if (storedCount == null) { commentLengthCounts.put(length, count); } else { commentLengthCounts.put(length, storedCount + count); } } } long medianIndex = totalComments / 2L; long previousComments = 0; long comments = 0; int prevKey = 0; for (Entry<Integer, Long> entry : commentLengthCounts.entrySet()) { comments = previousComments + entry.getValue(); if (previousComments ≤ medianIndex && medianIndex < comments) { if (totalComments % 2 == 0 && previousComments == medianIndex) { result.setMedian((float) (entry.getKey() + prevKey) / 2.0f); } else { result.setMedian(entry.getKey()); } break; } previousComments = comments; prevKey = entry.getKey(); } // calculate standard deviation float mean = sum / totalComments; float sumOfSquares = 0.0f; for (Entry<Integer, Long> entry : commentLengthCounts.entrySet()) { sumOfSquares += (entry.getKey() - mean) * (entry.getKey() - mean) * entry.getValue(); }result.setStdDev((float) Math.sqrt(sumOfSquares / (totalComments - 1))); context.write(key, result); }}
倒排索引
map的输出为{链接-文档}public static class WikipediaExtractor extends Mapper<Object, Text, Text, Text> { private Text link = new Text(); private Text outkey = new Text(); public void map(Object key, Text value, Context context) throws IOException, InterruptedException { Map<String, String> parsed = MRDPUtils.transformXmlToMap(value .toString()); // Grab the necessary XML attributes String txt = parsed.get("Body"); String posttype = parsed.get("PostTypeId"); String row_id = parsed.get("Id");// if the body is null, or the post is a question (1), skip if (txt == null || (posttype != null && posttype.equals("1"))) { return; } // Unescape the HTML because the SO data is escaped. txt = StringEscapeUtils.unescapeHtml(txt.toLowerCase()); link.set(getWikipediaURL(txt)); outkey.set(row_id); context.write(link, outkey); }}
reduce中执行文档串的append
public static class Concatenator extends Reducer<Text,Text,Text,Text> { private Text result = new Text(); public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { StringBuilder sb = new StringBuilder(); boolean first = true; for (Text id : values) { if (first) { first = false; } else { sb.append(" "); } sb.append(id.toString()); } result.set(sb.toString()); context.write(key, result); }}可以利用combiner,功能与reduce类似
利用Counter执行计数,统计每个洲的人数
public static class CountNumUsersByStateMapper extends Mapper<Object, Text, NullWritable, NullWritable> { public static final String STATE_COUNTER_GROUP = "State"; public static final String UNKNOWN_COUNTER = "Unknown"; public static final String NULL_OR_EMPTY_COUNTER = "Null or Empty"; private String[] statesArray = new String[] { "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA", "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", "SF", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY" }; private HashSet<String> states = new HashSet<String>( Arrays.asList(statesArray));public void map(Object key, Text value, Context context) throws IOException, InterruptedException { Map<String, String> parsed = MRDPUtils.transformXmlToMap(value .toString()); // Get the value for the Location attribute String location = parsed.get("Location"); // Look for a state abbreviation code if the // location is not null or empty if (location != null && !location.isEmpty()) { // Make location uppercase and split on white space String[] tokens = location.toUpperCase().split("\\s"); // For each token boolean unknown = true; for (String state : tokens) { // Check if it is a state if (states.contains(state)) { // If so, increment the state's counter by 1 // and flag it as not unknown context.getCounter(STATE_COUNTER_GROUP, state) .increment(1); unknown = false; break; } } // If the state is unknown, increment the UNKNOWN_COUNTER counter if (unknown) { context.getCounter(STATE_COUNTER_GROUP, UNKNOWN_COUNTER) .increment(1); } } else { // If it is empty or null, increment the // NULL_OR_EMPTY_COUNTER counter by 1 context.getCounter(STATE_COUNTER_GROUP, NULL_OR_EMPTY_COUNTER).increment(1); } }}
...int code = job.waitForCompletion(true) ? 0 : 1;if (code == 0) { for (Counter counter : job.getCounters().getGroup( CountNumUsersByStateMapper.STATE_COUNTER_GROUP)) { System.out.println(counter.getDisplayName() + "\t" + counter.getValue()); }}// Clean up empty output directoryFileSystem.get(conf).delete(outputDir, true);System.exit(code);
map没有输出,只是更新组中对应counter的计数值,在内部机制中Job-Tractor会将各个task-Tractor中的counter求和。不用reduce
- MapReduce Design Patterns-chapter 2
- MapReduce Design Patterns-chapter 3
- MapReduce Design Patterns-chapter 4
- MapReduce Design Patterns-chapter 5
- MapReduce Design Patterns-chapter 6
- MapReduce Design Patterns-chapter 7
- MapReduce Design Patterns(chapter 2 (part 2))(三)
- MapReduce Design Patterns(chapter 2 (part 1))(二)
- MapReduce Design Patterns(chapter 2 (part 3))(四)
- MapReduce Design Patterns(chapter 3 (part 2))(六)
- MapReduce Design Patterns(chapter 4 (part 2))(八)
- MapReduce Design Patterns(chapter 5 (part 2))(十)
- MapReduce Design Patterns(chapter 6 (part 2))(十二)
- MapReduce Design Patterns(chapter 7 (part 2))(十四)
- MapReduce Design Patterns(chapter 1)(一)
- (转)MapReduce Design Patterns(chapter 1)(一)
- MapReduce Design Patterns(chapter 1)(一)简介
- MapReduce Design Patterns
- UVA 103 Stacking Boxes 套箱子 DAG最长路 dp记忆化搜索
- 结构体
- 2013年9月16日星期一(DEMO8_8,多边形)
- GDB详解
- linux安装rsh
- MapReduce Design Patterns-chapter 2
- 分辨率与观赏距离和屏幕尺寸的关系
- Understand Linux Shell and Basic Shell Scripting Language Tips-Part I
- The Tips of Success(成功的建议)
- Liferay使用心得 (转载)
- EasyUI-treegrid-拖拽的实现(drag and drop)
- Liferay中Portal.properties常用配置参数
- 同源策略
- 远程对象工厂设计模式