mapreduce编程实例(4)-求中位数和标准差
来源:互联网 发布:sqlyog怎么导入sql文件 编辑:程序博客网 时间:2024/05/16 19:46
这个实例解决问题是:计算一天的每个小时中,网站新增评论长度的中位数和这些长度之间的标准差。代码如下:
- package mrdp.ch2;
- import java.io.DataInput;
- import java.io.DataOutput;
- import java.io.IOException;
- import java.text.ParseException;
- import java.text.SimpleDateFormat;
- import java.util.ArrayList;
- import java.util.Collections;
- import java.util.Date;
- import java.util.Map;
- import mrdp.utils.MRDPUtils;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.io.IntWritable;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.io.Writable;
- import org.apache.hadoop.mapreduce.Job;
- import org.apache.hadoop.mapreduce.Mapper;
- import org.apache.hadoop.mapreduce.Reducer;
- import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
- import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
- import org.apache.hadoop.util.GenericOptionsParser;
- public class MedianStdDevDriver {
- public static class SOMedianStdDevMapper extends
- Mapper<Object, Text, IntWritable, IntWritable> {
- private IntWritable outHour = new IntWritable();
- private IntWritable outCommentLength = new IntWritable();
- private final static SimpleDateFormat frmt = new SimpleDateFormat(
- "yyyy-MM-dd'T'HH:mm:ss.SSS");
- @SuppressWarnings("deprecation")
- @Override
- public void map(Object key, Text value, Context context)
- throws IOException, InterruptedException {
- // Parse the input string into a nice map
- Map<String, String> parsed = MRDPUtils.transformXmlToMap(value.toString());
- // Grab the "CreationDate" field,
- // since it is what we are grouping by
- String strDate = parsed.get("CreationDate");
- // Grab the comment to find the length
- String text = parsed.get("Text");
- // .get will return null if the key is not there
- if (strDate == null || text == null) {
- // skip this record
- return;
- }
- try {
- // get the hour this comment was posted in
- Date creationDate = frmt.parse(strDate);
- outHour.set(creationDate.getHours());
- // get the comment length
- outCommentLength.set(text.length());
- // write out the user ID with min max dates and count
- context.write(outHour, outCommentLength);
- } catch (ParseException e) {
- System.err.println(e.getMessage());
- return;
- }
- }
- }
- public static class SOMedianStdDevReducer extends
- Reducer<IntWritable, IntWritable, IntWritable, MedianStdDevTuple> {
- private MedianStdDevTuple result = new MedianStdDevTuple();
- private ArrayList<Float> commentLengths = new ArrayList<Float>();
- @Override
- public void reduce(IntWritable key, Iterable<IntWritable> values,
- Context context) throws IOException, InterruptedException {
- float sum = 0;
- float count = 0;
- commentLengths.clear();
- result.setStdDev(0);
- // Iterate through all input values for this key
- for (IntWritable val : values) {
- commentLengths.add((float) val.get());
- sum += val.get();
- ++count;
- }
- // sort commentLengths to calculate median
- Collections.sort(commentLengths);
- // if commentLengths is an even value, average middle two elements
- if (count % 2 == 0) {
- result.setMedian((commentLengths.get((int) count / 2 - 1) + commentLengths
- .get((int) count / 2)) / 2.0f);
- } else {
- // else, set median to middle value
- result.setMedian(commentLengths.get((int) count / 2));
- }
- // calculate standard deviation
- float mean = sum / count;
- float sumOfSquares = 0.0f;
- for (Float f : commentLengths) {
- sumOfSquares += (f - mean) * (f - mean);
- }
- result.setStdDev((float) Math.sqrt(sumOfSquares / (count - 1)));
- context.write(key, result);
- }
- }
- public static void main(String[] args) throws Exception {
- Configuration conf = new Configuration();
- String[] otherArgs = new GenericOptionsParser(conf, args)
- .getRemainingArgs();
- if (otherArgs.length != 2) {
- System.err.println("Usage: MedianStdDevDriver <in> <out>");
- System.exit(2);
- }
- Job job = new Job(conf,
- "StackOverflow Comment Length Median StdDev By Hour");
- job.setJarByClass(MedianStdDevDriver.class);
- job.setMapperClass(SOMedianStdDevMapper.class);
- job.setReducerClass(SOMedianStdDevReducer.class);
- job.setMapOutputKeyClass(IntWritable.class);
- job.setMapOutputValueClass(IntWritable.class);
- job.setOutputKeyClass(IntWritable.class);
- job.setOutputValueClass(MedianStdDevTuple.class);
- FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
- FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
- System.exit(job.waitForCompletion(true) ? 0 : 1);
- }
- public static class MedianStdDevTuple implements Writable {
- private float median = 0;
- private float stddev = 0f;
- public float getMedian() {
- return median;
- }
- public void setMedian(float median) {
- this.median = median;
- }
- public float getStdDev() {
- return stddev;
- }
- public void setStdDev(float stddev) {
- this.stddev = stddev;
- }
- @Override
- public void readFields(DataInput in) throws IOException {
- median = in.readFloat();
- stddev = in.readFloat();
- }
- @Override
- public void write(DataOutput out) throws IOException {
- out.writeFloat(median);
- out.writeFloat(stddev);
- }
- @Override
- public String toString() {
- return median + "\t" + stddev;
- }
- }
- }
求标准差就是简单的根据数学定义求的。
计算结果如下:
- jpan@jpan-Beijing:~/Mywork/mapreducepatterns/testdata$ hadoop fs -cat output3/part-r-00000
- 0 145.5 158.66512
- 1 218.0 150.04599
- 2 139.0 148.84734
- 3 200.0 158.28148
- 4 139.5 158.62466
- 5 122.5 167.31377
- 6 199.5 160.57263
- 7 238.0 175.86475
- 8 253.5 164.08226
- 9 232.0 167.5952
- 10 200.0 157.11778
- 11 179.0 144.3936
- 12 172.0 148.96738
- 13 229.0 134.17366
- 14 207.0 147.26193
- 15 224.0 147.52689
- 16 143.0 130.6711
- 17 177.0 158.20508
- 18 199.0 159.31636
- 19 175.5 147.4742
- 20 169.0 138.74756
- 21 164.0 141.22824
- 22 152.5 122.51671
- 23 145.0 160.20476
0 0
- mapreduce编程实例(4)-求中位数和标准差
- mapreduce编程实例(4)-求中位数和标准差
- MapReduce编程基础(二)——数值概要(计算中位数、标准差)
- MapReduce编程基础(二)——数值概要(计算中位数、标准差)[内存优化]
- mapreduce编程实例(2)-求最大值和最小值
- mapreduce编程实例(2)-求最大值和最小值
- oracle 求方差和标准差
- matlab求方差和标准差
- MapReduce编程实例(一)-求平均数
- mapreduce编程实例(3)-求平均值
- mapreduce编程实例(3)-求平均值
- mapreduce编程实例(6)-求TOP 10
- MapReduce编程实例(一)-求平均数
- mapreduce编程实例(7)-求所有用户ID
- mapreduce编程:求平均值
- MapReduce编程实例
- mapreduce python编程实例
- MapReduce编程实例
- mapreduce编程实例(3)-求平均值
- 数据传输客户端 - 电脑端 - 基于BAT
- Linux环境下段错误的产生原因及调试方法小结
- oracle 编译无效对象
- Codeforces #247 (Div. 2) A. Black Square
- mapreduce编程实例(4)-求中位数和标准差
- 嵌入式linux上QT标准键盘输入的实现
- CMOS Sensor一些术语及其图像质量的调试点
- 2014 Beijing Invitational
- platform
- Instagram使用的那些开源伙伴们
- Java中影响方法调用性能的因素
- Cython基础--Cython入门
- LeetCode: Permutations II [046]