使用自定义数据类型实现评论数时间、评论总数计数(mapreduce)
来源:互联网 发布:java读懂项目代码 编辑:程序博客网 时间:2024/05/17 03:31
给定每条评论的时间、用户di等信息,计算该用户最早最近一次发评论时间,以及评论总数。
1,数据输入格式
2.代码
package mrdp.ch2;import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;import java.text.ParseException;import java.text.SimpleDateFormat;import java.util.Date;import java.util.Map;import mrdp.ch2.MRDPUtils;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.Text;import org.apache.hadoop.io.Writable;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.GenericOptionsParser;public class MinMaxCountDriver { //mapper modulepublic static class SOMinMaxCountMapper extendsMapper<Object, Text, Text, MinMaxCountTuple> {// Our output key and value Writables定义输出的key与valueprivate Text outUserId = new Text();//输出类型为自定义数据类型MinMaxCountTuple类private MinMaxCountTuple outTuple = new MinMaxCountTuple();// This object will format the creation date string into a Date object//将时间数据转为时间类型 指定了时间格式private final static SimpleDateFormat frmt = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS");@Overridepublic void map(Object key, Text value, Context context)throws IOException, InterruptedException {// Parse the input string into a nice map//解析行数据为map 对应python的字典类型Map<String, String> parsed = MRDPUtils.transformXmlToMap(value.toString());// Grab the "CreationDate" field since it is what we are finding// the min and max value ofString strDate = parsed.get("CreationDate");//parsed是map 也就是字典类型 由键获得数值// Grab the “UserID” since it is what we are grouping byString userId = parsed.get("UserId");//获得//get will return null if the key is not thereif (strDate == null || userId == null) {// skip this record 第一行数据是空 跳过该数据 map返回值为空return;}try {// Parse the string into a Date objectDate creationDate = frmt.parse(strDate);// Set the minimum and maximum date values to the creationDateoutTuple.setMin(creationDate);//outTuple为自定义数据类型outTuple.setMax(creationDate);// Set the comment count to 1outTuple.setCount(1);// Set our user ID as the output keyoutUserId.set(userId);// Write out the user ID with min max dates and countcontext.write(outUserId, outTuple);} catch (ParseException e) {// An error occurred parsing the creation Date string// skip this record}}} //reducer modulepublic static class SOMinMaxCountReducer extendsReducer<Text, MinMaxCountTuple, Text, MinMaxCountTuple> {private MinMaxCountTuple result = new MinMaxCountTuple();@Overridepublic void reduce(Text key, Iterable<MinMaxCountTuple> values,Context context) throws IOException, InterruptedException {// Initialize our result result也是自定义数据类型 MinMaxCountTupleresult.setMin(null);result.setMax(null);int sum = 0;// Iterate through all input values for this keyfor (MinMaxCountTuple val : values) {// If the value's min is less than the result's min// Set the result's min to value'sif (result.getMin() == null|| val.getMin().compareTo(result.getMin()) < 0) {result.setMin(val.getMin());}// If the value's max is less than the result's max// Set the result's max to value'sif (result.getMax() == null|| val.getMax().compareTo(result.getMax()) > 0) {result.setMax(val.getMax());}// Add to our sum the count for valsum += val.getCount();}// Set our count to the number of input valuesresult.setCount(sum);context.write(key, result);}} //main functionpublic static void main(String[] args) throws Exception {Configuration conf = new Configuration();//构建 map//String[] otherArgs = new GenericOptionsParser(conf, args)//.getRemainingArgs();//input and out put//if (otherArgs.length != 2) {//System.err.println("Usage: MinMaxCountDriver <in> <out>");//System.exit(2);//}Job job = new Job(conf, "StackOverflow Comment Date Min Max Count");//FileInputFormat.setInputPaths(job, new Path("G:/eclipsewokspace/MapReduceDesignPatternData/examplesData/MinMaxCount/inputComments.xml"));FileInputFormat.setInputPaths(job, new Path("F:/HDFSinputfile/inputComments.xml"));FileOutputFormat.setOutputPath(job, new Path("F:/HDFSoutputfile/MinMaxCountResult"));job.setJarByClass(MinMaxCountDriver.class);job.setMapperClass(SOMinMaxCountMapper.class);job.setCombinerClass(SOMinMaxCountReducer.class);job.setReducerClass(SOMinMaxCountReducer.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(MinMaxCountTuple.class);//FileInputFormat.addInputPath(job, new Path(otherArgs[0]));//FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));System.exit(job.waitForCompletion(true) ? 0 : 1);} //自定义数据类型public static class MinMaxCountTuple implements Writable {private Date min = new Date();private Date max = new Date();private long count = 0;private final static SimpleDateFormat frmt = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS");public Date getMin() {return min;}public void setMin(Date min) {this.min = min;}public Date getMax() {return max;}public void setMax(Date max) {this.max = max;}public long getCount() {return count;}public void setCount(long count) {this.count = count;} //实现接口中方法@Overridepublic void readFields(DataInput in) throws IOException {min = new Date(in.readLong());max = new Date(in.readLong());count = in.readLong();} //实现接口中方法@Overridepublic void write(DataOutput out) throws IOException {out.writeLong(min.getTime());out.writeLong(max.getTime());out.writeLong(count);}@Overridepublic String toString() {return frmt.format(min) + "\t" + frmt.format(max) + "\t" + count;}}}
辅助类
package mrdp.ch2;import java.util.HashMap;import java.util.Map;public class MRDPUtils {public static final String[] REDIS_INSTANCES = { "p0", "p1", "p2", "p3","p4", "p6" };// This helper function parses the stackoverflow into a Map for us.//java 集合 返回map类型 对应python中的字典数据类型public static Map<String, String> transformXmlToMap(String xml) {Map<String, String> map = new HashMap<String, String>();try {String[] tokens = xml.trim().substring(5, xml.trim().length() - 3).split("\"");//分隔字符串for (int i = 0; i < tokens.length - 1; i += 2) {String key = tokens[i].trim();String val = tokens[i + 1];map.put(key.substring(0, key.length() - 1), val);}} catch (StringIndexOutOfBoundsException e) {System.err.println(xml);}return map;}}
3结果
0 0
- 使用自定义数据类型实现评论数时间、评论总数计数(mapreduce)
- 使用UITableView实现新闻评论
- 评论
- 评论
- 评论
- 评论
- 评论
- 评论
- 评论
- 评论
- 评论
- Django自定义标签实现多级评论
- Dede评论数调用
- 在时间电影评论
- 使用javascript方式获取多说评论插件的文章评论数,转发数
- 自定义评论Dialog
- 自定义RatingBar,五星评论
- Django使用forms来实现评论功能
- 通过gitcheckout到本地然后通过pod添加框架后出现.h文件找不到问题的解决办法
- Ubuntu 常用命令记载
- 值得推荐的C/C++框架和库
- Redis Windows 安装使用及数据存储
- Swift 学习笔记 - 类(1) 定义一个类
- 使用自定义数据类型实现评论数时间、评论总数计数(mapreduce)
- 缓层对比
- hadoop的性能来源
- Android - UriMatcher ContentUris
- Redis 命令速查表
- 记录001
- UIBackgroundTaskIdentifier当进入后台后,继续完成long_running_task
- 【我的产品观】开发wangEditor一年总结
- springMVC 访问 静态资源