mapreduce编程实例(1)-统计词频

来源：互联网发布：英国大学知乎编辑：程序博客网时间：2024/05/09 03:35

今天开始把MapReduce Design Patterns这本书上的mapreduce例子过一遍，我觉得这本书对学mapreduce编程非常好，把这本书看完了，基本上能遇到的mapreduce问题也都能处理了。下面开始第一篇吧。这个程序是统计一个名为comment.xml中的词频。直接上代码吧。

//解析xml文件，并存入map中。package mrdp.utils;import java.util.HashMap;import java.util.Map;public class MRDPUtils {public static final String[] REDIS_INSTANCES = { "p0", "p1", "p2", "p3","p4", "p6" };// This helper function parses the stackoverflow into a Map for us.public static Map<String, String> transformXmlToMap(String xml) {Map<String, String> map = new HashMap<String, String>();try {String[] tokens = xml.trim().substring(5, xml.trim().length() - 3).split("\"");for (int i = 0; i < tokens.length - 1; i += 2) {String key = tokens[i].trim();String val = tokens[i + 1];map.put(key.substring(0, key.length() - 1), val);}} catch (StringIndexOutOfBoundsException e) {System.err.println(xml);}return map;}}

//主程序package mrdp.ch1;import java.io.IOException;import java.util.StringTokenizer;import java.util.Map;import mrdp.utils.MRDPUtils;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.GenericOptionsParser;import org.apache.commons.lang.StringEscapeUtils;public class CommentWordCount {public static class SOWordCountMapper extendsMapper<Object, Text, Text, IntWritable> {private final static IntWritable one = new IntWritable(1);private Text word = new Text();public void map(Object key, Text value, Context context)throws IOException, InterruptedException {// Parse the input string into a nice mapMap<String, String> parsed = MRDPUtils.transformXmlToMap(value.toString());// Grab the "Text" field, since that is what we are counting overString txt = parsed.get("Text");// .get will return null if the key is not thereif (txt == null) {// skip this recordreturn;}// Unescape the HTML because the SO data is escaped.txt = StringEscapeUtils.unescapeHtml(txt.toLowerCase());// Remove some annoying punctuationtxt = txt.replaceAll("'", ""); // remove single quotes (e.g., can't)txt = txt.replaceAll("[^a-zA-Z]", " "); // replace the rest with a  space// Tokenize the string, then send the tokens awayStringTokenizer itr = new StringTokenizer(txt);while (itr.hasMoreTokens()) {word.set(itr.nextToken());context.write(word, one);}}}public static class IntSumReducer extendsReducer<Text, IntWritable, Text, IntWritable> {private IntWritable result = new IntWritable();public void reduce(Text key, Iterable<IntWritable> values,Context context) throws IOException, InterruptedException {int sum = 0;for (IntWritable val : values) {sum += val.get();}result.set(sum);context.write(key, result);}}public static void main(String[] args) throws Exception {Configuration conf = new Configuration();String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();if (otherArgs.length != 2) {System.err.println("Usage: CommentWordCount <in> <out>");System.exit(2);}@SuppressWarnings("deprecation")Job job = new Job(conf, "StackOverflow Comment Word Count");job.setJarByClass(CommentWordCount.class);job.setMapperClass(SOWordCountMapper.class);job.setCombinerClass(IntSumReducer.class);job.setReducerClass(IntSumReducer.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(IntWritable.class);FileInputFormat.addInputPath(job, new Path(otherArgs[0]));FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));System.exit(job.waitForCompletion(true) ? 0 : 1);}}

这个程序就不用过多解释了，只要会java和稍微看过wordcount的人都知道的

我的mapreduce程序是在eclipse上调试的，在运行此程序时需要填写参数，即在run configuration中填上自己的hdfs地址，如我的参数是：

hdfs://localhost:8010/user/jpan/comments.xml  hdfs://localhost:8010/user/jpan/output1

测试数据的链接在http://pan.baidu.com/s/1c0xP6Dy，里面有comment.xml文件，另外一些文件我们会在后面用到。

MapReduce Design Patterns这本书的链接http://pan.baidu.com/s/1jGt96Hg

0 0