带参数的MapReduce程序

来源：互联网发布：普通话等级考试软件编辑：程序博客网时间：2024/06/06 05:08

在公司同事的帮助下写了一个word count程序，与hadoop官网上面的相比就是带了几个参数，使用时用户可以设定参数，并且还可以实时显示处理的进度（用token的形式反映）。

总共就两个类，一个是配置接口Setting

package wordCount;public interface Settings {public static final String HELP_OPTION = "help";public static final String PATH_INDICATOR = "path";public static final String STRING_INDICATOR = "string";public static final String INTEGER_INDICATOR = "int";public static final String INPUT_OPTION = "input";public static final String OUTPUT_OPTION = "output";public static final String FILTER_OPTION = "filter";public static final String TYPE_OPTION = "type";public static final String MAPPER_OPTION = "mapper";public static final String REDUCER_OPTION = "reducer";public static final int DEFAULT_NUMBER_OF_MAPPERS = 1;public static final int DEFAULT_NUMBER_OF_REDUCERS = 1;public static final String DEFAULT_TYPE = "news";}

另一个就是主程序了

package wordCount;import java.io.IOException;import java.util.Iterator;import java.util.StringTokenizer;import org.apache.commons.cli.CommandLine;import org.apache.commons.cli.CommandLineParser;import org.apache.commons.cli.GnuParser;import org.apache.commons.cli.HelpFormatter;import org.apache.commons.cli.OptionBuilder;import org.apache.commons.cli.Options;import org.apache.commons.cli.ParseException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.conf.Configured;import org.apache.hadoop.filecache.DistributedCache;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapred.FileInputFormat;import org.apache.hadoop.mapred.FileOutputFormat;import org.apache.hadoop.mapred.JobClient;import org.apache.hadoop.mapred.JobConf;import org.apache.hadoop.mapred.MapReduceBase;import org.apache.hadoop.mapred.Mapper;import org.apache.hadoop.mapred.OutputCollector;import org.apache.hadoop.mapred.Reducer;import org.apache.hadoop.mapred.Reporter;import org.apache.hadoop.mapred.RunningJob;import org.apache.hadoop.mapred.TextInputFormat;import org.apache.hadoop.mapred.TextOutputFormat;import org.apache.hadoop.util.Tool;import org.apache.hadoop.util.ToolRunner;import org.apache.log4j.Logger;public class Count extends Configured implements Tool {static final Logger sLogger = Logger.getLogger(Count.class);private static final String OPTION = "user_news";private static enum MyCounter {TOKEN, UNIQUE_TOKEN}public static class CountMapper extends MapReduceBase implementsMapper<LongWritable, Text, Text, IntWritable> {private final static IntWritable one = new IntWritable(1);private Text word = new Text();public void map(LongWritable key, Text value,OutputCollector<Text, IntWritable> output, Reporter reporter)throws IOException {String line = value.toString();StringTokenizer tokenizer = new StringTokenizer(line);while (tokenizer.hasMoreTokens()) {word.set(tokenizer.nextToken());output.collect(word, one);reporter.incrCounter(MyCounter.TOKEN, 1);}}}public static class CountReducer extends MapReduceBase implementsReducer<Text, IntWritable, Text, IntWritable> {public void reduce(Text key, Iterator<IntWritable> values,OutputCollector<Text, IntWritable> output, Reporter reporter)throws IOException {int sum = 0;while (values.hasNext()) {sum += values.next().get();}output.collect(key, new IntWritable(sum));reporter.incrCounter(MyCounter.UNIQUE_TOKEN, 1);}}public void count(String inputPath, String outputPath, String filterFile,String user_news, int numberOfMappers, int numberOfReducers)throws Exception {Path confPath = null;if (filterFile != null && filterFile.length() > 0) {confPath = new Path(filterFile);}sLogger.info("Tool: " + Count.class.getSimpleName());sLogger.info(" - input path: " + inputPath);sLogger.info(" - output path: " + outputPath);sLogger.info(" - number of mappers: " + numberOfMappers);sLogger.info(" - number of reducers: " + numberOfReducers);sLogger.info(" - filter path: " + filterFile);sLogger.info(" - type: " + user_news);JobConf conf = new JobConf(Count.class);conf.setJobName("FZX " + Count.class.getSimpleName());conf.set(OPTION, user_news);// Preconditions.checkArgument(fs.exists(confPath),// "Missing term index files...");if (confPath != null) {DistributedCache.addCacheFile(confPath.toUri(), conf);}conf.setNumMapTasks(numberOfMappers);conf.setNumReduceTasks(numberOfReducers);conf.setMapperClass(CountMapper.class);conf.setReducerClass(CountReducer.class);conf.setMapOutputKeyClass(Text.class);conf.setMapOutputValueClass(IntWritable.class);conf.setOutputKeyClass(Text.class);conf.setOutputValueClass(IntWritable.class);//conf.setInputFormat(DeprecatedLzoTextInputFormat.class);// conf.setInputFormat(LzoTextInputFormat.class);conf.setInputFormat(TextInputFormat.class);conf.setOutputFormat(TextOutputFormat.class);FileInputFormat.setInputPaths(conf, new Path(inputPath));FileOutputFormat.setOutputPath(conf, new Path(outputPath));// FileOutputFormat.setCompressOutput(conf, true);long startTime = System.currentTimeMillis();RunningJob job = JobClient.runJob(conf);sLogger.info("Job Finished in "+ (System.currentTimeMillis() - startTime) / 1000.0+ " seconds");}public int run(String[] args) throws Exception {Options options = new Options();options.addOption(Settings.HELP_OPTION, false, "print the help message");options.addOption(OptionBuilder.withArgName(Settings.PATH_INDICATOR).hasArg().withDescription("input file(s) or directory").isRequired().create(Settings.INPUT_OPTION));options.addOption(OptionBuilder.withArgName(Settings.PATH_INDICATOR).hasArg().withDescription("filter file(s)").create(Settings.FILTER_OPTION));options.addOption(OptionBuilder.withArgName(Settings.PATH_INDICATOR).hasArg().withDescription("output directory").isRequired().create(Settings.OUTPUT_OPTION));options.addOption(OptionBuilder.withArgName(Settings.INTEGER_INDICATOR).hasArg().withDescription("number of mappers (default - "+ Settings.DEFAULT_NUMBER_OF_MAPPERS + ")").create(Settings.MAPPER_OPTION));options.addOption(OptionBuilder.withArgName(Settings.INTEGER_INDICATOR).hasArg().withDescription("number of reducers (default - "+ Settings.DEFAULT_NUMBER_OF_REDUCERS + ")").create(Settings.REDUCER_OPTION));options.addOption(OptionBuilder.withArgName(Settings.STRING_INDICATOR).hasArg().withDescription("type (default - " + Settings.DEFAULT_TYPE + ")").create(Settings.TYPE_OPTION));String inputPath = null;String outputPath = null;String filterPath = null;String type = Settings.DEFAULT_TYPE;int numberOfMappers = Settings.DEFAULT_NUMBER_OF_MAPPERS;int numberOfReducers = Settings.DEFAULT_NUMBER_OF_REDUCERS;CommandLineParser parser = new GnuParser();HelpFormatter formatter = new HelpFormatter();try {CommandLine line = parser.parse(options, args);if (line.hasOption(Settings.INPUT_OPTION)) {inputPath = line.getOptionValue(Settings.INPUT_OPTION);} else {throw new ParseException("Parsing failed due to "+ Settings.INPUT_OPTION + " not initialized...");}if (line.hasOption(Settings.OUTPUT_OPTION)) {outputPath = line.getOptionValue(Settings.OUTPUT_OPTION);} else {throw new ParseException("Parsing failed due to "+ Settings.OUTPUT_OPTION + " not initialized...");}if (line.hasOption(Settings.FILTER_OPTION)) {filterPath = line.getOptionValue(Settings.FILTER_OPTION);}if (line.hasOption(Settings.MAPPER_OPTION)) {numberOfMappers = Integer.parseInt(line.getOptionValue(Settings.MAPPER_OPTION));}if (line.hasOption(Settings.REDUCER_OPTION)) {numberOfReducers = Integer.parseInt(line.getOptionValue(Settings.REDUCER_OPTION));}if (line.hasOption(Settings.TYPE_OPTION)) {type = line.getOptionValue(Settings.TYPE_OPTION);}} catch (ParseException pe) {System.err.println(pe.getMessage());formatter.printHelp(Count.class.getName(), options);System.exit(0);} catch (NumberFormatException nfe) {System.err.println(nfe.getMessage());System.exit(0);}// Delete the output directory if it exists alreadyFileSystem fs = FileSystem.get(new JobConf(Count.class));fs.delete(new Path(outputPath), true);try {count(inputPath, outputPath, filterPath, type, numberOfMappers,numberOfReducers);} finally {// fs.delete(new Path(outputPath), true);}return 0;}public static void main(String[] args) throws Exception {int res = ToolRunner.run(new Configuration(), new Count(), args);System.exit(res);}}

其中CountMapper执行map操作，CountReducer执行reduce操作。

0 0