## 引言 ##
## 实验 ##

package com.luchi.wordcount;import java.io.BufferedReader;import java.io.FileReader;import java.io.IOException;import java.net.URI;import java.util.ArrayList;import java.util.HashSet;import java.util.List;import java.util.Set;import java.util.StringTokenizer;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Counter;import org.apache.hadoop.mapreduce.Counters;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.GenericOptionsParser;import org.apache.hadoop.util.StringUtils;public class WordCount2 {  public static class TokenizerMapper       extends Mapper<Object, Text, Text, IntWritable>{    static enum CountersEnum { INPUT_WORDS }    private final static IntWritable one = new IntWritable(1);    private Text word = new Text();    private boolean caseSensitive;    private Set<String> patternsToSkip = new HashSet<String>();    private Configuration conf;    private BufferedReader fis;    @Override    public void setup(Context context) throws IOException,        InterruptedException {      conf = context.getConfiguration();      caseSensitive = conf.getBoolean("wordcount.case.sensitive", true);      if (conf.getBoolean("wordcount.skip.patterns", true)) {        URI[] patternsURIs = Job.getInstance(conf).getCacheFiles();        for (URI patternsURI : patternsURIs) {          Path patternsPath = new Path(patternsURI.getPath());          String patternsFileName = patternsPath.getName().toString();          parseSkipFile(patternsFileName);        }      }    }    private void parseSkipFile(String fileName) {      try {        fis = new BufferedReader(new FileReader(fileName));        String pattern = null;        while ((pattern = fis.readLine()) != null) {          patternsToSkip.add(pattern);        }      } catch (IOException ioe) {        System.err.println("Caught exception while parsing the cached file '"            + StringUtils.stringifyException(ioe));      }    }    @Override    public void map(Object key, Text value, Context context                    ) throws IOException, InterruptedException {      String line = (caseSensitive) ?          value.toString() : value.toString().toLowerCase();      for (String pattern : patternsToSkip) {        line = line.replaceAll(pattern, "");      }      StringTokenizer itr = new StringTokenizer(line);      while (itr.hasMoreTokens()) {        word.set(itr.nextToken());        context.write(word, one);        Counter counter = context.getCounter(CountersEnum.class.getName(),        CountersEnum.INPUT_WORDS.toString());        counter.increment(1);      }    }  }  public static class IntSumReducer       extends Reducer<Text,IntWritable,Text,IntWritable> {    private IntWritable result = new IntWritable();    public void reduce(Text key, Iterable<IntWritable> values,                       Context context                       ) throws IOException, InterruptedException {      int sum = 0;      for (IntWritable val : values) {        sum += val.get();      }      result.set(sum);      context.write(key, result);    }  }  public static void main(String[] args) throws Exception {    Configuration conf = new Configuration();    GenericOptionsParser optionParser = new GenericOptionsParser(conf, args);    String[] remainingArgs = optionParser.getRemainingArgs();    if (!(remainingArgs.length != 2 || remainingArgs.length != 4)) {      System.err.println("Usage: wordcount <in> <out> [-skip skipPatternFile]");      System.exit(2);    }    Job job = Job.getInstance(conf, "word count2");    job.setJarByClass(WordCount2.class);    job.setMapperClass(TokenizerMapper.class);    job.setCombinerClass(IntSumReducer.class);    job.setReducerClass(IntSumReducer.class);    job.setOutputKeyClass(Text.class);    job.setOutputValueClass(IntWritable.class);    List<String> otherArgs = new ArrayList<String>();    for (int i=0; i < remainingArgs.length; ++i) {      if ("-skip".equals(remainingArgs[i])) {        job.addCacheFile(new Path(remainingArgs[++i]).toUri());        job.getConfiguration().setBoolean("wordcount.skip.patterns", true);      } else {        otherArgs.add(remainingArgs[i]);      }    }    FileInputFormat.addInputPath(job, new Path(otherArgs.get(0)));    FileOutputFormat.setOutputPath(job, new Path(otherArgs.get(1)));    int e =job.waitForCompletion(true) ? 0 : 1;    Counters counters =job.getCounters();    Counter c=counters.findCounter("com.luchi.wordcount.WordCount2$TokenizerMapper$CountersEnum", "INPUT_WORDS");    System.out.println(c.getValue());    System.exit(e);  }}


    Configuration conf = new Configuration();    GenericOptionsParser optionParser = new GenericOptionsParser(conf, args);    String[] remainingArgs = optionParser.getRemainingArgs();    if (!(remainingArgs.length != 2 || remainingArgs.length != 4)) {      System.err.println("Usage: wordcount <in> <out> [-skip skipPatternFile]");      System.exit(2);    }

这里面使用了GenericOptionsParser 这个类,这个类主要是为了解析命令行参数,我们来看下其API说明:

GenericOptionsParser is a utility to parse command line arguments generic to the Hadoop framework. GenericOptionsParser recognizes several standarad command line arguments, enabling applications to easily specify a namenode, a jobtracker, additional configuration resources etc.

大概意思就是,GenericOptionsParser 这个类是Hadoop框架用来解析命令参数的类,这个类的主要作用是配置一些Hadoop运行期间的参数。其主要解析的参数列表是:

 -conf <configuration file>     specify a configuration file     -D <property=value>            use value for given property     -fs <local|namenode:port>      specify a namenode     -jt <local|jobtracker:port>    specify a job tracker     -files <comma separated list of files>    specify comma separated                            files to be copied to the map reduce cluster     -libjars <comma separated list of jars>   specify comma separated                            jar files to include in the classpath.     -archives <comma separated list of archives>    specify comma             separated archives to be unarchived on the compute machines.

这个类的getRemainingArgs() 方法是为了获取不在上述配置列表里面的命令参数,也就是非Hadoop设置参数。其方法的说明是:

Returns an array of Strings containing only application-specific arguments.


for (int i=0; i < remainingArgs.length; ++i) {      if ("-skip".equals(remainingArgs[i])) {        job.addCacheFile(new Path(remainingArgs[++i]).toUri());        job.getConfiguration().setBoolean("wordcount.skip.patterns", true);      } else {        otherArgs.add(remainingArgs[i]);      }    }

这段代码解析的是Application args,也就是 optionParser.getRemainingArgs();获取的参数,这里面有一个不同是,如果参数里面有-skip选项,那么-skip后面的pattern.txt文件路径就会加入到Hadoop的cacheFile里面,关于cacheFile,API说明的是:

The framework will copy the necessary files to the slave node before any tasks for the job are executed on that node. Its efficiency stems from the fact that the files are only copied once per job and the ability to cache archives which are un-archived on the slaves.


public void setup(Context context) throws IOException,        InterruptedException {      conf = context.getConfiguration();      caseSensitive = conf.getBoolean("wordcount.case.sensitive", true);      if (conf.getBoolean("wordcount.skip.patterns", true)) {        URI[] patternsURIs = Job.getInstance(conf).getCacheFiles();        for (URI patternsURI : patternsURIs) {          Path patternsPath = new Path(patternsURI.getPath());          String patternsFileName = patternsPath.getName().toString();          parseSkipFile(patternsFileName);        }      }    }    private void parseSkipFile(String fileName) {      try {        fis = new BufferedReader(new FileReader(fileName));        String pattern = null;        while ((pattern = fis.readLine()) != null) {          patternsToSkip.add(pattern);        }      } catch (IOException ioe) {        System.err.println("Caught exception while parsing the cached file '"            + StringUtils.stringifyException(ioe));      }



        Counter counter = context.getCounter(CountersEnum.class.getName(),        CountersEnum.INPUT_WORDS.toString());        counter.increment(1);


    int e =job.waitForCompletion(true) ? 0 : 1;    Counters counters =job.getCounters();    Counter c = counters.findCounter("com.luchi.wordcount.WordCount2$TokenizerMapper$CountersEnum", "INPUT_WORDS");    System.out.println(c.getValue());


Job in state DEFINE instead of RUNNING


0 0