Hadoop 编程初步认识

来源：互联网发布：steam数据扫号编辑：程序博客网时间：2024/05/20 02:53

Mapper 类有四个参数，分别是输入键、输入值、输出键和输出值的类型。
Hadoop 提供一套“可优化网络序列化传输”的基本类型，在org.apache.hadoop.io 包里。
LongWritable 相当于Java Long 类型。
Text 相当于Java String类型。
Intwritable 相当于 Java Integer类型。

新API放在org.apache.hadoop.mapreduce包内。

旧API放在org.apache.hadoop.mapred包内。

package wordcount;import java.io.IOException;import java.util.StringTokenizer;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.GenericOptionsParser;public class WordCount {  public static class TokenizerMapper        extends Mapper<Object, Text, Text, IntWritable>{        private final static IntWritable one = new IntWritable(1);    private Text word = new Text();          public void map(Object key, Text value, Context context    // 把Object 改成NullWritable 后，运行出错。不知为何。                    ) throws IOException, InterruptedException {      StringTokenizer itr = new StringTokenizer(value.toString());      // Text.toString(); 把Text类型变为字符串类型。      while (itr.hasMoreTokens()) {      // StringTokenizer 实现Enumeration接口      // 默认以空白字符分割。        word.set(itr.nextToken());        // Text.set(String);  Text的一种构造方法        context.write(word, one);      }    }  }  /**   *   1.reducer 接收各个mapper的输出。   *   2.按key值进行排序分组。   *   3.相同key值的键值对在一个集合内。( Reduce第二个参数为Iterable接口变量。)   *   4.然后调用reduce( )函数。   *      *   @author ooc   */  public static class IntSumReducer        extends Reducer<Text,IntWritable,Text,IntWritable> {    private IntWritable result = new IntWritable();    public void reduce(Text key, Iterable<IntWritable> values,                        Context context                       ) throws IOException, InterruptedException {      int sum = 0;      for (IntWritable val : values) {        sum += val.get();        // IntWritable.get(); 返回 IntWritable的int值      }      result.set(sum);      // IntWritable.set(int) ;  IntWritable的一种构造方法      context.write(key, result);      // 写入结果。    }  }  public static void main(String[] args) throws Exception {     /**  String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();       if (otherArgs.length < 2) {      System.err.println("Usage: wordcount <in> [<in>...] <out>");      System.exit(2);    }    **/Configuration conf = new Configuration();    Job job = new Job(conf, "word count");   //  Job job=new Job();   // 用这条代替上面两条，eclipse卡死，但HDFS里还有输出    job.setJarByClass(WordCount.class);    //设置类名，Hadoop以此寻找包含此类的jar文件并运行。    job.setMapperClass(TokenizerMapper.class);    job.setCombinerClass(IntSumReducer.class);    job.setReducerClass(IntSumReducer.class);    // 设置map reduce的类名    job.setOutputKeyClass(Text.class);    // 设置输出键的类型    job.setOutputValueClass(IntWritable.class);    // 设置输出值的类型    /**    for (int i = 0; i < otherArgs.length - 1; ++i) {      FileInputFormat.addInputPath(job, new Path(otherArgs[i]));    }    FileOutputFormat.setOutputPath(job,      new Path(otherArgs[otherArgs.length - 1]));      **/    for (int i = 0; i < args.length - 1; ++i) {        FileInputFormat.addInputPath(job, new Path(args[i]));        //输入路径，可添加多个路径      }      FileOutputFormat.setOutputPath(job,        new Path(args[args.length - 1]));       // 设置Reduce函数的输出路径。运行前路径应该不存在。    System.exit(job.waitForCompletion(false) ? 0 : 1);    //waitForCompletion 方法提交作业并等待执行完成。    //waitForCompletion 参数为true时，作业把进度写入控制台。    // 任务成功返回true，则为exit(0);  }}

Hadoop 命令

hadoop fs -put ~/file/file* /input

// 把本地磁盘～file/file*的文件导入HDFS里的input文件夹。

hadoop jar share/hadoopxxxx/xxxx.jar wordcount /input /output

// 运行jar中的wordcount 其中HDFS中的input为输入，output为输出。

Linxu命令

cat file1 file2 > file // 将file1与file2的文件合并输出在新建文件file中

cat file1 - > file // - 是标准输入，就是把file1和一会键盘输入的，输出在file中。输入以Ctrl+Z结束。

0 0