MapReduce操作HBase

来源：互联网发布：医学英文文献数据库编辑：程序博客网时间：2024/05/05 23:49

这两天看了一下HBase的基本操作，然后又重温了下Hadoop的MapReduce的基本操作(虽然之前看的也是一般般，理解不是很深)。本来打算昨晚完成两件事情的：1、使用map任务读出HDFS 上的文件，并把他导入到HBase中；2、使用map任务读出HBase中的数据，并使用reduce输出到文件中。

我的数据使用如下：（只是很少的数据）

首先贴代码，然后再说明问题吧：

package org.fansy.date830;import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.hbase.HBaseConfiguration;import org.apache.hadoop.hbase.client.Put;import org.apache.hadoop.hbase.io.ImmutableBytesWritable;import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;import org.apache.hadoop.hbase.util.Bytes;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.util.GenericOptionsParser;public class JobOne {/** * use map job to read file data and then import the data to HBase * start:22:44 * test:OK * end: 22:59 */public static void main(String[] args) throws Exception{// TODO Auto-generated method stubConfiguration conf = HBaseConfiguration.create();    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();    if(otherArgs.length != 2) {      System.err.println("Wrong number of arguments: " + otherArgs.length);      System.err.println("Usage: <input> <tablename>");      System.exit(-1);    }        Job job=new Job(conf,"import data to hbase");    job.setJarByClass(JobOne.class);job.setMapperClass(MapperClass.class);job.setMapOutputKeyClass(ImmutableBytesWritable.class);        job.setMapOutputValueClass(Put.class);TableMapReduceUtil.initTableReducerJob(args[1], null, job);job.setNumReduceTasks(0);FileInputFormat.setInputPaths(job, args[0]);System.exit(job.waitForCompletion(true) ? 0 : 1);}public static class MapperClass extends Mapper<LongWritable,Text,ImmutableBytesWritable,Put>{public void map(LongWritable key,Text line,Context context)throws IOException,InterruptedException{String[] values=line.toString().split(",");if(values.length!=4){ // if there are not four args,then returnreturn ;}  byte [] row = Bytes.toBytes(values[0]);      byte [] family = Bytes.toBytes(values[1]);      byte [] qualifier = Bytes.toBytes(values[2]);      byte [] value = Bytes.toBytes(values[3]);     Put put=new Put(row);     put.add(family,qualifier,value);     // first do not add the next line to test whether needs it     // the result turns out that this should be added     context.write(new ImmutableBytesWritable(row),put);}}}

我参考了网上的一篇文章：http://www.cnblogs.com/liqizhou/archive/2012/05/17/2504279.html，这篇文章上面使用了map任务读取数据，然后在reduce阶段把数据导入到HBase中，我觉得如果用map任务可以完成的话就不用reduce就行了吧。在上面的代码中要注意一点，其中的context.write(...)一句一定要加上，不如不会有结果导入到HBase中，但在网上的这篇文章中的reduce任务没有context.write(...)，不知道可否（不过个人对mapreduce的操作看的也是一般般）。

然后在 hbase shell中查看导入的数据如下：

然后就是问题二了，昨晚没有解决，搞到差不多12点，还没搞定，我都有点晕晕沉沉的了，所以就睡了，今早早起搞定它。

先贴代码：

package org.fansy.date830;import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.hbase.HBaseConfiguration;//import org.apache.hadoop.hbase.client.Put;import org.apache.hadoop.hbase.client.Result;import org.apache.hadoop.hbase.client.Scan;import org.apache.hadoop.hbase.io.ImmutableBytesWritable;import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;import org.apache.hadoop.hbase.mapreduce.TableMapper;import org.apache.hadoop.hbase.util.Bytes;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;//import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;//import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.GenericOptionsParser;//import org.fansy.date830.JobOne.MapperClass;public class JobTwo {/** * use map job to read from HBase,and use reduce job to output the data to a file * start:23:00 *  * end: 2012/08/31 09:30 */public static void main(String[] args) throws Exception{// TODO Auto-generated method stubConfiguration conf = HBaseConfiguration.create();    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();    if(otherArgs.length != 1) {      System.err.println("Wrong number of arguments: " + otherArgs.length);      System.err.println("Usage: <output> ");      System.exit(-1);    }        Job job=new Job(conf,"read data from hbase and import it to a file");    Scan scan =new Scan();    job.setJarByClass(JobTwo.class);//job.setMapOutputKeyClass(Text.class);    //   job.setMapOutputValueClass(Text.class);        TableMapReduceUtil.initTableMapperJob("testtable".getBytes(), scan,MapperClass.class, Text.class,Text.class,job);job.setReducerClass(ReducerClass.class);//FileInputFormat.setInputPaths(job, args[0]);FileOutputFormat.setOutputPath(job, new Path(args[0]));System.exit(job.waitForCompletion(true) ? 0 : 1);}public static class MapperClass extends TableMapper<Text,Text>{public  void map(ImmutableBytesWritable row,Result result,Context context)throws IOException,InterruptedException{//String newrowq=row.toString();String newrow=Bytes.toString(result.getRow());String newvalue=null;if(result.containsColumn("f1".getBytes(), "age".getBytes())){ newvalue=Bytes.toString(result.getValue("f1".getBytes(), "age".getBytes())); context.write(new Text(newrow), new Text(newvalue));}if(result.containsColumn("f1".getBytes(), "name".getBytes())){ newvalue=Bytes.toString(result.getValue("f1".getBytes(), "name".getBytes())); context.write(new Text(newrow), new Text(newvalue));}}}public static class ReducerClass extends Reducer<Text,Text,Text,Text>{public void reduce(Text key,Iterable<Text> values,Context context) throws IOException,InterruptedException{StringBuffer str=new StringBuffer();for(Text val:values){str.append(val.toString());}context.write(key, new Text(str.toString()));}}}

首先说下我遇到的问题吧，

一：

 TableMapReduceUtil.initTableMapperJob("testtable".getBytes(), scan,MapperClass.class, Text.class,Text.class,job);

这一句，昨晚不知道很多参数的意义，所以老是说map的输出的格式不对，后来认真看来API 才知道，原来是设置错误，弄得我昨晚晕晕的；

二：

context.write(new Text(newrow), new Text(newvalue));

本来这句是在map函数的最后面的，但是运行结果出来后发现map的输入输出records都是8，这样就不对了，应该是16records的吧，然后我就改为上面的代码了，然后map的输入仍然是8,但是输出变为16了，所以我猜测，如果是相同的row的话map应该是把他当作一条记录了。

最后贴上输出的文件：

本人也是刚开始学习HBase，以前有弄过Hadoop的mapreduce，但是不是很深入，很多都不了解，希望以后可以有更深入的了解。

分享，成长