Hbse源码分析-HFileOutputFo…

来源：互联网发布：西安淘宝模特兼职编辑：程序博客网时间：2024/06/18 12:45

原文地址：Hbse源码分析-HFileOutputFormat作者：天若有情

开始学习使用Hbase，不知道对不对但是先从源码开始读起吧....

hadoop mr 输出需要导入hbase的话最好先输出成HFile格式，再导入到HBase,因为HFile是HBase的内部存

对应的源码为：

[java] viewplaincopy
  
package org.apache.hadoop.hbase.mapreduce;  
  
import java.io.IOException;  
import java.net.URI;  
import java.net.URISyntaxException;  
import java.util.ArrayList;  
import java.util.List;  
import java.util.Map;  
import java.util.TreeMap;  
import java.util.TreeSet;  
  
import org.apache.hadoop.conf.Configuration;  
import org.apache.hadoop.filecache.DistributedCache;  
import org.apache.hadoop.fs.FileSystem;  
import org.apache.hadoop.fs.Path;  
import org.apache.hadoop.hbase.HConstants;  
import org.apache.hadoop.hbase.KeyValue;  
import org.apache.hadoop.hbase.client.HTable;  
import org.apache.hadoop.hbase.client.Put;  
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;  
import org.apache.hadoop.hbase.io.hfile.Compression;  
import org.apache.hadoop.hbase.io.hfile.HFile;  
import org.apache.hadoop.hbase.mapreduce.hadoopbackport.TotalOrderPartitioner;  
import org.apache.hadoop.hbase.regionserver.StoreFile;  
import org.apache.hadoop.hbase.util.Bytes;  
import org.apache.hadoop.io.NullWritable;  
import org.apache.hadoop.io.SequenceFile;  
import org.apache.hadoop.mapreduce.Job;  
import org.apache.hadoop.mapreduce.RecordWriter;  
import org.apache.hadoop.mapreduce.TaskAttemptContext;  
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;  
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  
  
import org.apache.commons.logging.Log;  
import org.apache.commons.logging.LogFactory;  
  
  
public class HFileOutputFormat extends FileOutputFormat<ImmutableBytesWritable, KeyValue> {  
  static Log LOG = LogFactory.getLog(HFileOutputFormat.class);  
  //继承的FileOutputFile  
  public RecordWriter<ImmutableBytesWritable, KeyValue> getRecordWriter(final TaskAttemptContext context)//keyValue是什么值？  
  throws IOException, InterruptedException {  
    // Get the path of the temporary output file获得输出文件的临时路径  
    final Path outputPath = FileOutputFormat.getOutputPath(context);  
    final Path outputdir = new FileOutputCommitter(outputPath, context).getWorkPath();  
    Configuration conf = context.getConfiguration();  
    final FileSystem fs = outputdir.getFileSystem(conf);  
    // These configs. are from hbase-*.xml获取对应的配置信息，前面的参数未设置，则使用默认的参数  
    final long maxsize = conf.getLong("hbase.hregion.max.filesize", 268435456);  
    final int blocksize =  
      conf.getInt("hbase.mapreduce.hfileoutputformat.blocksize", 65536);  
    // Invented config.  Add to hbase-*.xml if other than default compression.  
    final String compression = conf.get("hfile.compression",  
      Compression.Algorithm.NONE.getName());  
  
    return new RecordWriter<ImmutableBytesWritable, KeyValue>() {  
      // Map of families to writers and how much has been output on the writer.将列簇映射到writer上去  
      private final Map<byte [], WriterLength> writers =//使用map作为容器  
        new TreeMap<byte [], WriterLength>(Bytes.BYTES_COMPARATOR);  
      private byte [] previousRow = HConstants.EMPTY_BYTE_ARRAY;//空  
      private final byte [] now = Bytes.toBytes(System.currentTimeMillis());//当前时间  
  
      public void write(ImmutableBytesWritable row, KeyValue kv)//写入信息  
      throws IOException {  
        long length = kv.getLength();  
        byte [] family = kv.getFamily();//获取列簇  
        WriterLength wl = this.writers.get(family);  
        if (wl == null || ((length + wl.written) >= maxsize) &&  
            Bytes.compareTo(this.previousRow, 0, this.previousRow.length,  
              kv.getBuffer(), kv.getRowOffset(), kv.getRowLength()) != 0) {  
          // Get a new writer.  
          Path basedir = new Path(outputdir, Bytes.toString(family));  
          if (wl == null) {  
            wl = new WriterLength();  
            this.writers.put(family, wl);  
            if (this.writers.size() > 1) throw new IOException("One family only");  
            // If wl == null, first file in family.  Ensure family dir exits.  
            if (!fs.exists(basedir)) fs.mkdirs(basedir);//建立对应的路径  
          }  
          wl.writer = getNewWriter(wl.writer, basedir);  
          LOG.info("Writer=" + wl.writer.getPath() +  
            ((wl.written == 0)? "": ", wrote=" + wl.written));  
          wl.written = 0;  
        }  
        kv.updateLatestStamp(this.now);//  
        wl.writer.append(kv);  
        wl.written += length;  
        // Copy the row so we know when a row transition.  
        this.previousRow = kv.getRow();  
      }  
  
        
      private HFile.Writer getNewWriter(final HFile.Writer writer,  
          final Path familydir)  
      throws IOException {  
        close(writer);  
        return new HFile.Writer(fs,  StoreFile.getUniqueFile(fs, familydir),  
          blocksize, compression, KeyValue.KEY_COMPARATOR);  
      }  
  
      private void close(final HFile.Writer w) throws IOException {  
        if (w != null) {  
          w.appendFileInfo(StoreFile.BULKLOAD_TIME_KEY,//时间标签  
              Bytes.toBytes(System.currentTimeMillis()));  
          w.appendFileInfo(StoreFile.BULKLOAD_TASK_KEY,  
              Bytes.toBytes(context.getTaskAttemptID().toString()));  
          w.appendFileInfo(StoreFile.MAJOR_COMPACTION_KEY,   
              Bytes.toBytes(true));  
          w.close();  
        }  
      }  
  
      public void close(TaskAttemptContext c)  
      throws IOException, InterruptedException {  
        for (Map.Entry<byte [], WriterLength> e: this.writers.entrySet()) {  
          close(e.getValue().writer);  
        }  
      }  
    };  
  }  
  
    
  static class WriterLength {  
    long written = 0;  
    HFile.Writer writer = null;  
  }  
  
    
  private static List<ImmutableBytesWritable> getRegionStartKeys(HTable table)  
  throws IOException {  
    byte[][] byteKeys = table.getStartKeys();  
    ArrayList<ImmutableBytesWritable> ret =  
      new ArrayList<ImmutableBytesWritable>(byteKeys.length);  
    for (byte[] byteKey : byteKeys) {  
      ret.add(new ImmutableBytesWritable(byteKey));  
    }  
    return ret;  
  }  
  
    
  private static void writePartitions(Configuration conf, Path partitionsPath,  
      List<ImmutableBytesWritable> startKeys) throws IOException {  
    if (startKeys.isEmpty()) {  
      throw new IllegalArgumentException("No regions passed");  
    }  
  
    // We're generating a list of split points, and we don't ever  
    // have keys < the first region (which has an empty start key)  
    // so we need to remove it. Otherwise we would end up with an  
    // empty reducer with index 0  
    TreeSet<ImmutableBytesWritable> sorted =  
      new TreeSet<ImmutableBytesWritable>(startKeys);  
  
    ImmutableBytesWritable first = sorted.first();  
    if (!first.equals(HConstants.EMPTY_BYTE_ARRAY)) {  
      throw new IllegalArgumentException(  
          "First region of table should have empty start key. Instead has: "  
          + Bytes.toStringBinary(first.get()));  
    }  
    sorted.remove(first);  
      
    // Write the actual file  
    FileSystem fs = partitionsPath.getFileSystem(conf);  
    SequenceFile.Writer writer = SequenceFile.createWriter(fs,   
        conf, partitionsPath, ImmutableBytesWritable.class, NullWritable.class);  
      
    try {  
      for (ImmutableBytesWritable startKey : sorted) {  
        writer.append(startKey, NullWritable.get());  
      }  
    } finally {  
      writer.close();  
    }  
  }  
    
    
  public static void configureIncrementalLoad(Job job, HTable table) throws IOException {//这些不是在main函数中设置的吗  
    Configuration conf = job.getConfiguration();  
    job.setPartitionerClass(TotalOrderPartitioner.class);  
    job.setOutputKeyClass(ImmutableBytesWritable.class);  
    job.setOutputValueClass(KeyValue.class);  
    job.setOutputFormatClass(HFileOutputFormat.class);  
      
    // Based on the configured map output class, set the correct reducer to properly  
    // sort the incoming values.  
    // TODO it would be nice to pick one or the other of these formats.  
    if (KeyValue.class.equals(job.getMapOutputValueClass())) {  
      job.setReducerClass(KeyValueSortReducer.class);  
    } else if (Put.class.equals(job.getMapOutputValueClass())) {  
      job.setReducerClass(PutSortReducer.class);  
    } else {  
      LOG.warn("Unknown map output value type:" + job.getMapOutputValueClass());  
    }  
      
    LOG.info("Looking up current regions for table " + table);  
    List<ImmutableBytesWritable> startKeys = getRegionStartKeys(table);  
    LOG.info("Configuring " + startKeys.size() + " reduce partitions " +  
        "to match current region count");  
    job.setNumReduceTasks(startKeys.size());  
      
    Path partitionsPath = new Path(job.getWorkingDirectory(),  
        "partitions_" + System.currentTimeMillis());  
    LOG.info("Writing partition information to " + partitionsPath);  
  
    FileSystem fs = partitionsPath.getFileSystem(conf);  
    writePartitions(conf, partitionsPath, startKeys);  
    partitionsPath.makeQualified(fs);  
    URI cacheUri;  
    try {  
      cacheUri = new URI(partitionsPath.toString() + "#" +  
          TotalOrderPartitioner.DEFAULT_PATH);  
    } catch (URISyntaxException e) {  
      throw new IOException(e);  
    }  
    DistributedCache.addCacheFile(cacheUri, conf);  
    DistributedCache.createSymlink(conf);  
      
    LOG.info("Incremental table output configured.");  
  }  
    
}  

下面给出一个示例：

1. 创建HBase表t1

[java] viewplaincopy
hbase(main):157:0* create 't1','f1'   
0 row(s) in 1.3280 seconds   
   
hbase(main):158:0> scan 't1'   
ROW                   COLUMN+CELL                                                  
0 row(s) in 1.2770 seconds   

2.写MR作业
HBaseHFileMapper.java

[java] viewplaincopy
package com.test.hfile;   
import java.io.IOException;   
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;   
import org.apache.hadoop.hbase.util.Bytes;   
import org.apache.hadoop.io.LongWritable;   
import org.apache.hadoop.io.Text;   
import org.apache.hadoop.mapreduce.Mapper;   
   
public class HBaseHFileMapper extends Mapper<LongWritable, Text, ImmutableBytesWritable, Text> {   
    private ImmutableBytesWritable immutableBytesWritable = new ImmutableBytesWritable();   
    @Override   
    protected void map(LongWritable key, Text value,   
            org.apache.hadoop.mapreduce.Mapper.Context context)   
            throws IOException, InterruptedException {   
        immutableBytesWritable.set(Bytes.toBytes(key.get())); //也就是说输入值还是key  
        context.write(immutableBytesWritable, value);   
    }   
}   

HBaseHFileReducer.java

[java] viewplaincopy
package com.test.hfile;   
import java.io.IOException;   
import org.apache.hadoop.hbase.KeyValue;   
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;   
import org.apache.hadoop.hbase.util.Bytes;   
import org.apache.hadoop.io.Text;   
import org.apache.hadoop.mapreduce.Reducer;   
   
public class HBaseHFileReducer extends Reducer<ImmutableBytesWritable, Text, ImmutableBytesWritable, KeyValue> {       
    protected void reduce(ImmutableBytesWritable key, Iterable<Text> values,   
            Context context)   
            throws IOException, InterruptedException {   
        String value="";   
        while(values.iterator().hasNext())   
        {   
            value = values.iterator().next().toString();   
            if(value != null && !"".equals(value)) //不为空  
            {   
                KeyValue kv = createKeyValue(value.toString());//keyValue就是value值   
                if(kv!=null)   
                    context.write(key, kv);   
            }   
        }   
    }   
    // str格式为row:family:qualifier:value 简单模拟下  
    private KeyValue createKeyValue(String str)   
    {   
        String[] strstrs = str.split(":");   
        if(strs.length<4)   
            return null;   
        String row=strs[0];   
        String family=strs[1];   
        String qualifier=strs[2];   
        String value=strs[3];   
        return new KeyValue(Bytes.toBytes(row),Bytes.toBytes(family),Bytes.toBytes(qualifier),System.currentTimeMillis(), Bytes.toBytes(value));   
    }   
}   

HbaseHFileDriver.java

[java] viewplaincopy
package com.test.hfile;   
import java.io.IOException;   
import org.apache.hadoop.conf.Configuration;   
import org.apache.hadoop.fs.Path;   
import org.apache.hadoop.hbase.HBaseConfiguration;   
import org.apache.hadoop.hbase.client.HTable;   
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;   
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat;   
import org.apache.hadoop.io.Text;   
import org.apache.hadoop.mapreduce.Job;   
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;   
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;   
import org.apache.hadoop.util.GenericOptionsParser;   
   
public class HbaseHFileDriver {   
    public static void main(String[] args) throws IOException,   
            InterruptedException, ClassNotFoundException {   
           
        Configuration conf = new Configuration();   
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();   
   
        Job job = new Job(conf, "testhbasehfile");   
        job.setJarByClass(HbaseHFileDriver.class); //主类  
   
        job.setMapperClass(com.test.hfile.HBaseHFileMapper.class);   
        job.setReducerClass(com.test.hfile.HBaseHFileReducer.class);   
   
        job.setMapOutputKeyClass(ImmutableBytesWritable.class);   
        job.setMapOutputValueClass(Text.class);   
  
        // 偷懒， 直接写死在程序里了，实际应用中不能这样, 应从命令行获取,无语了对这个  
        FileInputFormat.addInputPath(job, new Path("/home/yinjie/input"));   
        FileOutputFormat.setOutputPath(job, new Path("/home/yinjie/output"));   
   
        Configuration HBASE_CONFIG = new Configuration();   
        HBASE_CONFIG.set("hbase.zookeeper.quorum", "localhost");   
        HBASE_CONFIG.set("hbase.zookeeper.property.clientPort", "2181");   
        HBaseConfiguration cfg = new HBaseConfiguration(HBASE_CONFIG);   
        String tableName = "t1";   
        HTable htable = new HTable(cfg, tableName);   
        HFileOutputFormat.configureIncrementalLoad(job, htable);   
   
        System.exit(job.waitForCompletion(true) ? 0 : 1);   
    }   
}   

/home/yinjie/input目录下有一个hbasedata.txt文件,内容为

[java] viewplaincopy
[root@localhost input]# cat hbasedata.txt    
r1:f1:c1:value1   
r2:f1:c2:value2   
r3:f1:c3:value3   

将作业打包，我的到处路径为/home/yinjie/job/hbasetest.jar
提交作业到hadoop运行:

[root@localhost job]# hadoop jar /home/yinjie/job/hbasetest.jar com.test.hfile.HbaseHFileDriver -libjars

/home/yinjie/hbase-0.90.3/hbase-0.90.3.jar
作业运行完毕后查看下输出目录:

[root@localhost input]# hadoop fs -ls /home/yinjie/output
Found 2 items
drwxr-xr-x - root supergroup 0 2011-08-28 21:02 /home/yinjie/output/_logs
drwxr-xr-x - root supergroup 0 2011-08-28 21:03 /home/yinjie/output/f1

OK,已经生成以列族f1命名的文件夹了。

接下去使用Bulk Load将数据导入到HBbase

[root@localhost job]# hadoop jar /home/yinjie/hbase-0.90.3/hbase-0.90.3.jar completebulkload

导入完毕，查询hbase表t1进行验证

hbase(main):166:0> scan 't1'
ROW COLUMN+CELL
r1 column=f1:c1, timestamp=1314591150788, value=value1
r2 column=f1:c2, timestamp=1314591150814, value=value2
r3 column=f1:c3, timestamp=1314591150815, value=value3
3 row(s) in 0.0210 seconds

数据已经导入!

本文后面的例子出自 “炽天使” 博客，请务必保留此出处 http://3199782.blog.51cto.com/3189782/652244

package org.apache.hadoop.hbase.mapreduce;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.TreeSet;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.io.hfile.Compression;
import org.apache.hadoop.hbase.io.hfile.HFile;
import org.apache.hadoop.hbase.mapreduce.hadoopbackport.TotalOrderPartitioner;
import org.apache.hadoop.hbase.regionserver.StoreFile;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
public class HFileOutputFormat extends FileOutputFormat<ImmutableBytesWritable,KeyValue> {
static Log LOG = LogFactory.getLog(HFileOutputFormat.class);
//继承的FileOutputFile
public RecordWriter<ImmutableBytesWritable,KeyValue> getRecordWriter(final TaskAttemptContext context)//keyValue是什么值？
throws IOException, InterruptedException{
// Get the path of the temporary outputfile获得输出文件的临时路径
final Path outputPath =FileOutputFormat.getOutputPath(context);
final Path outputdir = new FileOutputCommitter(outputPath,context).getWorkPath();
Configuration conf =context.getConfiguration();
final FileSystem fs =outputdir.getFileSystem(conf);
// These configs. are fromhbase-*.xml获取对应的配置信息，前面的参数未设置，则使用默认的参数
final long maxsize = conf.getLong("hbase.hregion.max.filesize",268435456);
final int blocksize =
conf.getInt("hbase.mapreduce.hfileoutputformat.blocksize",65536);
// Invented config. Add tohbase-*.xml if other than default compression.
final String compression = conf.get("hfile.compression",
Compression.Algorithm.NONE.getName());
return new RecordWriter<ImmutableBytesWritable,KeyValue>() {
// Map of families to writers and how much has beenoutput on the writer.将列簇映射到writer上去
private final Map<byte [], WriterLength> writers =//使用map作为容器
new TreeMap<byte [],WriterLength>(Bytes.BYTES_COMPARATOR);
private byte [] previousRow = HConstants.EMPTY_BYTE_ARRAY;//空
private final byte [] now =Bytes.toBytes(System.currentTimeMillis());//当前时间
public void write(ImmutableBytesWritable row, KeyValue kv)//写入信息
throws IOException {
long length = kv.getLength();
byte [] family = kv.getFamily();//获取列簇
WriterLength wl = this.writers.get(family);
if (wl == null || ((length + wl.written) >= maxsize)&&
Bytes.compareTo(this.previousRow,0,this.previousRow.length,
kv.getBuffer(), kv.getRowOffset(), kv.getRowLength()) !=0){
// Get a new writer.
Path basedir = new Path(outputdir,Bytes.toString(family));
if (wl == null){
wl = new WriterLength();
this.writers.put(family,wl);
if (this.writers.size()> 1)throw new IOException("One familyonly");
// If wl == null, first file infamily. Ensure family direxits.
if (!fs.exists(basedir)) fs.mkdirs(basedir);//建立对应的路径
}
wl.writer = getNewWriter(wl.writer,basedir);
LOG.info("Writer="+ wl.writer.getPath() +
((wl.written == 0)?"":", wrote="+ wl.written));
wl.written = 0;
}
kv.updateLatestStamp(this.now);//
wl.writer.append(kv);
wl.written += length;
// Copy the row so we know when a rowtransition.
this.previousRow= kv.getRow();
}
private HFile.Writer getNewWriter(final HFile.Writer writer,
final Path familydir)
throws IOException {
close(writer);
return new HFile.Writer(fs, StoreFile.getUniqueFile(fs,familydir),
blocksize, compression,KeyValue.KEY_COMPARATOR);
}
private void close(final HFile.Writer w) throws IOException {
if (w != null){
w.appendFileInfo(StoreFile.BULKLOAD_TIME_KEY,//时间标签
Bytes.toBytes(System.currentTimeMillis()));
w.appendFileInfo(StoreFile.BULKLOAD_TASK_KEY,
Bytes.toBytes(context.getTaskAttemptID().toString()));
w.appendFileInfo(StoreFile.MAJOR_COMPACTION_KEY,
Bytes.toBytes(true));
w.close();
}
}
public void close(TaskAttemptContext c)
throws IOException, InterruptedException{
for (Map.Entry<byte [], WriterLength> e: this.writers.entrySet()){
close(e.getValue().writer);
}
}
};
}
static class WriterLength {
long written = 0;
HFile.Writerwriter = null;
}
private static List<ImmutableBytesWritable>getRegionStartKeys(HTabletable)
throws IOException {
byte[][]byteKeys =table.getStartKeys();
ArrayList<ImmutableBytesWritable> ret=
new ArrayList<ImmutableBytesWritable>(byteKeys.length);
for (byte[]byteKey : byteKeys) {
ret.add(new ImmutableBytesWritable(byteKey));
}
return ret;
}
private static void writePartitions(Configuration conf, PathpartitionsPath,
List<ImmutableBytesWritable>startKeys) throws IOException {
if (startKeys.isEmpty()) {
throw new IllegalArgumentException("No regionspassed");
}
// We're generating a list of split points, and wedon't ever
// have keys < the first region(which has an empty start key)
// so we need to remove it. Otherwise we would endup with an
// empty reducer with index 0
TreeSet<ImmutableBytesWritable>sorted =
new TreeSet<ImmutableBytesWritable>(startKeys);
ImmutableBytesWritable first =sorted.first();
if (!first.equals(HConstants.EMPTY_BYTE_ARRAY)){
throw new IllegalArgumentException(
"First region of table should haveempty start key. Instead has: "
+ Bytes.toStringBinary(first.get()));
}
sorted.remove(first);
// Write the actual file
FileSystemfs =partitionsPath.getFileSystem(conf);
SequenceFile.Writer writer =SequenceFile.createWriter(fs,
conf, partitionsPath, ImmutableBytesWritable.class,NullWritable.class);
try {
for (ImmutableBytesWritable startKey : sorted){
writer.append(startKey,NullWritable.get());
}
}finally {
writer.close();
}
}
public static void configureIncrementalLoad(Job job, HTable table) throws IOException {//这些不是在main函数中设置的吗
Configuration conf =job.getConfiguration();
job.setPartitionerClass(TotalOrderPartitioner.class);
job.setOutputKeyClass(ImmutableBytesWritable.class);
job.setOutputValueClass(KeyValue.class);
job.setOutputFormatClass(HFileOutputFormat.class);
// Based on the configured map output class, setthe correct reducer to properly
// sort the incoming values.
// TODO it would be nice to pick one or the otherof these formats.
if (KeyValue.class.equals(job.getMapOutputValueClass())){
job.setReducerClass(KeyValueSortReducer.class);
}else if (Put.class.equals(job.getMapOutputValueClass())){
job.setReducerClass(PutSortReducer.class);
}else {
LOG.warn("Unknown map output valuetype:" +job.getMapOutputValueClass());
}
LOG.info("Looking up current regions fortable " + table);
List<ImmutableBytesWritable>startKeys =getRegionStartKeys(table);
LOG.info("Configuring" + startKeys.size() + " reduce partitions" +
"to match current regioncount");
job.setNumReduceTasks(startKeys.size());
PathpartitionsPath = new Path(job.getWorkingDirectory(),
"partitions_"+ System.currentTimeMillis());
LOG.info("Writing partition information to" + partitionsPath);
FileSystemfs =partitionsPath.getFileSystem(conf);
writePartitions(conf, partitionsPath,startKeys);
partitionsPath.makeQualified(fs);
URIcacheUri;
try {
cacheUri = new URI(partitionsPath.toString() + "#"+
TotalOrderPartitioner.DEFAULT_PATH);
}catch (URISyntaxException e) {
throw new IOException(e);
}
DistributedCache.addCacheFile(cacheUri,conf);
DistributedCache.createSymlink(conf);
LOG.info("Incremental table outputconfigured.");
}
}

下面给出一个示例：

1. 创建HBase表t1

[java] viewplaincopy
hbase(main):157:0*create 't1','f1'  
0 row(s) in 1.3280 seconds  
  
hbase(main):158:0>scan 't1'  
ROW                  COLUMN+CELL                                                 
0 row(s) in 1.2770 seconds  

2.写MR作业
HBaseHFileMapper.java

[java] viewplaincopy
package com.test.hfile;  
import java.io.IOException;  
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;  
import org.apache.hadoop.hbase.util.Bytes;  
import org.apache.hadoop.io.LongWritable;  
import org.apache.hadoop.io.Text;  
import org.apache.hadoop.mapreduce.Mapper;  
  
public class HBaseHFileMapper extends Mapper<LongWritable, Text, ImmutableBytesWritable,Text>{  
    private ImmutableBytesWritable immutableBytesWritable = new ImmutableBytesWritable();  
    @Override  
    protected void map(LongWritable key, Textvalue,  
           org.apache.hadoop.mapreduce.Mapper.Contextcontext)  
           throws IOException, InterruptedException{  
       immutableBytesWritable.set(Bytes.toBytes(key.get())); //也就是说输入值还是key 
       context.write(immutableBytesWritable,value);  
   }  
}  

HBaseHFileReducer.java

[java] viewplaincopy
package com.test.hfile;  
import java.io.IOException;  
import org.apache.hadoop.hbase.KeyValue;  
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;  
import org.apache.hadoop.hbase.util.Bytes;  
import org.apache.hadoop.io.Text;  
import org.apache.hadoop.mapreduce.Reducer;  
  
public class HBaseHFileReducer extends Reducer<ImmutableBytesWritable, Text,ImmutableBytesWritable, KeyValue>{      
    protected void reduce(ImmutableBytesWritable key,Iterable<Text>values,  
           Context context)  
           throws IOException, InterruptedException{  
       String value="";  
       while(values.iterator().hasNext())  
       {  
           value =values.iterator().next().toString();  
           if(value!= null && !"".equals(value))//不为空 
           {  
               KeyValue kv = createKeyValue(value.toString());//keyValue就是value值  
               if(kv!=null)  
                   context.write(key,kv);  
           }  
       }  
   }  
    // str格式为row:family:qualifier:value简单模拟下 
    private KeyValue createKeyValue(Stringstr)  
   {  
       String[] strstrs = str.split(":");  
       if(strs.length<4)  
           return null;  
       String row=strs[0];  
       String family=strs[1];  
       String qualifier=strs[2];  
       String value=strs[3];  
       return new KeyValue(Bytes.toBytes(row),Bytes.toBytes(family),Bytes.toBytes(qualifier),System.currentTimeMillis(),Bytes.toBytes(value));  
   }  
}  

HbaseHFileDriver.java

[java] viewplaincopy
package com.test.hfile;  
import java.io.IOException;  
import org.apache.hadoop.conf.Configuration;  
import org.apache.hadoop.fs.Path;  
import org.apache.hadoop.hbase.HBaseConfiguration;  
import org.apache.hadoop.hbase.client.HTable;  
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;  
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat;  
import org.apache.hadoop.io.Text;  
import org.apache.hadoop.mapreduce.Job;  
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  
import org.apache.hadoop.util.GenericOptionsParser;  
  
public class HbaseHFileDriver{  
    public static void main(String[] args) throws IOException,  
           InterruptedException, ClassNotFoundException{  
          
       Configuration conf = new Configuration();  
       String[] otherArgs = new GenericOptionsParser(conf,args).getRemainingArgs();  
  
       Job job = new Job(conf, "testhbasehfile");  
       job.setJarByClass(HbaseHFileDriver.class);//主类 
  
       job.setMapperClass(com.test.hfile.HBaseHFileMapper.class);  
       job.setReducerClass(com.test.hfile.HBaseHFileReducer.class);  
  
       job.setMapOutputKeyClass(ImmutableBytesWritable.class);  
       job.setMapOutputValueClass(Text.class);  
 
       // 偷懒， 直接写死在程序里了，实际应用中不能这样,应从命令行获取,无语了对这个 
       FileInputFormat.addInputPath(job, new Path("/home/yinjie/input"));  
       FileOutputFormat.setOutputPath(job, new Path("/home/yinjie/output"));  
  
       Configuration HBASE_CONFIG = new Configuration();  
       HBASE_CONFIG.set("hbase.zookeeper.quorum","localhost");  
       HBASE_CONFIG.set("hbase.zookeeper.property.clientPort","2181");  
       HBaseConfiguration cfg = new HBaseConfiguration(HBASE_CONFIG);  
       String tableName = "t1";  
       HTable htable = new HTable(cfg,tableName);  
       HFileOutputFormat.configureIncrementalLoad(job,htable);  
  
       System.exit(job.waitForCompletion(true)? 0 : 1);  
   }  
}  

/home/yinjie/input目录下有一个hbasedata.txt文件,内容为

[java] viewplaincopy
[root@localhost input]# cathbasedata.txt   
r1:f1:c1:value1  
r2:f1:c2:value2  
r3:f1:c3:value3  

将作业打包，我的到处路径为/home/yinjie/job/hbasetest.jar
提交作业到hadoop运行:

[root@localhost job]# hadoop jar /home/yinjie/job/hbasetest.jarcom.test.hfile.HbaseHFileDriver -libjars

/home/yinjie/hbase-0.90.3/hbase-0.90.3.jar
作业运行完毕后查看下输出目录:

[root@localhost input]# hadoop fs -ls/home/yinjie/output
Found 2 items
drwxr-xr-x - rootsupergroup 0 2011-08-28 21:02 /home/yinjie/output/_logs
drwxr-xr-x - rootsupergroup 0 2011-08-28 21:03 /home/yinjie/output/f1

OK,已经生成以列族f1命名的文件夹了。

接下去使用Bulk Load将数据导入到HBbase

[root@localhost job]# hadoop jar/home/yinjie/hbase-0.90.3/hbase-0.90.3.jarcompletebulkload

导入完毕，查询hbase表t1进行验证

hbase(main):166:0> scan 't1'
ROW COLUMN+CELL
r1 column=f1:c1,timestamp=1314591150788,value=value1
r2 column=f1:c2,timestamp=1314591150814,value=value2
r3 column=f1:c3,timestamp=1314591150815,value=value3
3 row(s) in 0.0210 seconds

数据已经导入!

本文后面的例子出自 “炽天使” 博客，请务必保留此出处 http://3199782.blog.51cto.com/3189782/652244

package org.apache.hadoop.hbase.mapreduce;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.TreeSet;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.io.hfile.Compression;
import org.apache.hadoop.hbase.io.hfile.HFile;
import org.apache.hadoop.hbase.mapreduce.hadoopbackport.TotalOrderPartitioner;
import org.apache.hadoop.hbase.regionserver.StoreFile;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
public class HFileOutputFormat extends FileOutputFormat<ImmutableBytesWritable,KeyValue> {
static Log LOG = LogFactory.getLog(HFileOutputFormat.class);
//继承的FileOutputFile
public RecordWriter<ImmutableBytesWritable,KeyValue> getRecordWriter(final TaskAttemptContext context)//keyValue是什么值？
throws IOException, InterruptedException{
// Get the path of the temporary outputfile获得输出文件的临时路径
final Path outputPath =FileOutputFormat.getOutputPath(context);
final Path outputdir = new FileOutputCommitter(outputPath,context).getWorkPath();
Configuration conf =context.getConfiguration();
final FileSystem fs =outputdir.getFileSystem(conf);
// These configs. are fromhbase-*.xml获取对应的配置信息，前面的参数未设置，则使用默认的参数
final long maxsize = conf.getLong("hbase.hregion.max.filesize",268435456);
final int blocksize =
conf.getInt("hbase.mapreduce.hfileoutputformat.blocksize",65536);
// Invented config. Add tohbase-*.xml if other than default compression.
final String compression = conf.get("hfile.compression",
Compression.Algorithm.NONE.getName());
return new RecordWriter<ImmutableBytesWritable,KeyValue>() {
// Map of families to writers and how much has beenoutput on the writer.将列簇映射到writer上去
private final Map<byte [], WriterLength> writers =//使用map作为容器
new TreeMap<byte [],WriterLength>(Bytes.BYTES_COMPARATOR);
private byte [] previousRow = HConstants.EMPTY_BYTE_ARRAY;//空
private final byte [] now =Bytes.toBytes(System.currentTimeMillis());//当前时间
public void write(ImmutableBytesWritable row, KeyValue kv)//写入信息
throws IOException {
long length = kv.getLength();
byte [] family = kv.getFamily();//获取列簇
WriterLength wl = this.writers.get(family);
if (wl == null || ((length + wl.written) >= maxsize)&&
Bytes.compareTo(this.previousRow,0,this.previousRow.length,
kv.getBuffer(), kv.getRowOffset(), kv.getRowLength()) !=0){
// Get a new writer.
Path basedir = new Path(outputdir,Bytes.toString(family));
if (wl == null){
wl = new WriterLength();
this.writers.put(family,wl);
if (this.writers.size()> 1)throw new IOException("One familyonly");
// If wl == null, first file infamily. Ensure family direxits.
if (!fs.exists(basedir)) fs.mkdirs(basedir);//建立对应的路径
}
wl.writer = getNewWriter(wl.writer,basedir);
LOG.info("Writer="+ wl.writer.getPath() +
((wl.written == 0)?"":", wrote="+ wl.written));
wl.written = 0;
}
kv.updateLatestStamp(this.now);//
wl.writer.append(kv);
wl.written += length;
// Copy the row so we know when a rowtransition.
this.previousRow= kv.getRow();
}
private HFile.Writer getNewWriter(final HFile.Writer writer,
final Path familydir)
throws IOException {
close(writer);
return new HFile.Writer(fs, StoreFile.getUniqueFile(fs,familydir),
blocksize, compression,KeyValue.KEY_COMPARATOR);
}
private void close(final HFile.Writer w) throws IOException {
if (w != null){
w.appendFileInfo(StoreFile.BULKLOAD_TIME_KEY,//时间标签
Bytes.toBytes(System.currentTimeMillis()));
w.appendFileInfo(StoreFile.BULKLOAD_TASK_KEY,
Bytes.toBytes(context.getTaskAttemptID().toString()));
w.appendFileInfo(StoreFile.MAJOR_COMPACTION_KEY,
Bytes.toBytes(true));
w.close();
}
}
public void close(TaskAttemptContext c)
throws IOException, InterruptedException{
for (Map.Entry<byte [], WriterLength> e: this.writers.entrySet()){
close(e.getValue().writer);
}
}
};
}
static class WriterLength {
long written = 0;
HFile.Writerwriter = null;
}
private static List<ImmutableBytesWritable>getRegionStartKeys(HTabletable)
throws IOException {
byte[][]byteKeys =table.getStartKeys();
ArrayList<ImmutableBytesWritable> ret=
new ArrayList<ImmutableBytesWritable>(byteKeys.length);
for (byte[]byteKey : byteKeys) {
ret.add(new ImmutableBytesWritable(byteKey));
}
return ret;
}
private static void writePartitions(Configuration conf, PathpartitionsPath,
List<ImmutableBytesWritable>startKeys) throws IOException {
if (startKeys.isEmpty()) {
throw new IllegalArgumentException("No regionspassed");
}
// We're generating a list of split points, and wedon't ever
// have keys < the first region(which has an empty start key)
// so we need to remove it. Otherwise we would endup with an
// empty reducer with index 0
TreeSet<ImmutableBytesWritable>sorted =
new TreeSet<ImmutableBytesWritable>(startKeys);
ImmutableBytesWritable first =sorted.first();
if (!first.equals(HConstants.EMPTY_BYTE_ARRAY)){
throw new IllegalArgumentException(
"First region of table should haveempty start key. Instead has: "
+ Bytes.toStringBinary(first.get()));
}
sorted.remove(first);
// Write the actual file
FileSystemfs =partitionsPath.getFileSystem(conf);
SequenceFile.Writer writer =SequenceFile.createWriter(fs,
conf, partitionsPath, ImmutableBytesWritable.class,NullWritable.class);
try {
for (ImmutableBytesWritable startKey : sorted){
writer.append(startKey,NullWritable.get());
}
}finally {
writer.close();
}
}
public static void configureIncrementalLoad(Job job, HTable table) throws IOException {//这些不是在main函数中设置的吗
Configuration conf =job.getConfiguration();
job.setPartitionerClass(TotalOrderPartitioner.class);
job.setOutputKeyClass(ImmutableBytesWritable.class);
job.setOutputValueClass(KeyValue.class);
job.setOutputFormatClass(HFileOutputFormat.class);
// Based on the configured map output class, setthe correct reducer to properly
// sort the incoming values.
// TODO it would be nice to pick one or the otherof these formats.
if (KeyValue.class.equals(job.getMapOutputValueClass())){
job.setReducerClass(KeyValueSortReducer.class);
}else if (Put.class.equals(job.getMapOutputValueClass())){
job.setReducerClass(PutSortReducer.class);
}else {
LOG.warn("Unknown map output valuetype:" +job.getMapOutputValueClass());
}
LOG.info("Looking up current regions fortable " + table);
List<ImmutableBytesWritable>startKeys =getRegionStartKeys(table);
LOG.info("Configuring" + startKeys.size() + " reduce partitions" +
"to match current regioncount");
job.setNumReduceTasks(startKeys.size());
PathpartitionsPath = new Path(job.getWorkingDirectory(),
"partitions_"+ System.currentTimeMillis());
LOG.info("Writing partition information to" + partitionsPath);
FileSystemfs =partitionsPath.getFileSystem(conf);
writePartitions(conf, partitionsPath,startKeys);
partitionsPath.makeQualified(fs);
URIcacheUri;
try {
cacheUri = new URI(partitionsPath.toString() + "#"+
TotalOrderPartitioner.DEFAULT_PATH);
}catch (URISyntaxException e) {
throw new IOException(e);
}
DistributedCache.addCacheFile(cacheUri,conf);
DistributedCache.createSymlink(conf);
LOG.info("Incremental table outputconfigured.");
}
}

下面给出一个示例：

1. 创建HBase表t1

[java] viewplaincopy
hbase(main):157:0*create 't1','f1'  
0 row(s) in 1.3280 seconds  
  
hbase(main):158:0>scan 't1'  
ROW                  COLUMN+CELL                                                 
0 row(s) in 1.2770 seconds  

2.写MR作业
HBaseHFileMapper.java

[java] viewplaincopy
package com.test.hfile;  
import java.io.IOException;  
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;  
import org.apache.hadoop.hbase.util.Bytes;  
import org.apache.hadoop.io.LongWritable;  
import org.apache.hadoop.io.Text;  
import org.apache.hadoop.mapreduce.Mapper;  
  
public class HBaseHFileMapper extends Mapper<LongWritable, Text, ImmutableBytesWritable,Text>{  
    private ImmutableBytesWritable immutableBytesWritable = new ImmutableBytesWritable();  
    @Override  
    protected void map(LongWritable key, Textvalue,  
           org.apache.hadoop.mapreduce.Mapper.Contextcontext)  
           throws IOException, InterruptedException{  
       immutableBytesWritable.set(Bytes.toBytes(key.get())); //也就是说输入值还是key 
       context.write(immutableBytesWritable,value);  
   }  
}  

HBaseHFileReducer.java

[java] viewplaincopy
package com.test.hfile;  
import java.io.IOException;  
import org.apache.hadoop.hbase.KeyValue;  
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;  
import org.apache.hadoop.hbase.util.Bytes;  
import org.apache.hadoop.io.Text;  
import org.apache.hadoop.mapreduce.Reducer;  
  
public class HBaseHFileReducer extends Reducer<ImmutableBytesWritable, Text,ImmutableBytesWritable, KeyValue>{      
    protected void reduce(ImmutableBytesWritable key,Iterable<Text>values,  
           Context context)  
           throws IOException, InterruptedException{  
       String value="";  
       while(values.iterator().hasNext())  
       {  
           value =values.iterator().next().toString();  
           if(value!= null && !"".equals(value))//不为空 
           {  
               KeyValue kv = createKeyValue(value.toString());//keyValue就是value值  
               if(kv!=null)  
                   context.write(key,kv);  
           }  
       }  
   }  
    // str格式为row:family:qualifier:value简单模拟下 
    private KeyValue createKeyValue(Stringstr)  
   {  
       String[] strstrs = str.split(":");  
       if(strs.length<4)  
           return null;  
       String row=strs[0];  
       String family=strs[1];  
       String qualifier=strs[2];  
       String value=strs[3];  
       return new KeyValue(Bytes.toBytes(row),Bytes.toBytes(family),Bytes.toBytes(qualifier),System.currentTimeMillis(),Bytes.toBytes(value));  
   }  
}  

HbaseHFileDriver.java

[java] viewplaincopy
package com.test.hfile;  
import java.io.IOException;  
import org.apache.hadoop.conf.Configuration;  
import org.apache.hadoop.fs.Path;  
import org.apache.hadoop.hbase.HBaseConfiguration;  
import org.apache.hadoop.hbase.client.HTable;  
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;  
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat;  
import org.apache.hadoop.io.Text;  
import org.apache.hadoop.mapreduce.Job;  
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  
import org.apache.hadoop.util.GenericOptionsParser;  
  
public class HbaseHFileDriver{  
    public static void main(String[] args) throws IOException,  
           InterruptedException, ClassNotFoundException{  
          
       Configuration conf = new Configuration();  
       String[] otherArgs = new GenericOptionsParser(conf,args).getRemainingArgs();  
  
       Job job = new Job(conf, "testhbasehfile");  
       job.setJarByClass(HbaseHFileDriver.class);//主类 
  
       job.setMapperClass(com.test.hfile.HBaseHFileMapper.class);  
       job.setReducerClass(com.test.hfile.HBaseHFileReducer.class);  
  
       job.setMapOutputKeyClass(ImmutableBytesWritable.class);  
       job.setMapOutputValueClass(Text.class);  
 
       // 偷懒， 直接写死在程序里了，实际应用中不能这样,应从命令行获取,无语了对这个 
       FileInputFormat.addInputPath(job, new Path("/home/yinjie/input"));  
       FileOutputFormat.setOutputPath(job, new Path("/home/yinjie/output"));  
  
       Configuration HBASE_CONFIG = new Configuration();  
       HBASE_CONFIG.set("hbase.zookeeper.quorum","localhost");  
       HBASE_CONFIG.set("hbase.zookeeper.property.clientPort","2181");  
       HBaseConfiguration cfg = new HBaseConfiguration(HBASE_CONFIG);  
       String tableName = "t1";  
       HTable htable = new HTable(cfg,tableName);  
       HFileOutputFormat.configureIncrementalLoad(job,htable);  
  
       System.exit(job.waitForCompletion(true)? 0 : 1);  
   }  
}  

/home/yinjie/input目录下有一个hbasedata.txt文件,内容为

[java] viewplaincopy
[root@localhost input]# cathbasedata.txt   
r1:f1:c1:value1  
r2:f1:c2:value2  
r3:f1:c3:value3  

将作业打包，我的到处路径为/home/yinjie/job/hbasetest.jar
提交作业到hadoop运行:

[root@localhost job]# hadoop jar /home/yinjie/job/hbasetest.jarcom.test.hfile.HbaseHFileDriver -libjars

/home/yinjie/hbase-0.90.3/hbase-0.90.3.jar
作业运行完毕后查看下输出目录:

[root@localhost input]# hadoop fs -ls/home/yinjie/output
Found 2 items
drwxr-xr-x - rootsupergroup 0 2011-08-28 21:02 /home/yinjie/output/_logs
drwxr-xr-x - rootsupergroup 0 2011-08-28 21:03 /home/yinjie/output/f1

OK,已经生成以列族f1命名的文件夹了。

接下去使用Bulk Load将数据导入到HBbase

[root@localhost job]# hadoop jar/home/yinjie/hbase-0.90.3/hbase-0.90.3.jarcompletebulkload

导入完毕，查询hbase表t1进行验证

hbase(main):166:0> scan 't1'
ROW COLUMN+CELL
r1 column=f1:c1,timestamp=1314591150788,value=value1
r2 column=f1:c2,timestamp=1314591150814,value=value2
r3 column=f1:c3,timestamp=1314591150815,value=value3
3 row(s) in 0.0210 seconds

数据已经导入!

本文后面的例子出自 “炽天使” 博客，请务必保留此出处 http://3199782.blog.51cto.com/3189782/652244