Hbse源码分析-HFileOutputFo…

来源:互联网 发布:西安淘宝模特兼职 编辑:程序博客网 时间:2024/06/18 12:45
原文地址:Hbse源码分析-HFileOutputFormat作者:天若有情

 

开始学习使用Hbase,不知道对不对 但是先从源码开始读起吧....

hadoop mr 输出需要导入hbase的话最好先输出成HFile格式,再导入到HBase,因为HFile是HBase的内部存

对应的源码为:

[java] viewplaincopy
  1.   
  2. package org.apache.hadoop.hbase.mapreduce;  
  3.   
  4. import java.io.IOException;  
  5. import java.net.URI;  
  6. import java.net.URISyntaxException;  
  7. import java.util.ArrayList;  
  8. import java.util.List;  
  9. import java.util.Map;  
  10. import java.util.TreeMap;  
  11. import java.util.TreeSet;  
  12.   
  13. import org.apache.hadoop.conf.Configuration;  
  14. import org.apache.hadoop.filecache.DistributedCache;  
  15. import org.apache.hadoop.fs.FileSystem;  
  16. import org.apache.hadoop.fs.Path;  
  17. import org.apache.hadoop.hbase.HConstants;  
  18. import org.apache.hadoop.hbase.KeyValue;  
  19. import org.apache.hadoop.hbase.client.HTable;  
  20. import org.apache.hadoop.hbase.client.Put;  
  21. import org.apache.hadoop.hbase.io.ImmutableBytesWritable;  
  22. import org.apache.hadoop.hbase.io.hfile.Compression;  
  23. import org.apache.hadoop.hbase.io.hfile.HFile;  
  24. import org.apache.hadoop.hbase.mapreduce.hadoopbackport.TotalOrderPartitioner;  
  25. import org.apache.hadoop.hbase.regionserver.StoreFile;  
  26. import org.apache.hadoop.hbase.util.Bytes;  
  27. import org.apache.hadoop.io.NullWritable;  
  28. import org.apache.hadoop.io.SequenceFile;  
  29. import org.apache.hadoop.mapreduce.Job;  
  30. import org.apache.hadoop.mapreduce.RecordWriter;  
  31. import org.apache.hadoop.mapreduce.TaskAttemptContext;  
  32. import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;  
  33. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  
  34.   
  35. import org.apache.commons.logging.Log;  
  36. import org.apache.commons.logging.LogFactory;  
  37.   
  38.   
  39. public class HFileOutputFormat extends FileOutputFormat<ImmutableBytesWritable, KeyValue> {  
  40.   static Log LOG = LogFactory.getLog(HFileOutputFormat.class);  
  41.   //继承的FileOutputFile  
  42.   public RecordWriter<ImmutableBytesWritable, KeyValue> getRecordWriter(final TaskAttemptContext context)//keyValue是什么值?  
  43.   throws IOException, InterruptedException {  
  44.     // Get the path of the temporary output file获得输出文件的临时路径  
  45.     final Path outputPath = FileOutputFormat.getOutputPath(context);  
  46.     final Path outputdir = new FileOutputCommitter(outputPath, context).getWorkPath();  
  47.     Configuration conf = context.getConfiguration();  
  48.     final FileSystem fs = outputdir.getFileSystem(conf);  
  49.     // These configs. are from hbase-*.xml获取对应的配置信息,前面的参数未设置,则使用默认的参数  
  50.     final long maxsize = conf.getLong("hbase.hregion.max.filesize"268435456);  
  51.     final int blocksize =  
  52.       conf.getInt("hbase.mapreduce.hfileoutputformat.blocksize"65536);  
  53.     // Invented config.  Add to hbase-*.xml if other than default compression.  
  54.     final String compression = conf.get("hfile.compression",  
  55.       Compression.Algorithm.NONE.getName());  
  56.   
  57.     return new RecordWriter<ImmutableBytesWritable, KeyValue>() {  
  58.       // Map of families to writers and how much has been output on the writer.将列簇映射到writer上去  
  59.       private final Map<byte [], WriterLength> writers =//使用map作为容器  
  60.         new TreeMap<byte [], WriterLength>(Bytes.BYTES_COMPARATOR);  
  61.       private byte [] previousRow = HConstants.EMPTY_BYTE_ARRAY;//空  
  62.       private final byte [] now = Bytes.toBytes(System.currentTimeMillis());//当前时间  
  63.   
  64.       public void write(ImmutableBytesWritable row, KeyValue kv)//写入信息  
  65.       throws IOException {  
  66.         long length = kv.getLength();  
  67.         byte [] family = kv.getFamily();//获取列簇  
  68.         WriterLength wl = this.writers.get(family);  
  69.         if (wl == null || ((length + wl.written) >= maxsize) &&  
  70.             Bytes.compareTo(this.previousRow, 0this.previousRow.length,  
  71.               kv.getBuffer(), kv.getRowOffset(), kv.getRowLength()) != 0) {  
  72.           // Get a new writer.  
  73.           Path basedir = new Path(outputdir, Bytes.toString(family));  
  74.           if (wl == null) {  
  75.             wl = new WriterLength();  
  76.             this.writers.put(family, wl);  
  77.             if (this.writers.size() > 1throw new IOException("One family only");  
  78.             // If wl == null, first file in family.  Ensure family dir exits.  
  79.             if (!fs.exists(basedir)) fs.mkdirs(basedir);//建立对应的路径  
  80.           }  
  81.           wl.writer = getNewWriter(wl.writer, basedir);  
  82.           LOG.info("Writer=" + wl.writer.getPath() +  
  83.             ((wl.written == 0)? """, wrote=" + wl.written));  
  84.           wl.written = 0;  
  85.         }  
  86.         kv.updateLatestStamp(this.now);//  
  87.         wl.writer.append(kv);  
  88.         wl.written += length;  
  89.         // Copy the row so we know when a row transition.  
  90.         this.previousRow = kv.getRow();  
  91.       }  
  92.   
  93.         
  94.       private HFile.Writer getNewWriter(final HFile.Writer writer,  
  95.           final Path familydir)  
  96.       throws IOException {  
  97.         close(writer);  
  98.         return new HFile.Writer(fs,  StoreFile.getUniqueFile(fs, familydir),  
  99.           blocksize, compression, KeyValue.KEY_COMPARATOR);  
  100.       }  
  101.   
  102.       private void close(final HFile.Writer w) throws IOException {  
  103.         if (w != null) {  
  104.           w.appendFileInfo(StoreFile.BULKLOAD_TIME_KEY,//时间标签  
  105.               Bytes.toBytes(System.currentTimeMillis()));  
  106.           w.appendFileInfo(StoreFile.BULKLOAD_TASK_KEY,  
  107.               Bytes.toBytes(context.getTaskAttemptID().toString()));  
  108.           w.appendFileInfo(StoreFile.MAJOR_COMPACTION_KEY,   
  109.               Bytes.toBytes(true));  
  110.           w.close();  
  111.         }  
  112.       }  
  113.   
  114.       public void close(TaskAttemptContext c)  
  115.       throws IOException, InterruptedException {  
  116.         for (Map.Entry<byte [], WriterLength> e: this.writers.entrySet()) {  
  117.           close(e.getValue().writer);  
  118.         }  
  119.       }  
  120.     };  
  121.   }  
  122.   
  123.     
  124.   static class WriterLength {  
  125.     long written = 0;  
  126.     HFile.Writer writer = null;  
  127.   }  
  128.   
  129.     
  130.   private static List<ImmutableBytesWritable> getRegionStartKeys(HTable table)  
  131.   throws IOException {  
  132.     byte[][] byteKeys = table.getStartKeys();  
  133.     ArrayList<ImmutableBytesWritable> ret =  
  134.       new ArrayList<ImmutableBytesWritable>(byteKeys.length);  
  135.     for (byte[] byteKey : byteKeys) {  
  136.       ret.add(new ImmutableBytesWritable(byteKey));  
  137.     }  
  138.     return ret;  
  139.   }  
  140.   
  141.     
  142.   private static void writePartitions(Configuration conf, Path partitionsPath,  
  143.       List<ImmutableBytesWritable> startKeys) throws IOException {  
  144.     if (startKeys.isEmpty()) {  
  145.       throw new IllegalArgumentException("No regions passed");  
  146.     }  
  147.   
  148.     // We're generating a list of split points, and we don't ever  
  149.     // have keys < the first region (which has an empty start key)  
  150.     // so we need to remove it. Otherwise we would end up with an  
  151.     // empty reducer with index 0  
  152.     TreeSet<ImmutableBytesWritable> sorted =  
  153.       new TreeSet<ImmutableBytesWritable>(startKeys);  
  154.   
  155.     ImmutableBytesWritable first = sorted.first();  
  156.     if (!first.equals(HConstants.EMPTY_BYTE_ARRAY)) {  
  157.       throw new IllegalArgumentException(  
  158.           "First region of table should have empty start key. Instead has: "  
  159.           + Bytes.toStringBinary(first.get()));  
  160.     }  
  161.     sorted.remove(first);  
  162.       
  163.     // Write the actual file  
  164.     FileSystem fs = partitionsPath.getFileSystem(conf);  
  165.     SequenceFile.Writer writer = SequenceFile.createWriter(fs,   
  166.         conf, partitionsPath, ImmutableBytesWritable.class, NullWritable.class);  
  167.       
  168.     try {  
  169.       for (ImmutableBytesWritable startKey : sorted) {  
  170.         writer.append(startKey, NullWritable.get());  
  171.       }  
  172.     } finally {  
  173.       writer.close();  
  174.     }  
  175.   }  
  176.     
  177.     
  178.   public static void configureIncrementalLoad(Job job, HTable table) throws IOException {//这些不是在main函数中设置的吗  
  179.     Configuration conf = job.getConfiguration();  
  180.     job.setPartitionerClass(TotalOrderPartitioner.class);  
  181.     job.setOutputKeyClass(ImmutableBytesWritable.class);  
  182.     job.setOutputValueClass(KeyValue.class);  
  183.     job.setOutputFormatClass(HFileOutputFormat.class);  
  184.       
  185.     // Based on the configured map output class, set the correct reducer to properly  
  186.     // sort the incoming values.  
  187.     // TODO it would be nice to pick one or the other of these formats.  
  188.     if (KeyValue.class.equals(job.getMapOutputValueClass())) {  
  189.       job.setReducerClass(KeyValueSortReducer.class);  
  190.     } else if (Put.class.equals(job.getMapOutputValueClass())) {  
  191.       job.setReducerClass(PutSortReducer.class);  
  192.     } else {  
  193.       LOG.warn("Unknown map output value type:" + job.getMapOutputValueClass());  
  194.     }  
  195.       
  196.     LOG.info("Looking up current regions for table " + table);  
  197.     List<ImmutableBytesWritable> startKeys = getRegionStartKeys(table);  
  198.     LOG.info("Configuring " + startKeys.size() + " reduce partitions " +  
  199.         "to match current region count");  
  200.     job.setNumReduceTasks(startKeys.size());  
  201.       
  202.     Path partitionsPath = new Path(job.getWorkingDirectory(),  
  203.         "partitions_" + System.currentTimeMillis());  
  204.     LOG.info("Writing partition information to " + partitionsPath);  
  205.   
  206.     FileSystem fs = partitionsPath.getFileSystem(conf);  
  207.     writePartitions(conf, partitionsPath, startKeys);  
  208.     partitionsPath.makeQualified(fs);  
  209.     URI cacheUri;  
  210.     try {  
  211.       cacheUri = new URI(partitionsPath.toString() + "#" +  
  212.           TotalOrderPartitioner.DEFAULT_PATH);  
  213.     } catch (URISyntaxException e) {  
  214.       throw new IOException(e);  
  215.     }  
  216.     DistributedCache.addCacheFile(cacheUri, conf);  
  217.     DistributedCache.createSymlink(conf);  
  218.       
  219.     LOG.info("Incremental table output configured.");  
  220.   }  
  221.     
  222. }  
下面给出一个示例:

1. 创建HBase表t1

[java] viewplaincopy
  1. hbase(main):157:0* create 't1','f1'   
  2. 0 row(s) in 1.3280 seconds   
  3.    
  4. hbase(main):158:0> scan 't1'   
  5. ROW                   COLUMN+CELL                                                  
  6. 0 row(s) in 1.2770 seconds   
2.写MR作业
HBaseHFileMapper.java
[java] viewplaincopy
  1. package com.test.hfile;   
  2. import java.io.IOException;   
  3. import org.apache.hadoop.hbase.io.ImmutableBytesWritable;   
  4. import org.apache.hadoop.hbase.util.Bytes;   
  5. import org.apache.hadoop.io.LongWritable;   
  6. import org.apache.hadoop.io.Text;   
  7. import org.apache.hadoop.mapreduce.Mapper;   
  8.    
  9. public class HBaseHFileMapper extends Mapper<LongWritable, Text, ImmutableBytesWritable, Text> {   
  10.     private ImmutableBytesWritable immutableBytesWritable = new ImmutableBytesWritable();   
  11.     @Override   
  12.     protected void map(LongWritable key, Text value,   
  13.             org.apache.hadoop.mapreduce.Mapper.Context context)   
  14.             throws IOException, InterruptedException {   
  15.         immutableBytesWritable.set(Bytes.toBytes(key.get())); //也就是说输入值还是key  
  16.         context.write(immutableBytesWritable, value);   
  17.     }   
  18. }   
HBaseHFileReducer.java
[java] viewplaincopy
  1. package com.test.hfile;   
  2. import java.io.IOException;   
  3. import org.apache.hadoop.hbase.KeyValue;   
  4. import org.apache.hadoop.hbase.io.ImmutableBytesWritable;   
  5. import org.apache.hadoop.hbase.util.Bytes;   
  6. import org.apache.hadoop.io.Text;   
  7. import org.apache.hadoop.mapreduce.Reducer;   
  8.    
  9. public class HBaseHFileReducer extends Reducer<ImmutableBytesWritable, Text, ImmutableBytesWritable, KeyValue> {       
  10.     protected void reduce(ImmutableBytesWritable key, Iterable<Text> values,   
  11.             Context context)   
  12.             throws IOException, InterruptedException {   
  13.         String value="";   
  14.         while(values.iterator().hasNext())   
  15.         {   
  16.             value = values.iterator().next().toString();   
  17.             if(value != null && !"".equals(value)) //不为空  
  18.             {   
  19.                 KeyValue kv = createKeyValue(value.toString());//keyValue就是value值   
  20.                 if(kv!=null)   
  21.                     context.write(key, kv);   
  22.             }   
  23.         }   
  24.     }   
  25.     // str格式为row:family:qualifier:value 简单模拟下  
  26.     private KeyValue createKeyValue(String str)   
  27.     {   
  28.         String[] strstrs = str.split(":");   
  29.         if(strs.length<4)   
  30.             return null;   
  31.         String row=strs[0];   
  32.         String family=strs[1];   
  33.         String qualifier=strs[2];   
  34.         String value=strs[3];   
  35.         return new KeyValue(Bytes.toBytes(row),Bytes.toBytes(family),Bytes.toBytes(qualifier),System.currentTimeMillis(), Bytes.toBytes(value));   
  36.     }   
  37. }   
HbaseHFileDriver.java
[java] viewplaincopy
  1. package com.test.hfile;   
  2. import java.io.IOException;   
  3. import org.apache.hadoop.conf.Configuration;   
  4. import org.apache.hadoop.fs.Path;   
  5. import org.apache.hadoop.hbase.HBaseConfiguration;   
  6. import org.apache.hadoop.hbase.client.HTable;   
  7. import org.apache.hadoop.hbase.io.ImmutableBytesWritable;   
  8. import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat;   
  9. import org.apache.hadoop.io.Text;   
  10. import org.apache.hadoop.mapreduce.Job;   
  11. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;   
  12. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;   
  13. import org.apache.hadoop.util.GenericOptionsParser;   
  14.    
  15. public class HbaseHFileDriver {   
  16.     public static void main(String[] args) throws IOException,   
  17.             InterruptedException, ClassNotFoundException {   
  18.            
  19.         Configuration conf = new Configuration();   
  20.         String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();   
  21.    
  22.         Job job = new Job(conf, "testhbasehfile");   
  23.         job.setJarByClass(HbaseHFileDriver.class); //主类  
  24.    
  25.         job.setMapperClass(com.test.hfile.HBaseHFileMapper.class);   
  26.         job.setReducerClass(com.test.hfile.HBaseHFileReducer.class);   
  27.    
  28.         job.setMapOutputKeyClass(ImmutableBytesWritable.class);   
  29.         job.setMapOutputValueClass(Text.class);   
  30.   
  31.         // 偷懒, 直接写死在程序里了,实际应用中不能这样, 应从命令行获取,无语了对这个  
  32.         FileInputFormat.addInputPath(job, new Path("/home/yinjie/input"));   
  33.         FileOutputFormat.setOutputPath(job, new Path("/home/yinjie/output"));   
  34.    
  35.         Configuration HBASE_CONFIG = new Configuration();   
  36.         HBASE_CONFIG.set("hbase.zookeeper.quorum""localhost");   
  37.         HBASE_CONFIG.set("hbase.zookeeper.property.clientPort""2181");   
  38.         HBaseConfiguration cfg = new HBaseConfiguration(HBASE_CONFIG);   
  39.         String tableName = "t1";   
  40.         HTable htable = new HTable(cfg, tableName);   
  41.         HFileOutputFormat.configureIncrementalLoad(job, htable);   
  42.    
  43.         System.exit(job.waitForCompletion(true) ? 0 : 1);   
  44.     }   
  45. }   
/home/yinjie/input目录下有一个hbasedata.txt文件,内容为
[java] viewplaincopy
  1. [root@localhost input]# cat hbasedata.txt    
  2. r1:f1:c1:value1   
  3. r2:f1:c2:value2   
  4. r3:f1:c3:value3   
将作业打包,我的到处路径为/home/yinjie/job/hbasetest.jar
提交作业到hadoop运行:

[root@localhost job]# hadoop jar /home/yinjie/job/hbasetest.jar com.test.hfile.HbaseHFileDriver -libjars 

/home/yinjie/hbase-0.90.3/hbase-0.90.3.jar
作业运行完毕后查看下输出目录:

  1. [root@localhost input]# hadoop fs -ls /home/yinjie/output 
  2. Found 2 items 
  3. drwxr-xr-x   - root supergroup          0 2011-08-28 21:02 /home/yinjie/output/_logs 
  4. drwxr-xr-x   - root supergroup          0 2011-08-28 21:03 /home/yinjie/output/f1 
OK,已经生成以列族f1命名的文件夹了。

接下去使用Bulk Load将数据导入到HBbase

[root@localhost job]# hadoop jar /home/yinjie/hbase-0.90.3/hbase-0.90.3.jar completebulkload

导入完毕,查询hbase表t1进行验证

  1. hbase(main):166:0> scan 't1' 
  2. ROW                              COLUMN+CELL                                                                                  
  3.  r1                              column=f1:c1, timestamp=1314591150788value=value1                                          
  4.  r2                              column=f1:c2, timestamp=1314591150814value=value2                                          
  5.  r3                              column=f1:c3, timestamp=1314591150815value=value3                                          
  6. 3 row(s) in 0.0210 seconds 

数据已经导入!

本文后面的例子出自 “炽天使” 博客,请务必保留此出处 http://3199782.blog.51cto.com/3189782/652244

  1.  
  2. package org.apache.hadoop.hbase.mapreduce; 
  3.  
  4. import java.io.IOException; 
  5. import java.net.URI; 
  6. import java.net.URISyntaxException; 
  7. import java.util.ArrayList; 
  8. import java.util.List; 
  9. import java.util.Map; 
  10. import java.util.TreeMap; 
  11. import java.util.TreeSet; 
  12.  
  13. import org.apache.hadoop.conf.Configuration; 
  14. import org.apache.hadoop.filecache.DistributedCache; 
  15. import org.apache.hadoop.fs.FileSystem; 
  16. import org.apache.hadoop.fs.Path; 
  17. import org.apache.hadoop.hbase.HConstants; 
  18. import org.apache.hadoop.hbase.KeyValue; 
  19. import org.apache.hadoop.hbase.client.HTable; 
  20. import org.apache.hadoop.hbase.client.Put; 
  21. import org.apache.hadoop.hbase.io.ImmutableBytesWritable; 
  22. import org.apache.hadoop.hbase.io.hfile.Compression; 
  23. import org.apache.hadoop.hbase.io.hfile.HFile; 
  24. import org.apache.hadoop.hbase.mapreduce.hadoopbackport.TotalOrderPartitioner; 
  25. import org.apache.hadoop.hbase.regionserver.StoreFile; 
  26. import org.apache.hadoop.hbase.util.Bytes; 
  27. import org.apache.hadoop.io.NullWritable; 
  28. import org.apache.hadoop.io.SequenceFile; 
  29. import org.apache.hadoop.mapreduce.Job; 
  30. import org.apache.hadoop.mapreduce.RecordWriter; 
  31. import org.apache.hadoop.mapreduce.TaskAttemptContext; 
  32. import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter; 
  33. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 
  34.  
  35. import org.apache.commons.logging.Log; 
  36. import org.apache.commons.logging.LogFactory; 
  37.  
  38.  
  39. public class HFileOutputFormat extends FileOutputFormat<ImmutableBytesWritable,KeyValue> { 
  40.   static Log LOG = LogFactory.getLog(HFileOutputFormat.class); 
  41.   //继承的FileOutputFile 
  42.   public RecordWriter<ImmutableBytesWritable,KeyValue> getRecordWriter(final TaskAttemptContext context)//keyValue是什么值? 
  43.   throws IOException, InterruptedException{ 
  44.     // Get the path of the temporary outputfile获得输出文件的临时路径 
  45.     final Path outputPath =FileOutputFormat.getOutputPath(context); 
  46.     final Path outputdir = new FileOutputCommitter(outputPath,context).getWorkPath(); 
  47.    Configuration conf =context.getConfiguration(); 
  48.     final FileSystem fs =outputdir.getFileSystem(conf); 
  49.     // These configs. are fromhbase-*.xml获取对应的配置信息,前面的参数未设置,则使用默认的参数 
  50.     final long maxsize = conf.getLong("hbase.hregion.max.filesize",268435456); 
  51.     final int blocksize = 
  52.      conf.getInt("hbase.mapreduce.hfileoutputformat.blocksize",65536); 
  53.     // Invented config.  Add tohbase-*.xml if other than default compression. 
  54.     final String compression = conf.get("hfile.compression"
  55.      Compression.Algorithm.NONE.getName()); 
  56.  
  57.     return new RecordWriter<ImmutableBytesWritable,KeyValue>() { 
  58.      // Map of families to writers and how much has beenoutput on the writer.将列簇映射到writer上去 
  59.      private final Map<byte [], WriterLength> writers =//使用map作为容器 
  60.        new TreeMap<byte [],WriterLength>(Bytes.BYTES_COMPARATOR); 
  61.      private byte [] previousRow = HConstants.EMPTY_BYTE_ARRAY;//空 
  62.      private final byte [] now =Bytes.toBytes(System.currentTimeMillis());//当前时间 
  63.  
  64.      public void write(ImmutableBytesWritable row, KeyValue kv)//写入信息 
  65.      throws IOException { 
  66.        long length = kv.getLength(); 
  67.        byte [] family = kv.getFamily();//获取列簇 
  68.        WriterLength wl = this.writers.get(family); 
  69.        if (wl == null || ((length + wl.written) >= maxsize)&& 
  70.            Bytes.compareTo(this.previousRow,0,this.previousRow.length, 
  71.              kv.getBuffer(), kv.getRowOffset(), kv.getRowLength()) !=0){ 
  72.          // Get a new writer. 
  73.          Path basedir = new Path(outputdir,Bytes.toString(family)); 
  74.          if (wl == null){ 
  75.            wl = new WriterLength(); 
  76.            this.writers.put(family,wl); 
  77.            if (this.writers.size()> 1)throw new IOException("One familyonly"); 
  78.            // If wl == null, first file infamily.  Ensure family direxits. 
  79.            if (!fs.exists(basedir)) fs.mkdirs(basedir);//建立对应的路径 
  80.          } 
  81.          wl.writer = getNewWriter(wl.writer,basedir); 
  82.          LOG.info("Writer="+ wl.writer.getPath() + 
  83.            ((wl.written == 0)?"":", wrote="+ wl.written)); 
  84.          wl.written = 0
  85.        } 
  86.        kv.updateLatestStamp(this.now);// 
  87.        wl.writer.append(kv); 
  88.        wl.written += length; 
  89.        // Copy the row so we know when a rowtransition. 
  90.        this.previousRow= kv.getRow(); 
  91.      } 
  92.  
  93.       
  94.      private HFile.Writer getNewWriter(final HFile.Writer writer, 
  95.          final Path familydir) 
  96.      throws IOException { 
  97.        close(writer); 
  98.        return new HFile.Writer(fs,  StoreFile.getUniqueFile(fs,familydir), 
  99.          blocksize, compression,KeyValue.KEY_COMPARATOR); 
  100.      } 
  101.  
  102.      private void close(final HFile.Writer w) throws IOException { 
  103.        if (w != null){ 
  104.          w.appendFileInfo(StoreFile.BULKLOAD_TIME_KEY,//时间标签 
  105.              Bytes.toBytes(System.currentTimeMillis())); 
  106.          w.appendFileInfo(StoreFile.BULKLOAD_TASK_KEY, 
  107.              Bytes.toBytes(context.getTaskAttemptID().toString())); 
  108.          w.appendFileInfo(StoreFile.MAJOR_COMPACTION_KEY,  
  109.              Bytes.toBytes(true)); 
  110.          w.close(); 
  111.        } 
  112.      } 
  113.  
  114.      public void close(TaskAttemptContext c) 
  115.      throws IOException, InterruptedException{ 
  116.        for (Map.Entry<byte [], WriterLength> e: this.writers.entrySet()){ 
  117.          close(e.getValue().writer); 
  118.        } 
  119.      } 
  120.    }; 
  121.   } 
  122.  
  123.    
  124.   static class WriterLength { 
  125.     long written = 0
  126.     HFile.Writerwriter = null
  127.   } 
  128.  
  129.    
  130.   private static List<ImmutableBytesWritable>getRegionStartKeys(HTabletable) 
  131.   throws IOException { 
  132.     byte[][]byteKeys =table.getStartKeys(); 
  133.    ArrayList<ImmutableBytesWritable> ret= 
  134.      new ArrayList<ImmutableBytesWritable>(byteKeys.length); 
  135.     for (byte[]byteKey : byteKeys) { 
  136.      ret.add(new ImmutableBytesWritable(byteKey)); 
  137.    } 
  138.     return ret; 
  139.   } 
  140.  
  141.    
  142.   private static void writePartitions(Configuration conf, PathpartitionsPath, 
  143.      List<ImmutableBytesWritable>startKeys) throws IOException { 
  144.     if (startKeys.isEmpty()) { 
  145.      throw new IllegalArgumentException("No regionspassed"); 
  146.    } 
  147.  
  148.     // We're generating a list of split points, and wedon't ever 
  149.     // have keys < the first region(which has an empty start key) 
  150.     // so we need to remove it. Otherwise we would endup with an 
  151.     // empty reducer with index 0 
  152.    TreeSet<ImmutableBytesWritable>sorted = 
  153.      new TreeSet<ImmutableBytesWritable>(startKeys); 
  154.  
  155.    ImmutableBytesWritable first =sorted.first(); 
  156.     if (!first.equals(HConstants.EMPTY_BYTE_ARRAY)){ 
  157.      throw new IllegalArgumentException
  158.          "First region of table should haveempty start key. Instead has: " 
  159.          + Bytes.toStringBinary(first.get())); 
  160.    } 
  161.    sorted.remove(first); 
  162.      
  163.     // Write the actual file 
  164.     FileSystemfs =partitionsPath.getFileSystem(conf); 
  165.    SequenceFile.Writer writer =SequenceFile.createWriter(fs,  
  166.        conf, partitionsPath, ImmutableBytesWritable.class,NullWritable.class); 
  167.      
  168.     try
  169.      for (ImmutableBytesWritable startKey : sorted){ 
  170.        writer.append(startKey,NullWritable.get()); 
  171.      } 
  172.     }finally
  173.      writer.close(); 
  174.    } 
  175.   } 
  176.    
  177.    
  178.   public static void configureIncrementalLoad(Job job, HTable table) throws IOException {//这些不是在main函数中设置的吗 
  179.    Configuration conf =job.getConfiguration(); 
  180.    job.setPartitionerClass(TotalOrderPartitioner.class); 
  181.    job.setOutputKeyClass(ImmutableBytesWritable.class); 
  182.    job.setOutputValueClass(KeyValue.class); 
  183.    job.setOutputFormatClass(HFileOutputFormat.class); 
  184.      
  185.     // Based on the configured map output class, setthe correct reducer to properly 
  186.     // sort the incoming values. 
  187.     // TODO it would be nice to pick one or the otherof these formats. 
  188.     if (KeyValue.class.equals(job.getMapOutputValueClass())){ 
  189.      job.setReducerClass(KeyValueSortReducer.class); 
  190.     }else if (Put.class.equals(job.getMapOutputValueClass())){ 
  191.      job.setReducerClass(PutSortReducer.class); 
  192.     }else
  193.      LOG.warn("Unknown map output valuetype:" +job.getMapOutputValueClass()); 
  194.    } 
  195.      
  196.    LOG.info("Looking up current regions fortable " + table); 
  197.    List<ImmutableBytesWritable>startKeys =getRegionStartKeys(table); 
  198.    LOG.info("Configuring" + startKeys.size() + " reduce partitions"
  199.        "to match current regioncount"); 
  200.    job.setNumReduceTasks(startKeys.size()); 
  201.      
  202.     PathpartitionsPath = new Path(job.getWorkingDirectory(), 
  203.        "partitions_"+ System.currentTimeMillis()); 
  204.    LOG.info("Writing partition information to" + partitionsPath); 
  205.  
  206.     FileSystemfs =partitionsPath.getFileSystem(conf); 
  207.    writePartitions(conf, partitionsPath,startKeys); 
  208.    partitionsPath.makeQualified(fs); 
  209.     URIcacheUri; 
  210.     try
  211.      cacheUri = new URI(partitionsPath.toString() + "#"
  212.          TotalOrderPartitioner.DEFAULT_PATH); 
  213.     }catch (URISyntaxException e) { 
  214.      throw new IOException(e); 
  215.    } 
  216.    DistributedCache.addCacheFile(cacheUri,conf); 
  217.    DistributedCache.createSymlink(conf); 
  218.      
  219.    LOG.info("Incremental table outputconfigured."); 
  220.   } 
  221.    
下面给出一个示例:

1. 创建HBase表t1

[java] viewplaincopy
  1. hbase(main):157:0*create 't1','f1'  
  2. 0 row(s) in 1.3280 seconds  
  3.   
  4. hbase(main):158:0>scan 't1'  
  5. ROW                  COLUMN+CELL                                                 
  6. 0 row(s) in 1.2770 seconds  
2.写MR作业
HBaseHFileMapper.java
[java] viewplaincopy
  1. package com.test.hfile;  
  2. import java.io.IOException;  
  3. import org.apache.hadoop.hbase.io.ImmutableBytesWritable;  
  4. import org.apache.hadoop.hbase.util.Bytes;  
  5. import org.apache.hadoop.io.LongWritable;  
  6. import org.apache.hadoop.io.Text;  
  7. import org.apache.hadoop.mapreduce.Mapper;  
  8.   
  9. public class HBaseHFileMapper extends Mapper<LongWritable, Text, ImmutableBytesWritable,Text>{  
  10.     private ImmutableBytesWritable immutableBytesWritable = new ImmutableBytesWritable();  
  11.     @Override  
  12.     protected void map(LongWritable key, Textvalue,  
  13.            org.apache.hadoop.mapreduce.Mapper.Contextcontext)  
  14.            throws IOException, InterruptedException{  
  15.        immutableBytesWritable.set(Bytes.toBytes(key.get())); //也就是说输入值还是key 
  16.        context.write(immutableBytesWritable,value);  
  17.    }  
  18. }  
HBaseHFileReducer.java
[java] viewplaincopy
  1. package com.test.hfile;  
  2. import java.io.IOException;  
  3. import org.apache.hadoop.hbase.KeyValue;  
  4. import org.apache.hadoop.hbase.io.ImmutableBytesWritable;  
  5. import org.apache.hadoop.hbase.util.Bytes;  
  6. import org.apache.hadoop.io.Text;  
  7. import org.apache.hadoop.mapreduce.Reducer;  
  8.   
  9. public class HBaseHFileReducer extends Reducer<ImmutableBytesWritable, Text,ImmutableBytesWritable, KeyValue>{      
  10.     protected void reduce(ImmutableBytesWritable key,Iterable<Text>values,  
  11.            Context context)  
  12.            throws IOException, InterruptedException{  
  13.        String value="";  
  14.        while(values.iterator().hasNext())  
  15.        {  
  16.            value =values.iterator().next().toString();  
  17.            if(value!= null && !"".equals(value))//不为空 
  18.            {  
  19.                KeyValue kv = createKeyValue(value.toString());//keyValue就是value值  
  20.                if(kv!=null)  
  21.                    context.write(key,kv);  
  22.            }  
  23.        }  
  24.    }  
  25.     // str格式为row:family:qualifier:value简单模拟下 
  26.     private KeyValue createKeyValue(Stringstr)  
  27.    {  
  28.        String[] strstrs = str.split(":");  
  29.        if(strs.length<4)  
  30.            return null;  
  31.        String row=strs[0];  
  32.        String family=strs[1];  
  33.        String qualifier=strs[2];  
  34.        String value=strs[3];  
  35.        return new KeyValue(Bytes.toBytes(row),Bytes.toBytes(family),Bytes.toBytes(qualifier),System.currentTimeMillis(),Bytes.toBytes(value));  
  36.    }  
  37. }  
HbaseHFileDriver.java
[java] viewplaincopy
  1. package com.test.hfile;  
  2. import java.io.IOException;  
  3. import org.apache.hadoop.conf.Configuration;  
  4. import org.apache.hadoop.fs.Path;  
  5. import org.apache.hadoop.hbase.HBaseConfiguration;  
  6. import org.apache.hadoop.hbase.client.HTable;  
  7. import org.apache.hadoop.hbase.io.ImmutableBytesWritable;  
  8. import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat;  
  9. import org.apache.hadoop.io.Text;  
  10. import org.apache.hadoop.mapreduce.Job;  
  11. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  
  12. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  
  13. import org.apache.hadoop.util.GenericOptionsParser;  
  14.   
  15. public class HbaseHFileDriver{  
  16.     public static void main(String[] args) throws IOException,  
  17.            InterruptedException, ClassNotFoundException{  
  18.           
  19.        Configuration conf = new Configuration();  
  20.        String[] otherArgs = new GenericOptionsParser(conf,args).getRemainingArgs();  
  21.   
  22.        Job job = new Job(conf, "testhbasehfile");  
  23.        job.setJarByClass(HbaseHFileDriver.class);//主类 
  24.   
  25.        job.setMapperClass(com.test.hfile.HBaseHFileMapper.class);  
  26.        job.setReducerClass(com.test.hfile.HBaseHFileReducer.class);  
  27.   
  28.        job.setMapOutputKeyClass(ImmutableBytesWritable.class);  
  29.        job.setMapOutputValueClass(Text.class);  
  30.  
  31.        // 偷懒, 直接写死在程序里了,实际应用中不能这样,应从命令行获取,无语了对这个 
  32.        FileInputFormat.addInputPath(job, new Path("/home/yinjie/input"));  
  33.        FileOutputFormat.setOutputPath(job, new Path("/home/yinjie/output"));  
  34.   
  35.        Configuration HBASE_CONFIG = new Configuration();  
  36.        HBASE_CONFIG.set("hbase.zookeeper.quorum","localhost");  
  37.        HBASE_CONFIG.set("hbase.zookeeper.property.clientPort","2181");  
  38.        HBaseConfiguration cfg = new HBaseConfiguration(HBASE_CONFIG);  
  39.        String tableName = "t1";  
  40.        HTable htable = new HTable(cfg,tableName);  
  41.        HFileOutputFormat.configureIncrementalLoad(job,htable);  
  42.   
  43.        System.exit(job.waitForCompletion(true)? 0 : 1);  
  44.    }  
  45. }  
/home/yinjie/input目录下有一个hbasedata.txt文件,内容为
[java] viewplaincopy
  1. [root@localhost input]# cathbasedata.txt   
  2. r1:f1:c1:value1  
  3. r2:f1:c2:value2  
  4. r3:f1:c3:value3  
将作业打包,我的到处路径为/home/yinjie/job/hbasetest.jar
提交作业到hadoop运行:

[root@localhost job]# hadoop jar /home/yinjie/job/hbasetest.jarcom.test.hfile.HbaseHFileDriver -libjars

/home/yinjie/hbase-0.90.3/hbase-0.90.3.jar
作业运行完毕后查看下输出目录:

  1. [root@localhost input]# hadoop fs -ls/home/yinjie/output
  2. Found 2 items
  3. drwxr-xr-x   - rootsupergroup         0 2011-08-28 21:02 /home/yinjie/output/_logs
  4. drwxr-xr-x   - rootsupergroup         0 2011-08-28 21:03 /home/yinjie/output/f1
OK,已经生成以列族f1命名的文件夹了。

接下去使用Bulk Load将数据导入到HBbase

[root@localhost job]# hadoop jar/home/yinjie/hbase-0.90.3/hbase-0.90.3.jarcompletebulkload

导入完毕,查询hbase表t1进行验证

  1. hbase(main):166:0> scan 't1'
  2. ROW                             COLUMN+CELL                                                                                 
  3. r1                             column=f1:c1,timestamp=1314591150788,value=value1                                         
  4. r2                             column=f1:c2,timestamp=1314591150814,value=value2                                         
  5. r3                             column=f1:c3,timestamp=1314591150815,value=value3                                         
  6. 3 row(s) in 0.0210 seconds

数据已经导入!

本文后面的例子出自 “炽天使” 博客,请务必保留此出处 http://3199782.blog.51cto.com/3189782/652244

  1.  
  2. package org.apache.hadoop.hbase.mapreduce; 
  3.  
  4. import java.io.IOException; 
  5. import java.net.URI; 
  6. import java.net.URISyntaxException; 
  7. import java.util.ArrayList; 
  8. import java.util.List; 
  9. import java.util.Map; 
  10. import java.util.TreeMap; 
  11. import java.util.TreeSet; 
  12.  
  13. import org.apache.hadoop.conf.Configuration; 
  14. import org.apache.hadoop.filecache.DistributedCache; 
  15. import org.apache.hadoop.fs.FileSystem; 
  16. import org.apache.hadoop.fs.Path; 
  17. import org.apache.hadoop.hbase.HConstants; 
  18. import org.apache.hadoop.hbase.KeyValue; 
  19. import org.apache.hadoop.hbase.client.HTable; 
  20. import org.apache.hadoop.hbase.client.Put; 
  21. import org.apache.hadoop.hbase.io.ImmutableBytesWritable; 
  22. import org.apache.hadoop.hbase.io.hfile.Compression; 
  23. import org.apache.hadoop.hbase.io.hfile.HFile; 
  24. import org.apache.hadoop.hbase.mapreduce.hadoopbackport.TotalOrderPartitioner; 
  25. import org.apache.hadoop.hbase.regionserver.StoreFile; 
  26. import org.apache.hadoop.hbase.util.Bytes; 
  27. import org.apache.hadoop.io.NullWritable; 
  28. import org.apache.hadoop.io.SequenceFile; 
  29. import org.apache.hadoop.mapreduce.Job; 
  30. import org.apache.hadoop.mapreduce.RecordWriter; 
  31. import org.apache.hadoop.mapreduce.TaskAttemptContext; 
  32. import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter; 
  33. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 
  34.  
  35. import org.apache.commons.logging.Log; 
  36. import org.apache.commons.logging.LogFactory; 
  37.  
  38.  
  39. public class HFileOutputFormat extends FileOutputFormat<ImmutableBytesWritable,KeyValue> { 
  40.   static Log LOG = LogFactory.getLog(HFileOutputFormat.class); 
  41.   //继承的FileOutputFile 
  42.   public RecordWriter<ImmutableBytesWritable,KeyValue> getRecordWriter(final TaskAttemptContext context)//keyValue是什么值? 
  43.   throws IOException, InterruptedException{ 
  44.     // Get the path of the temporary outputfile获得输出文件的临时路径 
  45.     final Path outputPath =FileOutputFormat.getOutputPath(context); 
  46.     final Path outputdir = new FileOutputCommitter(outputPath,context).getWorkPath(); 
  47.    Configuration conf =context.getConfiguration(); 
  48.     final FileSystem fs =outputdir.getFileSystem(conf); 
  49.     // These configs. are fromhbase-*.xml获取对应的配置信息,前面的参数未设置,则使用默认的参数 
  50.     final long maxsize = conf.getLong("hbase.hregion.max.filesize",268435456); 
  51.     final int blocksize = 
  52.      conf.getInt("hbase.mapreduce.hfileoutputformat.blocksize",65536); 
  53.     // Invented config.  Add tohbase-*.xml if other than default compression. 
  54.     final String compression = conf.get("hfile.compression"
  55.      Compression.Algorithm.NONE.getName()); 
  56.  
  57.     return new RecordWriter<ImmutableBytesWritable,KeyValue>() { 
  58.      // Map of families to writers and how much has beenoutput on the writer.将列簇映射到writer上去 
  59.      private final Map<byte [], WriterLength> writers =//使用map作为容器 
  60.        new TreeMap<byte [],WriterLength>(Bytes.BYTES_COMPARATOR); 
  61.      private byte [] previousRow = HConstants.EMPTY_BYTE_ARRAY;//空 
  62.      private final byte [] now =Bytes.toBytes(System.currentTimeMillis());//当前时间 
  63.  
  64.      public void write(ImmutableBytesWritable row, KeyValue kv)//写入信息 
  65.      throws IOException { 
  66.        long length = kv.getLength(); 
  67.        byte [] family = kv.getFamily();//获取列簇 
  68.        WriterLength wl = this.writers.get(family); 
  69.        if (wl == null || ((length + wl.written) >= maxsize)&& 
  70.            Bytes.compareTo(this.previousRow,0,this.previousRow.length, 
  71.              kv.getBuffer(), kv.getRowOffset(), kv.getRowLength()) !=0){ 
  72.          // Get a new writer. 
  73.          Path basedir = new Path(outputdir,Bytes.toString(family)); 
  74.          if (wl == null){ 
  75.            wl = new WriterLength(); 
  76.            this.writers.put(family,wl); 
  77.            if (this.writers.size()> 1)throw new IOException("One familyonly"); 
  78.            // If wl == null, first file infamily.  Ensure family direxits. 
  79.            if (!fs.exists(basedir)) fs.mkdirs(basedir);//建立对应的路径 
  80.          } 
  81.          wl.writer = getNewWriter(wl.writer,basedir); 
  82.          LOG.info("Writer="+ wl.writer.getPath() + 
  83.            ((wl.written == 0)?"":", wrote="+ wl.written)); 
  84.          wl.written = 0
  85.        } 
  86.        kv.updateLatestStamp(this.now);// 
  87.        wl.writer.append(kv); 
  88.        wl.written += length; 
  89.        // Copy the row so we know when a rowtransition. 
  90.        this.previousRow= kv.getRow(); 
  91.      } 
  92.  
  93.       
  94.      private HFile.Writer getNewWriter(final HFile.Writer writer, 
  95.          final Path familydir) 
  96.      throws IOException { 
  97.        close(writer); 
  98.        return new HFile.Writer(fs,  StoreFile.getUniqueFile(fs,familydir), 
  99.          blocksize, compression,KeyValue.KEY_COMPARATOR); 
  100.      } 
  101.  
  102.      private void close(final HFile.Writer w) throws IOException { 
  103.        if (w != null){ 
  104.          w.appendFileInfo(StoreFile.BULKLOAD_TIME_KEY,//时间标签 
  105.              Bytes.toBytes(System.currentTimeMillis())); 
  106.          w.appendFileInfo(StoreFile.BULKLOAD_TASK_KEY, 
  107.              Bytes.toBytes(context.getTaskAttemptID().toString())); 
  108.          w.appendFileInfo(StoreFile.MAJOR_COMPACTION_KEY,  
  109.              Bytes.toBytes(true)); 
  110.          w.close(); 
  111.        } 
  112.      } 
  113.  
  114.      public void close(TaskAttemptContext c) 
  115.      throws IOException, InterruptedException{ 
  116.        for (Map.Entry<byte [], WriterLength> e: this.writers.entrySet()){ 
  117.          close(e.getValue().writer); 
  118.        } 
  119.      } 
  120.    }; 
  121.   } 
  122.  
  123.    
  124.   static class WriterLength { 
  125.     long written = 0
  126.     HFile.Writerwriter = null
  127.   } 
  128.  
  129.    
  130.   private static List<ImmutableBytesWritable>getRegionStartKeys(HTabletable) 
  131.   throws IOException { 
  132.     byte[][]byteKeys =table.getStartKeys(); 
  133.    ArrayList<ImmutableBytesWritable> ret= 
  134.      new ArrayList<ImmutableBytesWritable>(byteKeys.length); 
  135.     for (byte[]byteKey : byteKeys) { 
  136.      ret.add(new ImmutableBytesWritable(byteKey)); 
  137.    } 
  138.     return ret; 
  139.   } 
  140.  
  141.    
  142.   private static void writePartitions(Configuration conf, PathpartitionsPath, 
  143.      List<ImmutableBytesWritable>startKeys) throws IOException { 
  144.     if (startKeys.isEmpty()) { 
  145.      throw new IllegalArgumentException("No regionspassed"); 
  146.    } 
  147.  
  148.     // We're generating a list of split points, and wedon't ever 
  149.     // have keys < the first region(which has an empty start key) 
  150.     // so we need to remove it. Otherwise we would endup with an 
  151.     // empty reducer with index 0 
  152.    TreeSet<ImmutableBytesWritable>sorted = 
  153.      new TreeSet<ImmutableBytesWritable>(startKeys); 
  154.  
  155.    ImmutableBytesWritable first =sorted.first(); 
  156.     if (!first.equals(HConstants.EMPTY_BYTE_ARRAY)){ 
  157.      throw new IllegalArgumentException
  158.          "First region of table should haveempty start key. Instead has: " 
  159.          + Bytes.toStringBinary(first.get())); 
  160.    } 
  161.    sorted.remove(first); 
  162.      
  163.     // Write the actual file 
  164.     FileSystemfs =partitionsPath.getFileSystem(conf); 
  165.    SequenceFile.Writer writer =SequenceFile.createWriter(fs,  
  166.        conf, partitionsPath, ImmutableBytesWritable.class,NullWritable.class); 
  167.      
  168.     try
  169.      for (ImmutableBytesWritable startKey : sorted){ 
  170.        writer.append(startKey,NullWritable.get()); 
  171.      } 
  172.     }finally
  173.      writer.close(); 
  174.    } 
  175.   } 
  176.    
  177.    
  178.   public static void configureIncrementalLoad(Job job, HTable table) throws IOException {//这些不是在main函数中设置的吗 
  179.    Configuration conf =job.getConfiguration(); 
  180.    job.setPartitionerClass(TotalOrderPartitioner.class); 
  181.    job.setOutputKeyClass(ImmutableBytesWritable.class); 
  182.    job.setOutputValueClass(KeyValue.class); 
  183.    job.setOutputFormatClass(HFileOutputFormat.class); 
  184.      
  185.     // Based on the configured map output class, setthe correct reducer to properly 
  186.     // sort the incoming values. 
  187.     // TODO it would be nice to pick one or the otherof these formats. 
  188.     if (KeyValue.class.equals(job.getMapOutputValueClass())){ 
  189.      job.setReducerClass(KeyValueSortReducer.class); 
  190.     }else if (Put.class.equals(job.getMapOutputValueClass())){ 
  191.      job.setReducerClass(PutSortReducer.class); 
  192.     }else
  193.      LOG.warn("Unknown map output valuetype:" +job.getMapOutputValueClass()); 
  194.    } 
  195.      
  196.    LOG.info("Looking up current regions fortable " + table); 
  197.    List<ImmutableBytesWritable>startKeys =getRegionStartKeys(table); 
  198.    LOG.info("Configuring" + startKeys.size() + " reduce partitions"
  199.        "to match current regioncount"); 
  200.    job.setNumReduceTasks(startKeys.size()); 
  201.      
  202.     PathpartitionsPath = new Path(job.getWorkingDirectory(), 
  203.        "partitions_"+ System.currentTimeMillis()); 
  204.    LOG.info("Writing partition information to" + partitionsPath); 
  205.  
  206.     FileSystemfs =partitionsPath.getFileSystem(conf); 
  207.    writePartitions(conf, partitionsPath,startKeys); 
  208.    partitionsPath.makeQualified(fs); 
  209.     URIcacheUri; 
  210.     try
  211.      cacheUri = new URI(partitionsPath.toString() + "#"
  212.          TotalOrderPartitioner.DEFAULT_PATH); 
  213.     }catch (URISyntaxException e) { 
  214.      throw new IOException(e); 
  215.    } 
  216.    DistributedCache.addCacheFile(cacheUri,conf); 
  217.    DistributedCache.createSymlink(conf); 
  218.      
  219.    LOG.info("Incremental table outputconfigured."); 
  220.   } 
  221.    
下面给出一个示例:

1. 创建HBase表t1

[java] viewplaincopy
  1. hbase(main):157:0*create 't1','f1'  
  2. 0 row(s) in 1.3280 seconds  
  3.   
  4. hbase(main):158:0>scan 't1'  
  5. ROW                  COLUMN+CELL                                                 
  6. 0 row(s) in 1.2770 seconds  
2.写MR作业
HBaseHFileMapper.java
[java] viewplaincopy
  1. package com.test.hfile;  
  2. import java.io.IOException;  
  3. import org.apache.hadoop.hbase.io.ImmutableBytesWritable;  
  4. import org.apache.hadoop.hbase.util.Bytes;  
  5. import org.apache.hadoop.io.LongWritable;  
  6. import org.apache.hadoop.io.Text;  
  7. import org.apache.hadoop.mapreduce.Mapper;  
  8.   
  9. public class HBaseHFileMapper extends Mapper<LongWritable, Text, ImmutableBytesWritable,Text>{  
  10.     private ImmutableBytesWritable immutableBytesWritable = new ImmutableBytesWritable();  
  11.     @Override  
  12.     protected void map(LongWritable key, Textvalue,  
  13.            org.apache.hadoop.mapreduce.Mapper.Contextcontext)  
  14.            throws IOException, InterruptedException{  
  15.        immutableBytesWritable.set(Bytes.toBytes(key.get())); //也就是说输入值还是key 
  16.        context.write(immutableBytesWritable,value);  
  17.    }  
  18. }  
HBaseHFileReducer.java
[java] viewplaincopy
  1. package com.test.hfile;  
  2. import java.io.IOException;  
  3. import org.apache.hadoop.hbase.KeyValue;  
  4. import org.apache.hadoop.hbase.io.ImmutableBytesWritable;  
  5. import org.apache.hadoop.hbase.util.Bytes;  
  6. import org.apache.hadoop.io.Text;  
  7. import org.apache.hadoop.mapreduce.Reducer;  
  8.   
  9. public class HBaseHFileReducer extends Reducer<ImmutableBytesWritable, Text,ImmutableBytesWritable, KeyValue>{      
  10.     protected void reduce(ImmutableBytesWritable key,Iterable<Text>values,  
  11.            Context context)  
  12.            throws IOException, InterruptedException{  
  13.        String value="";  
  14.        while(values.iterator().hasNext())  
  15.        {  
  16.            value =values.iterator().next().toString();  
  17.            if(value!= null && !"".equals(value))//不为空 
  18.            {  
  19.                KeyValue kv = createKeyValue(value.toString());//keyValue就是value值  
  20.                if(kv!=null)  
  21.                    context.write(key,kv);  
  22.            }  
  23.        }  
  24.    }  
  25.     // str格式为row:family:qualifier:value简单模拟下 
  26.     private KeyValue createKeyValue(Stringstr)  
  27.    {  
  28.        String[] strstrs = str.split(":");  
  29.        if(strs.length<4)  
  30.            return null;  
  31.        String row=strs[0];  
  32.        String family=strs[1];  
  33.        String qualifier=strs[2];  
  34.        String value=strs[3];  
  35.        return new KeyValue(Bytes.toBytes(row),Bytes.toBytes(family),Bytes.toBytes(qualifier),System.currentTimeMillis(),Bytes.toBytes(value));  
  36.    }  
  37. }  
HbaseHFileDriver.java
[java] viewplaincopy
  1. package com.test.hfile;  
  2. import java.io.IOException;  
  3. import org.apache.hadoop.conf.Configuration;  
  4. import org.apache.hadoop.fs.Path;  
  5. import org.apache.hadoop.hbase.HBaseConfiguration;  
  6. import org.apache.hadoop.hbase.client.HTable;  
  7. import org.apache.hadoop.hbase.io.ImmutableBytesWritable;  
  8. import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat;  
  9. import org.apache.hadoop.io.Text;  
  10. import org.apache.hadoop.mapreduce.Job;  
  11. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  
  12. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  
  13. import org.apache.hadoop.util.GenericOptionsParser;  
  14.   
  15. public class HbaseHFileDriver{  
  16.     public static void main(String[] args) throws IOException,  
  17.            InterruptedException, ClassNotFoundException{  
  18.           
  19.        Configuration conf = new Configuration();  
  20.        String[] otherArgs = new GenericOptionsParser(conf,args).getRemainingArgs();  
  21.   
  22.        Job job = new Job(conf, "testhbasehfile");  
  23.        job.setJarByClass(HbaseHFileDriver.class);//主类 
  24.   
  25.        job.setMapperClass(com.test.hfile.HBaseHFileMapper.class);  
  26.        job.setReducerClass(com.test.hfile.HBaseHFileReducer.class);  
  27.   
  28.        job.setMapOutputKeyClass(ImmutableBytesWritable.class);  
  29.        job.setMapOutputValueClass(Text.class);  
  30.  
  31.        // 偷懒, 直接写死在程序里了,实际应用中不能这样,应从命令行获取,无语了对这个 
  32.        FileInputFormat.addInputPath(job, new Path("/home/yinjie/input"));  
  33.        FileOutputFormat.setOutputPath(job, new Path("/home/yinjie/output"));  
  34.   
  35.        Configuration HBASE_CONFIG = new Configuration();  
  36.        HBASE_CONFIG.set("hbase.zookeeper.quorum","localhost");  
  37.        HBASE_CONFIG.set("hbase.zookeeper.property.clientPort","2181");  
  38.        HBaseConfiguration cfg = new HBaseConfiguration(HBASE_CONFIG);  
  39.        String tableName = "t1";  
  40.        HTable htable = new HTable(cfg,tableName);  
  41.        HFileOutputFormat.configureIncrementalLoad(job,htable);  
  42.   
  43.        System.exit(job.waitForCompletion(true)? 0 : 1);  
  44.    }  
  45. }  
/home/yinjie/input目录下有一个hbasedata.txt文件,内容为
[java] viewplaincopy
  1. [root@localhost input]# cathbasedata.txt   
  2. r1:f1:c1:value1  
  3. r2:f1:c2:value2  
  4. r3:f1:c3:value3  
将作业打包,我的到处路径为/home/yinjie/job/hbasetest.jar
提交作业到hadoop运行:

[root@localhost job]# hadoop jar /home/yinjie/job/hbasetest.jarcom.test.hfile.HbaseHFileDriver -libjars

/home/yinjie/hbase-0.90.3/hbase-0.90.3.jar
作业运行完毕后查看下输出目录:

  1. [root@localhost input]# hadoop fs -ls/home/yinjie/output
  2. Found 2 items
  3. drwxr-xr-x   - rootsupergroup         0 2011-08-28 21:02 /home/yinjie/output/_logs
  4. drwxr-xr-x   - rootsupergroup         0 2011-08-28 21:03 /home/yinjie/output/f1
OK,已经生成以列族f1命名的文件夹了。

接下去使用Bulk Load将数据导入到HBbase

[root@localhost job]# hadoop jar/home/yinjie/hbase-0.90.3/hbase-0.90.3.jarcompletebulkload

导入完毕,查询hbase表t1进行验证

  1. hbase(main):166:0> scan 't1'
  2. ROW                             COLUMN+CELL                                                                                 
  3. r1                             column=f1:c1,timestamp=1314591150788,value=value1                                         
  4. r2                             column=f1:c2,timestamp=1314591150814,value=value2                                         
  5. r3                             column=f1:c3,timestamp=1314591150815,value=value3                                         
  6. 3 row(s) in 0.0210 seconds

数据已经导入!

本文后面的例子出自 “炽天使” 博客,请务必保留此出处 http://3199782.blog.51cto.com/3189782/652244

原创粉丝点击