Hbse源码分析-HFileOutputFo…
来源:互联网 发布:西安淘宝模特兼职 编辑:程序博客网 时间:2024/06/18 12:45
开始学习使用Hbase,不知道对不对 但是先从源码开始读起吧....
hadoop mr 输出需要导入hbase的话最好先输出成HFile格式,再导入到HBase,因为HFile是HBase的内部存
对应的源码为:
- package org.apache.hadoop.hbase.mapreduce;
- import java.io.IOException;
- import java.net.URI;
- import java.net.URISyntaxException;
- import java.util.ArrayList;
- import java.util.List;
- import java.util.Map;
- import java.util.TreeMap;
- import java.util.TreeSet;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.filecache.DistributedCache;
- import org.apache.hadoop.fs.FileSystem;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.hbase.HConstants;
- import org.apache.hadoop.hbase.KeyValue;
- import org.apache.hadoop.hbase.client.HTable;
- import org.apache.hadoop.hbase.client.Put;
- import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
- import org.apache.hadoop.hbase.io.hfile.Compression;
- import org.apache.hadoop.hbase.io.hfile.HFile;
- import org.apache.hadoop.hbase.mapreduce.hadoopbackport.TotalOrderPartitioner;
- import org.apache.hadoop.hbase.regionserver.StoreFile;
- import org.apache.hadoop.hbase.util.Bytes;
- import org.apache.hadoop.io.NullWritable;
- import org.apache.hadoop.io.SequenceFile;
- import org.apache.hadoop.mapreduce.Job;
- import org.apache.hadoop.mapreduce.RecordWriter;
- import org.apache.hadoop.mapreduce.TaskAttemptContext;
- import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
- import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
- import org.apache.commons.logging.Log;
- import org.apache.commons.logging.LogFactory;
- public class HFileOutputFormat extends FileOutputFormat<ImmutableBytesWritable, KeyValue> {
- static Log LOG = LogFactory.getLog(HFileOutputFormat.class);
- //继承的FileOutputFile
- public RecordWriter<ImmutableBytesWritable, KeyValue> getRecordWriter(final TaskAttemptContext context)//keyValue是什么值?
- throws IOException, InterruptedException {
- // Get the path of the temporary output file获得输出文件的临时路径
- final Path outputPath = FileOutputFormat.getOutputPath(context);
- final Path outputdir = new FileOutputCommitter(outputPath, context).getWorkPath();
- Configuration conf = context.getConfiguration();
- final FileSystem fs = outputdir.getFileSystem(conf);
- // These configs. are from hbase-*.xml获取对应的配置信息,前面的参数未设置,则使用默认的参数
- final long maxsize = conf.getLong("hbase.hregion.max.filesize", 268435456);
- final int blocksize =
- conf.getInt("hbase.mapreduce.hfileoutputformat.blocksize", 65536);
- // Invented config. Add to hbase-*.xml if other than default compression.
- final String compression = conf.get("hfile.compression",
- Compression.Algorithm.NONE.getName());
- return new RecordWriter<ImmutableBytesWritable, KeyValue>() {
- // Map of families to writers and how much has been output on the writer.将列簇映射到writer上去
- private final Map<byte [], WriterLength> writers =//使用map作为容器
- new TreeMap<byte [], WriterLength>(Bytes.BYTES_COMPARATOR);
- private byte [] previousRow = HConstants.EMPTY_BYTE_ARRAY;//空
- private final byte [] now = Bytes.toBytes(System.currentTimeMillis());//当前时间
- public void write(ImmutableBytesWritable row, KeyValue kv)//写入信息
- throws IOException {
- long length = kv.getLength();
- byte [] family = kv.getFamily();//获取列簇
- WriterLength wl = this.writers.get(family);
- if (wl == null || ((length + wl.written) >= maxsize) &&
- Bytes.compareTo(this.previousRow, 0, this.previousRow.length,
- kv.getBuffer(), kv.getRowOffset(), kv.getRowLength()) != 0) {
- // Get a new writer.
- Path basedir = new Path(outputdir, Bytes.toString(family));
- if (wl == null) {
- wl = new WriterLength();
- this.writers.put(family, wl);
- if (this.writers.size() > 1) throw new IOException("One family only");
- // If wl == null, first file in family. Ensure family dir exits.
- if (!fs.exists(basedir)) fs.mkdirs(basedir);//建立对应的路径
- }
- wl.writer = getNewWriter(wl.writer, basedir);
- LOG.info("Writer=" + wl.writer.getPath() +
- ((wl.written == 0)? "": ", wrote=" + wl.written));
- wl.written = 0;
- }
- kv.updateLatestStamp(this.now);//
- wl.writer.append(kv);
- wl.written += length;
- // Copy the row so we know when a row transition.
- this.previousRow = kv.getRow();
- }
- private HFile.Writer getNewWriter(final HFile.Writer writer,
- final Path familydir)
- throws IOException {
- close(writer);
- return new HFile.Writer(fs, StoreFile.getUniqueFile(fs, familydir),
- blocksize, compression, KeyValue.KEY_COMPARATOR);
- }
- private void close(final HFile.Writer w) throws IOException {
- if (w != null) {
- w.appendFileInfo(StoreFile.BULKLOAD_TIME_KEY,//时间标签
- Bytes.toBytes(System.currentTimeMillis()));
- w.appendFileInfo(StoreFile.BULKLOAD_TASK_KEY,
- Bytes.toBytes(context.getTaskAttemptID().toString()));
- w.appendFileInfo(StoreFile.MAJOR_COMPACTION_KEY,
- Bytes.toBytes(true));
- w.close();
- }
- }
- public void close(TaskAttemptContext c)
- throws IOException, InterruptedException {
- for (Map.Entry<byte [], WriterLength> e: this.writers.entrySet()) {
- close(e.getValue().writer);
- }
- }
- };
- }
- static class WriterLength {
- long written = 0;
- HFile.Writer writer = null;
- }
- private static List<ImmutableBytesWritable> getRegionStartKeys(HTable table)
- throws IOException {
- byte[][] byteKeys = table.getStartKeys();
- ArrayList<ImmutableBytesWritable> ret =
- new ArrayList<ImmutableBytesWritable>(byteKeys.length);
- for (byte[] byteKey : byteKeys) {
- ret.add(new ImmutableBytesWritable(byteKey));
- }
- return ret;
- }
- private static void writePartitions(Configuration conf, Path partitionsPath,
- List<ImmutableBytesWritable> startKeys) throws IOException {
- if (startKeys.isEmpty()) {
- throw new IllegalArgumentException
( "No regions passed"); - }
- // We're generating a list of split points, and we don't ever
- // have keys < the first region (which has an empty start key)
- // so we need to remove it. Otherwise we would end up with an
- // empty reducer with index 0
- TreeSet<ImmutableBytesWritable> sorted =
- new TreeSet<ImmutableBytesWritable>(startKeys);
- ImmutableBytesWritable first = sorted.first();
- if (!first.equals(HConstants.EMPTY_BYTE_ARRAY)) {
- throw new IllegalArgumentException
( - "First region of table should have empty start key. Instead has: "
- + Bytes.toStringBinary(first.get()));
- }
- sorted.remove(first);
- // Write the actual file
- FileSystem fs = partitionsPath.getFileSystem(conf);
- SequenceFile.Writer writer = SequenceFile.createWriter(fs,
- conf, partitionsPath, ImmutableBytesWritable.class, NullWritable.class);
- try {
- for (ImmutableBytesWritable startKey : sorted) {
- writer.append(startKey, NullWritable.get());
- }
- } finally {
- writer.close();
- }
- }
- public static void configureIncrementalLoad
(Job job, HTable table) throws IOException {//这些不是在main函数中设置的吗 - Configuration conf = job.getConfiguration();
- job.setPartitionerClass(TotalOrderPartitioner.class);
- job.setOutputKeyClass(ImmutableBytesWritable.class);
- job.setOutputValueClass(KeyValue.class);
- job.setOutputFormatClass(HFileOutputFormat.class);
- // Based on the configured map output class, set the correct reducer to properly
- // sort the incoming values.
- // TODO it would be nice to pick one or the other of these formats.
- if (KeyValue.class.equals(job.getMapOutputValueClass())) {
- job.setReducerClass(KeyValueSortReducer.class);
- } else if (Put.class.equals(job.getMapOutputValueClass())) {
- job.setReducerClass(PutSortReducer.class);
- } else {
- LOG.warn("Unknown map output value type:" + job.getMapOutputValueClass());
- }
- LOG.info("Looking up current regions for table " + table);
- List<ImmutableBytesWritable> startKeys = getRegionStartKeys(table);
- LOG.info("Configuring " + startKeys.size() + " reduce partitions " +
- "to match current region count");
- job.setNumReduceTasks(startKeys.size());
- Path partitionsPath = new Path(job.getWorkingDirectory(),
- "partitions_" + System.currentTimeMillis());
- LOG.info("Writing partition information to " + partitionsPath);
- FileSystem fs = partitionsPath.getFileSystem(conf);
- writePartitions(conf, partitionsPath, startKeys);
- partitionsPath.makeQualified(fs);
- URI cacheUri;
- try {
- cacheUri = new URI(partitionsPath.toString() + "#" +
- TotalOrderPartitioner.DEFAULT_PATH);
- } catch (URISyntaxException e) {
- throw new IOException(e);
- }
- DistributedCache.addCacheFile(cacheUri, conf);
- DistributedCache.createSymlink(conf);
- LOG.info("Incremental table output configured.");
- }
- }
1. 创建HBase表t1
- hbase(main):157:0* create 't1','f1'
- 0 row(s) in 1.3280 seconds
- hbase(main):158:0> scan 't1'
- ROW COLUMN+CELL
- 0 row(s) in 1.2770 seconds
HBaseHFileMapper.java
- package com.test.hfile;
- import java.io.IOException;
- import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
- import org.apache.hadoop.hbase.util.Bytes;
- import org.apache.hadoop.io.LongWritable;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.mapreduce.Mapper;
- public class HBaseHFileMapper extends Mapper<LongWritable, Text, ImmutableBytesWritable, Text> {
- private ImmutableBytesWritable immutableBytesWritable = new ImmutableBytesWritable();
- @Override
- protected void map(LongWritable key, Text value,
- org.apache.hadoop.mapreduce.Mapper.Context context)
- throws IOException, InterruptedException {
- immutableBytesWritable.set(Bytes.toBytes(key.get())); //也就是说输入值还是key
- context.write(immutableBytesWritable, value);
- }
- }
- package com.test.hfile;
- import java.io.IOException;
- import org.apache.hadoop.hbase.KeyValue;
- import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
- import org.apache.hadoop.hbase.util.Bytes;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.mapreduce.Reducer;
- public class HBaseHFileReducer extends Reducer<ImmutableBytesWritable, Text, ImmutableBytesWritable, KeyValue> {
- protected void reduce(ImmutableBytesWritable key, Iterable<Text> values,
- Context context)
- throws IOException, InterruptedException {
- String value="";
- while(values.iterator().hasNext())
- {
- value = values.iterator().next().toString();
- if(value != null && !"".equals(value)) //不为空
- {
- KeyValue kv = createKeyValue(value.toString());//keyValue就是value值
- if(kv!=null)
- context.write(key, kv);
- }
- }
- }
- // str格式为row:family:qualifier:value 简单模拟下
- private KeyValue createKeyValue(String str)
- {
- String[] strstrs = str.split(":");
- if(strs.length<4)
- return null;
- String row=strs[0];
- String family=strs[1];
- String qualifier=strs[2];
- String value=strs[3];
- return new KeyValue(Bytes.toBytes(row),Bytes.toBytes(family),Bytes.toBytes(qualifier),System.currentTimeMillis(), Bytes.toBytes(value));
- }
- }
- package com.test.hfile;
- import java.io.IOException;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.hbase.HBaseConfiguration;
- import org.apache.hadoop.hbase.client.HTable;
- import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
- import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.mapreduce.Job;
- import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
- import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
- import org.apache.hadoop.util.GenericOptionsParser;
- public class HbaseHFileDriver {
- public static void main(String[] args) throws IOException,
- InterruptedException, ClassNotFoundException {
- Configuration conf = new Configuration();
- String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
- Job job = new Job(conf, "testhbasehfile");
- job.setJarByClass(HbaseHFileDriver.class); //主类
- job.setMapperClass(com.test.hfile.HBaseHFileMapper.class);
- job.setReducerClass(com.test.hfile.HBaseHFileReducer.class);
- job.setMapOutputKeyClass(ImmutableBytesWritable.class);
- job.setMapOutputValueClass(Text.class);
- // 偷懒, 直接写死在程序里了,实际应用中不能这样, 应从命令行获取,无语了对这个
- FileInputFormat.addInputPath(job, new Path("/home/yinjie/input"));
- FileOutputFormat.setOutputPath(job, new Path("/home/yinjie/output"));
- Configuration HBASE_CONFIG = new Configuration();
- HBASE_CONFIG.set("hbase.zookeeper.quorum", "localhost");
- HBASE_CONFIG.set("hbase.zookeeper.property.clientPort", "2181");
- HBaseConfiguration cfg = new HBaseConfiguration(HBASE_CONFIG);
- String tableName = "t1";
- HTable htable = new HTable(cfg, tableName);
- HFileOutputFormat.configureIncrementalLoad
(job, htable); - System.exit(job.waitForCompletion(true) ? 0 : 1);
- }
- }
- [root@localhost input]# cat hbasedata.txt
- r1:f1:c1:value1
- r2:f1:c2:value2
- r3:f1:c3:value3
提交作业到hadoop运行:
[root@localhost job]# hadoop jar /home/yinjie/job/hbasetest.jar com.test.hfile.HbaseHFileDriver -libjars
/home/yinjie/hbase-0.90.3/hbase-0.90.3.jar
作业运行完毕后查看下输出目录:
- [root@localhost input]# hadoop fs -ls /home/yinjie/output
- Found 2 items
- drwxr-xr-x - root supergroup 0 2011-08-28 21:02 /home/yinjie/output/_logs
- drwxr-xr-x - root supergroup 0 2011-08-28 21:03 /home/yinjie/output/f1
接下去使用Bulk Load将数据导入到HBbase
[root@localhost job]# hadoop jar /home/yinjie/hbase-0.90.3/hbase-0.90.3.jar completebulkload
导入完毕,查询hbase表t1进行验证
- hbase(main):166:0> scan 't1'
- ROW COLUMN+CELL
- r1 column=f1:c1, timestamp=1314591150788, value=value1
- r2 column=f1:c2, timestamp=1314591150814, value=value2
- r3 column=f1:c3, timestamp=1314591150815, value=value3
- 3 row(s) in 0.0210 seconds
数据已经导入!
本文后面的例子出自 “炽天使” 博客,请务必保留此出处 http://3199782.blog.51cto.com/3189782/652244
- package org.apache.hadoop.hbase.mapreduce;
- import java.io.IOException;
- import java.net.URI;
- import java.net.URISyntaxException;
- import java.util.ArrayList;
- import java.util.List;
- import java.util.Map;
- import java.util.TreeMap;
- import java.util.TreeSet;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.filecache.DistributedCache;
- import org.apache.hadoop.fs.FileSystem;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.hbase.HConstants;
- import org.apache.hadoop.hbase.KeyValue;
- import org.apache.hadoop.hbase.client.HTable;
- import org.apache.hadoop.hbase.client.Put;
- import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
- import org.apache.hadoop.hbase.io.hfile.Compression;
- import org.apache.hadoop.hbase.io.hfile.HFile;
- import org.apache.hadoop.hbase.mapreduce.hadoopbackport.TotalOrderPartitioner;
- import org.apache.hadoop.hbase.regionserver.StoreFile;
- import org.apache.hadoop.hbase.util.Bytes;
- import org.apache.hadoop.io.NullWritable;
- import org.apache.hadoop.io.SequenceFile;
- import org.apache.hadoop.mapreduce.Job;
- import org.apache.hadoop.mapreduce.RecordWriter;
- import org.apache.hadoop.mapreduce.TaskAttemptContext;
- import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
- import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
- import org.apache.commons.logging.Log;
- import org.apache.commons.logging.LogFactory;
- public class HFileOutputFormat extends FileOutputFormat<ImmutableBytesWritable,KeyValue> {
- static Log LOG = LogFactory.getLog(HFileOutputFormat.class);
- //继承的FileOutputFile
- public RecordWriter<ImmutableBytesWritable,KeyValue> getRecordWriter(final TaskAttemptContext context)//keyValue是什么值?
- throws IOException, InterruptedException{
- // Get the path of the temporary outputfile获得输出文件的临时路径
- final Path outputPath =FileOutputFormat.getOutputPath(context);
- final Path outputdir = new FileOutputCommitter(outputPath,context).getWorkPath();
- Configuration conf =context.getConfiguration();
- final FileSystem fs =outputdir.getFileSystem(conf);
- // These configs. are fromhbase-*.xml获取对应的配置信息,前面的参数未设置,则使用默认的参数
- final long maxsize = conf.getLong("hbase.hregion.max.filesize",268435456);
- final int blocksize =
- conf.getInt("hbase.mapreduce.hfileoutputformat.blocksize",65536);
- // Invented config. Add tohbase-*.xml if other than default compression.
- final String compression = conf.get("hfile.compression",
- Compression.Algorithm.NONE.getName());
- return new RecordWriter<ImmutableBytesWritable,KeyValue>() {
- // Map of families to writers and how much has beenoutput on the writer.将列簇映射到writer上去
- private final Map<byte [], WriterLength> writers =//使用map作为容器
- new TreeMap<byte [],WriterLength>(Bytes.BYTES_COMPARATOR);
- private byte [] previousRow = HConstants.EMPTY_BYTE_ARRAY;//空
- private final byte [] now =Bytes.toBytes(System.currentTimeMillis());//当前时间
- public void write(ImmutableBytesWritable row, KeyValue kv)//写入信息
- throws IOException {
- long length = kv.getLength();
- byte [] family = kv.getFamily();//获取列簇
- WriterLength wl = this.writers.get(family);
- if (wl == null || ((length + wl.written) >= maxsize)&&
- Bytes.compareTo(this.previousRow,0,this.previousRow.length,
- kv.getBuffer(), kv.getRowOffset(), kv.getRowLength()) !=0){
- // Get a new writer.
- Path basedir = new Path(outputdir,Bytes.toString(family));
- if (wl == null){
- wl = new WriterLength();
- this.writers.put(family,wl);
- if (this.writers.size()> 1)throw new IOException("One familyonly");
- // If wl == null, first file infamily. Ensure family direxits.
- if (!fs.exists(basedir)) fs.mkdirs(basedir);//建立对应的路径
- }
- wl.writer = getNewWriter(wl.writer,basedir);
- LOG.info("Writer="+ wl.writer.getPath() +
- ((wl.written == 0)?"":", wrote="+ wl.written));
- wl.written = 0;
- }
- kv.updateLatestStamp(this.now);//
- wl.writer.append(kv);
- wl.written += length;
- // Copy the row so we know when a rowtransition.
- this.previousRow= kv.getRow();
- }
- private HFile.Writer getNewWriter(final HFile.Writer writer,
- final Path familydir)
- throws IOException {
- close(writer);
- return new HFile.Writer(fs, StoreFile.getUniqueFile(fs,familydir),
- blocksize, compression,KeyValue.KEY_COMPARATOR);
- }
- private void close(final HFile.Writer w) throws IOException {
- if (w != null){
- w.appendFileInfo(StoreFile.BULKLOAD_TIME_KEY,//时间标签
- Bytes.toBytes(System.currentTimeMillis()));
- w.appendFileInfo(StoreFile.BULKLOAD_TASK_KEY,
- Bytes.toBytes(context.getTaskAttemptID().toString()));
- w.appendFileInfo(StoreFile.MAJOR_COMPACTION_KEY,
- Bytes.toBytes(true));
- w.close();
- }
- }
- public void close(TaskAttemptContext c)
- throws IOException, InterruptedException{
- for (Map.Entry<byte [], WriterLength> e: this.writers.entrySet()){
- close(e.getValue().writer);
- }
- }
- };
- }
- static class WriterLength {
- long written = 0;
- HFile.Writerwriter = null;
- }
- private static List<ImmutableBytesWritable>getRegionStartKeys(HTabletable)
- throws IOException {
- byte[][]byteKeys =table.getStartKeys();
- ArrayList<ImmutableBytesWritable> ret=
- new ArrayList<ImmutableBytesWritable>(byteKeys.length);
- for (byte[]byteKey : byteKeys) {
- ret.add(new ImmutableBytesWritable(byteKey));
- }
- return ret;
- }
- private static void writePartitions(Configuration conf, PathpartitionsPath,
- List<ImmutableBytesWritable>startKeys) throws IOException {
- if (startKeys.isEmpty()) {
- throw new IllegalArgumentException
( "No regionspassed"); - }
- // We're generating a list of split points, and wedon't ever
- // have keys < the first region(which has an empty start key)
- // so we need to remove it. Otherwise we would endup with an
- // empty reducer with index 0
- TreeSet<ImmutableBytesWritable>sorted =
- new TreeSet<ImmutableBytesWritable>(startKeys);
- ImmutableBytesWritable first =sorted.first();
- if (!first.equals(HConstants.EMPTY_BYTE_ARRAY)){
- throw new IllegalArgumentException
( - "First region of table should haveempty start key. Instead has: "
- + Bytes.toStringBinary(first.get()));
- }
- sorted.remove(first);
- // Write the actual file
- FileSystemfs =partitionsPath.getFileSystem(conf);
- SequenceFile.Writer writer =SequenceFile.createWriter(fs,
- conf, partitionsPath, ImmutableBytesWritable.class,NullWritable.class);
- try {
- for (ImmutableBytesWritable startKey : sorted){
- writer.append(startKey,NullWritable.get());
- }
- }finally {
- writer.close();
- }
- }
- public static void configureIncrementalLoad
(Job job, HTable table) throws IOException {//这些不是在main函数中设置的吗 - Configuration conf =job.getConfiguration();
- job.setPartitionerClass(TotalOrderPartitioner.class);
- job.setOutputKeyClass(ImmutableBytesWritable.class);
- job.setOutputValueClass(KeyValue.class);
- job.setOutputFormatClass(HFileOutputFormat.class);
- // Based on the configured map output class, setthe correct reducer to properly
- // sort the incoming values.
- // TODO it would be nice to pick one or the otherof these formats.
- if (KeyValue.class.equals(job.getMapOutputValueClass())){
- job.setReducerClass(KeyValueSortReducer.class);
- }else if (Put.class.equals(job.getMapOutputValueClass())){
- job.setReducerClass(PutSortReducer.class);
- }else {
- LOG.warn("Unknown map output valuetype:" +job.getMapOutputValueClass());
- }
- LOG.info("Looking up current regions fortable " + table);
- List<ImmutableBytesWritable>startKeys =getRegionStartKeys(table);
- LOG.info("Configuring" + startKeys.size() + " reduce partitions" +
- "to match current regioncount");
- job.setNumReduceTasks(startKeys.size());
- PathpartitionsPath = new Path(job.getWorkingDirectory(),
- "partitions_"+ System.currentTimeMillis());
- LOG.info("Writing partition information to" + partitionsPath);
- FileSystemfs =partitionsPath.getFileSystem(conf);
- writePartitions(conf, partitionsPath,startKeys);
- partitionsPath.makeQualified(fs);
- URIcacheUri;
- try {
- cacheUri = new URI(partitionsPath.toString() + "#"+
- TotalOrderPartitioner.DEFAULT_PATH);
- }catch (URISyntaxException e) {
- throw new IOException(e);
- }
- DistributedCache.addCacheFile(cacheUri,conf);
- DistributedCache.createSymlink(conf);
- LOG.info("Incremental table outputconfigured.");
- }
- }
1. 创建HBase表t1
- hbase(main):157:0*create 't1','f1'
- 0 row(s) in 1.3280 seconds
- hbase(main):158:0>scan 't1'
- ROW COLUMN+CELL
- 0 row(s) in 1.2770 seconds
HBaseHFileMapper.java
- package com.test.hfile;
- import java.io.IOException;
- import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
- import org.apache.hadoop.hbase.util.Bytes;
- import org.apache.hadoop.io.LongWritable;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.mapreduce.Mapper;
- public class HBaseHFileMapper extends Mapper<LongWritable, Text, ImmutableBytesWritable,Text>{
- private ImmutableBytesWritable immutableBytesWritable = new ImmutableBytesWritable();
- @Override
- protected void map(LongWritable key, Textvalue,
- org.apache.hadoop.mapreduce.Mapper.Contextcontext)
- throws IOException, InterruptedException{
- immutableBytesWritable.set(Bytes.toBytes(key.get())); //也就是说输入值还是key
- context.write(immutableBytesWritable,value);
- }
- }
- package com.test.hfile;
- import java.io.IOException;
- import org.apache.hadoop.hbase.KeyValue;
- import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
- import org.apache.hadoop.hbase.util.Bytes;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.mapreduce.Reducer;
- public class HBaseHFileReducer extends Reducer<ImmutableBytesWritable, Text,ImmutableBytesWritable, KeyValue>{
- protected void reduce(ImmutableBytesWritable key,Iterable<Text>values,
- Context context)
- throws IOException, InterruptedException{
- String value="";
- while(values.iterator().hasNext())
- {
- value =values.iterator().next().toString();
- if(value!= null && !"".equals(value))//不为空
- {
- KeyValue kv = createKeyValue(value.toString());//keyValue就是value值
- if(kv!=null)
- context.write(key,kv);
- }
- }
- }
- // str格式为row:family:qualifier:value简单模拟下
- private KeyValue createKeyValue(Stringstr)
- {
- String[] strstrs = str.split(":");
- if(strs.length<4)
- return null;
- String row=strs[0];
- String family=strs[1];
- String qualifier=strs[2];
- String value=strs[3];
- return new KeyValue(Bytes.toBytes(row),Bytes.toBytes(family),Bytes.toBytes(qualifier),System.currentTimeMillis(),Bytes.toBytes(value));
- }
- }
- package com.test.hfile;
- import java.io.IOException;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.hbase.HBaseConfiguration;
- import org.apache.hadoop.hbase.client.HTable;
- import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
- import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.mapreduce.Job;
- import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
- import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
- import org.apache.hadoop.util.GenericOptionsParser;
- public class HbaseHFileDriver{
- public static void main(String[] args) throws IOException,
- InterruptedException, ClassNotFoundException{
- Configuration conf = new Configuration();
- String[] otherArgs = new GenericOptionsParser(conf,args).getRemainingArgs();
- Job job = new Job(conf, "testhbasehfile");
- job.setJarByClass(HbaseHFileDriver.class);//主类
- job.setMapperClass(com.test.hfile.HBaseHFileMapper.class);
- job.setReducerClass(com.test.hfile.HBaseHFileReducer.class);
- job.setMapOutputKeyClass(ImmutableBytesWritable.class);
- job.setMapOutputValueClass(Text.class);
- // 偷懒, 直接写死在程序里了,实际应用中不能这样,应从命令行获取,无语了对这个
- FileInputFormat.addInputPath(job, new Path("/home/yinjie/input"));
- FileOutputFormat.setOutputPath(job, new Path("/home/yinjie/output"));
- Configuration HBASE_CONFIG = new Configuration();
- HBASE_CONFIG.set("hbase.zookeeper.quorum","localhost");
- HBASE_CONFIG.set("hbase.zookeeper.property.clientPort","2181");
- HBaseConfiguration cfg = new HBaseConfiguration(HBASE_CONFIG);
- String tableName = "t1";
- HTable htable = new HTable(cfg,tableName);
- HFileOutputFormat.configureIncrementalLoad
(job,htable); - System.exit(job.waitForCompletion(true)? 0 : 1);
- }
- }
- [root@localhost input]# cathbasedata.txt
- r1:f1:c1:value1
- r2:f1:c2:value2
- r3:f1:c3:value3
提交作业到hadoop运行:
[root@localhost job]# hadoop jar /home/yinjie/job/hbasetest.jarcom.test.hfile.HbaseHFileDriver -libjars
/home/yinjie/hbase-0.90.3/hbase-0.90.3.jar
作业运行完毕后查看下输出目录:
- [root@localhost input]# hadoop fs -ls/home/yinjie/output
- Found 2 items
- drwxr-xr-x - rootsupergroup 0 2011-08-28 21:02 /home/yinjie/output/_logs
- drwxr-xr-x - rootsupergroup 0 2011-08-28 21:03 /home/yinjie/output/f1
接下去使用Bulk Load将数据导入到HBbase
[root@localhost job]# hadoop jar/home/yinjie/hbase-0.90.3/hbase-0.90.3.jarcompletebulkload
导入完毕,查询hbase表t1进行验证
- hbase(main):166:0> scan 't1'
- ROW COLUMN+CELL
- r1 column=f1:c1,timestamp=1314591150788,value=value1
- r2 column=f1:c2,timestamp=1314591150814,value=value2
- r3 column=f1:c3,timestamp=1314591150815,value=value3
- 3 row(s) in 0.0210 seconds
数据已经导入!
本文后面的例子出自 “炽天使” 博客,请务必保留此出处 http://3199782.blog.51cto.com/3189782/652244
- package org.apache.hadoop.hbase.mapreduce;
- import java.io.IOException;
- import java.net.URI;
- import java.net.URISyntaxException;
- import java.util.ArrayList;
- import java.util.List;
- import java.util.Map;
- import java.util.TreeMap;
- import java.util.TreeSet;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.filecache.DistributedCache;
- import org.apache.hadoop.fs.FileSystem;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.hbase.HConstants;
- import org.apache.hadoop.hbase.KeyValue;
- import org.apache.hadoop.hbase.client.HTable;
- import org.apache.hadoop.hbase.client.Put;
- import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
- import org.apache.hadoop.hbase.io.hfile.Compression;
- import org.apache.hadoop.hbase.io.hfile.HFile;
- import org.apache.hadoop.hbase.mapreduce.hadoopbackport.TotalOrderPartitioner;
- import org.apache.hadoop.hbase.regionserver.StoreFile;
- import org.apache.hadoop.hbase.util.Bytes;
- import org.apache.hadoop.io.NullWritable;
- import org.apache.hadoop.io.SequenceFile;
- import org.apache.hadoop.mapreduce.Job;
- import org.apache.hadoop.mapreduce.RecordWriter;
- import org.apache.hadoop.mapreduce.TaskAttemptContext;
- import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
- import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
- import org.apache.commons.logging.Log;
- import org.apache.commons.logging.LogFactory;
- public class HFileOutputFormat extends FileOutputFormat<ImmutableBytesWritable,KeyValue> {
- static Log LOG = LogFactory.getLog(HFileOutputFormat.class);
- //继承的FileOutputFile
- public RecordWriter<ImmutableBytesWritable,KeyValue> getRecordWriter(final TaskAttemptContext context)//keyValue是什么值?
- throws IOException, InterruptedException{
- // Get the path of the temporary outputfile获得输出文件的临时路径
- final Path outputPath =FileOutputFormat.getOutputPath(context);
- final Path outputdir = new FileOutputCommitter(outputPath,context).getWorkPath();
- Configuration conf =context.getConfiguration();
- final FileSystem fs =outputdir.getFileSystem(conf);
- // These configs. are fromhbase-*.xml获取对应的配置信息,前面的参数未设置,则使用默认的参数
- final long maxsize = conf.getLong("hbase.hregion.max.filesize",268435456);
- final int blocksize =
- conf.getInt("hbase.mapreduce.hfileoutputformat.blocksize",65536);
- // Invented config. Add tohbase-*.xml if other than default compression.
- final String compression = conf.get("hfile.compression",
- Compression.Algorithm.NONE.getName());
- return new RecordWriter<ImmutableBytesWritable,KeyValue>() {
- // Map of families to writers and how much has beenoutput on the writer.将列簇映射到writer上去
- private final Map<byte [], WriterLength> writers =//使用map作为容器
- new TreeMap<byte [],WriterLength>(Bytes.BYTES_COMPARATOR);
- private byte [] previousRow = HConstants.EMPTY_BYTE_ARRAY;//空
- private final byte [] now =Bytes.toBytes(System.currentTimeMillis());//当前时间
- public void write(ImmutableBytesWritable row, KeyValue kv)//写入信息
- throws IOException {
- long length = kv.getLength();
- byte [] family = kv.getFamily();//获取列簇
- WriterLength wl = this.writers.get(family);
- if (wl == null || ((length + wl.written) >= maxsize)&&
- Bytes.compareTo(this.previousRow,0,this.previousRow.length,
- kv.getBuffer(), kv.getRowOffset(), kv.getRowLength()) !=0){
- // Get a new writer.
- Path basedir = new Path(outputdir,Bytes.toString(family));
- if (wl == null){
- wl = new WriterLength();
- this.writers.put(family,wl);
- if (this.writers.size()> 1)throw new IOException("One familyonly");
- // If wl == null, first file infamily. Ensure family direxits.
- if (!fs.exists(basedir)) fs.mkdirs(basedir);//建立对应的路径
- }
- wl.writer = getNewWriter(wl.writer,basedir);
- LOG.info("Writer="+ wl.writer.getPath() +
- ((wl.written == 0)?"":", wrote="+ wl.written));
- wl.written = 0;
- }
- kv.updateLatestStamp(this.now);//
- wl.writer.append(kv);
- wl.written += length;
- // Copy the row so we know when a rowtransition.
- this.previousRow= kv.getRow();
- }
- private HFile.Writer getNewWriter(final HFile.Writer writer,
- final Path familydir)
- throws IOException {
- close(writer);
- return new HFile.Writer(fs, StoreFile.getUniqueFile(fs,familydir),
- blocksize, compression,KeyValue.KEY_COMPARATOR);
- }
- private void close(final HFile.Writer w) throws IOException {
- if (w != null){
- w.appendFileInfo(StoreFile.BULKLOAD_TIME_KEY,//时间标签
- Bytes.toBytes(System.currentTimeMillis()));
- w.appendFileInfo(StoreFile.BULKLOAD_TASK_KEY,
- Bytes.toBytes(context.getTaskAttemptID().toString()));
- w.appendFileInfo(StoreFile.MAJOR_COMPACTION_KEY,
- Bytes.toBytes(true));
- w.close();
- }
- }
- public void close(TaskAttemptContext c)
- throws IOException, InterruptedException{
- for (Map.Entry<byte [], WriterLength> e: this.writers.entrySet()){
- close(e.getValue().writer);
- }
- }
- };
- }
- static class WriterLength {
- long written = 0;
- HFile.Writerwriter = null;
- }
- private static List<ImmutableBytesWritable>getRegionStartKeys(HTabletable)
- throws IOException {
- byte[][]byteKeys =table.getStartKeys();
- ArrayList<ImmutableBytesWritable> ret=
- new ArrayList<ImmutableBytesWritable>(byteKeys.length);
- for (byte[]byteKey : byteKeys) {
- ret.add(new ImmutableBytesWritable(byteKey));
- }
- return ret;
- }
- private static void writePartitions(Configuration conf, PathpartitionsPath,
- List<ImmutableBytesWritable>startKeys) throws IOException {
- if (startKeys.isEmpty()) {
- throw new IllegalArgumentException
( "No regionspassed"); - }
- // We're generating a list of split points, and wedon't ever
- // have keys < the first region(which has an empty start key)
- // so we need to remove it. Otherwise we would endup with an
- // empty reducer with index 0
- TreeSet<ImmutableBytesWritable>sorted =
- new TreeSet<ImmutableBytesWritable>(startKeys);
- ImmutableBytesWritable first =sorted.first();
- if (!first.equals(HConstants.EMPTY_BYTE_ARRAY)){
- throw new IllegalArgumentException
( - "First region of table should haveempty start key. Instead has: "
- + Bytes.toStringBinary(first.get()));
- }
- sorted.remove(first);
- // Write the actual file
- FileSystemfs =partitionsPath.getFileSystem(conf);
- SequenceFile.Writer writer =SequenceFile.createWriter(fs,
- conf, partitionsPath, ImmutableBytesWritable.class,NullWritable.class);
- try {
- for (ImmutableBytesWritable startKey : sorted){
- writer.append(startKey,NullWritable.get());
- }
- }finally {
- writer.close();
- }
- }
- public static void configureIncrementalLoad
(Job job, HTable table) throws IOException {//这些不是在main函数中设置的吗 - Configuration conf =job.getConfiguration();
- job.setPartitionerClass(TotalOrderPartitioner.class);
- job.setOutputKeyClass(ImmutableBytesWritable.class);
- job.setOutputValueClass(KeyValue.class);
- job.setOutputFormatClass(HFileOutputFormat.class);
- // Based on the configured map output class, setthe correct reducer to properly
- // sort the incoming values.
- // TODO it would be nice to pick one or the otherof these formats.
- if (KeyValue.class.equals(job.getMapOutputValueClass())){
- job.setReducerClass(KeyValueSortReducer.class);
- }else if (Put.class.equals(job.getMapOutputValueClass())){
- job.setReducerClass(PutSortReducer.class);
- }else {
- LOG.warn("Unknown map output valuetype:" +job.getMapOutputValueClass());
- }
- LOG.info("Looking up current regions fortable " + table);
- List<ImmutableBytesWritable>startKeys =getRegionStartKeys(table);
- LOG.info("Configuring" + startKeys.size() + " reduce partitions" +
- "to match current regioncount");
- job.setNumReduceTasks(startKeys.size());
- PathpartitionsPath = new Path(job.getWorkingDirectory(),
- "partitions_"+ System.currentTimeMillis());
- LOG.info("Writing partition information to" + partitionsPath);
- FileSystemfs =partitionsPath.getFileSystem(conf);
- writePartitions(conf, partitionsPath,startKeys);
- partitionsPath.makeQualified(fs);
- URIcacheUri;
- try {
- cacheUri = new URI(partitionsPath.toString() + "#"+
- TotalOrderPartitioner.DEFAULT_PATH);
- }catch (URISyntaxException e) {
- throw new IOException(e);
- }
- DistributedCache.addCacheFile(cacheUri,conf);
- DistributedCache.createSymlink(conf);
- LOG.info("Incremental table outputconfigured.");
- }
- }
1. 创建HBase表t1
- hbase(main):157:0*create 't1','f1'
- 0 row(s) in 1.3280 seconds
- hbase(main):158:0>scan 't1'
- ROW COLUMN+CELL
- 0 row(s) in 1.2770 seconds
HBaseHFileMapper.java
- package com.test.hfile;
- import java.io.IOException;
- import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
- import org.apache.hadoop.hbase.util.Bytes;
- import org.apache.hadoop.io.LongWritable;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.mapreduce.Mapper;
- public class HBaseHFileMapper extends Mapper<LongWritable, Text, ImmutableBytesWritable,Text>{
- private ImmutableBytesWritable immutableBytesWritable = new ImmutableBytesWritable();
- @Override
- protected void map(LongWritable key, Textvalue,
- org.apache.hadoop.mapreduce.Mapper.Contextcontext)
- throws IOException, InterruptedException{
- immutableBytesWritable.set(Bytes.toBytes(key.get())); //也就是说输入值还是key
- context.write(immutableBytesWritable,value);
- }
- }
- package com.test.hfile;
- import java.io.IOException;
- import org.apache.hadoop.hbase.KeyValue;
- import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
- import org.apache.hadoop.hbase.util.Bytes;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.mapreduce.Reducer;
- public class HBaseHFileReducer extends Reducer<ImmutableBytesWritable, Text,ImmutableBytesWritable, KeyValue>{
- protected void reduce(ImmutableBytesWritable key,Iterable<Text>values,
- Context context)
- throws IOException, InterruptedException{
- String value="";
- while(values.iterator().hasNext())
- {
- value =values.iterator().next().toString();
- if(value!= null && !"".equals(value))//不为空
- {
- KeyValue kv = createKeyValue(value.toString());//keyValue就是value值
- if(kv!=null)
- context.write(key,kv);
- }
- }
- }
- // str格式为row:family:qualifier:value简单模拟下
- private KeyValue createKeyValue(Stringstr)
- {
- String[] strstrs = str.split(":");
- if(strs.length<4)
- return null;
- String row=strs[0];
- String family=strs[1];
- String qualifier=strs[2];
- String value=strs[3];
- return new KeyValue(Bytes.toBytes(row),Bytes.toBytes(family),Bytes.toBytes(qualifier),System.currentTimeMillis(),Bytes.toBytes(value));
- }
- }
- package com.test.hfile;
- import java.io.IOException;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.hbase.HBaseConfiguration;
- import org.apache.hadoop.hbase.client.HTable;
- import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
- import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.mapreduce.Job;
- import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
- import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
- import org.apache.hadoop.util.GenericOptionsParser;
- public class HbaseHFileDriver{
- public static void main(String[] args) throws IOException,
- InterruptedException, ClassNotFoundException{
- Configuration conf = new Configuration();
- String[] otherArgs = new GenericOptionsParser(conf,args).getRemainingArgs();
- Job job = new Job(conf, "testhbasehfile");
- job.setJarByClass(HbaseHFileDriver.class);//主类
- job.setMapperClass(com.test.hfile.HBaseHFileMapper.class);
- job.setReducerClass(com.test.hfile.HBaseHFileReducer.class);
- job.setMapOutputKeyClass(ImmutableBytesWritable.class);
- job.setMapOutputValueClass(Text.class);
- // 偷懒, 直接写死在程序里了,实际应用中不能这样,应从命令行获取,无语了对这个
- FileInputFormat.addInputPath(job, new Path("/home/yinjie/input"));
- FileOutputFormat.setOutputPath(job, new Path("/home/yinjie/output"));
- Configuration HBASE_CONFIG = new Configuration();
- HBASE_CONFIG.set("hbase.zookeeper.quorum","localhost");
- HBASE_CONFIG.set("hbase.zookeeper.property.clientPort","2181");
- HBaseConfiguration cfg = new HBaseConfiguration(HBASE_CONFIG);
- String tableName = "t1";
- HTable htable = new HTable(cfg,tableName);
- HFileOutputFormat.configureIncrementalLoad
(job,htable); - System.exit(job.waitForCompletion(true)? 0 : 1);
- }
- }
- [root@localhost input]# cathbasedata.txt
- r1:f1:c1:value1
- r2:f1:c2:value2
- r3:f1:c3:value3
提交作业到hadoop运行:
[root@localhost job]# hadoop jar /home/yinjie/job/hbasetest.jarcom.test.hfile.HbaseHFileDriver -libjars
/home/yinjie/hbase-0.90.3/hbase-0.90.3.jar
作业运行完毕后查看下输出目录:
- [root@localhost input]# hadoop fs -ls/home/yinjie/output
- Found 2 items
- drwxr-xr-x - rootsupergroup 0 2011-08-28 21:02 /home/yinjie/output/_logs
- drwxr-xr-x - rootsupergroup 0 2011-08-28 21:03 /home/yinjie/output/f1
接下去使用Bulk Load将数据导入到HBbase
[root@localhost job]# hadoop jar/home/yinjie/hbase-0.90.3/hbase-0.90.3.jarcompletebulkload
导入完毕,查询hbase表t1进行验证
- hbase(main):166:0> scan 't1'
- ROW COLUMN+CELL
- r1 column=f1:c1,timestamp=1314591150788,value=value1
- r2 column=f1:c2,timestamp=1314591150814,value=value2
- r3 column=f1:c3,timestamp=1314591150815,value=value3
- 3 row(s) in 0.0210 seconds
数据已经导入!
本文后面的例子出自 “炽天使” 博客,请务必保留此出处 http://3199782.blog.51cto.com/3189782/652244
- Hbse源码分析-HFileOutputFo…
- HBSE学习
- 关系数据库和Hbse
- HBse中Shell
- win8配置 java方法连接HBSE 数据库
- 源码分析
- 源码分析
- 源码分析
- 源码分析
- 源码分析
- 源码分析
- 源码分析
- 源码分析
- 源码分析:SparseArray分析
- HashSet及LinkedHashSet源码分析(…
- MiniGUI源码分析:MiniGUIMain中有…
- 转--httplib源码分析及IncompleteR…
- 源码- Spark Broadcast源码分析
- HBase vs Cassandra-装载
- HBase二级索引与Join
- Bulk Loading - Hbase
- HBase加载大数据
- bulk-load装载hdfs数据到hbase
- Hbse源码分析-HFileOutputFo…
- HADOOP_CLASSPATH设置
- hadoop生态链资源
- Hadoop Streaming和Pipes
- hadoop pipes编程示例
- hadoop c++ pipes接口实现
- 流式计算之Storm简介
- MapR初探
- ACL-NLP顶级会议