生成HFile以及入库到HBase

来源:互联网 发布:网络兼职推荐 编辑:程序博客网 时间:2024/05/01 09:55

一、MR生成HFile文件

package insert.tools.hfile;import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.hbase.HBaseConfiguration;import org.apache.hadoop.hbase.KeyValue;import org.apache.hadoop.hbase.io.ImmutableBytesWritable;import org.apache.hadoop.hbase.mapreduce.KeyValueSortReducer;import org.apache.hadoop.hbase.util.Bytes;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;public class TestHFileToHBase {public static class TestHFileToHBaseMapper extends Mapper {@Overrideprotected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {String[] values = value.toString().split("/t", 2);byte[] row = Bytes.toBytes(values[0]);ImmutableBytesWritable k = new ImmutableBytesWritable(row);KeyValue kvProtocol = new KeyValue(row, "PROTOCOLID".getBytes(), "PROTOCOLID".getBytes(), values[1].getBytes());context.write(k, kvProtocol);// KeyValue kvSrcip = new KeyValue(row, "SRCIP".getBytes(),// "SRCIP".getBytes(), values[1].getBytes());// context.write(k, kvSrcip);// HFileOutputFormat.getRecordWriter }}public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {Configuration conf = HBaseConfiguration.create();Job job = new Job(conf, "TestHFileToHBase");job.setJarByClass(TestHFileToHBase.class);job.setOutputKeyClass(ImmutableBytesWritable.class);job.setOutputValueClass(KeyValue.class);job.setMapperClass(TestHFileToHBaseMapper.class);job.setReducerClass(KeyValueSortReducer.class);//job.setOutputFormatClass(org.apache.hadoop.hbase.mapreduce.HFileOutputFormat.class);job.setOutputFormatClass(HFileOutputFormat.class);// job.setNumReduceTasks(4);// job.setPartitionerClass(org.apache.hadoop.hbase.mapreduce.SimpleTotalOrderPartitioner.class);// HBaseAdmin admin = new HBaseAdmin(conf);//HTable table = new HTable(conf, "hua"); HFileOutputFormat.configureIncrementalLoad(job, table);FileInputFormat.addInputPath(job, new Path(args[0]));FileOutputFormat.setOutputPath(job, new Path(args[1]));System.exit(job.waitForCompletion(true) ? 0 : 1);}}


二、改进后的HFileOutputFormat

源码中的HFileOutputFormat只适合一次生成一个列族的HFile,改进后的HFileOutputFormat适合同时多列族生成HFile文件。有add标签的是在源码上添加代码,有revise标签的是在源码上增加代码。参考:https://review.cloudera.org/r/1272/diff/1/?file=17977#file17977line93

/** * Copyright 2009 The Apache Software Foundation * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements.  See the NOTICE file * distributed with this work for additional information * regarding copyright ownership.  The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License.  You may obtain a copy of the License at * *     http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */package insert.tools.hfile;import java.io.IOException;import java.net.URI;import java.net.URISyntaxException;import java.util.ArrayList;import java.util.List;import java.util.Map;import java.util.TreeMap;import java.util.TreeSet;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.filecache.DistributedCache;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.hbase.HConstants;import org.apache.hadoop.hbase.KeyValue;import org.apache.hadoop.hbase.client.HTable;import org.apache.hadoop.hbase.client.Put;import org.apache.hadoop.hbase.io.ImmutableBytesWritable;import org.apache.hadoop.hbase.io.hfile.Compression;import org.apache.hadoop.hbase.io.hfile.HFile;import org.apache.hadoop.hbase.mapreduce.KeyValueSortReducer;import org.apache.hadoop.hbase.mapreduce.hadoopbackport.TotalOrderPartitioner;import org.apache.hadoop.hbase.regionserver.StoreFile;import org.apache.hadoop.hbase.util.Bytes;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.SequenceFile;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.RecordWriter;import org.apache.hadoop.mapreduce.TaskAttemptContext;import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.commons.logging.Log;import org.apache.commons.logging.LogFactory;import com.google.common.base.Preconditions;/** * Writes HFiles. Passed KeyValues must arrive in order. Currently, can only * write files to a single column family at a time. Multiple column families * requires coordinating keys cross family. Writes current time as the sequence * id for the file. Sets the major compacted attribute on created hfiles. *  * @see KeyValueSortReducer */public class HFileOutputFormat extendsFileOutputFormat {static Log LOG = LogFactory.getLog(HFileOutputFormat.class);public RecordWriter getRecordWriter(final TaskAttemptContext context) throws IOException,InterruptedException {// Get the path of the temporary output filefinal Path outputPath = FileOutputFormat.getOutputPath(context);final Path outputdir = new FileOutputCommitter(outputPath, context).getWorkPath();Configuration conf = context.getConfiguration();final FileSystem fs = outputdir.getFileSystem(conf);// These configs. are from hbase-*.xml// revise// final long maxsize = conf.getLong("hbase.hregion.max.filesize",// 268435456);// final int blocksize = conf.getInt("hfile.min.blocksize.size", 65536);final long maxsize = conf.getLong("hbase.hregion.max.filesize",HConstants.DEFAULT_MAX_FILE_SIZE);final int blocksize = conf.getInt("hfile.min.blocksize.size",HFile.DEFAULT_BLOCKSIZE);// -revise// Invented config. Add to hbase-*.xml if other than default// compression.final String compression = conf.get("hfile.compression",Compression.Algorithm.NONE.getName());return new RecordWriter() {// Map of families to writers and how much has been output on the// writer.private final Map<byte[], WriterLength> writers = new TreeMap<byte[], WriterLength>(Bytes.BYTES_COMPARATOR);private byte[] previousRow = HConstants.EMPTY_BYTE_ARRAY;private final byte[] now = Bytes.toBytes(System.currentTimeMillis());// addprivate boolean rollRequested = false;// -addpublic void write(ImmutableBytesWritable row, KeyValue kv)throws IOException {// add// null input == user explicitly wants to flushif (row == null && kv == null) {rollWriters();return;}byte[] rowKey = kv.getRow();// -addlong length = kv.getLength();byte[] family = kv.getFamily();WriterLength wl = this.writers.get(family);// revise// if (wl == null// || ((length + wl.written) >= maxsize)// && Bytes.compareTo(this.previousRow, 0,// this.previousRow.length, kv.getBuffer(), kv// .getRowOffset(), kv.getRowLength()) != 0) {// // Get a new writer.// Path basedir = new Path(outputdir, Bytes.toString(family));// if (wl == null) {// wl = new WriterLength();// this.writers.put(family, wl);// if (this.writers.size() > 1)// throw new IOException("One family only");// // If wl == null, first file in family. Ensure family// // dir exits.// if (!fs.exists(basedir))// fs.mkdirs(basedir);// }// wl.writer = getNewWriter(wl.writer, basedir);// LOG// .info("Writer="// + wl.writer.getPath()// + ((wl.written == 0) ? "" : ", wrote="// + wl.written));// wl.written = 0;// }// If this is a new column family, verify that the directory// existsif (wl == null) {fs.mkdirs(new Path(outputdir, Bytes.toString(family)));}// If any of the HFiles for the column families has reached// maxsize, we need to roll all the writersif (wl != null && wl.written + length >= maxsize) {this.rollRequested = true;}// This can only happen once a row is finished thoughif (rollRequested&& Bytes.compareTo(this.previousRow, rowKey) != 0) {rollWriters();}// create a new HLog writer, if necessaryif (wl == null || wl.writer == null) {wl = getNewWriter(family);}// we now have the proper HLog writer. full steam ahead// -revisekv.updateLatestStamp(this.now);wl.writer.append(kv);wl.written += length;// Copy the row so we know when a row transition.// revise// this.previousRow = kv.getRow();this.previousRow = rowKey;// -revise}// revise// /*// * Create a new HFile.Writer. Close current if there is one.// *// * @param writer// *// * @param familydir// *// * @return A new HFile.Writer.// *// * @throws IOException// */// private HFile.Writer getNewWriter(final HFile.Writer writer,// final Path familydir) throws IOException {// close(writer);// return new HFile.Writer(fs, StoreFile.getUniqueFile(fs,// familydir), blocksize, compression,// KeyValue.KEY_COMPARATOR);// }private void rollWriters() throws IOException {for (WriterLength wl : this.writers.values()) {if (wl.writer != null) {LOG.info("Writer="+ wl.writer.getPath()+ ((wl.written == 0) ? "" : ", wrote="+ wl.written));close(wl.writer);}wl.writer = null;wl.written = 0;}this.rollRequested = false;}/* * Create a new HFile.Writer. *  * @param family *  * @return A WriterLength, containing a new HFile.Writer. *  * @throws IOException */private WriterLength getNewWriter(byte[] family) throws IOException {WriterLength wl = new WriterLength();Path familydir = new Path(outputdir, Bytes.toString(family));wl.writer = new HFile.Writer(fs, StoreFile.getUniqueFile(fs,familydir), blocksize, compression,KeyValue.KEY_COMPARATOR);this.writers.put(family, wl);return wl;}// -reviseprivate void close(final HFile.Writer w) throws IOException {if (w != null) {w.appendFileInfo(StoreFile.BULKLOAD_TIME_KEY, Bytes.toBytes(System.currentTimeMillis()));w.appendFileInfo(StoreFile.BULKLOAD_TASK_KEY, Bytes.toBytes(context.getTaskAttemptID().toString()));w.appendFileInfo(StoreFile.MAJOR_COMPACTION_KEY, Bytes.toBytes(true));w.close();}}// revise// public void close(TaskAttemptContext c) throws IOException,// InterruptedException {// for (Map.Entry e : this.writers// .entrySet()) {// close(e.getValue().writer);// }// }public void close(TaskAttemptContext c) throws IOException,InterruptedException {for (WriterLength wl : this.writers.values()) {close(wl.writer);}}// -revise};}/* * Data structure to hold a Writer and amount of data written on it. */static class WriterLength {long written = 0;HFile.Writer writer = null;}/** * Return the start keys of all of the regions in this table, as a list of * ImmutableBytesWritable. */private static List getRegionStartKeys(HTable table)throws IOException {byte[][] byteKeys = table.getStartKeys();ArrayList ret = new ArrayList(byteKeys.length);for (byte[] byteKey : byteKeys) {ret.add(new ImmutableBytesWritable(byteKey));}return ret;}/** * Write out a SequenceFile that can be read by TotalOrderPartitioner that * contains the split points in startKeys. *  * @param partitionsPath *            output path for SequenceFile * @param startKeys *            the region start keys */private static void writePartitions(Configuration conf,Path partitionsPath, List startKeys)throws IOException {Preconditions.checkArgument(!startKeys.isEmpty(), "No regions passed");// We're generating a list of split points, and we don't ever// have keys < the first region (which has an empty start key)// so we need to remove it. Otherwise we would end up with an// empty reducer with index 0TreeSet sorted = new TreeSet(startKeys);ImmutableBytesWritable first = sorted.first();Preconditions.checkArgument(first.equals(HConstants.EMPTY_BYTE_ARRAY),"First region of table should have empty start key. Instead has: %s",Bytes.toStringBinary(first.get()));sorted.remove(first);// Write the actual fileFileSystem fs = partitionsPath.getFileSystem(conf);SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf,partitionsPath, ImmutableBytesWritable.class,NullWritable.class);try {for (ImmutableBytesWritable startKey : sorted) {writer.append(startKey, NullWritable.get());}} finally {writer.close();}}/** * Configure a MapReduce Job to perform an incremental load into the given * table. This *  * Inspects the table to configure a total order partitioner * Uploads the partitions file to the cluster and adds it to the * DistributedCache * Sets the number of reduce tasks to match the current number of * regions * Sets the output key/value class to match HFileOutputFormat's * requirements * Sets the reducer up to perform the appropriate sorting (either * KeyValueSortReducer or PutSortReducer) *  * The user should be sure to set the map output value class to either * KeyValue or Put before running this function. */public static void configureIncrementalLoad(Job job, HTable table)throws IOException {Configuration conf = job.getConfiguration();job.setPartitionerClass(TotalOrderPartitioner.class);job.setOutputKeyClass(ImmutableBytesWritable.class);job.setOutputValueClass(KeyValue.class);job.setOutputFormatClass(HFileOutputFormat.class);// Based on the configured map output class, set the correct reducer to// properly// sort the incoming values.// TODO it would be nice to pick one or the other of these formats.if (KeyValue.class.equals(job.getMapOutputValueClass())) {job.setReducerClass(KeyValueSortReducer.class);} else if (Put.class.equals(job.getMapOutputValueClass())) {job.setReducerClass(PutSortReducer.class);} else {LOG.warn("Unknown map output value type:"+ job.getMapOutputValueClass());}LOG.info("Looking up current regions for table " + table);List startKeys = getRegionStartKeys(table);LOG.info("Configuring " + startKeys.size() + " reduce partitions "+ "to match current region count");job.setNumReduceTasks(startKeys.size());Path partitionsPath = new Path(job.getWorkingDirectory(), "partitions_"+ System.currentTimeMillis());LOG.info("Writing partition information to " + partitionsPath);FileSystem fs = partitionsPath.getFileSystem(conf);writePartitions(conf, partitionsPath, startKeys);partitionsPath.makeQualified(fs);URI cacheUri;try {cacheUri = new URI(partitionsPath.toString() + "#"+ TotalOrderPartitioner.DEFAULT_PATH);} catch (URISyntaxException e) {throw new IOException(e);}DistributedCache.addCacheFile(cacheUri, conf);DistributedCache.createSymlink(conf);LOG.info("Incremental table output configured.");}}


三、MR生成HFile的注意事项

1. 无论是map还是reduce作为最终的输出结果,输出的key和value的类型应该是: 或者< ImmutableBytesWritable, Put>。

2. Map或者reduce的输出类型是KeyValue 或Put对应KeyValueSortReducer或PutSortReducer。

3. MR例子中job.setOutputFormatClass(HFileOutputFormat.class); HFileOutputFormat是改进后的mr,可适用于多列族同时生成HFile文件,源码中只适合一次对单列族组织成HFile文件。

4. MR例子中HFileOutputFormat.configureIncrementalLoad(job, table);自动对job进行配置,SimpleTotalOrderPartitioner是需要先对key进行整体排序,然后划分到每个reduce中,保证每一个reducer中的的key最小最大值区间范围,是不会有交集的。

因为入库到HBase的时候,作为一个整体的Region,key是绝对有序的。

5. MR例子中最后生成HFile存储在HDFS上,输出路径下的子目录是各个列族。如果对HFile进行入库HBase,相当于move HFile到HBase的Region中,HFile子目录的列族内容没有了。

四、HFile入库到HBase

import org.apache.hadoop.hbase.client.HTable;import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles;import org.apache.hadoop.hbase.util.Bytes;public class TestLoadIncrementalHFileToHBase {// private static final byte[] TABLE = Bytes.toBytes("hua");// private static final byte[] QUALIFIER = Bytes.toBytes("PROTOCOLID");// private static final byte[] FAMILY = Bytes.toBytes("PROTOCOLID");public static void main(String[] args) throws IOException {Configuration conf = HBaseConfiguration.create();//byte[] TABLE = Bytes.toBytes("hua");byte[] TABLE = Bytes.toBytes(args[0]);HTable table = new HTable(TABLE);LoadIncrementalHFiles loader = new LoadIncrementalHFiles(conf);loader.doBulkLoad(new Path(args[1]), table);//loader.doBulkLoad(new Path("/hua/testHFileResult/"), table);}}


五、HFile入库到HBase注意事项

1. 通过HBase中 LoadIncrementalHFiles的doBulkLoad方法,对生成的HFile文件入库,入库的第一个参数是表名,第二个参数是HFile的路径(以上MR生成HFile的输出路径),也可一个个列族录入到HBase中对应的表列族。

2. 如何入库的相关链接:

http://hbase.apache.org/docs/r0.89.20100726/bulk-loads.html

http://hbase.apache.org/docs/r0.20.6/api/org/apache/hadoop/hbase/mapreduce/package-summary.html#bulk

http://genius-bai.javaeye.com/blog/641927

3. 入库分为代码入库以及脚本入库。代码入库有两种,一种是

hadoop jar hbase-VERSION.jar completebulkload /myoutput mytable;

另外一种是通过以上的TestLoadIncrementalHFileToHBase类。

脚本入库为:jruby  $HBASE_HOME/bin/loadtable.rb  hbase-mytable  hadoop-hbase-hfile-outputdir。