MapReduce生成HFile入库到HBase

来源：互联网发布：win10cpu优化编辑：程序博客网时间：2024/05/01 09:28

原文参考：

http://shitouer.cn/2013/02/hbase-hfile-bulk-load/

可能需要依赖一写jar包，在这里下载：http://download.csdn.net/detail/q79969786/6933683

主要做了如下修改：

package com.upa.hbase;import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.hbase.KeyValue;import org.apache.hadoop.hbase.io.ImmutableBytesWritable;import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat;import org.apache.hadoop.hbase.mapreduce.KeyValueSortReducer;import org.apache.hadoop.hbase.mapreduce.SimpleTotalOrderPartitioner;import org.apache.hadoop.hbase.util.Bytes;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.GenericOptionsParser;public class HFileGenerator {public static class HFileMapper extendsMapper<LongWritable, Text, ImmutableBytesWritable, KeyValue> {@Overrideprotected void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException {String line = value.toString();String[] items = line.split(",", -1);ImmutableBytesWritable rowkey = new ImmutableBytesWritable(items[0].getBytes());KeyValue kv = new KeyValue(Bytes.toBytes(items[0]),Bytes.toBytes(items[1]), Bytes.toBytes(items[2]),System.currentTimeMillis(), Bytes.toBytes(items[3]));if (null != kv) {context.write(rowkey, kv);}}}public static void main(String[] args) throws IOException,InterruptedException, ClassNotFoundException {Configuration conf = new Configuration();String[] dfsArgs = new GenericOptionsParser(conf, args).getRemainingArgs();Job job = new Job(conf, "HFile bulk load test");job.setJarByClass(HFileGenerator.class);job.setMapperClass(HFileMapper.class);job.setReducerClass(KeyValueSortReducer.class);job.setMapOutputKeyClass(ImmutableBytesWritable.class);job.setMapOutputValueClass(KeyValue.class);job.setPartitionerClass(SimpleTotalOrderPartitioner.class);FileInputFormat.addInputPath(job, new Path(dfsArgs[0]));FileOutputFormat.setOutputPath(job, new Path(dfsArgs[1]));HFileOutputFormat.configureIncrementalLoad(job,ConnectionUtil.getTable());System.exit(job.waitForCompletion(true) ? 0 : 1);}}

增加了ConnectionUtil：

package com.upa.hbase;import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.hbase.HBaseConfiguration;import org.apache.hadoop.hbase.client.HTable;public class ConnectionUtil {public static Configuration getConfiguration() {Configuration config = HBaseConfiguration.create();return config;}public static HTable getTable() {try {return new HTable(getConfiguration(), "test_hfile");} catch (IOException e) {e.printStackTrace();return null;}}}

loader类没有修改：

package com.unionpay.upa.hbase;import org.apache.hadoop.fs.Path;import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles;import org.apache.hadoop.util.GenericOptionsParser; public class HFileLoader {     public static void main(String[] args) throws Exception {        String[] dfsArgs = new GenericOptionsParser(                ConnectionUtil.getConfiguration(), args).getRemainingArgs();        LoadIncrementalHFiles loader = new LoadIncrementalHFiles(                ConnectionUtil.getConfiguration());        loader.doBulkLoad(new Path(dfsArgs[0]), ConnectionUtil.getTable());    } }

接下来需要创建一个hbase数据库：

hbase(main):025:0> create 'test_hfile', 't'

上传测试数据到HDFS:

$ hadoop fs -put test_hfile.txt test_hbase/test_hfile

格式大致是这样：

1,t,45,wer,fg,we5r,t,sdfsd,fsd,fwesr,t,wrwer,sdfsdf,sdf,sd,tfg,sd,werr,t,wer,wesdfdsf,sdfd,retrwer,t,werd546,fgh,g,df,ga,ds

接下来执行load：

$ hadoop jar etl.jar com.upa.hbase.HFileGenerator /user/hadoop/test_hbase/test_hfile /user/hadoop/test_hbase/hfile$ hadoop jar etl.jar com.upa.hbase.HFileLoader  /user/hadoop/test_hbase/hfile

0 0