Hbase基于Mapreduce的编程

来源：互联网发布：c与java的区别编辑：程序博客网时间：2024/05/17 07:21

小试牛刀，将mapreduce的输出结果保存到大型分布式数据库中HBase中，一个例子，求各url的访问pv数据,由于用到rcfile格式需要导入hive-exce包，还需要加载hbase包，如果这两个包都已经被集群管理员放到各节点的hadoop/lib下那就可以省去这一步，废话不说，干货，看代码：

package test.hbase;import java.io.IOException;import java.util.HashSet;import java.util.Set;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.hbase.HBaseConfiguration;import org.apache.hadoop.hbase.HColumnDescriptor;import org.apache.hadoop.hbase.HTableDescriptor;import org.apache.hadoop.hbase.client.HBaseAdmin;import org.apache.hadoop.hbase.client.Put;import org.apache.hadoop.hbase.mapreduce.TableOutputFormat;import org.apache.hadoop.hbase.mapreduce.TableReducer;import org.apache.hadoop.hbase.util.Bytes;import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import com.test.dm.common.RCFileInputFormat;public class URLCountHbase {public static class HBaseMap extendsMapper<LongWritable, BytesRefArrayWritable, Text, IntWritable> {private IntWritable i = new IntWritable(1);@Overrideprotected void map(LongWritable key, BytesRefArrayWritable value,Context context) throws IOException, InterruptedException {byte[] url = value.get(4).getBytesCopy();context.write(new Text(url), i);}}public static class HBaseReduce extendsTableReducer<Text, IntWritable, NullWritable> {@Overrideprotected void reduce(Text key, Iterable<IntWritable> values,Context context) throws IOException, InterruptedException {int sum = 0;for (IntWritable i : values) {sum += i.get();}Put put = new Put(Bytes.toBytes(key.toString()));put.add(Bytes.toBytes("type"), Bytes.toBytes("count"),Bytes.toBytes(String.valueOf(sum)));context.write(NullWritable.get(), put);}}public static void createHbaseTable(String tablename) throws IOException {HTableDescriptor htd = new HTableDescriptor(tablename);HColumnDescriptor col = new HColumnDescriptor("type");htd.addFamily(col);HBaseConfiguration config = new HBaseConfiguration();HBaseAdmin admin = new HBaseAdmin(config);if (admin.tableExists(tablename)) {System.out.println("table exists, trying recreate table");admin.disableTable(tablename);admin.deleteTable(tablename);}System.out.println("create new table:" + tablename);admin.createTable(htd);}public static void main(String args[]) throws Exception {String tablename = "urlcount";Configuration conf = new Configuration();final FileSystem fs = FileSystem.getLocal(conf);final HashSet<String> localfiles = new HashSet<String>();localfiles.add("/opt/hadoop/hive-0.8.1/lib/hive-exec-0.8.1.jar");localfiles.add("/opt/hadoop/hbase/hbase-0.92.1.jar");final HashSet<String> files = new HashSet<String>();for (String s : localfiles) {files.add(URLCountHbase.convertPath(s, fs));}URLCountHbase.cacheJars(conf, files);conf.set(TableOutputFormat.OUTPUT_TABLE, tablename);createHbaseTable(tablename);Job job = new Job(conf, "WordCount table with " + args[0]);job.setJarByClass(URLCountHbase.class);job.setNumReduceTasks(3);job.setReducerClass(HBaseReduce.class);job.setMapperClass(HBaseMap.class);job.setMapOutputKeyClass(Text.class);job.setMapOutputValueClass(IntWritable.class);job.setOutputFormatClass(TableOutputFormat.class);job.setInputFormatClass(RCFileInputFormat.class);FileInputFormat.setInputPaths(job, new Path(args[0]));System.exit(job.waitForCompletion(true) ? 0 : 1);} private static String convertPath(String path, FileSystem fs) {        final Path p = new Path(path);        return p.makeQualified(fs).toString(); } private static void cacheJars(Configuration job, Set<String> localUrls) throws IOException {     if (localUrls.isEmpty()) {            return;        }        final String tmpjars = job.get("tmpjars");        final StringBuilder sb = new StringBuilder();        if (null != tmpjars) {            sb.append(tmpjars);            sb.append(",");        }        sb.append(org.apache.hadoop.util.StringUtils.arrayToString(localUrls.toArray(new String[0])));        job.set("tmpjars", sb.toString());  }}