使用mapreduce操作Hbase

来源:互联网 发布:新奇小礼物知乎 编辑:程序博客网 时间:2024/06/10 10:55

前面用了java api操作了hbase,这里就用mapreduce来操作mapreduce。
需求:对下面的表中的数据进行统计

hbase(main):006:0> scan 'words'ROW                                           COLUMN+CELL                                                                                                                          1                                            column=info:word, timestamp=1509345324263, value=hadoop,hdfs,mapreduce,hive,hbase                                                    2                                            column=info:word, timestamp=1509345324263, value=hdfs,hive,hbase,storm,kafka                                                         3                                            column=info:word, timestamp=1509345324263, value=hbase,storm,kafka,spark,mahout 

要求统计结果放入到另外一张表words2中,格式如下

hbase(main):010:0> scan 'words2'ROW                                           COLUMN+CELL                                                                                                                          hadoop                                       column=info:word, timestamp=1509345777813, value=1                                                                                   hbase                                        column=info:word, timestamp=1509345777813, value=3                                                                                   hdfs                                         column=info:word, timestamp=1509345777813, value=2                                                                                   hive                                         column=info:word, timestamp=1509345777813, value=2                                                                                   kafka                                        column=info:word, timestamp=1509345777813, value=2                                                                                   mahout                                       column=info:word, timestamp=1509345777813, value=1                                                                                   mapreduce                                    column=info:word, timestamp=1509345777813, value=1                                                                                   spark                                        column=info:word, timestamp=1509345777813, value=1                                                                                   storm                                        column=info:word, timestamp=1509345777813, value=2 

注:上面的都是自己做完之后复制的。
1、建立数据来源表‘words’,包含一个列族‘info’
向表中添加数据,在列族中放入列‘word’,并将短文数据放入该列中,如此插入多行,行键为不同的数据即可

2、建立输出表‘words2’,包含一个列族‘info’

3、通过Mr操作Hbase的‘word’表,对‘info:word’中的单词做统计,并将统计结果写入‘words2’表的‘info:word中’,行键为单词

注释都写在程序中就不做解释了

/** * 使用mr操作hbase来进行单词统计 * @author 12706 * */public class MrMachineHBase {    //创建hbase配置对象    static Configuration config = null;    static{         config = HBaseConfiguration.create();         //设置参数         config.set("hbase.zookeeper.quorum", "mini1,mini2,mini3");         config.set("hbase.zookeeper.property.clientPort", "2181");    }    //创建表信息    public static final String TABLE_NAME = "words";    public static final String TABLE_NAME_ = "words2";    public static final String FAMILY = "info";    public static final String COLUMN = "word";    //初始化表    public static void init(){        HTable table = null;        HBaseAdmin admin = null;        try{            admin = new HBaseAdmin(config);            //判断表是否存在            if(admin.tableExists(TABLE_NAME)){                //删除表                admin.disableTable(TABLE_NAME);                admin.deleteTable(TABLE_NAME);            }            if(admin.tableExists(TABLE_NAME_)){                //删除表                admin.disableTable(TABLE_NAME_);                admin.deleteTable(TABLE_NAME_);            }            //创建表描述类            TableName tableName = TableName.valueOf(TABLE_NAME);            HTableDescriptor tableDescriptor = new HTableDescriptor(tableName);            //创建列族描述类            HColumnDescriptor columnDescriptor = new HColumnDescriptor(FAMILY);            //列族描述类加入到表描述类中            tableDescriptor.addFamily(columnDescriptor);            //创建表            admin.createTable(tableDescriptor);            TableName tableName2 = TableName.valueOf(TABLE_NAME_);            HTableDescriptor tableDescriptor2 = new HTableDescriptor(tableName2);            //创建列族描述类            HColumnDescriptor columnDescriptor2 = new HColumnDescriptor(FAMILY);            //列族描述类加入到表描述类中            tableDescriptor2.addFamily(columnDescriptor2);            //创建表            admin.createTable(tableDescriptor2);            table = new HTable(config, TABLE_NAME);            //自动刷出            table.setAutoFlush(false);            //一次缓存大小            table.setWriteBufferSize(1000);            /*             * 向表中插入数据             */            List<Put> putList = new ArrayList<>();            Put put = new Put(Bytes.toBytes("1"));            put.add(Bytes.toBytes(FAMILY), Bytes.toBytes(COLUMN),                     Bytes.toBytes("hadoop,hdfs,mapreduce,hive,hbase"));            putList.add(put);            Put put2 = new Put(Bytes.toBytes("2"));            put2.add(Bytes.toBytes(FAMILY), Bytes.toBytes(COLUMN),                     Bytes.toBytes("hdfs,hive,hbase,storm,kafka"));            putList.add(put2);            Put put3 = new Put(Bytes.toBytes("3"));            put3.add(Bytes.toBytes(FAMILY), Bytes.toBytes(COLUMN),                    Bytes.toBytes("hbase,storm,kafka,spark,mahout"));            putList.add(put3);            table.put(putList);            //提交            table.flushCommits();        }catch(Exception e){            e.printStackTrace();        }finally {            if(table!=null){                try {                    table.close();                } catch (IOException e) {                    e.printStackTrace();                }            }        }    }    /**     * Text 输出key类型     * IntWritable  输出value类型     * 一次读取一行数据(一个rowkey对应一行)     * @author 12706     *     */    static class WordCountMapper extends TableMapper<Text, IntWritable>{        static Text k = new Text();        static IntWritable v = new IntWritable(1);        //key:rowkey    value:对应的一行的result        @Override        protected void map(ImmutableBytesWritable key, Result value,                Context context)throws IOException, InterruptedException {            //获取info:word的value值            //如:hadoop,hdfs,mapreduce,hive,hbase            byte[] line = value.getValue(Bytes.toBytes(FAMILY), Bytes.toBytes(COLUMN));            //按逗号切割hadoop   hdfs    mapreduce   hive    hbase            String[] wordz = Bytes.toString(line).split(",");            //循环输出word和1            for (String w : wordz) {                k.set(w);                //写出                context.write(k, v);            }        }    }    /**     * Text:传入的key类型     * IntWritable:传入的value类型     * ImmutableBytesWritable:输出类型,rowkey类型     * @author 12706     *     */    static class WordCountReducer extends TableReducer<Text, IntWritable, ImmutableBytesWritable>{        @Override        protected void reduce(Text key, Iterable<IntWritable> values,                Context context)throws IOException, InterruptedException {            int count = 0;            for (IntWritable intWritable : values) {                count += intWritable.get();            }            //设置rowkey为单词            Put put = new Put(Bytes.toBytes(key.toString()));            put.add(Bytes.toBytes(FAMILY), Bytes.toBytes(COLUMN), Bytes.toBytes(String.valueOf(count)));            //写到hbase,需要指定rowkey、put            context.write(new ImmutableBytesWritable(Bytes.toBytes(key.toString())), put);        }    }    public static void main(String[] args) throws Exception {        //初始化表        init();        //创建job        Job job = Job.getInstance(config);//job        job.setJarByClass(MrMachineHBase.class);//主类        //创建scan        Scan scan = new Scan();        //可以指定查询某一列        scan.addColumn(Bytes.toBytes(FAMILY), Bytes.toBytes(COLUMN));        //创建查询hbase的mapper,设置表名、scan、mapper类、mapper的输出key、mapper的输出value        TableMapReduceUtil.initTableMapperJob(TABLE_NAME, scan, WordCountMapper.class,Text.class, IntWritable.class, job);        //创建写入hbase的reducer,指定表名、reducer类、job reduce不设置输出默认跟mapper的输出一致        TableMapReduceUtil.initTableReducerJob(TABLE_NAME_, WordCountReducer.class, job);        System.exit(job.waitForCompletion(true) ? 0 : 1);    }}

将工程打包上传到hadoop集群
开始测试

[root@mini1 ~]# hadoop jar hbase.jar com.scu.hbase.MrMachineHBase

执行完之后,hbase客户端命令行查看

hbase(main):011:0> listTABLE                                                                                                                                                                             user1                                                                                                                                                                             words                                                                                                                                                                             words2                                                                                                                                                                            3 row(s) in 0.0550 seconds=> ["user1", "words", "words2"]hbase(main):012:0> scan 'words'ROW                                           COLUMN+CELL                                                                                                                          1                                            column=info:word, timestamp=1509345735252, value=hadoop,hdfs,mapreduce,hive,hbase                                                    2                                            column=info:word, timestamp=1509345735252, value=hdfs,hive,hbase,storm,kafka                                                         3                                            column=info:word, timestamp=1509345735252, value=hbase,storm,kafka,spark,mahout                                                     3 row(s) in 0.1610 secondshbase(main):013:0> scan 'words2'ROW                                           COLUMN+CELL                                                                                                                          hadoop                                       column=info:word, timestamp=1509345777813, value=1                                                                                   hbase                                        column=info:word, timestamp=1509345777813, value=3                                                                                   hdfs                                         column=info:word, timestamp=1509345777813, value=2                                                                                   hive                                         column=info:word, timestamp=1509345777813, value=2                                                                                   kafka                                        column=info:word, timestamp=1509345777813, value=2                                                                                   mahout                                       column=info:word, timestamp=1509345777813, value=1                                                                                   mapreduce                                    column=info:word, timestamp=1509345777813, value=1                                                                                   spark                                        column=info:word, timestamp=1509345777813, value=1                                                                                   storm                                        column=info:word, timestamp=1509345777813, value=2                                                                                  9 row(s) in 0.0860 seconds
原创粉丝点击