使用mapreduce操作Hbase
来源:互联网 发布:新奇小礼物知乎 编辑:程序博客网 时间:2024/06/10 10:55
前面用了java api操作了hbase,这里就用mapreduce来操作mapreduce。
需求:对下面的表中的数据进行统计
hbase(main):006:0> scan 'words'ROW COLUMN+CELL 1 column=info:word, timestamp=1509345324263, value=hadoop,hdfs,mapreduce,hive,hbase 2 column=info:word, timestamp=1509345324263, value=hdfs,hive,hbase,storm,kafka 3 column=info:word, timestamp=1509345324263, value=hbase,storm,kafka,spark,mahout
要求统计结果放入到另外一张表words2中,格式如下
hbase(main):010:0> scan 'words2'ROW COLUMN+CELL hadoop column=info:word, timestamp=1509345777813, value=1 hbase column=info:word, timestamp=1509345777813, value=3 hdfs column=info:word, timestamp=1509345777813, value=2 hive column=info:word, timestamp=1509345777813, value=2 kafka column=info:word, timestamp=1509345777813, value=2 mahout column=info:word, timestamp=1509345777813, value=1 mapreduce column=info:word, timestamp=1509345777813, value=1 spark column=info:word, timestamp=1509345777813, value=1 storm column=info:word, timestamp=1509345777813, value=2
注:上面的都是自己做完之后复制的。
1、建立数据来源表‘words’,包含一个列族‘info’
向表中添加数据,在列族中放入列‘word’,并将短文数据放入该列中,如此插入多行,行键为不同的数据即可
2、建立输出表‘words2’,包含一个列族‘info’
3、通过Mr操作Hbase的‘word’表,对‘info:word’中的单词做统计,并将统计结果写入‘words2’表的‘info:word中’,行键为单词
注释都写在程序中就不做解释了
/** * 使用mr操作hbase来进行单词统计 * @author 12706 * */public class MrMachineHBase { //创建hbase配置对象 static Configuration config = null; static{ config = HBaseConfiguration.create(); //设置参数 config.set("hbase.zookeeper.quorum", "mini1,mini2,mini3"); config.set("hbase.zookeeper.property.clientPort", "2181"); } //创建表信息 public static final String TABLE_NAME = "words"; public static final String TABLE_NAME_ = "words2"; public static final String FAMILY = "info"; public static final String COLUMN = "word"; //初始化表 public static void init(){ HTable table = null; HBaseAdmin admin = null; try{ admin = new HBaseAdmin(config); //判断表是否存在 if(admin.tableExists(TABLE_NAME)){ //删除表 admin.disableTable(TABLE_NAME); admin.deleteTable(TABLE_NAME); } if(admin.tableExists(TABLE_NAME_)){ //删除表 admin.disableTable(TABLE_NAME_); admin.deleteTable(TABLE_NAME_); } //创建表描述类 TableName tableName = TableName.valueOf(TABLE_NAME); HTableDescriptor tableDescriptor = new HTableDescriptor(tableName); //创建列族描述类 HColumnDescriptor columnDescriptor = new HColumnDescriptor(FAMILY); //列族描述类加入到表描述类中 tableDescriptor.addFamily(columnDescriptor); //创建表 admin.createTable(tableDescriptor); TableName tableName2 = TableName.valueOf(TABLE_NAME_); HTableDescriptor tableDescriptor2 = new HTableDescriptor(tableName2); //创建列族描述类 HColumnDescriptor columnDescriptor2 = new HColumnDescriptor(FAMILY); //列族描述类加入到表描述类中 tableDescriptor2.addFamily(columnDescriptor2); //创建表 admin.createTable(tableDescriptor2); table = new HTable(config, TABLE_NAME); //自动刷出 table.setAutoFlush(false); //一次缓存大小 table.setWriteBufferSize(1000); /* * 向表中插入数据 */ List<Put> putList = new ArrayList<>(); Put put = new Put(Bytes.toBytes("1")); put.add(Bytes.toBytes(FAMILY), Bytes.toBytes(COLUMN), Bytes.toBytes("hadoop,hdfs,mapreduce,hive,hbase")); putList.add(put); Put put2 = new Put(Bytes.toBytes("2")); put2.add(Bytes.toBytes(FAMILY), Bytes.toBytes(COLUMN), Bytes.toBytes("hdfs,hive,hbase,storm,kafka")); putList.add(put2); Put put3 = new Put(Bytes.toBytes("3")); put3.add(Bytes.toBytes(FAMILY), Bytes.toBytes(COLUMN), Bytes.toBytes("hbase,storm,kafka,spark,mahout")); putList.add(put3); table.put(putList); //提交 table.flushCommits(); }catch(Exception e){ e.printStackTrace(); }finally { if(table!=null){ try { table.close(); } catch (IOException e) { e.printStackTrace(); } } } } /** * Text 输出key类型 * IntWritable 输出value类型 * 一次读取一行数据(一个rowkey对应一行) * @author 12706 * */ static class WordCountMapper extends TableMapper<Text, IntWritable>{ static Text k = new Text(); static IntWritable v = new IntWritable(1); //key:rowkey value:对应的一行的result @Override protected void map(ImmutableBytesWritable key, Result value, Context context)throws IOException, InterruptedException { //获取info:word的value值 //如:hadoop,hdfs,mapreduce,hive,hbase byte[] line = value.getValue(Bytes.toBytes(FAMILY), Bytes.toBytes(COLUMN)); //按逗号切割hadoop hdfs mapreduce hive hbase String[] wordz = Bytes.toString(line).split(","); //循环输出word和1 for (String w : wordz) { k.set(w); //写出 context.write(k, v); } } } /** * Text:传入的key类型 * IntWritable:传入的value类型 * ImmutableBytesWritable:输出类型,rowkey类型 * @author 12706 * */ static class WordCountReducer extends TableReducer<Text, IntWritable, ImmutableBytesWritable>{ @Override protected void reduce(Text key, Iterable<IntWritable> values, Context context)throws IOException, InterruptedException { int count = 0; for (IntWritable intWritable : values) { count += intWritable.get(); } //设置rowkey为单词 Put put = new Put(Bytes.toBytes(key.toString())); put.add(Bytes.toBytes(FAMILY), Bytes.toBytes(COLUMN), Bytes.toBytes(String.valueOf(count))); //写到hbase,需要指定rowkey、put context.write(new ImmutableBytesWritable(Bytes.toBytes(key.toString())), put); } } public static void main(String[] args) throws Exception { //初始化表 init(); //创建job Job job = Job.getInstance(config);//job job.setJarByClass(MrMachineHBase.class);//主类 //创建scan Scan scan = new Scan(); //可以指定查询某一列 scan.addColumn(Bytes.toBytes(FAMILY), Bytes.toBytes(COLUMN)); //创建查询hbase的mapper,设置表名、scan、mapper类、mapper的输出key、mapper的输出value TableMapReduceUtil.initTableMapperJob(TABLE_NAME, scan, WordCountMapper.class,Text.class, IntWritable.class, job); //创建写入hbase的reducer,指定表名、reducer类、job reduce不设置输出默认跟mapper的输出一致 TableMapReduceUtil.initTableReducerJob(TABLE_NAME_, WordCountReducer.class, job); System.exit(job.waitForCompletion(true) ? 0 : 1); }}
将工程打包上传到hadoop集群
开始测试
[root@mini1 ~]# hadoop jar hbase.jar com.scu.hbase.MrMachineHBase
执行完之后,hbase客户端命令行查看
hbase(main):011:0> listTABLE user1 words words2 3 row(s) in 0.0550 seconds=> ["user1", "words", "words2"]hbase(main):012:0> scan 'words'ROW COLUMN+CELL 1 column=info:word, timestamp=1509345735252, value=hadoop,hdfs,mapreduce,hive,hbase 2 column=info:word, timestamp=1509345735252, value=hdfs,hive,hbase,storm,kafka 3 column=info:word, timestamp=1509345735252, value=hbase,storm,kafka,spark,mahout 3 row(s) in 0.1610 secondshbase(main):013:0> scan 'words2'ROW COLUMN+CELL hadoop column=info:word, timestamp=1509345777813, value=1 hbase column=info:word, timestamp=1509345777813, value=3 hdfs column=info:word, timestamp=1509345777813, value=2 hive column=info:word, timestamp=1509345777813, value=2 kafka column=info:word, timestamp=1509345777813, value=2 mahout column=info:word, timestamp=1509345777813, value=1 mapreduce column=info:word, timestamp=1509345777813, value=1 spark column=info:word, timestamp=1509345777813, value=1 storm column=info:word, timestamp=1509345777813, value=2 9 row(s) in 0.0860 seconds
阅读全文
0 0
- 使用mapreduce操作Hbase
- 【Hadoop】使用MapReduce操作HBase
- MapReduce操作HBase
- MapReduce操作HBase
- MapReduce操作HBase
- mapreduce 操作 hbase
- mapreduce操作HBase
- MapReduce 操作hbase
- MapReduce操作HBase
- MapReduce操作HBase
- MapReduce 操作 hbase
- MapReduce操作HBase
- 通过mapreduce操作Hbase
- MapReduce操作HBase
- HBASE--数据操作,MapReduce
- MapReduce操作HBase
- mapreduce操作hbase
- Hbase Mapreduce操作
- 记录锁
- 水平垂直居中
- Android手势密码的实现
- Find a way(HDU-2612)
- 性能评价相关公式以及计算
- 使用mapreduce操作Hbase
- PAT 乙级练习 1001
- 刚刚开通了博客,目的是记录我的每天所学,让自己的生活留下一些痕迹
- VS2003启动时出现“Microsoft Development Environment 遇到问题需要关闭”
- POI编辑Excel
- JQuery使用总结(一)
- Relearning in JAVA: 5)Initialization & cleanup
- UEditor富文本编辑器中工具按钮的配置
- 30天了解30种技术系列---(26)MySQL自动化运维工具Inception