MapReduce操作Hbase（3）

来源：互联网发布：erp软件培训编辑：程序博客网时间：2024/05/17 23:34

从HBase读取数据MR操作后写入HBase

1、blog实例分析

业务需求是这样：找到具有相同兴趣的人，我们简单定义为如果author之间article的tag相同，则认为两者有相同兴趣，将分析结果保存到HBase。除了上面介绍的blog表外，我们新增一张表tag_friend，RowKey为tag，Value为authors,大概就下面这样：

我们省略了一些跟分析无关的Column数据，上面的数据按前面描述的业务需求经过MapReduce分析，应该得到下面的结果:

实际的运算过程分析如下:

代码实现：

package mapred;import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.hbase.HBaseConfiguration;import org.apache.hadoop.hbase.HColumnDescriptor;import org.apache.hadoop.hbase.HTableDescriptor;import org.apache.hadoop.hbase.KeyValue;import org.apache.hadoop.hbase.client.HBaseAdmin;import org.apache.hadoop.hbase.client.HTable;import org.apache.hadoop.hbase.client.HTablePool;import org.apache.hadoop.hbase.client.Put;import org.apache.hadoop.hbase.client.Result;import org.apache.hadoop.hbase.client.Scan;import org.apache.hadoop.hbase.io.ImmutableBytesWritable;import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;import org.apache.hadoop.hbase.mapreduce.TableMapper;import org.apache.hadoop.hbase.mapreduce.TableReducer;import org.apache.hadoop.hbase.util.Bytes;import org.apache.hadoop.mapreduce.Job;public class TestHBaseMapReduce {static class Mapper extends                      TableMapper<ImmutableBytesWritable, ImmutableBytesWritable> {@Overridepublic void map(ImmutableBytesWritable row, Result values, Context context)                                   throws IOException {ImmutableBytesWritable value = null;String[] tags = null;for (KeyValue kv : values.list()) {if ("author".equals(Bytes.toString(kv.getFamily())) &&                            "nickname".equals(Bytes.toString(kv.getQualifier()))) {value = new ImmutableBytesWritable(kv.getValue());}if ("article".equals(Bytes.toString(kv.getFamily())) &&                            "tags".equals(Bytes.toString(kv.getQualifier()))) {tags = Bytes.toString(kv.getValue()).split(",");}}for (int i = 0; i < tags.length; i++) {ImmutableBytesWritable key = new ImmutableBytesWritable(                                                  Bytes.toBytes(tags[i].toLowerCase()));try {context.write(key, value);} catch (InterruptedException e) {throw new IOException(e);}}}}static class Reducer extends TableReducer<ImmutableBytesWritable,                        ImmutableBytesWritable, ImmutableBytesWritable> {@Overridepublic void reduce(ImmutableBytesWritable key,               Iterable<ImmutableBytesWritable> values, Context context) throws IOException,InterruptedException {String friends = "";for (ImmutableBytesWritable val : values) {friends += (friends.length() > 0 ? "," : "") + Bytes.toString(val.get());}Put put = new Put(key.get());put.add(Bytes.toBytes("person"), Bytes.toBytes("nicknames"),                           Bytes.toBytes(friends));context.write(key, put);}}public static void main(String[] args) throws Exception {Configuration conf = HBaseConfiguration.create();conf.set("hbase.zookeeper.quorum", "10.10.4.55");conf.set("hbase.zookeeper.property.clientPort", "2181");Job job = new Job(conf, "HBase_FindFriend");job.setJarByClass(TestHBaseMapReduce.class);/** * 创建blog,tag_friend表 并初始化blog表数据 */new TestHBaseMapReduce().initDate(conf);Scan scan = new Scan();scan.addColumn(Bytes.toBytes("author"), Bytes.toBytes("nickname"));scan.addColumn(Bytes.toBytes("article"), Bytes.toBytes("tags"));TableMapReduceUtil.initTableMapperJob("blog", scan,                      TestHBaseMapReduce.Mapper.class, ImmutableBytesWritable.class,ImmutableBytesWritable.class, job);TableMapReduceUtil.initTableReducerJob("tag_friend",                      TestHBaseMapReduce.Reducer.class, job);System.exit(job.waitForCompletion(true) ? 0 : 1);}private void initDate(Configuration conf) throws IOException {HBaseAdmin hBaseAdmin = new HBaseAdmin(conf);String tableName = "blog";if (hBaseAdmin.tableExists(tableName)) {hBaseAdmin.disableTable(tableName);hBaseAdmin.deleteTable(tableName);System.out.println(tableName + " is exist,detele....");}HTableDescriptor tableDescriptor = new HTableDescriptor(tableName);tableDescriptor.addFamily(new HColumnDescriptor("article"));tableDescriptor.addFamily(new HColumnDescriptor("author"));hBaseAdmin.createTable(tableDescriptor);HTablePool pool = new HTablePool(conf, 1000);HTable table = (HTable) pool.getTable(tableName);Put put1 = new Put("1".getBytes());put1.add("article".getBytes(), "content".getBytes(),                      "HBase is the Hadoop DataBase.".getBytes());put1.add("article".getBytes(), "tags".getBytes(),                      "HBase,NoSQL,Hadoop".getBytes());put1.add("article".getBytes(), "title".getBytes(),                      "HBase First Hadoop".getBytes());put1.add("author".getBytes(), "name".getBytes(), "jiabing".getBytes());put1.add("author".getBytes(), "nickname".getBytes(), "djb".getBytes());Put put2 = new Put("10".getBytes());put2.add("article".getBytes(), "tags".getBytes(), "Hadoop".getBytes());put2.add("author".getBytes(), "nickname".getBytes(), "dyq".getBytes());Put put3 = new Put("100".getBytes());put3.add("article".getBytes(), "tags".getBytes(), "hbase,nosql".getBytes());put3.add("author".getBytes(), "nickname".getBytes(), "yzx".getBytes());table.put(put1);table.put(put2);table.put(put3);String tableName2 = "tag_friend";HTableDescriptor htd = new HTableDescriptor(tableName2);HColumnDescriptor col = new HColumnDescriptor("person");htd.addFamily(col);if (hBaseAdmin.tableExists(tableName2)) {System.out.println("table exists, trying recreate table!");hBaseAdmin.disableTable(tableName2);hBaseAdmin.deleteTable(tableName2);}System.out.println("create new table : " + tableName2);hBaseAdmin.createTable(htd);hBaseAdmin.close();}}

初始表及结果表内容如下：

hbase(main):012:0> scan 'blog'ROW              COLUMN+CELL             1               column=article:content, timestamp=1378353404057, value=HBase is the Hadoop DataBase.  1               column=article:tags, timestamp=1378353404057, value=HBase,NoSQL,Hadoop                1               column=article:title, timestamp=1378353404057, value=HBase First Hadoop               1               column=author:name, timestamp=1378353404057, value=jiabing                            1               column=author:nickname, timestamp=1378353404057, value=djb                            10              column=article:tags, timestamp=1378353404067, value=Hadoop                            10              column=author:nickname, timestamp=1378353404067, value=dyq                            100             column=article:tags, timestamp=1378353404072, value=hbase,nosql                       100             column=author:nickname, timestamp=1378353404072, value=yzx                           3 row(s) in 1.3360 secondshbase(main):013:0> scan 'tag_friend'ROW              COLUMN+CELL  hadoop          column=person:nicknames, timestamp=1378353421350, value=djb,dyq                                            hbase           column=person:nicknames, timestamp=1378353421350, value=djb,yzx                                            nosql           column=person:nicknames, timestamp=1378353421350, value=djb,yzx                                           3 row(s) in 1.1460 secondshbase(main):014:0>

2、HBase表拷贝示例代码

package mapred;import java.io.IOException;import java.util.List;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.hbase.HBaseConfiguration;import org.apache.hadoop.hbase.HColumnDescriptor;import org.apache.hadoop.hbase.HTableDescriptor;import org.apache.hadoop.hbase.KeyValue;import org.apache.hadoop.hbase.client.HBaseAdmin;import org.apache.hadoop.hbase.client.HTable;import org.apache.hadoop.hbase.client.HTablePool;import org.apache.hadoop.hbase.client.Put;import org.apache.hadoop.hbase.client.Result;import org.apache.hadoop.hbase.client.Scan;import org.apache.hadoop.hbase.io.ImmutableBytesWritable;import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;import org.apache.hadoop.hbase.mapreduce.TableMapper;import org.apache.hadoop.mapreduce.Job;public class TableCopy {static class CopyMapper extends TableMapper<ImmutableBytesWritable,Put>{@Overrideprotected void map(ImmutableBytesWritable key, Result value,Context context) throws IOException, InterruptedException {//将查询结果保存到listList<KeyValue> kvs =  value.list();System.out.println(new String(key.get()));Put p = new Put(key.get());//将结果装载到Putfor(KeyValue kv : kvs) {if(kv != null) {p.add(kv);}}//将结果写入到Reducecontext.write(key, p);}}public static void main(String[] args) throws Exception {Configuration conf = HBaseConfiguration.create();conf.set("hbase.zookeeper.quorum", "10.10.4.55");conf.set("hbase.zookeeper.property.clientPort", "2181");Job job = new Job(conf, "Table Copy");job.setJarByClass(TableCopy.class);/** * 创建blog,blog_copy表 并初始化blog表数据 */new TableCopy().initDate(conf);Scan sc = new Scan();sc.setCaching(10000);sc.setCacheBlocks(false);job.setJarByClass(TableCopy.class);job.setNumReduceTasks(0);TableMapReduceUtil.initTableMapperJob("blog", sc, CopyMapper.class,                         ImmutableBytesWritable.class, Result.class, job);TableMapReduceUtil.initTableReducerJob("blog_copy", null, job);System.exit(job.waitForCompletion(true) ? 0 : 1);}private void initDate(Configuration conf) throws IOException {HBaseAdmin hBaseAdmin = new HBaseAdmin(conf);String tableName = "blog";if (hBaseAdmin.tableExists(tableName)) {hBaseAdmin.disableTable(tableName);hBaseAdmin.deleteTable(tableName);System.out.println(tableName + " is exist,detele....");}HTableDescriptor tableDescriptor = new HTableDescriptor(tableName);tableDescriptor.addFamily(new HColumnDescriptor("article"));tableDescriptor.addFamily(new HColumnDescriptor("author"));hBaseAdmin.createTable(tableDescriptor);HTablePool pool = new HTablePool(conf, 1000);HTable table = (HTable) pool.getTable(tableName);Put put1 = new Put("1".getBytes());put1.add("article".getBytes(), "content".getBytes(),                       "HBase is the Hadoop DataBase.".getBytes());put1.add("article".getBytes(), "tags".getBytes(),                        "HBase,NoSQL,Hadoop".getBytes());put1.add("article".getBytes(), "title".getBytes(),                       "HBase First Hadoop".getBytes());put1.add("author".getBytes(), "name".getBytes(), "jiabing".getBytes());put1.add("author".getBytes(), "nickname".getBytes(), "djb".getBytes());Put put2 = new Put("10".getBytes());put2.add("article".getBytes(), "tags".getBytes(), "Hadoop".getBytes());put2.add("author".getBytes(), "nickname".getBytes(), "dyq".getBytes());Put put3 = new Put("100".getBytes());put3.add("article".getBytes(), "tags".getBytes(), "hbase,nosql".getBytes());put3.add("author".getBytes(), "nickname".getBytes(), "yzx".getBytes());table.put(put1);table.put(put2);table.put(put3);String tableName2 = "blog_copy";HTableDescriptor htd = new HTableDescriptor(tableName2);HColumnDescriptor col = new HColumnDescriptor("article");HColumnDescriptor col2 = new HColumnDescriptor("author");htd.addFamily(col);htd.addFamily(col2);if (hBaseAdmin.tableExists(tableName2)) {System.out.println("table exists, trying recreate table!");hBaseAdmin.disableTable(tableName2);hBaseAdmin.deleteTable(tableName2);}System.out.println("create new table : " + tableName2);hBaseAdmin.createTable(htd);hBaseAdmin.close();}}

原表及拷贝表数据如下所示：

hbase(main):002:0> scan 'blog'ROW            COLUMN+CELL                                 1             column=article:content, timestamp=1378364676891, value=HBase is the Hadoop DataBase.   1             column=article:tags, timestamp=1378364676891, value=HBase,NoSQL,Hadoop                 1             column=article:title, timestamp=1378364676891, value=HBase First Hadoop                1             column=author:name, timestamp=1378364676891, value=jiabing                             1             column=author:nickname, timestamp=1378364676891, value=djb                             10            column=article:tags, timestamp=1378364676901, value=Hadoop                             10            column=author:nickname, timestamp=1378364676901, value=dyq                             100           column=article:tags, timestamp=1378364676906, value=hbase,nosql                        100           column=author:nickname, timestamp=1378364676906, value=yzx                            3 row(s) in 0.2810 secondshbase(main):003:0> scan 'blog_copy'ROW            COLUMN+CELL                                                                    1             column=article:content, timestamp=1378364804365, value=HBase is the Hadoop DataBase. 1             column=article:tags, timestamp=1378364804365, value=HBase,NoSQL,Hadoop               1             column=article:title, timestamp=1378364804365, value=HBase First Hadoop              1             column=author:name, timestamp=1378364804365, value=jiabing                           1             column=author:nickname, timestamp=1378364804365, value=djb                           10            column=article:tags, timestamp=1378364804414, value=Hadoop                           10            column=author:nickname, timestamp=1378364804414, value=dyq                           100           column=article:tags, timestamp=1378364804420, value=hbase,nosql                      100           column=author:nickname, timestamp=1378364804420, value=yzx                          3 row(s) in 0.1040 secondshbase(main):004:0>

3、总结

1、使用TableMapper来读取表。

2、写入表的一种方式是用TableMapReduceUtil.initTableReducerJob方法，这里既可以在map阶段输出，也能在reduce阶段输出，区别是Reduce的class设置为null或者实际的Reducer。

3、写入表的另外一种方式就是调用hbase的原生api，即HTable.put的方式写入数据（这种方式适合写少量数据，或者统计后的结果）。