MapReduce操作Hbase 进行单词数量统计Demo

来源:互联网 发布:算法导论22.4 2 编辑:程序博客网 时间:2024/05/16 08:11

1.window环境准备:eclipse中建项目,导入hbase安装环境下的lib中的jar包,或者用maven工具加载jar

2.linux环境准备:前提安装Hadoop,hbase,zookeeper并能正常运行。若想在Hadoop上运行hbase相关文件,得把hbase环境下的lib里的依赖jar导入到hadoop环境下,下面给一个简单方法:进入到 Hadoop的hadoop-env.sh 文件中,添加下面这段shell脚本,脚本意思是把hbase下lib里的jar加入到hadoop的classpath下:

for f in $HBASE_HOME/lib/*.jar; do
  if [ "$HADOOP_CLASSPATH" ]; then
    export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:$f
  else
    export HADOOP_CLASSPATH=$f
  fi
done

3.重新启动Hadoop,启动zookeeper,hbase。

4.编写java代码:

package HbaseTest1;import java.io.IOException;import java.util.ArrayList;import java.util.List;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.hbase.HBaseConfiguration;import org.apache.hadoop.hbase.HColumnDescriptor;import org.apache.hadoop.hbase.HTableDescriptor;import org.apache.hadoop.hbase.client.HBaseAdmin;import org.apache.hadoop.hbase.client.HTable;import org.apache.hadoop.hbase.client.Put;import org.apache.hadoop.hbase.client.Result;import org.apache.hadoop.hbase.client.Scan;import org.apache.hadoop.hbase.io.ImmutableBytesWritable;import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;import org.apache.hadoop.hbase.mapreduce.TableMapper;import org.apache.hadoop.hbase.mapreduce.TableReducer;import org.apache.hadoop.hbase.util.Bytes;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;public class HbaseMr {//基本配置static Configuration conf=null;static{conf=HBaseConfiguration.create();conf.set("hbase.zookeeper.quorum", "slave1,slave2,slave3");conf.set("hbase.zookeeper.property.clientPort", "2181");}//表信息public static final String tableName1="t1";//表1public static final String family1="content1";//列族public static final String col1="info1";//列public static final String tableName2="t2";//表2/** * 初始化表结构和数据 */public static void initTable(){HBaseAdmin admin =null;HTable table=null;try {admin = new HBaseAdmin(conf);//管理//建表前先查看有没有,有则删除if(admin.tableExists(tableName1)||admin.tableExists(tableName2)){admin.disableTable(tableName1);admin.deleteTable(tableName1);admin.disableTable(tableName2);admin.deleteTable(tableName2);}//建表1HTableDescriptor desc1 = new HTableDescriptor(tableName1);HColumnDescriptor family = new HColumnDescriptor(family1);desc1.addFamily(family);admin.createTable(desc1);//建表2HTableDescriptor desc2 = new HTableDescriptor(tableName2);desc2.addFamily(family);admin.createTable(desc2);//插入数据table = new HTable(conf,tableName1);table.setAutoFlush(false);//true是每次都提交table.setWriteBufferSize(5);//达到5M时自动提交List<Put> pList = new ArrayList<Put>();for(int i=0;i<6;i++){Put p1 = new Put(Bytes.toBytes(i+""));//行键为ip1.add(family1.getBytes(),col1.getBytes(),("aaa bbb ccc aaa ddd"+i).getBytes());pList.add(p1);}//插入table.put(pList);table.flushCommits();pList.clear();} catch (Exception e) {e.printStackTrace();}finally{if(table!=null){try {table.close();} catch (IOException e) {e.printStackTrace();}}}}/** * MyMapper继承 TableMapper<Text,IntWritable>  */public static class MyMapper extends TableMapper<Text, IntWritable>{private Text word=new Text();private IntWritable one=new IntWritable(1);@Override//ImmutableBytesWritable:表示rowkey的类型,value:一行数据的结果集Resultprotected void map(ImmutableBytesWritable key, Result value,Context context)throws IOException, InterruptedException {//获取一行数据family1:col1String words = Bytes.toStringBinary(value.getValue(Bytes.toBytes(family1), Bytes.toBytes(col1)));//切分获取到的字符串    String[] split = words.split(" ");    //循环遍历出每个单词    for(int i=0;i<split.length ;i++){    word.set(split[i]);    context.write(word, one);//输出    }}}/** * MyReducer 继承TableReducer<Text,IntWritable,ImmutableBytesWritable>  * ImmutableBytesWritable:输出类型,表示rowkey的类型 */public static class MyReducer extends TableReducer<Text, IntWritable, ImmutableBytesWritable>{@Overrideprotected void reduce(Text key, Iterable<IntWritable> values,Context context)throws IOException, InterruptedException {int sum=0;for(IntWritable i:values){sum+=i.get();//累加}//把统计好的单词逐一插入到table2中Put put = new Put(Bytes.toBytes(key.toString()));//单词作为rowkey//准备数据:列族,列,值put.add(Bytes.toBytes(family1),Bytes.toBytes(col1),Bytes.toBytes(sum+""));//写出,参数rowkey,put,写到那张表已经在main函数中指定context.write(new ImmutableBytesWritable(Bytes.toBytes(key.toString())), put);}}public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {initTable();//初始化表Job job = Job.getInstance(conf);job.setJarByClass(HbaseMr.class);Scan scan = new Scan();//设置查询是那列scan.addColumn(Bytes.toBytes(family1), Bytes.toBytes(col1));//创建查询的MapperTableMapReduceUtil.initTableMapperJob(tableName1, scan, MyMapper.class, Text.class, IntWritable.class, job);    //创建写入的reduceTableMapReduceUtil.initTableReducerJob(tableName2, MyReducer.class, job);boolean res = job.waitForCompletion(true);System.exit(res?0:1);}}

5.打包,打包时直接设置好main函数所在的位置。

6.把打包好的jar拷贝到hadoop环境下,运行

[root@CentOS hadoop-2.6.0]# hadoop jar hbcount.jar 

..........

17/11/16 11:21:09 INFO mapreduce.Job:  map 0% reduce 0%
17/11/16 11:21:21 INFO mapreduce.Job:  map 100% reduce 0%
17/11/16 11:21:43 INFO mapreduce.Job:  map 100% reduce 100%

........

执行完毕。

7.进入hbase中,查看这2个表。

[root@slave1 bin]# hbase shell

查看所有表:

hbase(main):001:0> list

t1                                                                                        
t2                                                                                        
t3                                                                                        
t4                                                                                        
test                                                                                      
test1                                                                                     
6 row(s) in 2.4150 seconds

=> ["t1", "t2", "t3", "t4", "test", "test1"]

查看t1:

hbase(main):002:0> scan 't1'

ROW                     COLUMN+CELL                                                       
 0                      column=content1:info1, timestamp=1510800127451, value=aaa bbb ccc 
                        aaa ddd0                                                          
 1                      column=content1:info1, timestamp=1510800127564, value=aaa bbb ccc 
                        aaa ddd1                                                          
 2                      column=content1:info1, timestamp=1510800127578, value=aaa bbb ccc 
                        aaa ddd2                                                          
 3                      column=content1:info1, timestamp=1510800127588, value=aaa bbb ccc 
                        aaa ddd3                                                          
 4                      column=content1:info1, timestamp=1510800127597, value=aaa bbb ccc 
                        aaa ddd4                                                          
 5                      column=content1:info1, timestamp=1510800127610, value=aaa bbb ccc 
                        aaa ddd5                                                          
6 row(s) in 0.4620 seconds

查看t2:
hbase(main):002:0> scan 't2'
ROW                     COLUMN+CELL                                                       
 aaa                    column=content1:info1, timestamp=1510800219206, value=12          
 bbb                    column=content1:info1, timestamp=1510800219206, value=6           
 ccc                    column=content1:info1, timestamp=1510800219206, value=6           
 ddd0                   column=content1:info1, timestamp=1510800219206, value=1           
 ddd1                   column=content1:info1, timestamp=1510800219206, value=1           
 ddd2                   column=content1:info1, timestamp=1510800219206, value=1           
 ddd3                   column=content1:info1, timestamp=1510800219206, value=1           
 ddd4                   column=content1:info1, timestamp=1510800219206, value=1           
 ddd5                   column=content1:info1, timestamp=1510800219206, value=1           
9 row(s) in 0.0770 seconds

结果得出:aaa共出现12次,......