spark操作HBASE
来源:互联网 发布:埋点数据 编辑:程序博客网 时间:2024/05/17 22:49
转:http://www.cnblogs.com/seaspring/p/5631112.html
import org.apache.hadoop.hbase.util.Bytesimport org.apache.hadoop.hbase.{HColumnDescriptor, HTableDescriptor, TableName, HBaseConfiguration}import org.apache.hadoop.hbase.client._import org.apache.spark.SparkContextimport scala.collection.JavaConversions._/** * HBase 1.0.0 新版API, CRUD 的基本操作代码示例 **/object HBaseNewAPI { def main(args: Array[String]) { val sc = new SparkContext("local", "SparkHBase") val conf = HBaseConfiguration.create() conf.set("hbase.zookeeper.property.clientPort", "2181") conf.set("hbase.zookeeper.quorum", "master") //Connection 的创建是个重量级的工作,线程安全,是操作hbase的入口 val conn = ConnectionFactory.createConnection(conf) //从Connection获得 Admin 对象(相当于以前的 HAdmin) val admin = conn.getAdmin //本例将操作的表名 val userTable = TableName.valueOf("user") //创建 user 表 val tableDescr = new HTableDescriptor(userTable) tableDescr.addFamily(new HColumnDescriptor("basic".getBytes)) println("Creating table `user`. ") if (admin.tableExists(userTable)) { admin.disableTable(userTable) admin.deleteTable(userTable) } admin.createTable(tableDescr) println("Done!") try{ //获取 user 表 val table = conn.getTable(userTable) try{ //准备插入一条 key 为 id001 的数据 val p = new Put("id001".getBytes) //为put操作指定 column 和 value (以前的 put.add 方法被弃用了) p.addColumn("basic".getBytes,"name".getBytes, "wuchong".getBytes) //提交 table.put(p) //查询某条数据 val g = new Get("id001".getBytes) val result = table.get(g) val value = Bytes.toString(result.getValue("basic".getBytes,"name".getBytes)) println("GET id001 :"+value) //扫描数据 val s = new Scan() s.addColumn("basic".getBytes,"name".getBytes) val scanner = table.getScanner(s) try{ for(r <- scanner){ println("Found row: "+r) println("Found value: "+Bytes.toString(r.getValue("basic".getBytes,"name".getBytes))) } }finally { //确保scanner关闭 scanner.close() } //删除某条数据,操作方式与 Put 类似 val d = new Delete("id001".getBytes) d.addColumn("basic".getBytes,"name".getBytes) table.delete(d) }finally { if(table != null) table.close() } }finally { conn.close() } }}
import org.apache.hadoop.hbase.client.Putimport org.apache.hadoop.hbase.filter.CompareFilter.CompareOpimport org.apache.hadoop.hbase.filter.SingleColumnValueFilterimport org.apache.hadoop.hbase.io.ImmutableBytesWritableimport org.apache.hadoop.hbase.mapred.TableOutputFormatimport org.apache.hadoop.hbase.mapreduce.TableInputFormatimport org.apache.hadoop.hbase.protobuf.ProtobufUtilimport org.apache.hadoop.hbase.util.{Base64, Bytes}import org.apache.hadoop.hbase.HBaseConfigurationimport org.apache.hadoop.mapred.JobConfimport org.apache.spark.SparkContextimport org.apache.hadoop.hbase.client._/** * Spark 读取和写入 HBase **/object SparkOnHBase { def convertScanToString(scan: Scan) = { val proto = ProtobufUtil.toScan(scan) Base64.encodeBytes(proto.toByteArray) } def main(args: Array[String]) { val sc = new SparkContext("local","SparkOnHBase") val conf = HBaseConfiguration.create() conf.set("hbase.zookeeper.property.clientPort", "2181") conf.set("hbase.zookeeper.quorum", "master") // ======Save RDD to HBase======== // step 1: JobConf setup val jobConf = new JobConf(conf,this.getClass) jobConf.setOutputFormat(classOf[TableOutputFormat]) jobConf.set(TableOutputFormat.OUTPUT_TABLE,"user") // step 2: rdd mapping to table // 在 HBase 中表的 schema 一般是这样的 // *row cf:col_1 cf:col_2 // 而在Spark中,我们操作的是RDD元组,比如(1,"lilei",14) , (2,"hanmei",18) // 我们需要将 *RDD[(uid:Int, name:String, age:Int)]* 转换成 *RDD[(ImmutableBytesWritable, Put)]* // 我们定义了 convert 函数做这个转换工作 def convert(triple: (Int, String, Int)) = { val p = new Put(Bytes.toBytes(triple._1)) p.addColumn(Bytes.toBytes("basic"),Bytes.toBytes("name"),Bytes.toBytes(triple._2)) p.addColumn(Bytes.toBytes("basic"),Bytes.toBytes("age"),Bytes.toBytes(triple._3)) (new ImmutableBytesWritable, p) } // step 3: read RDD data from somewhere and convert val rawData = List((1,"lilei",14), (2,"hanmei",18), (3,"someone",38)) val localData = sc.parallelize(rawData).map(convert) //step 4: use `saveAsHadoopDataset` to save RDD to HBase localData.saveAsHadoopDataset(jobConf) // ================================= // ======Load RDD from HBase======== // use `newAPIHadoopRDD` to load RDD from HBase //直接从 HBase 中读取数据并转成 Spark 能直接操作的 RDD[K,V] //设置查询的表名 conf.set(TableInputFormat.INPUT_TABLE, "user") //添加过滤条件,年龄大于 18 岁 val scan = new Scan() scan.setFilter(new SingleColumnValueFilter("basic".getBytes,"age".getBytes, CompareOp.GREATER_OR_EQUAL,Bytes.toBytes(18))) conf.set(TableInputFormat.SCAN,convertScanToString(scan)) val usersRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat], classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable], classOf[org.apache.hadoop.hbase.client.Result]) val count = usersRDD.count() println("Users RDD Count:" + count) usersRDD.cache() //遍历输出 usersRDD.foreach{ case (_,result) => val key = Bytes.toInt(result.getRow) val name = Bytes.toString(result.getValue("basic".getBytes,"name".getBytes)) val age = Bytes.toInt(result.getValue("basic".getBytes,"age".getBytes)) println("Row key:"+key+" Name:"+name+" Age:"+age) } // ================================= }}
0 0
- Spark操作hbase
- spark 操作 hbase
- spark操作hbase
- spark操作hbase
- Spark操作Hbase
- spark操作hbase
- spark操作HBASE
- Spark学习笔记-HBase操作
- spark操作hbase中的数据
- scala spark hbase 操作案例
- spark操作hbase的javaapi
- spark操作读取hbase实例
- spark 操作hbase及mysql
- spark-hbase数据操作心得
- spark hbase
- spark hbase
- Spark&hbase
- spark hbase hbase-rdd
- C#反射 Assembly的简单应用
- 单例模式
- 关于object类的clone方法浅克隆与深度克隆
- 非对称加密---RSA算法
- spring 和MyBatista配置异常
- spark操作HBASE
- 工厂方法模式
- Assembly(c#中简单说明
- 跨域、跨子域,跨服务器读取session
- 微博在狂奔,为何微博电商却走向落寞
- 造轮子,还是造键盘?
- SQL优化
- 抽象工厂模式
- 2017趋势科技校园招聘北邮宣讲会+笔试