Spark读取Hbase

来源:互联网 发布:bear煮粥煲汤淘宝 编辑:程序博客网 时间:2024/05/21 10:10
object WriteHbase {  def main(args: Array[String]): Unit = {    //获取conf    val conf=HBaseConfiguration.create()    val tablenamein = args(0)    val tablenameout = args(1)    //设置读取的表    conf.set(TableInputFormat.INPUT_TABLE,tablenamein)    //设置写入的表    conf.set(TableOutputFormat.OUTPUT_TABLE,tablenameout)    val sparkConf=new SparkConf()    sparkConf.setAppName("read and write for hbase ")    sparkConf.setMaster("local[3]")    val sc=new SparkContext(sparkConf)    //job指定输出格式和输出表名    val newAPIJobConfiguration1 = Job.getInstance(conf)    newAPIJobConfiguration1.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, tablenameout)    newAPIJobConfiguration1.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]])    //全量读取hbase    val rdd=sc.newAPIHadoopRDD(conf,classOf[TableInputFormat]      ,classOf[ImmutableBytesWritable]      ,classOf[Result]    )    //过滤空数据,然后对每一个记录做更新,并转换成写入的格式    val final_rdd= rdd.map(forDatas)    //转换后的结果,再次做过滤    val save_rdd=final_rdd.filter(checkNull)    //最终在写回hbase    final_rdd.saveAsNewAPIHadoopDataset(newAPIJobConfiguration1.getConfiguration)    sc.stop()  }  //作用:过滤掉空列簇的数据  def checkNotEmptyKs(f:((ImmutableBytesWritable,Result))):Boolean={    val r=f._2    val rowkey=Bytes.toString(r.getRow)    val map:scala.collection.mutable.Map[Array[Byte],Array[Byte]]= r.getFamilyMap(Bytes.toBytes("f")).asScala    val map1:scala.collection.mutable.Map[Array[Byte],Array[Byte]]= r.getFamilyMap(Bytes.toBytes("h")).asScala    val map2:scala.collection.mutable.Map[Array[Byte],Array[Byte]]= r.getFamilyMap(Bytes.toBytes("mtdt")).asScala    if(map.isEmpty || map1.isEmpty || map2.isEmpty)  false else true  }  //作用:读取每一条数据,做update后,在转化成写入操作  def forDatas(f: (ImmutableBytesWritable,Result)): (ImmutableBytesWritable,Put)={    val r=f._2 //获取Result    val put:Put=new Put(r.getRow)    val ks=Bytes.toBytes("f")    val ks1=Bytes.toBytes("h")    val ks2=Bytes.toBytes("mtdt")    val map:scala.collection.mutable.Map[Array[Byte],Array[Byte]]= r.getFamilyMap(ks).asScala    map.foreach(kv=>{    val kid= Bytes.toString(kv._1)    var value=Bytes.toString(kv._2)      put.add(ks,kv._1,Bytes.toBytes(value))    }    )    val map1:scala.collection.mutable.Map[Array[Byte],Array[Byte]]= r.getFamilyMap(ks2).asScala    map1.foreach(kv=>{    val kid= Bytes.toString(kv._1)      var value=Bytes.toString(kv._2)      put.add(ks2,kv._1,Bytes.toBytes(value))    }    )    val map2:scala.collection.mutable.Map[Array[Byte],Array[Byte]]= r.getFamilyMap(ks1).asScala    map2.foreach(kv=>{    val kid= Bytes.toString(kv._1)      var value=Bytes.toString(kv._2)      put.add(ks2,kv._1,Bytes.toBytes(value))    }    )    if(put.isEmpty)  null  else (new ImmutableBytesWritable(),put)  }  //checkNull 作用:过滤最终结果里面的null数据  def checkNull(f:((ImmutableBytesWritable,Put))):Boolean={    if(f==null)  false  else true  }}
原创粉丝点击