Kafka:无丢失将kafka的值读取到hbase

来源:互联网 发布:免费下载东京淘宝商城 编辑:程序博客网 时间:2024/05/16 12:47

目录

1、需求

2、代码

1、需求

功能:将kafka数据读取到HBASE中
步鄹:
       1、从kafka提取topicName在zookeeper中的offset
       2、循环读取kafka中topicName中的offset且对比zookeeper中的offset
       3、将数据提前做好Scala映射类
       4、将提取的数据转为DF
       5、通过Phoenix存储到Hbase中

备注:博文中使用到了Zookeeper记录kafka的offset,请仔细阅读下面博客,理解会更加深入
http://blog.csdn.net/silentwolfyh/article/details/52985171
ZookeeperCurator框架应用和常用命令

2、代码及结构

这里写图片描述

package com.donews.kafka2hbaseimport com.donews.utils.{KafkaClusterHelper, ZookeeperHelper,WebLog}import kafka.common.TopicAndPartitionimport kafka.serializer.StringDecoderimport org.apache.commons.cli.{GnuParser, HelpFormatter, Options}import org.apache.spark.sql.{SQLContext, SaveMode}import org.apache.spark.streaming.kafka.{KafkaUtils, OffsetRange}import org.apache.spark.{SparkConf, SparkContext}import org.slf4j.LoggerFactory/**  * Created by  yuhui on 16-09-23.  *  * 功能:将kafka数据读取到HBASE中  * 步鄹:1、从kafka提取topicName在zookeeper中的offset  *      2、循环读取kafka中topicName中的offset且对比zookeeper中的offset  *      3、将数据提前做好Scala映射类  *      4、将提取的数据转为DF  *      5、通过Phoenix存储到Hbase中  * Hbase建表语句:CREATE TABLE WEB_LOG(appkey VARCHAR not null, day VARCHAR,timestamp VARCHAR not null,cookie VARCHAR,short_cookie VARCHAR,request_method VARCHAR, status Integer, http_referer VARCHAR, http_user_agent VARCHAR, http_x_forwarded_for VARCHAR, http_url VARCHAR, to_target VARCHAR,duration Integer,event VARCHAR,is_new Integer,  page_id VARCHAR,CONSTRAINT pk PRIMARY KEY (appkey, day ,timestamp, cookie,short_cookie)) SALT_BUCKETS = 20;  */object StreamingHbase {  val LOG = LoggerFactory.getLogger(StreamingHbase.getClass)  def main(args: Array[String]): Unit = {    val options = new Options()      .addOption("l", "local", false, "配置为本地模式运行模式")      .addOption("s", "source", true, "数据源,Kafka的Topic名称")      .addOption("h", "help", false, "打印帮助信息")    val parser = new GnuParser()    val cmdLine = parser.parse(options, args)    val processorName = cmdLine.getArgs.headOption.getOrElse("druid")    val isLocal = cmdLine.hasOption("local")    val source = cmdLine.getOptionValue("source")    if (cmdLine.hasOption("h")) {      val formatter = new HelpFormatter()      formatter.printHelp("StreamingHbase", options)      System.exit(0)    }    val topicsSet = source      .split(",")      .filterNot(_ == null)      .map(_.trim)      .filterNot(_.isEmpty)      .toSet    LOG.info("topicSet:{}===》 "+ topicsSet)    val kafkaParams = Map[String, String](      "metadata.broker.list" -> "slave01:9092,slave02:9092,slave03:9092",      "auto.offset.reset" -> "smallest"    )    val kafkaHelper = new KafkaClusterHelper(kafkaParams)    var conf = new SparkConf    if (isLocal) {      conf = conf.setAppName("StreamingHbase")        .setMaster("local[4]").        set("spark.local.dir", "/data/tmp/")    }    val sc = new SparkContext(conf)    val blockSize = 1024 * 1024 * 128 // 128MB    sc.hadoopConfiguration.setInt("dfs.blocksize", blockSize)    sc.hadoopConfiguration.setInt("parquet.block.size", blockSize)    val ctx = new SQLContext(sc)    import ctx.implicits._    ctx.setConf("spark.sql.parquet.mergeSchema", "true")    //程序进入while循环之后,会每小时读取zookeeper中的offset,消费kafka中的数据    while (true) {      var hasMore = false      do {        try {          hasMore=false          //获取Zookeeper中最新的offset,如果第一次则取kafkaParams中的smallest          val offsets = ZookeeperHelper.loadOffsets(topicsSet, kafkaHelper.getFromOffsets(kafkaParams, topicsSet))//          LOG.info("offsets.keySet===》"+offsets.toString())          //获取kafka中最新的offset          val latestOffsets = KafkaClusterHelper.checkErrors(kafkaHelper.getLatestLeaderOffsets(offsets.keySet))//          LOG.info("latestOffsets:"+latestOffsets.toString())          val offsetRanges = offsets.keys.map { tp =>            val fromOffset = offsets(tp)            val latestOffset = latestOffsets(tp).offset            if (latestOffset - fromOffset > 1024 * 1024) {              hasMore = true            }//            LOG.info("fromOffset:"+fromOffset+"===>latestOffset:"+latestOffset)            val chaOffset = latestOffset - fromOffset//            LOG.info("latestOffset - fromOffset:===>"+chaOffset)            OffsetRange(tp, fromOffset, Math.min(fromOffset + 1024 * 1024, latestOffset))  //限制成大约是500M          }.toArray          val rdd = KafkaUtils.createRDD[String, String, StringDecoder, StringDecoder](sc, kafkaParams, offsetRanges)          LOG.info("rdd===================》"+rdd.count()+"条记录")          //将Json数据注册为一个临时表          rdd.map { case (k, v) => v }            .map(WebLog.fromJson)            .filter(message => message != null)            .toDF().registerTempTable("kafkaTable")         //结果数据变成DF          val resultDF = ctx.sql("select appkey AS APPKEY ,day AS DAY , timestamp AS TIMESTAMP,cookie AS COOKIE,short_cookie AS SHORT_COOKIE,request_method AS REQUEST_METHOD," +            "status AS STATUS,http_referer AS HTTP_REFERER,http_user_agent AS HTTP_USER_AGENT,http_x_forwarded_for AS HTTP_X_FORWARDED_FOR," +            "http_url AS HTTP_URL,to_target AS TO_TARGET ,duration AS DURATION,event AS EVENT,is_new AS IS_NEW,page_id AS PAGE_ID from kafkaTable")          resultDF.write.mode(SaveMode.Overwrite).options(            Map("table" -> "WEB_LOG", "zkUrl" -> "slave01:2181;slave02:2181;slave03:2181")          ).format("org.apache.phoenix.spark").save()          LOG.info("插入===================》"+resultDF.count()+"条记录")          val nextOffsets = offsetRanges.map(x => (TopicAndPartition(x.topic, x.partition), x.untilOffset)).toMap          //将offset存储到zookeeper,zookeeper存储路径可以删除,保证数据不丢失及数据重新读入          ZookeeperHelper.storeOffsets(nextOffsets)          LOG.info("nextOffsets:"+nextOffsets.toString())          //每小时执行一次          Thread.sleep(1000L * 60 * 60)        } catch {          //程序错误重新运行          case e: Exception => LOG.error(e.getMessage, e)            LOG.info("sleep on error for 5000ms")            Thread.sleep(1000L * 5)            hasMore=true        }      } while (hasMore)    }  }}
package com.donews.utils/**  * Created by yuhui on 16-6-29.  * copy from spark-kafka source  */import java.util.Propertiesimport kafka.api._import kafka.common.{ErrorMapping, OffsetAndMetadata, OffsetMetadataAndError, TopicAndPartition}import kafka.consumer.{ConsumerConfig, SimpleConsumer}import org.apache.spark.SparkExceptionimport scala.collection.mutable.ArrayBufferimport scala.util.Randomimport scala.util.control.NonFatal/**  * Convenience methods for interacting with a Kafka cluster.  *  * @param kafkaParams Kafka <a href="http://kafka.apache.org/documentation.html#configuration">  *                    configuration parameters</a>.  *                    Requires "metadata.broker.list" or "bootstrap.servers" to be set with Kafka broker(s),  *                    NOT zookeeper servers, specified in host1:port1,host2:port2 form  */class KafkaClusterHelper(val kafkaParams: Map[String, String]) extends Serializable {  import KafkaClusterHelper.{Err, LeaderOffset, SimpleConsumerConfig}  // ConsumerConfig isn't serializable  @transient private var _config: SimpleConsumerConfig = null  def config: SimpleConsumerConfig = this.synchronized {    if (_config == null) {      _config = SimpleConsumerConfig(kafkaParams)    }    _config  }  def connect(host: String, port: Int): SimpleConsumer =    new SimpleConsumer(host, port, config.socketTimeoutMs,      config.socketReceiveBufferBytes, config.clientId)  def connectLeader(topic: String, partition: Int): Either[Err, SimpleConsumer] =    findLeader(topic, partition).right.map(hp => connect(hp._1, hp._2))  // Metadata api  // scalastyle:off  // https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol#AGuideToTheKafkaProtocol-MetadataAPI  // scalastyle:on  def findLeader(topic: String, partition: Int): Either[Err, (String, Int)] = {    val req = TopicMetadataRequest(TopicMetadataRequest.CurrentVersion,      0, config.clientId, Seq(topic))    val errs = new Err    withBrokers(Random.shuffle(config.seedBrokers), errs) { consumer =>      val resp: TopicMetadataResponse = consumer.send(req)      resp.topicsMetadata.find(_.topic == topic).flatMap { tm: TopicMetadata =>        tm.partitionsMetadata.find(_.partitionId == partition)      }.foreach { pm: PartitionMetadata =>        pm.leader.foreach { leader =>          return Right((leader.host, leader.port))        }      }    }    Left(errs)  }  def findLeaders(                   topicAndPartitions: Set[TopicAndPartition]                 ): Either[Err, Map[TopicAndPartition, (String, Int)]] = {    val topics = topicAndPartitions.map(_.topic)    val response = getPartitionMetadata(topics).right    val answer = response.flatMap { tms: Set[TopicMetadata] =>      val leaderMap = tms.flatMap { tm: TopicMetadata =>        tm.partitionsMetadata.flatMap { pm: PartitionMetadata =>          val tp = TopicAndPartition(tm.topic, pm.partitionId)          if (topicAndPartitions(tp)) {            pm.leader.map { l =>              tp -> (l.host -> l.port)            }          } else {            None          }        }      }.toMap      if (leaderMap.keys.size == topicAndPartitions.size) {        Right(leaderMap)      } else {        val missing = topicAndPartitions.diff(leaderMap.keySet)        val err = new Err        err.append(new SparkException(s"Couldn't find leaders for ${missing}"))        Left(err)      }    }    answer  }  def getPartitions(topics: Set[String]): Either[Err, Set[TopicAndPartition]] = {    getPartitionMetadata(topics).right.map { r =>      r.flatMap { tm: TopicMetadata =>        tm.partitionsMetadata.map { pm: PartitionMetadata =>          TopicAndPartition(tm.topic, pm.partitionId)        }      }    }  }  def getPartitionMetadata(topics: Set[String]): Either[Err, Set[TopicMetadata]] = {    val req = TopicMetadataRequest(      TopicMetadataRequest.CurrentVersion, 0, config.clientId, topics.toSeq)    val errs = new Err    withBrokers(Random.shuffle(config.seedBrokers), errs) { consumer =>      val resp: TopicMetadataResponse = consumer.send(req)      val respErrs = resp.topicsMetadata.filter(m => m.errorCode != ErrorMapping.NoError)      if (respErrs.isEmpty) {        return Right(resp.topicsMetadata.toSet)      } else {        respErrs.foreach { m =>          val cause = ErrorMapping.exceptionFor(m.errorCode)          val msg = s"Error getting partition metadata for '${m.topic}'. Does the topic exist?"          errs.append(new SparkException(msg, cause))        }      }    }    Left(errs)  }  //获取kafka最新的offset  def getLatestLeaderOffsets(                              topicAndPartitions: Set[TopicAndPartition]                            ): Either[Err, Map[TopicAndPartition, LeaderOffset]] =    getLeaderOffsets(topicAndPartitions, OffsetRequest.LatestTime)  def getEarliestLeaderOffsets(                                topicAndPartitions: Set[TopicAndPartition]                              ): Either[Err, Map[TopicAndPartition, LeaderOffset]] =    getLeaderOffsets(topicAndPartitions, OffsetRequest.EarliestTime)  def getLeaderOffsets(                        topicAndPartitions: Set[TopicAndPartition],                        before: Long                      ): Either[Err, Map[TopicAndPartition, LeaderOffset]] = {    getLeaderOffsets(topicAndPartitions, before, 1).right.map { r =>      r.map { kv =>        // mapValues isnt serializable, see SI-7005        kv._1 -> kv._2.head      }    }  }  private def flip[K, V](m: Map[K, V]): Map[V, Seq[K]] =    m.groupBy(_._2).map { kv =>      kv._1 -> kv._2.keys.toSeq    }  def getLeaderOffsets(                        topicAndPartitions: Set[TopicAndPartition],                        before: Long,                        maxNumOffsets: Int                      ): Either[Err, Map[TopicAndPartition, Seq[LeaderOffset]]] = {    findLeaders(topicAndPartitions).right.flatMap { tpToLeader =>      val leaderToTp: Map[(String, Int), Seq[TopicAndPartition]] = flip(tpToLeader)      val leaders = leaderToTp.keys      var result = Map[TopicAndPartition, Seq[LeaderOffset]]()      val errs = new Err      withBrokers(leaders, errs) { consumer =>        val partitionsToGetOffsets: Seq[TopicAndPartition] =          leaderToTp((consumer.host, consumer.port))        val reqMap = partitionsToGetOffsets.map { tp: TopicAndPartition =>          tp -> PartitionOffsetRequestInfo(before, maxNumOffsets)        }.toMap        val req = OffsetRequest(reqMap)        val resp = consumer.getOffsetsBefore(req)        val respMap = resp.partitionErrorAndOffsets        partitionsToGetOffsets.foreach { tp: TopicAndPartition =>          respMap.get(tp).foreach { por: PartitionOffsetsResponse =>            if (por.error == ErrorMapping.NoError) {              if (por.offsets.nonEmpty) {                result += tp -> por.offsets.map { off =>                  LeaderOffset(consumer.host, consumer.port, off)                }              } else {                errs.append(new SparkException(                  s"Empty offsets for ${tp}, is ${before} before log beginning?"))              }            } else {              errs.append(ErrorMapping.exceptionFor(por.error))            }          }        }        if (result.keys.size == topicAndPartitions.size) {          return Right(result)        }      }      val missing = topicAndPartitions.diff(result.keySet)      errs.append(new SparkException(s"Couldn't find leader offsets for ${missing}"))      Left(errs)    }  }  // Consumer offset api  // scalastyle:off  // https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol#AGuideToTheKafkaProtocol-OffsetCommit/FetchAPI  // scalastyle:on  // this 0 here indicates api version, in this case the original ZK backed api.  private def defaultConsumerApiVersion: Short = 0  /** Requires Kafka >= 0.8.1.1 */  def getConsumerOffsets(                          groupId: String,                          topicAndPartitions: Set[TopicAndPartition]                        ): Either[Err, Map[TopicAndPartition, Long]] =    getConsumerOffsets(groupId, topicAndPartitions, defaultConsumerApiVersion)  def getConsumerOffsets(                          groupId: String,                          topicAndPartitions: Set[TopicAndPartition],                          consumerApiVersion: Short                        ): Either[Err, Map[TopicAndPartition, Long]] = {    getConsumerOffsetMetadata(groupId, topicAndPartitions, consumerApiVersion).right.map { r =>      r.map { kv =>        kv._1 -> kv._2.offset      }    }  }  /** Requires Kafka >= 0.8.1.1 */  def getConsumerOffsetMetadata(                                 groupId: String,                                 topicAndPartitions: Set[TopicAndPartition]                               ): Either[Err, Map[TopicAndPartition, OffsetMetadataAndError]] =    getConsumerOffsetMetadata(groupId, topicAndPartitions, defaultConsumerApiVersion)  def getConsumerOffsetMetadata(                                 groupId: String,                                 topicAndPartitions: Set[TopicAndPartition],                                 consumerApiVersion: Short                               ): Either[Err, Map[TopicAndPartition, OffsetMetadataAndError]] = {    var result = Map[TopicAndPartition, OffsetMetadataAndError]()    val req = OffsetFetchRequest(groupId, topicAndPartitions.toSeq, consumerApiVersion)    val errs = new Err    withBrokers(Random.shuffle(config.seedBrokers), errs) { consumer =>      val resp = consumer.fetchOffsets(req)      val respMap = resp.requestInfo      val needed = topicAndPartitions.diff(result.keySet)      needed.foreach { tp: TopicAndPartition =>        respMap.get(tp).foreach { ome: OffsetMetadataAndError =>          if (ome.error == ErrorMapping.NoError) {            result += tp -> ome          } else {            errs.append(ErrorMapping.exceptionFor(ome.error))          }        }      }      if (result.keys.size == topicAndPartitions.size) {        return Right(result)      }    }    val missing = topicAndPartitions.diff(result.keySet)    errs.append(new SparkException(s"Couldn't find consumer offsets for ${missing}"))    Left(errs)  }  /** Requires Kafka >= 0.8.1.1 */  def setConsumerOffsets(                          groupId: String,                          offsets: Map[TopicAndPartition, Long]                        ): Either[Err, Map[TopicAndPartition, Short]] =    setConsumerOffsets(groupId, offsets, defaultConsumerApiVersion)  def setConsumerOffsets(                          groupId: String,                          offsets: Map[TopicAndPartition, Long],                          consumerApiVersion: Short                        ): Either[Err, Map[TopicAndPartition, Short]] = {    val meta = offsets.map { kv =>      kv._1 -> OffsetAndMetadata(kv._2)    }    setConsumerOffsetMetadata(groupId, meta, consumerApiVersion)  }  /** Requires Kafka >= 0.8.1.1 */  def setConsumerOffsetMetadata(                                 groupId: String,                                 metadata: Map[TopicAndPartition, OffsetAndMetadata]                               ): Either[Err, Map[TopicAndPartition, Short]] =    setConsumerOffsetMetadata(groupId, metadata, defaultConsumerApiVersion)  def setConsumerOffsetMetadata(                                 groupId: String,                                 metadata: Map[TopicAndPartition, OffsetAndMetadata],                                 consumerApiVersion: Short                               ): Either[Err, Map[TopicAndPartition, Short]] = {    var result = Map[TopicAndPartition, Short]()    val req = OffsetCommitRequest(groupId, metadata, consumerApiVersion)    val errs = new Err    val topicAndPartitions = metadata.keySet    withBrokers(Random.shuffle(config.seedBrokers), errs) { consumer =>      val resp = consumer.commitOffsets(req)      val respMap = resp.commitStatus      val needed = topicAndPartitions.diff(result.keySet)      needed.foreach { tp: TopicAndPartition =>        respMap.get(tp).foreach { err: Short =>          if (err == ErrorMapping.NoError) {            result += tp -> err          } else {            errs.append(ErrorMapping.exceptionFor(err))          }        }      }      if (result.keys.size == topicAndPartitions.size) {        return Right(result)      }    }    val missing = topicAndPartitions.diff(result.keySet)    errs.append(new SparkException(s"Couldn't set offsets for ${missing}"))    Left(errs)  }  // Try a call against potentially multiple brokers, accumulating errors  private def withBrokers(brokers: Iterable[(String, Int)], errs: Err)                         (fn: SimpleConsumer => Any): Unit = {    brokers.foreach { hp =>      var consumer: SimpleConsumer = null      try {        consumer = connect(hp._1, hp._2)        fn(consumer)      } catch {        case NonFatal(e) =>          errs.append(e)      } finally {        if (consumer != null) {          consumer.close()        }      }    }  }   //获取kafka最开始的offset  def getFromOffsets(kafkaParams: Map[String, String], topics: Set[String]): Map[TopicAndPartition, Long] = {    val reset = kafkaParams.get("auto.offset.reset").map(_.toLowerCase)    val result = for {      topicPartitions <- getPartitions(topics).right      leaderOffsets <- (if (reset == Some("smallest")) {        getEarliestLeaderOffsets(topicPartitions)      } else {        getLatestLeaderOffsets(topicPartitions)      }).right    } yield {      leaderOffsets.map { case (tp, lo) =>        (tp, lo.offset)      }    }    KafkaClusterHelper.checkErrors(result)  }}object KafkaClusterHelper {  type Err = ArrayBuffer[Throwable]  /** If the result is right, return it, otherwise throw SparkException */  def checkErrors[T](result: Either[Err, T]): T = {    result.fold(      errs => throw new SparkException(errs.mkString("\n")),      ok => ok    )  }  case class LeaderOffset(host: String, port: Int, offset: Long)  /**    * High-level kafka consumers connect to ZK.  ConsumerConfig assumes this use case.    * Simple consumers connect directly to brokers, but need many of the same configs.    * This subclass won't warn about missing ZK params, or presence of broker params.    */  class SimpleConsumerConfig private(brokers: String, originalProps: Properties)    extends ConsumerConfig(originalProps) {    val seedBrokers: Array[(String, Int)] = brokers.split(",").map { hp =>      val hpa = hp.split(":")      if (hpa.size == 1) {        throw new SparkException(s"Broker not in the correct format of <host>:<port> [$brokers]")      }      (hpa(0), hpa(1).toInt)    }  }  object SimpleConsumerConfig {    /**      * Make a consumer config without requiring group.id or zookeeper.connect,      * since communicating with brokers also needs common settings such as timeout      */    def apply(kafkaParams: Map[String, String]): SimpleConsumerConfig = {      // These keys are from other pre-existing kafka configs for specifying brokers, accept either      val brokers = kafkaParams.get("metadata.broker.list")        .orElse(kafkaParams.get("bootstrap.servers"))        .getOrElse(throw new SparkException(          "Must specify metadata.broker.list or bootstrap.servers"))      val props = new Properties()      kafkaParams.foreach { case (key, value) =>        // prevent warnings on parameters ConsumerConfig doesn't know about        if (key != "metadata.broker.list" && key != "bootstrap.servers") {          props.put(key, value)        }      }      Seq("zookeeper.connect", "group.id").foreach { s =>        if (!props.containsKey(s)) {          props.setProperty(s, "")        }      }      new SimpleConsumerConfig(brokers, props)    }  }  def main(args: Array[String]) {    val kafkaParams = Map[String, String](      "metadata.broker.list" -> "spark-slave03:9092,spark-slave04:9092,spark-slave05:9092",      "auto.offset.reset" -> "smallest"    )    val reset = kafkaParams.get("auto.offset.reset").map(_.toLowerCase)    println(reset == Some("smallest"))  }}
package com.donews.utilsimport java.util.Propertiesimport com.typesafe.config.{Config, ConfigFactory}object WebConfig {  private val conf: Config = ConfigFactory.load()  lazy val KAFKA_BROKER_LIST = conf.getString("kafka.metadata.broker.list")  lazy val ZOOKEEPER_CONNECT = conf.getString("zookeeper.connect")  lazy val HBASE_URL = conf.getString("Hbase.url")  lazy val ZOOKEEPER_OFFSET = conf.getString("zookeeper.offset")  /***    * 加载配置文件    * @param config    * @return    */  private def propsFromConfig(config: Config): Properties = {    import scala.collection.JavaConversions._    val props = new Properties()    val map: Map[String, Object] = config.entrySet().map({ entry =>      entry.getKey -> entry.getValue.unwrapped()    })(collection.breakOut)    props.putAll(map)    props  }}
package com.donews.utilsimport java.util.Propertiesimport com.typesafe.config.{Config, ConfigFactory}object WebConfig {  private val conf: Config = ConfigFactory.load()  lazy val KAFKA_BROKER_LIST = conf.getString("kafka.metadata.broker.list")  lazy val ZOOKEEPER_CONNECT = conf.getString("zookeeper.connect")  lazy val HBASE_URL = conf.getString("Hbase.url")  lazy val ZOOKEEPER_OFFSET = conf.getString("zookeeper.offset")  /***    * 加载配置文件    * @param config    * @return    */  private def propsFromConfig(config: Config): Properties = {    import scala.collection.JavaConversions._    val props = new Properties()    val map: Map[String, Object] = config.entrySet().map({ entry =>      entry.getKey -> entry.getValue.unwrapped()    })(collection.breakOut)    props.putAll(map)    props  }}
package com.donews.utilsimport com.fasterxml.jackson.annotation.JsonIgnorePropertiesimport com.fasterxml.jackson.databind.ObjectMapperimport com.fasterxml.jackson.module.scala.DefaultScalaModuleimport org.slf4j.LoggerFactory/**  * Created by yuhui on 16-9-20  * 将kafka数据转为对象返回  */@JsonIgnoreProperties(ignoreUnknown = true)case class WebLog(                   var appkey: String,                   timestamp: String,                   cookie: String,                   short_cookie: String,                   request_method: String,                   status: java.lang.Integer,                   http_referer: String,                   http_user_agent: String,                   http_x_forwarded_for: String,                   http_url: String,                   to_target: String,                   duration: java.lang.Integer,                   event: String,                   is_new: java.lang.Integer,                   page_id: String,                   var day: String                 )/***  * 将kafka消费出来的每行数据,进行解析过滤,最后转为对象  */object WebLog {  private val LOG = LoggerFactory.getLogger(WebLog.getClass)  val mapper = new ObjectMapper()  mapper.registerModule(DefaultScalaModule)  def fromJson(value: String): WebLog = {    if (value == null) return null    try {      val obj = mapper.readValue(value, classOf[WebLog])      if(obj.appkey == null || "".equals(obj.appkey)){obj.appkey="donews_website"}      if (obj.timestamp == null||"".equals( obj.timestamp )|| obj.cookie== null ||"".equals( obj.cookie ) ) {return null}      obj    } catch {      case e: Exception =>        LOG.info(e.getMessage, e)        null    }  }}
package com.donews.utilsimport kafka.common.TopicAndPartitionimport org.apache.curator.framework.CuratorFrameworkFactoryimport org.apache.curator.retry.ExponentialBackoffRetryimport org.slf4j.LoggerFactoryimport scala.collection.JavaConversions._/**  * Created by yuhui on 16-6-8.  */object ZookeeperHelper {  val LOG = LoggerFactory.getLogger(ZookeeperHelper.getClass)  val client = {    val client = CuratorFrameworkFactory      .builder      .connectString(WebConfig.ZOOKEEPER_CONNECT)      .retryPolicy(new ExponentialBackoffRetry(1000, 3))      .namespace("webstatistic_test")      .build()    client.start()    client  }  //zookeeper创建路径  def ensurePathExists(path: String): Unit = {    if (client.checkExists().forPath(path) == null) {      client.create().creatingParentsIfNeeded().forPath(path)    }  }  //zookeeper加载offset的方法  def loadOffsets(topicSet: Set[String], defaultOffset: Map[TopicAndPartition, Long]): Map[TopicAndPartition, Long] = {    val kafkaOffsetPath = s"/kafkaOffsets"    ensurePathExists(kafkaOffsetPath)    val offsets = for {    //t就是路径webstatistic/kafkaOffsets下面的子目录遍历      t <- client.getChildren.forPath(kafkaOffsetPath)      if topicSet.contains(t)    //p就是新路径   /webstatistic/kafkaOffsets/donews_website      p <- client.getChildren.forPath(s"$kafkaOffsetPath/$t")    } yield {      //遍历路径下面的partition中的offset      val data = client.getData.forPath(s"$kafkaOffsetPath/$t/$p")      //将data变成Long类型      val offset = java.lang.Long.valueOf(new String(data)).toLong      (TopicAndPartition(t, Integer.parseInt(p)), offset)    }    defaultOffset ++ offsets.toMap  }  //zookeeper存储offset的方法  def storeOffsets(offsets: Map[TopicAndPartition, Long]): Unit = {    val kafkaOffsetPath = s"/kafkaOffsets"    if (client.checkExists().forPath(kafkaOffsetPath) == null) {      client.create().creatingParentsIfNeeded().forPath(kafkaOffsetPath)    }    for ((tp, offset) <- offsets) {      val data = String.valueOf(offset).getBytes      val path = s"$kafkaOffsetPath/${tp.topic}/${tp.partition}"      ensurePathExists(path)      client.setData().forPath(path, data)    }  }}
0 0
原创粉丝点击