Kafka:无丢失将kafka的值读取到hbase
来源:互联网 发布:免费下载东京淘宝商城 编辑:程序博客网 时间:2024/05/16 12:47
目录
1、需求
2、代码
1、需求
功能:将kafka数据读取到HBASE中
步鄹:
1、从kafka提取topicName在zookeeper中的offset
2、循环读取kafka中topicName中的offset且对比zookeeper中的offset
3、将数据提前做好Scala映射类
4、将提取的数据转为DF
5、通过Phoenix存储到Hbase中
备注:博文中使用到了Zookeeper记录kafka的offset,请仔细阅读下面博客,理解会更加深入
http://blog.csdn.net/silentwolfyh/article/details/52985171
ZookeeperCurator框架应用和常用命令
2、代码及结构
package com.donews.kafka2hbaseimport com.donews.utils.{KafkaClusterHelper, ZookeeperHelper,WebLog}import kafka.common.TopicAndPartitionimport kafka.serializer.StringDecoderimport org.apache.commons.cli.{GnuParser, HelpFormatter, Options}import org.apache.spark.sql.{SQLContext, SaveMode}import org.apache.spark.streaming.kafka.{KafkaUtils, OffsetRange}import org.apache.spark.{SparkConf, SparkContext}import org.slf4j.LoggerFactory/** * Created by yuhui on 16-09-23. * * 功能:将kafka数据读取到HBASE中 * 步鄹:1、从kafka提取topicName在zookeeper中的offset * 2、循环读取kafka中topicName中的offset且对比zookeeper中的offset * 3、将数据提前做好Scala映射类 * 4、将提取的数据转为DF * 5、通过Phoenix存储到Hbase中 * Hbase建表语句:CREATE TABLE WEB_LOG(appkey VARCHAR not null, day VARCHAR,timestamp VARCHAR not null,cookie VARCHAR,short_cookie VARCHAR,request_method VARCHAR, status Integer, http_referer VARCHAR, http_user_agent VARCHAR, http_x_forwarded_for VARCHAR, http_url VARCHAR, to_target VARCHAR,duration Integer,event VARCHAR,is_new Integer, page_id VARCHAR,CONSTRAINT pk PRIMARY KEY (appkey, day ,timestamp, cookie,short_cookie)) SALT_BUCKETS = 20; */object StreamingHbase { val LOG = LoggerFactory.getLogger(StreamingHbase.getClass) def main(args: Array[String]): Unit = { val options = new Options() .addOption("l", "local", false, "配置为本地模式运行模式") .addOption("s", "source", true, "数据源,Kafka的Topic名称") .addOption("h", "help", false, "打印帮助信息") val parser = new GnuParser() val cmdLine = parser.parse(options, args) val processorName = cmdLine.getArgs.headOption.getOrElse("druid") val isLocal = cmdLine.hasOption("local") val source = cmdLine.getOptionValue("source") if (cmdLine.hasOption("h")) { val formatter = new HelpFormatter() formatter.printHelp("StreamingHbase", options) System.exit(0) } val topicsSet = source .split(",") .filterNot(_ == null) .map(_.trim) .filterNot(_.isEmpty) .toSet LOG.info("topicSet:{}===》 "+ topicsSet) val kafkaParams = Map[String, String]( "metadata.broker.list" -> "slave01:9092,slave02:9092,slave03:9092", "auto.offset.reset" -> "smallest" ) val kafkaHelper = new KafkaClusterHelper(kafkaParams) var conf = new SparkConf if (isLocal) { conf = conf.setAppName("StreamingHbase") .setMaster("local[4]"). set("spark.local.dir", "/data/tmp/") } val sc = new SparkContext(conf) val blockSize = 1024 * 1024 * 128 // 128MB sc.hadoopConfiguration.setInt("dfs.blocksize", blockSize) sc.hadoopConfiguration.setInt("parquet.block.size", blockSize) val ctx = new SQLContext(sc) import ctx.implicits._ ctx.setConf("spark.sql.parquet.mergeSchema", "true") //程序进入while循环之后,会每小时读取zookeeper中的offset,消费kafka中的数据 while (true) { var hasMore = false do { try { hasMore=false //获取Zookeeper中最新的offset,如果第一次则取kafkaParams中的smallest val offsets = ZookeeperHelper.loadOffsets(topicsSet, kafkaHelper.getFromOffsets(kafkaParams, topicsSet))// LOG.info("offsets.keySet===》"+offsets.toString()) //获取kafka中最新的offset val latestOffsets = KafkaClusterHelper.checkErrors(kafkaHelper.getLatestLeaderOffsets(offsets.keySet))// LOG.info("latestOffsets:"+latestOffsets.toString()) val offsetRanges = offsets.keys.map { tp => val fromOffset = offsets(tp) val latestOffset = latestOffsets(tp).offset if (latestOffset - fromOffset > 1024 * 1024) { hasMore = true }// LOG.info("fromOffset:"+fromOffset+"===>latestOffset:"+latestOffset) val chaOffset = latestOffset - fromOffset// LOG.info("latestOffset - fromOffset:===>"+chaOffset) OffsetRange(tp, fromOffset, Math.min(fromOffset + 1024 * 1024, latestOffset)) //限制成大约是500M }.toArray val rdd = KafkaUtils.createRDD[String, String, StringDecoder, StringDecoder](sc, kafkaParams, offsetRanges) LOG.info("rdd===================》"+rdd.count()+"条记录") //将Json数据注册为一个临时表 rdd.map { case (k, v) => v } .map(WebLog.fromJson) .filter(message => message != null) .toDF().registerTempTable("kafkaTable") //结果数据变成DF val resultDF = ctx.sql("select appkey AS APPKEY ,day AS DAY , timestamp AS TIMESTAMP,cookie AS COOKIE,short_cookie AS SHORT_COOKIE,request_method AS REQUEST_METHOD," + "status AS STATUS,http_referer AS HTTP_REFERER,http_user_agent AS HTTP_USER_AGENT,http_x_forwarded_for AS HTTP_X_FORWARDED_FOR," + "http_url AS HTTP_URL,to_target AS TO_TARGET ,duration AS DURATION,event AS EVENT,is_new AS IS_NEW,page_id AS PAGE_ID from kafkaTable") resultDF.write.mode(SaveMode.Overwrite).options( Map("table" -> "WEB_LOG", "zkUrl" -> "slave01:2181;slave02:2181;slave03:2181") ).format("org.apache.phoenix.spark").save() LOG.info("插入===================》"+resultDF.count()+"条记录") val nextOffsets = offsetRanges.map(x => (TopicAndPartition(x.topic, x.partition), x.untilOffset)).toMap //将offset存储到zookeeper,zookeeper存储路径可以删除,保证数据不丢失及数据重新读入 ZookeeperHelper.storeOffsets(nextOffsets) LOG.info("nextOffsets:"+nextOffsets.toString()) //每小时执行一次 Thread.sleep(1000L * 60 * 60) } catch { //程序错误重新运行 case e: Exception => LOG.error(e.getMessage, e) LOG.info("sleep on error for 5000ms") Thread.sleep(1000L * 5) hasMore=true } } while (hasMore) } }}
package com.donews.utils/** * Created by yuhui on 16-6-29. * copy from spark-kafka source */import java.util.Propertiesimport kafka.api._import kafka.common.{ErrorMapping, OffsetAndMetadata, OffsetMetadataAndError, TopicAndPartition}import kafka.consumer.{ConsumerConfig, SimpleConsumer}import org.apache.spark.SparkExceptionimport scala.collection.mutable.ArrayBufferimport scala.util.Randomimport scala.util.control.NonFatal/** * Convenience methods for interacting with a Kafka cluster. * * @param kafkaParams Kafka <a href="http://kafka.apache.org/documentation.html#configuration"> * configuration parameters</a>. * Requires "metadata.broker.list" or "bootstrap.servers" to be set with Kafka broker(s), * NOT zookeeper servers, specified in host1:port1,host2:port2 form */class KafkaClusterHelper(val kafkaParams: Map[String, String]) extends Serializable { import KafkaClusterHelper.{Err, LeaderOffset, SimpleConsumerConfig} // ConsumerConfig isn't serializable @transient private var _config: SimpleConsumerConfig = null def config: SimpleConsumerConfig = this.synchronized { if (_config == null) { _config = SimpleConsumerConfig(kafkaParams) } _config } def connect(host: String, port: Int): SimpleConsumer = new SimpleConsumer(host, port, config.socketTimeoutMs, config.socketReceiveBufferBytes, config.clientId) def connectLeader(topic: String, partition: Int): Either[Err, SimpleConsumer] = findLeader(topic, partition).right.map(hp => connect(hp._1, hp._2)) // Metadata api // scalastyle:off // https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol#AGuideToTheKafkaProtocol-MetadataAPI // scalastyle:on def findLeader(topic: String, partition: Int): Either[Err, (String, Int)] = { val req = TopicMetadataRequest(TopicMetadataRequest.CurrentVersion, 0, config.clientId, Seq(topic)) val errs = new Err withBrokers(Random.shuffle(config.seedBrokers), errs) { consumer => val resp: TopicMetadataResponse = consumer.send(req) resp.topicsMetadata.find(_.topic == topic).flatMap { tm: TopicMetadata => tm.partitionsMetadata.find(_.partitionId == partition) }.foreach { pm: PartitionMetadata => pm.leader.foreach { leader => return Right((leader.host, leader.port)) } } } Left(errs) } def findLeaders( topicAndPartitions: Set[TopicAndPartition] ): Either[Err, Map[TopicAndPartition, (String, Int)]] = { val topics = topicAndPartitions.map(_.topic) val response = getPartitionMetadata(topics).right val answer = response.flatMap { tms: Set[TopicMetadata] => val leaderMap = tms.flatMap { tm: TopicMetadata => tm.partitionsMetadata.flatMap { pm: PartitionMetadata => val tp = TopicAndPartition(tm.topic, pm.partitionId) if (topicAndPartitions(tp)) { pm.leader.map { l => tp -> (l.host -> l.port) } } else { None } } }.toMap if (leaderMap.keys.size == topicAndPartitions.size) { Right(leaderMap) } else { val missing = topicAndPartitions.diff(leaderMap.keySet) val err = new Err err.append(new SparkException(s"Couldn't find leaders for ${missing}")) Left(err) } } answer } def getPartitions(topics: Set[String]): Either[Err, Set[TopicAndPartition]] = { getPartitionMetadata(topics).right.map { r => r.flatMap { tm: TopicMetadata => tm.partitionsMetadata.map { pm: PartitionMetadata => TopicAndPartition(tm.topic, pm.partitionId) } } } } def getPartitionMetadata(topics: Set[String]): Either[Err, Set[TopicMetadata]] = { val req = TopicMetadataRequest( TopicMetadataRequest.CurrentVersion, 0, config.clientId, topics.toSeq) val errs = new Err withBrokers(Random.shuffle(config.seedBrokers), errs) { consumer => val resp: TopicMetadataResponse = consumer.send(req) val respErrs = resp.topicsMetadata.filter(m => m.errorCode != ErrorMapping.NoError) if (respErrs.isEmpty) { return Right(resp.topicsMetadata.toSet) } else { respErrs.foreach { m => val cause = ErrorMapping.exceptionFor(m.errorCode) val msg = s"Error getting partition metadata for '${m.topic}'. Does the topic exist?" errs.append(new SparkException(msg, cause)) } } } Left(errs) } //获取kafka最新的offset def getLatestLeaderOffsets( topicAndPartitions: Set[TopicAndPartition] ): Either[Err, Map[TopicAndPartition, LeaderOffset]] = getLeaderOffsets(topicAndPartitions, OffsetRequest.LatestTime) def getEarliestLeaderOffsets( topicAndPartitions: Set[TopicAndPartition] ): Either[Err, Map[TopicAndPartition, LeaderOffset]] = getLeaderOffsets(topicAndPartitions, OffsetRequest.EarliestTime) def getLeaderOffsets( topicAndPartitions: Set[TopicAndPartition], before: Long ): Either[Err, Map[TopicAndPartition, LeaderOffset]] = { getLeaderOffsets(topicAndPartitions, before, 1).right.map { r => r.map { kv => // mapValues isnt serializable, see SI-7005 kv._1 -> kv._2.head } } } private def flip[K, V](m: Map[K, V]): Map[V, Seq[K]] = m.groupBy(_._2).map { kv => kv._1 -> kv._2.keys.toSeq } def getLeaderOffsets( topicAndPartitions: Set[TopicAndPartition], before: Long, maxNumOffsets: Int ): Either[Err, Map[TopicAndPartition, Seq[LeaderOffset]]] = { findLeaders(topicAndPartitions).right.flatMap { tpToLeader => val leaderToTp: Map[(String, Int), Seq[TopicAndPartition]] = flip(tpToLeader) val leaders = leaderToTp.keys var result = Map[TopicAndPartition, Seq[LeaderOffset]]() val errs = new Err withBrokers(leaders, errs) { consumer => val partitionsToGetOffsets: Seq[TopicAndPartition] = leaderToTp((consumer.host, consumer.port)) val reqMap = partitionsToGetOffsets.map { tp: TopicAndPartition => tp -> PartitionOffsetRequestInfo(before, maxNumOffsets) }.toMap val req = OffsetRequest(reqMap) val resp = consumer.getOffsetsBefore(req) val respMap = resp.partitionErrorAndOffsets partitionsToGetOffsets.foreach { tp: TopicAndPartition => respMap.get(tp).foreach { por: PartitionOffsetsResponse => if (por.error == ErrorMapping.NoError) { if (por.offsets.nonEmpty) { result += tp -> por.offsets.map { off => LeaderOffset(consumer.host, consumer.port, off) } } else { errs.append(new SparkException( s"Empty offsets for ${tp}, is ${before} before log beginning?")) } } else { errs.append(ErrorMapping.exceptionFor(por.error)) } } } if (result.keys.size == topicAndPartitions.size) { return Right(result) } } val missing = topicAndPartitions.diff(result.keySet) errs.append(new SparkException(s"Couldn't find leader offsets for ${missing}")) Left(errs) } } // Consumer offset api // scalastyle:off // https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol#AGuideToTheKafkaProtocol-OffsetCommit/FetchAPI // scalastyle:on // this 0 here indicates api version, in this case the original ZK backed api. private def defaultConsumerApiVersion: Short = 0 /** Requires Kafka >= 0.8.1.1 */ def getConsumerOffsets( groupId: String, topicAndPartitions: Set[TopicAndPartition] ): Either[Err, Map[TopicAndPartition, Long]] = getConsumerOffsets(groupId, topicAndPartitions, defaultConsumerApiVersion) def getConsumerOffsets( groupId: String, topicAndPartitions: Set[TopicAndPartition], consumerApiVersion: Short ): Either[Err, Map[TopicAndPartition, Long]] = { getConsumerOffsetMetadata(groupId, topicAndPartitions, consumerApiVersion).right.map { r => r.map { kv => kv._1 -> kv._2.offset } } } /** Requires Kafka >= 0.8.1.1 */ def getConsumerOffsetMetadata( groupId: String, topicAndPartitions: Set[TopicAndPartition] ): Either[Err, Map[TopicAndPartition, OffsetMetadataAndError]] = getConsumerOffsetMetadata(groupId, topicAndPartitions, defaultConsumerApiVersion) def getConsumerOffsetMetadata( groupId: String, topicAndPartitions: Set[TopicAndPartition], consumerApiVersion: Short ): Either[Err, Map[TopicAndPartition, OffsetMetadataAndError]] = { var result = Map[TopicAndPartition, OffsetMetadataAndError]() val req = OffsetFetchRequest(groupId, topicAndPartitions.toSeq, consumerApiVersion) val errs = new Err withBrokers(Random.shuffle(config.seedBrokers), errs) { consumer => val resp = consumer.fetchOffsets(req) val respMap = resp.requestInfo val needed = topicAndPartitions.diff(result.keySet) needed.foreach { tp: TopicAndPartition => respMap.get(tp).foreach { ome: OffsetMetadataAndError => if (ome.error == ErrorMapping.NoError) { result += tp -> ome } else { errs.append(ErrorMapping.exceptionFor(ome.error)) } } } if (result.keys.size == topicAndPartitions.size) { return Right(result) } } val missing = topicAndPartitions.diff(result.keySet) errs.append(new SparkException(s"Couldn't find consumer offsets for ${missing}")) Left(errs) } /** Requires Kafka >= 0.8.1.1 */ def setConsumerOffsets( groupId: String, offsets: Map[TopicAndPartition, Long] ): Either[Err, Map[TopicAndPartition, Short]] = setConsumerOffsets(groupId, offsets, defaultConsumerApiVersion) def setConsumerOffsets( groupId: String, offsets: Map[TopicAndPartition, Long], consumerApiVersion: Short ): Either[Err, Map[TopicAndPartition, Short]] = { val meta = offsets.map { kv => kv._1 -> OffsetAndMetadata(kv._2) } setConsumerOffsetMetadata(groupId, meta, consumerApiVersion) } /** Requires Kafka >= 0.8.1.1 */ def setConsumerOffsetMetadata( groupId: String, metadata: Map[TopicAndPartition, OffsetAndMetadata] ): Either[Err, Map[TopicAndPartition, Short]] = setConsumerOffsetMetadata(groupId, metadata, defaultConsumerApiVersion) def setConsumerOffsetMetadata( groupId: String, metadata: Map[TopicAndPartition, OffsetAndMetadata], consumerApiVersion: Short ): Either[Err, Map[TopicAndPartition, Short]] = { var result = Map[TopicAndPartition, Short]() val req = OffsetCommitRequest(groupId, metadata, consumerApiVersion) val errs = new Err val topicAndPartitions = metadata.keySet withBrokers(Random.shuffle(config.seedBrokers), errs) { consumer => val resp = consumer.commitOffsets(req) val respMap = resp.commitStatus val needed = topicAndPartitions.diff(result.keySet) needed.foreach { tp: TopicAndPartition => respMap.get(tp).foreach { err: Short => if (err == ErrorMapping.NoError) { result += tp -> err } else { errs.append(ErrorMapping.exceptionFor(err)) } } } if (result.keys.size == topicAndPartitions.size) { return Right(result) } } val missing = topicAndPartitions.diff(result.keySet) errs.append(new SparkException(s"Couldn't set offsets for ${missing}")) Left(errs) } // Try a call against potentially multiple brokers, accumulating errors private def withBrokers(brokers: Iterable[(String, Int)], errs: Err) (fn: SimpleConsumer => Any): Unit = { brokers.foreach { hp => var consumer: SimpleConsumer = null try { consumer = connect(hp._1, hp._2) fn(consumer) } catch { case NonFatal(e) => errs.append(e) } finally { if (consumer != null) { consumer.close() } } } } //获取kafka最开始的offset def getFromOffsets(kafkaParams: Map[String, String], topics: Set[String]): Map[TopicAndPartition, Long] = { val reset = kafkaParams.get("auto.offset.reset").map(_.toLowerCase) val result = for { topicPartitions <- getPartitions(topics).right leaderOffsets <- (if (reset == Some("smallest")) { getEarliestLeaderOffsets(topicPartitions) } else { getLatestLeaderOffsets(topicPartitions) }).right } yield { leaderOffsets.map { case (tp, lo) => (tp, lo.offset) } } KafkaClusterHelper.checkErrors(result) }}object KafkaClusterHelper { type Err = ArrayBuffer[Throwable] /** If the result is right, return it, otherwise throw SparkException */ def checkErrors[T](result: Either[Err, T]): T = { result.fold( errs => throw new SparkException(errs.mkString("\n")), ok => ok ) } case class LeaderOffset(host: String, port: Int, offset: Long) /** * High-level kafka consumers connect to ZK. ConsumerConfig assumes this use case. * Simple consumers connect directly to brokers, but need many of the same configs. * This subclass won't warn about missing ZK params, or presence of broker params. */ class SimpleConsumerConfig private(brokers: String, originalProps: Properties) extends ConsumerConfig(originalProps) { val seedBrokers: Array[(String, Int)] = brokers.split(",").map { hp => val hpa = hp.split(":") if (hpa.size == 1) { throw new SparkException(s"Broker not in the correct format of <host>:<port> [$brokers]") } (hpa(0), hpa(1).toInt) } } object SimpleConsumerConfig { /** * Make a consumer config without requiring group.id or zookeeper.connect, * since communicating with brokers also needs common settings such as timeout */ def apply(kafkaParams: Map[String, String]): SimpleConsumerConfig = { // These keys are from other pre-existing kafka configs for specifying brokers, accept either val brokers = kafkaParams.get("metadata.broker.list") .orElse(kafkaParams.get("bootstrap.servers")) .getOrElse(throw new SparkException( "Must specify metadata.broker.list or bootstrap.servers")) val props = new Properties() kafkaParams.foreach { case (key, value) => // prevent warnings on parameters ConsumerConfig doesn't know about if (key != "metadata.broker.list" && key != "bootstrap.servers") { props.put(key, value) } } Seq("zookeeper.connect", "group.id").foreach { s => if (!props.containsKey(s)) { props.setProperty(s, "") } } new SimpleConsumerConfig(brokers, props) } } def main(args: Array[String]) { val kafkaParams = Map[String, String]( "metadata.broker.list" -> "spark-slave03:9092,spark-slave04:9092,spark-slave05:9092", "auto.offset.reset" -> "smallest" ) val reset = kafkaParams.get("auto.offset.reset").map(_.toLowerCase) println(reset == Some("smallest")) }}
package com.donews.utilsimport java.util.Propertiesimport com.typesafe.config.{Config, ConfigFactory}object WebConfig { private val conf: Config = ConfigFactory.load() lazy val KAFKA_BROKER_LIST = conf.getString("kafka.metadata.broker.list") lazy val ZOOKEEPER_CONNECT = conf.getString("zookeeper.connect") lazy val HBASE_URL = conf.getString("Hbase.url") lazy val ZOOKEEPER_OFFSET = conf.getString("zookeeper.offset") /*** * 加载配置文件 * @param config * @return */ private def propsFromConfig(config: Config): Properties = { import scala.collection.JavaConversions._ val props = new Properties() val map: Map[String, Object] = config.entrySet().map({ entry => entry.getKey -> entry.getValue.unwrapped() })(collection.breakOut) props.putAll(map) props }}
package com.donews.utilsimport java.util.Propertiesimport com.typesafe.config.{Config, ConfigFactory}object WebConfig { private val conf: Config = ConfigFactory.load() lazy val KAFKA_BROKER_LIST = conf.getString("kafka.metadata.broker.list") lazy val ZOOKEEPER_CONNECT = conf.getString("zookeeper.connect") lazy val HBASE_URL = conf.getString("Hbase.url") lazy val ZOOKEEPER_OFFSET = conf.getString("zookeeper.offset") /*** * 加载配置文件 * @param config * @return */ private def propsFromConfig(config: Config): Properties = { import scala.collection.JavaConversions._ val props = new Properties() val map: Map[String, Object] = config.entrySet().map({ entry => entry.getKey -> entry.getValue.unwrapped() })(collection.breakOut) props.putAll(map) props }}
package com.donews.utilsimport com.fasterxml.jackson.annotation.JsonIgnorePropertiesimport com.fasterxml.jackson.databind.ObjectMapperimport com.fasterxml.jackson.module.scala.DefaultScalaModuleimport org.slf4j.LoggerFactory/** * Created by yuhui on 16-9-20 * 将kafka数据转为对象返回 */@JsonIgnoreProperties(ignoreUnknown = true)case class WebLog( var appkey: String, timestamp: String, cookie: String, short_cookie: String, request_method: String, status: java.lang.Integer, http_referer: String, http_user_agent: String, http_x_forwarded_for: String, http_url: String, to_target: String, duration: java.lang.Integer, event: String, is_new: java.lang.Integer, page_id: String, var day: String )/*** * 将kafka消费出来的每行数据,进行解析过滤,最后转为对象 */object WebLog { private val LOG = LoggerFactory.getLogger(WebLog.getClass) val mapper = new ObjectMapper() mapper.registerModule(DefaultScalaModule) def fromJson(value: String): WebLog = { if (value == null) return null try { val obj = mapper.readValue(value, classOf[WebLog]) if(obj.appkey == null || "".equals(obj.appkey)){obj.appkey="donews_website"} if (obj.timestamp == null||"".equals( obj.timestamp )|| obj.cookie== null ||"".equals( obj.cookie ) ) {return null} obj } catch { case e: Exception => LOG.info(e.getMessage, e) null } }}
package com.donews.utilsimport kafka.common.TopicAndPartitionimport org.apache.curator.framework.CuratorFrameworkFactoryimport org.apache.curator.retry.ExponentialBackoffRetryimport org.slf4j.LoggerFactoryimport scala.collection.JavaConversions._/** * Created by yuhui on 16-6-8. */object ZookeeperHelper { val LOG = LoggerFactory.getLogger(ZookeeperHelper.getClass) val client = { val client = CuratorFrameworkFactory .builder .connectString(WebConfig.ZOOKEEPER_CONNECT) .retryPolicy(new ExponentialBackoffRetry(1000, 3)) .namespace("webstatistic_test") .build() client.start() client } //zookeeper创建路径 def ensurePathExists(path: String): Unit = { if (client.checkExists().forPath(path) == null) { client.create().creatingParentsIfNeeded().forPath(path) } } //zookeeper加载offset的方法 def loadOffsets(topicSet: Set[String], defaultOffset: Map[TopicAndPartition, Long]): Map[TopicAndPartition, Long] = { val kafkaOffsetPath = s"/kafkaOffsets" ensurePathExists(kafkaOffsetPath) val offsets = for { //t就是路径webstatistic/kafkaOffsets下面的子目录遍历 t <- client.getChildren.forPath(kafkaOffsetPath) if topicSet.contains(t) //p就是新路径 /webstatistic/kafkaOffsets/donews_website p <- client.getChildren.forPath(s"$kafkaOffsetPath/$t") } yield { //遍历路径下面的partition中的offset val data = client.getData.forPath(s"$kafkaOffsetPath/$t/$p") //将data变成Long类型 val offset = java.lang.Long.valueOf(new String(data)).toLong (TopicAndPartition(t, Integer.parseInt(p)), offset) } defaultOffset ++ offsets.toMap } //zookeeper存储offset的方法 def storeOffsets(offsets: Map[TopicAndPartition, Long]): Unit = { val kafkaOffsetPath = s"/kafkaOffsets" if (client.checkExists().forPath(kafkaOffsetPath) == null) { client.create().creatingParentsIfNeeded().forPath(kafkaOffsetPath) } for ((tp, offset) <- offsets) { val data = String.valueOf(offset).getBytes val path = s"$kafkaOffsetPath/${tp.topic}/${tp.partition}" ensurePathExists(path) client.setData().forPath(path, data) } }}
0 0
- Kafka:无丢失将kafka的值读取到hbase
- Kafka:无丢失提取kafka的值,详解kafka的消费过程
- SparkStreaming无丢失读取Kafka且转为DataFrame
- Kafka无消息丢失配置
- Kafka无消息丢失配置
- Kafka无消息丢失配置
- python hbase读取数据发送kafka
- logstash配置读取文本文件使用kafka传送到kafka服务器
- 将Rsyslog的日志输出到Kafka消息队列
- logstash将采取kafka的数据到elasticSearch配置
- logstash通过codec将完整的数据发送到kafka
- 读取Kafka集群的消息
- spark-streaming 读取kafka数据不丢失(一)
- spark streaming读取kafka数据令丢失(二)
- spark streaming读取kafka 零丢失(三)
- [WARN] Network error when fetching messages:storm-kafka-0.8导致kafka读取数据丢失
- 从kafka到flink到hbase的心酸路程示例(希望有用)
- Storm+Kafka+Hbase的wordcount统计
- Android Retrofit 2.0 配置OkHttpClient
- 工作第十五周:上线前的惊悚
- 机械硬盘与固态硬盘的区别
- linux内存管理-虚拟地址转换成物理地址的过程
- 两阶段提交
- Kafka:无丢失将kafka的值读取到hbase
- 天平平衡(递归)
- Days40 Material Design(二)
- 神经网络和深度学习(一)
- 【transitionEnd】解决如何在transition的时间结束后执行javascript
- 2.JSON的新方法及data-自定义数据
- 秒杀多线程-经典线程同步之互斥量Mutex
- 微信公众号-消息推送-实现固定模式下的私人订制
- Sql数据库基本操作