SparkStreaming连接到kafka,防止重复消费
来源:互联网 发布:红三兵炒股软件下载 编辑:程序博客网 时间:2024/06/16 00:39
package com.manulife.mbps.behavior.data.streamimport java.util.Propertiesimport com.cloudera.com.amazonaws.util.json.JSONObjectimport com.manulife.mbps.behavior.common.utils.{IpUtil, ValidateUtil}import kafka.api.{OffsetRequest, PartitionOffsetRequestInfo, TopicMetadataRequest}import kafka.common.TopicAndPartitionimport kafka.consumer.SimpleConsumerimport kafka.producer.{KeyedMessage, Producer, ProducerConfig}import kafka.utils.{ZKGroupTopicDirs, ZkUtils}import org.I0Itec.zkclient.{ZkClient, ZkConnection}import org.I0Itec.zkclient.serialize.ZkSerializerimport org.apache.kafka.clients.consumer.ConsumerRecordimport org.apache.kafka.common.TopicPartitionimport org.apache.kafka.common.serialization.StringDeserializerimport org.apache.spark.SparkConfimport org.apache.spark.streaming.dstream.InputDStreamimport org.apache.spark.streaming.kafka010.ConsumerStrategies.{Assign, Subscribe}import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistentimport org.apache.spark.streaming.kafka010._import org.apache.spark.streaming.{Seconds, StreamingContext}/** * Created by nickliu on 7/5/2017. */object SparkStream_kafka { def main(args: Array[String]) { val topic = // val topics = Set(//) val group =// val zkHosts = // val brokers = // val sparkConf = new SparkConf().setAppName("Test-SparkDemo-kafka").setMaster("local[3]") /** 限制每秒钟从topic的每个partition最多消费的消息条数 */ sparkConf.set("spark.streaming.kafka.maxRatePerPartition", "1000") val ssc = new StreamingContext(sparkConf, Seconds(1)) /** kafka config */ val kafkaParams = Map[String, Object]( "bootstrap.servers" -> brokers, "key.deserializer" -> classOf[StringDeserializer], "value.deserializer" -> classOf[StringDeserializer], "group.id" -> group, "zookeeper.set.acl" -> "false", "auto.offset.reset" -> "latest", //kafka.api.OffsetRequest.SmallestTimeString "enable.auto.commit" -> (false: java.lang.Boolean) // 关闭自动提交 ) /** 获取zk中offset */ val topicDirs = new ZKGroupTopicDirs(group, topic) /** consumerOffsetDir:zk中的topic路径,这里是/consumers/example/offsets/ICPRecommend */ val consumerOffsetDir = topicDirs.consumerOffsetDir /** zookeeper connection */ val zkClient = new ZkClient(zkHosts, 10000, 10000, new ZkSerializer { override def serialize(data: scala.Any): Array[Byte] = data.asInstanceOf[String].getBytes("UTF-8") override def deserialize(bytes: Array[Byte]): AnyRef = if (bytes == null) null else new String(bytes, "UTF-8") }) //连接到zk更新offset val zkUtils = new ZkUtils(zkClient, new ZkConnection(zkHosts), false) val children = zkClient.countChildren(consumerOffsetDir) var kafkaStream: InputDStream[ConsumerRecord[String, String]] = null var fromOffsets: Map[TopicPartition, Long] = Map() if (children > 0) { /** get partition leader begin */ val topicList = List(topic) /** 得到该topic的一些信息,比如broker,partition分布情况 */ val req = new TopicMetadataRequest(topicList, 0) /** low level api interface */ val getLeaderConsumer = new SimpleConsumer("master", 9092, 10000, 10000, "OffsetLookup") val res = getLeaderConsumer.send(req) //TopicMetadataRequest topic broker partition 的一些信息 val topicMetaOption = res.topicsMetadata.headOption /** partitions */ val partitions = topicMetaOption match { case Some(tm) => tm.partitionsMetadata.map(pm => (pm.partitionId, pm.leader.get.host)).toMap[Int, String] //分区与主机之间的map映射 case None => Map[Int, String]() } /** get partition leader end */ /** offset */ for (i <- 0 until children) { val zkPartitionOffset = zkClient.readData[String](s"${topicDirs.consumerOffsetDir}/${i}") val tp = new TopicPartition(topic, i) val tap = new TopicAndPartition(topic, i) /** additional begin */ val requestMin = OffsetRequest(Map(tap -> PartitionOffsetRequestInfo(OffsetRequest.EarliestTime, 1))) // -2,1 val getMinOffsetConsumer = new SimpleConsumer(partitions(i), 9092, 10000, 10000, "getMinOffset") val kafkaCurMinOffset = getMinOffsetConsumer.getOffsetsBefore(requestMin).partitionErrorAndOffsets(tap).offsets var nextOffset = zkPartitionOffset.toLong if (kafkaCurMinOffset.length > 0 && nextOffset < kafkaCurMinOffset.head) { /** 如果下一个offset小于当前的offset */ nextOffset = kafkaCurMinOffset.head } /** additional end */ fromOffsets += (tp -> nextOffset) } kafkaStream = KafkaUtils.createDirectStream[String, String]( ssc, PreferConsistent, Assign[String, String](fromOffsets.keys.toList, kafkaParams, fromOffsets) ) } else { kafkaStream = KafkaUtils.createDirectStream[String, String]( ssc, PreferConsistent, Subscribe[String, String](topics, kafkaParams) ) } kafkaStream.foreachRDD { rdd => val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges /** 将partitionId及起始offset存储到zookeeper中 */ for (offset <- offsetRanges) { val zkPath = s"${topicDirs.consumerOffsetDir}/${offset.partition}" zkUtils.updatePersistentPath(zkPath, offset.fromOffset.toString) } rdd.foreachPartition( message => { while (message.hasNext) { val msg = message.next() println("message: " + msg) sendData(msg) } }) } ssc.start() ssc.awaitTermination() } def sendData(value: ConsumerRecord[String, String]) = { var events = value.value().toString var js = new JSONObject(events) val channel = js.getString("channel") var ip = js.getString("ip") var user_id = js.getString("user_id") var user_id_md5 = js.getString("user_id_md5") var plan_code = js.getString("plan_code") var oper_type = js.getString("oper_type") var catagory = js.getString("category") var time = js.getString("time") var mid = js.getString("mid") //校验ip地址是否合法 ip = ValidateUtil.ipCheck(ip) var IpObject = IpUtil.ipToBean(IpUtil.ipToLong(ip)) var stat: String = null var territory: String = null if (IpObject == null) { stat = "-" territory = "-" } else { stat = IpObject.getState territory = IpObject.getTerritory } val topic1 = // val brokers = // val props = new Properties() props.put("metadata.broker.list", brokers) props.put("serializer.class", "kafka.serializer.StringEncoder") val kafkaConfig = new ProducerConfig(props) val producer = new Producer[String, String](kafkaConfig) while (true) { // prepare event data val event = new JSONObject() event.put("user_id", user_id) .put("channel", channel) .put("ip", ip) .put("user_id_md5", user_id_md5) .put("plan_code", plan_code) .put("oper_type", oper_type) .put("category", catagory) .put("time", time) .put("stat", stat) .put("territory", territory) .put("mid", mid) producer.send(new KeyedMessage[String, String](topic1, event.toString)) Thread.sleep(200) println("Message sent: " + event) } }}
阅读全文
0 0
- SparkStreaming连接到kafka,防止重复消费
- sparkstreaming消费kafka中的数据
- SparkStreaming消费Kafka数据遇到的问题
- kafka重复消费问题
- Kafka重复消费
- spark-streaming-[9]-SparkStreaming消费Kafka-Direct Approach
- Kafka到SparkStreaming的两种方式
- flume到kafka,structuredStreaming从kafka消费
- Kafka重复消费和丢失数据研究
- Kafka->SparkStreaming
- sparkstreaming+kafka
- sparkstreaming+kafka
- sparkstreaming + kafka如何保证数据不丢失、不重复
- Spark学习(二)---kafka+SparkStreaming的搭建与连接
- kafka重置consumer的offset 数据重复消费
- Kafka中的消息是否会丢失和重复消费
- Flume+Kafka+SparkStreaming整合
- SparkStreaming基于Kafka Direct
- java中 i=i++的解释
- LINUX 学习第10天 Nginx介绍
- 搬家至CSDN
- javaweb第五记<监听器,本地化国际化>
- 识别文件编码
- SparkStreaming连接到kafka,防止重复消费
- 笔记(一):做前端开发以来几乎每天用到的东西!
- 对集合的一点理解
- LINUX 学习第11天 Nginx优化
- 基于PROCD的init脚本
- hdu 6119(区间重合处理)(尺取
- liunx安装gitolite
- Android4.0Sd卡移植之使用vold自动挂载sd卡
- guava实现事件驱动