SparkStreaming连接到kafka,防止重复消费

来源:互联网 发布:红三兵炒股软件下载 编辑:程序博客网 时间:2024/06/16 00:39
package com.manulife.mbps.behavior.data.streamimport java.util.Propertiesimport com.cloudera.com.amazonaws.util.json.JSONObjectimport com.manulife.mbps.behavior.common.utils.{IpUtil, ValidateUtil}import kafka.api.{OffsetRequest, PartitionOffsetRequestInfo, TopicMetadataRequest}import kafka.common.TopicAndPartitionimport kafka.consumer.SimpleConsumerimport kafka.producer.{KeyedMessage, Producer, ProducerConfig}import kafka.utils.{ZKGroupTopicDirs, ZkUtils}import org.I0Itec.zkclient.{ZkClient, ZkConnection}import org.I0Itec.zkclient.serialize.ZkSerializerimport org.apache.kafka.clients.consumer.ConsumerRecordimport org.apache.kafka.common.TopicPartitionimport org.apache.kafka.common.serialization.StringDeserializerimport org.apache.spark.SparkConfimport org.apache.spark.streaming.dstream.InputDStreamimport org.apache.spark.streaming.kafka010.ConsumerStrategies.{Assign, Subscribe}import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistentimport org.apache.spark.streaming.kafka010._import org.apache.spark.streaming.{Seconds, StreamingContext}/**  * Created by nickliu on 7/5/2017.  */object SparkStream_kafka {  def main(args: Array[String]) {    val topic = //    val topics = Set(//)    val group =//    val zkHosts = //    val brokers = //    val sparkConf = new SparkConf().setAppName("Test-SparkDemo-kafka").setMaster("local[3]")    /** 限制每秒钟从topic的每个partition最多消费的消息条数 */    sparkConf.set("spark.streaming.kafka.maxRatePerPartition", "1000")    val ssc = new StreamingContext(sparkConf, Seconds(1))    /** kafka config */    val kafkaParams = Map[String, Object](      "bootstrap.servers" -> brokers,      "key.deserializer" -> classOf[StringDeserializer],      "value.deserializer" -> classOf[StringDeserializer],      "group.id" -> group,      "zookeeper.set.acl" -> "false",      "auto.offset.reset" -> "latest", //kafka.api.OffsetRequest.SmallestTimeString      "enable.auto.commit" -> (false: java.lang.Boolean) // 关闭自动提交    )    /** 获取zk中offset */    val topicDirs = new ZKGroupTopicDirs(group, topic)    /** consumerOffsetDir:zk中的topic路径,这里是/consumers/example/offsets/ICPRecommend */    val consumerOffsetDir = topicDirs.consumerOffsetDir    /** zookeeper connection */    val zkClient = new ZkClient(zkHosts, 10000, 10000, new ZkSerializer {      override def serialize(data: scala.Any): Array[Byte] = data.asInstanceOf[String].getBytes("UTF-8")      override def deserialize(bytes: Array[Byte]): AnyRef = if (bytes == null) null else new String(bytes, "UTF-8")    })    //连接到zk更新offset    val zkUtils = new ZkUtils(zkClient, new ZkConnection(zkHosts), false)    val children = zkClient.countChildren(consumerOffsetDir)    var kafkaStream: InputDStream[ConsumerRecord[String, String]] = null    var fromOffsets: Map[TopicPartition, Long] = Map()    if (children > 0) {      /**  get partition leader begin */      val topicList = List(topic)      /** 得到该topic的一些信息,比如broker,partition分布情况 */      val req = new TopicMetadataRequest(topicList, 0)      /**  low level api interface */      val getLeaderConsumer = new SimpleConsumer("master", 9092, 10000, 10000, "OffsetLookup")      val res = getLeaderConsumer.send(req) //TopicMetadataRequest   topic broker partition 的一些信息      val topicMetaOption = res.topicsMetadata.headOption        /** partitions */      val partitions = topicMetaOption match {        case Some(tm) =>          tm.partitionsMetadata.map(pm => (pm.partitionId, pm.leader.get.host)).toMap[Int, String]  //分区与主机之间的map映射        case None =>          Map[Int, String]()      }      /**  get partition leader end */      /** offset */      for (i <- 0 until children) {        val zkPartitionOffset = zkClient.readData[String](s"${topicDirs.consumerOffsetDir}/${i}")        val tp = new TopicPartition(topic, i)        val tap = new TopicAndPartition(topic, i)        /** additional begin */        val requestMin = OffsetRequest(Map(tap -> PartitionOffsetRequestInfo(OffsetRequest.EarliestTime, 1))) // -2,1        val getMinOffsetConsumer = new SimpleConsumer(partitions(i), 9092, 10000, 10000, "getMinOffset")        val kafkaCurMinOffset = getMinOffsetConsumer.getOffsetsBefore(requestMin).partitionErrorAndOffsets(tap).offsets        var nextOffset = zkPartitionOffset.toLong        if (kafkaCurMinOffset.length > 0 && nextOffset < kafkaCurMinOffset.head) {          /** 如果下一个offset小于当前的offset */          nextOffset = kafkaCurMinOffset.head        }        /** additional end */        fromOffsets += (tp -> nextOffset)      }      kafkaStream = KafkaUtils.createDirectStream[String, String](        ssc,        PreferConsistent,        Assign[String, String](fromOffsets.keys.toList, kafkaParams, fromOffsets)      )    } else {      kafkaStream = KafkaUtils.createDirectStream[String, String](        ssc,        PreferConsistent,        Subscribe[String, String](topics, kafkaParams)      )    }    kafkaStream.foreachRDD { rdd =>      val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges      /** 将partitionId及起始offset存储到zookeeper中 */      for (offset <- offsetRanges) {        val zkPath = s"${topicDirs.consumerOffsetDir}/${offset.partition}"        zkUtils.updatePersistentPath(zkPath, offset.fromOffset.toString)      }      rdd.foreachPartition(        message => {          while (message.hasNext) {            val msg = message.next()            println("message: " + msg)            sendData(msg)          }        })    }    ssc.start()    ssc.awaitTermination()  }  def sendData(value: ConsumerRecord[String, String]) = {    var events = value.value().toString    var js = new JSONObject(events)    val channel = js.getString("channel")    var ip = js.getString("ip")    var user_id = js.getString("user_id")    var user_id_md5 = js.getString("user_id_md5")    var plan_code = js.getString("plan_code")    var oper_type = js.getString("oper_type")    var catagory = js.getString("category")    var time = js.getString("time")    var mid = js.getString("mid")    //校验ip地址是否合法    ip = ValidateUtil.ipCheck(ip)    var IpObject = IpUtil.ipToBean(IpUtil.ipToLong(ip))    var stat: String = null    var territory: String = null    if (IpObject == null) {      stat = "-"      territory = "-"    } else {      stat = IpObject.getState      territory = IpObject.getTerritory    }    val topic1 = //    val brokers = //    val props = new Properties()    props.put("metadata.broker.list", brokers)    props.put("serializer.class", "kafka.serializer.StringEncoder")    val kafkaConfig = new ProducerConfig(props)    val producer = new Producer[String, String](kafkaConfig)    while (true) {      // prepare event data      val event = new JSONObject()      event.put("user_id", user_id)        .put("channel", channel)        .put("ip", ip)        .put("user_id_md5", user_id_md5)        .put("plan_code", plan_code)        .put("oper_type", oper_type)        .put("category", catagory)        .put("time", time)        .put("stat", stat)        .put("territory", territory)        .put("mid", mid)      producer.send(new KeyedMessage[String, String](topic1, event.toString))      Thread.sleep(200)      println("Message sent: " + event)    }  }}