SprakStreaming整合Kafka2

来源:互联网 发布:电脑摇号软件 编辑:程序博客网 时间:2024/06/06 05:19
package com.uplooking.bigdata.streaming.p2;

import kafka.serializer.StringDecoder;
import org.apache.spark.SparkConf;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaPairInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka.KafkaUtils;
import scala.Tuple2;

import java.util.*;

/**
 * Created by thinkpad on 2017/4/21.
 */
public class JavaSparkStreamingDirectOps {
    public static void main(String[] args) {
        SparkConf conf = new SparkConf();
        conf.setMaster("local[2]");
        conf.setAppName(JavaSparkStreamingDirectOps.class.getSimpleName());
        JavaStreamingContext jsc = new JavaStreamingContext(conf, Durations.seconds(2));
//        jsc.checkpoint("E:/test/spark/streaming/ck1");

        /**
         * 使用Direct方式直接让SparkStreaming去Kafka中拉取数据,而且能够保证数据只被消费一次。
         * 在这种方式中就不需要在使用zk来记录kafka中的数据被消费的偏移量offset,为了保证程序恢复之后能够正常的读取
         * 争取offset对应的数据,所以需要通过StreamingContext来创建一个checkpoint来保存消费的偏移量。
         *
         * @param jssc JavaStreamingContext object
         * @param keyClass Class of the keys in the Kafka records
         * @param valueClass Class of the values in the Kafka records
         * @param keyDecoderClass Class of the key decoder
         * @param valueDecoderClass Class type of the value decoder
         * @param kafkaParams Kafka <a href="http://kafka.apache.org/documentation.html#configuration">
         *   configuration parameters</a>. Requires "metadata.broker.list" or "bootstrap.servers"
         *   to be set with Kafka broker(s) (NOT zookeeper servers), specified in
         *   host1:port1,host2:port2 form.
         *   If not starting from a checkpoint, "auto.offset.reset" may be set to "largest" or "smallest"
         *   to determine where the stream starts (defaults to "largest")
         * @param topics Names of the topics to consume
         * @tparam K type of Kafka message key
         * @tparam V type of Kafka message value
         * @tparam KD type of Kafka message key decoder
         * @tparam VD type of Kafka message value decoder
         * @return DStream of (Kafka message key, Kafka message value)
         */

        Map<String, String> kafkaParams = new HashMap<>();
        kafkaParams.put("bootstrap.servers", "master:9092,slave01:9092,slave02:9092");
        Set<String> topics = new HashSet<>();
        topics.add("spark-kafka");
        /**
         * java.lang.ClassCastException:
         * kafka.cluster.BrokerEndPoint cannot be cast to kafka.cluster.Broker
         * 因为Spark-Streaming用的版本是1.6.2,但是其所兼容的kafka的版本是0.8.2.1
         * 所以对其它版本的kafka兼容性不是很好,导致了咱们这个问题。
         */
        JavaPairInputDStream<String, String> kafkaDStream =
                KafkaUtils.createDirectStream(
                        jsc,
                        String.class,
                        String.class,
                        StringDecoder.class,
                        StringDecoder.class,
                        kafkaParams,
                        topics
                );
        JavaDStream<String> wordsDStream = kafkaDStream.flatMap(t -> {
            return Arrays.asList(t._2().split(" "));
        });
        JavaPairDStream<String, Integer> pairDStream = wordsDStream.mapToPair(word -> {
            return new Tuple2<String, Integer>(word, 1);
        });

        JavaPairDStream<String, Integer> retDStream = pairDStream.reduceByKey((v1, v2) -> {
            return v1 + v2;
        });
        retDStream.print();

        jsc.start();
        jsc.awaitTermination();
    }
}

0 0