Kafka+SparkStreaming+HBase
来源:互联网 发布:淘宝卖家子账号认证 编辑:程序博客网 时间:2024/05/21 07:58
先上pom文件
<dependencies> <dependency> <groupId>org.apache.kafka</groupId> <artifactId>kafka_2.10</artifactId> <version>0.10.0.0</version> </dependency> <dependency> <groupId>org.apache.kafka</groupId> <artifactId>kafka-clients</artifactId> <version>0.10.0.0</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-core_2.10</artifactId> <version>1.6.0</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-streaming_2.10</artifactId> <version>1.6.0</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-streaming-kafka_2.10</artifactId> <version>1.6.0</version> </dependency> <dependency> <groupId>org.apache.hbase</groupId> <artifactId>hbase-client</artifactId> <version>1.2.0</version> </dependency>
模拟Kafka生产者
导包import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}import java.util.Properties
import java.util.Properties
模拟生产者代码
def main(args: Array[String]): Unit = { val topic = "user_events" val brokers = "服务器名:9092" val props = new Properties() props.put("bootstrap.servers",brokers) props.put("acks", "0") props.put("retries",0.asInstanceOf[Integer]) props.put("batch.size", 16384.asInstanceOf[Integer]) props.put("linger.ms", 1.asInstanceOf[Integer]) props.put("buffer.memory", 33554432.asInstanceOf[Integer]) props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer") props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer") val producer = new KafkaProducer[String, String](props) for(i <- 0 to 100){ producer.send(new ProducerRecord[String, String](topic, Integer.toString(i)))//参数还可以添加分区 } producer.close()}
SparkStreaming + HBase
导包
import kafka.common.TopicAndPartitionimport kafka.serializer.StringDecoderimport org.apache.spark.SparkConfimport org.apache.spark.streaming.{Seconds, StreamingContext}import org.apache.hadoop.hbase.{HBaseConfiguration, HColumnDescriptor, HTableDescriptor, TableName}import org.apache.hadoop.hbase.client.{Connection, ConnectionFactory, Put}import org.apache.hadoop.hbase.util.Bytes
数据处理代码
object KafkaSparkStreaming { var hconf = HBaseConfiguration.create() hconf.set("hbase.zookeeper.quorum", "rozntgtest8") hconf.set("hbase.zookeeper.property.clientPort", "2181") hconf.set("mapreduce.task.timeout", "120000000") hconf.set("hbase.client.scanner.timeout.period", "600000000") hconf.set("hbase.rpc.timeout", "600000000") val conn:Connection = ConnectionFactory.createConnection(hconf) def main(args: Array[String]): Unit = { val conf = new SparkConf().setMaster("local[1]").setAppName("UserClickCountStat") val ssc = new StreamingContext(conf, Seconds(5)) val topics = Set("user_events") val brokers = "主机名:9092" val kafkaParams = Map[String, String]( "metadata.broker.list" -> brokers, "value.serializer"-> "org.apache.kafka.common.serialization.StringSerializer","key.serializer"-> "org.apache.kafka.common.serialization.StringSerializer") val kafkaStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topics) kafkaStream.foreachRDD(rdd => { if (!conn.getAdmin.tableExists(TableName.valueOf("number"))) { val descriptor = new HTableDescriptor(TableName.valueOf("number")).addFamily(new HColumnDescriptor("info")) conn.getAdmin.createTable(descriptor) } rdd.foreachPartition(x => { x.foreach( a => { val put = new Put(Bytes.toBytes("rowkey")) put.addColumn(Bytes.toBytes("info"), Bytes.toBytes(a._2), Bytes.toBytes(a._2)) val table = conn.getTable(TableName.valueOf("number")) table.put(put) }) }) }) ssc.start() ssc.awaitTermination()}
这中间有一个问题,conn必须在作为成员变量,我在测试的时候把它放在局部的时候无法序列化,报错:task not serializable。
阅读全文