用sparkstreaming按天计算地区销售额简单模版

来源:互联网 发布:c语言 time.h库函数 编辑:程序博客网 时间:2024/06/06 05:57

producer端:

import java.util.HashMapimport org.apache.kafka.clients.producer.{KafkaProducer, ProducerConfig, ProducerRecord}import org.apache.spark.SparkConfimport org.apache.spark.streaming._import org.apache.spark.streaming.kafka._import scala.util.Random/**  * Created by zengxiaosen on 16/9/26.  *//*在命令行输入 kafka-console-consumer.sh --zookeeper slave1:2181 --topic orderTopic来看看我们生产的数据 */object OrderProductor {  def main(args: Array[String]): Unit = {    val topic = "orderTopic"    val brokers = "master:9092,slave1:9092"    val props = new HashMap[String, Object]()    props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, brokers)    props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer")    props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer")    val producer = new KafkaProducer[String, String](props)    //生产10条订单    while(true){      (1 to 10).foreach{        messageNum =>          //地区ID,订单id,订单金额,订单时间          val str = messageNum+","+Random.nextInt(10)+","+Math.round(Random.nextDouble()*100)+","+DateUtils.getCurrentDateTime          val message = new ProducerRecord[String, String](topic, null, str)          producer.send(message)      }      Thread.sleep(1000)    }  }}
consumer端:

import org.apache.spark.SparkConfimport org.apache.spark.streaming.kafka.KafkaUtilsimport org.apache.spark.streaming.{Seconds, StreamingContext}/**  * Created by zengxiaosen on 16/9/26.  */object AreaAmt {  //每批次的wordcount  def main(args: Array[String]): Unit = {    /*    对kafka来讲,groupid的作用是:    我们想多个作业同时消费同一个topic时,    1每个作业拿到完整数据,计算互不干扰;    2每个作业拿到一部分数据,相当于实现负载均衡    当多个作业groupid相同时,属于2    否则属于情况1     */    val zkQuorum = "slave1:2181"    val group = "g1"    val topics = "logTopic"    val numThreads = 2    //setmaster的核数至少给2,如果给1,资源不够则无法计算,至少需要一个核进行维护,一个计算    val sparkConf = new SparkConf().setAppName("AreaAmt").setMaster("local[2]")    val ssc = new StreamingContext(sparkConf, Seconds(2))//两秒一个批次    ssc.checkpoint("hdfs://192:168.75.130:8020/user/root/checkpoint/AreaAmt")//设置有状态检查点    val topicMap = topics.split(",").map((_, numThreads.toInt)).toMap    //val topicMap2 = Map(topics->2)    //得出写到kafka里面每一行每一行的数据    //每个时间段批次    val lines = KafkaUtils.createStream(ssc, zkQuorum, group, topicMap).map(_._2)    //产生我们需要的pair rdd    val linerdd = lines.map{row =>{      val arr = row.split(",")      //按日期按地区计算销售额2016-09-04_Area      /*      继续细分到城市,无非是key该表一下,其他地方都是一样的       */      val key = arr(3).substring(0,10)+"_"+arr(0)      val amt = arr(2).toInt      (key, amt)    }}    val addFunc = (currValues: Seq[Int], preValueState: Option[Int]) =>{      //通过spark内部的reducebykey按key规约,然后这里传入某key当前批次的seq,再计算key的总和      val currentCount = currValues.sum      //已经累加的值      val previousCount = preValueState.getOrElse(0)      //返回累加后的结果,是一个Option[Int]类型      Some(currentCount + previousCount)    }    linerdd.updateStateByKey[Int](addFunc).print()    ssc.start()    ssc.awaitTermination()  }}
关于DateUtil:

自己随意写,这里写两个模版:

import java.util.Calendarimport java.text.SimpleDateFormat/**  * Created by zengxiaosen on 16/9/26.  */object DateUtils {  def getCurrentDateTime: String = getCurrentDateTime("K:mm aa")  def getCurrentDate: String = getCurrentDateTime("EEEE, MMMM d")  private def getCurrentDateTime(dateTimeFormat: String): String = {    val dateFormat = new SimpleDateFormat(dateTimeFormat)    val cal = Calendar.getInstance()    dateFormat.format(cal.getTime())  }}
另外模版:

import java.text.SimpleDateFormatimport java.util.Calendar/**  * Created by zengxiaosen on 16/9/26.  */object DateUtils01 {  def getCurrentTime(): String =  {    val sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")    val c = Calendar.getInstance()    sdf.format(c.getTime)  }  def main(args: Array[String]): Unit = {    println("2016-09-04 15:19:09".substring(0, 10))  }}

0
0 0