自定义分区partitioner实现数据分区存储

来源：互联网发布：手机淘宝能修改评价吗编辑：程序博客网时间：2024/05/22 15:13

Spark中分区器直接决定了RDD中分区的个数、RDD中每条数据经过Shuffle过程属于哪个分区和Reduce的个数
注意：
(1)只有Key-Value类型的RDD才有分区的，非Key-Value类型的RDD分区的值是None
(2)每个RDD的分区ID范围：0~numPartitions-1，决定这个值是属于那个分区的。
参考：http://blog.csdn.net/high2011/article/details/68491115

package com.ljt.spark01.weblog

import java.net.URL

import org.apache.spark.HashPartitioner
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext

/**
* 自定义分区partitioner实现数据分区存储
*/
object UrlCountPartition {

def main(args: Array[String]): Unit = {
val arr_course = Array(“java.itcast.cn”, “php.itcast.cn”, “net.itcast.cn”)
val conf = new SparkConf().setAppName(“AdvUrlCount”)
.setMaster(“local[2]”)
val sc = new SparkContext(conf)

//将数据切分为元组（URL，1）存放在RDDlval RDD1 = sc.textFile("data/usercount/IT_education.log").map { x =>  val f = x.split("\t")  //去掉时间，每出现一次URL，记为一个元组（url,1）  (f(1), 1)}//对相同的key的每个元组的值进行自加//(http://php.itcast.cn/php/course.shtml,459)val rdd_urlCount = RDD1.reduceByKey(_ + _)//获取url的前缀Host做为课程标识//(php.itcast.cn,http://php.itcast.cn/php/course.shtml,459)val rdd_urlHost = rdd_urlCount.map(f => {  val url = f._1  val countUrl = f._2  val host = new URL(url).getHost  //为了方便按照分区内部排序需要使用K-V，元组  (host, (url, countUrl))}).cache() //cache会将数据缓存到内存当中，cache是一个Transformation，lazy//url去重,得到所有host课程种类val ints = rdd_urlHost.map(_._1).distinct().collect()//实例化分区val hostPartitioner = new HostPartition(ints)//每个分区内部排序，取出前3名val rdd_Partitioners = rdd_urlHost.partitionBy(hostPartitioner)  .mapPartitions(it => {    it.toList.sortBy(_._2._2).reverse.take(3).iterator  })rdd_Partitioners.saveAsTextFile("data/out/out_partitioner")/** * ArrayBuffer((net.itcast.cn,(http://net.itcast.cn/net/course.shtml,521)), (net.itcast.cn,(http://net.itcast.cn/net/video.shtml,521)), (net.itcast.cn,(http://net.itcast.cn/net/teacher.shtml,512)), (java.itcast.cn,(http://java.itcast.cn/java/course/cloud.shtml,1028)), (java.itcast.cn,(http://java.itcast.cn/java/course/javaee.shtml,1000)), (java.itcast.cn,(http://java.itcast.cn/java/course/base.shtml,543)), (php.itcast.cn,(http://php.itcast.cn/php/video.shtml,490)), (php.itcast.cn,(http://php.itcast.cn/php/teacher.shtml,464)), (php.itcast.cn,(http://php.itcast.cn/php/course.shtml,459))) */println(rdd_Partitioners.collect().toBuffer)sc.stop()

}

 package com.ljt.spark01.weblogimport org.apache.spark.Partitionerimport scala.collection.mutable.HashMap/**  * 重写partition分区，按规则存储分区数据 */class HostPartition(ins: Array[String]) extends Partitioner {  val parMap = new HashMap[String, Int]()  var count = 0  for (i <- ins) {    parMap += (i -> count)    count += 1  }  override def numPartitions: Int = {    ins.length  }  def getPartition(key: Any): Int = {    parMap.getOrElse(key.toString(), 0)  }}

阅读全文

0 0