transform实现广告计费日志实时黑名单过滤(Java版本)

来源:互联网 发布:创优网络 编辑:程序博客网 时间:2024/05/22 03:50
package gh.spark.SparkStreaming;


import java.util.ArrayList;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.Optional;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaReceiverInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;


import scala.Tuple2;


/**
 * 广告计费日志实时黑名单过滤
 * @author Administrator
 *
 */
public class TransformDemo {
public static void main(String[] args) throws Exception {
SparkConf conf=new SparkConf()
.setAppName("TransformDemo")
.setMaster("local[2]");
JavaStreamingContext jsc=
new JavaStreamingContext(conf,Durations.seconds(5));

//创建一份黑名单
ArrayList<Tuple2<String, Boolean>> blackList = 
new ArrayList<Tuple2<String, Boolean>>();
blackList.add(new Tuple2<String, Boolean>("leo", true));

//注意,定义时需要添加final关键字
final JavaPairRDD<String, Boolean> blackRDD = 
jsc.sparkContext().parallelizePairs(blackList);

JavaReceiverInputDStream<String> linesDtream = 
jsc.socketTextStream("tgmaster", 9999);

//日志格式:date username,比如:2016-11-07 jack
//(jack,2016-11-07 jack)
JavaPairDStream<String, String> mapToPairDtream = linesDtream.mapToPair(new PairFunction<String, String, String>() {
private static final long serialVersionUID = 1L;


public Tuple2<String, String> call(String line) throws Exception {

return new Tuple2<String, String>(line.split(" ")[1], line);
}
});

//实时黑名单过滤
JavaDStream<String> resultDtream = mapToPairDtream.transform(new Function<JavaPairRDD<String,String>, JavaRDD<String>>() {


private static final long serialVersionUID = 1L;


public JavaRDD<String> call(JavaPairRDD<String, String> userClickLogRDD)
throws Exception {
/**
* 在此处,我们使用leftOuterJoin左外连接的方式进行join
* 左外连接之后的结果中既有黑名单人员,又有非黑名单人员
*/
JavaPairRDD<String, Tuple2<String, Optional<Boolean>>> joinRDD = 
userClickLogRDD.leftOuterJoin(blackRDD);

/**
* 接下来进行filter过滤操作
* 将黑名单用户过滤出来
*/
JavaPairRDD<String, Tuple2<String, Optional<Boolean>>> filterRDD = joinRDD.filter(new Function<Tuple2<String,Tuple2<String,Optional<Boolean>>>, Boolean>() {
private static final long serialVersionUID = 1L;


public Boolean call(
Tuple2<String, Tuple2<String, Optional<Boolean>>> log)
throws Exception {
if(log._2._2.isPresent() && log._2._2.get()){
return false; //此时是一个黑名单用户
}
return true;  //非黑名单用户
}
});

//找出非黑名单日志中的用户名username
JavaRDD<String> mapRDD = filterRDD.map(new Function<Tuple2<String,Tuple2<String,Optional<Boolean>>>, String>() {


private static final long serialVersionUID = 1L;


public String call(
Tuple2<String, Tuple2<String, Optional<Boolean>>> log)
throws Exception {

return log._2._1;  //返回用名username的日志
}
});

return mapRDD;
}
});

resultDtream.print();

jsc.start();
jsc.awaitTermination();
jsc.close();
}
}
0 0
原创粉丝点击