Spark-kafka-ES (JAVA)一套流程
来源:互联网 发布:网络水军价格淘宝 编辑:程序博客网 时间:2024/06/05 12:04
1、创建myeclipse的 maven项目
2、hadoop2.6.3 spark1.6.3 ES 5.4.3 jdk1.8
3、创建maven依赖
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.10</artifactId>
<version>${spark-version}</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
<exclusion>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
</exclusion>
<exclusion>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.10</artifactId>
<version>${spark-version}</version>
<scope>provided</scope>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
<exclusion>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka_2.10</artifactId>
<version>${spark-version}</version>
</dependency>
<dependency>
<groupId>org.apache.zookeeper</groupId>
<artifactId>zookeeper</artifactId>
<version>3.4.5</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
<exclusion>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>2.10.5</version>
</dependency>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-compiler</artifactId>
<version>2.10.5</version>
</dependency>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-reflect</artifactId>
<version>2.10.5</version>
</dependency>
<dependency>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch-spark-13_2.10</artifactId>
<version>${es-version}</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-to-slf4j</artifactId>
<version>2.8.2</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.24</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>1.7.24</version>
</dependency>
4、直接上手java 代码
package com.mobike;
import java.io.Serializable;
import java.text.SimpleDateFormat;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.regex.Pattern;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.streaming.Duration;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairReceiverInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka.KafkaUtils;
import org.elasticsearch.spark.rdd.api.java.JavaEsSpark;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import scala.Tuple2;
public class SparkToES implements Serializable{
public static String yyyy="";
public static String mm="";
public static String dd="";
public static String hh="";
public static String yyyyerr="";
public static String mmerr="";
public static String dderr="";
public static String hherr="";
public static void main(String[] args) {
//设置匹配模式,以空格分隔
final Pattern SPACE = Pattern.compile(" ");
//接收数据的地址和端口
String zkQuorum = "t-kafka1.hdp.mobike.cn:2181";
//话题所在的组
String group = "1";
//话题名称以“,”分隔
String topics = "api_newlog";
//每个话题的分片数
int numThreads = 1;
Date date=new Date();
SimpleDateFormat dateFormat=new SimpleDateFormat("yyyy-MM-dd");
String datef=dateFormat.format(date);
yyyyerr=datef.substring(0,4);
mmerr=datef.substring(5,7);
dderr=datef.substring(8, 10);
hherr=datef.substring(0, 2);
SparkConf sparkConf = new SparkConf().setAppName("KafkaWordCount");
sparkConf.set("es.index.auto.create", "true");
sparkConf.set("es.nodes", "10.3.3.83");
//---->如果是连接的远程es节点,该项必须要设置
sparkConf.set("es.port", "9200");
JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, new Duration(4000));
jssc.checkpoint("hdfs://mycluster/tmp/"+yyyyerr+"/"+mmerr+"/"+dderr+"/"+hherr+"/"); //设置检查点
//存放话题跟分片的映射关系
Map<String, Integer> topicmap = new HashMap<String, Integer>();
String[] topicsArr = topics.split(",");
int n = topicsArr.length;
for(int i=0;i<n;i++){
topicmap.put(topicsArr[i], numThreads);
}
//从Kafka中获取数据转换成RDD
JavaPairReceiverInputDStream<String, String> lines = KafkaUtils.createStream(jssc, zkQuorum, group, topicmap);
//从话题中过滤所需数据
JavaDStream<String> nn = lines.map(new Function<Tuple2<String, String>, String>() {
public String call(Tuple2<String, String> tuple2) {
return tuple2._2();
}
});
JavaDStream<Map> a=nn.flatMap(new FlatMapFunction<String, Map>() {
public Iterable<Map> call(String arg0) throws Exception {
// TODO Auto-generated method stub
Map map =new HashMap();
map.put("comments", arg0);
Date date=new Date();
SimpleDateFormat dateFormat=new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ");
String datef=dateFormat.format(date);
map.put("commentDate", datef);
return Arrays.asList(map);
}
});
/*JavaDStream<String> words = nn.flatMap(new FlatMapFunction<String, String>() {
public Iterable<String> call(String line) {
// String jsonString="{\"userName\":"+line.split(" ")[8];
Map map =new HashMap();
map.put("comments", line.toString());
Date date=new Date();
SimpleDateFormat dateFormat=new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ");
String datef=dateFormat.format(date);
map.put("commentDate", datef);
return Arrays.asList(map.toString());
}
});
*/
a.foreachRDD(new VoidFunction<JavaRDD<Map>>() {
public void call(JavaRDD<Map> t) throws Exception {
JavaEsSpark.saveToEs(t, "logs/docs");
}
});
//words.dstream().repartition(1).saveAsTextFiles("hdfs://mycluster/tmp/"+yyyyerr+"/"+mmerr+"/"+dderr+"/"+hherr+"/api_newlog", "txt");
a.print();
jssc.start();
jssc.awaitTermination();
}
}
- Spark-kafka-ES (JAVA)一套流程
- spark、es、java精确匹配
- kafka java api 消费入es
- java+scala+zookpeeper+flume+kafka+maven+hadoop+hbase+hive+spark安装流程
- python scala kafka 集成一个流程项目 spark
- spark createDirectStream保存kafka offset(JAVA实现)
- spark createDirectStream保存kafka offset(JAVA实现)
- spark createDirectStream保存kafka offset(JAVA实现)
- spark kafka 零数据丢失-----java版
- spark createDirectStream保存kafka offset(JAVA实现)
- spark createDirectStream保存kafka offset(JAVA实现)
- spark-kafka
- Spark 与 Kafka 集成出错: Apache Spark: java.lang.NoSuchMethodError
- Kafka->Mongodb->Es
- kafka->logstash->es
- 【spark】spark+kafka
- Spark Streaming createDirectStream保存kafka offset(JAVA实现)
- spark streaming 接收 kafka 数据java代码WordCount示例
- 【分布式系统】RPC实验
- Mac 下启动mysql
- 七大查找算法
- Redis之持久化
- 更换SSH2 HOME
- Spark-kafka-ES (JAVA)一套流程
- 算法题:加油站
- Aras Innovator: 把itemtype直接部署在TOC的根目录
- CentOS7.0基础配置
- 36.Struts2_自定义验证器
- 第六章静态查询分析:使用自定义的变量
- 截断一个单链表
- Hadoop中WordCount
- jQuery