java操作spark读写mongodb
来源:互联网 发布:杜比音效增强软件 编辑:程序博客网 时间:2024/06/05 04:37
首先要引入mongodb-spark-connector的maven依赖,具体的可见这个api网址:https://docs.mongodb.com/spark-connector/current/java-api/,然后基本上就可以按照api上面的内容来进行spark操作了。这里面已经有spark读入mongodb数据转化为rdd的操作了。
有一些补充的或许有用(?)的代码,放在这里。
import com.mongodb.MongoClient;import com.mongodb.MongoClientURI;import com.mongodb.client.MongoDatabase;import com.mongodb.spark.MongoConnector;import com.mongodb.spark.MongoSpark;import com.mongodb.spark.config.ReadConfig;import com.mongodb.spark.config.WriteConfig;import com.mongodb.spark.rdd.api.java.JavaMongoRDD;import com.mongodb.spark.sql.helpers.StructFields;import org.apache.spark.SparkConf;import org.apache.spark.api.java.JavaRDD;import org.apache.spark.api.java.JavaSparkContext;import org.apache.spark.api.java.function.Function;import org.apache.spark.sql.Dataset;import org.apache.spark.sql.Row;import org.apache.spark.sql.SparkSession;import org.apache.spark.sql.types.DataTypes;import org.apache.spark.sql.types.StructType;import org.bson.Document;import org.bson.types.ObjectId;import java.util.HashMap;import java.util.List;import java.util.Map;import static java.lang.String.format;import static java.util.Arrays.asList;import static java.util.Collections.singletonList;public final class JavaIntroduction {/** * Run this main method to see the output of this quick example. * * @param args takes an optional single argument for the connection string * @throws InterruptedException if a latch is interrupted */public static void main(final String[] args) throws InterruptedException { JavaSparkContext jsc = createJavaSparkContext(args); // Create a RDD JavaRDD<Document> documents = jsc.parallelize(asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)).map (new Function<Integer, Document>() { @Override public Document call(final Integer i) throws Exception { return Document.parse("{test: " + i + "}"); } }); // Saving data from an RDD to MongoDB MongoSpark.save(documents); // Saving data with a custom WriteConfig Map<String, String> writeOverrides = new HashMap<String, String>(); writeOverrides.put("collection", "spark"); writeOverrides.put("writeConcern.w", "majority"); WriteConfig writeConfig = WriteConfig.create(jsc).withOptions(writeOverrides); JavaRDD<Document> sparkDocuments = jsc.parallelize(asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)).map (new Function<Integer, Document>() { @Override public Document call(final Integer i) throws Exception { return Document.parse("{spark: " + i + "}"); } }); // Saving data from an RDD to MongoDB MongoSpark.save(sparkDocuments, writeConfig); // Loading and analyzing data from MongoDB JavaMongoRDD<Document> rdd = MongoSpark.load(jsc); System.out.println(rdd.count()); System.out.println(rdd.first().toJson()); // Loading data with a custom ReadConfig Map<String, String> readOverrides = new HashMap<String, String>(); readOverrides.put("collection", "spark"); readOverrides.put("readPreference.name", "secondaryPreferred"); ReadConfig readConfig = ReadConfig.create(jsc).withOptions(readOverrides); JavaMongoRDD<Document> customRdd = MongoSpark.load(jsc, readConfig); System.out.println(customRdd.count()); System.out.println(customRdd.first().toJson()); // Filtering an rdd using an aggregation pipeline before passing data to Spark JavaMongoRDD<Document> aggregatedRdd = rdd.withPipeline(singletonList(Document.parse("{ $match: { test : { $gt : 5 } } }"))); System.out.println(aggregatedRdd.count()); System.out.println(aggregatedRdd.first().toJson()); // Datasets // Drop database dropDatabase(getMongoClientURI(args)); // Add Sample Data List<String> characters = asList( "{'name': 'Bilbo Baggins', 'age': 50}", "{'name': 'Gandalf', 'age': 1000}", "{'name': 'Thorin', 'age': 195}", "{'name': 'Balin', 'age': 178}", "{'name': 'K铆li', 'age': 77}", "{'name': 'Dwalin', 'age': 169}", "{'name': '脫in', 'age': 167}", "{'name': 'Gl贸in', 'age': 158}", "{'name': 'F铆li', 'age': 82}", "{'name': 'Bombur'}" ); MongoSpark.save(jsc.parallelize(characters).map(new Function<String, Document>() { @Override public Document call(final String json) throws Exception { return Document.parse(json); } })); // Load inferring schema Dataset<Row> df = MongoSpark.load(jsc).toDF(); df.printSchema(); df.show(); // Declare the Schema via a Java Bean SparkSession sparkSession = SparkSession.builder().getOrCreate(); Dataset<Row> explicitDF = MongoSpark.load(jsc).toDF(Character.class); explicitDF.printSchema(); // SQL explicitDF.registerTempTable("characters"); Dataset<Row> centenarians = sparkSession.sql("SELECT name, age FROM characters WHERE age >= 100"); // Saving DataFrame MongoSpark.write(centenarians).option("collection", "hundredClub").save(); MongoSpark.load(sparkSession, ReadConfig.create(sparkSession).withOption("collection", "hundredClub"), Character.class).show(); // Drop database MongoConnector.apply(jsc.sc()).withDatabaseDo(ReadConfig.create(sparkSession), new Function<MongoDatabase, Void>() { @Override public Void call(final MongoDatabase db) throws Exception { db.drop(); return null; } }); String objectId = "123400000000000000000000"; List<Document> docs = asList( new Document("_id", new ObjectId(objectId)).append("a", 1), new Document("_id", new ObjectId()).append("a", 2)); MongoSpark.save(jsc.parallelize(docs)); // Set the schema using the ObjectId helper StructType schema = DataTypes.createStructType(asList( StructFields.objectId("_id", false), DataTypes.createStructField("a", DataTypes.IntegerType, false))); // Create a dataframe with the helper functions registered df = MongoSpark.read(sparkSession).schema(schema).option("registerSQLHelperFunctions", "true").load(); // Query using the ObjectId string df.filter(format("_id = ObjectId('%s')", objectId)).show();}private static JavaSparkContext createJavaSparkContext(final String[] args) { String uri = getMongoClientURI(args); dropDatabase(uri); SparkConf conf = new SparkConf() .setMaster("local") .setAppName("MongoSparkConnectorTour") .set("spark.app.id", "MongoSparkConnectorTour") .set("spark.mongodb.input.uri", uri) .set("spark.mongodb.output.uri", uri); return new JavaSparkContext(conf);}private static String getMongoClientURI(final String[] args) { String uri; if (args.length == 0) { uri = "mongodb://localhost/test.coll"; // default } else { uri = args[0]; } return uri;}private static void dropDatabase(final String connectionString) { MongoClientURI uri = new MongoClientURI(connectionString); new MongoClient(uri).dropDatabase(uri.getDatabase());}}。
阅读全文
0 0
- java操作spark读写mongodb
- Java操作Mongodb之文件读写
- Spark操作mongodb
- [mongodb]java操作mongodb
- [mongodb]java操作mongodb
- 【MongoDB】Java驱动读写数组
- mongodb学习(java操作mongodb)
- mongodb初学-java操作mongodb
- mongodb各操作的读写锁
- mongoDB JAVA操作
- MongoDB,java操作
- Java 操作 MongoDB
- Java操作MongoDB
- mongoDB JAVA操作
- java操作mongodb实例
- Java操作MongoDB入门
- Java操作Mongodb
- Java操作MongoDB
- JavaScript数组
- 各种上拉下拉刷新的方法
- 日志配置
- 全局中间件
- lintCode-落单的数III
- java操作spark读写mongodb
- 一位程序员工作10年总结的13个忠告
- laravel 在Windows 8.1的任务调度
- 16的md5 32位的md5
- 支付宝的一个伦理风险行为
- C#发送邮件(附件)类
- Light Probes: Technical information
- 自学写的一个顺序表
- [前奏篇] Win7+opencv2.4.9+Visual Studio2010之环境配置