基于Spark Mllib, 使用java api操作的电影推荐系统(spark1.5.2 jdk1.7)
来源:互联网 发布:it前沿技术 编辑:程序博客网 时间:2024/04/29 17:54
最近在学习Spark Mllib,看了一些它的算法,但不知道算法怎么去应用,网上的实例大部分都是使用Scala语言写的,没有java的代码,从网上找到了一篇基于Spark Mllib,SparkSQL的电影推荐系统 也是使用Scala语言(对其不是很了解,暂时也没有多少时间去学),还好他讲得特别细,关于测试数据他都有下载地址,在这就不多说了。直接把由其改写的java代码附上:
maven依赖如下(可能有的不需要,当时还看了spark的别的方面):
<dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-core_2.10</artifactId> <version>1.5.2</version></dependency><!-- <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-sql_2.10</artifactId> <version>1.5.2</version></dependency> --><dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-hive_2.10</artifactId> <version>1.5.2</version></dependency><dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-mllib_2.10</artifactId> <version>1.5.2</version></dependency><dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-client</artifactId> <version>1.2.0</version></dependency>上面有的jar包可能直接下不下来,可以手动安装,或者连接vpn后再下载。
java代码如下:
import java.util.ArrayList;import java.util.List;import org.apache.spark.SparkConf;import org.apache.spark.api.java.JavaPairRDD;import org.apache.spark.api.java.JavaRDD;import org.apache.spark.api.java.JavaSparkContext;import org.apache.spark.api.java.function.DoubleFunction;import org.apache.spark.api.java.function.Function;import org.apache.spark.api.java.function.Function2;import org.apache.spark.api.java.function.VoidFunction;import org.apache.spark.mllib.recommendation.ALS;import org.apache.spark.mllib.recommendation.MatrixFactorizationModel;import org.apache.spark.mllib.recommendation.Rating;import scala.Tuple2;import scala.Tuple3;/** * Collaborative filtering 协同过滤 alternating least squares (ALS) (交替最小二乘法(ALS) ) * Title. <br> * Description. * <p> * Version: 1.0 * <p> */public class Test {public static void main(String[] args) {// 创建入口对象SparkConf conf = new SparkConf().setAppName("Collaborative Filtering Example").setMaster("local");JavaSparkContext sc = new JavaSparkContext(conf);// 加载评分数据String path = "C:/Users/dulinan/Desktop/movie/ratings.dat";JavaRDD<String> data = sc.textFile(path);// 所有评分数据,由于此数据要分三部分使用,60%用于训练,20%用于验证,最后20%用于测试。将时间戳%10可以得到近似的10等分,用于三部分数据切分JavaRDD<Tuple2<Integer, Rating>> ratingsTrain_KV = data.map(new Function<String, Tuple2<Integer, Rating>>() {@Override public Tuple2<Integer, Rating> call(String line) throws Exception {String[] fields = line.split("::");if (fields.length != 4) {throw new IllegalArgumentException("每一行必须有且只有4个元素");}int userId = Integer.parseInt(fields[0]);int movieId = Integer.parseInt(fields[1]);double rating = Float.parseFloat(fields[2]);int timestamp = (int) (Long.parseLong(fields[3])%10); return new Tuple2<Integer, Rating>(timestamp, new Rating(userId, movieId, rating)); }});System.out.println("get " + ratingsTrain_KV.count() + " ratings from " + ratingsTrain_KV.distinct().count() + "users on " + ratingsTrain_KV.distinct().count() + "movies");// 加载我的评分数据String mypath = "C:/Users/dulinan/Desktop/movie/test.dat";JavaRDD<String> mydata = sc.textFile(mypath);JavaRDD<Rating> myRatedData_Rating = mydata.map(new Function<String, Rating>() {@Override public Rating call(String line) throws Exception {String[] fields = line.split("::");if (fields.length != 4) {throw new IllegalArgumentException("每一行必须有且只有4个元素");}int userId = Integer.parseInt(fields[0]);int movieId = Integer.parseInt(fields[1]);double rating = Float.parseFloat(fields[2]); return new Rating(userId, movieId, rating); }});//设置分区数int numPartitions = 3;//将键值小于6(60%)的数据用于训练JavaRDD<Rating> traningData_Rating = JavaPairRDD.fromJavaRDD(ratingsTrain_KV.filter(new Function<Tuple2<Integer,Rating>, Boolean>() {@Override public Boolean call(Tuple2<Integer, Rating> v1) throws Exception { return v1._1 < 6; }})).values().union(myRatedData_Rating).repartition(numPartitions).cache();//将键值大于6小于8(20%)的数据用于验证JavaRDD<Rating> validateData_Rating = JavaPairRDD.fromJavaRDD(ratingsTrain_KV.filter(new Function<Tuple2<Integer,Rating>, Boolean>() {@Overridepublic Boolean call(Tuple2<Integer, Rating> v1) throws Exception {return v1._1 >= 6 && v1._1 < 8;}})).values().repartition(numPartitions).cache();//将键值大于8(20%)的数据用于测试JavaRDD<Rating> testData_Rating = JavaPairRDD.fromJavaRDD(ratingsTrain_KV.filter(new Function<Tuple2<Integer,Rating>, Boolean>() {@Overridepublic Boolean call(Tuple2<Integer, Rating> v1) throws Exception {return v1._1 >= 8;}})).values().cache();System.out.println("training data's num : " + traningData_Rating.count() + " validate data's num : " + validateData_Rating.count() + " test data's num : " + testData_Rating.count());// 为训练设置参数 每种参数设置2个值,三层for循环,一共进行8次训练List<Integer> ranks = new ArrayList<Integer>();ranks.add(8);ranks.add(22);List<Double> lambdas = new ArrayList<Double>();lambdas.add(0.1);lambdas.add(10.0);List<Integer> iters = new ArrayList<Integer>();iters.add(5);iters.add(7);// 初始化最好的模型参数 MatrixFactorizationModel bestModel = null; double bestValidateRnse = Double.MAX_VALUE; int bestRank = 0; double bestLambda = -1.0; int bestIter = -1;for (int i = 0; i < ranks.size(); i++) {for (int j = 0; j < lambdas.size(); j++) {for (int k = 0; k < iters.size(); k++) {//训练获得模型MatrixFactorizationModel model = ALS.train(JavaRDD.toRDD(traningData_Rating), ranks.get(i), iters.get(i), lambdas.get(i));//通过校验集validateData_Rating获取方差,以便查看此模型的好坏,方差方法定义在最下面double validateRnse = variance(model, validateData_Rating, validateData_Rating.count());System.out.println("validation = " + validateRnse + " for the model trained with rank = " + ranks.get(i) + " lambda = " + lambdas.get(i) + " and numIter" + iters.get(i)); //将最好的模型训练结果所设置的参数进行保存if (validateRnse < bestValidateRnse) {bestModel = model;bestValidateRnse = validateRnse;bestRank = ranks.get(i);bestLambda = lambdas.get(i);bestIter = iters.get(i);}}}}//8次训练后获取最好的模型,根据最好的模型及训练集testData_Rating来获取此方差double testDataRnse = variance(bestModel, testData_Rating, testData_Rating.count());System.out.println("the best model was trained with rank = " + bestRank + " and lambda = " + bestLambda + " and numIter = " + bestIter + " and Rnse on the test data is " + testDataRnse);// 获取测试数据中,分数的平均值final double meanRating = traningData_Rating.union(validateData_Rating).mapToDouble(new DoubleFunction<Rating>() {@Overridepublic double call(Rating t) throws Exception {return t.rating();}}).mean();// 根据平均值来计算旧的方差值double baseLineRnse = Math.sqrt(testData_Rating.mapToDouble(new DoubleFunction<Rating>() {@Overridepublic double call(Rating t) throws Exception {return (meanRating - t.rating()) * (meanRating - t.rating());}}).mean());// 通过模型,数据的拟合度提升了多少double improvent = (baseLineRnse - testDataRnse) / baseLineRnse * 100;System.out.println("the best model improves the baseline by " + improvent + "%");//加载电影数据String moviepath = "C:/Users/dulinan/Desktop/movie/movies.dat";JavaRDD<String> moviedata = sc.textFile(moviepath);// 将电影的id,标题,类型以三元组的形式保存JavaRDD<Tuple3<Integer, String, String>> movieList_Tuple = moviedata.map(new Function<String, Tuple3<Integer, String, String>>() {@Override public Tuple3<Integer, String, String> call(String line) throws Exception {String[] fields = line.split("::");if (fields.length != 3) {throw new IllegalArgumentException("Each line must contain 3 fields");}int id = Integer.parseInt(fields[0]);String title = fields[1];String type = fields[2]; return new Tuple3<Integer, String, String>(id, title, type); }});// 将电影的id,标题以二元组的形式保存JavaRDD<Tuple2<Integer, String>> movies_Map = movieList_Tuple.map(new Function<Tuple3<Integer,String,String>, Tuple2<Integer,String>>() {@Override public Tuple2<Integer, String> call(Tuple3<Integer, String, String> v1) throws Exception { return new Tuple2<Integer, String>(v1._1(), v1._2()); }});System.out.println("movies recommond for you:");// 获取我所看过的电影idsfinal List<Integer> movieIds = myRatedData_Rating.map(new Function<Rating, Integer>() {@Override public Integer call(Rating v1) throws Exception { return v1.product(); }}).collect();// 从电影数据中去除我看过的电影数据JavaRDD<Tuple2<Integer, String>> movieIdList = movies_Map.filter(new Function<Tuple2<Integer,String>, Boolean>() {@Override public Boolean call(Tuple2<Integer, String> v1) throws Exception { return !movieIds.contains(v1._1); }});// 封装rating的参数形式,user为0,product为电影id进行封装JavaPairRDD<Integer, Integer> recommondList = JavaPairRDD.fromJavaRDD(movieIdList.map(new Function<Tuple2<Integer,String>, Tuple2<Integer,Integer>>() {@Override public Tuple2<Integer, Integer> call(Tuple2<Integer, String> v1) throws Exception { return new Tuple2<Integer, Integer>(0, v1._1); }}));//通过模型预测出user为0的各product(电影id)的评分,并按照评分进行排序,获取前10个电影idfinal List<Integer> list = bestModel.predict(recommondList).sortBy(new Function<Rating, Double>() {@Override public Double call(Rating v1) throws Exception { return v1.rating(); }}, false, 1).map(new Function<Rating, Integer>() {@Override public Integer call(Rating v1) throws Exception { return v1.product(); }}).take(10);if (list != null && !list.isEmpty()) {//从电影数据中过滤出这10部电影,遍历打印movieList_Tuple.filter(new Function<Tuple3<Integer,String,String>, Boolean>() {@Override public Boolean call(Tuple3<Integer, String, String> v1) throws Exception { return list.contains(v1._1()); }}).foreach(new VoidFunction<Tuple3<Integer,String,String>>() {@Override public void call(Tuple3<Integer, String, String> t) throws Exception { System.out.println("nmovie name --> " + t._2() + " nmovie type --> " + t._3()); }});}}//方差的计算方法public static double variance(MatrixFactorizationModel model, JavaRDD<Rating> predictionData, long n) {//将predictionData转化成二元组型式,以便训练使用JavaRDD<Tuple2<Object, Object>> userProducts = predictionData.map(new Function<Rating, Tuple2<Object, Object>>() {public Tuple2<Object, Object> call(Rating r) {return new Tuple2<Object, Object>(r.user(), r.product());}});//通过模型对数据进行预测JavaPairRDD<Tuple2<Integer, Integer>, Double> prediction = JavaPairRDD.fromJavaRDD(model.predict(JavaRDD.toRDD(userProducts)).toJavaRDD().map(new Function<Rating, Tuple2<Tuple2<Integer, Integer>, Double>>() { public Tuple2<Tuple2<Integer, Integer>, Double> call(Rating r) { //System.out.println(r.user()+"..."+r.product()+"..."+r.rating()); return new Tuple2<Tuple2<Integer, Integer>, Double>(new Tuple2<Integer, Integer>(r.user(), r.product()), r.rating()); } }));//预测值和原值内连接JavaRDD<Tuple2<Double, Double>> ratesAndPreds = JavaPairRDD.fromJavaRDD(predictionData.map(new Function<Rating, Tuple2<Tuple2<Integer, Integer>, Double>>() { public Tuple2<Tuple2<Integer, Integer>, Double> call(Rating r) { //System.out.println(r.user() + "..." + r.product() + "..." + r.rating()); return new Tuple2<Tuple2<Integer, Integer>, Double>(new Tuple2<Integer, Integer>(r.user(), r.product()), r.rating()); } })).join(prediction).values();//计算方差并返回结果Double dVar = ratesAndPreds.map(new Function<Tuple2<Double,Double>, Double>() {@Override public Double call(Tuple2<Double, Double> v1) throws Exception { return (v1._1 - v1._2) * (v1._1 - v1._2); }}).reduce(new Function2<Double, Double, Double>() {@Override public Double call(Double v1, Double v2) throws Exception { return v1 + v2; }});return Math.sqrt(dVar / n);}}
1 0
- 基于Spark Mllib, 使用java api操作的电影推荐系统(spark1.5.2 jdk1.7)
- 基于Spark Mllib,SparkSQL的电影推荐系统
- <转>基于Spark Mllib,SparkSQL的电影推荐系统
- 基于Spark Mllib,SparkSQL的电影推荐系统
- 基于Spark Mllib,SparkSQL的电影推荐系统
- 基于Spark Mllib,SparkSQL的电影推荐系统
- 基于Spark Mllib,SparkSQL的电影推荐系统
- Spark MLlib系列(二):基于协同过滤的电影推荐系统
- Spark MLlib系列(二):基于协同过滤的电影推荐系统
- 基于Spark MLlib平台的协同过滤算法---电影推荐系统
- 基于Spark MLlib平台的协同过滤算法---电影推荐系统
- 基于Spark MLlib平台的协同过滤算法---电影推荐系统
- 基于Spark MLlib平台的协同过滤算法---电影推荐系统
- Spark MLlib系列(二):基于协同过滤的电影推荐系统
- Spark MLlib系列(二):基于协同过滤的电影推荐系统
- 基于Spark MLlib平台的协同过滤算法---电影推荐系统
- 基于Spark MLlib平台的协同过滤算法---电影推荐系统
- Spark MLlib系列(二):基于协同过滤的电影推荐系统
- hbase + falcon监控目录大小
- MVP模式在Android项目中的使用
- Android各种知识图(4):Context相关
- AMP(非对称多进程处理模式)和Zynq SoC的OCM
- mysql数据类型
- 基于Spark Mllib, 使用java api操作的电影推荐系统(spark1.5.2 jdk1.7)
- poj1328 Radar Installation —— 贪心
- spring学习(六)—AOP的概念和作用
- SparkStreaming例子
- Struts2 版本不一致导致的问题
- Windows下使用xShell向阿里云ecs服务器上传文件
- 内部类
- C语言external-internal-none链接属性浅析
- 抽象类(核心)