基于Spark Mllib, 使用java api操作的电影推荐系统(spark1.5.2 jdk1.7)

来源：互联网发布：it前沿技术编辑：程序博客网时间：2024/04/29 17:54

最近在学习Spark Mllib，看了一些它的算法，但不知道算法怎么去应用，网上的实例大部分都是使用Scala语言写的，没有java的代码，从网上找到了一篇基于Spark Mllib，SparkSQL的电影推荐系统也是使用Scala语言（对其不是很了解，暂时也没有多少时间去学），还好他讲得特别细，关于测试数据他都有下载地址，在这就不多说了。直接把由其改写的java代码附上：

maven依赖如下（可能有的不需要，当时还看了spark的别的方面）：

<dependency>    <groupId>org.apache.spark</groupId>    <artifactId>spark-core_2.10</artifactId>    <version>1.5.2</version></dependency><!-- <dependency>    <groupId>org.apache.spark</groupId>    <artifactId>spark-sql_2.10</artifactId>    <version>1.5.2</version></dependency> --><dependency>    <groupId>org.apache.spark</groupId>    <artifactId>spark-hive_2.10</artifactId>    <version>1.5.2</version></dependency><dependency>    <groupId>org.apache.spark</groupId>    <artifactId>spark-mllib_2.10</artifactId>    <version>1.5.2</version></dependency><dependency>    <groupId>org.apache.hadoop</groupId>    <artifactId>hadoop-client</artifactId>    <version>1.2.0</version></dependency>

上面有的jar包可能直接下不下来，可以手动安装，或者连接vpn后再下载。

java代码如下：

import java.util.ArrayList;import java.util.List;import org.apache.spark.SparkConf;import org.apache.spark.api.java.JavaPairRDD;import org.apache.spark.api.java.JavaRDD;import org.apache.spark.api.java.JavaSparkContext;import org.apache.spark.api.java.function.DoubleFunction;import org.apache.spark.api.java.function.Function;import org.apache.spark.api.java.function.Function2;import org.apache.spark.api.java.function.VoidFunction;import org.apache.spark.mllib.recommendation.ALS;import org.apache.spark.mllib.recommendation.MatrixFactorizationModel;import org.apache.spark.mllib.recommendation.Rating;import scala.Tuple2;import scala.Tuple3;/** * Collaborative filtering 协同过滤 alternating least squares (ALS) (交替最小二乘法(ALS) ) * Title. <br> * Description. * <p> * Version: 1.0 * <p> */public class Test {public static void main(String[] args) {// 创建入口对象SparkConf conf = new SparkConf().setAppName("Collaborative Filtering Example").setMaster("local");JavaSparkContext sc = new JavaSparkContext(conf);// 加载评分数据String path = "C:/Users/dulinan/Desktop/movie/ratings.dat";JavaRDD<String> data = sc.textFile(path);// 所有评分数据，由于此数据要分三部分使用，60%用于训练，20%用于验证，最后20%用于测试。将时间戳%10可以得到近似的10等分，用于三部分数据切分JavaRDD<Tuple2<Integer, Rating>> ratingsTrain_KV = data.map(new Function<String, Tuple2<Integer, Rating>>() {@Override            public Tuple2<Integer, Rating> call(String line) throws Exception {String[] fields = line.split("::");if (fields.length != 4) {throw new IllegalArgumentException("每一行必须有且只有4个元素");}int userId = Integer.parseInt(fields[0]);int movieId = Integer.parseInt(fields[1]);double rating = Float.parseFloat(fields[2]);int timestamp = (int) (Long.parseLong(fields[3])%10);            return new Tuple2<Integer, Rating>(timestamp, new Rating(userId, movieId, rating));            }});System.out.println("get " + ratingsTrain_KV.count() + " ratings from " + ratingsTrain_KV.distinct().count() + "users on " + ratingsTrain_KV.distinct().count() + "movies");// 加载我的评分数据String mypath = "C:/Users/dulinan/Desktop/movie/test.dat";JavaRDD<String> mydata = sc.textFile(mypath);JavaRDD<Rating> myRatedData_Rating = mydata.map(new Function<String, Rating>() {@Override            public Rating call(String line) throws Exception {String[] fields = line.split("::");if (fields.length != 4) {throw new IllegalArgumentException("每一行必须有且只有4个元素");}int userId = Integer.parseInt(fields[0]);int movieId = Integer.parseInt(fields[1]);double rating = Float.parseFloat(fields[2]);            return new Rating(userId, movieId, rating);            }});//设置分区数int numPartitions = 3;//将键值小于6（60%）的数据用于训练JavaRDD<Rating> traningData_Rating = JavaPairRDD.fromJavaRDD(ratingsTrain_KV.filter(new Function<Tuple2<Integer,Rating>, Boolean>() {@Override            public Boolean call(Tuple2<Integer, Rating> v1) throws Exception {            return v1._1 < 6;            }})).values().union(myRatedData_Rating).repartition(numPartitions).cache();//将键值大于6小于8（20%）的数据用于验证JavaRDD<Rating> validateData_Rating = JavaPairRDD.fromJavaRDD(ratingsTrain_KV.filter(new Function<Tuple2<Integer,Rating>, Boolean>() {@Overridepublic Boolean call(Tuple2<Integer, Rating> v1) throws Exception {return v1._1 >= 6 && v1._1 < 8;}})).values().repartition(numPartitions).cache();//将键值大于8（20%）的数据用于测试JavaRDD<Rating> testData_Rating = JavaPairRDD.fromJavaRDD(ratingsTrain_KV.filter(new Function<Tuple2<Integer,Rating>, Boolean>() {@Overridepublic Boolean call(Tuple2<Integer, Rating> v1) throws Exception {return v1._1 >= 8;}})).values().cache();System.out.println("training data's num : " + traningData_Rating.count() + " validate data's num : " + validateData_Rating.count() + " test data's num : " + testData_Rating.count());// 为训练设置参数 每种参数设置2个值，三层for循环，一共进行8次训练List<Integer> ranks = new ArrayList<Integer>();ranks.add(8);ranks.add(22);List<Double> lambdas = new ArrayList<Double>();lambdas.add(0.1);lambdas.add(10.0);List<Integer> iters = new ArrayList<Integer>();iters.add(5);iters.add(7);// 初始化最好的模型参数 MatrixFactorizationModel bestModel = null;    double bestValidateRnse = Double.MAX_VALUE;    int bestRank = 0;    double bestLambda = -1.0;    int bestIter = -1;for (int i = 0; i < ranks.size(); i++) {for (int j = 0; j < lambdas.size(); j++) {for (int k = 0; k < iters.size(); k++) {//训练获得模型MatrixFactorizationModel model = ALS.train(JavaRDD.toRDD(traningData_Rating), ranks.get(i), iters.get(i), lambdas.get(i));//通过校验集validateData_Rating获取方差，以便查看此模型的好坏，方差方法定义在最下面double validateRnse = variance(model, validateData_Rating, validateData_Rating.count());System.out.println("validation = " + validateRnse + " for the model trained with rank = " + ranks.get(i) + " lambda = " + lambdas.get(i) + " and numIter" + iters.get(i));        //将最好的模型训练结果所设置的参数进行保存if (validateRnse < bestValidateRnse) {bestModel = model;bestValidateRnse = validateRnse;bestRank = ranks.get(i);bestLambda = lambdas.get(i);bestIter = iters.get(i);}}}}//8次训练后获取最好的模型，根据最好的模型及训练集testData_Rating来获取此方差double testDataRnse = variance(bestModel, testData_Rating, testData_Rating.count());System.out.println("the best model was trained with rank = " + bestRank + " and lambda = " + bestLambda                   + " and numIter = " + bestIter + " and Rnse on the test data is " + testDataRnse);// 获取测试数据中，分数的平均值final double meanRating = traningData_Rating.union(validateData_Rating).mapToDouble(new DoubleFunction<Rating>() {@Overridepublic double call(Rating t) throws Exception {return t.rating();}}).mean();// 根据平均值来计算旧的方差值double baseLineRnse = Math.sqrt(testData_Rating.mapToDouble(new DoubleFunction<Rating>() {@Overridepublic double call(Rating t) throws Exception {return (meanRating - t.rating()) * (meanRating - t.rating());}}).mean());// 通过模型，数据的拟合度提升了多少double improvent = (baseLineRnse - testDataRnse) / baseLineRnse * 100;System.out.println("the best model improves the baseline by " + improvent + "%");//加载电影数据String moviepath = "C:/Users/dulinan/Desktop/movie/movies.dat";JavaRDD<String> moviedata = sc.textFile(moviepath);// 将电影的id，标题，类型以三元组的形式保存JavaRDD<Tuple3<Integer, String, String>> movieList_Tuple = moviedata.map(new Function<String, Tuple3<Integer, String, String>>() {@Override            public Tuple3<Integer, String, String> call(String line) throws Exception {String[] fields = line.split("::");if (fields.length != 3) {throw new IllegalArgumentException("Each line must contain 3 fields");}int id = Integer.parseInt(fields[0]);String title = fields[1];String type = fields[2];            return new Tuple3<Integer, String, String>(id, title, type);            }});// 将电影的id，标题以二元组的形式保存JavaRDD<Tuple2<Integer, String>> movies_Map = movieList_Tuple.map(new Function<Tuple3<Integer,String,String>, Tuple2<Integer,String>>() {@Override            public Tuple2<Integer, String> call(Tuple3<Integer, String, String> v1) throws Exception {            return new Tuple2<Integer, String>(v1._1(), v1._2());            }});System.out.println("movies recommond for you:");// 获取我所看过的电影idsfinal List<Integer> movieIds = myRatedData_Rating.map(new Function<Rating, Integer>() {@Override            public Integer call(Rating v1) throws Exception {            return v1.product();            }}).collect();// 从电影数据中去除我看过的电影数据JavaRDD<Tuple2<Integer, String>> movieIdList  = movies_Map.filter(new Function<Tuple2<Integer,String>, Boolean>() {@Override            public Boolean call(Tuple2<Integer, String> v1) throws Exception {            return !movieIds.contains(v1._1);            }});// 封装rating的参数形式，user为0，product为电影id进行封装JavaPairRDD<Integer, Integer> recommondList = JavaPairRDD.fromJavaRDD(movieIdList.map(new Function<Tuple2<Integer,String>, Tuple2<Integer,Integer>>() {@Override            public Tuple2<Integer, Integer> call(Tuple2<Integer, String> v1) throws Exception {            return new Tuple2<Integer, Integer>(0, v1._1);            }}));//通过模型预测出user为0的各product(电影id)的评分，并按照评分进行排序，获取前10个电影idfinal List<Integer> list = bestModel.predict(recommondList).sortBy(new Function<Rating, Double>() {@Override            public Double call(Rating v1) throws Exception {            return v1.rating();            }}, false, 1).map(new Function<Rating, Integer>() {@Override            public Integer call(Rating v1) throws Exception {            return v1.product();            }}).take(10);if (list != null && !list.isEmpty()) {//从电影数据中过滤出这10部电影，遍历打印movieList_Tuple.filter(new Function<Tuple3<Integer,String,String>, Boolean>() {@Override                public Boolean call(Tuple3<Integer, String, String> v1) throws Exception {                return list.contains(v1._1());                }}).foreach(new VoidFunction<Tuple3<Integer,String,String>>() {@Override                public void call(Tuple3<Integer, String, String> t) throws Exception {                System.out.println("nmovie name --> " + t._2() + " nmovie type --> " + t._3());                }});}}//方差的计算方法public static double variance(MatrixFactorizationModel model, JavaRDD<Rating> predictionData, long n) {//将predictionData转化成二元组型式，以便训练使用JavaRDD<Tuple2<Object, Object>> userProducts = predictionData.map(new Function<Rating, Tuple2<Object, Object>>() {public Tuple2<Object, Object> call(Rating r) {return new Tuple2<Object, Object>(r.user(), r.product());}});//通过模型对数据进行预测JavaPairRDD<Tuple2<Integer, Integer>, Double> prediction = JavaPairRDD.fromJavaRDD(model.predict(JavaRDD.toRDD(userProducts)).toJavaRDD().map(new Function<Rating, Tuple2<Tuple2<Integer, Integer>, Double>>() {        public Tuple2<Tuple2<Integer, Integer>, Double> call(Rating r) {        //System.out.println(r.user()+"..."+r.product()+"..."+r.rating());        return new Tuple2<Tuple2<Integer, Integer>, Double>(new Tuple2<Integer, Integer>(r.user(), r.product()), r.rating());        }        }));//预测值和原值内连接JavaRDD<Tuple2<Double, Double>> ratesAndPreds = JavaPairRDD.fromJavaRDD(predictionData.map(new Function<Rating, Tuple2<Tuple2<Integer, Integer>, Double>>() {        public Tuple2<Tuple2<Integer, Integer>, Double> call(Rating r) {        //System.out.println(r.user() + "..." + r.product() + "..." + r.rating());        return new Tuple2<Tuple2<Integer, Integer>, Double>(new Tuple2<Integer, Integer>(r.user(), r.product()), r.rating());        }        })).join(prediction).values();//计算方差并返回结果Double dVar = ratesAndPreds.map(new Function<Tuple2<Double,Double>, Double>() {@Override            public Double call(Tuple2<Double, Double> v1) throws Exception {            return (v1._1 - v1._2) * (v1._1 - v1._2);            }}).reduce(new Function2<Double, Double, Double>() {@Override            public Double call(Double v1, Double v2) throws Exception {            return v1 + v2;            }});return Math.sqrt(dVar / n);}}

1 0