Hadoop 实现协同过滤算法(2)

来源：互联网发布：泰罗奥特曼act淘宝编辑：程序博客网时间：2024/05/16 18:53

这部分内容接Hadoop 实现协同过滤算法(1)

第四个MR：MR4的map不做任何事情；MR4的reduce输出就是把MR(31)和MR(32)的相同的itemID整合一下而已(注意此处的输入为两个路径)：如下：

101  {107:1.0,106:2.0,105:2.0,104:4.0,103:4.0,102:3.0,101:5.0} [5 1 4 2 3] [4.0 5.0 5.0 2.0 2.5]...

WiKiDriver4.java:

package org.fansy.date1012.mahoutinaction.chapter6.sourcecode;import static org.fansy.date1012.mahoutinaction.chapter6.sourcecode.WiKiUtils.PATH;import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;import org.apache.hadoop.util.GenericOptionsParser;import org.apache.mahout.cf.taste.hadoop.item.VectorAndPrefsWritable;import org.apache.mahout.cf.taste.hadoop.item.VectorOrPrefWritable;public class WiKiDriver4 {/** * @param args * @throws IOException  * @throws InterruptedException  * @throws ClassNotFoundException  */public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {// TODO Auto-generated method stubConfiguration conf1 = new Configuration();    String[] otherArgs = new GenericOptionsParser(conf1, args).getRemainingArgs();          if (otherArgs.length != 3) {      System.err.println("Usage: WiKiDriver4 <in1><in2> <out>");      System.exit(2);    }    Job job1 = new Job(conf1, "wiki  job four");    job1.setNumReduceTasks(1);    job1.setJarByClass(WiKiDriver4.class);    job1.setInputFormatClass(SequenceFileInputFormat.class);    job1.setMapperClass(WikiMapper4.class);    job1.setMapOutputKeyClass(IntWritable.class);job1.setMapOutputValueClass(VectorOrPrefWritable.class);    job1.setReducerClass(WiKiReducer4.class);    job1.setOutputKeyClass(IntWritable.class);   job1.setOutputValueClass(VectorAndPrefsWritable.class);    job1.setOutputFormatClass(SequenceFileOutputFormat.class);    SequenceFileInputFormat.addInputPath(job1, new Path(PATH+otherArgs[0]));    SequenceFileInputFormat.addInputPath(job1, new Path(PATH+otherArgs[1]));    SequenceFileOutputFormat.setOutputPath(job1, new Path(PATH+otherArgs[2]));       if(!job1.waitForCompletion(true)){    System.exit(1); // run error then exit    }}}

WiKiMapper4.java:

package org.fansy.date1012.mahoutinaction.chapter6.sourcecode;import java.io.IOException;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.mapreduce.Mapper;import org.apache.mahout.cf.taste.hadoop.item.VectorOrPrefWritable;public class WikiMapper4 extends Mapper<IntWritable ,VectorOrPrefWritable,IntWritable,VectorOrPrefWritable> {public void map(IntWritable key,VectorOrPrefWritable value,Context context) throws IOException, InterruptedException{context.write(key, value);}}

WiKiReducer4.java:

package org.fansy.date1012.mahoutinaction.chapter6.sourcecode;import java.io.IOException;import java.util.ArrayList;import java.util.List;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.mapreduce.Reducer;import org.apache.mahout.cf.taste.hadoop.item.VectorAndPrefsWritable;import org.apache.mahout.cf.taste.hadoop.item.VectorOrPrefWritable;import org.apache.mahout.math.Vector;public class WiKiReducer4 extends Reducer<IntWritable,VectorOrPrefWritable,IntWritable,VectorAndPrefsWritable> {public void reduce(IntWritable key, Iterable<VectorOrPrefWritable> values,Context context) throws IOException, InterruptedException{List<Long> userfs=new ArrayList<Long>();List<Float> prefs=new ArrayList<Float>();Vector v=null;for(VectorOrPrefWritable value:values){if(value.getVector()!=null){v=value.getVector();}else{userfs.add(value.getUserID());prefs.add(value.getValue()); }}context.write(key, new VectorAndPrefsWritable(v,userfs,prefs));//System.out.println("key ,itemid:"+key.toString()+", information:"+v+","+userfs+","+prefs);} }

第五个MR：

map:针对MR4的输出的每一行中的每一个用户，用这个用户的评分值(value)去乘以项目之间的相似度向量，比如针对第一条记录中的用户3，则有 Vectorforuser3=[1.0 2.0 2.0 4.0 4.0 3.0 5.0]* 2.5  则map的输出为 key : 3    value :  Vectorforuser3;

map的输出应该如下所示：

alluserids:[5, 1, 4, 2, 3],userid:5,vector:{107:4.0,106:8.0,105:8.0,104:16.0,103:16.0,102:12.0,101:20.0},userid:1,vector:{107:5.0,106:10.0,105:10.0,104:20.0,103:20.0,102:15.0,101:25.0},userid:4,vector:{107:5.0,106:10.0,105:10.0,104:20.0,103:20.0,102:15.0,101:25.0},userid:2,vector:{107:2.0,106:4.0,105:4.0,104:8.0,103:8.0,102:6.0,101:10.0},userid:3,vector:{107:2.5,106:5.0,105:5.0,104:10.0,103:10.0,102:7.5,101:12.5}。。。

Combine : 针对map的输出，把相同 key(即userID)的向量对应相加，得到的向量和即为该userID的对各个项目的评分；

combine的输出应该如下所示：

userid:1,vecotr:{107:5.0,106:18.0,105:15.5,104:33.5,103:39.0,102:31.5,101:44.0}userid:2,vecotr:{107:4.0,106:20.5,105:15.5,104:36.0,103:41.5,102:32.5,101:45.5}。。。

Reduce:针对combine的输出，把用户已经评价过分的项目筛选掉，然后按照评分值的大小有大到小排序输出，即为用户推荐项目；

最后的输出为：

1[104:33.5,106:18.0,105:15.5,107:5.0]2[106:20.5,105:15.5,107:4.0]3[103:26.5,102:20.0,106:17.5]4[102:37.0,105:26.0,107:9.5]5[107:11.5]

WiKiDriver5.java:

package org.fansy.date1012.mahoutinaction.chapter6.sourcecode;import static org.fansy.date1012.mahoutinaction.chapter6.sourcecode.WiKiUtils.PATH;import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.GenericOptionsParser;import org.apache.mahout.cf.taste.hadoop.RecommendedItemsWritable;import org.apache.mahout.math.VarLongWritable;import org.apache.mahout.math.VectorWritable;public class WiKiDriver5 {/** * @param args * @throws IOException  * @throws InterruptedException  * @throws ClassNotFoundException  */public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {// TODO Auto-generated method stubConfiguration conf1 = new Configuration();    String[] otherArgs = new GenericOptionsParser(conf1, args).getRemainingArgs();          if (otherArgs.length != 2) {      System.err.println("Usage: WiKiDriver5 <in> <out>");      System.exit(2);    }    Job job1 = new Job(conf1, "wiki  job five");    job1.setNumReduceTasks(1);    job1.setJarByClass(WiKiDriver5.class);    job1.setInputFormatClass(SequenceFileInputFormat.class);    job1.setMapperClass(WikiMapper5.class);    job1.setMapOutputKeyClass(VarLongWritable.class);job1.setMapOutputValueClass(VectorWritable.class);job1.setCombinerClass(WiKiCombiner5.class);    job1.setReducerClass(WiKiReducer5.class);    job1.setOutputKeyClass(VarLongWritable.class);    job1.setOutputValueClass(RecommendedItemsWritable.class);//   job1.setOutputFormatClass(SequenceFileOutputFormat.class);    SequenceFileInputFormat.addInputPath(job1, new Path(PATH+otherArgs[0]));    FileOutputFormat.setOutputPath(job1, new Path(PATH+otherArgs[1]));       if(!job1.waitForCompletion(true)){    System.exit(1); // run error then exit    }}}

WiKiMapper5.java:

package org.fansy.date1012.mahoutinaction.chapter6.sourcecode;import java.io.IOException;import java.util.List;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.mapreduce.Mapper;import org.apache.mahout.cf.taste.hadoop.item.VectorAndPrefsWritable;import org.apache.mahout.math.VarLongWritable;import org.apache.mahout.math.Vector;import org.apache.mahout.math.VectorWritable;public class WikiMapper5 extends Mapper<IntWritable ,VectorAndPrefsWritable,VarLongWritable,VectorWritable>{public void map(IntWritable key,VectorAndPrefsWritable vectorAndPref,Context context) throws IOException, InterruptedException{Vector coo=vectorAndPref.getVector();List<Long> userIds=vectorAndPref.getUserIDs();List<Float> prefValues=vectorAndPref.getValues();//System.out.println("alluserids:"+userIds);for(int i=0;i<userIds.size();i++){long userID=userIds.get(i);float prefValue=prefValues.get(i);Vector par=coo.times(prefValue);context.write(new VarLongWritable(userID), new VectorWritable(par));//System.out.println(",userid:"+userID+",vector:"+par);  //  if the user id = 3 is the same as my paper then is right}//System.out.println();}}

WiKiCombiner5.java:

package org.fansy.date1012.mahoutinaction.chapter6.sourcecode;import java.io.IOException;import org.apache.hadoop.mapreduce.Reducer;import org.apache.mahout.math.VarLongWritable;import org.apache.mahout.math.Vector;import org.apache.mahout.math.VectorWritable;public class WiKiCombiner5 extends Reducer<VarLongWritable,VectorWritable,VarLongWritable,VectorWritable> {public void reduce(VarLongWritable key, Iterable<VectorWritable> values,Context context) throws IOException, InterruptedException{Vector partial=null;for(VectorWritable v:values){partial=partial==null?v.get():partial.plus(v.get());}context.write(key, new VectorWritable(partial));System.out.println("userid:"+key.toString()+",vecotr:"+partial);//   here also should be the same as my paper's result}}

WiKiReducer5.java:

package org.fansy.date1012.mahoutinaction.chapter6.sourcecode;import static org.fansy.date1012.mahoutinaction.chapter6.sourcecode.WiKiUtils.*;import java.io.IOException;import java.net.URI;import java.util.ArrayList;import java.util.Collections;import java.util.Iterator;import java.util.List;import java.util.PriorityQueue;import java.util.Queue;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.SequenceFile;import org.apache.hadoop.io.Writable;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.util.ReflectionUtils;import org.apache.mahout.cf.taste.hadoop.RecommendedItemsWritable;import org.apache.mahout.cf.taste.impl.common.FastMap;import org.apache.mahout.cf.taste.impl.recommender.ByValueRecommendedItemComparator;import org.apache.mahout.cf.taste.impl.recommender.GenericRecommendedItem;import org.apache.mahout.cf.taste.recommender.RecommendedItem;import org.apache.mahout.math.VarLongWritable;import org.apache.mahout.math.Vector;import org.apache.mahout.math.VectorWritable;public class WiKiReducer5 extends Reducer<VarLongWritable,VectorWritable,VarLongWritable,RecommendedItemsWritable> {private int recommendationsPerUser=RECOMMENDATIONSPERUSER;private String path=JOB1OUTPATH;private static FastMap<Integer,String> map=new FastMap<Integer,String>();public void setup(Context context) throws IOException{Configuration conf=new Configuration();FileSystem fs=FileSystem.get(URI.create(path), conf);Path tempPath=new Path(path);SequenceFile.Reader reader=null;try {reader=new SequenceFile.Reader(fs, tempPath, conf);Writable key=(Writable)ReflectionUtils.newInstance(reader.getKeyClass(),conf);Writable value = (Writable) ReflectionUtils.newInstance(reader.getValueClass(), conf); //long position = reader.getPosition();  while (reader.next(key, value)) {  map.put(Integer.parseInt(key.toString()), value.toString());//    System.out.println(key.toString()+","+value.toString());//    position = reader.getPosition(); // beginning of next record  }} catch (Exception e) {// TODO Auto-generated catch blocke.printStackTrace();}  }public void reduce(VarLongWritable key, Iterable<VectorWritable> values,Context context) throws IOException, InterruptedException{int userID=(int)key.get();Vector rev=null;for(VectorWritable vec:values){rev=rev==null? vec.get():rev.plus(vec.get());}Queue<RecommendedItem>topItems=new PriorityQueue<RecommendedItem>(recommendationsPerUser+1,Collections.reverseOrder(ByValueRecommendedItemComparator.getInstance()));Iterator<Vector.Element>recommendationVectorIterator=rev.iterateNonZero();while(recommendationVectorIterator.hasNext()){Vector.Element e=recommendationVectorIterator.next();int index=e.index();System.out.println("Vecotr.element.indxe:"+index);  //  test here  find the index is item id or not  ** test result : index is itemif(!hasItem(userID,String.valueOf(index))){float value=(float) e.get();if(topItems.size()<recommendationsPerUser){//  here only set indextopItems.add(new GenericRecommendedItem(index,value));}else if(value>topItems.peek().getValue()){topItems.add(new GenericRecommendedItem(index,value));topItems.poll();}}}List<RecommendedItem>recom=new ArrayList<RecommendedItem>(topItems.size());recom.addAll(topItems);Collections.sort(recom,ByValueRecommendedItemComparator.getInstance());context.write(key, new RecommendedItemsWritable(recom));}public static boolean hasItem(int user,String item){  // to check whether the user has rate the itemboolean flag=false;String items=map.get(user);if(items.contains(item)){flag=true;}return flag;}}

最后一个reducer的编写也是费了一番功夫：基本思路：在Reducer的setup函数中读取SequenceFile的数据，这个数据是MR1的输出数据，用来排除用户已经评价过的项目。

其实在编写这些代码的时候查了好多mahout的API，因为好多类都是在Mahout上面的，要了解它的用法才行，在最后一个Reducer中我也用了一个FastMap,这个类也是Mahout的，应该用他提供的一些类会运行的更加快吧。

最后说下算法吧：

在《Mahout in Action》中最初的算法是这样的：

用相似度矩阵点乘用户对项目的评价向量得到用户对项目的评分（其中的U3对对101项目的评分应该是2.5，我怀疑书上印错了）；

但用代码实现的时候书上建议这样做：

这样做可以提高效率。

分享，快乐，成长