mahout基于项目的协同过滤源码分析

来源:互联网 发布:淘宝上面卖什么好 编辑:程序博客网 时间:2024/05/16 00:48

1.     综述

Mahout支持2 M/R jobs实现itemBase的协同过滤
(1) ItemSimilarityJob

(2) RecommenderJob

源码包位置:org.apache.mahout.cf.taste.hadoop.item.RecommenderJob

RecommenderJob前几个阶段和ItemSimilarityJob是一样的,不过ItemSimilarityJob 计算出item的相似度矩阵就结束了,而RecommenderJob 会继续使用相似度矩阵,对每个user计算出应该推荐给他的top N itemsRecommenderJob 的输入也是[userID, itemID, preferencevalue]格式的。JobRecommenderJob主要由以下一的Job组成:

2.     第一阶段:

   该阶段包含三个job

   1)输入是原始数据,格式为:<userid,itemid,preference> 计算每个teamid对应的index,并保存index及其对应的teamid中的最小值,输出为<index,min(teamid)> 

   2)输入为原始数据,格式为:<userid,itemid,preference> ,生成用户矩阵,输出为:<userid,vector<IndexOfIteamid,pre>>

   3输入为(2)的结果,生成项目矩阵,输出为:<IndexOfIteamid,vector<IndexOfUserId,pre>>

 

 /*shouldRunNextPhase 这里的作用是:当这个阶段已经运行过,且结果已保存,则可以通过设置参数--startphase 跳过该阶段,也可以设置--endphase ,让程序在运行完某个阶段,停下来。*/

   if (shouldRunNextPhase(parsedArgscurrentPhase)) {

      ToolRunner.run(getConf(), newPreparePreferenceMatrixJob(), new String[]{

        "--input", getInputPath().toString(),

        "--output"prepPath.toString(),

        "--minPrefsPerUser", String.valueOf(minPrefsPerUser),

        "--booleanData", String.valueOf(booleanData),

        "--tempDir", getTempPath().toString(),

      });

 

  numberOfUsers = HadoopUtil.readInt

  (new Path(prepPath, PreparePreferenceMatrixJob.NUM_USERS), getConf());

  }

2.1.PreparePreferenceMatrixJob

   1)输入是原始数据,格式为:<userid,itemid,preference> 计算每个teamid对应的index,并保存index及其对应的teamid中的最小值,输出为<index,min(teamid)> 

 

Job itemIDIndex= prepareJob(getInputPath(), getOutputPath(ITEMID_INDEX),

 TextInputFormat.class,

 ItemIDIndexMapper.class, VarIntWritable.class, VarLongWritable.class,

 ItemIDIndexReducer.class,

 VarIntWritable.class, VarLongWritable.class, SequenceFileOutputFormat.class);

    itemIDIndex.setCombinerClass(ItemIDIndexReducer.class);

    booleansucceeded = itemIDIndex.waitForCompletion(true);

    if (!succeeded) {

      return -1;

    }

     2)输入为原始数据,格式为:<userid,itemid,preference> ,生成用户矩阵,输出为:<userid,vector<IndexOfIteamid,pre>>

 

    Job toUserVectors= prepareJob(getInputPath(),

                                   getOutputPath(USER_VECTORS),

                                   TextInputFormat.class,

                                   ToItemPrefsMapper.class,

                                   VarLongWritable.class,

                                   booleanData ? VarLongWritable.class : EntityPrefWritable.class,

                                   ToUserVectorsReducer.class,

                                   VarLongWritable.class,

                                   VectorWritable.class,

                                   SequenceFileOutputFormat.class);

    toUserVectors.getConfiguration().setBoolean(RecommenderJob.BOOLEAN_DATAbooleanData);

    toUserVectors.getConfiguration().setInt(ToUserVectorsReducer.MIN_PREFERENCES_PER_USERminPrefsPerUser);

    toUserVectors.getConfiguration().set(ToEntityPrefsMapper.RATING_SHIFT, String.valueOf(ratingShift));

    succeeded = toUserVectors.waitForCompletion(true);

    if (!succeeded) {

      return -1;

    }

    //we need the number of users later

    intnumberOfUsers = (inttoUserVectors.getCounters().findCounter(ToUserVectorsReducer.Counters.USERS).getValue();

    HadoopUtil.writeInt(numberOfUsers, getOutputPath(NUM_USERS), getConf());

 

   3输入为(2)的结果,生成项目矩阵,输出为:<IndexOfIteamid,vector<IndexOfUserId,pre>>

 

    Job toItemVectors= prepareJob(getOutputPath(USER_VECTORS), getOutputPath(RATING_MATRIX),

            ToItemVectorsMapper.class, IntWritable.class, VectorWritable.classToItemVectorsReducer.class,

            IntWritable.class, VectorWritable.class);

    toItemVectors.setCombinerClass(ToItemVectorsReducer.class);

 

    succeeded = toItemVectors.waitForCompletion(true);

    if (!succeeded) {

      return -1;

    }

2.1.1.itemIDIndexItemIDIndexMapper

 //提取输入文件中的item项,并计算其对应的index,输出为 <indexOfitemiditemid>

 

publicfinalclassItemIDIndexMapperextends

Mapper<LongWritable,Text, VarIntWritable, VarLongWritable> {

 

 //输入文件中的useriditemid是否相反,如果是false,则输入文件为<userid,itemid,pref>

 // 如果是true,则输入文件为<itemid,userid,pref>

 

  privatebooleantranspose;

 

  privatefinal VarIntWritable indexWritable = new VarIntWritable();

  privatefinal VarLongWritable itemIDWritable = new VarLongWritable();

 

  @Override

  protectedvoid setup(Context context) {

Configuration jobConf = context.getConfiguration();

 

 //从配置文件中提取transpose的值,默认是false

 

    transpose = jobConf.getBoolean(ToEntityPrefsMapper.TRANSPOSE_USER_ITEMfalse);

  }

 

  @Override

  protectedvoid map(LongWritable key,

                     Text value,

                     Context contextthrows IOException, InterruptedException {

String[] tokens = TasteHadoopUtils.splitPrefTokens(value.toString());

 

    //如果transpose false,则itemid为第二项,否则,为第一项。

 

longitemID = Long.parseLong(tokens[transpose ? 0 : 1]);

 

    //计算itemid对应的index

 

    intindex = TasteHadoopUtils.idToIndex(itemID);

    indexWritable.set(index);

itemIDWritable.set(itemID);

 

    //输出<indexofitemid,itemid>

 

    context.write(indexWritableitemIDWritable);

  } 

}

2.1.2. itemIDIndexItemIDIndexReducer

//生成同一个index对应的itemid的最小值

 

publicfinalclassItemIDIndexReducerextends

    Reducer<VarIntWritable, VarLongWritable, VarIntWritable,VarLongWritable> {

 

  privatefinal VarLongWritable minimumItemIDWritable = new VarLongWritable();

 

  @Override

  protectedvoid reduce(VarIntWritable index,

                        Iterable<VarLongWritable> possibleItemIDs,

                        Context contextthrows IOException, InterruptedException {

  //初始化一个最大值

 

    longminimumItemID = Long.MAX_VALUE;

    for (VarLongWritable varLongWritable : possibleItemIDs) {

      longitemID = varLongWritable.get();

      if (itemID < minimumItemID) {

 

     //只保存最小最

 

        minimumItemID = itemID;

      }

    }

    if (minimumItemID != Long.MAX_VALUE) {

      minimumItemIDWritable.set(minimumItemID);

 

   //输出<indexofitem,,min<itemidofindex>>

 

      context.write(indexminimumItemIDWritable);

    }

  }

 

}

2.1.3.toUserVectors :ToItemPrefsMapper

//读取输入文件,并以<userid,<itemid,pref>>格式输出

publicfinalclassToItemPrefsMapperextends ToEntityPrefsMapper {

 

  public ToItemPrefsMapper() {

    super(false);

  }

 

}

 

publicabstractclassToEntityPrefsMapperextends

    Mapper<LongWritable,Text, VarLongWritable,VarLongWritable> {

 

  publicstaticfinal String TRANSPOSE_USER_ITEM = ToEntityPrefsMapper.class + "transposeUserItem";

  publicstaticfinal String RATING_SHIFT = ToEntityPrefsMapper.class + "shiftRatings";

 

//用于分割输入的正则模式

 

  privatestaticfinal Pattern DELIMITER = Pattern.compile("[\t,]");

 

  //输入文件中是否用布尔类型值定义pref评分,如果booleanDatatrue,是,否则,不是。

  privatebooleanbooleanData;

 

  // 输入文件中useriditemid是否相反

 

  privatebooleantranspose;

  privatefinalbooleanitemKey;

  privatefloatratingShift;

 

  ToEntityPrefsMapper(booleanitemKey) {

    this.itemKey = itemKey;

  }

 

  @Override

  protectedvoid setup(Context context) {

    Configuration jobConf = context.getConfiguration();

    booleanData = jobConf.getBoolean(RecommenderJob.BOOLEAN_DATAfalse);

    transpose = jobConf.getBoolean(TRANSPOSE_USER_ITEMfalse);

    ratingShift = Float.parseFloat(jobConf.get(RATING_SHIFT"0.0"));

  }

 

  @Override

  publicvoid map(LongWritable key,

                  Text value,

                  Context contextthrows IOException, InterruptedException {

    String[] tokens = DELIMITER.split(value.toString());

    longuserID = Long.parseLong(tokens[0]);

longitemID = Long.parseLong(tokens[1]);

 

if (itemKey ^ transpose) {

 

     //如果输入文件useriditemid相反,则互换

 

 

      longtemp = userID;

      userID = itemID;

      itemID = temp;

}

 

if (booleanData) {

 

     //如果采用布尔类型值定义评分pref,则仅输出<userid,itemid>

 

      context.write(new VarLongWritable(userID), new VarLongWritable(itemID));

else {

 

     //如果输入文件中有评分值,将输入文件中的评分项都加ratingshift,否则,加1

 

      floatprefValue = tokens.length > 2 ? Float.parseFloat(tokens[2]) + ratingShift : 1.0f;

     //输出为<userid,<itemid,pref>>

      context.write(new VarLongWritable(userID), new EntityPrefWritable(itemIDprefValue));

    }

  }

 

}

 

2.1.4.toUserVectors :ToUserVectorsReducer

//生成userid 对应的所有itemidpref

 

publicfinalclassToUserVectorsReducerextends

    Reducer<VarLongWritable,VarLongWritable,VarLongWritable,VectorWritable> {

 

  publicstaticfinal String MIN_PREFERENCES_PER_USER = ToUserVectorsReducer.class.getName()

      + ".minPreferencesPerUser";

 

  //评分个数阈值

 

  privateintminPreferences;

 

  //user个数计数器

  publicenum Counters { USERS }

 

  privatefinal VectorWritable userVectorWritable = new VectorWritable();

 

  @Override

  protectedvoid setup(Context ctxthrows IOException, InterruptedException {

super.setup(ctx);

 

   //从配置文件中读取minPreferences 的值,默认是1

 

    minPreferences = ctx.getConfiguration().getInt(MIN_PREFERENCES_PER_USER, 1);

  }

 

  @Override

  protectedvoid reduce(VarLongWritable userID,

                        Iterable<VarLongWritable> itemPrefs,

                        Context contextthrows IOException, InterruptedException {

    Vector userVector = new RandomAccessSparseVector(Integer.MAX_VALUE, 100);

for (VarLongWritable itemPref : itemPrefs) {

 

   //计算每个itemid对应的index

 

      intindex = TasteHadoopUtils.idToIndex(itemPref.get());

 

    //如果评分采用布尔值使得map输出,则将相应的评分设为1

 

      floatvalue = itemPrefinstanceof EntityPrefWritable ? ((EntityPrefWritable) itemPref).getPrefValue() : 1.0f;

 

     //itemid对应的index为用户向量的位置,评分为用户向量在该位置上的值

 

      userVector.set(indexvalue);

}

 

     //将评分个数大于minPreferences的项输出,小于的丢弃

    if (userVector.getNumNondefaultElements() >= minPreferences) {

      userVectorWritable.set(userVector);

      userVectorWritable.setWritesLaxPrecision(true);

      context.getCounter(Counters.USERS).increment(1);

      context.write(userIDuserVectorWritable);

    }

  }

 

}

 

2.1.5. toItemVectors :ToItemVectorsMapper

 //生成项目矩阵

publicclassToItemVectorsMapper

    extends Mapper<VarLongWritable,VectorWritable,IntWritable,VectorWritable> {

 

  privatefinal IntWritable itemID = new IntWritable();

  privatefinal VectorWritable itemVectorWritable = new VectorWritable();

 

  @Override

  protectedvoid map(VarLongWritable rowIndex, VectorWritable vectorWritable, Context ctx)

    throws IOException, InterruptedException {

Vector userRatings = vectorWritable.get();

 

  //计算userid对应的index

    intcolumn = TasteHadoopUtils.idToIndex(rowIndex.get());

 

    itemVectorWritable.setWritesLaxPrecision(true);//暂时不明白

   

    //初始化itemvector,最大容量是Integer.MAX_VALUE,初始容量为1

    Vector itemVector = new RandomAccessSparseVector(Integer.MAX_VALUE, 1);

for (Vector.Element elem : userRatings.nonZeroes()) {

 

     //useriditemid进行对调

      itemID.set(elem.index());

      itemVector.setQuick(columnelem.get());

      itemVectorWritable.set(itemVector);

 

      //输出<indexofitemid,vector<indexofuserid,pref>>

      ctx.write(itemIDitemVectorWritable);

 

      // 重用itemvector

      itemVector.setQuick(elem.index(), 0.0);

    }

  }

 

}

 

2.1.6. toItemVectors :ToItemVectorsReducer

 

publicclassToItemVectorsReducerextends

Reducer<IntWritable,VectorWritable,IntWritable,VectorWritable> {

 

  privatefinal VectorWritable merged = new VectorWritable();

 

  @Override

  protectedvoid reduce(IntWritable row, Iterable<VectorWritable> vectors, Context ctx)

    throws IOException, InterruptedException {

   

merged.setWritesLaxPrecision(true);

 

   //将相同indexofitemid下的对应项合并

merged.set(VectorWritable.mergeToVector(vectors.iterator()));

 

   //输出完整的<indexofitemid,vector<indexofuserid,pref>>

    ctx.write(rowmerged);

  }

}

 

3.     第二阶段:

   该阶段包含5job:其中前4个属于RowSimilarityJob步骤,生成相似度矩阵,第五个为输出TextOutputFormat文件格式的相似度矩阵。

  4)输入为(3)的结果,统计数据中不同user的个数,输出为:vector<IndexOfUseridCountOfUser>

  5)输入为(3)的结果,对输入进行选样,输出(3)的样本对应的转置,

<IndexOfuserid,vector<IndexOfIteamid,pre>>,输出每个样本项目对应多的norm值,

vector<IndexOfIteamid,norm(teamid)>,如果用户设定threshold参数,而不是默认值,则还输出每个样本项目对应的用户数及对应的pref最大值,

vector<IndexOfIteamid,numNonZeroEntries> ,vector<IndexOfIteamid,maxValue>

  6)输入为(5)的输出,其中转置矩阵作为job的输入,其它利用configure的参数传递文件地址,进而通过地址操作文件,生成共生矩阵(相似度矩阵上三角),输出为:<IndexOfIteamA,vector<IndexOfIteamB,similarityValue>>如果用户设定threshold参数,而不是默认值,则similarityValue小于threshold的对应项被丢弃。由threshold参数和文件地址中的内容共同决定是否计算IteamA IteamB的相似度。

  7)输入为(6)的输出,生成完整的相似度矩阵,并保留每个项目对应的相似度最大的topk项,输出为:<IndexOfIteamA,vector<IndexOfIteamB,similarityValue>>

  8)输入为(7)的输出,生成相似度矩阵的TextOutputFormat文件,输出为:<<IteamA,IteamB>,similarityValue>

 

if (shouldRunNextPhase(parsedArgscurrentPhase)) {

 

      /* 如果第一阶段已执行,且已停止,当再次执行时,跳过第一阶段,但是numberOfUsers 不被保存,因此需要重新计算numberOfUsers 的值*/

      if (numberOfUsers == -1) {

        numberOfUsers = (int) HadoopUtil.countRecords(new Path(prepPath, PreparePreferenceMatrixJob.USER_VECTORS),

                PathType.LISTnull, getConf());

      }

 

      //计算共生矩阵

      ToolRunner.run(getConf(), newRowSimilarityJob(), new String[]{

        "--input"new Path(prepPath, PreparePreferenceMatrixJob.RATING_MATRIX).toString(),

        "--output"similarityMatrixPath.toString(),

        "--numberOfColumns", String.valueOf(numberOfUsers),

        "--similarityClassname"similarityClassname,

        "--maxObservationsPerRow", String.valueOf(maxPrefsInItemSimilarity),

        "--maxObservationsPerColumn", String.valueOf(maxPrefsInItemSimilarity),

        "--maxSimilaritiesPerRow", String.valueOf(maxSimilaritiesPerItem),

        "--excludeSelfSimilarity", String.valueOf(Boolean.TRUE),

        "--threshold", String.valueOf(threshold),

        "--randomSeed", String.valueOf(randomSeed),

        "--tempDir", getTempPath().toString(),

      });

 

      //如果用户指定outputPathForSimilarityMatrix,则输出共生矩阵

      if (hasOption("outputPathForSimilarityMatrix")) {

        Path outputPathForSimilarityMatrix = new Path(getOption("outputPathForSimilarityMatrix"));

 

        Job outputSimilarityMatrix= prepareJob(similarityMatrixPathoutputPathForSimilarityMatrix,

            SequenceFileInputFormat.classItemSimilarityJob.MostSimilarItemPairsMapper.class,

            EntityEntityWritable.class, DoubleWritable.classItemSimilarityJob.MostSimilarItemPairsReducer.class,

            EntityEntityWritable.class, DoubleWritable.class, TextOutputFormat.class);

 

        Configuration mostSimilarItemsConf = outputSimilarityMatrix.getConfiguration();

        mostSimilarItemsConf.set(ItemSimilarityJob.ITEM_ID_INDEX_PATH_STR,

            new Path(prepPath, PreparePreferenceMatrixJob.ITEMID_INDEX).toString());

        mostSimilarItemsConf.setInt(ItemSimilarityJob.MAX_SIMILARITIES_PER_ITEMmaxSimilaritiesPerItem);

        outputSimilarityMatrix.waitForCompletion(true);

      }

}

3.1. RowSimilarityJob

    RowSimilarityJob4job组成

  4)输入为(3)的结果,统计数据中不同user的个数,输出为:vector<IndexOfUseridCountOfUser>

 

   Job countObservations= prepareJob(getInputPath(), getTempPath("notUsed"), CountObservationsMapper.class,

        NullWritable.class, VectorWritable.classSumObservationsReducer.class, NullWritable.class,

        VectorWritable.class);

    countObservations.setCombinerClass(VectorSumCombiner.class);

    countObservations.getConfiguration().set(OBSERVATIONS_PER_COLUMN_PATHobservationsPerColumnPath.toString());

    countObservations.setNumReduceTasks(1);

    countObservations.waitForCompletion(true);

 

  5)输入为(3)的结果,对输入进行选样,输出(3)的样本对应的转置,

<IndexOfuserid,vector<IndexOfIteamid,pre>>,输出每个样本项目对应多的norm值,

vector<IndexOfIteamid,norm(teamid)>,如果用户设定threshold参数,而不是默认值,则还输出每个样本项目对应的用户数及对应的pref最大值,

vector<IndexOfIteamid,numNonZeroEntries> ,vector<IndexOfIteamid,maxValue>

 

    if (shouldRunNextPhase(parsedArgscurrentPhase)) {

      Job normsAndTranspose= prepareJob(getInputPath(), weightsPathVectorNormMapper.class, IntWritable.class,

          VectorWritable.classMergeVectorsReducer.class, IntWritable.class, VectorWritable.class);

      normsAndTranspose.setCombinerClass(MergeVectorsCombiner.class);

      Configuration normsAndTransposeConf = normsAndTranspose.getConfiguration();

      normsAndTransposeConf.set(THRESHOLD, String.valueOf(threshold));

      normsAndTransposeConf.set(NORMS_PATHnormsPath.toString());

      normsAndTransposeConf.set(NUM_NON_ZERO_ENTRIES_PATHnumNonZeroEntriesPath.toString());

      normsAndTransposeConf.set(MAXVALUES_PATHmaxValuesPath.toString());

      normsAndTransposeConf.set(SIMILARITY_CLASSNAMEsimilarityClassname);

      normsAndTransposeConf.set(OBSERVATIONS_PER_COLUMN_PATHobservationsPerColumnPath.toString());

      normsAndTransposeConf.set(MAX_OBSERVATIONS_PER_ROW, String.valueOf(maxObservationsPerRow));

      normsAndTransposeConf.set(MAX_OBSERVATIONS_PER_COLUMN, String.valueOf(maxObservationsPerColumn));

      normsAndTransposeConf.set(RANDOM_SEED, String.valueOf(randomSeed));

 

      booleansucceeded = normsAndTranspose.waitForCompletion(true);

      if (!succeeded) {

        return -1;

      }

    }

6)输入为(5)的输出,其中转置矩阵作为job的输入,其它利用configure的参数传递文件地址,进而通过地址操作文件,生成共生矩阵(相似度矩阵上三角),输出为:<IndexOfIteamA,vector<IndexOfIteamB,similarityValue>>如果用户设定threshold参数,而不是默认值,则similarityValue小于threshold的对应项被丢弃。由threshold参数和文件地址中的内容共同决定是否计算IteamA IteamB的相似度。

 

    if (shouldRunNextPhase(parsedArgscurrentPhase)) {

      Job pairwiseSimilarity= prepareJob(weightsPathpairwiseSimilarityPathCooccurrencesMapper.class,

          IntWritable.class, VectorWritable.classSimilarityReducer.class, IntWritable.class, VectorWritable.class);

      pairwiseSimilarity.setCombinerClass(VectorSumReducer.class);

      Configuration pairwiseConf = pairwiseSimilarity.getConfiguration();

      pairwiseConf.set(THRESHOLD, String.valueOf(threshold));

      pairwiseConf.set(NORMS_PATHnormsPath.toString());

      pairwiseConf.set(NUM_NON_ZERO_ENTRIES_PATHnumNonZeroEntriesPath.toString());

      pairwiseConf.set(MAXVALUES_PATHmaxValuesPath.toString());

      pairwiseConf.set(SIMILARITY_CLASSNAMEsimilarityClassname);

      pairwiseConf.setInt(NUMBER_OF_COLUMNSnumberOfColumns);

      pairwiseConf.setBoolean(EXCLUDE_SELF_SIMILARITYexcludeSelfSimilarity);

      booleansucceeded = pairwiseSimilarity.waitForCompletion(true);

      if (!succeeded) {

        return -1;

      }

    }

 

    7)输入为(6)的输出,生成完整的相似度矩阵,并保留每个项目对应的相似度最大的topk项,输出为:<IndexOfIteamA,vector<IndexOfIteamB,similarityValue>>

 

    if (shouldRunNextPhase(parsedArgscurrentPhase)) {

      Job asMatrix= prepareJob(pairwiseSimilarityPath, getOutputPath(), UnsymmetrifyMapper.class,

          IntWritable.class, VectorWritable.classMergeToTopKSimilaritiesReducer.class, IntWritable.class,

          VectorWritable.class);

      asMatrix.setCombinerClass(MergeToTopKSimilaritiesReducer.class);

      asMatrix.getConfiguration().setInt(MAX_SIMILARITIES_PER_ROWmaxSimilaritiesPerRow);

      booleansucceeded = asMatrix.waitForCompletion(true);

      if (!succeeded) {

        return -1;

      }

    }

3.1.1.countObservations :CountObservationsMapper

publicstaticclassCountObservationsMapperextends

Mapper<IntWritable,VectorWritable,NullWritable,VectorWritable> {

 

    private Vector columnCounts = new RandomAccessSparseVector(Integer.MAX_VALUE);

 

    @Override

    protectedvoid map(IntWritable rowIndex, VectorWritable rowVectorWritable, Context ctx)

      throws IOException, InterruptedException {

 

      Vector row = rowVectorWritable.get();

      for (Vector.Element elem : row.nonZeroes()) {

 

    //向量的位置是indexofuserid,位置上的内容是indexindexofuseriduser的个数

        columnCounts.setQuick(elem.index(), columnCounts.getQuick(elem.index()) + 1);

      }

    }

 

    @Override

protectedvoid cleanup(Context ctxthrows IOException, InterruptedException {

      //输出map段的统计结果

      ctx.write(NullWritable.get(), new VectorWritable(columnCounts));

    }

  }

3.1.2.countObservations :SumObservationsReducer

publicstaticclass SumObservationsReducer extends

Reducer<NullWritable,VectorWritable,NullWritable,VectorWritable> {

    @Override

    protectedvoid reduce(NullWritable nullWritable, Iterable<VectorWritable> partialVectors, Context ctx)

throws IOException, InterruptedException {

 

      //vector对应内容累加

      Vector counts = Vectors.sum(partialVectors.iterator());

 

      //由于最后的向量只有一个,reducenum设为1,因此,这里没有用jobreduce输出路径,而是通过hdfs api 直接将结果写入文件中。

      Vectors.write(countsnew

Path(ctx.getConfiguration().get(OBSERVATIONS_PER_COLUMN_PATH)), ctx.getConfiguration());

    }

  }

 

3.1.3. normsAndTranspose :VectorNormMapper

 

publicstaticclass VectorNormMapper extends

Mapper<IntWritable,VectorWritable,IntWritable,VectorWritable> {

 

    private VectorSimilarityMeasure similarity;

    private Vector norms;

    private Vector nonZeroEntries;

    private Vector maxValues;

    privatedoublethreshold;

 

    private OpenIntIntHashMap observationsPerColumn;

    privateintmaxObservationsPerRow;

    privateintmaxObservationsPerColumn;

 

    private Random random;

 

    @Override

    protectedvoid setup(Context ctxthrows IOException, InterruptedException {

 

      Configuration conf = ctx.getConfiguration();

     

      //计算相似度时使用的距离计算方式

      similarity = ClassUtils.instantiateAs(conf.get(SIMILARITY_CLASSNAME), VectorSimilarityMeasure.class);

 

      //norm向量用于存放每个indexofitemid对应的norm

      norms = new RandomAccessSparseVector(Integer.MAX_VALUE);

 

      //每个item对应的用户数向量

      nonZeroEntries = new RandomAccessSparseVector(Integer.MAX_VALUE);

 

      //每个item对应的评分最大值

      maxValues = new RandomAccessSparseVector(Integer.MAX_VALUE);

      threshold = Double.parseDouble(conf.get(THRESHOLD));

     

      //读取每个用户出现次数的向量,即每个用户购买不同产品数

      observationsPerColumn = Vectors.readAsIntMap(new

Path(conf.get(OBSERVATIONS_PER_COLUMN_PATH)), conf);

      maxObservationsPerRow = conf.getInt(MAX_OBSERVATIONS_PER_ROWDEFAULT_MAX_OBSERVATIONS_PER_ROW);

      maxObservationsPerColumn = conf.getInt(MAX_OBSERVATIONS_PER_COLUMNDEFAULT_MAX_OBSERVATIONS_PER_COLUMN);

     

      //从配置文件中读取随机种子的值

      longseed = Long.parseLong(conf.get(RANDOM_SEED));

      if (seed == NO_FIXED_RANDOM_SEED) {

        random = RandomUtils.getRandom();

      } else {

        random = RandomUtils.getRandom(seed);

      }

    }

   

    //采样函数,即采样的策略

    private Vector sampleDown(Vector rowVector, Context ctx) {

     

    //计算向量的非默认值的个数,也就是非0个数,即项目对应的用户数

      intobservationsPerRow = rowVector.getNumNondefaultElements();

 

 

  /* indexofitemid对应的用户个数小于maxObservationsPerRow,行概率为1,否则,行概率为maxObservationsPerRow/observationsPerRow*/

 

      doublerowSampleRate = (double) Math.min(maxObservationsPerRowobservationsPerRow) / (doubleobservationsPerRow;

 

      Vector downsampledRow = rowVector.like();

      longusedObservations = 0;

      longneglectedObservations = 0;

 

      for (Vector.Element elem : rowVector.nonZeroes()) {

 

        //用户indexofuserid对应的出现次数

        intcolumnCount = observationsPerColumn.get(elem.index());

       

        /*如果columnCount小于maxObservationsPerColumn时,列概率为1,否则,列概率为maxObservationsPerColumn / columnCount*/

 

        doublecolumnSampleRate = (double) Math.min(maxObservationsPerColumncolumnCount) / (doublecolumnCount;

 

        //由行、列概率共同决定对应indexofitemindexofuserid上的值是否保留

        if (random.nextDouble() <= Math.min(rowSampleRatecolumnSampleRate)) {

          downsampledRow.setQuick(elem.index(), elem.get());

          usedObservations++;

        } else {

          neglectedObservations++;

        }

 

      }

 

      ctx.getCounter(Counters.USED_OBSERVATIONS).increment(usedObservations);

      ctx.getCounter(Counters.NEGLECTED_OBSERVATIONS).increment(neglectedObservations);

 

      returndownsampledRow;

    }

 

    @Override

    protectedvoid map(IntWritable row, VectorWritable vectorWritable, Context ctx)

      throws IOException, InterruptedException {

 

      //对输入的向量进行采样处理,并返回采样后的向量

      Vector sampledRowVector = sampleDown(vectorWritable.get(), ctx);

      //如果是欧几里得距离,则rowVector sampledRowVector相同

      Vector rowVector = similarity.normalize(sampledRowVector);

 

      intnumNonZeroEntries = 0;

      doublemaxValue = Double.MIN_VALUE;

 

      for (Vector.Element element : rowVector.nonZeroes()) {

        RandomAccessSparseVector partialColumnVector = new

RandomAccessSparseVector(Integer.MAX_VALUE);

        partialColumnVector.setQuick(row.get(), element.get());

 

      //输出<indexofuserid,<indexofitemid,pref>>,即采样向量进行转置

        ctx.write(new IntWritable(element.index()), new

VectorWritable(partialColumnVector));

 

        numNonZeroEntries++;

        if (maxValue < element.get()) {

          maxValue = element.get();

        }

      }

 

      if (threshold != NO_THRESHOLD) {

 

        //采样向量的非默认值个数

        nonZeroEntries.setQuick(row.get(), numNonZeroEntries);

 

        //采样向量中最大值

        maxValues.setQuick(row.get(), maxValue);

      }

       //计算indexofitemid对应的norm

      norms.setQuick(row.get(), similarity.norm(rowVector));

 

      ctx.getCounter(Counters.ROWS).increment(1);

    }

 

    @Override

    protectedvoid cleanup(Context ctxthrows IOException, InterruptedException {

      ctx.write(new IntWritable(NORM_VECTOR_MARKER), new VectorWritable(norms));

      ctx.write(new IntWritable(NUM_NON_ZERO_ENTRIES_VECTOR_MARKER), new

VectorWritable(nonZeroEntries));

      ctx.write(new IntWritable(MAXVALUE_VECTOR_MARKER), new

VectorWritable(maxValues));

    }

  }

 

3.1.4. normsAndTranspose :MergeVectorsCombiner

privatestaticclass MergeVectorsCombiner extends Reducer<IntWritable,VectorWritable,IntWritable,VectorWritable> {

    @Override

    protectedvoid reduce(IntWritable row, Iterable<VectorWritable> partialVectors, Context ctx)

      throws IOException, InterruptedException {

 

     //相同key值对应的value合并

      ctx.write(rownew VectorWritable(Vectors.merge(partialVectors)));

    }

  }

 

3.1.5.normsAndTranspose :MergeVectorsReducer

 publicstaticclass MergeVectorsReducer extends

Reducer<IntWritable,VectorWritable,IntWritable,VectorWritable> {

 

    private Path normsPath;

    private Path numNonZeroEntriesPath;

    private Path maxValuesPath;

 

    @Override

    protectedvoid setup(Context ctxthrows IOException, InterruptedException {

      normsPath = new Path(ctx.getConfiguration().get(NORMS_PATH));

      numNonZeroEntriesPath = new Path(ctx.getConfiguration().get(NUM_NON_ZERO_ENTRIES_PATH));

      maxValuesPath = new Path(ctx.getConfiguration().get(MAXVALUES_PATH));

    }

 

    @Override

    protectedvoid reduce(IntWritable row, Iterable<VectorWritable> partialVectors, Context ctx)

      throws IOException, InterruptedException {

 

     //相同的key值,进行合并

      Vector partialVector = Vectors.merge(partialVectors);

     

      if (row.get() == NORM_VECTOR_MARKER) {

 

        //norm向量结果写入指定的路径文件中

        Vectors.write(partialVectornormsPathctx.getConfiguration());

      } elseif (row.get() == MAXVALUE_VECTOR_MARKER) {

       //将最大值向量写入指定的路径文件中

        Vectors.write(partialVectormaxValuesPathctx.getConfiguration());

      } elseif (row.get() == NUM_NON_ZERO_ENTRIES_VECTOR_MARKER) {

        //将非0个数向量写入指定的文件中

        Vectors.write(partialVectornumNonZeroEntriesPathctx.getConfiguration(), true);

      } else {

 

      //输出装置向量

        ctx.write(rownew VectorWritable(partialVector));

      }

    }

  }

 

3.1.6. pairwiseSimilarity :CooccurrencesMapper

publicstaticclass CooccurrencesMapper extends

Mapper<IntWritable,VectorWritable,IntWritable,VectorWritable> {

 

    private VectorSimilarityMeasure similarity;

 

    private OpenIntIntHashMap numNonZeroEntries;

    private Vector maxValues;

    privatedoublethreshold;

 

    privatestaticfinal Comparator<Vector.Element> BY_INDEX = new Comparator<Vector.Element>() {

      @Override

      publicint compare(Vector.Element one, Vector.Element two) {

        return Ints.compare(one.index(), two.index());

      }

    };

 

    @Override

    protectedvoid setup(Context ctxthrows IOException, InterruptedException {

      similarity = ClassUtils.instantiateAs(ctx.getConfiguration().get(SIMILARITY_CLASSNAME),

          VectorSimilarityMeasure.class);

      numNonZeroEntries = Vectors.readAsIntMap(new Path(ctx.getConfiguration().get(NUM_NON_ZERO_ENTRIES_PATH)),

          ctx.getConfiguration());

      maxValues = Vectors.read(new Path(ctx.getConfiguration().get(MAXVALUES_PATH)), ctx.getConfiguration());

      threshold = Double.parseDouble(ctx.getConfiguration().get(THRESHOLD));

    }

 

    privateboolean consider(Vector.Element occurrenceA, Vector.Element occurrenceB) {

      intnumNonZeroEntriesA = numNonZeroEntries.get(occurrenceA.index());

      intnumNonZeroEntriesB = numNonZeroEntries.get(occurrenceB.index());

 

      doublemaxValueA = maxValues.get(occurrenceA.index());

      doublemaxValueB = maxValues.get(occurrenceB.index());

 

      returnsimilarity.consider(numNonZeroEntriesAnumNonZeroEntriesBmaxValueAmaxValueBthreshold);

    }

 

    @Override

    protectedvoid map(IntWritable column, VectorWritable occurrenceVector, Context ctx)

      throws IOException, InterruptedException {

    

     //将向量变成数组

      Vector.Element[] occurrences = Vectors.toArray(occurrenceVector);

 

      //将数组按照其在向量中的位置进行排序

      Arrays.sort(occurrencesBY_INDEX);

 

      intcooccurrences = 0;

      intprunedCooccurrences = 0;

      for (intn = 0; n < occurrences.lengthn++) {

        Vector.Element occurrenceA = occurrences[n];

        Vector dots = new RandomAccessSparseVector(Integer.MAX_VALUE);

        for (intm = nm < occurrences.lengthm++) {

          Vector.Element occurrenceB = occurrences[m];

          if (threshold == NO_THRESHOLD || consider(occurrenceAoccurrenceB)) {

 

         //向量在occurrenceB.index()的位置上放occurrenceAoccurrenceB的乘积

            dots.setQuick(occurrenceB.index(),   

            similarity.aggregate(occurrenceA.get(), occurrenceB.get()));

            cooccurrences++;

          } else {

            prunedCooccurrences++;

          }

        }

 

      //输出<occurrenceA.index(),vector<occurrenceB.index(),prefofA*prefofB>>

        ctx.write(new IntWritable(occurrenceA.index()), new VectorWritable(dots));

      }

    ctx.getCounter(Counters.COOCCURRENCES).increment(cooccurrences);

      ctx.getCounter(Counters.PRUNED_COOCCURRENCES).increment(prunedCooccurrences);

    }

  }

 

3.1.7.pairwiseSimilarity :SimilarityReducer

 publicstaticclass SimilarityReducer extends

Reducer<IntWritable,VectorWritable,IntWritable,VectorWritable> {

 

    private VectorSimilarityMeasure similarity;

    privateintnumberOfColumns;

    privatebooleanexcludeSelfSimilarity;

    private Vector norms;

    privatedoubletreshold;

 

    @Override

    protectedvoid setup(Context ctxthrows IOException, InterruptedException {

      similarity =

ClassUtils.instantiateAs(ctx.getConfiguration().get(SIMILARITY_CLASSNAME),

          VectorSimilarityMeasure.class);

      numberOfColumns = ctx.getConfiguration().getInt(NUMBER_OF_COLUMNS, -1);

      Preconditions.checkArgument(numberOfColumns > 0, "Number of columns must be greater then 0! But numberOfColumns = " + numberOfColumns);

      excludeSelfSimilarity = ctx.getConfiguration().getBoolean(EXCLUDE_SELF_SIMILARITYfalse);

      norms = Vectors.read(new Path(ctx.getConfiguration().get(NORMS_PATH)), ctx.getConfiguration());

      treshold = Double.parseDouble(ctx.getConfiguration().get(THRESHOLD));

    }

 

    @Override

    protectedvoid reduce(IntWritable row, Iterable<VectorWritable> partialDots, Context ctx)

      throws IOException, InterruptedException {

      Iterator<VectorWritable> partialDotsIterator = partialDots.iterator();

 

      //相同key值的vector做累加

      Vector dots = partialDotsIterator.next().get();

      while (partialDotsIterator.hasNext()) {

        Vector toAdd = partialDotsIterator.next().get();

        for (Element nonZeroElement : toAdd.nonZeroes()) {

          dots.setQuick(nonZeroElement.index(), dots.getQuick(nonZeroElement.index()) + nonZeroElement.get());

        }

      }

 

      Vector similarities = dots.like();

      doublenormA = norms.getQuick(row.get());

      for (Element b : dots.nonZeroes()) {

 

        //计算itemAb的相似度

        doublesimilarityValue = similarity.similarity(b.get(), normAnorms.getQuick(b.index()), numberOfColumns);

      

        //如果相似度大于等于阈值treshold,则保存,否则,丢弃

        if (similarityValue >= treshold) {

          similarities.set(b.index(), similarityValue);

        }

      }

   

      if (excludeSelfSimilarity) {

        //将与自己的相似度设为0

        similarities.setQuick(row.get(), 0);

      }

     //输出itemA index大于indexofitemA的项目的相似度,即整体输出一个上三角矩阵

      ctx.write(rownew VectorWritable(similarities));

    }

  }

3.1.8. asMatrix :UnsymmetrifyMapper

//生成完整的相似度矩阵

 publicstaticclass UnsymmetrifyMapper extends

Mapper<IntWritable,VectorWritable,IntWritable,VectorWritable>  {

 

    privateintmaxSimilaritiesPerRow;

 

    @Override

    protectedvoid setup(Mapper.Contextctxthrows IOException, InterruptedException {

      maxSimilaritiesPerRow =

ctx.getConfiguration().getInt(MAX_SIMILARITIES_PER_ROW, 0);

      Preconditions.checkArgument(maxSimilaritiesPerRow > 0, "Maximum number of similarities per row must be greater then 0!");

    }

 

    @Override

    protectedvoid map(IntWritable row, VectorWritable similaritiesWritable, Context ctx)

      throws IOException, InterruptedException {

      Vector similarities = similaritiesWritable.get();

      Vector transposedPartial = new RandomAccessSparseVector(similarities.size(), 1);

      /*利用优先队列找到向量中值最大的前topK 项,并保存值与位置信息,这里,去掉无用项,为了节约空间*/

      TopElementsQueue topKQueue = new TopElementsQueue(maxSimilaritiesPerRow);

      for (Element nonZeroElement : similarities.nonZeroes()) {

        MutableElement top = topKQueue.top();

        doublecandidateValue = nonZeroElement.get();

        if (candidateValue > top.get()) {

          top.setIndex(nonZeroElement.index());

          top.set(candidateValue);

          topKQueue.updateTop();

        }

 

       //将向量进行转置并输出

        transposedPartial.setQuick(row.get(), candidateValue);

        ctx.write(new IntWritable(nonZeroElement.index()), new

VectorWritable(transposedPartial));

        //重用transposedPartial

        transposedPartial.setQuick(row.get(), 0.0);

      }

      Vector topKSimilarities = new RandomAccessSparseVector(similarities.size(), maxSimilaritiesPerRow);

      for (Vector.Element topKSimilarity : topKQueue.getTopElements()) {

        topKSimilarities.setQuick(topKSimilarity.index(), topKSimilarity.get());

      }

      ctx.write(rownew VectorWritable(topKSimilarities));

    }

  }

3.1.9.asMatrix :MergeToTopKSimilaritiesReducer

  publicstaticclass MergeToTopKSimilaritiesReducer

      extends Reducer<IntWritable,VectorWritable,IntWritable,VectorWritable> {

 

    privateintmaxSimilaritiesPerRow;

 

    @Override

    protectedvoid setup(Context ctxthrows IOException, InterruptedException {

      maxSimilaritiesPerRow = ctx.getConfiguration().getInt(MAX_SIMILARITIES_PER_ROW, 0);

      Preconditions.checkArgument(maxSimilaritiesPerRow > 0, "Maximum number of similarities per row must be greater then 0!");

    }

 

    @Override

    protectedvoid reduce(IntWritable row, Iterable<VectorWritable> partials, Context ctx)

      throws IOException, InterruptedException {

    

     //合并向量

      Vector allSimilarities = Vectors.merge(partials);

     //保存相似度最大的topk

      Vector topKSimilarities = Vectors.topKElements(maxSimilaritiesPerRowallSimilarities);

     //输出结果,最终生成整体的相似度矩阵

      ctx.write(rownew VectorWritable(topKSimilarities));

    }

  }

3.2. ItemSimilarityJob.MostSimilarItemPairsMapper

  //相似度矩阵打印出来,格式为<<itemA,itemB>,similarity>

  publicstaticclassMostSimilarItemPairsMapper

      extends Mapper<IntWritable,VectorWritable,EntityEntityWritable,DoubleWritable> {

 

    private OpenIntLongHashMap indexItemIDMap;

    privateintmaxSimilarItemsPerItem;

 

    @Override

    protectedvoid setup(Context ctx) {

      Configuration conf = ctx.getConfiguration();

      maxSimilarItemsPerItem = conf.getInt(MAX_SIMILARITIES_PER_ITEM, -1);

      indexItemIDMap = TasteHadoopUtils.readIDIndexMap(conf.get(ITEM_ID_INDEX_PATH_STR), conf);

 

      Preconditions.checkArgument(maxSimilarItemsPerItem > 0, "maxSimilarItemsPerItem must be greater then 0!");

    }

 

    @Override

    protectedvoid map(IntWritable itemIDIndexWritable, VectorWritable similarityVector, Context ctx)

      throws IOException, InterruptedException {

 

      intitemIDIndex = itemIDIndexWritable.get();

 

      TopSimilarItemsQueue topKMostSimilarItems = new

TopSimilarItemsQueue(maxSimilarItemsPerItem);

 

      for (Vector.Element element : similarityVector.get().nonZeroes()) {

        SimilarItem top = topKMostSimilarItems.top();

        doublecandidateSimilarity = element.get();

        if (candidateSimilarity > top.getSimilarity()) {

          top.set(indexItemIDMap.get(element.index()), candidateSimilarity);

          topKMostSimilarItems.updateTop();

        }

      }

 

      longitemID = indexItemIDMap.get(itemIDIndex);

      for (SimilarItem similarItem : topKMostSimilarItems.getTopItems()) {

        longotherItemID = similarItem.getItemID();

        if (itemID < otherItemID) {

          ctx.write(new EntityEntityWritable(itemIDotherItemID), new DoubleWritable(similarItem.getSimilarity()));

        } else {

          ctx.write(new EntityEntityWritable(otherItemIDitemID), new DoubleWritable(similarItem.getSimilarity()));

        }

      }

    }

  }

 

3.3.ItemSimilarityJob.MostSimilarItemPairsReducer

  publicstaticclass MostSimilarItemPairsReducer

      extends Reducer<EntityEntityWritable,DoubleWritable,EntityEntityWritable,DoubleWritable> {

    @Override

    protectedvoid reduce(EntityEntityWritable pair, Iterable<DoubleWritable> values, Context ctx)

      throws IOException, InterruptedException {

      ctx.write(pairvalues.iterator().next());

    }

  }

 

4.     第三阶段:

    该阶段由一个Job完成,该Job由两个输入路径,两种不同的map,一种reduce组成。

  1. 第一个输入为(7)的输出,对应第一个map,将输入的相似度向量中与自己的相似度设为NAN,输出为:<IndexOfIteamA,vector<IndexOfIteamB,similarityValue>>;第二个输入为(2)的输出,对应第二个map,如果存在推荐用户表,则只针对需要推荐的用户生成对应的转置<IndexOfIteamid,vector<userId,pref>>,如果不存在推荐用户表,则对所有用户生成对应的转置<IndexOfIteamid,vector<userId,pref>>。注意:第二个map中只保留pref最大的前N项对应的IndexOfIteamid,其他的IndexOfIteamid对应的pref设为NAN

Reduce将上面两种map的输出作为输入,生成项目对应的相似项,用户及评分,输出为:

<IndexOfIteamid,<similarityMatrixColumnOfIndexOfIteamid,UserListOfIndexOfIteamid,prefListOfIndexOfIteamid>>

 

 if (shouldRunNextPhase(parsedArgscurrentPhase)) {

      Job partialMultiplynew Job(getConf(), "partialMultiply");

      Configuration partialMultiplyConf = partialMultiply.getConfiguration();

 

      MultipleInputs.addInputPath(partialMultiplysimilarityMatrixPath, SequenceFileInputFormat.class,

                                  SimilarityMatrixRowWrapperMapper.class);

      MultipleInputs.addInputPath(partialMultiplynew Path(prepPath, PreparePreferenceMatrixJob.USER_VECTORS),

          SequenceFileInputFormat.classUserVectorSplitterMapper.class);

      partialMultiply.setJarByClass(ToVectorAndPrefReducer.class);

      partialMultiply.setMapOutputKeyClass(VarIntWritable.class);

      partialMultiply.setMapOutputValueClass(VectorOrPrefWritable.class);

      partialMultiply.setReducerClass(ToVectorAndPrefReducer.class);

      partialMultiply.setOutputFormatClass(SequenceFileOutputFormat.class);

      partialMultiply.setOutputKeyClass(VarIntWritable.class);

      partialMultiply.setOutputValueClass(VectorAndPrefsWritable.class);

      partialMultiplyConf.setBoolean("mapred.compress.map.output"true);

      partialMultiplyConf.set("mapred.output.dir"partialMultiplyPath.toString());

 

      if (usersFile != null) {

        partialMultiplyConf.set(UserVectorSplitterMapper.USERS_FILEusersFile);

      }

      partialMultiplyConf.setInt(UserVectorSplitterMapper.MAX_PREFS_PER_USER_CONSIDEREDmaxPrefsPerUser);

 

      booleansucceeded = partialMultiply.waitForCompletion(true);

      if (!succeeded) {

        return -1;

      }

    }

4.1. SimilarityMatrixRowWrapperMapper

publicfinalclassSimilarityMatrixRowWrapperMapperextends

    Mapper<IntWritable,VectorWritable,VarIntWritable,VectorOrPrefWritable> {

 

  privatefinal VarIntWritable index = new VarIntWritable();

  privatefinal VectorOrPrefWritable vectorOrPref = new VectorOrPrefWritable();

 

  @Override

  protectedvoid map(IntWritable key,

                     VectorWritable value,

                     Context contextthrows IOException, InterruptedException {

Vector similarityMatrixRow = value.get();

 

    /* 将自身的相似度设为NAN,这样的值对应的项目的相似度向量,将不参加推荐运算*/

    similarityMatrixRow.set(key.get(), Double.NaN);

 

index.set(key.get());

 

//这种mapvectorOrPref只保存相似度向量

    vectorOrPref.set(similarityMatrixRow);

 

    context.write(indexvectorOrPref);

  }

 

}

4.2.UserVectorSplitterMapper

publicfinalclassUserVectorSplitterMapperextends

    Mapper<VarLongWritable,VectorWritable, VarIntWritable,VectorOrPrefWritable> {

 

  privatestaticfinal Logger log = LoggerFactory.getLogger(UserVectorSplitterMapper.class);

 

  staticfinal String USERS_FILE = "usersFile";

  staticfinal String MAX_PREFS_PER_USER_CONSIDERED = "maxPrefsPerUserConsidered";

  staticfinalintDEFAULT_MAX_PREFS_PER_USER_CONSIDERED = 10;

 

  privateintmaxPrefsPerUserConsidered;

  private FastIDSet usersToRecommendFor;

 

  privatefinal VarIntWritable itemIndexWritable = new VarIntWritable();

  privatefinal VectorOrPrefWritable vectorOrPref = new VectorOrPrefWritable();

 

  @Override

  protectedvoid setup(Context contextthrows IOException {

    Configuration jobConf = context.getConfiguration();

    maxPrefsPerUserConsidered = jobConf.getInt(MAX_PREFS_PER_USER_CONSIDEREDDEFAULT_MAX_PREFS_PER_USER_CONSIDERED);

    String usersFilePathString = jobConf.get(USERS_FILE);

    if (usersFilePathString != null) {

      FSDataInputStream in = null;

      try {

        Path unqualifiedUsersFilePath = new Path(usersFilePathString);

        FileSystem fs = FileSystem.get(unqualifiedUsersFilePath.toUri(), jobConf);

        usersToRecommendFor = new FastIDSet();

        Path usersFilePath = unqualifiedUsersFilePath.makeQualified(fs);

        in = fs.open(usersFilePath);

        for (String line : new FileLineIterable(in)) {

          try {

            usersToRecommendFor.add(Long.parseLong(line));

          } catch (NumberFormatException nfe) {

            log.warn("usersFile line ignored: {}"line);

          }

        }

      } finally {

        Closeables.close(intrue);

      }

    }

  }

 

  @Override

  protectedvoid map(VarLongWritable key,

                     VectorWritable value,

                     Context contextthrows IOException, InterruptedException {

    longuserID = key.get();

    if (usersToRecommendFor != null && !usersToRecommendFor.contains(userID)) {

      return;

}

 

//userid对应的用户向量进行处理,保留评分最大的前K

    Vector userVector = maybePruneUserVector(value.get());

 

    for (Element e : userVector.nonZeroes()) {

      itemIndexWritable.set(e.index());

    

     //这种mapvectorOrPref只保存useridpref

      vectorOrPref.set(userID, (floate.get());

      context.write(itemIndexWritablevectorOrPref);

    }

  }

 

  private Vector maybePruneUserVector(Vector userVector) {

    if (userVector.getNumNondefaultElements() <= maxPrefsPerUserConsidered) {

      returnuserVector;

    }

 

floatsmallestLargeValue = findSmallestLargeValue(userVector);

 

    /*将用户向量中不保留的项的值设为NAN,这样的值对应的相似度向量,将不做推荐运算*/

    for (Element e : userVector.nonZeroes()) {

      floatabsValue = Math.abs((floate.get());

      if (absValue < smallestLargeValue) {

        e.set(Float.NaN);

      }

    }

 

    returnuserVector;

  }

 

  privatefloat findSmallestLargeValue(Vector userVector) {

 

    PriorityQueue<Float> topPrefValues = new PriorityQueue<Float>(maxPrefsPerUserConsidered) {

      @Override

      protectedboolean lessThan(Float f1, Float f2) {

        returnf1 < f2;

      }

    };

 

    for (Element e : userVector.nonZeroes()) {

      floatabsValue = Math.abs((floate.get());

      topPrefValues.insertWithOverflow(absValue);

    }

    returntopPrefValues.top();

  }

 

}

4.3.ToVectorAndPrefReducer

publicfinalclassToVectorAndPrefReducerextends

    Reducer<VarIntWritable,VectorOrPrefWritable,VarIntWritable,VectorAndPrefsWritable> {

 

  privatefinal VectorAndPrefsWritable vectorAndPrefs = new VectorAndPrefsWritable();

 

  @Override

  protectedvoid reduce(VarIntWritable key,

                        Iterable<VectorOrPrefWritable> values,

                        Context contextthrows IOException, InterruptedException {

 

    List<Long> userIDs = Lists.newArrayList();

    List<Float> prefValues = Lists.newArrayList();

    Vector similarityMatrixColumn = null;

    for (VectorOrPrefWritable value : values) {

      if (value.getVector() == null) {

        // Then this is a user-pref value

        userIDs.add(value.getUserID());

        prefValues.add(value.getValue());

      } else {

        // Then this is the column vector

        if (similarityMatrixColumn != null) {

          thrownew IllegalStateException("Found two similarity-matrix columns for item index " + key.get());

        }

        similarityMatrixColumn = value.getVector();

      }

    }

 

    if (similarityMatrixColumn == null) {

      return;

}

 

   //vectorAndPrefs中保存itemid对应的相似度向量,所有的用户及用户的评分

    vectorAndPrefs.set(similarityMatrixColumnuserIDsprefValues);

    context.write(keyvectorAndPrefs);

  }

 

}

 

5.     第四阶段:

 

 该阶段包含两个job

  1. 如何用户设定过滤文件,则进行该job,输入为该过滤文件,将过滤文件中用户-项目对应的相似项目中的对应值设为NAN(用于后面的过滤),输出为<IndexOfIteamid,<similarityMatrixColumnOfIndexOfIteamid,UserListOfIndexOfIteamid,prefListOfIndexOfIteamid>>。用于过滤推荐项目。

 

  1. 计算用户的推介项目,在计算结果后,保存结果之前进行过滤,对所有用户,根据推荐项目表,对推介项目进行过滤,同时利用过滤文件产生的结果,进行二次过滤。生成:<userid,List<iteamid,value>>

 

 if (shouldRunNextPhase(parsedArgscurrentPhase)) {

      //filter out any users we don't care about

      /* convert the user/item pairs to filter if a filterfile has been specified */

      if (filterFile != null) {

        Job itemFiltering= prepareJob(new Path(filterFile), explicitFilterPath, TextInputFormat.class,

                ItemFilterMapper.class, VarLongWritable.class, VarLongWritable.class,

                ItemFilterAsVectorAndPrefsReducer.class, VarIntWritable.class, VectorAndPrefsWritable.class,

                SequenceFileOutputFormat.class);

        booleansucceeded = itemFiltering.waitForCompletion(true);

        if (!succeeded) {

          return -1;

        }

      }

 

      String aggregateAndRecommendInput = partialMultiplyPath.toString();

      if (filterFile != null) {

        aggregateAndRecommendInput += "," + explicitFilterPath;

      }

 

      Class<? extendsOutputFormatoutputFormat = parsedArgs.containsKey("--sequencefileOutput")

          ? SequenceFileOutputFormat.class : TextOutputFormat.class;

 

      //extract out the recommendations

      Job aggregateAndRecommend= prepareJob(

              new Path(aggregateAndRecommendInput), outputPath, SequenceFileInputFormat.class,

              PartialMultiplyMapper.class, VarLongWritable.class, PrefAndSimilarityColumnWritable.class,

              AggregateAndRecommendReducer.class, VarLongWritable.class, RecommendedItemsWritable.class,

              outputFormat);

      Configuration aggregateAndRecommendConf = aggregateAndRecommend.getConfiguration();

      if (itemsFile != null) {

        aggregateAndRecommendConf.set(AggregateAndRecommendReducer.ITEMS_FILEitemsFile);

      }

 

      if (filterFile != null) {

        setS3SafeCombinedInputPath(aggregateAndRecommend, getTempPath(), partialMultiplyPathexplicitFilterPath);

      }

      setIOSort(aggregateAndRecommend);

      aggregateAndRecommendConf.set(AggregateAndRecommendReducer.ITEMID_INDEX_PATH,

              new Path(prepPath, PreparePreferenceMatrixJob.ITEMID_INDEX).toString());

      aggregateAndRecommendConf.setInt(AggregateAndRecommendReducer.NUM_RECOMMENDATIONSnumRecommendations);

      aggregateAndRecommendConf.setBoolean(BOOLEAN_DATAbooleanData);

      booleansucceeded = aggregateAndRecommend.waitForCompletion(true);

      if (!succeeded) {

        return -1;

      }

  }

5.1. itemFiltering :ItemFilterMapper

//读取过滤文件,输出为<itemid,userid>

publicclassItemFilterMapperextends Mapper<LongWritable,Text,VarLongWritable,VarLongWritable> {

 

  privatestaticfinal Pattern SEPARATOR = Pattern.compile("[\t,]");

 

  privatefinal VarLongWritable itemIDWritable = new VarLongWritable();

  privatefinal VarLongWritable userIDWritable = new VarLongWritable();

 

  @Override

  protectedvoid map(LongWritable key, Text line, Context ctxthrows IOException, InterruptedException {

    String[] tokens = SEPARATOR.split(line.toString());

    longuserID = Long.parseLong(tokens[0]);

    longitemID = Long.parseLong(tokens[1]);

    itemIDWritable.set(itemID);

    userIDWritable.set(userID);

    ctx.write(itemIDWritableuserIDWritable);

  }

}

 

5.2.itemFiltering :ItemFilterAsVectorAndPrefsReducer

/*构造vectorAndPrefs,将其中相似度矩阵中,itemid 位置上的值设为NAN,表明此itemid将不会出现在推荐列表中*/

 

publicclassItemFilterAsVectorAndPrefsReducer

    extends Reducer<VarLongWritable,VarLongWritable,VarIntWritable,VectorAndPrefsWritable> {

 

  privatefinal VarIntWritable itemIDIndexWritable = new VarIntWritable();

  privatefinal VectorAndPrefsWritable vectorAndPrefs = new VectorAndPrefsWritable();

 

  @Override

  protectedvoid reduce(VarLongWritable itemID, Iterable<VarLongWritable> values, Context ctx)

    throws IOException, InterruptedException {

   

    intitemIDIndex = TasteHadoopUtils.idToIndex(itemID.get());

    Vector vector = new RandomAccessSparseVector(Integer.MAX_VALUE, 1);

 

/*userIDs中的用户对应的itemIDIndex对应的相似度矩阵中与自己的相似度设为NAN

即在计算推荐向量时,该项不被推荐*/

    vector.set(itemIDIndex, Double.NaN);

 

    List<Long> userIDs = Lists.newArrayList();

    List<Float> prefValues = Lists.newArrayList();

    for (VarLongWritable userID : values) {

      userIDs.add(userID.get());

      prefValues.add(1.0f);

    }

 

    itemIDIndexWritable.set(itemIDIndex);

    vectorAndPrefs.set(vectoruserIDsprefValues);

    ctx.write(itemIDIndexWritablevectorAndPrefs);

  }

}

5.3.PartialMultiplyMapper

/*汇总参与计算userid的推荐项量的所有pref与相似向量,生成<userid,List<pref,vevtor<indexofitemid,SimilarityValue>>>*/

 

publicfinalclassPartialMultiplyMapperextends

    Mapper<VarIntWritable,VectorAndPrefsWritable,VarLongWritable,PrefAndSimilarityColumnWritable> {

 

  privatefinal VarLongWritable userIDWritable = new VarLongWritable();

  privatefinal PrefAndSimilarityColumnWritable prefAndSimilarityColumn = new PrefAndSimilarityColumnWritable();

 

  @Override

  protectedvoid map(VarIntWritable key,

                     VectorAndPrefsWritable vectorAndPrefsWritable,

                     Context contextthrows IOException, InterruptedException {

 

    Vector similarityMatrixColumn = vectorAndPrefsWritable.getVector();

    List<Long> userIDs = vectorAndPrefsWritable.getUserIDs();

    List<Float> prefValues = vectorAndPrefsWritable.getValues();

 

    for (inti = 0; i < userIDs.size(); i++) {

      longuserID = userIDs.get(i);

      floatprefValue = prefValues.get(i);

     

      //pref值为NAN时,将不参与计算推荐值

      if (!Float.isNaN(prefValue)) {

        prefAndSimilarityColumn.set(prefValuesimilarityMatrixColumn);

        userIDWritable.set(userID);

        context.write(userIDWritableprefAndSimilarityColumn);

      }

    }

  }

}

5.4.AggregateAndRecommendReducer

publicfinalclassAggregateAndRecommendReducerextends

    Reducer<VarLongWritable,PrefAndSimilarityColumnWritable,VarLongWritable,RecommendedItemsWritable> {

 

  privatestaticfinal Logger log = LoggerFactory.getLogger(AggregateAndRecommendReducer.class);

 

  staticfinal String ITEMID_INDEX_PATH = "itemIDIndexPath";

  staticfinal String NUM_RECOMMENDATIONS = "numRecommendations";

  staticfinalintDEFAULT_NUM_RECOMMENDATIONS = 10;

  staticfinal String ITEMS_FILE = "itemsFile";

 

  privatebooleanbooleanData;

  privateintrecommendationsPerUser;

  private FastIDSet itemsToRecommendFor;

  private OpenIntLongHashMap indexItemIDMap;

 

  privatefinal RecommendedItemsWritable recommendedItems = new RecommendedItemsWritable();

 

  privatestaticfinalfloatBOOLEAN_PREF_VALUE = 1.0f;

 

  @Override

  protectedvoid setup(Context contextthrows IOException {

    Configuration conf = context.getConfiguration();

    recommendationsPerUser = conf.getInt(NUM_RECOMMENDATIONSDEFAULT_NUM_RECOMMENDATIONS);

    booleanData = conf.getBoolean(RecommenderJob.BOOLEAN_DATAfalse);

    indexItemIDMap = TasteHadoopUtils.readIDIndexMap(conf.get(ITEMID_INDEX_PATH), conf);

 

    String itemFilePathString = conf.get(ITEMS_FILE);

    if (itemFilePathString != null) {

      itemsToRecommendFor = new FastIDSet();

      for (String line : new FileLineIterable(HadoopUtil.openStream(new Path(itemFilePathString), conf))) {

        try {

          itemsToRecommendFor.add(Long.parseLong(line));

        } catch (NumberFormatException nfe) {

          log.warn("itemsFile line ignored: {}"line);

        }

      }

    }

  }

 

  @Override

  protectedvoid reduce(VarLongWritable userID,

                        Iterable<PrefAndSimilarityColumnWritable> values,

                        Context contextthrows IOException, InterruptedException {

if (booleanData) {

 

  /*如果评分采用布尔值,则只进行相加,否则,用推荐公式计算*/

 

      reduceBooleanData(userIDvaluescontext);

else {

      reduceNonBooleanData(userIDvaluescontext);

    }

  }

 

  privatevoid reduceBooleanData(VarLongWritable userID,

                                 Iterable<PrefAndSimilarityColumnWritable> values,

                                 Context contextthrows IOException, InterruptedException {

    /* having boolean data, each estimated preference can only be 1,

     * however we can't use this to rank the recommended items,

     * so we use the sum of similarities for that. */

 

    Iterator<PrefAndSimilarityColumnWritable> columns = values.iterator();

    Vector predictions = columns.next().getSimilarityColumn();

    while (columns.hasNext()) {

      predictions.assign(columns.next().getSimilarityColumn(), Functions.PLUS);

    }

    writeRecommendedItems(userIDpredictionscontext);

  }

 

  privatevoid reduceNonBooleanData(VarLongWritable userID,

                        Iterable<PrefAndSimilarityColumnWritable> values,

                        Context contextthrows IOException, InterruptedException {

    /* each entry here is the sum in the numerator of the prediction formula */

    Vector numerators = null;

    /* each entry here is the sum in the denominator of the prediction formula */

    Vector denominators = null;

    /* each entry here is the number of similar items used in the prediction formula */

    Vector numberOfSimilarItemsUsed = new RandomAccessSparseVector(Integer.MAX_VALUE, 100);

 

    for (PrefAndSimilarityColumnWritable prefAndSimilarityColumn : values) {

      Vector simColumn = prefAndSimilarityColumn.getSimilarityColumn();

      floatprefValue = prefAndSimilarityColumn.getPrefValue();

      /* count the number of items used for each prediction */

      for (Element e : simColumn.nonZeroes()) {

        intitemIDIndex = e.index();

        numberOfSimilarItemsUsed.setQuick(itemIDIndexnumberOfSimilarItemsUsed.getQuick(itemIDIndex) + 1);

      }

 

      if (denominators == null) {

        denominators = simColumn.clone();

      } else {

        denominators.assign(simColumn, Functions.PLUS_ABS);

      }

 

      if (numerators == null) {

        numerators = simColumn.clone();

        if (prefValue != BOOLEAN_PREF_VALUE) {

          numerators.assign(Functions.MULTprefValue);

        }

      } else {

        if (prefValue != BOOLEAN_PREF_VALUE) {

          simColumn.assign(Functions.MULTprefValue);

        }

        numerators.assign(simColumn, Functions.PLUS);

      }

 

    }

 

    if (numerators == null) {

      return;

    }

 

    Vector recommendationVector = new RandomAccessSparseVector(Integer.MAX_VALUE, 100);

    for (Element element : numerators.nonZeroes()) {

      intitemIDIndex = element.index();

      /* preference estimations must be based on at least 2 datapoints */

      if (numberOfSimilarItemsUsed.getQuick(itemIDIndex) > 1) {

        /* compute normalized prediction */

        doubleprediction = element.get() / denominators.getQuick(itemIDIndex);

        recommendationVector.setQuick(itemIDIndexprediction);

      }

}

/*对推荐向量进行过滤,并提取推荐值最大的前K项,输出*/

    writeRecommendedItems(userIDrecommendationVectorcontext);

  }

 

  /**

   * find the top entries in recommendationVector, map them to the real itemIDs and write back the result

   */

  privatevoid writeRecommendedItems(VarLongWritable userID, Vector recommendationVector, Context context)

    throws IOException, InterruptedException {

 

    TopItemsQueue topKItems = new TopItemsQueue(recommendationsPerUser);

 

    for (Element element : recommendationVector.nonZeroes()) {

      intindex = element.index();

      longitemID;

      if (indexItemIDMap != null && !indexItemIDMap.isEmpty()) {

        itemID = indexItemIDMap.get(index);

      } else { //we don't have any mappings, so just use the original

        itemID = index;

      }

      if (itemsToRecommendFor == null || itemsToRecommendFor.contains(itemID)) {

        floatvalue = (floatelement.get();

   

   /* 推荐值是NAN时,将不被推荐,也就是过滤文件job中,相似度向量中设为NAN对应的item,不会被推荐*/

        if (!Float.isNaN(value)) {

 

          MutableRecommendedItem topItem = topKItems.top();

          if (value > topItem.getValue()) {

            topItem.set(itemIDvalue);

            topKItems.updateTop();

          }

        }

      }

    }

 

    List<RecommendedItem> topItems = topKItems.getTopItems();

    if (!topItems.isEmpty()) {

      recommendedItems.set(topItems);

  

      //输出,userid及其推荐向量

      context.write(userIDrecommendedItems);

 

    }

  }

 

}

0 0
原创粉丝点击