mahout基于项目的协同过滤源码分析

来源：互联网发布：淘宝上面卖什么好编辑：程序博客网时间：2024/05/16 00:48

1. 综述

Mahout支持2种 M/R 的jobs实现itemBase的协同过滤
(1) ItemSimilarityJob
(2) RecommenderJob

源码包位置：org.apache.mahout.cf.taste.hadoop.item.RecommenderJob

RecommenderJob前几个阶段和ItemSimilarityJob是一样的，不过ItemSimilarityJob 计算出item的相似度矩阵就结束了，而RecommenderJob 会继续使用相似度矩阵，对每个user计算出应该推荐给他的top N 个items。RecommenderJob 的输入也是[userID, itemID, preferencevalue]格式的。JobRecommenderJob主要由以下一的Job组成：

2. 第一阶段：

该阶段包含三个job：

（1）输入是原始数据，格式为：<userid,itemid,preference> ，计算每个teamid对应的index，并保存index及其对应的teamid中的最小值，输出为<index,min(teamid)>。

（2）输入为原始数据，格式为：<userid,itemid,preference> ，生成用户矩阵，输出为：<userid,vector<IndexOfIteamid,pre>>。

（3）输入为（2）的结果，生成项目矩阵，输出为：<IndexOfIteamid,vector<IndexOfUserId,pre>>

/*shouldRunNextPhase 这里的作用是：当这个阶段已经运行过，且结果已保存，则可以通过设置参数--startphase 跳过该阶段，也可以设置--endphase ，让程序在运行完某个阶段，停下来。*/

if (shouldRunNextPhase(parsedArgs, currentPhase)) {

ToolRunner.run(getConf(), newPreparePreferenceMatrixJob(), new String[]{

"--input", getInputPath().toString(),

"--output", prepPath.toString(),

"--minPrefsPerUser", String.valueOf(minPrefsPerUser),

"--booleanData", String.valueOf(booleanData),

"--tempDir", getTempPath().toString(),

});

numberOfUsers = HadoopUtil.readInt

(new Path(prepPath, PreparePreferenceMatrixJob.NUM_USERS), getConf());

}

2.1.PreparePreferenceMatrixJob

（1）输入是原始数据，格式为：<userid,itemid,preference> ，计算每个teamid对应的index，并保存index及其对应的teamid中的最小值，输出为<index,min(teamid)>。

Job itemIDIndex= prepareJob(getInputPath(), getOutputPath(ITEMID_INDEX),

TextInputFormat.class,

ItemIDIndexMapper.class, VarIntWritable.class, VarLongWritable.class,

ItemIDIndexReducer.class,

VarIntWritable.class, VarLongWritable.class, SequenceFileOutputFormat.class);

itemIDIndex.setCombinerClass(ItemIDIndexReducer.class);

booleansucceeded = itemIDIndex.waitForCompletion(true);

if (!succeeded) {

return -1;

}

（2）输入为原始数据，格式为：<userid,itemid,preference> ，生成用户矩阵，输出为：<userid,vector<IndexOfIteamid,pre>>。

Job toUserVectors= prepareJob(getInputPath(),

getOutputPath(USER_VECTORS),

TextInputFormat.class,

ToItemPrefsMapper.class,

VarLongWritable.class,

booleanData ? VarLongWritable.class : EntityPrefWritable.class,

ToUserVectorsReducer.class,

VarLongWritable.class,

VectorWritable.class,

SequenceFileOutputFormat.class);

toUserVectors.getConfiguration().setBoolean(RecommenderJob.BOOLEAN_DATA, booleanData);

toUserVectors.getConfiguration().setInt(ToUserVectorsReducer.MIN_PREFERENCES_PER_USER, minPrefsPerUser);

toUserVectors.getConfiguration().set(ToEntityPrefsMapper.RATING_SHIFT, String.valueOf(ratingShift));

succeeded = toUserVectors.waitForCompletion(true);

if (!succeeded) {

return -1;

}

//we need the number of users later

intnumberOfUsers = (int) toUserVectors.getCounters().findCounter(ToUserVectorsReducer.Counters.USERS).getValue();

HadoopUtil.writeInt(numberOfUsers, getOutputPath(NUM_USERS), getConf());

（3）输入为（2）的结果，生成项目矩阵，输出为：<IndexOfIteamid,vector<IndexOfUserId,pre>>

Job toItemVectors= prepareJob(getOutputPath(USER_VECTORS), getOutputPath(RATING_MATRIX),

ToItemVectorsMapper.class, IntWritable.class, VectorWritable.class, ToItemVectorsReducer.class,

IntWritable.class, VectorWritable.class);

toItemVectors.setCombinerClass(ToItemVectorsReducer.class);

succeeded = toItemVectors.waitForCompletion(true);

if (!succeeded) {

return -1;

}

2.1.1.itemIDIndex：ItemIDIndexMapper

//提取输入文件中的item项，并计算其对应的index，输出为 <indexOfitemid，itemid>

publicfinalclassItemIDIndexMapperextends

Mapper<LongWritable,Text, VarIntWritable, VarLongWritable> {

//输入文件中的userid与itemid是否相反，如果是false，则输入文件为<userid,itemid,pref>

// 如果是true，则输入文件为<itemid,userid,pref>

privatebooleantranspose;

privatefinal VarIntWritable indexWritable = new VarIntWritable();

privatefinal VarLongWritable itemIDWritable = new VarLongWritable();

@Override

protectedvoid setup(Context context) {

Configuration jobConf = context.getConfiguration();

//从配置文件中提取transpose的值，默认是false

transpose = jobConf.getBoolean(ToEntityPrefsMapper.TRANSPOSE_USER_ITEM, false);

}

@Override

protectedvoid map(LongWritable key,

Text value,

Context context) throws IOException, InterruptedException {

String[] tokens = TasteHadoopUtils.splitPrefTokens(value.toString());

//如果transpose 是false，则itemid为第二项，否则，为第一项。

longitemID = Long.parseLong(tokens[transpose ? 0 : 1]);

//计算itemid对应的index

intindex = TasteHadoopUtils.idToIndex(itemID);

indexWritable.set(index);

itemIDWritable.set(itemID);

//输出<indexofitemid,itemid>

context.write(indexWritable, itemIDWritable);

}

2.1.2. itemIDIndex：ItemIDIndexReducer

//生成同一个index对应的itemid的最小值

publicfinalclassItemIDIndexReducerextends

Reducer<VarIntWritable, VarLongWritable, VarIntWritable,VarLongWritable> {

privatefinal VarLongWritable minimumItemIDWritable = new VarLongWritable();

@Override

protectedvoid reduce(VarIntWritable index,

Iterable<VarLongWritable> possibleItemIDs,

Context context) throws IOException, InterruptedException {

//初始化一个最大值

longminimumItemID = Long.MAX_VALUE;

for (VarLongWritable varLongWritable : possibleItemIDs) {

longitemID = varLongWritable.get();

if (itemID < minimumItemID) {

//只保存最小最

minimumItemID = itemID;

}

if (minimumItemID != Long.MAX_VALUE) {

minimumItemIDWritable.set(minimumItemID);

//输出<indexofitem,,min<itemidofindex>>

context.write(index, minimumItemIDWritable);

}

2.1.3.toUserVectors :ToItemPrefsMapper

//读取输入文件，并以<userid,<itemid,pref>>格式输出

publicfinalclassToItemPrefsMapperextends ToEntityPrefsMapper {

public ToItemPrefsMapper() {

super(false);

}

publicabstractclassToEntityPrefsMapperextends

Mapper<LongWritable,Text, VarLongWritable,VarLongWritable> {

publicstaticfinal String TRANSPOSE_USER_ITEM = ToEntityPrefsMapper.class + "transposeUserItem";

publicstaticfinal String RATING_SHIFT = ToEntityPrefsMapper.class + "shiftRatings";

//用于分割输入的正则模式

privatestaticfinal Pattern DELIMITER = Pattern.compile("[\t,]");

//输入文件中是否用布尔类型值定义pref评分，如果booleanData为true，是，否则，不是。

privatebooleanbooleanData;

// 输入文件中userid与itemid是否相反

privatebooleantranspose;

privatefinalbooleanitemKey;

privatefloatratingShift;

ToEntityPrefsMapper(booleanitemKey) {

this.itemKey = itemKey;

}

@Override

protectedvoid setup(Context context) {

Configuration jobConf = context.getConfiguration();

booleanData = jobConf.getBoolean(RecommenderJob.BOOLEAN_DATA, false);

transpose = jobConf.getBoolean(TRANSPOSE_USER_ITEM, false);

ratingShift = Float.parseFloat(jobConf.get(RATING_SHIFT, "0.0"));

}

@Override

publicvoid map(LongWritable key,

Text value,

Context context) throws IOException, InterruptedException {

String[] tokens = DELIMITER.split(value.toString());

longuserID = Long.parseLong(tokens[0]);

longitemID = Long.parseLong(tokens[1]);

if (itemKey ^ transpose) {

//如果输入文件userid与itemid相反，则互换

longtemp = userID;

userID = itemID;

itemID = temp;

}

if (booleanData) {

//如果采用布尔类型值定义评分pref，则仅输出<userid,itemid>

context.write(new VarLongWritable(userID), new VarLongWritable(itemID));

} else {

//如果输入文件中有评分值，将输入文件中的评分项都加ratingshift，否则，加1。

floatprefValue = tokens.length > 2 ? Float.parseFloat(tokens[2]) + ratingShift : 1.0f;

//输出为<userid,<itemid,pref>>

context.write(new VarLongWritable(userID), new EntityPrefWritable(itemID, prefValue));

}

2.1.4.toUserVectors :ToUserVectorsReducer

//生成userid 对应的所有itemid即pref。

publicfinalclassToUserVectorsReducerextends

Reducer<VarLongWritable,VarLongWritable,VarLongWritable,VectorWritable> {

publicstaticfinal String MIN_PREFERENCES_PER_USER = ToUserVectorsReducer.class.getName()

+ ".minPreferencesPerUser";

//评分个数阈值

privateintminPreferences;

//user个数计数器

publicenum Counters { USERS }

privatefinal VectorWritable userVectorWritable = new VectorWritable();

@Override

protectedvoid setup(Context ctx) throws IOException, InterruptedException {

super.setup(ctx);

//从配置文件中读取minPreferences 的值，默认是1

minPreferences = ctx.getConfiguration().getInt(MIN_PREFERENCES_PER_USER, 1);

}

@Override

protectedvoid reduce(VarLongWritable userID,

Iterable<VarLongWritable> itemPrefs,

Context context) throws IOException, InterruptedException {

Vector userVector = new RandomAccessSparseVector(Integer.MAX_VALUE, 100);

for (VarLongWritable itemPref : itemPrefs) {

//计算每个itemid对应的index

intindex = TasteHadoopUtils.idToIndex(itemPref.get());

//如果评分采用布尔值使得map输出，则将相应的评分设为1。

floatvalue = itemPrefinstanceof EntityPrefWritable ? ((EntityPrefWritable) itemPref).getPrefValue() : 1.0f;

//itemid对应的index为用户向量的位置，评分为用户向量在该位置上的值

userVector.set(index, value);

}

//将评分个数大于minPreferences的项输出，小于的丢弃

if (userVector.getNumNondefaultElements() >= minPreferences) {

userVectorWritable.set(userVector);

userVectorWritable.setWritesLaxPrecision(true);

context.getCounter(Counters.USERS).increment(1);

context.write(userID, userVectorWritable);

}

2.1.5. toItemVectors :ToItemVectorsMapper

//生成项目矩阵

publicclassToItemVectorsMapper

extends Mapper<VarLongWritable,VectorWritable,IntWritable,VectorWritable> {

privatefinal IntWritable itemID = new IntWritable();

privatefinal VectorWritable itemVectorWritable = new VectorWritable();

@Override

protectedvoid map(VarLongWritable rowIndex, VectorWritable vectorWritable, Context ctx)

throws IOException, InterruptedException {

Vector userRatings = vectorWritable.get();

//计算userid对应的index

intcolumn = TasteHadoopUtils.idToIndex(rowIndex.get());

itemVectorWritable.setWritesLaxPrecision(true);//暂时不明白

//初始化itemvector，最大容量是Integer.MAX_VALUE，初始容量为1

Vector itemVector = new RandomAccessSparseVector(Integer.MAX_VALUE, 1);

for (Vector.Element elem : userRatings.nonZeroes()) {

//将userid与itemid进行对调

itemID.set(elem.index());

itemVector.setQuick(column, elem.get());

itemVectorWritable.set(itemVector);

//输出<indexofitemid,vector<indexofuserid,pref>>

ctx.write(itemID, itemVectorWritable);

// 重用itemvector

itemVector.setQuick(elem.index(), 0.0);

}

2.1.6. toItemVectors :ToItemVectorsReducer

publicclassToItemVectorsReducerextends

Reducer<IntWritable,VectorWritable,IntWritable,VectorWritable> {

privatefinal VectorWritable merged = new VectorWritable();

@Override

protectedvoid reduce(IntWritable row, Iterable<VectorWritable> vectors, Context ctx)

throws IOException, InterruptedException {

merged.setWritesLaxPrecision(true);

//将相同indexofitemid下的对应项合并

merged.set(VectorWritable.mergeToVector(vectors.iterator()));

//输出完整的<indexofitemid,vector<indexofuserid,pref>>

ctx.write(row, merged);

}

3. 第二阶段：

该阶段包含5个job：其中前4个属于RowSimilarityJob步骤，生成相似度矩阵，第五个为输出TextOutputFormat文件格式的相似度矩阵。

（4）输入为（3）的结果，统计数据中不同user的个数，输出为：vector<IndexOfUserid，CountOfUser>

（5）输入为（3）的结果，对输入进行选样，输出（3）的样本对应的转置，

<IndexOfuserid,vector<IndexOfIteamid,pre>>，输出每个样本项目对应多的norm值，

vector<IndexOfIteamid,norm(teamid)>，如果用户设定threshold参数，而不是默认值，则还输出每个样本项目对应的用户数及对应的pref最大值，

vector<IndexOfIteamid,numNonZeroEntries> ,vector<IndexOfIteamid,maxValue>。

（6）输入为（5）的输出，其中转置矩阵作为job的输入，其它利用configure的参数传递文件地址，进而通过地址操作文件，生成共生矩阵（相似度矩阵上三角），输出为：<IndexOfIteamA,vector<IndexOfIteamB,similarityValue>>。如果用户设定threshold参数，而不是默认值，则similarityValue小于threshold的对应项被丢弃。由threshold参数和文件地址中的内容共同决定是否计算IteamA与 IteamB的相似度。

（7）输入为（6）的输出，生成完整的相似度矩阵，并保留每个项目对应的相似度最大的topk项，输出为：<IndexOfIteamA,vector<IndexOfIteamB,similarityValue>>。

（8）输入为（7）的输出，生成相似度矩阵的TextOutputFormat文件，输出为：<<IteamA,IteamB>,similarityValue>。

if (shouldRunNextPhase(parsedArgs, currentPhase)) {

/* 如果第一阶段已执行，且已停止，当再次执行时，跳过第一阶段，但是numberOfUsers 不被保存，因此需要重新计算numberOfUsers 的值*/

if (numberOfUsers == -1) {

numberOfUsers = (int) HadoopUtil.countRecords(new Path(prepPath, PreparePreferenceMatrixJob.USER_VECTORS),

PathType.LIST, null, getConf());

}

//计算共生矩阵

ToolRunner.run(getConf(), newRowSimilarityJob(), new String[]{

"--input", new Path(prepPath, PreparePreferenceMatrixJob.RATING_MATRIX).toString(),

"--output", similarityMatrixPath.toString(),

"--numberOfColumns", String.valueOf(numberOfUsers),

"--similarityClassname", similarityClassname,

"--maxObservationsPerRow", String.valueOf(maxPrefsInItemSimilarity),

"--maxObservationsPerColumn", String.valueOf(maxPrefsInItemSimilarity),

"--maxSimilaritiesPerRow", String.valueOf(maxSimilaritiesPerItem),

"--excludeSelfSimilarity", String.valueOf(Boolean.TRUE),

"--threshold", String.valueOf(threshold),

"--randomSeed", String.valueOf(randomSeed),

"--tempDir", getTempPath().toString(),

});

//如果用户指定outputPathForSimilarityMatrix，则输出共生矩阵

if (hasOption("outputPathForSimilarityMatrix")) {

Path outputPathForSimilarityMatrix = new Path(getOption("outputPathForSimilarityMatrix"));

Job outputSimilarityMatrix= prepareJob(similarityMatrixPath, outputPathForSimilarityMatrix,

SequenceFileInputFormat.class, ItemSimilarityJob.MostSimilarItemPairsMapper.class,

EntityEntityWritable.class, DoubleWritable.class, ItemSimilarityJob.MostSimilarItemPairsReducer.class,

EntityEntityWritable.class, DoubleWritable.class, TextOutputFormat.class);

Configuration mostSimilarItemsConf = outputSimilarityMatrix.getConfiguration();

mostSimilarItemsConf.set(ItemSimilarityJob.ITEM_ID_INDEX_PATH_STR,

new Path(prepPath, PreparePreferenceMatrixJob.ITEMID_INDEX).toString());

mostSimilarItemsConf.setInt(ItemSimilarityJob.MAX_SIMILARITIES_PER_ITEM, maxSimilaritiesPerItem);

outputSimilarityMatrix.waitForCompletion(true);

}

3.1. RowSimilarityJob

RowSimilarityJob由4个job组成

（4）输入为（3）的结果，统计数据中不同user的个数，输出为：vector<IndexOfUserid，CountOfUser>

Job countObservations= prepareJob(getInputPath(), getTempPath("notUsed"), CountObservationsMapper.class,

NullWritable.class, VectorWritable.class, SumObservationsReducer.class, NullWritable.class,

VectorWritable.class);

countObservations.setCombinerClass(VectorSumCombiner.class);

countObservations.getConfiguration().set(OBSERVATIONS_PER_COLUMN_PATH, observationsPerColumnPath.toString());

countObservations.setNumReduceTasks(1);

countObservations.waitForCompletion(true);

（5）输入为（3）的结果，对输入进行选样，输出（3）的样本对应的转置，

<IndexOfuserid,vector<IndexOfIteamid,pre>>，输出每个样本项目对应多的norm值，

vector<IndexOfIteamid,norm(teamid)>，如果用户设定threshold参数，而不是默认值，则还输出每个样本项目对应的用户数及对应的pref最大值，

vector<IndexOfIteamid,numNonZeroEntries> ,vector<IndexOfIteamid,maxValue>。

if (shouldRunNextPhase(parsedArgs, currentPhase)) {

Job normsAndTranspose= prepareJob(getInputPath(), weightsPath, VectorNormMapper.class, IntWritable.class,

VectorWritable.class, MergeVectorsReducer.class, IntWritable.class, VectorWritable.class);

normsAndTranspose.setCombinerClass(MergeVectorsCombiner.class);

Configuration normsAndTransposeConf = normsAndTranspose.getConfiguration();

normsAndTransposeConf.set(THRESHOLD, String.valueOf(threshold));

normsAndTransposeConf.set(NORMS_PATH, normsPath.toString());

normsAndTransposeConf.set(NUM_NON_ZERO_ENTRIES_PATH, numNonZeroEntriesPath.toString());

normsAndTransposeConf.set(MAXVALUES_PATH, maxValuesPath.toString());

normsAndTransposeConf.set(SIMILARITY_CLASSNAME, similarityClassname);

normsAndTransposeConf.set(OBSERVATIONS_PER_COLUMN_PATH, observationsPerColumnPath.toString());

normsAndTransposeConf.set(MAX_OBSERVATIONS_PER_ROW, String.valueOf(maxObservationsPerRow));

normsAndTransposeConf.set(MAX_OBSERVATIONS_PER_COLUMN, String.valueOf(maxObservationsPerColumn));

normsAndTransposeConf.set(RANDOM_SEED, String.valueOf(randomSeed));

booleansucceeded = normsAndTranspose.waitForCompletion(true);

if (!succeeded) {

return -1;

}

if (shouldRunNextPhase(parsedArgs, currentPhase)) {

Job pairwiseSimilarity= prepareJob(weightsPath, pairwiseSimilarityPath, CooccurrencesMapper.class,

IntWritable.class, VectorWritable.class, SimilarityReducer.class, IntWritable.class, VectorWritable.class);

pairwiseSimilarity.setCombinerClass(VectorSumReducer.class);

Configuration pairwiseConf = pairwiseSimilarity.getConfiguration();

pairwiseConf.set(THRESHOLD, String.valueOf(threshold));

pairwiseConf.set(NORMS_PATH, normsPath.toString());

pairwiseConf.set(NUM_NON_ZERO_ENTRIES_PATH, numNonZeroEntriesPath.toString());

pairwiseConf.set(MAXVALUES_PATH, maxValuesPath.toString());

pairwiseConf.set(SIMILARITY_CLASSNAME, similarityClassname);

pairwiseConf.setInt(NUMBER_OF_COLUMNS, numberOfColumns);

pairwiseConf.setBoolean(EXCLUDE_SELF_SIMILARITY, excludeSelfSimilarity);

booleansucceeded = pairwiseSimilarity.waitForCompletion(true);

if (!succeeded) {

return -1;

}

（7）输入为（6）的输出，生成完整的相似度矩阵，并保留每个项目对应的相似度最大的topk项，输出为：<IndexOfIteamA,vector<IndexOfIteamB,similarityValue>>。

if (shouldRunNextPhase(parsedArgs, currentPhase)) {

Job asMatrix= prepareJob(pairwiseSimilarityPath, getOutputPath(), UnsymmetrifyMapper.class,

IntWritable.class, VectorWritable.class, MergeToTopKSimilaritiesReducer.class, IntWritable.class,

VectorWritable.class);

asMatrix.setCombinerClass(MergeToTopKSimilaritiesReducer.class);

asMatrix.getConfiguration().setInt(MAX_SIMILARITIES_PER_ROW, maxSimilaritiesPerRow);

booleansucceeded = asMatrix.waitForCompletion(true);

if (!succeeded) {

return -1;

}

3.1.1.countObservations :CountObservationsMapper

publicstaticclassCountObservationsMapperextends

Mapper<IntWritable,VectorWritable,NullWritable,VectorWritable> {

private Vector columnCounts = new RandomAccessSparseVector(Integer.MAX_VALUE);

@Override

protectedvoid map(IntWritable rowIndex, VectorWritable rowVectorWritable, Context ctx)

throws IOException, InterruptedException {

Vector row = rowVectorWritable.get();

for (Vector.Element elem : row.nonZeroes()) {

//向量的位置是indexofuserid，位置上的内容是index是indexofuserid的user的个数

columnCounts.setQuick(elem.index(), columnCounts.getQuick(elem.index()) + 1);

}

@Override

protectedvoid cleanup(Context ctx) throws IOException, InterruptedException {

//输出map段的统计结果

ctx.write(NullWritable.get(), new VectorWritable(columnCounts));

}

3.1.2.countObservations :SumObservationsReducer

publicstaticclass SumObservationsReducer extends

Reducer<NullWritable,VectorWritable,NullWritable,VectorWritable> {

@Override

protectedvoid reduce(NullWritable nullWritable, Iterable<VectorWritable> partialVectors, Context ctx)

throws IOException, InterruptedException {

//vector对应内容累加

Vector counts = Vectors.sum(partialVectors.iterator());

//由于最后的向量只有一个，reduce的num设为1，因此，这里没有用job的reduce输出路径，而是通过hdfs api 直接将结果写入文件中。

Vectors.write(counts, new

Path(ctx.getConfiguration().get(OBSERVATIONS_PER_COLUMN_PATH)), ctx.getConfiguration());

}

3.1.3. normsAndTranspose :VectorNormMapper

publicstaticclass VectorNormMapper extends

Mapper<IntWritable,VectorWritable,IntWritable,VectorWritable> {

private VectorSimilarityMeasure similarity;

private Vector norms;

private Vector nonZeroEntries;

private Vector maxValues;

privatedoublethreshold;

private OpenIntIntHashMap observationsPerColumn;

privateintmaxObservationsPerRow;

privateintmaxObservationsPerColumn;

private Random random;

@Override

protectedvoid setup(Context ctx) throws IOException, InterruptedException {

Configuration conf = ctx.getConfiguration();

//计算相似度时使用的距离计算方式

similarity = ClassUtils.instantiateAs(conf.get(SIMILARITY_CLASSNAME), VectorSimilarityMeasure.class);

//norm向量用于存放每个indexofitemid对应的norm值

norms = new RandomAccessSparseVector(Integer.MAX_VALUE);

//每个item对应的用户数向量

nonZeroEntries = new RandomAccessSparseVector(Integer.MAX_VALUE);

//每个item对应的评分最大值

maxValues = new RandomAccessSparseVector(Integer.MAX_VALUE);

threshold = Double.parseDouble(conf.get(THRESHOLD));

//读取每个用户出现次数的向量，即每个用户购买不同产品数

observationsPerColumn = Vectors.readAsIntMap(new

Path(conf.get(OBSERVATIONS_PER_COLUMN_PATH)), conf);

maxObservationsPerRow = conf.getInt(MAX_OBSERVATIONS_PER_ROW, DEFAULT_MAX_OBSERVATIONS_PER_ROW);

maxObservationsPerColumn = conf.getInt(MAX_OBSERVATIONS_PER_COLUMN, DEFAULT_MAX_OBSERVATIONS_PER_COLUMN);

//从配置文件中读取随机种子的值

longseed = Long.parseLong(conf.get(RANDOM_SEED));

if (seed == NO_FIXED_RANDOM_SEED) {

random = RandomUtils.getRandom();

} else {

random = RandomUtils.getRandom(seed);

}

//采样函数，即采样的策略

private Vector sampleDown(Vector rowVector, Context ctx) {

//计算向量的非默认值的个数，也就是非0个数，即项目对应的用户数

intobservationsPerRow = rowVector.getNumNondefaultElements();

/* 当indexofitemid对应的用户个数小于maxObservationsPerRow，行概率为1，否则，行概率为maxObservationsPerRow/observationsPerRow。*/

doublerowSampleRate = (double) Math.min(maxObservationsPerRow, observationsPerRow) / (double) observationsPerRow;

Vector downsampledRow = rowVector.like();

longusedObservations = 0;

longneglectedObservations = 0;

for (Vector.Element elem : rowVector.nonZeroes()) {

//用户indexofuserid对应的出现次数

intcolumnCount = observationsPerColumn.get(elem.index());

/*如果columnCount小于maxObservationsPerColumn时，列概率为1，否则，列概率为maxObservationsPerColumn / columnCount。*/

doublecolumnSampleRate = (double) Math.min(maxObservationsPerColumn, columnCount) / (double) columnCount;

//由行、列概率共同决定对应indexofitem，indexofuserid上的值是否保留

if (random.nextDouble() <= Math.min(rowSampleRate, columnSampleRate)) {

downsampledRow.setQuick(elem.index(), elem.get());

usedObservations++;

} else {

neglectedObservations++;

}

ctx.getCounter(Counters.USED_OBSERVATIONS).increment(usedObservations);

ctx.getCounter(Counters.NEGLECTED_OBSERVATIONS).increment(neglectedObservations);

returndownsampledRow;

}

@Override

protectedvoid map(IntWritable row, VectorWritable vectorWritable, Context ctx)

throws IOException, InterruptedException {

//对输入的向量进行采样处理，并返回采样后的向量

Vector sampledRowVector = sampleDown(vectorWritable.get(), ctx);

//如果是欧几里得距离，则rowVector 与sampledRowVector相同

Vector rowVector = similarity.normalize(sampledRowVector);

intnumNonZeroEntries = 0;

doublemaxValue = Double.MIN_VALUE;

for (Vector.Element element : rowVector.nonZeroes()) {

RandomAccessSparseVector partialColumnVector = new

RandomAccessSparseVector(Integer.MAX_VALUE);

partialColumnVector.setQuick(row.get(), element.get());

//输出<indexofuserid,<indexofitemid,pref>>,即采样向量进行转置

ctx.write(new IntWritable(element.index()), new

VectorWritable(partialColumnVector));

numNonZeroEntries++;

if (maxValue < element.get()) {

maxValue = element.get();

}

if (threshold != NO_THRESHOLD) {

//采样向量的非默认值个数

nonZeroEntries.setQuick(row.get(), numNonZeroEntries);

//采样向量中最大值

maxValues.setQuick(row.get(), maxValue);

}

//计算indexofitemid对应的norm值

norms.setQuick(row.get(), similarity.norm(rowVector));

ctx.getCounter(Counters.ROWS).increment(1);

}

@Override

protectedvoid cleanup(Context ctx) throws IOException, InterruptedException {

ctx.write(new IntWritable(NORM_VECTOR_MARKER), new VectorWritable(norms));

ctx.write(new IntWritable(NUM_NON_ZERO_ENTRIES_VECTOR_MARKER), new

VectorWritable(nonZeroEntries));

ctx.write(new IntWritable(MAXVALUE_VECTOR_MARKER), new

VectorWritable(maxValues));

}

3.1.4. normsAndTranspose :MergeVectorsCombiner

privatestaticclass MergeVectorsCombiner extends Reducer<IntWritable,VectorWritable,IntWritable,VectorWritable> {

@Override

protectedvoid reduce(IntWritable row, Iterable<VectorWritable> partialVectors, Context ctx)

throws IOException, InterruptedException {

//相同key值对应的value合并

ctx.write(row, new VectorWritable(Vectors.merge(partialVectors)));

}

3.1.5.normsAndTranspose :MergeVectorsReducer

publicstaticclass MergeVectorsReducer extends

Reducer<IntWritable,VectorWritable,IntWritable,VectorWritable> {

private Path normsPath;

private Path numNonZeroEntriesPath;

private Path maxValuesPath;

@Override

protectedvoid setup(Context ctx) throws IOException, InterruptedException {

normsPath = new Path(ctx.getConfiguration().get(NORMS_PATH));

numNonZeroEntriesPath = new Path(ctx.getConfiguration().get(NUM_NON_ZERO_ENTRIES_PATH));

maxValuesPath = new Path(ctx.getConfiguration().get(MAXVALUES_PATH));

}

@Override

protectedvoid reduce(IntWritable row, Iterable<VectorWritable> partialVectors, Context ctx)

throws IOException, InterruptedException {

//相同的key值，进行合并

Vector partialVector = Vectors.merge(partialVectors);

if (row.get() == NORM_VECTOR_MARKER) {

//将norm向量结果写入指定的路径文件中

Vectors.write(partialVector, normsPath, ctx.getConfiguration());

} elseif (row.get() == MAXVALUE_VECTOR_MARKER) {

//将最大值向量写入指定的路径文件中

Vectors.write(partialVector, maxValuesPath, ctx.getConfiguration());

} elseif (row.get() == NUM_NON_ZERO_ENTRIES_VECTOR_MARKER) {

//将非0个数向量写入指定的文件中

Vectors.write(partialVector, numNonZeroEntriesPath, ctx.getConfiguration(), true);

} else {

//输出装置向量

ctx.write(row, new VectorWritable(partialVector));

}

3.1.6. pairwiseSimilarity :CooccurrencesMapper

publicstaticclass CooccurrencesMapper extends

Mapper<IntWritable,VectorWritable,IntWritable,VectorWritable> {

private VectorSimilarityMeasure similarity;

private OpenIntIntHashMap numNonZeroEntries;

private Vector maxValues;

privatedoublethreshold;

privatestaticfinal Comparator<Vector.Element> BY_INDEX = new Comparator<Vector.Element>() {

@Override

publicint compare(Vector.Element one, Vector.Element two) {

return Ints.compare(one.index(), two.index());

}

};

@Override

protectedvoid setup(Context ctx) throws IOException, InterruptedException {

similarity = ClassUtils.instantiateAs(ctx.getConfiguration().get(SIMILARITY_CLASSNAME),

VectorSimilarityMeasure.class);

numNonZeroEntries = Vectors.readAsIntMap(new Path(ctx.getConfiguration().get(NUM_NON_ZERO_ENTRIES_PATH)),

ctx.getConfiguration());

maxValues = Vectors.read(new Path(ctx.getConfiguration().get(MAXVALUES_PATH)), ctx.getConfiguration());

threshold = Double.parseDouble(ctx.getConfiguration().get(THRESHOLD));

}

privateboolean consider(Vector.Element occurrenceA, Vector.Element occurrenceB) {

intnumNonZeroEntriesA = numNonZeroEntries.get(occurrenceA.index());

intnumNonZeroEntriesB = numNonZeroEntries.get(occurrenceB.index());

doublemaxValueA = maxValues.get(occurrenceA.index());

doublemaxValueB = maxValues.get(occurrenceB.index());

returnsimilarity.consider(numNonZeroEntriesA, numNonZeroEntriesB, maxValueA, maxValueB, threshold);

}

@Override

protectedvoid map(IntWritable column, VectorWritable occurrenceVector, Context ctx)

throws IOException, InterruptedException {

//将向量变成数组

Vector.Element[] occurrences = Vectors.toArray(occurrenceVector);

//将数组按照其在向量中的位置进行排序

Arrays.sort(occurrences, BY_INDEX);

intcooccurrences = 0;

intprunedCooccurrences = 0;

for (intn = 0; n < occurrences.length; n++) {

Vector.Element occurrenceA = occurrences[n];

Vector dots = new RandomAccessSparseVector(Integer.MAX_VALUE);

for (intm = n; m < occurrences.length; m++) {

Vector.Element occurrenceB = occurrences[m];

if (threshold == NO_THRESHOLD || consider(occurrenceA, occurrenceB)) {

//向量在occurrenceB.index()的位置上放occurrenceA与occurrenceB的乘积

dots.setQuick(occurrenceB.index(),

similarity.aggregate(occurrenceA.get(), occurrenceB.get()));

cooccurrences++;

} else {

prunedCooccurrences++;

}

//输出<occurrenceA.index(),vector<occurrenceB.index(),prefofA*prefofB>>

ctx.write(new IntWritable(occurrenceA.index()), new VectorWritable(dots));

}

ctx.getCounter(Counters.COOCCURRENCES).increment(cooccurrences);

ctx.getCounter(Counters.PRUNED_COOCCURRENCES).increment(prunedCooccurrences);

}

3.1.7.pairwiseSimilarity :SimilarityReducer

publicstaticclass SimilarityReducer extends

Reducer<IntWritable,VectorWritable,IntWritable,VectorWritable> {

private VectorSimilarityMeasure similarity;

privateintnumberOfColumns;

privatebooleanexcludeSelfSimilarity;

private Vector norms;

privatedoubletreshold;

@Override

protectedvoid setup(Context ctx) throws IOException, InterruptedException {

similarity =

ClassUtils.instantiateAs(ctx.getConfiguration().get(SIMILARITY_CLASSNAME),

VectorSimilarityMeasure.class);

numberOfColumns = ctx.getConfiguration().getInt(NUMBER_OF_COLUMNS, -1);

Preconditions.checkArgument(numberOfColumns > 0, "Number of columns must be greater then 0! But numberOfColumns = " + numberOfColumns);

excludeSelfSimilarity = ctx.getConfiguration().getBoolean(EXCLUDE_SELF_SIMILARITY, false);

norms = Vectors.read(new Path(ctx.getConfiguration().get(NORMS_PATH)), ctx.getConfiguration());

treshold = Double.parseDouble(ctx.getConfiguration().get(THRESHOLD));

}

@Override

protectedvoid reduce(IntWritable row, Iterable<VectorWritable> partialDots, Context ctx)

throws IOException, InterruptedException {

Iterator<VectorWritable> partialDotsIterator = partialDots.iterator();

//相同key值的vector做累加

Vector dots = partialDotsIterator.next().get();

while (partialDotsIterator.hasNext()) {

Vector toAdd = partialDotsIterator.next().get();

for (Element nonZeroElement : toAdd.nonZeroes()) {

dots.setQuick(nonZeroElement.index(), dots.getQuick(nonZeroElement.index()) + nonZeroElement.get());

}

Vector similarities = dots.like();

doublenormA = norms.getQuick(row.get());

for (Element b : dots.nonZeroes()) {

//计算itemA与b的相似度

doublesimilarityValue = similarity.similarity(b.get(), normA, norms.getQuick(b.index()), numberOfColumns);

//如果相似度大于等于阈值treshold，则保存，否则，丢弃

if (similarityValue >= treshold) {

similarities.set(b.index(), similarityValue);

}

if (excludeSelfSimilarity) {

//将与自己的相似度设为0

similarities.setQuick(row.get(), 0);

}

//输出itemA 与index大于indexofitemA的项目的相似度，即整体输出一个上三角矩阵

ctx.write(row, new VectorWritable(similarities));

}

3.1.8. asMatrix :UnsymmetrifyMapper

//生成完整的相似度矩阵

publicstaticclass UnsymmetrifyMapper extends

Mapper<IntWritable,VectorWritable,IntWritable,VectorWritable> {

privateintmaxSimilaritiesPerRow;

@Override

protectedvoid setup(Mapper.Contextctx) throws IOException, InterruptedException {

maxSimilaritiesPerRow =

ctx.getConfiguration().getInt(MAX_SIMILARITIES_PER_ROW, 0);

Preconditions.checkArgument(maxSimilaritiesPerRow > 0, "Maximum number of similarities per row must be greater then 0!");

}

@Override

protectedvoid map(IntWritable row, VectorWritable similaritiesWritable, Context ctx)

throws IOException, InterruptedException {

Vector similarities = similaritiesWritable.get();

Vector transposedPartial = new RandomAccessSparseVector(similarities.size(), 1);

/*利用优先队列找到向量中值最大的前topK 项，并保存值与位置信息，这里，去掉无用项，为了节约空间*/

TopElementsQueue topKQueue = new TopElementsQueue(maxSimilaritiesPerRow);

for (Element nonZeroElement : similarities.nonZeroes()) {

MutableElement top = topKQueue.top();

doublecandidateValue = nonZeroElement.get();

if (candidateValue > top.get()) {

top.setIndex(nonZeroElement.index());

top.set(candidateValue);

topKQueue.updateTop();

}

//将向量进行转置并输出

transposedPartial.setQuick(row.get(), candidateValue);

ctx.write(new IntWritable(nonZeroElement.index()), new

VectorWritable(transposedPartial));

//重用transposedPartial

transposedPartial.setQuick(row.get(), 0.0);

}

Vector topKSimilarities = new RandomAccessSparseVector(similarities.size(), maxSimilaritiesPerRow);

for (Vector.Element topKSimilarity : topKQueue.getTopElements()) {

topKSimilarities.setQuick(topKSimilarity.index(), topKSimilarity.get());

}

ctx.write(row, new VectorWritable(topKSimilarities));

}

3.1.9.asMatrix :MergeToTopKSimilaritiesReducer

publicstaticclass MergeToTopKSimilaritiesReducer

extends Reducer<IntWritable,VectorWritable,IntWritable,VectorWritable> {

privateintmaxSimilaritiesPerRow;

@Override

protectedvoid setup(Context ctx) throws IOException, InterruptedException {

maxSimilaritiesPerRow = ctx.getConfiguration().getInt(MAX_SIMILARITIES_PER_ROW, 0);

Preconditions.checkArgument(maxSimilaritiesPerRow > 0, "Maximum number of similarities per row must be greater then 0!");

}

@Override

protectedvoid reduce(IntWritable row, Iterable<VectorWritable> partials, Context ctx)

throws IOException, InterruptedException {

//合并向量

Vector allSimilarities = Vectors.merge(partials);

//保存相似度最大的topk项

Vector topKSimilarities = Vectors.topKElements(maxSimilaritiesPerRow, allSimilarities);

//输出结果，最终生成整体的相似度矩阵

ctx.write(row, new VectorWritable(topKSimilarities));

}

3.2. ItemSimilarityJob.MostSimilarItemPairsMapper

//相似度矩阵打印出来，格式为<<itemA,itemB>,similarity>

publicstaticclassMostSimilarItemPairsMapper

extends Mapper<IntWritable,VectorWritable,EntityEntityWritable,DoubleWritable> {

private OpenIntLongHashMap indexItemIDMap;

privateintmaxSimilarItemsPerItem;

@Override

protectedvoid setup(Context ctx) {

Configuration conf = ctx.getConfiguration();

maxSimilarItemsPerItem = conf.getInt(MAX_SIMILARITIES_PER_ITEM, -1);

indexItemIDMap = TasteHadoopUtils.readIDIndexMap(conf.get(ITEM_ID_INDEX_PATH_STR), conf);

Preconditions.checkArgument(maxSimilarItemsPerItem > 0, "maxSimilarItemsPerItem must be greater then 0!");

}

@Override

protectedvoid map(IntWritable itemIDIndexWritable, VectorWritable similarityVector, Context ctx)

throws IOException, InterruptedException {

intitemIDIndex = itemIDIndexWritable.get();

TopSimilarItemsQueue topKMostSimilarItems = new

TopSimilarItemsQueue(maxSimilarItemsPerItem);

for (Vector.Element element : similarityVector.get().nonZeroes()) {

SimilarItem top = topKMostSimilarItems.top();

doublecandidateSimilarity = element.get();

if (candidateSimilarity > top.getSimilarity()) {

top.set(indexItemIDMap.get(element.index()), candidateSimilarity);

topKMostSimilarItems.updateTop();

}

longitemID = indexItemIDMap.get(itemIDIndex);

for (SimilarItem similarItem : topKMostSimilarItems.getTopItems()) {

longotherItemID = similarItem.getItemID();

if (itemID < otherItemID) {

ctx.write(new EntityEntityWritable(itemID, otherItemID), new DoubleWritable(similarItem.getSimilarity()));

} else {

ctx.write(new EntityEntityWritable(otherItemID, itemID), new DoubleWritable(similarItem.getSimilarity()));

}

3.3.ItemSimilarityJob.MostSimilarItemPairsReducer

publicstaticclass MostSimilarItemPairsReducer

extends Reducer<EntityEntityWritable,DoubleWritable,EntityEntityWritable,DoubleWritable> {

@Override

protectedvoid reduce(EntityEntityWritable pair, Iterable<DoubleWritable> values, Context ctx)

throws IOException, InterruptedException {

ctx.write(pair, values.iterator().next());

}

4. 第三阶段：

该阶段由一个Job完成，该Job由两个输入路径，两种不同的map，一种reduce组成。

第一个输入为（7）的输出，对应第一个map，将输入的相似度向量中与自己的相似度设为NAN，输出为：<IndexOfIteamA,vector<IndexOfIteamB,similarityValue>>；第二个输入为（2）的输出，对应第二个map，如果存在推荐用户表，则只针对需要推荐的用户生成对应的转置<IndexOfIteamid,vector<userId,pref>>，如果不存在推荐用户表，则对所有用户生成对应的转置<IndexOfIteamid,vector<userId,pref>>。注意：第二个map中只保留pref最大的前N项对应的IndexOfIteamid，其他的IndexOfIteamid对应的pref设为NAN。

Reduce将上面两种map的输出作为输入，生成项目对应的相似项，用户及评分，输出为：

<IndexOfIteamid,<similarityMatrixColumnOfIndexOfIteamid,UserListOfIndexOfIteamid,prefListOfIndexOfIteamid>>。

if (shouldRunNextPhase(parsedArgs, currentPhase)) {

Job partialMultiply= new Job(getConf(), "partialMultiply");

Configuration partialMultiplyConf = partialMultiply.getConfiguration();

MultipleInputs.addInputPath(partialMultiply, similarityMatrixPath, SequenceFileInputFormat.class,

SimilarityMatrixRowWrapperMapper.class);

MultipleInputs.addInputPath(partialMultiply, new Path(prepPath, PreparePreferenceMatrixJob.USER_VECTORS),

SequenceFileInputFormat.class, UserVectorSplitterMapper.class);

partialMultiply.setJarByClass(ToVectorAndPrefReducer.class);

partialMultiply.setMapOutputKeyClass(VarIntWritable.class);

partialMultiply.setMapOutputValueClass(VectorOrPrefWritable.class);

partialMultiply.setReducerClass(ToVectorAndPrefReducer.class);

partialMultiply.setOutputFormatClass(SequenceFileOutputFormat.class);

partialMultiply.setOutputKeyClass(VarIntWritable.class);

partialMultiply.setOutputValueClass(VectorAndPrefsWritable.class);

partialMultiplyConf.setBoolean("mapred.compress.map.output", true);

partialMultiplyConf.set("mapred.output.dir", partialMultiplyPath.toString());

if (usersFile != null) {

partialMultiplyConf.set(UserVectorSplitterMapper.USERS_FILE, usersFile);

}

partialMultiplyConf.setInt(UserVectorSplitterMapper.MAX_PREFS_PER_USER_CONSIDERED, maxPrefsPerUser);

booleansucceeded = partialMultiply.waitForCompletion(true);

if (!succeeded) {

return -1;

}

4.1. SimilarityMatrixRowWrapperMapper

publicfinalclassSimilarityMatrixRowWrapperMapperextends

Mapper<IntWritable,VectorWritable,VarIntWritable,VectorOrPrefWritable> {

privatefinal VarIntWritable index = new VarIntWritable();

privatefinal VectorOrPrefWritable vectorOrPref = new VectorOrPrefWritable();

@Override

protectedvoid map(IntWritable key,

VectorWritable value,

Context context) throws IOException, InterruptedException {

Vector similarityMatrixRow = value.get();

/* 将自身的相似度设为NAN，这样的值对应的项目的相似度向量，将不参加推荐运算*/

similarityMatrixRow.set(key.get(), Double.NaN);

index.set(key.get());

//这种map的vectorOrPref只保存相似度向量

vectorOrPref.set(similarityMatrixRow);

context.write(index, vectorOrPref);

}

4.2.UserVectorSplitterMapper

publicfinalclassUserVectorSplitterMapperextends

Mapper<VarLongWritable,VectorWritable, VarIntWritable,VectorOrPrefWritable> {

privatestaticfinal Logger log = LoggerFactory.getLogger(UserVectorSplitterMapper.class);

staticfinal String USERS_FILE = "usersFile";

staticfinal String MAX_PREFS_PER_USER_CONSIDERED = "maxPrefsPerUserConsidered";

staticfinalintDEFAULT_MAX_PREFS_PER_USER_CONSIDERED = 10;

privateintmaxPrefsPerUserConsidered;

private FastIDSet usersToRecommendFor;

privatefinal VarIntWritable itemIndexWritable = new VarIntWritable();

privatefinal VectorOrPrefWritable vectorOrPref = new VectorOrPrefWritable();

@Override

protectedvoid setup(Context context) throws IOException {

Configuration jobConf = context.getConfiguration();

maxPrefsPerUserConsidered = jobConf.getInt(MAX_PREFS_PER_USER_CONSIDERED, DEFAULT_MAX_PREFS_PER_USER_CONSIDERED);

String usersFilePathString = jobConf.get(USERS_FILE);

if (usersFilePathString != null) {

FSDataInputStream in = null;

try {

Path unqualifiedUsersFilePath = new Path(usersFilePathString);

FileSystem fs = FileSystem.get(unqualifiedUsersFilePath.toUri(), jobConf);

usersToRecommendFor = new FastIDSet();

Path usersFilePath = unqualifiedUsersFilePath.makeQualified(fs);

in = fs.open(usersFilePath);

for (String line : new FileLineIterable(in)) {

try {

usersToRecommendFor.add(Long.parseLong(line));

} catch (NumberFormatException nfe) {

log.warn("usersFile line ignored: {}", line);

}

} finally {

Closeables.close(in, true);

}

@Override

protectedvoid map(VarLongWritable key,

VectorWritable value,

Context context) throws IOException, InterruptedException {

longuserID = key.get();

if (usersToRecommendFor != null && !usersToRecommendFor.contains(userID)) {

return;

}

//对userid对应的用户向量进行处理，保留评分最大的前K项

Vector userVector = maybePruneUserVector(value.get());

for (Element e : userVector.nonZeroes()) {

itemIndexWritable.set(e.index());

//这种map的vectorOrPref只保存userid与pref

vectorOrPref.set(userID, (float) e.get());

context.write(itemIndexWritable, vectorOrPref);

}

private Vector maybePruneUserVector(Vector userVector) {

if (userVector.getNumNondefaultElements() <= maxPrefsPerUserConsidered) {

returnuserVector;

}

floatsmallestLargeValue = findSmallestLargeValue(userVector);

/*将用户向量中不保留的项的值设为NAN，这样的值对应的相似度向量，将不做推荐运算*/

for (Element e : userVector.nonZeroes()) {

floatabsValue = Math.abs((float) e.get());

if (absValue < smallestLargeValue) {

e.set(Float.NaN);

}

returnuserVector;

}

privatefloat findSmallestLargeValue(Vector userVector) {

PriorityQueue<Float> topPrefValues = new PriorityQueue<Float>(maxPrefsPerUserConsidered) {

@Override

protectedboolean lessThan(Float f1, Float f2) {

returnf1 < f2;

}

};

for (Element e : userVector.nonZeroes()) {

floatabsValue = Math.abs((float) e.get());

topPrefValues.insertWithOverflow(absValue);

}

returntopPrefValues.top();

}

4.3.ToVectorAndPrefReducer

publicfinalclassToVectorAndPrefReducerextends

Reducer<VarIntWritable,VectorOrPrefWritable,VarIntWritable,VectorAndPrefsWritable> {

privatefinal VectorAndPrefsWritable vectorAndPrefs = new VectorAndPrefsWritable();

@Override

protectedvoid reduce(VarIntWritable key,

Iterable<VectorOrPrefWritable> values,

Context context) throws IOException, InterruptedException {

List<Long> userIDs = Lists.newArrayList();

List<Float> prefValues = Lists.newArrayList();

Vector similarityMatrixColumn = null;

for (VectorOrPrefWritable value : values) {

if (value.getVector() == null) {

// Then this is a user-pref value

userIDs.add(value.getUserID());

prefValues.add(value.getValue());

} else {

// Then this is the column vector

if (similarityMatrixColumn != null) {

thrownew IllegalStateException("Found two similarity-matrix columns for item index " + key.get());

}

similarityMatrixColumn = value.getVector();

}

if (similarityMatrixColumn == null) {

return;

}

//vectorAndPrefs中保存itemid对应的相似度向量，所有的用户及用户的评分

vectorAndPrefs.set(similarityMatrixColumn, userIDs, prefValues);

context.write(key, vectorAndPrefs);

}

5. 第四阶段：

该阶段包含两个job：

如何用户设定过滤文件，则进行该job，输入为该过滤文件，将过滤文件中用户-项目对应的相似项目中的对应值设为NAN（用于后面的过滤），输出为<IndexOfIteamid,<similarityMatrixColumnOfIndexOfIteamid,UserListOfIndexOfIteamid,prefListOfIndexOfIteamid>>。用于过滤推荐项目。

计算用户的推介项目，在计算结果后，保存结果之前进行过滤，对所有用户，根据推荐项目表，对推介项目进行过滤，同时利用过滤文件产生的结果，进行二次过滤。生成：<userid,List<iteamid,value>>。

if (shouldRunNextPhase(parsedArgs, currentPhase)) {

//filter out any users we don't care about

/* convert the user/item pairs to filter if a filterfile has been specified */

if (filterFile != null) {

Job itemFiltering= prepareJob(new Path(filterFile), explicitFilterPath, TextInputFormat.class,

ItemFilterMapper.class, VarLongWritable.class, VarLongWritable.class,

ItemFilterAsVectorAndPrefsReducer.class, VarIntWritable.class, VectorAndPrefsWritable.class,

SequenceFileOutputFormat.class);

booleansucceeded = itemFiltering.waitForCompletion(true);

if (!succeeded) {

return -1;

}

String aggregateAndRecommendInput = partialMultiplyPath.toString();

if (filterFile != null) {

aggregateAndRecommendInput += "," + explicitFilterPath;

}

Class<? extendsOutputFormat> outputFormat = parsedArgs.containsKey("--sequencefileOutput")

? SequenceFileOutputFormat.class : TextOutputFormat.class;

//extract out the recommendations

Job aggregateAndRecommend= prepareJob(

new Path(aggregateAndRecommendInput), outputPath, SequenceFileInputFormat.class,

PartialMultiplyMapper.class, VarLongWritable.class, PrefAndSimilarityColumnWritable.class,

AggregateAndRecommendReducer.class, VarLongWritable.class, RecommendedItemsWritable.class,

outputFormat);

Configuration aggregateAndRecommendConf = aggregateAndRecommend.getConfiguration();

if (itemsFile != null) {

aggregateAndRecommendConf.set(AggregateAndRecommendReducer.ITEMS_FILE, itemsFile);

}

if (filterFile != null) {

setS3SafeCombinedInputPath(aggregateAndRecommend, getTempPath(), partialMultiplyPath, explicitFilterPath);

}

setIOSort(aggregateAndRecommend);

aggregateAndRecommendConf.set(AggregateAndRecommendReducer.ITEMID_INDEX_PATH,

new Path(prepPath, PreparePreferenceMatrixJob.ITEMID_INDEX).toString());

aggregateAndRecommendConf.setInt(AggregateAndRecommendReducer.NUM_RECOMMENDATIONS, numRecommendations);

aggregateAndRecommendConf.setBoolean(BOOLEAN_DATA, booleanData);

booleansucceeded = aggregateAndRecommend.waitForCompletion(true);

if (!succeeded) {

return -1;

}

5.1. itemFiltering :ItemFilterMapper

//读取过滤文件，输出为<itemid,userid>

publicclassItemFilterMapperextends Mapper<LongWritable,Text,VarLongWritable,VarLongWritable> {

privatestaticfinal Pattern SEPARATOR = Pattern.compile("[\t,]");

privatefinal VarLongWritable itemIDWritable = new VarLongWritable();

privatefinal VarLongWritable userIDWritable = new VarLongWritable();

@Override

protectedvoid map(LongWritable key, Text line, Context ctx) throws IOException, InterruptedException {

String[] tokens = SEPARATOR.split(line.toString());

longuserID = Long.parseLong(tokens[0]);

longitemID = Long.parseLong(tokens[1]);

itemIDWritable.set(itemID);

userIDWritable.set(userID);

ctx.write(itemIDWritable, userIDWritable);

}

5.2.itemFiltering :ItemFilterAsVectorAndPrefsReducer

/*构造vectorAndPrefs，将其中相似度矩阵中，itemid 位置上的值设为NAN，表明此itemid将不会出现在推荐列表中*/

publicclassItemFilterAsVectorAndPrefsReducer

extends Reducer<VarLongWritable,VarLongWritable,VarIntWritable,VectorAndPrefsWritable> {

privatefinal VarIntWritable itemIDIndexWritable = new VarIntWritable();

privatefinal VectorAndPrefsWritable vectorAndPrefs = new VectorAndPrefsWritable();

@Override

protectedvoid reduce(VarLongWritable itemID, Iterable<VarLongWritable> values, Context ctx)

throws IOException, InterruptedException {

intitemIDIndex = TasteHadoopUtils.idToIndex(itemID.get());

Vector vector = new RandomAccessSparseVector(Integer.MAX_VALUE, 1);

/*将userIDs中的用户对应的itemIDIndex对应的相似度矩阵中与自己的相似度设为NAN，

即在计算推荐向量时，该项不被推荐*/

vector.set(itemIDIndex, Double.NaN);

List<Long> userIDs = Lists.newArrayList();

List<Float> prefValues = Lists.newArrayList();

for (VarLongWritable userID : values) {

userIDs.add(userID.get());

prefValues.add(1.0f);

}

itemIDIndexWritable.set(itemIDIndex);

vectorAndPrefs.set(vector, userIDs, prefValues);

ctx.write(itemIDIndexWritable, vectorAndPrefs);

}

5.3.PartialMultiplyMapper

/*汇总参与计算userid的推荐项量的所有pref与相似向量，生成<userid,List<pref,vevtor<indexofitemid,SimilarityValue>>>*/

publicfinalclassPartialMultiplyMapperextends

Mapper<VarIntWritable,VectorAndPrefsWritable,VarLongWritable,PrefAndSimilarityColumnWritable> {

privatefinal VarLongWritable userIDWritable = new VarLongWritable();

privatefinal PrefAndSimilarityColumnWritable prefAndSimilarityColumn = new PrefAndSimilarityColumnWritable();

@Override

protectedvoid map(VarIntWritable key,

VectorAndPrefsWritable vectorAndPrefsWritable,

Context context) throws IOException, InterruptedException {

Vector similarityMatrixColumn = vectorAndPrefsWritable.getVector();

List<Long> userIDs = vectorAndPrefsWritable.getUserIDs();

List<Float> prefValues = vectorAndPrefsWritable.getValues();

for (inti = 0; i < userIDs.size(); i++) {

longuserID = userIDs.get(i);

floatprefValue = prefValues.get(i);

//pref值为NAN时，将不参与计算推荐值

if (!Float.isNaN(prefValue)) {

prefAndSimilarityColumn.set(prefValue, similarityMatrixColumn);

userIDWritable.set(userID);

context.write(userIDWritable, prefAndSimilarityColumn);

}

5.4.AggregateAndRecommendReducer

publicfinalclassAggregateAndRecommendReducerextends

Reducer<VarLongWritable,PrefAndSimilarityColumnWritable,VarLongWritable,RecommendedItemsWritable> {

privatestaticfinal Logger log = LoggerFactory.getLogger(AggregateAndRecommendReducer.class);

staticfinal String ITEMID_INDEX_PATH = "itemIDIndexPath";

staticfinal String NUM_RECOMMENDATIONS = "numRecommendations";

staticfinalintDEFAULT_NUM_RECOMMENDATIONS = 10;

staticfinal String ITEMS_FILE = "itemsFile";

privatebooleanbooleanData;

privateintrecommendationsPerUser;

private FastIDSet itemsToRecommendFor;

private OpenIntLongHashMap indexItemIDMap;

privatefinal RecommendedItemsWritable recommendedItems = new RecommendedItemsWritable();

privatestaticfinalfloatBOOLEAN_PREF_VALUE = 1.0f;

@Override

protectedvoid setup(Context context) throws IOException {

Configuration conf = context.getConfiguration();

recommendationsPerUser = conf.getInt(NUM_RECOMMENDATIONS, DEFAULT_NUM_RECOMMENDATIONS);

booleanData = conf.getBoolean(RecommenderJob.BOOLEAN_DATA, false);

indexItemIDMap = TasteHadoopUtils.readIDIndexMap(conf.get(ITEMID_INDEX_PATH), conf);

String itemFilePathString = conf.get(ITEMS_FILE);

if (itemFilePathString != null) {

itemsToRecommendFor = new FastIDSet();

for (String line : new FileLineIterable(HadoopUtil.openStream(new Path(itemFilePathString), conf))) {

try {

itemsToRecommendFor.add(Long.parseLong(line));

} catch (NumberFormatException nfe) {

log.warn("itemsFile line ignored: {}", line);

}

@Override

protectedvoid reduce(VarLongWritable userID,

Iterable<PrefAndSimilarityColumnWritable> values,

Context context) throws IOException, InterruptedException {

if (booleanData) {

/*如果评分采用布尔值，则只进行相加，否则，用推荐公式计算*/

reduceBooleanData(userID, values, context);

} else {

reduceNonBooleanData(userID, values, context);

}

privatevoid reduceBooleanData(VarLongWritable userID,

Iterable<PrefAndSimilarityColumnWritable> values,

Context context) throws IOException, InterruptedException {

/* having boolean data, each estimated preference can only be 1,

* however we can't use this to rank the recommended items,

* so we use the sum of similarities for that. */

Iterator<PrefAndSimilarityColumnWritable> columns = values.iterator();

Vector predictions = columns.next().getSimilarityColumn();

while (columns.hasNext()) {

predictions.assign(columns.next().getSimilarityColumn(), Functions.PLUS);

}

writeRecommendedItems(userID, predictions, context);

}

privatevoid reduceNonBooleanData(VarLongWritable userID,

Iterable<PrefAndSimilarityColumnWritable> values,

Context context) throws IOException, InterruptedException {

/* each entry here is the sum in the numerator of the prediction formula */

Vector numerators = null;

/* each entry here is the sum in the denominator of the prediction formula */

Vector denominators = null;

/* each entry here is the number of similar items used in the prediction formula */

Vector numberOfSimilarItemsUsed = new RandomAccessSparseVector(Integer.MAX_VALUE, 100);

for (PrefAndSimilarityColumnWritable prefAndSimilarityColumn : values) {

Vector simColumn = prefAndSimilarityColumn.getSimilarityColumn();

floatprefValue = prefAndSimilarityColumn.getPrefValue();

/* count the number of items used for each prediction */

for (Element e : simColumn.nonZeroes()) {

intitemIDIndex = e.index();

numberOfSimilarItemsUsed.setQuick(itemIDIndex, numberOfSimilarItemsUsed.getQuick(itemIDIndex) + 1);

}

if (denominators == null) {

denominators = simColumn.clone();

} else {

denominators.assign(simColumn, Functions.PLUS_ABS);

}

if (numerators == null) {

numerators = simColumn.clone();

if (prefValue != BOOLEAN_PREF_VALUE) {

numerators.assign(Functions.MULT, prefValue);

}

} else {

if (prefValue != BOOLEAN_PREF_VALUE) {

simColumn.assign(Functions.MULT, prefValue);

}

numerators.assign(simColumn, Functions.PLUS);

}

if (numerators == null) {

return;

}

Vector recommendationVector = new RandomAccessSparseVector(Integer.MAX_VALUE, 100);

for (Element element : numerators.nonZeroes()) {

intitemIDIndex = element.index();

/* preference estimations must be based on at least 2 datapoints */

if (numberOfSimilarItemsUsed.getQuick(itemIDIndex) > 1) {

/* compute normalized prediction */

doubleprediction = element.get() / denominators.getQuick(itemIDIndex);

recommendationVector.setQuick(itemIDIndex, prediction);

}

/*对推荐向量进行过滤，并提取推荐值最大的前K项，输出*/

writeRecommendedItems(userID, recommendationVector, context);

}

/**

* find the top entries in recommendationVector, map them to the real itemIDs and write back the result

privatevoid writeRecommendedItems(VarLongWritable userID, Vector recommendationVector, Context context)

throws IOException, InterruptedException {

TopItemsQueue topKItems = new TopItemsQueue(recommendationsPerUser);

for (Element element : recommendationVector.nonZeroes()) {

intindex = element.index();

longitemID;

if (indexItemIDMap != null && !indexItemIDMap.isEmpty()) {

itemID = indexItemIDMap.get(index);

} else { //we don't have any mappings, so just use the original

itemID = index;

}

if (itemsToRecommendFor == null || itemsToRecommendFor.contains(itemID)) {

floatvalue = (float) element.get();

/* 推荐值是NAN时，将不被推荐，也就是过滤文件job中，相似度向量中设为NAN对应的item，不会被推荐*/

if (!Float.isNaN(value)) {

MutableRecommendedItem topItem = topKItems.top();

if (value > topItem.getValue()) {

topItem.set(itemID, value);

topKItems.updateTop();

}

List<RecommendedItem> topItems = topKItems.getTopItems();

if (!topItems.isEmpty()) {

recommendedItems.set(topItems);

//输出，userid及其推荐向量

context.write(userID, recommendedItems);

}

0 0