基于mapreduce 的 minHash 矩阵压缩

来源：互联网发布：sem高级优化师认证编辑：程序博客网时间：2024/06/15 15:21

Minhash作用：

对大矩阵进行降维处理，在进行计算俩个用户之间的相似度。

比如：

俩个用户手机下载的APP的相似度，在一个矩阵中会有很多很多的用户要比较没俩个用户之间的相似度是一个很大的计算任务

如果首先对这个矩阵降维处理，会减少很大的计算量，并且将矩阵放到mapreduce中进行分布式计算，会大大缩短计算时间。

使用minhash降维处理原理：

将矩阵的行随机打乱,假设下面已经是第一次打乱的结果，从上往下取第一个为1的行,比如 u1 H(switch1)=0/a1,u2第一行是0不取，u2的第二行是1取了，H(switch1)=1/a2,

u3 H(switch1)=0/a1, u4 H(switch1)=1/a2.

得到一个新矩阵：

u1 u2 u3 u4

H(switch1) 0/a1 1/a2 0/a1 1/a2

如果随机打乱N次会生成一个N行的矩阵，缩小了原矩阵的行数，然后计算新矩阵的JACCARD相似度。

这样看上去简单，但是打乱一个大的矩阵是一个很大的开销，所以我们用一组随机的minhash函数来代替一组随机的的打乱过程。

minhash 是取最小的哈希值，计算过程：

比如h1根据行号产生的哈希值是1230，u1的第一行是1所以可以取到1，u2是0就不行，u2第二行是1，只能取到2, u3的第一行是1，可以取到1，同样u4是0，只能找第一个为1对应的行的哈希值。@为无穷大，一步步的更新。取最小，小的不更新大的更新。

u1 u2 u3 u4

h1 1 @ @ 1

h2 1 @ @ 1

u1 u2 u3 u4

h1 1 2 2 1

h2 1 3 3 1

u1 u2 u3 u4

h1 1 2 2 1

h2 1 1 1 1

u1 u2 u3 u4

h1 0 0 2 1

h2 1 1 1 1

这就是最后的结果然后再算JACCARD相似度

u2,u3相同的行有1行/哈希函数的个数（矩阵的行数）

jaccard=相同的行数/矩阵的函数=相同的行数/hash函数的个数。

在Mapreduce中的实现：

数据类似：

第一个mapreduce:

压缩矩阵

生成用户对

package code;import hashfunctions.HashFactory;import hashfunctions.HashFactory.HashType;import hashfunctions.HashFunction;import java.io.IOException;import java.util.ArrayList;import java.util.Collections;import java.util.Iterator;import java.util.List;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;import utils.ConfigUtils;import utils.ErrorLevel;import utils.Loger;public class redstone1Step1{  public static class maphadoop1 extends Mapper<Object, Text, Text, Text>  {  private HashFunction[] hashFunction; //hash函数数组

  private int numHashFunctions;        //hash函数个数  private HashType hashType;  private long[] minHashValues;        //最小hash值  private byte[] bytesToHash;  protected void setup(Context context)throws IOException, InterruptedException  {  Configuration conf = context.getConfiguration();  this.numHashFunctions = (int)ConfigUtils.getValue(conf, "numHashFuctions", 15);  this.minHashValues = new long[this.numHashFunctions];  this.bytesToHash = new byte[4];  String htype = ConfigUtils.getValue(conf, "HashType", "MURMUR");  System.out.println("numHashFunctions1111:" + numHashFunctions);  System.out.println("htype:" + htype);  try {  hashType = HashType.valueOf(htype);  } catch (IllegalArgumentException uae) {  Loger.logOnScreen(ErrorLevel.ERROR, "No valid hash type found in configuration!");  hashType = HashFactory.HashType.valueOf("LINEAR");  }  hashFunction = HashFactory.createHashFunctions(hashType, numHashFunctions);  super.setup(context);  }  public void map(Object key, Text value, Context context) throws IOException, InterruptedException  {  List<Long> ls = new ArrayList<Long>();  String[] inputTokens = value.toString().split(" ");  String nodeId = inputTokens[0];     //用户ID  Text outputValue = new Text(nodeId);  for (int i = 1; i < inputTokens.length; ++i) {  ls.add(Long.parseLong(inputTokens[i].trim()));  } //app的ID列表  int length = ls.size();  for (int j = 0; j < numHashFunctions; ++j) {  this.minHashValues[j] = Integer.MAX_VALUE;  } //初始化hash函数值  for (int i = 0; i < this.numHashFunctions; ++i) {  for (int j = 0; j < length; ++j) {  long val = ls.get(j);  bytesToHash[0] = (byte)(val >> 24);  bytesToHash[1] = (byte)(val >> 16);  bytesToHash[2] = (byte)(val >> 8);  bytesToHash[3] = (byte)val;//转换成字节，每次移出去8位，剩下位组成的数的结果，如果是负数（最高位是1）则取补码加一的方式计算  int hashIndex = hashFunction[i].hash(bytesToHash);  if (minHashValues[i] > hashIndex) {  minHashValues[i] = hashIndex;  }  }  }  for (int i = 0; i < numHashFunctions; ++i) {  String iMinHashValue = String.valueOf(i) + "_" + this.minHashValues[i];  Text outputKey = new Text(iMinHashValue);  context.write(outputKey, outputValue); //（0_最小hash值 ，user）  }  }  }  public static class reducehadoop1 extends Reducer<Text, Text, Text, Text>  {  public void reduce(Text key, Iterable<Text> values, Context context)throws IOException, InterruptedException  {  ArrayList<String>list= new ArrayList<String>();  Iterator<Text> itr = values.iterator();  while (itr.hasNext()) {  list.add(itr.next().toString());  }  Collections.sort(list);//为了避免一条边表示俩次<u1,u2>,<u2,u1>  if (list.size() >= 2){  for (int i = 0; i < list.size() - 1; ++i){  for (int j = i + 1; j < list.size(); ++j){  context.write(new Text(list.get(i)), new Text(list.get(j))); //生成相同 key=哈希函数_最小哈希值 的用户组成的用户对  }  }  }  }  }    public static void main(String[] args)throws Exception  {  Configuration conf = new Configuration();  Job job = Job.getInstance(conf, "redstone1Step1");  job.setJarByClass(redstone1Step1.class);  job.setNumReduceTasks(25);    FileInputFormat.addInputPath(job, new Path(args[0]));  FileOutputFormat.setOutputPath(job, new Path(args[1]));  job.setMapperClass(maphadoop1.class);  job.setReducerClass(reducehadoop1.class);  job.setOutputFormatClass(TextOutputFormat.class);  job.setOutputKeyClass(Text.class);  job.setOutputValueClass(Text.class);  job.waitForCompletion(true);  }}

第二个mapreduce:

将用户对加

生成jaccard距离=相同用户对的个数/函数个数

package code;import org.apache.hadoop.io.FloatWritable;import org.apache.hadoop.io.IntWritable;import java.io.IOException;import java.util.Iterator;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;import utils.ConfigUtils;public class redstone1Step2{  public static class maphadoop2 extends Mapper<Object, Text, Text, IntWritable>{    public void map(Object key, Text value,Context context)throws IOException, InterruptedException    {      IntWritable one = new IntWritable(1);      String[] str = value.toString().split("\t");      context.write(new Text(str[0] + ":" + str[1]), one);    }}public static class combinerhadoop2 extends Reducer<Text, IntWritable, Text, IntWritable>{protected void reduce(Text key, Iterable<IntWritable> values,Context context)throws IOException, InterruptedException{Iterator<IntWritable> itr = values.iterator();int partCount = 0;while (itr.hasNext()) {String str = itr.next().toString();partCount += Integer.parseInt(str);}context.write(key, new IntWritable(partCount));}}  public static class reducehadoop2 extends Reducer<Text, IntWritable, FloatWritable, Text>{private int numHashFunctions;private float edgeThreshold;protected void setup(Context context)throws IOException, InterruptedException{Configuration conf = context.getConfiguration();this.numHashFunctions = (int)ConfigUtils.getValue(conf, "numHashFuctions", 15);this.edgeThreshold = (float)ConfigUtils.getValue(conf, "gp.threshold", 0.7D);super.setup(context);}public void reduce(Text key, Iterable<IntWritable> values,Context context) throws IOException, InterruptedException {String[] tokens = key.toString().split(":");String value = tokens[0] + "\t" + tokens[1];Text outputValue = new Text(value);Iterator<IntWritable> itr = values.iterator();int count = 0;while (itr.hasNext()) {String str = itr.next().toString();count += Integer.parseInt(str);}float jaccard = (float)count / numHashFunctions;if (jaccard >= edgeThreshold)context.write(new FloatWritable(1.0F - jaccard), outputValue);}}    public static void main(String[] args)throws Exception{Configuration conf = new Configuration();Job job = Job.getInstance(conf, "redstone1Step2");job.setJarByClass(redstone1Step2.class);job.setNumReduceTasks(25);FileInputFormat.addInputPath(job, new Path(args[0]));FileOutputFormat.setOutputPath(job, new Path(args[1]));job.setMapperClass(maphadoop2.class);job.setReducerClass(reducehadoop2.class);job.setCombinerClass(combinerhadoop2.class);job.setOutputFormatClass(TextOutputFormat.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(IntWritable.class);job.waitForCompletion(true);}}

以上是根据mahout的minhash。

第一个mapreduce中生成用户对可以进行改进。

1 0