【Hadoop版】K-Shingle+最小Hash签名+LSH算法+LSH族

来源：互联网发布：微擎人人商城源码下载编辑：程序博客网时间：2024/06/05 23:07

将单机版的代码转化为可以在Hadoop上运行的MapReduce版本需要适应两个方面。

1.MR模型，即将单机版的一个程序切分成两个步骤。

2.Hadoop本身的IO特性。

由于在单机版的时候，文件读取采用了bufferreader类进行，但是hadoop中必须使用hadoop自己的读写方式即将文件默认以键值对方式输入,key是行再文本内的偏移量，value是文件中的一行。

这个特性破坏了单机版中，直接读取整个文档然后进行处理的情况。每个map方法执行时只能读一行，无法使用ShingleSet类。如果map中收集所有行，在cleanup中对整个汇集起来的文档进行处理并不可行。因为map用于收集文档的所有行则无法产生key-value输出，而reduce必须接收来自map的输出。

于是采用了预处理文本，将其所有换行取掉，变为一行。（据说Hadoop一行最多只能读1024Kb，经测试这是错误的）。

1.下边是对文本进行预处理的代码。

import java.io.BufferedReader;import java.io.FileOutputStream;import java.io.FileReader;import java.io.IOException;import java.io.PrintWriter;import java.util.regex.Matcher;import java.util.regex.Pattern;public class RemoveLineBreak {BufferedReader  inputStream;PrintWriter  outputStream;public  void replaceBlank() throws IOException {inputStream =new BufferedReader(new FileReader("源文件路径"));outputStream =new PrintWriter(new FileOutputStream("输出文件路径",true)); String line=inputStream.readLine(); Pattern p = Pattern.compile("\\s+|\t|\r|\n"); while(line!=null){Matcher m = p.matcher(line);line = m.replaceAll(" ");outputStream.print(line);line=inputStream.readLine(); }inputStream.close();outputStream.close();}public static void  main(String[] agrs) throws IOException{ RemoveLineBreak test=new  RemoveLineBreak(); test.replaceBlank();} }

2.在Hadoop程序中进行重要处理的一个类

import java.util.ArrayList;import java.io.*; //只能处理a-z+空格public class HadoopShingleSet  {int k; int signatureNumber;int times;int [] randomArray;int [] randomArrayForLSH;ArrayList <String> array=new ArrayList <String>();//存储所有的Shingles的集合，这些Shingles是无序的/**数组中的值是哈希桶的编号，即各个Shingles对应的桶号，和Shingles在array中的顺序相同但也是无序的，可以看做矩阵的行号， *  * 但这些行号并没有按从小到大排序 *  * 从array变到resultOfHashToShingle的过程采用了相同的哈希函数 * * 如果是多篇文档的话，各自的resultOfHashToShingle数组中存储的桶号并不相同，也没有按照桶号的顺序来存储，仅仅存储了文 *  * 档的shingles都被哈希到了哪些桶 *  */long [] resultOfHashToShingle;long [] signature;//这个数组用于存储文本的签名矩阵int bandNumber;int [] bucketNumber;//这个数组用于存储签名被哈希到的桶号int [] bucketNumberANDOR;//这个数组用于存储签名被哈希到的桶号BufferedReader  inputStream;public HadoopShingleSet (int k,int signatureNumber,int bandNumber,int times, int[] randomArray,int [] randomArrayForLSH){this.k=k;this.signatureNumber=signatureNumber;this.bandNumber=bandNumber;this.times=times;this.randomArray=randomArray;this.randomArrayForLSH=randomArrayForLSH;}public void createShingleSet(String line) {if(!(line.length()<k)){//这块的处理有点粗糙，行过短的被忽略，并且是先读入行再进行去除制表符、回车等字符int start =0;String tmp=null;do{tmp=line.substring(start,start+(k-1));start++;if(!array.contains(tmp)){array.add(tmp);//如果文档长度不同的话，自己所包含的shingle种类大小可能也不同}}while(!(start>line.length()-(k-1)));}}public void hashToShingle(){resultOfHashToShingle=new long[array.size()];for(int i=0;i<array.size();i++){String tmp=array.get(i);long sum=0;//设为longfor(int t=0;t<k-4;t++){char[] chartmp=tmp.substring(t,t+4).toCharArray();//将九位字符串中的连续四位以字符数组的形式存储//将字符串转化为32位整数。这里的强制类型转换将char转为int，再将double转long时，由于double此时为整数且不大于long最大值，所以转换无损long  inttmp=(long)((int)chartmp[0]*Math.pow(128,3)+(int)chartmp[1]*Math.pow(128,2)+(int)chartmp[2]*Math.pow(128,1)+(int)chartmp[3]*Math.pow(128,0));sum+=inttmp;}long hashResult=(sum%(long)Math.pow(2,32));//java中模运算的操作数范围大；将字符串哈希到2^32个桶中,而int占-2^31到+2^31。但桶数目小于27^9//hashResult的结果是0-2^32-1resultOfHashToShingle[i]=hashResult;}}/**对所有的桶重新进行大量哈希，每个哈希取最小的桶号 *  * 强制没和哈希函数的结果共有27^9个桶（每个哈希函数的桶数目可以不一样吗？）因为27^9中字符串* */public void produceSignature(){signature=new long[signatureNumber];for(int i=0;i<signatureNumber;i++){long min=(long)Math.pow(27, k);//一个哈希函数将resultOfHashToShingle中的桶号在重新排序到27^k个桶中，找出最小的桶号即为签名存储进signature即可for(int t=0;t<resultOfHashToShingle.length;t++){long tmp=(resultOfHashToShingle[t]*randomArray[2*i]+randomArray[2*i+1])%(long)Math.pow(27, k);//结果是0-27^kif(tmp<min) min=tmp;}signature[i]=min;}}public void localitySensitiveHahing(){int rows=signatureNumber/bandNumber;//因为有bandNumber个行条，所以使得哈希函数也有bandNumber*time个桶。同一个行条必须使用同一个哈希函数。//这里不同行条使用了不同的hash函数//所以，第i个行条的哈希值=[（行条内签名之和）*randomArray[row*i]+randomArray(row*i+1)]%(bandNumber*time)//对一个文档的签名向量的每个行条使用一个哈希函数，并存入了数组bucketNumber，对每篇文档的签明进行了bandNumber次hashbucketNumber=new int[bandNumber];for(int i=0;i<bandNumber;i++){int begin=i*rows;int end=(i+1)*rows;long sum=0;for(int t=begin;t<end;t++)  sum+=signature[t];//将本文档的第i行条的哈希值(即被哈希到的桶号)放入bucketNumber[i],如果两个文档的bucketNumber[i]相等，这说明这两个文档的第i个行条完全一样//每个行条一组桶。bucketNumber[i]=(int)((sum*randomArray[rows*i]+randomArray[rows*i+1])%(bandNumber*times));}//与构造+或构造，选用的hash函数并不一定要是在局部敏感哈希中使用过的哈希函数。所以在再这里再构造4*4*bucketNumber个哈希函数对文档进行重新处理//也就是对每个行条是用来16个hash函数//每个行条使用不同的hash函数，并将结果存入数组，每篇文档进行了4*4*bandNumber次哈希。bucketNumberANDOR=new int[4*4*bandNumber];for(int i=0;i<bandNumber;i++){int begin=i*rows;int end=(i+1)*rows;long sum=0;for(int t=begin;t<end;t++)  sum+=signature[t];for(int k=0;k<(4*4);k+=2) bucketNumberANDOR[(4*4)*i+k]=(int)((sum*randomArrayForLSH[(4*4)*i+k]+randomArrayForLSH[(4*4)*i+k+1])%(bandNumber*times));}}public void run() {this.hashToShingle();this.produceSignature();this.localitySensitiveHahing();}}

3.Hadoop代码主类

import java.io.IOException;import java.util.Random;import java.util.Scanner;import java.util.StringTokenizer;import java.util.regex.Matcher;import java.util.regex.Pattern;import java.util.ArrayList;import java.lang.Iterable; import java.util.Iterator;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.GenericOptionsParser;import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.lib.input.FileSplit;public class FindSimilar {  public static class TokenizerMapper        extends Mapper<Object, Text, Text, Text>{      private Text outPutKey = new Text();  private Text outPutValue = new Text();    int signatureNumber=100;   int bandNumber=1;  int times=100;  int[] randomArray;  int[] randomArrayForLSH;  HadoopShingleSet test;        protected void setup(Context context) throws IOException, InterruptedException {         this.bandNumber=Integer.parseInt(context.getConfiguration().get("bandNumber"));    this.times=Integer.parseInt(context.getConfiguration().get("times"));    this.signatureNumber=Integer.parseInt(context.getConfiguration().get("signatureNumber"));            String tmprandomArray=context.getConfiguration().get("randomArray");String testLength=new String(tmprandomArray);int length_randomArray=0;    int begin=0;    int end;do{    end=testLength.indexOf('d');    testLength=testLength.substring(end+1, testLength.length());        length_randomArray++;    }while(!testLength.equals(""));int i=0;begin=0;    this.randomArray=new int  [length_randomArray];    do{    end=tmprandomArray.indexOf('d');    String tmp=tmprandomArray.substring(begin, end);    tmprandomArray=tmprandomArray.substring(end+1, tmprandomArray.length());    int number=Integer.parseInt(tmp);    this.randomArray[i]=number;    i++;    }while(!tmprandomArray.equals(""));            String tmprandomArrayForLSH=context.getConfiguration().get("randomArrayForLSH");testLength=new String(tmprandomArrayForLSH);int length_randomArrayForLSH=0;    begin=0;do{    end=testLength.indexOf('d');    testLength=testLength.substring(end+1, testLength.length());        length_randomArrayForLSH++;    }while(!testLength.equals(""));i=0;begin=0;    this.randomArrayForLSH=new int  [length_randomArrayForLSH];    do{    end=tmprandomArrayForLSH.indexOf('d');    String tmp=tmprandomArrayForLSH.substring(begin, end);    tmprandomArrayForLSH=tmprandomArrayForLSH.substring(end+1, tmprandomArrayForLSH.length());    int number=Integer.parseInt(tmp);    this.randomArrayForLSH[i]=number;    i++;    }while(!tmprandomArrayForLSH.equals(""));            test=new HadoopShingleSet (5,signatureNumber,bandNumber,times,this.randomArray,this.randomArrayForLSH);        }        public void map(Object key, Text value, Context context ) throws IOException, InterruptedException {    System.out.println(value.toString());System.out.println("输出一行了******************************************");//如果整篇文档都不存在换行，那么就可以做到一行相当于一整片文档    test.createShingleSet(value.toString());    test.run();    InputSplit inputSplit = context.getInputSplit();String fileName = ((FileSplit) inputSplit).getPath().toString();    for(int i=0;i<test.bucketNumber.length;i++){String tmp=i+"d"+test.bucketNumber[i];    outPutKey.set(i+"d"+test.bucketNumber[i]);    outPutValue.set(fileName);    context.write(outPutKey,outPutValue);}    }    }     public static class TokenizerReducer  extends Reducer<Text,Text,Text,Text> {    private Text outPutKey = new Text();  private Text outPutValue = new Text();    public void reduce(Text key,Iterable<Text>  values, Context context ) throws IOException, InterruptedException {     //必须声明在reduce方法，否则可能多个key使用同一个arrayArrayList <Text> array=new ArrayList <Text>();//System.out.println("当key等于"+key.toString()+"时：");Iterator <Text> iterator=values.iterator();    int k=0;while(iterator.hasNext()) {Text tmp=new Text(iterator.next().toString());//System.out.println("输出从迭代器获取的值"+tmp);array.add(k,tmp);//System.out.println("输出array中这次存储的值"+array.get(k));k++;}//System.out.println("接下来遍历array数组：");//for(int i=0;i<array.size();i++){//System.out.println("i等于"+i+"时");//System.out.println(array.get(i).toString());                //        }        for(int i=0;i<array.size();i++){    outPutKey.set("键值为：  "+key.toString()+"  "+array.get(i).toString());for(int t=i+1;t<array.size();t++){    outPutValue.set("               "+array.get(t).toString());    context.write(outPutKey,outPutValue);        }    }    }  }  public static void main(String[] args) throws Exception {      int signatureNumber=100;    int bandNumber=1;int times=100;double  Jaccard;int [] randomArray;int [] randomArrayForLSH;Scanner keyboard=new Scanner(System.in);//产生最小哈希签名的哈希函数数目强制设初始化为100个(100对随机数)，即每个文本有100个签名，下边进行重新赋值。System.out.println("请问您希望将使用多少个Hash函数用于为文档产生签名？");signatureNumber=keyboard.nextInt();randomArray=new int [signatureNumber*2];Random random = new Random();for(int i=0;i<signatureNumber;i++){int tmp=(int)Math.pow(signatureNumber,0.5);randomArray[2*i]=(Math.abs(random.nextInt())%tmp)+1;//随机数在0-(tmp-1),改为1-tmp randomArray[2*i+1]=(Math.abs(random.nextInt())%tmp)+1;}//根据签名向量的长度以及预期的相似度来确定行条的数目，对double进行了运算，可能产生误差System.out.println("请问您希望将相似度为多少的文档在LSH过程中尽可能成为后选对？");Jaccard=keyboard.nextDouble();System.out.println("请问您希望在LSH过程中哈希桶的数目是行条数的几倍？");times=keyboard.nextInt();keyboard.close();double difference=Math.abs(Math.pow(1.0/1.0,1.0/100.0)-Jaccard);for(int i=2;i<=signatureNumber;i++){if(signatureNumber%i==0){double tmp=Math.abs(Math.pow((double)1/(double)i,(double)i/(double)signatureNumber)-Jaccard);System.out.printf("行条=%4d时  ",i);System.out.printf("差值为%8f",tmp);if(tmp<difference) {difference=tmp;bandNumber=i;System.out.println("   行条被改变");}else{System.out.println("   行条未改变");}}}System.out.println("签名矩阵被分为了"+bandNumber+"个行条");randomArrayForLSH=new int [4*4*bandNumber];for(int i=0;i<(4*4*bandNumber);i++){int tmp=(int)Math.pow(4*4*bandNumber,0.5 );randomArrayForLSH[i]=(Math.abs(random.nextInt())%tmp)+1;//随机数在0-(tmp-1),改为1-tmp}      Configuration conf = new Configuration();        String bandNumber_String=bandNumber+"";    String signatureNumber_String=signatureNumber+"";    String times_String= times+"";    String randomArray_String="";    for(int i=0;i<randomArray.length;i++){    randomArray_String+=randomArray[i];    randomArray_String+="d";    }        String randomArrayForLSH_String="";    for(int i=0;i<randomArrayForLSH.length;i++){    randomArrayForLSH_String+=randomArrayForLSH[i];    randomArrayForLSH_String+="d";    }            conf.set("bandNumber",bandNumber_String);    conf.set("signatureNumber",signatureNumber_String);    conf.set("times",times_String);    conf.set("randomArray",randomArray_String);    conf.set("randomArrayForLSH",randomArrayForLSH_String);            String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();    if (otherArgs.length < 2) {      System.err.println("Usage: wordcount <in> [<in>...] <out>");      System.exit(2);    }    Job job = new Job(conf, " FindSimilar");    job.setJarByClass( FindSimilar.class);    job.setMapperClass(TokenizerMapper.class);    job.setReducerClass(TokenizerReducer.class);    job.setOutputKeyClass(Text.class);    job.setOutputValueClass(Text.class);    for (int i = 0; i < otherArgs.length - 1; ++i) {      FileInputFormat.addInputPath(job, new Path(otherArgs[i]));    }    FileOutputFormat.setOutputPath(job,new Path(otherArgs[otherArgs.length - 1]));    System.exit(job.waitForCompletion(true) ? 0 : 1);  }}

处理所有文档的hash函数需要一样。这里使用conf传递在main中生成的hash函数的系数。将其转换为字符串再传递给每个Map task。

至此程序将给出所有可能的文档对。（重复的未被剔除）

在Mapper类的setup()方法为每个task声明一个HadoopShingleSet对象。当输入的每篇文档小于一个block大小时，每篇文档将有一个HadoopShingleSet对象。但当某篇文档大于一个block块大小时，这篇文档将有可能被划分到多个InputSplit，这样每个InputSplit对应一个Mapper，也就可能产生多个HadoopShingleSet对象，从而出错。

.................................................................

至于具体的划分策略，FileInputFormat默认为文件在HDFS上的每一个Block生成一个对应的FileSplit。那么自然，FileSplit.start就是对应Block在文件中的Offset、FileSplit.length就是对应Block的Length、FileSplit.hosts就是对应Block的Location。
但是可以设置“mapred.min.split.size”参数，使得Split的大小大于一个Block，这时候FileInputFormat会将连续的若干个Block分在一个Split中、也可能会将一个Block分别划在不同的Split中（但是前提是一个Split必须在一个文件中）。

0 0