Hadoop 2.6 使用Map Reduce实现矩阵相乘1 矩阵转置

来源：互联网发布：陕西软件测试培训编辑：程序博客网时间：2024/05/17 08:56

项目地址：https://github.com/tudoupaisimalingshu/hadoop_matrix

矩阵相乘

一、理论基础

二、如何用程序实现？A[M][N]*B[N][P]

import java.util.Arrays;public class Matrix {public static void main(String[] args) {int[][] matrix1 = {{1,2,-2,0},{3,3,4,-3},{-2,0,2,3},{5,3,-1,2},{-4,2,0,2}};//左矩阵，5*4int[][] matrix2 = {{0,3,-1,2,-3},{1,3,5,-2,-1},{0,1,4,-1,2},{-2,2,-1,1,2}};//右矩阵，4*5int [][] matrix3 = new int[5][5];//结果矩阵，5*5for(int i=0;i<5;i++)//计算结果矩阵的每一行{int[] row = matrix1[i];//左边矩阵第i行System.out.println("row=" + Arrays.toString(row));for(int j=0;j<5;j++)//计算结果矩阵的每一列{int[] line = new int[4];//右边矩阵第j列for(int k=0;k<4;k++){line[k] = matrix2[k][j];}//因为列向量是竖着的，用循环获得该列的各个元素System.out.println("line=" + Arrays.toString(line));int result_i_j = 0;//定义相乘结果for(int m=0;m<4;m++){result_i_j += row[m] * line[m];//累加乘积}System.out.println("result_i_j=" + result_i_j);System.out.println("--------------------");matrix3[i][j] = result_i_j;//设置结果矩阵对应位置的值}}//输出结果矩阵for(int i=0;i<5;i++){for(int j=0;j<5;j++){System.out.print(matrix3[i][j] + "\t");}System.out.println();}}}

输出：

row=[1, 2, -2, 0]line=[0, 1, 0, -2]result_i_j=2--------------------line=[3, 3, 1, 2]result_i_j=7--------------------line=[-1, 5, 4, -1]result_i_j=1--------------------line=[2, -2, -1, 1]result_i_j=0--------------------line=[-3, -1, 2, 2]result_i_j=-9--------------------row=[3, 3, 4, -3]line=[0, 1, 0, -2]result_i_j=9--------------------line=[3, 3, 1, 2]result_i_j=16--------------------line=[-1, 5, 4, -1]result_i_j=31--------------------line=[2, -2, -1, 1]result_i_j=-7--------------------line=[-3, -1, 2, 2]result_i_j=-10--------------------row=[-2, 0, 2, 3]line=[0, 1, 0, -2]result_i_j=-6--------------------line=[3, 3, 1, 2]result_i_j=2--------------------line=[-1, 5, 4, -1]result_i_j=7--------------------line=[2, -2, -1, 1]result_i_j=-3--------------------line=[-3, -1, 2, 2]result_i_j=16--------------------row=[5, 3, -1, 2]line=[0, 1, 0, -2]result_i_j=-1--------------------line=[3, 3, 1, 2]result_i_j=27--------------------line=[-1, 5, 4, -1]result_i_j=4--------------------line=[2, -2, -1, 1]result_i_j=7--------------------line=[-3, -1, 2, 2]result_i_j=-16--------------------row=[-4, 2, 0, 2]line=[0, 1, 0, -2]result_i_j=-2--------------------line=[3, 3, 1, 2]result_i_j=-2--------------------line=[-1, 5, 4, -1]result_i_j=12--------------------line=[2, -2, -1, 1]result_i_j=-10--------------------line=[-3, -1, 2, 2]result_i_j=14--------------------2710-991631-7-10-627-316-12747-16-2-212-1014

三、传统程序的问题：

1、不能并发执行，总是按照循环的条件一次一次执行。

2、如果矩阵的规模很大，以至于放不到内存中，则可能要放入文件中，那么对于左侧矩阵还好说，每次只需要读取一行放入内存，下次循环再读取下一行即可；从程序中可以看到，对于右侧矩阵，我们需要得到列向量，也就是遍历所有的行，每行取一个元素，然后组成列向量，当文件很大时，速度太慢。

四、解决方案

1、针对问题1引入并发执行框架Hadoop，其中的Map和Reduce操作可以并发执行。

2、针对问题2，将右边矩阵转置，从而实现列向量转为行向量

五、使用Hadoop Map Reduce 进行矩阵相乘

1、矩阵的存储结构

为什么要把每一行的所有列写在一行？

矩阵文件可能很大，此时Hadoop的HDFS就会将文件分片，如果没有将同一行的所有列写在一起，则属于同一行的元素可能会被分到不同的分片，导致后面还会消耗时间和空间去查找拼接，也就是还需要写reduce来合并行。

为什么要对一行的每一个元素标出列的序号？

由于Hadoop是并行的，在进行map拆分的之后进行reduce合并的过程中，并不能保证一行的各个元素是有序的，因此要标出元素对应的下标，在hadoop中，由于行号是唯一的，再加上标明的列号，就能保证在并行处理过程中的正确性。

2、矩阵转置的Map Reduce实现

package hadoop;import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;public class Step1 {public static class Mapper1 extends Mapper<LongWritable,Text,Text,Text>{private Text outKey = new Text();private Text outValue = new Text();/*待转置矩阵03-12-3135-2-1014-12-22-112*//*目标矩阵011-23312-154-12-2-11-3-122*///对于每一行，以第一行为例//key : 1//value : "11_0,2_3,3_-1,4_2,5_-3"@Overrideprotected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)throws IOException, InterruptedException {String[] rowAndline = value.toString().split("\t");//rowAndline : {"1","1_0,2_3,3_-1,4_2,5_-3"}String row = rowAndline[0];//row "1"String[] lines = rowAndline[1].split(",");//rowAndline[1] : "1_0,2_3,3_-1,4_2,5_-3"//lines : {"1_0","2_3","3_-1","4_2","5_-3"}for(String line : lines)//对于每一列，以第一列为例，line "1_0"{String colunm = line.split("_")[0];//colunm : 1String valueStr = line.split("_")[1];//valueStr : 0 outKey.set(colunm);//将列作为行outValue.set(row + "_" + valueStr);//将行作为列context.write(outKey, outValue);// 产生(1,"1_0")}//循环结束，对于{"1_0","2_3","3_-1","4_2","5_-3"}//产生(1,"1_0") 第一行，第一列_0    (2,"1_3")  第二行，第一列_3(3,"1_-1") (4,"1_2")(5,"1_-3")/*目标转置矩阵011-23312-154-12-2-11-3-122*///正好对应于转置矩阵的第一列}/*所有map操作产生 ("1","1_0")("2","1_3") ("3","1_-1")("4","1_2")("5","1_-3")（"1","2_1"）("2","2_3") ("3","2_5")    ("4","2_-2")("5","2_-1")（"1","3_0"）("2","3_1")    ("3","3_4")("4","3_-1")("5","3_2")（"1","4_-2"）  ("2","4_2")    ("3","4_-1")("4","4_1")("5","4_2")*/}/*Reduce任务，将map操作产生的所有键值对集合进行合并，生成转置矩阵的存储表示key值相同的值会组成值的集合如：key:"1"时values:{"3_0","1_0","4_-2","2_1"} 注意：这里就是为什么要进行列标号的原因，values的顺序不一定就是原来矩阵列的顺序*/public static class Reducer1 extends Reducer<Text,Text,Text,Text>{private Text outKey = new Text();private Text outValue = new Text();@Overrideprotected void reduce(Text key, Iterable<Text> values, Context context)throws IOException, InterruptedException {StringBuilder sb = new StringBuilder();for(Text text : values){sb.append(text + ",");}//sb : "3_0,1_0,4_-2,2_1,"//注意这里末尾有个逗号String line = "";if(sb.toString().endsWith(",")){line = sb.substring(0,sb.length()-1);}//去掉逗号//line : "3_0,1_0,4_-2,2_1"outKey.set(key);outValue.set(line);//("1","3_0,1_0,4_-2,2_1")context.write(outKey, outValue);}}private static final String INPATH = "input/matrix.txt";//输入文件路径private static final String OUTPATH = "output/step1";//输出文件路径private static final String HDFS = "hdfs://pc1:9000";//HDFS路径public void run() throws IOException, ClassNotFoundException, InterruptedException { Configuration conf = new Configuration();    //String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();    //String[] otherArgs = {"hdfs://pc1:9000/input/chenjie.txt","hdfs://pc1:9000/output/out4"};    String[] otherArgs = {"input/matrix.txt","hdfs://pc1:9000/output/step1"};    //这里需要配置参数即输入和输出的HDFS的文件路径    if (otherArgs.length != 2) {      System.err.println("Usage: wordcount <in> <out>");      System.exit(2);    }    //conf.set("fs.defaultFS",HDFS);   // JobConf conf1 = new JobConf(WordCount.class);    Job job = new Job(conf, "step1");//Job(Configuration conf, String jobName) 设置job名称和    job.setJarByClass(Step1.class);    job.setMapperClass(Mapper1.class); //为job设置Mapper类     //job.setCombinerClass(IntSumReducer.class); //为job设置Combiner类      job.setReducerClass(Reducer1.class); //为job设置Reduce类     job.setMapOutputKeyClass(Text.class);      job.setMapOutputValueClass(Text.class);     job.setOutputKeyClass(Text.class);        //设置输出key的类型    job.setOutputValueClass(Text.class);//  设置输出value的类型    job.setOutputFormatClass(SequenceFileOutputFormat.class);    FileInputFormat.addInputPath(job, new Path(otherArgs[0])); //为map-reduce任务设置InputFormat实现类   设置输入路径    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));//为map-reduce任务设置OutputFormat实现类  设置输出路径    System.exit(job.waitForCompletion(true) ? 0 : 1);/*Configuration conf = new Configuration();conf.set("fs.defaultFS",HDFS);Job job = Job.getInstance(conf,"step1");job.setJarByClass(Step1.class);job.setMapperClass(Mapper1.class);job.setReducerClass(Reducer1.class);job.setMapOutputKeyClass(Text.class);job.setMapOutputValueClass(Text.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(Text.class);FileSystem fs = FileSystem.get(conf);Path inPath = new Path(INPATH);if(fs.exists(inPath)){//FileInputFormat.addInputPath(conf, inPath);}Path outPath = new Path(OUTPATH);if(fs.exists(outPath)){fs.delete(outPath, true);}*/}public static void main(String[] args){try {new Step1().run();} catch (ClassNotFoundException | IOException | InterruptedException e) {e.printStackTrace();}}}

运行结果：

使用hadoop fs -text 文件路径查看转置结果：

然后进行矩阵相乘（点击打开）

阅读全文

0 0