Hadoop矩阵相乘

来源：互联网发布：中山大学网络新ip 编辑：程序博客网时间：2024/04/30 10:47

方法如下：从课件中抄的，懒得翻译了。懒得看的可以直接看我传的图片，应该立马看懂（字较丑）

•We canthink of a matrix as a relation with three attributes: the row number, thecolumn number, and the value in that row and column. Thus, we could view matrixM asa relation M (I, J, V ), with tuples (i, j, m_ij )and we could view matrix Nas a relation N (J, K, W ), with tuples (j, k, n_jk). As large matrices are often sparse(mostly 0’s), and since we can omit the tuples for matrix elements that are 0,this relational representation is often a very good one for a large matrix.

•The product M N isalmost a natural join followed by

groupingand aggregation. That is, the natural join of M (I, J, V ) and N (J, K, W ), havingonly attributeJ incommon, would produce tuples (i, j, k, v, w) from each tuple (i, j, v)in M andtuple (j, k, w) in N .This five-component tuple represents the pair of matrix elements (m_ij , n_jk ). What we want instead is the product ofthese elements,that is, the four-component tuple (i, j, k, v ×w),because thatrepresents theproduct m_ij n_jk .Once we have this relation as the result of one map-reduce operation, we canperform grouping and aggregation, with I and K asthe grouping attributes and the sum of V ×W asthe aggregation.

•Stage 1:

•The MapFunction: Send each matrix element

m_ij tothe key value pair (j, (M, i, m_ij ))

Sendeach matrix element n_jk tothe key value pair (j,(N, k, n_jk)).

•The Reduce Function: For each value that comes

fromM ,say (M, i,m_ij)), and each value that comesfrom N ,say (N, k, n_jk )),produce the tuple (i, k, m_ij n_jk ).Note that the output ofthe Reduce function is a key j pairedwith the list of all the tuples of this form that we get from j.

•Stage 2:

•The Map Function: The elements to whichthis Map

functionis appliedare the pairs that are output from theprevious Reduce function. These pairs are of the (j, [(i_1, k_1, v_1), (i_2, k_2, v_2), . . . , (i_p, k_p, v_p)] whereeach v_q isthe product of elements m_(i_q j) and n_(jk_q ) .From this element we produce pkey-value pairs:((i_1, k_1), v_1), ((i_2, k_2), v_2), . . . , ((i_p, k_p), v_p)

•The Reduce Function: For each key (i, k),produce the sum

ofthe list of values associated with thiskey. The result is a pair ((i, k), v),where v isthevalue ofthe element in row i andcolumn k ofthe matrix P =M N .

•

•P =MN.

•TheMap Function: For each element m_ij of M ,produce a key-value

Pair ((i, k), (M, j, m_ij)) for k =1, 2, . . . , up to the number of columns of N .Also, for each element n_jk of N ,produce a key-value pair ((i, k), (N, j, n_jk ))fori =1, 2, . . . , up to the number of rows of M .

•TheReduce Function: Each key (i, k)will have an associated list with all

the values (M, j, m_ij) and (N, j, n_jk ),for all possible values of j. The Reduce function needs to connectthe two values on the list that have the same value of j,for each j.An easy way to do this step is to sort by j thevalues that begin with Mand sort by j thevalues that begin with N, in separate lists. The jth values on each list must have theirthird components, m_ij andn_jk extractedand multiplied. Then, these products are summed and the result is paired with (i, k)in the output of the Reduce function.

•

具体过程如下图

package com.song.mr;import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;import java.net.URI;import java.util.Iterator;import java.util.LinkedList;import java.util.StringTokenizer;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.io.WritableComparable;import org.apache.hadoop.mapreduce.InputSplit;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.FileSplit;import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;import org.apache.hadoop.mapreduce.lib.jobcontrol.ControlledJob;import org.apache.hadoop.mapreduce.lib.jobcontrol.JobControl;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;/** * My High Performance Parallel Program Design Homework2 -- Matrix * Multiplication. To get the result there are two MapReduce tasks which in my * code which are Step1* and Step2*. I assume the data of each matrix is stored * in a file. And the left matrix's name must be "left" and the right matrix's * name must be "right". I assume each input file is constructed by many lines * and each line is like this: linenum n1 n2 n3 n4 ...  * 矩阵乘法分成两步 Mapreduce 作业。输入文件必须是left和right分别表示左矩阵和右矩阵。假设每行行首数组表示行号 *  * @author  * @version 1.0 * @date 2016.4.8 */public class MRMatrixMultiplication2 {/** * MatrixType enum, can be left or right. 矩阵的类型枚举，可左可右 */private static enum MatrixType {LEFT("left"), RIGHT("right");String val;private MatrixType(String val) {this.val = val;}public String value() {return val;}public static MatrixType getType(String val) {if (val.equals("left")) {return MatrixType.LEFT;} else {return MatrixType.RIGHT;}}}/** * The first step's map task. 第一步的map作业。 */private static class Step1Map extends Mapper<Object, Text, IntWritable, Text> {@Overrideprotected void map(Object key, Text value, Mapper<Object, Text, IntWritable, Text>.Context context)throws IOException, InterruptedException {// get current file name in order to know which matrix is processing now// 获取当前处理的分片的文件名称，用以标识是哪个矩阵。所以文件名必须是 left 和 rightInputSplit inputSplit = context.getInputSplit();String fileName = ((FileSplit) inputSplit).getPath().toString();int i = fileName.length();while (fileName.charAt(--i) != '/');MatrixType matrixType = MatrixType.getType(fileName.substring(i + 1));String line = value.toString();if (line == null || line.length() == 0) {return;}int column = 0;i = 0;while (line.charAt(i) != ' ') {i++;}// get line number// 获得当前处理的行号int linenum = Integer.parseInt(line.substring(0, i));StringBuilder item = new StringBuilder();while (++i < line.length()) {if (line.charAt(i) != ' ') {item.append(line.charAt(i));}if (line.charAt(i) == ' ' || i + 1 == line.length()) {column++;// according to the matrix type and output// 根据左右矩阵输出if (matrixType == MatrixType.LEFT) {System.out.println("** step1 map output ** key:" + column + ",text:" + matrixType.value() + ","+ linenum + "," + item);context.write(new IntWritable(column),new Text(matrixType.value() + "," + linenum + "," + item));} else {System.out.println("** step1 map output ** key:" + linenum + ",text:" + matrixType.value() + ","+ column + "," + item);context.write(new IntWritable(linenum),new Text(matrixType.value() + "," + column + "," + item));}item = new StringBuilder();}}}}/** * the first step's reduce task. 第一步的reduce作业 */private static class Step1Reduce extends Reducer<IntWritable, Text, Text, IntWritable> {private static class IndexAndVal {int index;int val;public IndexAndVal(String input) {String[] inputs = input.split(",");this.index = Integer.parseInt(inputs[0]);this.val = Integer.parseInt(inputs[1]);}}@Overrideprotected void reduce(IntWritable key, Iterable<Text> iterable,Reducer<IntWritable, Text, Text, IntWritable>.Context context)throws IOException, InterruptedException {System.out.println("** step1 reduce processing key ** " + key.get());LinkedList<IndexAndVal> leftList = new LinkedList<>();LinkedList<IndexAndVal> rightList = new LinkedList<>();Iterator<Text> iterator = iterable.iterator();while (iterator.hasNext()) {int i = 0;String value = iterator.next().toString();System.out.println("** step 1 reduce processing value ** " + value);StringBuilder stringBuilder = new StringBuilder();while (value.charAt(i) != ',') {stringBuilder.append(value.charAt(i++));}MatrixType matrixType = MatrixType.getType(stringBuilder.toString());switch (matrixType) {case LEFT:leftList.add(new IndexAndVal(value.substring(i + 1, value.length())));break;case RIGHT:rightList.add(new IndexAndVal(value.substring(i + 1, value.length())));break;}}//笛卡尔积for (IndexAndVal left : leftList) {for (IndexAndVal right : rightList) {Text position = new Text(left.index + "," + right.index);IntWritable product = new IntWritable(left.val * right.val);System.out.println("step1 reduce output key:" + position + " value:" + product);context.write(position, product);}}}}//第二步的map作业private static class Step2Map extends Mapper<LongWritable, Text, Text, IntWritable> {@Overrideprotected void map(LongWritable key, Text value,Mapper<LongWritable, Text, Text, IntWritable>.Context context)throws IOException, InterruptedException {String line = value.toString();System.out.println("** step 2 map ** input line:" + line);//一开始没用StringTokenizer直接split没有得到正确结果，总是抛出Out of array index 异常StringTokenizer itr = new StringTokenizer(line);int i = 0;Text position = null;IntWritable intWritable;while (itr.hasMoreTokens()) {String token = itr.nextToken();System.out.println("token:" + token);i++;switch (i) {case 1:position = new Text(token);break;case 2:intWritable = new IntWritable(Integer.parseInt(token.trim()));context.write(position, intWritable);}}}}//private static class Step2Reduce extends Reducer<Text, IntWritable, Text, IntWritable> {@Overrideprotected void reduce(Text key, Iterable<IntWritable> iterable,Reducer<Text, IntWritable, Text, IntWritable>.Context context)throws IOException, InterruptedException {int sum = 0;for (IntWritable val : iterable) {sum += val.get();System.out.println("** step 2 reduce ** input key:" + key + ",val:" + val.get());}System.out.println("** step 2 reduce ** output " + key + "," + sum);context.write(key, new IntWritable(sum));}}public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {String WORKING_DIRECTORY = "hdfs://localhost:9000";String inputFileName = "/input";String step1OutputFileName = "/step1output";String step2InputFileName = "/step1output/part*";String outputFileName = "/output";Configuration conf = new Configuration();try {//配置文件信息FileSystem fs = FileSystem.get(URI.create(WORKING_DIRECTORY), conf);Path inputFilePath = new Path(WORKING_DIRECTORY + inputFileName);Path step1OutputFilePath = new Path(step1OutputFileName);if (fs.exists(step1OutputFilePath)) {fs.delete(step1OutputFilePath);}Path step2InputFilePath = new Path(WORKING_DIRECTORY + step2InputFileName);Path outputFilePath = new Path(outputFileName);if (fs.exists(outputFilePath)) {fs.delete(outputFilePath);}//配置第一步的信息Job job1 = new Job(conf, "Matrix Multiplication Step 1");job1.setJarByClass(MRMatrixMultiplication2.class);job1.setMapperClass(Step1Map.class);job1.setMapOutputKeyClass(IntWritable.class);job1.setMapOutputValueClass(Text.class);job1.setReducerClass(Step1Reduce.class);job1.setOutputKeyClass(Text.class);job1.setOutputValueClass(IntWritable.class);job1.setInputFormatClass(TextInputFormat.class);job1.setOutputFormatClass(TextOutputFormat.class);ControlledJob controlledJob1 = new ControlledJob(conf);controlledJob1.setJob(job1);FileInputFormat.addInputPath(job1, inputFilePath);FileOutputFormat.setOutputPath(job1, step1OutputFilePath);//配置第二步Job job2 = new Job(conf, "Matrix Multiplication Step 2");job2.setJarByClass(MRMatrixMultiplication2.class);job2.setMapperClass(Step2Map.class);job2.setMapOutputKeyClass(Text.class);job2.setMapOutputValueClass(IntWritable.class);job2.setReducerClass(Step2Reduce.class);job2.setOutputKeyClass(Text.class);job2.setOutputValueClass(IntWritable.class);job2.setInputFormatClass(TextInputFormat.class);job2.setOutputFormatClass(TextOutputFormat.class);ControlledJob controlledJob2 = new ControlledJob(conf);controlledJob2.setJob(job2);//配置作业关系controlledJob2.addDependingJob(controlledJob1);FileInputFormat.addInputPath(job2, step2InputFilePath);FileOutputFormat.setOutputPath(job2, outputFilePath);JobControl jobControl = new JobControl("Matrix Multiplication");jobControl.addJob(controlledJob1);jobControl.addJob(controlledJob2);Thread thread = new Thread(jobControl);thread.start();while (true) {if (jobControl.allFinished()) {// 如果作业成功完成，就打印成功作业的信息System.out.println(jobControl.getSuccessfulJobList());jobControl.stop();break;}if (jobControl.getFailedJobList().size() > 0) {// 如果作业失败，就打印失败作业的信息System.out.println(jobControl.getFailedJobList());jobControl.stop();break;}}} catch (Exception exception) {exception.printStackTrace();System.out.println(exception);}}}

0 0