MPI群通信与矩阵乘法的Fox算法实现

来源：互联网发布：长春网络电视台编辑：程序博客网时间：2024/04/29 07:23

原本以为，MPI天生只能在Linux上运行。但这次却发现了Intel MPI Library 这个好用的东西。基本不需要设置，安上之后，用自己能登录windows的帐号和密码注册就行了。虽然不是局域网上的机器，但也可以让我的双核CPU达到100%（平时开个Matlab什么的都才是50%，软件优化真是关键啊）。

FOX算法有一些恶心的要求：输入的矩阵必须是方阵，而且进程必须为平方数，方阵必须能均匀划分给每个进程。其实方阵的条件并不一定必须，只是会增加编程复杂性。

MPI中，虽然最精华的就6条语句，但仅用这6条，还是比较麻烦的。使用一些高级语句，能提高程序性能或是简化代码。

MPI_Bcast，这个是必须要提的。能把这样的语句 if (task_id == root) { send to child_id } else { recieve from root} 仅用个bcast就替换了。

MPI_Isend/MPI_Irecv是异步发送和接收的语句，显然比MPI_Send/MPI_Recv这样的阻塞语句更可能提高性能。另外，还可以避免循环死锁。当然，避免循环死锁的方法还有使用奇偶法、MPI_Sendrecv等。

虽然，理解MPI_Cart_create等一系列的笛卡尔结构操作还是有些难度的，但却是十分有用的进程逻辑功能划分方法。特别在FOX算法中，每行是个通信子域，每列也是。这样就会有n+n个子域（尽管有划分重叠），每个域的进程都仅与域中的其它进程有信息交换，这样仅用bcast一语就能传完数据。

以下是代码。比较简单。就不说明了。

// MatrixMulFox.cpp// Jarod 2007.12.3#include "mpi.h"#include <algorithm>#include <fstream>#include <cmath>const int root_id = 0;const int max_procs_size = 16;int main(int argc,char *argv[])...{    double start_time, end_time, time;    int procs_id, procs_size;    MPI_Status status;    MPI_Request reqSend, reqRecv;    MPI_Init(&argc,&argv);    start_time = MPI_Wtime();    MPI_Comm_size(MPI_COMM_WORLD,&procs_size);    MPI_Comm_rank(MPI_COMM_WORLD,&procs_id);    // 参数检查    int N=0;    ...{        for (int i=1; i<argc; ++i ) ...{            char * pos =strstr(argv[i], "-N=");            if ( pos!=NULL) ...{                sscanf(pos, "-N=%d", &N);                break;            }        }    }    const int procs_size_sqrt = floor(sqrt(static_cast<double>(procs_size)));    const int n = N / procs_size_sqrt;    const int n_sqr = n*n;    if (procs_size<4 || procs_size> max_procs_size) ...{        printf("The fox algorithm requires at least 4 processors and at most %d processors. ",               max_procs_size);        MPI_Finalize();        return 0;    }    if (procs_size_sqrt*procs_size_sqrt != procs_size ) ...{        printf("The number of process must be a square. ");        MPI_Finalize();        return 0;    }    if (N % procs_size_sqrt !=0) ...{        printf("N mod procs_size_sqrt !=0  ");        MPI_Finalize();        return 0;    }    //初始化矩阵    int * A = new int[n_sqr];    int * B = new int[n_sqr];    int * C = new int[n_sqr];    int * T = new int[n_sqr];    for (int i=0; i<n; ++i)        for (int j=0; j<n; ++j) ...{            A[i*n+j] = (i+j)*procs_id;            B[i*n+j] = (i-j)*procs_id;            C[i*n+j] = 0;        }    //输出矩阵    printf("A on procs %d :  ", procs_id);    for (int i=0; i<n; ++i) ...{        for (int j=0; j<n; ++j) ...{            printf("%5d",A[i*n+j]);        }        printf(" ");    }    printf("B on procs %d :  ", procs_id);    for (int i=0; i<n; ++i) ...{        for (int j=0; j<n; ++j) ...{            printf("%5d",B[i*n+j]);        }        printf(" ");    }    // 划分组, 建立子通信空间    MPI_Comm cart_all, cart_row, cart_col;    int dims[2], periods[2];    int procs_cart_rank, procs_coords[2];    dims[0] = dims[1] = procs_size_sqrt;    periods[0] = periods[1] = true;    MPI_Cart_create(MPI_COMM_WORLD, 2, dims, periods, false, &cart_all);    MPI_Comm_rank(cart_all, &procs_cart_rank);    MPI_Cart_coords(cart_all, procs_cart_rank,  2, procs_coords);    MPI_Comm_split(cart_all, procs_coords[0], procs_coords[1], &cart_row);    MPI_Comm_split(cart_all, procs_coords[1], procs_coords[0], &cart_col);    int rank_cart_row, rank_cart_col;    MPI_Comm_rank(cart_row, & rank_cart_row);    MPI_Comm_rank(cart_col, & rank_cart_col);    // 计算并传递        for (int round = 0; round < procs_size_sqrt; ++ round) ...{        MPI_Isend(B, n_sqr, MPI_INT, (procs_coords[0] - 1 + procs_size_sqrt) % procs_size_sqrt,                   1, cart_col, &reqSend);        int broader = (round + procs_coords[0]) % procs_size_sqrt;        if (broader == procs_coords[1]) std::copy(A,A+n_sqr,T);        MPI_Bcast(T, n_sqr, MPI_INT, broader , cart_row);        for (int row=0; row<n; ++row)            for (int col=0; col<n; ++col)                for (int k=0; k<n; ++k) ...{                    C[row*n+col] += T[row*n+k]*B[k*n+col];                }                MPI_Wait(&reqSend, &status);        MPI_Recv(T, n_sqr,  MPI_INT, (procs_coords[0] + 1) % procs_size_sqrt                          , 1, cart_col, &status);        std::copy(T,T+n_sqr,B);    }    //输出结果    printf("C on procs %d :  ", procs_id);    for (int i=0; i<n; ++i) ...{        for (int j=0; j<n; ++j) ...{            printf("%5d",C[i*n+j]);        }        printf(" ");    }            // 释放    MPI_Comm_free(&cart_col);    MPI_Comm_free(&cart_row);    MPI_Comm_free(&cart_all);    delete []A;    delete []B;    delete []C;    delete []T;    end_time = MPI_Wtime();    MPI_Finalize();    printf("task %d consumed %lf seconds ", procs_id, end_time-start_time);    return 0;}