PBLAS (or Parallel BLAS) is a parallel version of BLAS that use BLACS internally for parallel computing. It expects the matrix to be already distributed among processors before it starts computing. You first create the data in each process and then provide PBLAS with information that will help it determine how exactly the matrix is distributed. Each process can access only its local data.

Table of Contents

Array descriptor

You also need to define an ‘array descriptor’ for the matrix that you are working on. The array descriptor is an integer array of length 9 that contains the following data:

int array_desc[9] = {
    dtype,   // descriptor type (=1 for dense matrix)
    context, // BLACS context handle for process grid
    m,       // num of rows in the global array
    n,       // num of cols in the global array
    mb,      // num of rows in a block
    nb,      // num of cols in a block
    rsrc,    // process row over which first row of the global array is distributed
    csrc,    // process col over which first col of the global array is distributed
    lld      // leading dimension of the local array
}

Although you can do it yourself, using the descinit function for initializing the array descriptor is a good way to keep the code clean. This function looks as follows:

void descinit_ (
    int *desc, 
    const int *m, 
    const int *n, 
    const int *mb, 
    const int *nb, 
    const int *irsrc, 
    const int *icsrc, 
    const int *ictxt, 
    const int *lld, 
    int *info
);

Multiplication function description

According to PBLAS conventions, the global matrix can be denoted by A and the block of matrix possessed by the particlar process as sub(A). The number of rows and columns of a global dense matrix that a particular process in a grid receives after data distributing is denoted by LOCr() and LOCc(), respectively. To compute these numbers, you can use the ScaLAPACK tool routine numroc.

To explain with example, see the prototype of the pdgemm routine (intel resource):

void pdgemm_(
    const char *transa ,  // (g) form of sub(A)
    const char *transb ,  // (g) form of sub(B)
    const int *m ,        // (g) number of rows of sub(A) and sub(C)
    const int *n ,        // (g) number of cols of sub(B) and sub(C)
    const int *k ,        // (g) Number of cols of sub(A) and rows of sub(A)
    const double *alpha , // (g) scalar alpha
    // array that contains local pieces of distributed matrix sub(A). size lld_a by kla.
    //   kla is LOCq(ja+m-1) for C code (transposed).
    const double *a ,     // (l)
    const int *ia ,       // (g) row index in the distributed matrix A indicating first row of sub(A)
    const int *ja ,       // (g) col index in the distributed matrix A indicating first col of sub(A)
    const int *desca ,    // (g & l)array of dim 9. Array descriptor of A.
    // array that contains local pieces of dist matrix sub(B). size lld_b by klb.
    //   klb is LOCq(jb+k-1) for C code (transposed).
    const double *b ,     // (l)
    const int *ib ,       // (g) row index of dist matrix B indicating first row of sub(B)
    const int *jb ,       // (g) col index of dist matrix B indicating first col of sub(B)
    const int *descb ,    // (g & l) array desc of matrix B (dim 9).
    const double *beta ,  // (g) scalar beta
    double *c ,           // (l) Array of size (lld_a, LOCq(jc+n-1)). contains sub(C) pieces.
    const int *ic ,       // (g) row index of dist matrix C indicating first row of sub(C)
    const int *jc ,       // (g) col index of dist matrix C indicating first col of sub(C)
    const int *descc      // (g & l) array of dim 9. Array desc of C.
)

The above function looks very similar to non-parallel dgemm from BLAS, with additions for making it easy to find elements in a parallel scenario. Keep in mind that there are some arguments that refer to the global array properties and some that refer to the local array properties.

A function called numroc from ScaLAPACK is useful for determining how many rows or cols of the global matrix are present in a particular process. The prototype looks as follows:

int numroc_(
    const int *n,       // (g) number of rows/cols in dist matrix (global matrix).
    const int *nb,      // (g input) block size. (must be square blocks)
    const int *iproc,   // (l input) co-ordinate of process whole local array row/col is to be determined.
    const int *srcproc, // (g input) co-ordinate of the process that contains the frist row or col of the dist matrix.
    const int *nprocs   // (g input) total number of processes.
)

When compiling these functions, don’t forget to link with the -lgfortran flag.

Full code

A simple implementation of matrix multiplication using BLACS and PBLAS:

#include "mpi.h"
#include <iostream>
#include <cstdlib>
#include <cmath>
using namespace std;

extern "C" {
  /* Cblacs declarations */
  void Cblacs_pinfo(int*, int*);
  void Cblacs_get(int, int, int*);
  void Cblacs_gridinit(int*, const char*, int, int);
  void Cblacs_pcoord(int, int, int*, int*);
  void Cblacs_gridexit(int);
  void Cblacs_barrier(int, const char*);
 
  int numroc_(int*, int*, int*, int*, int*);

  void descinit_(int *desc, const int *m,  const int *n, const int *mb, 
    const int *nb, const int *irsrc, const int *icsrc, const int *ictxt, 
    const int *lld, int *info);

  void pdgemm_( char* TRANSA, char* TRANSB,
                int * M, int * N, int * K,
                double * ALPHA,
                double * A, int * IA, int * JA, int * DESCA,
                double * B, int * IB, int * JB, int * DESCB,
                double * BETA,
                double * C, int * IC, int * JC, int * DESCC );
}


int main(int argc, char ** argv)
{
  // MPI init
  MPI_Init(&argc, &argv);
  int mpi_rank, mpi_size;
  MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
  MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
  // end MPI init

  // BLACS init
  int BLACS_CONTEXT, proc_nrows, proc_ncols, myrow, mycol;
  int proc_id, num_procs;
  proc_nrows = 2; proc_ncols = 2;
n  //int proc_dims[2] = {proc_nrows, proc_ncols};
  Cblacs_pinfo(&proc_id, &num_procs);
  Cblacs_get( -1, 0, &BLACS_CONTEXT );
  Cblacs_gridinit( &BLACS_CONTEXT, "Row", proc_nrows, proc_ncols );
  Cblacs_pcoord(BLACS_CONTEXT, mpi_rank, &myrow, &mycol);
  cout << "myrow " << myrow << " mycol " << mycol << endl;
  cout << "procid " << proc_id << " num_procs " << num_procs << endl;
  // end BLACS init

  // matrix properties
  int N = 8, nb = 4; // mat size, blk size.
  double* a = (double*)malloc(sizeof(double)*nb*nb);
  double* b = (double*)malloc(sizeof(double)*nb*nb);
  double* c = (double*)malloc(sizeof(double)*nb*nb);

  // generate matrix data
  for (int i = 0; i < nb*nb; ++i) {
    a[i] = 1;
    b[i] = 2;
    c[i] = 0;
  }
  // end matrix properties

  // create array descriptor
  int desca[9];
  int descb[9];
  int descc[9];
  int rsrc = 0, csrc = 0, info;
  descinit_(desca, &N, &N, &nb, &nb, &rsrc, &csrc, &BLACS_CONTEXT, &nb, &info);
  descinit_(descb, &N, &N, &nb, &nb, &rsrc, &csrc, &BLACS_CONTEXT, &nb, &info);
  descinit_(descc, &N, &N, &nb, &nb, &rsrc, &csrc, &BLACS_CONTEXT, &nb, &info);
  cout << proc_id << " info: " << info << endl;
  // end create array descriptor
  
  Cblacs_barrier(BLACS_CONTEXT, "All");
  int ia = 1, ja = 1, ib = 1, jb = 1, ic = 1, jc = 1;
  double alpha = 1, beta = 1;
  pdgemm_("T", "T", &N, &N, &N, &alpha, a, &ia, &ja, desca, b, &ib, &jb, descb,
          &beta, c, &ic, &jc, descc);

  // print results on a per-process basis
  if (proc_id == 0) {
    cout << "proc : " << proc_id << endl;
    for (int i = 0; i < nb; ++i) {
      for (int j = 0; j < nb; ++j) {
        cout << "(" << nb*myrow + i << "," <<
          nb*mycol + j << ") " << c[i*nb + j] << " ";
      }
      cout << endl;
    }
    cout << endl;
  }

  if (proc_id == 1) {
    cout << "proc : " << proc_id << endl;
    for (int i = 0; i < nb; ++i) {
      for (int j = 0; j < nb; ++j) {
        cout << "(" << nb*myrow + i << "," <<
          nb*mycol + j << ") " << c[i*nb + j] << " ";
      }
      cout << endl;
    }
    cout << endl;
  }

  if (proc_id == 2) {
    cout << "proc : " << proc_id << endl;
    for (int i = 0; i < nb; ++i) {
      for (int j = 0; j < nb; ++j) {
        cout << "(" << nb*myrow + i << "," <<
          nb*mycol + j << ") " << c[i*nb + j] << " ";
      }
      cout << endl;
    }
    cout << endl;
  }

  if (proc_id == 3) {
    cout << "proc : " << proc_id << endl;
    for (int i = 0; i < nb; ++i) {
      for (int j = 0; j < nb; ++j) {
        cout << "(" << nb*myrow + i << "," <<
          nb*mycol + j << ") " << c[i*nb + j] << " ";
      }
      cout << endl;
    }
    cout << endl;
  }

  MPI_Finalize();
}

Distributed matrix multiplication using PBLAS and BLACS.

Array descriptor

Multiplication function description

Full code

Resources