Scatter array to only worker tasks

Question

Scatter and gather are described in some detail in this answer. Scatter splits up data and scatters the pieces to other tasks, but the data has to be stored in contiguous memory - MPI_Scatter has no way of knowing it needs to follow pointers and if so, how many - and the way you're allocating sendbuff:

sendbuff = new int*[ntasks];
for(int i = 0; i < ntasks; i++){
    sendbuff[i] = new int[buffsize];
}

the different rows of sendbuff could be scattered all over system memory. You'll be almost there if you allocate the data contiguously:

sendbuff = new int*[ntasks];
sendbuff[0] = new int[ntasks * 6];
for(int i = 1; i < ntasks; i++){
    sendbuff[i] = &(sendbuff[0][i*6];
}

Now you should be able to scatter, but be aware that row 0 will go to rank 0; that is, scatter goes to all the processes in the communicator. If you are only trying to send to your non-rank-zero tasks, the simplest thing to do is to just keep a row of dummy data in sendbuff for rank 0 so that a normal scatter works correctly:

#include <iostream>
#include <mpi.h>

int main(int argc, char **argv)
{
    int rank, size;
    const int nelem = 6;

    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);

    int **sendbuff = new int*[size];
    int *recvbuff  = new int[nelem];

    if (rank == 0) {
        sendbuff[0] = new int[nelem * size];
        for (int i=0; i<size; i++)
            sendbuff[i] = &(sendbuff[0][nelem*i]);

        for (int i=0; i<size; i++)
            for (int j=0; j<nelem; j++)
                sendbuff[i][j] = i-1;
    }

    MPI_Scatter(sendbuff[0], nelem, MPI_INT, recvbuff, nelem, MPI_INT, 0, MPI_COMM_WORLD);

    if (rank != 0) {
        std::cout << "Scatter: [ " << rank << "]: ";
        for (int i=0; i<nelem; i++)
            std::cout << recvbuff[i] << " ";
        std::cout << std::endl;

        for (int i=0; i<nelem; i++)
            recvbuff[i] *= recvbuff[i];
    }

    MPI_Gather(recvbuff, nelem, MPI_INT, sendbuff[0], nelem, MPI_INT, 0, MPI_COMM_WORLD);
    if (rank == 0) {
        for (int j=1; j<size; j++) {
            std::cout << "Gather: [ " << j << "]: ";
            for (int i=0; i<nelem; i++)
                    std::cout << sendbuff[j][i] << " ";
            std::cout << std::endl;
        }
    }

    delete [] recvbuff;
    if (rank == 0)
        delete [] sendbuff[0];
    delete [] sendbuff;

    MPI_Finalize();
}

Note that we're scattering the data, the workers are squaring the numbers, and the master gathers it back. Compiling and running gives:

$ mpic++ -o intercomm intercomm.cxx
$ mpirun -np 4 ./intercomm
Scatter: [ 2]: 1 1 1 1 1 1
Scatter: [ 1]: 0 0 0 0 0 0
Scatter: [ 3]: 2 2 2 2 2 2
Gather: [ 1]: 0 0 0 0 0 0
Gather: [ 2]: 1 1 1 1 1 1
Gather: [ 3]: 4 4 4 4 4 4

If you'd rather avoid having dummy data for rank 0 -- perhaps is large -- you can break up the tasks into two groups, the master task and the worker tasks, and set up an intercommunicator that allows collective communications between them. Here's a simple program which does just that:

#include <iostream>
#include <mpi.h>

int main(int argc, char **argv)
{
    MPI_Comm   localComm;    /* intra-communicator of local sub-group */
    MPI_Comm   interComm;    /* inter-communicator */
    int masterworker;
    int rank, size;
    const int nelem = 6;
    int rootrank;

    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);

    masterworker = (rank == 0 ? 0 : 1);
    MPI_Comm_split(MPI_COMM_WORLD, masterworker, rank, &localComm);

    if (masterworker == 0)
    {
        MPI_Intercomm_create( localComm, 0, MPI_COMM_WORLD, 1, 1, &interComm);
        rootrank = ( rank == 0 ? MPI_ROOT : MPI_PROC_NULL );
    }
    else {
        MPI_Intercomm_create( localComm, 0, MPI_COMM_WORLD, 0, 1, &interComm);
        rootrank = 0;
    }

    int **sendbuff = new int*[size-1];
    int *recvbuff  = new int[nelem];

    if (rank == 0) {

        sendbuff[0] = new int[nelem * (size-1)];
        for (int i=1; i<size-1; i++)
            sendbuff[i] = &(sendbuff[0][nelem*i]);

        for (int i=0; i<size-1; i++)
            for (int j=0; j<nelem; j++)
                sendbuff[i][j] = i;
    }

    MPI_Scatter(sendbuff[0], nelem, MPI_INT, recvbuff, nelem, MPI_INT, rootrank, interComm);

    if (masterworker == 1) {
        std::cout << "Scatter: [ " << rank << "]: ";
        for (int i=0; i<nelem; i++)
            std::cout << recvbuff[i] << " ";
        std::cout << std::endl;

        for (int i=0; i<nelem; i++)
            recvbuff[i] *= recvbuff[i];
    }

    MPI_Gather(recvbuff, nelem, MPI_INT, sendbuff[0], nelem, MPI_INT, rootrank, interComm);
    if (masterworker == 0) {
        for (int j=0; j<size-1; j++) {
            std::cout << "Gather: [ " << j << "]: ";
            for (int i=0; i<nelem; i++)
                    std::cout << sendbuff[j][i] << " ";
            std::cout << std::endl;
        }
    }




    MPI_Comm_free(&interComm);
    MPI_Comm_free(&localComm);
    delete [] recvbuff;
    if (rank == 0)
        delete [] sendbuff[0];
    delete [] sendbuff;

    MPI_Finalize();
}

Again, compiling and running gives:

$ mpic++ -o intercomm intercomm.cxx
$ mpirun -np 4 ./intercomm
Scatter: [ 1]: 0 0 0 0 0 0
Scatter: [ 2]: 1 1 1 1 1 1
Scatter: [ 3]: 2 2 2 2 2 2
Gather: [ 0]: 0 0 0 0 0 0
Gather: [ 1]: 1 1 1 1 1 1
Gather: [ 2]: 4 4 4 4 4 4

Alternately, if you don't want to mess around with intercommunicators, just keep a row of dummy data in sendbuff for rank 0 so that a normal scatter works correctly:

#include <iostream>
#include <mpi.h>

int main(int argc, char **argv)
{
    int rank, size;
    const int nelem = 6;

    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);

    int **sendbuff = new int*[size];
    int *recvbuff  = new int[nelem];

    if (rank == 0) {
        sendbuff[0] = new int[nelem * size];
        for (int i=0; i<size; i++)
            sendbuff[i] = &(sendbuff[0][nelem*i]);

        for (int i=0; i<size; i++)
            for (int j=0; j<nelem; j++)
                sendbuff[i][j] = i-1;
    }

    MPI_Scatter(sendbuff[0], nelem, MPI_INT, recvbuff, nelem, MPI_INT, 0, MPI_COMM_WORLD);

    if (rank != 0) {
        std::cout << "Scatter: [ " << rank << "]: ";
        for (int i=0; i<nelem; i++)
            std::cout << recvbuff[i] << " ";
        std::cout << std::endl;

        for (int i=0; i<nelem; i++)
            recvbuff[i] *= recvbuff[i];
    }

    MPI_Gather(recvbuff, nelem, MPI_INT, sendbuff[0], nelem, MPI_INT, 0, MPI_COMM_WORLD);
    if (rank == 0) {
        for (int j=1; j<size; j++) {
            std::cout << "Gather: [ " << j << "]: ";
            for (int i=0; i<nelem; i++)
                    std::cout << sendbuff[j][i] << " ";
            std::cout << std::endl;
        }
    }

    delete [] recvbuff;
    if (rank == 0)
        delete [] sendbuff[0];
    delete [] sendbuff;

    MPI_Finalize();
}

And again compiling and running gives:

$ mpic++ -o intercomm intercomm.cxx
$ mpirun -np 4 ./intercomm
Scatter: [ 2]: 1 1 1 1 1 1
Scatter: [ 1]: 0 0 0 0 0 0
Scatter: [ 3]: 2 2 2 2 2 2
Gather: [ 1]: 0 0 0 0 0 0
Gather: [ 2]: 1 1 1 1 1 1
Gather: [ 3]: 4 4 4 4 4 4