How to improve speed of MPI I/O on large number of cores?

Question

The main thing to realize here is that you can read in the data in one fell swoop (or, if memory is a problem, in chunks - but it can be in much larger chunks than individual doubles!) and that you don't need to skip to the end of the file one double at a time.

Here's an example which will read in the data in arbitrary chunk sizes, processes the data as you will, and appends some data (in this case, everyone just adds 4 copies of their rank to the end of the file). For simplicity, little python scripts help with writing and displaying test data.

$ ./writedata.py 
$ ./readdata.py 
[  0.   1.   2.   3.   4.   5.   6.   7.   8.   9.  10.  11.  12.  13.  14.
  15.  16.  17.  18.  19.  20.  21.  22.  23.  24.]

$ mpirun -np 3 ./usepario
 rank:   0 got data: 0.000...   24.000 
 rank:   1 got data: 0.000...   24.000
 rank:   2 got data: 0.000...   24.000

$ ./readdata.py 
[  0.   1.   2.   3.   4.   5.   6.   7.   8.   9.  10.  11.  12.  13.  14.
  15.  16.  17.  18.  19.  20.  21.  22.  23.  24.   0.   0.   0.   0.   1.
   1.   1.   1.   2.   2.   2.   2.]

usepario.f90:

module pario

contains
    function openFile(filename)
        use mpi
        implicit none
        integer :: openFile, ierr
        character(len=*) :: filename
        integer(MPI_OFFSET_KIND) :: off = 0

        call MPI_File_open(MPI_COMM_WORLD, filename,  &
                           ior(MPI_MODE_RDWR, MPI_MODE_UNIQUE_OPEN),  &
                           MPI_INFO_NULL, openFile, ierr)
        call MPI_File_set_view(openFile, off,  &
                               MPI_DOUBLE_PRECISION, MPI_DOUBLE_PRECISION, &
                               "native", MPI_INFO_NULL, ierr)
    end function  openFile

    subroutine closeFile(fh)
        use mpi
        implicit none
        integer :: fh, ierr
        call MPI_File_close(fh, ierr)
    end subroutine closeFile

    function filesizedoubles(fh)
        use mpi
        implicit none
        integer :: fh, ierr
        integer(MPI_OFFSET_KIND) :: filesize, filesizedoubles
        integer :: dblsize

        call MPI_File_get_size(fh, filesize, ierr)
        call MPI_type_size(MPI_DOUBLE_PRECISION, dblsize, ierr)
        filesizedoubles = filesize / dblsize
    end function filesizedoubles

    subroutine getdatablock(fh, blocksize, datablock, datasize)
        use mpi
        implicit none
        integer :: fh, ierr
        integer :: blocksize, datasize
        double precision, dimension(:) :: datablock
        integer(MPI_OFFSET_KIND) :: fileloc
        integer, dimension(MPI_STATUS_SIZE) :: rstatus

        ! you can also experiment with read_all for non collective/synchronous file
        ! access

        call MPI_File_read(fh, datablock, blocksize, MPI_DOUBLE_PRECISION, &
                           rstatus, ierr)
        call MPI_Get_count(rstatus, MPI_DOUBLE_PRECISION, datasize, ierr)
    end subroutine getdatablock

    subroutine eachappend(fh, filesize, numitems, newdata)
        use mpi
        implicit none
        integer :: fh, numitems
        integer(MPI_OFFSET_KIND) :: filesize
        double precision, dimension(:) :: newdata
        integer :: rank, ierr
        integer(MPI_OFFSET_KIND) :: offset

        call MPI_Comm_rank(MPI_COMM_WORLD, rank, ierr)
        offset = filesize + rank*numitems
        call MPI_File_write_at_all(fh, offset, newdata, numitems, &
                                    MPI_DOUBLE_PRECISION,         &
                                    MPI_STATUS_IGNORE, ierr)

    end subroutine eachappend
end module pario


program usepario
    use mpi
    use pario
    implicit none

    integer :: fileh
    integer, parameter :: bufsize=1000, newsize=4
    integer(MPI_OFFSET_KIND) :: filesize
    double precision, allocatable, dimension(:) :: curdata, newdata
    integer :: datasize
    integer :: rank, ierr

    call MPI_Init(ierr)
    call MPI_Comm_rank(MPI_COMM_WORLD, rank, ierr)

    allocate(curdata(bufsize))

    fileh = openFile("data.dat")
    filesize = filesizedoubles(fileh)

    do
        call getdatablock(fileh, bufsize, curdata, datasize)
        !! 
        !! process data here
        !!
        !! do i=1,datasize
        !!  ...dostuff...
        !! end do
        !! 
        print '(1X,A,I3,A,F8.3,A,F8.3)', 'rank: ', rank, ' got data: ', curdata(1), '...', curdata(datasize)
        if (datasize /= bufsize) exit
    end do

    deallocate(curdata)

    allocate(newdata(newsize))
    newdata = rank

    call eachappend(fileh, filesize, newsize, newdata)
    call closeFile(fileh)

    call MPI_Finalize(ierr)
end program usepario

writedata.py:

#!/usr/bin/env python

import numpy

numdoubles = 25

data = numpy.arange(numdoubles,dtype=numpy.float64)
data.tofile("data.dat")

readdata.py:

#!/usr/bin/env python

import numpy

data = numpy.fromfile("data.dat",dtype=numpy.float64)
print data