From f0ebea0a734178673f4d2d9fafe43a037a2dbb6f Mon Sep 17 00:00:00 2001
From: "Kurt A. O'Hearn" <ohearnku@msu.edu>
Date: Wed, 12 May 2021 00:12:22 -0400
Subject: [PATCH] PG-PuReMD: ensure MPI pack/unpack CUDA kernels are run in the
 correct stream. Other changes to bring the CUDA-aware MPI functions for the
 charge solver in line with their MPI counterparts.

---
 PG-PuReMD/src/cuda/cuda_basic_comm.cu   | 125 ++++++++++++++----------
 PG-PuReMD/src/cuda/cuda_basic_comm.h    |   6 +-
 PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu |   8 +-
 3 files changed, 78 insertions(+), 61 deletions(-)

diff --git a/PG-PuReMD/src/cuda/cuda_basic_comm.cu b/PG-PuReMD/src/cuda/cuda_basic_comm.cu
index a65741d6..d3e2fa8a 100644
--- a/PG-PuReMD/src/cuda/cuda_basic_comm.cu
+++ b/PG-PuReMD/src/cuda/cuda_basic_comm.cu
@@ -8,9 +8,10 @@
 #include "../vector.h"
 
 
-typedef void (*dist_packer)( void const * const, mpi_out_data * const );
-typedef void (*coll_unpacker)( void const * const, void * const,
-        mpi_out_data * const );
+typedef void (*cuda_dist_packer)( void const * const, mpi_out_data * const,
+        cudaStream_t );
+typedef void (*cuda_coll_unpacker)( void const * const, void * const,
+        mpi_out_data * const, cudaStream_t );
 
 
 /* copy integer entries from buffer to MPI egress buffer
@@ -158,7 +159,7 @@ CUDA_GLOBAL void k_real_unpacker( real const * const in, int const * const index
         return;
     }
 
-    buf[ index[i] ] = in[i];
+    buf[ index[i] ] += in[i];
 }
 
 
@@ -211,127 +212,131 @@ CUDA_GLOBAL void k_rvec2_unpacker( rvec2 const * const in, int const * const ind
 }
 
 
-static void int_packer( void const * const dummy, mpi_out_data * const out_buf )
+static void int_packer( void const * const dummy, mpi_out_data * const out_buf,
+       cudaStream_t s )
 {
     int blocks;
 
     blocks = (out_buf->cnt / DEF_BLOCK_SIZE)
         + ((out_buf->cnt % DEF_BLOCK_SIZE == 0) ? 0 : 1);
 
-    k_int_packer <<< blocks, DEF_BLOCK_SIZE >>>
+    k_int_packer <<< blocks, DEF_BLOCK_SIZE, 0, s >>>
         ( (int *) dummy, out_buf->index, (int *) out_buf->out_atoms, out_buf->cnt );
-    /* explicitly wait for kernel compilation as MPI calls need buffer prepared */
-    cudaDeviceSynchronize( );
     cudaCheckError( );
+
+    cudaStreamSynchronize( s );
 }
 
 
-static void real_packer( void const * const dummy, mpi_out_data * const out_buf )
+static void real_packer( void const * const dummy, mpi_out_data * const out_buf,
+       cudaStream_t s )
 {
     int blocks;
 
     blocks = (out_buf->cnt / DEF_BLOCK_SIZE)
         + ((out_buf->cnt % DEF_BLOCK_SIZE == 0) ? 0 : 1);
 
-    k_real_packer <<< blocks, DEF_BLOCK_SIZE >>>
+    k_real_packer <<< blocks, DEF_BLOCK_SIZE, 0, s >>>
         ( (real *) dummy, out_buf->index, (real *) out_buf->out_atoms, out_buf->cnt );
-    /* explicitly wait for kernel compilation as MPI calls need buffer prepared */
-    cudaDeviceSynchronize( );
     cudaCheckError( );
+
+    cudaStreamSynchronize( s );
 }
 
 
-static void rvec_packer( void const * const dummy, mpi_out_data * const out_buf )
+static void rvec_packer( void const * const dummy, mpi_out_data * const out_buf,
+       cudaStream_t s )
 {
     int blocks;
 
     blocks = (out_buf->cnt / DEF_BLOCK_SIZE)
         + ((out_buf->cnt % DEF_BLOCK_SIZE == 0) ? 0 : 1);
 
-    k_rvec_packer <<< blocks, DEF_BLOCK_SIZE >>>
+    k_rvec_packer <<< blocks, DEF_BLOCK_SIZE, 0, s >>>
         ( (rvec *) dummy, out_buf->index, (rvec *) out_buf->out_atoms, out_buf->cnt );
-    /* explicitly wait for kernel compilation as MPI calls need buffer prepared */
-    cudaDeviceSynchronize( );
     cudaCheckError( );
+
+    cudaStreamSynchronize( s );
 }
 
 
-static void rvec2_packer( void const * const dummy, mpi_out_data * const out_buf )
+static void rvec2_packer( void const * const dummy, mpi_out_data * const out_buf,
+       cudaStream_t s )
 {
     int blocks;
 
     blocks = (out_buf->cnt / DEF_BLOCK_SIZE)
         + ((out_buf->cnt % DEF_BLOCK_SIZE == 0) ? 0 : 1);
 
-    k_rvec2_packer <<< blocks, DEF_BLOCK_SIZE >>>
+    k_rvec2_packer <<< blocks, DEF_BLOCK_SIZE, 0, s >>>
         ( (rvec2 *) dummy, out_buf->index, (rvec2 *) out_buf->out_atoms, out_buf->cnt );
-    /* explicitly wait for kernel compilation as MPI calls need buffer prepared */
-    cudaDeviceSynchronize( );
     cudaCheckError( );
+
+    cudaStreamSynchronize( s );
 }
 
 
 static void int_unpacker( void const * const dummy_in, void * const dummy_buf,
-        mpi_out_data * const out_buf )
+        mpi_out_data * const out_buf, cudaStream_t s )
 {
     int blocks;
 
     blocks = (out_buf->cnt / DEF_BLOCK_SIZE)
         + ((out_buf->cnt % DEF_BLOCK_SIZE == 0) ? 0 : 1);
 
-    k_int_unpacker <<< blocks, DEF_BLOCK_SIZE >>>
+    k_int_unpacker <<< blocks, DEF_BLOCK_SIZE, 0, s >>>
         ( (int *) dummy_in, out_buf->index, (int *) dummy_buf, out_buf->cnt );
-    /* explicitly wait for kernel compilation as MPI calls need buffer prepared */
-    cudaDeviceSynchronize( );
     cudaCheckError( );
+
+    cudaStreamSynchronize( s );
 }
 
 
 static void real_unpacker( void const * const dummy_in, void * const dummy_buf,
-        mpi_out_data * const out_buf )
+        mpi_out_data * const out_buf, cudaStream_t s )
 {
     int blocks;
 
     blocks = (out_buf->cnt / DEF_BLOCK_SIZE)
         + ((out_buf->cnt % DEF_BLOCK_SIZE == 0) ? 0 : 1);
 
-    k_real_unpacker <<< blocks, DEF_BLOCK_SIZE >>>
+    k_real_unpacker <<< blocks, DEF_BLOCK_SIZE, 0, s >>>
         ( (real *) dummy_in, out_buf->index, (real *) dummy_buf, out_buf->cnt );
-    /* explicitly wait for kernel compilation as MPI calls need buffer prepared */
-    cudaDeviceSynchronize( );
     cudaCheckError( );
+
+    cudaStreamSynchronize( s );
 }
 
 
 static void rvec_unpacker( void const * const dummy_in, void * const dummy_buf,
-        mpi_out_data * const out_buf )
+        mpi_out_data * const out_buf, cudaStream_t s )
 {
     int blocks;
 
     blocks = (out_buf->cnt / DEF_BLOCK_SIZE)
         + ((out_buf->cnt % DEF_BLOCK_SIZE == 0) ? 0 : 1);
 
-    k_rvec_unpacker <<< blocks, DEF_BLOCK_SIZE >>>
+    k_rvec_unpacker <<< blocks, DEF_BLOCK_SIZE, 0, s >>>
         ( (rvec *) dummy_in, out_buf->index, (rvec *) dummy_buf, out_buf->cnt );
-    /* explicitly wait for kernel compilation as MPI calls need buffer prepared */
-    cudaDeviceSynchronize( );
     cudaCheckError( );
+
+    cudaStreamSynchronize( s );
 }
 
 
 static void rvec2_unpacker( void const * const dummy_in, void * const dummy_buf,
-        mpi_out_data * const out_buf )
+        mpi_out_data * const out_buf, cudaStream_t s )
 {
     int blocks;
 
     blocks = (out_buf->cnt / DEF_BLOCK_SIZE)
         + ((out_buf->cnt % DEF_BLOCK_SIZE == 0) ? 0 : 1);
 
-    k_rvec2_unpacker <<< blocks, DEF_BLOCK_SIZE >>>
+    k_rvec2_unpacker <<< blocks, DEF_BLOCK_SIZE, 0, s >>>
         ( (rvec2 *) dummy_in, out_buf->index, (rvec2 *) dummy_buf, out_buf->cnt );
-    /* explicitly wait for kernel compilation as MPI calls need buffer prepared */
-    cudaDeviceSynchronize( );
     cudaCheckError( );
+
+    cudaStreamSynchronize( s );
 }
 
 
@@ -368,9 +373,9 @@ static void * Get_Buffer_Offset( void const * const buffer,
 }
 
 
-static dist_packer Get_Packer( int type )
+static cuda_dist_packer Get_Packer( int type )
 {
-    dist_packer func_ptr;
+    cuda_dist_packer func_ptr;
 
     switch ( type )
     {
@@ -400,9 +405,9 @@ static dist_packer Get_Packer( int type )
 }
 
 
-static coll_unpacker Get_Unpacker( int type )
+static cuda_coll_unpacker Get_Unpacker( int type )
 {
-    coll_unpacker func_ptr;
+    cuda_coll_unpacker func_ptr;
 
     switch ( type )
     {
@@ -433,7 +438,7 @@ static coll_unpacker Get_Unpacker( int type )
 
 
 void Cuda_Dist( reax_system const * const system, mpi_datatypes * const mpi_data,
-        void const * const buf, int buf_type, MPI_Datatype type )
+        void const * const buf, int buf_type, MPI_Datatype type, cudaStream_t s )
 {
     int d, cnt1, cnt2, ret;
     mpi_out_data *out_bufs;
@@ -441,13 +446,14 @@ void Cuda_Dist( reax_system const * const system, mpi_datatypes * const mpi_data
     MPI_Request req1, req2;
     MPI_Status stat1, stat2;
     const neighbor_proc *nbr1, *nbr2;
-    dist_packer pack;
+    cuda_dist_packer pack;
     MPI_Aint extent, lower_bound;
     size_t type_size;
 
     ret = MPI_Type_get_extent( type, &lower_bound, &extent );
     Check_MPI_Error( ret, __FILE__, __LINE__ );
-    type_size = MPI_Aint_add( lower_bound, extent );
+//    type_size = MPI_Aint_add( lower_bound, extent );
+    type_size = extent;
 
     comm = mpi_data->comm_mesh3D;
     out_bufs = mpi_data->d_out_buffers;
@@ -466,7 +472,7 @@ void Cuda_Dist( reax_system const * const system, mpi_datatypes * const mpi_data
                 &out_bufs[2 * d].index_size,
                 sizeof(int) * out_bufs[2 * d].cnt, __FILE__, __LINE__ );
 
-        pack( buf, &out_bufs[2 * d] );
+        pack( buf, &out_bufs[2 * d], s );
 
         ret = MPI_Isend( out_bufs[2 * d].out_atoms, out_bufs[2 * d].cnt,
                 type, nbr1->rank, 2 * d, comm, &req1 );
@@ -479,7 +485,7 @@ void Cuda_Dist( reax_system const * const system, mpi_datatypes * const mpi_data
                 &out_bufs[2 * d + 1].index_size,
                 sizeof(int) * out_bufs[2 * d + 1].cnt, __FILE__, __LINE__ );
 
-        pack( buf, &out_bufs[2 * d + 1] );
+        pack( buf, &out_bufs[2 * d + 1], s );
 
         ret = MPI_Isend( out_bufs[2 * d + 1].out_atoms, out_bufs[2 * d + 1].cnt,
                 type, nbr2->rank, 2 * d + 1, comm, &req2 );
@@ -496,6 +502,11 @@ void Cuda_Dist( reax_system const * const system, mpi_datatypes * const mpi_data
             fprintf( stderr, "[ERROR] MPI_Get_count returned MPI_UNDEFINED\n" );
             MPI_Abort( MPI_COMM_WORLD, RUNTIME_ERROR );
         }
+        else if ( cnt1 + nbr1->atoms_str > system->total_cap )
+        {
+            fprintf( stderr, "[ERROR] Dist: not enough space in recv buffer for nbr1 (dim = %d)\n", d );
+            MPI_Abort( MPI_COMM_WORLD, RUNTIME_ERROR );
+        }
 
         ret = MPI_Recv( Get_Buffer_Offset( buf, nbr1->atoms_str, buf_type ),
                 cnt1, type, nbr1->rank, 2 * d + 1, comm, MPI_STATUS_IGNORE );
@@ -511,6 +522,11 @@ void Cuda_Dist( reax_system const * const system, mpi_datatypes * const mpi_data
             fprintf( stderr, "[ERROR] MPI_Get_count returned MPI_UNDEFINED\n" );
             MPI_Abort( MPI_COMM_WORLD, RUNTIME_ERROR );
         }
+        else if ( cnt2 + nbr2->atoms_str > system->total_cap )
+        {
+            fprintf( stderr, "[ERROR] Dist: not enough space in recv buffer for nbr2 (dim = %d)\n", d );
+            MPI_Abort( MPI_COMM_WORLD, RUNTIME_ERROR );
+        }
 
         ret = MPI_Recv( Get_Buffer_Offset( buf, nbr2->atoms_str, buf_type ),
                 cnt2, type, nbr2->rank, 2 * d, comm, MPI_STATUS_IGNORE );
@@ -525,7 +541,7 @@ void Cuda_Dist( reax_system const * const system, mpi_datatypes * const mpi_data
 
 
 void Cuda_Dist_FS( reax_system const * const system, mpi_datatypes * const mpi_data,
-        void const * const buf, int buf_type, MPI_Datatype type )
+        void const * const buf, int buf_type, MPI_Datatype type, cudaStream_t s )
 {
     int d, cnt1, cnt2, ret;
     mpi_out_data *out_bufs;
@@ -533,7 +549,7 @@ void Cuda_Dist_FS( reax_system const * const system, mpi_datatypes * const mpi_d
     MPI_Request req1, req2;
     MPI_Status stat1, stat2;
     const neighbor_proc *nbr1, *nbr2;
-    dist_packer pack;
+    cuda_dist_packer pack;
     MPI_Aint extent, lower_bound;
     size_t type_size;
 
@@ -558,7 +574,7 @@ void Cuda_Dist_FS( reax_system const * const system, mpi_datatypes * const mpi_d
                 &out_bufs[2 * d].index_size,
                 sizeof(int) * out_bufs[2 * d].cnt, __FILE__, __LINE__ );
 
-        pack( buf, &out_bufs[2 * d] );
+        pack( buf, &out_bufs[2 * d], s );
 
         ret = MPI_Isend( out_bufs[2 * d].out_atoms, out_bufs[2 * d].cnt,
                 type, nbr1->rank, 2 * d, comm, &req1 );
@@ -571,7 +587,7 @@ void Cuda_Dist_FS( reax_system const * const system, mpi_datatypes * const mpi_d
                 &out_bufs[2 * d + 1].index_size,
                 sizeof(int) * out_bufs[2 * d + 1].cnt, __FILE__, __LINE__ );
 
-        pack( buf, &out_bufs[2 * d + 1] );
+        pack( buf, &out_bufs[2 * d + 1], s );
 
         ret = MPI_Isend( out_bufs[2 * d + 1].out_atoms, out_bufs[2 * d + 1].cnt,
                 type, nbr2->rank, 2 * d + 1, comm, &req2 );
@@ -617,7 +633,7 @@ void Cuda_Dist_FS( reax_system const * const system, mpi_datatypes * const mpi_d
 
 
 void Cuda_Coll( reax_system const * const system, mpi_datatypes * const mpi_data,
-        void * const buf, int buf_type, MPI_Datatype type )
+        void * const buf, int buf_type, MPI_Datatype type, cudaStream_t s )
 {   
     int d, cnt1, cnt2, ret;
     mpi_out_data *out_bufs;
@@ -625,13 +641,14 @@ void Cuda_Coll( reax_system const * const system, mpi_datatypes * const mpi_data
     MPI_Request req1, req2;
     MPI_Status stat1, stat2;
     const neighbor_proc *nbr1, *nbr2;
-    coll_unpacker unpack;
+    cuda_coll_unpacker unpack;
     MPI_Aint extent, lower_bound;
     size_t type_size;
 
     ret = MPI_Type_get_extent( type, &lower_bound, &extent );
     Check_MPI_Error( ret, __FILE__, __LINE__ );
-    type_size = MPI_Aint_add( lower_bound, extent );
+//    type_size = MPI_Aint_add( lower_bound, extent );
+    type_size = extent;
 
     comm = mpi_data->comm_mesh3D;
     out_bufs = mpi_data->d_out_buffers;
@@ -693,7 +710,7 @@ void Cuda_Coll( reax_system const * const system, mpi_datatypes * const mpi_data
         ret = MPI_Wait( &req2, MPI_STATUS_IGNORE );
         Check_MPI_Error( ret, __FILE__, __LINE__ );
 
-        unpack( mpi_data->d_in1_buffer, buf, &out_bufs[2 * d] );
-        unpack( mpi_data->d_in2_buffer, buf, &out_bufs[2 * d + 1] );
+        unpack( mpi_data->d_in1_buffer, buf, &out_bufs[2 * d], s );
+        unpack( mpi_data->d_in2_buffer, buf, &out_bufs[2 * d + 1], s );
     }
 }
diff --git a/PG-PuReMD/src/cuda/cuda_basic_comm.h b/PG-PuReMD/src/cuda/cuda_basic_comm.h
index 1428f926..9bac866b 100644
--- a/PG-PuReMD/src/cuda/cuda_basic_comm.h
+++ b/PG-PuReMD/src/cuda/cuda_basic_comm.h
@@ -35,13 +35,13 @@ enum pointer_type
 
 
 void Cuda_Dist( reax_system const * const, mpi_datatypes * const,
-        void const * const, int, MPI_Datatype );
+        void const * const, int, MPI_Datatype, cudaStream_t );
 
 void Cuda_Dist_FS( reax_system const * const, mpi_datatypes * const,
-        void const * const, int, MPI_Datatype );
+        void const * const, int, MPI_Datatype, cudaStream_t );
 
 void Cuda_Coll( reax_system const * const, mpi_datatypes * const,
-        void * const , int, MPI_Datatype );
+        void * const , int, MPI_Datatype, cudaStream_t );
 
 
 #endif
diff --git a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu
index bd2c7f46..4cc442ee 100644
--- a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu
+++ b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu
@@ -594,7 +594,7 @@ static void Dual_Sparse_MatVec_Comm_Part1( const reax_system * const system,
 
 #if defined(CUDA_DEVICE_PACK)
     /* exploit 3D domain decomposition of simulation space with 3-stage communication pattern */
-    Cuda_Dist( system, mpi_data, x, buf_type, mpi_type );
+    Cuda_Dist( system, mpi_data, x, buf_type, mpi_type, control->streams[4] );
 #else
     check_smalloc( &workspace->host_scratch, &workspace->host_scratch_size,
             sizeof(rvec2) * n, TRUE, SAFE_ZONE,
@@ -692,7 +692,7 @@ static void Dual_Sparse_MatVec_Comm_Part2( const reax_system * const system,
     if ( mat_format == SYM_HALF_MATRIX )
     {
 #if defined(CUDA_DEVICE_PACK)
-        Cuda_Coll( system, mpi_data, b, buf_type, mpi_type );
+        Cuda_Coll( system, mpi_data, b, buf_type, mpi_type, control->streams[4] );
 #else
         check_smalloc( &workspace->host_scratch, &workspace->host_scratch_size,
                 sizeof(rvec2) * n1, TRUE, SAFE_ZONE,
@@ -777,7 +777,7 @@ static void Sparse_MatVec_Comm_Part1( const reax_system * const system,
 
 #if defined(CUDA_DEVICE_PACK)
     /* exploit 3D domain decomposition of simulation space with 3-stage communication pattern */
-    Cuda_Dist( system, mpi_data, x, buf_type, mpi_type );
+    Cuda_Dist( system, mpi_data, x, buf_type, mpi_type, control->streams[4] );
 #else
     check_smalloc( &workspace->host_scratch, &workspace->host_scratch_size,
             sizeof(real) * n, TRUE, SAFE_ZONE,
@@ -873,7 +873,7 @@ static void Sparse_MatVec_Comm_Part2( const reax_system * const system,
     if ( mat_format == SYM_HALF_MATRIX )
     {
 #if defined(CUDA_DEVICE_PACK)
-        Cuda_Coll( system, mpi_data, b, buf_type, mpi_type );
+        Cuda_Coll( system, mpi_data, b, buf_type, mpi_type, control->streams[4] );
 #else
         check_smalloc( &workspace->host_scratch, &workspace->host_scratch_size,
                 sizeof(real) * n1, TRUE, SAFE_ZONE,
-- 
GitLab