From f0ebea0a734178673f4d2d9fafe43a037a2dbb6f Mon Sep 17 00:00:00 2001 From: "Kurt A. O'Hearn" <ohearnku@msu.edu> Date: Wed, 12 May 2021 00:12:22 -0400 Subject: [PATCH] PG-PuReMD: ensure MPI pack/unpack CUDA kernels are run in the correct stream. Other changes to bring the CUDA-aware MPI functions for the charge solver in line with their MPI counterparts. --- PG-PuReMD/src/cuda/cuda_basic_comm.cu | 125 ++++++++++++++---------- PG-PuReMD/src/cuda/cuda_basic_comm.h | 6 +- PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu | 8 +- 3 files changed, 78 insertions(+), 61 deletions(-) diff --git a/PG-PuReMD/src/cuda/cuda_basic_comm.cu b/PG-PuReMD/src/cuda/cuda_basic_comm.cu index a65741d6..d3e2fa8a 100644 --- a/PG-PuReMD/src/cuda/cuda_basic_comm.cu +++ b/PG-PuReMD/src/cuda/cuda_basic_comm.cu @@ -8,9 +8,10 @@ #include "../vector.h" -typedef void (*dist_packer)( void const * const, mpi_out_data * const ); -typedef void (*coll_unpacker)( void const * const, void * const, - mpi_out_data * const ); +typedef void (*cuda_dist_packer)( void const * const, mpi_out_data * const, + cudaStream_t ); +typedef void (*cuda_coll_unpacker)( void const * const, void * const, + mpi_out_data * const, cudaStream_t ); /* copy integer entries from buffer to MPI egress buffer @@ -158,7 +159,7 @@ CUDA_GLOBAL void k_real_unpacker( real const * const in, int const * const index return; } - buf[ index[i] ] = in[i]; + buf[ index[i] ] += in[i]; } @@ -211,127 +212,131 @@ CUDA_GLOBAL void k_rvec2_unpacker( rvec2 const * const in, int const * const ind } -static void int_packer( void const * const dummy, mpi_out_data * const out_buf ) +static void int_packer( void const * const dummy, mpi_out_data * const out_buf, + cudaStream_t s ) { int blocks; blocks = (out_buf->cnt / DEF_BLOCK_SIZE) + ((out_buf->cnt % DEF_BLOCK_SIZE == 0) ? 0 : 1); - k_int_packer <<< blocks, DEF_BLOCK_SIZE >>> + k_int_packer <<< blocks, DEF_BLOCK_SIZE, 0, s >>> ( (int *) dummy, out_buf->index, (int *) out_buf->out_atoms, out_buf->cnt ); - /* explicitly wait for kernel compilation as MPI calls need buffer prepared */ - cudaDeviceSynchronize( ); cudaCheckError( ); + + cudaStreamSynchronize( s ); } -static void real_packer( void const * const dummy, mpi_out_data * const out_buf ) +static void real_packer( void const * const dummy, mpi_out_data * const out_buf, + cudaStream_t s ) { int blocks; blocks = (out_buf->cnt / DEF_BLOCK_SIZE) + ((out_buf->cnt % DEF_BLOCK_SIZE == 0) ? 0 : 1); - k_real_packer <<< blocks, DEF_BLOCK_SIZE >>> + k_real_packer <<< blocks, DEF_BLOCK_SIZE, 0, s >>> ( (real *) dummy, out_buf->index, (real *) out_buf->out_atoms, out_buf->cnt ); - /* explicitly wait for kernel compilation as MPI calls need buffer prepared */ - cudaDeviceSynchronize( ); cudaCheckError( ); + + cudaStreamSynchronize( s ); } -static void rvec_packer( void const * const dummy, mpi_out_data * const out_buf ) +static void rvec_packer( void const * const dummy, mpi_out_data * const out_buf, + cudaStream_t s ) { int blocks; blocks = (out_buf->cnt / DEF_BLOCK_SIZE) + ((out_buf->cnt % DEF_BLOCK_SIZE == 0) ? 0 : 1); - k_rvec_packer <<< blocks, DEF_BLOCK_SIZE >>> + k_rvec_packer <<< blocks, DEF_BLOCK_SIZE, 0, s >>> ( (rvec *) dummy, out_buf->index, (rvec *) out_buf->out_atoms, out_buf->cnt ); - /* explicitly wait for kernel compilation as MPI calls need buffer prepared */ - cudaDeviceSynchronize( ); cudaCheckError( ); + + cudaStreamSynchronize( s ); } -static void rvec2_packer( void const * const dummy, mpi_out_data * const out_buf ) +static void rvec2_packer( void const * const dummy, mpi_out_data * const out_buf, + cudaStream_t s ) { int blocks; blocks = (out_buf->cnt / DEF_BLOCK_SIZE) + ((out_buf->cnt % DEF_BLOCK_SIZE == 0) ? 0 : 1); - k_rvec2_packer <<< blocks, DEF_BLOCK_SIZE >>> + k_rvec2_packer <<< blocks, DEF_BLOCK_SIZE, 0, s >>> ( (rvec2 *) dummy, out_buf->index, (rvec2 *) out_buf->out_atoms, out_buf->cnt ); - /* explicitly wait for kernel compilation as MPI calls need buffer prepared */ - cudaDeviceSynchronize( ); cudaCheckError( ); + + cudaStreamSynchronize( s ); } static void int_unpacker( void const * const dummy_in, void * const dummy_buf, - mpi_out_data * const out_buf ) + mpi_out_data * const out_buf, cudaStream_t s ) { int blocks; blocks = (out_buf->cnt / DEF_BLOCK_SIZE) + ((out_buf->cnt % DEF_BLOCK_SIZE == 0) ? 0 : 1); - k_int_unpacker <<< blocks, DEF_BLOCK_SIZE >>> + k_int_unpacker <<< blocks, DEF_BLOCK_SIZE, 0, s >>> ( (int *) dummy_in, out_buf->index, (int *) dummy_buf, out_buf->cnt ); - /* explicitly wait for kernel compilation as MPI calls need buffer prepared */ - cudaDeviceSynchronize( ); cudaCheckError( ); + + cudaStreamSynchronize( s ); } static void real_unpacker( void const * const dummy_in, void * const dummy_buf, - mpi_out_data * const out_buf ) + mpi_out_data * const out_buf, cudaStream_t s ) { int blocks; blocks = (out_buf->cnt / DEF_BLOCK_SIZE) + ((out_buf->cnt % DEF_BLOCK_SIZE == 0) ? 0 : 1); - k_real_unpacker <<< blocks, DEF_BLOCK_SIZE >>> + k_real_unpacker <<< blocks, DEF_BLOCK_SIZE, 0, s >>> ( (real *) dummy_in, out_buf->index, (real *) dummy_buf, out_buf->cnt ); - /* explicitly wait for kernel compilation as MPI calls need buffer prepared */ - cudaDeviceSynchronize( ); cudaCheckError( ); + + cudaStreamSynchronize( s ); } static void rvec_unpacker( void const * const dummy_in, void * const dummy_buf, - mpi_out_data * const out_buf ) + mpi_out_data * const out_buf, cudaStream_t s ) { int blocks; blocks = (out_buf->cnt / DEF_BLOCK_SIZE) + ((out_buf->cnt % DEF_BLOCK_SIZE == 0) ? 0 : 1); - k_rvec_unpacker <<< blocks, DEF_BLOCK_SIZE >>> + k_rvec_unpacker <<< blocks, DEF_BLOCK_SIZE, 0, s >>> ( (rvec *) dummy_in, out_buf->index, (rvec *) dummy_buf, out_buf->cnt ); - /* explicitly wait for kernel compilation as MPI calls need buffer prepared */ - cudaDeviceSynchronize( ); cudaCheckError( ); + + cudaStreamSynchronize( s ); } static void rvec2_unpacker( void const * const dummy_in, void * const dummy_buf, - mpi_out_data * const out_buf ) + mpi_out_data * const out_buf, cudaStream_t s ) { int blocks; blocks = (out_buf->cnt / DEF_BLOCK_SIZE) + ((out_buf->cnt % DEF_BLOCK_SIZE == 0) ? 0 : 1); - k_rvec2_unpacker <<< blocks, DEF_BLOCK_SIZE >>> + k_rvec2_unpacker <<< blocks, DEF_BLOCK_SIZE, 0, s >>> ( (rvec2 *) dummy_in, out_buf->index, (rvec2 *) dummy_buf, out_buf->cnt ); - /* explicitly wait for kernel compilation as MPI calls need buffer prepared */ - cudaDeviceSynchronize( ); cudaCheckError( ); + + cudaStreamSynchronize( s ); } @@ -368,9 +373,9 @@ static void * Get_Buffer_Offset( void const * const buffer, } -static dist_packer Get_Packer( int type ) +static cuda_dist_packer Get_Packer( int type ) { - dist_packer func_ptr; + cuda_dist_packer func_ptr; switch ( type ) { @@ -400,9 +405,9 @@ static dist_packer Get_Packer( int type ) } -static coll_unpacker Get_Unpacker( int type ) +static cuda_coll_unpacker Get_Unpacker( int type ) { - coll_unpacker func_ptr; + cuda_coll_unpacker func_ptr; switch ( type ) { @@ -433,7 +438,7 @@ static coll_unpacker Get_Unpacker( int type ) void Cuda_Dist( reax_system const * const system, mpi_datatypes * const mpi_data, - void const * const buf, int buf_type, MPI_Datatype type ) + void const * const buf, int buf_type, MPI_Datatype type, cudaStream_t s ) { int d, cnt1, cnt2, ret; mpi_out_data *out_bufs; @@ -441,13 +446,14 @@ void Cuda_Dist( reax_system const * const system, mpi_datatypes * const mpi_data MPI_Request req1, req2; MPI_Status stat1, stat2; const neighbor_proc *nbr1, *nbr2; - dist_packer pack; + cuda_dist_packer pack; MPI_Aint extent, lower_bound; size_t type_size; ret = MPI_Type_get_extent( type, &lower_bound, &extent ); Check_MPI_Error( ret, __FILE__, __LINE__ ); - type_size = MPI_Aint_add( lower_bound, extent ); +// type_size = MPI_Aint_add( lower_bound, extent ); + type_size = extent; comm = mpi_data->comm_mesh3D; out_bufs = mpi_data->d_out_buffers; @@ -466,7 +472,7 @@ void Cuda_Dist( reax_system const * const system, mpi_datatypes * const mpi_data &out_bufs[2 * d].index_size, sizeof(int) * out_bufs[2 * d].cnt, __FILE__, __LINE__ ); - pack( buf, &out_bufs[2 * d] ); + pack( buf, &out_bufs[2 * d], s ); ret = MPI_Isend( out_bufs[2 * d].out_atoms, out_bufs[2 * d].cnt, type, nbr1->rank, 2 * d, comm, &req1 ); @@ -479,7 +485,7 @@ void Cuda_Dist( reax_system const * const system, mpi_datatypes * const mpi_data &out_bufs[2 * d + 1].index_size, sizeof(int) * out_bufs[2 * d + 1].cnt, __FILE__, __LINE__ ); - pack( buf, &out_bufs[2 * d + 1] ); + pack( buf, &out_bufs[2 * d + 1], s ); ret = MPI_Isend( out_bufs[2 * d + 1].out_atoms, out_bufs[2 * d + 1].cnt, type, nbr2->rank, 2 * d + 1, comm, &req2 ); @@ -496,6 +502,11 @@ void Cuda_Dist( reax_system const * const system, mpi_datatypes * const mpi_data fprintf( stderr, "[ERROR] MPI_Get_count returned MPI_UNDEFINED\n" ); MPI_Abort( MPI_COMM_WORLD, RUNTIME_ERROR ); } + else if ( cnt1 + nbr1->atoms_str > system->total_cap ) + { + fprintf( stderr, "[ERROR] Dist: not enough space in recv buffer for nbr1 (dim = %d)\n", d ); + MPI_Abort( MPI_COMM_WORLD, RUNTIME_ERROR ); + } ret = MPI_Recv( Get_Buffer_Offset( buf, nbr1->atoms_str, buf_type ), cnt1, type, nbr1->rank, 2 * d + 1, comm, MPI_STATUS_IGNORE ); @@ -511,6 +522,11 @@ void Cuda_Dist( reax_system const * const system, mpi_datatypes * const mpi_data fprintf( stderr, "[ERROR] MPI_Get_count returned MPI_UNDEFINED\n" ); MPI_Abort( MPI_COMM_WORLD, RUNTIME_ERROR ); } + else if ( cnt2 + nbr2->atoms_str > system->total_cap ) + { + fprintf( stderr, "[ERROR] Dist: not enough space in recv buffer for nbr2 (dim = %d)\n", d ); + MPI_Abort( MPI_COMM_WORLD, RUNTIME_ERROR ); + } ret = MPI_Recv( Get_Buffer_Offset( buf, nbr2->atoms_str, buf_type ), cnt2, type, nbr2->rank, 2 * d, comm, MPI_STATUS_IGNORE ); @@ -525,7 +541,7 @@ void Cuda_Dist( reax_system const * const system, mpi_datatypes * const mpi_data void Cuda_Dist_FS( reax_system const * const system, mpi_datatypes * const mpi_data, - void const * const buf, int buf_type, MPI_Datatype type ) + void const * const buf, int buf_type, MPI_Datatype type, cudaStream_t s ) { int d, cnt1, cnt2, ret; mpi_out_data *out_bufs; @@ -533,7 +549,7 @@ void Cuda_Dist_FS( reax_system const * const system, mpi_datatypes * const mpi_d MPI_Request req1, req2; MPI_Status stat1, stat2; const neighbor_proc *nbr1, *nbr2; - dist_packer pack; + cuda_dist_packer pack; MPI_Aint extent, lower_bound; size_t type_size; @@ -558,7 +574,7 @@ void Cuda_Dist_FS( reax_system const * const system, mpi_datatypes * const mpi_d &out_bufs[2 * d].index_size, sizeof(int) * out_bufs[2 * d].cnt, __FILE__, __LINE__ ); - pack( buf, &out_bufs[2 * d] ); + pack( buf, &out_bufs[2 * d], s ); ret = MPI_Isend( out_bufs[2 * d].out_atoms, out_bufs[2 * d].cnt, type, nbr1->rank, 2 * d, comm, &req1 ); @@ -571,7 +587,7 @@ void Cuda_Dist_FS( reax_system const * const system, mpi_datatypes * const mpi_d &out_bufs[2 * d + 1].index_size, sizeof(int) * out_bufs[2 * d + 1].cnt, __FILE__, __LINE__ ); - pack( buf, &out_bufs[2 * d + 1] ); + pack( buf, &out_bufs[2 * d + 1], s ); ret = MPI_Isend( out_bufs[2 * d + 1].out_atoms, out_bufs[2 * d + 1].cnt, type, nbr2->rank, 2 * d + 1, comm, &req2 ); @@ -617,7 +633,7 @@ void Cuda_Dist_FS( reax_system const * const system, mpi_datatypes * const mpi_d void Cuda_Coll( reax_system const * const system, mpi_datatypes * const mpi_data, - void * const buf, int buf_type, MPI_Datatype type ) + void * const buf, int buf_type, MPI_Datatype type, cudaStream_t s ) { int d, cnt1, cnt2, ret; mpi_out_data *out_bufs; @@ -625,13 +641,14 @@ void Cuda_Coll( reax_system const * const system, mpi_datatypes * const mpi_data MPI_Request req1, req2; MPI_Status stat1, stat2; const neighbor_proc *nbr1, *nbr2; - coll_unpacker unpack; + cuda_coll_unpacker unpack; MPI_Aint extent, lower_bound; size_t type_size; ret = MPI_Type_get_extent( type, &lower_bound, &extent ); Check_MPI_Error( ret, __FILE__, __LINE__ ); - type_size = MPI_Aint_add( lower_bound, extent ); +// type_size = MPI_Aint_add( lower_bound, extent ); + type_size = extent; comm = mpi_data->comm_mesh3D; out_bufs = mpi_data->d_out_buffers; @@ -693,7 +710,7 @@ void Cuda_Coll( reax_system const * const system, mpi_datatypes * const mpi_data ret = MPI_Wait( &req2, MPI_STATUS_IGNORE ); Check_MPI_Error( ret, __FILE__, __LINE__ ); - unpack( mpi_data->d_in1_buffer, buf, &out_bufs[2 * d] ); - unpack( mpi_data->d_in2_buffer, buf, &out_bufs[2 * d + 1] ); + unpack( mpi_data->d_in1_buffer, buf, &out_bufs[2 * d], s ); + unpack( mpi_data->d_in2_buffer, buf, &out_bufs[2 * d + 1], s ); } } diff --git a/PG-PuReMD/src/cuda/cuda_basic_comm.h b/PG-PuReMD/src/cuda/cuda_basic_comm.h index 1428f926..9bac866b 100644 --- a/PG-PuReMD/src/cuda/cuda_basic_comm.h +++ b/PG-PuReMD/src/cuda/cuda_basic_comm.h @@ -35,13 +35,13 @@ enum pointer_type void Cuda_Dist( reax_system const * const, mpi_datatypes * const, - void const * const, int, MPI_Datatype ); + void const * const, int, MPI_Datatype, cudaStream_t ); void Cuda_Dist_FS( reax_system const * const, mpi_datatypes * const, - void const * const, int, MPI_Datatype ); + void const * const, int, MPI_Datatype, cudaStream_t ); void Cuda_Coll( reax_system const * const, mpi_datatypes * const, - void * const , int, MPI_Datatype ); + void * const , int, MPI_Datatype, cudaStream_t ); #endif diff --git a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu index bd2c7f46..4cc442ee 100644 --- a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu +++ b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu @@ -594,7 +594,7 @@ static void Dual_Sparse_MatVec_Comm_Part1( const reax_system * const system, #if defined(CUDA_DEVICE_PACK) /* exploit 3D domain decomposition of simulation space with 3-stage communication pattern */ - Cuda_Dist( system, mpi_data, x, buf_type, mpi_type ); + Cuda_Dist( system, mpi_data, x, buf_type, mpi_type, control->streams[4] ); #else check_smalloc( &workspace->host_scratch, &workspace->host_scratch_size, sizeof(rvec2) * n, TRUE, SAFE_ZONE, @@ -692,7 +692,7 @@ static void Dual_Sparse_MatVec_Comm_Part2( const reax_system * const system, if ( mat_format == SYM_HALF_MATRIX ) { #if defined(CUDA_DEVICE_PACK) - Cuda_Coll( system, mpi_data, b, buf_type, mpi_type ); + Cuda_Coll( system, mpi_data, b, buf_type, mpi_type, control->streams[4] ); #else check_smalloc( &workspace->host_scratch, &workspace->host_scratch_size, sizeof(rvec2) * n1, TRUE, SAFE_ZONE, @@ -777,7 +777,7 @@ static void Sparse_MatVec_Comm_Part1( const reax_system * const system, #if defined(CUDA_DEVICE_PACK) /* exploit 3D domain decomposition of simulation space with 3-stage communication pattern */ - Cuda_Dist( system, mpi_data, x, buf_type, mpi_type ); + Cuda_Dist( system, mpi_data, x, buf_type, mpi_type, control->streams[4] ); #else check_smalloc( &workspace->host_scratch, &workspace->host_scratch_size, sizeof(real) * n, TRUE, SAFE_ZONE, @@ -873,7 +873,7 @@ static void Sparse_MatVec_Comm_Part2( const reax_system * const system, if ( mat_format == SYM_HALF_MATRIX ) { #if defined(CUDA_DEVICE_PACK) - Cuda_Coll( system, mpi_data, b, buf_type, mpi_type ); + Cuda_Coll( system, mpi_data, b, buf_type, mpi_type, control->streams[4] ); #else check_smalloc( &workspace->host_scratch, &workspace->host_scratch_size, sizeof(real) * n1, TRUE, SAFE_ZONE, -- GitLab