From e9e6455b8a3a21df87ce96105ab3bf31f31b282d Mon Sep 17 00:00:00 2001 From: "Kurt A. O'Hearn" <ohearnku@msu.edu> Date: Wed, 16 Sep 2020 15:13:21 -0400 Subject: [PATCH] PG-PuReMD: switch timers to use CUDA events and add additional timers for kernels to match MPI code. Other code cleanup. --- PG-PuReMD/src/box.c | 10 +- PG-PuReMD/src/cuda/cuda_environment.cu | 12 +-- PG-PuReMD/src/cuda/cuda_forces.cu | 135 ++++++++++++++++++++++--- PG-PuReMD/src/cuda/cuda_init_md.cu | 7 +- PG-PuReMD/src/cuda/cuda_neighbors.cu | 29 +++++- PG-PuReMD/src/io_tools.c | 75 ++++++++------ PG-PuReMD/src/puremd.c | 2 +- PG-PuReMD/src/restart.c | 4 +- 8 files changed, 208 insertions(+), 66 deletions(-) diff --git a/PG-PuReMD/src/box.c b/PG-PuReMD/src/box.c index effc6a90..a223eebe 100644 --- a/PG-PuReMD/src/box.c +++ b/PG-PuReMD/src/box.c @@ -287,17 +287,21 @@ void Setup_Boundary_Cutoffs( reax_system * const system, control_params * const void Setup_Environment( reax_system * const system, control_params * const control, mpi_datatypes * const mpi_data ) { + int ret; ivec periodic = {1, 1, 1}; #if defined(DEBUG_FOCUS) char temp[100]; #endif /* initialize communicator - 3D mesh with wrap-arounds = 3D torus */ - MPI_Cart_create( MPI_COMM_WORLD, 3, control->procs_by_dim, periodic, 1, + ret = MPI_Cart_create( MPI_COMM_WORLD, 3, control->procs_by_dim, periodic, 1, &mpi_data->comm_mesh3D ); - MPI_Comm_rank( mpi_data->comm_mesh3D, &system->my_rank ); - MPI_Cart_coords( mpi_data->comm_mesh3D, system->my_rank, 3, + Check_MPI_Error( ret, __FILE__, __LINE__ ); + ret = MPI_Comm_rank( mpi_data->comm_mesh3D, &system->my_rank ); + Check_MPI_Error( ret, __FILE__, __LINE__ ); + ret = MPI_Cart_coords( mpi_data->comm_mesh3D, system->my_rank, 3, system->my_coords ); + Check_MPI_Error( ret, __FILE__, __LINE__ ); Setup_Boundary_Cutoffs( system, control ); Setup_My_Box( system, control ); diff --git a/PG-PuReMD/src/cuda/cuda_environment.cu b/PG-PuReMD/src/cuda/cuda_environment.cu index 0445189f..1e11bc32 100644 --- a/PG-PuReMD/src/cuda/cuda_environment.cu +++ b/PG-PuReMD/src/cuda/cuda_environment.cu @@ -20,11 +20,11 @@ extern "C" void Cuda_Setup_Environment( int rank, int nprocs, int gpus_per_node { int deviceCount; - cudaError_t flag; + cudaError_t ret; - flag = cudaGetDeviceCount( &deviceCount ); + ret = cudaGetDeviceCount( &deviceCount ); - if ( flag != cudaSuccess || deviceCount < 1 ) + if ( ret != cudaSuccess || deviceCount < 1 ) { fprintf( stderr, "[ERROR] no CUDA capable device(s) found. Terminating...\n" ); exit( CANNOT_INITIALIZE ); @@ -38,15 +38,15 @@ extern "C" void Cuda_Setup_Environment( int rank, int nprocs, int gpus_per_node /* assign the GPU for each process */ //TODO: handle condition where # CPU procs > # GPUs - flag = cudaSetDevice( rank % gpus_per_node ); + ret = cudaSetDevice( rank % gpus_per_node ); - if ( flag == cudaErrorInvalidDevice ) + if ( ret == cudaErrorInvalidDevice ) { fprintf( stderr, "[ERROR] invalid CUDA device ID set (%d). Terminating...\n", rank % gpus_per_node ); exit( CANNOT_INITIALIZE ); } - else if ( flag == cudaErrorDeviceAlreadyInUse ) + else if ( ret == cudaErrorDeviceAlreadyInUse ) { fprintf( stderr, "[ERROR] CUDA device with specified ID already in use (%d). Terminating...\n", rank % gpus_per_node ); diff --git a/PG-PuReMD/src/cuda/cuda_forces.cu b/PG-PuReMD/src/cuda/cuda_forces.cu index 6e18ff81..3bdb4d3e 100644 --- a/PG-PuReMD/src/cuda/cuda_forces.cu +++ b/PG-PuReMD/src/cuda/cuda_forces.cu @@ -1474,9 +1474,13 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, int renbr, blocks, ret, realloc_bonds, realloc_hbonds, realloc_cm; static int dist_done = FALSE, cm_done = FALSE, bonds_done = FALSE; #if defined(LOG_PERFORMANCE) - double time; + float time_elapsed; + cudaEvent_t time_event[4]; - time = Get_Time( ); + for ( int i = 0; i < 4; ++i ) + { + cudaEventCreate( &time_event[i] ); + } #endif renbr = (data->step - data->prev_steps) % control->reneighbor == 0 ? TRUE : FALSE; @@ -1498,6 +1502,10 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, cuda_memset( system->d_realloc_cm_entries, FALSE, sizeof(int), "Cuda_Init_Forces::d_realloc_cm_entries" ); +#if defined(LOG_PERFORMANCE) + cudaEventRecord( time_event[0] ); +#endif + if ( renbr == FALSE && dist_done == FALSE ) { k_init_distance <<< control->blocks_n, control->block_size_n >>> @@ -1508,7 +1516,7 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, } #if defined(LOG_PERFORMANCE) - Update_Timing_Info( &time, &data->timing.init_dist ); + cudaEventRecord( time_event[1] ); #endif blocks = workspace->d_workspace->H.n_max / DEF_BLOCK_SIZE @@ -1561,7 +1569,7 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, } #if defined(LOG_PERFORMANCE) - Update_Timing_Info( &time, &data->timing.init_cm ); + cudaEventRecord( time_event[2] ); #endif if ( bonds_done == FALSE ) @@ -1578,7 +1586,7 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, } #if defined(LOG_PERFORMANCE) - Update_Timing_Info( &time, &data->timing.init_bond ); + cudaEventRecord( time_event[3] ); #endif /* check reallocation flags on device */ @@ -1589,6 +1597,42 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, copy_host_device( &realloc_hbonds, system->d_realloc_hbonds, sizeof(int), cudaMemcpyDeviceToHost, "Cuda_Init_Forces::d_realloc_hbonds" ); +#if defined(LOG_PERFORMANCE) + if ( cudaEventQuery( time_event[0] ) != cudaSuccess ) + { + cudaEventSynchronize( time_event[0] ); + } + + if ( cudaEventQuery( time_event[1] ) != cudaSuccess ) + { + cudaEventSynchronize( time_event[1] ); + } + + cudaEventElapsedTime( &time_elapsed, time_event[0], time_event[1] ); + data->timing.init_dist += (real) (time_elapsed / 1000.0); + + if ( cudaEventQuery( time_event[2] ) != cudaSuccess ) + { + cudaEventSynchronize( time_event[2] ); + } + + cudaEventElapsedTime( &time_elapsed, time_event[1], time_event[2] ); + data->timing.init_cm += (real) (time_elapsed / 1000.0); + + if ( cudaEventQuery( time_event[3] ) != cudaSuccess ) + { + cudaEventSynchronize( time_event[3] ); + } + + cudaEventElapsedTime( &time_elapsed, time_event[2], time_event[3] ); + data->timing.init_bond += (real) (time_elapsed / 1000.0); + + for ( int i = 0; i < 4; ++i ) + { + cudaEventDestroy( time_event[i] ); + } +#endif + ret = (realloc_cm == FALSE && realloc_bonds == FALSE && realloc_hbonds == FALSE) ? SUCCESS : FAILURE; @@ -1996,9 +2040,13 @@ extern "C" int Cuda_Compute_Forces( reax_system *system, control_params *control int charge_flag, ret; static int init_forces_done = FALSE; #if defined(LOG_PERFORMANCE) - real time; - - time = Get_Time( ); + float time_elapsed; + cudaEvent_t time_event[6]; + + for ( int i = 0; i < 6; ++i ) + { + cudaEventCreate( &time_event[i] ); + } #endif ret = SUCCESS; @@ -2013,6 +2061,10 @@ extern "C" int Cuda_Compute_Forces( reax_system *system, control_params *control charge_flag = FALSE; } +#if defined(LOG_PERFORMANCE) + cudaEventRecord( time_event[0] ); +#endif + if ( init_forces_done == FALSE ) { if ( charge_flag == TRUE ) @@ -2033,18 +2085,18 @@ extern "C" int Cuda_Compute_Forces( reax_system *system, control_params *control } #if defined(LOG_PERFORMANCE) - Update_Timing_Info( &time, &data->timing.init_forces ); + cudaEventRecord( time_event[1] ); #endif if ( ret == SUCCESS ) { ret = Cuda_Compute_Bonded_Forces( system, control, data, workspace, lists, out_control ); + } #if defined(LOG_PERFORMANCE) - Update_Timing_Info( &time, &data->timing.bonded ); + cudaEventRecord( time_event[2] ); #endif - } if ( ret == SUCCESS ) { @@ -2055,24 +2107,79 @@ extern "C" int Cuda_Compute_Forces( reax_system *system, control_params *control } #if defined(LOG_PERFORMANCE) - Update_Timing_Info( &time, &data->timing.cm ); + cudaEventRecord( time_event[3] ); #endif Cuda_Compute_NonBonded_Forces( system, control, data, workspace, lists, out_control, mpi_data ); #if defined(LOG_PERFORMANCE) - Update_Timing_Info( &time, &data->timing.nonb ); + cudaEventRecord( time_event[4] ); #endif Cuda_Compute_Total_Force( system, control, data, workspace, lists, mpi_data ); #if defined(LOG_PERFORMANCE) - Update_Timing_Info( &time, &data->timing.bonded ); + cudaEventRecord( time_event[5] ); #endif init_forces_done = FALSE; } +#if defined(LOG_PERFORMANCE) + if ( cudaEventQuery( time_event[0] ) != cudaSuccess ) + { + cudaEventSynchronize( time_event[0] ); + } + + if ( cudaEventQuery( time_event[1] ) != cudaSuccess ) + { + cudaEventSynchronize( time_event[1] ); + } + + cudaEventElapsedTime( &time_elapsed, time_event[0], time_event[1] ); + data->timing.init_forces += (real) (time_elapsed / 1000.0); + + if ( cudaEventQuery( time_event[2] ) != cudaSuccess ) + { + cudaEventSynchronize( time_event[2] ); + } + + cudaEventElapsedTime( &time_elapsed, time_event[1], time_event[2] ); + data->timing.bonded += (real) (time_elapsed / 1000.0); + + if ( ret == SUCCESS ) + { + if ( cudaEventQuery( time_event[3] ) != cudaSuccess ) + { + cudaEventSynchronize( time_event[3] ); + } + + cudaEventElapsedTime( &time_elapsed, time_event[2], time_event[3] ); + data->timing.cm += (real) (time_elapsed / 1000.0); + + if ( cudaEventQuery( time_event[4] ) != cudaSuccess ) + { + cudaEventSynchronize( time_event[4] ); + } + + cudaEventElapsedTime( &time_elapsed, time_event[3], time_event[4] ); + data->timing.nonb += (real) (time_elapsed / 1000.0); + + if ( cudaEventQuery( time_event[5] ) != cudaSuccess ) + { + cudaEventSynchronize( time_event[5] ); + } + + cudaEventElapsedTime( &time_elapsed, time_event[4], time_event[5] ); + data->timing.bonded += (real) (time_elapsed / 1000.0); + } + + for ( int i = 0; i < 6; ++i ) + { + cudaEventDestroy( time_event[i] ); + } +#endif + return ret; } diff --git a/PG-PuReMD/src/cuda/cuda_init_md.cu b/PG-PuReMD/src/cuda/cuda_init_md.cu index 1ba48522..12262073 100644 --- a/PG-PuReMD/src/cuda/cuda_init_md.cu +++ b/PG-PuReMD/src/cuda/cuda_init_md.cu @@ -24,8 +24,7 @@ static void Cuda_Init_System( reax_system *system, control_params *control, - simulation_data *data, storage *workspace, - mpi_datatypes *mpi_data ) + simulation_data *data, storage *workspace, mpi_datatypes *mpi_data ) { Setup_New_Grid( system, control, MPI_COMM_WORLD ); @@ -110,8 +109,8 @@ void Cuda_Init_Simulation_Data( reax_system *system, control_params *control, control->virial = 0; if ( !control->restart || (control->restart && control->random_vel) ) { - data->therm.G_xi = control->Tau_T * - (2.0 * data->sys_en.e_kin - data->N_f * K_B * control->T ); + data->therm.G_xi = control->Tau_T + * (2.0 * data->sys_en.e_kin - data->N_f * K_B * control->T ); data->therm.v_xi = data->therm.G_xi * control->dt; data->therm.v_xi_old = 0; data->therm.xi = 0; diff --git a/PG-PuReMD/src/cuda/cuda_neighbors.cu b/PG-PuReMD/src/cuda/cuda_neighbors.cu index d937b351..e94cd26b 100644 --- a/PG-PuReMD/src/cuda/cuda_neighbors.cu +++ b/PG-PuReMD/src/cuda/cuda_neighbors.cu @@ -567,9 +567,13 @@ extern "C" int Cuda_Generate_Neighbor_Lists( reax_system *system, { int blocks, ret, ret_far_nbr; #if defined(LOG_PERFORMANCE) - double time; + float time_elapsed; + cudaEvent_t time_event[2]; - time = Get_Time( ); + for ( int i = 0; i < 2; ++i ) + { + cudaEventCreate( &time_event[i] ); + } #endif /* reset reallocation flag on device */ @@ -582,6 +586,10 @@ extern "C" int Cuda_Generate_Neighbor_Lists( reax_system *system, blocks = (system->N / NBRS_BLOCK_SIZE) + ((system->N % NBRS_BLOCK_SIZE) == 0 ? 0 : 1); +#if defined(LOG_PERFORMANCE) + cudaEventRecord( time_event[0] ); +#endif + k_generate_neighbor_lists <<< blocks, NBRS_BLOCK_SIZE >>> ( system->d_my_atoms, system->my_ext_box, system->d_my_grid, *(lists[FAR_NBRS]), @@ -599,6 +607,10 @@ extern "C" int Cuda_Generate_Neighbor_Lists( reax_system *system, // *(lists[FAR_NBRS]), system->n, system->N ); // cudaCheckError( ); +#if defined(LOG_PERFORMANCE) + cudaEventRecord( time_event[1] ); +#endif + /* check reallocation flag on device */ copy_host_device( &ret_far_nbr, system->d_realloc_far_nbrs, sizeof(int), cudaMemcpyDeviceToHost, "Cuda_Generate_Neighbor_Lists::d_realloc_far_nbrs" ); @@ -607,7 +619,18 @@ extern "C" int Cuda_Generate_Neighbor_Lists( reax_system *system, workspace->d_workspace->realloc.far_nbrs = ret_far_nbr; #if defined(LOG_PERFORMANCE) - Update_Timing_Info( &time, &data->timing.nbrs ); + if ( cudaEventQuery( time_event[0] ) != cudaSuccess ) + { + cudaEventSynchronize( time_event[0] ); + } + + if ( cudaEventQuery( time_event[1] ) != cudaSuccess ) + { + cudaEventSynchronize( time_event[1] ); + } + + cudaEventElapsedTime( &time_elapsed, time_event[0], time_event[1] ); + data->timing.nbrs += (real) (time_elapsed / 1000.0); #endif return ret; diff --git a/PG-PuReMD/src/io_tools.c b/PG-PuReMD/src/io_tools.c index d97655b7..488581ed 100644 --- a/PG-PuReMD/src/io_tools.c +++ b/PG-PuReMD/src/io_tools.c @@ -110,9 +110,17 @@ void Init_Output_Files( reax_system *system, control_params *control, out_control->log = sfopen( temp, "w", "Init_Output_Controls::output_control->log" ); - fprintf( out_control->log, "%6s%8s%8s%8s%8s%8s%8s%8s%8s%8s\n", - "step", "total", "comm", "nbrs", "init", "bonded", "nonb", - "charges", "siters", "retries" ); +// fprintf( out_control->log, "%6s%8s%8s%8s%8s%8s%8s%8s%8s%8s\n", +// "step", "total", "comm", "nbrs", "init", "bonded", "nonb", +// "charges", "siters", "retries" ); + + fprintf( out_control->log, "%6s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s\n", + "step", "total", "comm", "nbrs", "init", + "init_dist", "init_cm", "init_bond", + "bonded", "nonb", + "cm", "cm_sort", "cm_iters", "cm_p_comp", "cm_p_app", + "cm_comm", "cm_allr", "cm_spmv", "cm_vec_ops", "cm_orthog", "cm_t_solve", + "retries" ); #if defined(DEBUG) fflush( out_control->log ); @@ -1321,38 +1329,39 @@ void Output_Results( reax_system *system, control_params *control, denom = 1.0; } - fprintf( out_control->log, - "%6d%8.3f%8.3f%8.3f%8.3f%8.3f%8.3f%8.3f%8d%8d\n", - data->step, t_elapsed * denom, data->timing.comm * denom, - data->timing.nbrs * denom, data->timing.init_forces * denom, - data->timing.bonded * denom, data->timing.nonb * denom, - data->timing.cm * denom, - (int) (data->timing.cm_solver_iters * denom), - data->timing.num_retries ); - // fprintf( out_control->log, -// "%6d %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f %10.2f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f\n", -// data->step, -// t_elapsed * denom, -// data->timing.comm * denom, -// data->timing.nbrs * denom, -// data->timing.init_forces * denom, -// data->timing.init_dist * denom, -// data->timing.init_cm * denom, -// data->timing.init_bond * denom, -// data->timing.bonded * denom, -// (data->timing.nonb + data->timing.cm) * denom, +// "%6d%8.3f%8.3f%8.3f%8.3f%8.3f%8.3f%8.3f%8d%8d\n", +// data->step, t_elapsed * denom, data->timing.comm * denom, +// data->timing.nbrs * denom, data->timing.init_forces * denom, +// data->timing.bonded * denom, data->timing.nonb * denom, // data->timing.cm * denom, -// data->timing.cm_sort * denom, -// (double) (data->timing.cm_solver_iters * denom), -// data->timing.cm_solver_pre_comp * denom, -// data->timing.cm_solver_pre_app * denom, -// data->timing.cm_solver_comm * denom, -// data->timing.cm_solver_allreduce * denom, -// data->timing.cm_solver_spmv * denom, -// data->timing.cm_solver_vector_ops * denom, -// data->timing.cm_solver_orthog * denom, -// data->timing.cm_solver_tri_solve * denom ); +// (int) (data->timing.cm_solver_iters * denom), +// data->timing.num_retries ); + + fprintf( out_control->log, + "%6d %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f %10.2f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f %10.2f\n", + data->step, + t_elapsed * denom, + data->timing.comm * denom, + data->timing.nbrs * denom, + data->timing.init_forces * denom, + data->timing.init_dist * denom, + data->timing.init_cm * denom, + data->timing.init_bond * denom, + data->timing.bonded * denom, + (data->timing.nonb + data->timing.cm) * denom, + data->timing.cm * denom, + data->timing.cm_sort * denom, + (double) (data->timing.cm_solver_iters * denom), + data->timing.cm_solver_pre_comp * denom, + data->timing.cm_solver_pre_app * denom, + data->timing.cm_solver_comm * denom, + data->timing.cm_solver_allreduce * denom, + data->timing.cm_solver_spmv * denom, + data->timing.cm_solver_vector_ops * denom, + data->timing.cm_solver_orthog * denom, + data->timing.cm_solver_tri_solve * denom, + data->timing.num_retries * denom ); #if defined(DEBUG) fflush( out_control->log ); diff --git a/PG-PuReMD/src/puremd.c b/PG-PuReMD/src/puremd.c index 7b253903..5b660346 100644 --- a/PG-PuReMD/src/puremd.c +++ b/PG-PuReMD/src/puremd.c @@ -303,7 +303,7 @@ int simulate( const void * const handle ) { ret = SUCCESS; - if ( control->T_mode && retries == 0 ) + if ( control->T_mode > 0 && retries == 0 ) { Temperature_Control( control, data ); } diff --git a/PG-PuReMD/src/restart.c b/PG-PuReMD/src/restart.c index ef932e36..67b06b04 100644 --- a/PG-PuReMD/src/restart.c +++ b/PG-PuReMD/src/restart.c @@ -74,7 +74,7 @@ void Write_Binary_Restart_File( reax_system *system, control_params *control, /* fill in the buffers */ for ( i = 0 ; i < system->n; ++i ) { - p_atom = &(system->my_atoms[i]); + p_atom = &system->my_atoms[i]; buffer[i].orig_id = p_atom->orig_id; buffer[i].type = p_atom->type; strncpy( buffer[i].name, p_atom->name, sizeof(buffer[i].name) - 1 ); @@ -97,7 +97,7 @@ void Write_Binary_Restart_File( reax_system *system, control_params *control, { if ( i != MASTER_NODE ) { - ret = MPI_Recv( buffer + top, system->bigN - top, mpi_data->restart_atom_type, + ret = MPI_Recv( &buffer[top], system->bigN - top, mpi_data->restart_atom_type, i, np * RESTART_ATOMS + i, MPI_COMM_WORLD, &status ); Check_MPI_Error( ret, __FILE__, __LINE__ ); MPI_Get_count( &status, mpi_data->restart_atom_type, &cnt ); -- GitLab