From 0f8f5bf2772591bf247c2d7e6d9d97e4e02493aa Mon Sep 17 00:00:00 2001 From: "Kurt A. O'Hearn" <ohearnku@msu.edu> Date: Fri, 7 May 2021 19:17:42 -0400 Subject: [PATCH] PG-PuReMD: utlize multiple CUDA streams to run independent kernels in the ReaxFF+QEq task dependency graph. --- PG-PuReMD/src/cuda/cuda_allocate.cu | 12 +- PG-PuReMD/src/cuda/cuda_bond_orders.cu | 8 +- PG-PuReMD/src/cuda/cuda_bonds.cu | 15 +- PG-PuReMD/src/cuda/cuda_charges.cu | 70 ++++---- PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu | 15 +- PG-PuReMD/src/cuda/cuda_environment.cu | 40 +++-- PG-PuReMD/src/cuda/cuda_forces.cu | 159 +++++++++++------ PG-PuReMD/src/cuda/cuda_forces.h | 10 +- PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu | 5 +- PG-PuReMD/src/cuda/cuda_init_md.cu | 19 +- PG-PuReMD/src/cuda/cuda_multi_body.cu | 5 +- PG-PuReMD/src/cuda/cuda_nonbonded.cu | 36 ++-- PG-PuReMD/src/cuda/cuda_reset_tools.cu | 5 +- PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu | 205 +++++++++++----------- PG-PuReMD/src/cuda/cuda_system_props.cu | 20 +-- PG-PuReMD/src/cuda/cuda_torsion_angles.cu | 5 +- PG-PuReMD/src/cuda/cuda_valence_angles.cu | 8 +- PG-PuReMD/src/reax_types.h | 13 +- 18 files changed, 372 insertions(+), 278 deletions(-) diff --git a/PG-PuReMD/src/cuda/cuda_allocate.cu b/PG-PuReMD/src/cuda/cuda_allocate.cu index 5c346e7c..87fd72fa 100644 --- a/PG-PuReMD/src/cuda/cuda_allocate.cu +++ b/PG-PuReMD/src/cuda/cuda_allocate.cu @@ -43,10 +43,10 @@ static void Cuda_Reallocate_System_Part1( reax_system *system, { int *temp; - sCudaCheckMalloc( &workspace->scratch, &workspace->scratch_size, + sCudaCheckMalloc( &workspace->scratch[0], &workspace->scratch_size[0], sizeof(int) * local_cap_old, __FILE__, __LINE__ ); - temp = (int *) workspace->scratch; + temp = (int *) workspace->scratch[0]; sCudaMemcpyAsync( temp, system->d_cm_entries, sizeof(int) * local_cap_old, cudaMemcpyDeviceToDevice, control->streams[0], __FILE__, __LINE__ ); @@ -76,11 +76,11 @@ static void Cuda_Reallocate_System_Part2( reax_system *system, control_params *c int *temp; reax_atom *temp_atom; - sCudaCheckMalloc( &workspace->scratch, &workspace->scratch_size, + sCudaCheckMalloc( &workspace->scratch[0], &workspace->scratch_size[0], MAX( sizeof(reax_atom), sizeof(int) ) * total_cap_old, __FILE__, __LINE__ ); - temp = (int *) workspace->scratch; - temp_atom = (reax_atom *) workspace->scratch; + temp = (int *) workspace->scratch[0]; + temp_atom = (reax_atom *) workspace->scratch[0]; /* free the existing storage for atoms, leave other info allocated */ sCudaMemcpyAsync( temp_atom, system->d_my_atoms, sizeof(reax_atom) * total_cap_old, @@ -173,7 +173,7 @@ void Cuda_Allocate_Grid( reax_system *system, control_params *control ) // grid_cell local_cell; grid *host = &system->my_grid; grid *device = &system->d_my_grid; -// ivec *nbrs_x = (ivec *) workspace->scratch; +// ivec *nbrs_x = (ivec *) workspace->scratch[0]; total = host->ncells[0] * host->ncells[1] * host->ncells[2]; ivec_Copy( device->ncells, host->ncells ); diff --git a/PG-PuReMD/src/cuda/cuda_bond_orders.cu b/PG-PuReMD/src/cuda/cuda_bond_orders.cu index b8be616a..c8ff70e9 100644 --- a/PG-PuReMD/src/cuda/cuda_bond_orders.cu +++ b/PG-PuReMD/src/cuda/cuda_bond_orders.cu @@ -999,6 +999,8 @@ void Cuda_Compute_Bond_Orders( reax_system * const system, control_params * cons { int blocks; + cudaStreamWaitEvent( control->streams[0], control->stream_events[2] ); + k_bond_order_part1 <<< control->blocks_n, control->block_size_n, 0, control->streams[0] >>> ( system->d_my_atoms, system->reax_param.d_sbp, @@ -1032,6 +1034,8 @@ void Cuda_Compute_Bond_Orders( reax_system * const system, control_params * cons ( system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_sbp, *(workspace->d_workspace), system->N ); cudaCheckError( ); + + cudaEventRecord( control->stream_events[4], control->streams[0] ); } @@ -1062,9 +1066,9 @@ void Cuda_Total_Forces_Part1( reax_system * const system, control_params * const } else { - sCudaCheckMalloc( &workspace->scratch, &workspace->scratch_size, + sCudaCheckMalloc( &workspace->scratch[0], &workspace->scratch_size[0], sizeof(rvec) * 2 * system->N, __FILE__, __LINE__ ); - spad_rvec = (rvec *) workspace->scratch; + spad_rvec = (rvec *) workspace->scratch[0]; sCudaMemsetAsync( spad_rvec, 0, sizeof(rvec) * 2 * system->N, control->streams[0], __FILE__, __LINE__ ); cudaStreamSynchronize( control->streams[0] ); diff --git a/PG-PuReMD/src/cuda/cuda_bonds.cu b/PG-PuReMD/src/cuda/cuda_bonds.cu index 6f8e9ff9..96f45704 100644 --- a/PG-PuReMD/src/cuda/cuda_bonds.cu +++ b/PG-PuReMD/src/cuda/cuda_bonds.cu @@ -279,19 +279,20 @@ void Cuda_Compute_Bonds( reax_system *system, control_params *control, int update_energy; real *spad; - sCudaCheckMalloc( &workspace->scratch, &workspace->scratch_size, + sCudaCheckMalloc( &workspace->scratch[0], &workspace->scratch_size[0], sizeof(real) * system->n, __FILE__, __LINE__ ); - spad = (real *) workspace->scratch; + spad = (real *) workspace->scratch[0]; update_energy = (out_control->energy_update_freq > 0 && data->step % out_control->energy_update_freq == 0) ? TRUE : FALSE; #else sCudaMemsetAsync( &((simulation_data *)data->d_simulation_data)->my_en.e_bond, - 0, sizeof(real), control->streams[0], __FILE__, __LINE__ ); - cudaStreamSynchronize( control->streams[0] ); + 0, sizeof(real), control->streams[1], __FILE__, __LINE__ ); #endif -// k_bonds <<< control->blocks, control->block_size, 0, control->streams[0] >>> + cudaStreamWaitEvent( control->streams[1], control->stream_events[4] ); + +// k_bonds <<< control->blocks, control->block_size, 0, control->streams[1] >>> // ( system->d_my_atoms, system->reax_param.d_gp, // system->reax_param.d_sbp, system->reax_param.d_tbp, // *(workspace->d_workspace), *(lists[BONDS]), @@ -309,7 +310,7 @@ void Cuda_Compute_Bonds( reax_system *system, control_params *control, k_bonds_opt <<< blocks, DEF_BLOCK_SIZE, sizeof(cub::WarpReduce<double>::TempStorage) * (DEF_BLOCK_SIZE / 32), - control->streams[0] >>> + control->streams[1] >>> ( system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_sbp, system->reax_param.d_tbp, *(workspace->d_workspace), *(lists[BONDS]), @@ -329,4 +330,6 @@ void Cuda_Compute_Bonds( reax_system *system, control_params *control, system->n ); } #endif + + cudaEventRecord( control->stream_events[5], control->streams[1] ); } diff --git a/PG-PuReMD/src/cuda/cuda_charges.cu b/PG-PuReMD/src/cuda/cuda_charges.cu index deaca1fd..b85a1a88 100644 --- a/PG-PuReMD/src/cuda/cuda_charges.cu +++ b/PG-PuReMD/src/cuda/cuda_charges.cu @@ -61,7 +61,7 @@ static void jacobi( reax_system const * const system, blocks = system->n / DEF_BLOCK_SIZE + ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1); - k_jacobi <<< blocks, DEF_BLOCK_SIZE, 0, control->streams[0] >>> + k_jacobi <<< blocks, DEF_BLOCK_SIZE, 0, control->streams[4] >>> ( system->d_my_atoms, system->reax_param.d_sbp, *(workspace->d_workspace), system->n ); cudaCheckError( ); @@ -89,11 +89,9 @@ void Sort_Matrix_Rows( sparse_matrix * const A, reax_system const * const system start = (int *) smalloc( sizeof(int) * system->total_cap, "Sort_Matrix_Rows::start" ); end = (int *) smalloc( sizeof(int) * system->total_cap, "Sort_Matrix_Rows::end" ); sCudaMemcpyAsync( start, A->start, sizeof(int) * system->total_cap, - cudaMemcpyDeviceToHost, control->streams[0], __FILE__, __LINE__ ); - cudaStreamSynchronize( control->streams[0] ); + cudaMemcpyDeviceToHost, control->streams[4], __FILE__, __LINE__ ); sCudaMemcpyAsync( end, A->end, sizeof(int) * system->total_cap, - cudaMemcpyDeviceToHost, control->streams[0], __FILE__, __LINE__ ); - cudaStreamSynchronize( control->streams[0] ); + cudaMemcpyDeviceToHost, control->streams[4], __FILE__, __LINE__ ); /* make copies of column indices and non-zero values */ sCudaMalloc( (void **) &d_j_temp, sizeof(int) * system->total_cm_entries, @@ -101,11 +99,11 @@ void Sort_Matrix_Rows( sparse_matrix * const A, reax_system const * const system sCudaMalloc( (void **) &d_val_temp, sizeof(real) * system->total_cm_entries, __FILE__, __LINE__ ); sCudaMemcpyAsync( d_j_temp, A->j, sizeof(int) * system->total_cm_entries, - cudaMemcpyDeviceToDevice, control->streams[0], __FILE__, __LINE__ ); - cudaStreamSynchronize( control->streams[0] ); + cudaMemcpyDeviceToDevice, control->streams[4], __FILE__, __LINE__ ); sCudaMemcpyAsync( d_val_temp, A->val, sizeof(real) * system->total_cm_entries, - cudaMemcpyDeviceToDevice, control->streams[0], __FILE__, __LINE__ ); - cudaStreamSynchronize( control->streams[0] ); + cudaMemcpyDeviceToDevice, control->streams[4], __FILE__, __LINE__ ); + + cudaStreamSynchronize( control->streams[4] ); for ( i = 0; i < system->n; ++i ) { @@ -246,7 +244,7 @@ static void Spline_Extrapolate_Charges_QEq( reax_system const * const system, + ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1); k_spline_extrapolate_charges_qeq <<< blocks, DEF_BLOCK_SIZE, 0, - control->streams[0] >>> + control->streams[4] >>> ( system->d_my_atoms, system->reax_param.d_sbp, (control_params *)control->d_control_params, *(workspace->d_workspace), system->n ); @@ -418,21 +416,21 @@ static void Extrapolate_Charges_QEq_Part2( reax_system const * const system, blocks = system->n / DEF_BLOCK_SIZE + ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1); - sCudaCheckMalloc( &workspace->scratch, &workspace->scratch_size, + sCudaCheckMalloc( &workspace->scratch[4], &workspace->scratch_size[4], sizeof(real) * system->n, __FILE__, __LINE__ ); - spad = (real *) workspace->scratch; + spad = (real *) workspace->scratch[4]; sCudaMemsetAsync( spad, 0, sizeof(real) * system->n, - control->streams[0], __FILE__, __LINE__ ); - cudaStreamSynchronize( control->streams[0] ); + control->streams[4], __FILE__, __LINE__ ); k_extrapolate_charges_qeq_part2 <<< blocks, DEF_BLOCK_SIZE, 0, - control->streams[0] >>> + control->streams[4] >>> ( system->d_my_atoms, *(workspace->d_workspace), u, spad, system->n ); cudaCheckError( ); sCudaMemcpyAsync( q, spad, sizeof(real) * system->n, - cudaMemcpyDeviceToHost, control->streams[0], __FILE__, __LINE__ ); - cudaStreamSynchronize( control->streams[0] ); + cudaMemcpyDeviceToHost, control->streams[4], __FILE__, __LINE__ ); + + cudaStreamSynchronize( control->streams[4] ); } @@ -462,15 +460,17 @@ static void Update_Ghost_Atom_Charges( reax_system const * const system, blocks = (system->N - system->n) / DEF_BLOCK_SIZE + (((system->N - system->n) % DEF_BLOCK_SIZE == 0) ? 0 : 1); - sCudaCheckMalloc( &workspace->scratch, &workspace->scratch_size, + sCudaCheckMalloc( &workspace->scratch[4], &workspace->scratch_size[4], sizeof(real) * (system->N - system->n), __FILE__, __LINE__ ); - spad = (real *) workspace->scratch; + spad = (real *) workspace->scratch[4]; + sCudaMemcpyAsync( spad, &q[system->n], sizeof(real) * (system->N - system->n), - cudaMemcpyHostToDevice, control->streams[0], __FILE__, __LINE__ ); - cudaStreamSynchronize( control->streams[0] ); + cudaMemcpyHostToDevice, control->streams[4], __FILE__, __LINE__ ); + + cudaStreamSynchronize( control->streams[4] ); k_update_ghost_atom_charges <<< blocks, DEF_BLOCK_SIZE, 0, - control->streams[0] >>> + control->streams[4] >>> ( system->d_my_atoms, spad, system->n, system->N ); cudaCheckError( ); } @@ -494,41 +494,43 @@ static void Calculate_Charges_QEq( reax_system const * const system, blocks = system->n / DEF_BLOCK_SIZE + ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1); - sCudaCheckMalloc( &workspace->scratch, &workspace->scratch_size, + sCudaCheckMalloc( &workspace->scratch[4], &workspace->scratch_size[4], sizeof(rvec2) * (blocks + 1), __FILE__, __LINE__ ); - spad = (rvec2 *) workspace->scratch; + spad = (rvec2 *) workspace->scratch[4]; + sCudaMemsetAsync( spad, 0, sizeof(rvec2) * (blocks + 1), - control->streams[0], __FILE__, __LINE__ ); - cudaStreamSynchronize( control->streams[0] ); + control->streams[4], __FILE__, __LINE__ ); /* compute local sums of pseudo-charges in s and t on device */ k_reduction_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec2) * (DEF_BLOCK_SIZE / 32), - control->streams[0] >>> + control->streams[4] >>> ( workspace->d_workspace->x, spad, system->n ); cudaCheckError( ); k_reduction_rvec2 <<< 1, ((blocks + 31) / 32) * 32, sizeof(rvec2) * ((blocks + 31) / 32), - control->streams[0] >>> + control->streams[4] >>> ( spad, &spad[blocks], blocks ); cudaCheckError( ); sCudaMemcpyAsync( &my_sum, &spad[blocks], sizeof(rvec2), - cudaMemcpyDeviceToHost, control->streams[0], __FILE__, __LINE__ ); - cudaStreamSynchronize( control->streams[0] ); + cudaMemcpyDeviceToHost, control->streams[4], __FILE__, __LINE__ ); + + cudaStreamSynchronize( control->streams[4] ); #else - sCudaCheckMalloc( &workspace->scratch, &workspace->scratch_size, + sCudaCheckMalloc( &workspace->scratch[4], &workspace->scratch_size[4], sizeof(real) * 2, __FILE__, __LINE__ ); - spad = (real *) workspace->scratch; + spad = (real *) workspace->scratch[4]; /* local reductions (sums) on device */ Cuda_Reduction_Sum( workspace->d_workspace->s, &spad[0], system->n ); Cuda_Reduction_Sum( workspace->d_workspace->t, &spad[1], system->n ); sCudaMemcpyAsync( my_sum, spad, sizeof(real) * 2, - cudaMemcpyDeviceToHost, control->streams[0], __FILE__, __LINE__ ); - cudaStreamSynchronize( control->streams[0] ); + cudaMemcpyDeviceToHost, control->streams[4], __FILE__, __LINE__ ); + + cudaStreamSynchronize( control->streams[4] ); #endif /* global reduction on pseudo-charges for s and t */ diff --git a/PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu b/PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu index b4c62dc2..57652907 100644 --- a/PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu +++ b/PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu @@ -562,9 +562,9 @@ real Dot( storage * const workspace, real temp; //#endif - sCudaCheckMalloc( &workspace->scratch, &workspace->scratch_size, + sCudaCheckMalloc( &workspace->scratch[4], &workspace->scratch_size[4], sizeof(real) * (k + 1), __FILE__, __LINE__ ); - spad = (real *) workspace->scratch; + spad = (real *) workspace->scratch[4]; Vector_Mult( spad, v1, v2, k ); @@ -578,6 +578,7 @@ real Dot( storage * const workspace, //#else sCudaMemcpyAsync( &temp, &spad[k], sizeof(real), cudaMemcpyDeviceToHost, s, __FILE__, __LINE__ ); + cudaStreamSynchronize( s ); ret = MPI_Allreduce( &temp, &sum, 1, MPI_DOUBLE, MPI_SUM, comm ); @@ -603,9 +604,9 @@ real Dot_local( storage * const workspace, { real sum, *spad; - sCudaCheckMalloc( &workspace->scratch, &workspace->scratch_size, + sCudaCheckMalloc( &workspace->scratch[4], &workspace->scratch_size[4], sizeof(real) * (k + 1), __FILE__, __LINE__ ); - spad = (real *) workspace->scratch; + spad = (real *) workspace->scratch[4]; Vector_Mult( spad, v1, v2, k ); @@ -615,6 +616,7 @@ real Dot_local( storage * const workspace, //TODO: keep result of reduction on devie and pass directly to CUDA-aware MPI sCudaMemcpyAsync( &sum, &spad[k], sizeof(real), cudaMemcpyDeviceToHost, s, __FILE__, __LINE__ ); + cudaStreamSynchronize( s ); return sum; @@ -640,9 +642,9 @@ void Dot_local_rvec2( storage * const workspace, blocks = (k / DEF_BLOCK_SIZE) + ((k % DEF_BLOCK_SIZE == 0) ? 0 : 1); - sCudaCheckMalloc( &workspace->scratch, &workspace->scratch_size, + sCudaCheckMalloc( &workspace->scratch[4], &workspace->scratch_size[4], sizeof(rvec2) * (k + blocks + 1), __FILE__, __LINE__ ); - spad = (rvec2 *) workspace->scratch; + spad = (rvec2 *) workspace->scratch[4]; Vector_Mult_rvec2( spad, v1, v2, k ); @@ -662,6 +664,7 @@ void Dot_local_rvec2( storage * const workspace, //TODO: keep result of reduction on devie and pass directly to CUDA-aware MPI sCudaMemcpyAsync( &sum, &spad[k + blocks], sizeof(rvec2), cudaMemcpyDeviceToHost, s, __FILE__, __LINE__ ); + cudaStreamSynchronize( s ); *sum1 = sum[0]; diff --git a/PG-PuReMD/src/cuda/cuda_environment.cu b/PG-PuReMD/src/cuda/cuda_environment.cu index 6cfecd92..4a1b26ea 100644 --- a/PG-PuReMD/src/cuda/cuda_environment.cu +++ b/PG-PuReMD/src/cuda/cuda_environment.cu @@ -62,17 +62,17 @@ extern "C" void Cuda_Setup_Environment( reax_system const * const system, exit( CANNOT_INITIALIZE ); } - /* stream assignment: - * 0: init bonds, bond order (uncorrected/corrected), lone pair/over coord/under coord - * 1: (after bond order) bonds, valence angels, torsions - * 2: init hbonds, (after bonds) hbonds - * 3: van der Waals - * 4: init CM, CM, Coulomb + /* stream assignment (default to 0 for any kernel not listed): + * 0: init dist, init CM, bond order (uncorrected/corrected), lone pair/over coord/under coord + * 1: (after init dist) init bonds, (after bond order) bonds, valence angles, torsions + * 2: (after init dist) init hbonds, (after bonds) hbonds + * 3: (after init dist) van der Waals + * 4: (after init CM) CM, Coulomb */ - for ( i = 0; i < CUDA_MAX_STREAMS; ++i ) + for ( i = 0; i < MAX_CUDA_STREAMS; ++i ) { /* all non-CM streams of equal priority */ - if ( i < CUDA_MAX_STREAMS - 1 ) + if ( i < MAX_CUDA_STREAMS - 1 ) { ret = cudaStreamCreateWithPriority( &control->streams[i], cudaStreamNonBlocking, least_priority ); } @@ -84,7 +84,27 @@ extern "C" void Cuda_Setup_Environment( reax_system const * const system, if ( ret != cudaSuccess ) { - fprintf( stderr, "[ERROR] CUDA strema creation failed (%d). Terminating...\n", + fprintf( stderr, "[ERROR] CUDA stream creation failed (%d). Terminating...\n", + i ); + exit( CANNOT_INITIALIZE ); + } + } + + /* stream event assignment: + * 0: init dist done (stream 0) + * 1: init CM done (stream 4) + * 2: init bonds done (stream 1) + * 3: init hbonds done (stream 2) + * 4: bond orders done (stream 0) + * 5: bonds done (stream 1) + */ + for ( i = 0; i < MAX_CUDA_STREAM_EVENTS; ++i ) + { + ret = cudaEventCreateWithFlags( &control->stream_events[i], cudaEventDisableTiming ); + + if ( ret != cudaSuccess ) + { + fprintf( stderr, "[ERROR] CUDA event creation failed (%d). Terminating...\n", i ); exit( CANNOT_INITIALIZE ); } @@ -112,7 +132,7 @@ extern "C" void Cuda_Cleanup_Environment( control_params const * const control ) int i; cudaError_t ret; - for ( i = 0; i < CUDA_MAX_STREAMS; ++i ) + for ( i = 0; i < MAX_CUDA_STREAMS; ++i ) { ret = cudaStreamDestroy( control->streams[i] ); diff --git a/PG-PuReMD/src/cuda/cuda_forces.cu b/PG-PuReMD/src/cuda/cuda_forces.cu index 120f0a43..bc3bb518 100644 --- a/PG-PuReMD/src/cuda/cuda_forces.cu +++ b/PG-PuReMD/src/cuda/cuda_forces.cu @@ -1671,23 +1671,22 @@ void Cuda_Init_Neighbor_Indices( reax_system *system, control_params *control, /* Initialize indices for far hydrogen bonds list post reallocation * * system: atomic system info. */ -void Cuda_Init_HBond_Indices( reax_system *system, control_params *control, - storage *workspace, reax_list *hbond_list ) +void Cuda_Init_HBond_Indices( reax_system *system, storage *workspace, + reax_list *hbond_list, cudaStream_t s ) { int blocks, *temp; blocks = system->total_cap / DEF_BLOCK_SIZE + (system->total_cap % DEF_BLOCK_SIZE == 0 ? 0 : 1); - sCudaCheckMalloc( &workspace->scratch, &workspace->scratch_size, + sCudaCheckMalloc( &workspace->scratch[2], &workspace->scratch_size[2], sizeof(int) * system->total_cap, __FILE__, __LINE__ ); - temp = (int *) workspace->scratch; + temp = (int *) workspace->scratch[2]; /* init indices and end_indices */ - Cuda_Scan_Excl_Sum( system->d_max_hbonds, temp, system->total_cap, - control->streams[0] ); + Cuda_Scan_Excl_Sum( system->d_max_hbonds, temp, system->total_cap, s ); - k_init_hbond_indices <<< blocks, DEF_BLOCK_SIZE, 0, control->streams[0] >>> + k_init_hbond_indices <<< blocks, DEF_BLOCK_SIZE, 0, s >>> ( system->d_my_atoms, system->reax_param.d_sbp, system->d_hbonds, temp, hbond_list->index, hbond_list->end_index, system->total_cap ); cudaCheckError( ); @@ -1697,8 +1696,8 @@ void Cuda_Init_HBond_Indices( reax_system *system, control_params *control, /* Initialize indices for far bonds list post reallocation * * system: atomic system info. */ -void Cuda_Init_Bond_Indices( reax_system *system, control_params * control, - reax_list * bond_list ) +void Cuda_Init_Bond_Indices( reax_system *system, reax_list * bond_list, + cudaStream_t s ) { int blocks; @@ -1706,11 +1705,11 @@ void Cuda_Init_Bond_Indices( reax_system *system, control_params * control, (system->total_cap % DEF_BLOCK_SIZE == 0 ? 0 : 1); /* init indices */ - Cuda_Scan_Excl_Sum( system->d_max_bonds, bond_list->index, system->total_cap, - control->streams[0] ); + Cuda_Scan_Excl_Sum( system->d_max_bonds, bond_list->index, + system->total_cap, s ); /* init end_indices */ - k_init_end_index <<< blocks, DEF_BLOCK_SIZE, 0, control->streams[0] >>> + k_init_end_index <<< blocks, DEF_BLOCK_SIZE, 0, s >>> ( system->d_bonds, bond_list->index, bond_list->end_index, system->total_cap ); cudaCheckError( ); } @@ -1720,8 +1719,8 @@ void Cuda_Init_Bond_Indices( reax_system *system, control_params * control, * * system: atomic system info. * H: charge matrix */ -void Cuda_Init_Sparse_Matrix_Indices( reax_system *system, control_params *control, - sparse_matrix *H ) +void Cuda_Init_Sparse_Matrix_Indices( reax_system *system, sparse_matrix *H, + cudaStream_t s ) { int blocks; @@ -1729,12 +1728,11 @@ void Cuda_Init_Sparse_Matrix_Indices( reax_system *system, control_params *contr + (H->n_max % DEF_BLOCK_SIZE == 0 ? 0 : 1); /* init indices */ - Cuda_Scan_Excl_Sum( system->d_max_cm_entries, H->start, H->n_max, - control->streams[0] ); + Cuda_Scan_Excl_Sum( system->d_max_cm_entries, H->start, H->n_max, s ); //TODO: not needed for full format (Init_Forces sets H->end) /* init end_indices */ - k_init_end_index <<< blocks, DEF_BLOCK_SIZE, 0, control->streams[0] >>> + k_init_end_index <<< blocks, DEF_BLOCK_SIZE, 0, s >>> ( system->d_cm_entries, H->start, H->end, H->n_max ); cudaCheckError( ); } @@ -1848,9 +1846,9 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, static int dist_done = FALSE, cm_done = FALSE, bonds_done = FALSE, hbonds_done = FALSE; #if defined(LOG_PERFORMANCE) float time_elapsed; - cudaEvent_t time_event[4]; + cudaEvent_t time_event[7]; - for ( int i = 0; i < 4; ++i ) + for ( int i = 0; i < 7; ++i ) { cudaEventCreate( &time_event[i] ); } @@ -1859,13 +1857,21 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, renbr = (data->step - data->prev_steps) % control->reneighbor == 0 ? TRUE : FALSE; /* reset reallocation flags on device */ - sCudaMemsetAsync( system->d_realloc_cm_entries, FALSE, sizeof(int), - control->streams[0], __FILE__, __LINE__ ); - sCudaMemsetAsync( system->d_realloc_bonds, FALSE, sizeof(int), - control->streams[0], __FILE__, __LINE__ ); - sCudaMemsetAsync( system->d_realloc_hbonds, FALSE, sizeof(int), - control->streams[0], __FILE__, __LINE__ ); - cudaStreamSynchronize( control->streams[0] ); + if ( cm_done == FALSE ) + { + sCudaMemsetAsync( system->d_realloc_cm_entries, FALSE, sizeof(int), + control->streams[4], __FILE__, __LINE__ ); + } + if ( bonds_done == FALSE ) + { + sCudaMemsetAsync( system->d_realloc_bonds, FALSE, sizeof(int), + control->streams[1], __FILE__, __LINE__ ); + } + if ( hbonds_done == FALSE ) + { + sCudaMemsetAsync( system->d_realloc_hbonds, FALSE, sizeof(int), + control->streams[2], __FILE__, __LINE__ ); + } #if defined(LOG_PERFORMANCE) cudaEventRecord( time_event[0], control->streams[0] ); @@ -1883,11 +1889,13 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, ( system->d_my_atoms, *(lists[FAR_NBRS]), system->N ); cudaCheckError( ); + cudaEventRecord( control->stream_events[0], control->streams[0] ); + dist_done = TRUE; } #if defined(LOG_PERFORMANCE) - cudaEventRecord( time_event[1], control->streams[0] ); + cudaEventRecord( time_event[1], control->streams[4] ); #endif if ( cm_done == FALSE ) @@ -1898,13 +1906,16 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, /* update num. rows in matrix for this GPU */ workspace->d_workspace->H.n = system->n; - Cuda_Init_Sparse_Matrix_Indices( system, control, &workspace->d_workspace->H ); + Cuda_Init_Sparse_Matrix_Indices( system, &workspace->d_workspace->H, + control->streams[4] ); + + cudaStreamWaitEvent( control->streams[4], control->stream_events[0] ); if ( workspace->d_workspace->H.format == SYM_HALF_MATRIX ) { if ( control->tabulate <= 0 ) { - k_init_cm_half_fs <<< blocks, DEF_BLOCK_SIZE, 0, control->streams[0] >>> + k_init_cm_half_fs <<< blocks, DEF_BLOCK_SIZE, 0, control->streams[4] >>> ( system->d_my_atoms, system->reax_param.d_sbp, system->reax_param.d_tbp, *(workspace->d_workspace), (control_params *) control->d_control_params, *(lists[FAR_NBRS]), system->reax_param.num_atom_types, @@ -1912,7 +1923,7 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, } else { - k_init_cm_half_fs_tab <<< blocks, DEF_BLOCK_SIZE, 0, control->streams[0] >>> + k_init_cm_half_fs_tab <<< blocks, DEF_BLOCK_SIZE, 0, control->streams[4] >>> ( system->d_my_atoms, system->reax_param.d_sbp, *(workspace->d_workspace), (control_params *) control->d_control_params, *(lists[FAR_NBRS]), workspace->d_LR, system->reax_param.num_atom_types, @@ -1923,7 +1934,7 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, { if ( control->tabulate <= 0 ) { -// k_init_cm_full_fs <<< blocks, DEF_BLOCK_SIZE, 0, control->streams[0] >>> +// k_init_cm_full_fs <<< blocks, DEF_BLOCK_SIZE, 0, control->streams[4] >>> // ( system->d_my_atoms, system->reax_param.d_sbp, system->reax_param.d_tbp, // *(workspace->d_workspace), (control_params *) control->d_control_params, // *(lists[FAR_NBRS]), system->reax_param.num_atom_types, @@ -1934,7 +1945,7 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, k_init_cm_full_fs_opt <<< blocks, DEF_BLOCK_SIZE, sizeof(cub::WarpScan<int>::TempStorage) * (DEF_BLOCK_SIZE / 32), - control->streams[0] >>> + control->streams[4] >>> ( system->d_my_atoms, system->reax_param.d_sbp, system->reax_param.d_tbp, *(workspace->d_workspace), (control_params *) control->d_control_params, *(lists[FAR_NBRS]), system->reax_param.num_atom_types, @@ -1943,7 +1954,7 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, else { k_init_cm_full_fs_tab <<< blocks, DEF_BLOCK_SIZE, 0, - control->streams[0] >>> + control->streams[4] >>> ( system->d_my_atoms, system->reax_param.d_sbp, *(workspace->d_workspace), (control_params *) control->d_control_params, *(lists[FAR_NBRS]), workspace->d_LR, system->reax_param.num_atom_types, @@ -1951,10 +1962,13 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, } } cudaCheckError( ); + + cudaEventRecord( control->stream_events[1], control->streams[4] ); } #if defined(LOG_PERFORMANCE) - cudaEventRecord( time_event[2], control->streams[0] ); + cudaEventRecord( time_event[2], control->streams[4] ); + cudaEventRecord( time_event[3], control->streams[1] ); #endif if ( bonds_done == FALSE ) @@ -1962,13 +1976,15 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, blocks = system->total_cap / DEF_BLOCK_SIZE + ((system->total_cap % DEF_BLOCK_SIZE == 0 ) ? 0 : 1); - k_reset_bond_orders <<< blocks, DEF_BLOCK_SIZE, 0, control->streams[0] >>> + k_reset_bond_orders <<< blocks, DEF_BLOCK_SIZE, 0, control->streams[1] >>> ( *(workspace->d_workspace), system->total_cap ); cudaCheckError( ); - Cuda_Init_Bond_Indices( system, control, lists[BONDS] ); + Cuda_Init_Bond_Indices( system, lists[BONDS], control->streams[1] ); -// k_init_bonds <<< control->blocks_n, control->block_size_n, 0, control->streams[0] >>> + cudaStreamWaitEvent( control->streams[1], control->stream_events[0] ); + +// k_init_bonds <<< control->blocks_n, control->block_size_n, 0, control->streams[1] >>> // ( system->d_my_atoms, system->reax_param.d_sbp, // system->reax_param.d_tbp, *(workspace->d_workspace), // (control_params *) control->d_control_params, @@ -1982,7 +1998,7 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, k_init_bonds_opt <<< blocks, DEF_BLOCK_SIZE, sizeof(cub::WarpScan<int>::TempStorage) * (DEF_BLOCK_SIZE / 32), - control->streams[0] >>> + control->streams[1] >>> ( system->d_my_atoms, system->reax_param.d_sbp, system->reax_param.d_tbp, *(workspace->d_workspace), (control_params *) control->d_control_params, @@ -1990,13 +2006,23 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, system->n, system->N, system->reax_param.num_atom_types, system->d_max_bonds, system->d_realloc_bonds ); cudaCheckError( ); + + cudaEventRecord( control->stream_events[2], control->streams[1] ); } +#if defined(LOG_PERFORMANCE) + cudaEventRecord( time_event[4], control->streams[1] ); + cudaEventRecord( time_event[5], control->streams[2] ); +#endif + if ( control->hbond_cut > 0.0 && system->numH > 0 && hbonds_done == FALSE ) { - Cuda_Init_HBond_Indices( system, control, workspace, lists[HBONDS] ); + Cuda_Init_HBond_Indices( system, workspace, lists[HBONDS], + control->streams[2] ); -// k_init_hbonds <<< control->blocks_n, control->block_size_n, 0, control->streams[0] >>> + cudaStreamWaitEvent( control->streams[2], control->stream_events[0] ); + +// k_init_hbonds <<< control->blocks_n, control->block_size_n, 0, control->streams[2] >>> // ( system->d_my_atoms, system->reax_param.d_sbp, // (control_params *) control->d_control_params, // *(lists[FAR_NBRS]), *(lists[HBONDS]), @@ -2009,17 +2035,18 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, k_init_hbonds_opt <<< blocks, DEF_BLOCK_SIZE, sizeof(cub::WarpScan<int>::TempStorage) * (DEF_BLOCK_SIZE / 32), - control->streams[0] >>> + control->streams[2] >>> ( system->d_my_atoms, system->reax_param.d_sbp, (control_params *) control->d_control_params, *(lists[FAR_NBRS]), *(lists[HBONDS]), system->n, system->N, system->reax_param.num_atom_types, system->d_max_hbonds, system->d_realloc_hbonds ); cudaCheckError( ); - } + cudaEventRecord( control->stream_events[3], control->streams[2] ); + } #if defined(LOG_PERFORMANCE) - cudaEventRecord( time_event[3], control->streams[0] ); + cudaEventRecord( time_event[6], control->streams[2] ); #endif /* check reallocation flags on device */ @@ -2035,7 +2062,7 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, if ( bonds_done == FALSE ) { sCudaMemcpyAsync( &realloc_bonds, system->d_realloc_bonds, sizeof(int), - cudaMemcpyDeviceToHost, control->streams[0], __FILE__, __LINE__ ); + cudaMemcpyDeviceToHost, control->streams[1], __FILE__, __LINE__ ); } else { @@ -2044,14 +2071,16 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, if ( hbonds_done == FALSE ) { sCudaMemcpyAsync( &realloc_hbonds, system->d_realloc_hbonds, sizeof(int), - cudaMemcpyDeviceToHost, control->streams[0], __FILE__, __LINE__ ); + cudaMemcpyDeviceToHost, control->streams[2], __FILE__, __LINE__ ); } else { realloc_hbonds = FALSE; } - cudaStreamSynchronize( control->streams[0] ); + cudaStreamSynchronize( control->streams[4] ); + cudaStreamSynchronize( control->streams[1] ); + cudaStreamSynchronize( control->streams[2] ); #if defined(LOG_PERFORMANCE) if ( cudaEventQuery( time_event[0] ) != cudaSuccess ) @@ -2080,10 +2109,28 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, cudaEventSynchronize( time_event[3] ); } - cudaEventElapsedTime( &time_elapsed, time_event[2], time_event[3] ); + if ( cudaEventQuery( time_event[4] ) != cudaSuccess ) + { + cudaEventSynchronize( time_event[4] ); + } + + cudaEventElapsedTime( &time_elapsed, time_event[3], time_event[4] ); + data->timing.init_bond += (real) (time_elapsed / 1000.0); + + if ( cudaEventQuery( time_event[5] ) != cudaSuccess ) + { + cudaEventSynchronize( time_event[5] ); + } + + if ( cudaEventQuery( time_event[6] ) != cudaSuccess ) + { + cudaEventSynchronize( time_event[6] ); + } + + cudaEventElapsedTime( &time_elapsed, time_event[5], time_event[6] ); data->timing.init_bond += (real) (time_elapsed / 1000.0); - for ( int i = 0; i < 4; ++i ) + for ( int i = 0; i < 7; ++i ) { cudaEventDestroy( time_event[i] ); } @@ -2204,7 +2251,7 @@ int Cuda_Init_Forces_No_Charges( reax_system *system, control_params *control, ( *(workspace->d_workspace), system->total_cap ); cudaCheckError( ); - Cuda_Init_Bond_Indices( system, control, lists[BONDS] ); + Cuda_Init_Bond_Indices( system, lists[BONDS], control->streams[0] ); // k_init_bonds <<< control->blocks_n, control->block_size_n, 0, control->streams[0] >>> // ( system->d_my_atoms, system->reax_param.d_sbp, @@ -2231,7 +2278,8 @@ int Cuda_Init_Forces_No_Charges( reax_system *system, control_params *control, if ( control->hbond_cut > 0.0 && system->numH > 0 && hbonds_done == FALSE ) { - Cuda_Init_HBond_Indices( system, control, workspace, lists[HBONDS] ); + Cuda_Init_HBond_Indices( system, workspace, lists[HBONDS], + control->streams[0] ); // k_init_hbonds <<< control->blocks_n, control->block_size_n, 0, control->streams[0] >>> // ( system->d_my_atoms, system->reax_param.d_sbp, @@ -2441,13 +2489,13 @@ extern "C" int Cuda_Compute_Forces( reax_system *system, control_params *control simulation_data *data, storage *workspace, reax_list **lists, output_controls *out_control, mpi_datatypes *mpi_data ) { - int charge_flag, ret; + int i, charge_flag, ret; static int init_forces_done = FALSE; #if defined(LOG_PERFORMANCE) float time_elapsed; cudaEvent_t time_event[6]; - for ( int i = 0; i < 6; ++i ) + for ( i = 0; i < 6; ++i ) { cudaEventCreate( &time_event[i] ); } @@ -2521,6 +2569,11 @@ extern "C" int Cuda_Compute_Forces( reax_system *system, control_params *control cudaEventRecord( time_event[4], control->streams[0] ); #endif + for ( i = 0; i < MAX_CUDA_STREAMS; ++i ) + { + cudaStreamSynchronize( control->streams[i] ); + } + Cuda_Compute_Total_Force( system, control, data, workspace, lists, mpi_data ); #if defined(LOG_PERFORMANCE) @@ -2579,7 +2632,7 @@ extern "C" int Cuda_Compute_Forces( reax_system *system, control_params *control data->timing.bonded += (real) (time_elapsed / 1000.0); } - for ( int i = 0; i < 6; ++i ) + for ( i = 0; i < 6; ++i ) { cudaEventDestroy( time_event[i] ); } diff --git a/PG-PuReMD/src/cuda/cuda_forces.h b/PG-PuReMD/src/cuda/cuda_forces.h index 3347db7a..fde3edf9 100644 --- a/PG-PuReMD/src/cuda/cuda_forces.h +++ b/PG-PuReMD/src/cuda/cuda_forces.h @@ -7,13 +7,13 @@ void Cuda_Init_Neighbor_Indices( reax_system *, control_params *, reax_list * ); -void Cuda_Init_HBond_Indices( reax_system *, control_params *, storage *, - reax_list * ); +void Cuda_Init_HBond_Indices( reax_system *, storage *, reax_list *, + cudaStream_t ); -void Cuda_Init_Bond_Indices( reax_system *, control_params *, reax_list * ); +void Cuda_Init_Bond_Indices( reax_system *, reax_list *, cudaStream_t ); -void Cuda_Init_Sparse_Matrix_Indices( reax_system *, control_params *, - sparse_matrix * ); +void Cuda_Init_Sparse_Matrix_Indices( reax_system *, sparse_matrix *, + cudaStream_t ); void Cuda_Init_Three_Body_Indices( int *, int, reax_list ** ); diff --git a/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu b/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu index c9db3090..ecb88b49 100644 --- a/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu +++ b/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu @@ -749,10 +749,10 @@ void Cuda_Compute_Hydrogen_Bonds( reax_system *system, control_params *control, real *spad; rvec *rvec_spad; - sCudaCheckMalloc( &workspace->scratch, &workspace->scratch_size, + sCudaCheckMalloc( &workspace->scratch[0], &workspace->scratch_size[0], (sizeof(real) * 3 + sizeof(rvec)) * system->N + sizeof(rvec) * control->blocks_n, __FILE__, __LINE__ ); - spad = (real *) workspace->scratch; + spad = (real *) workspace->scratch[0]; update_energy = (out_control->energy_update_freq > 0 && data->step % out_control->energy_update_freq == 0) ? TRUE : FALSE; #else @@ -763,7 +763,6 @@ void Cuda_Compute_Hydrogen_Bonds( reax_system *system, control_params *control, sCudaMemsetAsync( &((simulation_data *)data->d_simulation_data)->my_ext_press, 0, sizeof(rvec), control->streams[0], __FILE__, __LINE__ ); } - cudaStreamSynchronize( control->streams[0] ); #endif if ( control->virial == 1 ) diff --git a/PG-PuReMD/src/cuda/cuda_init_md.cu b/PG-PuReMD/src/cuda/cuda_init_md.cu index ad777753..43b33bd0 100644 --- a/PG-PuReMD/src/cuda/cuda_init_md.cu +++ b/PG-PuReMD/src/cuda/cuda_init_md.cu @@ -210,17 +210,19 @@ void Cuda_Init_Lists( reax_system *system, control_params *control, Cuda_Allocate_Matrix( &workspace->d_workspace->H, system->n, system->local_cap, system->total_cm_entries, SYM_FULL_MATRIX ); - Cuda_Init_Sparse_Matrix_Indices( system, control, &workspace->d_workspace->H ); + Cuda_Init_Sparse_Matrix_Indices( system, &workspace->d_workspace->H, + control->streams[0] ); Cuda_Make_List( system->total_cap, system->total_bonds, TYP_BOND, lists[BONDS] ); - Cuda_Init_Bond_Indices( system, control, lists[BONDS] ); + Cuda_Init_Bond_Indices( system, lists[BONDS], control->streams[0] ); if ( control->hbond_cut > 0.0 && system->numH > 0 ) { Cuda_Make_List( system->total_cap, system->total_hbonds, TYP_HBOND, lists[HBONDS] ); - Cuda_Init_HBond_Indices( system, control, workspace, lists[HBONDS] ); + Cuda_Init_HBond_Indices( system, workspace, lists[HBONDS], + control->streams[0] ); } /* 3bodies list: since a more accurate estimate of the num. @@ -234,6 +236,8 @@ extern "C" void Cuda_Initialize( reax_system *system, control_params *control, reax_list **lists, output_controls *out_control, mpi_datatypes *mpi_data ) { + int i; + Init_MPI_Datatypes( system, workspace, mpi_data ); #if defined(CUDA_DEVICE_PACK) @@ -249,7 +253,7 @@ extern "C" void Cuda_Initialize( reax_system *system, control_params *control, mpi_data->d_in2_buffer = NULL; mpi_data->d_in2_buffer_size = 0; - for ( int i = 0; i < MAX_NBRS; ++i ) + for ( i = 0; i < MAX_NBRS; ++i ) { mpi_data->d_out_buffers[i].cnt = 0; mpi_data->d_out_buffers[i].index = NULL; @@ -263,8 +267,11 @@ extern "C" void Cuda_Initialize( reax_system *system, control_params *control, /* scratch space - set before Cuda_Init_Workspace * as Cuda_Init_System utilizes these variables */ - workspace->scratch = NULL; - workspace->scratch_size = 0; + for ( i = 0; i < MAX_CUDA_STREAMS; ++i ) + { + workspace->scratch[i] = NULL; + workspace->scratch_size[i] = 0; + } workspace->host_scratch = NULL; workspace->host_scratch_size = 0; diff --git a/PG-PuReMD/src/cuda/cuda_multi_body.cu b/PG-PuReMD/src/cuda/cuda_multi_body.cu index d7d81cc7..e932dfec 100644 --- a/PG-PuReMD/src/cuda/cuda_multi_body.cu +++ b/PG-PuReMD/src/cuda/cuda_multi_body.cu @@ -548,10 +548,10 @@ void Cuda_Compute_Atom_Energy( reax_system *system, control_params *control, int update_energy; real *spad; - sCudaCheckMalloc( &workspace->scratch, &workspace->scratch_size, + sCudaCheckMalloc( &workspace->scratch[0], &workspace->scratch_size[0], sizeof(real) * 3 * system->n, __FILE__, __LINE__ ); - spad = (real *) workspace->scratch; + spad = (real *) workspace->scratch[0]; update_energy = (out_control->energy_update_freq > 0 && data->step % out_control->energy_update_freq == 0) ? TRUE : FALSE; #else @@ -561,7 +561,6 @@ void Cuda_Compute_Atom_Energy( reax_system *system, control_params *control, 0, sizeof(real), control->streams[0], __FILE__, __LINE__ ); sCudaMemsetAsync( &((simulation_data *)data->d_simulation_data)->my_en.e_un, 0, sizeof(real), control->streams[0], __FILE__, __LINE__ ); - cudaStreamSynchronize( control->streams[0] ); #endif // k_atom_energy_part1 <<< control->blocks, control->block_size, 0, control->streams[0] >>> diff --git a/PG-PuReMD/src/cuda/cuda_nonbonded.cu b/PG-PuReMD/src/cuda/cuda_nonbonded.cu index 6f7a425a..8f2936a6 100644 --- a/PG-PuReMD/src/cuda/cuda_nonbonded.cu +++ b/PG-PuReMD/src/cuda/cuda_nonbonded.cu @@ -835,20 +835,20 @@ static void Cuda_Compute_Polarization_Energy( reax_system *system, #if !defined(CUDA_ACCUM_ATOMIC) real *spad; - sCudaCheckMalloc( &workspace->scratch, &workspace->scratch_size, + sCudaCheckMalloc( &workspace->scratch[0], &workspace->scratch_size[0], sizeof(real) * system->n, __FILE__, __LINE__ ); - spad = (real *) workspace->scratch; + spad = (real *) workspace->scratch[0]; #else sCudaMemsetAsync( &((simulation_data *)data->d_simulation_data)->my_en.e_pol, - 0, sizeof(real), control->streams[0], __FILE__, __LINE__ ); - cudaStreamSynchronize( control->streams[0] ); + 0, sizeof(real), control->streams[4], __FILE__, __LINE__ ); + cudaStreamSynchronize( control->streams[4] ); #endif blocks = system->n / DEF_BLOCK_SIZE + ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1); k_compute_polarization_energy <<< blocks, DEF_BLOCK_SIZE, 0, - control->streams[0] >>> + control->streams[4] >>> ( system->d_my_atoms, system->reax_param.d_sbp, system->n, #if !defined(CUDA_ACCUM_ATOMIC) @@ -890,22 +890,22 @@ void Cuda_Compute_NonBonded_Forces( reax_system *system, control_params *control { s = sizeof(real) * 2 * system->n; } - sCudaCheckMalloc( &workspace->scratch, &workspace->scratch_size, + sCudaCheckMalloc( &workspace->scratch[0], &workspace->scratch_size[0], s, __FILE__, __LINE__ ); - spad = (real *) workspace->scratch; + spad = (real *) workspace->scratch[0]; #endif #if defined(CUDA_ACCUM_ATOMIC) sCudaMemsetAsync( &((simulation_data *)data->d_simulation_data)->my_en.e_vdW, - 0, sizeof(real), control->streams[0], __FILE__, __LINE__ ); + 0, sizeof(real), control->streams[4], __FILE__, __LINE__ ); sCudaMemsetAsync( &((simulation_data *)data->d_simulation_data)->my_en.e_ele, - 0, sizeof(real), control->streams[0], __FILE__, __LINE__ ); + 0, sizeof(real), control->streams[4], __FILE__, __LINE__ ); if ( control->virial == 1 ) { sCudaMemsetAsync( &((simulation_data *)data->d_simulation_data)->my_ext_press, - 0, sizeof(rvec), control->streams[0], __FILE__, __LINE__ ); + 0, sizeof(rvec), control->streams[4], __FILE__, __LINE__ ); } - cudaStreamSynchronize( control->streams[0] ); + cudaStreamSynchronize( control->streams[4] ); #endif blocks = system->n * 32 / DEF_BLOCK_SIZE @@ -916,7 +916,7 @@ void Cuda_Compute_NonBonded_Forces( reax_system *system, control_params *control if ( control->virial == 1 ) { // k_vdW_coulomb_energy_virial_full <<< control->blocks, control->block_size, -// 0, control->streams[0] >>> +// 0, control->streams[4] >>> // ( system->d_my_atoms, system->reax_param.d_tbp, // system->reax_param.d_gp, (control_params *) control->d_control_params, // *(workspace->d_workspace), *(lists[FAR_NBRS]), @@ -932,7 +932,7 @@ void Cuda_Compute_NonBonded_Forces( reax_system *system, control_params *control k_vdW_coulomb_energy_virial_full_opt <<< blocks, DEF_BLOCK_SIZE, sizeof(real) * (DEF_BLOCK_SIZE / 32), - control->streams[0] >>> + control->streams[4] >>> ( system->d_my_atoms, system->reax_param.d_tbp, system->reax_param.d_gp, (control_params *) control->d_control_params, *(workspace->d_workspace), *(lists[FAR_NBRS]), @@ -949,7 +949,7 @@ void Cuda_Compute_NonBonded_Forces( reax_system *system, control_params *control else { // k_vdW_coulomb_energy_full <<< control->blocks, control->block_size, -// 0, control->streams[0] >>> +// 0, control->streams[4] >>> // ( system->d_my_atoms, system->reax_param.d_tbp, // system->reax_param.d_gp, (control_params *) control->d_control_params, // *(workspace->d_workspace), *(lists[FAR_NBRS]), @@ -964,7 +964,7 @@ void Cuda_Compute_NonBonded_Forces( reax_system *system, control_params *control k_vdW_coulomb_energy_full_opt <<< blocks, DEF_BLOCK_SIZE, sizeof(real) * (DEF_BLOCK_SIZE / 32), - control->streams[0] >>> + control->streams[4] >>> ( system->d_my_atoms, system->reax_param.d_tbp, system->reax_param.d_gp, (control_params *) control->d_control_params, *(workspace->d_workspace), *(lists[FAR_NBRS]), @@ -982,7 +982,7 @@ void Cuda_Compute_NonBonded_Forces( reax_system *system, control_params *control else { k_vdW_coulomb_energy_tab_full <<< control->blocks, control->block_size, - 0, control->streams[0] >>> + 0, control->streams[4] >>> ( system->d_my_atoms, system->reax_param.d_gp, (control_params *) control->d_control_params, *(workspace->d_workspace), *(lists[FAR_NBRS]), @@ -1022,13 +1022,13 @@ void Cuda_Compute_NonBonded_Forces( reax_system *system, control_params *control /* reduction for ext_press */ k_reduction_rvec <<< control->blocks, control->block_size, sizeof(rvec) * (control->block_size / 32), - control->streams[0] >>> + control->streams[4] >>> ( spad_rvec, &spad_rvec[system->n], system->n ); cudaCheckError( ); k_reduction_rvec <<< 1, control->blocks_pow_2, sizeof(rvec) * (control->blocks_pow_2 / 32), - control->streams[0] >>> + control->streams[4] >>> ( &spad_rvec[system->n], &((simulation_data *)data->d_simulation_data)->my_ext_press, control->blocks ); diff --git a/PG-PuReMD/src/cuda/cuda_reset_tools.cu b/PG-PuReMD/src/cuda/cuda_reset_tools.cu index e957d352..73ebf11a 100644 --- a/PG-PuReMD/src/cuda/cuda_reset_tools.cu +++ b/PG-PuReMD/src/cuda/cuda_reset_tools.cu @@ -75,9 +75,9 @@ void Cuda_Reset_Atoms_HBond_Indices( reax_system* system, control_params *contro #if !defined(CUDA_ACCUM_ATOMIC) int *hindex; - sCudaCheckMalloc( &workspace->scratch, &workspace->scratch_size, + sCudaCheckMalloc( &workspace->scratch[0], &workspace->scratch_size[0], sizeof(int) * system->total_cap, __FILE__, __LINE__ ); - hindex = (int *) workspace->scratch; + hindex = (int *) workspace->scratch[0]; #endif k_reset_hindex <<< control->blocks_n, control->block_size_n, 0, @@ -97,6 +97,7 @@ void Cuda_Reset_Atoms_HBond_Indices( reax_system* system, control_params *contro sCudaMemcpyAsync( &system->numH, system->d_numH, sizeof(int), cudaMemcpyDeviceToHost, control->streams[0], __FILE__, __LINE__ ); + cudaStreamSynchronize( control->streams[0] ); } diff --git a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu index 15210720..0f31b8c3 100644 --- a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu +++ b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu @@ -602,15 +602,15 @@ static void Dual_Sparse_MatVec_Comm_Part1( const reax_system * const system, spad = (rvec2 *) workspace->host_scratch; sCudaMemcpyAsync( spad, (void *) x, sizeof(rvec2) * n, - cudaMemcpyDeviceToHost, control->streams[0], __FILE__, __LINE__ ); - cudaStreamSynchronize( control->streams[0] ); + cudaMemcpyDeviceToHost, control->streams[4], __FILE__, __LINE__ ); + + cudaStreamSynchronize( control->streams[4] ); /* exploit 3D domain decomposition of simulation space with 3-stage communication pattern */ Dist( system, mpi_data, spad, buf_type, mpi_type ); sCudaMemcpyAsync( (void *) x, spad, sizeof(rvec2) * n, - cudaMemcpyHostToDevice, control->streams[0], __FILE__, __LINE__ ); - cudaStreamSynchronize( control->streams[0] ); + cudaMemcpyHostToDevice, control->streams[4], __FILE__, __LINE__ ); #endif } @@ -633,7 +633,6 @@ static void Dual_Sparse_MatVec_local( control_params const * const control, { /* half-format requires entries of b be initialized to zero */ sCudaMemsetAsync( b, 0, sizeof(rvec2) * n, s, __FILE__, __LINE__ ); - cudaStreamSynchronize( s ); /* 1 thread per row implementation */ // k_dual_sparse_matvec_half_csr <<< control->blocks, control->block_size, 0, s >>> @@ -699,15 +698,16 @@ static void Dual_Sparse_MatVec_Comm_Part2( const reax_system * const system, sizeof(rvec2) * n1, TRUE, SAFE_ZONE, "Dual_Sparse_MatVec_Comm_Part2::workspace->host_scratch" ); spad = (rvec2 *) workspace->host_scratch; + sCudaMemcpyAsync( spad, b, sizeof(rvec2) * n1, - cudaMemcpyDeviceToHost, control->streams[0], __FILE__, __LINE__ ); - cudaStreamSynchronize( control->streams[0] ); + cudaMemcpyDeviceToHost, control->streams[4], __FILE__, __LINE__ ); + + cudaStreamSynchronize( control->streams[4] ); Coll( system, mpi_data, spad, buf_type, mpi_type ); sCudaMemcpyAsync( b, spad, sizeof(rvec2) * n2, - cudaMemcpyHostToDevice, control->streams[0], __FILE__, __LINE__ ); - cudaStreamSynchronize( control->streams[0] ); + cudaMemcpyHostToDevice, control->streams[4], __FILE__, __LINE__ ); #endif } } @@ -743,7 +743,7 @@ static void Dual_Sparse_MatVec( const reax_system * const system, Update_Timing_Info( &time, &data->timing.cm_solver_comm ); #endif - Dual_Sparse_MatVec_local( control, A, x, b, n, control->streams[0] ); + Dual_Sparse_MatVec_local( control, A, x, b, n, control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_spmv ); @@ -783,16 +783,17 @@ static void Sparse_MatVec_Comm_Part1( const reax_system * const system, sizeof(real) * n, TRUE, SAFE_ZONE, "Sparse_MatVec_Comm_Part1::workspace->host_scratch" ); spad = (real *) workspace->host_scratch; + sCudaMemcpyAsync( spad, (void *) x, sizeof(real) * n, - cudaMemcpyDeviceToHost, control->streams[0], __FILE__, __LINE__ ); - cudaStreamSynchronize( control->streams[0] ); + cudaMemcpyDeviceToHost, control->streams[4], __FILE__, __LINE__ ); + + cudaStreamSynchronize( control->streams[4] ); /* exploit 3D domain decomposition of simulation space with 3-stage communication pattern */ Dist( system, mpi_data, spad, buf_type, mpi_type ); sCudaMemcpyAsync( (void *) x, spad, sizeof(real) * n, - cudaMemcpyHostToDevice, control->streams[0], __FILE__, __LINE__ ); - cudaStreamSynchronize( control->streams[0] ); + cudaMemcpyHostToDevice, control->streams[4], __FILE__, __LINE__ ); #endif } @@ -815,7 +816,6 @@ static void Sparse_MatVec_local( control_params const * const control, { /* half-format requires entries of b be initialized to zero */ sCudaMemsetAsync( b, 0, sizeof(real) * n, s, __FILE__, __LINE__ ); - cudaStreamSynchronize( s ); /* 1 thread per row implementation */ // k_sparse_matvec_half_csr <<< control->blocks, control->block_size, 0, s >>> @@ -879,15 +879,16 @@ static void Sparse_MatVec_Comm_Part2( const reax_system * const system, sizeof(real) * n1, TRUE, SAFE_ZONE, "Sparse_MatVec_Comm_Part2::workspace->host_scratch" ); spad = (real *) workspace->host_scratch; + sCudaMemcpyAsync( spad, b, sizeof(real) * n1, - cudaMemcpyDeviceToHost, control->streams[0], __FILE__, __LINE__ ); - cudaStreamSynchronize( control->streams[0] ); + cudaMemcpyDeviceToHost, control->streams[4], __FILE__, __LINE__ ); + + cudaStreamSynchronize( control->streams[4] ); Coll( system, mpi_data, spad, buf_type, mpi_type ); sCudaMemcpyAsync( b, spad, sizeof(real) * n2, - cudaMemcpyHostToDevice, control->streams[0], __FILE__, __LINE__ ); - cudaStreamSynchronize( control->streams[0] ); + cudaMemcpyHostToDevice, control->streams[4], __FILE__, __LINE__ ); #endif } } @@ -923,7 +924,7 @@ static void Sparse_MatVec( reax_system const * const system, Update_Timing_Info( &time, &data->timing.cm_solver_comm ); #endif - Sparse_MatVec_local( control, A, x, b, n, control->streams[0] ); + Sparse_MatVec_local( control, A, x, b, n, control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_spmv ); @@ -967,15 +968,15 @@ int Cuda_dual_SDM( reax_system const * const system, #endif dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r2, - workspace->d_workspace->d2, system->n, control->streams[0] ); + workspace->d_workspace->d2, system->n, control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); #endif - Dot_local_rvec2( workspace, b, b, system->n, &redux[0], &redux[1], control->streams[0] ); + Dot_local_rvec2( workspace, b, b, system->n, &redux[0], &redux[1], control->streams[4] ); Dot_local_rvec2( workspace, workspace->d_workspace->r2, - workspace->d_workspace->d2, system->n, &redux[2], &redux[3], control->streams[0] ); + workspace->d_workspace->d2, system->n, &redux[2], &redux[3], control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); @@ -1008,9 +1009,9 @@ int Cuda_dual_SDM( reax_system const * const system, #endif Dot_local_rvec2( workspace, workspace->d_workspace->r2, - workspace->d_workspace->d2, system->n, &redux[0], &redux[1], control->streams[0] ); + workspace->d_workspace->d2, system->n, &redux[0], &redux[1], control->streams[4] ); Dot_local_rvec2( workspace, workspace->d_workspace->d2, - workspace->d_workspace->q2, system->n, &redux[2], &redux[3], control->streams[0] ); + workspace->d_workspace->q2, system->n, &redux[2], &redux[3], control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_allreduce ); @@ -1039,7 +1040,7 @@ int Cuda_dual_SDM( reax_system const * const system, #endif dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r2, - workspace->d_workspace->d2, system->n, control->streams[0] ); + workspace->d_workspace->d2, system->n, control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); @@ -1118,15 +1119,15 @@ int Cuda_SDM( reax_system const * const system, control_params const * const con #endif jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r, - workspace->d_workspace->d, system->n, control->streams[0] ); + workspace->d_workspace->d, system->n, control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); #endif - redux[0] = Dot_local( workspace, b, b, system->n, control->streams[0] ); + redux[0] = Dot_local( workspace, b, b, system->n, control->streams[4] ); redux[1] = Dot_local( workspace, workspace->d_workspace->r, - workspace->d_workspace->d, system->n, control->streams[0] ); + workspace->d_workspace->d, system->n, control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); @@ -1152,9 +1153,9 @@ int Cuda_SDM( reax_system const * const system, control_params const * const con #endif redux[0] = Dot_local( workspace, workspace->d_workspace->r, - workspace->d_workspace->d, system->n, control->streams[0] ); + workspace->d_workspace->d, system->n, control->streams[4] ); redux[1] = Dot_local( workspace, workspace->d_workspace->d, - workspace->d_workspace->q, system->n, control->streams[0] ); + workspace->d_workspace->q, system->n, control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_allreduce ); @@ -1180,7 +1181,7 @@ int Cuda_SDM( reax_system const * const system, control_params const * const con #endif jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r, - workspace->d_workspace->d, system->n, control->streams[0] ); + workspace->d_workspace->d, system->n, control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); @@ -1229,17 +1230,17 @@ int Cuda_dual_CG( reax_system const * const system, #endif dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r2, - workspace->d_workspace->d2, system->n, control->streams[0] ); + workspace->d_workspace->d2, system->n, control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); #endif Dot_local_rvec2( workspace, workspace->d_workspace->r2, - workspace->d_workspace->d2, system->n, &redux[0], &redux[1], control->streams[0] ); + workspace->d_workspace->d2, system->n, &redux[0], &redux[1], control->streams[4] ); Dot_local_rvec2( workspace, workspace->d_workspace->d2, - workspace->d_workspace->d2, system->n, &redux[2], &redux[3], control->streams[0] ); - Dot_local_rvec2( workspace, b, b, system->n, &redux[4], &redux[5], control->streams[0] ); + workspace->d_workspace->d2, system->n, &redux[2], &redux[3], control->streams[4] ); + Dot_local_rvec2( workspace, b, b, system->n, &redux[4], &redux[5], control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); @@ -1273,7 +1274,7 @@ int Cuda_dual_CG( reax_system const * const system, #endif Dot_local_rvec2( workspace, workspace->d_workspace->d2, - workspace->d_workspace->q2, system->n, &redux[0], &redux[1], control->streams[0] ); + workspace->d_workspace->q2, system->n, &redux[0], &redux[1], control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); @@ -1300,16 +1301,16 @@ int Cuda_dual_CG( reax_system const * const system, #endif dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r2, - workspace->d_workspace->p2, system->n, control->streams[0] ); + workspace->d_workspace->p2, system->n, control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); #endif Dot_local_rvec2( workspace, workspace->d_workspace->r2, - workspace->d_workspace->p2, system->n, &redux[0], &redux[1], control->streams[0] ); + workspace->d_workspace->p2, system->n, &redux[0], &redux[1], control->streams[4] ); Dot_local_rvec2( workspace, workspace->d_workspace->p2, - workspace->d_workspace->p2, system->n, &redux[2], &redux[3], control->streams[0] ); + workspace->d_workspace->p2, system->n, &redux[2], &redux[3], control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); @@ -1413,17 +1414,17 @@ int Cuda_CG( reax_system const * const system, control_params const * const cont #endif jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r, - workspace->d_workspace->d, system->n, control->streams[0] ); + workspace->d_workspace->d, system->n, control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); #endif redux[0] = Dot_local( workspace, workspace->d_workspace->r, - workspace->d_workspace->d, system->n, control->streams[0] ); + workspace->d_workspace->d, system->n, control->streams[4] ); redux[1] = Dot_local( workspace, workspace->d_workspace->d, - workspace->d_workspace->d, system->n, control->streams[0] ); - redux[2] = Dot_local( workspace, b, b, system->n, control->streams[0] ); + workspace->d_workspace->d, system->n, control->streams[4] ); + redux[2] = Dot_local( workspace, b, b, system->n, control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); @@ -1450,7 +1451,7 @@ int Cuda_CG( reax_system const * const system, control_params const * const cont #endif tmp = Dot( workspace, workspace->d_workspace->d, workspace->d_workspace->q, - system->n, MPI_COMM_WORLD, control->streams[0] ); + system->n, MPI_COMM_WORLD, control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_allreduce ); @@ -1466,16 +1467,16 @@ int Cuda_CG( reax_system const * const system, control_params const * const cont #endif jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r, - workspace->d_workspace->p, system->n, control->streams[0] ); + workspace->d_workspace->p, system->n, control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); #endif redux[0] = Dot_local( workspace, workspace->d_workspace->r, - workspace->d_workspace->p, system->n, control->streams[0] ); + workspace->d_workspace->p, system->n, control->streams[4] ); redux[1] = Dot_local( workspace, workspace->d_workspace->p, - workspace->d_workspace->p, system->n, control->streams[0] ); + workspace->d_workspace->p, system->n, control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); @@ -1552,9 +1553,9 @@ int Cuda_dual_BiCGStab( reax_system const * const system, control_params const * Vector_Sum_rvec2( workspace->d_workspace->r2, 1.0, 1.0, b, -1.0, -1.0, workspace->d_workspace->d2, system->n ); Dot_local_rvec2( workspace, b, - b, system->n, &redux[0], &redux[1], control->streams[0] ); + b, system->n, &redux[0], &redux[1], control->streams[4] ); Dot_local_rvec2( workspace, workspace->d_workspace->r2, - workspace->d_workspace->r2, system->n, &redux[2], &redux[3], control->streams[0] ); + workspace->d_workspace->r2, system->n, &redux[2], &redux[3], control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); @@ -1599,7 +1600,7 @@ int Cuda_dual_BiCGStab( reax_system const * const system, control_params const * } Dot_local_rvec2( workspace, workspace->d_workspace->r_hat2, - workspace->d_workspace->r2, system->n, &redux[0], &redux[1], control->streams[0] ); + workspace->d_workspace->r2, system->n, &redux[0], &redux[1], control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); @@ -1641,7 +1642,7 @@ int Cuda_dual_BiCGStab( reax_system const * const system, control_params const * #endif dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->p2, - workspace->d_workspace->d2, system->n, control->streams[0] ); + workspace->d_workspace->d2, system->n, control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); @@ -1655,7 +1656,7 @@ int Cuda_dual_BiCGStab( reax_system const * const system, control_params const * #endif Dot_local_rvec2( workspace, workspace->d_workspace->r_hat2, - workspace->d_workspace->z2, system->n, &redux[0], &redux[1], control->streams[0] ); + workspace->d_workspace->z2, system->n, &redux[0], &redux[1], control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); @@ -1677,7 +1678,7 @@ int Cuda_dual_BiCGStab( reax_system const * const system, control_params const * 1.0, 1.0, workspace->d_workspace->r2, -1.0 * alpha[0], -1.0 * alpha[1], workspace->d_workspace->z2, system->n ); Dot_local_rvec2( workspace, workspace->d_workspace->q2, - workspace->d_workspace->q2, system->n, &redux[0], &redux[1], control->streams[0] ); + workspace->d_workspace->q2, system->n, &redux[0], &redux[1], control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); @@ -1705,7 +1706,7 @@ int Cuda_dual_BiCGStab( reax_system const * const system, control_params const * #endif dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->q2, - workspace->d_workspace->q_hat2, system->n, control->streams[0] ); + workspace->d_workspace->q_hat2, system->n, control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); @@ -1719,9 +1720,9 @@ int Cuda_dual_BiCGStab( reax_system const * const system, control_params const * #endif Dot_local_rvec2( workspace, workspace->d_workspace->y2, - workspace->d_workspace->q2, system->n, &redux[0], &redux[1], control->streams[0] ); + workspace->d_workspace->q2, system->n, &redux[0], &redux[1], control->streams[4] ); Dot_local_rvec2( workspace, workspace->d_workspace->y2, - workspace->d_workspace->y2, system->n, &redux[2], &redux[3], control->streams[0] ); + workspace->d_workspace->y2, system->n, &redux[2], &redux[3], control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); @@ -1749,7 +1750,7 @@ int Cuda_dual_BiCGStab( reax_system const * const system, control_params const * 1.0, 1.0, workspace->d_workspace->q2, -1.0 * omega[0], -1.0 * omega[1], workspace->d_workspace->y2, system->n ); Dot_local_rvec2( workspace, workspace->d_workspace->r2, - workspace->d_workspace->r2, system->n, &redux[0], &redux[1], control->streams[0] ); + workspace->d_workspace->r2, system->n, &redux[0], &redux[1], control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); @@ -1858,9 +1859,9 @@ int Cuda_BiCGStab( reax_system const * const system, control_params const * cons Vector_Sum( workspace->d_workspace->r, 1.0, b, -1.0, workspace->d_workspace->d, system->n ); - redux[0] = Dot_local( workspace, b, b, system->n, control->streams[0] ); + redux[0] = Dot_local( workspace, b, b, system->n, control->streams[4] ); redux[1] = Dot_local( workspace, workspace->d_workspace->r, - workspace->d_workspace->r, system->n, control->streams[0] ); + workspace->d_workspace->r, system->n, control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); @@ -1892,7 +1893,7 @@ int Cuda_BiCGStab( reax_system const * const system, control_params const * cons for ( i = 0; i < control->cm_solver_max_iters && r_norm / b_norm > tol; ++i ) { redux[0] = Dot_local( workspace, workspace->d_workspace->r_hat, - workspace->d_workspace->r, system->n, control->streams[0] ); + workspace->d_workspace->r, system->n, control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); @@ -1932,7 +1933,7 @@ int Cuda_BiCGStab( reax_system const * const system, control_params const * cons #endif jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->p, - workspace->d_workspace->d, system->n, control->streams[0] ); + workspace->d_workspace->d, system->n, control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); @@ -1946,7 +1947,7 @@ int Cuda_BiCGStab( reax_system const * const system, control_params const * cons #endif redux[0] = Dot_local( workspace, workspace->d_workspace->r_hat, - workspace->d_workspace->z, system->n, control->streams[0] ); + workspace->d_workspace->z, system->n, control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); @@ -1966,7 +1967,7 @@ int Cuda_BiCGStab( reax_system const * const system, control_params const * cons 1.0, workspace->d_workspace->r, -1.0 * alpha, workspace->d_workspace->z, system->n ); redux[0] = Dot_local( workspace, workspace->d_workspace->q, - workspace->d_workspace->q, system->n, control->streams[0] ); + workspace->d_workspace->q, system->n, control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); @@ -1993,7 +1994,7 @@ int Cuda_BiCGStab( reax_system const * const system, control_params const * cons #endif jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->q, - workspace->d_workspace->q_hat, system->n, control->streams[0] ); + workspace->d_workspace->q_hat, system->n, control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); @@ -2007,9 +2008,9 @@ int Cuda_BiCGStab( reax_system const * const system, control_params const * cons #endif redux[0] = Dot_local( workspace, workspace->d_workspace->y, - workspace->d_workspace->q, system->n, control->streams[0] ); + workspace->d_workspace->q, system->n, control->streams[4] ); redux[1] = Dot_local( workspace, workspace->d_workspace->y, - workspace->d_workspace->y, system->n, control->streams[0] ); + workspace->d_workspace->y, system->n, control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); @@ -2034,7 +2035,7 @@ int Cuda_BiCGStab( reax_system const * const system, control_params const * cons 1.0, workspace->d_workspace->q, -1.0 * omega, workspace->d_workspace->y, system->n ); redux[0] = Dot_local( workspace, workspace->d_workspace->r, - workspace->d_workspace->r, system->n, control->streams[0] ); + workspace->d_workspace->r, system->n, control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); @@ -2110,7 +2111,7 @@ int Cuda_dual_PIPECG( reax_system const * const system, control_params const * c #endif dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r2, - workspace->d_workspace->u2, system->n, control->streams[0] ); + workspace->d_workspace->u2, system->n, control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); @@ -2124,12 +2125,12 @@ int Cuda_dual_PIPECG( reax_system const * const system, control_params const * c #endif Dot_local_rvec2( workspace, workspace->d_workspace->w2, - workspace->d_workspace->u2, system->n, &redux[0], &redux[1], control->streams[0] ); + workspace->d_workspace->u2, system->n, &redux[0], &redux[1], control->streams[4] ); Dot_local_rvec2( workspace, workspace->d_workspace->r2, - workspace->d_workspace->u2, system->n, &redux[2], &redux[3], control->streams[0] ); + workspace->d_workspace->u2, system->n, &redux[2], &redux[3], control->streams[4] ); Dot_local_rvec2( workspace, workspace->d_workspace->u2, - workspace->d_workspace->u2, system->n, &redux[4], &redux[5], control->streams[0] ); - Dot_local_rvec2( workspace, b, b, system->n, &redux[6], &redux[7], control->streams[0] ); + workspace->d_workspace->u2, system->n, &redux[4], &redux[5], control->streams[4] ); + Dot_local_rvec2( workspace, b, b, system->n, &redux[6], &redux[7], control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); @@ -2140,7 +2141,7 @@ int Cuda_dual_PIPECG( reax_system const * const system, control_params const * c Check_MPI_Error( ret, __FILE__, __LINE__ ); dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->w2, - workspace->d_workspace->m2, system->n, control->streams[0] ); + workspace->d_workspace->m2, system->n, control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); @@ -2207,11 +2208,11 @@ int Cuda_dual_PIPECG( reax_system const * const system, control_params const * c Vector_Sum_rvec2( workspace->d_workspace->r2, 1.0, 1.0, workspace->d_workspace->r2, -1.0 * alpha[0], -1.0 * alpha[1], workspace->d_workspace->d2, system->n ); Dot_local_rvec2( workspace, workspace->d_workspace->w2, - workspace->d_workspace->u2, system->n, &redux[0], &redux[1], control->streams[0] ); + workspace->d_workspace->u2, system->n, &redux[0], &redux[1], control->streams[4] ); Dot_local_rvec2( workspace, workspace->d_workspace->r2, - workspace->d_workspace->u2, system->n, &redux[2], &redux[3], control->streams[0] ); + workspace->d_workspace->u2, system->n, &redux[2], &redux[3], control->streams[4] ); Dot_local_rvec2( workspace, workspace->d_workspace->u2, - workspace->d_workspace->u2, system->n, &redux[4], &redux[5], control->streams[0] ); + workspace->d_workspace->u2, system->n, &redux[4], &redux[5], control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); @@ -2222,7 +2223,7 @@ int Cuda_dual_PIPECG( reax_system const * const system, control_params const * c Check_MPI_Error( ret, __FILE__, __LINE__ ); dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->w2, - workspace->d_workspace->m2, system->n, control->streams[0] ); + workspace->d_workspace->m2, system->n, control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); @@ -2333,7 +2334,7 @@ int Cuda_PIPECG( reax_system const * const system, control_params const * const #endif jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r, - workspace->d_workspace->u, system->n, control->streams[0] ); + workspace->d_workspace->u, system->n, control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); @@ -2347,12 +2348,12 @@ int Cuda_PIPECG( reax_system const * const system, control_params const * const #endif redux[0] = Dot_local( workspace, workspace->d_workspace->w, - workspace->d_workspace->u, system->n, control->streams[0] ); + workspace->d_workspace->u, system->n, control->streams[4] ); redux[1] = Dot_local( workspace, workspace->d_workspace->r, - workspace->d_workspace->u, system->n, control->streams[0] ); + workspace->d_workspace->u, system->n, control->streams[4] ); redux[2] = Dot_local( workspace, workspace->d_workspace->u, - workspace->d_workspace->u, system->n, control->streams[0] ); - redux[3] = Dot_local( workspace, b, b, system->n, control->streams[0] ); + workspace->d_workspace->u, system->n, control->streams[4] ); + redux[3] = Dot_local( workspace, b, b, system->n, control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); @@ -2363,7 +2364,7 @@ int Cuda_PIPECG( reax_system const * const system, control_params const * const Check_MPI_Error( ret, __FILE__, __LINE__ ); jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->w, - workspace->d_workspace->m, system->n, control->streams[0] ); + workspace->d_workspace->m, system->n, control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); @@ -2417,11 +2418,11 @@ int Cuda_PIPECG( reax_system const * const system, control_params const * const Vector_Sum( workspace->d_workspace->r, 1.0, workspace->d_workspace->r, -1.0 * alpha, workspace->d_workspace->d, system->n ); redux[0] = Dot_local( workspace, workspace->d_workspace->w, - workspace->d_workspace->u, system->n, control->streams[0] ); + workspace->d_workspace->u, system->n, control->streams[4] ); redux[1] = Dot_local( workspace, workspace->d_workspace->r, - workspace->d_workspace->u, system->n, control->streams[0] ); + workspace->d_workspace->u, system->n, control->streams[4] ); redux[2] = Dot_local( workspace, workspace->d_workspace->u, - workspace->d_workspace->u, system->n, control->streams[0] ); + workspace->d_workspace->u, system->n, control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); @@ -2432,7 +2433,7 @@ int Cuda_PIPECG( reax_system const * const system, control_params const * const Check_MPI_Error( ret, __FILE__, __LINE__ ); jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->w, - workspace->d_workspace->m, system->n, control->streams[0] ); + workspace->d_workspace->m, system->n, control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); @@ -2505,15 +2506,15 @@ int Cuda_dual_PIPECR( reax_system const * const system, control_params const * c #endif dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r2, - workspace->d_workspace->u2, system->n, control->streams[0] ); + workspace->d_workspace->u2, system->n, control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); #endif - Dot_local_rvec2( workspace, b, b, system->n, &redux[0], &redux[1], control->streams[0] ); + Dot_local_rvec2( workspace, b, b, system->n, &redux[0], &redux[1], control->streams[4] ); Dot_local_rvec2( workspace, workspace->d_workspace->u2, - workspace->d_workspace->u2, system->n, &redux[2], &redux[3], control->streams[0] ); + workspace->d_workspace->u2, system->n, &redux[2], &redux[3], control->streams[4] ); ret = MPI_Iallreduce( MPI_IN_PLACE, redux, 4, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD, &req ); @@ -2549,18 +2550,18 @@ int Cuda_dual_PIPECR( reax_system const * const system, control_params const * c } dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->w2, - workspace->d_workspace->m2, system->n, control->streams[0] ); + workspace->d_workspace->m2, system->n, control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); #endif Dot_local_rvec2( workspace, workspace->d_workspace->w2, - workspace->d_workspace->u2, system->n, &redux[0], &redux[1], control->streams[0] ); + workspace->d_workspace->u2, system->n, &redux[0], &redux[1], control->streams[4] ); Dot_local_rvec2( workspace, workspace->d_workspace->m2, - workspace->d_workspace->w2, system->n, &redux[2], &redux[3], control->streams[0] ); + workspace->d_workspace->w2, system->n, &redux[2], &redux[3], control->streams[4] ); Dot_local_rvec2( workspace, workspace->d_workspace->u2, - workspace->d_workspace->u2, system->n, &redux[4], &redux[5], control->streams[0] ); + workspace->d_workspace->u2, system->n, &redux[4], &redux[5], control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); @@ -2708,15 +2709,15 @@ int Cuda_PIPECR( reax_system const * const system, control_params const * const #endif jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r, - workspace->d_workspace->u, system->n, control->streams[0] ); + workspace->d_workspace->u, system->n, control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); #endif - redux[0] = Dot_local( workspace, b, b, system->n, control->streams[0] ); + redux[0] = Dot_local( workspace, b, b, system->n, control->streams[4] ); redux[1] = Dot_local( workspace, workspace->d_workspace->u, - workspace->d_workspace->u, system->n, control->streams[0] ); + workspace->d_workspace->u, system->n, control->streams[4] ); ret = MPI_Iallreduce( MPI_IN_PLACE, redux, 2, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD, &req ); @@ -2745,18 +2746,18 @@ int Cuda_PIPECR( reax_system const * const system, control_params const * const for ( i = 0; i < control->cm_solver_max_iters && r_norm / b_norm > tol; ++i ) { jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->w, - workspace->d_workspace->m, system->n, control->streams[0] ); + workspace->d_workspace->m, system->n, control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); #endif redux[0] = Dot_local( workspace, workspace->d_workspace->w, - workspace->d_workspace->u, system->n, control->streams[0] ); + workspace->d_workspace->u, system->n, control->streams[4] ); redux[1] = Dot_local( workspace, workspace->d_workspace->m, - workspace->d_workspace->w, system->n, control->streams[0] ); + workspace->d_workspace->w, system->n, control->streams[4] ); redux[2] = Dot_local( workspace, workspace->d_workspace->u, - workspace->d_workspace->u, system->n, control->streams[0] ); + workspace->d_workspace->u, system->n, control->streams[4] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); diff --git a/PG-PuReMD/src/cuda/cuda_system_props.cu b/PG-PuReMD/src/cuda/cuda_system_props.cu index 6df030a6..fb623e44 100644 --- a/PG-PuReMD/src/cuda/cuda_system_props.cu +++ b/PG-PuReMD/src/cuda/cuda_system_props.cu @@ -520,9 +520,9 @@ static void Cuda_Compute_Momentum( reax_system *system, control_params *control, { rvec *spad; - sCudaCheckMalloc( &workspace->scratch, &workspace->scratch_size, + sCudaCheckMalloc( &workspace->scratch[0], &workspace->scratch_size[0], sizeof(rvec) * (control->blocks + 1), __FILE__, __LINE__ ); - spad = (rvec *) workspace->scratch; + spad = (rvec *) workspace->scratch[0]; // xcm sCudaMemsetAsync( spad, 0, sizeof(rvec) * (control->blocks + 1), @@ -594,9 +594,9 @@ static void Cuda_Compute_Inertial_Tensor( reax_system *system, control_params *c { real *spad; - sCudaCheckMalloc( &workspace->scratch, &workspace->scratch_size, + sCudaCheckMalloc( &workspace->scratch[0], &workspace->scratch_size[0], sizeof(real) * 6 * (control->blocks + 1), __FILE__, __LINE__ ); - spad = (real *) workspace->scratch; + spad = (real *) workspace->scratch[0]; sCudaMemsetAsync( spad, 0, sizeof(real) * 6 * (control->blocks + 1), control->streams[0], __FILE__, __LINE__ ); cudaStreamSynchronize( control->streams[0] ); @@ -690,9 +690,9 @@ extern "C" void Cuda_Compute_Kinetic_Energy( reax_system *system, int ret; real *kinetic_energy; - sCudaCheckMalloc( &workspace->scratch, &workspace->scratch_size, + sCudaCheckMalloc( &workspace->scratch[0], &workspace->scratch_size[0], sizeof(real) * (system->n + 1), __FILE__, __LINE__ ); - kinetic_energy = (real *) workspace->scratch; + kinetic_energy = (real *) workspace->scratch[0]; k_compute_kinetic_energy <<< control->blocks, control->block_size, 0, control->streams[0] >>> ( system->reax_param.d_sbp, system->d_my_atoms, kinetic_energy, system->n ); @@ -727,9 +727,9 @@ void Cuda_Compute_Total_Mass( reax_system *system, control_params *control, int ret; real my_M, *spad; - sCudaCheckMalloc( &workspace->scratch, &workspace->scratch_size, + sCudaCheckMalloc( &workspace->scratch[0], &workspace->scratch_size[0], sizeof(real) * (system->n + 1), __FILE__, __LINE__ ); - spad = (real *) workspace->scratch; + spad = (real *) workspace->scratch[0]; k_compute_total_mass <<< control->blocks, control->block_size, 0, control->streams[0] >>> ( system->reax_param.d_sbp, system->d_my_atoms, spad, system->n ); @@ -873,10 +873,10 @@ void Cuda_Compute_Pressure( reax_system* system, control_params *control, /* 0: both int and ext, 1: ext only, 2: int only */ if ( control->press_mode == 0 || control->press_mode == 2 ) { - sCudaCheckMalloc( &workspace->scratch, &workspace->scratch_size, + sCudaCheckMalloc( &workspace->scratch[0], &workspace->scratch_size[0], sizeof(rvec) * (system->n + control->blocks + 1), __FILE__, __LINE__ ); - rvec_spad = (rvec *) workspace->scratch; + rvec_spad = (rvec *) workspace->scratch[0]; k_compute_pressure <<< control->blocks, control->block_size, 0, control->streams[0] >>> diff --git a/PG-PuReMD/src/cuda/cuda_torsion_angles.cu b/PG-PuReMD/src/cuda/cuda_torsion_angles.cu index 42f3be11..afd5c708 100644 --- a/PG-PuReMD/src/cuda/cuda_torsion_angles.cu +++ b/PG-PuReMD/src/cuda/cuda_torsion_angles.cu @@ -1299,10 +1299,10 @@ void Cuda_Compute_Torsion_Angles( reax_system *system, control_params *control, { s = (sizeof(real) * 2 * system->n; } - sCudaCheckMalloc( &workspace->scratch, &workspace->scratch_size, + sCudaCheckMalloc( &workspace->scratch[0], &workspace->scratch_size[0], s, __FILE__, __LINE__ ); - spad = (real *) workspace->scratch; + spad = (real *) workspace->scratch[0]; update_energy = (out_control->energy_update_freq > 0 && data->step % out_control->energy_update_freq == 0) ? TRUE : FALSE; #else @@ -1315,7 +1315,6 @@ void Cuda_Compute_Torsion_Angles( reax_system *system, control_params *control, sCudaMemsetAsync( &((simulation_data *)data->d_simulation_data)->my_ext_press, 0, sizeof(rvec), control->streams[0], __FILE__, __LINE__ ); } - cudaStreamSynchronize( control->streams[0] ); #endif if ( control->virial == 1 ) diff --git a/PG-PuReMD/src/cuda/cuda_valence_angles.cu b/PG-PuReMD/src/cuda/cuda_valence_angles.cu index d88dbb74..fa309d91 100644 --- a/PG-PuReMD/src/cuda/cuda_valence_angles.cu +++ b/PG-PuReMD/src/cuda/cuda_valence_angles.cu @@ -1348,7 +1348,6 @@ static int Cuda_Estimate_Storage_Three_Body( reax_system *system, control_params sCudaMemsetAsync( thbody, 0, system->total_bonds * sizeof(int), control->streams[0], __FILE__, __LINE__ ); - cudaStreamSynchronize( control->streams[0] ); // k_estimate_valence_angles <<< control->blocks_n, control->block_size_n, 0, control->streams[0] >>> // ( system->d_my_atoms, (control_params *)control->d_control_params, @@ -1437,12 +1436,12 @@ int Cuda_Compute_Valence_Angles( reax_system *system, control_params *control, s = sizeof(int) * system->total_bonds; #endif - sCudaCheckMalloc( &workspace->scratch, &workspace->scratch_size, + sCudaCheckMalloc( &workspace->scratch[0], &workspace->scratch_size[0], s, __FILE__, __LINE__ ); - thbody = (int *) workspace->scratch; + thbody = (int *) workspace->scratch[0]; #if !defined(CUDA_ACCUM_ATOMIC) - spad = (real *) workspace->scratch; + spad = (real *) workspace->scratch[0]; update_energy = (out_control->energy_update_freq > 0 && data->step % out_control->energy_update_freq == 0) ? TRUE : FALSE; #endif @@ -1463,7 +1462,6 @@ int Cuda_Compute_Valence_Angles( reax_system *system, control_params *control, 0, sizeof(real), control->streams[0], __FILE__, __LINE__ ); sCudaMemsetAsync( &((simulation_data *)data->d_simulation_data)->my_ext_press, 0, sizeof(rvec), control->streams[0], __FILE__, __LINE__ ); - cudaStreamSynchronize( control->streams[0] ); #endif if ( control->virial == 1 ) diff --git a/PG-PuReMD/src/reax_types.h b/PG-PuReMD/src/reax_types.h index cf3a055a..109e1476 100644 --- a/PG-PuReMD/src/reax_types.h +++ b/PG-PuReMD/src/reax_types.h @@ -358,7 +358,10 @@ #endif /* max. num. of active CUDA streams */ - #define CUDA_MAX_STREAMS (5) + #define MAX_CUDA_STREAMS (5) + + /* max. num. of CUDA events used for synchronizing streams */ + #define MAX_CUDA_STREAM_EVENTS (6) #endif @@ -1659,7 +1662,9 @@ struct control_params * for kernels with 1 thread per atom (local AND ghost) */ int blocks_pow_2_n; /* CUDA stream */ - cudaStream_t streams[CUDA_MAX_STREAMS]; + cudaStream_t streams[MAX_CUDA_STREAMS]; + /* CUDA events for synchronizing streams */ + cudaEvent_t stream_events[MAX_CUDA_STREAM_EVENTS]; #endif }; @@ -2322,9 +2327,9 @@ struct storage /* size of temporary host workspace, in bytes */ size_t host_scratch_size; /* temporary workspace (GPU) */ - void *scratch; + void *scratch[MAX_CUDA_STREAMS]; /* size of temporary workspace (GPU), in bytes */ - size_t scratch_size; + size_t scratch_size[MAX_CUDA_STREAMS]; /* lookup table for force tabulation (GPU) */ LR_lookup_table *d_LR; /* storage (GPU) */ -- GitLab