From 6717d968d6b150264f5b525dc993fd8e830dd722 Mon Sep 17 00:00:00 2001 From: "Kurt A. O'Hearn" <ohearnku@msu.edu> Date: Tue, 4 May 2021 13:15:12 -0400 Subject: [PATCH] WIP: changes to support multiple CUDA streams. This commit contains porting of all kernels from the default stream to a single non-default stream. This commit has bugs regarding cudaMemcpy's still being in the default stream (and synchronous). --- PG-PuReMD/src/cuda/cuda_allocate.cu | 2 +- PG-PuReMD/src/cuda/cuda_bond_orders.cu | 53 +++-- PG-PuReMD/src/cuda/cuda_bond_orders.h | 12 +- PG-PuReMD/src/cuda/cuda_bonds.cu | 5 +- PG-PuReMD/src/cuda/cuda_box.cu | 2 +- PG-PuReMD/src/cuda/cuda_charges.cu | 34 +-- PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu | 97 ++++----- PG-PuReMD/src/cuda/cuda_dense_lin_alg.h | 46 ++-- PG-PuReMD/src/cuda/cuda_environment.cu | 68 +++++- PG-PuReMD/src/cuda/cuda_environment.h | 5 +- PG-PuReMD/src/cuda/cuda_forces.cu | 136 +++++++----- PG-PuReMD/src/cuda/cuda_forces.h | 9 +- PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu | 22 +- PG-PuReMD/src/cuda/cuda_init_md.cu | 14 +- PG-PuReMD/src/cuda/cuda_integrate.cu | 76 ++++--- PG-PuReMD/src/cuda/cuda_integrate.h | 2 +- PG-PuReMD/src/cuda/cuda_multi_body.cu | 8 +- PG-PuReMD/src/cuda/cuda_neighbors.cu | 14 +- PG-PuReMD/src/cuda/cuda_neighbors.h | 7 +- PG-PuReMD/src/cuda/cuda_nonbonded.cu | 30 ++- PG-PuReMD/src/cuda/cuda_post_evolve.cu | 3 +- PG-PuReMD/src/cuda/cuda_reduction.cu | 30 +-- PG-PuReMD/src/cuda/cuda_reduction.h | 10 +- PG-PuReMD/src/cuda/cuda_reset_tools.cu | 10 +- PG-PuReMD/src/cuda/cuda_reset_tools.h | 2 +- PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu | 248 +++++++++++----------- PG-PuReMD/src/cuda/cuda_system_props.cu | 52 +++-- PG-PuReMD/src/cuda/cuda_torsion_angles.cu | 18 +- PG-PuReMD/src/cuda/cuda_valence_angles.cu | 41 ++-- PG-PuReMD/src/puremd.c | 7 +- PG-PuReMD/src/reax_types.h | 5 + 31 files changed, 600 insertions(+), 468 deletions(-) diff --git a/PG-PuReMD/src/cuda/cuda_allocate.cu b/PG-PuReMD/src/cuda/cuda_allocate.cu index 6cecfcb6..50b4356c 100644 --- a/PG-PuReMD/src/cuda/cuda_allocate.cu +++ b/PG-PuReMD/src/cuda/cuda_allocate.cu @@ -923,7 +923,7 @@ void Cuda_Reallocate_Part2( reax_system *system, control_params *control, { Cuda_Reallocate_List( lists[FAR_NBRS], system->total_cap, system->total_far_nbrs, TYP_FAR_NEIGHBOR ); - Cuda_Init_Neighbor_Indices( system, lists[FAR_NBRS] ); + Cuda_Init_Neighbor_Indices( system, control, lists[FAR_NBRS] ); realloc->far_nbrs = FALSE; } diff --git a/PG-PuReMD/src/cuda/cuda_bond_orders.cu b/PG-PuReMD/src/cuda/cuda_bond_orders.cu index 5565baad..9435f75e 100644 --- a/PG-PuReMD/src/cuda/cuda_bond_orders.cu +++ b/PG-PuReMD/src/cuda/cuda_bond_orders.cu @@ -993,13 +993,14 @@ CUDA_GLOBAL void k_total_forces_part2( reax_atom *my_atoms, int n, } -void Cuda_Compute_Bond_Orders( reax_system *system, control_params *control, - simulation_data *data, storage *workspace, - reax_list **lists, output_controls *out_control ) +void Cuda_Compute_Bond_Orders( reax_system * const system, control_params * const control, + simulation_data * const data, storage * const workspace, + reax_list ** const lists, output_controls * const out_control ) { int blocks; - k_bond_order_part1 <<< control->blocks_n, control->block_size_n >>> + k_bond_order_part1 <<< control->blocks_n, control->block_size_n, 0, + control->streams[0] >>> ( system->d_my_atoms, system->reax_param.d_sbp, *(workspace->d_workspace), system->N ); cudaCheckError( ); @@ -1014,25 +1015,28 @@ void Cuda_Compute_Bond_Orders( reax_system *system, control_params *control, + (system->N * 32 % DEF_BLOCK_SIZE == 0 ? 0 : 1); k_bond_order_part2 <<< blocks, DEF_BLOCK_SIZE, - sizeof(cub::WarpReduce<double>::TempStorage) * (DEF_BLOCK_SIZE / 32) >>> + sizeof(cub::WarpReduce<double>::TempStorage) * (DEF_BLOCK_SIZE / 32), + control->streams[0] >>> ( system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_sbp, system->reax_param.d_tbp, *(workspace->d_workspace), *(lists[BONDS]), system->reax_param.num_atom_types, system->N ); cudaCheckError( ); - k_bond_order_part3 <<< control->blocks_n, control->block_size_n >>> + k_bond_order_part3 <<< control->blocks_n, control->block_size_n, 0, + control->streams[0] >>> ( *(workspace->d_workspace), *(lists[BONDS]), system->N ); cudaCheckError( ); - k_bond_order_part4 <<< control->blocks_n, control->block_size_n >>> + k_bond_order_part4 <<< control->blocks_n, control->block_size_n, 0, + control->streams[0] >>> ( system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_sbp, *(workspace->d_workspace), system->N ); cudaCheckError( ); } -void Cuda_Total_Forces_Part1( reax_system *system, control_params *control, - simulation_data *data, storage *workspace, reax_list **lists ) +void Cuda_Total_Forces_Part1( reax_system * const system, control_params * const control, + simulation_data *data, storage * const workspace, reax_list ** const lists ) { int blocks; rvec *spad_rvec; @@ -1042,7 +1046,8 @@ void Cuda_Total_Forces_Part1( reax_system *system, control_params *control, // blocks = system->N / DEF_BLOCK_SIZE // + ((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1); // -// k_total_forces_part1 <<< blocks, DEF_BLOCK_SIZE >>> +// k_total_forces_part1 <<< blocks, DEF_BLOCK_SIZE, 0, +// control->streams[0] >>> // ( *(workspace->d_workspace), *(lists[BONDS]), system->N ); // cudaCheckError( ); @@ -1050,7 +1055,8 @@ void Cuda_Total_Forces_Part1( reax_system *system, control_params *control, + ((system->N * 32 % DEF_BLOCK_SIZE == 0) ? 0 : 1); k_total_forces_part1_opt <<< blocks, DEF_BLOCK_SIZE, - sizeof(cub::WarpReduce<double>::TempStorage) * (DEF_BLOCK_SIZE / 32) >>> + sizeof(cub::WarpReduce<double>::TempStorage) * (DEF_BLOCK_SIZE / 32), + control->streams[0] >>> ( *(workspace->d_workspace), *(lists[BONDS]), system->N ); cudaCheckError( ); } @@ -1066,7 +1072,8 @@ void Cuda_Total_Forces_Part1( reax_system *system, control_params *control, blocks = system->N / DEF_BLOCK_SIZE + ((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1); - k_total_forces_virial_part1 <<< blocks, DEF_BLOCK_SIZE >>> + k_total_forces_virial_part1 <<< blocks, DEF_BLOCK_SIZE, 0, + control->streams[0] >>> ( *(workspace->d_workspace), *(lists[BONDS]), (simulation_data *)data->d_simulation_data, spad_rvec, system->N ); @@ -1076,12 +1083,17 @@ void Cuda_Total_Forces_Part1( reax_system *system, control_params *control, + ((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1); /* reduction for ext_press */ - k_reduction_rvec <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec) * (DEF_BLOCK_SIZE / 32) >>> + k_reduction_rvec <<< blocks, DEF_BLOCK_SIZE, + sizeof(rvec) * (DEF_BLOCK_SIZE / 32), + control->streams[0] >>> ( spad_rvec, &spad_rvec[system->N], system->N ); cudaCheckError( ); - k_reduction_rvec <<< 1, ((blocks + 31) / 32) * 32, sizeof(rvec) * ((blocks + 31) / 32) >>> - ( &spad_rvec[system->N], &((simulation_data *)data->d_simulation_data)->my_ext_press, blocks ); + k_reduction_rvec <<< 1, ((blocks + 31) / 32) * 32, + sizeof(rvec) * ((blocks + 31) / 32), + control->streams[0] >>> + ( &spad_rvec[system->N], + &((simulation_data *)data->d_simulation_data)->my_ext_press, blocks ); cudaCheckError( ); } @@ -1090,21 +1102,24 @@ void Cuda_Total_Forces_Part1( reax_system *system, control_params *control, + ((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1); /* post processing for the atomic forces */ - k_total_forces_part1_2 <<< blocks, DEF_BLOCK_SIZE >>> - ( system->d_my_atoms, *(lists[BONDS]), *(workspace->d_workspace), system->N ); + k_total_forces_part1_2 <<< blocks, DEF_BLOCK_SIZE, 0, + control->streams[0] >>> + ( system->d_my_atoms, *(lists[BONDS]), + *(workspace->d_workspace), system->N ); cudaCheckError( ); #endif } -void Cuda_Total_Forces_Part2( reax_system *system, storage *workspace ) +void Cuda_Total_Forces_Part2( reax_system * const system, + control_params * const control, storage * const workspace ) { int blocks; blocks = system->n / DEF_BLOCK_SIZE + ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1); - k_total_forces_part2 <<< blocks, DEF_BLOCK_SIZE >>> + k_total_forces_part2 <<< blocks, DEF_BLOCK_SIZE, 0, control->streams[0] >>> ( system->d_my_atoms, system->n, *(workspace->d_workspace) ); cudaCheckError( ); } diff --git a/PG-PuReMD/src/cuda/cuda_bond_orders.h b/PG-PuReMD/src/cuda/cuda_bond_orders.h index 50afbb08..a5bc8318 100644 --- a/PG-PuReMD/src/cuda/cuda_bond_orders.h +++ b/PG-PuReMD/src/cuda/cuda_bond_orders.h @@ -9,13 +9,15 @@ #include "../vector.h" -void Cuda_Compute_Bond_Orders( reax_system *, control_params *, - simulation_data *, storage *, reax_list **, output_controls * ); +void Cuda_Compute_Bond_Orders( reax_system * const, control_params * const, + simulation_data * const, storage * const, reax_list ** const, + output_controls * const ); -void Cuda_Total_Forces_Part1( reax_system *, control_params *, - simulation_data *, storage *, reax_list ** ); +void Cuda_Total_Forces_Part1( reax_system * const, control_params * const, + simulation_data * const, storage *, reax_list ** const ); -void Cuda_Total_Forces_Part2( reax_system *, storage * ); +void Cuda_Total_Forces_Part2( reax_system * const, control_params * const, + storage * const ); /* Compute the bond order term between atoms i and j, diff --git a/PG-PuReMD/src/cuda/cuda_bonds.cu b/PG-PuReMD/src/cuda/cuda_bonds.cu index 45456a6b..0e07832e 100644 --- a/PG-PuReMD/src/cuda/cuda_bonds.cu +++ b/PG-PuReMD/src/cuda/cuda_bonds.cu @@ -291,7 +291,7 @@ void Cuda_Compute_Bonds( reax_system *system, control_params *control, 0, sizeof(real), "Cuda_Compute_Bonds::e_bond" ); #endif -// k_bonds <<< control->blocks, control->block_size >>> +// k_bonds <<< control->blocks, control->block_size, 0, control->streams[0] >>> // ( system->d_my_atoms, system->reax_param.d_gp, // system->reax_param.d_sbp, system->reax_param.d_tbp, // *(workspace->d_workspace), *(lists[BONDS]), @@ -308,7 +308,8 @@ void Cuda_Compute_Bonds( reax_system *system, control_params *control, + (system->n * 32 % DEF_BLOCK_SIZE == 0 ? 0 : 1); k_bonds_opt <<< blocks, DEF_BLOCK_SIZE, - sizeof(cub::WarpReduce<double>::TempStorage) * (DEF_BLOCK_SIZE / 32) >>> + sizeof(cub::WarpReduce<double>::TempStorage) * (DEF_BLOCK_SIZE / 32), + control->streams[0] >>> ( system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_sbp, system->reax_param.d_tbp, *(workspace->d_workspace), *(lists[BONDS]), diff --git a/PG-PuReMD/src/cuda/cuda_box.cu b/PG-PuReMD/src/cuda/cuda_box.cu index fb8b62ab..044125b5 100644 --- a/PG-PuReMD/src/cuda/cuda_box.cu +++ b/PG-PuReMD/src/cuda/cuda_box.cu @@ -87,7 +87,7 @@ void Cuda_Scale_Box( reax_system *system, control_params *control, lambda = SQRT( lambda ); /* Scale velocities and positions at t+dt */ - Cuda_Scale_Velocities_NPT( system, lambda, mu ); + Cuda_Scale_Velocities_NPT( system, control, lambda, mu ); Cuda_Compute_Kinetic_Energy( system, control, workspace, data, mpi_data->comm_mesh3D ); diff --git a/PG-PuReMD/src/cuda/cuda_charges.cu b/PG-PuReMD/src/cuda/cuda_charges.cu index 3933082c..77d15534 100644 --- a/PG-PuReMD/src/cuda/cuda_charges.cu +++ b/PG-PuReMD/src/cuda/cuda_charges.cu @@ -54,14 +54,14 @@ CUDA_GLOBAL void k_jacobi( reax_atom const * const my_atoms, static void jacobi( reax_system const * const system, - storage const * const workspace ) + control_params const * const control, storage const * const workspace ) { int blocks; blocks = system->n / DEF_BLOCK_SIZE + ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1); - k_jacobi <<< blocks, DEF_BLOCK_SIZE >>> + k_jacobi <<< blocks, DEF_BLOCK_SIZE, 0, control->streams[0] >>> ( system->d_my_atoms, system->reax_param.d_sbp, *(workspace->d_workspace), system->n ); cudaCheckError( ); @@ -240,7 +240,8 @@ static void Spline_Extrapolate_Charges_QEq( reax_system const * const system, blocks = system->n / DEF_BLOCK_SIZE + ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1); - k_spline_extrapolate_charges_qeq <<< blocks, DEF_BLOCK_SIZE >>> + k_spline_extrapolate_charges_qeq <<< blocks, DEF_BLOCK_SIZE, 0, + control->streams[0] >>> ( system->d_my_atoms, system->reax_param.d_sbp, (control_params *)control->d_control_params, *(workspace->d_workspace), system->n ); @@ -339,7 +340,7 @@ static void Compute_Preconditioner_QEq( reax_system const * const system, if ( control->cm_solver_pre_comp_type == JACOBI_PC ) { - jacobi( system, workspace ); + jacobi( system, control, workspace ); } else if ( control->cm_solver_pre_comp_type == SAI_PC ) { @@ -403,7 +404,8 @@ CUDA_GLOBAL void k_extrapolate_charges_qeq_part2( reax_atom *my_atoms, static void Extrapolate_Charges_QEq_Part2( reax_system const * const system, - storage * const workspace, real * const q, real u ) + control_params const * const control, storage * const workspace, + real * const q, real u ) { int blocks; real *spad; @@ -417,7 +419,8 @@ static void Extrapolate_Charges_QEq_Part2( reax_system const * const system, spad = (real *) workspace->scratch; cuda_memset( spad, 0, sizeof(real) * system->n, "Extrapolate_Charges_QEq_Part2::spad" ); - k_extrapolate_charges_qeq_part2 <<< blocks, DEF_BLOCK_SIZE >>> + k_extrapolate_charges_qeq_part2 <<< blocks, DEF_BLOCK_SIZE, 0, + control->streams[0] >>> ( system->d_my_atoms, *(workspace->d_workspace), u, spad, system->n ); cudaCheckError( ); @@ -443,7 +446,8 @@ CUDA_GLOBAL void k_update_ghost_atom_charges( reax_atom *my_atoms, real *q, static void Update_Ghost_Atom_Charges( reax_system const * const system, - storage * const workspace, real * const q ) + control_params const * const control, storage * const workspace, + real * const q ) { int blocks; real *spad; @@ -458,15 +462,15 @@ static void Update_Ghost_Atom_Charges( reax_system const * const system, sCudaMemcpy( spad, &q[system->n], sizeof(real) * (system->N - system->n), cudaMemcpyHostToDevice, __FILE__, __LINE__ ); - k_update_ghost_atom_charges <<< blocks, DEF_BLOCK_SIZE >>> + k_update_ghost_atom_charges <<< blocks, DEF_BLOCK_SIZE, 0, + control->streams[0] >>> ( system->d_my_atoms, spad, system->n, system->N ); cudaCheckError( ); } static void Calculate_Charges_QEq( reax_system const * const system, - control_params const * const control, - storage * const workspace, + control_params const * const control, storage * const workspace, mpi_datatypes * const mpi_data ) { int ret; @@ -492,12 +496,14 @@ static void Calculate_Charges_QEq( reax_system const * const system, /* compute local sums of pseudo-charges in s and t on device */ k_reduction_rvec2 <<< blocks, DEF_BLOCK_SIZE, - sizeof(rvec2) * (DEF_BLOCK_SIZE / 32) >>> + sizeof(rvec2) * (DEF_BLOCK_SIZE / 32), + control->streams[0] >>> ( workspace->d_workspace->x, spad, system->n ); cudaCheckError( ); k_reduction_rvec2 <<< 1, ((blocks + 31) / 32) * 32, - sizeof(rvec2) * ((blocks + 31) / 32) >>> + sizeof(rvec2) * ((blocks + 31) / 32), + control->streams[0] >>> ( spad, &spad[blocks], blocks ); cudaCheckError( ); @@ -531,12 +537,12 @@ static void Calculate_Charges_QEq( reax_system const * const system, /* derive atomic charges from pseudo-charges * and set up extrapolation for next time step */ - Extrapolate_Charges_QEq_Part2( system, workspace, q, u ); + Extrapolate_Charges_QEq_Part2( system, control, workspace, q, u ); Dist_FS( system, mpi_data, q, REAL_PTR_TYPE, MPI_DOUBLE ); /* copy atomic charges to ghost atoms in case of ownership transfer */ - Update_Ghost_Atom_Charges( system, workspace, q ); + Update_Ghost_Atom_Charges( system, control, workspace, q ); } diff --git a/PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu b/PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu index 306ccec6..24b42cc0 100644 --- a/PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu +++ b/PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu @@ -279,31 +279,6 @@ CUDA_GLOBAL void k_vector_mult_rvec2( rvec2 * const dest, rvec2 const * const v1 } -/* check if all entries of a dense vector are sufficiently close to zero - * - * inputs: - * v: dense vector - * k: number of entries in v - * output: TRUE if all entries are sufficiently close to zero, FALSE otherwise - */ -int Vector_isZero( real const * const v, unsigned int k ) -{ - unsigned int i, ret; - - ret = TRUE; - - for ( i = 0; i < k; ++i ) - { - if ( FABS( v[i] ) > ALMOST_ZERO ) - { - ret = FALSE; - } - } - - return ret; -} - - /* sets all entries of a dense vector to zero * * inputs: @@ -311,14 +286,14 @@ int Vector_isZero( real const * const v, unsigned int k ) * k: number of entries in v * output: v with entries set to zero */ -void Vector_MakeZero( real * const v, unsigned int k ) +void Vector_MakeZero( real * const v, unsigned int k, cudaStream_t s ) { int blocks; blocks = (k / DEF_BLOCK_SIZE) + ((k % DEF_BLOCK_SIZE == 0) ? 0 : 1); - k_vector_makezero <<< blocks, DEF_BLOCK_SIZE >>> + k_vector_makezero <<< blocks, DEF_BLOCK_SIZE, 0, s >>> ( v, k ); cudaCheckError( ); } @@ -333,14 +308,14 @@ void Vector_MakeZero( real * const v, unsigned int k ) * dest: vector copied into */ void Vector_Copy( real * const dest, real const * const v, - unsigned int k ) + unsigned int k, cudaStream_t s ) { int blocks; blocks = (k / DEF_BLOCK_SIZE) + ((k % DEF_BLOCK_SIZE == 0) ? 0 : 1); - k_vector_copy <<< blocks, DEF_BLOCK_SIZE >>> + k_vector_copy <<< blocks, DEF_BLOCK_SIZE, 0, s >>> ( dest, v, k ); cudaCheckError( ); } @@ -355,42 +330,42 @@ void Vector_Copy( real * const dest, real const * const v, * dest: vector copied into */ void Vector_Copy_rvec2( rvec2 * const dest, rvec2 const * const v, - unsigned int k ) + unsigned int k, cudaStream_t s ) { int blocks; blocks = (k / DEF_BLOCK_SIZE) + ((k % DEF_BLOCK_SIZE == 0) ? 0 : 1); - k_vector_copy_rvec2 <<< blocks, DEF_BLOCK_SIZE >>> + k_vector_copy_rvec2 <<< blocks, DEF_BLOCK_SIZE, 0, s >>> ( dest, v, k ); cudaCheckError( ); } void Vector_Copy_From_rvec2( real * const dst, rvec2 const * const src, - int index, int k ) + int index, int k, cudaStream_t s ) { int blocks; blocks = (k / DEF_BLOCK_SIZE) + ((k % DEF_BLOCK_SIZE == 0) ? 0 : 1); - k_vector_copy_from_rvec2 <<< blocks, DEF_BLOCK_SIZE >>> + k_vector_copy_from_rvec2 <<< blocks, DEF_BLOCK_SIZE, 0, s >>> ( dst, src, index, k ); cudaCheckError( ); } void Vector_Copy_To_rvec2( rvec2 * const dst, real const * const src, - int index, int k ) + int index, int k, cudaStream_t s ) { int blocks; blocks = (k / DEF_BLOCK_SIZE) + ((k % DEF_BLOCK_SIZE == 0) ? 0 : 1); - k_vector_copy_to_rvec2 <<< blocks, DEF_BLOCK_SIZE >>> + k_vector_copy_to_rvec2 <<< blocks, DEF_BLOCK_SIZE, 0, s >>> ( dst, src, index, k ); cudaCheckError( ); } @@ -406,14 +381,14 @@ void Vector_Copy_To_rvec2( rvec2 * const dst, real const * const src, * dest: with entries scaled */ void Vector_Scale( real * const dest, real c, real const * const v, - unsigned int k ) + unsigned int k, cudaStream_t s ) { int blocks; blocks = (k / DEF_BLOCK_SIZE) + ((k % DEF_BLOCK_SIZE == 0) ? 0 : 1); - k_vector_scale <<< blocks, DEF_BLOCK_SIZE >>> + k_vector_scale <<< blocks, DEF_BLOCK_SIZE, 0, s >>> ( dest, c, v, k ); cudaCheckError( ); } @@ -430,28 +405,28 @@ void Vector_Scale( real * const dest, real c, real const * const v, * dest: vector containing the scaled sum */ void Vector_Sum( real * const dest, real c, real const * const v, - real d, real const * const y, unsigned int k ) + real d, real const * const y, unsigned int k, cudaStream_t s ) { int blocks; blocks = (k / DEF_BLOCK_SIZE) + ((k % DEF_BLOCK_SIZE == 0) ? 0 : 1); - k_vector_sum <<< blocks, DEF_BLOCK_SIZE >>> + k_vector_sum <<< blocks, DEF_BLOCK_SIZE, 0, s >>> ( dest, c, v, d, y, k ); cudaCheckError( ); } void Vector_Sum_rvec2( rvec2 * const dest, real c0, real c1, rvec2 const * const v, - real d0, real d1, rvec2 const * const y, unsigned int k ) + real d0, real d1, rvec2 const * const y, unsigned int k, cudaStream_t s ) { int blocks; blocks = (k / DEF_BLOCK_SIZE) + ((k % DEF_BLOCK_SIZE == 0) ? 0 : 1); - k_vector_sum_rvec2 <<< blocks, DEF_BLOCK_SIZE >>> + k_vector_sum_rvec2 <<< blocks, DEF_BLOCK_SIZE, 0, s >>> ( dest, c0, c1, v, d0, d1, y, k ); cudaCheckError( ); } @@ -468,14 +443,14 @@ void Vector_Sum_rvec2( rvec2 * const dest, real c0, real c1, rvec2 const * const * dest: vector to accumulate with the scaled sum */ void Vector_Add( real * const dest, real c, real const * const v, - unsigned int k ) + unsigned int k, cudaStream_t s ) { int blocks; blocks = (k / DEF_BLOCK_SIZE) + ((k % DEF_BLOCK_SIZE == 0) ? 0 : 1); - k_vector_add <<< blocks, DEF_BLOCK_SIZE >>> + k_vector_add <<< blocks, DEF_BLOCK_SIZE, 0, s >>> ( dest, c, v, k ); cudaCheckError( ); } @@ -492,14 +467,14 @@ void Vector_Add( real * const dest, real c, real const * const v, * dest: vector to accumulate with the scaled sum */ void Vector_Add_rvec2( rvec2 * const dest, real c0, real c1, rvec2 const * const v, - unsigned int k ) + unsigned int k, cudaStream_t s ) { int blocks; blocks = (k / DEF_BLOCK_SIZE) + ((k % DEF_BLOCK_SIZE == 0) ? 0 : 1); - k_vector_add_rvec2 <<< blocks, DEF_BLOCK_SIZE >>> + k_vector_add_rvec2 <<< blocks, DEF_BLOCK_SIZE, 0, s >>> ( dest, c0, c1, v, k ); cudaCheckError( ); } @@ -514,14 +489,14 @@ void Vector_Add_rvec2( rvec2 * const dest, real c0, real c1, rvec2 const * const * dest: vector with the result of the multiplication */ void Vector_Mult( real * const dest, real const * const v1, - real const * const v2, unsigned int k ) + real const * const v2, unsigned int k, cudaStream_t s ) { int blocks; blocks = (k / DEF_BLOCK_SIZE) + ((k % DEF_BLOCK_SIZE == 0) ? 0 : 1); - k_vector_mult <<< blocks, DEF_BLOCK_SIZE >>> + k_vector_mult <<< blocks, DEF_BLOCK_SIZE, 0, s >>> ( dest, v1, v2, k ); cudaCheckError( ); } @@ -536,14 +511,14 @@ void Vector_Mult( real * const dest, real const * const v1, * dest: vector with the result of the multiplication */ void Vector_Mult_rvec2( rvec2 * const dest, rvec2 const * const v1, - rvec2 const * const v2, unsigned int k ) + rvec2 const * const v2, unsigned int k, cudaStream_t s ) { int blocks; blocks = (k / DEF_BLOCK_SIZE) + ((k % DEF_BLOCK_SIZE == 0) ? 0 : 1); - k_vector_mult_rvec2 <<< blocks, DEF_BLOCK_SIZE >>> + k_vector_mult_rvec2 <<< blocks, DEF_BLOCK_SIZE, 0, s >>> ( dest, v1, v2, k ); cudaCheckError( ); } @@ -556,13 +531,14 @@ void Vector_Mult_rvec2( rvec2 * const dest, rvec2 const * const v1, * v1: dense vector * k: number of entries in the vector * comm: MPI communicator + * s: CUDA stream * output: * norm: 2-norm */ real Norm( storage * const workspace, - real const * const v1, unsigned int k, MPI_Comm comm ) + real const * const v1, unsigned int k, MPI_Comm comm, cudaStream_t s ) { - return SQRT( Dot( workspace, v1, v1, k, comm ) ); + return SQRT( Dot( workspace, v1, v1, k, comm, s ) ); } @@ -578,7 +554,7 @@ real Norm( storage * const workspace, */ real Dot( storage * const workspace, real const * const v1, real const * const v2, - unsigned int k, MPI_Comm comm ) + unsigned int k, MPI_Comm comm, cudaStream_t s ) { int ret; real sum, *spad; @@ -593,7 +569,7 @@ real Dot( storage * const workspace, Vector_Mult( spad, v1, v2, k ); /* local reduction (sum) on device */ - Cuda_Reduction_Sum( spad, &spad[k], k ); + Cuda_Reduction_Sum( spad, &spad[k], k, s ); /* global reduction (sum) of local device sums and store on host */ //#if defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT @@ -622,7 +598,7 @@ real Dot( storage * const workspace, */ real Dot_local( storage * const workspace, real const * const v1, real const * const v2, - unsigned int k ) + unsigned int k, cudaStream_t s ) { real sum, *spad; @@ -633,7 +609,7 @@ real Dot_local( storage * const workspace, Vector_Mult( spad, v1, v2, k ); /* local reduction (sum) on device */ - Cuda_Reduction_Sum( spad, &spad[k], k ); + Cuda_Reduction_Sum( spad, &spad[k], k, s ); //TODO: keep result of reduction on devie and pass directly to CUDA-aware MPI sCudaMemcpy( &sum, &spad[k], sizeof(real), @@ -652,10 +628,9 @@ real Dot_local( storage * const workspace, * output: * dot: inner product of the two vector */ -void Dot_local_rvec2( control_params const * const control, - storage * const workspace, +void Dot_local_rvec2( storage * const workspace, rvec2 const * const v1, rvec2 const * const v2, - unsigned int k, real * sum1, real * sum2 ) + unsigned int k, real * sum1, real * sum2, cudaStream_t s ) { int blocks; rvec2 sum, *spad; @@ -670,15 +645,15 @@ void Dot_local_rvec2( control_params const * const control, Vector_Mult_rvec2( spad, v1, v2, k ); /* local reduction (sum) on device */ -// Cuda_Reduction_Sum( spad, &spad[k], k ); +// Cuda_Reduction_Sum( spad, &spad[k], k, s ); k_reduction_rvec2 <<< blocks, DEF_BLOCK_SIZE, - sizeof(rvec2) * (DEF_BLOCK_SIZE / 32) >>> + sizeof(rvec2) * (DEF_BLOCK_SIZE / 32), s >>> ( spad, &spad[k], k ); cudaCheckError( ); k_reduction_rvec2 <<< 1, ((blocks + 31) / 32) * 32, - sizeof(rvec2) * ((blocks + 31) / 32) >>> + sizeof(rvec2) * ((blocks + 31) / 32), s >>> ( &spad[k], &spad[k + blocks], blocks ); cudaCheckError( ); diff --git a/PG-PuReMD/src/cuda/cuda_dense_lin_alg.h b/PG-PuReMD/src/cuda/cuda_dense_lin_alg.h index 61846fb6..4d893dae 100644 --- a/PG-PuReMD/src/cuda/cuda_dense_lin_alg.h +++ b/PG-PuReMD/src/cuda/cuda_dense_lin_alg.h @@ -4,58 +4,52 @@ #include "../reax_types.h" -int Vector_isZero( real const * const, unsigned int ); - -void Vector_MakeZero( real * const, unsigned int ); +void Vector_MakeZero( real * const, unsigned int, + cudaStream_t = cudaStreamDefault ); void Vector_Copy( real * const, real const * const, - unsigned int ); + unsigned int, cudaStream_t = cudaStreamDefault ); void Vector_Copy_rvec2( rvec2 * const, rvec2 const * const, - unsigned int ); + unsigned int, cudaStream_t = cudaStreamDefault ); void Vector_Copy_From_rvec2( real * const, rvec2 const * const, - int, int ); + int, int, cudaStream_t = cudaStreamDefault ); void Vector_Copy_To_rvec2( rvec2 * const, real const * const, - int, int ); + int, int, cudaStream_t = cudaStreamDefault ); void Vector_Scale( real * const, real, real const * const, - unsigned int ); + unsigned int, cudaStream_t = cudaStreamDefault ); void Vector_Sum( real * const, real, real const * const, - real, real const * const, unsigned int ); + real, real const * const, unsigned int, cudaStream_t = cudaStreamDefault ); void Vector_Sum_rvec2( rvec2 * const, real, real, rvec2 const * const, - real, real, rvec2 const * const, unsigned int ); + real, real, rvec2 const * const, unsigned int, cudaStream_t = cudaStreamDefault ); void Vector_Add( real * const, real, real const * const, - unsigned int ); + unsigned int, cudaStream_t = cudaStreamDefault ); void Vector_Add_rvec2( rvec2 * const, real, real, rvec2 const * const, - unsigned int ); + unsigned int, cudaStream_t = cudaStreamDefault ); void Vector_Mult( real * const, real const * const, - real const * const, unsigned int ); + real const * const, unsigned int, cudaStream_t = cudaStreamDefault ); void Vector_Mult_rvec2( rvec2 * const, rvec2 const * const, - rvec2 const * const, unsigned int ); + rvec2 const * const, unsigned int, cudaStream_t = cudaStreamDefault ); -real Norm( storage * const, - real const * const, unsigned int, MPI_Comm ); +real Norm( storage * const, real const * const, unsigned int, MPI_Comm, cudaStream_t ); -real Dot( storage * const, - real const * const, real const * const, - unsigned int, MPI_Comm ); +real Dot( storage * const, real const * const, real const * const, + unsigned int, MPI_Comm, cudaStream_t ); -real Dot_local( storage * const, - real const * const, real const * const, - unsigned int ); +real Dot_local( storage * const, real const * const, real const * const, + unsigned int, cudaStream_t ); -void Dot_local_rvec2( control_params const * const, - storage * const, - rvec2 const * const, rvec2 const * const, - unsigned int, real *, real * ); +void Dot_local_rvec2( storage * const, rvec2 const * const, rvec2 const * const, + unsigned int, real *, real *, cudaStream_t ); #endif diff --git a/PG-PuReMD/src/cuda/cuda_environment.cu b/PG-PuReMD/src/cuda/cuda_environment.cu index 1e11bc32..6cfecd92 100644 --- a/PG-PuReMD/src/cuda/cuda_environment.cu +++ b/PG-PuReMD/src/cuda/cuda_environment.cu @@ -16,10 +16,11 @@ static void compute_nearest_multiple_32( int blocks, int *result ) } -extern "C" void Cuda_Setup_Environment( int rank, int nprocs, int gpus_per_node ) +extern "C" void Cuda_Setup_Environment( reax_system const * const system, + control_params * const control ) { - int deviceCount; + int i, least_priority, greatest_priority, deviceCount; cudaError_t ret; ret = cudaGetDeviceCount( &deviceCount ); @@ -29,30 +30,66 @@ extern "C" void Cuda_Setup_Environment( int rank, int nprocs, int gpus_per_node fprintf( stderr, "[ERROR] no CUDA capable device(s) found. Terminating...\n" ); exit( CANNOT_INITIALIZE ); } - else if ( deviceCount < gpus_per_node || gpus_per_node < 1 ) + else if ( deviceCount < control->gpus_per_node || control->gpus_per_node < 1 ) { fprintf( stderr, "[ERROR] invalid number of CUDA capable devices requested (gpus_per_node = %d). Terminating...\n", - gpus_per_node ); + control->gpus_per_node ); exit( INVALID_INPUT ); } /* assign the GPU for each process */ //TODO: handle condition where # CPU procs > # GPUs - ret = cudaSetDevice( rank % gpus_per_node ); + ret = cudaSetDevice( system->my_rank % control->gpus_per_node ); if ( ret == cudaErrorInvalidDevice ) { fprintf( stderr, "[ERROR] invalid CUDA device ID set (%d). Terminating...\n", - rank % gpus_per_node ); + system->my_rank % control->gpus_per_node ); exit( CANNOT_INITIALIZE ); } else if ( ret == cudaErrorDeviceAlreadyInUse ) { fprintf( stderr, "[ERROR] CUDA device with specified ID already in use (%d). Terminating...\n", - rank % gpus_per_node ); + system->my_rank % control->gpus_per_node ); exit( CANNOT_INITIALIZE ); } + ret = cudaDeviceGetStreamPriorityRange( &least_priority, &greatest_priority ); + + if ( ret != cudaSuccess ) + { + fprintf( stderr, "[ERROR] CUDA strema priority query failed. Terminating...\n" ); + exit( CANNOT_INITIALIZE ); + } + + /* stream assignment: + * 0: init bonds, bond order (uncorrected/corrected), lone pair/over coord/under coord + * 1: (after bond order) bonds, valence angels, torsions + * 2: init hbonds, (after bonds) hbonds + * 3: van der Waals + * 4: init CM, CM, Coulomb + */ + for ( i = 0; i < CUDA_MAX_STREAMS; ++i ) + { + /* all non-CM streams of equal priority */ + if ( i < CUDA_MAX_STREAMS - 1 ) + { + ret = cudaStreamCreateWithPriority( &control->streams[i], cudaStreamNonBlocking, least_priority ); + } + /* CM gets highest priority due to MPI comms and cudaMemcpy's */ + else + { + ret = cudaStreamCreateWithPriority( &control->streams[i], cudaStreamNonBlocking, greatest_priority ); + } + + if ( ret != cudaSuccess ) + { + fprintf( stderr, "[ERROR] CUDA strema creation failed (%d). Terminating...\n", + i ); + exit( CANNOT_INITIALIZE ); + } + } + //TODO: revisit additional device configurations // cudaDeviceSetLimit( cudaLimitStackSize, 8192 ); // cudaDeviceSetCacheConfig( cudaFuncCachePreferL1 ); @@ -70,8 +107,23 @@ extern "C" void Cuda_Init_Block_Sizes( reax_system *system, } -extern "C" void Cuda_Cleanup_Environment( ) +extern "C" void Cuda_Cleanup_Environment( control_params const * const control ) { + int i; + cudaError_t ret; + + for ( i = 0; i < CUDA_MAX_STREAMS; ++i ) + { + ret = cudaStreamDestroy( control->streams[i] ); + + if ( ret != cudaSuccess ) + { + fprintf( stderr, "[ERROR] CUDA strema destruction failed (%d). Terminating...\n", + i ); + exit( CANNOT_INITIALIZE ); + } + } + cudaDeviceReset( ); cudaDeviceSynchronize( ); } diff --git a/PG-PuReMD/src/cuda/cuda_environment.h b/PG-PuReMD/src/cuda/cuda_environment.h index 7141590d..a39eaf0e 100644 --- a/PG-PuReMD/src/cuda/cuda_environment.h +++ b/PG-PuReMD/src/cuda/cuda_environment.h @@ -9,11 +9,12 @@ extern "C" { #endif -void Cuda_Setup_Environment( int, int, int ); +void Cuda_Setup_Environment( reax_system const * const, + control_params * const ); void Cuda_Init_Block_Sizes( reax_system *, control_params * ); -void Cuda_Cleanup_Environment( ); +void Cuda_Cleanup_Environment( control_params const * const ); #ifdef __cplusplus } diff --git a/PG-PuReMD/src/cuda/cuda_forces.cu b/PG-PuReMD/src/cuda/cuda_forces.cu index ed76e02c..7c828199 100644 --- a/PG-PuReMD/src/cuda/cuda_forces.cu +++ b/PG-PuReMD/src/cuda/cuda_forces.cu @@ -1617,27 +1617,28 @@ CUDA_GLOBAL void k_print_hbonds( reax_atom *my_atoms, reax_list hbond_list, int #if defined(DEBUG_FOCUS) -static void Print_Forces( reax_system *system ) +static void Print_Forces( reax_system *system, control_params *control ) { int blocks; blocks = (system->n) / DEF_BLOCK_SIZE + (((system->n % DEF_BLOCK_SIZE) == 0) ? 0 : 1); - k_print_forces <<< blocks, DEF_BLOCK_SIZE >>> + k_print_forces <<< blocks, DEF_BLOCK_SIZE, 0, control->streams[0] >>> ( system->d_my_atoms, workspace->d_workspace->f, system->n ); cudaCheckError( ); } -static void Print_HBonds( reax_system *system, int step ) +static void Print_HBonds( reax_system *system, control_params *control, + int step ) { int blocks; blocks = (system->n) / DEF_BLOCK_SIZE + (((system->n % DEF_BLOCK_SIZE) == 0) ? 0 : 1); - k_print_hbonds <<< blocks, DEF_BLOCK_SIZE >>> + k_print_hbonds <<< blocks, DEF_BLOCK_SIZE, 0, control->streams[0] >>> ( system->d_my_atoms, *(lists[HBONDS]), system->n, system->my_rank, step ); cudaCheckError( ); } @@ -1647,7 +1648,8 @@ static void Print_HBonds( reax_system *system, int step ) /* Initialize indices for far neighbors list post reallocation * * system: atomic system info. */ -void Cuda_Init_Neighbor_Indices( reax_system *system, reax_list *far_nbr_list ) +void Cuda_Init_Neighbor_Indices( reax_system *system, control_params *control, + reax_list *far_nbr_list ) { int blocks; @@ -1655,11 +1657,13 @@ void Cuda_Init_Neighbor_Indices( reax_system *system, reax_list *far_nbr_list ) + (far_nbr_list->n % DEF_BLOCK_SIZE == 0 ? 0 : 1); /* init indices */ - Cuda_Scan_Excl_Sum( system->d_max_far_nbrs, far_nbr_list->index, far_nbr_list->n ); + Cuda_Scan_Excl_Sum( system->d_max_far_nbrs, far_nbr_list->index, + far_nbr_list->n, control->streams[0] ); /* init end_indices */ - k_init_end_index <<< blocks, DEF_BLOCK_SIZE >>> - ( system->d_far_nbrs, far_nbr_list->index, far_nbr_list->end_index, far_nbr_list->n ); + k_init_end_index <<< blocks, DEF_BLOCK_SIZE, 0, control->streams[0] >>> + ( system->d_far_nbrs, far_nbr_list->index, far_nbr_list->end_index, + far_nbr_list->n ); cudaCheckError( ); } @@ -1667,8 +1671,8 @@ void Cuda_Init_Neighbor_Indices( reax_system *system, reax_list *far_nbr_list ) /* Initialize indices for far hydrogen bonds list post reallocation * * system: atomic system info. */ -void Cuda_Init_HBond_Indices( reax_system *system, storage *workspace, - reax_list *hbond_list ) +void Cuda_Init_HBond_Indices( reax_system *system, control_params *control, + storage *workspace, reax_list *hbond_list ) { int blocks, *temp; @@ -1681,9 +1685,10 @@ void Cuda_Init_HBond_Indices( reax_system *system, storage *workspace, temp = (int *) workspace->scratch; /* init indices and end_indices */ - Cuda_Scan_Excl_Sum( system->d_max_hbonds, temp, system->total_cap ); + Cuda_Scan_Excl_Sum( system->d_max_hbonds, temp, system->total_cap, + control->streams[0] ); - k_init_hbond_indices <<< blocks, DEF_BLOCK_SIZE >>> + k_init_hbond_indices <<< blocks, DEF_BLOCK_SIZE, 0, control->streams[0] >>> ( system->d_my_atoms, system->reax_param.d_sbp, system->d_hbonds, temp, hbond_list->index, hbond_list->end_index, system->total_cap ); cudaCheckError( ); @@ -1693,7 +1698,8 @@ void Cuda_Init_HBond_Indices( reax_system *system, storage *workspace, /* Initialize indices for far bonds list post reallocation * * system: atomic system info. */ -void Cuda_Init_Bond_Indices( reax_system *system, reax_list * bond_list ) +void Cuda_Init_Bond_Indices( reax_system *system, control_params * control, + reax_list * bond_list ) { int blocks; @@ -1701,10 +1707,11 @@ void Cuda_Init_Bond_Indices( reax_system *system, reax_list * bond_list ) (system->total_cap % DEF_BLOCK_SIZE == 0 ? 0 : 1); /* init indices */ - Cuda_Scan_Excl_Sum( system->d_max_bonds, bond_list->index, system->total_cap ); + Cuda_Scan_Excl_Sum( system->d_max_bonds, bond_list->index, system->total_cap, + control->streams[0] ); /* init end_indices */ - k_init_end_index <<< blocks, DEF_BLOCK_SIZE >>> + k_init_end_index <<< blocks, DEF_BLOCK_SIZE, 0, control->streams[0] >>> ( system->d_bonds, bond_list->index, bond_list->end_index, system->total_cap ); cudaCheckError( ); } @@ -1714,7 +1721,8 @@ void Cuda_Init_Bond_Indices( reax_system *system, reax_list * bond_list ) * * system: atomic system info. * H: charge matrix */ -void Cuda_Init_Sparse_Matrix_Indices( reax_system *system, sparse_matrix *H ) +void Cuda_Init_Sparse_Matrix_Indices( reax_system *system, control_params *control, + sparse_matrix *H ) { int blocks; @@ -1722,11 +1730,12 @@ void Cuda_Init_Sparse_Matrix_Indices( reax_system *system, sparse_matrix *H ) + (H->n_max % DEF_BLOCK_SIZE == 0 ? 0 : 1); /* init indices */ - Cuda_Scan_Excl_Sum( system->d_max_cm_entries, H->start, H->n_max ); + Cuda_Scan_Excl_Sum( system->d_max_cm_entries, H->start, H->n_max, + control->streams[0] ); //TODO: not needed for full format (Init_Forces sets H->end) /* init end_indices */ - k_init_end_index <<< blocks, DEF_BLOCK_SIZE >>> + k_init_end_index <<< blocks, DEF_BLOCK_SIZE, 0, control->streams[0] >>> ( system->d_cm_entries, H->start, H->end, H->n_max ); cudaCheckError( ); } @@ -1745,7 +1754,8 @@ void Cuda_Estimate_Storages( reax_system *system, control_params *control, if ( workspace->d_workspace->H.format == SYM_HALF_MATRIX ) { - k_estimate_storages_cm_half <<< blocks, DEF_BLOCK_SIZE >>> + k_estimate_storages_cm_half <<< blocks, DEF_BLOCK_SIZE, 0, + control->streams[0] >>> ( system->d_my_atoms, (control_params *) control->d_control_params, *(lists[FAR_NBRS]), workspace->d_workspace->H.n, workspace->d_workspace->H.n_max, @@ -1753,7 +1763,8 @@ void Cuda_Estimate_Storages( reax_system *system, control_params *control, } else { - k_estimate_storages_cm_full <<< blocks, DEF_BLOCK_SIZE >>> + k_estimate_storages_cm_full <<< blocks, DEF_BLOCK_SIZE, 0, + control->streams[0] >>> ( (control_params *) control->d_control_params, *(lists[FAR_NBRS]), workspace->d_workspace->H.n, workspace->d_workspace->H.n_max, @@ -1761,8 +1772,8 @@ void Cuda_Estimate_Storages( reax_system *system, control_params *control, } cudaCheckError( ); - Cuda_Reduction_Sum( system->d_max_cm_entries, - system->d_total_cm_entries, workspace->d_workspace->H.n_max ); + Cuda_Reduction_Sum( system->d_max_cm_entries, system->d_total_cm_entries, + workspace->d_workspace->H.n_max, control->streams[0] ); sCudaMemcpy( &system->total_cm_entries, system->d_total_cm_entries, sizeof(int), cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); } @@ -1772,7 +1783,8 @@ void Cuda_Estimate_Storages( reax_system *system, control_params *control, blocks = system->total_cap / DEF_BLOCK_SIZE + (system->total_cap % DEF_BLOCK_SIZE == 0 ? 0 : 1); - k_estimate_storage_bonds <<< blocks, DEF_BLOCK_SIZE >>> + k_estimate_storage_bonds <<< blocks, DEF_BLOCK_SIZE, 0, + control->streams[0] >>> ( system->d_my_atoms, system->reax_param.d_sbp, system->reax_param.d_tbp, (control_params *) control->d_control_params, *(lists[FAR_NBRS]), system->reax_param.num_atom_types, @@ -1781,7 +1793,7 @@ void Cuda_Estimate_Storages( reax_system *system, control_params *control, cudaCheckError( ); Cuda_Reduction_Sum( system->d_max_bonds, system->d_total_bonds, - system->total_cap ); + system->total_cap, control->streams[0] ); sCudaMemcpy( &system->total_bonds, system->d_total_bonds, sizeof(int), cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); } @@ -1791,7 +1803,8 @@ void Cuda_Estimate_Storages( reax_system *system, control_params *control, blocks = system->total_cap / DEF_BLOCK_SIZE + (system->total_cap % DEF_BLOCK_SIZE == 0 ? 0 : 1); - k_estimate_storage_hbonds <<< blocks, DEF_BLOCK_SIZE >>> + k_estimate_storage_hbonds <<< blocks, DEF_BLOCK_SIZE, 0, + control->streams[0] >>> ( system->d_my_atoms, system->reax_param.d_sbp, (control_params *) control->d_control_params, *(lists[FAR_NBRS]), system->reax_param.num_atom_types, @@ -1800,7 +1813,7 @@ void Cuda_Estimate_Storages( reax_system *system, control_params *control, cudaCheckError( ); Cuda_Reduction_Sum( system->d_max_hbonds, system->d_total_hbonds, - system->total_cap ); + system->total_cap, control->streams[0] ); sCudaMemcpy( &system->total_hbonds, system->d_total_hbonds, sizeof(int), cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); } @@ -1819,7 +1832,8 @@ void Cuda_Estimate_Storages( reax_system *system, control_params *control, #endif control->hbond_cut = 0.0; - k_disable_hydrogen_bonding <<< 1, 1 >>> ( (control_params *) control->d_control_params ); + k_disable_hydrogen_bonding <<< 1, 1, 0, control->streams[0] >>> + ( (control_params *) control->d_control_params ); } } @@ -1856,13 +1870,13 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, if ( renbr == FALSE && dist_done == FALSE ) { -// k_init_dist <<< control->blocks_n, control->block_size_n >>> +// k_init_dist <<< control->blocks_n, control->block_size_n, 0, control->streams[0] >>> // ( system->d_my_atoms, *(lists[FAR_NBRS]), system->N ); blocks = system->N * 32 / DEF_BLOCK_SIZE + (system->N * 32 % DEF_BLOCK_SIZE == 0 ? 0 : 1); - k_init_dist_opt <<< blocks, DEF_BLOCK_SIZE >>> + k_init_dist_opt <<< blocks, DEF_BLOCK_SIZE, 0, control->streams[0] >>> ( system->d_my_atoms, *(lists[FAR_NBRS]), system->N ); cudaCheckError( ); @@ -1881,13 +1895,13 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, /* update num. rows in matrix for this GPU */ workspace->d_workspace->H.n = system->n; - Cuda_Init_Sparse_Matrix_Indices( system, &workspace->d_workspace->H ); + Cuda_Init_Sparse_Matrix_Indices( system, control, &workspace->d_workspace->H ); if ( workspace->d_workspace->H.format == SYM_HALF_MATRIX ) { if ( control->tabulate <= 0 ) { - k_init_cm_half_fs <<< blocks, DEF_BLOCK_SIZE >>> + k_init_cm_half_fs <<< blocks, DEF_BLOCK_SIZE, 0, control->streams[0] >>> ( system->d_my_atoms, system->reax_param.d_sbp, system->reax_param.d_tbp, *(workspace->d_workspace), (control_params *) control->d_control_params, *(lists[FAR_NBRS]), system->reax_param.num_atom_types, @@ -1895,7 +1909,7 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, } else { - k_init_cm_half_fs_tab <<< blocks, DEF_BLOCK_SIZE >>> + k_init_cm_half_fs_tab <<< blocks, DEF_BLOCK_SIZE, 0, control->streams[0] >>> ( system->d_my_atoms, system->reax_param.d_sbp, *(workspace->d_workspace), (control_params *) control->d_control_params, *(lists[FAR_NBRS]), workspace->d_LR, system->reax_param.num_atom_types, @@ -1906,7 +1920,7 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, { if ( control->tabulate <= 0 ) { -// k_init_cm_full_fs <<< blocks, DEF_BLOCK_SIZE >>> +// k_init_cm_full_fs <<< blocks, DEF_BLOCK_SIZE, 0, control->streams[0] >>> // ( system->d_my_atoms, system->reax_param.d_sbp, system->reax_param.d_tbp, // *(workspace->d_workspace), (control_params *) control->d_control_params, // *(lists[FAR_NBRS]), system->reax_param.num_atom_types, @@ -1916,7 +1930,8 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, + (workspace->d_workspace->H.n_max * 32 % DEF_BLOCK_SIZE == 0 ? 0 : 1); k_init_cm_full_fs_opt <<< blocks, DEF_BLOCK_SIZE, - sizeof(cub::WarpScan<int>::TempStorage) * (DEF_BLOCK_SIZE / 32) >>> + sizeof(cub::WarpScan<int>::TempStorage) * (DEF_BLOCK_SIZE / 32), + control->streams[0] >>> ( system->d_my_atoms, system->reax_param.d_sbp, system->reax_param.d_tbp, *(workspace->d_workspace), (control_params *) control->d_control_params, *(lists[FAR_NBRS]), system->reax_param.num_atom_types, @@ -1924,7 +1939,8 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, } else { - k_init_cm_full_fs_tab <<< blocks, DEF_BLOCK_SIZE >>> + k_init_cm_full_fs_tab <<< blocks, DEF_BLOCK_SIZE, 0, + control->streams[0] >>> ( system->d_my_atoms, system->reax_param.d_sbp, *(workspace->d_workspace), (control_params *) control->d_control_params, *(lists[FAR_NBRS]), workspace->d_LR, system->reax_param.num_atom_types, @@ -1943,13 +1959,13 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, blocks = system->total_cap / DEF_BLOCK_SIZE + ((system->total_cap % DEF_BLOCK_SIZE == 0 ) ? 0 : 1); - k_reset_bond_orders <<< blocks, DEF_BLOCK_SIZE >>> + k_reset_bond_orders <<< blocks, DEF_BLOCK_SIZE, 0, control->streams[0] >>> ( *(workspace->d_workspace), system->total_cap ); cudaCheckError( ); - Cuda_Init_Bond_Indices( system, lists[BONDS] ); + Cuda_Init_Bond_Indices( system, control, lists[BONDS] ); -// k_init_bonds <<< control->blocks_n, control->block_size_n >>> +// k_init_bonds <<< control->blocks_n, control->block_size_n, 0, control->streams[0] >>> // ( system->d_my_atoms, system->reax_param.d_sbp, // system->reax_param.d_tbp, *(workspace->d_workspace), // (control_params *) control->d_control_params, @@ -1962,7 +1978,8 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, + (system->N * 32 % DEF_BLOCK_SIZE == 0 ? 0 : 1); k_init_bonds_opt <<< blocks, DEF_BLOCK_SIZE, - sizeof(cub::WarpScan<int>::TempStorage) * (DEF_BLOCK_SIZE / 32) >>> + sizeof(cub::WarpScan<int>::TempStorage) * (DEF_BLOCK_SIZE / 32), + control->streams[0] >>> ( system->d_my_atoms, system->reax_param.d_sbp, system->reax_param.d_tbp, *(workspace->d_workspace), (control_params *) control->d_control_params, @@ -1974,9 +1991,9 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, if ( control->hbond_cut > 0.0 && system->numH > 0 && hbonds_done == FALSE ) { - Cuda_Init_HBond_Indices( system, workspace, lists[HBONDS] ); + Cuda_Init_HBond_Indices( system, control, workspace, lists[HBONDS] ); -// k_init_hbonds <<< control->blocks_n, control->block_size_n >>> +// k_init_hbonds <<< control->blocks_n, control->block_size_n, 0, control->streams[0] >>> // ( system->d_my_atoms, system->reax_param.d_sbp, // (control_params *) control->d_control_params, // *(lists[FAR_NBRS]), *(lists[HBONDS]), @@ -1988,7 +2005,8 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, + (system->N * 32 % DEF_BLOCK_SIZE == 0 ? 0 : 1); k_init_hbonds_opt <<< blocks, DEF_BLOCK_SIZE, - sizeof(cub::WarpScan<int>::TempStorage) * (DEF_BLOCK_SIZE / 32) >>> + sizeof(cub::WarpScan<int>::TempStorage) * (DEF_BLOCK_SIZE / 32), + control->streams[0] >>> ( system->d_my_atoms, system->reax_param.d_sbp, (control_params *) control->d_control_params, *(lists[FAR_NBRS]), *(lists[HBONDS]), @@ -2063,7 +2081,8 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, if ( ret == SUCCESS ) { - k_update_sym_dbond_indices <<< control->blocks_n, control->block_size_n >>> + k_update_sym_dbond_indices <<< control->blocks_n, control->block_size_n, + 0, control->streams[0] >>> ( *(lists[BONDS]), system->N ); cudaCheckError( ); @@ -2074,7 +2093,8 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, + (system->N * 32 % DEF_BLOCK_SIZE == 0 ? 0 : 1); /* make hbond_list symmetric */ - k_update_sym_hbond_indices_opt <<< blocks, DEF_BLOCK_SIZE >>> + k_update_sym_hbond_indices_opt <<< blocks, DEF_BLOCK_SIZE, + 0, control->streams[0] >>> ( system->d_my_atoms, *(lists[HBONDS]), system->N ); cudaCheckError( ); } @@ -2131,13 +2151,13 @@ int Cuda_Init_Forces_No_Charges( reax_system *system, control_params *control, if ( renbr == FALSE && dist_done == FALSE ) { -// k_init_dist <<< control->blocks_n, control->block_size_n >>> +// k_init_dist <<< control->blocks_n, control->block_size_n, 0, control->streams[0] >>> // ( system->d_my_atoms, *(lists[FAR_NBRS]), system->N ); blocks = system->N * 32 / DEF_BLOCK_SIZE + (system->N * 32 % DEF_BLOCK_SIZE == 0 ? 0 : 1); - k_init_dist_opt <<< blocks, DEF_BLOCK_SIZE >>> + k_init_dist_opt <<< blocks, DEF_BLOCK_SIZE, 0, control->streams[0] >>> ( system->d_my_atoms, *(lists[FAR_NBRS]), system->N ); cudaCheckError( ); @@ -2153,13 +2173,13 @@ int Cuda_Init_Forces_No_Charges( reax_system *system, control_params *control, blocks = system->total_cap / DEF_BLOCK_SIZE + ((system->total_cap % DEF_BLOCK_SIZE == 0 ) ? 0 : 1); - k_reset_bond_orders <<< blocks, DEF_BLOCK_SIZE >>> + k_reset_bond_orders <<< blocks, DEF_BLOCK_SIZE, 0, control->streams[0] >>> ( *(workspace->d_workspace), system->total_cap ); cudaCheckError( ); - Cuda_Init_Bond_Indices( system, lists[BONDS] ); + Cuda_Init_Bond_Indices( system, control, lists[BONDS] ); -// k_init_bonds <<< control->blocks_n, control->block_size_n >>> +// k_init_bonds <<< control->blocks_n, control->block_size_n, 0, control->streams[0] >>> // ( system->d_my_atoms, system->reax_param.d_sbp, // system->reax_param.d_tbp, *(workspace->d_workspace), // (control_params *) control->d_control_params, @@ -2171,7 +2191,8 @@ int Cuda_Init_Forces_No_Charges( reax_system *system, control_params *control, + (control->block_size_n * 32 % DEF_BLOCK_SIZE == 0 ? 0 : 1); k_init_bonds_opt <<< blocks, DEF_BLOCK_SIZE, - sizeof(cub::WarpScan<int>::TempStorage) * (DEF_BLOCK_SIZE / 32) >>> + sizeof(cub::WarpScan<int>::TempStorage) * (DEF_BLOCK_SIZE / 32), + control->streams[0] >>> ( system->d_my_atoms, system->reax_param.d_sbp, system->reax_param.d_tbp, *(workspace->d_workspace), (control_params *) control->d_control_params, @@ -2183,9 +2204,9 @@ int Cuda_Init_Forces_No_Charges( reax_system *system, control_params *control, if ( control->hbond_cut > 0.0 && system->numH > 0 && hbonds_done == FALSE ) { - Cuda_Init_HBond_Indices( system, workspace, lists[HBONDS] ); + Cuda_Init_HBond_Indices( system, control, workspace, lists[HBONDS] ); -// k_init_hbonds <<< control->blocks_n, control->block_size_n >>> +// k_init_hbonds <<< control->blocks_n, control->block_size_n, 0, control->streams[0] >>> // ( system->d_my_atoms, system->reax_param.d_sbp, // (control_params *) control->d_control_params, // *(lists[FAR_NBRS]), *(lists[HBONDS]), @@ -2197,7 +2218,8 @@ int Cuda_Init_Forces_No_Charges( reax_system *system, control_params *control, + (system->N * 32 % DEF_BLOCK_SIZE == 0 ? 0 : 1); k_init_hbonds_opt <<< blocks, DEF_BLOCK_SIZE, - sizeof(cub::WarpScan<int>::TempStorage) * (DEF_BLOCK_SIZE / 32) >>> + sizeof(cub::WarpScan<int>::TempStorage) * (DEF_BLOCK_SIZE / 32), + control->streams[0] >>> ( system->d_my_atoms, system->reax_param.d_sbp, (control_params *) control->d_control_params, *(lists[FAR_NBRS]), *(lists[HBONDS]), @@ -2258,7 +2280,8 @@ int Cuda_Init_Forces_No_Charges( reax_system *system, control_params *control, if ( ret == SUCCESS ) { - k_update_sym_dbond_indices <<< control->blocks_n, control->block_size_n >>> + k_update_sym_dbond_indices <<< control->blocks_n, control->block_size_n, + 0, control->streams[0] >>> ( *(lists[BONDS]), system->N ); cudaCheckError( ); @@ -2269,7 +2292,8 @@ int Cuda_Init_Forces_No_Charges( reax_system *system, control_params *control, + (system->N * 32 % DEF_BLOCK_SIZE == 0 ? 0 : 1); /* make hbond_list symmetric */ - k_update_sym_hbond_indices_opt <<< blocks, DEF_BLOCK_SIZE >>> + k_update_sym_hbond_indices_opt <<< blocks, DEF_BLOCK_SIZE, + 0, control->streams[0] >>> ( system->d_my_atoms, *(lists[HBONDS]), system->N ); cudaCheckError( ); } @@ -2364,7 +2388,7 @@ static void Cuda_Compute_Total_Force( reax_system *system, control_params *contr sCudaMemcpy( workspace->d_workspace->f, f, sizeof(rvec) * system->N, cudaMemcpyHostToDevice, __FILE__, __LINE__ ); - Cuda_Total_Forces_Part2( system, workspace ); + Cuda_Total_Forces_Part2( system, control, workspace ); } diff --git a/PG-PuReMD/src/cuda/cuda_forces.h b/PG-PuReMD/src/cuda/cuda_forces.h index 6b50adda..3347db7a 100644 --- a/PG-PuReMD/src/cuda/cuda_forces.h +++ b/PG-PuReMD/src/cuda/cuda_forces.h @@ -5,14 +5,15 @@ #include "../reax_types.h" -void Cuda_Init_Neighbor_Indices( reax_system *, reax_list * ); +void Cuda_Init_Neighbor_Indices( reax_system *, control_params *, reax_list * ); -void Cuda_Init_HBond_Indices( reax_system *, storage *, +void Cuda_Init_HBond_Indices( reax_system *, control_params *, storage *, reax_list * ); -void Cuda_Init_Bond_Indices( reax_system *, reax_list * ); +void Cuda_Init_Bond_Indices( reax_system *, control_params *, reax_list * ); -void Cuda_Init_Sparse_Matrix_Indices( reax_system *, sparse_matrix * ); +void Cuda_Init_Sparse_Matrix_Indices( reax_system *, control_params *, + sparse_matrix * ); void Cuda_Init_Three_Body_Indices( int *, int, reax_list ** ); diff --git a/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu b/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu index 1196c9a0..ea9f190a 100644 --- a/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu +++ b/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu @@ -767,7 +767,8 @@ void Cuda_Compute_Hydrogen_Bonds( reax_system *system, control_params *control, if ( control->virial == 1 ) { - k_hydrogen_bonds_virial_part1 <<< control->blocks, control->block_size >>> + k_hydrogen_bonds_virial_part1 <<< control->blocks, control->block_size, + 0, control->streams[0] >>> ( system->d_my_atoms, system->reax_param.d_sbp, system->reax_param.d_hbp, system->reax_param.d_gp, (control_params *) control->d_control_params, @@ -785,7 +786,7 @@ void Cuda_Compute_Hydrogen_Bonds( reax_system *system, control_params *control, } else { -// k_hydrogen_bonds_part1 <<< control->blocks, control->block_size >>> +// k_hydrogen_bonds_part1 <<< control->blocks, control->block_size, 0, control->streams[0] >>> // ( system->d_my_atoms, system->reax_param.d_sbp, // system->reax_param.d_hbp, system->reax_param.d_gp, // (control_params *) control->d_control_params, @@ -804,7 +805,8 @@ void Cuda_Compute_Hydrogen_Bonds( reax_system *system, control_params *control, + (system->n * 32 % DEF_BLOCK_SIZE == 0 ? 0 : 1); k_hydrogen_bonds_part1_opt <<< blocks, DEF_BLOCK_SIZE, - sizeof(cub::WarpReduce<double>::TempStorage) * (DEF_BLOCK_SIZE / 32) >>> + sizeof(cub::WarpReduce<double>::TempStorage) * (DEF_BLOCK_SIZE / 32), + control->streams[0] >>> ( system->d_my_atoms, system->reax_param.d_sbp, system->reax_param.d_hbp, system->reax_param.d_gp, (control_params *) control->d_control_params, @@ -833,12 +835,14 @@ void Cuda_Compute_Hydrogen_Bonds( reax_system *system, control_params *control, rvec_spad = (rvec *) (&spad[system->n]); k_reduction_rvec <<< control->blocks, control->block_size, - sizeof(rvec) * (control->block_size / 32) >>> + sizeof(rvec) * (control->block_size / 32), + control->streams[0] >>> ( rvec_spad, &rvec_spad[system->n], system->n ); cudaCheckError( ); k_reduction_rvec <<< 1, control->blocks_pow_2, - sizeof(rvec) * (control->blocks_pow_2 / 32) >>> + sizeof(rvec) * (control->blocks_pow_2 / 32), + control->streams[0] >>> ( &rvec_spad[system->n], &((simulation_data *)data->d_simulation_data)->my_ext_press, control->blocks ); @@ -850,7 +854,8 @@ void Cuda_Compute_Hydrogen_Bonds( reax_system *system, control_params *control, #endif #if !defined(CUDA_ACCUM_ATOMIC) - k_hydrogen_bonds_part2 <<< control->blocks, control->block_size >>> + k_hydrogen_bonds_part2 <<< control->blocks, control->block_size, 0, + control->streams[0] >>> ( system->d_my_atoms, *(workspace->d_workspace), *(lists[BONDS]), system->n ); cudaCheckError( ); @@ -858,10 +863,11 @@ void Cuda_Compute_Hydrogen_Bonds( reax_system *system, control_params *control, // hnbrs_blocks = (system->n * HB_POST_PROC_KER_THREADS_PER_ATOM / HB_POST_PROC_BLOCK_SIZE) + // (((system->n * HB_POST_PROC_KER_THREADS_PER_ATOM) % HB_POST_PROC_BLOCK_SIZE) == 0 ? 0 : 1); - k_hydrogen_bonds_part3 <<< control->blocks, control->block_size >>> + k_hydrogen_bonds_part3 <<< control->blocks, control->block_size, 0, + control->streams[0] >>> ( system->d_my_atoms, *(workspace->d_workspace), *(lists[HBONDS]), system->n ); // k_hydrogen_bonds_part3_opt <<< hnbrs_blocks, HB_POST_PROC_BLOCK_SIZE, -// HB_POST_PROC_BLOCK_SIZE * sizeof(rvec) >>> +// sizeof(rvec) * HB_POST_PROC_BLOCK_SIZE, control->streams[0] >>> // ( system->d_my_atoms, *(workspace->d_workspace), *(lists[HBONDS]), system->n ); cudaCheckError( ); #endif diff --git a/PG-PuReMD/src/cuda/cuda_init_md.cu b/PG-PuReMD/src/cuda/cuda_init_md.cu index 626bff64..8fefddf2 100644 --- a/PG-PuReMD/src/cuda/cuda_init_md.cu +++ b/PG-PuReMD/src/cuda/cuda_init_md.cu @@ -180,7 +180,7 @@ void Cuda_Init_Workspace( reax_system *system, control_params *control, workspace->d_workspace->realloc.thbody = FALSE; workspace->d_workspace->realloc.gcell_atoms = 0; - Cuda_Reset_Workspace( system, workspace ); + Cuda_Reset_Workspace( system, control, workspace ); Init_Taper( control, workspace->d_workspace, mpi_data ); } @@ -190,13 +190,13 @@ void Cuda_Init_Lists( reax_system *system, control_params *control, simulation_data *data, storage *workspace, reax_list **lists, mpi_datatypes *mpi_data ) { - Cuda_Estimate_Num_Neighbors( system, data ); + Cuda_Estimate_Num_Neighbors( system, control, data ); Cuda_Make_List( system->total_cap, system->total_far_nbrs, TYP_FAR_NEIGHBOR, lists[FAR_NBRS] ); - Cuda_Init_Neighbor_Indices( system, lists[FAR_NBRS] ); + Cuda_Init_Neighbor_Indices( system, control, lists[FAR_NBRS] ); - Cuda_Generate_Neighbor_Lists( system, data, workspace, lists ); + Cuda_Generate_Neighbor_Lists( system, control, data, workspace, lists ); /* first call to Cuda_Estimate_Storages requires setting these manually before allocation */ workspace->d_workspace->H.n = system->n; @@ -209,15 +209,15 @@ void Cuda_Init_Lists( reax_system *system, control_params *control, Cuda_Allocate_Matrix( &workspace->d_workspace->H, system->n, system->local_cap, system->total_cm_entries, SYM_FULL_MATRIX ); - Cuda_Init_Sparse_Matrix_Indices( system, &workspace->d_workspace->H ); + Cuda_Init_Sparse_Matrix_Indices( system, control, &workspace->d_workspace->H ); Cuda_Make_List( system->total_cap, system->total_bonds, TYP_BOND, lists[BONDS] ); - Cuda_Init_Bond_Indices( system, lists[BONDS] ); + Cuda_Init_Bond_Indices( system, control, lists[BONDS] ); if ( control->hbond_cut > 0.0 && system->numH > 0 ) { Cuda_Make_List( system->total_cap, system->total_hbonds, TYP_HBOND, lists[HBONDS] ); - Cuda_Init_HBond_Indices( system, workspace, lists[HBONDS] ); + Cuda_Init_HBond_Indices( system, control, workspace, lists[HBONDS] ); } /* 3bodies list: since a more accurate estimate of the num. diff --git a/PG-PuReMD/src/cuda/cuda_integrate.cu b/PG-PuReMD/src/cuda/cuda_integrate.cu index acd257f5..c27dd2e9 100644 --- a/PG-PuReMD/src/cuda/cuda_integrate.cu +++ b/PG-PuReMD/src/cuda/cuda_integrate.cu @@ -183,61 +183,65 @@ CUDA_GLOBAL void k_scale_velocities_npt( reax_atom *my_atoms, real lambda, } -void Velocity_Verlet_Part1( reax_system *system, real dt ) +static void Velocity_Verlet_Part1( reax_system *system, control_params *control, real dt ) { int blocks; blocks = system->n / DEF_BLOCK_SIZE + ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1); - k_velocity_verlet_part1 <<< blocks, DEF_BLOCK_SIZE >>> + k_velocity_verlet_part1 <<< blocks, DEF_BLOCK_SIZE, 0, control->streams[0] >>> ( system->d_my_atoms, system->reax_param.d_sbp, dt, system->n ); cudaCheckError( ); } -void Velocity_Verlet_Part2( reax_system *system, real dt ) +static void Velocity_Verlet_Part2( reax_system *system, control_params *control, real dt ) { int blocks; blocks = system->n / DEF_BLOCK_SIZE + ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1); - k_velocity_verlet_part2 <<< blocks, DEF_BLOCK_SIZE >>> + k_velocity_verlet_part2 <<< blocks, DEF_BLOCK_SIZE, 0, control->streams[0] >>> ( system->d_my_atoms, system->reax_param.d_sbp, dt, system->n ); cudaCheckError( ); } -void Velocity_Verlet_Nose_Hoover_NVT_Part1( reax_system *system, real dt ) +static void Velocity_Verlet_Nose_Hoover_NVT_Part1( reax_system *system, + control_params *control, real dt ) { int blocks; blocks = system->n / DEF_BLOCK_SIZE + ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1); - k_velocity_verlet_nose_hoover_nvt <<< blocks, DEF_BLOCK_SIZE >>> + k_velocity_verlet_nose_hoover_nvt <<< blocks, DEF_BLOCK_SIZE, 0, + control->streams[0] >>> ( system->d_my_atoms, system->reax_param.d_sbp, dt, system->n ); cudaCheckError( ); } -void Velocity_Verlet_Nose_Hoover_NVT_Part2( reax_system *system, storage *workspace, real dt, real v_xi ) +static void Velocity_Verlet_Nose_Hoover_NVT_Part2( reax_system *system, control_params *control, + storage *workspace, real dt, real v_xi ) { int blocks; blocks = system->n / DEF_BLOCK_SIZE + ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1); - k_velocity_verlet_nose_hoover_nvt <<< blocks, DEF_BLOCK_SIZE >>> + k_velocity_verlet_nose_hoover_nvt <<< blocks, DEF_BLOCK_SIZE, 0, control->streams[0] >>> ( system->d_my_atoms, workspace->v_const, system->reax_param.d_sbp, dt, v_xi, system->n ); cudaCheckError( ); } -real Velocity_Verlet_Nose_Hoover_NVT_Part3( reax_system *system, storage *workspace, - real dt, real v_xi_old, real *d_my_ekin, real *d_total_my_ekin ) +static real Velocity_Verlet_Nose_Hoover_NVT_Part3( reax_system *system, + control_params *control, storage *workspace, real dt, real v_xi_old, + real *d_my_ekin, real *d_total_my_ekin ) { int blocks; real my_ekin; @@ -245,12 +249,12 @@ real Velocity_Verlet_Nose_Hoover_NVT_Part3( reax_system *system, storage *worksp blocks = system->n / DEF_BLOCK_SIZE + ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1); - k_velocity_verlet_nose_hoover_nvt_part3 <<< blocks, DEF_BLOCK_SIZE >>> + k_velocity_verlet_nose_hoover_nvt_part3 <<< blocks, DEF_BLOCK_SIZE, 0, control->streams[0] >>> ( system->d_my_atoms, workspace->v_const, system->reax_param.d_sbp, dt, v_xi_old, d_my_ekin, system->n ); cudaCheckError( ); - Cuda_Reduction_Sum( d_my_ekin, d_total_my_ekin, system->n ); + Cuda_Reduction_Sum( d_my_ekin, d_total_my_ekin, system->n, control->streams[0] ); sCudaMemcpy( &my_ekin, d_total_my_ekin, sizeof(real), cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); @@ -259,27 +263,30 @@ real Velocity_Verlet_Nose_Hoover_NVT_Part3( reax_system *system, storage *worksp } -static void Cuda_Scale_Velocities_Berendsen_NVT( reax_system *system, real lambda ) +static void Cuda_Scale_Velocities_Berendsen_NVT( reax_system *system, + control_params * control, real lambda ) { int blocks; blocks = system->n / DEF_BLOCK_SIZE + ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1); - k_scale_velocites_berendsen_nvt <<< blocks, DEF_BLOCK_SIZE >>> + k_scale_velocites_berendsen_nvt <<< blocks, DEF_BLOCK_SIZE, + 0, control->streams[0] >>> ( system->d_my_atoms, lambda, system->n ); cudaCheckError( ); } -void Cuda_Scale_Velocities_NPT( reax_system *system, real lambda, rvec mu ) +void Cuda_Scale_Velocities_NPT( reax_system *system, control_params * control, + real lambda, rvec mu ) { int blocks; blocks = system->n / DEF_BLOCK_SIZE + ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1); - k_scale_velocities_npt <<< blocks, DEF_BLOCK_SIZE >>> + k_scale_velocities_npt <<< blocks, DEF_BLOCK_SIZE, 0, control->streams[0] >>> ( system->d_my_atoms, lambda, mu[0], mu[1], mu[2], system->n ); cudaCheckError( ); } @@ -300,7 +307,7 @@ int Cuda_Velocity_Verlet_NVE( reax_system *system, control_params *control, if ( verlet_part1_done == FALSE ) { - Velocity_Verlet_Part1( system, dt ); + Velocity_Verlet_Part1( system, control, dt ); Cuda_Reallocate_Part1( system, control, data, workspace, lists, mpi_data ); @@ -338,7 +345,7 @@ int Cuda_Velocity_Verlet_NVE( reax_system *system, control_params *control, if ( renbr == TRUE && gen_nbr_list == FALSE ) { - ret = Cuda_Generate_Neighbor_Lists( system, data, workspace, lists ); + ret = Cuda_Generate_Neighbor_Lists( system, control, data, workspace, lists ); if ( ret == SUCCESS ) { @@ -346,7 +353,7 @@ int Cuda_Velocity_Verlet_NVE( reax_system *system, control_params *control, } else { - Cuda_Estimate_Num_Neighbors( system, data ); + Cuda_Estimate_Num_Neighbors( system, control, data ); } } @@ -358,7 +365,7 @@ int Cuda_Velocity_Verlet_NVE( reax_system *system, control_params *control, if ( ret == SUCCESS ) { - Velocity_Verlet_Part2( system, dt ); + Velocity_Verlet_Part2( system, control, dt ); verlet_part1_done = FALSE; cuda_copy = FALSE; @@ -390,7 +397,7 @@ int Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein( reax_system* system, if ( verlet_part1_done == FALSE ) { - Velocity_Verlet_Nose_Hoover_NVT_Part1( system, dt ); + Velocity_Verlet_Nose_Hoover_NVT_Part1( system, control, dt ); /* Compute xi(t + dt) */ therm->xi += therm->v_xi * dt + 0.5 * dt_sqr * therm->G_xi; @@ -436,7 +443,7 @@ int Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein( reax_system* system, if ( renbr == TRUE && gen_nbr_list == FALSE ) { - ret = Cuda_Generate_Neighbor_Lists( system, data, workspace, lists ); + ret = Cuda_Generate_Neighbor_Lists( system, control, data, workspace, lists ); if ( ret == SUCCESS ) { @@ -444,7 +451,7 @@ int Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein( reax_system* system, } else { - Cuda_Estimate_Num_Neighbors( system, data ); + Cuda_Estimate_Num_Neighbors( system, control, data ); } } @@ -457,7 +464,7 @@ int Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein( reax_system* system, if ( ret == SUCCESS ) { /* Compute iteration constants for each atom's velocity */ - Velocity_Verlet_Nose_Hoover_NVT_Part2( system, + Velocity_Verlet_Nose_Hoover_NVT_Part2( system, control, workspace->d_workspace, dt, therm->v_xi ); v_xi_new = therm->v_xi_old + 2.0 * dt * therm->G_xi; @@ -476,7 +483,7 @@ int Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein( reax_system* system, /* new values become old in this iteration */ v_xi_old = v_xi_new; - my_ekin = Velocity_Verlet_Nose_Hoover_NVT_Part3( system, + my_ekin = Velocity_Verlet_Nose_Hoover_NVT_Part3( system, control, workspace->d_workspace, dt, v_xi_old, d_my_ekin, d_total_my_ekin ); @@ -524,7 +531,7 @@ int Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* con if ( verlet_part1_done == FALSE ) { - Velocity_Verlet_Part1( system, dt ); + Velocity_Verlet_Part1( system, control, dt ); Cuda_Reallocate_Part1( system, control, data, workspace, lists, mpi_data ); @@ -567,7 +574,7 @@ int Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* con if ( renbr == TRUE && gen_nbr_list == FALSE ) { - ret = Cuda_Generate_Neighbor_Lists( system, data, workspace, lists ); + ret = Cuda_Generate_Neighbor_Lists( system, control, data, workspace, lists ); if ( ret == SUCCESS ) { @@ -575,7 +582,7 @@ int Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* con } else { - Cuda_Estimate_Num_Neighbors( system, data ); + Cuda_Estimate_Num_Neighbors( system, control, data ); } } @@ -587,8 +594,7 @@ int Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* con if ( ret == SUCCESS ) { - /* velocity verlet, 2nd part */ - Velocity_Verlet_Part2( system, dt ); + Velocity_Verlet_Part2( system, control, dt ); /* temperature scaler */ Cuda_Compute_Kinetic_Energy( system, control, workspace, @@ -606,7 +612,7 @@ int Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* con lambda = SQRT( lambda ); /* Scale velocities and positions at t+dt */ - Cuda_Scale_Velocities_Berendsen_NVT( system, lambda ); + Cuda_Scale_Velocities_Berendsen_NVT( system, control, lambda ); Cuda_Compute_Kinetic_Energy( system, control, workspace, data, mpi_data->comm_mesh3D ); @@ -638,7 +644,7 @@ int Cuda_Velocity_Verlet_Berendsen_NPT( reax_system* system, control_params* con if ( verlet_part1_done == FALSE ) { - Velocity_Verlet_Part1( system, dt ); + Velocity_Verlet_Part1( system, control, dt ); Cuda_Reallocate_Part1( system, control, data, workspace, lists, mpi_data ); @@ -681,7 +687,7 @@ int Cuda_Velocity_Verlet_Berendsen_NPT( reax_system* system, control_params* con if ( renbr == TRUE && gen_nbr_list == FALSE ) { - ret = Cuda_Generate_Neighbor_Lists( system, data, workspace, lists ); + ret = Cuda_Generate_Neighbor_Lists( system, control, data, workspace, lists ); if ( ret == SUCCESS ) { @@ -689,7 +695,7 @@ int Cuda_Velocity_Verlet_Berendsen_NPT( reax_system* system, control_params* con } else { - Cuda_Estimate_Num_Neighbors( system, data ); + Cuda_Estimate_Num_Neighbors( system, control, data ); } } @@ -701,7 +707,7 @@ int Cuda_Velocity_Verlet_Berendsen_NPT( reax_system* system, control_params* con if ( ret == SUCCESS ) { - Velocity_Verlet_Part2( system, dt ); + Velocity_Verlet_Part2( system, control, dt ); Cuda_Compute_Kinetic_Energy( system, control, workspace, data, mpi_data->comm_mesh3D ); diff --git a/PG-PuReMD/src/cuda/cuda_integrate.h b/PG-PuReMD/src/cuda/cuda_integrate.h index 20c742fe..8201f7e7 100644 --- a/PG-PuReMD/src/cuda/cuda_integrate.h +++ b/PG-PuReMD/src/cuda/cuda_integrate.h @@ -25,7 +25,7 @@ #include "../reax_types.h" -void Cuda_Scale_Velocities_NPT( reax_system *, real, rvec ); +void Cuda_Scale_Velocities_NPT( reax_system *, control_params *, real, rvec ); int Cuda_Velocity_Verlet_NVE( reax_system*, control_params*, simulation_data*, storage*, reax_list**, output_controls*, diff --git a/PG-PuReMD/src/cuda/cuda_multi_body.cu b/PG-PuReMD/src/cuda/cuda_multi_body.cu index abac8b22..9ab23c44 100644 --- a/PG-PuReMD/src/cuda/cuda_multi_body.cu +++ b/PG-PuReMD/src/cuda/cuda_multi_body.cu @@ -564,7 +564,7 @@ void Cuda_Compute_Atom_Energy( reax_system *system, control_params *control, 0, sizeof(real), "Cuda_Compute_Atom_Energy::e_un" ); #endif -// k_atom_energy_part1 <<< control->blocks, control->block_size >>> +// k_atom_energy_part1 <<< control->blocks, control->block_size, 0, control->streams[0] >>> // ( system->d_my_atoms, system->reax_param.d_gp, // system->reax_param.d_sbp, system->reax_param.d_tbp, *(workspace->d_workspace), // *(lists[BONDS]), system->n, system->reax_param.num_atom_types, @@ -582,7 +582,8 @@ void Cuda_Compute_Atom_Energy( reax_system *system, control_params *control, + (system->n * 32 % DEF_BLOCK_SIZE == 0 ? 0 : 1); k_atom_energy_part1_opt <<< blocks, DEF_BLOCK_SIZE, - sizeof(cub::WarpReduce<double>::TempStorage) * (DEF_BLOCK_SIZE / 32) >>> + sizeof(cub::WarpReduce<double>::TempStorage) * (DEF_BLOCK_SIZE / 32), + control->streams[0] >>> ( system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_sbp, system->reax_param.d_tbp, *(workspace->d_workspace), *(lists[BONDS]), system->n, system->reax_param.num_atom_types, @@ -597,7 +598,8 @@ void Cuda_Compute_Atom_Energy( reax_system *system, control_params *control, cudaCheckError( ); #if !defined(CUDA_ACCUM_ATOMIC) - k_atom_energy_part2 <<< control->blocks, control->block_size >>> + k_atom_energy_part2 <<< control->blocks, control->block_size, 0, + control->streams[0] >>> ( *(lists[BONDS]), *(workspace->d_workspace), system->n ); cudaCheckError( ); diff --git a/PG-PuReMD/src/cuda/cuda_neighbors.cu b/PG-PuReMD/src/cuda/cuda_neighbors.cu index bba3f788..f62d3043 100644 --- a/PG-PuReMD/src/cuda/cuda_neighbors.cu +++ b/PG-PuReMD/src/cuda/cuda_neighbors.cu @@ -492,7 +492,8 @@ CUDA_GLOBAL void k_estimate_neighbors_full( reax_atom *my_atoms, extern "C" int Cuda_Generate_Neighbor_Lists( reax_system *system, - simulation_data *data, storage *workspace, reax_list **lists ) + control_params *control, simulation_data *data, storage *workspace, + reax_list **lists ) { int blocks, ret, ret_far_nbr; #if defined(LOG_PERFORMANCE) @@ -517,7 +518,7 @@ extern "C" int Cuda_Generate_Neighbor_Lists( reax_system *system, blocks = (system->N / NBRS_BLOCK_SIZE) + ((system->N % NBRS_BLOCK_SIZE) == 0 ? 0 : 1); - k_generate_neighbor_lists_full <<< blocks, NBRS_BLOCK_SIZE >>> + k_generate_neighbor_lists_full <<< blocks, NBRS_BLOCK_SIZE, 0, control->streams[0] >>> ( system->d_my_atoms, system->my_ext_box, system->d_my_grid, *(lists[FAR_NBRS]), system->n, system->N, @@ -529,7 +530,7 @@ extern "C" int Cuda_Generate_Neighbor_Lists( reax_system *system, // (((system->N * NB_KER_THREADS_PER_ATOM) % NBRS_BLOCK_SIZE) == 0 ? 0 : 1); // k_mt_generate_neighbor_lists <<< blocks, NBRS_BLOCK_SIZE, // //sizeof(int) * (NBRS_BLOCK_SIZE + NBRS_BLOCK_SIZE / NB_KER_THREADS_PER_ATOM) >>> -// sizeof(int) * 2 * NBRS_BLOCK_SIZE >>> +// sizeof(int) * 2 * NBRS_BLOCK_SIZE, control->streams[0] >>> // ( system->d_my_atoms, system->my_ext_box, system->d_my_grid, // *(lists[FAR_NBRS]), system->n, system->N ); // cudaCheckError( ); @@ -565,7 +566,8 @@ extern "C" int Cuda_Generate_Neighbor_Lists( reax_system *system, /* Estimate the number of far neighbors for each atoms * * system: atomic system info */ -void Cuda_Estimate_Num_Neighbors( reax_system *system, simulation_data *data ) +void Cuda_Estimate_Num_Neighbors( reax_system *system, control_params *control, + simulation_data *data ) { int blocks; #if defined(LOG_PERFORMANCE) @@ -583,14 +585,14 @@ void Cuda_Estimate_Num_Neighbors( reax_system *system, simulation_data *data ) blocks = system->total_cap / DEF_BLOCK_SIZE + (system->total_cap % DEF_BLOCK_SIZE == 0 ? 0 : 1); - k_estimate_neighbors_full <<< blocks, DEF_BLOCK_SIZE >>> + k_estimate_neighbors_full <<< blocks, DEF_BLOCK_SIZE, 0, control->streams[0] >>> ( system->d_my_atoms, system->my_ext_box, system->d_my_grid, system->n, system->N, system->total_cap, system->d_far_nbrs, system->d_max_far_nbrs ); cudaCheckError( ); Cuda_Reduction_Sum( system->d_max_far_nbrs, system->d_total_far_nbrs, - system->total_cap ); + system->total_cap, control->streams[0] ); sCudaMemcpy( &system->total_far_nbrs, system->d_total_far_nbrs, sizeof(int), cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); diff --git a/PG-PuReMD/src/cuda/cuda_neighbors.h b/PG-PuReMD/src/cuda/cuda_neighbors.h index e5c9b536..2051627d 100644 --- a/PG-PuReMD/src/cuda/cuda_neighbors.h +++ b/PG-PuReMD/src/cuda/cuda_neighbors.h @@ -9,14 +9,15 @@ extern "C" { #endif -int Cuda_Generate_Neighbor_Lists( reax_system *, simulation_data *, - storage *, reax_list ** ); +int Cuda_Generate_Neighbor_Lists( reax_system *, control_params *, + simulation_data *, storage *, reax_list ** ); #ifdef __cplusplus } #endif -void Cuda_Estimate_Num_Neighbors( reax_system *, simulation_data * ); +void Cuda_Estimate_Num_Neighbors( reax_system *, control_params *, + simulation_data * ); #endif diff --git a/PG-PuReMD/src/cuda/cuda_nonbonded.cu b/PG-PuReMD/src/cuda/cuda_nonbonded.cu index c619f3eb..b0ed0d34 100644 --- a/PG-PuReMD/src/cuda/cuda_nonbonded.cu +++ b/PG-PuReMD/src/cuda/cuda_nonbonded.cu @@ -828,8 +828,8 @@ CUDA_GLOBAL void k_vdW_coulomb_energy_tab_full( reax_atom *my_atoms, } -static void Cuda_Compute_Polarization_Energy( reax_system *system, storage *workspace, - simulation_data *data ) +static void Cuda_Compute_Polarization_Energy( reax_system *system, + control_params *control, storage *workspace, simulation_data *data ) { int blocks; #if !defined(CUDA_ACCUM_ATOMIC) @@ -847,7 +847,8 @@ static void Cuda_Compute_Polarization_Energy( reax_system *system, storage *work blocks = system->n / DEF_BLOCK_SIZE + ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1); - k_compute_polarization_energy <<< blocks, DEF_BLOCK_SIZE >>> + k_compute_polarization_energy <<< blocks, DEF_BLOCK_SIZE, 0, + control->streams[0] >>> ( system->d_my_atoms, system->reax_param.d_sbp, system->n, #if !defined(CUDA_ACCUM_ATOMIC) @@ -909,7 +910,8 @@ void Cuda_Compute_NonBonded_Forces( reax_system *system, control_params *control { if ( control->virial == 1 ) { -// k_vdW_coulomb_energy_virial_full <<< control->blocks, control->block_size >>> +// k_vdW_coulomb_energy_virial_full <<< control->blocks, control->block_size, +// 0, control->streams[0] >>> // ( system->d_my_atoms, system->reax_param.d_tbp, // system->reax_param.d_gp, (control_params *) control->d_control_params, // *(workspace->d_workspace), *(lists[FAR_NBRS]), @@ -924,7 +926,8 @@ void Cuda_Compute_NonBonded_Forces( reax_system *system, control_params *control // ); k_vdW_coulomb_energy_virial_full_opt <<< blocks, DEF_BLOCK_SIZE, - sizeof(real) * (DEF_BLOCK_SIZE / 32) >>> + sizeof(real) * (DEF_BLOCK_SIZE / 32), + control->streams[0] >>> ( system->d_my_atoms, system->reax_param.d_tbp, system->reax_param.d_gp, (control_params *) control->d_control_params, *(workspace->d_workspace), *(lists[FAR_NBRS]), @@ -940,7 +943,8 @@ void Cuda_Compute_NonBonded_Forces( reax_system *system, control_params *control } else { -// k_vdW_coulomb_energy_full <<< control->blocks, control->block_size >>> +// k_vdW_coulomb_energy_full <<< control->blocks, control->block_size, +// 0, control->streams[0] >>> // ( system->d_my_atoms, system->reax_param.d_tbp, // system->reax_param.d_gp, (control_params *) control->d_control_params, // *(workspace->d_workspace), *(lists[FAR_NBRS]), @@ -954,7 +958,8 @@ void Cuda_Compute_NonBonded_Forces( reax_system *system, control_params *control // ); k_vdW_coulomb_energy_full_opt <<< blocks, DEF_BLOCK_SIZE, - sizeof(real) * (DEF_BLOCK_SIZE / 32) >>> + sizeof(real) * (DEF_BLOCK_SIZE / 32), + control->streams[0] >>> ( system->d_my_atoms, system->reax_param.d_tbp, system->reax_param.d_gp, (control_params *) control->d_control_params, *(workspace->d_workspace), *(lists[FAR_NBRS]), @@ -971,7 +976,8 @@ void Cuda_Compute_NonBonded_Forces( reax_system *system, control_params *control } else { - k_vdW_coulomb_energy_tab_full <<< control->blocks, control->block_size >>> + k_vdW_coulomb_energy_tab_full <<< control->blocks, control->block_size, + 0, control->streams[0] >>> ( system->d_my_atoms, system->reax_param.d_gp, (control_params *) control->d_control_params, *(workspace->d_workspace), *(lists[FAR_NBRS]), @@ -1010,12 +1016,14 @@ void Cuda_Compute_NonBonded_Forces( reax_system *system, control_params *control /* reduction for ext_press */ k_reduction_rvec <<< control->blocks, control->block_size, - sizeof(rvec) * (control->block_size / 32) >>> + sizeof(rvec) * (control->block_size / 32), + control->streams[0] >>> ( spad_rvec, &spad_rvec[system->n], system->n ); cudaCheckError( ); k_reduction_rvec <<< 1, control->blocks_pow_2, - sizeof(rvec) * (control->blocks_pow_2 / 32) >>> + sizeof(rvec) * (control->blocks_pow_2 / 32), + control->streams[0] >>> ( &spad_rvec[system->n], &((simulation_data *)data->d_simulation_data)->my_ext_press, control->blocks ); @@ -1025,6 +1033,6 @@ void Cuda_Compute_NonBonded_Forces( reax_system *system, control_params *control if ( update_energy == TRUE ) { - Cuda_Compute_Polarization_Energy( system, workspace, data ); + Cuda_Compute_Polarization_Energy( system, control, workspace, data ); } } diff --git a/PG-PuReMD/src/cuda/cuda_post_evolve.cu b/PG-PuReMD/src/cuda/cuda_post_evolve.cu index ea0e3f16..75e7ca2c 100644 --- a/PG-PuReMD/src/cuda/cuda_post_evolve.cu +++ b/PG-PuReMD/src/cuda/cuda_post_evolve.cu @@ -33,7 +33,8 @@ CUDA_GLOBAL void k_remove_center_of_mass_velocities( reax_atom *my_atoms, extern "C" void Cuda_Remove_CoM_Velocities( reax_system *system, control_params *control, simulation_data *data ) { - k_remove_center_of_mass_velocities <<< control->blocks, control->block_size >>> + k_remove_center_of_mass_velocities <<< control->blocks, control->block_size, + 0, control->streams[0] >>> ( system->d_my_atoms, (simulation_data *)data->d_simulation_data, system->n ); cudaCheckError( ); } diff --git a/PG-PuReMD/src/cuda/cuda_reduction.cu b/PG-PuReMD/src/cuda/cuda_reduction.cu index 6336f492..40021b21 100644 --- a/PG-PuReMD/src/cuda/cuda_reduction.cu +++ b/PG-PuReMD/src/cuda/cuda_reduction.cu @@ -30,14 +30,14 @@ * * d_array: device array to reduce * d_dest: device pointer to hold result of reduction */ -void Cuda_Reduction_Sum( int *d_array, int *d_dest, size_t n ) +void Cuda_Reduction_Sum( int *d_array, int *d_dest, size_t n, cudaStream_t s ) { void *d_temp_storage = NULL; size_t temp_storage_bytes = 0; /* determine temporary device storage requirements */ cub::DeviceReduce::Sum( d_temp_storage, temp_storage_bytes, - d_array, d_dest, n ); + d_array, d_dest, n, s ); cudaCheckError( ); /* allocate temporary storage */ @@ -46,7 +46,7 @@ void Cuda_Reduction_Sum( int *d_array, int *d_dest, size_t n ) /* run sum-reduction */ cub::DeviceReduce::Sum( d_temp_storage, temp_storage_bytes, - d_array, d_dest, n ); + d_array, d_dest, n, s ); cudaCheckError( ); /* deallocate temporary storage */ @@ -58,14 +58,14 @@ void Cuda_Reduction_Sum( int *d_array, int *d_dest, size_t n ) * * d_array: device array to reduce * d_dest: device pointer to hold result of reduction */ -void Cuda_Reduction_Sum( real *d_array, real *d_dest, size_t n ) +void Cuda_Reduction_Sum( real *d_array, real *d_dest, size_t n, cudaStream_t s ) { void *d_temp_storage = NULL; size_t temp_storage_bytes = 0; /* determine temporary device storage requirements */ cub::DeviceReduce::Sum( d_temp_storage, temp_storage_bytes, - d_array, d_dest, n ); + d_array, d_dest, n, s ); cudaCheckError( ); /* allocate temporary storage */ @@ -74,7 +74,7 @@ void Cuda_Reduction_Sum( real *d_array, real *d_dest, size_t n ) /* run sum-reduction */ cub::DeviceReduce::Sum( d_temp_storage, temp_storage_bytes, - d_array, d_dest, n ); + d_array, d_dest, n, s ); cudaCheckError( ); /* deallocate temporary storage */ @@ -86,7 +86,7 @@ void Cuda_Reduction_Sum( real *d_array, real *d_dest, size_t n ) // * // * d_array: device array to reduce // * d_dest: device pointer to hold result of reduction */ -//void Cuda_Reduction_Sum( rvec *d_array, rvec *d_dest, size_t n ) +//void Cuda_Reduction_Sum( rvec *d_array, rvec *d_dest, size_t n, cudaStream_t s ) //{ // void *d_temp_storage = NULL; // size_t temp_storage_bytes = 0; @@ -95,7 +95,7 @@ void Cuda_Reduction_Sum( real *d_array, real *d_dest, size_t n ) // // /* determine temporary device storage requirements */ // cub::DeviceReduce::Reduce( d_temp_storage, temp_storage_bytes, -// d_array, d_dest, n, sum_op, init ); +// d_array, d_dest, n, sum_op, init, s ); // cudaCheckError( ); // // /* allocate temporary storage */ @@ -104,7 +104,7 @@ void Cuda_Reduction_Sum( real *d_array, real *d_dest, size_t n ) // // /* run sum-reduction */ // cub::DeviceReduce::Reduce( d_temp_storage, temp_storage_bytes, -// d_array, d_dest, n, sum_op, init ); +// d_array, d_dest, n, sum_op, init, s ); // cudaCheckError( ); // // /* deallocate temporary storage */ @@ -116,14 +116,14 @@ void Cuda_Reduction_Sum( real *d_array, real *d_dest, size_t n ) * * d_array: device array to reduce * d_dest: device pointer to hold result of reduction */ -void Cuda_Reduction_Max( int *d_array, int *d_dest, size_t n ) +void Cuda_Reduction_Max( int *d_array, int *d_dest, size_t n, cudaStream_t s ) { void *d_temp_storage = NULL; size_t temp_storage_bytes = 0; /* determine temporary device storage requirements */ cub::DeviceReduce::Max( d_temp_storage, temp_storage_bytes, - d_array, d_dest, n ); + d_array, d_dest, n, s ); cudaCheckError( ); /* allocate temporary storage */ @@ -132,7 +132,7 @@ void Cuda_Reduction_Max( int *d_array, int *d_dest, size_t n ) /* run exclusive prefix sum */ cub::DeviceReduce::Max( d_temp_storage, temp_storage_bytes, - d_array, d_dest, n ); + d_array, d_dest, n, s ); cudaCheckError( ); /* deallocate temporary storage */ @@ -144,14 +144,14 @@ void Cuda_Reduction_Max( int *d_array, int *d_dest, size_t n ) * * d_src: device array to scan * d_dest: device array to hold result of scan */ -void Cuda_Scan_Excl_Sum( int *d_src, int *d_dest, size_t n ) +void Cuda_Scan_Excl_Sum( int *d_src, int *d_dest, size_t n, cudaStream_t s ) { void *d_temp_storage = NULL; size_t temp_storage_bytes = 0; /* determine temporary device storage requirements */ cub::DeviceScan::ExclusiveSum( d_temp_storage, temp_storage_bytes, - d_src, d_dest, n ); + d_src, d_dest, n, s ); cudaCheckError( ); /* allocate temporary storage */ @@ -160,7 +160,7 @@ void Cuda_Scan_Excl_Sum( int *d_src, int *d_dest, size_t n ) /* run exclusive prefix sum */ cub::DeviceScan::ExclusiveSum( d_temp_storage, temp_storage_bytes, - d_src, d_dest, n ); + d_src, d_dest, n, s ); cudaCheckError( ); /* deallocate temporary storage */ diff --git a/PG-PuReMD/src/cuda/cuda_reduction.h b/PG-PuReMD/src/cuda/cuda_reduction.h index 73a1513f..baf4cbea 100644 --- a/PG-PuReMD/src/cuda/cuda_reduction.h +++ b/PG-PuReMD/src/cuda/cuda_reduction.h @@ -5,15 +5,15 @@ #include "../reax_types.h" -void Cuda_Reduction_Sum( int *, int *, size_t ); +void Cuda_Reduction_Sum( int *, int *, size_t, cudaStream_t ); -void Cuda_Reduction_Sum( real *, real *, size_t ); +void Cuda_Reduction_Sum( real *, real *, size_t, cudaStream_t ); -//void Cuda_Reduction_Sum( rvec *, rvec *, size_t ); +//void Cuda_Reduction_Sum( rvec *, rvec *, size_t, cudaStream_t ); -void Cuda_Reduction_Max( int *, int *, size_t ); +void Cuda_Reduction_Max( int *, int *, size_t, cudaStream_t ); -void Cuda_Scan_Excl_Sum( int *, int *, size_t ); +void Cuda_Scan_Excl_Sum( int *, int *, size_t, cudaStream_t ); CUDA_GLOBAL void k_reduction_rvec( rvec *, rvec *, size_t ); diff --git a/PG-PuReMD/src/cuda/cuda_reset_tools.cu b/PG-PuReMD/src/cuda/cuda_reset_tools.cu index a74c18b4..e43353a7 100644 --- a/PG-PuReMD/src/cuda/cuda_reset_tools.cu +++ b/PG-PuReMD/src/cuda/cuda_reset_tools.cu @@ -55,14 +55,15 @@ CUDA_GLOBAL void k_reset_hindex( reax_atom *my_atoms, single_body_parameters *sb #endif } -void Cuda_Reset_Workspace( reax_system *system, storage *workspace ) +void Cuda_Reset_Workspace( reax_system *system, control_params *control, + storage *workspace ) { int blocks; blocks = system->total_cap / DEF_BLOCK_SIZE + ((system->total_cap % DEF_BLOCK_SIZE == 0 ) ? 0 : 1); - k_reset_workspace <<< blocks, DEF_BLOCK_SIZE >>> + k_reset_workspace <<< blocks, DEF_BLOCK_SIZE, 0, control->streams[0] >>> ( *(workspace->d_workspace), system->total_cap ); cudaCheckError( ); } @@ -80,7 +81,8 @@ void Cuda_Reset_Atoms_HBond_Indices( reax_system* system, control_params *contro hindex = (int *) workspace->scratch; #endif - k_reset_hindex <<< control->blocks_n, control->block_size_n >>> + k_reset_hindex <<< control->blocks_n, control->block_size_n, 0, + control->streams[0] >>> ( system->d_my_atoms, system->reax_param.d_sbp, #if !defined(CUDA_ACCUM_ATOMIC) hindex, @@ -111,5 +113,5 @@ extern "C" void Cuda_Reset( reax_system *system, control_params *control, Reset_Pressures( data ); } - Cuda_Reset_Workspace( system, workspace ); + Cuda_Reset_Workspace( system, control, workspace ); } diff --git a/PG-PuReMD/src/cuda/cuda_reset_tools.h b/PG-PuReMD/src/cuda/cuda_reset_tools.h index ac9f841b..dd719a91 100644 --- a/PG-PuReMD/src/cuda/cuda_reset_tools.h +++ b/PG-PuReMD/src/cuda/cuda_reset_tools.h @@ -5,7 +5,7 @@ #include "../reax_types.h" -void Cuda_Reset_Workspace( reax_system *, storage * ); +void Cuda_Reset_Workspace( reax_system *, control_params *, storage * ); void Cuda_Reset_Atoms_HBond_Indices( reax_system *, control_params *, storage * ); diff --git a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu index d638c044..27799cd4 100644 --- a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu +++ b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu @@ -546,28 +546,28 @@ CUDA_GLOBAL void k_dual_sparse_matvec_full_opt_csr( int *row_ptr_start, void dual_jacobi_apply( real const * const Hdia_inv, rvec2 const * const y, - rvec2 * const x, int n ) + rvec2 * const x, int n, cudaStream_t s ) { int blocks; blocks = (n / DEF_BLOCK_SIZE) + ((n % DEF_BLOCK_SIZE == 0) ? 0 : 1); - k_dual_jacobi_apply <<< blocks, DEF_BLOCK_SIZE >>> + k_dual_jacobi_apply <<< blocks, DEF_BLOCK_SIZE, 0, s >>> ( Hdia_inv, y, x, n ); cudaCheckError( ); } void jacobi_apply( real const * const Hdia_inv, real const * const y, - real * const x, int n ) + real * const x, int n, cudaStream_t s ) { int blocks; blocks = (n / DEF_BLOCK_SIZE) + ((n % DEF_BLOCK_SIZE == 0) ? 0 : 1); - k_jacobi_apply <<< blocks, DEF_BLOCK_SIZE >>> + k_jacobi_apply <<< blocks, DEF_BLOCK_SIZE, 0, s >>> ( Hdia_inv, y, x, n ); cudaCheckError( ); } @@ -623,7 +623,7 @@ static void Dual_Sparse_MatVec_Comm_Part1( const reax_system * const system, */ static void Dual_Sparse_MatVec_local( control_params const * const control, sparse_matrix const * const A, rvec2 const * const x, - rvec2 * const b, int n ) + rvec2 * const b, int n, cudaStream_t s ) { int blocks; @@ -633,7 +633,7 @@ static void Dual_Sparse_MatVec_local( control_params const * const control, cuda_memset( b, 0, sizeof(rvec2) * n, "Dual_Sparse_MatVec_local::b" ); /* 1 thread per row implementation */ -// k_dual_sparse_matvec_half_csr <<< control->blocks, control->block_size >>> +// k_dual_sparse_matvec_half_csr <<< control->blocks, control->block_size, 0, s >>> // ( A->start, A->end, A->j, A->val, x, b, A->n ); blocks = A->n * 32 / DEF_BLOCK_SIZE @@ -641,13 +641,13 @@ static void Dual_Sparse_MatVec_local( control_params const * const control, /* 32 threads per row implementation * using registers to accumulate partial row sums */ - k_dual_sparse_matvec_half_opt_csr <<< blocks, DEF_BLOCK_SIZE >>> + k_dual_sparse_matvec_half_opt_csr <<< blocks, DEF_BLOCK_SIZE, 0, s >>> ( A->start, A->end, A->j, A->val, x, b, A->n ); } else if ( A->format == SYM_FULL_MATRIX ) { /* 1 thread per row implementation */ -// k_dual_sparse_matvec_full_csr <<< control->blocks_n, control->blocks_size_n >>> +// k_dual_sparse_matvec_full_csr <<< control->blocks_n, control->blocks_size_n, 0, s >>> // ( *A, x, b, A->n ); blocks = ((A->n * 32) / DEF_BLOCK_SIZE) @@ -655,7 +655,7 @@ static void Dual_Sparse_MatVec_local( control_params const * const control, /* 32 threads per row implementation * using registers to accumulate partial row sums */ - k_dual_sparse_matvec_full_opt_csr <<< blocks, DEF_BLOCK_SIZE >>> + k_dual_sparse_matvec_full_opt_csr <<< blocks, DEF_BLOCK_SIZE, 0, s >>> ( A->start, A->end, A->j, A->val, x, b, A->n ); } cudaCheckError( ); @@ -738,7 +738,7 @@ static void Dual_Sparse_MatVec( const reax_system * const system, Update_Timing_Info( &time, &data->timing.cm_solver_comm ); #endif - Dual_Sparse_MatVec_local( control, A, x, b, n ); + Dual_Sparse_MatVec_local( control, A, x, b, n, control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_spmv ); @@ -800,7 +800,7 @@ static void Sparse_MatVec_Comm_Part1( const reax_system * const system, */ static void Sparse_MatVec_local( control_params const * const control, sparse_matrix const * const A, real const * const x, - real * const b, int n ) + real * const b, int n, cudaStream_t s ) { int blocks; @@ -810,7 +810,7 @@ static void Sparse_MatVec_local( control_params const * const control, cuda_memset( b, 0, sizeof(real) * n, "Sparse_MatVec_local::b" ); /* 1 thread per row implementation */ -// k_sparse_matvec_half_csr <<< control->blocks, control->block_size >>> +// k_sparse_matvec_half_csr <<< control->blocks, control->block_size, 0, s >>> // ( A->start, A->end, A->j, A->val, x, b, A->n ); blocks = (A->n * 32 / DEF_BLOCK_SIZE) @@ -818,13 +818,13 @@ static void Sparse_MatVec_local( control_params const * const control, /* 32 threads per row implementation * using registers to accumulate partial row sums */ - k_sparse_matvec_half_opt_csr <<< blocks, DEF_BLOCK_SIZE >>> + k_sparse_matvec_half_opt_csr <<< blocks, DEF_BLOCK_SIZE, 0, s >>> ( A->start, A->end, A->j, A->val, x, b, A->n ); } else if ( A->format == SYM_FULL_MATRIX ) { /* 1 thread per row implementation */ -// k_sparse_matvec_full_csr <<< control->blocks, control->blocks_size >>> +// k_sparse_matvec_full_csr <<< control->blocks, control->blocks_size, 0, s >>> // ( A->start, A->end, A->j, A->val, x, b, A->n ); blocks = ((A->n * 32) / DEF_BLOCK_SIZE) @@ -832,7 +832,7 @@ static void Sparse_MatVec_local( control_params const * const control, /* 32 threads per row implementation * using registers to accumulate partial row sums */ - k_sparse_matvec_full_opt_csr <<< blocks, DEF_BLOCK_SIZE >>> + k_sparse_matvec_full_opt_csr <<< blocks, DEF_BLOCK_SIZE, 0, s >>> ( A->start, A->end, A->j, A->val, x, b, A->n ); } cudaCheckError( ); @@ -913,7 +913,7 @@ static void Sparse_MatVec( reax_system const * const system, Update_Timing_Info( &time, &data->timing.cm_solver_comm ); #endif - Sparse_MatVec_local( control, A, x, b, n ); + Sparse_MatVec_local( control, A, x, b, n, control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_spmv ); @@ -957,15 +957,15 @@ int Cuda_dual_SDM( reax_system const * const system, #endif dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r2, - workspace->d_workspace->d2, system->n ); + workspace->d_workspace->d2, system->n, control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); #endif - Dot_local_rvec2( control, workspace, b, b, system->n, &redux[0], &redux[1] ); - Dot_local_rvec2( control, workspace, workspace->d_workspace->r2, - workspace->d_workspace->d2, system->n, &redux[2], &redux[3] ); + Dot_local_rvec2( workspace, b, b, system->n, &redux[0], &redux[1], control->streams[0] ); + Dot_local_rvec2( workspace, workspace->d_workspace->r2, + workspace->d_workspace->d2, system->n, &redux[2], &redux[3], control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); @@ -997,10 +997,10 @@ int Cuda_dual_SDM( reax_system const * const system, time = Get_Time( ); #endif - Dot_local_rvec2( control, workspace, workspace->d_workspace->r2, - workspace->d_workspace->d2, system->n, &redux[0], &redux[1] ); - Dot_local_rvec2( control, workspace, workspace->d_workspace->d2, - workspace->d_workspace->q2, system->n, &redux[2], &redux[3] ); + Dot_local_rvec2( workspace, workspace->d_workspace->r2, + workspace->d_workspace->d2, system->n, &redux[0], &redux[1], control->streams[0] ); + Dot_local_rvec2( workspace, workspace->d_workspace->d2, + workspace->d_workspace->q2, system->n, &redux[2], &redux[3], control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_allreduce ); @@ -1029,7 +1029,7 @@ int Cuda_dual_SDM( reax_system const * const system, #endif dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r2, - workspace->d_workspace->d2, system->n ); + workspace->d_workspace->d2, system->n, control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); @@ -1108,15 +1108,15 @@ int Cuda_SDM( reax_system const * const system, control_params const * const con #endif jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r, - workspace->d_workspace->d, system->n ); + workspace->d_workspace->d, system->n, control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); #endif - redux[0] = Dot_local( workspace, b, b, system->n ); + redux[0] = Dot_local( workspace, b, b, system->n, control->streams[0] ); redux[1] = Dot_local( workspace, workspace->d_workspace->r, - workspace->d_workspace->d, system->n ); + workspace->d_workspace->d, system->n, control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); @@ -1142,9 +1142,9 @@ int Cuda_SDM( reax_system const * const system, control_params const * const con #endif redux[0] = Dot_local( workspace, workspace->d_workspace->r, - workspace->d_workspace->d, system->n ); + workspace->d_workspace->d, system->n, control->streams[0] ); redux[1] = Dot_local( workspace, workspace->d_workspace->d, - workspace->d_workspace->q, system->n ); + workspace->d_workspace->q, system->n, control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_allreduce ); @@ -1170,7 +1170,7 @@ int Cuda_SDM( reax_system const * const system, control_params const * const con #endif jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r, - workspace->d_workspace->d, system->n ); + workspace->d_workspace->d, system->n, control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); @@ -1219,17 +1219,17 @@ int Cuda_dual_CG( reax_system const * const system, #endif dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r2, - workspace->d_workspace->d2, system->n ); + workspace->d_workspace->d2, system->n, control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); #endif - Dot_local_rvec2( control, workspace, workspace->d_workspace->r2, - workspace->d_workspace->d2, system->n, &redux[0], &redux[1] ); - Dot_local_rvec2( control, workspace, workspace->d_workspace->d2, - workspace->d_workspace->d2, system->n, &redux[2], &redux[3] ); - Dot_local_rvec2( control, workspace, b, b, system->n, &redux[4], &redux[5] ); + Dot_local_rvec2( workspace, workspace->d_workspace->r2, + workspace->d_workspace->d2, system->n, &redux[0], &redux[1], control->streams[0] ); + Dot_local_rvec2( workspace, workspace->d_workspace->d2, + workspace->d_workspace->d2, system->n, &redux[2], &redux[3], control->streams[0] ); + Dot_local_rvec2( workspace, b, b, system->n, &redux[4], &redux[5], control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); @@ -1262,8 +1262,8 @@ int Cuda_dual_CG( reax_system const * const system, time = Get_Time( ); #endif - Dot_local_rvec2( control, workspace, workspace->d_workspace->d2, - workspace->d_workspace->q2, system->n, &redux[0], &redux[1] ); + Dot_local_rvec2( workspace, workspace->d_workspace->d2, + workspace->d_workspace->q2, system->n, &redux[0], &redux[1], control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); @@ -1290,16 +1290,16 @@ int Cuda_dual_CG( reax_system const * const system, #endif dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r2, - workspace->d_workspace->p2, system->n ); + workspace->d_workspace->p2, system->n, control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); #endif - Dot_local_rvec2( control, workspace, workspace->d_workspace->r2, - workspace->d_workspace->p2, system->n, &redux[0], &redux[1] ); - Dot_local_rvec2( control, workspace, workspace->d_workspace->p2, - workspace->d_workspace->p2, system->n, &redux[2], &redux[3] ); + Dot_local_rvec2( workspace, workspace->d_workspace->r2, + workspace->d_workspace->p2, system->n, &redux[0], &redux[1], control->streams[0] ); + Dot_local_rvec2( workspace, workspace->d_workspace->p2, + workspace->d_workspace->p2, system->n, &redux[2], &redux[3], control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); @@ -1403,17 +1403,17 @@ int Cuda_CG( reax_system const * const system, control_params const * const cont #endif jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r, - workspace->d_workspace->d, system->n ); + workspace->d_workspace->d, system->n, control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); #endif redux[0] = Dot_local( workspace, workspace->d_workspace->r, - workspace->d_workspace->d, system->n ); + workspace->d_workspace->d, system->n, control->streams[0] ); redux[1] = Dot_local( workspace, workspace->d_workspace->d, - workspace->d_workspace->d, system->n ); - redux[2] = Dot_local( workspace, b, b, system->n ); + workspace->d_workspace->d, system->n, control->streams[0] ); + redux[2] = Dot_local( workspace, b, b, system->n, control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); @@ -1440,7 +1440,7 @@ int Cuda_CG( reax_system const * const system, control_params const * const cont #endif tmp = Dot( workspace, workspace->d_workspace->d, workspace->d_workspace->q, - system->n, MPI_COMM_WORLD ); + system->n, MPI_COMM_WORLD, control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_allreduce ); @@ -1456,16 +1456,16 @@ int Cuda_CG( reax_system const * const system, control_params const * const cont #endif jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r, - workspace->d_workspace->p, system->n ); + workspace->d_workspace->p, system->n, control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); #endif redux[0] = Dot_local( workspace, workspace->d_workspace->r, - workspace->d_workspace->p, system->n ); + workspace->d_workspace->p, system->n, control->streams[0] ); redux[1] = Dot_local( workspace, workspace->d_workspace->p, - workspace->d_workspace->p, system->n ); + workspace->d_workspace->p, system->n, control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); @@ -1541,10 +1541,10 @@ int Cuda_dual_BiCGStab( reax_system const * const system, control_params const * Vector_Sum_rvec2( workspace->d_workspace->r2, 1.0, 1.0, b, -1.0, -1.0, workspace->d_workspace->d2, system->n ); - Dot_local_rvec2( control, workspace, b, - b, system->n, &redux[0], &redux[1] ); - Dot_local_rvec2( control, workspace, workspace->d_workspace->r2, - workspace->d_workspace->r2, system->n, &redux[2], &redux[3] ); + Dot_local_rvec2( workspace, b, + b, system->n, &redux[0], &redux[1], control->streams[0] ); + Dot_local_rvec2( workspace, workspace->d_workspace->r2, + workspace->d_workspace->r2, system->n, &redux[2], &redux[3], control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); @@ -1588,8 +1588,8 @@ int Cuda_dual_BiCGStab( reax_system const * const system, control_params const * break; } - Dot_local_rvec2( control, workspace, workspace->d_workspace->r_hat2, - workspace->d_workspace->r2, system->n, &redux[0], &redux[1] ); + Dot_local_rvec2( workspace, workspace->d_workspace->r_hat2, + workspace->d_workspace->r2, system->n, &redux[0], &redux[1], control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); @@ -1631,7 +1631,7 @@ int Cuda_dual_BiCGStab( reax_system const * const system, control_params const * #endif dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->p2, - workspace->d_workspace->d2, system->n ); + workspace->d_workspace->d2, system->n, control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); @@ -1644,8 +1644,8 @@ int Cuda_dual_BiCGStab( reax_system const * const system, control_params const * time = Get_Time( ); #endif - Dot_local_rvec2( control, workspace, workspace->d_workspace->r_hat2, - workspace->d_workspace->z2, system->n, &redux[0], &redux[1] ); + Dot_local_rvec2( workspace, workspace->d_workspace->r_hat2, + workspace->d_workspace->z2, system->n, &redux[0], &redux[1], control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); @@ -1666,8 +1666,8 @@ int Cuda_dual_BiCGStab( reax_system const * const system, control_params const * Vector_Sum_rvec2( workspace->d_workspace->q2, 1.0, 1.0, workspace->d_workspace->r2, -1.0 * alpha[0], -1.0 * alpha[1], workspace->d_workspace->z2, system->n ); - Dot_local_rvec2( control, workspace, workspace->d_workspace->q2, - workspace->d_workspace->q2, system->n, &redux[0], &redux[1] ); + Dot_local_rvec2( workspace, workspace->d_workspace->q2, + workspace->d_workspace->q2, system->n, &redux[0], &redux[1], control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); @@ -1695,7 +1695,7 @@ int Cuda_dual_BiCGStab( reax_system const * const system, control_params const * #endif dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->q2, - workspace->d_workspace->q_hat2, system->n ); + workspace->d_workspace->q_hat2, system->n, control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); @@ -1708,10 +1708,10 @@ int Cuda_dual_BiCGStab( reax_system const * const system, control_params const * time = Get_Time( ); #endif - Dot_local_rvec2( control, workspace, workspace->d_workspace->y2, - workspace->d_workspace->q2, system->n, &redux[0], &redux[1] ); - Dot_local_rvec2( control, workspace, workspace->d_workspace->y2, - workspace->d_workspace->y2, system->n, &redux[2], &redux[3] ); + Dot_local_rvec2( workspace, workspace->d_workspace->y2, + workspace->d_workspace->q2, system->n, &redux[0], &redux[1], control->streams[0] ); + Dot_local_rvec2( workspace, workspace->d_workspace->y2, + workspace->d_workspace->y2, system->n, &redux[2], &redux[3], control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); @@ -1738,8 +1738,8 @@ int Cuda_dual_BiCGStab( reax_system const * const system, control_params const * Vector_Sum_rvec2( workspace->d_workspace->r2, 1.0, 1.0, workspace->d_workspace->q2, -1.0 * omega[0], -1.0 * omega[1], workspace->d_workspace->y2, system->n ); - Dot_local_rvec2( control, workspace, workspace->d_workspace->r2, - workspace->d_workspace->r2, system->n, &redux[0], &redux[1] ); + Dot_local_rvec2( workspace, workspace->d_workspace->r2, + workspace->d_workspace->r2, system->n, &redux[0], &redux[1], control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); @@ -1848,9 +1848,9 @@ int Cuda_BiCGStab( reax_system const * const system, control_params const * cons Vector_Sum( workspace->d_workspace->r, 1.0, b, -1.0, workspace->d_workspace->d, system->n ); - redux[0] = Dot_local( workspace, b, b, system->n ); + redux[0] = Dot_local( workspace, b, b, system->n, control->streams[0] ); redux[1] = Dot_local( workspace, workspace->d_workspace->r, - workspace->d_workspace->r, system->n ); + workspace->d_workspace->r, system->n, control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); @@ -1882,7 +1882,7 @@ int Cuda_BiCGStab( reax_system const * const system, control_params const * cons for ( i = 0; i < control->cm_solver_max_iters && r_norm / b_norm > tol; ++i ) { redux[0] = Dot_local( workspace, workspace->d_workspace->r_hat, - workspace->d_workspace->r, system->n ); + workspace->d_workspace->r, system->n, control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); @@ -1922,7 +1922,7 @@ int Cuda_BiCGStab( reax_system const * const system, control_params const * cons #endif jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->p, - workspace->d_workspace->d, system->n ); + workspace->d_workspace->d, system->n, control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); @@ -1936,7 +1936,7 @@ int Cuda_BiCGStab( reax_system const * const system, control_params const * cons #endif redux[0] = Dot_local( workspace, workspace->d_workspace->r_hat, - workspace->d_workspace->z, system->n ); + workspace->d_workspace->z, system->n, control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); @@ -1956,7 +1956,7 @@ int Cuda_BiCGStab( reax_system const * const system, control_params const * cons 1.0, workspace->d_workspace->r, -1.0 * alpha, workspace->d_workspace->z, system->n ); redux[0] = Dot_local( workspace, workspace->d_workspace->q, - workspace->d_workspace->q, system->n ); + workspace->d_workspace->q, system->n, control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); @@ -1983,7 +1983,7 @@ int Cuda_BiCGStab( reax_system const * const system, control_params const * cons #endif jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->q, - workspace->d_workspace->q_hat, system->n ); + workspace->d_workspace->q_hat, system->n, control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); @@ -1997,9 +1997,9 @@ int Cuda_BiCGStab( reax_system const * const system, control_params const * cons #endif redux[0] = Dot_local( workspace, workspace->d_workspace->y, - workspace->d_workspace->q, system->n ); + workspace->d_workspace->q, system->n, control->streams[0] ); redux[1] = Dot_local( workspace, workspace->d_workspace->y, - workspace->d_workspace->y, system->n ); + workspace->d_workspace->y, system->n, control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); @@ -2024,7 +2024,7 @@ int Cuda_BiCGStab( reax_system const * const system, control_params const * cons 1.0, workspace->d_workspace->q, -1.0 * omega, workspace->d_workspace->y, system->n ); redux[0] = Dot_local( workspace, workspace->d_workspace->r, - workspace->d_workspace->r, system->n ); + workspace->d_workspace->r, system->n, control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); @@ -2100,7 +2100,7 @@ int Cuda_dual_PIPECG( reax_system const * const system, control_params const * c #endif dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r2, - workspace->d_workspace->u2, system->n ); + workspace->d_workspace->u2, system->n, control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); @@ -2113,13 +2113,13 @@ int Cuda_dual_PIPECG( reax_system const * const system, control_params const * c time = Get_Time( ); #endif - Dot_local_rvec2( control, workspace, workspace->d_workspace->w2, - workspace->d_workspace->u2, system->n, &redux[0], &redux[1] ); - Dot_local_rvec2( control, workspace, workspace->d_workspace->r2, - workspace->d_workspace->u2, system->n, &redux[2], &redux[3] ); - Dot_local_rvec2( control, workspace, workspace->d_workspace->u2, - workspace->d_workspace->u2, system->n, &redux[4], &redux[5] ); - Dot_local_rvec2( control, workspace, b, b, system->n, &redux[6], &redux[7] ); + Dot_local_rvec2( workspace, workspace->d_workspace->w2, + workspace->d_workspace->u2, system->n, &redux[0], &redux[1], control->streams[0] ); + Dot_local_rvec2( workspace, workspace->d_workspace->r2, + workspace->d_workspace->u2, system->n, &redux[2], &redux[3], control->streams[0] ); + Dot_local_rvec2( workspace, workspace->d_workspace->u2, + workspace->d_workspace->u2, system->n, &redux[4], &redux[5], control->streams[0] ); + Dot_local_rvec2( workspace, b, b, system->n, &redux[6], &redux[7], control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); @@ -2130,7 +2130,7 @@ int Cuda_dual_PIPECG( reax_system const * const system, control_params const * c Check_MPI_Error( ret, __FILE__, __LINE__ ); dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->w2, - workspace->d_workspace->m2, system->n ); + workspace->d_workspace->m2, system->n, control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); @@ -2196,12 +2196,12 @@ int Cuda_dual_PIPECG( reax_system const * const system, control_params const * c -1.0 * alpha[0], -1.0 * alpha[1], workspace->d_workspace->z2, system->n ); Vector_Sum_rvec2( workspace->d_workspace->r2, 1.0, 1.0, workspace->d_workspace->r2, -1.0 * alpha[0], -1.0 * alpha[1], workspace->d_workspace->d2, system->n ); - Dot_local_rvec2( control, workspace, workspace->d_workspace->w2, - workspace->d_workspace->u2, system->n, &redux[0], &redux[1] ); - Dot_local_rvec2( control, workspace, workspace->d_workspace->r2, - workspace->d_workspace->u2, system->n, &redux[2], &redux[3] ); - Dot_local_rvec2( control, workspace, workspace->d_workspace->u2, - workspace->d_workspace->u2, system->n, &redux[4], &redux[5] ); + Dot_local_rvec2( workspace, workspace->d_workspace->w2, + workspace->d_workspace->u2, system->n, &redux[0], &redux[1], control->streams[0] ); + Dot_local_rvec2( workspace, workspace->d_workspace->r2, + workspace->d_workspace->u2, system->n, &redux[2], &redux[3], control->streams[0] ); + Dot_local_rvec2( workspace, workspace->d_workspace->u2, + workspace->d_workspace->u2, system->n, &redux[4], &redux[5], control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); @@ -2212,7 +2212,7 @@ int Cuda_dual_PIPECG( reax_system const * const system, control_params const * c Check_MPI_Error( ret, __FILE__, __LINE__ ); dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->w2, - workspace->d_workspace->m2, system->n ); + workspace->d_workspace->m2, system->n, control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); @@ -2323,7 +2323,7 @@ int Cuda_PIPECG( reax_system const * const system, control_params const * const #endif jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r, - workspace->d_workspace->u, system->n ); + workspace->d_workspace->u, system->n, control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); @@ -2337,12 +2337,12 @@ int Cuda_PIPECG( reax_system const * const system, control_params const * const #endif redux[0] = Dot_local( workspace, workspace->d_workspace->w, - workspace->d_workspace->u, system->n ); + workspace->d_workspace->u, system->n, control->streams[0] ); redux[1] = Dot_local( workspace, workspace->d_workspace->r, - workspace->d_workspace->u, system->n ); + workspace->d_workspace->u, system->n, control->streams[0] ); redux[2] = Dot_local( workspace, workspace->d_workspace->u, - workspace->d_workspace->u, system->n ); - redux[3] = Dot_local( workspace, b, b, system->n ); + workspace->d_workspace->u, system->n, control->streams[0] ); + redux[3] = Dot_local( workspace, b, b, system->n, control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); @@ -2353,7 +2353,7 @@ int Cuda_PIPECG( reax_system const * const system, control_params const * const Check_MPI_Error( ret, __FILE__, __LINE__ ); jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->w, - workspace->d_workspace->m, system->n ); + workspace->d_workspace->m, system->n, control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); @@ -2407,11 +2407,11 @@ int Cuda_PIPECG( reax_system const * const system, control_params const * const Vector_Sum( workspace->d_workspace->r, 1.0, workspace->d_workspace->r, -1.0 * alpha, workspace->d_workspace->d, system->n ); redux[0] = Dot_local( workspace, workspace->d_workspace->w, - workspace->d_workspace->u, system->n ); + workspace->d_workspace->u, system->n, control->streams[0] ); redux[1] = Dot_local( workspace, workspace->d_workspace->r, - workspace->d_workspace->u, system->n ); + workspace->d_workspace->u, system->n, control->streams[0] ); redux[2] = Dot_local( workspace, workspace->d_workspace->u, - workspace->d_workspace->u, system->n ); + workspace->d_workspace->u, system->n, control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); @@ -2422,7 +2422,7 @@ int Cuda_PIPECG( reax_system const * const system, control_params const * const Check_MPI_Error( ret, __FILE__, __LINE__ ); jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->w, - workspace->d_workspace->m, system->n ); + workspace->d_workspace->m, system->n, control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); @@ -2495,15 +2495,15 @@ int Cuda_dual_PIPECR( reax_system const * const system, control_params const * c #endif dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r2, - workspace->d_workspace->u2, system->n ); + workspace->d_workspace->u2, system->n, control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); #endif - Dot_local_rvec2( control, workspace, b, b, system->n, &redux[0], &redux[1] ); - Dot_local_rvec2( control, workspace, workspace->d_workspace->u2, - workspace->d_workspace->u2, system->n, &redux[2], &redux[3] ); + Dot_local_rvec2( workspace, b, b, system->n, &redux[0], &redux[1], control->streams[0] ); + Dot_local_rvec2( workspace, workspace->d_workspace->u2, + workspace->d_workspace->u2, system->n, &redux[2], &redux[3], control->streams[0] ); ret = MPI_Iallreduce( MPI_IN_PLACE, redux, 4, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD, &req ); @@ -2539,18 +2539,18 @@ int Cuda_dual_PIPECR( reax_system const * const system, control_params const * c } dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->w2, - workspace->d_workspace->m2, system->n ); + workspace->d_workspace->m2, system->n, control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); #endif - Dot_local_rvec2( control, workspace, workspace->d_workspace->w2, - workspace->d_workspace->u2, system->n, &redux[0], &redux[1] ); - Dot_local_rvec2( control, workspace, workspace->d_workspace->m2, - workspace->d_workspace->w2, system->n, &redux[2], &redux[3] ); - Dot_local_rvec2( control, workspace, workspace->d_workspace->u2, - workspace->d_workspace->u2, system->n, &redux[4], &redux[5] ); + Dot_local_rvec2( workspace, workspace->d_workspace->w2, + workspace->d_workspace->u2, system->n, &redux[0], &redux[1], control->streams[0] ); + Dot_local_rvec2( workspace, workspace->d_workspace->m2, + workspace->d_workspace->w2, system->n, &redux[2], &redux[3], control->streams[0] ); + Dot_local_rvec2( workspace, workspace->d_workspace->u2, + workspace->d_workspace->u2, system->n, &redux[4], &redux[5], control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); @@ -2698,15 +2698,15 @@ int Cuda_PIPECR( reax_system const * const system, control_params const * const #endif jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r, - workspace->d_workspace->u, system->n ); + workspace->d_workspace->u, system->n, control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); #endif - redux[0] = Dot_local( workspace, b, b, system->n ); + redux[0] = Dot_local( workspace, b, b, system->n, control->streams[0] ); redux[1] = Dot_local( workspace, workspace->d_workspace->u, - workspace->d_workspace->u, system->n ); + workspace->d_workspace->u, system->n, control->streams[0] ); ret = MPI_Iallreduce( MPI_IN_PLACE, redux, 2, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD, &req ); @@ -2735,18 +2735,18 @@ int Cuda_PIPECR( reax_system const * const system, control_params const * const for ( i = 0; i < control->cm_solver_max_iters && r_norm / b_norm > tol; ++i ) { jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->w, - workspace->d_workspace->m, system->n ); + workspace->d_workspace->m, system->n, control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); #endif redux[0] = Dot_local( workspace, workspace->d_workspace->w, - workspace->d_workspace->u, system->n ); + workspace->d_workspace->u, system->n, control->streams[0] ); redux[1] = Dot_local( workspace, workspace->d_workspace->m, - workspace->d_workspace->w, system->n ); + workspace->d_workspace->w, system->n, control->streams[0] ); redux[2] = Dot_local( workspace, workspace->d_workspace->u, - workspace->d_workspace->u, system->n ); + workspace->d_workspace->u, system->n, control->streams[0] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); diff --git a/PG-PuReMD/src/cuda/cuda_system_props.cu b/PG-PuReMD/src/cuda/cuda_system_props.cu index ee10c7ca..2658bf32 100644 --- a/PG-PuReMD/src/cuda/cuda_system_props.cu +++ b/PG-PuReMD/src/cuda/cuda_system_props.cu @@ -530,12 +530,14 @@ static void Cuda_Compute_Momentum( reax_system *system, control_params *control, "Cuda_Compute_Momentum::spad" ); k_center_of_mass_blocks_xcm <<< control->blocks, control->block_size, - sizeof(rvec) * (control->block_size / 32) >>> + sizeof(rvec) * (control->block_size / 32), + control->streams[0] >>> ( system->reax_param.d_sbp, system->d_my_atoms, spad, system->n ); cudaCheckError( ); k_reduction_rvec <<< 1, control->blocks_pow_2, - sizeof(rvec) * (control->blocks_pow_2 / 32) >>> + sizeof(rvec) * (control->blocks_pow_2 / 32), + control->streams[0] >>> ( spad, &spad[control->blocks], control->blocks ); cudaCheckError( ); @@ -547,12 +549,14 @@ static void Cuda_Compute_Momentum( reax_system *system, control_params *control, "Cuda_Compute_Momentum::spad" ); k_center_of_mass_blocks_vcm <<< control->blocks, control->block_size, - sizeof(rvec) * (control->block_size / 32) >>> + sizeof(rvec) * (control->block_size / 32), + control->streams[0] >>> ( system->reax_param.d_sbp, system->d_my_atoms, spad, system->n ); cudaCheckError( ); k_reduction_rvec <<< 1, control->blocks_pow_2, - sizeof(rvec) * (control->blocks_pow_2 / 32) >>> + sizeof(rvec) * (control->blocks_pow_2 / 32), + control->streams[0] >>> ( spad, &spad[control->blocks], control->blocks ); cudaCheckError( ); @@ -564,12 +568,14 @@ static void Cuda_Compute_Momentum( reax_system *system, control_params *control, "Cuda_Compute_Momentum::spad"); k_center_of_mass_blocks_amcm <<< control->blocks, control->block_size, - sizeof(rvec) * (control->block_size / 32) >>> + sizeof(rvec) * (control->block_size / 32), + control->streams[0] >>> ( system->reax_param.d_sbp, system->d_my_atoms, spad, system->n ); cudaCheckError( ); k_reduction_rvec <<< 1, control->blocks_pow_2, - sizeof(rvec) * (control->blocks_pow_2 / 32) >>> + sizeof(rvec) * (control->blocks_pow_2 / 32), + control->streams[0] >>> ( spad, &spad[control->blocks], control->blocks ); cudaCheckError( ); @@ -591,26 +597,30 @@ static void Cuda_Compute_Inertial_Tensor( reax_system *system, control_params *c "Cuda_Compute_Intertial_Tensor::tmp" ); k_compute_inertial_tensor_xx_xy <<< control->blocks, control->block_size, - sizeof(real) * 2 * (control->block_size / 32) >>> + sizeof(real) * 2 * (control->block_size / 32), + control->streams[0] >>> ( system->reax_param.d_sbp, system->d_my_atoms, spad, my_xcm[0], my_xcm[1], my_xcm[2], system->n ); cudaCheckError( ); k_compute_inertial_tensor_xz_yy <<< control->blocks, control->block_size, - sizeof(real) * 2 * (control->block_size / 32) >>> + sizeof(real) * 2 * (control->block_size / 32), + control->streams[0] >>> ( system->reax_param.d_sbp, system->d_my_atoms, spad, my_xcm[0], my_xcm[1], my_xcm[2], system->n ); cudaCheckError( ); k_compute_inertial_tensor_yz_zz <<< control->blocks, control->block_size, - sizeof(real) * 2 * (control->block_size / 32) >>> + sizeof(real) * 2 * (control->block_size / 32), + control->streams[0] >>> ( system->reax_param.d_sbp, system->d_my_atoms, spad, my_xcm[0], my_xcm[1], my_xcm[2], system->n ); cudaCheckError( ); /* reduction of block-level partial sums for inertial tensor */ k_compute_inertial_tensor_blocks <<< 1, control->blocks_pow_2, - sizeof(real) * 6 * control->blocks_pow_2 >>> + sizeof(real) * 6 * control->blocks_pow_2, + control->streams[0] >>> ( spad, &spad[6 * control->blocks], control->blocks ); cudaCheckError( ); @@ -647,7 +657,7 @@ void Cuda_Generate_Initial_Velocities( reax_system *system, MPI_Abort( MPI_COMM_WORLD, INVALID_INPUT ); } - k_atom_velocities_zero <<< blocks, DEF_BLOCK_SIZE >>> + k_atom_velocities_zero <<< blocks, DEF_BLOCK_SIZE, 0, control->streams[0] >>> ( system->d_my_atoms, system->n ); } else @@ -661,7 +671,7 @@ void Cuda_Generate_Initial_Velocities( reax_system *system, Cuda_Randomize( ); - k_atom_velocities_random <<< blocks, DEF_BLOCK_SIZE >>> + k_atom_velocities_random <<< blocks, DEF_BLOCK_SIZE, 0, control->streams[0] >>> ( system->reax_param.d_sbp, system->d_my_atoms, T, system->n ); } } @@ -679,13 +689,14 @@ extern "C" void Cuda_Compute_Kinetic_Energy( reax_system *system, "Cuda_Compute_Kinetic_Energy::workspace->scratch" ); kinetic_energy = (real *) workspace->scratch; - k_compute_kinetic_energy <<< control->blocks, control->block_size >>> + k_compute_kinetic_energy <<< control->blocks, control->block_size, 0, control->streams[0] >>> ( system->reax_param.d_sbp, system->d_my_atoms, kinetic_energy, system->n ); cudaCheckError( ); /* note: above kernel sums the kinetic energy contribution within blocks, * and this call finishes the global reduction across all blocks */ - Cuda_Reduction_Sum( kinetic_energy, &kinetic_energy[system->n], system->n ); + Cuda_Reduction_Sum( kinetic_energy, &kinetic_energy[system->n], system->n, + control->streams[0] ); sCudaMemcpy( &data->my_en.e_kin, &kinetic_energy[system->n], sizeof(real), cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); @@ -715,11 +726,11 @@ void Cuda_Compute_Total_Mass( reax_system *system, control_params *control, "Cuda_Compute_Total_Mass::workspace->scratch" ); spad = (real *) workspace->scratch; - k_compute_total_mass <<< control->blocks, control->block_size >>> + k_compute_total_mass <<< control->blocks, control->block_size, 0, control->streams[0] >>> ( system->reax_param.d_sbp, system->d_my_atoms, spad, system->n ); cudaCheckError( ); - Cuda_Reduction_Sum( spad, &spad[system->n], system->n ); + Cuda_Reduction_Sum( spad, &spad[system->n], system->n, control->streams[0] ); sCudaMemcpy( &my_M, &spad[system->n], sizeof(real), cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); @@ -861,17 +872,20 @@ void Cuda_Compute_Pressure( reax_system* system, control_params *control, "Cuda_Compute_Pressure::workspace->scratch" ); rvec_spad = (rvec *) workspace->scratch; - k_compute_pressure <<< control->blocks, control->block_size >>> + k_compute_pressure <<< control->blocks, control->block_size, 0, + control->streams[0] >>> ( system->d_my_atoms, system->d_big_box, rvec_spad, system->n ); k_reduction_rvec <<< control->blocks, control->block_size, - sizeof(rvec) * (control->block_size / 32) >>> + sizeof(rvec) * (control->block_size / 32), + control->streams[0] >>> ( rvec_spad, &rvec_spad[system->n], system->n ); cudaCheckError( ); k_reduction_rvec <<< 1, control->blocks_pow_2, - sizeof(rvec) * (control->blocks_pow_2 / 32) >>> + sizeof(rvec) * (control->blocks_pow_2 / 32), + control->streams[0] >>> ( &rvec_spad[system->n], &rvec_spad[system->n + control->blocks], control->blocks ); cudaCheckError( ); diff --git a/PG-PuReMD/src/cuda/cuda_torsion_angles.cu b/PG-PuReMD/src/cuda/cuda_torsion_angles.cu index be808aac..f7b9e777 100644 --- a/PG-PuReMD/src/cuda/cuda_torsion_angles.cu +++ b/PG-PuReMD/src/cuda/cuda_torsion_angles.cu @@ -1319,7 +1319,8 @@ void Cuda_Compute_Torsion_Angles( reax_system *system, control_params *control, if ( control->virial == 1 ) { - k_torsion_angles_virial_part1 <<< control->blocks, control->block_size >>> + k_torsion_angles_virial_part1 <<< control->blocks, control->block_size, + 0, control->streams[0] >>> ( system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_fbp, (control_params *) control->d_control_params, *(lists[BONDS]), *(lists[THREE_BODIES]), *(workspace->d_workspace), system->n, @@ -1335,7 +1336,8 @@ void Cuda_Compute_Torsion_Angles( reax_system *system, control_params *control, } else { -// k_torsion_angles_part1 <<< control->blocks, control->block_size >>> +// k_torsion_angles_part1 <<< control->blocks, control->block_size, +// 0, control->streams[0] >>> // ( system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_fbp, // (control_params *) control->d_control_params, *(lists[BONDS]), // *(lists[THREE_BODIES]), *(workspace->d_workspace), system->n, @@ -1352,7 +1354,8 @@ void Cuda_Compute_Torsion_Angles( reax_system *system, control_params *control, + (system->n * 32 % DEF_BLOCK_SIZE == 0 ? 0 : 1); k_torsion_angles_part1_opt <<< blocks, DEF_BLOCK_SIZE, - sizeof(cub::WarpReduce<double>::TempStorage) * (DEF_BLOCK_SIZE / 32) >>> + sizeof(cub::WarpReduce<double>::TempStorage) * (DEF_BLOCK_SIZE / 32), + control->streams[0] >>> ( system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_fbp, (control_params *) control->d_control_params, *(lists[BONDS]), *(lists[THREE_BODIES]), *(workspace->d_workspace), system->n, @@ -1384,12 +1387,14 @@ void Cuda_Compute_Torsion_Angles( reax_system *system, control_params *control, rvec_spad = (rvec *) (&spad[2 * system->n]); k_reduction_rvec <<< control->blocks, control->block_size, - sizeof(rvec) * (control->block_size / 32) >>> + sizeof(rvec) * (control->block_size / 32), + control->streams[0] >>> ( rvec_spad, &rvec_spad[system->n], system->n ); cudaCheckError( ); k_reduction_rvec <<< 1, control->blocks_pow_2, - sizeof(rvec) * (control->blocks_pow_2 / 32) >>> + sizeof(rvec) * (control->blocks_pow_2 / 32), + control->streams[0] >>> ( &rvec_spad[system->n], &((simulation_data *)data->d_simulation_data)->my_ext_press, control->blocks ); @@ -1401,7 +1406,8 @@ void Cuda_Compute_Torsion_Angles( reax_system *system, control_params *control, #endif #if !defined(CUDA_ACCUM_ATOMIC) - k_torsion_angles_part2 <<< control->blocks_n, control->block_size_n >>> + k_torsion_angles_part2 <<< control->blocks_n, control->block_size_n, 0, + control->streams[0] >>> ( system->d_my_atoms, *(workspace->d_workspace), *(lists[BONDS]), system->N ); cudaCheckError( ); diff --git a/PG-PuReMD/src/cuda/cuda_valence_angles.cu b/PG-PuReMD/src/cuda/cuda_valence_angles.cu index 989b26bc..85ff57d7 100644 --- a/PG-PuReMD/src/cuda/cuda_valence_angles.cu +++ b/PG-PuReMD/src/cuda/cuda_valence_angles.cu @@ -1349,7 +1349,7 @@ static int Cuda_Estimate_Storage_Three_Body( reax_system *system, control_params cuda_memset( thbody, 0, system->total_bonds * sizeof(int), "Cuda_Estimate_Storage_Three_Body::thbody" ); -// k_estimate_valence_angles <<< control->blocks_n, control->block_size_n >>> +// k_estimate_valence_angles <<< control->blocks_n, control->block_size_n, 0, control->streams[0] >>> // ( system->d_my_atoms, (control_params *)control->d_control_params, // *(lists[BONDS]), system->n, system->N, thbody ); // cudaCheckError( ); @@ -1358,12 +1358,14 @@ static int Cuda_Estimate_Storage_Three_Body( reax_system *system, control_params + (system->N * 32 % DEF_BLOCK_SIZE == 0 ? 0 : 1); k_estimate_valence_angles_opt <<< blocks, DEF_BLOCK_SIZE, - sizeof(cub::WarpReduce<int>::TempStorage) * (DEF_BLOCK_SIZE / 32) >>> + sizeof(cub::WarpReduce<int>::TempStorage) * (DEF_BLOCK_SIZE / 32), + control->streams[0] >>> ( system->d_my_atoms, (control_params *)control->d_control_params, *(lists[BONDS]), system->n, system->N, thbody ); cudaCheckError( ); - Cuda_Reduction_Sum( thbody, system->d_total_thbodies, system->total_bonds ); + Cuda_Reduction_Sum( thbody, system->d_total_thbodies, system->total_bonds, + control->streams[0] ); sCudaMemcpy( &system->total_thbodies, system->d_total_thbodies, sizeof(int), cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); @@ -1403,13 +1405,14 @@ static int Cuda_Estimate_Storage_Three_Body( reax_system *system, control_params * * indices: list indices * entries: num. of entries in list */ -void Cuda_Init_Three_Body_Indices( int *indices, int entries, reax_list **lists ) +static void Cuda_Init_Three_Body_Indices( control_params *control, int *indices, + int entries, reax_list **lists ) { reax_list *thbody; thbody = lists[THREE_BODIES]; - Cuda_Scan_Excl_Sum( indices, thbody->index, entries ); + Cuda_Scan_Excl_Sum( indices, thbody->index, entries, control->streams[0] ); } @@ -1447,7 +1450,7 @@ int Cuda_Compute_Valence_Angles( reax_system *system, control_params *control, if ( ret == SUCCESS ) { - Cuda_Init_Three_Body_Indices( thbody, system->total_thbodies_indices, lists ); + Cuda_Init_Three_Body_Indices( control, thbody, system->total_thbodies_indices, lists ); #if defined(CUDA_ACCUM_ATOMIC) cuda_memset( &((simulation_data *)data->d_simulation_data)->my_en.e_ang, @@ -1462,7 +1465,8 @@ int Cuda_Compute_Valence_Angles( reax_system *system, control_params *control, if ( control->virial == 1 ) { - k_valence_angles_virial_part1 <<< control->blocks_n, control->block_size_n >>> + k_valence_angles_virial_part1 <<< control->blocks_n, control->block_size_n, + 0, control->streams[0] >>> ( system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_sbp, system->reax_param.d_thbp, (control_params *) control->d_control_params, @@ -1480,7 +1484,8 @@ int Cuda_Compute_Valence_Angles( reax_system *system, control_params *control, } else { -// k_valence_angles_part1 <<< control->blocks_n, control->block_size_n >>> +// k_valence_angles_part1 <<< control->blocks_n, control->block_size_n, +// 0, control->streams[0] >>> // ( system->d_my_atoms, system->reax_param.d_gp, // system->reax_param.d_sbp, system->reax_param.d_thbp, // (control_params *) control->d_control_params, @@ -1500,7 +1505,8 @@ int Cuda_Compute_Valence_Angles( reax_system *system, control_params *control, k_valence_angles_part1_opt <<< blocks, DEF_BLOCK_SIZE, (sizeof(cub::WarpScan<int>::TempStorage) - + sizeof(cub::WarpReduce<double>::TempStorage)) * (DEF_BLOCK_SIZE / 32) >>> + + sizeof(cub::WarpReduce<double>::TempStorage)) * (DEF_BLOCK_SIZE / 32), + control->streams[0] >>> ( system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_sbp, system->reax_param.d_thbp, (control_params *) control->d_control_params, @@ -1522,15 +1528,15 @@ int Cuda_Compute_Valence_Angles( reax_system *system, control_params *control, { Cuda_Reduction_Sum( spad, &((simulation_data *)data->d_simulation_data)->my_en.e_ang, - system->N ); + system->N, control->streams[0] ); Cuda_Reduction_Sum( &spad[system->N], &((simulation_data *)data->d_simulation_data)->my_en.e_pen, - system->N ); + system->N, control->streams[0] ); Cuda_Reduction_Sum( &spad[2 * system->N], &((simulation_data *)data->d_simulation_data)->my_en.e_coa, - system->N ); + system->N, control->streams[0] ); } if ( control->virial == 1 ) @@ -1538,24 +1544,27 @@ int Cuda_Compute_Valence_Angles( reax_system *system, control_params *control, rvec_spad = (rvec *) (&spad[3 * system->N]); k_reduction_rvec <<< control->blocks_n, control->block_size_n, - sizeof(rvec) * (control->block_size_n / 32) >>> + sizeof(rvec) * (control->block_size_n / 32), + control->streams[0] >>> ( rvec_spad, &rvec_spad[system->N], system->N ); cudaCheckError( ); k_reduction_rvec <<< 1, control->blocks_pow_2_n, - sizeof(rvec) * (control->blocks_pow_2_n / 32) >>> + sizeof(rvec) * (control->blocks_pow_2_n / 32), + control->streams[0] >>> ( &rvec_spad[system->N], &((simulation_data *)data->d_simulation_data)->my_ext_press, control->blocks_n ); cudaCheckError( ); // Cuda_Reduction_Sum( rvec_spad, // &((simulation_data *)data->d_simulation_data)->my_ext_press, -// system->N ); +// system->N, control->streams[0] ); } #endif #if !defined(CUDA_ACCUM_ATOMIC) - k_valence_angles_part2 <<< control->blocks_n, control->block_size_n >>> + k_valence_angles_part2 <<< control->blocks_n, control->block_size_n, + 0, control->streams[0] >>> ( system->d_my_atoms, (control_params *) control->d_control_params, *(workspace->d_workspace), *(lists[BONDS]), system->N ); cudaCheckError( ); diff --git a/PG-PuReMD/src/puremd.c b/PG-PuReMD/src/puremd.c index 0d1b8f57..be52e5d1 100644 --- a/PG-PuReMD/src/puremd.c +++ b/PG-PuReMD/src/puremd.c @@ -227,8 +227,7 @@ void* setup( const char * const geo_file, const char * const ffield_file, pmd_handle->workspace, pmd_handle->out_control, pmd_handle->mpi_data ); #if defined(HAVE_CUDA) - Cuda_Setup_Environment( pmd_handle->system->my_rank, - pmd_handle->control->nprocs, pmd_handle->control->gpus_per_node ); + Cuda_Setup_Environment( pmd_handle->system, pmd_handle->control ); #endif return (void*) pmd_handle; @@ -298,7 +297,7 @@ int simulate( const void * const handle ) Cuda_Reset( system, control, data, workspace, lists ); - Cuda_Generate_Neighbor_Lists( system, data, workspace, lists ); + Cuda_Generate_Neighbor_Lists( system, control, data, workspace, lists ); Cuda_Compute_Forces( system, control, data, workspace, lists, out_control, mpi_data ); @@ -501,7 +500,7 @@ int cleanup( const void * const handle ) #if defined(HAVE_CUDA) //TODO: add Cuda_Finalize( ... ) - Cuda_Cleanup_Environment( ); + Cuda_Cleanup_Environment( pmd_handle->control ); #else Finalize( pmd_handle->system, pmd_handle->control, pmd_handle->data, pmd_handle->workspace, pmd_handle->lists, pmd_handle->out_control, diff --git a/PG-PuReMD/src/reax_types.h b/PG-PuReMD/src/reax_types.h index 661926c1..cf3a055a 100644 --- a/PG-PuReMD/src/reax_types.h +++ b/PG-PuReMD/src/reax_types.h @@ -356,6 +356,9 @@ #else #define VDW_BLOCK_SIZE (256) #endif + + /* max. num. of active CUDA streams */ + #define CUDA_MAX_STREAMS (5) #endif @@ -1655,6 +1658,8 @@ struct control_params /* num. of CUDA blocks rounded up to the nearest power of 2 * for kernels with 1 thread per atom (local AND ghost) */ int blocks_pow_2_n; + /* CUDA stream */ + cudaStream_t streams[CUDA_MAX_STREAMS]; #endif }; -- GitLab