From 74f5c1c6e074a467883883b19fa32b744dae781b Mon Sep 17 00:00:00 2001 From: "Kurt A. O'Hearn" <ohearnku@msu.edu> Date: Fri, 19 Feb 2021 17:04:37 -0500 Subject: [PATCH] PG-PuReMD: fix atomic force reductions and reduce the number of atomic operations. --- PG-PuReMD/src/cuda/cuda_forces.cu | 13 +- PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu | 65 ++--- PG-PuReMD/src/cuda/cuda_multi_body.cu | 54 ---- PG-PuReMD/src/cuda/cuda_nonbonded.cu | 322 ++++++++++------------ PG-PuReMD/src/cuda/cuda_nonbonded.h | 4 +- PG-PuReMD/src/cuda/cuda_torsion_angles.cu | 30 +- PG-PuReMD/src/cuda/cuda_valence_angles.cu | 13 +- 7 files changed, 192 insertions(+), 309 deletions(-) diff --git a/PG-PuReMD/src/cuda/cuda_forces.cu b/PG-PuReMD/src/cuda/cuda_forces.cu index 6e59636a..e8895e60 100644 --- a/PG-PuReMD/src/cuda/cuda_forces.cu +++ b/PG-PuReMD/src/cuda/cuda_forces.cu @@ -2120,16 +2120,7 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, } -void Cuda_Compute_NonBonded_Forces( reax_system *system, control_params *control, - simulation_data *data, storage *workspace, - reax_list **lists, output_controls *out_control, - mpi_datatypes *mpi_data ) -{ - Cuda_NonBonded_Energy( system, control, workspace, data, lists, out_control ); -} - - -void Cuda_Compute_Total_Force( reax_system *system, control_params *control, +static void Cuda_Compute_Total_Force( reax_system *system, control_params *control, simulation_data *data, storage *workspace, reax_list **lists, mpi_datatypes *mpi_data ) { @@ -2237,7 +2228,7 @@ extern "C" int Cuda_Compute_Forces( reax_system *system, control_params *control #endif Cuda_Compute_NonBonded_Forces( system, control, data, workspace, - lists, out_control, mpi_data ); + lists, out_control ); #if defined(LOG_PERFORMANCE) cudaEventRecord( time_event[4] ); diff --git a/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu b/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu index 075c9fc7..46cdc806 100644 --- a/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu +++ b/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu @@ -51,7 +51,7 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1( reax_atom *my_atoms, single_body_par rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk; rvec dvec_jk, temp, ext_press_l; #if defined(CUDA_ACCUM_ATOMIC) - rvec f_i_l, f_j_l, f_k_l; + rvec f_j_l; #endif hbond_parameters *hbp; bond_order_data *bo_ij; @@ -69,9 +69,7 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1( reax_atom *my_atoms, single_body_par e_hb_l = 0.0; rvec_MakeZero( ext_press_l ); #if defined(CUDA_ACCUM_ATOMIC) - rvec_MakeZero( f_i_l ); rvec_MakeZero( f_j_l ); - rvec_MakeZero( f_k_l ); #endif /* discover the Hydrogen bonds between i-j-k triplets. @@ -185,13 +183,13 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1( reax_atom *my_atoms, single_body_par rvec_ScaledAdd( phbond_jk->hb_f, CEhb3 / r_jk, dvec_jk ); #else /* dcos terms */ - rvec_ScaledAdd( f_i_l, CEhb2, dcos_theta_di ); + atomic_rvecScaledAdd( workspace.f[i], CEhb2, dcos_theta_di ); rvec_ScaledAdd( f_j_l, CEhb2, dcos_theta_dj ); - rvec_ScaledAdd( f_k_l, CEhb2, dcos_theta_dk ); + atomic_rvecScaledAdd( workspace.f[k], CEhb2, dcos_theta_dk ); /* dr terms */ rvec_ScaledAdd( f_j_l, -1.0 * CEhb3 / r_jk, dvec_jk ); - rvec_ScaledAdd( f_k_l, CEhb3 / r_jk, dvec_jk ); + atomic_rvecScaledAdd( workspace.f[k], CEhb3 / r_jk, dvec_jk ); #endif } else @@ -226,7 +224,7 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1( reax_atom *my_atoms, single_body_par * derivatives are added directly into pressure vector/tensor */ /* dcos terms */ rvec_Scale( temp, CEhb2, dcos_theta_di ); - rvec_Add( f_i_l, temp ); + atomic_rvecAdd( workspace.f[i], temp ); rvec_iMultiply( temp, pbond_ij->rel_box, temp ); rvec_Add( ext_press_l, temp ); @@ -235,7 +233,7 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1( reax_atom *my_atoms, single_body_par ivec_Scale( rel_jk, hbond_list.hbond_list[pk].scl, far_nbr_list.far_nbr_list.rel_box[nbr_jk] ); rvec_Scale( temp, CEhb2, dcos_theta_dk ); - rvec_Add( f_k_l, temp ); + atomic_rvecAdd( workspace.f[k], temp ); rvec_iMultiply( temp, rel_jk, temp ); rvec_Add( ext_press_l, temp ); @@ -243,7 +241,7 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1( reax_atom *my_atoms, single_body_par rvec_ScaledAdd( f_j_l, -1.0 * CEhb3 / r_jk, dvec_jk ); rvec_Scale( temp, CEhb3 / r_jk, dvec_jk ); - rvec_Add( f_k_l, temp ); + atomic_rvecAdd( workspace.f[k], temp ); rvec_iMultiply( temp, rel_jk, temp ); rvec_Add( ext_press_l, temp ); #endif @@ -260,12 +258,11 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1( reax_atom *my_atoms, single_body_par #if !defined(CUDA_ACCUM_ATOMIC) /* write conflicts for accumulating partial forces resolved by subsequent kernels */ + rvecCopy( workspace.f[j], f_j_l ); e_hb_g[j] = e_hb_l; rvecCopy( ext_press_g[j], ext_press_l ); #else - atomic_rvecAdd( workspace.f[i], f_i_l ); atomic_rvecAdd( workspace.f[j], f_j_l ); - atomic_rvecAdd( workspace.f[k], f_k_l ); atomicAdd( (double *) e_hb_g, (double) e_hb_l ); atomic_rvecAdd( *ext_press_g, ext_press_l ); #endif @@ -289,11 +286,9 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1_opt( reax_atom *my_atoms, single_body real r_ij, r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2; real e_hb, e_hb_l, exp_hb2, exp_hb3, CEhb1, CEhb1_l, CEhb2, CEhb3; rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk; - rvec dvec_jk, temp, ext_press_l; + rvec dvec_jk, temp, f_j_l, ext_press_l; #if !defined(CUDA_ACCUM_ATOMIC) - rvec f_l, hb_f_l; -#else - rvec f_i_l, f_j_l, f_k_l; + rvec hb_f_l; #endif hbond_parameters *hbp; bond_order_data *bo_ij; @@ -322,14 +317,8 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1_opt( reax_atom *my_atoms, single_body * so in this function i->X, j->H, k->Z when we map * variables onto the ones in the handout.*/ e_hb_l = 0.0; - rvec_MakeZero( ext_press_l ); -#if !defined(CUDA_ACCUM_ATOMIC) - rvec_MakeZero( f_l ); -#else - rvec_MakeZero( f_i_l ); rvec_MakeZero( f_j_l ); - rvec_MakeZero( f_k_l ); -#endif + rvec_MakeZero( ext_press_l ); /* j has to be of type H */ if ( sbp[ my_atoms[j].type ].p_hbond == H_ATOM ) @@ -435,21 +424,21 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1_opt( reax_atom *my_atoms, single_body #if !defined(CUDA_ACCUM_ATOMIC) /* dcos terms */ rvec_ScaledAdd( hb_f_l, CEhb2, dcos_theta_di ); - rvec_ScaledAdd( f_l, CEhb2, dcos_theta_dj ); + rvec_ScaledAdd( f_j_l, CEhb2, dcos_theta_dj ); rvec_ScaledAdd( phbond_jk->hb_f, CEhb2, dcos_theta_dk ); /* dr terms */ - rvec_ScaledAdd( f_l, -1.0 * CEhb3 / r_jk, dvec_jk ); + rvec_ScaledAdd( f_j_l, -1.0 * CEhb3 / r_jk, dvec_jk ); rvec_ScaledAdd( phbond_jk->hb_f, CEhb3 / r_jk, dvec_jk ); #else /* dcos terms */ - rvec_ScaledAdd( f_i_l, CEhb2, dcos_theta_di ); + atomic_rvecScaledAdd( workspace.f[i], CEhb2, dcos_theta_di ); rvec_ScaledAdd( f_j_l, CEhb2, dcos_theta_dj ); - rvec_ScaledAdd( f_k_l, CEhb2, dcos_theta_dk ); + atomic_rvecScaledAdd( workspace.f[k], CEhb2, dcos_theta_dk ); /* dr terms */ rvec_ScaledAdd( f_j_l, -1.0 * CEhb3 / r_jk, dvec_jk ); - rvec_ScaledAdd( f_k_l, CEhb3 / r_jk, dvec_jk ); + atomic_rvecScaledAdd( workspace.f[k], CEhb3 / r_jk, dvec_jk ); #endif } else @@ -484,7 +473,7 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1_opt( reax_atom *my_atoms, single_body * derivatives are added directly into pressure vector/tensor */ /* dcos terms */ rvec_Scale( temp, CEhb2, dcos_theta_di ); - rvec_Add( f_i_l, temp ); + atomic_rvecAdd( workspace.f[i], temp ); rvec_iMultiply( temp, pbond_ij->rel_box, temp ); rvec_Add( ext_press_l, temp ); @@ -493,7 +482,7 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1_opt( reax_atom *my_atoms, single_body ivec_Scale( rel_jk, hbond_list.hbond_list[pk].scl, far_nbr_list.far_nbr_list.rel_box[nbr_jk] ); rvec_Scale( temp, CEhb2, dcos_theta_dk ); - rvec_Add( f_k_l, temp ); + atomic_rvecAdd( workspace.f[k], temp ); rvec_iMultiply( temp, rel_jk, temp ); rvec_Add( ext_press_l, temp ); @@ -501,7 +490,7 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1_opt( reax_atom *my_atoms, single_body rvec_ScaledAdd( f_j_l, -1.0 * CEhb3 / r_jk, dvec_jk ); rvec_Scale( temp, CEhb3 / r_jk, dvec_jk ); - rvec_Add( f_k_l, temp ); + atomic_rvecAdd( workspace.f[k], temp ); rvec_iMultiply( temp, rel_jk, temp ); rvec_Add( ext_press_l, temp ); #endif @@ -539,21 +528,9 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1_opt( reax_atom *my_atoms, single_body /* warp-level sums using registers within a warp */ for ( offset = warpSize >> 1; offset > 0; offset /= 2 ) { -#if !defined(CUDA_ACCUM_ATOMIC) - f_l[0] += __shfl_down_sync( mask, f_l[0], offset ); - f_l[1] += __shfl_down_sync( mask, f_l[1], offset ); - f_l[2] += __shfl_down_sync( mask, f_l[2], offset ); -#else - f_i_l[0] += __shfl_down_sync( mask, f_i_l[0], offset ); - f_i_l[1] += __shfl_down_sync( mask, f_i_l[1], offset ); - f_i_l[2] += __shfl_down_sync( mask, f_i_l[2], offset ); f_j_l[0] += __shfl_down_sync( mask, f_j_l[0], offset ); f_j_l[1] += __shfl_down_sync( mask, f_j_l[1], offset ); f_j_l[2] += __shfl_down_sync( mask, f_j_l[2], offset ); - f_k_l[0] += __shfl_down_sync( mask, f_k_l[0], offset ); - f_k_l[1] += __shfl_down_sync( mask, f_k_l[1], offset ); - f_k_l[2] += __shfl_down_sync( mask, f_k_l[2], offset ); -#endif e_hb_l += __shfl_down_sync( mask, e_hb_l, offset ); ext_press_l[0] += __shfl_down_sync( mask, ext_press_l[0], offset ); ext_press_l[1] += __shfl_down_sync( mask, ext_press_l[1], offset ); @@ -564,13 +541,11 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1_opt( reax_atom *my_atoms, single_body if ( lane_id == 0 ) { #if !defined(CUDA_ACCUM_ATOMIC) - rvec_Add( workspace.f[j], f_l ); + rvec_Add( workspace.f[j], f_j_l ); e_hb_g[j] = e_hb_l; rvecCopy( ext_press_g[j], ext_press_l ); #else - atomic_rvecAdd( workspace.f[i], f_i_l ); atomic_rvecAdd( workspace.f[j], f_j_l ); - atomic_rvecAdd( workspace.f[k], f_k_l ); atomicAdd( (double *) e_hb_g, (double) e_hb_l ); atomic_rvecAdd( *ext_press_g, ext_press_l ); #endif diff --git a/PG-PuReMD/src/cuda/cuda_multi_body.cu b/PG-PuReMD/src/cuda/cuda_multi_body.cu index b3b92fed..cabab1d4 100644 --- a/PG-PuReMD/src/cuda/cuda_multi_body.cu +++ b/PG-PuReMD/src/cuda/cuda_multi_body.cu @@ -202,11 +202,6 @@ CUDA_GLOBAL void Cuda_Atom_Energy_Part1( reax_atom *my_atoms, global_parameters workspace.CdDelta[i] += CEover3; // OvCoor - 2nd term workspace.CdDelta[i] += CEunder3; // UnCoor - 1st term -#if defined(TEST_FORCES) - Add_dDelta( system, lists, i, CEover3, workspace.f_ov ); // OvCoor 2nd - Add_dDelta( system, lists, i, CEunder3, workspace.f_un ); // UnCoor 1st -#endif - for ( pj = Start_Index(i, &bond_list); pj < End_Index(i, &bond_list); ++pj ) { pbond_ij = &bond_list.bond_list[pj]; @@ -240,55 +235,6 @@ CUDA_GLOBAL void Cuda_Atom_Energy_Part1( reax_atom *my_atoms, global_parameters * workspace.Delta_lp_temp[j]); // UnCoor-2b bo_ij->Cdbopi2 += CEunder4 * (workspace.Delta[j] - dfvl * workspace.Delta_lp_temp[j]); // UnCoor-2b - -#if defined(TEST_ENERGY) - /* fprintf( out_control->eov, "%6d%12.6f\n", - workspace.reverse_map[j], - // CEover1 * twbp->p_ovun1 * twbp->De_s, CEover3, - CEover4 * (1.0 - workspace.dDelta_lp[j]) * - (bo_ij->BO_pi + bo_ij->BO_pi2) - */// /*CEover4 * (workspace.Delta[j]-workspace.Delta_lp[j])*/); - // fprintf( out_control->eov, "%6d%12.6f\n", - // fprintf( out_control->eov, "%6d%24.15e\n", - // system->my_atoms[j].orig_id, - // CEover1 * twbp->p_ovun1 * twbp->De_s, CEover3, - // CEover4 * (1.0 - workspace.dDelta_lp[j]) * - // (bo_ij->BO_pi + bo_ij->BO_pi2) - // /*CEover4 * (workspace.Delta[j]-workspace.Delta_lp[j])*/); - - // CEunder4 * (1.0 - workspace.dDelta_lp[j]) * - // (bo_ij->BO_pi + bo_ij->BO_pi2), - // CEunder4 * (workspace.Delta[j] - workspace.Delta_lp[j]) ); -#endif - -#if defined(TEST_FORCES) - Add_dBO( system, lists, i, pj, CEover1 * twbp->p_ovun1 * twbp->De_s, - workspace.f_ov ); // OvCoor - 1st term - - Add_dDelta( system, lists, j, - CEover4 * (1.0 - dfvl*workspace.dDelta_lp[j]) * - (bo_ij->BO_pi + bo_ij->BO_pi2), - workspace.f_ov ); // OvCoor - 3a - - Add_dBOpinpi2( system, lists, i, pj, - CEover4 * (workspace.Delta[j] - - dfvl * workspace.Delta_lp_temp[j]), - CEover4 * (workspace.Delta[j] - - dfvl * workspace.Delta_lp_temp[j]), - workspace.f_ov, workspace.f_ov ); // OvCoor - 3b - - Add_dDelta( system, lists, j, - CEunder4 * (1.0 - dfvl*workspace.dDelta_lp[j]) * - (bo_ij->BO_pi + bo_ij->BO_pi2), - workspace.f_un ); // UnCoor - 2a - - Add_dBOpinpi2( system, lists, i, pj, - CEunder4 * (workspace.Delta[j] - - dfvl * workspace.Delta_lp_temp[j]), - CEunder4 * (workspace.Delta[j] - - dfvl * workspace.Delta_lp_temp[j]), - workspace.f_un, workspace.f_un ); // UnCoor - 2b -#endif } } diff --git a/PG-PuReMD/src/cuda/cuda_nonbonded.cu b/PG-PuReMD/src/cuda/cuda_nonbonded.cu index a7aec169..3bc1e72d 100644 --- a/PG-PuReMD/src/cuda/cuda_nonbonded.cu +++ b/PG-PuReMD/src/cuda/cuda_nonbonded.cu @@ -35,7 +35,7 @@ CUDA_GLOBAL void k_compute_polarization_energy( reax_atom *my_atoms, - single_body_parameters *sbp, int n, real *data_e_pol ) + single_body_parameters *sbp, int n, real *e_pol_g ) { int i, type_i; real q; @@ -50,7 +50,7 @@ CUDA_GLOBAL void k_compute_polarization_energy( reax_atom *my_atoms, q = my_atoms[i].q; type_i = my_atoms[i].type; - data_e_pol[i] = KCALpMOL_to_EV * (sbp[type_i].chi * q + e_pol_g[i] = KCALpMOL_to_EV * (sbp[type_i].chi * q + (sbp[type_i].eta / 2.0) * SQR(q)); } @@ -59,7 +59,7 @@ CUDA_GLOBAL void k_compute_polarization_energy( reax_atom *my_atoms, CUDA_GLOBAL void k_vdW_coulomb_energy( reax_atom *my_atoms, two_body_parameters *tbp, global_parameters gp, control_params *control, storage workspace, reax_list far_nbr_list, int n, int num_atom_types, - real *data_e_vdW, real *data_e_ele, rvec *data_ext_press ) + real *e_vdW_g, real *e_ele_g, rvec *ext_press_g ) { int i, j, pj; int start_i, end_i, orig_i, orig_j; @@ -69,8 +69,8 @@ CUDA_GLOBAL void k_vdW_coulomb_energy( reax_atom *my_atoms, real r_ij, fn13, exp1, exp2, e_base, de_base; real Tap, dTap, dfn13, CEvd, CEclmb; real dr3gamij_1, dr3gamij_3; - real e_ele, e_vdW, e_core, de_core, e_clb, de_clb; - rvec temp, ext_press; + real e_ele_l, e_vdW_l, e_core, de_core, e_clb, de_clb; + rvec temp, f_i_l, ext_press_l; two_body_parameters *twbp; i = blockIdx.x * blockDim.x + threadIdx.x; @@ -82,9 +82,10 @@ CUDA_GLOBAL void k_vdW_coulomb_energy( reax_atom *my_atoms, p_vdW1 = gp.l[28]; p_vdW1i = 1.0 / p_vdW1; - e_vdW = 0.0; - e_ele = 0.0; - rvec_MakeZero( ext_press ); + e_vdW_l = 0.0; + e_ele_l = 0.0; + rvec_MakeZero( f_i_l ); + rvec_MakeZero( ext_press_l ); start_i = Start_Index( i, &far_nbr_list ); end_i = End_Index( i, &far_nbr_list ); @@ -137,7 +138,7 @@ CUDA_GLOBAL void k_vdW_coulomb_energy( reax_atom *my_atoms, exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); e_base = twbp->D * (exp1 - 2.0 * exp2); - e_vdW += self_coef * (e_base * Tap); + e_vdW_l += self_coef * (e_base * Tap); dfn13 = POW( r_ij, p_vdW1 - 1.0 ) * POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0 ); @@ -150,7 +151,7 @@ CUDA_GLOBAL void k_vdW_coulomb_energy( reax_atom *my_atoms, exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) ); e_base = twbp->D * (exp1 - 2.0 * exp2); - e_vdW += self_coef * (e_base * Tap); + e_vdW_l += self_coef * (e_base * Tap); de_base = (twbp->D * twbp->alpha / twbp->r_vdW) * (exp2 - exp1); } @@ -159,7 +160,7 @@ CUDA_GLOBAL void k_vdW_coulomb_energy( reax_atom *my_atoms, if ( gp.vdw_type == 2 || gp.vdw_type == 3 ) { e_core = twbp->ecore * EXP( twbp->acore * (1.0 - (r_ij / twbp->rcore)) ); - e_vdW += self_coef * (e_core * Tap); + e_vdW_l += self_coef * (e_core * Tap); de_core = -(twbp->acore / twbp->rcore) * e_core; } @@ -172,28 +173,17 @@ CUDA_GLOBAL void k_vdW_coulomb_energy( reax_atom *my_atoms, CEvd = self_coef * ( (de_base + de_core) * Tap + (e_base + e_core) * dTap ); -#if defined(DEBUG_FOCUS) - printf( "%6d%6d%24.12f%24.12f%24.12f%24.12f\n", - i + 1, j + 1, - e_base, de_base, e_core, de_core ); -#endif - /* Coulomb Calculations */ dr3gamij_1 = r_ij * r_ij * r_ij + POW( twbp->gamma, -3.0 ); dr3gamij_3 = POW( dr3gamij_1, 1.0 / 3.0 ); e_clb = C_ELE * (my_atoms[i].q * my_atoms[j].q) / dr3gamij_3; - e_ele += self_coef * (e_clb * Tap); + e_ele_l += self_coef * (e_clb * Tap); de_clb = -C_ELE * (my_atoms[i].q * my_atoms[j].q) * (r_ij * r_ij) / POW( dr3gamij_1, 4.0 / 3.0 ); CEclmb = self_coef * (de_clb * Tap + e_clb * dTap); -#if defined(DEBUG_FOCUS) - printf( "%6d%6d%24.12f%24.12f\n", - i + 1, j + 1, e_clb, de_clb ); -#endif - if ( control->virial == 0 ) { if ( i < j ) @@ -204,7 +194,7 @@ CUDA_GLOBAL void k_vdW_coulomb_energy( reax_atom *my_atoms, { rvec_Scale( temp, (CEvd + CEclmb) / r_ij, far_nbr_list.far_nbr_list.dvec[pj] ); } - atomic_rvecAdd( workspace.f[i], temp ); + rvec_Add( f_i_l, temp ); rvec_Scale( temp, -1.0, temp ); atomic_rvecAdd( workspace.f[j], temp ); } @@ -223,48 +213,29 @@ CUDA_GLOBAL void k_vdW_coulomb_energy( reax_atom *my_atoms, rvec_Scale( temp, (CEvd + CEclmb) / r_ij, far_nbr_list.far_nbr_list.dvec[pj] ); } - atomic_rvecAdd( workspace.f[i], temp ); + rvec_Add( f_i_l, temp ); rvec_Scale( temp, -1.0, temp ); atomic_rvecAdd( workspace.f[j], temp ); - rvec_iMultiply( ext_press, + rvec_iMultiply( temp, far_nbr_list.far_nbr_list.rel_box[pj], temp ); - rvec_Add( data_ext_press[i], ext_press ); + rvec_Add( ext_press_l, temp ); } - -#if defined(TEST_ENERGY) - // fprintf( out_control->evdw, - // "%12.9f%12.9f%12.9f%12.9f%12.9f%12.9f%12.9f%12.9f\n", - // workspace.Tap[7], workspace.Tap[6], workspace.Tap[5], - // workspace.Tap[4], workspace.Tap[3], workspace.Tap[2], - // workspace.Tap[1], Tap ); - //fprintf( out_control->evdw, "%6d%6d%24.15e%24.15e%24.15e\n", - fprintf( out_control->evdw, "%6d%6d%12.4f%12.4f%12.4f\n", - my_atoms[i].orig_id, my_atoms[j].orig_id, - r_ij, e_vdW, data->my_en.e_vdW ); - //fprintf(out_control->ecou,"%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n", - fprintf( out_control->ecou, "%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f\n", - my_atoms[i].orig_id, my_atoms[j].orig_id, - r_ij, my_atoms[i].q, my_atoms[j].q, - e_ele, data->my_en.e_ele ); -#endif - -#if defined(TEST_FORCES) - rvec_ScaledAdd( workspace.f_vdw[i], -CEvd, - far_nbr_list.far_nbr_list.dvec[pj] ); - rvec_ScaledAdd( workspace.f_vdw[j], +CEvd, - far_nbr_list.far_nbr_list.dvec[pj] ); - rvec_ScaledAdd( workspace.f_ele[i], -CEclmb, - far_nbr_list.far_nbr_list.dvec[pj] ); - rvec_ScaledAdd( workspace.f_ele[j], +CEclmb, - far_nbr_list.far_nbr_list.dvec[pj] ); -#endif } } - __syncthreads( ); - data_e_vdW[i] = e_vdW; - data_e_ele[i] = e_ele; + atomic_rvecAdd( workspace.f[i], f_i_l ); +#if !defined(CUDA_ACCUM_ATOMIC) + e_vdW_g[i] = e_vdW_l; + e_ele_g[i] = e_ele_l; + if ( control->virial == 1 ) + rvec_Copy( ext_press_g[j], ext_press_l ); +#else + atomicAdd( (double *) e_vdW_g, (double) e_vdW_l ); + atomicAdd( (double *) e_ele_g, (double) e_ele_l ); + if ( control->virial == 1 ) + atomic_rvecAdd( *ext_press_g, ext_press_l ); +#endif } @@ -272,7 +243,7 @@ CUDA_GLOBAL void k_vdW_coulomb_energy( reax_atom *my_atoms, CUDA_GLOBAL void k_vdW_coulomb_energy_opt( reax_atom *my_atoms, two_body_parameters *tbp, global_parameters gp, control_params *control, storage workspace, reax_list far_nbr_list, int n, int num_atom_types, - real *data_e_vdW, real *data_e_ele, rvec *data_ext_press ) + real *e_vdW_g, real *e_ele_g, rvec *ext_press_g ) { int i, j, pj; int start_i, end_i, orig_i, orig_j; @@ -282,12 +253,10 @@ CUDA_GLOBAL void k_vdW_coulomb_energy_opt( reax_atom *my_atoms, real r_ij, fn13, exp1, exp2, e_base, de_base; real Tap, dTap, dfn13, CEvd, CEclmb; real dr3gamij_1, dr3gamij_3; - real e_ele, e_vdW, e_core, de_core, e_clb, de_clb; - rvec temp, ext_press; + real e_vdW_l, e_ele_l, e_core, de_core, e_clb, de_clb; + rvec temp, f_i_l, ext_press_l; two_body_parameters *twbp; int thread_id, warp_id, lane_id, offset; - real e_vdW_l, e_ele_l; - rvec f_l; thread_id = blockIdx.x * blockDim.x + threadIdx.x; warp_id = thread_id >> 5; @@ -301,12 +270,10 @@ CUDA_GLOBAL void k_vdW_coulomb_energy_opt( reax_atom *my_atoms, i = warp_id; p_vdW1 = gp.l[28]; p_vdW1i = 1.0 / p_vdW1; - e_vdW = 0.0; - e_ele = 0.0; - rvec_MakeZero( ext_press ); e_vdW_l = 0.0; e_ele_l = 0.0; - rvec_MakeZero( f_l ); + rvec_MakeZero( f_i_l ); + rvec_MakeZero( ext_press_l ); start_i = Start_Index( i, &far_nbr_list ); end_i = End_Index( i, &far_nbr_list ); @@ -360,8 +327,7 @@ CUDA_GLOBAL void k_vdW_coulomb_energy_opt( reax_atom *my_atoms, exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); e_base = twbp->D * (exp1 - 2.0 * exp2); - e_vdW = self_coef * (e_base * Tap); - e_vdW_l += e_vdW; + e_vdW_l += self_coef * (e_base * Tap); dfn13 = POW( r_ij, p_vdW1 - 1.0 ) * POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0 ); @@ -374,8 +340,7 @@ CUDA_GLOBAL void k_vdW_coulomb_energy_opt( reax_atom *my_atoms, exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) ); e_base = twbp->D * (exp1 - 2.0 * exp2); - e_vdW = self_coef * (e_base * Tap); - e_vdW_l += e_vdW; + e_vdW_l += self_coef * (e_base * Tap); de_base = (twbp->D * twbp->alpha / twbp->r_vdW) * (exp2 - exp1); } @@ -384,8 +349,7 @@ CUDA_GLOBAL void k_vdW_coulomb_energy_opt( reax_atom *my_atoms, if ( gp.vdw_type == 2 || gp.vdw_type == 3 ) { e_core = twbp->ecore * EXP( twbp->acore * (1.0 - (r_ij / twbp->rcore)) ); - e_vdW += self_coef * (e_core * Tap); - e_vdW_l += (self_coef * (e_core * Tap)); + e_vdW_l += self_coef * (e_core * Tap); de_core = -(twbp->acore / twbp->rcore) * e_core; } @@ -398,29 +362,17 @@ CUDA_GLOBAL void k_vdW_coulomb_energy_opt( reax_atom *my_atoms, CEvd = self_coef * ( (de_base + de_core) * Tap + (e_base + e_core) * dTap ); -#if defined(DEBUG_FOCUS) - printf( "%6d%6d%24.12f%24.12f%24.12f%24.12f\n", - i + 1, j + 1, - e_base, de_base, e_core, de_core ); -#endif - /* Coulomb Calculations */ dr3gamij_1 = r_ij * r_ij * r_ij + POW( twbp->gamma, -3.0 ); dr3gamij_3 = POW( dr3gamij_1, 1.0 / 3.0 ); e_clb = C_ELE * (my_atoms[i].q * my_atoms[j].q) / dr3gamij_3; - e_ele = self_coef * (e_clb * Tap); - e_ele_l += e_ele; + e_ele_l += self_coef * (e_clb * Tap); de_clb = -C_ELE * (my_atoms[i].q * my_atoms[j].q) * (r_ij * r_ij) / POW( dr3gamij_1, 4.0 / 3.0 ); CEclmb = self_coef * (de_clb * Tap + e_clb * dTap); -#if defined(DEBUG_FOCUS) - printf( "%6d%6d%24.12f%24.12f\n", - i + 1, j + 1, e_clb, de_clb ); -#endif - if ( control->virial == 0 ) { if ( i < j ) @@ -431,7 +383,7 @@ CUDA_GLOBAL void k_vdW_coulomb_energy_opt( reax_atom *my_atoms, { rvec_Scale( temp, (CEvd + CEclmb) / r_ij, far_nbr_list.far_nbr_list.dvec[pj] ); } - rvec_Add( f_l, temp ); + rvec_Add( f_i_l, temp ); rvec_Scale( temp, -1.0, temp ); atomic_rvecAdd( workspace.f[j], temp ); } @@ -450,42 +402,14 @@ CUDA_GLOBAL void k_vdW_coulomb_energy_opt( reax_atom *my_atoms, rvec_Scale( temp, (CEvd + CEclmb) / r_ij, far_nbr_list.far_nbr_list.dvec[pj] ); } - rvec_Add( f_l, temp ); + rvec_Add( f_i_l, temp ); rvec_Scale( temp, -1.0, temp ); atomic_rvecAdd( workspace.f[j], temp ); - rvec_iMultiply( ext_press, + rvec_iMultiply( temp, far_nbr_list.far_nbr_list.rel_box[pj], temp ); - rvec_Add( data_ext_press[i], ext_press ); + rvec_Add( ext_press_l, temp ); } - -#if defined(TEST_ENERGY) - // fprintf( out_control->evdw, - // "%12.9f%12.9f%12.9f%12.9f%12.9f%12.9f%12.9f%12.9f\n", - // workspace.Tap[7], workspace.Tap[6], workspace.Tap[5], - // workspace.Tap[4], workspace.Tap[3], workspace.Tap[2], - // workspace.Tap[1], Tap ); - //fprintf( out_control->evdw, "%6d%6d%24.15e%24.15e%24.15e\n", - fprintf( out_control->evdw, "%6d%6d%12.4f%12.4f%12.4f\n", - my_atoms[i].orig_id, my_atoms[j].orig_id, - r_ij, e_vdW, data->my_en.e_vdW ); - //fprintf(out_control->ecou,"%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n", - fprintf( out_control->ecou, "%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f\n", - my_atoms[i].orig_id, my_atoms[j].orig_id, - r_ij, my_atoms[i].q, my_atoms[j].q, - e_ele, data->my_en.e_ele ); -#endif - -#if defined(TEST_FORCES) - rvec_ScaledAdd( workspace.f_vdw[i], -CEvd, - far_nbr_list.far_nbr_list.dvec[pj] ); - rvec_ScaledAdd( workspace.f_vdw[j], +CEvd, - far_nbr_list.far_nbr_list.dvec[pj] ); - rvec_ScaledAdd( workspace.f_ele[i], -CEclmb, - far_nbr_list.far_nbr_list.dvec[pj] ); - rvec_ScaledAdd( workspace.f_ele[j], +CEclmb, - far_nbr_list.far_nbr_list.dvec[pj] ); -#endif } pj += warpSize; @@ -496,17 +420,26 @@ CUDA_GLOBAL void k_vdW_coulomb_energy_opt( reax_atom *my_atoms, { e_vdW_l += __shfl_down_sync( FULL_MASK, e_vdW_l, offset ); e_ele_l += __shfl_down_sync( FULL_MASK, e_ele_l, offset ); - f_l[0] += __shfl_down_sync( FULL_MASK, f_l[0], offset ); - f_l[1] += __shfl_down_sync( FULL_MASK, f_l[1], offset ); - f_l[2] += __shfl_down_sync( FULL_MASK, f_l[2], offset ); + f_i_l[0] += __shfl_down_sync( FULL_MASK, f_i_l[0], offset ); + f_i_l[1] += __shfl_down_sync( FULL_MASK, f_i_l[1], offset ); + f_i_l[2] += __shfl_down_sync( FULL_MASK, f_i_l[2], offset ); } /* first thread within a warp writes warp-level sum to global memory */ if ( lane_id == 0 ) { - data_e_vdW[i] = e_vdW_l; - data_e_ele[i] = e_ele_l; - atomic_rvecAdd( workspace.f[i], f_l ); + atomic_rvecAdd( workspace.f[i], f_i_l ); +#if !defined(CUDA_ACCUM_ATOMIC) + e_vdW_g[i] = e_vdW_l; + e_ele_g[i] = e_ele_l; + if ( control->virial == 1 ) + rvec_Copy( ext_press_g[j], ext_press_l ); +#else + atomicAdd( (double *) e_vdW_g, (double) e_vdW_l ); + atomicAdd( (double *) e_ele_g, (double) e_ele_l ); + if ( control->virial == 1 ) + atomic_rvecAdd( *ext_press_g, ext_press_l ); +#endif } } @@ -517,15 +450,15 @@ CUDA_GLOBAL void k_vdW_coulomb_energy_tab( reax_atom *my_atoms, storage workspace, reax_list far_nbr_list, LR_lookup_table *t_LR, int n, int num_atom_types, int step, int prev_steps, int energy_update_freq, - real *data_e_vdW, real *data_e_ele, rvec *data_ext_press ) + real *e_vdW_g, real *e_ele_g, rvec *ext_press_g ) { int i, j, pj, r, steps, update_freq, update_energies; int type_i, type_j, tmin, tmax; int start_i, end_i, orig_i, orig_j; real r_ij, self_coef, base, dif; - real e_vdW, e_ele; + real e_vdW_l, e_ele_l; real CEvd, CEclmb; - rvec temp, ext_press; + rvec temp, f_i_l, ext_press_l; LR_lookup_table *t; i = blockIdx.x * blockDim.x + threadIdx.x; @@ -538,10 +471,10 @@ CUDA_GLOBAL void k_vdW_coulomb_energy_tab( reax_atom *my_atoms, steps = step - prev_steps; update_freq = energy_update_freq; update_energies = update_freq > 0 && steps % update_freq == 0; - e_ele = 0.0; - e_vdW = 0.0; - data_e_vdW[i] = 0.0; - data_e_ele[i] = 0.0; + e_ele_l = 0.0; + e_vdW_l = 0.0; + rvec_MakeZero( f_i_l ); + rvec_MakeZero( ext_press_l ); type_i = my_atoms[i].type; start_i = Start_Index( i, &far_nbr_list ); @@ -575,16 +508,11 @@ CUDA_GLOBAL void k_vdW_coulomb_energy_tab( reax_atom *my_atoms, if ( update_energies ) { - e_vdW = ((t->vdW[r].d * dif + t->vdW[r].c) * dif + t->vdW[r].b) - * dif + t->vdW[r].a; - e_vdW *= self_coef; - - e_ele = ((t->ele[r].d * dif + t->ele[r].c) * dif + t->ele[r].b) - * dif + t->ele[r].a; - e_ele *= self_coef * my_atoms[i].q * my_atoms[j].q; + e_vdW_l += self_coef * (((t->vdW[r].d * dif + t->vdW[r].c) * dif + t->vdW[r].b) + * dif + t->vdW[r].a); - data_e_vdW[i] += e_vdW; - data_e_ele[i] += e_ele; + e_ele_l += (((t->ele[r].d * dif + t->ele[r].c) * dif + t->ele[r].b) + * dif + t->ele[r].a) * self_coef * my_atoms[i].q * my_atoms[j].q; } CEvd = ((t->CEvd[r].d * dif + t->CEvd[r].c) * dif + t->CEvd[r].b) @@ -599,54 +527,56 @@ CUDA_GLOBAL void k_vdW_coulomb_energy_tab( reax_atom *my_atoms, { if ( i < j ) { - rvec_ScaledAdd( workspace.f[i], + rvec_ScaledAdd( temp, -(CEvd + CEclmb) / r_ij, far_nbr_list.far_nbr_list.dvec[pj] ); } else { - rvec_ScaledAdd( workspace.f[i], + rvec_ScaledAdd( temp, (CEvd + CEclmb) / r_ij, far_nbr_list.far_nbr_list.dvec[pj] ); } + rvec_Add( f_i_l, temp ); + rvec_Scale( temp, -1.0, temp ); + atomic_rvecAdd( workspace.f[j], temp ); } /* NPT, iNPT or sNPT */ else { /* for pressure coupling, terms not related to bond order derivatives are added directly into pressure vector/tensor */ - rvec_Scale( temp, - (CEvd + CEclmb) / r_ij, far_nbr_list.far_nbr_list.dvec[pj] ); - - rvec_ScaledAdd( workspace.f[i], -1.0, temp ); - rvec_Add( workspace.f[j], temp ); + if ( i < j ) + { + rvec_Scale( temp, + -(CEvd + CEclmb) / r_ij, far_nbr_list.far_nbr_list.dvec[pj] ); + } + else + { + rvec_Scale( temp, + (CEvd + CEclmb) / r_ij, far_nbr_list.far_nbr_list.dvec[pj] ); + } + rvec_Add( f_i_l, temp ); + rvec_ScaledAdd( temp, -1.0, temp ); + atomic_rvecAdd( workspace.f[j], temp ); - rvec_iMultiply( ext_press, far_nbr_list.far_nbr_list.rel_box[pj], temp ); - rvec_Add( data_ext_press[i], ext_press ); + rvec_iMultiply( temp, far_nbr_list.far_nbr_list.rel_box[pj], temp ); + rvec_Add( ext_press_l, temp ); } - -#if defined(TEST_ENERGY) - //fprintf( out_control->evdw, "%6d%6d%24.15e%24.15e%24.15e\n", - fprintf( out_control->evdw, "%6d%6d%12.4f%12.4f%12.4f\n", - my_atoms[i].orig_id, my_atoms[j].orig_id, - r_ij, e_vdW, data->my_en.e_vdW ); - //fprintf(out_control->ecou,"%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n", - fprintf( out_control->ecou, "%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f\n", - my_atoms[i].orig_id, my_atoms[j].orig_id, - r_ij, my_atoms[i].q, my_atoms[j].q, - e_ele, data->my_en.e_ele ); -#endif - -#if defined(TEST_FORCES) - rvec_ScaledAdd( workspace.f_vdw[i], -CEvd, - far_nbr_list.far_nbr_list.dvec[pj] ); - rvec_ScaledAdd( workspace.f_vdw[j], +CEvd, - far_nbr_list.far_nbr_list.dvec[pj] ); - rvec_ScaledAdd( workspace.f_ele[i], -CEclmb, - far_nbr_list.far_nbr_list.dvec[pj] ); - rvec_ScaledAdd( workspace.f_ele[j], +CEclmb, - far_nbr_list.far_nbr_list.dvec[pj] ); -#endif } } + + atomic_rvecAdd( workspace.f[i], f_i_l ); +#if !defined(CUDA_ACCUM_ATOMIC) + __syncthreads( ); + e_vdW_g[i] = e_vdW_l; + e_ele_g[i] = e_ele_l; + if ( control->virial == 1 ) + rvec_Copy( ext_press_g[j], ext_press_l ); +#else + atomicAdd( (double *) e_vdW_g, (double) e_vdW_l ); + atomicAdd( (double *) e_ele_g, (double) e_ele_l ); + if ( control->virial == 1 ) + atomic_rvecAdd( *ext_press_g, ext_press_l ); +#endif } @@ -675,21 +605,34 @@ static void Cuda_Compute_Polarization_Energy( reax_system *system, storage *work } -void Cuda_NonBonded_Energy( reax_system *system, control_params *control, - storage *workspace, simulation_data *data, reax_list **lists, +void Cuda_Compute_NonBonded_Forces( reax_system *system, control_params *control, + simulation_data *data, storage *workspace, reax_list **lists, output_controls *out_control ) { int update_energy, blocks; +#if !defined(CUDA_ACCUM_ATOMIC) real *spad; rvec *spad_rvec; +#endif update_energy = (out_control->energy_update_freq > 0 && data->step % out_control->energy_update_freq == 0) ? TRUE : FALSE; +#if !defined(CUDA_ACCUM_ATOMIC) cuda_check_malloc( &workspace->scratch, &workspace->scratch_size, (sizeof(real) * 2 + sizeof(rvec)) * system->n + sizeof(rvec) * control->blocks, - "Cuda_NonBonded_Energy::workspace->scratch" ); + "Cuda_Compute_NonBonded_Forces::workspace->scratch" ); spad = (real *) workspace->scratch; +#endif + +#if defined(CUDA_ACCUM_ATOMIC) + cuda_memset( &((simulation_data *)data->d_simulation_data)->my_en.e_vdW, + 0, sizeof(real), "Cuda_Compute_Bonded_Forces::e_vdW" ); + cuda_memset( &((simulation_data *)data->d_simulation_data)->my_en.e_ele, + 0, sizeof(real), "Cuda_Compute_Bonded_Forces::e_ele" ); + cuda_memset( &((simulation_data *)data->d_simulation_data)->my_ext_press, + 0, sizeof(rvec), "Cuda_Compute_Bonded_Forces::my_ext_press" ); +#endif blocks = system->n * 32 / DEF_BLOCK_SIZE + (system->n * 32 % DEF_BLOCK_SIZE == 0 ? 0 : 1); @@ -701,14 +644,28 @@ void Cuda_NonBonded_Energy( reax_system *system, control_params *control, // system->reax_param.d_gp, (control_params *) control->d_control_params, // *(workspace->d_workspace), *(lists[FAR_NBRS]), // system->n, system->reax_param.num_atom_types, -// spad, &spad[system->n], (rvec *) (&spad[2 * system->n]) ); +//#if !defined(CUDA_ACCUM_ATOMIC) +// spad, &spad[system->n], (rvec *) (&spad[2 * system->n]) +//#else +// &((simulation_data *)data->d_simulation_data)->my_en.e_vdW, +// &((simulation_data *)data->d_simulation_data)->my_en.e_ele, +// &((simulation_data *)data->d_simulation_data)->my_ext_press +//#endif +// ); k_vdW_coulomb_energy_opt <<< blocks, DEF_BLOCK_SIZE >>> ( system->d_my_atoms, system->reax_param.d_tbp, system->reax_param.d_gp, (control_params *) control->d_control_params, *(workspace->d_workspace), *(lists[FAR_NBRS]), system->n, system->reax_param.num_atom_types, - spad, &spad[system->n], (rvec *) (&spad[2 * system->n]) ); +#if !defined(CUDA_ACCUM_ATOMIC) + spad, &spad[system->n], (rvec *) (&spad[2 * system->n]) +#else + &((simulation_data *)data->d_simulation_data)->my_en.e_vdW, + &((simulation_data *)data->d_simulation_data)->my_en.e_ele, + &((simulation_data *)data->d_simulation_data)->my_ext_press +#endif + ); cudaCheckError( ); } else @@ -721,10 +678,18 @@ void Cuda_NonBonded_Energy( reax_system *system, control_params *control, system->reax_param.num_atom_types, data->step, data->prev_steps, out_control->energy_update_freq, - spad, &spad[system->n], (rvec *) (&spad[2 * system->n])); +#if !defined(CUDA_ACCUM_ATOMIC) + spad, &spad[system->n], (rvec *) (&spad[2 * system->n]) +#else + &((simulation_data *)data->d_simulation_data)->my_en.e_vdW, + &((simulation_data *)data->d_simulation_data)->my_en.e_ele, + &((simulation_data *)data->d_simulation_data)->my_ext_press +#endif + ); cudaCheckError( ); } +#if !defined(CUDA_ACCUM_ATOMIC) if ( update_energy == TRUE ) { /* reduction for vdw */ @@ -755,6 +720,7 @@ void Cuda_NonBonded_Energy( reax_system *system, control_params *control, control->blocks ); cudaCheckError( ); } +#endif if ( update_energy == TRUE ) { diff --git a/PG-PuReMD/src/cuda/cuda_nonbonded.h b/PG-PuReMD/src/cuda/cuda_nonbonded.h index 038cb9e7..fbc2cf5b 100644 --- a/PG-PuReMD/src/cuda/cuda_nonbonded.h +++ b/PG-PuReMD/src/cuda/cuda_nonbonded.h @@ -25,8 +25,8 @@ #include "../reax_types.h" -void Cuda_NonBonded_Energy( reax_system *, control_params *, - storage *, simulation_data *, reax_list **, +void Cuda_Compute_NonBonded_Forces( reax_system *, control_params *, + simulation_data *, storage *, reax_list **, output_controls * ); diff --git a/PG-PuReMD/src/cuda/cuda_torsion_angles.cu b/PG-PuReMD/src/cuda/cuda_torsion_angles.cu index 915ada83..0e832ba4 100644 --- a/PG-PuReMD/src/cuda/cuda_torsion_angles.cu +++ b/PG-PuReMD/src/cuda/cuda_torsion_angles.cu @@ -170,9 +170,8 @@ CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *my_atoms, global_paramete real CEtors5, CEtors6, CEtors7, CEtors8, CEtors9; real Cconj, CEconj1, CEconj2, CEconj3; real CEconj4, CEconj5, CEconj6, e_tor_l, e_con_l; - rvec dvec_li, temp, ext_press_l; + rvec dvec_li, temp, f_j_l, ext_press_l; ivec rel_box_jl; - // rtensor total_rtensor, temp_rtensor; four_body_header *fbh; four_body_parameters *fbp; bond_data *pbond_ij, *pbond_jk, *pbond_kl; @@ -193,6 +192,7 @@ CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *my_atoms, global_paramete p_cot2 = gp.l[27]; e_tor_l = 0.0; e_con_l = 0.0; + rvec_MakeZero( f_j_l ); rvec_MakeZero( ext_press_l ); #if defined(DEBUG_FOCUS) num_frb_intrs = 0; @@ -431,13 +431,13 @@ CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *my_atoms, global_paramete /* dcos_theta_ijk */ atomic_rvecScaledAdd( pbond_ij->ta_f, CEtors7 + CEconj4, p_ijk->dcos_dk ); - rvec_ScaledAdd( workspace.f[j], + rvec_ScaledAdd( f_j_l, CEtors7 + CEconj4, p_ijk->dcos_dj ); atomic_rvecScaledAdd( pbond_jk->ta_f, CEtors7 + CEconj4, p_ijk->dcos_di ); /* dcos_theta_jkl */ - rvec_ScaledAdd( workspace.f[j], + rvec_ScaledAdd( f_j_l, CEtors8 + CEconj5, p_jkl->dcos_di ); atomic_rvecScaledAdd( pbond_jk->ta_f, CEtors8 + CEconj5, p_jkl->dcos_dj ); @@ -447,7 +447,7 @@ CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *my_atoms, global_paramete /* dcos_omega */ atomic_rvecScaledAdd( pbond_ij->ta_f, CEtors9 + CEconj6, dcos_omega_di ); - rvec_ScaledAdd( workspace.f[j], + rvec_ScaledAdd( f_j_l, CEtors9 + CEconj6, dcos_omega_dj ); atomic_rvecScaledAdd( pbond_jk->ta_f, CEtors9 + CEconj6, dcos_omega_dk ); @@ -457,13 +457,13 @@ CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *my_atoms, global_paramete /* dcos_theta_ijk */ atomic_rvecScaledAdd( workspace.f[i], CEtors7 + CEconj4, p_ijk->dcos_dk ); - atomic_rvecScaledAdd( workspace.f[j], + rvec_ScaledAdd( f_j_l, CEtors7 + CEconj4, p_ijk->dcos_dj ); atomic_rvecScaledAdd( workspace.f[k], CEtors7 + CEconj4, p_ijk->dcos_di ); /* dcos_theta_jkl */ - atomic_rvecScaledAdd( workspace.f[j], + rvec_ScaledAdd( f_j_l, CEtors8 + CEconj5, p_jkl->dcos_di ); atomic_rvecScaledAdd( workspace.f[k], CEtors8 + CEconj5, p_jkl->dcos_dj ); @@ -473,7 +473,7 @@ CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *my_atoms, global_paramete /* dcos_omega */ atomic_rvecScaledAdd( workspace.f[i], CEtors9 + CEconj6, dcos_omega_di ); - atomic_rvecScaledAdd( workspace.f[j], + rvec_ScaledAdd( f_j_l, CEtors9 + CEconj6, dcos_omega_dj ); atomic_rvecScaledAdd( workspace.f[k], CEtors9 + CEconj6, dcos_omega_dk ); @@ -492,7 +492,7 @@ CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *my_atoms, global_paramete rvec_iMultiply( temp, pbond_ij->rel_box, temp ); rvec_Add( ext_press_l, temp ); - rvec_ScaledAdd( workspace.f[j], + rvec_ScaledAdd( f_j_l, CEtors7 + CEconj4, p_ijk->dcos_dj ); rvec_Scale( temp, CEtors7 + CEconj4, p_ijk->dcos_di ); @@ -501,7 +501,7 @@ CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *my_atoms, global_paramete rvec_Add( ext_press_l, temp ); /* dcos_theta_jkl */ - rvec_ScaledAdd( workspace.f[j], + rvec_ScaledAdd( f_j_l, CEtors8 + CEconj5, p_jkl->dcos_di ); rvec_Scale( temp, CEtors8 + CEconj5, p_jkl->dcos_dj ); @@ -520,7 +520,7 @@ CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *my_atoms, global_paramete rvec_iMultiply( temp, pbond_ij->rel_box, temp ); rvec_Add( ext_press_l, temp ); - rvec_ScaledAdd( workspace.f[j], + rvec_ScaledAdd( f_j_l, CEtors9 + CEconj6, dcos_omega_dj ); rvec_Scale( temp, CEtors9 + CEconj6, dcos_omega_dk ); @@ -541,7 +541,7 @@ CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *my_atoms, global_paramete rvec_iMultiply( temp, pbond_ij->rel_box, temp ); rvec_Add( ext_press_l, temp ); - atomic_rvecScaledAdd( workspace.f[j], + rvec_ScaledAdd( f_j_l, CEtors7 + CEconj4, p_ijk->dcos_dj ); rvec_Scale( temp, CEtors7 + CEconj4, p_ijk->dcos_di ); @@ -550,7 +550,7 @@ CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *my_atoms, global_paramete rvec_Add( ext_press_l, temp ); /* dcos_theta_jkl */ - atomic_rvecScaledAdd( workspace.f[j], + rvec_ScaledAdd( f_j_l, CEtors8 + CEconj5, p_jkl->dcos_di ); rvec_Scale( temp, CEtors8 + CEconj5, p_jkl->dcos_dj ); @@ -569,7 +569,7 @@ CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *my_atoms, global_paramete rvec_iMultiply( temp, pbond_ij->rel_box, temp ); rvec_Add( ext_press_l, temp ); - atomic_rvecScaledAdd( workspace.f[j], + rvec_ScaledAdd( f_j_l, CEtors9 + CEconj6, dcos_omega_dj ); rvec_Scale( temp, CEtors9 + CEconj6, dcos_omega_dk ); @@ -593,10 +593,12 @@ CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *my_atoms, global_paramete // } // j loop #if !defined(CUDA_ACCUM_ATOMIC) + rvec_Add( workspace.f[j], f_j_l ); e_tor_g[j] = e_tor_l; e_con_g[j] = e_con_l; rvec_Copy( e_ext_press_g[j], e_ext_press_l ); #else + atomic_rvecAdd( workspace.f[j], f_j_l ); atomicAdd( (double *) e_tor_g, (double) e_tor_l ); atomicAdd( (double *) e_con_g, (double) e_con_l ); atomic_rvecAdd( *ext_press_g, ext_press_l ); diff --git a/PG-PuReMD/src/cuda/cuda_valence_angles.cu b/PG-PuReMD/src/cuda/cuda_valence_angles.cu index fc51dfc9..5392a77b 100644 --- a/PG-PuReMD/src/cuda/cuda_valence_angles.cu +++ b/PG-PuReMD/src/cuda/cuda_valence_angles.cu @@ -59,7 +59,7 @@ CUDA_GLOBAL void Cuda_Valence_Angles_Part1( reax_atom *my_atoms, real f7_ij, f7_jk, f8_Dj, f9_Dj; real Ctheta_0, theta_0, theta_00, theta, cos_theta, sin_theta; real BOA_ij, BOA_jk; - rvec rvec_temp, ext_press_l; + rvec rvec_temp, f_j_l, ext_press_l; three_body_header *thbh; three_body_parameters *thbp; three_body_interaction_data *p_ijk; @@ -88,6 +88,7 @@ CUDA_GLOBAL void Cuda_Valence_Angles_Part1( reax_atom *my_atoms, e_ang_l = 0.0; e_coa_l = 0.0; e_pen_l = 0.0; + rvec_MakeZero( f_j_l ); rvec_MakeZero( ext_press_l ); type_j = my_atoms[j].type; @@ -406,11 +407,11 @@ CUDA_GLOBAL void Cuda_Valence_Angles_Part1( reax_atom *my_atoms, { #if !defined(CUDA_ACCUM_ATOMIC) rvec_ScaledAdd( pbond_ij->va_f, CEval8, p_ijk->dcos_di ); - rvec_ScaledAdd( workspace.f[j], CEval8, p_ijk->dcos_dj ); + rvec_ScaledAdd( f_j_l, CEval8, p_ijk->dcos_dj ); rvec_ScaledAdd( pbond_jk->va_f, CEval8, p_ijk->dcos_dk ); #else atomic_rvecScaledAdd( workspace.f[i], CEval8, p_ijk->dcos_di ); - atomic_rvecScaledAdd( workspace.f[j], CEval8, p_ijk->dcos_dj ); + rvec_ScaledAdd( f_j_l, CEval8, p_ijk->dcos_dj ); atomic_rvecScaledAdd( workspace.f[k], CEval8, p_ijk->dcos_dk ); #endif } @@ -424,7 +425,7 @@ CUDA_GLOBAL void Cuda_Valence_Angles_Part1( reax_atom *my_atoms, rvec_iMultiply( rvec_temp, pbond_ij->rel_box, rvec_temp ); rvec_Add( ext_press_l, rvec_temp ); - rvec_ScaledAdd( workspace.f[j], CEval8, p_ijk->dcos_dj ); + rvec_ScaledAdd( f_j_l, CEval8, p_ijk->dcos_dj ); rvec_Scale( rvec_temp, CEval8, p_ijk->dcos_dk ); rvec_Add( pbond_jk->va_f, rvec_temp ); @@ -438,7 +439,7 @@ CUDA_GLOBAL void Cuda_Valence_Angles_Part1( reax_atom *my_atoms, rvec_iMultiply( rvec_temp, pbond_ij->rel_box, rvec_temp ); rvec_Add( ext_press_l, rvec_temp ); - rvec_ScaledAdd( workspace.f[j], CEval8, p_ijk->dcos_dj ); + rvec_ScaledAdd( f_j_l, CEval8, p_ijk->dcos_dj ); rvec_Scale( rvec_temp, CEval8, p_ijk->dcos_dk ); atomic_rvecAdd( workspace.f[k], rvec_temp ); @@ -455,11 +456,13 @@ CUDA_GLOBAL void Cuda_Valence_Angles_Part1( reax_atom *my_atoms, } #if !defined(CUDA_ACCUM_ATOMIC) + rvec_Add( workspace.f[j], f_j_l ); e_ang_g[j] = e_ang_l; e_coa_g[j] = e_coa_l; e_pen_g[j] = e_pen_l; rvec_Copy( ext_press_g[j], ext_press_l ); #else + atomic_rvecAdd( workspace.f[j], f_j_l ); atomicAdd( (double *) e_ang_g, (double) e_ang_l ); atomicAdd( (double *) e_coa_g, (double) e_coa_l ); atomicAdd( (double *) e_pen_g, (double) e_pen_l ); -- GitLab