From 7c45199376ae600888d139a1b34bd79c3de0e71d Mon Sep 17 00:00:00 2001 From: "Kurt A. O'Hearn" <ohearnku@msu.edu> Date: Fri, 19 Feb 2021 13:39:57 -0500 Subject: [PATCH] PG-PuReMD: use atomics for both partial energy and force accumulation (small reworks to reduce number of atomic operations). Remove several unneccesary cudaMemset calls. Other code clean-up. --- PG-PuReMD/src/cuda/cuda_bond_orders.cu | 24 +- PG-PuReMD/src/cuda/cuda_bond_orders.h | 4 +- PG-PuReMD/src/cuda/cuda_bonds.cu | 48 +-- PG-PuReMD/src/cuda/cuda_forces.cu | 178 +++++--- PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu | 317 +++++++------- PG-PuReMD/src/cuda/cuda_hydrogen_bonds.h | 2 +- PG-PuReMD/src/cuda/cuda_multi_body.cu | 81 ++-- PG-PuReMD/src/cuda/cuda_multi_body.h | 2 +- PG-PuReMD/src/cuda/cuda_torsion_angles.cu | 244 ++++------- PG-PuReMD/src/cuda/cuda_torsion_angles.h | 2 +- PG-PuReMD/src/cuda/cuda_valence_angles.cu | 487 +++++++++------------- PG-PuReMD/src/cuda/cuda_valence_angles.h | 11 +- PG-PuReMD/src/reax_types.h | 10 +- 13 files changed, 628 insertions(+), 782 deletions(-) diff --git a/PG-PuReMD/src/cuda/cuda_bond_orders.cu b/PG-PuReMD/src/cuda/cuda_bond_orders.cu index 25efc930..1420659c 100644 --- a/PG-PuReMD/src/cuda/cuda_bond_orders.cu +++ b/PG-PuReMD/src/cuda/cuda_bond_orders.cu @@ -4,7 +4,7 @@ #include "cuda_list.h" #include "cuda_utils.h" #include "cuda_reduction.h" -#if defined(CUDA_ACCUM_FORCE_ATOMIC) +#if defined(CUDA_ACCUM_ATOMIC) #include "cuda_helpers.h" #endif @@ -65,7 +65,7 @@ CUDA_DEVICE void Cuda_Add_dBond_to_Forces_NPT( int i, int pj, nbr_k = &bond_list->bond_list[pk]; k = nbr_k->nbr; -#if !defined(CUDA_ACCUM_FORCE_ATOMIC) +#if !defined(CUDA_ACCUM_ATOMIC) rvec_MakeZero( nbr_k->tf_f ); #endif @@ -79,7 +79,7 @@ CUDA_DEVICE void Cuda_Add_dBond_to_Forces_NPT( int i, int pj, rvec_ScaledAdd( temp, -coef.C3dbopi2, nbr_k->bo_data.dBOp ); /* force */ -#if !defined(CUDA_ACCUM_FORCE_ATOMIC) +#if !defined(CUDA_ACCUM_ATOMIC) rvec_Add( nbr_k->tf_f, temp ); #else atomic_rvecAdd( workspace->f[k], temp ); @@ -129,7 +129,7 @@ CUDA_DEVICE void Cuda_Add_dBond_to_Forces_NPT( int i, int pj, nbr_k = &bond_list->bond_list[pk]; k = nbr_k->nbr; -#if !defined(CUDA_ACCUM_FORCE_ATOMIC) +#if !defined(CUDA_ACCUM_ATOMIC) rvec_MakeZero( nbr_k->tf_f ); #endif @@ -143,7 +143,7 @@ CUDA_DEVICE void Cuda_Add_dBond_to_Forces_NPT( int i, int pj, rvec_ScaledAdd( temp, -coef.C4dbopi2, nbr_k->bo_data.dBOp ); /* force */ -#if !defined(CUDA_ACCUM_FORCE_ATOMIC) +#if !defined(CUDA_ACCUM_ATOMIC) rvec_Add( nbr_k->tf_f, temp ); #else atomic_rvecAdd( workspace->f[k], temp ); @@ -183,7 +183,7 @@ CUDA_DEVICE void Cuda_Add_dBond_to_Forces_NPT( int i, int pj, rvec_ScaledAdd( temp, coef.C4dbopi2, workspace->dDeltap_self[j] ); /* force */ -#if !defined(CUDA_ACCUM_FORCE_ATOMIC) +#if !defined(CUDA_ACCUM_ATOMIC) rvec_Add( workspace->f[j], temp ); #else atomic_rvecAdd( workspace->f[j], temp ); @@ -250,7 +250,7 @@ CUDA_DEVICE void Cuda_Add_dBond_to_Forces( int i, int pj, /* 3rd, dBOpi2 */ rvec_ScaledAdd( temp, -coef.C3dbopi2, nbr_k->bo_data.dBOp ); -#if !defined(CUDA_ACCUM_FORCE_ATOMIC) +#if !defined(CUDA_ACCUM_ATOMIC) rvec_Add( nbr_k->tf_f, temp ); #else atomic_rvecAdd( workspace->f[nbr_k->nbr], temp ); @@ -281,7 +281,7 @@ CUDA_DEVICE void Cuda_Add_dBond_to_Forces( int i, int pj, /* 3rd, dBO_pi2 */ rvec_ScaledAdd( temp, coef.C3dbopi2, workspace->dDeltap_self[i] ); -#if !defined(CUDA_ACCUM_FORCE_ATOMIC) +#if !defined(CUDA_ACCUM_ATOMIC) rvec_Add( workspace->f[i], temp ); #else atomic_rvecAdd( workspace->f[i], temp ); @@ -302,7 +302,7 @@ CUDA_DEVICE void Cuda_Add_dBond_to_Forces( int i, int pj, /* 4th, dBOpi2 */ rvec_ScaledAdd( temp, -coef.C4dbopi2, nbr_k->bo_data.dBOp ); -#if !defined(CUDA_ACCUM_FORCE_ATOMIC) +#if !defined(CUDA_ACCUM_ATOMIC) rvec_Add( nbr_k->tf_f, temp ); #else atomic_rvecAdd( workspace->f[nbr_k->nbr], temp ); @@ -333,7 +333,7 @@ CUDA_DEVICE void Cuda_Add_dBond_to_Forces( int i, int pj, /* 3rd, dBOpi2 */ rvec_ScaledAdd( temp, coef.C4dbopi2, workspace->dDeltap_self[i] ); -#if !defined(CUDA_ACCUM_FORCE_ATOMIC) +#if !defined(CUDA_ACCUM_ATOMIC) rvec_Add( workspace->f[i], temp ); #else atomic_rvecAdd( workspace->f[i], temp ); @@ -739,7 +739,7 @@ CUDA_GLOBAL void k_total_forces_part1( storage workspace, reax_list bond_list, } -#if !defined(CUDA_ACCUM_FORCE_ATOMIC) +#if !defined(CUDA_ACCUM_ATOMIC) CUDA_GLOBAL void k_total_forces_part2( reax_atom *my_atoms, reax_list bond_list, storage workspace, int N ) { @@ -816,7 +816,7 @@ void Cuda_Total_Forces( reax_system *system, control_params *control, cudaCheckError( ); } -#if !defined(CUDA_ACCUM_FORCE_ATOMIC) +#if !defined(CUDA_ACCUM_ATOMIC) /* post processing for the atomic forces */ k_total_forces_part2 <<< blocks, DEF_BLOCK_SIZE >>> ( system->d_my_atoms, *(lists[BONDS]), *(workspace->d_workspace), system->N ); diff --git a/PG-PuReMD/src/cuda/cuda_bond_orders.h b/PG-PuReMD/src/cuda/cuda_bond_orders.h index ccec119f..0212e42a 100644 --- a/PG-PuReMD/src/cuda/cuda_bond_orders.h +++ b/PG-PuReMD/src/cuda/cuda_bond_orders.h @@ -118,7 +118,7 @@ CUDA_DEVICE static inline int Cuda_BOp( reax_list bond_list, real bo_cut, bo_ij->Cdbopi = 0.0; bo_ij->Cdbopi2 = 0.0; -#if !defined(CUDA_ACCUM_FORCE_ATOMIC) +#if !defined(CUDA_ACCUM_ATOMIC) ibond->ae_CdDelta = 0.0; ibond->va_CdDelta = 0.0; rvec_MakeZero( ibond->va_f ); @@ -174,7 +174,7 @@ CUDA_DEVICE static inline int Cuda_BOp( reax_list bond_list, real bo_cut, bo_ji->Cdbopi = 0.0; bo_ji->Cdbopi2 = 0.0; -#if !defined(CUDA_ACCUM_FORCE_ATOMIC) +#if !defined(CUDA_ACCUM_ATOMIC) jbond->ae_CdDelta = 0.0; jbond->va_CdDelta = 0.0; rvec_MakeZero( jbond->va_f ); diff --git a/PG-PuReMD/src/cuda/cuda_bonds.cu b/PG-PuReMD/src/cuda/cuda_bonds.cu index 9cd4e0a4..8ffe75d3 100644 --- a/PG-PuReMD/src/cuda/cuda_bonds.cu +++ b/PG-PuReMD/src/cuda/cuda_bonds.cu @@ -30,14 +30,14 @@ CUDA_GLOBAL void Cuda_Bonds( reax_atom *my_atoms, global_parameters gp, single_body_parameters *sbp, two_body_parameters *tbp, storage p_workspace, reax_list p_bond_list, int n, int num_atom_types, - real *e_bond ) + real *e_bond_g ) { int i, j, pj; int start_i, end_i; int type_i, type_j; - real ebond, pow_BOs_be2, exp_be12, CEbo; + real pow_BOs_be2, exp_be12, CEbo, e_bond_l; real gp3, gp4, gp7, gp10, gp37; - real exphu, exphua1, exphub1, exphuov, hulpov, estriph; + real exphu, exphua1, exphub1, exphuov, hulpov; real decobdbo, decobdboua, decobdboub; single_body_parameters *sbp_i, *sbp_j; two_body_parameters *twbp; @@ -59,6 +59,7 @@ CUDA_GLOBAL void Cuda_Bonds( reax_atom *my_atoms, global_parameters gp, gp7 = gp.l[7]; gp10 = gp.l[10]; gp37 = (int) gp.l[37]; + e_bond_l = 0.0; start_i = Start_Index( i, bond_list ); end_i = End_Index( i, bond_list ); @@ -82,32 +83,15 @@ CUDA_GLOBAL void Cuda_Bonds( reax_atom *my_atoms, global_parameters gp, * (1.0 - twbp->p_be1 * twbp->p_be2 * pow_BOs_be2); /* calculate bond energy */ - ebond = -twbp->De_s * bo_ij->BO_s * exp_be12 + e_bond_l += -twbp->De_s * bo_ij->BO_s * exp_be12 - twbp->De_p * bo_ij->BO_pi - twbp->De_pp * bo_ij->BO_pi2; - e_bond[i] += ebond; /* calculate derivatives of bond orders */ bo_ij->Cdbo += CEbo; bo_ij->Cdbopi -= CEbo + twbp->De_p; bo_ij->Cdbopi2 -= CEbo + twbp->De_pp; -#if defined(TEST_ENERGY) - //fprintf( out_control->ebond, "%6d%6d%24.15e%24.15e%24.15e\n", - fprintf( out_control->ebond, "%6d%6d%12.4f%12.4f%12.4f\n", - system->my_atoms[i].orig_id, - system->my_atoms[j].orig_id, - bo_ij->BO, ebond, data->my_en.e_bond ); -#endif - -#if defined(TEST_FORCES) - Add_dBO( system, lists, i, pj, CEbo, workspace->f_be ); - Add_dBOpinpi2( system, lists, i, pj, - -(CEbo + twbp->De_p), - -(CEbo + twbp->De_pp), - workspace->f_be, workspace->f_be ); -#endif - /* Stabilisation terminal triple bond */ if ( bo_ij->BO >= 1.00 ) { @@ -127,8 +111,7 @@ CUDA_GLOBAL void Cuda_Bonds( reax_atom *my_atoms, global_parameters gp, exphuov = EXP(gp4 * (workspace->Delta[i] + workspace->Delta[j])); hulpov = 1.0 / (1.0 + 25.0 * exphuov); - estriph = gp10 * exphu * hulpov * (exphua1 + exphub1); - e_bond[i] += estriph; + e_bond_l += gp10 * exphu * hulpov * (exphua1 + exphub1); decobdbo = gp10 * exphu * hulpov * (exphua1 + exphub1) * ( gp3 - 2.0 * gp7 * (bo_ij->BO - 2.5) ); @@ -140,21 +123,14 @@ CUDA_GLOBAL void Cuda_Bonds( reax_atom *my_atoms, global_parameters gp, bo_ij->Cdbo += decobdbo; workspace->CdDelta[i] += decobdboua; workspace->CdDelta[j] += decobdboub; - -#if defined(TEST_ENERGY) - //fprintf( out_control->ebond, - // "%6d%6d%24.15e%24.15e%24.15e%24.15e\n", - // system->my_atoms[i].orig_id, system->my_atoms[j].orig_id, - // estriph, decobdbo, decobdboua, decobdboub ); -#endif - -#if defined(TEST_FORCES) - Add_dBO( system, lists, i, pj, decobdbo, workspace->f_be ); - Add_dDelta( system, lists, i, decobdboua, workspace->f_be ); - Add_dDelta( system, lists, j, decobdboub, workspace->f_be ); -#endif } } } } + +#if !defined(CUDA_ACCUM_ATOMIC) + e_bond_g[i] = e_bond_l; +#else + atomicAdd( (double *) e_bond_g, (double) e_bond_l ); +#endif } diff --git a/PG-PuReMD/src/cuda/cuda_forces.cu b/PG-PuReMD/src/cuda/cuda_forces.cu index 6c658975..6e59636a 100644 --- a/PG-PuReMD/src/cuda/cuda_forces.cu +++ b/PG-PuReMD/src/cuda/cuda_forces.cu @@ -223,7 +223,6 @@ CUDA_GLOBAL void k_init_distance( reax_atom *my_atoms, reax_list far_nbr_list, i { int i, j, pj, start_i, end_i; rvec x_i; - reax_atom *atom_j; i = blockIdx.x * blockDim.x + threadIdx.x; @@ -240,19 +239,18 @@ CUDA_GLOBAL void k_init_distance( reax_atom *my_atoms, reax_list far_nbr_list, i for ( pj = start_i; pj < end_i; ++pj ) { j = far_nbr_list.far_nbr_list.nbr[pj]; - atom_j = &my_atoms[j]; if ( i < j ) { - far_nbr_list.far_nbr_list.dvec[pj][0] = atom_j->x[0] - x_i[0]; - far_nbr_list.far_nbr_list.dvec[pj][1] = atom_j->x[1] - x_i[1]; - far_nbr_list.far_nbr_list.dvec[pj][2] = atom_j->x[2] - x_i[2]; + far_nbr_list.far_nbr_list.dvec[pj][0] = my_atoms[j].x[0] - x_i[0]; + far_nbr_list.far_nbr_list.dvec[pj][1] = my_atoms[j].x[1] - x_i[1]; + far_nbr_list.far_nbr_list.dvec[pj][2] = my_atoms[j].x[2] - x_i[2]; } else { - far_nbr_list.far_nbr_list.dvec[pj][0] = x_i[0] - atom_j->x[0]; - far_nbr_list.far_nbr_list.dvec[pj][1] = x_i[1] - atom_j->x[1]; - far_nbr_list.far_nbr_list.dvec[pj][2] = x_i[2] - atom_j->x[2]; + far_nbr_list.far_nbr_list.dvec[pj][0] = x_i[0] - my_atoms[j].x[0]; + far_nbr_list.far_nbr_list.dvec[pj][1] = x_i[1] - my_atoms[j].x[1]; + far_nbr_list.far_nbr_list.dvec[pj][2] = x_i[2] - my_atoms[j].x[2]; } far_nbr_list.far_nbr_list.d[pj] = rvec_Norm( far_nbr_list.far_nbr_list.dvec[pj] ); } @@ -263,7 +261,7 @@ CUDA_GLOBAL void k_init_distance( reax_atom *my_atoms, reax_list far_nbr_list, i * in the far neighbors list if it's a NOT re-neighboring step */ CUDA_GLOBAL void k_init_distance_opt( reax_atom *my_atoms, reax_list far_nbr_list, int N ) { - int j, pj, start_i, end_i, thread_id, warp_id, lane_id; + int j, pj, start_i, end_i, thread_id, warp_id, lane_id, itr; __shared__ rvec x_i; thread_id = blockIdx.x * blockDim.x + threadIdx.x; @@ -284,7 +282,7 @@ CUDA_GLOBAL void k_init_distance_opt( reax_atom *my_atoms, reax_list far_nbr_lis __syncthreads( ); /* update distance and displacement vector between atoms i and j (i-j) */ - for ( pj = start_i + lane_id; pj < end_i; pj += 32 ) + for ( itr = 0, pj = start_i + lane_id; itr < (end_i - start_i + 0x0000001F) >> 5; ++itr ) { j = far_nbr_list.far_nbr_list.nbr[pj]; @@ -301,6 +299,8 @@ CUDA_GLOBAL void k_init_distance_opt( reax_atom *my_atoms, reax_list far_nbr_lis far_nbr_list.far_nbr_list.dvec[pj][2] = x_i[2] - my_atoms[j].x[2]; } far_nbr_list.far_nbr_list.d[pj] = rvec_Norm( far_nbr_list.far_nbr_list.dvec[pj] ); + + pj += warpSize; } } @@ -694,7 +694,7 @@ CUDA_GLOBAL void k_init_bonds( reax_atom *my_atoms, single_body_parameters *sbp, hbond_list.hbond_list[ihb_top].scl = -1; hbond_list.hbond_list[ihb_top].ptr = pj; -#if !defined(CUDA_ACCUM_FORCE_ATOMIC) +#if !defined(CUDA_ACCUM_ATOMIC) hbond_list.hbond_list[ihb_top].sym_index = -1; rvec_MakeZero( hbond_list.hbond_list[ihb_top].hb_f ); #endif @@ -734,7 +734,7 @@ CUDA_GLOBAL void k_init_bonds( reax_atom *my_atoms, single_body_parameters *sbp, } hbond_list.hbond_list[ihb_top].ptr = pj; -#if !defined(CUDA_ACCUM_FORCE_ATOMIC) +#if !defined(CUDA_ACCUM_ATOMIC) hbond_list.hbond_list[ihb_top].sym_index = -1; rvec_MakeZero( hbond_list.hbond_list[ihb_top].hb_f ); #endif @@ -750,7 +750,7 @@ CUDA_GLOBAL void k_init_bonds( reax_atom *my_atoms, single_body_parameters *sbp, hbond_list.hbond_list[ihb_top].scl = -1; hbond_list.hbond_list[ihb_top].ptr = pj; -#if !defined(CUDA_ACCUM_FORCE_ATOMIC) +#if !defined(CUDA_ACCUM_ATOMIC) hbond_list.hbond_list[ihb_top].sym_index = -1; rvec_MakeZero( hbond_list.hbond_list[ihb_top].hb_f ); #endif @@ -1113,7 +1113,7 @@ CUDA_GLOBAL void k_update_sym_dbond_indices( reax_list bond_list, int N ) } -#if !defined(CUDA_ACCUM_FORCE_ATOMIC) +#if !defined(CUDA_ACCUM_ATOMIC) CUDA_GLOBAL void k_update_sym_hbond_indices_opt( reax_atom *my_atoms, reax_list hbond_list, int N ) { @@ -1125,7 +1125,6 @@ CUDA_GLOBAL void k_update_sym_hbond_indices_opt( reax_atom *my_atoms, thread_id = blockIdx.x * blockDim.x + threadIdx.x; warp_id = thread_id >> 5; - lane_id = thread_id & 0x0000001F; if ( warp_id > N ) { @@ -1133,6 +1132,7 @@ CUDA_GLOBAL void k_update_sym_hbond_indices_opt( reax_atom *my_atoms, } i = warp_id; + lane_id = thread_id & 0x0000001F; start = Start_Index( my_atoms[i].Hindex, &hbond_list ); end = End_Index( my_atoms[i].Hindex, &hbond_list ); j = start + lane_id; @@ -1200,7 +1200,7 @@ CUDA_GLOBAL void k_print_hbonds( reax_atom *my_atoms, reax_list hbond_list, int k = hbond_list.hbond_list[pj].nbr; hbond_jk = &hbond_list.hbond_list[pj]; -#if !defined(CUDA_ACCUM_FORCE_ATOMIC) +#if !defined(CUDA_ACCUM_ATOMIC) printf( "p%03d, step %05d: %8d: %8d, %24.15f, %24.15f, %24.15f\n", rank, step, my_atoms[i].Hindex, k, hbond_jk->hb_f[0], @@ -1564,14 +1564,13 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, if ( renbr == FALSE && dist_done == FALSE ) { + blocks = system->N * 32 / DEF_BLOCK_SIZE + + (system->N * 32 % DEF_BLOCK_SIZE == 0 ? 0 : 1); -// blocks = system->N * 32 / DEF_BLOCK_SIZE -// + (system->N * 32 % DEF_BLOCK_SIZE == 0 ? 0 : 1); - - k_init_distance <<< control->blocks_n, control->block_size_n >>> - ( system->d_my_atoms, *(lists[FAR_NBRS]), system->N ); -// k_init_distance_opt <<< blocks, DEF_BLOCK_SIZE >>> +// k_init_distance <<< control->blocks_n, control->block_size_n >>> // ( system->d_my_atoms, *(lists[FAR_NBRS]), system->N ); + k_init_distance_opt <<< blocks, DEF_BLOCK_SIZE >>> + ( system->d_my_atoms, *(lists[FAR_NBRS]), system->N ); cudaCheckError( ); dist_done = TRUE; @@ -1713,7 +1712,7 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, ( *(lists[BONDS]), system->N ); cudaCheckError( ); -#if !defined(CUDA_ACCUM_FORCE_ATOMIC) +#if !defined(CUDA_ACCUM_ATOMIC) if ( control->hbond_cut > 0.0 && system->numH > 0 ) { blocks = system->N * 32 / DEF_BLOCK_SIZE @@ -1769,10 +1768,8 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, static int compute_bonded_part1 = FALSE; real *spad; rvec *rvec_spad; -#if defined(DEBUG_FOCUS) - real t_start, t_elapsed; -#endif +#if !defined(CUDA_ACCUM_ATOMIC) cuda_check_malloc( &workspace->scratch, &workspace->scratch_size, MAX( sizeof(real) * system->n, MAX( sizeof(real) * 3 * system->n, @@ -1781,6 +1778,7 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, (sizeof(real) + sizeof(rvec)) * system->n + sizeof(rvec) * control->blocks )))), "Cuda_Compute_Bonded_Forces::workspace->scratch" ); spad = (real *) workspace->scratch; +#endif update_energy = (out_control->energy_update_freq > 0 && data->step % out_control->energy_update_freq == 0) ? TRUE : FALSE; ret = SUCCESS; @@ -1810,55 +1808,76 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, cudaCheckError( ); /* 2. Bond Energy Interactions */ - cuda_memset( spad, 0, sizeof(real) * system->n, - "Compute_Bonded_Forces::spad" ); +#if defined(CUDA_ACCUM_ATOMIC) + cuda_memset( &((simulation_data *)data->d_simulation_data)->my_en.e_bond, + 0, sizeof(real), "Cuda_Compute_Bonded_Forces::e_bond" ); +#endif Cuda_Bonds <<< control->blocks, control->block_size, sizeof(real) * control->block_size >>> ( system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_sbp, system->reax_param.d_tbp, *(workspace->d_workspace), *(lists[BONDS]), - system->n, system->reax_param.num_atom_types, spad ); + system->n, system->reax_param.num_atom_types, +#if !defined(CUDA_ACCUM_ATOMIC) + spad +#else + &((simulation_data *)data->d_simulation_data)->my_en.e_bond +#endif + ); cudaCheckError( ); - /* reduction for E_BE */ +#if !defined(CUDA_ACCUM_ATOMIC) if ( update_energy == TRUE ) { Cuda_Reduction_Sum( spad, &((simulation_data *)data->d_simulation_data)->my_en.e_bond, system->n ); } +#endif /* 3. Atom Energy Interactions */ - cuda_memset( spad, 0, sizeof(real) * 3 * system->n, - "Compute_Bonded_Forces::spad" ); +#if defined(CUDA_ACCUM_ATOMIC) + cuda_memset( &((simulation_data *)data->d_simulation_data)->my_en.e_lp, + 0, sizeof(real), "Cuda_Compute_Bonded_Forces::e_lp" ); + cuda_memset( &((simulation_data *)data->d_simulation_data)->my_en.e_ov, + 0, sizeof(real), "Cuda_Compute_Bonded_Forces::e_ov" ); + cuda_memset( &((simulation_data *)data->d_simulation_data)->my_en.e_un, + 0, sizeof(real), "Cuda_Compute_Bonded_Forces::e_un" ); +#endif Cuda_Atom_Energy_Part1 <<< control->blocks, control->block_size >>> ( system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_sbp, system->reax_param.d_tbp, *(workspace->d_workspace), *(lists[BONDS]), system->n, system->reax_param.num_atom_types, - spad, &spad[system->n], &spad[2 * system->n] ); +#if !defined(CUDA_ACCUM_ATOMIC) + spad, &spad[system->n], &spad[2 * system->n] +#else + &((simulation_data *)data->d_simulation_data)->my_en.e_lp, + &((simulation_data *)data->d_simulation_data)->my_en.e_ov, + &((simulation_data *)data->d_simulation_data)->my_en.e_un +#endif + ); cudaCheckError( ); -#if !defined(CUDA_ACCUM_FORCE_ATOMIC) +#if !defined(CUDA_ACCUM_ATOMIC) Cuda_Atom_Energy_Part2 <<< control->blocks, control->block_size >>> ( *(lists[BONDS]), *(workspace->d_workspace), system->n ); cudaCheckError( ); #endif +#if !defined(CUDA_ACCUM_ATOMIC) if ( update_energy == TRUE ) { - /* reduction for E_Lp */ Cuda_Reduction_Sum( spad, &((simulation_data *)data->d_simulation_data)->my_en.e_lp, system->n ); - /* reduction for E_Ov */ Cuda_Reduction_Sum( &spad[system->n], &((simulation_data *)data->d_simulation_data)->my_en.e_ov, system->n ); - /* reduction for E_Un */ Cuda_Reduction_Sum( &spad[2 * system->n], &((simulation_data *)data->d_simulation_data)->my_en.e_un, system->n ); } +#endif compute_bonded_part1 = TRUE; } @@ -1869,7 +1888,8 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, "Cuda_Compute_Bonded_Forces::workspace->scratch" ); thbody = (int *) workspace->scratch; - spad = (real *) workspace->scratch; /* in case scratch gets reallocated above, changing the pointer */ + /* in case scratch gets reallocated above, reassign scratch pointer */ + spad = (real *) workspace->scratch; ret = Cuda_Estimate_Storage_Three_Body( system, control, data, workspace, lists, thbody ); @@ -1878,32 +1898,45 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, { Cuda_Init_Three_Body_Indices( thbody, system->total_thbodies_indices, lists ); - cuda_memset( spad, 0, - (sizeof(real) * 3 + sizeof(rvec)) * system->N + sizeof(rvec) * control->blocks_n, - "Cuda_Compute_Bonded_Forces::spad" ); +#if defined(CUDA_ACCUM_ATOMIC) + cuda_memset( &((simulation_data *)data->d_simulation_data)->my_en.e_ang, + 0, sizeof(real), "Cuda_Compute_Bonded_Forces::e_ang" ); + cuda_memset( &((simulation_data *)data->d_simulation_data)->my_en.e_pen, + 0, sizeof(real), "Cuda_Compute_Bonded_Forces::e_pen" ); + cuda_memset( &((simulation_data *)data->d_simulation_data)->my_en.e_coa, + 0, sizeof(real), "Cuda_Compute_Bonded_Forces::e_coa" ); + cuda_memset( &((simulation_data *)data->d_simulation_data)->my_ext_press, + 0, sizeof(rvec), "Cuda_Compute_Bonded_Forces::my_ext_press" ); +#endif Cuda_Valence_Angles_Part1 <<< control->blocks_n, control->block_size_n >>> - ( system->d_my_atoms, system->reax_param.d_gp, + ( system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_sbp, system->reax_param.d_thbp, (control_params *) control->d_control_params, *(workspace->d_workspace), *(lists[BONDS]), *(lists[THREE_BODIES]), system->n, system->N, system->reax_param.num_atom_types, - spad, &spad[system->N], &spad[2 * system->N], (rvec *) (&spad[3 * system->N]) ); +#if !defined(CUDA_ACCUM_ATOMIC) + spad, &spad[system->N], &spad[2 * system->N], (rvec *) (&spad[3 * system->N]) +#else + &((simulation_data *)data->d_simulation_data)->my_en.e_ang, + &((simulation_data *)data->d_simulation_data)->my_en.e_pen, + &((simulation_data *)data->d_simulation_data)->my_en.e_coa, + &((simulation_data *)data->d_simulation_data)->my_ext_press +#endif + ); cudaCheckError( ); +#if !defined(CUDA_ACCUM_ATOMIC) if ( update_energy == TRUE ) { - /* reduction for E_Ang */ Cuda_Reduction_Sum( spad, &((simulation_data *)data->d_simulation_data)->my_en.e_ang, system->N ); - /* reduction for E_Pen */ Cuda_Reduction_Sum( &spad[system->N], &((simulation_data *)data->d_simulation_data)->my_en.e_pen, system->N ); - /* reduction for E_Coa */ Cuda_Reduction_Sum( &spad[2 * system->N], &((simulation_data *)data->d_simulation_data)->my_en.e_coa, system->N ); @@ -1913,7 +1946,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, { rvec_spad = (rvec *) (&spad[3 * system->N]); - /* reduction for ext_pres */ k_reduction_rvec <<< control->blocks_n, control->block_size_n, sizeof(rvec) * (control->block_size_n / 32) >>> ( rvec_spad, &rvec_spad[system->N], system->N ); @@ -1929,8 +1961,9 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, // &((simulation_data *)data->d_simulation_data)->my_ext_press, // system->N ); } +#endif -#if !defined(CUDA_ACCUM_FORCE_ATOMIC) +#if !defined(CUDA_ACCUM_ATOMIC) Cuda_Valence_Angles_Part2 <<< control->blocks_n, control->block_size_n >>> ( system->d_my_atoms, (control_params *) control->d_control_params, *(workspace->d_workspace), *(lists[BONDS]), system->N ); @@ -1938,25 +1971,37 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, #endif /* 5. Torsion Angles Interactions */ - cuda_memset( spad, 0, (sizeof(real) * 2 + sizeof(rvec)) * system->n + sizeof(rvec) * control->blocks, - "Cuda_Compute_Bonded_Forces::spad" ); +#if defined(CUDA_ACCUM_ATOMIC) + cuda_memset( &((simulation_data *)data->d_simulation_data)->my_en.e_tor, + 0, sizeof(real), "Cuda_Compute_Bonded_Forces::e_tor" ); + cuda_memset( &((simulation_data *)data->d_simulation_data)->my_en.e_con, + 0, sizeof(real), "Cuda_Compute_Bonded_Forces::e_con" ); + cuda_memset( &((simulation_data *)data->d_simulation_data)->my_ext_press, + 0, sizeof(rvec), "Cuda_Compute_Bonded_Forces::my_ext_press" ); +#endif Cuda_Torsion_Angles_Part1 <<< control->blocks, control->block_size >>> ( system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_fbp, (control_params *) control->d_control_params, *(lists[BONDS]), *(lists[THREE_BODIES]), *(workspace->d_workspace), system->n, system->reax_param.num_atom_types, - spad, &spad[system->n], (rvec *) (&spad[2 * system->n]) ); +#if !defined(CUDA_ACCUM_ATOMIC) + spad, &spad[system->n], (rvec *) (&spad[2 * system->n]) +#else + &((simulation_data *)data->d_simulation_data)->my_en.e_tor, + &((simulation_data *)data->d_simulation_data)->my_en.e_con, + &((simulation_data *)data->d_simulation_data)->my_ext_press +#endif + ); cudaCheckError( ); +#if !defined(CUDA_ACCUM_ATOMIC) if ( update_energy == TRUE ) { - /* reduction for E_Tor */ Cuda_Reduction_Sum( spad, &((simulation_data *)data->d_simulation_data)->my_en.e_tor, system->n ); - /* reduction for E_Con */ Cuda_Reduction_Sum( &spad[system->n], &((simulation_data *)data->d_simulation_data)->my_en.e_con, system->n ); @@ -1966,7 +2011,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, { rvec_spad = (rvec *) (&spad[2 * system->n]); - /* reduction for ext_pres */ k_reduction_rvec <<< control->blocks, control->block_size, sizeof(rvec) * (control->block_size / 32) >>> ( rvec_spad, &rvec_spad[system->n], system->n ); @@ -1982,8 +2026,9 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, // &((simulation_data *)data->d_simulation_data)->my_ext_press, // system->n ); } +#endif -#if !defined(CUDA_ACCUM_FORCE_ATOMIC) +#if !defined(CUDA_ACCUM_ATOMIC) Cuda_Torsion_Angles_Part2 <<< control->blocks_n, control->block_size_n >>> ( system->d_my_atoms, *(workspace->d_workspace), *(lists[BONDS]), system->N ); @@ -1993,9 +2038,12 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, /* 6. Hydrogen Bonds Interactions */ if ( control->hbond_cut > 0.0 && system->numH > 0 ) { - cuda_memset( spad, 0, - (sizeof(real) + sizeof(rvec)) * system->n + sizeof(rvec) * control->blocks, - "Cuda_Compute_Bonded_Forces::spad" ); +#if defined(CUDA_ACCUM_ATOMIC) + cuda_memset( &((simulation_data *)data->d_simulation_data)->my_en.e_hb, + 0, sizeof(real), "Cuda_Compute_Bonded_Forces::e_hb" ); + cuda_memset( &((simulation_data *)data->d_simulation_data)->my_ext_press, + 0, sizeof(rvec), "Cuda_Compute_Bonded_Forces::my_ext_press" ); +#endif // hbs = (system->n * HB_KER_THREADS_PER_ATOM / HB_BLOCK_SIZE) + // (((system->n * HB_KER_THREADS_PER_ATOM) % HB_BLOCK_SIZE) == 0 ? 0 : 1); @@ -2009,12 +2057,18 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, *(workspace->d_workspace), *(lists[FAR_NBRS]), *(lists[BONDS]), *(lists[HBONDS]), system->n, system->reax_param.num_atom_types, - spad, (rvec *) (&spad[system->n]), system->my_rank ); +#if !defined(CUDA_ACCUM_ATOMIC) + spad, (rvec *) (&spad[system->n]), +#else + &((simulation_data *)data->d_simulation_data)->my_en.e_hb, + &((simulation_data *)data->d_simulation_data)->my_ext_press, +#endif + system->my_rank ); cudaCheckError( ); +#if !defined(CUDA_ACCUM_ATOMIC) if ( update_energy == TRUE ) { - /* reduction for E_HB */ Cuda_Reduction_Sum( spad, &((simulation_data *)data->d_simulation_data)->my_en.e_hb, system->n ); @@ -2024,7 +2078,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, { rvec_spad = (rvec *) (&spad[system->n]); - /* reduction for ext_pres */ k_reduction_rvec <<< control->blocks, control->block_size, sizeof(rvec) * (control->block_size / 32) >>> ( rvec_spad, &rvec_spad[system->n], system->n ); @@ -2040,8 +2093,9 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, // &((simulation_data *)data->d_simulation_data)->my_ext_press, // system->n ); } +#endif -#if !defined(CUDA_ACCUM_FORCE_ATOMIC) +#if !defined(CUDA_ACCUM_ATOMIC) Cuda_Hydrogen_Bonds_Part2 <<< control->blocks, control->block_size >>> ( system->d_my_atoms, *(workspace->d_workspace), *(lists[BONDS]), system->n ); diff --git a/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu b/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu index c77f42ca..075c9fc7 100644 --- a/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu +++ b/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu @@ -38,7 +38,7 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1( reax_atom *my_atoms, single_body_par hbond_parameters *d_hbp, global_parameters gp, control_params *control, storage workspace, reax_list far_nbr_list, reax_list bond_list, reax_list hbond_list, int n, - int num_atom_types, real *data_e_hb, rvec *data_ext_press, int rank ) + int num_atom_types, real *e_hb_g, rvec *ext_press_g, int rank ) { int i, j, k, pi, pk; int type_i, type_j, type_k; @@ -47,9 +47,12 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1( reax_atom *my_atoms, single_body_par int itr, top; ivec rel_jk; real r_ij, r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2; - real e_hb, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3; + real e_hb, e_hb_l, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3; rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk; - rvec dvec_jk, force, ext_press; + rvec dvec_jk, temp, ext_press_l; +#if defined(CUDA_ACCUM_ATOMIC) + rvec f_i_l, f_j_l, f_k_l; +#endif hbond_parameters *hbp; bond_order_data *bo_ij; int nbr_jk; @@ -63,6 +66,14 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1( reax_atom *my_atoms, single_body_par return; } + e_hb_l = 0.0; + rvec_MakeZero( ext_press_l ); +#if defined(CUDA_ACCUM_ATOMIC) + rvec_MakeZero( f_i_l ); + rvec_MakeZero( f_j_l ); + rvec_MakeZero( f_k_l ); +#endif + /* discover the Hydrogen bonds between i-j-k triplets. * here j is H atom and there has to be some bond between i and j. * Hydrogen bond is between j and k. @@ -115,7 +126,7 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1( reax_atom *my_atoms, single_body_par rvec_Scale( dvec_jk, hbond_list.hbond_list[pk].scl, far_nbr_list.far_nbr_list.dvec[nbr_jk] ); -#if !defined(CUDA_ACCUM_FORCE_ATOMIC) +#if !defined(CUDA_ACCUM_ATOMIC) rvec_MakeZero( phbond_jk->hb_f ); #endif @@ -150,7 +161,7 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1( reax_atom *my_atoms, single_body_par + r_jk / hbp->r0_hb - 2.0 ) ); e_hb = hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4; - data_e_hb[j] += e_hb; + e_hb_l += e_hb; CEhb1 = hbp->p_hb1 * hbp->p_hb2 * exp_hb2 * exp_hb3 * sin_xhz4; CEhb2 = -0.5 * hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * cos_xhz1; @@ -163,116 +174,80 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1( reax_atom *my_atoms, single_body_par if ( control->virial == 0 ) { -#if !defined(CUDA_ACCUM_FORCE_ATOMIC) +#if !defined(CUDA_ACCUM_ATOMIC) /* dcos terms */ rvec_ScaledAdd( pbond_ij->hb_f, CEhb2, dcos_theta_di ); - rvec_ScaledAdd( workspace.f[j], CEhb2, dcos_theta_dj ); - rvec_ScaledAdd( phbond_jk->hb_f, CEhb2, dcos_theta_dk ); /* dr terms */ rvec_ScaledAdd( workspace.f[j], -1.0 * CEhb3 / r_jk, dvec_jk ); - rvec_ScaledAdd( phbond_jk->hb_f, CEhb3 / r_jk, dvec_jk ); #else /* dcos terms */ - atomic_rvecScaledAdd( workspace.f[i], CEhb2, dcos_theta_di ); - - atomic_rvecScaledAdd( workspace.f[j], CEhb2, dcos_theta_dj ); - - atomic_rvecScaledAdd( workspace.f[k], CEhb2, dcos_theta_dk ); + rvec_ScaledAdd( f_i_l, CEhb2, dcos_theta_di ); + rvec_ScaledAdd( f_j_l, CEhb2, dcos_theta_dj ); + rvec_ScaledAdd( f_k_l, CEhb2, dcos_theta_dk ); /* dr terms */ - atomic_rvecScaledAdd( workspace.f[j], -1.0 * CEhb3 / r_jk, dvec_jk ); - - atomic_rvecScaledAdd( workspace.f[k], CEhb3 / r_jk, dvec_jk ); + rvec_ScaledAdd( f_j_l, -1.0 * CEhb3 / r_jk, dvec_jk ); + rvec_ScaledAdd( f_k_l, CEhb3 / r_jk, dvec_jk ); #endif } else { -#if !defined(CUDA_ACCUM_FORCE_ATOMIC) +#if !defined(CUDA_ACCUM_ATOMIC) /* for pressure coupling, terms that are not related to bond order * derivatives are added directly into pressure vector/tensor */ /* dcos terms */ - rvec_Scale( force, CEhb2, dcos_theta_di ); - rvec_Add( pbond_ij->hb_f, force ); - rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); - rvec_ScaledAdd( data_ext_press[j], 1.0, ext_press ); + rvec_Scale( temp, CEhb2, dcos_theta_di ); + rvec_Add( pbond_ij->hb_f, temp ); + rvec_iMultiply( temp, pbond_ij->rel_box, temp ); + rvec_Add( ext_press_l, temp ); rvec_ScaledAdd( workspace.f[j], CEhb2, dcos_theta_dj ); ivec_Scale( rel_jk, hbond_list.hbond_list[pk].scl, far_nbr_list.far_nbr_list.rel_box[nbr_jk] ); - rvec_Scale( force, CEhb2, dcos_theta_dk ); - rvec_Add( phbond_jk->hb_f, force ); - rvec_iMultiply( ext_press, rel_jk, force ); - rvec_ScaledAdd( data_ext_press[j], 1.0, ext_press ); + rvec_Scale( temp, CEhb2, dcos_theta_dk ); + rvec_Add( phbond_jk->hb_f, temp ); + rvec_iMultiply( temp, rel_jk, temp ); + rvec_Add( ext_press_l, temp ); /* dr terms */ rvec_ScaledAdd( workspace.f[j], -1.0 * CEhb3 / r_jk, dvec_jk ); - rvec_Scale( force, CEhb3 / r_jk, dvec_jk ); - rvec_Add( phbond_jk->hb_f, force ); - rvec_iMultiply( ext_press, rel_jk, force ); - rvec_ScaledAdd( data_ext_press[j], 1.0, ext_press ); + rvec_Scale( temp, CEhb3 / r_jk, dvec_jk ); + rvec_Add( phbond_jk->hb_f, temp ); + rvec_iMultiply( temp, rel_jk, temp ); + rvec_Add( ext_press_l, temp ); #else /* for pressure coupling, terms that are not related to bond order * derivatives are added directly into pressure vector/tensor */ /* dcos terms */ - rvec_Scale( force, CEhb2, dcos_theta_di ); - atomic_rvecAdd( workspace.f[i], force ); - rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); - rvec_ScaledAdd( data_ext_press[j], 1.0, ext_press ); + rvec_Scale( temp, CEhb2, dcos_theta_di ); + rvec_Add( f_i_l, temp ); + rvec_iMultiply( temp, pbond_ij->rel_box, temp ); + rvec_Add( ext_press_l, temp ); - atomic_rvecScaledAdd( workspace.f[j], CEhb2, dcos_theta_dj ); + rvec_ScaledAdd( f_j_l, CEhb2, dcos_theta_dj ); ivec_Scale( rel_jk, hbond_list.hbond_list[pk].scl, far_nbr_list.far_nbr_list.rel_box[nbr_jk] ); - rvec_Scale( force, CEhb2, dcos_theta_dk ); - atomic_rvecAdd( workspace.f[k], force ); - rvec_iMultiply( ext_press, rel_jk, force ); - rvec_ScaledAdd( data_ext_press[j], 1.0, ext_press ); + rvec_Scale( temp, CEhb2, dcos_theta_dk ); + rvec_Add( f_k_l, temp ); + rvec_iMultiply( temp, rel_jk, temp ); + rvec_Add( ext_press_l, temp ); /* dr terms */ - atomic_rvecScaledAdd( workspace.f[j], -1.0 * CEhb3 / r_jk, dvec_jk ); + rvec_ScaledAdd( f_j_l, -1.0 * CEhb3 / r_jk, dvec_jk ); - rvec_Scale( force, CEhb3 / r_jk, dvec_jk ); - atomic_rvecAdd( workspace.f[k], force ); - rvec_iMultiply( ext_press, rel_jk, force ); - rvec_ScaledAdd( data_ext_press[j], 1.0, ext_press ); + rvec_Scale( temp, CEhb3 / r_jk, dvec_jk ); + rvec_Add( f_k_l, temp ); + rvec_iMultiply( temp, rel_jk, temp ); + rvec_Add( ext_press_l, temp ); #endif } - -#if defined(TEST_ENERGY) - /* fprintf( out_control->ehb, - "%24.15e%24.15e%24.15e\n%24.15e%24.15e%24.15e\n%24.15e%24.15e%24.15e\n", - dcos_theta_di[0], dcos_theta_di[1], dcos_theta_di[2], - dcos_theta_dj[0], dcos_theta_dj[1], dcos_theta_dj[2], - dcos_theta_dk[0], dcos_theta_dk[1], dcos_theta_dk[2]); - fprintf( out_control->ehb, "%24.15e%24.15e%24.15e\n", - CEhb1, CEhb2, CEhb3 ); */ - fprintf( out_control->ehb, - //"%6d%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n", - "%6d%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f\n", - system->my_atoms[i].orig_id, system->my_atoms[j].orig_id, - system->my_atoms[k].orig_id, - r_jk, theta, bo_ij->BO, e_hb, data->my_en.e_hb ); -#endif - -#if defined(TEST_FORCES) - /* dbo term */ - Add_dBO( system, lists, j, pi, CEhb1, workspace.f_hb ); - - /* dcos terms */ - rvec_ScaledAdd( workspace.f_hb[i], CEhb2, dcos_theta_di ); - rvec_ScaledAdd( workspace.f_hb[j], CEhb2, dcos_theta_dj ); - rvec_ScaledAdd( workspace.f_hb[k], CEhb2, dcos_theta_dk ); - - /* dr terms */ - rvec_ScaledAdd( workspace.f_hb[j], -1.0 * CEhb3 / r_jk, dvec_jk ); - rvec_ScaledAdd( workspace.f_hb[k], CEhb3 / r_jk, dvec_jk ); -#endif } } } @@ -282,6 +257,18 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1( reax_atom *my_atoms, single_body_par free( hblist ); } } + +#if !defined(CUDA_ACCUM_ATOMIC) + /* write conflicts for accumulating partial forces resolved by subsequent kernels */ + e_hb_g[j] = e_hb_l; + rvecCopy( ext_press_g[j], ext_press_l ); +#else + atomic_rvecAdd( workspace.f[i], f_i_l ); + atomic_rvecAdd( workspace.f[j], f_j_l ); + atomic_rvecAdd( workspace.f[k], f_k_l ); + atomicAdd( (double *) e_hb_g, (double) e_hb_l ); + atomic_rvecAdd( *ext_press_g, ext_press_l ); +#endif } @@ -289,7 +276,7 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1( reax_atom *my_atoms, single_body_par CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1_opt( reax_atom *my_atoms, single_body_parameters *sbp, hbond_parameters *d_hbp, global_parameters gp, control_params *control, storage workspace, reax_list far_nbr_list, reax_list bond_list, reax_list hbond_list, int n, - int num_atom_types, real *data_e_hb, rvec *data_ext_press ) + int num_atom_types, real *e_hb_g, rvec *ext_press_g ) { int i, j, k, pi, pk; int type_i, type_j, type_k; @@ -300,9 +287,14 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1_opt( reax_atom *my_atoms, single_body int loopcount, count; ivec rel_jk; real r_ij, r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2; - real e_hb, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3; + real e_hb, e_hb_l, exp_hb2, exp_hb3, CEhb1, CEhb1_l, CEhb2, CEhb3; rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk; - rvec dvec_jk, force, ext_press; + rvec dvec_jk, temp, ext_press_l; +#if !defined(CUDA_ACCUM_ATOMIC) + rvec f_l, hb_f_l; +#else + rvec f_i_l, f_j_l, f_k_l; +#endif hbond_parameters *hbp; bond_order_data *bo_ij; int nbr_jk; @@ -311,8 +303,6 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1_opt( reax_atom *my_atoms, single_body /* thread-local variables */ int thread_id, warp_id, lane_id, offset; unsigned int mask; - real e_hb_s, CEhb1_s; - rvec f_s, hb_f_s; thread_id = blockIdx.x * blockDim.x + threadIdx.x; warp_id = thread_id >> 5; @@ -331,8 +321,15 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1_opt( reax_atom *my_atoms, single_body * Hydrogen bond is between j and k. * so in this function i->X, j->H, k->Z when we map * variables onto the ones in the handout.*/ - e_hb_s = 0.0; - rvec_MakeZero( f_s ); + e_hb_l = 0.0; + rvec_MakeZero( ext_press_l ); +#if !defined(CUDA_ACCUM_ATOMIC) + rvec_MakeZero( f_l ); +#else + rvec_MakeZero( f_i_l ); + rvec_MakeZero( f_j_l ); + rvec_MakeZero( f_k_l ); +#endif /* j has to be of type H */ if ( sbp[ my_atoms[j].type ].p_hbond == H_ATOM ) @@ -366,8 +363,10 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1_opt( reax_atom *my_atoms, single_body pi = hblist[itr]; pbond_ij = &bond_list.bond_list[pi]; i = pbond_ij->nbr; - rvec_MakeZero( hb_f_s ); - CEhb1_s = 0.0; +#if !defined(CUDA_ACCUM_ATOMIC) + rvec_MakeZero( hb_f_l ); +#endif + CEhb1_l = 0.0; //for( pk = hb_start_j; pk < hb_end_j; ++pk ) { loopcount = (hb_end_j - hb_start_j) / warpSize + @@ -420,7 +419,7 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1_opt( reax_atom *my_atoms, single_body + r_jk / hbp->r0_hb - 2.0 ) ); e_hb = hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4; - e_hb_s += e_hb; + e_hb_l += e_hb; CEhb1 = hbp->p_hb1 * hbp->p_hb2 * exp_hb2 * exp_hb3 * sin_xhz4; CEhb2 = -0.5 * hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * cos_xhz1; @@ -429,88 +428,82 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1_opt( reax_atom *my_atoms, single_body /* hydrogen bond forces */ /* dbo term */ - CEhb1_s += CEhb1; + CEhb1_l += CEhb1; if ( control->virial == 0 ) { -#if !defined(CUDA_ACCUM_FORCE_ATOMIC) +#if !defined(CUDA_ACCUM_ATOMIC) /* dcos terms */ - rvec_ScaledAdd( hb_f_s, CEhb2, dcos_theta_di ); - - rvec_ScaledAdd( f_s, CEhb2, dcos_theta_dj ); - + rvec_ScaledAdd( hb_f_l, CEhb2, dcos_theta_di ); + rvec_ScaledAdd( f_l, CEhb2, dcos_theta_dj ); rvec_ScaledAdd( phbond_jk->hb_f, CEhb2, dcos_theta_dk ); /* dr terms */ - rvec_ScaledAdd( f_s, -1.0 * CEhb3 / r_jk, dvec_jk ); - + rvec_ScaledAdd( f_l, -1.0 * CEhb3 / r_jk, dvec_jk ); rvec_ScaledAdd( phbond_jk->hb_f, CEhb3 / r_jk, dvec_jk ); #else /* dcos terms */ - rvec_ScaledAdd( hb_f_s, CEhb2, dcos_theta_di ); - - rvec_ScaledAdd( f_s, CEhb2, dcos_theta_dj ); - - atomic_rvecScaledAdd( workspace.f[k], CEhb2, dcos_theta_dk ); + rvec_ScaledAdd( f_i_l, CEhb2, dcos_theta_di ); + rvec_ScaledAdd( f_j_l, CEhb2, dcos_theta_dj ); + rvec_ScaledAdd( f_k_l, CEhb2, dcos_theta_dk ); /* dr terms */ - rvec_ScaledAdd( f_s, -1.0 * CEhb3 / r_jk, dvec_jk ); - - atomic_rvecScaledAdd( workspace.f[k], CEhb3 / r_jk, dvec_jk ); + rvec_ScaledAdd( f_j_l, -1.0 * CEhb3 / r_jk, dvec_jk ); + rvec_ScaledAdd( f_k_l, CEhb3 / r_jk, dvec_jk ); #endif } else { -#if !defined(CUDA_ACCUM_FORCE_ATOMIC) +#if !defined(CUDA_ACCUM_ATOMIC) /* for pressure coupling, terms that are not related to bond order * derivatives are added directly into pressure vector/tensor */ /* dcos terms */ - rvec_Scale( force, CEhb2, dcos_theta_di ); - rvec_Add( pbond_ij->hb_f, force ); - rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); - rvec_ScaledAdd( data_ext_press [j], 1.0, ext_press ); + rvec_Scale( temp, CEhb2, dcos_theta_di ); + rvec_Add( pbond_ij->hb_f, temp ); + rvec_iMultiply( temp, pbond_ij->rel_box, temp ); + rvec_Add( ext_press_l, temp ); rvec_ScaledAdd( workspace.f[j], CEhb2, dcos_theta_dj ); ivec_Scale( rel_jk, hbond_list.hbond_list[pk].scl, far_nbr_list.far_nbr_list.rel_box[nbr_jk] ); - rvec_Scale( force, CEhb2, dcos_theta_dk ); - rvec_Add( phbond_jk->hb_f, force ); - rvec_iMultiply( ext_press, rel_jk, force ); - rvec_ScaledAdd( data_ext_press[j], 1.0, ext_press ); + rvec_Scale( temp, CEhb2, dcos_theta_dk ); + rvec_Add( phbond_jk->hb_f, temp ); + rvec_iMultiply( temp, rel_jk, temp ); + rvec_Add( ext_press_l, temp ); /* dr terms */ rvec_ScaledAdd( workspace.f[j], -1.0 * CEhb3 / r_jk, dvec_jk ); - rvec_Scale( force, CEhb3 / r_jk, dvec_jk ); - rvec_Add( phbond_jk->hb_f, force ); - rvec_iMultiply( ext_press, rel_jk, force ); - rvec_ScaledAdd( data_ext_press[j], 1.0, ext_press ); + rvec_Scale( temp, CEhb3 / r_jk, dvec_jk ); + rvec_Add( phbond_jk->hb_f, temp ); + rvec_iMultiply( temp, rel_jk, temp ); + rvec_Add( ext_press_l, temp ); #else /* for pressure coupling, terms that are not related to bond order * derivatives are added directly into pressure vector/tensor */ /* dcos terms */ - rvec_Scale( force, CEhb2, dcos_theta_di ); - atomic_rvecAdd( workspace.f[i], force ); - rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); - rvec_ScaledAdd( data_ext_press [j], 1.0, ext_press ); + rvec_Scale( temp, CEhb2, dcos_theta_di ); + rvec_Add( f_i_l, temp ); + rvec_iMultiply( temp, pbond_ij->rel_box, temp ); + rvec_Add( ext_press_l, temp ); - atomic_rvecScaledAdd( workspace.f[j], CEhb2, dcos_theta_dj ); + rvec_ScaledAdd( f_j_l, CEhb2, dcos_theta_dj ); ivec_Scale( rel_jk, hbond_list.hbond_list[pk].scl, far_nbr_list.far_nbr_list.rel_box[nbr_jk] ); - rvec_Scale( force, CEhb2, dcos_theta_dk ); - atomic_rvecAdd( workspace.f[k], force ); - rvec_iMultiply( ext_press, rel_jk, force ); - rvec_ScaledAdd( data_ext_press[j], 1.0, ext_press ); + rvec_Scale( temp, CEhb2, dcos_theta_dk ); + rvec_Add( f_k_l, temp ); + rvec_iMultiply( temp, rel_jk, temp ); + rvec_Add( ext_press_l, temp ); /* dr terms */ - atomic_rvecScaledAdd( workspace.f[j], -1.0 * CEhb3 / r_jk, dvec_jk ); + rvec_ScaledAdd( f_j_l, -1.0 * CEhb3 / r_jk, dvec_jk ); - rvec_Scale( force, CEhb3 / r_jk, dvec_jk ); - atomic_rvecAdd( workspace.f[k], force ); - rvec_iMultiply( ext_press, rel_jk, force ); - rvec_ScaledAdd( data_ext_press[j], 1.0, ext_press ); + rvec_Scale( temp, CEhb3 / r_jk, dvec_jk ); + rvec_Add( f_k_l, temp ); + rvec_iMultiply( temp, rel_jk, temp ); + rvec_Add( ext_press_l, temp ); #endif } @@ -524,20 +517,20 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1_opt( reax_atom *my_atoms, single_body /* warp-level sums using registers within a warp */ for ( offset = warpSize >> 1; offset > 0; offset /= 2 ) { - CEhb1_s += __shfl_down_sync( mask, CEhb1_s, offset ); - hb_f_s[0] += __shfl_down_sync( mask, hb_f_s[0], offset ); - hb_f_s[1] += __shfl_down_sync( mask, hb_f_s[1], offset ); - hb_f_s[2] += __shfl_down_sync( mask, hb_f_s[2], offset ); + CEhb1_l += __shfl_down_sync( mask, CEhb1_l, offset ); +#if !defined(CUDA_ACCUM_ATOMIC) + hb_f_l[0] += __shfl_down_sync( mask, hb_f_l[0], offset ); + hb_f_l[1] += __shfl_down_sync( mask, hb_f_l[1], offset ); + hb_f_l[2] += __shfl_down_sync( mask, hb_f_l[2], offset ); +#endif } /* first thread within a warp writes warp-level sum to shared memory */ if ( lane_id == 0 ) { - bo_ij->Cdbo += CEhb1_s ; -#if !defined(CUDA_ACCUM_FORCE_ATOMIC) - rvec_Add( pbond_ij->hb_f, hb_f_s ); -#else - atomic_rvecAdd( workspace.f[i], hb_f_s ); + bo_ij->Cdbo += CEhb1_l ; +#if !defined(CUDA_ACCUM_ATOMIC) + rvec_Add( pbond_ij->hb_f, hb_f_l ); #endif } } // for loop hbonds end @@ -546,26 +539,46 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1_opt( reax_atom *my_atoms, single_body /* warp-level sums using registers within a warp */ for ( offset = warpSize >> 1; offset > 0; offset /= 2 ) { - e_hb_s += __shfl_down_sync( mask, e_hb_s, offset ); - f_s[0] += __shfl_down_sync( mask, f_s[0], offset ); - f_s[1] += __shfl_down_sync( mask, f_s[1], offset ); - f_s[2] += __shfl_down_sync( mask, f_s[2], offset ); +#if !defined(CUDA_ACCUM_ATOMIC) + f_l[0] += __shfl_down_sync( mask, f_l[0], offset ); + f_l[1] += __shfl_down_sync( mask, f_l[1], offset ); + f_l[2] += __shfl_down_sync( mask, f_l[2], offset ); +#else + f_i_l[0] += __shfl_down_sync( mask, f_i_l[0], offset ); + f_i_l[1] += __shfl_down_sync( mask, f_i_l[1], offset ); + f_i_l[2] += __shfl_down_sync( mask, f_i_l[2], offset ); + f_j_l[0] += __shfl_down_sync( mask, f_j_l[0], offset ); + f_j_l[1] += __shfl_down_sync( mask, f_j_l[1], offset ); + f_j_l[2] += __shfl_down_sync( mask, f_j_l[2], offset ); + f_k_l[0] += __shfl_down_sync( mask, f_k_l[0], offset ); + f_k_l[1] += __shfl_down_sync( mask, f_k_l[1], offset ); + f_k_l[2] += __shfl_down_sync( mask, f_k_l[2], offset ); +#endif + e_hb_l += __shfl_down_sync( mask, e_hb_l, offset ); + ext_press_l[0] += __shfl_down_sync( mask, ext_press_l[0], offset ); + ext_press_l[1] += __shfl_down_sync( mask, ext_press_l[1], offset ); + ext_press_l[2] += __shfl_down_sync( mask, ext_press_l[2], offset ); } /* first thread within a warp writes warp-level sums to global memory */ if ( lane_id == 0 ) { - data_e_hb[j] += e_hb_s; -#if !defined(CUDA_ACCUM_FORCE_ATOMIC) - rvec_Add( workspace.f[j], f_s ); +#if !defined(CUDA_ACCUM_ATOMIC) + rvec_Add( workspace.f[j], f_l ); + e_hb_g[j] = e_hb_l; + rvecCopy( ext_press_g[j], ext_press_l ); #else - atomic_rvecAdd( workspace.f[j], f_s ); + atomic_rvecAdd( workspace.f[i], f_i_l ); + atomic_rvecAdd( workspace.f[j], f_j_l ); + atomic_rvecAdd( workspace.f[k], f_k_l ); + atomicAdd( (double *) e_hb_g, (double) e_hb_l ); + atomic_rvecAdd( *ext_press_g, ext_press_l ); #endif } } -#if !defined(CUDA_ACCUM_FORCE_ATOMIC) +#if !defined(CUDA_ACCUM_ATOMIC) /* Accumulate forces stored in the bond list * using a one thread per atom implementation */ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part2( reax_atom *atoms, @@ -691,7 +704,7 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part3_opt( reax_atom *atoms, /* thread-local variables */ int thread_id, warp_id, lane_id, offset; unsigned int mask; - rvec hb_f_s; + rvec hb_f_l; thread_id = blockIdx.x * blockDim.x + threadIdx.x; warp_id = thread_id >> 5; @@ -707,14 +720,14 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part3_opt( reax_atom *atoms, start = Start_Index( atoms[j].Hindex, &hbond_list ); end = End_Index( atoms[j].Hindex, &hbond_list ); pj = start + lane_id; - rvec_MakeZero( hb_f_s ); + rvec_MakeZero( hb_f_l ); while ( pj < end ) { nbr_pj = &hbond_list.hbond_list[pj]; sym_index_nbr = &hbond_list.hbond_list[ nbr_pj->sym_index ]; - rvec_Add( hb_f_s, sym_index_nbr->hb_f ); + rvec_Add( hb_f_l, sym_index_nbr->hb_f ); pj += warpSize; } @@ -723,15 +736,15 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part3_opt( reax_atom *atoms, /* warp-level sums using registers within a warp */ for ( offset = warpSize >> 1; offset > 0; offset /= 2 ) { - hb_f_s[0] += __shfl_down_sync( mask, hb_f_s[0], offset ); - hb_f_s[1] += __shfl_down_sync( mask, hb_f_s[1], offset ); - hb_f_s[2] += __shfl_down_sync( mask, hb_f_s[2], offset ); + hb_f_l[0] += __shfl_down_sync( mask, hb_f_l[0], offset ); + hb_f_l[1] += __shfl_down_sync( mask, hb_f_l[1], offset ); + hb_f_l[2] += __shfl_down_sync( mask, hb_f_l[2], offset ); } /* first thread within a warp writes warp-level sums to global memory */ if ( lane_id == 0 ) { - rvec_Add( workspace.f[j], hb_f_s ); + rvec_Add( workspace.f[j], hb_f_l ); } } #endif diff --git a/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.h b/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.h index ff3a68d6..3c481134 100644 --- a/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.h +++ b/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.h @@ -36,7 +36,7 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1_opt( reax_atom *, single_body_paramet reax_list, reax_list, reax_list, int, int, real *, rvec * ); -#if !defined(CUDA_ACCUM_FORCE_ATOMIC) +#if !defined(CUDA_ACCUM_ATOMIC) CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part2( reax_atom *, storage, reax_list, int ); diff --git a/PG-PuReMD/src/cuda/cuda_multi_body.cu b/PG-PuReMD/src/cuda/cuda_multi_body.cu index 67f49c3d..b3b92fed 100644 --- a/PG-PuReMD/src/cuda/cuda_multi_body.cu +++ b/PG-PuReMD/src/cuda/cuda_multi_body.cu @@ -30,13 +30,13 @@ CUDA_GLOBAL void Cuda_Atom_Energy_Part1( reax_atom *my_atoms, global_parameters gp, single_body_parameters *sbp, two_body_parameters *tbp, storage workspace, reax_list bond_list, int n, int num_atom_types, - real *data_e_lp, real *dat_e_ov, real *data_e_un ) + real *e_lp_g, real *e_ov_g, real *e_un_g ) { int i, j, pj, type_i, type_j; real Delta_lpcorr, dfvl; - real e_lp, expvd2, inv_expvd2, dElp, CElp, DlpVi; - real e_lph, Di, vov3, deahu2dbo, deahu2dsbo; - real e_ov, CEover1, CEover2, CEover3, CEover4; + real expvd2, inv_expvd2, dElp, CElp, DlpVi; + real Di, vov3, deahu2dbo, deahu2dsbo; + real CEover1, CEover2, CEover3, CEover4, e_lp_l; real exp_ovun1, exp_ovun2, sum_ovun1, sum_ovun2; real exp_ovun2n, exp_ovun6, exp_ovun8; real inv_exp_ovun1, inv_exp_ovun2, inv_exp_ovun2n, inv_exp_ovun8; @@ -71,8 +71,7 @@ CUDA_GLOBAL void Cuda_Atom_Energy_Part1( reax_atom *my_atoms, global_parameters inv_expvd2 = 1.0 / (1.0 + expvd2 ); /* calculate the energy */ - e_lp = p_lp2 * workspace.Delta_lp[i] * inv_expvd2; - data_e_lp[i] += e_lp; + e_lp_l = p_lp2 * workspace.Delta_lp[i] * inv_expvd2; dElp = p_lp2 * inv_expvd2 + 75.0 * p_lp2 * workspace.Delta_lp[i] * expvd2 * SQR(inv_expvd2); @@ -80,18 +79,6 @@ CUDA_GLOBAL void Cuda_Atom_Energy_Part1( reax_atom *my_atoms, global_parameters workspace.CdDelta[i] += CElp; // lp - 1st term -#if defined(TEST_ENERGY) - fprintf( out_control->elp, "%23.15e%23.15e%23.15e%23.15e\n", - p_lp2, workspace.Delta_lp_temp[i], expvd2, dElp ); - fprintf( out_control->elp, "%6d%12.4f%12.4f%12.4f\n", - system->my_atoms[i].orig_id, workspace.nlp[i], - e_lp, data->my_en.e_lp ); -#endif - -#if defined(TEST_FORCES) - Add_dDelta( system, lists, i, CElp, workspace.f_lp ); // lp - 1st term -#endif - /* correction for C2 */ if ( gp.l[5] > 0.001 && Cuda_strncmp( sbp[ type_i ].name, "C", @@ -115,8 +102,7 @@ CUDA_GLOBAL void Cuda_Atom_Energy_Part1( reax_atom *my_atoms, global_parameters if ( vov3 > 3.0 ) { - e_lph = p_lp3 * SQR( vov3 - 3.0 ); - data_e_lp[i] += e_lph; + e_lp_l += p_lp3 * SQR( vov3 - 3.0 ); deahu2dbo = 2.0 * p_lp3 * (vov3 - 3.0); deahu2dsbo = 2.0 * p_lp3 * (vov3 - 3.0) @@ -124,25 +110,20 @@ CUDA_GLOBAL void Cuda_Atom_Energy_Part1( reax_atom *my_atoms, global_parameters bo_ij->Cdbo += deahu2dbo; workspace.CdDelta[i] += deahu2dsbo; - -#if defined(TEST_ENERGY) - fprintf( out_control->elp,"C2cor%6d%6d%12.6f%12.6f%12.6f\n", - system->my_atoms[i].orig_id, system->my_atoms[j].orig_id, - e_lph, deahu2dbo, deahu2dsbo ); -#endif - -#if defined(TEST_FORCES) - Add_dBO( system, lists, i, pj, deahu2dbo, workspace.f_lp ); - Add_dDelta( system, lists, i, deahu2dsbo, workspace.f_lp ); -#endif } } } } } +#if !defined(CUDA_ACCUM_ATOMIC) + e_lp_g[i] = e_lp_l; +#else + atomicAdd( (double *) e_lp_g, (double) e_lp_l ); +#endif + /* over-coordination energy */ - if( sbp_i->mass > 21.0 ) + if ( sbp_i->mass > 21.0 ) { dfvl = 0.0; } @@ -179,8 +160,11 @@ CUDA_GLOBAL void Cuda_Atom_Energy_Part1( reax_atom *my_atoms, global_parameters DlpVi = 1.0 / (Delta_lpcorr + sbp_i->valency + 1.0e-8); CEover1 = Delta_lpcorr * DlpVi * inv_exp_ovun2; - e_ov = sum_ovun1 * CEover1; - dat_e_ov[i] += e_ov; +#if !defined(CUDA_ACCUM_ATOMIC) + e_ov_g[i] = sum_ovun1 * CEover1; +#else + atomicAdd( (double *) e_ov_g, (double) (sum_ovun1 * CEover1) ); +#endif CEover2 = sum_ovun1 * DlpVi * inv_exp_ovun2 * (1.0 - Delta_lpcorr * ( DlpVi + p_ovun2 * exp_ovun2 * inv_exp_ovun2 )); @@ -201,7 +185,11 @@ CUDA_GLOBAL void Cuda_Atom_Energy_Part1( reax_atom *my_atoms, global_parameters inv_exp_ovun8 = 1.0 / (1.0 + exp_ovun8); e_un = -p_ovun5 * (1.0 - exp_ovun6) * inv_exp_ovun2n * inv_exp_ovun8; - data_e_un[i] += e_un; +#if !defined(CUDA_ACCUM_ATOMIC) + e_un_g[i] = e_un; +#else + atomicAdd( (double *) e_un_g, (double) e_un ); +#endif CEunder1 = inv_exp_ovun2n * ( p_ovun5 * p_ovun6 * exp_ovun6 * inv_exp_ovun8 + p_ovun2 * e_un * exp_ovun2n ); @@ -229,7 +217,7 @@ CUDA_GLOBAL void Cuda_Atom_Energy_Part1( reax_atom *my_atoms, global_parameters num_atom_types) ]; bo_ij->Cdbo += CEover1 * twbp->p_ovun1 * twbp->De_s;// OvCoor-1st -#if !defined(CUDA_ACCUM_FORCE_ATOMIC) +#if !defined(CUDA_ACCUM_ATOMIC) pbond_ij->ae_CdDelta += CEover4 * (1.0 - dfvl * workspace.dDelta_lp[j]) * (bo_ij->BO_pi + bo_ij->BO_pi2); // OvCoor-3a #else @@ -241,7 +229,7 @@ CUDA_GLOBAL void Cuda_Atom_Energy_Part1( reax_atom *my_atoms, global_parameters bo_ij->Cdbopi2 += CEover4 * (workspace.Delta[j] - dfvl * workspace.Delta_lp_temp[j]); // OvCoor-3b -#if !defined(CUDA_ACCUM_FORCE_ATOMIC) +#if !defined(CUDA_ACCUM_ATOMIC) pbond_ij->ae_CdDelta += CEunder4 * (1.0 - dfvl * workspace.dDelta_lp[j]) * (bo_ij->BO_pi + bo_ij->BO_pi2); // UnCoor - 2a #else @@ -302,27 +290,10 @@ CUDA_GLOBAL void Cuda_Atom_Energy_Part1( reax_atom *my_atoms, global_parameters workspace.f_un, workspace.f_un ); // UnCoor - 2b #endif } - -#if defined(TEST_ENERGY) - //fprintf( out_control->elp, "%6d%24.15e%24.15e%24.15e\n", - //fprintf( out_control->elp, "%6d%12.4f%12.4f%12.4f\n", - // system->my_atoms[i].orig_id, workspace.nlp[i], - // e_lp, data->my_en.e_lp ); - - //fprintf( out_control->eov, "%6d%24.15e%24.15e\n", - fprintf( out_control->eov, "%6d%12.4f%12.4f\n", - system->my_atoms[i].orig_id, - e_ov, data->my_en.e_ov + data->my_en.e_un ); - - //fprintf( out_control->eun, "%6d%24.15e%24.15e\n", - fprintf( out_control->eun, "%6d%12.4f%12.4f\n", - system->my_atoms[i].orig_id, - e_un, data->my_en.e_ov + data->my_en.e_un ); -#endif } -#if !defined(CUDA_ACCUM_FORCE_ATOMIC) +#if !defined(CUDA_ACCUM_ATOMIC) /* Traverse bond list and accumulate lone pair contributions from bonded neighbors */ CUDA_GLOBAL void Cuda_Atom_Energy_Part2( reax_list bond_list, storage workspace, int n ) diff --git a/PG-PuReMD/src/cuda/cuda_multi_body.h b/PG-PuReMD/src/cuda/cuda_multi_body.h index 16a6a938..156f07c5 100644 --- a/PG-PuReMD/src/cuda/cuda_multi_body.h +++ b/PG-PuReMD/src/cuda/cuda_multi_body.h @@ -29,7 +29,7 @@ CUDA_GLOBAL void Cuda_Atom_Energy_Part1( reax_atom *, global_parameters, single_body_parameters *, two_body_parameters *, storage, reax_list, int, int, real *, real *, real *); -#if !defined(CUDA_ACCUM_FORCE_ATOMIC) +#if !defined(CUDA_ACCUM_ATOMIC) CUDA_GLOBAL void Cuda_Atom_Energy_Part2( reax_list, storage, int ); #endif diff --git a/PG-PuReMD/src/cuda/cuda_torsion_angles.cu b/PG-PuReMD/src/cuda/cuda_torsion_angles.cu index e0e10fe2..915ada83 100644 --- a/PG-PuReMD/src/cuda/cuda_torsion_angles.cu +++ b/PG-PuReMD/src/cuda/cuda_torsion_angles.cu @@ -144,7 +144,7 @@ CUDA_DEVICE static real Calculate_Omega( const rvec dvec_ij, real r_ij, const rv CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *my_atoms, global_parameters gp, four_body_header *d_fbp, control_params *control, reax_list bond_list, reax_list thb_list, storage workspace, int n, int num_atom_types, - real *data_e_tor, real *data_e_con, rvec *data_ext_press ) + real *e_tor_g, real *e_con_g, rvec *ext_press_g ) { int i, j, k, l, pi, pj, pk, pl, pij, plk; int type_i, type_j, type_k, type_l; @@ -169,10 +169,8 @@ CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *my_atoms, global_paramete real CV, cmn, CEtors1, CEtors2, CEtors3, CEtors4; real CEtors5, CEtors6, CEtors7, CEtors8, CEtors9; real Cconj, CEconj1, CEconj2, CEconj3; - real CEconj4, CEconj5, CEconj6; - real e_tor, e_con; - rvec dvec_li; - rvec force, ext_press; + real CEconj4, CEconj5, CEconj6, e_tor_l, e_con_l; + rvec dvec_li, temp, ext_press_l; ivec rel_box_jl; // rtensor total_rtensor, temp_rtensor; four_body_header *fbh; @@ -193,6 +191,9 @@ CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *my_atoms, global_paramete p_tor3 = gp.l[24]; p_tor4 = gp.l[25]; p_cot2 = gp.l[27]; + e_tor_l = 0.0; + e_con_l = 0.0; + rvec_MakeZero( ext_press_l ); #if defined(DEBUG_FOCUS) num_frb_intrs = 0; #endif @@ -349,8 +350,7 @@ CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *my_atoms, global_paramete // + fbp->V2 * exp_tor1 * (1.0 - SQR(cos_omega)) // + fbp->V3 * (0.5 + 2.0 * CUBE(cos_omega) - 1.5 * cos_omega); - e_tor = fn10 * sin_ijk * sin_jkl * CV; - data_e_tor[j] += e_tor; + e_tor_l += fn10 * sin_ijk * sin_jkl * CV; dfn11 = (-p_tor3 * exp_tor3_DjDk + (p_tor3 * exp_tor3_DjDk - p_tor4 * exp_tor4_DjDk) @@ -386,9 +386,8 @@ CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *my_atoms, global_paramete /* 4-body conjugation energy */ fn12 = exp_cot2_ij * exp_cot2_jk * exp_cot2_kl; - e_con = fbp->p_cot1 * fn12 + e_con_l += fbp->p_cot1 * fn12 * (1.0 + (SQR(cos_omega) - 1.0) * sin_ijk * sin_jkl); - data_e_con[j] += e_con; Cconj = -2.0 * fn12 * fbp->p_cot1 * p_cot2 * (1.0 + (SQR(cos_omega) - 1.0) * sin_ijk * sin_jkl); @@ -410,7 +409,7 @@ CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *my_atoms, global_paramete /* end 4-body conjugation energy */ /* forces */ -#if !defined(CUDA_ACCUM_FORCE_ATOMIC) +#if !defined(CUDA_ACCUM_ATOMIC) bo_jk->Cdbopi += CEtors2; workspace.CdDelta[j] += CEtors3; pbond_jk->ta_CdDelta += CEtors3; @@ -428,7 +427,7 @@ CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *my_atoms, global_paramete if ( control->virial == 0 ) { -#if !defined(CUDA_ACCUM_FORCE_ATOMIC) +#if !defined(CUDA_ACCUM_ATOMIC) /* dcos_theta_ijk */ atomic_rvecScaledAdd( pbond_ij->ta_f, CEtors7 + CEconj4, p_ijk->dcos_dk ); @@ -484,203 +483,106 @@ CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *my_atoms, global_paramete } else { -#if !defined(CUDA_ACCUM_FORCE_ATOMIC) +#if !defined(CUDA_ACCUM_ATOMIC) ivec_Sum( rel_box_jl, pbond_jk->rel_box, pbond_kl->rel_box ); /* dcos_theta_ijk */ - rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_dk ); - atomic_rvecAdd( pbond_ij->ta_f, force ); - rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); - rvec_Add( data_ext_press[j], ext_press ); + rvec_Scale( temp, CEtors7 + CEconj4, p_ijk->dcos_dk ); + atomic_rvecAdd( pbond_ij->ta_f, temp ); + rvec_iMultiply( temp, pbond_ij->rel_box, temp ); + rvec_Add( ext_press_l, temp ); rvec_ScaledAdd( workspace.f[j], CEtors7 + CEconj4, p_ijk->dcos_dj ); - rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_di ); - atomic_rvecAdd( pbond_jk->ta_f, force ); - rvec_iMultiply( ext_press, pbond_jk->rel_box, force ); - rvec_Add( data_ext_press[j], ext_press ); + rvec_Scale( temp, CEtors7 + CEconj4, p_ijk->dcos_di ); + atomic_rvecAdd( pbond_jk->ta_f, temp ); + rvec_iMultiply( temp, pbond_jk->rel_box, temp ); + rvec_Add( ext_press_l, temp ); /* dcos_theta_jkl */ rvec_ScaledAdd( workspace.f[j], CEtors8 + CEconj5, p_jkl->dcos_di ); - rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dj ); - atomic_rvecAdd( pbond_jk->ta_f, force ); - rvec_iMultiply( ext_press, pbond_jk->rel_box, force ); - rvec_Add( data_ext_press[j], ext_press ); + rvec_Scale( temp, CEtors8 + CEconj5, p_jkl->dcos_dj ); + atomic_rvecAdd( pbond_jk->ta_f, temp ); + rvec_iMultiply( temp, pbond_jk->rel_box, temp ); + rvec_Add( ext_press_l, temp ); - rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dk ); - rvec_Add( pbond_kl->ta_f, force ); - rvec_iMultiply( ext_press, rel_box_jl, force ); - rvec_Add( data_ext_press[j], ext_press ); + rvec_Scale( temp, CEtors8 + CEconj5, p_jkl->dcos_dk ); + rvec_Add( pbond_kl->ta_f, temp ); + rvec_iMultiply( temp, rel_box_jl, temp ); + rvec_Add( ext_press_l, temp ); /* dcos_omega */ - rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_di ); - atomic_rvecAdd( pbond_ij->ta_f, force ); - rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); - rvec_Add( data_ext_press[j], ext_press ); + rvec_Scale( temp, CEtors9 + CEconj6, dcos_omega_di ); + atomic_rvecAdd( pbond_ij->ta_f, temp ); + rvec_iMultiply( temp, pbond_ij->rel_box, temp ); + rvec_Add( ext_press_l, temp ); rvec_ScaledAdd( workspace.f[j], CEtors9 + CEconj6, dcos_omega_dj ); - rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dk ); - rvec_Add( pbond_jk->ta_f, force ); - rvec_iMultiply( ext_press, pbond_jk->rel_box, force ); - rvec_Add( data_ext_press[j], ext_press ); + rvec_Scale( temp, CEtors9 + CEconj6, dcos_omega_dk ); + rvec_Add( pbond_jk->ta_f, temp ); + rvec_iMultiply( temp, pbond_jk->rel_box, temp ); + rvec_Add( ext_press_l, temp ); - rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dl ); - rvec_Add( pbond_kl->ta_f, force ); - rvec_iMultiply( ext_press, rel_box_jl, force ); - rvec_Add( data_ext_press[j], ext_press ); + rvec_Scale( temp, CEtors9 + CEconj6, dcos_omega_dl ); + rvec_Add( pbond_kl->ta_f, temp ); + rvec_iMultiply( temp, rel_box_jl, temp ); + rvec_Add( ext_press_l, temp ); #else ivec_Sum( rel_box_jl, pbond_jk->rel_box, pbond_kl->rel_box ); /* dcos_theta_ijk */ - rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_dk ); - atomic_rvecAdd( workspace.f[i], force ); - rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); - rvec_Add( data_ext_press[j], ext_press ); + rvec_Scale( temp, CEtors7 + CEconj4, p_ijk->dcos_dk ); + atomic_rvecAdd( workspace.f[i], temp ); + rvec_iMultiply( temp, pbond_ij->rel_box, temp ); + rvec_Add( ext_press_l, temp ); atomic_rvecScaledAdd( workspace.f[j], CEtors7 + CEconj4, p_ijk->dcos_dj ); - rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_di ); - atomic_rvecAdd( workspace.f[k], force ); - rvec_iMultiply( ext_press, pbond_jk->rel_box, force ); - rvec_Add( data_ext_press[j], ext_press ); + rvec_Scale( temp, CEtors7 + CEconj4, p_ijk->dcos_di ); + atomic_rvecAdd( workspace.f[k], temp ); + rvec_iMultiply( temp, pbond_jk->rel_box, temp ); + rvec_Add( ext_press_l, temp ); /* dcos_theta_jkl */ atomic_rvecScaledAdd( workspace.f[j], CEtors8 + CEconj5, p_jkl->dcos_di ); - rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dj ); - atomic_rvecAdd( workspace.f[k], force ); - rvec_iMultiply( ext_press, pbond_jk->rel_box, force ); - rvec_Add( data_ext_press[j], ext_press ); + rvec_Scale( temp, CEtors8 + CEconj5, p_jkl->dcos_dj ); + atomic_rvecAdd( workspace.f[k], temp ); + rvec_iMultiply( temp, pbond_jk->rel_box, temp ); + rvec_Add( ext_press_l, temp ); - rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dk ); - atomic_rvecAdd( workspace.f[l], force ); - rvec_iMultiply( ext_press, rel_box_jl, force ); - rvec_Add( data_ext_press[j], ext_press ); + rvec_Scale( temp, CEtors8 + CEconj5, p_jkl->dcos_dk ); + atomic_rvecAdd( workspace.f[l], temp ); + rvec_iMultiply( temp, rel_box_jl, temp ); + rvec_Add( ext_press_l, temp ); /* dcos_omega */ - rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_di ); - atomic_rvecAdd( workspace.f[i], force ); - rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); - rvec_Add( data_ext_press[j], ext_press ); + rvec_Scale( temp, CEtors9 + CEconj6, dcos_omega_di ); + atomic_rvecAdd( workspace.f[i], temp ); + rvec_iMultiply( temp, pbond_ij->rel_box, temp ); + rvec_Add( ext_press_l, temp ); atomic_rvecScaledAdd( workspace.f[j], CEtors9 + CEconj6, dcos_omega_dj ); - rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dk ); - atomic_rvecAdd( workspace.f[k], force ); - rvec_iMultiply( ext_press, pbond_jk->rel_box, force ); - rvec_Add( data_ext_press[j], ext_press ); + rvec_Scale( temp, CEtors9 + CEconj6, dcos_omega_dk ); + atomic_rvecAdd( workspace.f[k], temp ); + rvec_iMultiply( temp, pbond_jk->rel_box, temp ); + rvec_Add( ext_press_l, temp ); - rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dl ); - rvec_Add( workspace.f[l], force ); - rvec_iMultiply( ext_press, rel_box_jl, force ); - rvec_Add( data_ext_press[j], ext_press ); + rvec_Scale( temp, CEtors9 + CEconj6, dcos_omega_dl ); + rvec_Add( workspace.f[l], temp ); + rvec_iMultiply( temp, rel_box_jl, temp ); + rvec_Add( ext_press_l, temp ); #endif } - -#if defined(TEST_ENERGY) - /* fprintf( out_control->etor, - "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", - r_ij, r_jk, r_kl, cos_ijk, cos_jkl, sin_ijk, sin_jkl ); - fprintf( out_control->etor, "%12.8f\n", dfn11 ); */ - /* fprintf( out_control->etor, - "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", - CEtors2, CEtors3, CEtors4, CEtors5, CEtors6, - CEtors7, CEtors8, CEtors9 ); */ - /* fprintf( out_control->etor, - "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", - htra, htrb, htrc, hthd, hthe, hnra, hnrc, hnhd, hnhe ); */ - /* fprintf( out_control->etor, - "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", - CEconj1, CEconj2, CEconj3, CEconj4, CEconj5, CEconj6 ); */ - - /* fprintf( out_control->etor, "%12.6f%12.6f%12.6f%12.6f\n", - fbp->V1, fbp->V2, fbp->V3, fbp->p_tor1 );*/ - - fprintf(out_control->etor, - //"%6d%6d%6d%6d%24.15e%24.15e%24.15e%24.15e\n", - "%6d%6d%6d%6d%12.4f%12.4f%12.4f%12.4f\n", - system->my_atoms[i].orig_id,system->my_atoms[j].orig_id, - system->my_atoms[k].orig_id,system->my_atoms[l].orig_id, - RAD2DEG(omega), BOA_jk, e_tor, data->my_en.e_tor ); - - fprintf(out_control->econ, - //"%6d%6d%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e\n", - "%6d%6d%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f%12.4f\n", - system->my_atoms[i].orig_id,system->my_atoms[j].orig_id, - system->my_atoms[k].orig_id,system->my_atoms[l].orig_id, - RAD2DEG(omega), BOA_ij, BOA_jk, BOA_kl, - e_con, data->my_en.e_con ); -#endif - -#if defined(TEST_FORCES) - /* Torsion Forces */ - Add_dBOpinpi2( system, lists, j, pk, CEtors2, 0.0, - workspace.f_tor, workspace.f_tor ); - Add_dDelta( system, lists, j, CEtors3, workspace.f_tor ); - Add_dDelta( system, lists, k, CEtors3, workspace.f_tor ); - Add_dBO( system, lists, j, pij, CEtors4, workspace.f_tor ); - Add_dBO( system, lists, j, pk, CEtors5, workspace.f_tor ); - Add_dBO( system, lists, k, plk, CEtors6, workspace.f_tor ); - - rvec_ScaledAdd( workspace.f_tor[i], - CEtors7, p_ijk->dcos_dk ); - rvec_ScaledAdd( workspace.f_tor[j], - CEtors7, p_ijk->dcos_dj ); - rvec_ScaledAdd( workspace.f_tor[k], - CEtors7, p_ijk->dcos_di ); - - rvec_ScaledAdd( workspace.f_tor[j], - CEtors8, p_jkl->dcos_di ); - rvec_ScaledAdd( workspace.f_tor[k], - CEtors8, p_jkl->dcos_dj ); - rvec_ScaledAdd( workspace.f_tor[l], - CEtors8, p_jkl->dcos_dk ); - - rvec_ScaledAdd( workspace.f_tor[i], - CEtors9, dcos_omega_di ); - rvec_ScaledAdd( workspace.f_tor[j], - CEtors9, dcos_omega_dj ); - rvec_ScaledAdd( workspace.f_tor[k], - CEtors9, dcos_omega_dk ); - rvec_ScaledAdd( workspace.f_tor[l], - CEtors9, dcos_omega_dl ); - - /* Conjugation Forces */ - Add_dBO( system, lists, j, pij, CEconj1, workspace.f_con ); - Add_dBO( system, lists, j, pk, CEconj2, workspace.f_con ); - Add_dBO( system, lists, k, plk, CEconj3, workspace.f_con ); - - rvec_ScaledAdd( workspace.f_con[i], - CEconj4, p_ijk->dcos_dk ); - rvec_ScaledAdd( workspace.f_con[j], - CEconj4, p_ijk->dcos_dj ); - rvec_ScaledAdd( workspace.f_con[k], - CEconj4, p_ijk->dcos_di ); - - rvec_ScaledAdd( workspace.f_con[j], - CEconj5, p_jkl->dcos_di ); - rvec_ScaledAdd( workspace.f_con[k], - CEconj5, p_jkl->dcos_dj ); - rvec_ScaledAdd( workspace.f_con[l], - CEconj5, p_jkl->dcos_dk ); - - rvec_ScaledAdd( workspace.f_con[i], - CEconj6, dcos_omega_di ); - rvec_ScaledAdd( workspace.f_con[j], - CEconj6, dcos_omega_dj ); - rvec_ScaledAdd( workspace.f_con[k], - CEconj6, dcos_omega_dk ); - rvec_ScaledAdd( workspace.f_con[l], - CEconj6, dcos_omega_dl ); -#endif } // pl check ends } // pl loop ends } // pi check ends @@ -689,10 +591,20 @@ CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *my_atoms, global_paramete } // j<k && j-k neighbor check ends } // pk loop ends // } // j loop + +#if !defined(CUDA_ACCUM_ATOMIC) + e_tor_g[j] = e_tor_l; + e_con_g[j] = e_con_l; + rvec_Copy( e_ext_press_g[j], e_ext_press_l ); +#else + atomicAdd( (double *) e_tor_g, (double) e_tor_l ); + atomicAdd( (double *) e_con_g, (double) e_con_l ); + atomic_rvecAdd( *ext_press_g, ext_press_l ); +#endif } -#if !defined(CUDA_ACCUM_FORCE_ATOMIC) +#if !defined(CUDA_ACCUM_ATOMIC) CUDA_GLOBAL void Cuda_Torsion_Angles_Part2( reax_atom *my_atoms, storage workspace, reax_list bond_list, int N ) { diff --git a/PG-PuReMD/src/cuda/cuda_torsion_angles.h b/PG-PuReMD/src/cuda/cuda_torsion_angles.h index fbf4ca96..71fa7614 100644 --- a/PG-PuReMD/src/cuda/cuda_torsion_angles.h +++ b/PG-PuReMD/src/cuda/cuda_torsion_angles.h @@ -29,7 +29,7 @@ CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *, global_parameters, four_body_header *, control_params *, reax_list, reax_list, storage, int, int, real *, real *, rvec * ); -#if !defined(CUDA_ACCUM_FORCE_ATOMIC) +#if !defined(CUDA_ACCUM_ATOMIC) CUDA_GLOBAL void Cuda_Torsion_Angles_Part2( reax_atom *, storage, reax_list, int ); #endif diff --git a/PG-PuReMD/src/cuda/cuda_valence_angles.cu b/PG-PuReMD/src/cuda/cuda_valence_angles.cu index 32ac5543..fc51dfc9 100644 --- a/PG-PuReMD/src/cuda/cuda_valence_angles.cu +++ b/PG-PuReMD/src/cuda/cuda_valence_angles.cu @@ -21,7 +21,7 @@ #include "cuda_valence_angles.h" -#if defined(CUDA_ACCUM_FORCE_ATOMIC) +#if defined(CUDA_ACCUM_ATOMIC) #include "cuda_helpers.h" #endif #include "cuda_list.h" @@ -36,7 +36,7 @@ CUDA_GLOBAL void Cuda_Valence_Angles_Part1( reax_atom *my_atoms, global_parameters gp, single_body_parameters *sbp, three_body_header *d_thbh, control_params *control, storage workspace, reax_list bond_list, reax_list thb_list, int n, int N, int num_atom_types, - real *data_e_ang, real *data_e_pen, real *data_e_coa, rvec *my_ext_press ) + real *e_ang_g, real *e_pen_g, real *e_coa_g, rvec *ext_press_g ) { int i, j, pi, k, pk, t; int type_i, type_j, type_k; @@ -53,13 +53,13 @@ CUDA_GLOBAL void Cuda_Valence_Angles_Part1( reax_atom *my_atoms, real dSBO1, dSBO2, SBO, SBO2, CSBO2, SBOp, prod_SBO, vlpadj; real CEval1, CEval2, CEval3, CEval4, CEval5, CEval6, CEval7, CEval8; real CEpen1, CEpen2, CEpen3; - real e_ang, e_coa, e_pen; + real e_ang_l, e_coa, e_coa_l, e_pen, e_pen_l; real CEcoa1, CEcoa2, CEcoa3, CEcoa4, CEcoa5; real Cf7ij, Cf7jk, Cf8j, Cf9j; real f7_ij, f7_jk, f8_Dj, f9_Dj; real Ctheta_0, theta_0, theta_00, theta, cos_theta, sin_theta; real BOA_ij, BOA_jk; - rvec force, ext_press; + rvec rvec_temp, ext_press_l; three_body_header *thbh; three_body_parameters *thbp; three_body_interaction_data *p_ijk; @@ -85,12 +85,17 @@ CUDA_GLOBAL void Cuda_Valence_Angles_Part1( reax_atom *my_atoms, p_val9 = gp.l[16]; p_val10 = gp.l[17]; //num_thb_intrs = j * THREE_BODY_OFFSET; + e_ang_l = 0.0; + e_coa_l = 0.0; + e_pen_l = 0.0; + rvec_MakeZero( ext_press_l ); type_j = my_atoms[j].type; start_j = Start_Index( j, &bond_list ); end_j = End_Index( j, &bond_list ); p_val3 = sbp[ type_j ].p_val3; p_val5 = sbp[ type_j ].p_val5; + /* sum of pi and pi-pi BO terms for all neighbors of atom j, * used in determining the equilibrium angle between i-j-k */ SBOp = 0.0; @@ -101,7 +106,7 @@ CUDA_GLOBAL void Cuda_Valence_Angles_Part1( reax_atom *my_atoms, for ( t = start_j; t < end_j; ++t ) { bo_jt = &bond_list.bond_list[t].bo_data; - SBOp += (bo_jt->BO_pi + bo_jt->BO_pi2); + SBOp += bo_jt->BO_pi + bo_jt->BO_pi2; temp = SQR( bo_jt->BO ); temp *= temp; temp *= temp; @@ -236,275 +241,149 @@ CUDA_GLOBAL void Cuda_Valence_Angles_Part1( reax_atom *my_atoms, /* Fortran ReaxFF code hard-codes the constant below * as of 2019-02-27, so use that for now */ - if ( j < n && BOA_jk >= 0.0 && (bo_ij->BO * bo_jk->BO) >= 0.00001 ) -// if ( j < n && BOA_jk >= 0.0 && (bo_ij->BO * bo_jk->BO) > SQR(control->thb_cut) ) + if ( j >= n || BOA_jk < 0.0 || (bo_ij->BO * bo_jk->BO) < 0.00001 ) +// if ( j >= n || BOA_jk < 0.0 || (bo_ij->BO * bo_jk->BO) < SQR(control->thb_cut) ) { - thbh = &d_thbh[ - index_thbp(type_i, type_j, type_k, num_atom_types) ]; + continue; + } - for ( cnt = 0; cnt < thbh->cnt; ++cnt ) - { - /* valence angle does not exist in the force field */ - if ( FABS(thbh->prm[cnt].p_val1) < 0.001 ) - { - continue; - } + thbh = &d_thbh[ + index_thbp(type_i, type_j, type_k, num_atom_types) ]; - thbp = &thbh->prm[cnt]; - - /* calculate valence angle energy */ - p_val1 = thbp->p_val1; - p_val2 = thbp->p_val2; - p_val4 = thbp->p_val4; - p_val7 = thbp->p_val7; - theta_00 = thbp->theta_00; - - exp3ij = EXP( -p_val3 * POW( BOA_ij, p_val4 ) ); - f7_ij = 1.0 - exp3ij; - Cf7ij = p_val3 * p_val4 - * POW( BOA_ij, p_val4 - 1.0 ) * exp3ij; - - exp3jk = EXP( -p_val3 * POW( BOA_jk, p_val4 ) ); - f7_jk = 1.0 - exp3jk; - Cf7jk = p_val3 * p_val4 - * POW( BOA_jk, p_val4 - 1.0 ) * exp3jk; - - expval7 = EXP( -p_val7 * workspace.Delta_boc[j] ); - trm8 = 1.0 + expval6 + expval7; - f8_Dj = p_val5 - ( (p_val5 - 1.0) * (2.0 + expval6) / trm8 ); - Cf8j = ( (1.0 - p_val5) / SQR(trm8) ) - * (p_val6 * expval6 * trm8 - - (2.0 + expval6) * ( p_val6*expval6 - p_val7*expval7) ); - - theta_0 = 180.0 - theta_00 * (1.0 - EXP(-p_val10 * (2.0 - SBO2))); - theta_0 = DEG2RAD( theta_0 ); - - expval2theta = p_val1 * EXP(-p_val2 * SQR(theta_0 - theta)); - if ( p_val1 >= 0.0 ) - { - expval12theta = p_val1 - expval2theta; - } - /* to avoid linear Me-H-Me angles (6/6/06) */ - else - { - expval12theta = -expval2theta; - } + for ( cnt = 0; cnt < thbh->cnt; ++cnt ) + { + /* valence angle does not exist in the force field */ + if ( FABS(thbh->prm[cnt].p_val1) < 0.001 ) + { + continue; + } - CEval1 = Cf7ij * f7_jk * f8_Dj * expval12theta; - CEval2 = Cf7jk * f7_ij * f8_Dj * expval12theta; - CEval3 = Cf8j * f7_ij * f7_jk * expval12theta; - CEval4 = 2.0 * p_val2 * f7_ij * f7_jk * f8_Dj - * expval2theta * (theta_0 - theta); + thbp = &thbh->prm[cnt]; + + /* calculate valence angle energy */ + p_val1 = thbp->p_val1; + p_val2 = thbp->p_val2; + p_val4 = thbp->p_val4; + p_val7 = thbp->p_val7; + theta_00 = thbp->theta_00; + + exp3ij = EXP( -p_val3 * POW( BOA_ij, p_val4 ) ); + f7_ij = 1.0 - exp3ij; + Cf7ij = p_val3 * p_val4 + * POW( BOA_ij, p_val4 - 1.0 ) * exp3ij; + + exp3jk = EXP( -p_val3 * POW( BOA_jk, p_val4 ) ); + f7_jk = 1.0 - exp3jk; + Cf7jk = p_val3 * p_val4 + * POW( BOA_jk, p_val4 - 1.0 ) * exp3jk; + + expval7 = EXP( -p_val7 * workspace.Delta_boc[j] ); + trm8 = 1.0 + expval6 + expval7; + f8_Dj = p_val5 - (p_val5 - 1.0) * (2.0 + expval6) / trm8; + Cf8j = ( (1.0 - p_val5) / SQR(trm8) ) + * (p_val6 * expval6 * trm8 + - (2.0 + expval6) * ( p_val6 * expval6 - p_val7 * expval7) ); + + theta_0 = 180.0 - theta_00 * (1.0 - EXP(-p_val10 * (2.0 - SBO2))); + theta_0 = DEG2RAD( theta_0 ); + + expval2theta = p_val1 * EXP(-p_val2 * SQR(theta_0 - theta)); + if ( p_val1 >= 0.0 ) + { + expval12theta = p_val1 - expval2theta; + } + /* to avoid linear Me-H-Me angles (6/6/06) */ + else + { + expval12theta = -expval2theta; + } - Ctheta_0 = p_val10 * DEG2RAD(theta_00) - * EXP( -p_val10 * (2.0 - SBO2) ); + CEval1 = Cf7ij * f7_jk * f8_Dj * expval12theta; + CEval2 = Cf7jk * f7_ij * f8_Dj * expval12theta; + CEval3 = Cf8j * f7_ij * f7_jk * expval12theta; + CEval4 = 2.0 * p_val2 * f7_ij * f7_jk * f8_Dj + * expval2theta * (theta_0 - theta); - CEval5 = CEval4 * Ctheta_0 * CSBO2; - CEval6 = CEval5 * dSBO1; - CEval7 = CEval5 * dSBO2; - CEval8 = CEval4 / sin_theta; + Ctheta_0 = p_val10 * DEG2RAD(theta_00) + * EXP( -p_val10 * (2.0 - SBO2) ); - if ( pk < pi ) - { - e_ang = f7_ij * f7_jk * f8_Dj * expval12theta; - data_e_ang[j] += e_ang; - } + CEval5 = CEval4 * Ctheta_0 * CSBO2; + CEval6 = CEval5 * dSBO1; + CEval7 = CEval5 * dSBO2; + CEval8 = CEval4 / sin_theta; - /* calculate penalty for double bonds in valency angles */ - p_pen1 = thbp->p_pen1; - - exp_pen2ij = EXP( -p_pen2 * SQR( BOA_ij - 2.0 ) ); - exp_pen2jk = EXP( -p_pen2 * SQR( BOA_jk - 2.0 ) ); - exp_pen3 = EXP( -p_pen3 * workspace.Delta[j] ); - exp_pen4 = EXP( p_pen4 * workspace.Delta[j] ); - trm_pen34 = 1.0 + exp_pen3 + exp_pen4; - f9_Dj = ( 2.0 + exp_pen3 ) / trm_pen34; - Cf9j = (-p_pen3 * exp_pen3 * trm_pen34 - - (2.0 + exp_pen3) * ( -p_pen3 * exp_pen3 - + p_pen4 * exp_pen4 )) / SQR( trm_pen34 ); - /* very important: since each kernel generates all interactions, - * need to prevent all energies becoming duplicates */ - if ( pk < pi ) - { - e_pen = p_pen1 * f9_Dj * exp_pen2ij * exp_pen2jk; - data_e_pen[j] += e_pen; - } + if ( pk < pi ) + { + e_ang_l += f7_ij * f7_jk * f8_Dj * expval12theta; + } - CEpen1 = e_pen * Cf9j / f9_Dj; - temp = -2.0 * p_pen2 * e_pen; - CEpen2 = temp * (BOA_ij - 2.0); - CEpen3 = temp * (BOA_jk - 2.0); - - /* calculate valency angle conjugation energy */ - p_coa1 = thbp->p_coa1; - - exp_coa2 = EXP( p_coa2 * workspace.Delta_boc[j] ); - e_coa = p_coa1 - * EXP( -p_coa4 * SQR(BOA_ij - 1.5) ) - * EXP( -p_coa4 * SQR(BOA_jk - 1.5) ) - * EXP( -p_coa3 * SQR(workspace.total_bond_order[i] - BOA_ij) ) - * EXP( -p_coa3 * SQR(workspace.total_bond_order[k] - BOA_jk) ) - / (1.0 + exp_coa2); - /* similar to above comment regarding if statement */ - if ( pk < pi ) - { - data_e_coa[j] += e_coa; - } + /* calculate penalty for double bonds in valency angles */ + p_pen1 = thbp->p_pen1; + + exp_pen2ij = EXP( -p_pen2 * SQR( BOA_ij - 2.0 ) ); + exp_pen2jk = EXP( -p_pen2 * SQR( BOA_jk - 2.0 ) ); + exp_pen3 = EXP( -p_pen3 * workspace.Delta[j] ); + exp_pen4 = EXP( p_pen4 * workspace.Delta[j] ); + trm_pen34 = 1.0 + exp_pen3 + exp_pen4; + f9_Dj = ( 2.0 + exp_pen3 ) / trm_pen34; + Cf9j = (-p_pen3 * exp_pen3 * trm_pen34 + - (2.0 + exp_pen3) * ( -p_pen3 * exp_pen3 + + p_pen4 * exp_pen4 )) / SQR( trm_pen34 ); + + /* very important: since each kernel generates all interactions, + * need to prevent all energies becoming duplicates */ + if ( pk < pi ) + { + e_pen = p_pen1 * f9_Dj * exp_pen2ij * exp_pen2jk; + e_pen_l += e_pen; + } - CEcoa1 = -2.0 * p_coa4 * (BOA_ij - 1.5) * e_coa; - CEcoa2 = -2.0 * p_coa4 * (BOA_jk - 1.5) * e_coa; - CEcoa3 = -p_coa2 * exp_coa2 * e_coa / (1.0 + exp_coa2); - CEcoa4 = -2.0 * p_coa3 * (workspace.total_bond_order[i] - BOA_ij) * e_coa; - CEcoa5 = -2.0 * p_coa3 * (workspace.total_bond_order[k] - BOA_jk) * e_coa; + CEpen1 = e_pen * Cf9j / f9_Dj; + temp = -2.0 * p_pen2 * e_pen; + CEpen2 = temp * (BOA_ij - 2.0); + CEpen3 = temp * (BOA_jk - 2.0); + + /* calculate valency angle conjugation energy */ + p_coa1 = thbp->p_coa1; + + exp_coa2 = EXP( p_coa2 * workspace.Delta_boc[j] ); + e_coa = p_coa1 + * EXP( -p_coa4 * SQR(BOA_ij - 1.5) ) + * EXP( -p_coa4 * SQR(BOA_jk - 1.5) ) + * EXP( -p_coa3 * SQR(workspace.total_bond_order[i] - BOA_ij) ) + * EXP( -p_coa3 * SQR(workspace.total_bond_order[k] - BOA_jk) ) + / (1.0 + exp_coa2); + /* similar to above comment regarding if statement */ + if ( pk < pi ) + { + e_coa_l += e_coa; + } - /* calculate force contributions */ - /* we must again check for pk < pi for entire forces part */ - if ( pk < pi ) - { -#if !defined(CUDA_ACCUM_FORCE_ATOMIC) - bo_ij->Cdbo += (CEval1 + CEpen2 + (CEcoa1 - CEcoa4)); - bo_jk->Cdbo += (CEval2 + CEpen3 + (CEcoa2 - CEcoa5)); - workspace.CdDelta[j] += ((CEval3 + CEval7) + CEpen1 + CEcoa3); - pbond_ij->va_CdDelta += CEcoa4; - pbond_jk->va_CdDelta += CEcoa5; -#else - atomicAdd( &bo_ij->Cdbo, CEval1 + CEpen2 + (CEcoa1 - CEcoa4) ); - atomicAdd( &bo_jk->Cdbo, CEval2 + CEpen3 + (CEcoa2 - CEcoa5) ); - atomicAdd( &workspace.CdDelta[j], (CEval3 + CEval7) + CEpen1 + CEcoa3 ); - atomicAdd( &workspace.CdDelta[i], CEcoa4 ); - atomicAdd( &workspace.CdDelta[k], CEcoa5 ); -#endif + CEcoa1 = -2.0 * p_coa4 * (BOA_ij - 1.5) * e_coa; + CEcoa2 = -2.0 * p_coa4 * (BOA_jk - 1.5) * e_coa; + CEcoa3 = -p_coa2 * exp_coa2 * e_coa / (1.0 + exp_coa2); + CEcoa4 = -2.0 * p_coa3 * (workspace.total_bond_order[i] - BOA_ij) * e_coa; + CEcoa5 = -2.0 * p_coa3 * (workspace.total_bond_order[k] - BOA_jk) * e_coa; - for ( t = start_j; t < end_j; ++t ) - { - pbond_jt = &bond_list.bond_list[t]; - bo_jt = &pbond_jt->bo_data; - temp_bo_jt = bo_jt->BO; - temp = CUBE( temp_bo_jt ); - pBOjt7 = temp * temp * temp_bo_jt; - -#if !defined(CUDA_ACCUM_FORCE_ATOMIC) - bo_jt->Cdbo += (CEval6 * pBOjt7); - bo_jt->Cdbopi += CEval5; - bo_jt->Cdbopi2 += CEval5; -#else - atomicAdd( &bo_jt->Cdbo, CEval6 * pBOjt7 ); - atomicAdd( &bo_jt->Cdbopi, CEval5 ); - atomicAdd( &bo_jt->Cdbopi2, CEval5 ); -#endif - } - - if ( control->virial == 0 ) - { -#if !defined(CUDA_ACCUM_FORCE_ATOMIC) - rvec_ScaledAdd( pbond_ij->va_f, CEval8, p_ijk->dcos_di ); - rvec_ScaledAdd( workspace.f[j], CEval8, p_ijk->dcos_dj ); - rvec_ScaledAdd( pbond_jk->va_f, CEval8, p_ijk->dcos_dk ); -#else - atomic_rvecScaledAdd( workspace.f[i], CEval8, p_ijk->dcos_di ); - atomic_rvecScaledAdd( workspace.f[j], CEval8, p_ijk->dcos_dj ); - atomic_rvecScaledAdd( workspace.f[k], CEval8, p_ijk->dcos_dk ); -#endif - } - else - { -#if !defined(CUDA_ACCUM_FORCE_ATOMIC) - /* terms not related to bond order derivatives are - * added directly into forces and pressure vector/tensor */ - rvec_Scale( force, CEval8, p_ijk->dcos_di ); - rvec_Add( pbond_ij->va_f, force ); - rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); - rvec_Add( my_ext_press[j], ext_press ); - - rvec_ScaledAdd( workspace.f[j], CEval8, p_ijk->dcos_dj ); - - rvec_Scale( force, CEval8, p_ijk->dcos_dk ); - rvec_Add( pbond_jk->va_f, force ); - rvec_iMultiply( ext_press, pbond_jk->rel_box, force ); - rvec_Add( my_ext_press[j], ext_press ); + /* calculate force contributions */ + /* we must again check for pk < pi for entire forces part */ + if ( pk < pi ) + { +#if !defined(CUDA_ACCUM_ATOMIC) + bo_ij->Cdbo += CEval1 + CEpen2 + (CEcoa1 - CEcoa4); + bo_jk->Cdbo += CEval2 + CEpen3 + (CEcoa2 - CEcoa5); + workspace.CdDelta[j] += (CEval3 + CEval7) + CEpen1 + CEcoa3; + pbond_ij->va_CdDelta += CEcoa4; + pbond_jk->va_CdDelta += CEcoa5; #else - /* terms not related to bond order derivatives are - * added directly into forces and pressure vector/tensor */ - rvec_Scale( force, CEval8, p_ijk->dcos_di ); - atomic_rvecAdd( workspace.f[i], force ); - rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); - rvec_Add( my_ext_press[j], ext_press ); - - rvec_ScaledAdd( workspace.f[j], CEval8, p_ijk->dcos_dj ); - - rvec_Scale( force, CEval8, p_ijk->dcos_dk ); - atomic_rvecAdd( workspace.f[k], force ); - rvec_iMultiply( ext_press, pbond_jk->rel_box, force ); - rvec_Add( my_ext_press[j], ext_press ); + atomicAdd( &bo_ij->Cdbo, CEval1 + CEpen2 + (CEcoa1 - CEcoa4) ); + atomicAdd( &bo_jk->Cdbo, CEval2 + CEpen3 + (CEcoa2 - CEcoa5) ); + atomicAdd( &workspace.CdDelta[j], (CEval3 + CEval7) + CEpen1 + CEcoa3 ); + atomicAdd( &workspace.CdDelta[i], CEcoa4 ); + atomicAdd( &workspace.CdDelta[k], CEcoa5 ); #endif - } - } - -#if defined(TEST_ENERGY) - /*fprintf( out_control->eval, "%12.8f%12.8f%12.8f%12.8f\n", - p_val3, p_val4, BOA_ij, BOA_jk ); - fprintf(out_control->eval, "%13.8f%13.8f%13.8f%13.8f%13.8f\n", - workspace.Delta_e[j], workspace.vlpex[j], - dSBO1, dSBO2, vlpadj ); - fprintf( out_control->eval, "%12.8f%12.8f%12.8f%12.8f\n", - f7_ij, f7_jk, f8_Dj, expval12theta ); - fprintf( out_control->eval, - "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", - CEval1, CEval2, CEval3, CEval4, - CEval5, CEval6, CEval7, CEval8 ); - - fprintf( out_control->eval, - "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n", - p_ijk->dcos_di[0]/sin_theta, p_ijk->dcos_di[1]/sin_theta, - p_ijk->dcos_di[2]/sin_theta, - p_ijk->dcos_dj[0]/sin_theta, p_ijk->dcos_dj[1]/sin_theta, - p_ijk->dcos_dj[2]/sin_theta, - p_ijk->dcos_dk[0]/sin_theta, p_ijk->dcos_dk[1]/sin_theta, - p_ijk->dcos_dk[2]/sin_theta); - - fprintf( out_control->eval, - "%6d%6d%6d%15.8f%15.8f\n", - system->my_atoms[i].orig_id, - system->my_atoms[j].orig_id, - system->my_atoms[k].orig_id, - RAD2DEG(theta), e_ang );*/ - - fprintf( out_control->eval, - //"%6d%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e\n", - "%6d%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f%12.4f\n", - system->my_atoms[i].orig_id, - system->my_atoms[j].orig_id, - system->my_atoms[k].orig_id, - RAD2DEG(theta), theta_0, BOA_ij, BOA_jk, - e_ang, data->my_en.e_ang ); - - fprintf( out_control->epen, - //"%6d%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n", - "%6d%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f\n", - system->my_atoms[i].orig_id, - system->my_atoms[j].orig_id, - system->my_atoms[k].orig_id, - RAD2DEG(theta), BOA_ij, BOA_jk, e_pen, - data->my_en.e_pen ); - - fprintf( out_control->ecoa, - //"%6d%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n", - "%6d%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f\n", - system->my_atoms[i].orig_id, - system->my_atoms[j].orig_id, - system->my_atoms[k].orig_id, - RAD2DEG(theta), BOA_ij, BOA_jk, - e_coa, data->my_en.e_coa ); -#endif - -#if defined(TEST_FORCES) - /* angle forces */ - Add_dBO( system, lists, j, pi, CEval1, workspace.f_ang ); - Add_dBO( system, lists, j, pk, CEval2, workspace.f_ang ); - Add_dDelta( system, lists, j, - CEval3 + CEval7, workspace.f_ang ); - for( t = start_j; t < end_j; ++t ) + for ( t = start_j; t < end_j; ++t ) { pbond_jt = &bond_list.bond_list[t]; bo_jt = &pbond_jt->bo_data; @@ -512,33 +391,61 @@ CUDA_GLOBAL void Cuda_Valence_Angles_Part1( reax_atom *my_atoms, temp = CUBE( temp_bo_jt ); pBOjt7 = temp * temp * temp_bo_jt; - Add_dBO( system, lists, j, t, pBOjt7 * CEval6, - workspace.f_ang ); - Add_dBOpinpi2( system, lists, j, t, CEval5, CEval5, - workspace.f_ang, workspace.f_ang ); +#if !defined(CUDA_ACCUM_ATOMIC) + bo_jt->Cdbo += (CEval6 * pBOjt7); + bo_jt->Cdbopi += CEval5; + bo_jt->Cdbopi2 += CEval5; +#else + atomicAdd( &bo_jt->Cdbo, CEval6 * pBOjt7 ); + atomicAdd( &bo_jt->Cdbopi, CEval5 ); + atomicAdd( &bo_jt->Cdbopi2, CEval5 ); +#endif } - rvec_ScaledAdd( workspace.f_ang[i], CEval8, p_ijk->dcos_di ); - rvec_ScaledAdd( workspace.f_ang[j], CEval8, p_ijk->dcos_dj ); - rvec_ScaledAdd( workspace.f_ang[k], CEval8, p_ijk->dcos_dk ); - /* end angle forces */ - - /* penalty forces */ - Add_dDelta( system, lists, j, CEpen1, workspace.f_pen ); - Add_dBO( system, lists, j, pi, CEpen2, workspace.f_pen ); - Add_dBO( system, lists, j, pk, CEpen3, workspace.f_pen ); - /* end penalty forces */ - - /* coalition forces */ - Add_dBO( system, lists, j, pi, CEcoa1 - CEcoa4, - workspace.f_coa ); - Add_dBO( system, lists, j, pk, CEcoa2 - CEcoa5, - workspace.f_coa ); - Add_dDelta( system, lists, j, CEcoa3, workspace.f_coa ); - Add_dDelta( system, lists, i, CEcoa4, workspace.f_coa ); - Add_dDelta( system, lists, k, CEcoa5, workspace.f_coa ); - /* end coalition forces */ + if ( control->virial == 0 ) + { +#if !defined(CUDA_ACCUM_ATOMIC) + rvec_ScaledAdd( pbond_ij->va_f, CEval8, p_ijk->dcos_di ); + rvec_ScaledAdd( workspace.f[j], CEval8, p_ijk->dcos_dj ); + rvec_ScaledAdd( pbond_jk->va_f, CEval8, p_ijk->dcos_dk ); +#else + atomic_rvecScaledAdd( workspace.f[i], CEval8, p_ijk->dcos_di ); + atomic_rvecScaledAdd( workspace.f[j], CEval8, p_ijk->dcos_dj ); + atomic_rvecScaledAdd( workspace.f[k], CEval8, p_ijk->dcos_dk ); #endif + } + else + { +#if !defined(CUDA_ACCUM_ATOMIC) + /* terms not related to bond order derivatives are + * added directly into forces and pressure vector/tensor */ + rvec_Scale( rvec_temp, CEval8, p_ijk->dcos_di ); + rvec_Add( pbond_ij->va_f, rvec_temp ); + rvec_iMultiply( rvec_temp, pbond_ij->rel_box, rvec_temp ); + rvec_Add( ext_press_l, rvec_temp ); + + rvec_ScaledAdd( workspace.f[j], CEval8, p_ijk->dcos_dj ); + + rvec_Scale( rvec_temp, CEval8, p_ijk->dcos_dk ); + rvec_Add( pbond_jk->va_f, rvec_temp ); + rvec_iMultiply( rvec_temp, pbond_jk->rel_box, rvec_temp ); + rvec_Add( ext_press_l, rvec_temp ); +#else + /* terms not related to bond order derivatives are + * added directly into forces and pressure vector/tensor */ + rvec_Scale( rvec_temp, CEval8, p_ijk->dcos_di ); + atomic_rvecAdd( workspace.f[i], rvec_temp ); + rvec_iMultiply( rvec_temp, pbond_ij->rel_box, rvec_temp ); + rvec_Add( ext_press_l, rvec_temp ); + + rvec_ScaledAdd( workspace.f[j], CEval8, p_ijk->dcos_dj ); + + rvec_Scale( rvec_temp, CEval8, p_ijk->dcos_dk ); + atomic_rvecAdd( workspace.f[k], rvec_temp ); + rvec_iMultiply( rvec_temp, pbond_jk->rel_box, rvec_temp ); + rvec_Add( ext_press_l, rvec_temp ); +#endif + } } } } @@ -546,10 +453,22 @@ CUDA_GLOBAL void Cuda_Valence_Angles_Part1( reax_atom *my_atoms, Set_End_Index( pi, num_thb_intrs, &thb_list ); } + +#if !defined(CUDA_ACCUM_ATOMIC) + e_ang_g[j] = e_ang_l; + e_coa_g[j] = e_coa_l; + e_pen_g[j] = e_pen_l; + rvec_Copy( ext_press_g[j], ext_press_l ); +#else + atomicAdd( (double *) e_ang_g, (double) e_ang_l ); + atomicAdd( (double *) e_coa_g, (double) e_coa_l ); + atomicAdd( (double *) e_pen_g, (double) e_pen_l ); + atomic_rvecAdd( *ext_press_g, ext_press_l ); +#endif } -#if !defined(CUDA_ACCUM_FORCE_ATOMIC) +#if !defined(CUDA_ACCUM_ATOMIC) CUDA_GLOBAL void Cuda_Valence_Angles_Part2( reax_atom *atoms, control_params *control, storage workspace, reax_list bond_list, int N ) diff --git a/PG-PuReMD/src/cuda/cuda_valence_angles.h b/PG-PuReMD/src/cuda/cuda_valence_angles.h index d91387d2..ae30aa71 100644 --- a/PG-PuReMD/src/cuda/cuda_valence_angles.h +++ b/PG-PuReMD/src/cuda/cuda_valence_angles.h @@ -32,7 +32,7 @@ CUDA_GLOBAL void Cuda_Valence_Angles_Part1( reax_atom *, global_parameters, storage, reax_list, reax_list, int, int, int, real *, real *, real *, rvec *); -#if !defined(CUDA_ACCUM_FORCE_ATOMIC) +#if !defined(CUDA_ACCUM_ATOMIC) CUDA_GLOBAL void Cuda_Valence_Angles_Part2 ( reax_atom *, control_params *, storage , reax_list, int ); #endif @@ -42,8 +42,8 @@ CUDA_GLOBAL void Cuda_Estimate_Valence_Angles( reax_atom *, control_params *, /* calculates the theta angle between atom triplet i-j-k */ -CUDA_DEVICE static inline void Calculate_Theta( const rvec dvec_ji, real d_ji, rvec dvec_jk, real d_jk, - real * const theta, real * const cos_theta ) +CUDA_DEVICE static inline void Calculate_Theta( const rvec dvec_ji, real d_ji, + const rvec dvec_jk, real d_jk, real * const theta, real * const cos_theta ) { assert( d_ji > 0.0 ); assert( d_jk > 0.0 ); @@ -64,8 +64,9 @@ CUDA_DEVICE static inline void Calculate_Theta( const rvec dvec_ji, real d_ji, r /* calculates the derivative of the cosine of the angle between atom triplet i-j-k */ -CUDA_DEVICE static inline void Calculate_dCos_Theta( const rvec dvec_ji, real d_ji, rvec dvec_jk, real d_jk, - rvec * const dcos_theta_di, rvec * const dcos_theta_dj, rvec * const dcos_theta_dk ) +CUDA_DEVICE static inline void Calculate_dCos_Theta( const rvec dvec_ji, + real d_ji, const rvec dvec_jk, real d_jk, rvec * const dcos_theta_di, + rvec * const dcos_theta_dj, rvec * const dcos_theta_dk ) { int t; real sqr_d_ji, sqr_d_jk, inv_dists, inv_dists3, dot_dvecs, Cdot_inv3; diff --git a/PG-PuReMD/src/reax_types.h b/PG-PuReMD/src/reax_types.h index ef2fbaa1..b2936f81 100644 --- a/PG-PuReMD/src/reax_types.h +++ b/PG-PuReMD/src/reax_types.h @@ -82,8 +82,8 @@ #if defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT #define CUDA_DEVICE_PACK #endif -/* aggregate atomic forces using atomic operations */ -#define CUDA_ACCUM_FORCE_ATOMIC +/* aggregate atomic energies and forces using atomic operations */ +#define CUDA_ACCUM_ATOMIC /* disable assertions if NOT compiling with debug support -- * the definition (or lack thereof) controls how the assert macro is defined */ @@ -131,7 +131,7 @@ #if defined(USE_REF_FORTRAN_REAXFF_CONSTANTS) /* transcendental constant pi */ - #define PI (3.14159265) +// #define PI (3.14159265) /* unit conversion from ??? to kcal / mol */ #define C_ELE (332.0638) /* Boltzmann constant, AMU * A^2 / (ps^2 * K) */ @@ -1933,7 +1933,7 @@ struct hbond_data int scl; /* position of neighbor in far neighbor list */ int ptr; -#if defined(HAVE_CUDA) && !defined(CUDA_ACCUM_FORCE_ATOMIC) +#if defined(HAVE_CUDA) && !defined(CUDA_ACCUM_ATOMIC) /**/ int sym_index; /**/ @@ -2035,7 +2035,7 @@ struct bond_data rvec dvec; /* bond order data */ bond_order_data bo_data; -#if defined(HAVE_CUDA) && !defined(CUDA_ACCUM_FORCE_ATOMIC) +#if defined(HAVE_CUDA) && !defined(CUDA_ACCUM_ATOMIC) /**/ real ae_CdDelta; /**/ -- GitLab