From 53d6e789468f1a8fea152c3ff33c9cc2c0eafa78 Mon Sep 17 00:00:00 2001 From: "Kurt A. O'Hearn" <ohearnku@msu.edu> Date: Tue, 20 Apr 2021 16:06:34 -0400 Subject: [PATCH] PG-PuReMD: use a warp of threads for hydrogen bonds. --- PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu | 521 +++++++++------------- 1 file changed, 203 insertions(+), 318 deletions(-) diff --git a/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu b/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu index 88763478..1196c9a0 100644 --- a/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu +++ b/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu @@ -51,9 +51,7 @@ CUDA_GLOBAL void k_hydrogen_bonds_part1( reax_atom *my_atoms, single_body_parame real e_hb, e_hb_l, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3; rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk; rvec dvec_jk; -#if defined(CUDA_ACCUM_ATOMIC) - rvec f_j_l; -#endif + rvec f_j_l, f_k_l; hbond_parameters *hbp; bond_order_data *bo_ij; bond_data *pbond_ij; @@ -66,11 +64,6 @@ CUDA_GLOBAL void k_hydrogen_bonds_part1( reax_atom *my_atoms, single_body_parame return; } - e_hb_l = 0.0; -#if defined(CUDA_ACCUM_ATOMIC) - rvec_MakeZero( f_j_l ); -#endif - /* discover the Hydrogen bonds between i-j-k triplets. * here j is H atom and there has to be some bond between i and j. * Hydrogen bond is between j and k. @@ -88,6 +81,8 @@ CUDA_GLOBAL void k_hydrogen_bonds_part1( reax_atom *my_atoms, single_body_parame hb_start_j = Start_Index( my_atoms[j].Hindex, &hbond_list ); hb_end_j = End_Index( my_atoms[j].Hindex, &hbond_list ); top = 0; + e_hb_l = 0.0; + rvec_MakeZero( f_j_l ); if ( Num_Entries( j, &bond_list ) > hblist_size ) { @@ -120,13 +115,11 @@ CUDA_GLOBAL void k_hydrogen_bonds_part1( reax_atom *my_atoms, single_body_parame nbr_jk = phbond_jk->ptr; r_jk = far_nbr_list.far_nbr_list.d[nbr_jk]; + rvec_MakeZero( f_k_l ); + rvec_Scale( dvec_jk, hbond_list.hbond_list[pk].scl, far_nbr_list.far_nbr_list.dvec[nbr_jk] ); -#if !defined(CUDA_ACCUM_ATOMIC) - rvec_MakeZero( phbond_jk->hb_f ); -#endif - /* find matching hbond to atoms j and k */ for ( itr = 0; itr < top; ++itr ) { @@ -169,43 +162,201 @@ CUDA_GLOBAL void k_hydrogen_bonds_part1( reax_atom *my_atoms, single_body_parame /* dbo term */ bo_ij->Cdbo += CEhb1; -#if !defined(CUDA_ACCUM_ATOMIC) /* dcos terms */ +#if !defined(CUDA_ACCUM_ATOMIC) rvec_ScaledAdd( pbond_ij->hb_f, CEhb2, dcos_theta_di ); - rvec_ScaledAdd( workspace.f[j], CEhb2, dcos_theta_dj ); - rvec_ScaledAdd( phbond_jk->hb_f, CEhb2, dcos_theta_dk ); - - /* dr terms */ - rvec_ScaledAdd( workspace.f[j], -1.0 * CEhb3 / r_jk, dvec_jk ); - rvec_ScaledAdd( phbond_jk->hb_f, CEhb3 / r_jk, dvec_jk ); #else - /* dcos terms */ atomic_rvecScaledAdd( workspace.f[i], CEhb2, dcos_theta_di ); +#endif rvec_ScaledAdd( f_j_l, CEhb2, dcos_theta_dj ); - atomic_rvecScaledAdd( workspace.f[k], CEhb2, dcos_theta_dk ); + rvec_ScaledAdd( f_k_l, CEhb2, dcos_theta_dk ); /* dr terms */ - rvec_ScaledAdd( f_j_l, -1.0 * CEhb3 / r_jk, dvec_jk ); - atomic_rvecScaledAdd( workspace.f[k], CEhb3 / r_jk, dvec_jk ); -#endif + rvec_ScaledAdd( f_j_l, -1.0 * CEhb3 / r_jk, dvec_jk ); + rvec_ScaledAdd( f_k_l, CEhb3 / r_jk, dvec_jk ); } } + +#if !defined(CUDA_ACCUM_ATOMIC) + rvec_Copy( phbond_jk->hb_f, f_k_l ); +#else + atomic_rvecAdd( workspace.f[k], f_k_l ); +#endif } if ( hblist != NULL ) { free( hblist ); } + +#if !defined(CUDA_ACCUM_ATOMIC) + /* write conflicts for accumulating partial forces resolved by subsequent kernels */ + rvecCopy( workspace.f[j], f_j_l ); + e_hb_g[j] = e_hb_l; +#else + atomic_rvecAdd( workspace.f[j], f_j_l ); + atomicAdd( (double *) e_hb_g, (double) e_hb_l ); +#endif } +} + + +/* one thread per atom implementation */ +CUDA_GLOBAL void k_hydrogen_bonds_part1_opt( reax_atom *my_atoms, single_body_parameters *sbp, + hbond_parameters *d_hbp, global_parameters gp, + control_params *control, storage workspace, + reax_list far_nbr_list, reax_list bond_list, reax_list hbond_list, int n, + int num_atom_types, real *e_hb_g ) +{ + extern __shared__ cub::WarpReduce<double>::TempStorage temp_d[]; + int i, j, k, pi, pk, thread_id, lane_id, itr; + int type_i, type_j, type_k; + int start_j, end_j, hb_start_j, hb_end_j; + int nbr_jk; + real r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2; + real e_hb, e_hb_l, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3; + rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk; + rvec dvec_jk; + rvec f_j_l, f_k_l; + hbond_parameters *hbp; + bond_order_data *bo_ij; + bond_data *pbond_ij; + hbond_data *phbond_jk; + + thread_id = blockIdx.x * blockDim.x + threadIdx.x; + j = thread_id / warpSize; + + if ( j >= n ) + { + return; + } + + lane_id = thread_id % warpSize; + + /* discover the Hydrogen bonds between i-j-k triplets. + * here j is H atom and there has to be some bond between i and j. + * Hydrogen bond is between j and k. + * so in this function i->X, j->H, k->Z when we map + * variables onto the ones in the handout. */ + + /* j must be a hydrogen atom */ + if ( sbp[ my_atoms[j].type ].p_hbond == H_ATOM ) + { + type_j = my_atoms[j].type; + start_j = Start_Index( j, &bond_list ); + end_j = End_Index( j, &bond_list ); + hb_start_j = Start_Index( my_atoms[j].Hindex, &hbond_list ); + hb_end_j = End_Index( my_atoms[j].Hindex, &hbond_list ); + e_hb_l = 0.0; + rvec_MakeZero( f_j_l ); + + /* for each hbond of atom j */ + for ( pk = hb_start_j; pk < hb_end_j; ++pk ) + { + phbond_jk = &hbond_list.hbond_list[pk]; + k = phbond_jk->nbr; + type_k = my_atoms[k].type; + nbr_jk = phbond_jk->ptr; + r_jk = far_nbr_list.far_nbr_list.d[nbr_jk]; + + rvec_MakeZero( f_k_l ); + + rvec_Scale( dvec_jk, hbond_list.hbond_list[pk].scl, + far_nbr_list.far_nbr_list.dvec[nbr_jk] ); + + /* search bonded atoms i to atom j (hydrogen atom) for potential hydrogen bonding */ + for ( itr = 0, pi = start_j + lane_id; itr < (end_j - start_j + warpSize - 1) / warpSize; ++itr ) + { + if ( pi < end_j ) + { + pbond_ij = &bond_list.bond_list[pi]; + i = pbond_ij->nbr; + bo_ij = &pbond_ij->bo_data; + type_i = my_atoms[i].type; + if ( sbp[type_i].p_hbond == H_BONDING_ATOM + && bo_ij->BO >= HB_THRESHOLD + && my_atoms[i].orig_id != my_atoms[k].orig_id ) + { + hbp = &d_hbp[ index_hbp(type_i, type_j, type_k, num_atom_types) ]; + + Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk, + &theta, &cos_theta ); + + /* the derivative of cos(theta) */ + Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk, + &dcos_theta_di, &dcos_theta_dj, &dcos_theta_dk ); + + /* hydrogen bond energy */ + sin_theta2 = SIN( theta / 2.0 ); + sin_xhz4 = SQR( sin_theta2 ); + sin_xhz4 *= sin_xhz4; + cos_xhz1 = ( 1.0 - cos_theta ); + exp_hb2 = EXP( -1.0 * hbp->p_hb2 * bo_ij->BO ); + exp_hb3 = EXP( -1.0 * hbp->p_hb3 * ( hbp->r0_hb / r_jk + + r_jk / hbp->r0_hb - 2.0 ) ); + + e_hb = hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4; + e_hb_l += e_hb; + + CEhb1 = hbp->p_hb1 * hbp->p_hb2 * exp_hb2 * exp_hb3 * sin_xhz4; + CEhb2 = -0.5 * hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * cos_xhz1; + CEhb3 = hbp->p_hb3 * e_hb * (hbp->r0_hb / SQR( r_jk ) + + -1.0 / hbp->r0_hb); + + /* hydrogen bond forces */ + /* dbo term */ + bo_ij->Cdbo += CEhb1; + + /* dcos terms */ #if !defined(CUDA_ACCUM_ATOMIC) - /* write conflicts for accumulating partial forces resolved by subsequent kernels */ - rvecCopy( workspace.f[j], f_j_l ); - e_hb_g[j] = e_hb_l; + rvec_ScaledAdd( pbond_ij->hb_f, CEhb2, dcos_theta_di ); #else - atomic_rvecAdd( workspace.f[j], f_j_l ); - atomicAdd( (double *) e_hb_g, (double) e_hb_l ); + atomic_rvecScaledAdd( workspace.f[i], CEhb2, dcos_theta_di ); +#endif + rvec_ScaledAdd( f_j_l, CEhb2, dcos_theta_dj ); + rvec_ScaledAdd( f_k_l, CEhb2, dcos_theta_dk ); + + /* dr terms */ + rvec_ScaledAdd( f_j_l, -1.0 * CEhb3 / r_jk, dvec_jk ); + rvec_ScaledAdd( f_k_l, CEhb3 / r_jk, dvec_jk ); + } + } + + pi += warpSize; + } + + f_k_l[0] = cub::WarpReduce<double>(temp_d[j % (blockDim.x / warpSize)]).Sum(f_k_l[0]); + f_k_l[1] = cub::WarpReduce<double>(temp_d[j % (blockDim.x / warpSize)]).Sum(f_k_l[1]); + f_k_l[2] = cub::WarpReduce<double>(temp_d[j % (blockDim.x / warpSize)]).Sum(f_k_l[2]); + + if ( lane_id == 0 ) + { +#if !defined(CUDA_ACCUM_ATOMIC) + rvec_Copy( phbond_jk->hb_f, f_k_l ); +#else + atomic_rvecAdd( workspace.f[k], f_k_l ); #endif + } + } + + f_j_l[0] = cub::WarpReduce<double>(temp_d[j % (blockDim.x / warpSize)]).Sum(f_j_l[0]); + f_j_l[1] = cub::WarpReduce<double>(temp_d[j % (blockDim.x / warpSize)]).Sum(f_j_l[1]); + f_j_l[2] = cub::WarpReduce<double>(temp_d[j % (blockDim.x / warpSize)]).Sum(f_j_l[2]); + e_hb_l = cub::WarpReduce<double>(temp_d[j % (blockDim.x / warpSize)]).Sum(e_hb_l); + + if ( lane_id == 0 ) + { +#if !defined(CUDA_ACCUM_ATOMIC) + /* write conflicts for accumulating partial forces resolved by subsequent kernels */ + rvecCopy( workspace.f[j], f_j_l ); + e_hb_g[j] = e_hb_l; +#else + atomic_rvecAdd( workspace.f[j], f_j_l ); + atomicAdd( (double *) e_hb_g, (double) e_hb_l ); +#endif + } + } } @@ -420,286 +571,6 @@ CUDA_GLOBAL void k_hydrogen_bonds_virial_part1( reax_atom *my_atoms, single_body } -/* one warp of threads per atom implementation */ -//CUDA_GLOBAL void k_hydrogen_bonds_part1_opt( reax_atom *my_atoms, single_body_parameters *sbp, -// hbond_parameters *d_hbp, global_parameters gp, control_params *control, storage workspace, -// reax_list far_nbr_list, reax_list bond_list, reax_list hbond_list, int n, -// int num_atom_types, real *e_hb_g, rvec *ext_press_g ) -//{ -// typedef cub::WarpReduce<double> WarpReduce; -// extern __shared__ typename cub::WarpReduce::TempStorage temp_storage[]; -// int i, j, k, pi, pk; -// int type_i, type_j, type_k; -// int start_j, end_j, hb_start_j, hb_end_j; -// //TODO: re-write and remove -// int hblist[30]; -// int itr, top; -// int loopcount, count; -// ivec rel_jk; -// real r_ij, r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2; -// real e_hb, e_hb_l, exp_hb2, exp_hb3, CEhb1, CEhb1_l, CEhb2, CEhb3; -// rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk; -// rvec dvec_jk, temp, f_j_l, ext_press_l; -//#if !defined(CUDA_ACCUM_ATOMIC) -// rvec hb_f_l; -//#endif -// double dtemp; -// hbond_parameters *hbp; -// bond_order_data *bo_ij; -// int nbr_jk; -// bond_data *pbond_ij; -// hbond_data *phbond_jk; -// /* thread-local variables */ -// int thread_id, warp_id, lane_id; -// -// thread_id = blockIdx.x * blockDim.x + threadIdx.x; -// warp_id = thread_id >> 5; -// lane_id = thread_id & 0x0000001F; -// -// if ( warp_id >= n ) -// { -// return; -// } -// -// j = warp_id; // group of threads assigned to atom j -// -// /* discover the Hydrogen bonds between i-j-k triplets. -// * here j is H atom and there has to be some bond between i and j. -// * Hydrogen bond is between j and k. -// * so in this function i->X, j->H, k->Z when we map -// * variables onto the ones in the handout.*/ -// e_hb_l = 0.0; -// rvec_MakeZero( f_j_l ); -// rvec_MakeZero( ext_press_l ); -// -// /* j has to be of type H */ -// if ( sbp[ my_atoms[j].type ].p_hbond == H_ATOM ) -// { -// type_j = my_atoms[j].type; -// start_j = Start_Index( j, &bond_list ); -// end_j = End_Index( j, &bond_list ); -// hb_start_j = Start_Index( my_atoms[j].Hindex, &hbond_list ); -// hb_end_j = End_Index( my_atoms[j].Hindex, &hbond_list ); -// top = 0; -// -// /* search bonded atoms i to atom j (hydrogen atom) for potential hydrogen bonding */ -// for ( pi = start_j; pi < end_j; ++pi ) -// { -// pbond_ij = &bond_list.bond_list[pi]; -// i = pbond_ij->nbr; -// bo_ij = &pbond_ij->bo_data; -// type_i = my_atoms[i].type; -// -// if ( sbp[type_i].p_hbond == H_BONDING_ATOM -// && bo_ij->BO >= HB_THRESHOLD ) -// { -// hblist[top] = pi; -// ++top; -// } -// } -// -// /* find matching hbond to atoms j and k */ -// for ( itr = 0; itr < top; ++itr ) -// { -// pi = hblist[itr]; -// pbond_ij = &bond_list.bond_list[pi]; -// i = pbond_ij->nbr; -//#if !defined(CUDA_ACCUM_ATOMIC) -// rvec_MakeZero( hb_f_l ); -//#endif -// CEhb1_l = 0.0; -// -// //for( pk = hb_start_j; pk < hb_end_j; ++pk ) { -// loopcount = (hb_end_j - hb_start_j) / warpSize + -// (((hb_end_j - hb_start_j) % warpSize == 0) ? 0 : 1); -// -// count = 0; -// pk = hb_start_j + lane_id; -// while ( count < loopcount ) -// { -// /* only allow threads with an actual hbond */ -// if ( pk < hb_end_j ) -// { -// phbond_jk = &hbond_list.hbond_list[pk]; -// -// /* set k's varibles */ -// k = hbond_list.hbond_list[pk].nbr; -// type_k = my_atoms[k].type; -// nbr_jk = hbond_list.hbond_list[pk].ptr; -// r_jk = far_nbr_list.far_nbr_list.d[nbr_jk]; -// -// rvec_Scale( dvec_jk, phbond_jk->scl, -// far_nbr_list.far_nbr_list.dvec[nbr_jk] ); -// } -// else -// { -// k = -1; -// } -// -// if ( my_atoms[i].orig_id != my_atoms[k].orig_id && k != -1 ) -// { -// bo_ij = &pbond_ij->bo_data; -// type_i = my_atoms[i].type; -// r_ij = pbond_ij->d; -// hbp = &d_hbp[ index_hbp(type_i,type_j,type_k,num_atom_types) ]; -// -// Calculate_Theta( pbond_ij->dvec, r_ij, dvec_jk, r_jk, -// &theta, &cos_theta ); -// -// /* the derivative of cos(theta) */ -// Calculate_dCos_Theta( pbond_ij->dvec, r_ij, dvec_jk, r_jk, -// &dcos_theta_di, &dcos_theta_dj, &dcos_theta_dk ); -// -// /* hydrogen bond energy */ -// sin_theta2 = SIN( theta / 2.0 ); -// sin_xhz4 = SQR( sin_theta2 ); -// sin_xhz4 *= sin_xhz4; -// cos_xhz1 = ( 1.0 - cos_theta ); -// exp_hb2 = EXP( -1.0 * hbp->p_hb2 * bo_ij->BO ); -// exp_hb3 = EXP( -1.0 * hbp->p_hb3 * ( hbp->r0_hb / r_jk -// + r_jk / hbp->r0_hb - 2.0 ) ); -// -// e_hb = hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4; -// e_hb_l += e_hb; -// -// CEhb1 = hbp->p_hb1 * hbp->p_hb2 * exp_hb2 * exp_hb3 * sin_xhz4; -// CEhb2 = -0.5 * hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * cos_xhz1; -// CEhb3 = hbp->p_hb3 * e_hb * (hbp->r0_hb / SQR( r_jk ) -// + -1.0 / hbp->r0_hb); -// -// /* hydrogen bond forces */ -// /* dbo term */ -// CEhb1_l += CEhb1; -// -// if ( control->virial == 0 ) -// { -//#if !defined(CUDA_ACCUM_ATOMIC) -// /* dcos terms */ -// rvec_ScaledAdd( hb_f_l, CEhb2, dcos_theta_di ); -// rvec_ScaledAdd( f_j_l, CEhb2, dcos_theta_dj ); -// rvec_ScaledAdd( phbond_jk->hb_f, CEhb2, dcos_theta_dk ); -// -// /* dr terms */ -// rvec_ScaledAdd( f_j_l, -1.0 * CEhb3 / r_jk, dvec_jk ); -// rvec_ScaledAdd( phbond_jk->hb_f, CEhb3 / r_jk, dvec_jk ); -//#else -// /* dcos terms */ -// atomic_rvecScaledAdd( workspace.f[i], CEhb2, dcos_theta_di ); -// rvec_ScaledAdd( f_j_l, CEhb2, dcos_theta_dj ); -// atomic_rvecScaledAdd( workspace.f[k], CEhb2, dcos_theta_dk ); -// -// /* dr terms */ -// rvec_ScaledAdd( f_j_l, -1.0 * CEhb3 / r_jk, dvec_jk ); -// atomic_rvecScaledAdd( workspace.f[k], CEhb3 / r_jk, dvec_jk ); -//#endif -// } -// else -// { -//#if !defined(CUDA_ACCUM_ATOMIC) -// /* for pressure coupling, terms that are not related to bond order -// * derivatives are added directly into pressure vector/tensor */ -// /* dcos terms */ -// rvec_Scale( temp, CEhb2, dcos_theta_di ); -// rvec_Add( pbond_ij->hb_f, temp ); -// rvec_iMultiply( temp, pbond_ij->rel_box, temp ); -// rvec_Add( ext_press_l, temp ); -// -// rvec_ScaledAdd( workspace.f[j], CEhb2, dcos_theta_dj ); -// -// ivec_Scale( rel_jk, hbond_list.hbond_list[pk].scl, -// far_nbr_list.far_nbr_list.rel_box[nbr_jk] ); -// rvec_Scale( temp, CEhb2, dcos_theta_dk ); -// rvec_Add( phbond_jk->hb_f, temp ); -// rvec_iMultiply( temp, rel_jk, temp ); -// rvec_Add( ext_press_l, temp ); -// -// /* dr terms */ -// rvec_ScaledAdd( workspace.f[j], -1.0 * CEhb3 / r_jk, dvec_jk ); -// -// rvec_Scale( temp, CEhb3 / r_jk, dvec_jk ); -// rvec_Add( phbond_jk->hb_f, temp ); -// rvec_iMultiply( temp, rel_jk, temp ); -// rvec_Add( ext_press_l, temp ); -//#else -// /* for pressure coupling, terms that are not related to bond order -// * derivatives are added directly into pressure vector/tensor */ -// /* dcos terms */ -// rvec_Scale( temp, CEhb2, dcos_theta_di ); -// atomic_rvecAdd( workspace.f[i], temp ); -// rvec_iMultiply( temp, pbond_ij->rel_box, temp ); -// rvec_Add( ext_press_l, temp ); -// -// rvec_ScaledAdd( f_j_l, CEhb2, dcos_theta_dj ); -// -// ivec_Scale( rel_jk, hbond_list.hbond_list[pk].scl, -// far_nbr_list.far_nbr_list.rel_box[nbr_jk] ); -// rvec_Scale( temp, CEhb2, dcos_theta_dk ); -// atomic_rvecAdd( workspace.f[k], temp ); -// rvec_iMultiply( temp, rel_jk, temp ); -// rvec_Add( ext_press_l, temp ); -// -// /* dr terms */ -// rvec_ScaledAdd( f_j_l, -1.0 * CEhb3 / r_jk, dvec_jk ); -// -// rvec_Scale( temp, CEhb3 / r_jk, dvec_jk ); -// atomic_rvecAdd( workspace.f[k], temp ); -// rvec_iMultiply( temp, rel_jk, temp ); -// rvec_Add( ext_press_l, temp ); -//#endif -// } -// -// } //orid id end -// -// pk += warpSize; -// count++; -// -// } //for itr loop end -// -// CEhb1_l = WarpReduce(temp_storage[warp_id]).Sum(CEhb1_l); -//#if !defined(CUDA_ACCUM_ATOMIC) -// hb_f_l[0] = WarpReduce(temp_storage[warp_id]).Sum(hb_f_l[0]); -// hb_f_l[1] = WarpReduce(temp_storage[warp_id]).Sum(hb_f_l[1]); -// hb_f_l[2] = WarpReduce(temp_storage[warp_id]).Sum(hb_f_l[2]); -//#endif -// } -// -// /* first thread within a warp writes warp-level sum to shared memory */ -// if ( lane_id == 0 ) -// { -// bo_ij->Cdbo += CEhb1_l ; -//#if !defined(CUDA_ACCUM_ATOMIC) -// rvec_Add( pbond_ij->hb_f, hb_f_l ); -//#endif -// } -// } // for loop hbonds end -// } //if Hbond check end -// -// __syncthreads( ); -// -// f_j_l[0] = WarpReduce(temp_storage[warp_id]).Sum(f_j_l[0]); -// f_j_l[1] = WarpReduce(temp_storage[warp_id]).Sum(f_j_l[1]); -// f_j_l[2] = WarpReduce(temp_storage[warp_id]).Sum(f_j_l[2]); -// e_hb_l = WarpReduce(temp_storage[warp_id]).Sum(e_hb_l); -// ext_press_l[0] = WarpReduce(temp_storage[warp_id]).Sum(ext_press_l[0]); -// ext_press_l[1] = WarpReduce(temp_storage[warp_id]).Sum(ext_press_l[1]); -// ext_press_l[2] = WarpReduce(temp_storage[warp_id]).Sum(ext_press_l[2]); -// -// /* first thread within a warp writes warp-level sums to global memory */ -// if ( lane_id == 0 ) -// { -//#if !defined(CUDA_ACCUM_ATOMIC) -// rvec_Add( workspace.f[j], f_j_l ); -// e_hb_g[j] = e_hb_l; -// rvecCopy( ext_press_g[j], ext_press_l ); -//#else -// atomic_rvecAdd( workspace.f[j], f_j_l ); -// atomicAdd( (double *) e_hb_g, (double) e_hb_l ); -// atomic_rvecAdd( *ext_press_g, ext_press_l ); -//#endif -// } -//} - - #if !defined(CUDA_ACCUM_ATOMIC) /* Accumulate forces stored in the bond list * using a one thread per atom implementation */ @@ -871,6 +742,7 @@ void Cuda_Compute_Hydrogen_Bonds( reax_system *system, control_params *control, simulation_data *data, storage *workspace, reax_list **lists, output_controls *out_control ) { + int blocks; // int hbs, hnbrs_blocks; #if !defined(CUDA_ACCUM_ATOMIC) int update_energy; @@ -893,14 +765,9 @@ void Cuda_Compute_Hydrogen_Bonds( reax_system *system, control_params *control, } #endif -// hbs = (system->n * HB_KER_THREADS_PER_ATOM / HB_BLOCK_SIZE) + -// (((system->n * HB_KER_THREADS_PER_ATOM) % HB_BLOCK_SIZE) == 0 ? 0 : 1); - if ( control->virial == 1 ) { k_hydrogen_bonds_virial_part1 <<< control->blocks, control->block_size >>> -// k_hydrogen_bonds_virial_part1_opt <<< hbs, HB_BLOCK_SIZE, -// sizeof(real) * (hbs / warpSize) >>> ( system->d_my_atoms, system->reax_param.d_sbp, system->reax_param.d_hbp, system->reax_param.d_gp, (control_params *) control->d_control_params, @@ -914,12 +781,30 @@ void Cuda_Compute_Hydrogen_Bonds( reax_system *system, control_params *control, &((simulation_data *)data->d_simulation_data)->my_ext_press #endif ); + cudaCheckError( ); } else { - k_hydrogen_bonds_part1 <<< control->blocks, control->block_size >>> -// k_hydrogen_bonds_part1_opt <<< hbs, HB_BLOCK_SIZE, -// sizeof(real) * (hbs / warpSize) >>> +// k_hydrogen_bonds_part1 <<< control->blocks, control->block_size >>> +// ( system->d_my_atoms, system->reax_param.d_sbp, +// system->reax_param.d_hbp, system->reax_param.d_gp, +// (control_params *) control->d_control_params, +// *(workspace->d_workspace), +// *(lists[FAR_NBRS]), *(lists[BONDS]), *(lists[HBONDS]), +// system->n, system->reax_param.num_atom_types, +//#if !defined(CUDA_ACCUM_ATOMIC) +// spad +//#else +// &((simulation_data *)data->d_simulation_data)->my_en.e_hb +//#endif +// ); +// cudaCheckError( ); + + blocks = system->n * 32 / DEF_BLOCK_SIZE + + (system->n * 32 % DEF_BLOCK_SIZE == 0 ? 0 : 1); + + k_hydrogen_bonds_part1_opt <<< blocks, DEF_BLOCK_SIZE, + sizeof(cub::WarpReduce<double>::TempStorage) * (DEF_BLOCK_SIZE / 32) >>> ( system->d_my_atoms, system->reax_param.d_sbp, system->reax_param.d_hbp, system->reax_param.d_gp, (control_params *) control->d_control_params, @@ -932,8 +817,8 @@ void Cuda_Compute_Hydrogen_Bonds( reax_system *system, control_params *control, &((simulation_data *)data->d_simulation_data)->my_en.e_hb #endif ); + cudaCheckError( ); } - cudaCheckError( ); #if !defined(CUDA_ACCUM_ATOMIC) if ( update_energy == TRUE ) -- GitLab