From 74f5c1c6e074a467883883b19fa32b744dae781b Mon Sep 17 00:00:00 2001
From: "Kurt A. O'Hearn" <ohearnku@msu.edu>
Date: Fri, 19 Feb 2021 17:04:37 -0500
Subject: [PATCH] PG-PuReMD: fix atomic force reductions and reduce the number
 of atomic operations.

---
 PG-PuReMD/src/cuda/cuda_forces.cu         |  13 +-
 PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu |  65 ++---
 PG-PuReMD/src/cuda/cuda_multi_body.cu     |  54 ----
 PG-PuReMD/src/cuda/cuda_nonbonded.cu      | 322 ++++++++++------------
 PG-PuReMD/src/cuda/cuda_nonbonded.h       |   4 +-
 PG-PuReMD/src/cuda/cuda_torsion_angles.cu |  30 +-
 PG-PuReMD/src/cuda/cuda_valence_angles.cu |  13 +-
 7 files changed, 192 insertions(+), 309 deletions(-)

diff --git a/PG-PuReMD/src/cuda/cuda_forces.cu b/PG-PuReMD/src/cuda/cuda_forces.cu
index 6e59636a..e8895e60 100644
--- a/PG-PuReMD/src/cuda/cuda_forces.cu
+++ b/PG-PuReMD/src/cuda/cuda_forces.cu
@@ -2120,16 +2120,7 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
 }
 
 
-void Cuda_Compute_NonBonded_Forces( reax_system *system, control_params *control, 
-        simulation_data *data, storage *workspace, 
-        reax_list **lists, output_controls *out_control,
-        mpi_datatypes *mpi_data )
-{
-    Cuda_NonBonded_Energy( system, control, workspace, data, lists, out_control );
-}
-
-
-void Cuda_Compute_Total_Force( reax_system *system, control_params *control,
+static void Cuda_Compute_Total_Force( reax_system *system, control_params *control,
         simulation_data *data, storage *workspace,
         reax_list **lists, mpi_datatypes *mpi_data )
 {
@@ -2237,7 +2228,7 @@ extern "C" int Cuda_Compute_Forces( reax_system *system, control_params *control
 #endif
 
         Cuda_Compute_NonBonded_Forces( system, control, data, workspace,
-                lists, out_control, mpi_data );
+                lists, out_control );
 
 #if defined(LOG_PERFORMANCE)
         cudaEventRecord( time_event[4] );
diff --git a/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu b/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu
index 075c9fc7..46cdc806 100644
--- a/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu
+++ b/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu
@@ -51,7 +51,7 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1( reax_atom *my_atoms, single_body_par
     rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk;
     rvec dvec_jk, temp, ext_press_l;
 #if defined(CUDA_ACCUM_ATOMIC)
-    rvec f_i_l, f_j_l, f_k_l;
+    rvec f_j_l;
 #endif
     hbond_parameters *hbp;
     bond_order_data *bo_ij;
@@ -69,9 +69,7 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1( reax_atom *my_atoms, single_body_par
     e_hb_l = 0.0;
     rvec_MakeZero( ext_press_l );
 #if defined(CUDA_ACCUM_ATOMIC)
-    rvec_MakeZero( f_i_l );
     rvec_MakeZero( f_j_l );
-    rvec_MakeZero( f_k_l );
 #endif
 
     /* discover the Hydrogen bonds between i-j-k triplets.
@@ -185,13 +183,13 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1( reax_atom *my_atoms, single_body_par
                         rvec_ScaledAdd( phbond_jk->hb_f, CEhb3 / r_jk, dvec_jk );
 #else
                         /* dcos terms */
-                        rvec_ScaledAdd( f_i_l, CEhb2, dcos_theta_di );
+                        atomic_rvecScaledAdd( workspace.f[i], CEhb2, dcos_theta_di );
                         rvec_ScaledAdd( f_j_l, CEhb2, dcos_theta_dj );
-                        rvec_ScaledAdd( f_k_l, CEhb2, dcos_theta_dk );
+                        atomic_rvecScaledAdd( workspace.f[k], CEhb2, dcos_theta_dk );
 
                         /* dr terms */
                         rvec_ScaledAdd( f_j_l, -1.0 * CEhb3 / r_jk, dvec_jk );
-                        rvec_ScaledAdd( f_k_l, CEhb3 / r_jk, dvec_jk );
+                        atomic_rvecScaledAdd( workspace.f[k], CEhb3 / r_jk, dvec_jk );
 #endif
                     }
                     else
@@ -226,7 +224,7 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1( reax_atom *my_atoms, single_body_par
                          * derivatives are added directly into pressure vector/tensor */
                         /* dcos terms */
                         rvec_Scale( temp, CEhb2, dcos_theta_di );
-                        rvec_Add( f_i_l, temp );
+                        atomic_rvecAdd( workspace.f[i], temp );
                         rvec_iMultiply( temp, pbond_ij->rel_box, temp );
                         rvec_Add( ext_press_l, temp );
 
@@ -235,7 +233,7 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1( reax_atom *my_atoms, single_body_par
                         ivec_Scale( rel_jk, hbond_list.hbond_list[pk].scl,
                                 far_nbr_list.far_nbr_list.rel_box[nbr_jk] );
                         rvec_Scale( temp, CEhb2, dcos_theta_dk );
-                        rvec_Add( f_k_l, temp );
+                        atomic_rvecAdd( workspace.f[k], temp );
                         rvec_iMultiply( temp, rel_jk, temp );
                         rvec_Add( ext_press_l, temp );
 
@@ -243,7 +241,7 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1( reax_atom *my_atoms, single_body_par
                         rvec_ScaledAdd( f_j_l, -1.0 * CEhb3 / r_jk, dvec_jk ); 
 
                         rvec_Scale( temp, CEhb3 / r_jk, dvec_jk );
-                        rvec_Add( f_k_l, temp );
+                        atomic_rvecAdd( workspace.f[k], temp );
                         rvec_iMultiply( temp, rel_jk, temp );
                         rvec_Add( ext_press_l, temp );
 #endif
@@ -260,12 +258,11 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1( reax_atom *my_atoms, single_body_par
 
 #if !defined(CUDA_ACCUM_ATOMIC)
     /* write conflicts for accumulating partial forces resolved by subsequent kernels */
+    rvecCopy( workspace.f[j], f_j_l );
     e_hb_g[j] = e_hb_l;
     rvecCopy( ext_press_g[j], ext_press_l );
 #else
-    atomic_rvecAdd( workspace.f[i], f_i_l );
     atomic_rvecAdd( workspace.f[j], f_j_l );
-    atomic_rvecAdd( workspace.f[k], f_k_l );
     atomicAdd( (double *) e_hb_g, (double) e_hb_l );
     atomic_rvecAdd( *ext_press_g, ext_press_l );
 #endif
@@ -289,11 +286,9 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1_opt( reax_atom *my_atoms, single_body
     real r_ij, r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2;
     real e_hb, e_hb_l, exp_hb2, exp_hb3, CEhb1, CEhb1_l, CEhb2, CEhb3;
     rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk;
-    rvec dvec_jk, temp, ext_press_l;
+    rvec dvec_jk, temp, f_j_l, ext_press_l;
 #if !defined(CUDA_ACCUM_ATOMIC)
-    rvec f_l, hb_f_l;
-#else
-    rvec f_i_l, f_j_l, f_k_l;
+    rvec hb_f_l;
 #endif
     hbond_parameters *hbp;
     bond_order_data *bo_ij;
@@ -322,14 +317,8 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1_opt( reax_atom *my_atoms, single_body
      * so in this function i->X, j->H, k->Z when we map 
      * variables onto the ones in the handout.*/
     e_hb_l = 0.0;
-    rvec_MakeZero( ext_press_l );
-#if !defined(CUDA_ACCUM_ATOMIC)
-    rvec_MakeZero( f_l );
-#else
-    rvec_MakeZero( f_i_l );
     rvec_MakeZero( f_j_l );
-    rvec_MakeZero( f_k_l );
-#endif
+    rvec_MakeZero( ext_press_l );
 
     /* j has to be of type H */
     if ( sbp[ my_atoms[j].type ].p_hbond == H_ATOM )
@@ -435,21 +424,21 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1_opt( reax_atom *my_atoms, single_body
 #if !defined(CUDA_ACCUM_ATOMIC)
                         /* dcos terms */
                         rvec_ScaledAdd( hb_f_l, CEhb2, dcos_theta_di ); 
-                        rvec_ScaledAdd( f_l, CEhb2, dcos_theta_dj );
+                        rvec_ScaledAdd( f_j_l, CEhb2, dcos_theta_dj );
                         rvec_ScaledAdd( phbond_jk->hb_f, CEhb2, dcos_theta_dk );
 
                         /* dr terms */
-                        rvec_ScaledAdd( f_l, -1.0 * CEhb3 / r_jk, dvec_jk ); 
+                        rvec_ScaledAdd( f_j_l, -1.0 * CEhb3 / r_jk, dvec_jk ); 
                         rvec_ScaledAdd( phbond_jk->hb_f, CEhb3 / r_jk, dvec_jk );
 #else
                         /* dcos terms */
-                        rvec_ScaledAdd( f_i_l, CEhb2, dcos_theta_di ); 
+                        atomic_rvecScaledAdd( workspace.f[i], CEhb2, dcos_theta_di ); 
                         rvec_ScaledAdd( f_j_l, CEhb2, dcos_theta_dj );
-                        rvec_ScaledAdd( f_k_l, CEhb2, dcos_theta_dk );
+                        atomic_rvecScaledAdd( workspace.f[k], CEhb2, dcos_theta_dk );
 
                         /* dr terms */
                         rvec_ScaledAdd( f_j_l, -1.0 * CEhb3 / r_jk, dvec_jk ); 
-                        rvec_ScaledAdd( f_k_l, CEhb3 / r_jk, dvec_jk );
+                        atomic_rvecScaledAdd( workspace.f[k], CEhb3 / r_jk, dvec_jk );
 #endif
                     }
                     else
@@ -484,7 +473,7 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1_opt( reax_atom *my_atoms, single_body
                          * derivatives are added directly into pressure vector/tensor */
                         /* dcos terms */
                         rvec_Scale( temp, CEhb2, dcos_theta_di );
-                        rvec_Add( f_i_l, temp );
+                        atomic_rvecAdd( workspace.f[i], temp );
                         rvec_iMultiply( temp, pbond_ij->rel_box, temp );
                         rvec_Add( ext_press_l, temp );
 
@@ -493,7 +482,7 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1_opt( reax_atom *my_atoms, single_body
                         ivec_Scale( rel_jk, hbond_list.hbond_list[pk].scl,
                                 far_nbr_list.far_nbr_list.rel_box[nbr_jk] );
                         rvec_Scale( temp, CEhb2, dcos_theta_dk );
-                        rvec_Add( f_k_l, temp );
+                        atomic_rvecAdd( workspace.f[k], temp );
                         rvec_iMultiply( temp, rel_jk, temp );
                         rvec_Add( ext_press_l, temp );
 
@@ -501,7 +490,7 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1_opt( reax_atom *my_atoms, single_body
                         rvec_ScaledAdd( f_j_l, -1.0 * CEhb3 / r_jk, dvec_jk ); 
 
                         rvec_Scale( temp, CEhb3 / r_jk, dvec_jk );
-                        rvec_Add( f_k_l, temp );
+                        atomic_rvecAdd( workspace.f[k], temp );
                         rvec_iMultiply( temp, rel_jk, temp );
                         rvec_Add( ext_press_l, temp );
 #endif
@@ -539,21 +528,9 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1_opt( reax_atom *my_atoms, single_body
     /* warp-level sums using registers within a warp */
     for ( offset = warpSize >> 1; offset > 0; offset /= 2 )
     {
-#if !defined(CUDA_ACCUM_ATOMIC)
-        f_l[0] += __shfl_down_sync( mask, f_l[0], offset );
-        f_l[1] += __shfl_down_sync( mask, f_l[1], offset );
-        f_l[2] += __shfl_down_sync( mask, f_l[2], offset );
-#else
-        f_i_l[0] += __shfl_down_sync( mask, f_i_l[0], offset );
-        f_i_l[1] += __shfl_down_sync( mask, f_i_l[1], offset );
-        f_i_l[2] += __shfl_down_sync( mask, f_i_l[2], offset );
         f_j_l[0] += __shfl_down_sync( mask, f_j_l[0], offset );
         f_j_l[1] += __shfl_down_sync( mask, f_j_l[1], offset );
         f_j_l[2] += __shfl_down_sync( mask, f_j_l[2], offset );
-        f_k_l[0] += __shfl_down_sync( mask, f_k_l[0], offset );
-        f_k_l[1] += __shfl_down_sync( mask, f_k_l[1], offset );
-        f_k_l[2] += __shfl_down_sync( mask, f_k_l[2], offset );
-#endif
         e_hb_l += __shfl_down_sync( mask, e_hb_l, offset );
         ext_press_l[0] += __shfl_down_sync( mask, ext_press_l[0], offset );
         ext_press_l[1] += __shfl_down_sync( mask, ext_press_l[1], offset );
@@ -564,13 +541,11 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1_opt( reax_atom *my_atoms, single_body
     if ( lane_id == 0 )
     {
 #if !defined(CUDA_ACCUM_ATOMIC)
-        rvec_Add( workspace.f[j], f_l );
+        rvec_Add( workspace.f[j], f_j_l );
         e_hb_g[j] = e_hb_l;
         rvecCopy( ext_press_g[j], ext_press_l );
 #else
-        atomic_rvecAdd( workspace.f[i], f_i_l );
         atomic_rvecAdd( workspace.f[j], f_j_l );
-        atomic_rvecAdd( workspace.f[k], f_k_l );
         atomicAdd( (double *) e_hb_g, (double) e_hb_l );
         atomic_rvecAdd( *ext_press_g, ext_press_l );
 #endif
diff --git a/PG-PuReMD/src/cuda/cuda_multi_body.cu b/PG-PuReMD/src/cuda/cuda_multi_body.cu
index b3b92fed..cabab1d4 100644
--- a/PG-PuReMD/src/cuda/cuda_multi_body.cu
+++ b/PG-PuReMD/src/cuda/cuda_multi_body.cu
@@ -202,11 +202,6 @@ CUDA_GLOBAL void Cuda_Atom_Energy_Part1( reax_atom *my_atoms, global_parameters
     workspace.CdDelta[i] += CEover3;   // OvCoor - 2nd term
     workspace.CdDelta[i] += CEunder3;  // UnCoor - 1st term
 
-#if defined(TEST_FORCES)
-    Add_dDelta( system, lists, i, CEover3, workspace.f_ov ); // OvCoor 2nd
-    Add_dDelta( system, lists, i, CEunder3, workspace.f_un ); // UnCoor 1st
-#endif
-
     for ( pj = Start_Index(i, &bond_list); pj < End_Index(i, &bond_list); ++pj )
     {
         pbond_ij = &bond_list.bond_list[pj];
@@ -240,55 +235,6 @@ CUDA_GLOBAL void Cuda_Atom_Energy_Part1( reax_atom *my_atoms, global_parameters
                 * workspace.Delta_lp_temp[j]);  // UnCoor-2b
         bo_ij->Cdbopi2 += CEunder4 * (workspace.Delta[j] - dfvl
                 * workspace.Delta_lp_temp[j]);  // UnCoor-2b
-
-#if defined(TEST_ENERGY)
-        /*      fprintf( out_control->eov, "%6d%12.6f\n", 
-              workspace.reverse_map[j], 
-        // CEover1 * twbp->p_ovun1 * twbp->De_s, CEover3, 
-        CEover4 * (1.0 - workspace.dDelta_lp[j]) * 
-        (bo_ij->BO_pi + bo_ij->BO_pi2)
-         *///           /*CEover4 * (workspace.Delta[j]-workspace.Delta_lp[j])*/);
-        //      fprintf( out_control->eov, "%6d%12.6f\n", 
-        //      fprintf( out_control->eov, "%6d%24.15e\n", 
-        //           system->my_atoms[j].orig_id, 
-        // CEover1 * twbp->p_ovun1 * twbp->De_s, CEover3, 
-        //           CEover4 * (1.0 - workspace.dDelta_lp[j]) * 
-        //           (bo_ij->BO_pi + bo_ij->BO_pi2)
-        //           /*CEover4 * (workspace.Delta[j]-workspace.Delta_lp[j])*/);
-
-        // CEunder4 * (1.0 - workspace.dDelta_lp[j]) * 
-        // (bo_ij->BO_pi + bo_ij->BO_pi2),
-        // CEunder4 * (workspace.Delta[j] - workspace.Delta_lp[j]) );
-#endif
-
-#if defined(TEST_FORCES)
-        Add_dBO( system, lists, i, pj, CEover1 * twbp->p_ovun1 * twbp->De_s,
-                workspace.f_ov ); // OvCoor - 1st term
-
-        Add_dDelta( system, lists, j,
-                CEover4 * (1.0 - dfvl*workspace.dDelta_lp[j]) * 
-                (bo_ij->BO_pi + bo_ij->BO_pi2),
-                workspace.f_ov );   // OvCoor - 3a
-
-        Add_dBOpinpi2( system, lists, i, pj, 
-                CEover4 * (workspace.Delta[j] - 
-                    dfvl * workspace.Delta_lp_temp[j]),
-                CEover4 * (workspace.Delta[j] - 
-                    dfvl * workspace.Delta_lp_temp[j]),
-                workspace.f_ov, workspace.f_ov ); // OvCoor - 3b
-
-        Add_dDelta( system, lists, j,
-                CEunder4 * (1.0 - dfvl*workspace.dDelta_lp[j]) * 
-                (bo_ij->BO_pi + bo_ij->BO_pi2),
-                workspace.f_un ); // UnCoor - 2a
-
-        Add_dBOpinpi2( system, lists, i, pj, 
-                CEunder4 * (workspace.Delta[j] - 
-                    dfvl * workspace.Delta_lp_temp[j]),
-                CEunder4 * (workspace.Delta[j] - 
-                    dfvl * workspace.Delta_lp_temp[j]),
-                workspace.f_un, workspace.f_un ); // UnCoor - 2b
-#endif
     }
 }
 
diff --git a/PG-PuReMD/src/cuda/cuda_nonbonded.cu b/PG-PuReMD/src/cuda/cuda_nonbonded.cu
index a7aec169..3bc1e72d 100644
--- a/PG-PuReMD/src/cuda/cuda_nonbonded.cu
+++ b/PG-PuReMD/src/cuda/cuda_nonbonded.cu
@@ -35,7 +35,7 @@
 
 
 CUDA_GLOBAL void k_compute_polarization_energy( reax_atom *my_atoms, 
-        single_body_parameters *sbp, int n, real *data_e_pol )
+        single_body_parameters *sbp, int n, real *e_pol_g )
 {
     int i, type_i;
     real q;
@@ -50,7 +50,7 @@ CUDA_GLOBAL void k_compute_polarization_energy( reax_atom *my_atoms,
     q = my_atoms[i].q;
     type_i = my_atoms[i].type;
 
-    data_e_pol[i] = KCALpMOL_to_EV * (sbp[type_i].chi * q
+    e_pol_g[i] = KCALpMOL_to_EV * (sbp[type_i].chi * q
             + (sbp[type_i].eta / 2.0) * SQR(q));
 }
 
@@ -59,7 +59,7 @@ CUDA_GLOBAL void k_compute_polarization_energy( reax_atom *my_atoms,
 CUDA_GLOBAL void k_vdW_coulomb_energy( reax_atom *my_atoms, 
         two_body_parameters *tbp, global_parameters gp, control_params *control, 
         storage workspace, reax_list far_nbr_list, int n, int num_atom_types, 
-        real *data_e_vdW, real *data_e_ele, rvec *data_ext_press )
+        real *e_vdW_g, real *e_ele_g, rvec *ext_press_g )
 {
     int i, j, pj;
     int start_i, end_i, orig_i, orig_j;
@@ -69,8 +69,8 @@ CUDA_GLOBAL void k_vdW_coulomb_energy( reax_atom *my_atoms,
     real r_ij, fn13, exp1, exp2, e_base, de_base;
     real Tap, dTap, dfn13, CEvd, CEclmb;
     real dr3gamij_1, dr3gamij_3;
-    real e_ele, e_vdW, e_core, de_core, e_clb, de_clb;
-    rvec temp, ext_press;
+    real e_ele_l, e_vdW_l, e_core, de_core, e_clb, de_clb;
+    rvec temp, f_i_l, ext_press_l;
     two_body_parameters *twbp;
 
     i = blockIdx.x * blockDim.x + threadIdx.x;
@@ -82,9 +82,10 @@ CUDA_GLOBAL void k_vdW_coulomb_energy( reax_atom *my_atoms,
 
     p_vdW1 = gp.l[28];
     p_vdW1i = 1.0 / p_vdW1;
-    e_vdW = 0.0;
-    e_ele = 0.0;
-    rvec_MakeZero( ext_press );
+    e_vdW_l = 0.0;
+    e_ele_l = 0.0;
+    rvec_MakeZero( f_i_l );
+    rvec_MakeZero( ext_press_l );
 
     start_i = Start_Index( i, &far_nbr_list );
     end_i = End_Index( i, &far_nbr_list );
@@ -137,7 +138,7 @@ CUDA_GLOBAL void k_vdW_coulomb_energy( reax_atom *my_atoms,
                 exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
                 e_base = twbp->D * (exp1 - 2.0 * exp2);
 
-                e_vdW += self_coef * (e_base * Tap);
+                e_vdW_l += self_coef * (e_base * Tap);
 
                 dfn13 = POW( r_ij, p_vdW1 - 1.0 )
                     * POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0 );
@@ -150,7 +151,7 @@ CUDA_GLOBAL void k_vdW_coulomb_energy( reax_atom *my_atoms,
                 exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
                 e_base = twbp->D * (exp1 - 2.0 * exp2);
 
-                e_vdW += self_coef * (e_base * Tap);
+                e_vdW_l += self_coef * (e_base * Tap);
 
                 de_base = (twbp->D * twbp->alpha / twbp->r_vdW) * (exp2 - exp1);
             }
@@ -159,7 +160,7 @@ CUDA_GLOBAL void k_vdW_coulomb_energy( reax_atom *my_atoms,
             if ( gp.vdw_type == 2 || gp.vdw_type == 3 )
             {
                 e_core = twbp->ecore * EXP( twbp->acore * (1.0 - (r_ij / twbp->rcore)) );
-                e_vdW += self_coef * (e_core * Tap);
+                e_vdW_l += self_coef * (e_core * Tap);
 
                 de_core = -(twbp->acore / twbp->rcore) * e_core;
             }
@@ -172,28 +173,17 @@ CUDA_GLOBAL void k_vdW_coulomb_energy( reax_atom *my_atoms,
             CEvd = self_coef * ( (de_base + de_core) * Tap
                     + (e_base + e_core) * dTap );
 
-#if defined(DEBUG_FOCUS)
-            printf( "%6d%6d%24.12f%24.12f%24.12f%24.12f\n",
-                    i + 1, j + 1, 
-                    e_base, de_base, e_core, de_core );
-#endif
-
             /* Coulomb Calculations */
             dr3gamij_1 = r_ij * r_ij * r_ij
                 + POW( twbp->gamma, -3.0 );
             dr3gamij_3 = POW( dr3gamij_1, 1.0 / 3.0 );
             e_clb = C_ELE * (my_atoms[i].q * my_atoms[j].q) / dr3gamij_3;
-            e_ele += self_coef * (e_clb * Tap);
+            e_ele_l += self_coef * (e_clb * Tap);
 
             de_clb = -C_ELE * (my_atoms[i].q * my_atoms[j].q)
                     * (r_ij * r_ij) / POW( dr3gamij_1, 4.0 / 3.0 );
             CEclmb = self_coef * (de_clb * Tap + e_clb * dTap);
 
-#if defined(DEBUG_FOCUS)
-            printf( "%6d%6d%24.12f%24.12f\n",
-                    i + 1, j + 1, e_clb, de_clb );
-#endif
-
             if ( control->virial == 0 )
             {
                 if ( i < j ) 
@@ -204,7 +194,7 @@ CUDA_GLOBAL void k_vdW_coulomb_energy( reax_atom *my_atoms,
                 {
                     rvec_Scale( temp, (CEvd + CEclmb) / r_ij, far_nbr_list.far_nbr_list.dvec[pj] );
                 }
-                atomic_rvecAdd( workspace.f[i], temp );
+                rvec_Add( f_i_l, temp );
                 rvec_Scale( temp, -1.0, temp );
                 atomic_rvecAdd( workspace.f[j], temp );
             }
@@ -223,48 +213,29 @@ CUDA_GLOBAL void k_vdW_coulomb_energy( reax_atom *my_atoms,
                     rvec_Scale( temp,
                             (CEvd + CEclmb) / r_ij, far_nbr_list.far_nbr_list.dvec[pj] );
                 }
-                atomic_rvecAdd( workspace.f[i], temp );
+                rvec_Add( f_i_l, temp );
                 rvec_Scale( temp, -1.0, temp );
                 atomic_rvecAdd( workspace.f[j], temp );
 
-                rvec_iMultiply( ext_press,
+                rvec_iMultiply( temp,
                         far_nbr_list.far_nbr_list.rel_box[pj], temp );
-                rvec_Add( data_ext_press[i], ext_press );
+                rvec_Add( ext_press_l, temp );
             }
-
-#if defined(TEST_ENERGY)
-            // fprintf( out_control->evdw, 
-            // "%12.9f%12.9f%12.9f%12.9f%12.9f%12.9f%12.9f%12.9f\n", 
-            // workspace.Tap[7], workspace.Tap[6], workspace.Tap[5],
-            // workspace.Tap[4], workspace.Tap[3], workspace.Tap[2], 
-            // workspace.Tap[1], Tap );
-            //fprintf( out_control->evdw, "%6d%6d%24.15e%24.15e%24.15e\n",
-            fprintf( out_control->evdw, "%6d%6d%12.4f%12.4f%12.4f\n",
-                    my_atoms[i].orig_id, my_atoms[j].orig_id, 
-                    r_ij, e_vdW, data->my_en.e_vdW );
-            //fprintf(out_control->ecou,"%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n",
-            fprintf( out_control->ecou, "%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f\n",
-                    my_atoms[i].orig_id, my_atoms[j].orig_id,
-                    r_ij, my_atoms[i].q, my_atoms[j].q, 
-                    e_ele, data->my_en.e_ele );
-#endif
-
-#if defined(TEST_FORCES)
-            rvec_ScaledAdd( workspace.f_vdw[i], -CEvd,
-                    far_nbr_list.far_nbr_list.dvec[pj] );
-            rvec_ScaledAdd( workspace.f_vdw[j], +CEvd,
-                    far_nbr_list.far_nbr_list.dvec[pj] );
-            rvec_ScaledAdd( workspace.f_ele[i], -CEclmb,
-                    far_nbr_list.far_nbr_list.dvec[pj] );
-            rvec_ScaledAdd( workspace.f_ele[j], +CEclmb,
-                    far_nbr_list.far_nbr_list.dvec[pj] );
-#endif
         }
     }
 
-    __syncthreads( );
-    data_e_vdW[i] = e_vdW;
-    data_e_ele[i] = e_ele;
+    atomic_rvecAdd( workspace.f[i], f_i_l );
+#if !defined(CUDA_ACCUM_ATOMIC)
+    e_vdW_g[i] = e_vdW_l;
+    e_ele_g[i] = e_ele_l;
+    if ( control->virial == 1 )
+        rvec_Copy( ext_press_g[j], ext_press_l );
+#else
+    atomicAdd( (double *) e_vdW_g, (double) e_vdW_l );
+    atomicAdd( (double *) e_ele_g, (double) e_ele_l );
+    if ( control->virial == 1 )
+        atomic_rvecAdd( *ext_press_g, ext_press_l );
+#endif
 }
 
 
@@ -272,7 +243,7 @@ CUDA_GLOBAL void k_vdW_coulomb_energy( reax_atom *my_atoms,
 CUDA_GLOBAL void k_vdW_coulomb_energy_opt( reax_atom *my_atoms, 
         two_body_parameters *tbp, global_parameters gp, control_params *control, 
         storage workspace, reax_list far_nbr_list, int n, int num_atom_types, 
-        real *data_e_vdW, real *data_e_ele, rvec *data_ext_press )
+        real *e_vdW_g, real *e_ele_g, rvec *ext_press_g )
 {
     int i, j, pj;
     int start_i, end_i, orig_i, orig_j;
@@ -282,12 +253,10 @@ CUDA_GLOBAL void k_vdW_coulomb_energy_opt( reax_atom *my_atoms,
     real r_ij, fn13, exp1, exp2, e_base, de_base;
     real Tap, dTap, dfn13, CEvd, CEclmb;
     real dr3gamij_1, dr3gamij_3;
-    real e_ele, e_vdW, e_core, de_core, e_clb, de_clb;
-    rvec temp, ext_press;
+    real e_vdW_l, e_ele_l, e_core, de_core, e_clb, de_clb;
+    rvec temp, f_i_l, ext_press_l;
     two_body_parameters *twbp;
     int thread_id, warp_id, lane_id, offset;
-    real e_vdW_l, e_ele_l;
-    rvec f_l;
 
     thread_id = blockIdx.x * blockDim.x + threadIdx.x;
     warp_id = thread_id >> 5;
@@ -301,12 +270,10 @@ CUDA_GLOBAL void k_vdW_coulomb_energy_opt( reax_atom *my_atoms,
     i = warp_id;
     p_vdW1 = gp.l[28];
     p_vdW1i = 1.0 / p_vdW1;
-    e_vdW = 0.0;
-    e_ele = 0.0;
-    rvec_MakeZero( ext_press );
     e_vdW_l = 0.0;
     e_ele_l = 0.0;
-    rvec_MakeZero( f_l );
+    rvec_MakeZero( f_i_l );
+    rvec_MakeZero( ext_press_l );
 
     start_i = Start_Index( i, &far_nbr_list );
     end_i = End_Index( i, &far_nbr_list );
@@ -360,8 +327,7 @@ CUDA_GLOBAL void k_vdW_coulomb_energy_opt( reax_atom *my_atoms,
                 exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
                 e_base = twbp->D * (exp1 - 2.0 * exp2);
 
-                e_vdW = self_coef * (e_base * Tap);
-                e_vdW_l += e_vdW;
+                e_vdW_l += self_coef * (e_base * Tap);
 
                 dfn13 = POW( r_ij, p_vdW1 - 1.0 )
                     * POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0 );
@@ -374,8 +340,7 @@ CUDA_GLOBAL void k_vdW_coulomb_energy_opt( reax_atom *my_atoms,
                 exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
                 e_base = twbp->D * (exp1 - 2.0 * exp2);
 
-                e_vdW = self_coef * (e_base * Tap);
-                e_vdW_l += e_vdW;
+                e_vdW_l += self_coef * (e_base * Tap);
 
                 de_base = (twbp->D * twbp->alpha / twbp->r_vdW) * (exp2 - exp1);
             }
@@ -384,8 +349,7 @@ CUDA_GLOBAL void k_vdW_coulomb_energy_opt( reax_atom *my_atoms,
             if ( gp.vdw_type == 2 || gp.vdw_type == 3 )
             {
                 e_core = twbp->ecore * EXP( twbp->acore * (1.0 - (r_ij / twbp->rcore)) );
-                e_vdW += self_coef * (e_core * Tap);
-                e_vdW_l += (self_coef * (e_core * Tap));
+                e_vdW_l += self_coef * (e_core * Tap);
 
                 de_core = -(twbp->acore / twbp->rcore) * e_core;
             }
@@ -398,29 +362,17 @@ CUDA_GLOBAL void k_vdW_coulomb_energy_opt( reax_atom *my_atoms,
             CEvd = self_coef * ( (de_base + de_core) * Tap
                     + (e_base + e_core) * dTap );
 
-#if defined(DEBUG_FOCUS)
-            printf( "%6d%6d%24.12f%24.12f%24.12f%24.12f\n",
-                    i + 1, j + 1, 
-                    e_base, de_base, e_core, de_core );
-#endif
-
             /* Coulomb Calculations */
             dr3gamij_1 = r_ij * r_ij * r_ij
                 + POW( twbp->gamma, -3.0 );
             dr3gamij_3 = POW( dr3gamij_1, 1.0 / 3.0 );
             e_clb = C_ELE * (my_atoms[i].q * my_atoms[j].q) / dr3gamij_3;
-            e_ele = self_coef * (e_clb * Tap);
-            e_ele_l += e_ele;
+            e_ele_l += self_coef * (e_clb * Tap);
 
             de_clb = -C_ELE * (my_atoms[i].q * my_atoms[j].q)
                     * (r_ij * r_ij) / POW( dr3gamij_1, 4.0 / 3.0 );
             CEclmb = self_coef * (de_clb * Tap + e_clb * dTap);
 
-#if defined(DEBUG_FOCUS)
-            printf( "%6d%6d%24.12f%24.12f\n",
-                    i + 1, j + 1, e_clb, de_clb );
-#endif
-
             if ( control->virial == 0 )
             {
                 if ( i < j ) 
@@ -431,7 +383,7 @@ CUDA_GLOBAL void k_vdW_coulomb_energy_opt( reax_atom *my_atoms,
                 {
                     rvec_Scale( temp, (CEvd + CEclmb) / r_ij, far_nbr_list.far_nbr_list.dvec[pj] );
                 }
-                rvec_Add( f_l, temp );
+                rvec_Add( f_i_l, temp );
                 rvec_Scale( temp, -1.0, temp );
                 atomic_rvecAdd( workspace.f[j], temp );
             }
@@ -450,42 +402,14 @@ CUDA_GLOBAL void k_vdW_coulomb_energy_opt( reax_atom *my_atoms,
                     rvec_Scale( temp,
                             (CEvd + CEclmb) / r_ij, far_nbr_list.far_nbr_list.dvec[pj] );
                 }
-                rvec_Add( f_l, temp );
+                rvec_Add( f_i_l, temp );
                 rvec_Scale( temp, -1.0, temp );
                 atomic_rvecAdd( workspace.f[j], temp );
 
-                rvec_iMultiply( ext_press,
+                rvec_iMultiply( temp,
                         far_nbr_list.far_nbr_list.rel_box[pj], temp );
-                rvec_Add( data_ext_press[i], ext_press );
+                rvec_Add( ext_press_l, temp );
             }
-
-#if defined(TEST_ENERGY)
-            // fprintf( out_control->evdw, 
-            // "%12.9f%12.9f%12.9f%12.9f%12.9f%12.9f%12.9f%12.9f\n", 
-            // workspace.Tap[7], workspace.Tap[6], workspace.Tap[5],
-            // workspace.Tap[4], workspace.Tap[3], workspace.Tap[2], 
-            // workspace.Tap[1], Tap );
-            //fprintf( out_control->evdw, "%6d%6d%24.15e%24.15e%24.15e\n",
-            fprintf( out_control->evdw, "%6d%6d%12.4f%12.4f%12.4f\n",
-                    my_atoms[i].orig_id, my_atoms[j].orig_id, 
-                    r_ij, e_vdW, data->my_en.e_vdW );
-            //fprintf(out_control->ecou,"%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n",
-            fprintf( out_control->ecou, "%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f\n",
-                    my_atoms[i].orig_id, my_atoms[j].orig_id,
-                    r_ij, my_atoms[i].q, my_atoms[j].q, 
-                    e_ele, data->my_en.e_ele );
-#endif
-
-#if defined(TEST_FORCES)
-            rvec_ScaledAdd( workspace.f_vdw[i], -CEvd,
-                    far_nbr_list.far_nbr_list.dvec[pj] );
-            rvec_ScaledAdd( workspace.f_vdw[j], +CEvd,
-                    far_nbr_list.far_nbr_list.dvec[pj] );
-            rvec_ScaledAdd( workspace.f_ele[i], -CEclmb,
-                    far_nbr_list.far_nbr_list.dvec[pj] );
-            rvec_ScaledAdd( workspace.f_ele[j], +CEclmb,
-                    far_nbr_list.far_nbr_list.dvec[pj] );
-#endif
         }
 
         pj += warpSize;
@@ -496,17 +420,26 @@ CUDA_GLOBAL void k_vdW_coulomb_energy_opt( reax_atom *my_atoms,
     {
         e_vdW_l += __shfl_down_sync( FULL_MASK, e_vdW_l, offset );
         e_ele_l += __shfl_down_sync( FULL_MASK, e_ele_l, offset );
-        f_l[0] += __shfl_down_sync( FULL_MASK, f_l[0], offset );
-        f_l[1] += __shfl_down_sync( FULL_MASK, f_l[1], offset );
-        f_l[2] += __shfl_down_sync( FULL_MASK, f_l[2], offset );
+        f_i_l[0] += __shfl_down_sync( FULL_MASK, f_i_l[0], offset );
+        f_i_l[1] += __shfl_down_sync( FULL_MASK, f_i_l[1], offset );
+        f_i_l[2] += __shfl_down_sync( FULL_MASK, f_i_l[2], offset );
     }
 
     /* first thread within a warp writes warp-level sum to global memory */
     if ( lane_id == 0 )
     {
-        data_e_vdW[i] = e_vdW_l;
-        data_e_ele[i] = e_ele_l;
-        atomic_rvecAdd( workspace.f[i], f_l );
+        atomic_rvecAdd( workspace.f[i], f_i_l );
+#if !defined(CUDA_ACCUM_ATOMIC)
+        e_vdW_g[i] = e_vdW_l;
+        e_ele_g[i] = e_ele_l;
+        if ( control->virial == 1 )
+            rvec_Copy( ext_press_g[j], ext_press_l );
+#else
+        atomicAdd( (double *) e_vdW_g, (double) e_vdW_l );
+        atomicAdd( (double *) e_ele_g, (double) e_ele_l );
+        if ( control->virial == 1 )
+            atomic_rvecAdd( *ext_press_g, ext_press_l );
+#endif
     }
 }
 
@@ -517,15 +450,15 @@ CUDA_GLOBAL void k_vdW_coulomb_energy_tab( reax_atom *my_atoms,
         storage workspace, reax_list far_nbr_list, 
         LR_lookup_table *t_LR, int n, int num_atom_types, 
         int step, int prev_steps, int energy_update_freq, 
-        real *data_e_vdW, real *data_e_ele, rvec *data_ext_press )
+        real *e_vdW_g, real *e_ele_g, rvec *ext_press_g )
 {
     int i, j, pj, r, steps, update_freq, update_energies;
     int type_i, type_j, tmin, tmax;
     int start_i, end_i, orig_i, orig_j;
     real r_ij, self_coef, base, dif;
-    real e_vdW, e_ele;
+    real e_vdW_l, e_ele_l;
     real CEvd, CEclmb;
-    rvec temp, ext_press;
+    rvec temp, f_i_l, ext_press_l;
     LR_lookup_table *t;
 
     i = blockIdx.x * blockDim.x + threadIdx.x;
@@ -538,10 +471,10 @@ CUDA_GLOBAL void k_vdW_coulomb_energy_tab( reax_atom *my_atoms,
     steps = step - prev_steps;
     update_freq = energy_update_freq;
     update_energies = update_freq > 0 && steps % update_freq == 0;
-    e_ele = 0.0;
-    e_vdW = 0.0;
-    data_e_vdW[i] = 0.0;
-    data_e_ele[i] = 0.0;
+    e_ele_l = 0.0;
+    e_vdW_l = 0.0;
+    rvec_MakeZero( f_i_l );
+    rvec_MakeZero( ext_press_l );
 
     type_i = my_atoms[i].type;
     start_i = Start_Index( i, &far_nbr_list );
@@ -575,16 +508,11 @@ CUDA_GLOBAL void k_vdW_coulomb_energy_tab( reax_atom *my_atoms,
 
             if ( update_energies )
             {
-                e_vdW = ((t->vdW[r].d * dif + t->vdW[r].c) * dif + t->vdW[r].b)
-                    * dif + t->vdW[r].a;
-                e_vdW *= self_coef;
-
-                e_ele = ((t->ele[r].d * dif + t->ele[r].c) * dif + t->ele[r].b)
-                    * dif + t->ele[r].a;
-                e_ele *= self_coef * my_atoms[i].q * my_atoms[j].q;
+                e_vdW_l += self_coef * (((t->vdW[r].d * dif + t->vdW[r].c) * dif + t->vdW[r].b)
+                    * dif + t->vdW[r].a);
 
-                data_e_vdW[i] += e_vdW;
-                data_e_ele[i] += e_ele;
+                e_ele_l += (((t->ele[r].d * dif + t->ele[r].c) * dif + t->ele[r].b)
+                    * dif + t->ele[r].a) * self_coef * my_atoms[i].q * my_atoms[j].q;
             }    
 
             CEvd = ((t->CEvd[r].d * dif + t->CEvd[r].c) * dif + t->CEvd[r].b)
@@ -599,54 +527,56 @@ CUDA_GLOBAL void k_vdW_coulomb_energy_tab( reax_atom *my_atoms,
             {
                 if ( i < j ) 
                 {
-                    rvec_ScaledAdd( workspace.f[i],
+                    rvec_ScaledAdd( temp,
                             -(CEvd + CEclmb) / r_ij, far_nbr_list.far_nbr_list.dvec[pj] );
                 }
                 else 
                 {
-                    rvec_ScaledAdd( workspace.f[i],
+                    rvec_ScaledAdd( temp,
                             (CEvd + CEclmb) / r_ij, far_nbr_list.far_nbr_list.dvec[pj] );
                 }
+                rvec_Add( f_i_l, temp );
+                rvec_Scale( temp, -1.0, temp );
+                atomic_rvecAdd( workspace.f[j], temp );
             }
             /* NPT, iNPT or sNPT */
             else
             {
                 /* for pressure coupling, terms not related to bond order derivatives
                    are added directly into pressure vector/tensor */
-                rvec_Scale( temp,
-                        (CEvd + CEclmb) / r_ij, far_nbr_list.far_nbr_list.dvec[pj] );
-
-                rvec_ScaledAdd( workspace.f[i], -1.0, temp );
-                rvec_Add( workspace.f[j], temp );
+                if ( i < j ) 
+                {
+                    rvec_Scale( temp,
+                            -(CEvd + CEclmb) / r_ij, far_nbr_list.far_nbr_list.dvec[pj] );
+                }
+                else 
+                {
+                    rvec_Scale( temp,
+                            (CEvd + CEclmb) / r_ij, far_nbr_list.far_nbr_list.dvec[pj] );
+                }
+                rvec_Add( f_i_l, temp );
+                rvec_ScaledAdd( temp, -1.0, temp );
+                atomic_rvecAdd( workspace.f[j], temp );
 
-                rvec_iMultiply( ext_press, far_nbr_list.far_nbr_list.rel_box[pj], temp );
-                rvec_Add( data_ext_press[i], ext_press );
+                rvec_iMultiply( temp, far_nbr_list.far_nbr_list.rel_box[pj], temp );
+                rvec_Add( ext_press_l, temp );
             }
-
-#if defined(TEST_ENERGY)
-            //fprintf( out_control->evdw, "%6d%6d%24.15e%24.15e%24.15e\n",
-            fprintf( out_control->evdw, "%6d%6d%12.4f%12.4f%12.4f\n",
-                    my_atoms[i].orig_id, my_atoms[j].orig_id, 
-                    r_ij, e_vdW, data->my_en.e_vdW );
-            //fprintf(out_control->ecou,"%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n",
-            fprintf( out_control->ecou, "%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f\n",
-                    my_atoms[i].orig_id, my_atoms[j].orig_id,
-                    r_ij, my_atoms[i].q, my_atoms[j].q, 
-                    e_ele, data->my_en.e_ele );
-#endif
-
-#if defined(TEST_FORCES)
-            rvec_ScaledAdd( workspace.f_vdw[i], -CEvd,
-                    far_nbr_list.far_nbr_list.dvec[pj] );
-            rvec_ScaledAdd( workspace.f_vdw[j], +CEvd,
-                    far_nbr_list.far_nbr_list.dvec[pj] );
-            rvec_ScaledAdd( workspace.f_ele[i], -CEclmb,
-                    far_nbr_list.far_nbr_list.dvec[pj] );
-            rvec_ScaledAdd( workspace.f_ele[j], +CEclmb,
-                    far_nbr_list.far_nbr_list.dvec[pj] );
-#endif
         }
     }
+
+    atomic_rvecAdd( workspace.f[i], f_i_l );
+#if !defined(CUDA_ACCUM_ATOMIC)
+    __syncthreads( );
+    e_vdW_g[i] = e_vdW_l;
+    e_ele_g[i] = e_ele_l;
+    if ( control->virial == 1 )
+        rvec_Copy( ext_press_g[j], ext_press_l );
+#else
+    atomicAdd( (double *) e_vdW_g, (double) e_vdW_l );
+    atomicAdd( (double *) e_ele_g, (double) e_ele_l );
+    if ( control->virial == 1 )
+        atomic_rvecAdd( *ext_press_g, ext_press_l );
+#endif
 }
 
 
@@ -675,21 +605,34 @@ static void Cuda_Compute_Polarization_Energy( reax_system *system, storage *work
 }
 
 
-void Cuda_NonBonded_Energy( reax_system *system, control_params *control, 
-        storage *workspace, simulation_data *data, reax_list **lists,
+void Cuda_Compute_NonBonded_Forces( reax_system *system, control_params *control, 
+        simulation_data *data, storage *workspace, reax_list **lists,
         output_controls *out_control )
 {
     int update_energy, blocks;
+#if !defined(CUDA_ACCUM_ATOMIC)
     real *spad;
     rvec *spad_rvec;
+#endif
 
     update_energy = (out_control->energy_update_freq > 0
             && data->step % out_control->energy_update_freq == 0) ? TRUE : FALSE;
 
+#if !defined(CUDA_ACCUM_ATOMIC)
     cuda_check_malloc( &workspace->scratch, &workspace->scratch_size,
             (sizeof(real) * 2 + sizeof(rvec)) * system->n + sizeof(rvec) * control->blocks,
-            "Cuda_NonBonded_Energy::workspace->scratch" );
+            "Cuda_Compute_NonBonded_Forces::workspace->scratch" );
     spad = (real *) workspace->scratch;
+#endif
+
+#if defined(CUDA_ACCUM_ATOMIC)
+        cuda_memset( &((simulation_data *)data->d_simulation_data)->my_en.e_vdW,
+                0, sizeof(real), "Cuda_Compute_Bonded_Forces::e_vdW" );
+        cuda_memset( &((simulation_data *)data->d_simulation_data)->my_en.e_ele,
+                0, sizeof(real), "Cuda_Compute_Bonded_Forces::e_ele" );
+        cuda_memset( &((simulation_data *)data->d_simulation_data)->my_ext_press,
+                0, sizeof(rvec), "Cuda_Compute_Bonded_Forces::my_ext_press" );
+#endif
 
     blocks = system->n * 32 / DEF_BLOCK_SIZE
         + (system->n * 32 % DEF_BLOCK_SIZE == 0 ? 0 : 1);
@@ -701,14 +644,28 @@ void Cuda_NonBonded_Energy( reax_system *system, control_params *control,
 //              system->reax_param.d_gp, (control_params *) control->d_control_params, 
 //              *(workspace->d_workspace), *(lists[FAR_NBRS]), 
 //              system->n, system->reax_param.num_atom_types, 
-//              spad, &spad[system->n], (rvec *) (&spad[2 * system->n]) );
+//#if !defined(CUDA_ACCUM_ATOMIC)
+//              spad, &spad[system->n], (rvec *) (&spad[2 * system->n])
+//#else
+//              &((simulation_data *)data->d_simulation_data)->my_en.e_vdW,
+//              &((simulation_data *)data->d_simulation_data)->my_en.e_ele,
+//              &((simulation_data *)data->d_simulation_data)->my_ext_press
+//#endif
+//            );
 
         k_vdW_coulomb_energy_opt <<< blocks, DEF_BLOCK_SIZE >>>
             ( system->d_my_atoms, system->reax_param.d_tbp, 
               system->reax_param.d_gp, (control_params *) control->d_control_params, 
               *(workspace->d_workspace), *(lists[FAR_NBRS]), 
               system->n, system->reax_param.num_atom_types, 
-              spad, &spad[system->n], (rvec *) (&spad[2 * system->n]) );
+#if !defined(CUDA_ACCUM_ATOMIC)
+              spad, &spad[system->n], (rvec *) (&spad[2 * system->n])
+#else
+              &((simulation_data *)data->d_simulation_data)->my_en.e_vdW,
+              &((simulation_data *)data->d_simulation_data)->my_en.e_ele,
+              &((simulation_data *)data->d_simulation_data)->my_ext_press
+#endif
+            );
         cudaCheckError( );
     }
     else
@@ -721,10 +678,18 @@ void Cuda_NonBonded_Energy( reax_system *system, control_params *control,
               system->reax_param.num_atom_types, 
               data->step, data->prev_steps, 
               out_control->energy_update_freq,
-              spad, &spad[system->n], (rvec *) (&spad[2 * system->n]));
+#if !defined(CUDA_ACCUM_ATOMIC)
+              spad, &spad[system->n], (rvec *) (&spad[2 * system->n])
+#else
+              &((simulation_data *)data->d_simulation_data)->my_en.e_vdW,
+              &((simulation_data *)data->d_simulation_data)->my_en.e_ele,
+              &((simulation_data *)data->d_simulation_data)->my_ext_press
+#endif
+            );
         cudaCheckError( );
     }
 
+#if !defined(CUDA_ACCUM_ATOMIC)
     if ( update_energy == TRUE )
     {
         /* reduction for vdw */
@@ -755,6 +720,7 @@ void Cuda_NonBonded_Energy( reax_system *system, control_params *control,
               control->blocks );
         cudaCheckError( );
     }
+#endif
 
     if ( update_energy == TRUE )
     {
diff --git a/PG-PuReMD/src/cuda/cuda_nonbonded.h b/PG-PuReMD/src/cuda/cuda_nonbonded.h
index 038cb9e7..fbc2cf5b 100644
--- a/PG-PuReMD/src/cuda/cuda_nonbonded.h
+++ b/PG-PuReMD/src/cuda/cuda_nonbonded.h
@@ -25,8 +25,8 @@
 #include "../reax_types.h"
 
 
-void Cuda_NonBonded_Energy( reax_system *, control_params *,
-        storage *, simulation_data *, reax_list **,
+void Cuda_Compute_NonBonded_Forces( reax_system *, control_params *,
+        simulation_data *, storage *, reax_list **,
         output_controls * );
 
 
diff --git a/PG-PuReMD/src/cuda/cuda_torsion_angles.cu b/PG-PuReMD/src/cuda/cuda_torsion_angles.cu
index 915ada83..0e832ba4 100644
--- a/PG-PuReMD/src/cuda/cuda_torsion_angles.cu
+++ b/PG-PuReMD/src/cuda/cuda_torsion_angles.cu
@@ -170,9 +170,8 @@ CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *my_atoms, global_paramete
     real CEtors5, CEtors6, CEtors7, CEtors8, CEtors9;
     real Cconj, CEconj1, CEconj2, CEconj3;
     real CEconj4, CEconj5, CEconj6, e_tor_l, e_con_l;
-    rvec dvec_li, temp, ext_press_l;
+    rvec dvec_li, temp, f_j_l, ext_press_l;
     ivec rel_box_jl;
-    // rtensor total_rtensor, temp_rtensor;
     four_body_header *fbh;
     four_body_parameters *fbp;
     bond_data *pbond_ij, *pbond_jk, *pbond_kl;
@@ -193,6 +192,7 @@ CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *my_atoms, global_paramete
     p_cot2 = gp.l[27];
     e_tor_l = 0.0;
     e_con_l = 0.0;
+    rvec_MakeZero( f_j_l );
     rvec_MakeZero( ext_press_l );
 #if defined(DEBUG_FOCUS)
     num_frb_intrs = 0;
@@ -431,13 +431,13 @@ CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *my_atoms, global_paramete
                                     /* dcos_theta_ijk */
                                     atomic_rvecScaledAdd( pbond_ij->ta_f, 
                                             CEtors7 + CEconj4, p_ijk->dcos_dk );
-                                    rvec_ScaledAdd( workspace.f[j], 
+                                    rvec_ScaledAdd( f_j_l, 
                                             CEtors7 + CEconj4, p_ijk->dcos_dj );
                                     atomic_rvecScaledAdd( pbond_jk->ta_f,
                                             CEtors7 + CEconj4, p_ijk->dcos_di );
 
                                     /* dcos_theta_jkl */
-                                    rvec_ScaledAdd( workspace.f[j], 
+                                    rvec_ScaledAdd( f_j_l, 
                                             CEtors8 + CEconj5, p_jkl->dcos_di );
                                     atomic_rvecScaledAdd( pbond_jk->ta_f,
                                             CEtors8 + CEconj5, p_jkl->dcos_dj );
@@ -447,7 +447,7 @@ CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *my_atoms, global_paramete
                                     /* dcos_omega */
                                     atomic_rvecScaledAdd( pbond_ij->ta_f,
                                             CEtors9 + CEconj6, dcos_omega_di );
-                                    rvec_ScaledAdd( workspace.f[j], 
+                                    rvec_ScaledAdd( f_j_l, 
                                             CEtors9 + CEconj6, dcos_omega_dj );
                                     atomic_rvecScaledAdd( pbond_jk->ta_f,
                                             CEtors9 + CEconj6, dcos_omega_dk );
@@ -457,13 +457,13 @@ CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *my_atoms, global_paramete
                                     /* dcos_theta_ijk */
                                     atomic_rvecScaledAdd( workspace.f[i], 
                                             CEtors7 + CEconj4, p_ijk->dcos_dk );
-                                    atomic_rvecScaledAdd( workspace.f[j], 
+                                    rvec_ScaledAdd( f_j_l, 
                                             CEtors7 + CEconj4, p_ijk->dcos_dj );
                                     atomic_rvecScaledAdd( workspace.f[k],
                                             CEtors7 + CEconj4, p_ijk->dcos_di );
 
                                     /* dcos_theta_jkl */
-                                    atomic_rvecScaledAdd( workspace.f[j], 
+                                    rvec_ScaledAdd( f_j_l, 
                                             CEtors8 + CEconj5, p_jkl->dcos_di );
                                     atomic_rvecScaledAdd( workspace.f[k],
                                             CEtors8 + CEconj5, p_jkl->dcos_dj );
@@ -473,7 +473,7 @@ CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *my_atoms, global_paramete
                                     /* dcos_omega */
                                     atomic_rvecScaledAdd( workspace.f[i],
                                             CEtors9 + CEconj6, dcos_omega_di );
-                                    atomic_rvecScaledAdd( workspace.f[j], 
+                                    rvec_ScaledAdd( f_j_l, 
                                             CEtors9 + CEconj6, dcos_omega_dj );
                                     atomic_rvecScaledAdd( workspace.f[k],
                                             CEtors9 + CEconj6, dcos_omega_dk );
@@ -492,7 +492,7 @@ CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *my_atoms, global_paramete
                                     rvec_iMultiply( temp, pbond_ij->rel_box, temp );
                                     rvec_Add( ext_press_l, temp );
 
-                                    rvec_ScaledAdd( workspace.f[j], 
+                                    rvec_ScaledAdd( f_j_l, 
                                             CEtors7 + CEconj4, p_ijk->dcos_dj );
 
                                     rvec_Scale( temp, CEtors7 + CEconj4, p_ijk->dcos_di );
@@ -501,7 +501,7 @@ CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *my_atoms, global_paramete
                                     rvec_Add( ext_press_l, temp );
 
                                     /* dcos_theta_jkl */
-                                    rvec_ScaledAdd( workspace.f[j], 
+                                    rvec_ScaledAdd( f_j_l, 
                                             CEtors8 + CEconj5, p_jkl->dcos_di );
 
                                     rvec_Scale( temp, CEtors8 + CEconj5, p_jkl->dcos_dj );
@@ -520,7 +520,7 @@ CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *my_atoms, global_paramete
                                     rvec_iMultiply( temp, pbond_ij->rel_box, temp );
                                     rvec_Add( ext_press_l, temp );
 
-                                    rvec_ScaledAdd( workspace.f[j], 
+                                    rvec_ScaledAdd( f_j_l, 
                                             CEtors9 + CEconj6, dcos_omega_dj );
 
                                     rvec_Scale( temp, CEtors9 + CEconj6, dcos_omega_dk );
@@ -541,7 +541,7 @@ CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *my_atoms, global_paramete
                                     rvec_iMultiply( temp, pbond_ij->rel_box, temp );
                                     rvec_Add( ext_press_l, temp );
 
-                                    atomic_rvecScaledAdd( workspace.f[j], 
+                                    rvec_ScaledAdd( f_j_l, 
                                             CEtors7 + CEconj4, p_ijk->dcos_dj );
 
                                     rvec_Scale( temp, CEtors7 + CEconj4, p_ijk->dcos_di );
@@ -550,7 +550,7 @@ CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *my_atoms, global_paramete
                                     rvec_Add( ext_press_l, temp );
 
                                     /* dcos_theta_jkl */
-                                    atomic_rvecScaledAdd( workspace.f[j], 
+                                    rvec_ScaledAdd( f_j_l, 
                                             CEtors8 + CEconj5, p_jkl->dcos_di );
 
                                     rvec_Scale( temp, CEtors8 + CEconj5, p_jkl->dcos_dj );
@@ -569,7 +569,7 @@ CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *my_atoms, global_paramete
                                     rvec_iMultiply( temp, pbond_ij->rel_box, temp );
                                     rvec_Add( ext_press_l, temp );
 
-                                    atomic_rvecScaledAdd( workspace.f[j], 
+                                    rvec_ScaledAdd( f_j_l, 
                                             CEtors9 + CEconj6, dcos_omega_dj );
 
                                     rvec_Scale( temp, CEtors9 + CEconj6, dcos_omega_dk );
@@ -593,10 +593,12 @@ CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *my_atoms, global_paramete
     //  } // j loop
 
 #if !defined(CUDA_ACCUM_ATOMIC)
+    rvec_Add( workspace.f[j], f_j_l );
     e_tor_g[j] = e_tor_l;
     e_con_g[j] = e_con_l;
     rvec_Copy( e_ext_press_g[j], e_ext_press_l );
 #else
+    atomic_rvecAdd( workspace.f[j], f_j_l );
     atomicAdd( (double *) e_tor_g, (double) e_tor_l );
     atomicAdd( (double *) e_con_g, (double) e_con_l );
     atomic_rvecAdd( *ext_press_g, ext_press_l );
diff --git a/PG-PuReMD/src/cuda/cuda_valence_angles.cu b/PG-PuReMD/src/cuda/cuda_valence_angles.cu
index fc51dfc9..5392a77b 100644
--- a/PG-PuReMD/src/cuda/cuda_valence_angles.cu
+++ b/PG-PuReMD/src/cuda/cuda_valence_angles.cu
@@ -59,7 +59,7 @@ CUDA_GLOBAL void Cuda_Valence_Angles_Part1( reax_atom *my_atoms,
     real f7_ij, f7_jk, f8_Dj, f9_Dj;
     real Ctheta_0, theta_0, theta_00, theta, cos_theta, sin_theta;
     real BOA_ij, BOA_jk;
-    rvec rvec_temp, ext_press_l;
+    rvec rvec_temp, f_j_l, ext_press_l;
     three_body_header *thbh;
     three_body_parameters *thbp;
     three_body_interaction_data *p_ijk;
@@ -88,6 +88,7 @@ CUDA_GLOBAL void Cuda_Valence_Angles_Part1( reax_atom *my_atoms,
     e_ang_l = 0.0;
     e_coa_l = 0.0;
     e_pen_l = 0.0;
+    rvec_MakeZero( f_j_l );
     rvec_MakeZero( ext_press_l );
 
     type_j = my_atoms[j].type;
@@ -406,11 +407,11 @@ CUDA_GLOBAL void Cuda_Valence_Angles_Part1( reax_atom *my_atoms,
                         {
 #if !defined(CUDA_ACCUM_ATOMIC)
                             rvec_ScaledAdd( pbond_ij->va_f, CEval8, p_ijk->dcos_di );
-                            rvec_ScaledAdd( workspace.f[j], CEval8, p_ijk->dcos_dj );
+                            rvec_ScaledAdd( f_j_l, CEval8, p_ijk->dcos_dj );
                             rvec_ScaledAdd( pbond_jk->va_f, CEval8, p_ijk->dcos_dk );
 #else
                             atomic_rvecScaledAdd( workspace.f[i], CEval8, p_ijk->dcos_di );
-                            atomic_rvecScaledAdd( workspace.f[j], CEval8, p_ijk->dcos_dj );
+                            rvec_ScaledAdd( f_j_l, CEval8, p_ijk->dcos_dj );
                             atomic_rvecScaledAdd( workspace.f[k], CEval8, p_ijk->dcos_dk );
 #endif
                         }
@@ -424,7 +425,7 @@ CUDA_GLOBAL void Cuda_Valence_Angles_Part1( reax_atom *my_atoms,
                             rvec_iMultiply( rvec_temp, pbond_ij->rel_box, rvec_temp );
                             rvec_Add( ext_press_l, rvec_temp );
 
-                            rvec_ScaledAdd( workspace.f[j], CEval8, p_ijk->dcos_dj );
+                            rvec_ScaledAdd( f_j_l, CEval8, p_ijk->dcos_dj );
 
                             rvec_Scale( rvec_temp, CEval8, p_ijk->dcos_dk );
                             rvec_Add( pbond_jk->va_f, rvec_temp );
@@ -438,7 +439,7 @@ CUDA_GLOBAL void Cuda_Valence_Angles_Part1( reax_atom *my_atoms,
                             rvec_iMultiply( rvec_temp, pbond_ij->rel_box, rvec_temp );
                             rvec_Add( ext_press_l, rvec_temp );
 
-                            rvec_ScaledAdd( workspace.f[j], CEval8, p_ijk->dcos_dj );
+                            rvec_ScaledAdd( f_j_l, CEval8, p_ijk->dcos_dj );
 
                             rvec_Scale( rvec_temp, CEval8, p_ijk->dcos_dk );
                             atomic_rvecAdd( workspace.f[k], rvec_temp );
@@ -455,11 +456,13 @@ CUDA_GLOBAL void Cuda_Valence_Angles_Part1( reax_atom *my_atoms,
     }
 
 #if !defined(CUDA_ACCUM_ATOMIC)
+    rvec_Add( workspace.f[j], f_j_l );
     e_ang_g[j] = e_ang_l;
     e_coa_g[j] = e_coa_l;
     e_pen_g[j] = e_pen_l;
     rvec_Copy( ext_press_g[j], ext_press_l );
 #else
+    atomic_rvecAdd( workspace.f[j], f_j_l );
     atomicAdd( (double *) e_ang_g, (double) e_ang_l );
     atomicAdd( (double *) e_coa_g, (double) e_coa_l );
     atomicAdd( (double *) e_pen_g, (double) e_pen_l );
-- 
GitLab