From 7c45199376ae600888d139a1b34bd79c3de0e71d Mon Sep 17 00:00:00 2001
From: "Kurt A. O'Hearn" <ohearnku@msu.edu>
Date: Fri, 19 Feb 2021 13:39:57 -0500
Subject: [PATCH] PG-PuReMD: use atomics for both partial energy and force
 accumulation (small reworks to reduce number of atomic operations). Remove
 several unneccesary cudaMemset calls. Other code clean-up.

---
 PG-PuReMD/src/cuda/cuda_bond_orders.cu    |  24 +-
 PG-PuReMD/src/cuda/cuda_bond_orders.h     |   4 +-
 PG-PuReMD/src/cuda/cuda_bonds.cu          |  48 +--
 PG-PuReMD/src/cuda/cuda_forces.cu         | 178 +++++---
 PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu | 317 +++++++-------
 PG-PuReMD/src/cuda/cuda_hydrogen_bonds.h  |   2 +-
 PG-PuReMD/src/cuda/cuda_multi_body.cu     |  81 ++--
 PG-PuReMD/src/cuda/cuda_multi_body.h      |   2 +-
 PG-PuReMD/src/cuda/cuda_torsion_angles.cu | 244 ++++-------
 PG-PuReMD/src/cuda/cuda_torsion_angles.h  |   2 +-
 PG-PuReMD/src/cuda/cuda_valence_angles.cu | 487 +++++++++-------------
 PG-PuReMD/src/cuda/cuda_valence_angles.h  |  11 +-
 PG-PuReMD/src/reax_types.h                |  10 +-
 13 files changed, 628 insertions(+), 782 deletions(-)

diff --git a/PG-PuReMD/src/cuda/cuda_bond_orders.cu b/PG-PuReMD/src/cuda/cuda_bond_orders.cu
index 25efc930..1420659c 100644
--- a/PG-PuReMD/src/cuda/cuda_bond_orders.cu
+++ b/PG-PuReMD/src/cuda/cuda_bond_orders.cu
@@ -4,7 +4,7 @@
 #include "cuda_list.h"
 #include "cuda_utils.h"
 #include "cuda_reduction.h"
-#if defined(CUDA_ACCUM_FORCE_ATOMIC)
+#if defined(CUDA_ACCUM_ATOMIC)
 #include "cuda_helpers.h"
 #endif
 
@@ -65,7 +65,7 @@ CUDA_DEVICE void Cuda_Add_dBond_to_Forces_NPT( int i, int pj,
             nbr_k = &bond_list->bond_list[pk];
             k = nbr_k->nbr;
 
-#if !defined(CUDA_ACCUM_FORCE_ATOMIC)
+#if !defined(CUDA_ACCUM_ATOMIC)
             rvec_MakeZero( nbr_k->tf_f );
 #endif
 
@@ -79,7 +79,7 @@ CUDA_DEVICE void Cuda_Add_dBond_to_Forces_NPT( int i, int pj,
             rvec_ScaledAdd( temp, -coef.C3dbopi2, nbr_k->bo_data.dBOp );
 
             /* force */
-#if !defined(CUDA_ACCUM_FORCE_ATOMIC)
+#if !defined(CUDA_ACCUM_ATOMIC)
             rvec_Add( nbr_k->tf_f, temp );
 #else
             atomic_rvecAdd( workspace->f[k], temp );
@@ -129,7 +129,7 @@ CUDA_DEVICE void Cuda_Add_dBond_to_Forces_NPT( int i, int pj,
             nbr_k = &bond_list->bond_list[pk];
             k = nbr_k->nbr;
 
-#if !defined(CUDA_ACCUM_FORCE_ATOMIC)
+#if !defined(CUDA_ACCUM_ATOMIC)
             rvec_MakeZero( nbr_k->tf_f );
 #endif
 
@@ -143,7 +143,7 @@ CUDA_DEVICE void Cuda_Add_dBond_to_Forces_NPT( int i, int pj,
             rvec_ScaledAdd( temp, -coef.C4dbopi2, nbr_k->bo_data.dBOp );
 
             /* force */
-#if !defined(CUDA_ACCUM_FORCE_ATOMIC)
+#if !defined(CUDA_ACCUM_ATOMIC)
             rvec_Add( nbr_k->tf_f, temp );
 #else
             atomic_rvecAdd( workspace->f[k], temp );
@@ -183,7 +183,7 @@ CUDA_DEVICE void Cuda_Add_dBond_to_Forces_NPT( int i, int pj,
         rvec_ScaledAdd( temp, coef.C4dbopi2, workspace->dDeltap_self[j] );
 
         /* force */
-#if !defined(CUDA_ACCUM_FORCE_ATOMIC)
+#if !defined(CUDA_ACCUM_ATOMIC)
         rvec_Add( workspace->f[j], temp );
 #else
         atomic_rvecAdd( workspace->f[j], temp );
@@ -250,7 +250,7 @@ CUDA_DEVICE void Cuda_Add_dBond_to_Forces( int i, int pj,
             /* 3rd, dBOpi2 */
             rvec_ScaledAdd( temp, -coef.C3dbopi2, nbr_k->bo_data.dBOp );
 
-#if !defined(CUDA_ACCUM_FORCE_ATOMIC)
+#if !defined(CUDA_ACCUM_ATOMIC)
             rvec_Add( nbr_k->tf_f, temp );
 #else
             atomic_rvecAdd( workspace->f[nbr_k->nbr], temp );
@@ -281,7 +281,7 @@ CUDA_DEVICE void Cuda_Add_dBond_to_Forces( int i, int pj,
         /* 3rd, dBO_pi2 */
         rvec_ScaledAdd( temp, coef.C3dbopi2, workspace->dDeltap_self[i] );
 
-#if !defined(CUDA_ACCUM_FORCE_ATOMIC)
+#if !defined(CUDA_ACCUM_ATOMIC)
         rvec_Add( workspace->f[i], temp );
 #else
         atomic_rvecAdd( workspace->f[i], temp );
@@ -302,7 +302,7 @@ CUDA_DEVICE void Cuda_Add_dBond_to_Forces( int i, int pj,
             /* 4th, dBOpi2 */
             rvec_ScaledAdd( temp, -coef.C4dbopi2, nbr_k->bo_data.dBOp );
 
-#if !defined(CUDA_ACCUM_FORCE_ATOMIC)
+#if !defined(CUDA_ACCUM_ATOMIC)
             rvec_Add( nbr_k->tf_f, temp );
 #else
             atomic_rvecAdd( workspace->f[nbr_k->nbr], temp );
@@ -333,7 +333,7 @@ CUDA_DEVICE void Cuda_Add_dBond_to_Forces( int i, int pj,
         /* 3rd, dBOpi2 */
         rvec_ScaledAdd( temp, coef.C4dbopi2, workspace->dDeltap_self[i] );
 
-#if !defined(CUDA_ACCUM_FORCE_ATOMIC)
+#if !defined(CUDA_ACCUM_ATOMIC)
         rvec_Add( workspace->f[i], temp );
 #else
         atomic_rvecAdd( workspace->f[i], temp );
@@ -739,7 +739,7 @@ CUDA_GLOBAL void k_total_forces_part1( storage workspace, reax_list bond_list,
 }
 
 
-#if !defined(CUDA_ACCUM_FORCE_ATOMIC)
+#if !defined(CUDA_ACCUM_ATOMIC)
 CUDA_GLOBAL void k_total_forces_part2( reax_atom *my_atoms, reax_list bond_list,
         storage workspace, int N )
 {
@@ -816,7 +816,7 @@ void Cuda_Total_Forces( reax_system *system, control_params *control,
         cudaCheckError( ); 
     }
 
-#if !defined(CUDA_ACCUM_FORCE_ATOMIC)
+#if !defined(CUDA_ACCUM_ATOMIC)
     /* post processing for the atomic forces */
     k_total_forces_part2  <<< blocks, DEF_BLOCK_SIZE >>>
         ( system->d_my_atoms, *(lists[BONDS]), *(workspace->d_workspace), system->N );
diff --git a/PG-PuReMD/src/cuda/cuda_bond_orders.h b/PG-PuReMD/src/cuda/cuda_bond_orders.h
index ccec119f..0212e42a 100644
--- a/PG-PuReMD/src/cuda/cuda_bond_orders.h
+++ b/PG-PuReMD/src/cuda/cuda_bond_orders.h
@@ -118,7 +118,7 @@ CUDA_DEVICE static inline int Cuda_BOp( reax_list bond_list, real bo_cut,
             bo_ij->Cdbopi = 0.0;
             bo_ij->Cdbopi2 = 0.0;
 
-#if !defined(CUDA_ACCUM_FORCE_ATOMIC)
+#if !defined(CUDA_ACCUM_ATOMIC)
             ibond->ae_CdDelta = 0.0;
             ibond->va_CdDelta = 0.0;
             rvec_MakeZero( ibond->va_f );
@@ -174,7 +174,7 @@ CUDA_DEVICE static inline int Cuda_BOp( reax_list bond_list, real bo_cut,
             bo_ji->Cdbopi = 0.0;
             bo_ji->Cdbopi2 = 0.0;
 
-#if !defined(CUDA_ACCUM_FORCE_ATOMIC)
+#if !defined(CUDA_ACCUM_ATOMIC)
             jbond->ae_CdDelta = 0.0;
             jbond->va_CdDelta = 0.0;
             rvec_MakeZero( jbond->va_f );
diff --git a/PG-PuReMD/src/cuda/cuda_bonds.cu b/PG-PuReMD/src/cuda/cuda_bonds.cu
index 9cd4e0a4..8ffe75d3 100644
--- a/PG-PuReMD/src/cuda/cuda_bonds.cu
+++ b/PG-PuReMD/src/cuda/cuda_bonds.cu
@@ -30,14 +30,14 @@
 CUDA_GLOBAL void Cuda_Bonds( reax_atom *my_atoms, global_parameters gp, 
         single_body_parameters *sbp, two_body_parameters *tbp, 
         storage p_workspace, reax_list p_bond_list, int n, int num_atom_types, 
-        real *e_bond )
+        real *e_bond_g )
 {
     int i, j, pj;
     int start_i, end_i;
     int type_i, type_j;
-    real ebond, pow_BOs_be2, exp_be12, CEbo;
+    real pow_BOs_be2, exp_be12, CEbo, e_bond_l;
     real gp3, gp4, gp7, gp10, gp37;
-    real exphu, exphua1, exphub1, exphuov, hulpov, estriph;
+    real exphu, exphua1, exphub1, exphuov, hulpov;
     real decobdbo, decobdboua, decobdboub;
     single_body_parameters *sbp_i, *sbp_j;
     two_body_parameters *twbp;
@@ -59,6 +59,7 @@ CUDA_GLOBAL void Cuda_Bonds( reax_atom *my_atoms, global_parameters gp,
     gp7 = gp.l[7];
     gp10 = gp.l[10];
     gp37 = (int) gp.l[37];
+    e_bond_l = 0.0;
 
     start_i = Start_Index( i, bond_list );
     end_i = End_Index( i, bond_list );
@@ -82,32 +83,15 @@ CUDA_GLOBAL void Cuda_Bonds( reax_atom *my_atoms, global_parameters gp,
                 * (1.0 - twbp->p_be1 * twbp->p_be2 * pow_BOs_be2);
 
             /* calculate bond energy */
-            ebond = -twbp->De_s * bo_ij->BO_s * exp_be12
+            e_bond_l += -twbp->De_s * bo_ij->BO_s * exp_be12
                 - twbp->De_p * bo_ij->BO_pi
                 - twbp->De_pp * bo_ij->BO_pi2;
-            e_bond[i] += ebond;
 
             /* calculate derivatives of bond orders */
             bo_ij->Cdbo += CEbo;
             bo_ij->Cdbopi -= CEbo + twbp->De_p;
             bo_ij->Cdbopi2 -= CEbo + twbp->De_pp;
 
-#if defined(TEST_ENERGY)
-            //fprintf( out_control->ebond, "%6d%6d%24.15e%24.15e%24.15e\n",
-            fprintf( out_control->ebond, "%6d%6d%12.4f%12.4f%12.4f\n",
-                    system->my_atoms[i].orig_id, 
-                    system->my_atoms[j].orig_id, 
-                    bo_ij->BO, ebond, data->my_en.e_bond );
-#endif
-
-#if defined(TEST_FORCES)
-            Add_dBO( system, lists, i, pj, CEbo, workspace->f_be );
-            Add_dBOpinpi2( system, lists, i, pj,
-                    -(CEbo + twbp->De_p),
-                    -(CEbo + twbp->De_pp),
-                    workspace->f_be, workspace->f_be );
-#endif
-
             /* Stabilisation terminal triple bond */
             if ( bo_ij->BO >= 1.00 )
             {
@@ -127,8 +111,7 @@ CUDA_GLOBAL void Cuda_Bonds( reax_atom *my_atoms, global_parameters gp,
                     exphuov = EXP(gp4 * (workspace->Delta[i] + workspace->Delta[j]));
                     hulpov = 1.0 / (1.0 + 25.0 * exphuov);
 
-                    estriph = gp10 * exphu * hulpov * (exphua1 + exphub1);
-                    e_bond[i] += estriph;
+                    e_bond_l += gp10 * exphu * hulpov * (exphua1 + exphub1);
 
                     decobdbo = gp10 * exphu * hulpov * (exphua1 + exphub1)
                         * ( gp3 - 2.0 * gp7 * (bo_ij->BO - 2.5) );
@@ -140,21 +123,14 @@ CUDA_GLOBAL void Cuda_Bonds( reax_atom *my_atoms, global_parameters gp,
                     bo_ij->Cdbo += decobdbo;
                     workspace->CdDelta[i] += decobdboua;
                     workspace->CdDelta[j] += decobdboub;
-
-#if defined(TEST_ENERGY)
-                    //fprintf( out_control->ebond, 
-                    //  "%6d%6d%24.15e%24.15e%24.15e%24.15e\n",
-                    //  system->my_atoms[i].orig_id, system->my_atoms[j].orig_id,
-                    //  estriph, decobdbo, decobdboua, decobdboub );
-#endif
-
-#if defined(TEST_FORCES)
-                    Add_dBO( system, lists, i, pj, decobdbo, workspace->f_be );
-                    Add_dDelta( system, lists, i, decobdboua, workspace->f_be );
-                    Add_dDelta( system, lists, j, decobdboub, workspace->f_be );
-#endif
                 }
             }
         }
     }
+
+#if !defined(CUDA_ACCUM_ATOMIC)
+    e_bond_g[i] = e_bond_l;
+#else
+    atomicAdd( (double *) e_bond_g, (double) e_bond_l );
+#endif
 }
diff --git a/PG-PuReMD/src/cuda/cuda_forces.cu b/PG-PuReMD/src/cuda/cuda_forces.cu
index 6c658975..6e59636a 100644
--- a/PG-PuReMD/src/cuda/cuda_forces.cu
+++ b/PG-PuReMD/src/cuda/cuda_forces.cu
@@ -223,7 +223,6 @@ CUDA_GLOBAL void k_init_distance( reax_atom *my_atoms, reax_list far_nbr_list, i
 {
     int i, j, pj, start_i, end_i;
     rvec x_i;
-    reax_atom *atom_j;
 
     i = blockIdx.x * blockDim.x + threadIdx.x;
 
@@ -240,19 +239,18 @@ CUDA_GLOBAL void k_init_distance( reax_atom *my_atoms, reax_list far_nbr_list, i
     for ( pj = start_i; pj < end_i; ++pj )
     {
         j = far_nbr_list.far_nbr_list.nbr[pj];
-        atom_j = &my_atoms[j];
 
         if ( i < j )
         {
-            far_nbr_list.far_nbr_list.dvec[pj][0] = atom_j->x[0] - x_i[0];
-            far_nbr_list.far_nbr_list.dvec[pj][1] = atom_j->x[1] - x_i[1];
-            far_nbr_list.far_nbr_list.dvec[pj][2] = atom_j->x[2] - x_i[2];
+            far_nbr_list.far_nbr_list.dvec[pj][0] = my_atoms[j].x[0] - x_i[0];
+            far_nbr_list.far_nbr_list.dvec[pj][1] = my_atoms[j].x[1] - x_i[1];
+            far_nbr_list.far_nbr_list.dvec[pj][2] = my_atoms[j].x[2] - x_i[2];
         }
         else
         {
-            far_nbr_list.far_nbr_list.dvec[pj][0] = x_i[0] - atom_j->x[0];
-            far_nbr_list.far_nbr_list.dvec[pj][1] = x_i[1] - atom_j->x[1];
-            far_nbr_list.far_nbr_list.dvec[pj][2] = x_i[2] - atom_j->x[2];
+            far_nbr_list.far_nbr_list.dvec[pj][0] = x_i[0] - my_atoms[j].x[0];
+            far_nbr_list.far_nbr_list.dvec[pj][1] = x_i[1] - my_atoms[j].x[1];
+            far_nbr_list.far_nbr_list.dvec[pj][2] = x_i[2] - my_atoms[j].x[2];
         }
         far_nbr_list.far_nbr_list.d[pj] = rvec_Norm( far_nbr_list.far_nbr_list.dvec[pj] );
     }
@@ -263,7 +261,7 @@ CUDA_GLOBAL void k_init_distance( reax_atom *my_atoms, reax_list far_nbr_list, i
  * in the far neighbors list if it's a NOT re-neighboring step */
 CUDA_GLOBAL void k_init_distance_opt( reax_atom *my_atoms, reax_list far_nbr_list, int N )
 {
-    int j, pj, start_i, end_i, thread_id, warp_id, lane_id;
+    int j, pj, start_i, end_i, thread_id, warp_id, lane_id, itr;
     __shared__ rvec x_i;
 
     thread_id = blockIdx.x * blockDim.x + threadIdx.x;
@@ -284,7 +282,7 @@ CUDA_GLOBAL void k_init_distance_opt( reax_atom *my_atoms, reax_list far_nbr_lis
     __syncthreads( );
 
     /* update distance and displacement vector between atoms i and j (i-j) */
-    for ( pj = start_i + lane_id; pj < end_i; pj += 32 )
+    for ( itr = 0, pj = start_i + lane_id; itr < (end_i - start_i + 0x0000001F) >> 5; ++itr )
     {
         j = far_nbr_list.far_nbr_list.nbr[pj];
 
@@ -301,6 +299,8 @@ CUDA_GLOBAL void k_init_distance_opt( reax_atom *my_atoms, reax_list far_nbr_lis
             far_nbr_list.far_nbr_list.dvec[pj][2] = x_i[2] - my_atoms[j].x[2];
         }
         far_nbr_list.far_nbr_list.d[pj] = rvec_Norm( far_nbr_list.far_nbr_list.dvec[pj] );
+
+        pj += warpSize;
     }
 }
 
@@ -694,7 +694,7 @@ CUDA_GLOBAL void k_init_bonds( reax_atom *my_atoms, single_body_parameters *sbp,
                 hbond_list.hbond_list[ihb_top].scl = -1;
                 hbond_list.hbond_list[ihb_top].ptr = pj;
 
-#if !defined(CUDA_ACCUM_FORCE_ATOMIC)
+#if !defined(CUDA_ACCUM_ATOMIC)
                 hbond_list.hbond_list[ihb_top].sym_index = -1;
                 rvec_MakeZero( hbond_list.hbond_list[ihb_top].hb_f );
 #endif
@@ -734,7 +734,7 @@ CUDA_GLOBAL void k_init_bonds( reax_atom *my_atoms, single_body_parameters *sbp,
                         }
                         hbond_list.hbond_list[ihb_top].ptr = pj;
 
-#if !defined(CUDA_ACCUM_FORCE_ATOMIC)
+#if !defined(CUDA_ACCUM_ATOMIC)
                         hbond_list.hbond_list[ihb_top].sym_index = -1;
                         rvec_MakeZero( hbond_list.hbond_list[ihb_top].hb_f );
 #endif
@@ -750,7 +750,7 @@ CUDA_GLOBAL void k_init_bonds( reax_atom *my_atoms, single_body_parameters *sbp,
                         hbond_list.hbond_list[ihb_top].scl = -1;
                         hbond_list.hbond_list[ihb_top].ptr = pj;
 
-#if !defined(CUDA_ACCUM_FORCE_ATOMIC)
+#if !defined(CUDA_ACCUM_ATOMIC)
                         hbond_list.hbond_list[ihb_top].sym_index = -1;
                         rvec_MakeZero( hbond_list.hbond_list[ihb_top].hb_f );
 #endif
@@ -1113,7 +1113,7 @@ CUDA_GLOBAL void k_update_sym_dbond_indices( reax_list bond_list, int N )
 }
 
 
-#if !defined(CUDA_ACCUM_FORCE_ATOMIC)
+#if !defined(CUDA_ACCUM_ATOMIC)
 CUDA_GLOBAL void k_update_sym_hbond_indices_opt( reax_atom *my_atoms,
         reax_list hbond_list, int N )
 {
@@ -1125,7 +1125,6 @@ CUDA_GLOBAL void k_update_sym_hbond_indices_opt( reax_atom *my_atoms,
 
     thread_id = blockIdx.x * blockDim.x + threadIdx.x;
     warp_id = thread_id >> 5;
-    lane_id = thread_id & 0x0000001F; 
 
     if ( warp_id > N )
     {
@@ -1133,6 +1132,7 @@ CUDA_GLOBAL void k_update_sym_hbond_indices_opt( reax_atom *my_atoms,
     }
 
     i = warp_id;
+    lane_id = thread_id & 0x0000001F; 
     start = Start_Index( my_atoms[i].Hindex, &hbond_list );
     end = End_Index( my_atoms[i].Hindex, &hbond_list );
     j = start + lane_id;
@@ -1200,7 +1200,7 @@ CUDA_GLOBAL void k_print_hbonds( reax_atom *my_atoms, reax_list hbond_list, int
         k = hbond_list.hbond_list[pj].nbr;
         hbond_jk = &hbond_list.hbond_list[pj];
 
-#if !defined(CUDA_ACCUM_FORCE_ATOMIC)
+#if !defined(CUDA_ACCUM_ATOMIC)
         printf( "p%03d, step %05d: %8d: %8d, %24.15f, %24.15f, %24.15f\n",
                 rank, step, my_atoms[i].Hindex, k,
                 hbond_jk->hb_f[0],
@@ -1564,14 +1564,13 @@ int Cuda_Init_Forces( reax_system *system, control_params *control,
 
     if ( renbr == FALSE && dist_done == FALSE )
     {
+        blocks = system->N * 32 / DEF_BLOCK_SIZE
+            + (system->N * 32 % DEF_BLOCK_SIZE == 0 ? 0 : 1);
 
-//        blocks = system->N * 32 / DEF_BLOCK_SIZE
-//            + (system->N * 32 % DEF_BLOCK_SIZE == 0 ? 0 : 1);
-
-        k_init_distance <<< control->blocks_n, control->block_size_n >>>
-            ( system->d_my_atoms, *(lists[FAR_NBRS]), system->N );
-//        k_init_distance_opt <<< blocks, DEF_BLOCK_SIZE >>>
+//        k_init_distance <<< control->blocks_n, control->block_size_n >>>
 //            ( system->d_my_atoms, *(lists[FAR_NBRS]), system->N );
+        k_init_distance_opt <<< blocks, DEF_BLOCK_SIZE >>>
+            ( system->d_my_atoms, *(lists[FAR_NBRS]), system->N );
         cudaCheckError( );
 
         dist_done = TRUE;
@@ -1713,7 +1712,7 @@ int Cuda_Init_Forces( reax_system *system, control_params *control,
             ( *(lists[BONDS]), system->N );
         cudaCheckError( );
 
-#if !defined(CUDA_ACCUM_FORCE_ATOMIC)
+#if !defined(CUDA_ACCUM_ATOMIC)
         if ( control->hbond_cut > 0.0 && system->numH > 0 )
         {
             blocks = system->N * 32 / DEF_BLOCK_SIZE
@@ -1769,10 +1768,8 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
     static int compute_bonded_part1 = FALSE;
     real *spad;
     rvec *rvec_spad;
-#if defined(DEBUG_FOCUS)
-    real t_start, t_elapsed;
-#endif
 
+#if !defined(CUDA_ACCUM_ATOMIC)
     cuda_check_malloc( &workspace->scratch, &workspace->scratch_size,
             MAX( sizeof(real) * system->n,
                 MAX( sizeof(real) * 3 * system->n,
@@ -1781,6 +1778,7 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
                             (sizeof(real) + sizeof(rvec)) * system->n + sizeof(rvec) * control->blocks )))),
             "Cuda_Compute_Bonded_Forces::workspace->scratch" );
     spad = (real *) workspace->scratch;
+#endif
     update_energy = (out_control->energy_update_freq > 0
             && data->step % out_control->energy_update_freq == 0) ? TRUE : FALSE;
     ret = SUCCESS;
@@ -1810,55 +1808,76 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
         cudaCheckError( );
 
         /* 2. Bond Energy Interactions */
-        cuda_memset( spad, 0, sizeof(real) * system->n,
-                "Compute_Bonded_Forces::spad" );
+#if defined(CUDA_ACCUM_ATOMIC)
+        cuda_memset( &((simulation_data *)data->d_simulation_data)->my_en.e_bond,
+                0, sizeof(real), "Cuda_Compute_Bonded_Forces::e_bond" );
+#endif
 
         Cuda_Bonds <<< control->blocks, control->block_size, sizeof(real) * control->block_size >>>
             ( system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_sbp, system->reax_param.d_tbp,
               *(workspace->d_workspace), *(lists[BONDS]), 
-              system->n, system->reax_param.num_atom_types, spad );
+              system->n, system->reax_param.num_atom_types,
+#if !defined(CUDA_ACCUM_ATOMIC)
+              spad
+#else
+              &((simulation_data *)data->d_simulation_data)->my_en.e_bond
+#endif
+            );
         cudaCheckError( );
 
-        /* reduction for E_BE */
+#if !defined(CUDA_ACCUM_ATOMIC)
         if ( update_energy == TRUE )
         {
             Cuda_Reduction_Sum( spad, &((simulation_data *)data->d_simulation_data)->my_en.e_bond,
                     system->n );
         }
+#endif
 
         /* 3. Atom Energy Interactions */
-        cuda_memset( spad, 0, sizeof(real) * 3 * system->n,
-                "Compute_Bonded_Forces::spad" );
+#if defined(CUDA_ACCUM_ATOMIC)
+        cuda_memset( &((simulation_data *)data->d_simulation_data)->my_en.e_lp,
+                0, sizeof(real), "Cuda_Compute_Bonded_Forces::e_lp" );
+        cuda_memset( &((simulation_data *)data->d_simulation_data)->my_en.e_ov,
+                0, sizeof(real), "Cuda_Compute_Bonded_Forces::e_ov" );
+        cuda_memset( &((simulation_data *)data->d_simulation_data)->my_en.e_un,
+                0, sizeof(real), "Cuda_Compute_Bonded_Forces::e_un" );
+#endif
 
         Cuda_Atom_Energy_Part1 <<< control->blocks, control->block_size >>>
             ( system->d_my_atoms, system->reax_param.d_gp,
               system->reax_param.d_sbp, system->reax_param.d_tbp, *(workspace->d_workspace),
               *(lists[BONDS]), system->n, system->reax_param.num_atom_types,
-              spad, &spad[system->n], &spad[2 * system->n] );
+#if !defined(CUDA_ACCUM_ATOMIC)
+              spad, &spad[system->n], &spad[2 * system->n]
+#else
+              &((simulation_data *)data->d_simulation_data)->my_en.e_lp,
+              &((simulation_data *)data->d_simulation_data)->my_en.e_ov,
+              &((simulation_data *)data->d_simulation_data)->my_en.e_un
+#endif
+             );
         cudaCheckError( );
 
-#if !defined(CUDA_ACCUM_FORCE_ATOMIC)
+#if !defined(CUDA_ACCUM_ATOMIC)
         Cuda_Atom_Energy_Part2 <<< control->blocks, control->block_size >>>
             ( *(lists[BONDS]), *(workspace->d_workspace), system->n );
         cudaCheckError( );
 #endif
 
+#if !defined(CUDA_ACCUM_ATOMIC)
         if ( update_energy == TRUE )
         {
-            /* reduction for E_Lp */
             Cuda_Reduction_Sum( spad, &((simulation_data *)data->d_simulation_data)->my_en.e_lp,
                     system->n );
 
-            /* reduction for E_Ov */
             Cuda_Reduction_Sum( &spad[system->n],
                     &((simulation_data *)data->d_simulation_data)->my_en.e_ov,
                     system->n );
 
-            /* reduction for E_Un */
             Cuda_Reduction_Sum( &spad[2 * system->n],
                     &((simulation_data *)data->d_simulation_data)->my_en.e_un,
                     system->n );
         }
+#endif
 
         compute_bonded_part1 = TRUE;
     }
@@ -1869,7 +1888,8 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
             "Cuda_Compute_Bonded_Forces::workspace->scratch" );
 
     thbody = (int *) workspace->scratch;
-    spad = (real *) workspace->scratch; /* in case scratch gets reallocated above, changing the pointer */
+    /* in case scratch gets reallocated above, reassign scratch pointer */
+    spad = (real *) workspace->scratch;
 
     ret = Cuda_Estimate_Storage_Three_Body( system, control, data, workspace,
             lists, thbody );
@@ -1878,32 +1898,45 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
     {
         Cuda_Init_Three_Body_Indices( thbody, system->total_thbodies_indices, lists );
 
-        cuda_memset( spad, 0,
-                (sizeof(real) * 3 + sizeof(rvec)) * system->N + sizeof(rvec) * control->blocks_n,
-                "Cuda_Compute_Bonded_Forces::spad" );
+#if defined(CUDA_ACCUM_ATOMIC)
+        cuda_memset( &((simulation_data *)data->d_simulation_data)->my_en.e_ang,
+                0, sizeof(real), "Cuda_Compute_Bonded_Forces::e_ang" );
+        cuda_memset( &((simulation_data *)data->d_simulation_data)->my_en.e_pen,
+                0, sizeof(real), "Cuda_Compute_Bonded_Forces::e_pen" );
+        cuda_memset( &((simulation_data *)data->d_simulation_data)->my_en.e_coa,
+                0, sizeof(real), "Cuda_Compute_Bonded_Forces::e_coa" );
+        cuda_memset( &((simulation_data *)data->d_simulation_data)->my_ext_press,
+                0, sizeof(rvec), "Cuda_Compute_Bonded_Forces::my_ext_press" );
+#endif
 
         Cuda_Valence_Angles_Part1 <<< control->blocks_n, control->block_size_n >>>
-            ( system->d_my_atoms, system->reax_param.d_gp, 
+            ( system->d_my_atoms, system->reax_param.d_gp,
               system->reax_param.d_sbp, system->reax_param.d_thbp, 
               (control_params *) control->d_control_params,
               *(workspace->d_workspace), *(lists[BONDS]), *(lists[THREE_BODIES]),
               system->n, system->N, system->reax_param.num_atom_types, 
-              spad, &spad[system->N], &spad[2 * system->N], (rvec *) (&spad[3 * system->N]) );
+#if !defined(CUDA_ACCUM_ATOMIC)
+              spad, &spad[system->N], &spad[2 * system->N], (rvec *) (&spad[3 * system->N])
+#else
+              &((simulation_data *)data->d_simulation_data)->my_en.e_ang,
+              &((simulation_data *)data->d_simulation_data)->my_en.e_pen,
+              &((simulation_data *)data->d_simulation_data)->my_en.e_coa,
+              &((simulation_data *)data->d_simulation_data)->my_ext_press
+#endif
+            );
         cudaCheckError( );
 
+#if !defined(CUDA_ACCUM_ATOMIC)
         if ( update_energy == TRUE )
         {
-            /* reduction for E_Ang */
             Cuda_Reduction_Sum( spad,
                     &((simulation_data *)data->d_simulation_data)->my_en.e_ang,
                     system->N );
 
-            /* reduction for E_Pen */
             Cuda_Reduction_Sum( &spad[system->N],
                     &((simulation_data *)data->d_simulation_data)->my_en.e_pen,
                     system->N );
 
-            /* reduction for E_Coa */
             Cuda_Reduction_Sum( &spad[2 * system->N],
                     &((simulation_data *)data->d_simulation_data)->my_en.e_coa,
                     system->N );
@@ -1913,7 +1946,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
         {
             rvec_spad = (rvec *) (&spad[3 * system->N]);
 
-            /* reduction for ext_pres */
             k_reduction_rvec <<< control->blocks_n, control->block_size_n,
                              sizeof(rvec) * (control->block_size_n / 32) >>>
                 ( rvec_spad, &rvec_spad[system->N], system->N );
@@ -1929,8 +1961,9 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
 //                    &((simulation_data *)data->d_simulation_data)->my_ext_press,
 //                    system->N );
         }
+#endif
 
-#if !defined(CUDA_ACCUM_FORCE_ATOMIC)
+#if !defined(CUDA_ACCUM_ATOMIC)
         Cuda_Valence_Angles_Part2 <<< control->blocks_n, control->block_size_n >>>
             ( system->d_my_atoms, (control_params *) control->d_control_params,
               *(workspace->d_workspace), *(lists[BONDS]), system->N );
@@ -1938,25 +1971,37 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
 #endif
 
         /* 5. Torsion Angles Interactions */
-        cuda_memset( spad, 0, (sizeof(real) * 2 + sizeof(rvec)) * system->n + sizeof(rvec) * control->blocks,
-                "Cuda_Compute_Bonded_Forces::spad" );
+#if defined(CUDA_ACCUM_ATOMIC)
+        cuda_memset( &((simulation_data *)data->d_simulation_data)->my_en.e_tor,
+                0, sizeof(real), "Cuda_Compute_Bonded_Forces::e_tor" );
+        cuda_memset( &((simulation_data *)data->d_simulation_data)->my_en.e_con,
+                0, sizeof(real), "Cuda_Compute_Bonded_Forces::e_con" );
+        cuda_memset( &((simulation_data *)data->d_simulation_data)->my_ext_press,
+                0, sizeof(rvec), "Cuda_Compute_Bonded_Forces::my_ext_press" );
+#endif
 
         Cuda_Torsion_Angles_Part1 <<< control->blocks, control->block_size >>>
             ( system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_fbp,
               (control_params *) control->d_control_params, *(lists[BONDS]),
               *(lists[THREE_BODIES]), *(workspace->d_workspace), system->n,
               system->reax_param.num_atom_types, 
-              spad, &spad[system->n], (rvec *) (&spad[2 * system->n]) );
+#if !defined(CUDA_ACCUM_ATOMIC)
+              spad, &spad[system->n], (rvec *) (&spad[2 * system->n])
+#else
+              &((simulation_data *)data->d_simulation_data)->my_en.e_tor,
+              &((simulation_data *)data->d_simulation_data)->my_en.e_con,
+              &((simulation_data *)data->d_simulation_data)->my_ext_press
+#endif
+            );
         cudaCheckError( );
 
+#if !defined(CUDA_ACCUM_ATOMIC)
         if ( update_energy == TRUE )
         {
-            /* reduction for E_Tor */
             Cuda_Reduction_Sum( spad,
                     &((simulation_data *)data->d_simulation_data)->my_en.e_tor,
                     system->n );
 
-            /* reduction for E_Con */
             Cuda_Reduction_Sum( &spad[system->n],
                     &((simulation_data *)data->d_simulation_data)->my_en.e_con,
                     system->n );
@@ -1966,7 +2011,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
         {
             rvec_spad = (rvec *) (&spad[2 * system->n]);
 
-            /* reduction for ext_pres */
             k_reduction_rvec <<< control->blocks, control->block_size,
                              sizeof(rvec) * (control->block_size / 32) >>>
                 ( rvec_spad, &rvec_spad[system->n], system->n );
@@ -1982,8 +2026,9 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
 //                    &((simulation_data *)data->d_simulation_data)->my_ext_press,
 //                    system->n );
         }
+#endif
 
-#if !defined(CUDA_ACCUM_FORCE_ATOMIC)
+#if !defined(CUDA_ACCUM_ATOMIC)
         Cuda_Torsion_Angles_Part2 <<< control->blocks_n, control->block_size_n >>>
                 ( system->d_my_atoms, *(workspace->d_workspace), *(lists[BONDS]),
                   system->N );
@@ -1993,9 +2038,12 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
         /* 6. Hydrogen Bonds Interactions */
         if ( control->hbond_cut > 0.0 && system->numH > 0 )
         {
-            cuda_memset( spad, 0,
-                    (sizeof(real) + sizeof(rvec)) * system->n + sizeof(rvec) * control->blocks,
-                    "Cuda_Compute_Bonded_Forces::spad" );
+#if defined(CUDA_ACCUM_ATOMIC)
+            cuda_memset( &((simulation_data *)data->d_simulation_data)->my_en.e_hb,
+                    0, sizeof(real), "Cuda_Compute_Bonded_Forces::e_hb" );
+            cuda_memset( &((simulation_data *)data->d_simulation_data)->my_ext_press,
+                    0, sizeof(rvec), "Cuda_Compute_Bonded_Forces::my_ext_press" );
+#endif
 
 //            hbs = (system->n * HB_KER_THREADS_PER_ATOM / HB_BLOCK_SIZE) + 
 //                (((system->n * HB_KER_THREADS_PER_ATOM) % HB_BLOCK_SIZE) == 0 ? 0 : 1);
@@ -2009,12 +2057,18 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
                       *(workspace->d_workspace),
                       *(lists[FAR_NBRS]), *(lists[BONDS]), *(lists[HBONDS]),
                       system->n, system->reax_param.num_atom_types,
-                      spad, (rvec *) (&spad[system->n]), system->my_rank );
+#if !defined(CUDA_ACCUM_ATOMIC)
+                      spad, (rvec *) (&spad[system->n]),
+#else
+                      &((simulation_data *)data->d_simulation_data)->my_en.e_hb,
+                      &((simulation_data *)data->d_simulation_data)->my_ext_press,
+#endif
+                      system->my_rank );
             cudaCheckError( );
 
+#if !defined(CUDA_ACCUM_ATOMIC)
             if ( update_energy == TRUE )
             {
-                /* reduction for E_HB */
                 Cuda_Reduction_Sum( spad,
                         &((simulation_data *)data->d_simulation_data)->my_en.e_hb,
                         system->n );
@@ -2024,7 +2078,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
             {
                 rvec_spad = (rvec *) (&spad[system->n]);
 
-                /* reduction for ext_pres */
                 k_reduction_rvec <<< control->blocks, control->block_size,
                                  sizeof(rvec) * (control->block_size / 32) >>>
                     ( rvec_spad, &rvec_spad[system->n], system->n );
@@ -2040,8 +2093,9 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
 //                        &((simulation_data *)data->d_simulation_data)->my_ext_press,
 //                        system->n );
             }
+#endif
 
-#if !defined(CUDA_ACCUM_FORCE_ATOMIC)
+#if !defined(CUDA_ACCUM_ATOMIC)
             Cuda_Hydrogen_Bonds_Part2 <<< control->blocks, control->block_size >>>
                 ( system->d_my_atoms, *(workspace->d_workspace),
                   *(lists[BONDS]), system->n );
diff --git a/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu b/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu
index c77f42ca..075c9fc7 100644
--- a/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu
+++ b/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu
@@ -38,7 +38,7 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1( reax_atom *my_atoms, single_body_par
         hbond_parameters *d_hbp, global_parameters gp,
         control_params *control, storage workspace,
         reax_list far_nbr_list, reax_list bond_list, reax_list hbond_list, int n, 
-        int num_atom_types, real *data_e_hb, rvec *data_ext_press, int rank )
+        int num_atom_types, real *e_hb_g, rvec *ext_press_g, int rank )
 {
     int i, j, k, pi, pk;
     int type_i, type_j, type_k;
@@ -47,9 +47,12 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1( reax_atom *my_atoms, single_body_par
     int itr, top;
     ivec rel_jk;
     real r_ij, r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2;
-    real e_hb, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3;
+    real e_hb, e_hb_l, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3;
     rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk;
-    rvec dvec_jk, force, ext_press;
+    rvec dvec_jk, temp, ext_press_l;
+#if defined(CUDA_ACCUM_ATOMIC)
+    rvec f_i_l, f_j_l, f_k_l;
+#endif
     hbond_parameters *hbp;
     bond_order_data *bo_ij;
     int nbr_jk;
@@ -63,6 +66,14 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1( reax_atom *my_atoms, single_body_par
         return;
     }
 
+    e_hb_l = 0.0;
+    rvec_MakeZero( ext_press_l );
+#if defined(CUDA_ACCUM_ATOMIC)
+    rvec_MakeZero( f_i_l );
+    rvec_MakeZero( f_j_l );
+    rvec_MakeZero( f_k_l );
+#endif
+
     /* discover the Hydrogen bonds between i-j-k triplets.
      * here j is H atom and there has to be some bond between i and j.
      * Hydrogen bond is between j and k.
@@ -115,7 +126,7 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1( reax_atom *my_atoms, single_body_par
             rvec_Scale( dvec_jk, hbond_list.hbond_list[pk].scl,
                     far_nbr_list.far_nbr_list.dvec[nbr_jk] );
 
-#if !defined(CUDA_ACCUM_FORCE_ATOMIC)
+#if !defined(CUDA_ACCUM_ATOMIC)
             rvec_MakeZero( phbond_jk->hb_f );
 #endif
 
@@ -150,7 +161,7 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1( reax_atom *my_atoms, single_body_par
                                 + r_jk / hbp->r0_hb - 2.0 ) );
 
                     e_hb = hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4;
-                    data_e_hb[j] += e_hb;
+                    e_hb_l += e_hb;
 
                     CEhb1 = hbp->p_hb1 * hbp->p_hb2 * exp_hb2 * exp_hb3 * sin_xhz4;
                     CEhb2 = -0.5 * hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * cos_xhz1;
@@ -163,116 +174,80 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1( reax_atom *my_atoms, single_body_par
 
                     if ( control->virial == 0 )
                     {
-#if !defined(CUDA_ACCUM_FORCE_ATOMIC)
+#if !defined(CUDA_ACCUM_ATOMIC)
                         /* dcos terms */
                         rvec_ScaledAdd( pbond_ij->hb_f, CEhb2, dcos_theta_di ); 
-
                         rvec_ScaledAdd( workspace.f[j], CEhb2, dcos_theta_dj );
-
                         rvec_ScaledAdd( phbond_jk->hb_f, CEhb2, dcos_theta_dk );
 
                         /* dr terms */
                         rvec_ScaledAdd( workspace.f[j], -1.0 * CEhb3 / r_jk, dvec_jk ); 
-
                         rvec_ScaledAdd( phbond_jk->hb_f, CEhb3 / r_jk, dvec_jk );
 #else
                         /* dcos terms */
-                        atomic_rvecScaledAdd( workspace.f[i], CEhb2, dcos_theta_di );
-
-                        atomic_rvecScaledAdd( workspace.f[j], CEhb2, dcos_theta_dj );
-
-                        atomic_rvecScaledAdd( workspace.f[k], CEhb2, dcos_theta_dk );
+                        rvec_ScaledAdd( f_i_l, CEhb2, dcos_theta_di );
+                        rvec_ScaledAdd( f_j_l, CEhb2, dcos_theta_dj );
+                        rvec_ScaledAdd( f_k_l, CEhb2, dcos_theta_dk );
 
                         /* dr terms */
-                        atomic_rvecScaledAdd( workspace.f[j], -1.0 * CEhb3 / r_jk, dvec_jk );
-
-                        atomic_rvecScaledAdd( workspace.f[k], CEhb3 / r_jk, dvec_jk );
+                        rvec_ScaledAdd( f_j_l, -1.0 * CEhb3 / r_jk, dvec_jk );
+                        rvec_ScaledAdd( f_k_l, CEhb3 / r_jk, dvec_jk );
 #endif
                     }
                     else
                     {
-#if !defined(CUDA_ACCUM_FORCE_ATOMIC)
+#if !defined(CUDA_ACCUM_ATOMIC)
                         /* for pressure coupling, terms that are not related to bond order
                          * derivatives are added directly into pressure vector/tensor */
                         /* dcos terms */
-                        rvec_Scale( force, CEhb2, dcos_theta_di );
-                        rvec_Add( pbond_ij->hb_f, force );
-                        rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
-                        rvec_ScaledAdd( data_ext_press[j], 1.0, ext_press );
+                        rvec_Scale( temp, CEhb2, dcos_theta_di );
+                        rvec_Add( pbond_ij->hb_f, temp );
+                        rvec_iMultiply( temp, pbond_ij->rel_box, temp );
+                        rvec_Add( ext_press_l, temp );
 
                         rvec_ScaledAdd( workspace.f[j], CEhb2, dcos_theta_dj );
 
                         ivec_Scale( rel_jk, hbond_list.hbond_list[pk].scl,
                                 far_nbr_list.far_nbr_list.rel_box[nbr_jk] );
-                        rvec_Scale( force, CEhb2, dcos_theta_dk );
-                        rvec_Add( phbond_jk->hb_f, force );
-                        rvec_iMultiply( ext_press, rel_jk, force );
-                        rvec_ScaledAdd( data_ext_press[j], 1.0, ext_press );
+                        rvec_Scale( temp, CEhb2, dcos_theta_dk );
+                        rvec_Add( phbond_jk->hb_f, temp );
+                        rvec_iMultiply( temp, rel_jk, temp );
+                        rvec_Add( ext_press_l, temp );
 
                         /* dr terms */
                         rvec_ScaledAdd( workspace.f[j], -1.0 * CEhb3 / r_jk, dvec_jk ); 
 
-                        rvec_Scale( force, CEhb3 / r_jk, dvec_jk );
-                        rvec_Add( phbond_jk->hb_f, force );
-                        rvec_iMultiply( ext_press, rel_jk, force );
-                        rvec_ScaledAdd( data_ext_press[j], 1.0, ext_press );
+                        rvec_Scale( temp, CEhb3 / r_jk, dvec_jk );
+                        rvec_Add( phbond_jk->hb_f, temp );
+                        rvec_iMultiply( temp, rel_jk, temp );
+                        rvec_Add( ext_press_l, temp );
 #else
                         /* for pressure coupling, terms that are not related to bond order
                          * derivatives are added directly into pressure vector/tensor */
                         /* dcos terms */
-                        rvec_Scale( force, CEhb2, dcos_theta_di );
-                        atomic_rvecAdd( workspace.f[i], force );
-                        rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
-                        rvec_ScaledAdd( data_ext_press[j], 1.0, ext_press );
+                        rvec_Scale( temp, CEhb2, dcos_theta_di );
+                        rvec_Add( f_i_l, temp );
+                        rvec_iMultiply( temp, pbond_ij->rel_box, temp );
+                        rvec_Add( ext_press_l, temp );
 
-                        atomic_rvecScaledAdd( workspace.f[j], CEhb2, dcos_theta_dj );
+                        rvec_ScaledAdd( f_j_l, CEhb2, dcos_theta_dj );
 
                         ivec_Scale( rel_jk, hbond_list.hbond_list[pk].scl,
                                 far_nbr_list.far_nbr_list.rel_box[nbr_jk] );
-                        rvec_Scale( force, CEhb2, dcos_theta_dk );
-                        atomic_rvecAdd( workspace.f[k], force );
-                        rvec_iMultiply( ext_press, rel_jk, force );
-                        rvec_ScaledAdd( data_ext_press[j], 1.0, ext_press );
+                        rvec_Scale( temp, CEhb2, dcos_theta_dk );
+                        rvec_Add( f_k_l, temp );
+                        rvec_iMultiply( temp, rel_jk, temp );
+                        rvec_Add( ext_press_l, temp );
 
                         /* dr terms */
-                        atomic_rvecScaledAdd( workspace.f[j], -1.0 * CEhb3 / r_jk, dvec_jk ); 
+                        rvec_ScaledAdd( f_j_l, -1.0 * CEhb3 / r_jk, dvec_jk ); 
 
-                        rvec_Scale( force, CEhb3 / r_jk, dvec_jk );
-                        atomic_rvecAdd( workspace.f[k], force );
-                        rvec_iMultiply( ext_press, rel_jk, force );
-                        rvec_ScaledAdd( data_ext_press[j], 1.0, ext_press );
+                        rvec_Scale( temp, CEhb3 / r_jk, dvec_jk );
+                        rvec_Add( f_k_l, temp );
+                        rvec_iMultiply( temp, rel_jk, temp );
+                        rvec_Add( ext_press_l, temp );
 #endif
                     }
-
-#if defined(TEST_ENERGY)
-                    /* fprintf( out_control->ehb, 
-                       "%24.15e%24.15e%24.15e\n%24.15e%24.15e%24.15e\n%24.15e%24.15e%24.15e\n",
-                       dcos_theta_di[0], dcos_theta_di[1], dcos_theta_di[2], 
-                       dcos_theta_dj[0], dcos_theta_dj[1], dcos_theta_dj[2], 
-                       dcos_theta_dk[0], dcos_theta_dk[1], dcos_theta_dk[2]);
-                       fprintf( out_control->ehb, "%24.15e%24.15e%24.15e\n",
-                       CEhb1, CEhb2, CEhb3 ); */
-                    fprintf( out_control->ehb, 
-                            //"%6d%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n",
-                            "%6d%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f\n",
-                            system->my_atoms[i].orig_id, system->my_atoms[j].orig_id, 
-                            system->my_atoms[k].orig_id, 
-                            r_jk, theta, bo_ij->BO, e_hb, data->my_en.e_hb );       
-#endif
-
-#if defined(TEST_FORCES)
-                    /* dbo term */
-                    Add_dBO( system, lists, j, pi, CEhb1, workspace.f_hb );
-
-                    /* dcos terms */
-                    rvec_ScaledAdd( workspace.f_hb[i], CEhb2, dcos_theta_di );
-                    rvec_ScaledAdd( workspace.f_hb[j], CEhb2, dcos_theta_dj );
-                    rvec_ScaledAdd( workspace.f_hb[k], CEhb2, dcos_theta_dk );
-
-                    /* dr terms */
-                    rvec_ScaledAdd( workspace.f_hb[j], -1.0 * CEhb3 / r_jk, dvec_jk ); 
-                    rvec_ScaledAdd( workspace.f_hb[k], CEhb3 / r_jk, dvec_jk );
-#endif
                 }
             }
         }
@@ -282,6 +257,18 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1( reax_atom *my_atoms, single_body_par
             free( hblist );
         }
     }
+
+#if !defined(CUDA_ACCUM_ATOMIC)
+    /* write conflicts for accumulating partial forces resolved by subsequent kernels */
+    e_hb_g[j] = e_hb_l;
+    rvecCopy( ext_press_g[j], ext_press_l );
+#else
+    atomic_rvecAdd( workspace.f[i], f_i_l );
+    atomic_rvecAdd( workspace.f[j], f_j_l );
+    atomic_rvecAdd( workspace.f[k], f_k_l );
+    atomicAdd( (double *) e_hb_g, (double) e_hb_l );
+    atomic_rvecAdd( *ext_press_g, ext_press_l );
+#endif
 }
 
 
@@ -289,7 +276,7 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1( reax_atom *my_atoms, single_body_par
 CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1_opt( reax_atom *my_atoms, single_body_parameters *sbp, 
         hbond_parameters *d_hbp, global_parameters gp, control_params *control, storage workspace,
         reax_list far_nbr_list, reax_list bond_list, reax_list hbond_list, int n, 
-        int num_atom_types, real *data_e_hb, rvec *data_ext_press )
+        int num_atom_types, real *e_hb_g, rvec *ext_press_g )
 {
     int i, j, k, pi, pk;
     int type_i, type_j, type_k;
@@ -300,9 +287,14 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1_opt( reax_atom *my_atoms, single_body
     int loopcount, count;
     ivec rel_jk;
     real r_ij, r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2;
-    real e_hb, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3;
+    real e_hb, e_hb_l, exp_hb2, exp_hb3, CEhb1, CEhb1_l, CEhb2, CEhb3;
     rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk;
-    rvec dvec_jk, force, ext_press;
+    rvec dvec_jk, temp, ext_press_l;
+#if !defined(CUDA_ACCUM_ATOMIC)
+    rvec f_l, hb_f_l;
+#else
+    rvec f_i_l, f_j_l, f_k_l;
+#endif
     hbond_parameters *hbp;
     bond_order_data *bo_ij;
     int nbr_jk;
@@ -311,8 +303,6 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1_opt( reax_atom *my_atoms, single_body
     /* thread-local variables */
     int thread_id, warp_id, lane_id, offset;
     unsigned int mask;
-    real e_hb_s, CEhb1_s;
-    rvec f_s, hb_f_s;
 
     thread_id = blockIdx.x * blockDim.x + threadIdx.x;
     warp_id = thread_id >> 5;
@@ -331,8 +321,15 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1_opt( reax_atom *my_atoms, single_body
      * Hydrogen bond is between j and k.
      * so in this function i->X, j->H, k->Z when we map 
      * variables onto the ones in the handout.*/
-    e_hb_s = 0.0;
-    rvec_MakeZero( f_s );
+    e_hb_l = 0.0;
+    rvec_MakeZero( ext_press_l );
+#if !defined(CUDA_ACCUM_ATOMIC)
+    rvec_MakeZero( f_l );
+#else
+    rvec_MakeZero( f_i_l );
+    rvec_MakeZero( f_j_l );
+    rvec_MakeZero( f_k_l );
+#endif
 
     /* j has to be of type H */
     if ( sbp[ my_atoms[j].type ].p_hbond == H_ATOM )
@@ -366,8 +363,10 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1_opt( reax_atom *my_atoms, single_body
             pi = hblist[itr];
             pbond_ij = &bond_list.bond_list[pi];
             i = pbond_ij->nbr;
-            rvec_MakeZero( hb_f_s );
-            CEhb1_s = 0.0;
+#if !defined(CUDA_ACCUM_ATOMIC)
+            rvec_MakeZero( hb_f_l );
+#endif
+            CEhb1_l = 0.0;
 
             //for( pk = hb_start_j; pk < hb_end_j; ++pk ) {
             loopcount = (hb_end_j - hb_start_j) / warpSize + 
@@ -420,7 +419,7 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1_opt( reax_atom *my_atoms, single_body
                                 + r_jk / hbp->r0_hb - 2.0 ) );
 
                     e_hb = hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4;
-                    e_hb_s += e_hb;
+                    e_hb_l += e_hb;
 
                     CEhb1 = hbp->p_hb1 * hbp->p_hb2 * exp_hb2 * exp_hb3 * sin_xhz4;
                     CEhb2 = -0.5 * hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * cos_xhz1;
@@ -429,88 +428,82 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1_opt( reax_atom *my_atoms, single_body
 
                     /* hydrogen bond forces */
                     /* dbo term */
-                    CEhb1_s += CEhb1;
+                    CEhb1_l += CEhb1;
 
                     if ( control->virial == 0 )
                     {
-#if !defined(CUDA_ACCUM_FORCE_ATOMIC)
+#if !defined(CUDA_ACCUM_ATOMIC)
                         /* dcos terms */
-                        rvec_ScaledAdd( hb_f_s, CEhb2, dcos_theta_di ); 
-
-                        rvec_ScaledAdd( f_s, CEhb2, dcos_theta_dj );
-
+                        rvec_ScaledAdd( hb_f_l, CEhb2, dcos_theta_di ); 
+                        rvec_ScaledAdd( f_l, CEhb2, dcos_theta_dj );
                         rvec_ScaledAdd( phbond_jk->hb_f, CEhb2, dcos_theta_dk );
 
                         /* dr terms */
-                        rvec_ScaledAdd( f_s, -1.0 * CEhb3 / r_jk, dvec_jk ); 
-
+                        rvec_ScaledAdd( f_l, -1.0 * CEhb3 / r_jk, dvec_jk ); 
                         rvec_ScaledAdd( phbond_jk->hb_f, CEhb3 / r_jk, dvec_jk );
 #else
                         /* dcos terms */
-                        rvec_ScaledAdd( hb_f_s, CEhb2, dcos_theta_di ); 
-
-                        rvec_ScaledAdd( f_s, CEhb2, dcos_theta_dj );
-
-                        atomic_rvecScaledAdd( workspace.f[k], CEhb2, dcos_theta_dk );
+                        rvec_ScaledAdd( f_i_l, CEhb2, dcos_theta_di ); 
+                        rvec_ScaledAdd( f_j_l, CEhb2, dcos_theta_dj );
+                        rvec_ScaledAdd( f_k_l, CEhb2, dcos_theta_dk );
 
                         /* dr terms */
-                        rvec_ScaledAdd( f_s, -1.0 * CEhb3 / r_jk, dvec_jk ); 
-
-                        atomic_rvecScaledAdd( workspace.f[k], CEhb3 / r_jk, dvec_jk );
+                        rvec_ScaledAdd( f_j_l, -1.0 * CEhb3 / r_jk, dvec_jk ); 
+                        rvec_ScaledAdd( f_k_l, CEhb3 / r_jk, dvec_jk );
 #endif
                     }
                     else
                     {
-#if !defined(CUDA_ACCUM_FORCE_ATOMIC)
+#if !defined(CUDA_ACCUM_ATOMIC)
                         /* for pressure coupling, terms that are not related to bond order
                          * derivatives are added directly into pressure vector/tensor */
                         /* dcos terms */
-                        rvec_Scale( force, CEhb2, dcos_theta_di );
-                        rvec_Add( pbond_ij->hb_f, force );
-                        rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
-                        rvec_ScaledAdd( data_ext_press [j], 1.0, ext_press );
+                        rvec_Scale( temp, CEhb2, dcos_theta_di );
+                        rvec_Add( pbond_ij->hb_f, temp );
+                        rvec_iMultiply( temp, pbond_ij->rel_box, temp );
+                        rvec_Add( ext_press_l, temp );
 
                         rvec_ScaledAdd( workspace.f[j], CEhb2, dcos_theta_dj );
 
                         ivec_Scale( rel_jk, hbond_list.hbond_list[pk].scl,
                                 far_nbr_list.far_nbr_list.rel_box[nbr_jk] );
-                        rvec_Scale( force, CEhb2, dcos_theta_dk );
-                        rvec_Add( phbond_jk->hb_f, force );
-                        rvec_iMultiply( ext_press, rel_jk, force );
-                        rvec_ScaledAdd( data_ext_press[j], 1.0, ext_press );
+                        rvec_Scale( temp, CEhb2, dcos_theta_dk );
+                        rvec_Add( phbond_jk->hb_f, temp );
+                        rvec_iMultiply( temp, rel_jk, temp );
+                        rvec_Add( ext_press_l, temp );
 
                         /* dr terms */
                         rvec_ScaledAdd( workspace.f[j], -1.0 * CEhb3 / r_jk, dvec_jk ); 
 
-                        rvec_Scale( force, CEhb3 / r_jk, dvec_jk );
-                        rvec_Add( phbond_jk->hb_f, force );
-                        rvec_iMultiply( ext_press, rel_jk, force );
-                        rvec_ScaledAdd( data_ext_press[j], 1.0, ext_press );
+                        rvec_Scale( temp, CEhb3 / r_jk, dvec_jk );
+                        rvec_Add( phbond_jk->hb_f, temp );
+                        rvec_iMultiply( temp, rel_jk, temp );
+                        rvec_Add( ext_press_l, temp );
 #else
                         /* for pressure coupling, terms that are not related to bond order
                          * derivatives are added directly into pressure vector/tensor */
                         /* dcos terms */
-                        rvec_Scale( force, CEhb2, dcos_theta_di );
-                        atomic_rvecAdd( workspace.f[i], force );
-                        rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
-                        rvec_ScaledAdd( data_ext_press [j], 1.0, ext_press );
+                        rvec_Scale( temp, CEhb2, dcos_theta_di );
+                        rvec_Add( f_i_l, temp );
+                        rvec_iMultiply( temp, pbond_ij->rel_box, temp );
+                        rvec_Add( ext_press_l, temp );
 
-                        atomic_rvecScaledAdd( workspace.f[j], CEhb2, dcos_theta_dj );
+                        rvec_ScaledAdd( f_j_l, CEhb2, dcos_theta_dj );
 
                         ivec_Scale( rel_jk, hbond_list.hbond_list[pk].scl,
                                 far_nbr_list.far_nbr_list.rel_box[nbr_jk] );
-                        rvec_Scale( force, CEhb2, dcos_theta_dk );
-                        atomic_rvecAdd( workspace.f[k], force );
-                        rvec_iMultiply( ext_press, rel_jk, force );
-                        rvec_ScaledAdd( data_ext_press[j], 1.0, ext_press );
+                        rvec_Scale( temp, CEhb2, dcos_theta_dk );
+                        rvec_Add( f_k_l, temp );
+                        rvec_iMultiply( temp, rel_jk, temp );
+                        rvec_Add( ext_press_l, temp );
 
                         /* dr terms */
-                        atomic_rvecScaledAdd( workspace.f[j], -1.0 * CEhb3 / r_jk, dvec_jk ); 
+                        rvec_ScaledAdd( f_j_l, -1.0 * CEhb3 / r_jk, dvec_jk ); 
 
-                        rvec_Scale( force, CEhb3 / r_jk, dvec_jk );
-                        atomic_rvecAdd( workspace.f[k], force );
-                        rvec_iMultiply( ext_press, rel_jk, force );
-                        rvec_ScaledAdd( data_ext_press[j], 1.0, ext_press );
+                        rvec_Scale( temp, CEhb3 / r_jk, dvec_jk );
+                        rvec_Add( f_k_l, temp );
+                        rvec_iMultiply( temp, rel_jk, temp );
+                        rvec_Add( ext_press_l, temp );
 #endif
                     }
 
@@ -524,20 +517,20 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1_opt( reax_atom *my_atoms, single_body
             /* warp-level sums using registers within a warp */
             for ( offset = warpSize >> 1; offset > 0; offset /= 2 )
             {
-                CEhb1_s += __shfl_down_sync( mask, CEhb1_s, offset );
-                hb_f_s[0] += __shfl_down_sync( mask, hb_f_s[0], offset );
-                hb_f_s[1] += __shfl_down_sync( mask, hb_f_s[1], offset );
-                hb_f_s[2] += __shfl_down_sync( mask, hb_f_s[2], offset );
+                CEhb1_l += __shfl_down_sync( mask, CEhb1_l, offset );
+#if !defined(CUDA_ACCUM_ATOMIC)
+                hb_f_l[0] += __shfl_down_sync( mask, hb_f_l[0], offset );
+                hb_f_l[1] += __shfl_down_sync( mask, hb_f_l[1], offset );
+                hb_f_l[2] += __shfl_down_sync( mask, hb_f_l[2], offset );
+#endif
             }
 
             /* first thread within a warp writes warp-level sum to shared memory */
             if ( lane_id == 0 )
             {
-                bo_ij->Cdbo += CEhb1_s ;
-#if !defined(CUDA_ACCUM_FORCE_ATOMIC)
-                rvec_Add( pbond_ij->hb_f, hb_f_s );
-#else
-                atomic_rvecAdd( workspace.f[i], hb_f_s );
+                bo_ij->Cdbo += CEhb1_l ;
+#if !defined(CUDA_ACCUM_ATOMIC)
+                rvec_Add( pbond_ij->hb_f, hb_f_l );
 #endif
             }
         } // for loop hbonds end
@@ -546,26 +539,46 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1_opt( reax_atom *my_atoms, single_body
     /* warp-level sums using registers within a warp */
     for ( offset = warpSize >> 1; offset > 0; offset /= 2 )
     {
-        e_hb_s += __shfl_down_sync( mask, e_hb_s, offset );
-        f_s[0] += __shfl_down_sync( mask, f_s[0], offset );
-        f_s[1] += __shfl_down_sync( mask, f_s[1], offset );
-        f_s[2] += __shfl_down_sync( mask, f_s[2], offset );
+#if !defined(CUDA_ACCUM_ATOMIC)
+        f_l[0] += __shfl_down_sync( mask, f_l[0], offset );
+        f_l[1] += __shfl_down_sync( mask, f_l[1], offset );
+        f_l[2] += __shfl_down_sync( mask, f_l[2], offset );
+#else
+        f_i_l[0] += __shfl_down_sync( mask, f_i_l[0], offset );
+        f_i_l[1] += __shfl_down_sync( mask, f_i_l[1], offset );
+        f_i_l[2] += __shfl_down_sync( mask, f_i_l[2], offset );
+        f_j_l[0] += __shfl_down_sync( mask, f_j_l[0], offset );
+        f_j_l[1] += __shfl_down_sync( mask, f_j_l[1], offset );
+        f_j_l[2] += __shfl_down_sync( mask, f_j_l[2], offset );
+        f_k_l[0] += __shfl_down_sync( mask, f_k_l[0], offset );
+        f_k_l[1] += __shfl_down_sync( mask, f_k_l[1], offset );
+        f_k_l[2] += __shfl_down_sync( mask, f_k_l[2], offset );
+#endif
+        e_hb_l += __shfl_down_sync( mask, e_hb_l, offset );
+        ext_press_l[0] += __shfl_down_sync( mask, ext_press_l[0], offset );
+        ext_press_l[1] += __shfl_down_sync( mask, ext_press_l[1], offset );
+        ext_press_l[2] += __shfl_down_sync( mask, ext_press_l[2], offset );
     }
 
     /* first thread within a warp writes warp-level sums to global memory */
     if ( lane_id == 0 )
     {
-        data_e_hb[j] += e_hb_s;
-#if !defined(CUDA_ACCUM_FORCE_ATOMIC)
-        rvec_Add( workspace.f[j], f_s );
+#if !defined(CUDA_ACCUM_ATOMIC)
+        rvec_Add( workspace.f[j], f_l );
+        e_hb_g[j] = e_hb_l;
+        rvecCopy( ext_press_g[j], ext_press_l );
 #else
-        atomic_rvecAdd( workspace.f[j], f_s );
+        atomic_rvecAdd( workspace.f[i], f_i_l );
+        atomic_rvecAdd( workspace.f[j], f_j_l );
+        atomic_rvecAdd( workspace.f[k], f_k_l );
+        atomicAdd( (double *) e_hb_g, (double) e_hb_l );
+        atomic_rvecAdd( *ext_press_g, ext_press_l );
 #endif
     }
 }
 
 
-#if !defined(CUDA_ACCUM_FORCE_ATOMIC)
+#if !defined(CUDA_ACCUM_ATOMIC)
 /* Accumulate forces stored in the bond list
  * using a one thread per atom implementation */
 CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part2( reax_atom *atoms,
@@ -691,7 +704,7 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part3_opt( reax_atom *atoms,
     /* thread-local variables */
     int thread_id, warp_id, lane_id, offset;
     unsigned int mask;
-    rvec hb_f_s;
+    rvec hb_f_l;
 
     thread_id = blockIdx.x * blockDim.x + threadIdx.x;
     warp_id = thread_id >> 5;
@@ -707,14 +720,14 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part3_opt( reax_atom *atoms,
     start = Start_Index( atoms[j].Hindex, &hbond_list );
     end = End_Index( atoms[j].Hindex, &hbond_list );
     pj = start + lane_id;
-    rvec_MakeZero( hb_f_s );
+    rvec_MakeZero( hb_f_l );
 
     while ( pj < end )
     {
         nbr_pj = &hbond_list.hbond_list[pj];
         sym_index_nbr = &hbond_list.hbond_list[ nbr_pj->sym_index ];
 
-        rvec_Add( hb_f_s, sym_index_nbr->hb_f );
+        rvec_Add( hb_f_l, sym_index_nbr->hb_f );
 
         pj += warpSize;
     }
@@ -723,15 +736,15 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part3_opt( reax_atom *atoms,
     /* warp-level sums using registers within a warp */
     for ( offset = warpSize >> 1; offset > 0; offset /= 2 )
     {
-        hb_f_s[0] += __shfl_down_sync( mask, hb_f_s[0], offset );
-        hb_f_s[1] += __shfl_down_sync( mask, hb_f_s[1], offset );
-        hb_f_s[2] += __shfl_down_sync( mask, hb_f_s[2], offset );
+        hb_f_l[0] += __shfl_down_sync( mask, hb_f_l[0], offset );
+        hb_f_l[1] += __shfl_down_sync( mask, hb_f_l[1], offset );
+        hb_f_l[2] += __shfl_down_sync( mask, hb_f_l[2], offset );
     }
 
     /* first thread within a warp writes warp-level sums to global memory */
     if ( lane_id == 0 )
     {
-        rvec_Add( workspace.f[j], hb_f_s );
+        rvec_Add( workspace.f[j], hb_f_l );
     }
 }
 #endif
diff --git a/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.h b/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.h
index ff3a68d6..3c481134 100644
--- a/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.h
+++ b/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.h
@@ -36,7 +36,7 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part1_opt( reax_atom *, single_body_paramet
         reax_list, reax_list, reax_list, int,
         int, real *, rvec * );
 
-#if !defined(CUDA_ACCUM_FORCE_ATOMIC)
+#if !defined(CUDA_ACCUM_ATOMIC)
 CUDA_GLOBAL void Cuda_Hydrogen_Bonds_Part2( reax_atom *,
         storage, reax_list, int );
 
diff --git a/PG-PuReMD/src/cuda/cuda_multi_body.cu b/PG-PuReMD/src/cuda/cuda_multi_body.cu
index 67f49c3d..b3b92fed 100644
--- a/PG-PuReMD/src/cuda/cuda_multi_body.cu
+++ b/PG-PuReMD/src/cuda/cuda_multi_body.cu
@@ -30,13 +30,13 @@
 CUDA_GLOBAL void Cuda_Atom_Energy_Part1( reax_atom *my_atoms, global_parameters gp, 
         single_body_parameters *sbp, two_body_parameters *tbp, 
         storage workspace, reax_list bond_list, int n, int num_atom_types,
-        real *data_e_lp, real *dat_e_ov, real *data_e_un )
+        real *e_lp_g, real *e_ov_g, real *e_un_g )
 {
     int i, j, pj, type_i, type_j;
     real Delta_lpcorr, dfvl;
-    real e_lp, expvd2, inv_expvd2, dElp, CElp, DlpVi;
-    real e_lph, Di, vov3, deahu2dbo, deahu2dsbo;
-    real e_ov, CEover1, CEover2, CEover3, CEover4;
+    real expvd2, inv_expvd2, dElp, CElp, DlpVi;
+    real Di, vov3, deahu2dbo, deahu2dsbo;
+    real CEover1, CEover2, CEover3, CEover4, e_lp_l;
     real exp_ovun1, exp_ovun2, sum_ovun1, sum_ovun2;
     real exp_ovun2n, exp_ovun6, exp_ovun8;
     real inv_exp_ovun1, inv_exp_ovun2, inv_exp_ovun2n, inv_exp_ovun8;
@@ -71,8 +71,7 @@ CUDA_GLOBAL void Cuda_Atom_Energy_Part1( reax_atom *my_atoms, global_parameters
     inv_expvd2 = 1.0 / (1.0 + expvd2 );
 
     /* calculate the energy */
-    e_lp = p_lp2 * workspace.Delta_lp[i] * inv_expvd2;
-    data_e_lp[i] += e_lp;
+    e_lp_l = p_lp2 * workspace.Delta_lp[i] * inv_expvd2;
 
     dElp = p_lp2 * inv_expvd2 + 75.0 * p_lp2 * workspace.Delta_lp[i]
         * expvd2 * SQR(inv_expvd2);
@@ -80,18 +79,6 @@ CUDA_GLOBAL void Cuda_Atom_Energy_Part1( reax_atom *my_atoms, global_parameters
 
     workspace.CdDelta[i] += CElp;  // lp - 1st term  
 
-#if defined(TEST_ENERGY)
-        fprintf( out_control->elp, "%23.15e%23.15e%23.15e%23.15e\n",
-                 p_lp2, workspace.Delta_lp_temp[i], expvd2, dElp );
-    fprintf( out_control->elp, "%6d%12.4f%12.4f%12.4f\n",
-            system->my_atoms[i].orig_id, workspace.nlp[i], 
-            e_lp, data->my_en.e_lp );
-#endif
-
-#if defined(TEST_FORCES)
-    Add_dDelta( system, lists, i, CElp, workspace.f_lp );  // lp - 1st term
-#endif
-
     /* correction for C2 */
     if ( gp.l[5] > 0.001
             && Cuda_strncmp( sbp[ type_i ].name, "C",
@@ -115,8 +102,7 @@ CUDA_GLOBAL void Cuda_Atom_Energy_Part1( reax_atom *my_atoms, global_parameters
 
                     if ( vov3 > 3.0 )
                     {
-                        e_lph = p_lp3 * SQR( vov3 - 3.0 );
-                        data_e_lp[i] += e_lph;
+                        e_lp_l += p_lp3 * SQR( vov3 - 3.0 );
 
                         deahu2dbo = 2.0 * p_lp3 * (vov3 - 3.0);
                         deahu2dsbo = 2.0 * p_lp3 * (vov3 - 3.0)
@@ -124,25 +110,20 @@ CUDA_GLOBAL void Cuda_Atom_Energy_Part1( reax_atom *my_atoms, global_parameters
 
                         bo_ij->Cdbo += deahu2dbo;
                         workspace.CdDelta[i] += deahu2dsbo;
-
-#if defined(TEST_ENERGY)
-                        fprintf( out_control->elp,"C2cor%6d%6d%12.6f%12.6f%12.6f\n",
-                                system->my_atoms[i].orig_id, system->my_atoms[j].orig_id,
-                                e_lph, deahu2dbo, deahu2dsbo );
-#endif
-
-#if defined(TEST_FORCES)
-                        Add_dBO( system, lists, i, pj, deahu2dbo, workspace.f_lp );
-                        Add_dDelta( system, lists, i, deahu2dsbo, workspace.f_lp );
-#endif
                     }
                 }    
             }
         }
     }
 
+#if !defined(CUDA_ACCUM_ATOMIC)
+    e_lp_g[i] = e_lp_l;
+#else
+    atomicAdd( (double *) e_lp_g, (double) e_lp_l );
+#endif
+
     /* over-coordination energy */
-    if( sbp_i->mass > 21.0 ) 
+    if ( sbp_i->mass > 21.0 ) 
     {
         dfvl = 0.0;
     }
@@ -179,8 +160,11 @@ CUDA_GLOBAL void Cuda_Atom_Energy_Part1( reax_atom *my_atoms, global_parameters
     DlpVi = 1.0 / (Delta_lpcorr + sbp_i->valency + 1.0e-8);
     CEover1 = Delta_lpcorr * DlpVi * inv_exp_ovun2;
 
-    e_ov = sum_ovun1 * CEover1;
-    dat_e_ov[i] += e_ov;
+#if !defined(CUDA_ACCUM_ATOMIC)
+    e_ov_g[i] = sum_ovun1 * CEover1;
+#else
+    atomicAdd( (double *) e_ov_g, (double) (sum_ovun1 * CEover1) );
+#endif
 
     CEover2 = sum_ovun1 * DlpVi * inv_exp_ovun2 * (1.0 - Delta_lpcorr
             * ( DlpVi + p_ovun2 * exp_ovun2 * inv_exp_ovun2 ));
@@ -201,7 +185,11 @@ CUDA_GLOBAL void Cuda_Atom_Energy_Part1( reax_atom *my_atoms, global_parameters
     inv_exp_ovun8 = 1.0 / (1.0 + exp_ovun8);
 
     e_un = -p_ovun5 * (1.0 - exp_ovun6) * inv_exp_ovun2n * inv_exp_ovun8;
-    data_e_un[i] += e_un;
+#if !defined(CUDA_ACCUM_ATOMIC)
+    e_un_g[i] = e_un;
+#else
+    atomicAdd( (double *) e_un_g, (double) e_un );
+#endif
 
     CEunder1 = inv_exp_ovun2n * ( p_ovun5 * p_ovun6 * exp_ovun6 * inv_exp_ovun8
             + p_ovun2 * e_un * exp_ovun2n );
@@ -229,7 +217,7 @@ CUDA_GLOBAL void Cuda_Atom_Energy_Part1( reax_atom *my_atoms, global_parameters
                     num_atom_types) ];
 
         bo_ij->Cdbo += CEover1 * twbp->p_ovun1 * twbp->De_s;// OvCoor-1st 
-#if !defined(CUDA_ACCUM_FORCE_ATOMIC)
+#if !defined(CUDA_ACCUM_ATOMIC)
         pbond_ij->ae_CdDelta += CEover4 * (1.0 - dfvl * workspace.dDelta_lp[j])
             * (bo_ij->BO_pi + bo_ij->BO_pi2); // OvCoor-3a
 #else
@@ -241,7 +229,7 @@ CUDA_GLOBAL void Cuda_Atom_Energy_Part1( reax_atom *my_atoms, global_parameters
         bo_ij->Cdbopi2 += CEover4 * (workspace.Delta[j] - dfvl
                 * workspace.Delta_lp_temp[j]);  // OvCoor-3b
 
-#if !defined(CUDA_ACCUM_FORCE_ATOMIC)
+#if !defined(CUDA_ACCUM_ATOMIC)
         pbond_ij->ae_CdDelta += CEunder4 * (1.0 - dfvl * workspace.dDelta_lp[j])
             * (bo_ij->BO_pi + bo_ij->BO_pi2);   // UnCoor - 2a
 #else
@@ -302,27 +290,10 @@ CUDA_GLOBAL void Cuda_Atom_Energy_Part1( reax_atom *my_atoms, global_parameters
                 workspace.f_un, workspace.f_un ); // UnCoor - 2b
 #endif
     }
-
-#if defined(TEST_ENERGY)
-    //fprintf( out_control->elp, "%6d%24.15e%24.15e%24.15e\n",
-    //fprintf( out_control->elp, "%6d%12.4f%12.4f%12.4f\n",
-    //     system->my_atoms[i].orig_id, workspace.nlp[i], 
-    //     e_lp, data->my_en.e_lp );
-
-    //fprintf( out_control->eov, "%6d%24.15e%24.15e\n", 
-    fprintf( out_control->eov, "%6d%12.4f%12.4f\n", 
-            system->my_atoms[i].orig_id, 
-            e_ov, data->my_en.e_ov + data->my_en.e_un );
-
-    //fprintf( out_control->eun, "%6d%24.15e%24.15e\n", 
-    fprintf( out_control->eun, "%6d%12.4f%12.4f\n", 
-            system->my_atoms[i].orig_id, 
-            e_un, data->my_en.e_ov + data->my_en.e_un );
-#endif
 }
 
 
-#if !defined(CUDA_ACCUM_FORCE_ATOMIC)
+#if !defined(CUDA_ACCUM_ATOMIC)
 /* Traverse bond list and accumulate lone pair contributions from bonded neighbors */
 CUDA_GLOBAL void Cuda_Atom_Energy_Part2( reax_list bond_list, 
         storage workspace, int n )
diff --git a/PG-PuReMD/src/cuda/cuda_multi_body.h b/PG-PuReMD/src/cuda/cuda_multi_body.h
index 16a6a938..156f07c5 100644
--- a/PG-PuReMD/src/cuda/cuda_multi_body.h
+++ b/PG-PuReMD/src/cuda/cuda_multi_body.h
@@ -29,7 +29,7 @@ CUDA_GLOBAL void Cuda_Atom_Energy_Part1( reax_atom *, global_parameters,
         single_body_parameters *, two_body_parameters *, storage,
         reax_list, int, int, real *, real *, real *);
 
-#if !defined(CUDA_ACCUM_FORCE_ATOMIC)
+#if !defined(CUDA_ACCUM_ATOMIC)
 CUDA_GLOBAL void Cuda_Atom_Energy_Part2( reax_list, storage, int );
 #endif
 
diff --git a/PG-PuReMD/src/cuda/cuda_torsion_angles.cu b/PG-PuReMD/src/cuda/cuda_torsion_angles.cu
index e0e10fe2..915ada83 100644
--- a/PG-PuReMD/src/cuda/cuda_torsion_angles.cu
+++ b/PG-PuReMD/src/cuda/cuda_torsion_angles.cu
@@ -144,7 +144,7 @@ CUDA_DEVICE static real Calculate_Omega( const rvec dvec_ij, real r_ij, const rv
 CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *my_atoms, global_parameters gp, 
         four_body_header *d_fbp, control_params *control, reax_list bond_list,
         reax_list thb_list, storage workspace, int n, int num_atom_types, 
-        real *data_e_tor, real *data_e_con, rvec *data_ext_press )
+        real *e_tor_g, real *e_con_g, rvec *ext_press_g )
 {
     int i, j, k, l, pi, pj, pk, pl, pij, plk;
     int type_i, type_j, type_k, type_l;
@@ -169,10 +169,8 @@ CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *my_atoms, global_paramete
     real CV, cmn, CEtors1, CEtors2, CEtors3, CEtors4;
     real CEtors5, CEtors6, CEtors7, CEtors8, CEtors9;
     real Cconj, CEconj1, CEconj2, CEconj3;
-    real CEconj4, CEconj5, CEconj6;
-    real e_tor, e_con;
-    rvec dvec_li;
-    rvec force, ext_press;
+    real CEconj4, CEconj5, CEconj6, e_tor_l, e_con_l;
+    rvec dvec_li, temp, ext_press_l;
     ivec rel_box_jl;
     // rtensor total_rtensor, temp_rtensor;
     four_body_header *fbh;
@@ -193,6 +191,9 @@ CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *my_atoms, global_paramete
     p_tor3 = gp.l[24];
     p_tor4 = gp.l[25];
     p_cot2 = gp.l[27];
+    e_tor_l = 0.0;
+    e_con_l = 0.0;
+    rvec_MakeZero( ext_press_l );
 #if defined(DEBUG_FOCUS)
     num_frb_intrs = 0;
 #endif
@@ -349,8 +350,7 @@ CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *my_atoms, global_paramete
 //                                    + fbp->V2 * exp_tor1 * (1.0 - SQR(cos_omega))
 //                                    + fbp->V3 * (0.5 + 2.0 * CUBE(cos_omega) - 1.5 * cos_omega);
 
-                                e_tor = fn10 * sin_ijk * sin_jkl * CV;
-                                data_e_tor[j] += e_tor;
+                                e_tor_l += fn10 * sin_ijk * sin_jkl * CV;
 
                                 dfn11 = (-p_tor3 * exp_tor3_DjDk
                                         + (p_tor3 * exp_tor3_DjDk - p_tor4 * exp_tor4_DjDk)
@@ -386,9 +386,8 @@ CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *my_atoms, global_paramete
 
                                 /* 4-body conjugation energy */
                                 fn12 = exp_cot2_ij * exp_cot2_jk * exp_cot2_kl;
-                                e_con = fbp->p_cot1 * fn12
+                                e_con_l += fbp->p_cot1 * fn12
                                     * (1.0 + (SQR(cos_omega) - 1.0) * sin_ijk * sin_jkl);
-                                data_e_con[j] += e_con;
 
                                 Cconj = -2.0 * fn12 * fbp->p_cot1 * p_cot2
                                     * (1.0 + (SQR(cos_omega) - 1.0) * sin_ijk * sin_jkl);
@@ -410,7 +409,7 @@ CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *my_atoms, global_paramete
                                 /* end 4-body conjugation energy */
 
                                 /* forces */
-#if !defined(CUDA_ACCUM_FORCE_ATOMIC)
+#if !defined(CUDA_ACCUM_ATOMIC)
                                 bo_jk->Cdbopi += CEtors2;
                                 workspace.CdDelta[j] += CEtors3;
                                 pbond_jk->ta_CdDelta += CEtors3;
@@ -428,7 +427,7 @@ CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *my_atoms, global_paramete
 
                                 if ( control->virial == 0 )
                                 {
-#if !defined(CUDA_ACCUM_FORCE_ATOMIC)
+#if !defined(CUDA_ACCUM_ATOMIC)
                                     /* dcos_theta_ijk */
                                     atomic_rvecScaledAdd( pbond_ij->ta_f, 
                                             CEtors7 + CEconj4, p_ijk->dcos_dk );
@@ -484,203 +483,106 @@ CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *my_atoms, global_paramete
                                 }
                                 else
                                 {
-#if !defined(CUDA_ACCUM_FORCE_ATOMIC)
+#if !defined(CUDA_ACCUM_ATOMIC)
                                     ivec_Sum( rel_box_jl, pbond_jk->rel_box, pbond_kl->rel_box );
 
                                     /* dcos_theta_ijk */
-                                    rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_dk );
-                                    atomic_rvecAdd( pbond_ij->ta_f, force );
-                                    rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
-                                    rvec_Add( data_ext_press[j], ext_press );
+                                    rvec_Scale( temp, CEtors7 + CEconj4, p_ijk->dcos_dk );
+                                    atomic_rvecAdd( pbond_ij->ta_f, temp );
+                                    rvec_iMultiply( temp, pbond_ij->rel_box, temp );
+                                    rvec_Add( ext_press_l, temp );
 
                                     rvec_ScaledAdd( workspace.f[j], 
                                             CEtors7 + CEconj4, p_ijk->dcos_dj );
 
-                                    rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_di );
-                                    atomic_rvecAdd( pbond_jk->ta_f, force );
-                                    rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
-                                    rvec_Add( data_ext_press[j], ext_press );
+                                    rvec_Scale( temp, CEtors7 + CEconj4, p_ijk->dcos_di );
+                                    atomic_rvecAdd( pbond_jk->ta_f, temp );
+                                    rvec_iMultiply( temp, pbond_jk->rel_box, temp );
+                                    rvec_Add( ext_press_l, temp );
 
                                     /* dcos_theta_jkl */
                                     rvec_ScaledAdd( workspace.f[j], 
                                             CEtors8 + CEconj5, p_jkl->dcos_di );
 
-                                    rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dj );
-                                    atomic_rvecAdd( pbond_jk->ta_f, force );
-                                    rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
-                                    rvec_Add( data_ext_press[j], ext_press );
+                                    rvec_Scale( temp, CEtors8 + CEconj5, p_jkl->dcos_dj );
+                                    atomic_rvecAdd( pbond_jk->ta_f, temp );
+                                    rvec_iMultiply( temp, pbond_jk->rel_box, temp );
+                                    rvec_Add( ext_press_l, temp );
 
-                                    rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dk );
-                                    rvec_Add( pbond_kl->ta_f, force );
-                                    rvec_iMultiply( ext_press, rel_box_jl, force );
-                                    rvec_Add( data_ext_press[j], ext_press );
+                                    rvec_Scale( temp, CEtors8 + CEconj5, p_jkl->dcos_dk );
+                                    rvec_Add( pbond_kl->ta_f, temp );
+                                    rvec_iMultiply( temp, rel_box_jl, temp );
+                                    rvec_Add( ext_press_l, temp );
 
                                     /* dcos_omega */                      
-                                    rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_di );
-                                    atomic_rvecAdd( pbond_ij->ta_f, force );
-                                    rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
-                                    rvec_Add( data_ext_press[j], ext_press );
+                                    rvec_Scale( temp, CEtors9 + CEconj6, dcos_omega_di );
+                                    atomic_rvecAdd( pbond_ij->ta_f, temp );
+                                    rvec_iMultiply( temp, pbond_ij->rel_box, temp );
+                                    rvec_Add( ext_press_l, temp );
 
                                     rvec_ScaledAdd( workspace.f[j], 
                                             CEtors9 + CEconj6, dcos_omega_dj );
 
-                                    rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dk );
-                                    rvec_Add( pbond_jk->ta_f, force );
-                                    rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
-                                    rvec_Add( data_ext_press[j], ext_press );
+                                    rvec_Scale( temp, CEtors9 + CEconj6, dcos_omega_dk );
+                                    rvec_Add( pbond_jk->ta_f, temp );
+                                    rvec_iMultiply( temp, pbond_jk->rel_box, temp );
+                                    rvec_Add( ext_press_l, temp );
 
-                                    rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dl );
-                                    rvec_Add( pbond_kl->ta_f, force );
-                                    rvec_iMultiply( ext_press, rel_box_jl, force );
-                                    rvec_Add( data_ext_press[j], ext_press );
+                                    rvec_Scale( temp, CEtors9 + CEconj6, dcos_omega_dl );
+                                    rvec_Add( pbond_kl->ta_f, temp );
+                                    rvec_iMultiply( temp, rel_box_jl, temp );
+                                    rvec_Add( ext_press_l, temp );
 #else
                                     ivec_Sum( rel_box_jl, pbond_jk->rel_box, pbond_kl->rel_box );
 
                                     /* dcos_theta_ijk */
-                                    rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_dk );
-                                    atomic_rvecAdd( workspace.f[i], force );
-                                    rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
-                                    rvec_Add( data_ext_press[j], ext_press );
+                                    rvec_Scale( temp, CEtors7 + CEconj4, p_ijk->dcos_dk );
+                                    atomic_rvecAdd( workspace.f[i], temp );
+                                    rvec_iMultiply( temp, pbond_ij->rel_box, temp );
+                                    rvec_Add( ext_press_l, temp );
 
                                     atomic_rvecScaledAdd( workspace.f[j], 
                                             CEtors7 + CEconj4, p_ijk->dcos_dj );
 
-                                    rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_di );
-                                    atomic_rvecAdd( workspace.f[k], force );
-                                    rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
-                                    rvec_Add( data_ext_press[j], ext_press );
+                                    rvec_Scale( temp, CEtors7 + CEconj4, p_ijk->dcos_di );
+                                    atomic_rvecAdd( workspace.f[k], temp );
+                                    rvec_iMultiply( temp, pbond_jk->rel_box, temp );
+                                    rvec_Add( ext_press_l, temp );
 
                                     /* dcos_theta_jkl */
                                     atomic_rvecScaledAdd( workspace.f[j], 
                                             CEtors8 + CEconj5, p_jkl->dcos_di );
 
-                                    rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dj );
-                                    atomic_rvecAdd( workspace.f[k], force );
-                                    rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
-                                    rvec_Add( data_ext_press[j], ext_press );
+                                    rvec_Scale( temp, CEtors8 + CEconj5, p_jkl->dcos_dj );
+                                    atomic_rvecAdd( workspace.f[k], temp );
+                                    rvec_iMultiply( temp, pbond_jk->rel_box, temp );
+                                    rvec_Add( ext_press_l, temp );
 
-                                    rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dk );
-                                    atomic_rvecAdd( workspace.f[l], force );
-                                    rvec_iMultiply( ext_press, rel_box_jl, force );
-                                    rvec_Add( data_ext_press[j], ext_press );
+                                    rvec_Scale( temp, CEtors8 + CEconj5, p_jkl->dcos_dk );
+                                    atomic_rvecAdd( workspace.f[l], temp );
+                                    rvec_iMultiply( temp, rel_box_jl, temp );
+                                    rvec_Add( ext_press_l, temp );
 
                                     /* dcos_omega */                      
-                                    rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_di );
-                                    atomic_rvecAdd( workspace.f[i], force );
-                                    rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
-                                    rvec_Add( data_ext_press[j], ext_press );
+                                    rvec_Scale( temp, CEtors9 + CEconj6, dcos_omega_di );
+                                    atomic_rvecAdd( workspace.f[i], temp );
+                                    rvec_iMultiply( temp, pbond_ij->rel_box, temp );
+                                    rvec_Add( ext_press_l, temp );
 
                                     atomic_rvecScaledAdd( workspace.f[j], 
                                             CEtors9 + CEconj6, dcos_omega_dj );
 
-                                    rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dk );
-                                    atomic_rvecAdd( workspace.f[k], force );
-                                    rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
-                                    rvec_Add( data_ext_press[j], ext_press );
+                                    rvec_Scale( temp, CEtors9 + CEconj6, dcos_omega_dk );
+                                    atomic_rvecAdd( workspace.f[k], temp );
+                                    rvec_iMultiply( temp, pbond_jk->rel_box, temp );
+                                    rvec_Add( ext_press_l, temp );
 
-                                    rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dl );
-                                    rvec_Add( workspace.f[l], force );
-                                    rvec_iMultiply( ext_press, rel_box_jl, force );
-                                    rvec_Add( data_ext_press[j], ext_press );
+                                    rvec_Scale( temp, CEtors9 + CEconj6, dcos_omega_dl );
+                                    rvec_Add( workspace.f[l], temp );
+                                    rvec_iMultiply( temp, rel_box_jl, temp );
+                                    rvec_Add( ext_press_l, temp );
 #endif
                                 }
-
-#if defined(TEST_ENERGY)
-                                /* fprintf( out_control->etor, 
-                                   "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
-                                   r_ij, r_jk, r_kl, cos_ijk, cos_jkl, sin_ijk, sin_jkl );
-                                   fprintf( out_control->etor, "%12.8f\n", dfn11 ); */
-                                /* fprintf( out_control->etor, 
-                                   "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
-                                   CEtors2, CEtors3, CEtors4, CEtors5, CEtors6, 
-                                   CEtors7, CEtors8, CEtors9 ); */
-                                /* fprintf( out_control->etor, 
-                                   "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
-                                   htra, htrb, htrc, hthd, hthe, hnra, hnrc, hnhd, hnhe ); */
-                                /* fprintf( out_control->etor, 
-                                   "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
-                                   CEconj1, CEconj2, CEconj3, CEconj4, CEconj5, CEconj6 ); */
-
-                                /* fprintf( out_control->etor, "%12.6f%12.6f%12.6f%12.6f\n",
-                                   fbp->V1, fbp->V2, fbp->V3, fbp->p_tor1 );*/
-
-                                fprintf(out_control->etor, 
-                                        //"%6d%6d%6d%6d%24.15e%24.15e%24.15e%24.15e\n", 
-                                        "%6d%6d%6d%6d%12.4f%12.4f%12.4f%12.4f\n", 
-                                        system->my_atoms[i].orig_id,system->my_atoms[j].orig_id, 
-                                        system->my_atoms[k].orig_id,system->my_atoms[l].orig_id, 
-                                        RAD2DEG(omega), BOA_jk, e_tor, data->my_en.e_tor );
-
-                                fprintf(out_control->econ, 
-                                        //"%6d%6d%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e\n", 
-                                        "%6d%6d%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f%12.4f\n", 
-                                        system->my_atoms[i].orig_id,system->my_atoms[j].orig_id, 
-                                        system->my_atoms[k].orig_id,system->my_atoms[l].orig_id, 
-                                        RAD2DEG(omega), BOA_ij, BOA_jk, BOA_kl, 
-                                        e_con, data->my_en.e_con );
-#endif
-
-#if defined(TEST_FORCES)
-                                /* Torsion Forces */
-                                Add_dBOpinpi2( system, lists, j, pk, CEtors2, 0.0, 
-                                        workspace.f_tor, workspace.f_tor );
-                                Add_dDelta( system, lists, j, CEtors3, workspace.f_tor );
-                                Add_dDelta( system, lists, k, CEtors3, workspace.f_tor );
-                                Add_dBO( system, lists, j, pij, CEtors4, workspace.f_tor );
-                                Add_dBO( system, lists, j, pk, CEtors5, workspace.f_tor );
-                                Add_dBO( system, lists, k, plk, CEtors6, workspace.f_tor );
-
-                                rvec_ScaledAdd( workspace.f_tor[i], 
-                                        CEtors7, p_ijk->dcos_dk );
-                                rvec_ScaledAdd( workspace.f_tor[j], 
-                                        CEtors7, p_ijk->dcos_dj );
-                                rvec_ScaledAdd( workspace.f_tor[k], 
-                                        CEtors7, p_ijk->dcos_di );
-
-                                rvec_ScaledAdd( workspace.f_tor[j], 
-                                        CEtors8, p_jkl->dcos_di );
-                                rvec_ScaledAdd( workspace.f_tor[k], 
-                                        CEtors8, p_jkl->dcos_dj );
-                                rvec_ScaledAdd( workspace.f_tor[l], 
-                                        CEtors8, p_jkl->dcos_dk );
-
-                                rvec_ScaledAdd( workspace.f_tor[i], 
-                                        CEtors9, dcos_omega_di );
-                                rvec_ScaledAdd( workspace.f_tor[j], 
-                                        CEtors9, dcos_omega_dj );
-                                rvec_ScaledAdd( workspace.f_tor[k], 
-                                        CEtors9, dcos_omega_dk );
-                                rvec_ScaledAdd( workspace.f_tor[l], 
-                                        CEtors9, dcos_omega_dl );
-
-                                /* Conjugation Forces */
-                                Add_dBO( system, lists, j, pij, CEconj1, workspace.f_con );
-                                Add_dBO( system, lists, j, pk, CEconj2, workspace.f_con );
-                                Add_dBO( system, lists, k, plk, CEconj3, workspace.f_con );
-
-                                rvec_ScaledAdd( workspace.f_con[i], 
-                                        CEconj4, p_ijk->dcos_dk );
-                                rvec_ScaledAdd( workspace.f_con[j], 
-                                        CEconj4, p_ijk->dcos_dj );
-                                rvec_ScaledAdd( workspace.f_con[k], 
-                                        CEconj4, p_ijk->dcos_di );
-
-                                rvec_ScaledAdd( workspace.f_con[j], 
-                                        CEconj5, p_jkl->dcos_di );
-                                rvec_ScaledAdd( workspace.f_con[k], 
-                                        CEconj5, p_jkl->dcos_dj );
-                                rvec_ScaledAdd( workspace.f_con[l], 
-                                        CEconj5, p_jkl->dcos_dk );
-
-                                rvec_ScaledAdd( workspace.f_con[i], 
-                                        CEconj6, dcos_omega_di );
-                                rvec_ScaledAdd( workspace.f_con[j], 
-                                        CEconj6, dcos_omega_dj );
-                                rvec_ScaledAdd( workspace.f_con[k], 
-                                        CEconj6, dcos_omega_dk );
-                                rvec_ScaledAdd( workspace.f_con[l], 
-                                        CEconj6, dcos_omega_dl );
-#endif
                             } // pl check ends
                         } // pl loop ends
                     } // pi check ends
@@ -689,10 +591,20 @@ CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *my_atoms, global_paramete
         } // j<k && j-k neighbor check ends
     } // pk loop ends
     //  } // j loop
+
+#if !defined(CUDA_ACCUM_ATOMIC)
+    e_tor_g[j] = e_tor_l;
+    e_con_g[j] = e_con_l;
+    rvec_Copy( e_ext_press_g[j], e_ext_press_l );
+#else
+    atomicAdd( (double *) e_tor_g, (double) e_tor_l );
+    atomicAdd( (double *) e_con_g, (double) e_con_l );
+    atomic_rvecAdd( *ext_press_g, ext_press_l );
+#endif
 }
 
 
-#if !defined(CUDA_ACCUM_FORCE_ATOMIC)
+#if !defined(CUDA_ACCUM_ATOMIC)
 CUDA_GLOBAL void Cuda_Torsion_Angles_Part2( reax_atom *my_atoms, 
         storage workspace, reax_list bond_list, int N )
 {
diff --git a/PG-PuReMD/src/cuda/cuda_torsion_angles.h b/PG-PuReMD/src/cuda/cuda_torsion_angles.h
index fbf4ca96..71fa7614 100644
--- a/PG-PuReMD/src/cuda/cuda_torsion_angles.h
+++ b/PG-PuReMD/src/cuda/cuda_torsion_angles.h
@@ -29,7 +29,7 @@ CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *, global_parameters,
         four_body_header *, control_params *, reax_list, reax_list,
         storage, int, int, real *, real *, rvec * );
 
-#if !defined(CUDA_ACCUM_FORCE_ATOMIC)
+#if !defined(CUDA_ACCUM_ATOMIC)
 CUDA_GLOBAL void Cuda_Torsion_Angles_Part2( reax_atom *,
         storage, reax_list, int );
 #endif
diff --git a/PG-PuReMD/src/cuda/cuda_valence_angles.cu b/PG-PuReMD/src/cuda/cuda_valence_angles.cu
index 32ac5543..fc51dfc9 100644
--- a/PG-PuReMD/src/cuda/cuda_valence_angles.cu
+++ b/PG-PuReMD/src/cuda/cuda_valence_angles.cu
@@ -21,7 +21,7 @@
 
 #include "cuda_valence_angles.h"
 
-#if defined(CUDA_ACCUM_FORCE_ATOMIC)
+#if defined(CUDA_ACCUM_ATOMIC)
 #include "cuda_helpers.h"
 #endif
 #include "cuda_list.h"
@@ -36,7 +36,7 @@ CUDA_GLOBAL void Cuda_Valence_Angles_Part1( reax_atom *my_atoms,
         global_parameters gp, single_body_parameters *sbp, three_body_header *d_thbh,
         control_params *control, storage workspace, reax_list bond_list,
         reax_list thb_list, int n, int N, int num_atom_types,
-        real *data_e_ang, real *data_e_pen, real *data_e_coa, rvec *my_ext_press )
+        real *e_ang_g, real *e_pen_g, real *e_coa_g, rvec *ext_press_g )
 {
     int i, j, pi, k, pk, t;
     int type_i, type_j, type_k;
@@ -53,13 +53,13 @@ CUDA_GLOBAL void Cuda_Valence_Angles_Part1( reax_atom *my_atoms,
     real dSBO1, dSBO2, SBO, SBO2, CSBO2, SBOp, prod_SBO, vlpadj;
     real CEval1, CEval2, CEval3, CEval4, CEval5, CEval6, CEval7, CEval8;
     real CEpen1, CEpen2, CEpen3;
-    real e_ang, e_coa, e_pen;
+    real e_ang_l, e_coa, e_coa_l, e_pen, e_pen_l;
     real CEcoa1, CEcoa2, CEcoa3, CEcoa4, CEcoa5;
     real Cf7ij, Cf7jk, Cf8j, Cf9j;
     real f7_ij, f7_jk, f8_Dj, f9_Dj;
     real Ctheta_0, theta_0, theta_00, theta, cos_theta, sin_theta;
     real BOA_ij, BOA_jk;
-    rvec force, ext_press;
+    rvec rvec_temp, ext_press_l;
     three_body_header *thbh;
     three_body_parameters *thbp;
     three_body_interaction_data *p_ijk;
@@ -85,12 +85,17 @@ CUDA_GLOBAL void Cuda_Valence_Angles_Part1( reax_atom *my_atoms,
     p_val9 = gp.l[16];
     p_val10 = gp.l[17];
     //num_thb_intrs = j * THREE_BODY_OFFSET;
+    e_ang_l = 0.0;
+    e_coa_l = 0.0;
+    e_pen_l = 0.0;
+    rvec_MakeZero( ext_press_l );
 
     type_j = my_atoms[j].type;
     start_j = Start_Index( j, &bond_list );
     end_j = End_Index( j, &bond_list );
     p_val3 = sbp[ type_j ].p_val3;
     p_val5 = sbp[ type_j ].p_val5;
+
     /* sum of pi and pi-pi BO terms for all neighbors of atom j,
      * used in determining the equilibrium angle between i-j-k */
     SBOp = 0.0;
@@ -101,7 +106,7 @@ CUDA_GLOBAL void Cuda_Valence_Angles_Part1( reax_atom *my_atoms,
     for ( t = start_j; t < end_j; ++t )
     {
         bo_jt = &bond_list.bond_list[t].bo_data;
-        SBOp += (bo_jt->BO_pi + bo_jt->BO_pi2);
+        SBOp += bo_jt->BO_pi + bo_jt->BO_pi2;
         temp = SQR( bo_jt->BO );
         temp *= temp;
         temp *= temp;
@@ -236,275 +241,149 @@ CUDA_GLOBAL void Cuda_Valence_Angles_Part1( reax_atom *my_atoms,
 
                 /* Fortran ReaxFF code hard-codes the constant below
                  * as of 2019-02-27, so use that for now */
-                if ( j < n && BOA_jk >= 0.0 && (bo_ij->BO * bo_jk->BO) >= 0.00001 )
-//                if ( j < n && BOA_jk >= 0.0 && (bo_ij->BO * bo_jk->BO) > SQR(control->thb_cut) )
+                if ( j >= n || BOA_jk < 0.0 || (bo_ij->BO * bo_jk->BO) < 0.00001 )
+//                if ( j >= n || BOA_jk < 0.0 || (bo_ij->BO * bo_jk->BO) < SQR(control->thb_cut) )
                 {
-                    thbh = &d_thbh[
-                        index_thbp(type_i, type_j, type_k, num_atom_types) ];
+                    continue;
+                }
 
-                    for ( cnt = 0; cnt < thbh->cnt; ++cnt )
-                    {
-                        /* valence angle does not exist in the force field */
-                        if ( FABS(thbh->prm[cnt].p_val1) < 0.001 )
-                        {
-                            continue;
-                        }
+                thbh = &d_thbh[
+                    index_thbp(type_i, type_j, type_k, num_atom_types) ];
 
-                        thbp = &thbh->prm[cnt];
-
-                        /* calculate valence angle energy */
-                        p_val1 = thbp->p_val1;
-                        p_val2 = thbp->p_val2;
-                        p_val4 = thbp->p_val4;
-                        p_val7 = thbp->p_val7;
-                        theta_00 = thbp->theta_00;
-
-                        exp3ij = EXP( -p_val3 * POW( BOA_ij, p_val4 ) );
-                        f7_ij = 1.0 - exp3ij;
-                        Cf7ij = p_val3 * p_val4
-                            * POW( BOA_ij, p_val4 - 1.0 ) * exp3ij;
-
-                        exp3jk = EXP( -p_val3 * POW( BOA_jk, p_val4 ) );
-                        f7_jk = 1.0 - exp3jk;
-                        Cf7jk = p_val3 * p_val4
-                            * POW( BOA_jk, p_val4 - 1.0 ) * exp3jk;
-
-                        expval7 = EXP( -p_val7 * workspace.Delta_boc[j] );
-                        trm8 = 1.0 + expval6 + expval7;
-                        f8_Dj = p_val5 - ( (p_val5 - 1.0) * (2.0 + expval6) / trm8 );
-                        Cf8j = ( (1.0 - p_val5) / SQR(trm8) )
-                            * (p_val6 * expval6 * trm8
-                                    - (2.0 + expval6) * ( p_val6*expval6 - p_val7*expval7) );
-
-                        theta_0 = 180.0 - theta_00 * (1.0 - EXP(-p_val10 * (2.0 - SBO2)));
-                        theta_0 = DEG2RAD( theta_0 );
-
-                        expval2theta = p_val1 * EXP(-p_val2 * SQR(theta_0 - theta));
-                        if ( p_val1 >= 0.0 )
-                        {
-                            expval12theta = p_val1 - expval2theta;
-                        }
-                        /* to avoid linear Me-H-Me angles (6/6/06) */
-                        else
-                        {
-                            expval12theta = -expval2theta;
-                        }
+                for ( cnt = 0; cnt < thbh->cnt; ++cnt )
+                {
+                    /* valence angle does not exist in the force field */
+                    if ( FABS(thbh->prm[cnt].p_val1) < 0.001 )
+                    {
+                        continue;
+                    }
 
-                        CEval1 = Cf7ij * f7_jk * f8_Dj * expval12theta;
-                        CEval2 = Cf7jk * f7_ij * f8_Dj * expval12theta;
-                        CEval3 = Cf8j * f7_ij * f7_jk * expval12theta;
-                        CEval4 = 2.0 * p_val2 * f7_ij * f7_jk * f8_Dj
-                            * expval2theta * (theta_0 - theta);
+                    thbp = &thbh->prm[cnt];
+
+                    /* calculate valence angle energy */
+                    p_val1 = thbp->p_val1;
+                    p_val2 = thbp->p_val2;
+                    p_val4 = thbp->p_val4;
+                    p_val7 = thbp->p_val7;
+                    theta_00 = thbp->theta_00;
+
+                    exp3ij = EXP( -p_val3 * POW( BOA_ij, p_val4 ) );
+                    f7_ij = 1.0 - exp3ij;
+                    Cf7ij = p_val3 * p_val4
+                        * POW( BOA_ij, p_val4 - 1.0 ) * exp3ij;
+
+                    exp3jk = EXP( -p_val3 * POW( BOA_jk, p_val4 ) );
+                    f7_jk = 1.0 - exp3jk;
+                    Cf7jk = p_val3 * p_val4
+                        * POW( BOA_jk, p_val4 - 1.0 ) * exp3jk;
+
+                    expval7 = EXP( -p_val7 * workspace.Delta_boc[j] );
+                    trm8 = 1.0 + expval6 + expval7;
+                    f8_Dj = p_val5 - (p_val5 - 1.0) * (2.0 + expval6) / trm8;
+                    Cf8j = ( (1.0 - p_val5) / SQR(trm8) )
+                        * (p_val6 * expval6 * trm8
+                                - (2.0 + expval6) * ( p_val6 * expval6 - p_val7 * expval7) );
+
+                    theta_0 = 180.0 - theta_00 * (1.0 - EXP(-p_val10 * (2.0 - SBO2)));
+                    theta_0 = DEG2RAD( theta_0 );
+
+                    expval2theta = p_val1 * EXP(-p_val2 * SQR(theta_0 - theta));
+                    if ( p_val1 >= 0.0 )
+                    {
+                        expval12theta = p_val1 - expval2theta;
+                    }
+                    /* to avoid linear Me-H-Me angles (6/6/06) */
+                    else
+                    {
+                        expval12theta = -expval2theta;
+                    }
 
-                        Ctheta_0 = p_val10 * DEG2RAD(theta_00)
-                            * EXP( -p_val10 * (2.0 - SBO2) );
+                    CEval1 = Cf7ij * f7_jk * f8_Dj * expval12theta;
+                    CEval2 = Cf7jk * f7_ij * f8_Dj * expval12theta;
+                    CEval3 = Cf8j * f7_ij * f7_jk * expval12theta;
+                    CEval4 = 2.0 * p_val2 * f7_ij * f7_jk * f8_Dj
+                        * expval2theta * (theta_0 - theta);
 
-                        CEval5 = CEval4 * Ctheta_0 * CSBO2;
-                        CEval6 = CEval5 * dSBO1;
-                        CEval7 = CEval5 * dSBO2;
-                        CEval8 = CEval4 / sin_theta;
+                    Ctheta_0 = p_val10 * DEG2RAD(theta_00)
+                        * EXP( -p_val10 * (2.0 - SBO2) );
 
-                        if ( pk < pi )
-                        {
-                            e_ang = f7_ij * f7_jk * f8_Dj * expval12theta;
-                            data_e_ang[j] += e_ang;
-                        }
+                    CEval5 = CEval4 * Ctheta_0 * CSBO2;
+                    CEval6 = CEval5 * dSBO1;
+                    CEval7 = CEval5 * dSBO2;
+                    CEval8 = CEval4 / sin_theta;
 
-                        /* calculate penalty for double bonds in valency angles */
-                        p_pen1 = thbp->p_pen1;
-
-                        exp_pen2ij = EXP( -p_pen2 * SQR( BOA_ij - 2.0 ) );
-                        exp_pen2jk = EXP( -p_pen2 * SQR( BOA_jk - 2.0 ) );
-                        exp_pen3 = EXP( -p_pen3 * workspace.Delta[j] );
-                        exp_pen4 = EXP(  p_pen4 * workspace.Delta[j] );
-                        trm_pen34 = 1.0 + exp_pen3 + exp_pen4;
-                        f9_Dj = ( 2.0 + exp_pen3 ) / trm_pen34;
-                        Cf9j = (-p_pen3 * exp_pen3 * trm_pen34
-                                - (2.0 + exp_pen3) * ( -p_pen3 * exp_pen3
-                                    + p_pen4 * exp_pen4 )) / SQR( trm_pen34 );
-                        /* very important: since each kernel generates all interactions,
-                         * need to prevent all energies becoming duplicates */
-                        if ( pk < pi )
-                        {
-                            e_pen = p_pen1 * f9_Dj * exp_pen2ij * exp_pen2jk;
-                            data_e_pen[j] += e_pen;
-                        }
+                    if ( pk < pi )
+                    {
+                        e_ang_l += f7_ij * f7_jk * f8_Dj * expval12theta;
+                    }
 
-                        CEpen1 = e_pen * Cf9j / f9_Dj;
-                        temp = -2.0 * p_pen2 * e_pen;
-                        CEpen2 = temp * (BOA_ij - 2.0);
-                        CEpen3 = temp * (BOA_jk - 2.0);
-
-                        /* calculate valency angle conjugation energy */
-                        p_coa1 = thbp->p_coa1;
-
-                        exp_coa2 = EXP( p_coa2 * workspace.Delta_boc[j] );
-                        e_coa = p_coa1
-                            * EXP( -p_coa4 * SQR(BOA_ij - 1.5) )
-                            * EXP( -p_coa4 * SQR(BOA_jk - 1.5) )
-                            * EXP( -p_coa3 * SQR(workspace.total_bond_order[i] - BOA_ij) )
-                            * EXP( -p_coa3 * SQR(workspace.total_bond_order[k] - BOA_jk) )
-                            / (1.0 + exp_coa2);
-                        /* similar to above comment regarding if statement */
-                        if ( pk < pi )
-                        {
-                            data_e_coa[j] += e_coa;
-                        }
+                    /* calculate penalty for double bonds in valency angles */
+                    p_pen1 = thbp->p_pen1;
+
+                    exp_pen2ij = EXP( -p_pen2 * SQR( BOA_ij - 2.0 ) );
+                    exp_pen2jk = EXP( -p_pen2 * SQR( BOA_jk - 2.0 ) );
+                    exp_pen3 = EXP( -p_pen3 * workspace.Delta[j] );
+                    exp_pen4 = EXP(  p_pen4 * workspace.Delta[j] );
+                    trm_pen34 = 1.0 + exp_pen3 + exp_pen4;
+                    f9_Dj = ( 2.0 + exp_pen3 ) / trm_pen34;
+                    Cf9j = (-p_pen3 * exp_pen3 * trm_pen34
+                            - (2.0 + exp_pen3) * ( -p_pen3 * exp_pen3
+                                + p_pen4 * exp_pen4 )) / SQR( trm_pen34 );
+
+                    /* very important: since each kernel generates all interactions,
+                     * need to prevent all energies becoming duplicates */
+                    if ( pk < pi )
+                    {
+                        e_pen = p_pen1 * f9_Dj * exp_pen2ij * exp_pen2jk;
+                        e_pen_l += e_pen;
+                    }
 
-                        CEcoa1 = -2.0 * p_coa4 * (BOA_ij - 1.5) * e_coa;
-                        CEcoa2 = -2.0 * p_coa4 * (BOA_jk - 1.5) * e_coa;
-                        CEcoa3 = -p_coa2 * exp_coa2 * e_coa / (1.0 + exp_coa2);
-                        CEcoa4 = -2.0 * p_coa3 * (workspace.total_bond_order[i] - BOA_ij) * e_coa;
-                        CEcoa5 = -2.0 * p_coa3 * (workspace.total_bond_order[k] - BOA_jk) * e_coa;
+                    CEpen1 = e_pen * Cf9j / f9_Dj;
+                    temp = -2.0 * p_pen2 * e_pen;
+                    CEpen2 = temp * (BOA_ij - 2.0);
+                    CEpen3 = temp * (BOA_jk - 2.0);
+
+                    /* calculate valency angle conjugation energy */
+                    p_coa1 = thbp->p_coa1;
+
+                    exp_coa2 = EXP( p_coa2 * workspace.Delta_boc[j] );
+                    e_coa = p_coa1
+                        * EXP( -p_coa4 * SQR(BOA_ij - 1.5) )
+                        * EXP( -p_coa4 * SQR(BOA_jk - 1.5) )
+                        * EXP( -p_coa3 * SQR(workspace.total_bond_order[i] - BOA_ij) )
+                        * EXP( -p_coa3 * SQR(workspace.total_bond_order[k] - BOA_jk) )
+                        / (1.0 + exp_coa2);
+                    /* similar to above comment regarding if statement */
+                    if ( pk < pi )
+                    {
+                        e_coa_l += e_coa;
+                    }
 
-                        /* calculate force contributions */
-                        /* we must again check for pk < pi for entire forces part */
-                        if ( pk < pi )
-                        {
-#if !defined(CUDA_ACCUM_FORCE_ATOMIC)
-                            bo_ij->Cdbo += (CEval1 + CEpen2 + (CEcoa1 - CEcoa4));
-                            bo_jk->Cdbo += (CEval2 + CEpen3 + (CEcoa2 - CEcoa5));
-                            workspace.CdDelta[j] += ((CEval3 + CEval7) + CEpen1 + CEcoa3);
-                            pbond_ij->va_CdDelta += CEcoa4;
-                            pbond_jk->va_CdDelta += CEcoa5;
-#else
-                            atomicAdd( &bo_ij->Cdbo, CEval1 + CEpen2 + (CEcoa1 - CEcoa4) );
-                            atomicAdd( &bo_jk->Cdbo, CEval2 + CEpen3 + (CEcoa2 - CEcoa5) );
-                            atomicAdd( &workspace.CdDelta[j], (CEval3 + CEval7) + CEpen1 + CEcoa3 );
-                            atomicAdd( &workspace.CdDelta[i], CEcoa4 );
-                            atomicAdd( &workspace.CdDelta[k], CEcoa5 );
-#endif
+                    CEcoa1 = -2.0 * p_coa4 * (BOA_ij - 1.5) * e_coa;
+                    CEcoa2 = -2.0 * p_coa4 * (BOA_jk - 1.5) * e_coa;
+                    CEcoa3 = -p_coa2 * exp_coa2 * e_coa / (1.0 + exp_coa2);
+                    CEcoa4 = -2.0 * p_coa3 * (workspace.total_bond_order[i] - BOA_ij) * e_coa;
+                    CEcoa5 = -2.0 * p_coa3 * (workspace.total_bond_order[k] - BOA_jk) * e_coa;
 
-                            for ( t = start_j; t < end_j; ++t )
-                            {
-                                pbond_jt = &bond_list.bond_list[t];
-                                bo_jt = &pbond_jt->bo_data;
-                                temp_bo_jt = bo_jt->BO;
-                                temp = CUBE( temp_bo_jt );
-                                pBOjt7 = temp * temp * temp_bo_jt;
-
-#if !defined(CUDA_ACCUM_FORCE_ATOMIC)
-                                bo_jt->Cdbo += (CEval6 * pBOjt7);
-                                bo_jt->Cdbopi += CEval5;
-                                bo_jt->Cdbopi2 += CEval5;
-#else
-                                atomicAdd( &bo_jt->Cdbo, CEval6 * pBOjt7 );
-                                atomicAdd( &bo_jt->Cdbopi, CEval5 );
-                                atomicAdd( &bo_jt->Cdbopi2, CEval5 );
-#endif
-                            }
-
-                            if ( control->virial == 0 )
-                            {
-#if !defined(CUDA_ACCUM_FORCE_ATOMIC)
-                                rvec_ScaledAdd( pbond_ij->va_f, CEval8, p_ijk->dcos_di );
-                                rvec_ScaledAdd( workspace.f[j], CEval8, p_ijk->dcos_dj );
-                                rvec_ScaledAdd( pbond_jk->va_f, CEval8, p_ijk->dcos_dk );
-#else
-                                atomic_rvecScaledAdd( workspace.f[i], CEval8, p_ijk->dcos_di );
-                                atomic_rvecScaledAdd( workspace.f[j], CEval8, p_ijk->dcos_dj );
-                                atomic_rvecScaledAdd( workspace.f[k], CEval8, p_ijk->dcos_dk );
-#endif
-                            }
-                            else
-                            {
-#if !defined(CUDA_ACCUM_FORCE_ATOMIC)
-                                /* terms not related to bond order derivatives are
-                                 * added directly into forces and pressure vector/tensor */
-                                rvec_Scale( force, CEval8, p_ijk->dcos_di );
-                                rvec_Add( pbond_ij->va_f, force );
-                                rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
-                                rvec_Add( my_ext_press[j], ext_press );
-
-                                rvec_ScaledAdd( workspace.f[j], CEval8, p_ijk->dcos_dj );
-
-                                rvec_Scale( force, CEval8, p_ijk->dcos_dk );
-                                rvec_Add( pbond_jk->va_f, force );
-                                rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
-                                rvec_Add( my_ext_press[j], ext_press );
+                    /* calculate force contributions */
+                    /* we must again check for pk < pi for entire forces part */
+                    if ( pk < pi )
+                    {
+#if !defined(CUDA_ACCUM_ATOMIC)
+                        bo_ij->Cdbo += CEval1 + CEpen2 + (CEcoa1 - CEcoa4);
+                        bo_jk->Cdbo += CEval2 + CEpen3 + (CEcoa2 - CEcoa5);
+                        workspace.CdDelta[j] += (CEval3 + CEval7) + CEpen1 + CEcoa3;
+                        pbond_ij->va_CdDelta += CEcoa4;
+                        pbond_jk->va_CdDelta += CEcoa5;
 #else
-                                /* terms not related to bond order derivatives are
-                                 * added directly into forces and pressure vector/tensor */
-                                rvec_Scale( force, CEval8, p_ijk->dcos_di );
-                                atomic_rvecAdd( workspace.f[i], force );
-                                rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
-                                rvec_Add( my_ext_press[j], ext_press );
-
-                                rvec_ScaledAdd( workspace.f[j], CEval8, p_ijk->dcos_dj );
-
-                                rvec_Scale( force, CEval8, p_ijk->dcos_dk );
-                                atomic_rvecAdd( workspace.f[k], force );
-                                rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
-                                rvec_Add( my_ext_press[j], ext_press );
+                        atomicAdd( &bo_ij->Cdbo, CEval1 + CEpen2 + (CEcoa1 - CEcoa4) );
+                        atomicAdd( &bo_jk->Cdbo, CEval2 + CEpen3 + (CEcoa2 - CEcoa5) );
+                        atomicAdd( &workspace.CdDelta[j], (CEval3 + CEval7) + CEpen1 + CEcoa3 );
+                        atomicAdd( &workspace.CdDelta[i], CEcoa4 );
+                        atomicAdd( &workspace.CdDelta[k], CEcoa5 );
 #endif
-                            }
-                        }
-
-#if defined(TEST_ENERGY)
-                        /*fprintf( out_control->eval, "%12.8f%12.8f%12.8f%12.8f\n",
-                          p_val3, p_val4, BOA_ij, BOA_jk );
-                          fprintf(out_control->eval, "%13.8f%13.8f%13.8f%13.8f%13.8f\n",
-                          workspace.Delta_e[j], workspace.vlpex[j],
-                          dSBO1, dSBO2, vlpadj );
-                          fprintf( out_control->eval, "%12.8f%12.8f%12.8f%12.8f\n",
-                          f7_ij, f7_jk, f8_Dj, expval12theta );
-                          fprintf( out_control->eval,
-                          "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
-                          CEval1, CEval2, CEval3, CEval4,
-                          CEval5, CEval6, CEval7, CEval8 );
-
-                          fprintf( out_control->eval,
-                          "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n",
-                          p_ijk->dcos_di[0]/sin_theta, p_ijk->dcos_di[1]/sin_theta,
-                          p_ijk->dcos_di[2]/sin_theta,
-                          p_ijk->dcos_dj[0]/sin_theta, p_ijk->dcos_dj[1]/sin_theta,
-                          p_ijk->dcos_dj[2]/sin_theta,
-                          p_ijk->dcos_dk[0]/sin_theta, p_ijk->dcos_dk[1]/sin_theta,
-                          p_ijk->dcos_dk[2]/sin_theta);
-
-                          fprintf( out_control->eval,
-                          "%6d%6d%6d%15.8f%15.8f\n",
-                          system->my_atoms[i].orig_id,
-                          system->my_atoms[j].orig_id,
-                          system->my_atoms[k].orig_id,
-                          RAD2DEG(theta), e_ang );*/
-
-                        fprintf( out_control->eval,
-                                //"%6d%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e\n",
-                                "%6d%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f%12.4f\n",
-                                system->my_atoms[i].orig_id,
-                                system->my_atoms[j].orig_id,
-                                system->my_atoms[k].orig_id,
-                                RAD2DEG(theta), theta_0, BOA_ij, BOA_jk,
-                                e_ang, data->my_en.e_ang );
-
-                        fprintf( out_control->epen,
-                                //"%6d%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n",
-                                "%6d%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f\n",
-                                system->my_atoms[i].orig_id,
-                                system->my_atoms[j].orig_id,
-                                system->my_atoms[k].orig_id,
-                                RAD2DEG(theta), BOA_ij, BOA_jk, e_pen,
-                                data->my_en.e_pen );
-
-                        fprintf( out_control->ecoa,
-                                //"%6d%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n",
-                                "%6d%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f\n",
-                                system->my_atoms[i].orig_id,
-                                system->my_atoms[j].orig_id,
-                                system->my_atoms[k].orig_id,
-                                RAD2DEG(theta), BOA_ij, BOA_jk,
-                                e_coa, data->my_en.e_coa );
-#endif
-
-#if defined(TEST_FORCES)
-                        /* angle forces */
-                        Add_dBO( system, lists, j, pi, CEval1, workspace.f_ang );
-                        Add_dBO( system, lists, j, pk, CEval2, workspace.f_ang );
-                        Add_dDelta( system, lists, j,
-                                CEval3 + CEval7, workspace.f_ang );
 
-                        for( t = start_j; t < end_j; ++t )
+                        for ( t = start_j; t < end_j; ++t )
                         {
                             pbond_jt = &bond_list.bond_list[t];
                             bo_jt = &pbond_jt->bo_data;
@@ -512,33 +391,61 @@ CUDA_GLOBAL void Cuda_Valence_Angles_Part1( reax_atom *my_atoms,
                             temp = CUBE( temp_bo_jt );
                             pBOjt7 = temp * temp * temp_bo_jt;
 
-                            Add_dBO( system, lists, j, t, pBOjt7 * CEval6,
-                                    workspace.f_ang );
-                            Add_dBOpinpi2( system, lists, j, t, CEval5, CEval5,
-                                    workspace.f_ang, workspace.f_ang );
+#if !defined(CUDA_ACCUM_ATOMIC)
+                            bo_jt->Cdbo += (CEval6 * pBOjt7);
+                            bo_jt->Cdbopi += CEval5;
+                            bo_jt->Cdbopi2 += CEval5;
+#else
+                            atomicAdd( &bo_jt->Cdbo, CEval6 * pBOjt7 );
+                            atomicAdd( &bo_jt->Cdbopi, CEval5 );
+                            atomicAdd( &bo_jt->Cdbopi2, CEval5 );
+#endif
                         }
 
-                        rvec_ScaledAdd( workspace.f_ang[i], CEval8, p_ijk->dcos_di );
-                        rvec_ScaledAdd( workspace.f_ang[j], CEval8, p_ijk->dcos_dj );
-                        rvec_ScaledAdd( workspace.f_ang[k], CEval8, p_ijk->dcos_dk );
-                        /* end angle forces */
-
-                        /* penalty forces */
-                        Add_dDelta( system, lists, j, CEpen1, workspace.f_pen );
-                        Add_dBO( system, lists, j, pi, CEpen2, workspace.f_pen );
-                        Add_dBO( system, lists, j, pk, CEpen3, workspace.f_pen );
-                        /* end penalty forces */
-
-                        /* coalition forces */
-                        Add_dBO( system, lists, j, pi, CEcoa1 - CEcoa4,
-                                workspace.f_coa );
-                        Add_dBO( system, lists, j, pk, CEcoa2 - CEcoa5,
-                                workspace.f_coa );
-                        Add_dDelta( system, lists, j, CEcoa3, workspace.f_coa );
-                        Add_dDelta( system, lists, i, CEcoa4, workspace.f_coa );
-                        Add_dDelta( system, lists, k, CEcoa5, workspace.f_coa );
-                        /* end coalition forces */
+                        if ( control->virial == 0 )
+                        {
+#if !defined(CUDA_ACCUM_ATOMIC)
+                            rvec_ScaledAdd( pbond_ij->va_f, CEval8, p_ijk->dcos_di );
+                            rvec_ScaledAdd( workspace.f[j], CEval8, p_ijk->dcos_dj );
+                            rvec_ScaledAdd( pbond_jk->va_f, CEval8, p_ijk->dcos_dk );
+#else
+                            atomic_rvecScaledAdd( workspace.f[i], CEval8, p_ijk->dcos_di );
+                            atomic_rvecScaledAdd( workspace.f[j], CEval8, p_ijk->dcos_dj );
+                            atomic_rvecScaledAdd( workspace.f[k], CEval8, p_ijk->dcos_dk );
 #endif
+                        }
+                        else
+                        {
+#if !defined(CUDA_ACCUM_ATOMIC)
+                            /* terms not related to bond order derivatives are
+                             * added directly into forces and pressure vector/tensor */
+                            rvec_Scale( rvec_temp, CEval8, p_ijk->dcos_di );
+                            rvec_Add( pbond_ij->va_f, rvec_temp );
+                            rvec_iMultiply( rvec_temp, pbond_ij->rel_box, rvec_temp );
+                            rvec_Add( ext_press_l, rvec_temp );
+
+                            rvec_ScaledAdd( workspace.f[j], CEval8, p_ijk->dcos_dj );
+
+                            rvec_Scale( rvec_temp, CEval8, p_ijk->dcos_dk );
+                            rvec_Add( pbond_jk->va_f, rvec_temp );
+                            rvec_iMultiply( rvec_temp, pbond_jk->rel_box, rvec_temp );
+                            rvec_Add( ext_press_l, rvec_temp );
+#else
+                            /* terms not related to bond order derivatives are
+                             * added directly into forces and pressure vector/tensor */
+                            rvec_Scale( rvec_temp, CEval8, p_ijk->dcos_di );
+                            atomic_rvecAdd( workspace.f[i], rvec_temp );
+                            rvec_iMultiply( rvec_temp, pbond_ij->rel_box, rvec_temp );
+                            rvec_Add( ext_press_l, rvec_temp );
+
+                            rvec_ScaledAdd( workspace.f[j], CEval8, p_ijk->dcos_dj );
+
+                            rvec_Scale( rvec_temp, CEval8, p_ijk->dcos_dk );
+                            atomic_rvecAdd( workspace.f[k], rvec_temp );
+                            rvec_iMultiply( rvec_temp, pbond_jk->rel_box, rvec_temp );
+                            rvec_Add( ext_press_l, rvec_temp );
+#endif
+                        }
                     }
                 }
             }
@@ -546,10 +453,22 @@ CUDA_GLOBAL void Cuda_Valence_Angles_Part1( reax_atom *my_atoms,
 
         Set_End_Index( pi, num_thb_intrs, &thb_list );
     }
+
+#if !defined(CUDA_ACCUM_ATOMIC)
+    e_ang_g[j] = e_ang_l;
+    e_coa_g[j] = e_coa_l;
+    e_pen_g[j] = e_pen_l;
+    rvec_Copy( ext_press_g[j], ext_press_l );
+#else
+    atomicAdd( (double *) e_ang_g, (double) e_ang_l );
+    atomicAdd( (double *) e_coa_g, (double) e_coa_l );
+    atomicAdd( (double *) e_pen_g, (double) e_pen_l );
+    atomic_rvecAdd( *ext_press_g, ext_press_l );
+#endif
 }
 
 
-#if !defined(CUDA_ACCUM_FORCE_ATOMIC)
+#if !defined(CUDA_ACCUM_ATOMIC)
 CUDA_GLOBAL void Cuda_Valence_Angles_Part2( reax_atom *atoms,
         control_params *control, storage workspace,
         reax_list bond_list, int N )
diff --git a/PG-PuReMD/src/cuda/cuda_valence_angles.h b/PG-PuReMD/src/cuda/cuda_valence_angles.h
index d91387d2..ae30aa71 100644
--- a/PG-PuReMD/src/cuda/cuda_valence_angles.h
+++ b/PG-PuReMD/src/cuda/cuda_valence_angles.h
@@ -32,7 +32,7 @@ CUDA_GLOBAL void Cuda_Valence_Angles_Part1( reax_atom *, global_parameters,
         storage, reax_list, reax_list, int, int, int, real *,
         real *, real *, rvec *);
 
-#if !defined(CUDA_ACCUM_FORCE_ATOMIC)
+#if !defined(CUDA_ACCUM_ATOMIC)
 CUDA_GLOBAL void Cuda_Valence_Angles_Part2 ( reax_atom *, control_params *,
         storage , reax_list, int );
 #endif
@@ -42,8 +42,8 @@ CUDA_GLOBAL void Cuda_Estimate_Valence_Angles( reax_atom *, control_params *,
 
 
 /* calculates the theta angle between atom triplet i-j-k */
-CUDA_DEVICE static inline void Calculate_Theta( const rvec dvec_ji, real d_ji, rvec dvec_jk, real d_jk,
-        real * const theta, real * const cos_theta )
+CUDA_DEVICE static inline void Calculate_Theta( const rvec dvec_ji, real d_ji,
+        const rvec dvec_jk, real d_jk, real * const theta, real * const cos_theta )
 {
     assert( d_ji > 0.0 );
     assert( d_jk > 0.0 );
@@ -64,8 +64,9 @@ CUDA_DEVICE static inline void Calculate_Theta( const rvec dvec_ji, real d_ji, r
 
 
 /* calculates the derivative of the cosine of the angle between atom triplet i-j-k */
-CUDA_DEVICE static inline void Calculate_dCos_Theta( const rvec dvec_ji, real d_ji, rvec dvec_jk, real d_jk,
-        rvec * const dcos_theta_di, rvec * const dcos_theta_dj, rvec * const dcos_theta_dk )
+CUDA_DEVICE static inline void Calculate_dCos_Theta( const rvec dvec_ji,
+        real d_ji, const rvec dvec_jk, real d_jk, rvec * const dcos_theta_di,
+        rvec * const dcos_theta_dj, rvec * const dcos_theta_dk )
 {
     int t;
     real sqr_d_ji, sqr_d_jk, inv_dists, inv_dists3, dot_dvecs, Cdot_inv3;
diff --git a/PG-PuReMD/src/reax_types.h b/PG-PuReMD/src/reax_types.h
index ef2fbaa1..b2936f81 100644
--- a/PG-PuReMD/src/reax_types.h
+++ b/PG-PuReMD/src/reax_types.h
@@ -82,8 +82,8 @@
 #if defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT
   #define CUDA_DEVICE_PACK
 #endif
-/* aggregate atomic forces using atomic operations */
-#define CUDA_ACCUM_FORCE_ATOMIC
+/* aggregate atomic energies and forces using atomic operations */
+#define CUDA_ACCUM_ATOMIC
 
 /* disable assertions if NOT compiling with debug support --
  * the definition (or lack thereof) controls how the assert macro is defined */
@@ -131,7 +131,7 @@
 
 #if defined(USE_REF_FORTRAN_REAXFF_CONSTANTS)
   /* transcendental constant pi */
-  #define PI (3.14159265)
+//  #define PI (3.14159265)
   /* unit conversion from ??? to kcal / mol */
   #define C_ELE (332.0638)
   /* Boltzmann constant, AMU * A^2 / (ps^2 * K) */
@@ -1933,7 +1933,7 @@ struct hbond_data
     int scl;
     /* position of neighbor in far neighbor list */
     int ptr;
-#if defined(HAVE_CUDA) && !defined(CUDA_ACCUM_FORCE_ATOMIC)
+#if defined(HAVE_CUDA) && !defined(CUDA_ACCUM_ATOMIC)
     /**/
     int sym_index;
     /**/
@@ -2035,7 +2035,7 @@ struct bond_data
     rvec dvec;
     /* bond order data */
     bond_order_data bo_data;
-#if defined(HAVE_CUDA) && !defined(CUDA_ACCUM_FORCE_ATOMIC)
+#if defined(HAVE_CUDA) && !defined(CUDA_ACCUM_ATOMIC)
     /**/
     real ae_CdDelta;
     /**/
-- 
GitLab