From add5b60ada27d874c89f34cad6a901dcbed8e8bb Mon Sep 17 00:00:00 2001
From: "Kurt A. O'Hearn" <ohearnku@msu.edu>
Date: Fri, 9 Apr 2021 13:21:39 -0400
Subject: [PATCH] PG-PuReMD: split bond and hbond initialization kernels for
 future optimizations. Cleanup CUDA code related to far neighbor list being in
 full format (remove unnecessary conditionals and arithmetic).

---
 PG-PuReMD/src/allocate.c                  |   45 +-
 PG-PuReMD/src/cuda/cuda_allocate.cu       |  285 +++---
 PG-PuReMD/src/cuda/cuda_allocate.h        |    4 +-
 PG-PuReMD/src/cuda/cuda_bond_orders.cu    |    6 +-
 PG-PuReMD/src/cuda/cuda_bond_orders.h     |  225 ++---
 PG-PuReMD/src/cuda/cuda_forces.cu         |  892 +++++++++++-------
 PG-PuReMD/src/cuda/cuda_init_md.cu        |    4 +-
 PG-PuReMD/src/cuda/cuda_integrate.cu      |   16 +-
 PG-PuReMD/src/cuda/cuda_neighbors.cu      |  138 +--
 PG-PuReMD/src/cuda/cuda_nonbonded.cu      |  111 +--
 PG-PuReMD/src/cuda/cuda_reset_tools.cu    |   26 +-
 PG-PuReMD/src/cuda/cuda_valence_angles.cu |   92 +-
 PG-PuReMD/src/forces.c                    | 1027 ++++++++++++---------
 PG-PuReMD/src/forces.h                    |    2 +-
 PG-PuReMD/src/init_md.c                   |    5 +-
 PG-PuReMD/src/reset_tools.c               |   52 +-
 PG-PuReMD/src/reset_tools.h               |    3 -
 17 files changed, 1500 insertions(+), 1433 deletions(-)

diff --git a/PG-PuReMD/src/allocate.c b/PG-PuReMD/src/allocate.c
index 62704ed3..9a02f538 100644
--- a/PG-PuReMD/src/allocate.c
+++ b/PG-PuReMD/src/allocate.c
@@ -605,14 +605,11 @@ static void Reallocate_Matrix( sparse_matrix * const H, int n, int n_max, int m
 }
 
 
-void Reallocate_HBonds_List( reax_system * const system, reax_list * const hbond_list )
+static void Reallocate_List( reax_list * const list, int n, int max_intrs,
+        int type, int format )
 {
-    int format;
-
-    format = hbond_list->format;
-
-    Delete_List( hbond_list );
-    Make_List( system->total_cap, system->total_hbonds, TYP_HBOND, format, hbond_list );
+    Delete_List( list );
+    Make_List( n, max_intrs, type, format, list );
 }
 
 
@@ -864,7 +861,7 @@ void Reallocate_Part2( reax_system * const system, control_params * const contro
         mpi_datatypes * const mpi_data )
 {
     int nflag, Nflag;
-    int renbr, format;
+    int renbr;
     reallocate_data * const realloc = &workspace->realloc;
     sparse_matrix * const H = &workspace->H;
 
@@ -930,36 +927,36 @@ void Reallocate_Part2( reax_system * const system, control_params * const contro
 #else
         Reallocate_Matrix( H, system->n, system->local_cap, system->total_cm_entries );
 #endif
-        Init_Matrix_Row_Indices( H, system->max_cm_entries );
-        realloc->cm = FALSE;
-    }
 
-    /* hydrogen bonds list */
-    if ( control->hbond_cut > 0.0
-            && (Nflag == TRUE || realloc->hbonds == TRUE) )
-    {
-        Reallocate_HBonds_List( system, lists[HBONDS] );
-        Init_List_Indices( lists[HBONDS], system->max_hbonds );
-        realloc->hbonds = FALSE;
+        realloc->cm = FALSE;
     }
 
     /* bonds list */
     if ( Nflag == TRUE || realloc->bonds == TRUE )
     {
-        Reallocate_Bonds_List( system, lists[BONDS] );
-        Init_List_Indices( lists[BONDS], system->max_bonds );
+        Reallocate_List( lists[BONDS], system->total_cap, system->total_bonds,
+               TYP_BOND, lists[BONDS]->format );
+
         realloc->bonds = FALSE;
         realloc->thbody = TRUE;
     }
 
+    /* hydrogen bonds list */
+    if ( control->hbond_cut > 0.0
+            && (Nflag == TRUE || realloc->hbonds == TRUE) )
+    {
+        Reallocate_List( lists[HBONDS], system->total_cap, system->total_hbonds,
+               TYP_HBOND, lists[HBONDS]->format );
+
+        realloc->hbonds = FALSE;
+    }
+
     /* 3-body list */
     if ( realloc->thbody == TRUE )
     {
-        format = lists[THREE_BODIES]->format;
+        Reallocate_List( lists[THREE_BODIES], system->total_bonds, system->total_thbodies,
+               TYP_THREE_BODY, lists[THREE_BODIES]->format );
 
-        Delete_List( lists[THREE_BODIES] );
-        Make_List( system->total_bonds, system->total_thbodies,
-                TYP_THREE_BODY, format, lists[THREE_BODIES] );
         realloc->thbody = FALSE;
     }
 }
diff --git a/PG-PuReMD/src/cuda/cuda_allocate.cu b/PG-PuReMD/src/cuda/cuda_allocate.cu
index a0f9e714..50f256e7 100644
--- a/PG-PuReMD/src/cuda/cuda_allocate.cu
+++ b/PG-PuReMD/src/cuda/cuda_allocate.cu
@@ -30,6 +30,120 @@ CUDA_GLOBAL void k_init_nbrs( ivec *nbrs, int N )
 }
 
 
+static void Cuda_Reallocate_List( reax_list *list, size_t n, size_t max_intrs, int type )
+{
+    Cuda_Delete_List( list );
+    Cuda_Make_List( n, max_intrs, type, list );
+}
+
+
+static void Cuda_Reallocate_System_Part1( reax_system *system, storage *workspace,
+        int local_cap_old )
+{
+    int *temp;
+
+    cuda_check_malloc( &workspace->scratch, &workspace->scratch_size,
+            sizeof(int) * local_cap_old,
+            "Cuda_Reallocate_System_Part1::workspace->scratch" );
+    temp = (int *) workspace->scratch;
+
+    copy_device( temp, system->d_cm_entries, sizeof(int) * local_cap_old,
+            "Cuda_Reallocate_System_Part1::temp" );
+    cuda_free( system->d_cm_entries, "Cuda_Reallocate_System_Part1::d_cm_entries" );
+    cuda_malloc( (void **) &system->d_cm_entries,
+            sizeof(int) * system->local_cap, TRUE, "Cuda_Reallocate_System_Part1::d_cm_entries" );
+    copy_device( system->d_cm_entries, temp, sizeof(int) * local_cap_old,
+            "Cuda_Reallocate_System_Part1::temp" );
+
+    copy_device( temp, system->d_max_cm_entries, sizeof(int) * local_cap_old,
+            "Cuda_Reallocate_System_Part1::temp" );
+    cuda_free( system->d_max_cm_entries, "Cuda_Reallocate_System_Part1::d_max_cm_entries" );
+    cuda_malloc( (void **) &system->d_max_cm_entries,
+            sizeof(int) * system->local_cap, TRUE, "Cuda_Reallocate_System_Part1::d_max_cm_entries" );
+    copy_device( system->d_max_cm_entries, temp, sizeof(int) * local_cap_old,
+            "Cuda_Reallocate_System_Part1::temp" );
+}
+
+
+static void Cuda_Reallocate_System_Part2( reax_system *system, storage *workspace,
+        int total_cap_old )
+{
+    int *temp;
+    reax_atom *temp_atom;
+
+    cuda_check_malloc( &workspace->scratch, &workspace->scratch_size,
+            MAX( sizeof(reax_atom), sizeof(int) ) * total_cap_old,
+            "Cuda_Reallocate_System_Part2::workspace->scratch" );
+    temp = (int *) workspace->scratch;
+    temp_atom = (reax_atom *) workspace->scratch;
+
+    /* free the existing storage for atoms, leave other info allocated */
+    copy_device( temp_atom, system->d_my_atoms, sizeof(reax_atom) * total_cap_old,
+            "Cuda_Reallocate_System_Part2::temp_atom" );
+    cuda_free( system->d_my_atoms, "system::d_my_atoms" );
+    cuda_malloc( (void **) &system->d_my_atoms,
+            sizeof(reax_atom) * system->total_cap, TRUE,
+            "Cuda_Reallocate_System_Part2::d_my_atoms" );
+    copy_device( system->d_my_atoms, temp_atom, sizeof(reax_atom) * total_cap_old,
+            "Cuda_Reallocate_System_Part2::temp_atom" );
+
+    /* list management */
+    copy_device( temp, system->d_far_nbrs, sizeof(int) * total_cap_old,
+            "Cuda_Reallocate_System_Part2::temp" );
+    cuda_free( system->d_far_nbrs, "Cuda_Reallocate_System_Part2::d_far_nbrs" );
+    cuda_malloc( (void **) &system->d_far_nbrs,
+            sizeof(int) * system->total_cap, TRUE,
+            "Cuda_Reallocate_System_Part2::d_far_nbrs" );
+    copy_device( system->d_far_nbrs, temp, sizeof(int) * total_cap_old,
+            "Cuda_Reallocate_System_Part2::temp" );
+
+    copy_device( temp, system->d_max_far_nbrs, sizeof(int) * total_cap_old,
+            "Cuda_Reallocate_System_Part2::temp" );
+    cuda_free( system->d_max_far_nbrs, "Cuda_Reallocate_System_Part2::d_max_far_nbrs" );
+    cuda_malloc( (void **) &system->d_max_far_nbrs,
+            sizeof(int) * system->total_cap, TRUE,
+            "Cuda_Reallocate_System_Part2::d_max_far_nbrs" );
+    copy_device( system->d_max_far_nbrs, temp, sizeof(int) * total_cap_old,
+            "Cuda_Reallocate_System_Part2::temp" );
+
+    copy_device( temp, system->d_bonds, sizeof(int) * total_cap_old,
+            "Cuda_Reallocate_System_Part2::temp" );
+    cuda_free( system->d_bonds, "Cuda_Reallocate_System_Part2::d_bonds" );
+    cuda_malloc( (void **) &system->d_bonds,
+            sizeof(int) * system->total_cap, TRUE,
+            "Cuda_Reallocate_System_Part2::d_bonds" );
+    copy_device( system->d_bonds, temp, sizeof(int) * total_cap_old,
+            "Cuda_Reallocate_System_Part2::temp" );
+
+    copy_device( temp, system->d_max_bonds, sizeof(int) * total_cap_old,
+            "Cuda_Reallocate_System_Part2::temp" );
+    cuda_free( system->d_max_bonds, "Cuda_Reallocate_System_Part2::d_max_bonds" );
+    cuda_malloc( (void **) &system->d_max_bonds,
+            sizeof(int) * system->total_cap, TRUE,
+            "Cuda_Reallocate_System_Part2::d_max_bonds" );
+    copy_device( system->d_max_bonds, temp, sizeof(int) * total_cap_old,
+            "Cuda_Reallocate_System_Part2::temp" );
+
+    copy_device( temp, system->d_hbonds, sizeof(int) * total_cap_old,
+            "Cuda_Reallocate_System_Part2::temp" );
+    cuda_free( system->d_hbonds, "system::d_hbonds" );
+    cuda_malloc( (void **) &system->d_hbonds,
+            sizeof(int) * system->total_cap, TRUE,
+            "Cuda_Reallocate_System_Part2::d_hbonds" );
+    copy_device( system->d_hbonds, temp, sizeof(int) * total_cap_old,
+            "Cuda_Reallocate_System_Part2::temp" );
+
+    copy_device( temp, system->d_max_hbonds, sizeof(int) * total_cap_old,
+            "Cuda_Reallocate_System_Part2::temp" );
+    cuda_free( system->d_max_hbonds, "system::d_max_hbonds" );
+    cuda_malloc( (void **) &system->d_max_hbonds,
+            sizeof(int) * system->total_cap, TRUE,
+            "Cuda_Reallocate_System_Part2::d_max_hbonds" );
+    copy_device( system->d_max_hbonds, temp, sizeof(int) * total_cap_old,
+            "Cuda_Reallocate_System_Part2::temp" );
+}
+
+
 void Cuda_Allocate_Control( control_params *control )
 {
     cuda_malloc( (void **)&control->d_control_params,
@@ -251,113 +365,6 @@ void Cuda_Allocate_System( reax_system *system )
 }
 
 
-static void Cuda_Reallocate_System_Part1( reax_system *system, storage *workspace,
-        int local_cap_old )
-{
-    int *temp;
-
-    cuda_check_malloc( &workspace->scratch, &workspace->scratch_size,
-            sizeof(int) * local_cap_old,
-            "Cuda_Reallocate_System_Part1::workspace->scratch" );
-    temp = (int *) workspace->scratch;
-
-    copy_device( temp, system->d_cm_entries, sizeof(int) * local_cap_old,
-            "Cuda_Reallocate_System_Part1::temp" );
-    cuda_free( system->d_cm_entries, "Cuda_Reallocate_System_Part1::d_cm_entries" );
-    cuda_malloc( (void **) &system->d_cm_entries,
-            sizeof(int) * system->local_cap, TRUE, "Cuda_Reallocate_System_Part1::d_cm_entries" );
-    copy_device( system->d_cm_entries, temp, sizeof(int) * local_cap_old,
-            "Cuda_Reallocate_System_Part1::temp" );
-
-    copy_device( temp, system->d_max_cm_entries, sizeof(int) * local_cap_old,
-            "Cuda_Reallocate_System_Part1::temp" );
-    cuda_free( system->d_max_cm_entries, "Cuda_Reallocate_System_Part1::d_max_cm_entries" );
-    cuda_malloc( (void **) &system->d_max_cm_entries,
-            sizeof(int) * system->local_cap, TRUE, "Cuda_Reallocate_System_Part1::d_max_cm_entries" );
-    copy_device( system->d_max_cm_entries, temp, sizeof(int) * local_cap_old,
-            "Cuda_Reallocate_System_Part1::temp" );
-}
-
-
-static void Cuda_Reallocate_System_Part2( reax_system *system, storage *workspace,
-        int total_cap_old )
-{
-    int *temp;
-    reax_atom *temp_atom;
-
-    cuda_check_malloc( &workspace->scratch, &workspace->scratch_size,
-            MAX( sizeof(reax_atom), sizeof(int) ) * total_cap_old,
-            "Cuda_Reallocate_System_Part2::workspace->scratch" );
-    temp = (int *) workspace->scratch;
-    temp_atom = (reax_atom *) workspace->scratch;
-
-    /* free the existing storage for atoms, leave other info allocated */
-    copy_device( temp_atom, system->d_my_atoms, sizeof(reax_atom) * total_cap_old,
-            "Cuda_Reallocate_System_Part2::temp_atom" );
-    cuda_free( system->d_my_atoms, "system::d_my_atoms" );
-    cuda_malloc( (void **) &system->d_my_atoms,
-            sizeof(reax_atom) * system->total_cap, TRUE,
-            "Cuda_Reallocate_System_Part2::d_my_atoms" );
-    copy_device( system->d_my_atoms, temp_atom, sizeof(reax_atom) * total_cap_old,
-            "Cuda_Reallocate_System_Part2::temp_atom" );
-
-    /* list management */
-    copy_device( temp, system->d_far_nbrs, sizeof(int) * total_cap_old,
-            "Cuda_Reallocate_System_Part2::temp" );
-    cuda_free( system->d_far_nbrs, "Cuda_Reallocate_System_Part2::d_far_nbrs" );
-    cuda_malloc( (void **) &system->d_far_nbrs,
-            sizeof(int) * system->total_cap, TRUE,
-            "Cuda_Reallocate_System_Part2::d_far_nbrs" );
-    copy_device( system->d_far_nbrs, temp, sizeof(int) * total_cap_old,
-            "Cuda_Reallocate_System_Part2::temp" );
-
-    copy_device( temp, system->d_max_far_nbrs, sizeof(int) * total_cap_old,
-            "Cuda_Reallocate_System_Part2::temp" );
-    cuda_free( system->d_max_far_nbrs, "Cuda_Reallocate_System_Part2::d_max_far_nbrs" );
-    cuda_malloc( (void **) &system->d_max_far_nbrs,
-            sizeof(int) * system->total_cap, TRUE,
-            "Cuda_Reallocate_System_Part2::d_max_far_nbrs" );
-    copy_device( system->d_max_far_nbrs, temp, sizeof(int) * total_cap_old,
-            "Cuda_Reallocate_System_Part2::temp" );
-
-    copy_device( temp, system->d_bonds, sizeof(int) * total_cap_old,
-            "Cuda_Reallocate_System_Part2::temp" );
-    cuda_free( system->d_bonds, "Cuda_Reallocate_System_Part2::d_bonds" );
-    cuda_malloc( (void **) &system->d_bonds,
-            sizeof(int) * system->total_cap, TRUE,
-            "Cuda_Reallocate_System_Part2::d_bonds" );
-    copy_device( system->d_bonds, temp, sizeof(int) * total_cap_old,
-            "Cuda_Reallocate_System_Part2::temp" );
-
-    copy_device( temp, system->d_max_bonds, sizeof(int) * total_cap_old,
-            "Cuda_Reallocate_System_Part2::temp" );
-    cuda_free( system->d_max_bonds, "Cuda_Reallocate_System_Part2::d_max_bonds" );
-    cuda_malloc( (void **) &system->d_max_bonds,
-            sizeof(int) * system->total_cap, TRUE,
-            "Cuda_Reallocate_System_Part2::d_max_bonds" );
-    copy_device( system->d_max_bonds, temp, sizeof(int) * total_cap_old,
-            "Cuda_Reallocate_System_Part2::temp" );
-
-    copy_device( temp, system->d_hbonds, sizeof(int) * total_cap_old,
-            "Cuda_Reallocate_System_Part2::temp" );
-    cuda_free( system->d_hbonds, "system::d_hbonds" );
-    cuda_malloc( (void **) &system->d_hbonds,
-            sizeof(int) * system->total_cap, TRUE,
-            "Cuda_Reallocate_System_Part2::d_hbonds" );
-    copy_device( system->d_hbonds, temp, sizeof(int) * total_cap_old,
-            "Cuda_Reallocate_System_Part2::temp" );
-
-    copy_device( temp, system->d_max_hbonds, sizeof(int) * total_cap_old,
-            "Cuda_Reallocate_System_Part2::temp" );
-    cuda_free( system->d_max_hbonds, "system::d_max_hbonds" );
-    cuda_malloc( (void **) &system->d_max_hbonds,
-            sizeof(int) * system->total_cap, TRUE,
-            "Cuda_Reallocate_System_Part2::d_max_hbonds" );
-    copy_device( system->d_max_hbonds, temp, sizeof(int) * total_cap_old,
-            "Cuda_Reallocate_System_Part2::temp" );
-}
-
-
 void Cuda_Allocate_Simulation_Data( simulation_data *data )
 {
     cuda_malloc( (void **) &data->d_simulation_data,
@@ -729,35 +736,6 @@ void Cuda_Deallocate_Matrix( sparse_matrix *H )
 }
 
 
-void Cuda_Reallocate_Neighbor_List( reax_list *far_nbr_list, size_t n, size_t max_intrs )
-{
-    Cuda_Delete_List( far_nbr_list );
-    Cuda_Make_List( n, max_intrs, TYP_FAR_NEIGHBOR, far_nbr_list );
-}
-
-
-void Cuda_Reallocate_HBonds_List( reax_list *hbond_list, size_t n, size_t max_intrs )
-{
-    Cuda_Delete_List( hbond_list );
-    Cuda_Make_List( n, max_intrs, TYP_HBOND, hbond_list );
-}
-
-
-void Cuda_Reallocate_Bonds_List( reax_list *bond_list, size_t n, size_t max_intrs )
-{
-    Cuda_Delete_List( bond_list );
-    Cuda_Make_List( n, max_intrs, TYP_BOND, bond_list );
-}
-
-
-void Cuda_Reallocate_Thbodies_List( reax_list *thbodies, size_t n, size_t max_intrs )
-{
-    Cuda_Delete_List( thbodies );
-    Cuda_Make_List( n, max_intrs, TYP_THREE_BODY, thbodies );
-
-}
-
-
 void Cuda_Reallocate_Part1( reax_system *system, control_params *control,
         simulation_data *data, storage *workspace, reax_list **lists,
         mpi_datatypes *mpi_data )
@@ -851,8 +829,8 @@ void Cuda_Reallocate_Part2( reax_system *system, control_params *control,
     /* far neighbors */
     if ( renbr == TRUE && (Nflag == TRUE || realloc->far_nbrs == TRUE) )
     {
-        Cuda_Reallocate_Neighbor_List( lists[FAR_NBRS],
-                system->total_cap, system->total_far_nbrs );
+        Cuda_Reallocate_List( lists[FAR_NBRS], system->total_cap,
+                system->total_far_nbrs, TYP_FAR_NEIGHBOR );
         Cuda_Init_Neighbor_Indices( system, lists[FAR_NBRS] );
         realloc->far_nbrs = FALSE;
     }
@@ -865,34 +843,35 @@ void Cuda_Reallocate_Part2( reax_system *system, control_params *control,
         Cuda_Deallocate_Matrix( H );
         Cuda_Allocate_Matrix( H, system->n, system->local_cap,
                 system->total_cm_entries, format );
-        Cuda_Init_Sparse_Matrix_Indices( system, H );
+
         realloc->cm = FALSE;
     }
 
+    /* bonds list */
+    if ( Nflag == TRUE || realloc->bonds == TRUE )
+    {
+        Cuda_Reallocate_List( lists[BONDS], system->total_cap,
+                system->total_bonds, TYP_BOND );
+
+        realloc->bonds = FALSE;
+    }
+
     /* hydrogen bonds list */
     if ( control->hbond_cut > 0.0 && system->numH > 0
             && (Nflag == TRUE || realloc->hbonds == TRUE) )
     {
-        Cuda_Reallocate_HBonds_List( lists[HBONDS],
-                system->total_cap, system->total_hbonds );
-        Cuda_Init_HBond_Indices( system, workspace, lists[HBONDS] );
-        realloc->hbonds = FALSE;
-    }
+        Cuda_Reallocate_List( lists[HBONDS], system->total_cap,
+                system->total_hbonds, TYP_HBOND );
 
-    /* bonds list */
-    if ( Nflag == TRUE || realloc->bonds == TRUE )
-    {
-        Cuda_Reallocate_Bonds_List( lists[BONDS],
-                system->total_cap, system->total_bonds );
-        Cuda_Init_Bond_Indices( system, lists[BONDS] );
-        realloc->bonds = FALSE;
+        realloc->hbonds = FALSE;
     }
 
     /* 3-body list */
     if ( Nflag == TRUE || realloc->thbody == TRUE )
     {
-        Cuda_Reallocate_Thbodies_List( lists[THREE_BODIES],
-                system->total_thbodies_indices, system->total_thbodies );
+        Cuda_Reallocate_List( lists[THREE_BODIES], system->total_thbodies_indices,
+                system->total_thbodies, TYP_THREE_BODY );
+
         realloc->thbody = FALSE;
     }
 }
diff --git a/PG-PuReMD/src/cuda/cuda_allocate.h b/PG-PuReMD/src/cuda/cuda_allocate.h
index 82329aca..83e6a9a6 100644
--- a/PG-PuReMD/src/cuda/cuda_allocate.h
+++ b/PG-PuReMD/src/cuda/cuda_allocate.h
@@ -10,14 +10,14 @@ void Cuda_Allocate_Grid( reax_system * );
 
 void Cuda_Allocate_Simulation_Data( simulation_data * );
 
+void Cuda_Allocate_Control( control_params * );
+
 void Cuda_Allocate_Workspace_Part1( reax_system *, control_params *, storage *, int );
 
 void Cuda_Allocate_Workspace_Part2( reax_system *, control_params *, storage *, int );
 
 void Cuda_Allocate_Matrix( sparse_matrix * const, int, int, int, int );
 
-void Cuda_Allocate_Control( control_params * );
-
 void Cuda_Deallocate_Grid_Cell_Atoms( reax_system * );
 
 void Cuda_Allocate_Grid_Cell_Atoms( reax_system *, int );
diff --git a/PG-PuReMD/src/cuda/cuda_bond_orders.cu b/PG-PuReMD/src/cuda/cuda_bond_orders.cu
index 540c7b40..c6d8b57b 100644
--- a/PG-PuReMD/src/cuda/cuda_bond_orders.cu
+++ b/PG-PuReMD/src/cuda/cuda_bond_orders.cu
@@ -410,7 +410,6 @@ CUDA_GLOBAL void k_bond_order_part2( reax_atom *my_atoms, global_parameters gp,
         type_j = my_atoms[j].type;
         bo_ij = &bond_list.bond_list[pj].bo_data;
 
-        //TODO
         //if ( i < j || workspace.bond_mark[j] > 3 )
         if ( i < j )
         {
@@ -592,11 +591,10 @@ CUDA_GLOBAL void k_bond_order_part3( storage workspace, reax_list bond_list, int
 
     for ( pj = start_i; pj < end_i; ++pj )
     {
-
         j = bond_list.bond_list[pj].nbr;
         bo_ij = &bond_list.bond_list[pj].bo_data;
 
-        //if ( i >= j || workspace.bond_mark [i] <= 3 )
+        //if ( i >= j || workspace.bond_mark[i] <= 3 )
         if ( i >= j )
         {
             /* We only need to update bond orders from bo_ji
@@ -684,7 +682,6 @@ CUDA_GLOBAL void k_total_forces_part1( storage workspace, reax_list bond_list,
         return;
     }
 
-    //if ( i < bond_list.bond_list[pj].nbr ) {
     if ( control->virial == 0 )
     {
         for ( pj = Start_Index( i, &bond_list ); pj < End_Index( i, &bond_list ); ++pj )
@@ -722,7 +719,6 @@ CUDA_GLOBAL void k_total_forces_part1_2( reax_atom *my_atoms, reax_list bond_lis
         nbr_k = &bond_list.bond_list[pk];
         nbr_k_sym = &bond_list.bond_list[nbr_k->sym_index];
 
-        //rvec_Add( atoms[i].f, nbr_k_sym->tf_f );
         rvec_Add( workspace.f[i], nbr_k_sym->tf_f );
     }
 }
diff --git a/PG-PuReMD/src/cuda/cuda_bond_orders.h b/PG-PuReMD/src/cuda/cuda_bond_orders.h
index d419b3b3..a896eee6 100644
--- a/PG-PuReMD/src/cuda/cuda_bond_orders.h
+++ b/PG-PuReMD/src/cuda/cuda_bond_orders.h
@@ -4,6 +4,8 @@
 
 #include "../reax_types.h"
 
+#include "cuda_helpers.h"
+
 #include "../vector.h"
 
 
@@ -20,179 +22,72 @@ void Cuda_Total_Forces_Part2( reax_system *, storage * );
  * and if this term exceeds the cutoff bo_cut, then adds
  * BOTH atoms the bonds list (i.e., compute term once
  * and copy to avoid redundant computation) */
-CUDA_DEVICE static inline int Cuda_BOp( reax_list bond_list, real bo_cut,
-        int i, int btop_i, int j, ivec *rel_box,
-        real d, rvec *dvec, int format, single_body_parameters *sbp_i,
-        single_body_parameters *sbp_j, two_body_parameters *twbp,
-        rvec *dDeltap_self, real *total_bond_order )
+CUDA_DEVICE static inline void Cuda_Compute_BOp( reax_list bond_list, real bo_cut,
+        int i, int btop_i, int j, real C12, real C34, real C56, real BO_s,
+        real BO_pi, real BO_pi2, real BO, ivec *rel_box, real d, rvec *dvec,
+        int format, two_body_parameters *twbp, rvec *dDeltap_self,
+        real *total_bond_order )
 {
-    int btop_j, ret;
-    real r2, C12, C34, C56;
+    real r2;
     real Cln_BOp_s, Cln_BOp_pi, Cln_BOp_pi2;
-    real BO, BO_s, BO_pi, BO_pi2;
-    bond_data *ibond, *jbond;
-    bond_order_data *bo_ij, *bo_ji;
+    bond_data *ibond;
+    bond_order_data *bo_ij;
 
-    ret = FALSE;
     r2 = SQR( d );
 
-    if ( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0 )
-    {
-        C12 = twbp->p_bo1 * POW( d / twbp->r_s, twbp->p_bo2 );
-        BO_s = (1.0 + bo_cut) * exp( C12 );
-    }
-    else
-    {
-        C12 = 0.0;
-        BO_s = 0.0;
-    }
-
-    if ( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0 )
-    {
-        C34 = twbp->p_bo3 * POW( d / twbp->r_p, twbp->p_bo4 );
-        BO_pi = exp( C34 );
-    }
-    else
-    {
-        C34 = 0.0;
-        BO_pi = 0.0;
-    }
-
-    if ( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0 )
-    {
-        C56 = twbp->p_bo5 * POW( d / twbp->r_pp, twbp->p_bo6 );
-        BO_pi2 = exp( C56 );
-    }
-    else
-    {
-        C56 = 0.0;
-        BO_pi2 = 0.0;
-    }
-
-    /* Initially BO values are the uncorrected ones, page 1 */
-    BO = BO_s + BO_pi + BO_pi2;
-
-    if ( BO >= bo_cut )
-    {
-        /* Bond Order page2-3, derivative of total bond order prime */
-        Cln_BOp_s = twbp->p_bo2 * C12 / r2;
-        Cln_BOp_pi = twbp->p_bo4 * C34 / r2;
-        Cln_BOp_pi2 = twbp->p_bo6 * C56 / r2;
-
-        /****** bonds i-j and j-i ******/
-        if ( i < j )
-        {
-            /****** bond i-j ONLY ******/
-            ibond = &bond_list.bond_list[btop_i];
-            ibond->nbr = j;
-            ibond->d = d;
-
-            rvec_Copy( ibond->dvec, *dvec );
-            ivec_Copy( ibond->rel_box, *rel_box );
-
-            //ibond->dbond_index = btop_i;
-            //ibond->sym_index = btop_j;
-
-            bo_ij = &ibond->bo_data;
-            bo_ij->BO = BO;
-            bo_ij->BO_s = BO_s;
-            bo_ij->BO_pi = BO_pi;
-            bo_ij->BO_pi2 = BO_pi2;
-
-            /* Only dln_BOp_xx wrt. dr_i is stored here, note that
-             * dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 */
-            rvec_Scale( bo_ij->dln_BOp_s, -1.0 * bo_ij->BO_s * Cln_BOp_s, ibond->dvec );
-            rvec_Scale( bo_ij->dln_BOp_pi, -1.0 * bo_ij->BO_pi * Cln_BOp_pi, ibond->dvec );
-            rvec_Scale( bo_ij->dln_BOp_pi2, -1.0 * bo_ij->BO_pi2 * Cln_BOp_pi2, ibond->dvec );
-
-            /* Only dBOp wrt. dr_i is stored here, note that
-             * dBOp/dr_i = -dBOp/dr_j and all others are 0 */
-            rvec_Scale( bo_ij->dBOp, -(bo_ij->BO_s * Cln_BOp_s
-                        + bo_ij->BO_pi * Cln_BOp_pi
-                        + bo_ij->BO_pi2 * Cln_BOp_pi2), ibond->dvec );
-
-            rvec_Add( dDeltap_self[i], bo_ij->dBOp );
-
-            bo_ij->BO_s -= bo_cut;
-            bo_ij->BO -= bo_cut;
-            /* currently total_BOp */
-            total_bond_order[i] += bo_ij->BO; 
-            bo_ij->Cdbo = 0.0;
-            bo_ij->Cdbopi = 0.0;
-            bo_ij->Cdbopi2 = 0.0;
-
-#if !defined(CUDA_ACCUM_ATOMIC)
-            ibond->ae_CdDelta = 0.0;
-            ibond->va_CdDelta = 0.0;
-            rvec_MakeZero( ibond->va_f );
-            ibond->ta_CdDelta = 0.0;
-            ibond->ta_Cdbo = 0.0;
-            rvec_MakeZero( ibond->ta_f );
-            rvec_MakeZero( ibond->hb_f );
-            rvec_MakeZero( ibond->tf_f );
-#endif
-        }
-        else
-        {
-            /****** bond j-i ONLY ******/
-            //btop_j = End_Index( j, &bond_list );
-            btop_j = btop_i;
-            jbond = &bond_list.bond_list[btop_j];
-
-            jbond->nbr = j;
-            jbond->d = d;
-            rvec_Scale( jbond->dvec, -1.0, *dvec );
-            ivec_Scale( jbond->rel_box, -1.0, *rel_box );
-            //jbond->dbond_index = btop_i;
-            //jbond->sym_index = btop_i;
-
-            //Set_End_Index( j, btop_j + 1, &bond_list );
-
-            bo_ji = &jbond->bo_data;
-            bo_ji->BO = BO;
-            bo_ji->BO_s = BO_s;
-            bo_ji->BO_pi = BO_pi;
-            bo_ji->BO_pi2 = BO_pi2;
-
-            /* Only dln_BOp_xx wrt. dr_i is stored here, note that
-             * dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 */
-            rvec_Scale( bo_ji->dln_BOp_s, BO_s * Cln_BOp_s, *dvec );
-            rvec_Scale( bo_ji->dln_BOp_pi, BO_pi * Cln_BOp_pi, *dvec );
-            rvec_Scale( bo_ji->dln_BOp_pi2, BO_pi2 * Cln_BOp_pi2, *dvec );
-
-            /* Only dBOp wrt. dr_i is stored here, note that
-             * dBOp/dr_i = -dBOp/dr_j and all others are 0 */
-            rvec_Scale( bo_ji->dBOp, BO_s * Cln_BOp_s
-                        + BO_pi * Cln_BOp_pi
-                        + BO_pi2 * Cln_BOp_pi2, *dvec );
-
-            rvec_Add( dDeltap_self[i], bo_ji->dBOp );
-
-            bo_ji->BO_s -= bo_cut;
-            bo_ji->BO -= bo_cut;
-
-            /* currently total_BOp */
-            total_bond_order[i] += bo_ji->BO;
-            bo_ji->Cdbo = 0.0;
-            bo_ji->Cdbopi = 0.0;
-            bo_ji->Cdbopi2 = 0.0;
+    /****** bond i-j ONLY ******/
+    ibond = &bond_list.bond_list[btop_i];
+    ibond->nbr = j;
+    ibond->d = d;
+
+    rvec_Copy( ibond->dvec, *dvec );
+    ivec_Copy( ibond->rel_box, *rel_box );
+
+    bo_ij = &ibond->bo_data;
+    bo_ij->BO = BO;
+    bo_ij->BO_s = BO_s;
+    bo_ij->BO_pi = BO_pi;
+    bo_ij->BO_pi2 = BO_pi2;
+
+    /* Bond Order page2-3, derivative of total bond order prime */
+    Cln_BOp_s = twbp->p_bo2 * C12 / r2;
+    Cln_BOp_pi = twbp->p_bo4 * C34 / r2;
+    Cln_BOp_pi2 = twbp->p_bo6 * C56 / r2;
+
+    /* Only dln_BOp_xx wrt. dr_i is stored here, note that
+     * dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 */
+    rvec_Scale( bo_ij->dln_BOp_s, -1.0 * bo_ij->BO_s * Cln_BOp_s, ibond->dvec );
+    rvec_Scale( bo_ij->dln_BOp_pi, -1.0 * bo_ij->BO_pi * Cln_BOp_pi, ibond->dvec );
+    rvec_Scale( bo_ij->dln_BOp_pi2, -1.0 * bo_ij->BO_pi2 * Cln_BOp_pi2, ibond->dvec );
+
+    /* Only dBOp wrt. dr_i is stored here, note that
+     * dBOp/dr_i = -dBOp/dr_j and all others are 0 */
+    rvec_Scale( bo_ij->dBOp, -1.0 * (bo_ij->BO_s * Cln_BOp_s
+                + bo_ij->BO_pi * Cln_BOp_pi
+                + bo_ij->BO_pi2 * Cln_BOp_pi2), ibond->dvec );
+
+    rvec_Add( dDeltap_self[i], bo_ij->dBOp );
+//    atomic_rvecAdd( dDeltap_self[i], bo_ij->dBOp );
+
+    bo_ij->BO_s -= bo_cut;
+    bo_ij->BO -= bo_cut;
+    /* currently total_BOp */
+    total_bond_order[i] += bo_ij->BO; 
+//    atomicAdd( (double *) &total_bond_order[i], bo_ij->BO ); 
+    bo_ij->Cdbo = 0.0;
+    bo_ij->Cdbopi = 0.0;
+    bo_ij->Cdbopi2 = 0.0;
 
 #if !defined(CUDA_ACCUM_ATOMIC)
-            jbond->ae_CdDelta = 0.0;
-            jbond->va_CdDelta = 0.0;
-            rvec_MakeZero( jbond->va_f );
-            jbond->ta_CdDelta = 0.0;
-            jbond->ta_Cdbo = 0.0;
-            rvec_MakeZero( jbond->ta_f );
-            rvec_MakeZero( jbond->hb_f );
-            rvec_MakeZero( jbond->tf_f );
+    ibond->ae_CdDelta = 0.0;
+    ibond->va_CdDelta = 0.0;
+    rvec_MakeZero( ibond->va_f );
+    ibond->ta_CdDelta = 0.0;
+    ibond->ta_Cdbo = 0.0;
+    rvec_MakeZero( ibond->ta_f );
+    rvec_MakeZero( ibond->hb_f );
+    rvec_MakeZero( ibond->tf_f );
 #endif
-        }
-
-        ret = TRUE;
-    }
-
-    return ret;
 }
 
 
diff --git a/PG-PuReMD/src/cuda/cuda_forces.cu b/PG-PuReMD/src/cuda/cuda_forces.cu
index 8e4f5eb7..3826b36f 100644
--- a/PG-PuReMD/src/cuda/cuda_forces.cu
+++ b/PG-PuReMD/src/cuda/cuda_forces.cu
@@ -139,25 +139,10 @@ CUDA_GLOBAL void k_init_end_index( int * intr_cnt, int *indices, int *end_indice
 }
 
 
-CUDA_GLOBAL void k_init_hindex( reax_atom *my_atoms, int N )
-{
-    int i;
-
-    i = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if ( i >= N )
-    {
-        return;
-    }
-
-    my_atoms[i].Hindex = i;
-}
-
-
 CUDA_GLOBAL void k_init_hbond_indices( reax_atom * atoms, single_body_parameters *sbp,
         int *hbonds, int *max_hbonds, int *indices, int *end_indices, int N )
 {
-    int i, hindex, my_hbonds;
+    int i, hindex, flag;
 
     i = blockIdx.x * blockDim.x  + threadIdx.x;
 
@@ -168,20 +153,12 @@ CUDA_GLOBAL void k_init_hbond_indices( reax_atom * atoms, single_body_parameters
 
     hindex = atoms[i].Hindex;
 
-    if ( sbp[ atoms[i].type ].p_hbond == H_ATOM
-            || sbp[ atoms[i].type ].p_hbond == H_BONDING_ATOM )
-    {
-        my_hbonds = hbonds[i];
-        indices[hindex] = max_hbonds[i];
-        end_indices[hindex] = indices[hindex] + hbonds[i];
-    }
-    else
-    {
-        my_hbonds = 0;
-        indices[hindex] = 0;
-        end_indices[hindex] = 0;
-    }
-    atoms[i].num_hbonds = my_hbonds;
+    flag = (sbp[ atoms[i].type ].p_hbond == H_ATOM
+            || sbp[ atoms[i].type ].p_hbond == H_BONDING_ATOM ? TRUE : FALSE);
+
+    indices[hindex] = (flag == TRUE ? max_hbonds[i] : 0);
+    end_indices[hindex] = (flag == TRUE ? indices[hindex] + hbonds[i] : 0);
+    atoms[i].num_hbonds = (flag == TRUE ? hbonds[i] : 0);
 }
 
 
@@ -190,7 +167,6 @@ CUDA_GLOBAL void k_print_hbond_info( reax_atom *my_atoms, single_body_parameters
 {
     int i;
     int type_i;
-    int ihb, ihb_top;
     single_body_parameters *sbp_i;
     reax_atom *atom_i;
 
@@ -205,21 +181,8 @@ CUDA_GLOBAL void k_print_hbond_info( reax_atom *my_atoms, single_body_parameters
     type_i = atom_i->type;
     sbp_i = &sbp[type_i];
 
-    if ( control->hbond_cut > 0.0 )
-    {
-        ihb = sbp_i->p_hbond;
-
-        if ( ihb == H_ATOM  || ihb == H_BONDING_ATOM )
-        {
-            ihb_top = Start_Index( atom_i->Hindex, &hbond_list );
-        }
-        else
-        {
-            ihb_top = -1;
-        }
-    }
-
-    printf( "atom %6d: ihb = %2d, ihb_top = %2d\n", i, ihb, ihb_top );
+    printf( "atom %6d: ihb = %2d, ihb_top = %2d\n", i, sbp_i->p_hbond,
+            Start_Index( atom_i->Hindex, &hbond_list ) );
 }
 
 
@@ -247,18 +210,9 @@ CUDA_GLOBAL void k_init_dist( reax_atom *my_atoms, reax_list far_nbr_list, int N
     {
         j = far_nbr_list.far_nbr_list.nbr[pj];
 
-        if ( i < j )
-        {
-            far_nbr_list.far_nbr_list.dvec[pj][0] = my_atoms[j].x[0] - x_i[0];
-            far_nbr_list.far_nbr_list.dvec[pj][1] = my_atoms[j].x[1] - x_i[1];
-            far_nbr_list.far_nbr_list.dvec[pj][2] = my_atoms[j].x[2] - x_i[2];
-        }
-        else
-        {
-            far_nbr_list.far_nbr_list.dvec[pj][0] = x_i[0] - my_atoms[j].x[0];
-            far_nbr_list.far_nbr_list.dvec[pj][1] = x_i[1] - my_atoms[j].x[1];
-            far_nbr_list.far_nbr_list.dvec[pj][2] = x_i[2] - my_atoms[j].x[2];
-        }
+        far_nbr_list.far_nbr_list.dvec[pj][0] = my_atoms[j].x[0] - x_i[0];
+        far_nbr_list.far_nbr_list.dvec[pj][1] = my_atoms[j].x[1] - x_i[1];
+        far_nbr_list.far_nbr_list.dvec[pj][2] = my_atoms[j].x[2] - x_i[2];
         far_nbr_list.far_nbr_list.d[pj] = rvec_Norm( far_nbr_list.far_nbr_list.dvec[pj] );
     }
 }
@@ -269,44 +223,52 @@ CUDA_GLOBAL void k_init_dist( reax_atom *my_atoms, reax_list far_nbr_list, int N
  */
 CUDA_GLOBAL void k_init_dist_opt( reax_atom *my_atoms, reax_list far_nbr_list, int N )
 {
-    int j, pj, start_i, end_i, thread_id, warp_id, lane_id;
+    int j, pj, start_i, end_i, thread_id, i, lane_id;
     rvec x_i;
 
     thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    warp_id = thread_id >> 5;
+    i = thread_id / warpSize;
 
-    if ( warp_id >= N )
+    if ( i >= N )
     {
         return;
     }
 
-    lane_id = thread_id & 0x0000001F; 
-    start_i = Start_Index( warp_id, &far_nbr_list );
-    end_i = End_Index( warp_id, &far_nbr_list );
-    rvec_Copy( x_i, my_atoms[warp_id].x );
+    lane_id = thread_id % warpSize; 
+    start_i = Start_Index( i, &far_nbr_list );
+    end_i = End_Index( i, &far_nbr_list );
+    rvec_Copy( x_i, my_atoms[i].x );
 
     /* update distance and displacement vector between atoms i and j (i-j) */
     for ( pj = start_i + lane_id; pj < end_i; pj += warpSize )
     {
         j = far_nbr_list.far_nbr_list.nbr[pj];
 
-        if ( warp_id < j )
-        {
-            far_nbr_list.far_nbr_list.dvec[pj][0] = my_atoms[j].x[0] - x_i[0];
-            far_nbr_list.far_nbr_list.dvec[pj][1] = my_atoms[j].x[1] - x_i[1];
-            far_nbr_list.far_nbr_list.dvec[pj][2] = my_atoms[j].x[2] - x_i[2];
-        }
-        else
-        {
-            far_nbr_list.far_nbr_list.dvec[pj][0] = x_i[0] - my_atoms[j].x[0];
-            far_nbr_list.far_nbr_list.dvec[pj][1] = x_i[1] - my_atoms[j].x[1];
-            far_nbr_list.far_nbr_list.dvec[pj][2] = x_i[2] - my_atoms[j].x[2];
-        }
+        far_nbr_list.far_nbr_list.dvec[pj][0] = my_atoms[j].x[0] - x_i[0];
+        far_nbr_list.far_nbr_list.dvec[pj][1] = my_atoms[j].x[1] - x_i[1];
+        far_nbr_list.far_nbr_list.dvec[pj][2] = my_atoms[j].x[2] - x_i[2];
         far_nbr_list.far_nbr_list.d[pj] = rvec_Norm( far_nbr_list.far_nbr_list.dvec[pj] );
     }
 }
 
 
+
+CUDA_GLOBAL void k_reset_bond_orders( storage workspace, int N )
+{
+    int i;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= N )
+    {
+        return;
+    }
+
+    workspace.total_bond_order[i] = 0.0;
+    rvec_MakeZero( workspace.dDeltap_self[i] );
+}
+
+
 /* Compute the charge matrix entries and store the matrix in half format
  * using the far neighbors list (stored in full format) and according to
  * the full shell communication method */
@@ -378,7 +340,7 @@ CUDA_GLOBAL void k_init_cm_half_fs( reax_atom *my_atoms, single_body_parameters
     H->end[i] = cm_top;
     num_cm_entries = cm_top - H->start[i];
 
-    /* reallocation checks */
+    /* reallocation check */
     if ( num_cm_entries > max_cm_entries[i] )
     {
         *realloc_cm_entries = TRUE;
@@ -456,7 +418,7 @@ CUDA_GLOBAL void k_init_cm_half_fs_tab( reax_atom *my_atoms, single_body_paramet
     H->end[i] = cm_top;
     num_cm_entries = cm_top - H->start[i];
 
-    /* reallocation checks */
+    /* reallocation check */
     if ( num_cm_entries > max_cm_entries[i] )
     {
         *realloc_cm_entries = TRUE;
@@ -531,7 +493,7 @@ CUDA_GLOBAL void k_init_cm_full_fs( reax_atom *my_atoms, single_body_parameters
     H->end[i] = cm_top;
     num_cm_entries = cm_top - H->start[i];
 
-    /* reallocation checks */
+    /* reallocation check */
     if ( num_cm_entries > max_cm_entries[i] )
     {
         *realloc_cm_entries = TRUE;
@@ -621,7 +583,7 @@ CUDA_GLOBAL void k_init_cm_full_fs_opt( reax_atom *my_atoms, single_body_paramet
         H->end[i] = cm_top;
         num_cm_entries = cm_top - H->start[i];
 
-        /* reallocation checks */
+        /* reallocation check */
         if ( num_cm_entries > max_cm_entries[i] )
         {
             *realloc_cm_entries = TRUE;
@@ -696,7 +658,7 @@ CUDA_GLOBAL void k_init_cm_full_fs_tab( reax_atom *my_atoms, single_body_paramet
     H->end[i] = cm_top;
     num_cm_entries = cm_top - H->start[i];
 
-    /* reallocation checks */
+    /* reallocation check */
     if ( num_cm_entries > max_cm_entries[i] )
     {
         *realloc_cm_entries = TRUE;
@@ -706,17 +668,17 @@ CUDA_GLOBAL void k_init_cm_full_fs_tab( reax_atom *my_atoms, single_body_paramet
 
 CUDA_GLOBAL void k_init_bonds( reax_atom *my_atoms, single_body_parameters *sbp, 
         two_body_parameters *tbp, storage workspace, control_params *control, 
-        reax_list far_nbr_list, reax_list bond_list, reax_list hbond_list, 
-        LR_lookup_table *t_LR, int n, int N, int num_atom_types,
-        int *max_bonds, int *realloc_bonds,
-        int *max_hbonds, int *realloc_hbonds )
+        reax_list far_nbr_list, reax_list bond_list, int n, int N,
+        int num_atom_types, int *max_bonds, int *realloc_bonds )
 {
     int i, j, pj;
     int start_i, end_i;
     int type_i, type_j;
-    int btop_i, ihb, jhb, ihb_top;
-    int num_bonds, num_hbonds;
-    real cutoff;
+    int btop_i;
+    int num_bonds;
+    real cutoff, r_ij;
+    real C12, C34, C56;
+    real BO_s, BO_pi, BO_pi2, BO;
     single_body_parameters *sbp_i, *sbp_j;
     two_body_parameters *twbp;
     reax_atom *atom_i, *atom_j;
@@ -734,12 +696,10 @@ CUDA_GLOBAL void k_init_bonds( reax_atom *my_atoms, single_body_parameters *sbp,
     end_i = End_Index( i, &far_nbr_list );
     btop_i = Start_Index( i, &bond_list );
     sbp_i = &sbp[type_i];
-    ihb = NON_H_BONDING_ATOM;
-    ihb_top = -1;
 
     if ( i < n )
     {
-        cutoff = control->nonb_cut;
+        cutoff = MIN( control->nonb_cut, control->bond_cut );
 //        workspace.bond_mark[i] = 0;
     }
     else
@@ -749,154 +709,358 @@ CUDA_GLOBAL void k_init_bonds( reax_atom *my_atoms, single_body_parameters *sbp,
 //        workspace.bond_mark[i] = 1000;
     }
 
-    if ( control->hbond_cut > 0.0 )
+    /* check if j is within cutoff */
+    for ( pj = start_i; pj < end_i; ++pj )
     {
-        ihb = sbp_i->p_hbond;
-
-        if ( ihb == H_ATOM || ihb == H_BONDING_ATOM )
-        {
-            ihb_top = Start_Index( atom_i->Hindex, &hbond_list );
-        }
-        else
+        /* uncorrected bond orders */
+        if ( far_nbr_list.far_nbr_list.d[pj] <= cutoff )
         {
-            ihb_top = -1;
+            j = far_nbr_list.far_nbr_list.nbr[pj];
+            atom_j = &my_atoms[j];
+            type_j = atom_j->type;
+            r_ij = far_nbr_list.far_nbr_list.d[pj];
+            sbp_j = &sbp[type_j];
+            twbp = &tbp[ index_tbp(type_i, type_j, num_atom_types) ];
+
+            /* uncorrected bond orders */
+            if ( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0 )
+            {
+                C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 );
+                BO_s = (1.0 + control->bo_cut) * EXP( C12 );
+            }
+            else
+            {
+                C12 = 0.0;
+                BO_s = 0.0;
+            }
+
+            if ( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0 )
+            {
+                C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 );
+                BO_pi = EXP( C34 );
+            }
+            else
+            {
+                C34 = 0.0;
+                BO_pi = 0.0;
+            }
+
+            if ( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0 )
+            {
+                C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );
+                BO_pi2 = EXP( C56 );
+            }
+            else
+            {
+                C56 = 0.0;
+                BO_pi2 = 0.0;
+            }
+
+            /* initially BO values are the uncorrected ones, page 1 */
+            BO = BO_s + BO_pi + BO_pi2;
+
+            if ( BO >= control->bo_cut )
+            {
+                /* compute and append bond info to list */
+                Cuda_Compute_BOp( bond_list, control->bo_cut, i, btop_i,
+                        far_nbr_list.far_nbr_list.nbr[pj],
+                        C12, C34, C56, BO_s, BO_pi, BO_pi2, BO,
+                        &far_nbr_list.far_nbr_list.rel_box[pj],
+                        far_nbr_list.far_nbr_list.d[pj],
+                        &far_nbr_list.far_nbr_list.dvec[pj], far_nbr_list.format,
+                        twbp, workspace.dDeltap_self, workspace.total_bond_order );
+
+                ++btop_i;
+
+                /* TODO: future optimization if bond_mark implemented */
+//                if ( workspace->bond_mark[j] > workspace->bond_mark[i] + 1 )
+//                {
+//                    workspace->bond_mark[j] = workspace->bond_mark[i] + 1;
+//                }
+//                else if ( workspace->bond_mark[i] > workspace->bond_mark[j] + 1 )
+//                {
+//                    workspace->bond_mark[i] = workspace->bond_mark[j] + 1;
+//                }
+            }
         }
     }
 
-    /* update i-j distance - check if j is within cutoff */
-    for ( pj = start_i; pj < end_i; ++pj )
+    Set_End_Index( i, btop_i, &bond_list );
+
+    num_bonds = btop_i - Start_Index( i, &bond_list );
+
+    /* copy bond info to atom structure
+     * (needed for atom ownership transfer via MPI) */
+    my_atoms[i].num_bonds = num_bonds;
+
+    /* reallocation check */
+    if ( num_bonds > max_bonds[i] )
     {
-        j = far_nbr_list.far_nbr_list.nbr[pj];
-        atom_j = &my_atoms[j];
+        *realloc_bonds = TRUE;
+    }
+}
+
+
+CUDA_GLOBAL void k_init_bonds_opt( reax_atom *my_atoms, single_body_parameters *sbp, 
+        two_body_parameters *tbp, storage workspace, control_params *control, 
+        reax_list far_nbr_list, reax_list bond_list, int n, int N,
+        int num_atom_types, int *max_bonds, int *realloc_bonds )
+{
+    extern __shared__ cub::WarpScan<int>::TempStorage temp[];
+    int i, j, pj, thread_id, lane_id, itr;
+    int start_i, end_i;
+    int type_i, type_j;
+    int btop_i, offset, flag;
+    int num_bonds;
+    real cutoff, r_ij;
+    real C12, C34, C56;
+    real BO_s, BO_pi, BO_pi2, BO;
+    single_body_parameters *sbp_i, *sbp_j;
+    two_body_parameters *twbp;
+    reax_atom *atom_i, *atom_j;
+
+    thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    /* all threads within a warp are assigned the bonds
+     * for a unique atom */
+    i = thread_id / warpSize;
+
+    if ( i >= N )
+    {
+        return;
+    }
+
+    lane_id = thread_id % warpSize;
+    atom_i = &my_atoms[i];
+    type_i = atom_i->type;
+    start_i = Start_Index( i, &far_nbr_list );
+    end_i = End_Index( i, &far_nbr_list );
+    btop_i = Start_Index( i, &bond_list );
+    sbp_i = &sbp[type_i];
+
+    if ( i < n )
+    {
+        cutoff = MIN( control->nonb_cut, control->bond_cut );
+//        workspace.bond_mark[i] = 0;
+    }
+    else
+    {
+        cutoff = control->bond_cut;
+        /* put ghost atoms to an infinite distance (i.e., 1000) */
+//        workspace.bond_mark[i] = 1000;
+    }
 
-        if ( far_nbr_list.far_nbr_list.d[pj] <= control->nonb_cut )
+    for ( itr = 0, pj = start_i + lane_id; itr < (end_i - start_i + warpSize - 1) / warpSize; ++itr )
+    {
+        /* uncorrected bond orders */
+        if ( pj < end_i && far_nbr_list.far_nbr_list.d[pj] <= cutoff )
         {
+            j = far_nbr_list.far_nbr_list.nbr[pj];
+            atom_j = &my_atoms[j];
             type_j = atom_j->type;
+            r_ij = far_nbr_list.far_nbr_list.d[pj];
             sbp_j = &sbp[type_j];
-            ihb = sbp_i->p_hbond;
-            jhb = sbp_j->p_hbond;
-
-            /* atom i: H bonding, ghost
-             * atom j: H atom, native */
-            if ( control->hbond_cut > 0.0 && i >= n && j < n
-                    && ihb == H_BONDING_ATOM && jhb == H_ATOM
-                    && far_nbr_list.far_nbr_list.d[pj] <= control->hbond_cut )
+            twbp = &tbp[ index_tbp(type_i, type_j, num_atom_types) ];
+
+            /* uncorrected bond orders */
+            if ( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0 )
             {
-                hbond_list.hbond_list[ihb_top].nbr = j;
-                hbond_list.hbond_list[ihb_top].scl = -1;
-                hbond_list.hbond_list[ihb_top].ptr = pj;
+                C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 );
+                BO_s = (1.0 + control->bo_cut) * EXP( C12 );
+            }
+            else
+            {
+                C12 = 0.0;
+                BO_s = 0.0;
+            }
 
-#if !defined(CUDA_ACCUM_ATOMIC)
-                hbond_list.hbond_list[ihb_top].sym_index = -1;
-                rvec_MakeZero( hbond_list.hbond_list[ihb_top].hb_f );
-#endif
+            if ( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0 )
+            {
+                C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 );
+                BO_pi = EXP( C34 );
+            }
+            else
+            {
+                C34 = 0.0;
+                BO_pi = 0.0;
+            }
 
-                ++ihb_top;
+            if ( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0 )
+            {
+                C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );
+                BO_pi2 = EXP( C56 );
+            }
+            else
+            {
+                C56 = 0.0;
+                BO_pi2 = 0.0;
             }
+
+            /* initially BO values are the uncorrected ones, page 1 */
+            BO = BO_s + BO_pi + BO_pi2;
+        }
+        else
+        {
+            BO = 0.0;
         }
 
-        if ( far_nbr_list.far_nbr_list.d[pj] <= cutoff )
+        offset = (pj < end_i && far_nbr_list.far_nbr_list.d[pj] <= cutoff && BO >= control->bo_cut) ? 1 : 0;
+        flag = (offset == 1) ? TRUE : FALSE;
+        cub::WarpScan<int>(temp[i % (blockDim.x / warpSize)]).ExclusiveSum(offset, offset);
+
+        if ( flag == TRUE )
         {
-            type_j = atom_j->type;
-            sbp_j = &sbp[type_j];
-            twbp = &tbp[ index_tbp(type_i, type_j, num_atom_types) ];
+            /* compute and append bond info to list */
+            Cuda_Compute_BOp( bond_list, control->bo_cut, i, btop_i + offset,
+                    far_nbr_list.far_nbr_list.nbr[pj],
+                    C12, C34, C56, BO_s, BO_pi, BO_pi2, BO,
+                    &far_nbr_list.far_nbr_list.rel_box[pj],
+                    far_nbr_list.far_nbr_list.d[pj],
+                    &far_nbr_list.far_nbr_list.dvec[pj], far_nbr_list.format,
+                    twbp, workspace.dDeltap_self, workspace.total_bond_order );
+
+            /* TODO: future optimization if bond_mark implemented */
+//            if ( workspace->bond_mark[j] > workspace->bond_mark[i] + 1 )
+//            {
+//                workspace->bond_mark[j] = workspace->bond_mark[i] + 1;
+//            }
+//            else if ( workspace->bond_mark[i] > workspace->bond_mark[j] + 1 )
+//            {
+//                workspace->bond_mark[i] = workspace->bond_mark[j] + 1;
+//            }
+        }
+
+        /* get btop_i from thread in last lane */
+        btop_i = btop_i + offset + (flag == TRUE ? 1 : 0);
+        btop_i = __shfl_sync( FULL_MASK, btop_i, warpSize - 1 );
+
+        pj += warpSize;
+    }
+
+    if ( lane_id == 0 )
+    {
+        Set_End_Index( i, btop_i, &bond_list );
+
+        num_bonds = btop_i - Start_Index( i, &bond_list );
+
+        /* copy bond info to atom structure
+         * (needed for atom ownership transfer via MPI) */
+        my_atoms[i].num_bonds = num_bonds;
+
+        /* reallocation check */
+        if ( num_bonds > max_bonds[i] )
+        {
+            *realloc_bonds = TRUE;
+        }
+    }
+}
+
+
+/* Construct the interaction list for hydrogen bonds */
+CUDA_GLOBAL void k_init_hbonds( reax_atom *my_atoms, single_body_parameters *sbp, 
+        control_params *control, reax_list far_nbr_list, reax_list hbond_list,
+        int n, int N, int num_atom_types, int *max_hbonds, int *realloc_hbonds )
+{
+    int i, j, pj;
+    int start_i, end_i;
+    int type_i, type_j;
+    int ihb, jhb, ihb_top;
+    int num_hbonds;
+    real cutoff;
+    single_body_parameters *sbp_i, *sbp_j;
+    reax_atom *atom_i, *atom_j;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= N )
+    {
+        return;
+    }
+
+    atom_i = &my_atoms[i];
+    type_i = atom_i->type;
+    start_i = Start_Index( i, &far_nbr_list );
+    end_i = End_Index( i, &far_nbr_list );
+    sbp_i = &sbp[type_i];
+    ihb = sbp_i->p_hbond;
+
+    cutoff = MIN( control->nonb_cut, control->hbond_cut );
 
-            if ( i < n )
+    ihb_top = Start_Index( atom_i->Hindex, &hbond_list );
+
+    if ( i < n && ihb == H_ATOM || ihb == H_BONDING_ATOM )
+    {
+        /* check if j is within cutoff */
+        for ( pj = start_i; pj < end_i; ++pj )
+        {
+            if ( far_nbr_list.far_nbr_list.d[pj] <= cutoff )
             {
-                /* hydrogen bond lists */
-                if ( control->hbond_cut > 0.0
-                        && (ihb == H_ATOM || ihb == H_BONDING_ATOM)
-                        && far_nbr_list.far_nbr_list.d[pj] <= control->hbond_cut )
-                {
-                    jhb = sbp_j->p_hbond;
+                j = far_nbr_list.far_nbr_list.nbr[pj];
+                atom_j = &my_atoms[j];
+                type_j = atom_j->type;
+                sbp_j = &sbp[type_j];
+                jhb = sbp_j->p_hbond;
 
-                    /* atom i: H atom, native
-                     * atom j: H bonding atom */
-                    if ( ihb == H_ATOM && jhb == H_BONDING_ATOM )
-                    {
-                        hbond_list.hbond_list[ihb_top].nbr = j;
-
-                        if ( i < j )
-                        {
-                            hbond_list.hbond_list[ihb_top].scl = 1;
-                        }
-                        else
-                        {
-                            hbond_list.hbond_list[ihb_top].scl = -1;
-                        }
-                        hbond_list.hbond_list[ihb_top].ptr = pj;
+                /* atom i: H bonding, ghost
+                 * atom j: H atom, native */
+                if ( i >= n && j < n
+                        && ihb == H_BONDING_ATOM && jhb == H_ATOM )
+                {
+                    hbond_list.hbond_list[ihb_top].nbr = j;
+                    hbond_list.hbond_list[ihb_top].scl = -1;
+                    hbond_list.hbond_list[ihb_top].ptr = pj;
 
 #if !defined(CUDA_ACCUM_ATOMIC)
-                        hbond_list.hbond_list[ihb_top].sym_index = -1;
-                        rvec_MakeZero( hbond_list.hbond_list[ihb_top].hb_f );
+                    hbond_list.hbond_list[ihb_top].sym_index = -1;
+                    rvec_MakeZero( hbond_list.hbond_list[ihb_top].hb_f );
 #endif
 
-                        ++ihb_top;
-                    }
-                    /* atom i: H bonding atom, native
-                     * atom j: H atom, native */
-                    else if ( ihb == H_BONDING_ATOM && jhb == H_ATOM && j < n )
-                    {
-                        //jhb_top = End_Index( atom_j->Hindex, &hbond_list );
-                        hbond_list.hbond_list[ihb_top].nbr = j;
-                        hbond_list.hbond_list[ihb_top].scl = -1;
-                        hbond_list.hbond_list[ihb_top].ptr = pj;
+                    ++ihb_top;
+                }
+                /* atom i: H atom, native
+                 * atom j: H bonding atom */
+                else if ( i < n
+                        && ihb == H_ATOM && jhb == H_BONDING_ATOM )
+                {
+                    hbond_list.hbond_list[ihb_top].nbr = j;
+                    hbond_list.hbond_list[ihb_top].scl = 1;
+                    hbond_list.hbond_list[ihb_top].ptr = pj;
 
 #if !defined(CUDA_ACCUM_ATOMIC)
-                        hbond_list.hbond_list[ihb_top].sym_index = -1;
-                        rvec_MakeZero( hbond_list.hbond_list[ihb_top].hb_f );
+                    hbond_list.hbond_list[ihb_top].sym_index = -1;
+                    rvec_MakeZero( hbond_list.hbond_list[ihb_top].hb_f );
 #endif
 
-                        ++ihb_top;
-                    }
+                    ++ihb_top;
                 }
-            }
+                /* atom i: H bonding atom, native
+                 * atom j: H atom, native */
+                else if ( i < n
+                        && ihb == H_BONDING_ATOM && jhb == H_ATOM && j < n )
+                {
+                    hbond_list.hbond_list[ihb_top].nbr = j;
+                    hbond_list.hbond_list[ihb_top].scl = -1;
+                    hbond_list.hbond_list[ihb_top].ptr = pj;
 
-            /* uncorrected bond orders */
-            if ( far_nbr_list.far_nbr_list.d[pj] <= control->bond_cut
-                    && Cuda_BOp( bond_list, control->bo_cut,
-                        i, btop_i, far_nbr_list.far_nbr_list.nbr[pj],
-                        &far_nbr_list.far_nbr_list.rel_box[pj], far_nbr_list.far_nbr_list.d[pj],
-                        &far_nbr_list.far_nbr_list.dvec[pj], far_nbr_list.format,
-                        sbp_i, sbp_j, twbp, workspace.dDeltap_self,
-                        workspace.total_bond_order ) == TRUE )
-            {
-                ++btop_i;
+#if !defined(CUDA_ACCUM_ATOMIC)
+                    hbond_list.hbond_list[ihb_top].sym_index = -1;
+                    rvec_MakeZero( hbond_list.hbond_list[ihb_top].hb_f );
+#endif
 
-                /* TODO: Need to do later... since i and j are parallel */
-//                if( workspace->bond_mark[j] > workspace->bond_mark[i] + 1 )
-//                {
-//                    workspace->bond_mark[j] = workspace->bond_mark[i] + 1;
-//                }
-//                else if( workspace->bond_mark[i] > workspace->bond_mark[j] + 1 )
-//                {
-//                    workspace->bond_mark[i] = workspace->bond_mark[j] + 1;
-//                }
+                    ++ihb_top;
+                }
             }
         }
     }
 
-    Set_End_Index( i, btop_i, &bond_list );
-//    if ( control->hbond_cut > 0.0 && ihb_top > 0
-//            && (ihb == H_ATOM || ihb == H_BONDING_ATOM) )
-//    {
-        Set_End_Index( atom_i->Hindex, ihb_top, &hbond_list );
-//    }
+    Set_End_Index( atom_i->Hindex, ihb_top, &hbond_list );
 
-    num_bonds = btop_i - Start_Index( i, &bond_list );
     num_hbonds = ihb_top - Start_Index( atom_i->Hindex, &hbond_list );
 
-    /* copy (h)bond info to atom structure
+    /* copy hbond info to atom structure
      * (needed for atom ownership transfer via MPI) */
-    my_atoms[i].num_bonds = num_bonds;
     my_atoms[i].num_hbonds = num_hbonds;
 
-    /* reallocation checks */
-    if ( num_bonds > max_bonds[i] )
-    {
-        *realloc_bonds = TRUE;
-    }
-
+    /* reallocation check */
     if ( num_hbonds > max_hbonds[i] )
     {
         *realloc_hbonds = TRUE;
@@ -904,6 +1068,7 @@ CUDA_GLOBAL void k_init_bonds( reax_atom *my_atoms, single_body_parameters *sbp,
 }
 
 
+/* Construct the interaction list for bonds */
 CUDA_GLOBAL void k_estimate_storages_cm_half( reax_atom *my_atoms,
         control_params *control, reax_list far_nbr_list, int cm_n, int cm_n_max,
         int *cm_entries, int *max_cm_entries )
@@ -947,7 +1112,7 @@ CUDA_GLOBAL void k_estimate_storages_cm_half( reax_atom *my_atoms,
     /* round up to the nearest multiple of 32 to ensure that reads along
      * rows can be coalesced for 1 warp per row SpMV implementation */
     max_cm_entries[i] = MAX( ((int) CEIL( num_cm_entries * SAFE_ZONE )
-                + 0x0000001F) & 0xFFFFFFE0, MIN_CM_ENTRIES );
+                + warpSize - 1) / warpSize * warpSize, MIN_CM_ENTRIES );
 }
 
 
@@ -991,26 +1156,23 @@ CUDA_GLOBAL void k_estimate_storages_cm_full( control_params *control,
     /* round up to the nearest multiple of 32 to ensure that reads along
      * rows can be coalesced for 1 warp per row SpMV implementation */
     max_cm_entries[i] = MAX( ((int) CEIL( num_cm_entries * SAFE_ZONE )
-                + 0x0000001F) & 0xFFFFFFE0, MIN_CM_ENTRIES );
+                + warpSize - 1) / warpSize * warpSize, MIN_CM_ENTRIES );
 }
 
 
-CUDA_GLOBAL void k_estimate_storages_bonds( reax_atom *my_atoms, 
+CUDA_GLOBAL void k_estimate_storage_bonds( reax_atom *my_atoms, 
         single_body_parameters *sbp, two_body_parameters *tbp,
         control_params *control, reax_list far_nbr_list, 
         int num_atom_types, int n, int N, int total_cap,
-        int *bonds, int *max_bonds,
-        int *hbonds, int *max_hbonds  )
+        int *bonds, int *max_bonds )
 {
     int i, j, pj; 
     int start_i, end_i;
     int type_i, type_j;
-    int ihb, jhb;
-    int num_bonds, num_hbonds;
-    real cutoff;
-    real r_ij; 
+    int num_bonds;
+    real cutoff, r_ij; 
     real C12, C34, C56;
-    real BO, BO_s, BO_pi, BO_pi2;
+    real BO_s, BO_pi, BO_pi2;
     single_body_parameters *sbp_i, *sbp_j;
     two_body_parameters *twbp;
     reax_atom *atom_i;
@@ -1023,7 +1185,6 @@ CUDA_GLOBAL void k_estimate_storages_bonds( reax_atom *my_atoms,
     }
 
     num_bonds = 0;
-    num_hbonds = 0;
 
     if ( i < N )
     {
@@ -1032,111 +1193,148 @@ CUDA_GLOBAL void k_estimate_storages_bonds( reax_atom *my_atoms,
         start_i = Start_Index( i, &far_nbr_list );
         end_i = End_Index( i, &far_nbr_list );
         sbp_i = &sbp[type_i];
-        ihb = NON_H_BONDING_ATOM; 
 
         if ( i < n )
-        { 
-            cutoff = control->nonb_cut;
-        }   
+        {
+            cutoff = MIN( control->nonb_cut, control->bond_cut );
+        }
         else
         {
             cutoff = control->bond_cut;
-        } 
+        }
 
         for ( pj = start_i; pj < end_i; ++pj )
         { 
-            j = far_nbr_list.far_nbr_list.nbr[pj];
-
-            if ( far_nbr_list.far_nbr_list.d[pj] <= control->nonb_cut )
+            if ( far_nbr_list.far_nbr_list.d[pj] <= cutoff )
             {
+                j = far_nbr_list.far_nbr_list.nbr[pj];
                 type_j = my_atoms[j].type;
+                r_ij = far_nbr_list.far_nbr_list.d[pj];
                 sbp_j = &sbp[type_j];
-                ihb = sbp_i->p_hbond;
-                jhb = sbp_j->p_hbond;
+                twbp = &tbp[ index_tbp(type_i ,type_j, num_atom_types) ];
 
-                /* atom i: H bonding, ghost
-                 * atom j: H atom, native */
-                if ( control->hbond_cut > 0.0 && i >= n && j < n
-                        && ihb == H_BONDING_ATOM && jhb == H_ATOM
-                        && far_nbr_list.far_nbr_list.d[pj] <= control->hbond_cut )
+                /* uncorrected bond orders */
+                if ( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0 )
                 {
-                    ++num_hbonds;
+                    C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 );
+                    BO_s = (1.0 + control->bo_cut) * EXP( C12 );
+                }
+                else
+                {
+                    C12 = 0.0;
+                    BO_s = 0.0;
                 }
-            }
 
-            if ( far_nbr_list.far_nbr_list.d[pj] <= cutoff )
-            {
-                type_j = my_atoms[j].type;
-                r_ij = far_nbr_list.far_nbr_list.d[pj];
-                sbp_j = &sbp[type_j];
-                twbp = &tbp[ index_tbp(type_i ,type_j, num_atom_types) ];
+                if ( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0 )
+                {
+                    C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 );
+                    BO_pi = EXP( C34 );
+                }
+                else
+                {
+                    C34 = 0.0;
+                    BO_pi = 0.0;
+                }
 
-                if ( i < n )
+                if ( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0 )
                 {
-                    /* atom i: H atom OR H bonding atom, native */
-                    if ( control->hbond_cut > 0.0
-                            && (ihb == H_ATOM || ihb == H_BONDING_ATOM)
-                            && far_nbr_list.far_nbr_list.d[pj] <= control->hbond_cut )
-                    {
-                        jhb = sbp_j->p_hbond;
-
-                        /* atom i: H atom, native
-                         * atom j: H bonding atom */
-                        if ( ihb == H_ATOM && jhb == H_BONDING_ATOM )
-                        {
-                            ++num_hbonds;
-                        }
-                        /* atom i: H bonding atom, native
-                         * atom j: H atom, native */
-                        else if ( ihb == H_BONDING_ATOM && jhb == H_ATOM && j < n )
-                        {
-                            ++num_hbonds;
-                        }
-                    }
+                    C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );
+                    BO_pi2= EXP( C56 );
+                }
+                else
+                {
+                    C56 = 0.0;
+                    BO_pi2 = 0.0;
                 }
 
-                /* uncorrected bond orders */
-                if ( far_nbr_list.far_nbr_list.d[pj] <= control->bond_cut )
+                /* initially BO values are the uncorrected ones, page 1 */
+                if ( BO_s + BO_pi + BO_pi2 >= control->bo_cut )
                 {
-                    if ( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0 )
-                    {
-                        C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 );
-                        BO_s = (1.0 + control->bo_cut) * EXP( C12 );
-                    }
-                    else
-                    {
-                        C12 = 0.0;
-                        BO_s = 0.0;
-                    }
+                    ++num_bonds;
+                }
+            }
+        }
+    }
 
-                    if ( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0 )
-                    {
-                        C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 );
-                        BO_pi = EXP( C34 );
-                    }
-                    else
-                    {
-                        C34 = 0.0;
-                        BO_pi = 0.0;
-                    }
+    __syncthreads( );
 
-                    if ( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0 )
-                    {
-                        C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );
-                        BO_pi2= EXP( C56 );
-                    }
-                    else
-                    {
-                        C56 = 0.0;
-                        BO_pi2 = 0.0;
-                    }
+    bonds[i] = num_bonds;
+    max_bonds[i] = MAX( (int) CEIL(2 * num_bonds * SAFE_ZONE), MIN_BONDS );
+}
+
+
+CUDA_GLOBAL void k_estimate_storage_hbonds( reax_atom *my_atoms, 
+        single_body_parameters *sbp, control_params *control,
+        reax_list far_nbr_list, int num_atom_types, int n, int N,
+        int total_cap, int *hbonds, int *max_hbonds )
+{
+    int i, j, pj; 
+    int start_i, end_i;
+    int type_i, type_j;
+    int ihb, jhb;
+    int num_hbonds;
+    real cutoff;
+    single_body_parameters *sbp_i, *sbp_j;
+    reax_atom *atom_i;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= total_cap )
+    {
+        return;
+    }
+
+    num_hbonds = 0;
+
+    if ( i < N )
+    {
+        atom_i = &my_atoms[i];
+        type_i = atom_i->type;
+        start_i = Start_Index( i, &far_nbr_list );
+        end_i = End_Index( i, &far_nbr_list );
+        sbp_i = &sbp[type_i];
+        ihb = sbp_i->p_hbond;
+
+        if ( i < n )
+        { 
+            cutoff = control->nonb_cut;
+        }   
+        else
+        {
+            cutoff = control->bond_cut;
+        } 
 
-                    /* initially BO values are the uncorrected ones, page 1 */
-                    BO = BO_s + BO_pi + BO_pi2;
+        if ( i < n && ihb == H_ATOM || ihb == H_BONDING_ATOM )
+        {
+            for ( pj = start_i; pj < end_i; ++pj )
+            { 
+                j = far_nbr_list.far_nbr_list.nbr[pj];
+                type_j = my_atoms[j].type;
+                sbp_j = &sbp[type_j];
+                jhb = sbp_j->p_hbond;
 
-                    if ( BO >= control->bo_cut )
+                /* atom i: H bonding, ghost
+                 * atom j: H atom, native */
+                if ( i >= n && j < n && ihb == H_BONDING_ATOM && jhb == H_ATOM
+                        && far_nbr_list.far_nbr_list.d[pj] <= control->nonb_cut
+                        && far_nbr_list.far_nbr_list.d[pj] <= control->hbond_cut )
+                {
+                    ++num_hbonds;
+                }
+                else if ( i < n && far_nbr_list.far_nbr_list.d[pj] <= cutoff
+                        && far_nbr_list.far_nbr_list.d[pj] <= control->hbond_cut )
+                {
+                    /* atom i: H atom, native
+                     * atom j: H bonding atom */
+                    if ( ihb == H_ATOM && jhb == H_BONDING_ATOM )
                     {
-                        ++num_bonds;
+                        ++num_hbonds;
+                    }
+                    /* atom i: H bonding atom, native
+                     * atom j: H atom, native */
+                    else if ( ihb == H_BONDING_ATOM && jhb == H_ATOM && j < n )
+                    {
+                        ++num_hbonds;
                     }
                 }
             }
@@ -1145,9 +1343,6 @@ CUDA_GLOBAL void k_estimate_storages_bonds( reax_atom *my_atoms,
 
     __syncthreads( );
 
-    bonds[i] = num_bonds;
-    max_bonds[i] = MAX( (int) CEIL(2 * num_bonds * SAFE_ZONE), MIN_BONDS );
-
     hbonds[i] = num_hbonds;
     max_hbonds[i] = MAX( (int) CEIL(num_hbonds * SAFE_ZONE), MIN_HBONDS );
 }
@@ -1199,6 +1394,7 @@ CUDA_GLOBAL void k_update_sym_dbond_indices( reax_list bond_list, int N )
 
                 ibond->sym_index = pk;
                 jbond->sym_index = pj;
+                break;
             }
         }
     }
@@ -1213,18 +1409,17 @@ CUDA_GLOBAL void k_update_sym_hbond_indices_opt( reax_atom *my_atoms,
     int nbr, nbrstart, nbrend;
     int start, end;
     hbond_data *ihbond, *jhbond;
-    int thread_id, warp_id, lane_id;
+    int thread_id, lane_id;
 
     thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    warp_id = thread_id >> 5;
+    i = thread_id / warpSize;
 
-    if ( warp_id > N )
+    if ( i > N )
     {
         return;
     }
 
-    i = warp_id;
-    lane_id = thread_id & 0x0000001F; 
+    lane_id = thread_id % warpSize; 
     start = Start_Index( my_atoms[i].Hindex, &hbond_list );
     end = End_Index( my_atoms[i].Hindex, &hbond_list );
     pj = start + lane_id;
@@ -1408,11 +1603,6 @@ void Cuda_Init_HBond_Indices( reax_system *system, storage *workspace,
             "Cuda_Init_HBond_Indices::workspace->scratch" );
     temp = (int *) workspace->scratch;
 
-    /* init Hindices */
-    k_init_hindex <<< blocks, DEF_BLOCK_SIZE >>>
-        ( system->d_my_atoms, system->total_cap );
-    cudaCheckError( );
-
     /* init indices and end_indices */
     Cuda_Scan_Excl_Sum( system->d_max_hbonds, temp, system->total_cap );
 
@@ -1466,8 +1656,8 @@ void Cuda_Init_Sparse_Matrix_Indices( reax_system *system, sparse_matrix *H )
 
 
 void Cuda_Estimate_Storages( reax_system *system, control_params *control, 
-        storage *workspace, reax_list **lists,
-        int realloc_bonds, int realloc_hbonds, int realloc_cm, int step )
+        storage *workspace, reax_list **lists, int realloc_cm,
+        int realloc_bonds, int realloc_hbonds, int step )
 {
     int blocks;
 
@@ -1500,23 +1690,19 @@ void Cuda_Estimate_Storages( reax_system *system, control_params *control,
                 cudaMemcpyDeviceToHost, "Cuda_Estimate_Storages::d_total_cm_entries" );
     }
 
-    if ( realloc_bonds == TRUE || realloc_hbonds == TRUE )
+    if ( realloc_bonds == TRUE )
     {
         blocks = system->total_cap / DEF_BLOCK_SIZE
             + (system->total_cap % DEF_BLOCK_SIZE == 0 ? 0 : 1);
 
-        k_estimate_storages_bonds <<< blocks, DEF_BLOCK_SIZE >>>
+        k_estimate_storage_bonds <<< blocks, DEF_BLOCK_SIZE >>>
             ( system->d_my_atoms, system->reax_param.d_sbp, system->reax_param.d_tbp, 
               (control_params *) control->d_control_params,
               *(lists[FAR_NBRS]), system->reax_param.num_atom_types,
               system->n, system->N, system->total_cap,
-              system->d_bonds, system->d_max_bonds,
-              system->d_hbonds, system->d_max_hbonds );
+              system->d_bonds, system->d_max_bonds );
         cudaCheckError( );
-    }
 
-    if ( realloc_bonds == TRUE )
-    {
         Cuda_Reduction_Sum( system->d_max_bonds, system->d_total_bonds,
                 system->total_cap );
         copy_host_device( &system->total_bonds, system->d_total_bonds, sizeof(int), 
@@ -1525,12 +1711,23 @@ void Cuda_Estimate_Storages( reax_system *system, control_params *control,
 
     if ( system->numH > 0 && control->hbond_cut > 0.0 && realloc_hbonds == TRUE )
     {
+        blocks = system->total_cap / DEF_BLOCK_SIZE
+            + (system->total_cap % DEF_BLOCK_SIZE == 0 ? 0 : 1);
+
+        k_estimate_storage_hbonds <<< blocks, DEF_BLOCK_SIZE >>>
+            ( system->d_my_atoms, system->reax_param.d_sbp,
+              (control_params *) control->d_control_params,
+              *(lists[FAR_NBRS]), system->reax_param.num_atom_types,
+              system->n, system->N, system->total_cap,
+              system->d_hbonds, system->d_max_hbonds );
+        cudaCheckError( );
+
         Cuda_Reduction_Sum( system->d_max_hbonds, system->d_total_hbonds,
                 system->total_cap );
         copy_host_device( &system->total_hbonds, system->d_total_hbonds, sizeof(int), 
                 cudaMemcpyDeviceToHost, "Cuda_Estimate_Storages::d_total_hbonds" );
     }
-    else if ( step == 0 )
+    else if ( step == 0 && (system->numH == 0 || control->hbond_cut <= 0.0) )
     {
 #if defined(DEBUG_FOCUS)
         if ( system->numH == 0 )
@@ -1554,8 +1751,8 @@ int Cuda_Init_Forces( reax_system *system, control_params *control,
         simulation_data *data, storage *workspace,
         reax_list **lists, output_controls *out_control ) 
 {
-    int renbr, blocks, ret, realloc_bonds, realloc_hbonds, realloc_cm;
-    static int dist_done = FALSE, cm_done = FALSE, bonds_done = FALSE;
+    int renbr, blocks, ret, realloc_cm, realloc_bonds, realloc_hbonds;
+    static int dist_done = FALSE, cm_done = FALSE, bonds_done = FALSE, hbonds_done = FALSE;
 #if defined(LOG_PERFORMANCE)
     float time_elapsed;
     cudaEvent_t time_event[4];
@@ -1578,12 +1775,12 @@ int Cuda_Init_Forces( reax_system *system, control_params *control,
 //    cudaCheckError( );
 
     /* reset reallocation flags on device */
+    cuda_memset( system->d_realloc_cm_entries, FALSE, sizeof(int), 
+            "Cuda_Init_Forces::d_realloc_cm_entries" );
     cuda_memset( system->d_realloc_bonds, FALSE, sizeof(int), 
             "Cuda_Init_Forces::d_realloc_bonds" );
     cuda_memset( system->d_realloc_hbonds, FALSE, sizeof(int), 
             "Cuda_Init_Forces::d_realloc_hbonds" );
-    cuda_memset( system->d_realloc_cm_entries, FALSE, sizeof(int), 
-            "Cuda_Init_Forces::d_realloc_cm_entries" );
 
 #if defined(LOG_PERFORMANCE)
     cudaEventRecord( time_event[0] );
@@ -1591,11 +1788,12 @@ int Cuda_Init_Forces( reax_system *system, control_params *control,
 
     if ( renbr == FALSE && dist_done == FALSE )
     {
+//        k_init_dist <<< control->blocks_n, control->block_size_n >>>
+//            ( system->d_my_atoms, *(lists[FAR_NBRS]), system->N );
+
         blocks = system->N * 32 / DEF_BLOCK_SIZE
             + (system->N * 32 % DEF_BLOCK_SIZE == 0 ? 0 : 1);
 
-//        k_init_dist <<< control->blocks_n, control->block_size_n >>>
-//            ( system->d_my_atoms, *(lists[FAR_NBRS]), system->N );
         k_init_dist_opt <<< blocks, DEF_BLOCK_SIZE >>>
             ( system->d_my_atoms, *(lists[FAR_NBRS]), system->N );
         cudaCheckError( );
@@ -1615,6 +1813,8 @@ int Cuda_Init_Forces( reax_system *system, control_params *control,
 
     if ( cm_done == FALSE )
     {
+        Cuda_Init_Sparse_Matrix_Indices( system, &workspace->d_workspace->H );
+
         if ( workspace->d_workspace->H.format == SYM_HALF_MATRIX )
         {
             if ( control->tabulate <= 0 )
@@ -1672,13 +1872,46 @@ int Cuda_Init_Forces( reax_system *system, control_params *control,
 
     if ( bonds_done == FALSE )
     {
+        blocks = system->total_cap / DEF_BLOCK_SIZE
+            + ((system->total_cap % DEF_BLOCK_SIZE == 0 ) ? 0 : 1);
+
+        k_reset_bond_orders <<< blocks, DEF_BLOCK_SIZE >>>
+            ( *(workspace->d_workspace), system->total_cap );
+        cudaCheckError( );
+
+        Cuda_Init_Bond_Indices( system, lists[BONDS] );
+
         k_init_bonds <<< control->blocks_n, control->block_size_n >>>
             ( system->d_my_atoms, system->reax_param.d_sbp,
               system->reax_param.d_tbp, *(workspace->d_workspace),
               (control_params *) control->d_control_params,
-              *(lists[FAR_NBRS]), *(lists[BONDS]), *(lists[HBONDS]),
-              workspace->d_LR, system->n, system->N, system->reax_param.num_atom_types,
-              system->d_max_bonds, system->d_realloc_bonds,
+              *(lists[FAR_NBRS]), *(lists[BONDS]),
+              system->n, system->N, system->reax_param.num_atom_types,
+              system->d_max_bonds, system->d_realloc_bonds );
+
+//        blocks = control->block_size_n * 32 / DEF_BLOCK_SIZE
+//            + (control->block_size_n * 32 % DEF_BLOCK_SIZE == 0 ? 0 : 1);
+//
+//        k_init_bonds_opt <<< blocks, DEF_BLOCK_SIZE,
+//                     sizeof(cub::WarpScan<int>::TempStorage) * (DEF_BLOCK_SIZE / 32) >>>
+//            ( system->d_my_atoms, system->reax_param.d_sbp,
+//              system->reax_param.d_tbp, *(workspace->d_workspace),
+//              (control_params *) control->d_control_params,
+//              *(lists[FAR_NBRS]), *(lists[BONDS]),
+//              system->n, system->N, system->reax_param.num_atom_types,
+//              system->d_max_bonds, system->d_realloc_bonds );
+        cudaCheckError( );
+    }
+
+    if ( control->hbond_cut > 0.0 && system->numH > 0 && hbonds_done == FALSE )
+    {
+        Cuda_Init_HBond_Indices( system, workspace, lists[HBONDS] );
+
+        k_init_hbonds <<< control->blocks_n, control->block_size_n >>>
+            ( system->d_my_atoms, system->reax_param.d_sbp,
+              (control_params *) control->d_control_params,
+              *(lists[FAR_NBRS]), *(lists[HBONDS]),
+              system->n, system->N, system->reax_param.num_atom_types,
               system->d_max_hbonds, system->d_realloc_hbonds );
         cudaCheckError( );
     }
@@ -1731,17 +1964,21 @@ int Cuda_Init_Forces( reax_system *system, control_params *control,
     }
 #endif
 
-    ret = (realloc_cm == FALSE && realloc_bonds == FALSE && realloc_hbonds == FALSE)
-        ? SUCCESS : FAILURE;
+    ret = (realloc_cm == FALSE && realloc_bonds == FALSE && realloc_hbonds == FALSE
+            ? SUCCESS : FAILURE);
 
     if ( realloc_cm == FALSE )
     {
         cm_done = TRUE;
     }
-    if ( realloc_bonds == FALSE && realloc_hbonds == FALSE )
+    if ( realloc_bonds == FALSE )
     {
         bonds_done = TRUE;
     }
+    if ( realloc_hbonds == FALSE )
+    {
+        hbonds_done = TRUE;
+    }
 
     if ( ret == SUCCESS )
     {
@@ -1769,17 +2006,18 @@ int Cuda_Init_Forces( reax_system *system, control_params *control,
         dist_done = FALSE;
         cm_done = FALSE;
         bonds_done = FALSE;
+        hbonds_done = FALSE;
     }
     else
     {
         Cuda_Estimate_Storages( system, control, workspace, lists,
-               realloc_bonds, realloc_hbonds, realloc_cm,
+               realloc_cm, realloc_bonds, realloc_hbonds,
                data->step - data->prev_steps );
 
         /* schedule reallocations after updating allocation sizes */
+        workspace->d_workspace->realloc.cm = realloc_cm;
         workspace->d_workspace->realloc.bonds = realloc_bonds;
         workspace->d_workspace->realloc.hbonds = realloc_hbonds;
-        workspace->d_workspace->realloc.cm = realloc_cm;
     }
 
     return ret;
diff --git a/PG-PuReMD/src/cuda/cuda_init_md.cu b/PG-PuReMD/src/cuda/cuda_init_md.cu
index 68cef63a..1c4ee61d 100644
--- a/PG-PuReMD/src/cuda/cuda_init_md.cu
+++ b/PG-PuReMD/src/cuda/cuda_init_md.cu
@@ -168,15 +168,15 @@ void Cuda_Init_Workspace( reax_system *system, control_params *control,
 
     workspace->realloc.far_nbrs = FALSE;
     workspace->realloc.cm = FALSE;
-    workspace->realloc.hbonds = FALSE;
     workspace->realloc.bonds = FALSE;
+    workspace->realloc.hbonds = FALSE;
     workspace->realloc.thbody = FALSE;
     workspace->realloc.gcell_atoms = 0;
 
     workspace->d_workspace->realloc.far_nbrs = FALSE;
     workspace->d_workspace->realloc.cm = FALSE;
-    workspace->d_workspace->realloc.hbonds = FALSE;
     workspace->d_workspace->realloc.bonds = FALSE;
+    workspace->d_workspace->realloc.hbonds = FALSE;
     workspace->d_workspace->realloc.thbody = FALSE;
     workspace->d_workspace->realloc.gcell_atoms = 0;
 
diff --git a/PG-PuReMD/src/cuda/cuda_integrate.cu b/PG-PuReMD/src/cuda/cuda_integrate.cu
index 8a32018a..9a4a50e6 100644
--- a/PG-PuReMD/src/cuda/cuda_integrate.cu
+++ b/PG-PuReMD/src/cuda/cuda_integrate.cu
@@ -332,11 +332,11 @@ int Cuda_Velocity_Verlet_NVE( reax_system *system, control_params *control,
             Cuda_Copy_Grid_Host_to_Device( &system->my_grid, &system->d_my_grid );
         }
 
+        Cuda_Reset( system, control, data, workspace, lists );
+
         cuda_copy = TRUE;
     }
 
-    Cuda_Reset( system, control, data, workspace, lists );
-
     if ( renbr == TRUE && gen_nbr_list == FALSE )
     {
         ret = Cuda_Generate_Neighbor_Lists( system, data, workspace, lists );
@@ -430,11 +430,11 @@ int Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein( reax_system* system,
             Cuda_Copy_Grid_Host_to_Device( &system->my_grid, &system->d_my_grid );
         }
 
+        Cuda_Reset( system, control, data, workspace, lists );
+
         cuda_copy = TRUE;
     }
 
-    Cuda_Reset( system, control, data, workspace, lists );
-
     if ( renbr == TRUE && gen_nbr_list == FALSE )
     {
         ret = Cuda_Generate_Neighbor_Lists( system, data, workspace, lists );
@@ -561,11 +561,11 @@ int Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* con
             Cuda_Copy_Grid_Host_to_Device( &system->my_grid, &system->d_my_grid );
         }
 
+        Cuda_Reset( system, control, data, workspace, lists );
+
         cuda_copy = TRUE;
     }
 
-    Cuda_Reset( system, control, data, workspace, lists );
-
     if ( renbr == TRUE && gen_nbr_list == FALSE )
     {
         ret = Cuda_Generate_Neighbor_Lists( system, data, workspace, lists );
@@ -675,11 +675,11 @@ int Cuda_Velocity_Verlet_Berendsen_NPT( reax_system* system, control_params* con
             Cuda_Copy_Grid_Host_to_Device( &system->my_grid, &system->d_my_grid );
         }
 
+        Cuda_Reset( system, control, data, workspace, lists );
+
         cuda_copy = TRUE;
     }
 
-    Cuda_Reset( system, control, data, workspace, lists );
-
     if ( renbr == TRUE && gen_nbr_list == FALSE )
     {
         ret = Cuda_Generate_Neighbor_Lists( system, data, workspace, lists );
diff --git a/PG-PuReMD/src/cuda/cuda_neighbors.cu b/PG-PuReMD/src/cuda/cuda_neighbors.cu
index e94cd26b..1ef460f3 100644
--- a/PG-PuReMD/src/cuda/cuda_neighbors.cu
+++ b/PG-PuReMD/src/cuda/cuda_neighbors.cu
@@ -47,8 +47,9 @@ CUDA_DEVICE real Cuda_DistSqr_to_Special_Point( rvec cp, rvec x )
 }
 
 
-/* Generate far neighbor lists by scanning the atoms list and applying cutoffs */
-CUDA_GLOBAL void k_generate_neighbor_lists( reax_atom *my_atoms, 
+/* Generate far neighbor lists in full format
+ * by scanning the atoms list and applying cutoffs */
+CUDA_GLOBAL void k_generate_neighbor_lists_full( reax_atom *my_atoms, 
         simulation_box my_ext_box, grid g, reax_list far_nbr_list,
         int n, int N, int *far_nbrs, int *max_far_nbrs, int *realloc_far_nbrs )
 {
@@ -113,15 +114,16 @@ CUDA_GLOBAL void k_generate_neighbor_lists( reax_atom *my_atoms,
         ivec_Copy( nbrs_x, g.nbrs_x[index_grid_nbrs(i, j, k, itr, &g)] );
 
         /* if neighboring grid cell is further in the "positive" direction AND within cutoff */
-        if ( g.str[index_grid_3d(i, j, k, &g)] <= g.str[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]
-                && Cuda_DistSqr_to_Special_Point(g.nbrs_cp[index_grid_nbrs(i, j, k, itr, &g)], atom1->x) <= cutoff )
+        if ( //g.str[index_grid_3d(i, j, k, &g)] <= g.str[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)] && 
+                Cuda_DistSqr_to_Special_Point(g.nbrs_cp[index_grid_nbrs(i, j, k, itr, &g)], atom1->x) <= cutoff )
         {
             /* pick up another atom from the neighbor cell */
             for ( m = g.str[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; 
                     m < g.end[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; ++m )
             {
                 /* prevent recounting same pairs within a gcell */
-                if ( l < m )
+//                if ( l < m )
+                if ( l != m )
                 {
                     atom2 = &my_atoms[m];
                     dvec[0] = atom2->x[0] - atom1->x[0];
@@ -148,53 +150,11 @@ CUDA_GLOBAL void k_generate_neighbor_lists( reax_atom *my_atoms,
         ++itr;
     }   
 
-    /* scan neighboring grid cells within cutoff */
-    itr = 0;
-    while ( g.nbrs_x[index_grid_nbrs(i, j, k, itr, &g)][0] >= 0 )
-    { 
-        ivec_Copy( nbrs_x, g.nbrs_x[index_grid_nbrs(i, j, k, itr, &g)] );
-        cutoff = SQR( g.cutoff[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)] );
-
-        /* if neighboring grid cell is further in the "negative" direction AND within cutoff */
-        if ( g.str[index_grid_3d(i, j, k, &g)] >= g.str[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)] &&  
-                Cuda_DistSqr_to_Special_Point(g.nbrs_cp[index_grid_nbrs(i, j, k, itr, &g)], atom1->x) <= cutoff )
-        {
-            /* pick up another atom from the neighbor cell */
-            for ( m = g.str[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; 
-                    m < g.end[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; ++m )
-            {
-                /* prevent recounting same pairs within a gcell */
-                if ( l > m )
-                {
-                    atom2 = &my_atoms[m];
-                    dvec[0] = atom1->x[0] - atom2->x[0];
-                    dvec[1] = atom1->x[1] - atom2->x[1];
-                    dvec[2] = atom1->x[2] - atom2->x[2];
-                    d = rvec_Norm_Sqr( dvec );
-
-                    if ( d <= cutoff )
-                    {
-                        /* commit far neighbor to list */
-                        far_nbr_list.far_nbr_list.nbr[num_far] = m;
-                        far_nbr_list.far_nbr_list.d[num_far] = SQRT( d );
-                        rvec_Copy( far_nbr_list.far_nbr_list.dvec[num_far], dvec );
-                        ivec_ScaledSum( far_nbr_list.far_nbr_list.rel_box[num_far],
-                                1, g.rel_box[ index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], &g) ], 
-                                -1, g.rel_box[ index_grid_3d(i, j, k, &g) ] );
-
-                        ++num_far;
-                    }
-                }   
-            }
-        }
-
-        ++itr;
-    }   
-
     Set_End_Index( l, num_far, &far_nbr_list );
 
-    /* reallocation check */
     my_num_far = num_far - Start_Index( l, &far_nbr_list );
+
+    /* reallocation check */
     if ( my_num_far > max_far_nbrs[l] )
     {
         *realloc_far_nbrs = TRUE;
@@ -424,8 +384,8 @@ CUDA_GLOBAL void k_mt_generate_neighbor_lists( reax_atom *my_atoms,
 }
 
 
-/* Estimate the number of far neighbors per atom (GPU) */
-CUDA_GLOBAL void k_estimate_neighbors( reax_atom *my_atoms, 
+/* Estimate the number of far neighbors in full format per atom */
+CUDA_GLOBAL void k_estimate_neighbors_full( reax_atom *my_atoms, 
         simulation_box my_ext_box, grid g, int n, int N, int total_cap,
         int *far_nbrs, int *max_far_nbrs )
 {
@@ -500,7 +460,8 @@ CUDA_GLOBAL void k_estimate_neighbors( reax_atom *my_atoms,
                         m < g.end[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; ++m )
                 {
                     /* prevent recounting same pairs within a gcell */
-                    if ( l < m )
+//                    if ( l < m )
+                    if ( l != m )
                     {
                         atom2 = &my_atoms[m];
                         dvec[0] = atom2->x[0] - atom1->x[0];
@@ -518,38 +479,6 @@ CUDA_GLOBAL void k_estimate_neighbors( reax_atom *my_atoms,
             ++itr;
 
         }   
-
-        itr = 0;
-        while ( g.nbrs_x[index_grid_nbrs(i, j, k, itr, &g)][0] >= 0 )
-        {
-            ivec_Copy( nbrs_x, g.nbrs_x[index_grid_nbrs(i, j, k, itr, &g)] );
-            cutoff = SQR( g.cutoff[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)] );
-
-            if ( g.str[index_grid_3d(i, j, k, &g)] >= g.str[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)] &&  
-                    Cuda_DistSqr_to_Special_Point(g.nbrs_cp[index_grid_nbrs(i, j, k, itr, &g)],atom1->x) <= cutoff ) 
-            {
-                /* pick up another atom from the neighbor cell */
-                for ( m = g.str[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; 
-                        m < g.end[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; ++m )
-                {
-                    /* prevent recounting same pairs within a gcell */
-                    if ( l > m )
-                    {
-                        atom2 = &my_atoms[m];
-                        dvec[0] = atom2->x[0] - atom1->x[0];
-                        dvec[1] = atom2->x[1] - atom1->x[1];
-                        dvec[2] = atom2->x[2] - atom1->x[2];
-                        d = rvec_Norm_Sqr( dvec );
-
-                        if ( d <= cutoff )
-                        { 
-                            num_far++;
-                        }
-                    }   
-                }
-            }
-            ++itr;
-        }   
     }
     else
     {
@@ -574,6 +503,8 @@ extern "C" int Cuda_Generate_Neighbor_Lists( reax_system *system,
     {
         cudaEventCreate( &time_event[i] );
     }
+
+    cudaEventRecord( time_event[0] );
 #endif
 
     /* reset reallocation flag on device */
@@ -586,11 +517,7 @@ extern "C" int Cuda_Generate_Neighbor_Lists( reax_system *system,
     blocks = (system->N / NBRS_BLOCK_SIZE) +
         ((system->N % NBRS_BLOCK_SIZE) == 0 ? 0 : 1);
 
-#if defined(LOG_PERFORMANCE)
-    cudaEventRecord( time_event[0] );
-#endif
-
-    k_generate_neighbor_lists <<< blocks, NBRS_BLOCK_SIZE >>>
+    k_generate_neighbor_lists_full <<< blocks, NBRS_BLOCK_SIZE >>>
         ( system->d_my_atoms, system->my_ext_box,
           system->d_my_grid, *(lists[FAR_NBRS]),
           system->n, system->N,
@@ -607,10 +534,6 @@ extern "C" int Cuda_Generate_Neighbor_Lists( reax_system *system,
 //              *(lists[FAR_NBRS]), system->n, system->N );
 //    cudaCheckError( );
 
-#if defined(LOG_PERFORMANCE)
-    cudaEventRecord( time_event[1] );
-#endif
-
     /* check reallocation flag on device */
     copy_host_device( &ret_far_nbr, system->d_realloc_far_nbrs, sizeof(int), 
             cudaMemcpyDeviceToHost, "Cuda_Generate_Neighbor_Lists::d_realloc_far_nbrs" );
@@ -619,6 +542,8 @@ extern "C" int Cuda_Generate_Neighbor_Lists( reax_system *system,
     workspace->d_workspace->realloc.far_nbrs = ret_far_nbr;
 
 #if defined(LOG_PERFORMANCE)
+    cudaEventRecord( time_event[1] );
+
     if ( cudaEventQuery( time_event[0] ) != cudaSuccess ) 
     {
         cudaEventSynchronize( time_event[0] );
@@ -644,15 +569,21 @@ void Cuda_Estimate_Num_Neighbors( reax_system *system, simulation_data *data )
 {
     int blocks;
 #if defined(LOG_PERFORMANCE)
-    double time;
+    float time_elapsed;
+    cudaEvent_t time_event[2];
     
-    time = Get_Time( );
+    for ( int i = 0; i < 2; ++i )
+    {
+        cudaEventCreate( &time_event[i] );
+    }
+
+    cudaEventRecord( time_event[0] );
 #endif
 
     blocks = system->total_cap / DEF_BLOCK_SIZE
         + (system->total_cap % DEF_BLOCK_SIZE == 0 ? 0 : 1);
 
-    k_estimate_neighbors <<< blocks, DEF_BLOCK_SIZE >>>
+    k_estimate_neighbors_full <<< blocks, DEF_BLOCK_SIZE >>>
         ( system->d_my_atoms, system->my_ext_box, system->d_my_grid,
           system->n, system->N, system->total_cap,
           system->d_far_nbrs, system->d_max_far_nbrs );
@@ -664,6 +595,19 @@ void Cuda_Estimate_Num_Neighbors( reax_system *system, simulation_data *data )
             cudaMemcpyDeviceToHost, "Cuda_Estimate_Neighbors::d_total_far_nbrs" );
 
 #if defined(LOG_PERFORMANCE)
-    Update_Timing_Info( &time, &data->timing.nbrs );
+    cudaEventRecord( time_event[1] );
+
+    if ( cudaEventQuery( time_event[0] ) != cudaSuccess ) 
+    {
+        cudaEventSynchronize( time_event[0] );
+    }
+
+    if ( cudaEventQuery( time_event[1] ) != cudaSuccess ) 
+    {
+        cudaEventSynchronize( time_event[1] );
+    }
+
+    cudaEventElapsedTime( &time_elapsed, time_event[0], time_event[1] ); 
+    data->timing.nbrs += (real) (time_elapsed / 1000.0);
 #endif
 }
diff --git a/PG-PuReMD/src/cuda/cuda_nonbonded.cu b/PG-PuReMD/src/cuda/cuda_nonbonded.cu
index 15e56ea8..c619f3eb 100644
--- a/PG-PuReMD/src/cuda/cuda_nonbonded.cu
+++ b/PG-PuReMD/src/cuda/cuda_nonbonded.cu
@@ -59,8 +59,11 @@ CUDA_GLOBAL void k_compute_polarization_energy( reax_atom *my_atoms,
 }
 
 
-/* one thread per atom implementation */
-CUDA_GLOBAL void k_vdW_coulomb_energy( reax_atom *my_atoms, 
+/* Compute energies and forces due to van der Waals and Coulomb interactions
+ * where the far neighbors list is in full format
+ *
+ * This implementation assigns one thread per atom */
+CUDA_GLOBAL void k_vdW_coulomb_energy_full( reax_atom *my_atoms, 
         two_body_parameters *tbp, global_parameters gp, control_params *control, 
         storage workspace, reax_list far_nbr_list, int n, int num_atom_types, 
         real *e_vdW_g, real *e_ele_g )
@@ -99,7 +102,6 @@ CUDA_GLOBAL void k_vdW_coulomb_energy( reax_atom *my_atoms,
         j = far_nbr_list.far_nbr_list.nbr[pj];
         orig_j = my_atoms[j].orig_id;
 
-        //TODO: assuming far_nbr_list in FULL_LIST, add conditions for HALF_LIST
         if ( far_nbr_list.far_nbr_list.d[pj] <= control->nonb_cut 
                 && orig_i < orig_j )
         {
@@ -187,14 +189,7 @@ CUDA_GLOBAL void k_vdW_coulomb_energy( reax_atom *my_atoms,
                     * (r_ij * r_ij) / POW( dr3gamij_1, 4.0 / 3.0 );
             CEclmb = self_coef * (de_clb * Tap + e_clb * dTap);
 
-            if ( i < j ) 
-            {
-                rvec_Scale( temp, -(CEvd + CEclmb) / r_ij, far_nbr_list.far_nbr_list.dvec[pj] );
-            }
-            else 
-            {
-                rvec_Scale( temp, (CEvd + CEclmb) / r_ij, far_nbr_list.far_nbr_list.dvec[pj] );
-            }
+            rvec_Scale( temp, -(CEvd + CEclmb) / r_ij, far_nbr_list.far_nbr_list.dvec[pj] );
             rvec_Add( f_i_l, temp );
             rvec_Scale( temp, -1.0, temp );
             atomic_rvecAdd( workspace.f[j], temp );
@@ -212,8 +207,11 @@ CUDA_GLOBAL void k_vdW_coulomb_energy( reax_atom *my_atoms,
 }
 
 
-/* one thread per atom implementation */
-CUDA_GLOBAL void k_vdW_coulomb_virial_energy( reax_atom *my_atoms, 
+/* Compute virial terms, energies, and forces due to van der Waals and Coulomb interactions
+ * where the far neighbors list is in full format
+ *
+ * This implementation assigns one thread per atom */
+CUDA_GLOBAL void k_vdW_coulomb_energy_virial_full( reax_atom *my_atoms, 
         two_body_parameters *tbp, global_parameters gp, control_params *control, 
         storage workspace, reax_list far_nbr_list, int n, int num_atom_types, 
         real *e_vdW_g, real *e_ele_g, rvec *ext_press_g )
@@ -253,7 +251,6 @@ CUDA_GLOBAL void k_vdW_coulomb_virial_energy( reax_atom *my_atoms,
         j = far_nbr_list.far_nbr_list.nbr[pj];
         orig_j = my_atoms[j].orig_id;
 
-        //TODO: assuming far_nbr_list in FULL_LIST, add conditions for HALF_LIST
         if ( far_nbr_list.far_nbr_list.d[pj] <= control->nonb_cut 
                 && orig_i < orig_j )
         {
@@ -343,16 +340,8 @@ CUDA_GLOBAL void k_vdW_coulomb_virial_energy( reax_atom *my_atoms,
 
             /* for pressure coupling, terms not related to bond order 
                derivatives are added directly into pressure vector/tensor */
-            if ( i < j ) 
-            {
-                rvec_Scale( temp,
-                        -(CEvd + CEclmb) / r_ij, far_nbr_list.far_nbr_list.dvec[pj] );
-            }
-            else 
-            {
-                rvec_Scale( temp,
-                        (CEvd + CEclmb) / r_ij, far_nbr_list.far_nbr_list.dvec[pj] );
-            }
+            rvec_Scale( temp,
+                    -(CEvd + CEclmb) / r_ij, far_nbr_list.far_nbr_list.dvec[pj] );
             rvec_Add( f_i_l, temp );
             rvec_Scale( temp, -1.0, temp );
             atomic_rvecAdd( workspace.f[j], temp );
@@ -376,8 +365,11 @@ CUDA_GLOBAL void k_vdW_coulomb_virial_energy( reax_atom *my_atoms,
 }
 
 
-/* one warp of threads per atom implementation */
-CUDA_GLOBAL void k_vdW_coulomb_energy_opt( reax_atom *my_atoms, 
+/* Compute energies and forces due to van der Waals and Coulomb interactions
+ * where the far neighbors list is in full format
+ *
+ * This implementation assigns one warp of threads per atom */
+CUDA_GLOBAL void k_vdW_coulomb_energy_full_opt( reax_atom *my_atoms, 
         two_body_parameters *tbp, global_parameters gp, control_params *control, 
         storage workspace, reax_list far_nbr_list, int n, int num_atom_types, 
         real *e_vdW_g, real *e_ele_g )
@@ -423,7 +415,6 @@ CUDA_GLOBAL void k_vdW_coulomb_energy_opt( reax_atom *my_atoms,
         j = far_nbr_list.far_nbr_list.nbr[pj];
         orig_j = my_atoms[j].orig_id;
 
-        //TODO: assuming far_nbr_list in FULL_LIST, add conditions for HALF_LIST
         if ( far_nbr_list.far_nbr_list.d[pj] <= control->nonb_cut 
                 && orig_i < orig_j )
         {
@@ -511,14 +502,7 @@ CUDA_GLOBAL void k_vdW_coulomb_energy_opt( reax_atom *my_atoms,
                     * (r_ij * r_ij) / POW( dr3gamij_1, 4.0 / 3.0 );
             CEclmb = self_coef * (de_clb * Tap + e_clb * dTap);
 
-            if ( i < j ) 
-            {
-                rvec_Scale( temp, -(CEvd + CEclmb) / r_ij, far_nbr_list.far_nbr_list.dvec[pj] );
-            }
-            else 
-            {
-                rvec_Scale( temp, (CEvd + CEclmb) / r_ij, far_nbr_list.far_nbr_list.dvec[pj] );
-            }
+            rvec_Scale( temp, -(CEvd + CEclmb) / r_ij, far_nbr_list.far_nbr_list.dvec[pj] );
             rvec_Add( f_i_l, temp );
             rvec_Scale( temp, -1.0, temp );
             atomic_rvecAdd( workspace.f[j], temp );
@@ -548,8 +532,11 @@ CUDA_GLOBAL void k_vdW_coulomb_energy_opt( reax_atom *my_atoms,
 }
 
 
-/* one warp of threads per atom implementation */
-CUDA_GLOBAL void k_vdW_coulomb_energy_virial_opt( reax_atom *my_atoms, 
+/* Compute virial terms, energies, and forces due to van der Waals and Coulomb interactions
+ * where the far neighbors list is in full format
+ *
+ * This implementation assigns one warp of threads per atom */
+CUDA_GLOBAL void k_vdW_coulomb_energy_virial_full_opt( reax_atom *my_atoms, 
         two_body_parameters *tbp, global_parameters gp, control_params *control, 
         storage workspace, reax_list far_nbr_list, int n, int num_atom_types, 
         real *e_vdW_g, real *e_ele_g, rvec *ext_press_g )
@@ -596,7 +583,6 @@ CUDA_GLOBAL void k_vdW_coulomb_energy_virial_opt( reax_atom *my_atoms,
         j = far_nbr_list.far_nbr_list.nbr[pj];
         orig_j = my_atoms[j].orig_id;
 
-        //TODO: assuming far_nbr_list in FULL_LIST, add conditions for HALF_LIST
         if ( far_nbr_list.far_nbr_list.d[pj] <= control->nonb_cut 
                 && orig_i < orig_j )
         {
@@ -686,16 +672,8 @@ CUDA_GLOBAL void k_vdW_coulomb_energy_virial_opt( reax_atom *my_atoms,
 
             /* for pressure coupling, terms not related to bond order 
                derivatives are added directly into pressure vector/tensor */
-            if ( i < j ) 
-            {
-                rvec_Scale( temp,
-                        -(CEvd + CEclmb) / r_ij, far_nbr_list.far_nbr_list.dvec[pj] );
-            }
-            else 
-            {
-                rvec_Scale( temp,
-                        (CEvd + CEclmb) / r_ij, far_nbr_list.far_nbr_list.dvec[pj] );
-            }
+            rvec_Scale( temp,
+                    -(CEvd + CEclmb) / r_ij, far_nbr_list.far_nbr_list.dvec[pj] );
             rvec_Add( f_i_l, temp );
             rvec_Scale( temp, -1.0, temp );
             atomic_rvecAdd( workspace.f[j], temp );
@@ -732,7 +710,7 @@ CUDA_GLOBAL void k_vdW_coulomb_energy_virial_opt( reax_atom *my_atoms,
 
 
 /* one thread per atom implementation */
-CUDA_GLOBAL void k_vdW_coulomb_energy_tab( reax_atom *my_atoms, 
+CUDA_GLOBAL void k_vdW_coulomb_energy_tab_full( reax_atom *my_atoms, 
         global_parameters gp, control_params *control, 
         storage workspace, reax_list far_nbr_list, 
         LR_lookup_table *t_LR, int n, int num_atom_types, 
@@ -773,7 +751,6 @@ CUDA_GLOBAL void k_vdW_coulomb_energy_tab( reax_atom *my_atoms,
         j = far_nbr_list.far_nbr_list.nbr[pj];
         orig_j = my_atoms[j].orig_id;
 
-        //TODO: assuming far_nbr_list in FULL_LIST, add conditions for HALF_LIST
         if ( far_nbr_list.far_nbr_list.d[pj] <= control->nonb_cut
                 && orig_i < orig_j )
         {
@@ -812,16 +789,8 @@ CUDA_GLOBAL void k_vdW_coulomb_energy_tab( reax_atom *my_atoms,
 
             if ( control->virial == 0 )
             {
-                if ( i < j ) 
-                {
-                    rvec_ScaledAdd( temp,
-                            -(CEvd + CEclmb) / r_ij, far_nbr_list.far_nbr_list.dvec[pj] );
-                }
-                else 
-                {
-                    rvec_ScaledAdd( temp,
-                            (CEvd + CEclmb) / r_ij, far_nbr_list.far_nbr_list.dvec[pj] );
-                }
+                rvec_ScaledAdd( temp,
+                        -(CEvd + CEclmb) / r_ij, far_nbr_list.far_nbr_list.dvec[pj] );
                 rvec_Add( f_i_l, temp );
                 rvec_Scale( temp, -1.0, temp );
                 atomic_rvecAdd( workspace.f[j], temp );
@@ -831,16 +800,8 @@ CUDA_GLOBAL void k_vdW_coulomb_energy_tab( reax_atom *my_atoms,
             {
                 /* for pressure coupling, terms not related to bond order derivatives
                    are added directly into pressure vector/tensor */
-                if ( i < j ) 
-                {
-                    rvec_Scale( temp,
-                            -(CEvd + CEclmb) / r_ij, far_nbr_list.far_nbr_list.dvec[pj] );
-                }
-                else 
-                {
-                    rvec_Scale( temp,
-                            (CEvd + CEclmb) / r_ij, far_nbr_list.far_nbr_list.dvec[pj] );
-                }
+                rvec_Scale( temp,
+                        -(CEvd + CEclmb) / r_ij, far_nbr_list.far_nbr_list.dvec[pj] );
                 rvec_Add( f_i_l, temp );
                 rvec_ScaledAdd( temp, -1.0, temp );
                 atomic_rvecAdd( workspace.f[j], temp );
@@ -948,7 +909,7 @@ void Cuda_Compute_NonBonded_Forces( reax_system *system, control_params *control
     {
         if ( control->virial == 1 )
         {
-//            k_vdW_coulomb_energy_virial <<< control->blocks, control->block_size >>>
+//            k_vdW_coulomb_energy_virial_full <<< control->blocks, control->block_size >>>
 //                ( system->d_my_atoms, system->reax_param.d_tbp, 
 //                  system->reax_param.d_gp, (control_params *) control->d_control_params, 
 //                  *(workspace->d_workspace), *(lists[FAR_NBRS]), 
@@ -962,7 +923,7 @@ void Cuda_Compute_NonBonded_Forces( reax_system *system, control_params *control
 //#endif
 //            );
 
-        k_vdW_coulomb_energy_virial_opt <<< blocks, DEF_BLOCK_SIZE,
+        k_vdW_coulomb_energy_virial_full_opt <<< blocks, DEF_BLOCK_SIZE,
                                  sizeof(real) * (DEF_BLOCK_SIZE / 32) >>>
             ( system->d_my_atoms, system->reax_param.d_tbp, 
               system->reax_param.d_gp, (control_params *) control->d_control_params, 
@@ -979,7 +940,7 @@ void Cuda_Compute_NonBonded_Forces( reax_system *system, control_params *control
         }
         else
         {
-//            k_vdW_coulomb_energy <<< control->blocks, control->block_size >>>
+//            k_vdW_coulomb_energy_full <<< control->blocks, control->block_size >>>
 //                ( system->d_my_atoms, system->reax_param.d_tbp, 
 //                  system->reax_param.d_gp, (control_params *) control->d_control_params, 
 //                  *(workspace->d_workspace), *(lists[FAR_NBRS]), 
@@ -992,7 +953,7 @@ void Cuda_Compute_NonBonded_Forces( reax_system *system, control_params *control
 //#endif
 //                );
 
-        k_vdW_coulomb_energy_opt <<< blocks, DEF_BLOCK_SIZE,
+        k_vdW_coulomb_energy_full_opt <<< blocks, DEF_BLOCK_SIZE,
                                  sizeof(real) * (DEF_BLOCK_SIZE / 32) >>>
             ( system->d_my_atoms, system->reax_param.d_tbp, 
               system->reax_param.d_gp, (control_params *) control->d_control_params, 
@@ -1010,7 +971,7 @@ void Cuda_Compute_NonBonded_Forces( reax_system *system, control_params *control
     }
     else
     {
-        k_vdW_coulomb_energy_tab <<< control->blocks, control->block_size >>>
+        k_vdW_coulomb_energy_tab_full <<< control->blocks, control->block_size >>>
             ( system->d_my_atoms, system->reax_param.d_gp, 
               (control_params *) control->d_control_params, 
               *(workspace->d_workspace), *(lists[FAR_NBRS]), 
diff --git a/PG-PuReMD/src/cuda/cuda_reset_tools.cu b/PG-PuReMD/src/cuda/cuda_reset_tools.cu
index 99d55dd0..532d643c 100644
--- a/PG-PuReMD/src/cuda/cuda_reset_tools.cu
+++ b/PG-PuReMD/src/cuda/cuda_reset_tools.cu
@@ -20,9 +20,7 @@ CUDA_GLOBAL void k_reset_workspace( storage workspace, int N )
         return;
     }
 
-    workspace.total_bond_order[i] = 0.0;
     workspace.CdDelta[i] = 0.0;
-    rvec_MakeZero( workspace.dDeltap_self[i] );
     rvec_MakeZero( workspace.f[i] );
 }
 
@@ -39,18 +37,22 @@ CUDA_GLOBAL void k_reset_hindex( reax_atom *my_atoms, single_body_parameters *sb
         return;
     }
 
+    my_atoms[i].Hindex = i;
+
     if ( sbp[ my_atoms[i].type ].p_hbond == H_ATOM
             || sbp[ my_atoms[i].type ].p_hbond == H_BONDING_ATOM )
     {
+#if !defined(CUDA_ACCUM_ATOMIC)
         hindex[i] = 1;
     }
     else
     {
         hindex[i] = 0;
     }
-
-//    my_atoms[i].Hindex = hindex[i];
-    my_atoms[i].Hindex = i;
+#else
+        atomicAdd( hindex, 1 );
+    }
+#endif
 }
 
 void Cuda_Reset_Workspace( reax_system *system, storage *workspace )
@@ -69,18 +71,28 @@ void Cuda_Reset_Workspace( reax_system *system, storage *workspace )
 void Cuda_Reset_Atoms_HBond_Indices( reax_system* system, control_params *control,
         storage *workspace )
 {
+#if !defined(CUDA_ACCUM_ATOMIC)
     int *hindex;
 
     cuda_check_malloc( &workspace->scratch, &workspace->scratch_size,
-            sizeof(int) * system->N,
+            sizeof(int) * system->total_cap,
             "Cuda_Reset_Atoms_HBond_Indices::workspace->scratch" );
     hindex = (int *) workspace->scratch;
+#endif
 
     k_reset_hindex <<< control->blocks_n, control->block_size_n >>>
-        ( system->d_my_atoms, system->reax_param.d_sbp, hindex, system->N );
+        ( system->d_my_atoms, system->reax_param.d_sbp, 
+#if !defined(CUDA_ACCUM_ATOMIC)
+          hindex, 
+#else
+          system->d_numH,
+#endif
+          system->total_cap );
     cudaCheckError( );
 
+#if !defined(CUDA_ACCUM_ATOMIC)
     Cuda_Reduction_Sum( hindex, system->d_numH, system->N );
+#endif
 
     copy_host_device( &system->numH, system->d_numH, sizeof(int), 
             cudaMemcpyDeviceToHost, "Cuda_Reset_Atoms_HBond_Indices::d_numH" );
diff --git a/PG-PuReMD/src/cuda/cuda_valence_angles.cu b/PG-PuReMD/src/cuda/cuda_valence_angles.cu
index e31dc56d..fca9236d 100644
--- a/PG-PuReMD/src/cuda/cuda_valence_angles.cu
+++ b/PG-PuReMD/src/cuda/cuda_valence_angles.cu
@@ -43,7 +43,6 @@ CUDA_GLOBAL void k_valence_angles_part1( reax_atom *my_atoms,
     int i, j, pi, k, pk, t;
     int type_i, type_j, type_k;
     int start_j, end_j;
-//    int start_pk, end_pk;
     int cnt, num_thb_intrs;
     real temp, temp_bo_jt, pBOjt7;
     real p_val1, p_val2, p_val3, p_val4, p_val5;
@@ -65,7 +64,6 @@ CUDA_GLOBAL void k_valence_angles_part1( reax_atom *my_atoms,
     three_body_header *thbh;
     three_body_parameters *thbp;
     three_body_interaction_data *p_ijk;
-//    three_body_interaction_data *p_kji;
     bond_data *pbond_ij, *pbond_jk, *pbond_jt;
     bond_order_data *bo_ij, *bo_jk, *bo_jt;
 
@@ -86,7 +84,6 @@ CUDA_GLOBAL void k_valence_angles_part1( reax_atom *my_atoms,
     p_val8 = gp.l[33];
     p_val9 = gp.l[16];
     p_val10 = gp.l[17];
-    //num_thb_intrs = j * THREE_BODY_OFFSET;
     e_ang_l = 0.0;
     e_coa_l = 0.0;
     e_pen_l = 0.0;
@@ -165,41 +162,7 @@ CUDA_GLOBAL void k_valence_angles_part1( reax_atom *my_atoms,
             i = pbond_ij->nbr;
             type_i = my_atoms[i].type;
 
-            /* first copy 3-body intrs from previously computed ones where i > k.
-             * IMPORTANT: if it is less costly to compute theta and its
-             * derivative, we should definitely re-compute them,
-             * instead of copying!
-             * in the second for-loop below, we compute only new 3-body intrs
-             * where i < k */
-            // The copy loop commented out because strange asynchronous issues started to surface
-            // Each kernel now manually generates everything
-//            for( pk = start_j; pk < pi; ++pk )
-//            {
-//                start_pk = Start_Index( pk, &thb_list );
-//                end_pk = End_Index( pk, &thb_list );
-//
-//                for( t = start_pk; t < end_pk; ++t )
-//                {
-//                    if( thb_list.three_body_list[t].thb == i )
-//                    {
-//                        p_ijk = &thb_list.three_body_list[num_thb_intrs];
-//                        p_kji = &thb_list.three_body_list[t];
-//
-//                        p_ijk->thb = bond_list.bond_list[pk].nbr;
-//                        p_ijk->pthb  = pk;
-//                        p_ijk->theta = p_kji->theta;
-//                        rvec_Copy( p_ijk->dcos_di, p_kji->dcos_dk );
-//                        rvec_Copy( p_ijk->dcos_dj, p_kji->dcos_dj );
-//                        rvec_Copy( p_ijk->dcos_dk, p_kji->dcos_di );
-//
-//                        ++num_thb_intrs;
-//                        break;
-//                    }
-//                }
-//            }
-
-            /* and this is the second for loop mentioned above */
-            // Except that now the loop goes all the way from start_j to end_j
+            /* compute _ALL_ 3-body intrs */
             for ( pk = start_j; pk < end_j; ++pk )
             {
                 if ( pk == pi )
@@ -332,11 +295,9 @@ CUDA_GLOBAL void k_valence_angles_part1( reax_atom *my_atoms,
                             - (2.0 + exp_pen3) * ( -p_pen3 * exp_pen3
                                 + p_pen4 * exp_pen4 )) / SQR( trm_pen34 );
 
-                    /* very important: since each kernel generates all interactions,
-                     * need to prevent all energies becoming duplicates */
+                    e_pen = p_pen1 * f9_Dj * exp_pen2ij * exp_pen2jk;
                     if ( pk < pi )
                     {
-                        e_pen = p_pen1 * f9_Dj * exp_pen2ij * exp_pen2jk;
                         e_pen_l += e_pen;
                     }
 
@@ -355,7 +316,7 @@ CUDA_GLOBAL void k_valence_angles_part1( reax_atom *my_atoms,
                         * EXP( -p_coa3 * SQR(workspace.total_bond_order[i] - BOA_ij) )
                         * EXP( -p_coa3 * SQR(workspace.total_bond_order[k] - BOA_jk) )
                         / (1.0 + exp_coa2);
-                    /* similar to above comment regarding if statement */
+
                     if ( pk < pi )
                     {
                         e_coa_l += e_coa;
@@ -368,7 +329,6 @@ CUDA_GLOBAL void k_valence_angles_part1( reax_atom *my_atoms,
                     CEcoa5 = -2.0 * p_coa3 * (workspace.total_bond_order[k] - BOA_jk) * e_coa;
 
                     /* calculate force contributions */
-                    /* we must again check for pk < pi for entire forces part */
                     if ( pk < pi )
                     {
 #if !defined(CUDA_ACCUM_ATOMIC)
@@ -446,7 +406,6 @@ CUDA_GLOBAL void k_valence_angles_virial_part1( reax_atom *my_atoms,
     int i, j, pi, k, pk, t;
     int type_i, type_j, type_k;
     int start_j, end_j;
-//    int start_pk, end_pk;
     int cnt, num_thb_intrs;
     real temp, temp_bo_jt, pBOjt7;
     real p_val1, p_val2, p_val3, p_val4, p_val5;
@@ -468,7 +427,6 @@ CUDA_GLOBAL void k_valence_angles_virial_part1( reax_atom *my_atoms,
     three_body_header *thbh;
     three_body_parameters *thbp;
     three_body_interaction_data *p_ijk;
-//    three_body_interaction_data *p_kji;
     bond_data *pbond_ij, *pbond_jk, *pbond_jt;
     bond_order_data *bo_ij, *bo_jk, *bo_jt;
 
@@ -489,7 +447,6 @@ CUDA_GLOBAL void k_valence_angles_virial_part1( reax_atom *my_atoms,
     p_val8 = gp.l[33];
     p_val9 = gp.l[16];
     p_val10 = gp.l[17];
-    //num_thb_intrs = j * THREE_BODY_OFFSET;
     e_ang_l = 0.0;
     e_coa_l = 0.0;
     e_pen_l = 0.0;
@@ -569,41 +526,7 @@ CUDA_GLOBAL void k_valence_angles_virial_part1( reax_atom *my_atoms,
             i = pbond_ij->nbr;
             type_i = my_atoms[i].type;
 
-            /* first copy 3-body intrs from previously computed ones where i > k.
-             * IMPORTANT: if it is less costly to compute theta and its
-             * derivative, we should definitely re-compute them,
-             * instead of copying!
-             * in the second for-loop below, we compute only new 3-body intrs
-             * where i < k */
-            // The copy loop commented out because strange asynchronous issues started to surface
-            // Each kernel now manually generates everything
-//            for( pk = start_j; pk < pi; ++pk )
-//            {
-//                start_pk = Start_Index( pk, &thb_list );
-//                end_pk = End_Index( pk, &thb_list );
-//
-//                for( t = start_pk; t < end_pk; ++t )
-//                {
-//                    if( thb_list.three_body_list[t].thb == i )
-//                    {
-//                        p_ijk = &thb_list.three_body_list[num_thb_intrs];
-//                        p_kji = &thb_list.three_body_list[t];
-//
-//                        p_ijk->thb = bond_list.bond_list[pk].nbr;
-//                        p_ijk->pthb  = pk;
-//                        p_ijk->theta = p_kji->theta;
-//                        rvec_Copy( p_ijk->dcos_di, p_kji->dcos_dk );
-//                        rvec_Copy( p_ijk->dcos_dj, p_kji->dcos_dj );
-//                        rvec_Copy( p_ijk->dcos_dk, p_kji->dcos_di );
-//
-//                        ++num_thb_intrs;
-//                        break;
-//                    }
-//                }
-//            }
-
-            /* and this is the second for loop mentioned above */
-            // Except that now the loop goes all the way from start_j to end_j
+            /* compute _ALL_ 3-body intrs */
             for ( pk = start_j; pk < end_j; ++pk )
             {
                 if ( pk == pi )
@@ -736,11 +659,9 @@ CUDA_GLOBAL void k_valence_angles_virial_part1( reax_atom *my_atoms,
                             - (2.0 + exp_pen3) * ( -p_pen3 * exp_pen3
                                 + p_pen4 * exp_pen4 )) / SQR( trm_pen34 );
 
-                    /* very important: since each kernel generates all interactions,
-                     * need to prevent all energies becoming duplicates */
+                    e_pen = p_pen1 * f9_Dj * exp_pen2ij * exp_pen2jk;
                     if ( pk < pi )
                     {
-                        e_pen = p_pen1 * f9_Dj * exp_pen2ij * exp_pen2jk;
                         e_pen_l += e_pen;
                     }
 
@@ -759,7 +680,7 @@ CUDA_GLOBAL void k_valence_angles_virial_part1( reax_atom *my_atoms,
                         * EXP( -p_coa3 * SQR(workspace.total_bond_order[i] - BOA_ij) )
                         * EXP( -p_coa3 * SQR(workspace.total_bond_order[k] - BOA_jk) )
                         / (1.0 + exp_coa2);
-                    /* similar to above comment regarding if statement */
+
                     if ( pk < pi )
                     {
                         e_coa_l += e_coa;
@@ -772,7 +693,6 @@ CUDA_GLOBAL void k_valence_angles_virial_part1( reax_atom *my_atoms,
                     CEcoa5 = -2.0 * p_coa3 * (workspace.total_bond_order[k] - BOA_jk) * e_coa;
 
                     /* calculate force contributions */
-                    /* we must again check for pk < pi for entire forces part */
                     if ( pk < pi )
                     {
 #if !defined(CUDA_ACCUM_ATOMIC)
diff --git a/PG-PuReMD/src/forces.c b/PG-PuReMD/src/forces.c
index 29e220c7..bca63ec9 100644
--- a/PG-PuReMD/src/forces.c
+++ b/PG-PuReMD/src/forces.c
@@ -27,6 +27,7 @@
 #if defined(PURE_REAX)
   #include "forces.h"
 
+  #include "allocate.h"
   #include "bond_orders.h"
   #include "bonds.h"
   #include "basic_comm.h"
@@ -45,6 +46,7 @@
 #elif defined(LAMMPS_REAX)
   #include "reax_forces.h"
 
+  #include "reax_allocate.h"
   #include "reax_bond_orders.h"
   #include "reax_bonds.h"
   #include "reax_basic_comm.h"
@@ -435,32 +437,27 @@ static void Init_Distance( reax_system *system, control_params *control,
 {
     int i, j, pj;
     int start_i, end_i;
-    int renbr;
     reax_list *far_nbr_list;
     reax_atom *atom_i, *atom_j;
 
     far_nbr_list = lists[FAR_NBRS];
-    renbr = (data->step - data->prev_steps) % control->reneighbor == 0 ? TRUE : FALSE;
 
-    if ( renbr == FALSE )
+    for ( i = 0; i < system->N; ++i )
     {
-        for ( i = 0; i < system->N; ++i )
-        {
-            atom_i = &system->my_atoms[i];
-            start_i = Start_Index( i, far_nbr_list );
-            end_i = End_Index( i, far_nbr_list );
+        atom_i = &system->my_atoms[i];
+        start_i = Start_Index( i, far_nbr_list );
+        end_i = End_Index( i, far_nbr_list );
 
-            /* update distance and displacement vector between atoms i and j (i-j) */
-            for ( pj = start_i; pj < end_i; ++pj )
-            {
-                j = far_nbr_list->far_nbr_list.nbr[pj];
-                atom_j = &system->my_atoms[j];
-                
-                far_nbr_list->far_nbr_list.dvec[pj][0] = atom_j->x[0] - atom_i->x[0];
-                far_nbr_list->far_nbr_list.dvec[pj][1] = atom_j->x[1] - atom_i->x[1];
-                far_nbr_list->far_nbr_list.dvec[pj][2] = atom_j->x[2] - atom_i->x[2];
-                far_nbr_list->far_nbr_list.d[pj] = rvec_Norm( far_nbr_list->far_nbr_list.dvec[pj] );
-            }
+        /* update distance and displacement vector between atoms i and j (i-j) */
+        for ( pj = start_i; pj < end_i; ++pj )
+        {
+            j = far_nbr_list->far_nbr_list.nbr[pj];
+            atom_j = &system->my_atoms[j];
+            
+            far_nbr_list->far_nbr_list.dvec[pj][0] = atom_j->x[0] - atom_i->x[0];
+            far_nbr_list->far_nbr_list.dvec[pj][1] = atom_j->x[1] - atom_i->x[1];
+            far_nbr_list->far_nbr_list.dvec[pj][2] = atom_j->x[2] - atom_i->x[2];
+            far_nbr_list->far_nbr_list.d[pj] = rvec_Norm( far_nbr_list->far_nbr_list.dvec[pj] );
         }
     }
 }
@@ -672,12 +669,13 @@ static void Init_CM_Half_NT( reax_system *system, control_params *control,
         }
     }
 
-    /* reallocation checks */
+    /* reallocation check */
     for ( i = 0; i < system->N; ++i )
     {
         if ( i < system->n && H->end[i] - H->start[i] > system->max_cm_entries[i] )
         {
             workspace->realloc.cm = TRUE;
+            break;
         }
     }
 }
@@ -887,12 +885,13 @@ static void Init_CM_Full_NT( reax_system *system, control_params *control,
         }
     }
 
-    /* reallocation checks */
+    /* reallocation check */
     for ( i = 0; i < system->N; ++i )
     {
         if ( i < system->n && H->end[i] - H->start[i] > system->max_cm_entries[i] )
         {
             workspace->realloc.cm = TRUE;
+            break;
         }
     }
 }
@@ -967,12 +966,13 @@ static void Init_CM_Half_FS( reax_system *system, control_params *control,
         H->end[i] = cm_top;
     }
 
-    /* reallocation checks */
+    /* reallocation check */
     for ( i = 0; i < system->n; ++i )
     {
         if ( H->end[i] - H->start[i] > system->max_cm_entries[i] )
         {
             workspace->realloc.cm = TRUE;
+            break;
         }
     }
 }
@@ -1038,12 +1038,13 @@ static void Init_CM_Full_FS( reax_system *system, control_params *control,
         H->end[i] = cm_top;
     }
 
-    /* reallocation checks */
+    /* reallocation check */
     for ( i = 0; i < system->n; ++i )
     {
         if ( H->end[i] - H->start[i] > system->max_cm_entries[i] )
         {
             workspace->realloc.cm = TRUE;
+            break;
         }
     }
 }
@@ -1210,14 +1211,18 @@ static void Init_Bond_Half( reax_system *system, control_params *control,
         if ( Num_Entries( i, bond_list ) > system->max_bonds[i] )
         {
             workspace->realloc.bonds = TRUE;
+            break;
         }
+    }
 
-        if ( i < system->n
-                && system->reax_param.sbp[ system->my_atoms[i].type ].p_hbond == H_ATOM
+    for ( i = 0; i < system->n; ++i )
+    {
+        if ( system->reax_param.sbp[ system->my_atoms[i].type ].p_hbond == H_ATOM
                 && Num_Entries( system->my_atoms[i].Hindex, hbond_list )
-                    > system->max_hbonds[system->my_atoms[i].Hindex] )
+                > system->max_hbonds[system->my_atoms[i].Hindex] )
         {
             workspace->realloc.hbonds = TRUE;
+            break;
         }
     }
 }
@@ -1387,14 +1392,18 @@ static void Init_Bond_Full( reax_system *system, control_params *control,
         if ( Num_Entries( i, bond_list ) > system->max_bonds[i] )
         {
             workspace->realloc.bonds = TRUE;
+            break;
         }
+    }
 
-        if ( i < system->n
-                && system->reax_param.sbp[ system->my_atoms[i].type ].p_hbond == H_ATOM
+    for ( i = 0; i < system->n; ++i )
+    {
+        if ( system->reax_param.sbp[ system->my_atoms[i].type ].p_hbond == H_ATOM
                 && Num_Entries( system->my_atoms[i].Hindex, hbond_list )
                 > system->max_hbonds[system->my_atoms[i].Hindex] )
         {
             workspace->realloc.hbonds = TRUE;
+            break;
         }
     }
 
@@ -1449,398 +1458,180 @@ static void Compute_NonBonded_Forces( reax_system * const system,
 }
 
 
-/* this version of Compute_Total_Force computes forces from
- * coefficients accumulated by all interaction functions.
- * Saves enormous time & space! */
-static void Compute_Total_Force( reax_system * const system,
-        control_params * const control,
-        simulation_data * const data, storage * const workspace, reax_list ** const lists,
-        mpi_datatypes * const mpi_data )
+
+static void Estimate_Storages_CM( reax_system * const system, control_params * const control,
+        reax_list ** const lists, int * const matrix_dim, int cm_format )
 {
-    int i, pj;
-    reax_list * const bond_list = lists[BONDS];
+    int i, j, pj;
+    int start_i, end_i;
+    int local;
+    real cutoff;
+    reax_list *far_nbr_list;
+    reax_atom *atom_i, *atom_j;
+#if defined(NEUTRAL_TERRITORY)
+    int mark[6] = {1, 1, 2, 2, 2, 2};
+#endif
 
-    for ( i = 0; i < system->N; ++i )
+    far_nbr_list = lists[FAR_NBRS];
+    *matrix_dim = 0;
+
+    for ( i = 0; i < system->total_cap; ++i )
     {
-        for ( pj = Start_Index(i, bond_list); pj < End_Index(i, bond_list); ++pj )
+        if ( i < system->local_cap )
         {
-            if ( i < bond_list->bond_list[pj].nbr )
-            {
-                if ( control->virial == 0 )
-                {
-                    Add_dBond_to_Forces( i, pj, system, data, workspace, lists );
-                }
-                else
-                {
-                    Add_dBond_to_Forces_NPT( i, pj, system, data, workspace, lists );
-                }
-            }
+            system->cm_entries[i] = 0;
         }
     }
 
-#if defined(PURE_REAX)
-    /* now all forces are computed to their partially-final values
-     * based on the neighbors information each processor has had.
-     * final values of force on each atom needs to be computed by adding up
-     * all partially-final pieces */
-    Coll_FS( system, mpi_data, workspace->f, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
-    for ( i = 0; i < system->n; ++i )
+    for ( i = 0; i < system->N; ++i )
     {
-        rvec_Copy( system->my_atoms[i].f, workspace->f[i] );
-    }
-
-#if defined(TEST_FORCES)
-    Coll_FS( system, mpi_data, workspace->f_ele, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
-    Coll_FS( system, mpi_data, workspace->f_vdw, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
-    Coll_FS( system, mpi_data, workspace->f_be, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
-    Coll_FS( system, mpi_data, workspace->f_lp, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
-    Coll_FS( system, mpi_data, workspace->f_ov, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
-    Coll_FS( system, mpi_data, workspace->f_un, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
-    Coll_FS( system, mpi_data, workspace->f_ang, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
-    Coll_FS( system, mpi_data, workspace->f_coa, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
-    Coll_FS( system, mpi_data, workspace->f_pen, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
-    Coll_FS( system, mpi_data, workspace->f_hb, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
-    Coll_FS( system, mpi_data, workspace->f_tor, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
-    Coll_FS( system, mpi_data, workspace->f_con, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
-#endif
+        atom_i = &system->my_atoms[i];
+        start_i = Start_Index( i, far_nbr_list );
+        end_i = End_Index( i, far_nbr_list );
 
+        if ( i < system->n )
+        {
+            local = 1;
+            cutoff = control->nonb_cut;
+            ++system->cm_entries[i];
+            ++(*matrix_dim);
+        }
+#if defined(NEUTRAL_TERRITORY)
+        else if ( atom_i->nt_dir != -1 )
+        {
+            local = 2;
+            cutoff = control->nonb_cut;
+            ++(*matrix_dim);
+        }
 #endif
-}
+        else
+        {
+            local = 0;
+            cutoff = control->bond_cut;
+        }
 
+        for ( pj = start_i; pj < end_i; ++pj )
+        {
+            j = far_nbr_list->far_nbr_list.nbr[pj];
 
-static int Init_Forces( reax_system *system, control_params *control,
-        simulation_data *data, storage *workspace, reax_list **lists,
-        output_controls *out_control, mpi_datatypes *mpi_data )
-{
-#if defined(LOG_PERFORMANCE)
-    double time;
-    
-    time = Get_Time( );
+#if !defined(NEUTRAL_TERRITORY)
+            if ( far_nbr_list->format == HALF_LIST )
 #endif
+            {
+                atom_j = &system->my_atoms[j];
+            }
 
-    Init_Distance( system, control, data, workspace, lists, out_control );
-
-#if defined(LOG_PERFORMANCE)
-    Update_Timing_Info( &time, &data->timing.init_dist );
+            if ( far_nbr_list->far_nbr_list.d[pj] <= cutoff )
+            {
+                if ( local == 1 )
+                {
+#if defined(NEUTRAL_TERRITORY)
+                    if( atom_j->nt_dir > 0 || j < system->n )
+                    {
+                        ++system->cm_entries[i];
+                    }
+#else
+                    if ( (far_nbr_list->format == HALF_LIST
+                                && (j < system->n || atom_i->orig_id < atom_j->orig_id))
+                            || far_nbr_list->format == FULL_LIST )
+                    {
+                        ++system->cm_entries[i];
+                    }
 #endif
+                }
 
 #if defined(NEUTRAL_TERRITORY)
-    if ( workspace->H.format == SYM_HALF_MATRIX )
-    {
-        Init_CM_Half_NT( system, control, data, workspace, lists, out_control );
-    }
-    else
-    {
-        Init_CM_Full_NT( system, control, data, workspace, lists, out_control );
-    }
-#else
-    if ( workspace->H.format == SYM_HALF_MATRIX )
-    {
-        Init_CM_Half_FS( system, control, data, workspace, lists, out_control );
+                else if ( local == 2 )
+                {
+                    if( ( atom_j->nt_dir != -1 && mark[atom_i->nt_dir] != mark[atom_j->nt_dir] ) 
+                            || ( j < system->n && atom_i->nt_dir != 0 ))
+                    {
+                        ++system->cm_entries[i];
+                    }
+                }
+#endif
+            }
+        }
     }
-    else
+
+#if defined(NEUTRAL_TERRITORY)
+    /* Since we don't know the NT atoms' position yet, Htop cannot be calculated accurately.
+     * Therefore, we assume it is full and divide 2 if necessary. */
+    if ( cm_format == SYM_HALF_MATRIX )
     {
-        Init_CM_Full_FS( system, control, data, workspace, lists, out_control );
+        for ( i = 0; i < system->local_cap; ++i )
+        {
+            system->cm_entries[i] = (system->cm_entries[i] + system->n + 1) / 2;
+        }
     }
 #endif
 
-#if defined(LOG_PERFORMANCE)
-    Update_Timing_Info( &time, &data->timing.init_cm );
+#if defined(NEUTRAL_TERRITORY)
+    *matrix_dim = (int) MAX( *matrix_dim * SAFE_ZONE_NT, MIN_CAP );
+#else
+    *matrix_dim = (int) MAX( *matrix_dim * SAFE_ZONE, MIN_CAP );
 #endif
 
-    if ( lists[FAR_NBRS]->format == HALF_LIST )
+    for ( i = 0; i < system->N; ++i )
     {
-        Init_Bond_Half( system, control, data, workspace, lists, out_control );
+        if ( i < system->local_cap )
+        {
+#if defined(NEUTRAL_TERRITORY)
+            system->max_cm_entries[i] = MAX( (int)(system->cm_entries[i] * SAFE_ZONE_NT), MIN_CM_ENTRIES );
+#else
+            system->max_cm_entries[i] = MAX( (int)(system->cm_entries[i] * SAFE_ZONE), MIN_CM_ENTRIES );
+#endif
+        }
     }
-    else
+
+    /* set currently unused space to min. capacity */
+    for ( i = system->N; i < system->local_cap; ++i )
     {
-        Init_Bond_Full( system, control, data, workspace, lists, out_control );
+        system->max_cm_entries[i] = MIN_CM_ENTRIES;
     }
 
-#if defined(LOG_PERFORMANCE)
-    Update_Timing_Info( &time, &data->timing.init_bond );
-#endif
+    /* reductions to get totals */
+    system->total_cm_entries = 0;
 
-    return (workspace->realloc.bonds == FALSE 
-            && workspace->realloc.hbonds == FALSE
-            && workspace->realloc.cm == FALSE) ? SUCCESS : FAILURE;
+    for ( i = 0; i < system->local_cap; ++i )
+    {
+        system->total_cm_entries += system->max_cm_entries[i];
+    }
+
+    system->total_cm_entries = MAX( system->total_cm_entries, MIN_CAP * MIN_CM_ENTRIES );
 }
 
 
-static int Init_Forces_No_Charges( reax_system * const system, control_params * const control,
-        simulation_data * const data, storage * const workspace, reax_list ** const lists,
-        output_controls * const out_control )
+static void Estimate_Storages_Bonds( reax_system * const system,
+        control_params * const control, reax_list ** const lists )
 {
     int i, j, pj;
     int start_i, end_i;
     int type_i, type_j;
-    int btop_i;
-    int ihb, jhb, ihb_top;
-    int local, flag, renbr;
+    int ihb, jhb;
+    int local;
     real cutoff;
-    reax_list *far_nbr_list, *bond_list, *hbond_list;
+    real r_ij;
+    real C12, C34, C56;
+    real BO, BO_s, BO_pi, BO_pi2;
+    reax_list *far_nbr_list;
     single_body_parameters *sbp_i, *sbp_j;
     two_body_parameters *twbp;
     reax_atom *atom_i, *atom_j;
-    int jhb_top;
-    int start_j, end_j;
-    int btop_j;
+#if defined(NEUTRAL_TERRITORY)
+    int mark[6] = {1, 1, 2, 2, 2, 2};
+#endif
 
     far_nbr_list = lists[FAR_NBRS];
-    bond_list = lists[BONDS];
-    hbond_list = lists[HBONDS];
 
-    for ( i = 0; i < system->n; ++i )
-    {
-        workspace->bond_mark[i] = 0;
-    }
-    for ( i = system->n; i < system->N; ++i )
+    for ( i = 0; i < system->total_cap; ++i )
     {
-        /* put ghost atoms to an infinite distance */
-        workspace->bond_mark[i] = 1000;
+        system->bonds[i] = 0;
     }
 
-    renbr = (data->step - data->prev_steps) % control->reneighbor == 0 ? TRUE : FALSE;
-
-    for ( i = 0; i < system->N; ++i )
-    {
-        atom_i = &system->my_atoms[i];
-        type_i = atom_i->type;
-        start_i = Start_Index( i, far_nbr_list );
-        end_i = End_Index( i, far_nbr_list );
-        if ( far_nbr_list->format == HALF_LIST )
-        {
-            /* start at end because other atoms
-             * can add to this atom's list (half-list) */
-            btop_i = End_Index( i, bond_list );
-        }
-        else if ( far_nbr_list->format == FULL_LIST )
-        {
-            btop_i = Start_Index( i, bond_list );
-        }
-        else
-        {
-            btop_i = 0;
-        }
-        sbp_i = &system->reax_param.sbp[type_i];
-        ihb = NON_H_BONDING_ATOM;
-        ihb_top = -1;
-
-        if ( i < system->n )
-        {
-            local = TRUE;
-            cutoff = MAX( control->hbond_cut, control->bond_cut );
-        }
-        else
-        {
-            local = FALSE;
-            cutoff = control->bond_cut;
-        }
-
-        if ( local == TRUE && control->hbond_cut > 0.0 )
-        {
-            ihb = sbp_i->p_hbond;
-
-            if ( ihb == H_ATOM )
-            {
-                if ( far_nbr_list->format == HALF_LIST )
-                {
-                    /* start at end because other atoms
-                     * can add to this atom's list (half-list) */
-                    ihb_top = End_Index( atom_i->Hindex, hbond_list );
-                }
-                else if ( far_nbr_list->format == FULL_LIST )
-                {
-                    ihb_top = Start_Index( atom_i->Hindex, hbond_list );
-                }
-            }
-            else
-            {
-                ihb_top = -1;
-            }
-        }
-
-        /* update i-j distance - check if j is within cutoff */
-        for ( pj = start_i; pj < end_i; ++pj )
-        {
-            j = far_nbr_list->far_nbr_list.nbr[pj];
-            atom_j = &system->my_atoms[j];
-
-            if ( renbr == TRUE )
-            {
-                if ( far_nbr_list->far_nbr_list.d[pj] <= cutoff )
-                {
-                    flag = TRUE;
-                }
-                else
-                {
-                    flag = FALSE;
-                }
-            }
-            else
-            {
-                far_nbr_list->far_nbr_list.dvec[pj][0] = atom_j->x[0] - atom_i->x[0];
-                far_nbr_list->far_nbr_list.dvec[pj][1] = atom_j->x[1] - atom_i->x[1];
-                far_nbr_list->far_nbr_list.dvec[pj][2] = atom_j->x[2] - atom_i->x[2];
-                far_nbr_list->far_nbr_list.d[pj] = rvec_Norm_Sqr( far_nbr_list->far_nbr_list.dvec[pj] );
-
-                if ( far_nbr_list->far_nbr_list.d[pj] <= SQR(cutoff) )
-                {
-                    far_nbr_list->far_nbr_list.d[pj] = SQRT( far_nbr_list->far_nbr_list.d[pj] );
-                    flag = TRUE;
-                }
-                else
-                {
-                    flag = FALSE;
-                }
-            }
-
-            if ( flag == TRUE )
-            {
-                type_j = atom_j->type;
-                sbp_j = &system->reax_param.sbp[type_j];
-                twbp = &system->reax_param.tbp[
-                    index_tbp(type_i, type_j, system->reax_param.num_atom_types) ];
-
-                if ( local == TRUE )
-                {
-                    /* hydrogen bond lists */
-                    if ( control->hbond_cut > 0.0
-                            && (ihb == H_ATOM || ihb == H_BONDING_ATOM)
-                            && far_nbr_list->far_nbr_list.d[pj] <= control->hbond_cut )
-                    {
-                        jhb = sbp_j->p_hbond;
-
-                        if ( ihb == H_ATOM && jhb == H_BONDING_ATOM )
-                        {
-                            hbond_list->hbond_list[ihb_top].nbr = j;
-                            hbond_list->hbond_list[ihb_top].scl = 1;
-                            hbond_list->hbond_list[ihb_top].ptr = pj;
-                            ++ihb_top;
-                        }
-                        /* only add to list for local j (far nbrs is half-list) */
-                        else if ( far_nbr_list->format == HALF_LIST
-                                && ihb == H_BONDING_ATOM && jhb == H_ATOM )
-                        {
-                            jhb_top = End_Index( atom_j->Hindex, hbond_list );
-                            hbond_list->hbond_list[jhb_top].nbr = i;
-                            hbond_list->hbond_list[jhb_top].scl = -1;
-                            hbond_list->hbond_list[jhb_top].ptr = pj;
-                            Set_End_Index( atom_j->Hindex, jhb_top + 1, hbond_list );
-                        }
-                    }
-                }
-
-                /* uncorrected bond orders */
-                if ( //(workspace->bond_mark[i] < 3 || workspace->bond_mark[j] < 3) &&
-                    far_nbr_list->far_nbr_list.d[pj] <= control->bond_cut
-                    && BOp( workspace, bond_list, control->bo_cut,
-                         i, btop_i, far_nbr_list->far_nbr_list.nbr[pj],
-                         &far_nbr_list->far_nbr_list.rel_box[pj], far_nbr_list->far_nbr_list.d[pj],
-                         &far_nbr_list->far_nbr_list.dvec[pj], far_nbr_list->format,
-                         sbp_i, sbp_j, twbp ) == TRUE )
-                {
-                    ++btop_i;
-
-                    if ( workspace->bond_mark[j] > workspace->bond_mark[i] + 1 )
-                    {
-                        workspace->bond_mark[j] = workspace->bond_mark[i] + 1;
-                    }
-                    else if ( workspace->bond_mark[i] > workspace->bond_mark[j] + 1 )
-                    {
-                        workspace->bond_mark[i] = workspace->bond_mark[j] + 1;
-                    }
-                }
-            }
-        }
-
-        Set_End_Index( i, btop_i, bond_list );
-
-        if ( local == TRUE && ihb == H_ATOM )
-        {
-            Set_End_Index( atom_i->Hindex, ihb_top, hbond_list );
-        }
-    }
-
-    if ( far_nbr_list->format == FULL_LIST )
-    {
-        /* set sym_index for bonds list (far_nbrs full list) */
-        for ( i = 0; i < system->N; ++i )
-        {
-            start_i = Start_Index( i, bond_list );
-            end_i = End_Index( i, bond_list );
-
-            for ( btop_i = start_i; btop_i < end_i; ++btop_i )
-            {
-                j = bond_list->bond_list[btop_i].nbr;
-                start_j = Start_Index( j, bond_list );
-                end_j = End_Index( j, bond_list );
-
-                for ( btop_j = start_j; btop_j < end_j; ++btop_j )
-                {
-                    if ( bond_list->bond_list[btop_j].nbr == i )
-                    {
-                        bond_list->bond_list[btop_i].sym_index = btop_j;
-                        break;
-                    }
-                }
-            }
-        }
-    }
-
-    /* reallocation checks */
-    for ( i = 0; i < system->N; ++i )
-    {
-        if ( Num_Entries( i, bond_list ) > system->max_bonds[i] )
-        {
-            workspace->realloc.bonds = TRUE;
-        }
-
-        if ( i < system->n
-                && system->reax_param.sbp[ system->my_atoms[i].type ].p_hbond == H_ATOM
-                && Num_Entries( system->my_atoms[i].Hindex, hbond_list )
-                > system->max_hbonds[system->my_atoms[i].Hindex] )
-        {
-            workspace->realloc.hbonds = TRUE;
-        }
-    }
-
-    return (workspace->realloc.bonds == TRUE 
-            || workspace->realloc.hbonds == TRUE) ? FAILURE : SUCCESS;
-}
-
-
-void Estimate_Storages( reax_system * const system, control_params * const control,
-        reax_list ** const lists, int * const matrix_dim, int cm_format )
-{
-    int i, j, pj;
-    int start_i, end_i;
-    int type_i, type_j;
-    int ihb, jhb;
-    int local;
-    real cutoff;
-    real r_ij;
-    real C12, C34, C56;
-    real BO, BO_s, BO_pi, BO_pi2;
-    reax_list *far_nbr_list;
-    single_body_parameters *sbp_i, *sbp_j;
-    two_body_parameters *twbp;
-    reax_atom *atom_i, *atom_j;
-#if defined(NEUTRAL_TERRITORY)
-    int mark[6] = {1, 1, 2, 2, 2, 2};
-#endif
-
-    far_nbr_list = lists[FAR_NBRS];
-    *matrix_dim = 0;
-
     for ( i = 0; i < system->total_cap; ++i )
     {
-        system->bonds[i] = 0;
         system->hbonds[i] = 0;
-        if ( i < system->local_cap )
-        {
-            system->cm_entries[i] = 0;
-        }
     }
 
     for ( i = 0; i < system->N; ++i )
@@ -1855,8 +1646,6 @@ void Estimate_Storages( reax_system * const system, control_params * const contr
         {
             local = 1;
             cutoff = control->nonb_cut;
-            ++system->cm_entries[i];
-            ++(*matrix_dim);
             ihb = sbp_i->p_hbond;
         }
 #if defined(NEUTRAL_TERRITORY)
@@ -1864,7 +1653,6 @@ void Estimate_Storages( reax_system * const system, control_params * const contr
         {
             local = 2;
             cutoff = control->nonb_cut;
-            ++(*matrix_dim);
             ihb = -1;
         }
 #endif
@@ -1896,20 +1684,6 @@ void Estimate_Storages( reax_system * const system, control_params * const contr
 
                 if ( local == 1 )
                 {
-#if defined(NEUTRAL_TERRITORY)
-                    if( atom_j->nt_dir > 0 || j < system->n )
-                    {
-                        ++system->cm_entries[i];
-                    }
-#else
-                    if ( (far_nbr_list->format == HALF_LIST
-                                && (j < system->n || atom_i->orig_id < atom_j->orig_id))
-                            || far_nbr_list->format == FULL_LIST )
-                    {
-                        ++system->cm_entries[i];
-                    }
-#endif
-
                     /* hydrogen bond lists */
                     if ( control->hbond_cut > 0.1
                             && (ihb == H_ATOM || ihb == H_BONDING_ATOM)
@@ -1930,17 +1704,6 @@ void Estimate_Storages( reax_system * const system, control_params * const contr
                     }
                 }
 
-#if defined(NEUTRAL_TERRITORY)
-                else if ( local == 2 )
-                {
-                    if( ( atom_j->nt_dir != -1 && mark[atom_i->nt_dir] != mark[atom_j->nt_dir] ) 
-                            || ( j < system->n && atom_i->nt_dir != 0 ))
-                    {
-                        ++system->cm_entries[i];
-                    }
-                }
-#endif
-
                 /* uncorrected bond orders */
                 if ( far_nbr_list->far_nbr_list.d[pj] <= control->bond_cut )
                 {
@@ -1993,24 +1756,6 @@ void Estimate_Storages( reax_system * const system, control_params * const contr
         }
     }
 
-#if defined(NEUTRAL_TERRITORY)
-    /* Since we don't know the NT atoms' position yet, Htop cannot be calculated accurately.
-     * Therefore, we assume it is full and divide 2 if necessary. */
-    if ( cm_format == SYM_HALF_MATRIX )
-    {
-        for ( i = 0; i < system->local_cap; ++i )
-        {
-            system->cm_entries[i] = (system->cm_entries[i] + system->n + 1) / 2;
-        }
-    }
-#endif
-
-#if defined(NEUTRAL_TERRITORY)
-    *matrix_dim = (int) MAX( *matrix_dim * SAFE_ZONE_NT, MIN_CAP );
-#else
-    *matrix_dim = (int) MAX( *matrix_dim * SAFE_ZONE, MIN_CAP );
-#endif
-
     for ( i = 0; i < system->total_cap; ++i )
     {
         system->max_hbonds[i] = 0;
@@ -2031,14 +1776,6 @@ void Estimate_Storages( reax_system * const system, control_params * const contr
             system->max_hbonds[ system->my_atoms[i].Hindex ] = MAX(
                     (int)(system->hbonds[ system->my_atoms[i].Hindex ] * SAFE_ZONE), MIN_HBONDS );
         }
-        if ( i < system->local_cap )
-        {
-#if defined(NEUTRAL_TERRITORY)
-            system->max_cm_entries[i] = MAX( (int)(system->cm_entries[i] * SAFE_ZONE_NT), MIN_CM_ENTRIES );
-#else
-            system->max_cm_entries[i] = MAX( (int)(system->cm_entries[i] * SAFE_ZONE), MIN_CM_ENTRIES );
-#endif
-        }
     }
 
     /* set currently unused space to min. capacity */
@@ -2050,15 +1787,10 @@ void Estimate_Storages( reax_system * const system, control_params * const contr
     {
         system->max_hbonds[i] = MIN_HBONDS;
     }
-    for ( i = system->N; i < system->local_cap; ++i )
-    {
-        system->max_cm_entries[i] = MIN_CM_ENTRIES;
-    }
 
     /* reductions to get totals */
     system->total_bonds = 0;
     system->total_hbonds = 0;
-    system->total_cm_entries = 0;
     system->total_thbodies = 0;
 
     for ( i = 0; i < system->total_cap; ++i )
@@ -2069,10 +1801,6 @@ void Estimate_Storages( reax_system * const system, control_params * const contr
     {
         system->total_hbonds += system->max_hbonds[i];
     }
-    for ( i = 0; i < system->local_cap; ++i )
-    {
-        system->total_cm_entries += system->max_cm_entries[i];
-    }
     if ( far_nbr_list->format == HALF_LIST )
     {
         for ( i = 0; i < system->total_cap; ++i )
@@ -2090,7 +1818,6 @@ void Estimate_Storages( reax_system * const system, control_params * const contr
 
     system->total_bonds = MAX( system->total_bonds, MIN_CAP * MIN_BONDS );
     system->total_hbonds = MAX( system->total_hbonds, MIN_CAP * MIN_HBONDS );
-    system->total_cm_entries = MAX( system->total_cm_entries, MIN_CAP * MIN_CM_ENTRIES );
     system->total_thbodies = MAX( system->total_thbodies * SAFE_ZONE, MIN_3BODIES );
 
     /* duplicate info in atom structs in case of
@@ -2103,11 +1830,431 @@ void Estimate_Storages( reax_system * const system, control_params * const contr
     {
         system->my_atoms[i].num_hbonds = system->hbonds[i];
     }
+}
+
+
+/* this version of Compute_Total_Force computes forces from
+ * coefficients accumulated by all interaction functions.
+ * Saves enormous time & space! */
+static void Compute_Total_Force( reax_system * const system,
+        control_params * const control,
+        simulation_data * const data, storage * const workspace, reax_list ** const lists,
+        mpi_datatypes * const mpi_data )
+{
+    int i, pj;
+    reax_list * const bond_list = lists[BONDS];
+
+    for ( i = 0; i < system->N; ++i )
+    {
+        for ( pj = Start_Index(i, bond_list); pj < End_Index(i, bond_list); ++pj )
+        {
+            if ( i < bond_list->bond_list[pj].nbr )
+            {
+                if ( control->virial == 0 )
+                {
+                    Add_dBond_to_Forces( i, pj, system, data, workspace, lists );
+                }
+                else
+                {
+                    Add_dBond_to_Forces_NPT( i, pj, system, data, workspace, lists );
+                }
+            }
+        }
+    }
+
+#if defined(PURE_REAX)
+    /* now all forces are computed to their partially-final values
+     * based on the neighbors information each processor has had.
+     * final values of force on each atom needs to be computed by adding up
+     * all partially-final pieces */
+    Coll_FS( system, mpi_data, workspace->f, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
+    for ( i = 0; i < system->n; ++i )
+    {
+        rvec_Copy( system->my_atoms[i].f, workspace->f[i] );
+    }
+
+#if defined(TEST_FORCES)
+    Coll_FS( system, mpi_data, workspace->f_ele, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
+    Coll_FS( system, mpi_data, workspace->f_vdw, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
+    Coll_FS( system, mpi_data, workspace->f_be, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
+    Coll_FS( system, mpi_data, workspace->f_lp, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
+    Coll_FS( system, mpi_data, workspace->f_ov, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
+    Coll_FS( system, mpi_data, workspace->f_un, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
+    Coll_FS( system, mpi_data, workspace->f_ang, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
+    Coll_FS( system, mpi_data, workspace->f_coa, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
+    Coll_FS( system, mpi_data, workspace->f_pen, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
+    Coll_FS( system, mpi_data, workspace->f_hb, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
+    Coll_FS( system, mpi_data, workspace->f_tor, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
+    Coll_FS( system, mpi_data, workspace->f_con, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
+#endif
+
+#endif
+}
+
+
+static int Init_Forces( reax_system *system, control_params *control,
+        simulation_data *data, storage *workspace, reax_list **lists,
+        output_controls *out_control, mpi_datatypes *mpi_data )
+{
+    int i, renbr, ret;
+    static int dist_done = FALSE, cm_done = FALSE, bonds_done = FALSE;
+#if defined(LOG_PERFORMANCE)
+    double time;
+    
+    time = Get_Time( );
+#endif
+
+    renbr = (data->step - data->prev_steps) % control->reneighbor == 0 ? TRUE : FALSE;
+
+    if ( renbr == FALSE && dist_done == FALSE )
+    {
+        Init_Distance( system, control, data, workspace, lists, out_control );
+
+        dist_done = TRUE;
+    }
+
+#if defined(LOG_PERFORMANCE)
+    Update_Timing_Info( &time, &data->timing.init_dist );
+#endif
+
+    if ( cm_done == FALSE )
+    {
+        Init_Matrix_Row_Indices( &workspace->H, system->max_cm_entries );
+
+#if defined(NEUTRAL_TERRITORY)
+        if ( workspace->H.format == SYM_HALF_MATRIX )
+        {
+            Init_CM_Half_NT( system, control, data, workspace, lists, out_control );
+        }
+        else
+        {
+            Init_CM_Full_NT( system, control, data, workspace, lists, out_control );
+        }
+#else
+        if ( workspace->H.format == SYM_HALF_MATRIX )
+        {
+            Init_CM_Half_FS( system, control, data, workspace, lists, out_control );
+        }
+        else
+        {
+            Init_CM_Full_FS( system, control, data, workspace, lists, out_control );
+        }
+#endif
+    }
+
+#if defined(LOG_PERFORMANCE)
+    Update_Timing_Info( &time, &data->timing.init_cm );
+#endif
+
+    if ( bonds_done == FALSE )
+    {
+        for ( i = 0; i < system->total_cap; ++i )
+        {
+            workspace->total_bond_order[i] = 0.0;
+        }
+        for ( i = 0; i < system->total_cap; ++i )
+        {
+            rvec_MakeZero( workspace->dDeltap_self[i] );
+        }
+
+        Init_List_Indices( lists[BONDS], system->max_bonds );
+        Init_List_Indices( lists[HBONDS], system->max_hbonds );
+
+        if ( lists[FAR_NBRS]->format == HALF_LIST )
+        {
+            Init_Bond_Half( system, control, data, workspace, lists, out_control );
+        }
+        else
+        {
+            Init_Bond_Full( system, control, data, workspace, lists, out_control );
+        }
+    }
 
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "[INFO] p%d @ estimate storages: total_cm_entries = %d, total_thbodies = %d\n",
-            system->my_rank, system->total_cm_entries, system->total_thbodies );
+#if defined(LOG_PERFORMANCE)
+    Update_Timing_Info( &time, &data->timing.init_bond );
 #endif
+
+    ret = (workspace->realloc.cm == FALSE
+            && workspace->realloc.bonds == FALSE
+            && workspace->realloc.hbonds == FALSE
+            ? SUCCESS : FAILURE);
+
+    if ( workspace->realloc.cm == FALSE )
+    {
+        cm_done = TRUE;
+    }
+    if ( workspace->realloc.bonds == FALSE && workspace->realloc.hbonds == FALSE )
+    {
+        bonds_done = TRUE;
+    }
+
+    if ( ret == SUCCESS )
+    {
+        dist_done = FALSE;
+        cm_done = FALSE;
+        bonds_done = FALSE;
+    }
+
+    return ret;
+}
+
+
+static int Init_Forces_No_Charges( reax_system * const system, control_params * const control,
+        simulation_data * const data, storage * const workspace, reax_list ** const lists,
+        output_controls * const out_control )
+{
+    int i, j, pj;
+    int start_i, end_i;
+    int type_i, type_j;
+    int btop_i;
+    int ihb, jhb, ihb_top;
+    int local, flag, renbr;
+    real cutoff;
+    reax_list *far_nbr_list, *bond_list, *hbond_list;
+    single_body_parameters *sbp_i, *sbp_j;
+    two_body_parameters *twbp;
+    reax_atom *atom_i, *atom_j;
+    int jhb_top;
+    int start_j, end_j;
+    int btop_j;
+
+    far_nbr_list = lists[FAR_NBRS];
+    bond_list = lists[BONDS];
+    hbond_list = lists[HBONDS];
+
+    for ( i = 0; i < system->n; ++i )
+    {
+        workspace->bond_mark[i] = 0;
+    }
+    for ( i = system->n; i < system->N; ++i )
+    {
+        /* put ghost atoms to an infinite distance */
+        workspace->bond_mark[i] = 1000;
+    }
+
+    renbr = (data->step - data->prev_steps) % control->reneighbor == 0 ? TRUE : FALSE;
+
+    for ( i = 0; i < system->N; ++i )
+    {
+        atom_i = &system->my_atoms[i];
+        type_i = atom_i->type;
+        start_i = Start_Index( i, far_nbr_list );
+        end_i = End_Index( i, far_nbr_list );
+        if ( far_nbr_list->format == HALF_LIST )
+        {
+            /* start at end because other atoms
+             * can add to this atom's list (half-list) */
+            btop_i = End_Index( i, bond_list );
+        }
+        else if ( far_nbr_list->format == FULL_LIST )
+        {
+            btop_i = Start_Index( i, bond_list );
+        }
+        else
+        {
+            btop_i = 0;
+        }
+        sbp_i = &system->reax_param.sbp[type_i];
+        ihb = NON_H_BONDING_ATOM;
+        ihb_top = -1;
+
+        if ( i < system->n )
+        {
+            local = TRUE;
+            cutoff = MAX( control->hbond_cut, control->bond_cut );
+        }
+        else
+        {
+            local = FALSE;
+            cutoff = control->bond_cut;
+        }
+
+        if ( local == TRUE && control->hbond_cut > 0.0 )
+        {
+            ihb = sbp_i->p_hbond;
+
+            if ( ihb == H_ATOM )
+            {
+                if ( far_nbr_list->format == HALF_LIST )
+                {
+                    /* start at end because other atoms
+                     * can add to this atom's list (half-list) */
+                    ihb_top = End_Index( atom_i->Hindex, hbond_list );
+                }
+                else if ( far_nbr_list->format == FULL_LIST )
+                {
+                    ihb_top = Start_Index( atom_i->Hindex, hbond_list );
+                }
+            }
+            else
+            {
+                ihb_top = -1;
+            }
+        }
+
+        /* update i-j distance - check if j is within cutoff */
+        for ( pj = start_i; pj < end_i; ++pj )
+        {
+            j = far_nbr_list->far_nbr_list.nbr[pj];
+            atom_j = &system->my_atoms[j];
+
+            if ( renbr == TRUE )
+            {
+                if ( far_nbr_list->far_nbr_list.d[pj] <= cutoff )
+                {
+                    flag = TRUE;
+                }
+                else
+                {
+                    flag = FALSE;
+                }
+            }
+            else
+            {
+                far_nbr_list->far_nbr_list.dvec[pj][0] = atom_j->x[0] - atom_i->x[0];
+                far_nbr_list->far_nbr_list.dvec[pj][1] = atom_j->x[1] - atom_i->x[1];
+                far_nbr_list->far_nbr_list.dvec[pj][2] = atom_j->x[2] - atom_i->x[2];
+                far_nbr_list->far_nbr_list.d[pj] = rvec_Norm_Sqr( far_nbr_list->far_nbr_list.dvec[pj] );
+
+                if ( far_nbr_list->far_nbr_list.d[pj] <= SQR(cutoff) )
+                {
+                    far_nbr_list->far_nbr_list.d[pj] = SQRT( far_nbr_list->far_nbr_list.d[pj] );
+                    flag = TRUE;
+                }
+                else
+                {
+                    flag = FALSE;
+                }
+            }
+
+            if ( flag == TRUE )
+            {
+                type_j = atom_j->type;
+                sbp_j = &system->reax_param.sbp[type_j];
+                twbp = &system->reax_param.tbp[
+                    index_tbp(type_i, type_j, system->reax_param.num_atom_types) ];
+
+                if ( local == TRUE )
+                {
+                    /* hydrogen bond lists */
+                    if ( control->hbond_cut > 0.0
+                            && (ihb == H_ATOM || ihb == H_BONDING_ATOM)
+                            && far_nbr_list->far_nbr_list.d[pj] <= control->hbond_cut )
+                    {
+                        jhb = sbp_j->p_hbond;
+
+                        if ( ihb == H_ATOM && jhb == H_BONDING_ATOM )
+                        {
+                            hbond_list->hbond_list[ihb_top].nbr = j;
+                            hbond_list->hbond_list[ihb_top].scl = 1;
+                            hbond_list->hbond_list[ihb_top].ptr = pj;
+                            ++ihb_top;
+                        }
+                        /* only add to list for local j (far nbrs is half-list) */
+                        else if ( far_nbr_list->format == HALF_LIST
+                                && ihb == H_BONDING_ATOM && jhb == H_ATOM )
+                        {
+                            jhb_top = End_Index( atom_j->Hindex, hbond_list );
+                            hbond_list->hbond_list[jhb_top].nbr = i;
+                            hbond_list->hbond_list[jhb_top].scl = -1;
+                            hbond_list->hbond_list[jhb_top].ptr = pj;
+                            Set_End_Index( atom_j->Hindex, jhb_top + 1, hbond_list );
+                        }
+                    }
+                }
+
+                /* uncorrected bond orders */
+                if ( //(workspace->bond_mark[i] < 3 || workspace->bond_mark[j] < 3) &&
+                    far_nbr_list->far_nbr_list.d[pj] <= control->bond_cut
+                    && BOp( workspace, bond_list, control->bo_cut,
+                         i, btop_i, far_nbr_list->far_nbr_list.nbr[pj],
+                         &far_nbr_list->far_nbr_list.rel_box[pj], far_nbr_list->far_nbr_list.d[pj],
+                         &far_nbr_list->far_nbr_list.dvec[pj], far_nbr_list->format,
+                         sbp_i, sbp_j, twbp ) == TRUE )
+                {
+                    ++btop_i;
+
+                    if ( workspace->bond_mark[j] > workspace->bond_mark[i] + 1 )
+                    {
+                        workspace->bond_mark[j] = workspace->bond_mark[i] + 1;
+                    }
+                    else if ( workspace->bond_mark[i] > workspace->bond_mark[j] + 1 )
+                    {
+                        workspace->bond_mark[i] = workspace->bond_mark[j] + 1;
+                    }
+                }
+            }
+        }
+
+        Set_End_Index( i, btop_i, bond_list );
+
+        if ( local == TRUE && ihb == H_ATOM )
+        {
+            Set_End_Index( atom_i->Hindex, ihb_top, hbond_list );
+        }
+    }
+
+    if ( far_nbr_list->format == FULL_LIST )
+    {
+        /* set sym_index for bonds list (far_nbrs full list) */
+        for ( i = 0; i < system->N; ++i )
+        {
+            start_i = Start_Index( i, bond_list );
+            end_i = End_Index( i, bond_list );
+
+            for ( btop_i = start_i; btop_i < end_i; ++btop_i )
+            {
+                j = bond_list->bond_list[btop_i].nbr;
+                start_j = Start_Index( j, bond_list );
+                end_j = End_Index( j, bond_list );
+
+                for ( btop_j = start_j; btop_j < end_j; ++btop_j )
+                {
+                    if ( bond_list->bond_list[btop_j].nbr == i )
+                    {
+                        bond_list->bond_list[btop_i].sym_index = btop_j;
+                        break;
+                    }
+                }
+            }
+        }
+    }
+
+    /* reallocation checks */
+    for ( i = 0; i < system->N; ++i )
+    {
+        if ( Num_Entries( i, bond_list ) > system->max_bonds[i] )
+        {
+            workspace->realloc.bonds = TRUE;
+        }
+
+        if ( i < system->n
+                && system->reax_param.sbp[ system->my_atoms[i].type ].p_hbond == H_ATOM
+                && Num_Entries( system->my_atoms[i].Hindex, hbond_list )
+                > system->max_hbonds[system->my_atoms[i].Hindex] )
+        {
+            workspace->realloc.hbonds = TRUE;
+        }
+    }
+
+    return (workspace->realloc.bonds == TRUE 
+            || workspace->realloc.hbonds == TRUE) ? FAILURE : SUCCESS;
+}
+
+
+void Estimate_Storages( reax_system * const system, control_params * const control,
+        reax_list ** const lists, storage *workspace, int realloc_cm,
+        int realloc_bonds, int * const matrix_dim, int cm_format )
+{
+    if ( realloc_cm == TRUE )
+    {
+        Estimate_Storages_CM( system, control, lists, matrix_dim, cm_format );
+    }
+
+    if ( realloc_bonds == TRUE )
+    {
+        Estimate_Storages_Bonds( system, control, lists );
+    }
 }
 
 
@@ -2145,7 +2292,11 @@ int Compute_Forces( reax_system * const system, control_params * const control,
 
     if ( ret != SUCCESS )
     {
-        Estimate_Storages( system, control, lists, &matrix_dim, workspace->H.format );
+        Estimate_Storages( system, control, lists, workspace,
+                workspace->realloc.cm,
+                (workspace->realloc.bonds == TRUE
+                 || workspace->realloc.hbonds == TRUE ? TRUE : FALSE),
+                &matrix_dim, workspace->H.format );
     }
 
 #if defined(LOG_PERFORMANCE)
diff --git a/PG-PuReMD/src/forces.h b/PG-PuReMD/src/forces.h
index 9e0c9a89..52da90bc 100644
--- a/PG-PuReMD/src/forces.h
+++ b/PG-PuReMD/src/forces.h
@@ -35,7 +35,7 @@ int Compute_Forces( reax_system * const, control_params * const, simulation_data
         storage * const, reax_list ** const, output_controls * const, mpi_datatypes * const );
 
 void Estimate_Storages( reax_system * const, control_params * const,
-        reax_list ** const, int * const, int );
+        reax_list ** const, storage *, int, int, int * const, int );
 
 #ifdef __cplusplus
 }
diff --git a/PG-PuReMD/src/init_md.c b/PG-PuReMD/src/init_md.c
index 6ccc3bf9..1ea4b700 100644
--- a/PG-PuReMD/src/init_md.c
+++ b/PG-PuReMD/src/init_md.c
@@ -671,8 +671,9 @@ void Init_Lists( reax_system * const system, control_params * const control,
         fprintf( stderr, "[ERROR] p%d: failed to generate neighbor lists. Terminating...\n", system->my_rank );
         MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
     }
-    
-    Estimate_Storages( system, control, lists, &matrix_dim, cm_format );
+
+    Estimate_Storages( system, control, lists, workspace, TRUE, TRUE,
+            &matrix_dim, cm_format );
     
 #if defined(NEUTRAL_TERRITORY)
     Allocate_Matrix( &workspace->H, matrix_dim, system->local_cap, system->total_cm_entries, cm_format );
diff --git a/PG-PuReMD/src/reset_tools.c b/PG-PuReMD/src/reset_tools.c
index 915a9efe..b4dac553 100644
--- a/PG-PuReMD/src/reset_tools.c
+++ b/PG-PuReMD/src/reset_tools.c
@@ -169,13 +169,22 @@ void Reset_Test_Forces( reax_system * const system, storage * const workspace )
 
 void Reset_Workspace( reax_system * const system, storage * const workspace )
 {
-    memset( workspace->total_bond_order, 0, sizeof(real) * system->total_cap );
-    memset( workspace->dDeltap_self, 0, sizeof(rvec) * system->total_cap );
-    memset( workspace->CdDelta, 0, sizeof(real) * system->total_cap );
-    memset( workspace->f, 0, sizeof(rvec) * system->total_cap );
+    int i;
+
+    for ( i = 0; i < system->total_cap; ++i )
+    {
+        workspace->CdDelta[i] = 0.0;
+    }
+    for ( i = 0; i < system->total_cap; ++i )
+    {
+        rvec_MakeZero( workspace->f[i] );
+    }
 
 #ifdef TEST_FORCES
-    memset( workspace->dDelta, 0, sizeof(rvec) * system->total_cap );
+    for ( i = 0; i < system->total_cap; ++i )
+    {
+        rvec_MakeZero( workspace->dDelta[i] );
+    }
     Reset_Test_Forces( system, workspace );
 #endif
 }
@@ -211,37 +220,6 @@ void Reset_Out_Buffers( mpi_out_data * const out_buf, int n )
 }
 
 
-void Reset_Lists( reax_system * const system, control_params * const control,
-        storage * const workspace, reax_list ** const lists )
-{
-    int i;
-    reax_list * const bond_list = lists[BONDS];
-    reax_list * const hbond_list = lists[HBONDS];
-
-    if ( system->N > 0 )
-    {
-        for ( i = 0; i < bond_list->n; ++i )
-        {
-            Set_End_Index( i, Start_Index( i, bond_list ), bond_list );
-        }
-
-        if ( control->hbond_cut > 0.0 && system->numH > 0 )
-        {
-            for ( i = 0; i < hbond_list->n; ++i )
-            {
-                /* do not use Hindex, unconditionally reset end indices */
-                Set_End_Index( i, Start_Index( i, hbond_list ), hbond_list );
-            }
-        }
-
-        for ( i = 0; i < workspace->H.n_max; ++i )
-        {
-            workspace->H.end[i] = workspace->H.start[i];
-        }
-    }
-}
-
-
 void Reset( reax_system * const system, control_params * const control,
         simulation_data * const data, storage * const workspace,
         reax_list ** const lists )
@@ -256,6 +234,4 @@ void Reset( reax_system * const system, control_params * const control,
     }
 
     Reset_Workspace( system, workspace );
-
-    Reset_Lists( system, control, workspace, lists );
 }
diff --git a/PG-PuReMD/src/reset_tools.h b/PG-PuReMD/src/reset_tools.h
index f47ce364..9c78ca61 100644
--- a/PG-PuReMD/src/reset_tools.h
+++ b/PG-PuReMD/src/reset_tools.h
@@ -37,9 +37,6 @@ void Reset_Timing( reax_timing * const );
 
 void Reset_Workspace( reax_system * const, storage * const );
 
-void Reset_Lists( reax_system * const, control_params * const,
-        storage * const, reax_list** const );
-
 void Reset_Grid( grid * const );
 
 void Reset_Out_Buffers( mpi_out_data * const, int );
-- 
GitLab