From c24310e8d3f6be1d17d93752bf8764fb61c18882 Mon Sep 17 00:00:00 2001
From: "Kurt A. O'Hearn" <ohearnku@msu.edu>
Date: Thu, 24 Aug 2017 17:01:08 -0400
Subject: [PATCH] PG-PuReMD: complete memory changes. Remove max register per
 CUDA function option in build. Other misc. changes.

---
 PG-PuReMD/Makefile.am                     |   8 +-
 PG-PuReMD/src/comm_tools.c                |   8 +-
 PG-PuReMD/src/control.c                   |   1 +
 PG-PuReMD/src/cuda/cuda_allocate.cu       | 116 ++--
 PG-PuReMD/src/cuda/cuda_bond_orders.cu    |  56 +-
 PG-PuReMD/src/cuda/cuda_bonds.cu          |  14 +-
 PG-PuReMD/src/cuda/cuda_charges.cu        |   3 +-
 PG-PuReMD/src/cuda/cuda_forces.cu         | 745 ++++++++++++++--------
 PG-PuReMD/src/cuda/cuda_forces.h          |  15 +-
 PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu |  71 +--
 PG-PuReMD/src/cuda/cuda_hydrogen_bonds.h  |  20 +-
 PG-PuReMD/src/cuda/cuda_init_md.cu        |  46 +-
 PG-PuReMD/src/cuda/cuda_integrate.cu      | 165 +++--
 PG-PuReMD/src/cuda/cuda_lin_alg.cu        |  57 +-
 PG-PuReMD/src/cuda/cuda_multi_body.cu     |  46 +-
 PG-PuReMD/src/cuda/cuda_neighbors.cu      | 316 ++-------
 PG-PuReMD/src/cuda/cuda_neighbors.h       |  12 +-
 PG-PuReMD/src/cuda/cuda_random.cu         |  65 ++
 PG-PuReMD/src/cuda/cuda_random.h          |  44 ++
 PG-PuReMD/src/cuda/cuda_system_props.cu   |  74 ++-
 PG-PuReMD/src/cuda/cuda_system_props.h    |   2 +
 PG-PuReMD/src/cuda/cuda_torsion_angles.cu |  25 +-
 PG-PuReMD/src/cuda/cuda_utils.cu          |  24 +-
 PG-PuReMD/src/cuda/cuda_valence_angles.cu |  32 +-
 PG-PuReMD/src/cuda/cuda_vector.h          |  41 ++
 PG-PuReMD/src/ffield.c                    |  46 +-
 PG-PuReMD/src/ffield.h                    |   2 +-
 PG-PuReMD/src/init_md.c                   |   2 +-
 PG-PuReMD/src/io_tools.c                  |  12 +-
 PG-PuReMD/src/parallelreax.c              |   6 +-
 PG-PuReMD/src/random.c                    |  15 +-
 PG-PuReMD/src/random.h                    |   1 +
 PG-PuReMD/src/reax_types.h                |  30 +-
 PG-PuReMD/src/traj.c                      |  54 +-
 PG-PuReMD/src/vector.h                    |  10 +-
 PuReMD/src/basic_comm.c                   |   2 +-
 PuReMD/src/control.c                      |   6 +-
 PuReMD/src/ffield.c                       |  12 +-
 PuReMD/src/geo_tools.c                    |   4 +-
 PuReMD/src/grid.c                         |   2 +-
 PuReMD/src/init_md.c                      |   8 +-
 PuReMD/src/lookup.c                       |  12 +-
 PuReMD/src/parallelreax.c                 |  14 +-
 PuReMD/src/qEq.c                          |   2 +-
 PuReMD/src/restart.c                      |  12 +-
 PuReMD/src/tool_box.c                     |   2 +-
 PuReMD/src/traj.c                         |   6 +-
 cuda.am                                   |  10 +-
 48 files changed, 1189 insertions(+), 1087 deletions(-)
 create mode 100644 PG-PuReMD/src/cuda/cuda_random.cu
 create mode 100644 PG-PuReMD/src/cuda/cuda_random.h
 create mode 100644 PG-PuReMD/src/cuda/cuda_vector.h

diff --git a/PG-PuReMD/Makefile.am b/PG-PuReMD/Makefile.am
index 149e16de..d16fd249 100644
--- a/PG-PuReMD/Makefile.am
+++ b/PG-PuReMD/Makefile.am
@@ -35,8 +35,8 @@ include_HEADERS = src/reax_types.h src/index_utils.h \
 
 if USE_CUDA
 bin_pg_puremd_SOURCES += src/cuda/cuda_utils.cu src/cuda/cuda_allocate.cu src/cuda/cuda_environment.cu \
-      src/cuda/cuda_system_props.cu src/cuda/cuda_reduction.cu src/cuda/cuda_box.cu \
-      src/cuda/cuda_copy.cu src/cuda/cuda_reset_tools.cu src/cuda/cuda_list.cu \
+      src/cuda/cuda_system_props.cu src/cuda/cuda_reduction.cu src/cuda/cuda_box.cu src/cuda/cuda_list.cu \
+      src/cuda/cuda_copy.cu src/cuda/cuda_reset_tools.cu src/cuda/cuda_random.cu \
       src/cuda/cuda_neighbors.cu src/cuda/cuda_bond_orders.cu src/cuda/cuda_bonds.cu \
       src/cuda/cuda_multi_body.cu src/cuda/cuda_valence_angles.cu \
       src/cuda/cuda_torsion_angles.cu src/cuda/cuda_hydrogen_bonds.cu src/cuda/cuda_forces.cu \
@@ -45,8 +45,8 @@ bin_pg_puremd_SOURCES += src/cuda/cuda_utils.cu src/cuda/cuda_allocate.cu src/cu
       src/cuda/cuda_init_md.cu src/cuda/cuda_validation.cu src/cuda/cuda_lookup.cu
 include_HEADERS += src/cuda/cuda_helpers.h src/cuda/cuda_shuffle.h \
       src/cuda/cuda_utils.h src/cuda/cuda_allocate.h src/cuda/cuda_environment.h \
-      src/cuda/cuda_system_props.h src/cuda/cuda_reduction.h src/cuda/cuda_box.h \
-      src/cuda/cuda_copy.h src/cuda/cuda_reset_tools.h src/cuda/cuda_list.h \
+      src/cuda/cuda_system_props.h src/cuda/cuda_reduction.h src/cuda/cuda_box.h src/cuda/cuda_list.h \
+      src/cuda/cuda_copy.h src/cuda/cuda_reset_tools.h src/cuda/cuda_random.h src/cuda/cuda_vector.h \
       src/cuda/cuda_neighbors.h src/cuda/cuda_bond_orders.h src/cuda/cuda_bonds.h \
       src/cuda/cuda_multi_body.h src/cuda/cuda_valence_angles.h \
       src/cuda/cuda_torsion_angles.h src/cuda/cuda_hydrogen_bonds.h src/cuda/cuda_forces.h \
diff --git a/PG-PuReMD/src/comm_tools.c b/PG-PuReMD/src/comm_tools.c
index a8d46fcb..9e8978dd 100644
--- a/PG-PuReMD/src/comm_tools.c
+++ b/PG-PuReMD/src/comm_tools.c
@@ -763,8 +763,8 @@ void Comm_Atoms( reax_system *system, control_params *control,
         Reorder_My_Atoms( system, workspace );
 
 #if defined(DEBUG_FOCUS)
-        fprintf( stderr, "p%d updated local atoms, n=%d\n",
-                 system->my_rank, system->n );
+        fprintf( stderr, "p%d, step %d: updated local atoms, n=%d\n",
+                 system->my_rank, data->step, system->n );
         MPI_Barrier( MPI_COMM_WORLD );
 #endif
 
@@ -777,8 +777,8 @@ void Comm_Atoms( reax_system *system, control_params *control,
                 Sort_Boundary_Atoms, Unpack_Exchange_Message, TRUE );
 
 #if defined(DEBUG_FOCUS)
-        fprintf( stderr, "p%d: exchanged boundary atoms, N=%d\n",
-                 system->my_rank, system->N );
+        fprintf( stderr, "p%d, step %d: exchanged boundary atoms, N=%d\n",
+                 system->my_rank, data->step, system->N );
         for ( i = 0; i < MAX_NBRS; ++i )
         {
             fprintf( stderr, "p%d: nbr%d(p%d) str=%d cnt=%d end=%d\n",
diff --git a/PG-PuReMD/src/control.c b/PG-PuReMD/src/control.c
index 7f7060c1..268b6a17 100644
--- a/PG-PuReMD/src/control.c
+++ b/PG-PuReMD/src/control.c
@@ -58,6 +58,7 @@ char Read_Control_File( char *control_file, control_params* control,
     control->geo_format = 1;
     control->gpus_per_node = 1;
 
+    control->random_vel = 0;
     control->restart          = 0;
     out_control->restart_format = WRITE_BINARY;
     out_control->restart_freq = 0;
diff --git a/PG-PuReMD/src/cuda/cuda_allocate.cu b/PG-PuReMD/src/cuda/cuda_allocate.cu
index c81938c5..5df47e6f 100644
--- a/PG-PuReMD/src/cuda/cuda_allocate.cu
+++ b/PG-PuReMD/src/cuda/cuda_allocate.cu
@@ -2,6 +2,7 @@
 #include "cuda_allocate.h"
 
 #include "cuda_allocate.h"
+#include "cuda_forces.h"
 #include "cuda_list.h"
 #include "cuda_neighbors.h"
 #include "cuda_utils.h"
@@ -42,10 +43,10 @@ CUDA_GLOBAL void Init_Nbrs( ivec *nbrs, int N )
 void dev_alloc_grid( reax_system *system )
 {
     int total;
-    grid_cell local_cell;
+//    grid_cell local_cell;
     grid *host = &system->my_grid;
     grid *device = &system->d_my_grid;
-    ivec *nbrs_x = (ivec *) scratch;
+//    ivec *nbrs_x = (ivec *) scratch;
 
     total = host->ncells[0] * host->ncells[1] * host->ncells[2];
     ivec_Copy( device->ncells, host->ncells );
@@ -249,80 +250,85 @@ void dev_alloc_system( reax_system *system )
 }
 
 
-void dev_realloc_system( reax_system *system, int local_cap, int total_cap, char *msg )
+void dev_realloc_system( reax_system *system, int old_total_cap, int total_cap, char *msg )
 {
     int *temp;
+    reax_atom *temp_atom;
 
     temp = (int *) scratch;
+    temp_atom = (reax_atom*) scratch;
 
     /* free the existing storage for atoms, leave other info allocated */
+    copy_device( temp_atom, system->d_my_atoms, old_total_cap * sizeof(reax_atom),
+            "dev_realloc_system::temp" );
     cuda_free( system->d_my_atoms, "system::d_my_atoms" );
     cuda_malloc( (void **) &system->d_my_atoms, sizeof(reax_atom) * total_cap, 
             TRUE, "system::d_my_atoms" );
+    copy_device( system->d_my_atoms, temp, old_total_cap * sizeof(reax_atom),
+            "dev_realloc_system::temp" );
 
-    //TODO: record old total_cap before increase, use here
-    copy_device( temp, system->d_far_nbrs, system->total_cap * sizeof(int),
+    copy_device( temp, system->d_far_nbrs, old_total_cap * sizeof(int),
             "dev_realloc_system::temp" );
     cuda_free( system->d_far_nbrs, "system::d_far_nbrs" );
     cuda_malloc( (void **) &system->d_far_nbrs,
             system->total_cap * sizeof(int), TRUE, "system::d_far_nbrs" );
-    copy_device( system->d_far_nbrs, temp, system->total_cap * sizeof(int),
+    copy_device( system->d_far_nbrs, temp, old_total_cap * sizeof(int),
             "dev_realloc_system::temp" );
 
-    copy_device( temp, system->d_max_far_nbrs, system->total_cap * sizeof(int),
+    copy_device( temp, system->d_max_far_nbrs, old_total_cap * sizeof(int),
             "dev_realloc_system::temp" );
     cuda_free( system->d_max_far_nbrs, "system::d_max_far_nbrs" );
     cuda_malloc( (void **) &system->d_max_far_nbrs,
             system->total_cap * sizeof(int), TRUE, "system::d_max_far_nbrs" );
-    copy_device( system->d_max_far_nbrs, temp, system->total_cap * sizeof(int),
+    copy_device( system->d_max_far_nbrs, temp, old_total_cap * sizeof(int),
             "dev_realloc_system::temp" );
 
-    copy_device( temp, system->d_bonds, system->total_cap * sizeof(int),
+    copy_device( temp, system->d_bonds, old_total_cap * sizeof(int),
             "dev_realloc_system::temp" );
     cuda_free( system->d_bonds, "system::d_bonds" );
     cuda_malloc( (void **) &system->d_bonds,
             system->total_cap * sizeof(int), TRUE, "system::d_bonds" );
-    copy_device( system->d_bonds, temp, system->total_cap * sizeof(int),
+    copy_device( system->d_bonds, temp, old_total_cap * sizeof(int),
             "dev_realloc_system::temp" );
 
-    copy_device( temp, system->d_max_bonds, system->total_cap * sizeof(int),
+    copy_device( temp, system->d_max_bonds, old_total_cap * sizeof(int),
             "dev_realloc_system::temp" );
     cuda_free( system->d_max_bonds, "system::d_max_bonds" );
     cuda_malloc( (void **) &system->d_max_bonds,
             system->total_cap * sizeof(int), TRUE, "system::d_max_bonds" );
-    copy_device( system->d_max_bonds, temp, system->total_cap * sizeof(int),
+    copy_device( system->d_max_bonds, temp, old_total_cap * sizeof(int),
             "dev_realloc_system::temp" );
 
-    copy_device( temp, system->d_hbonds, system->total_cap * sizeof(int),
+    copy_device( temp, system->d_hbonds, old_total_cap * sizeof(int),
             "dev_realloc_system::temp" );
     cuda_free( system->d_hbonds, "system::d_hbonds" );
     cuda_malloc( (void **) &system->d_hbonds,
             system->total_cap * sizeof(int), TRUE, "system::d_hbonds" );
-    copy_device( system->d_hbonds, temp, system->total_cap * sizeof(int),
+    copy_device( system->d_hbonds, temp, old_total_cap * sizeof(int),
             "dev_realloc_system::temp" );
 
-    copy_device( temp, system->d_max_hbonds, system->total_cap * sizeof(int),
+    copy_device( temp, system->d_max_hbonds, old_total_cap * sizeof(int),
             "dev_realloc_system::temp" );
     cuda_free( system->d_max_hbonds, "system::d_max_hbonds" );
     cuda_malloc( (void **) &system->d_max_hbonds,
             system->total_cap * sizeof(int), TRUE, "system::d_max_hbonds" );
-    copy_device( system->d_max_hbonds, temp, system->total_cap * sizeof(int),
+    copy_device( system->d_max_hbonds, temp, old_total_cap * sizeof(int),
             "dev_realloc_system::temp" );
 
-    copy_device( temp, system->d_cm_entries, system->total_cap * sizeof(int),
+    copy_device( temp, system->d_cm_entries, old_total_cap * sizeof(int),
             "dev_realloc_system::temp" );
     cuda_free( system->d_cm_entries, "system::d_cm_entries" );
     cuda_malloc( (void **) &system->d_cm_entries,
             system->total_cap * sizeof(int), TRUE, "system::d_cm_entries" );
-    copy_device( system->d_cm_entries, temp, system->total_cap * sizeof(int),
+    copy_device( system->d_cm_entries, temp, old_total_cap * sizeof(int),
             "dev_realloc_system::temp" );
 
-    copy_device( temp, system->d_max_cm_entries, system->total_cap * sizeof(int),
+    copy_device( temp, system->d_max_cm_entries, old_total_cap * sizeof(int),
             "dev_realloc_system::temp" );
     cuda_free( system->d_max_cm_entries, "system::d_max_cm_entries" );
     cuda_malloc( (void **) &system->d_max_cm_entries,
             system->total_cap * sizeof(int), TRUE, "system::d_max_cm_entries" );
-    copy_device( system->d_max_cm_entries, temp, system->total_cap * sizeof(int),
+    copy_device( system->d_max_cm_entries, temp, old_total_cap * sizeof(int),
             "dev_realloc_system::temp" );
 }
 
@@ -336,14 +342,12 @@ void dev_alloc_simulation_data( simulation_data *data )
 void dev_alloc_workspace( reax_system *system, control_params *control, 
         storage *workspace, int local_cap, int total_cap, char *msg )
 {
-    int i, total_real, total_rvec, local_int, local_real, local_rvec;
+    int total_real, total_rvec, local_rvec;
 
     workspace->allocated = TRUE;
 
     total_real = total_cap * sizeof(real);
     total_rvec = total_cap * sizeof(rvec);
-    local_int = local_cap * sizeof(int);
-    local_real = local_cap * sizeof(real);
     local_rvec = local_cap * sizeof(rvec);
 
     /* communication storage */  
@@ -588,8 +592,8 @@ void Cuda_ReAllocate( reax_system *system, control_params *control,
         mpi_datatypes *mpi_data )
 {
     int i, j, k, p;
-    int nflag, Nflag, Hflag, mpi_flag, total_send;
-    int renbr, *indices;
+    int nflag, Nflag, old_total_cap, mpi_flag, total_send;
+    int renbr;
     reallocate_data *realloc;
     reax_list *far_nbrs;
     sparse_matrix *H;
@@ -601,25 +605,6 @@ void Cuda_ReAllocate( reax_system *system, control_params *control,
     realloc = &(dev_workspace->realloc);
     g = &(system->my_grid);
     H = &dev_workspace->H;
-    indices = (int *) host_scratch;
-
-#if defined(DEBUG)
-    fprintf( stderr, "p%d@reallocate: n: %d, N: %d, numH: %d\n",
-             system->my_rank, system->n, system->N, system->numH );
-    fprintf( stderr, "p%d@reallocate: local_cap: %d, total_cap: %d, Hcap: %d\n",
-             system->my_rank, system->local_cap, system->total_cap,
-             system->Hcap);
-    fprintf( stderr, "p%d: realloc.far_nbrs: %d\n",
-             system->my_rank, realloc->far_nbrs );
-    fprintf( stderr, "p%d: realloc.H: %d, realloc.Htop: %d\n",
-             system->my_rank, realloc->H, realloc->Htop );
-    fprintf( stderr, "p%d: realloc.Hbonds: %d, realloc.num_hbonds: %d\n",
-             system->my_rank, realloc->hbonds, realloc->num_hbonds );
-    fprintf( stderr, "p%d: realloc.bonds: %d, num_bonds: %d\n",
-             system->my_rank, realloc->bonds, realloc->num_bonds );
-    fprintf( stderr, "p%d: realloc.num_3body: %d\n",
-             system->my_rank, realloc->num_3body );
-#endif
 
     // IMPORTANT: LOOSE ZONES CHECKS ARE DISABLED FOR NOW BY &&'ing with 0!!!
     nflag = FALSE;
@@ -635,6 +620,7 @@ void Cuda_ReAllocate( reax_system *system, control_params *control,
             (0 && system->N <= LOOSE_ZONE * system->total_cap) )
     {
         Nflag = TRUE;
+        old_total_cap = system->total_cap;
         system->total_cap = (int)(system->N * SAFE_ZONE);
     }
 
@@ -649,7 +635,7 @@ void Cuda_ReAllocate( reax_system *system, control_params *control,
 #endif
 
         /* system */
-        dev_realloc_system( system, system->local_cap, system->total_cap, msg );
+        dev_realloc_system( system, old_total_cap, system->total_cap, msg );
 
         /* workspace */
         dev_dealloc_workspace( control, workspace );
@@ -659,24 +645,23 @@ void Cuda_ReAllocate( reax_system *system, control_params *control,
 
     /* far neighbors */
     renbr = (data->step - data->prev_steps) % control->reneighbor == 0;
-    if ( renbr || realloc->far_nbrs == TRUE )
+    if ( renbr && (Nflag == TRUE || realloc->far_nbrs == TRUE) )
     {
         far_nbrs = *dev_lists + FAR_NBRS;
 
-        if ( Nflag == TRUE || realloc->far_nbrs == TRUE )
-        {
 #if defined(DEBUG_FOCUS)
-            fprintf( stderr, "p%d: reallocating far_nbrs: far_nbrs=%d, space=%dMB\n",
-                     system->my_rank, system->total_far_nbrs,
-                     (int)(system->total_far_nbrs * sizeof(far_neighbor_data) /
-                           (1024.0 * 1024.0)) );
-            fprintf( stderr, "p:%d - *** Reallocating Far Nbrs *** \n", system->my_rank );
+        fprintf( stderr, "p%d: reallocating far_nbrs: far_nbrs=%d, space=%dMB\n",
+                 system->my_rank, system->total_far_nbrs,
+                 (int)(system->total_far_nbrs * sizeof(far_neighbor_data) /
+                       (1024.0 * 1024.0)) );
+        fprintf( stderr, "p:%d - *** Reallocating Far Nbrs *** \n", system->my_rank );
 #endif
 
-            Cuda_Reallocate_Neighbor_List( far_nbrs, system->total_cap, system->total_far_nbrs );
-            Cuda_Init_Neighbor_Indices( system );
-            realloc->far_nbrs = FALSE;
-        }
+        Cuda_Reallocate_Neighbor_List( far_nbrs, system->total_cap, system->total_far_nbrs );
+
+        Cuda_Init_Neighbor_Indices( system );
+
+        realloc->far_nbrs = FALSE;
     }
 
     /* charge matrix */
@@ -714,7 +699,9 @@ void Cuda_ReAllocate( reax_system *system, control_params *control,
 #endif
 
             Cuda_Reallocate_HBonds_List( (*dev_lists) + HBONDS, system->total_cap, system->total_hbonds );
+
             Cuda_Init_HBond_Indices( system );
+
             realloc->hbonds = FALSE;
         }
     }
@@ -729,23 +716,26 @@ void Cuda_ReAllocate( reax_system *system, control_params *control,
 #endif
 
         Cuda_Reallocate_Bonds_List( (*dev_lists) + BONDS, system->total_cap, system->total_bonds );
+
         Cuda_Init_Bond_Indices( system );
+
         realloc->bonds = FALSE;
     }
 
     /* 3-body list */
-    if ( Nflag == TRUE || realloc->num_3body > 0 )
+    if ( Nflag == TRUE || realloc->thbody == TRUE )
     {
 #if defined(DEBUG_FOCUS)
-        fprintf( stderr, "p%d: reallocating 3body list: num_3body=%d, space=%dMB\n",
-                system->my_rank, realloc->num_3body,
-                (int)(realloc->num_3body * sizeof(three_body_interaction_data) /
+        fprintf( stderr, "p%d: reallocating thbody list: num_thbody=%d, space=%dMB\n",
+                system->my_rank, system->total_thbodies,
+                (int)(system->total_thbodies * sizeof(three_body_interaction_data) /
                 (1024*1024)) );
 #endif
 
         Cuda_Reallocate_Thbodies_List( (*dev_lists) + THREE_BODIES,
-                (*dev_lists + BONDS)->num_intrs, system->total_thbodies );
-        realloc->num_3body = -1;
+                system->total_thbodies_indices, system->total_thbodies );
+
+        realloc->thbody = FALSE;
     }
 
     /* grid */
diff --git a/PG-PuReMD/src/cuda/cuda_bond_orders.cu b/PG-PuReMD/src/cuda/cuda_bond_orders.cu
index bb478a3a..f16ddcbc 100644
--- a/PG-PuReMD/src/cuda/cuda_bond_orders.cu
+++ b/PG-PuReMD/src/cuda/cuda_bond_orders.cu
@@ -14,6 +14,7 @@ CUDA_GLOBAL void Cuda_Calculate_BO_init( reax_atom *my_atoms,
 {
     int i, type_i;
     single_body_parameters *sbp_i;
+    storage *workspace;
 
     i = blockIdx.x * blockDim.x + threadIdx.x;
 
@@ -22,11 +23,11 @@ CUDA_GLOBAL void Cuda_Calculate_BO_init( reax_atom *my_atoms,
         return;
     }
 
-    storage *workspace = & (p_workspace);
+    workspace = &( p_workspace );
 
     /* Calculate Deltaprime, Deltaprime_boc values */
     type_i = my_atoms[i].type;
-    sbp_i = &(sbp[type_i]);
+    sbp_i = &( sbp[type_i] );
     workspace->Deltap[i] = workspace->total_bond_order[i] - sbp_i->valency;
     workspace->Deltap_boc[i] = 
         workspace->total_bond_order[i] - sbp_i->valency_val;
@@ -40,18 +41,20 @@ CUDA_GLOBAL void Cuda_Calculate_BO( reax_atom *my_atoms, global_parameters gp,
         int num_atom_types, int N )
 {
     int i, j, pj, type_i, type_j;
-    int start_i, end_i, sym_index, num_bonds;
+    int start_i, end_i;
+//    int sym_index;
     real val_i, Deltap_i, Deltap_boc_i;
     real val_j, Deltap_j, Deltap_boc_j;
     real f1, f2, f3, f4, f5, f4f5, exp_f4, exp_f5;
     real exp_p1i, exp_p2i, exp_p1j, exp_p2j;
     real temp, u1_ij, u1_ji, Cf1A_ij, Cf1B_ij, Cf1_ij, Cf1_ji;
-    real Cf45_ij, Cf45_ji, p_lp1; //u_ij, u_ji
+    real Cf45_ij, Cf45_ji;
     real A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji;
-    real explp1, p_boc1, p_boc2;
-    single_body_parameters *sbp_i, *sbp_j;
+    real p_boc1, p_boc2;
+    single_body_parameters *sbp_i;
     two_body_parameters *twbp;
-    bond_order_data *bo_ij, *bo_ji;
+    bond_order_data *bo_ij;
+//    bond_order_data *bo_ji;
     storage *workspace;
     reax_list *bonds;
 
@@ -64,25 +67,9 @@ CUDA_GLOBAL void Cuda_Calculate_BO( reax_atom *my_atoms, global_parameters gp,
 
     workspace = &(p_workspace);
     bonds = &(p_bonds);
-    num_bonds = 0; 
     p_boc1 = gp.l[0];
     p_boc2 = gp.l[1];
 
-    /* Calculate Deltaprime, Deltaprime_boc values */
-    /*
-    //for( i = 0; i < system->N; ++i ) {
-    type_i = my_atoms[i].type;
-    sbp_i = &(sbp[type_i]);
-    workspace->Deltap[i] = workspace->total_bond_order[i] - sbp_i->valency;
-    workspace->Deltap_boc[i] = 
-    workspace->total_bond_order[i] - sbp_i->valency_val;
-
-    //fprintf( stdout, "%d(%d) %24.15f\n", 
-    //     i, workspace->bond_mark[i], workspace->total_bond_order[i] );
-    workspace->total_bond_order[i] = 0; 
-    //}
-     */
-
     /* Corrected Bond Order calculations */
     //for( i = 0; i < system->N; ++i ) {
     type_i = my_atoms[i].type;
@@ -103,10 +90,6 @@ CUDA_GLOBAL void Cuda_Calculate_BO( reax_atom *my_atoms, global_parameters gp,
         bo_ij = &( bonds->select.bond_list[pj].bo_data );
         // fprintf( stderr, "\tj:%d - ubo: %8.3f\n", j+1, bo_ij->BO );
 
-        //TODO
-        //TODO
-        //TODO
-        //TODO
         //TODO
         //if( i < j || workspace->bond_mark[j] > 3 ) {
         if( i < j )
@@ -231,12 +214,6 @@ CUDA_GLOBAL void Cuda_Calculate_BO( reax_atom *my_atoms, global_parameters gp,
                     f4f5 = f4 * f5;
 
                     /* Bond Order pages 8-9, derivative of f4 and f5 */
-                    /*temp = twbp->p_boc5 - 
-                      twbp->p_boc3 * twbp->p_boc4 * SQR( bo_ij->BO );
-                      u_ij = temp + twbp->p_boc3 * Deltap_boc_i;
-                      u_ji = temp + twbp->p_boc3 * Deltap_boc_j;
-                      Cf45_ij = Cf45( u_ij, u_ji ) / f4f5;
-                      Cf45_ji = Cf45( u_ji, u_ij ) / f4f5;*/
                     Cf45_ij = -f4 * exp_f4;
                     Cf45_ji = -f5 * exp_f5;
                 }
@@ -390,7 +367,8 @@ CUDA_GLOBAL void Cuda_Update_Uncorrected_BO( storage p_workspace,
             bo_ij->BO_pi = bo_ji->BO_pi;
             bo_ij->BO_pi2 = bo_ji->BO_pi2;
 
-            workspace->total_bond_order[i] += bo_ij->BO;// now keeps total_BO
+            // now keeps total_BO
+            workspace->total_bond_order[i] += bo_ij->BO;
         }
     }
 }
@@ -401,9 +379,8 @@ CUDA_GLOBAL void Cuda_Update_Workspace_After_BO( reax_atom *my_atoms,
         storage p_workspace, int N )
 {
     int j, type_j;
-    real explp1;
-    real p_lp1;
-    single_body_parameters *sbp_i, *sbp_j;
+    real explp1, p_lp1;
+    single_body_parameters *sbp_j;
     storage *workspace;
 
     j = blockIdx.x * blockDim.x + threadIdx.x;
@@ -465,7 +442,6 @@ CUDA_DEVICE void Cuda_Add_dBond_to_Forces_NPT( int i, int pj,
     rvec temp, ext_press;
     ivec rel_box;
     int pk, k, j;
-    rvec tf_f;
 
     /* Initializations */
     nbr_j = &(bonds->select.bond_list[pj]);
@@ -635,7 +611,7 @@ CUDA_DEVICE void Cuda_Add_dBond_to_Forces( int i, int pj,
     bond_data *nbr_j, *nbr_k;
     bond_order_data *bo_ij, *bo_ji;
     dbond_coefficients coef;
-    int pk, k, j;
+    int pk, j;
     rvec tf_f;
 
     rvec_MakeZero( tf_f );
@@ -680,7 +656,6 @@ CUDA_DEVICE void Cuda_Add_dBond_to_Forces( int i, int pj,
         for( pk = Dev_Start_Index(i, bonds); pk < Dev_End_Index(i, bonds); ++pk )
         {
             nbr_k = &(bonds->select.bond_list[pk]);
-            k = nbr_k->nbr;
             rvec_MakeZero( tf_f );
 
             /*2nd,dBO*/
@@ -725,7 +700,6 @@ CUDA_DEVICE void Cuda_Add_dBond_to_Forces( int i, int pj,
         for( pk = Dev_Start_Index(i, bonds); pk < Dev_End_Index(i, bonds); ++pk )
         {
             nbr_k = &(bonds->select.bond_list[pk]);
-            k = nbr_k->nbr;
             rvec_MakeZero( tf_f );
 
             /*3rd, dBO*/
diff --git a/PG-PuReMD/src/cuda/cuda_bonds.cu b/PG-PuReMD/src/cuda/cuda_bonds.cu
index e3592630..a9c3ce30 100644
--- a/PG-PuReMD/src/cuda/cuda_bonds.cu
+++ b/PG-PuReMD/src/cuda/cuda_bonds.cu
@@ -31,7 +31,7 @@ CUDA_GLOBAL void Cuda_Bonds( reax_atom *my_atoms, global_parameters gp,
         storage p_workspace, reax_list p_bonds, int n, int num_atom_types, 
         real *e_bond )
 {
-    int i, j, pj, natoms;
+    int i, j, pj;
     int start_i, end_i;
     int type_i, type_j;
     real ebond, pow_BOs_be2, exp_be12, CEbo;
@@ -59,7 +59,6 @@ CUDA_GLOBAL void Cuda_Bonds( reax_atom *my_atoms, global_parameters gp,
     gp10 = gp.l[10];
     gp37 = (int) gp.l[37];
 
-    //for( i = 0; i < natoms; ++i ) {
     start_i = Dev_Start_Index(i, bonds);
     end_i = Dev_End_Index(i, bonds);
 
@@ -85,10 +84,10 @@ CUDA_GLOBAL void Cuda_Bonds( reax_atom *my_atoms, global_parameters gp,
                 ( 1.0 - twbp->p_be1 * twbp->p_be2 * pow_BOs_be2 );
 
             /* calculate the Bond Energy */
-            e_bond[ i ] += ebond = 
-                -twbp->De_s * bo_ij->BO_s * exp_be12 
+            ebond = -twbp->De_s * bo_ij->BO_s * exp_be12 
                 -twbp->De_p * bo_ij->BO_pi 
                 -twbp->De_pp * bo_ij->BO_pi2;
+            e_bond[ i ] += ebond;
 
             /* calculate derivatives of Bond Orders */
             bo_ij->Cdbo += CEbo;
@@ -102,12 +101,14 @@ CUDA_GLOBAL void Cuda_Bonds( reax_atom *my_atoms, global_parameters gp,
                     system->my_atoms[j].orig_id, 
                     bo_ij->BO, ebond, data->my_en.e_bond );
 #endif
+
 #ifdef TEST_FORCES
             Add_dBO( system, lists, i, pj, CEbo, workspace->f_be );
             Add_dBOpinpi2( system, lists, i, pj, 
                     -(CEbo + twbp->De_p), -(CEbo + twbp->De_pp), 
                     workspace->f_be, workspace->f_be );
 #endif
+
             /* Stabilisation terminal triple bond */
             if ( bo_ij->BO >= 1.00 )
             {
@@ -122,7 +123,7 @@ CUDA_GLOBAL void Cuda_Bonds( reax_atom *my_atoms, global_parameters gp,
                     hulpov = 1.0 / (1.0 + 25.0 * exphuov);
 
                     estriph = gp10 * exphu * hulpov * (exphua1 + exphub1);
-                    e_bond [i] += estriph;
+                    e_bond[i] += estriph;
 
                     decobdbo = gp10 * exphu * hulpov * (exphua1 + exphub1) *
                         ( gp3 - 2.0 * gp7 * (bo_ij->BO-2.50) );
@@ -134,12 +135,14 @@ CUDA_GLOBAL void Cuda_Bonds( reax_atom *my_atoms, global_parameters gp,
                     bo_ij->Cdbo += decobdbo;
                     workspace->CdDelta[i] += decobdboua;
                     workspace->CdDelta[j] += decobdboub;
+
 #ifdef TEST_ENERGY
                     //fprintf( out_control->ebond, 
                     //  "%6d%6d%24.15e%24.15e%24.15e%24.15e\n",
                     //  system->my_atoms[i].orig_id, system->my_atoms[j].orig_id,
                     //  estriph, decobdbo, decobdboua, decobdboub );
 #endif
+
 #ifdef TEST_FORCES
                     Add_dBO( system, lists, i, pj, decobdbo, workspace->f_be );
                     Add_dDelta( system, lists, i, decobdboua, workspace->f_be );
@@ -149,5 +152,4 @@ CUDA_GLOBAL void Cuda_Bonds( reax_atom *my_atoms, global_parameters gp,
             }
         }
     }
-    //  }
 }
diff --git a/PG-PuReMD/src/cuda/cuda_charges.cu b/PG-PuReMD/src/cuda/cuda_charges.cu
index 72018b1d..9ea97653 100644
--- a/PG-PuReMD/src/cuda/cuda_charges.cu
+++ b/PG-PuReMD/src/cuda/cuda_charges.cu
@@ -210,10 +210,9 @@ void cuda_charges_updateq( reax_system *system, real *q )
 void Cuda_Calculate_Charges( reax_system *system, storage *workspace,
         mpi_datatypes *mpi_data )
 {
-    int i, scale;
+    int scale;
     real u;//, s_sum, t_sum;
     rvec2 my_sum, all_sum;
-    reax_atom *atom;
     real *q;
 
     my_sum[0] = 0.0;
diff --git a/PG-PuReMD/src/cuda/cuda_forces.cu b/PG-PuReMD/src/cuda/cuda_forces.cu
index 71224dd0..4960038c 100644
--- a/PG-PuReMD/src/cuda/cuda_forces.cu
+++ b/PG-PuReMD/src/cuda/cuda_forces.cu
@@ -30,19 +30,187 @@ CUDA_GLOBAL void k_disable_hydrogen_bonding( control_params *control )
 }
 
 
+CUDA_GLOBAL void k_init_end_index( int * intr_cnt, int *indices, int *end_indices, int N )
+{
+    int i;
+
+    i = blockIdx.x * blockDim.x  + threadIdx.x;
+
+    if ( i >= N )
+    {
+        return;
+    }
+
+    end_indices[i] = indices[i] + intr_cnt[i];
+}
+
+
+CUDA_GLOBAL void k_setup_hindex( reax_atom *my_atoms, int N )
+{
+    int i;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= N )
+    {
+        return;
+    }
+
+    my_atoms[i].Hindex = i;
+}
+
+
+CUDA_GLOBAL void k_init_hbond_indices( reax_atom * atoms, single_body_parameters *sbp,
+        int *hbonds, int *max_hbonds, int *indices, int *end_indices, int N )
+{
+    int i, hindex, my_hbonds;
+
+    i = blockIdx.x * blockDim.x  + threadIdx.x;
+
+    if ( i >= N )
+    {
+        return;
+    }
+
+    hindex = atoms[i].Hindex;
+
+    if ( sbp[ atoms[i].type ].p_hbond == H_ATOM || 
+            sbp[ atoms[i].type ].p_hbond == H_BONDING_ATOM )
+    {
+        my_hbonds = hbonds[i];
+        indices[hindex] = max_hbonds[i];
+        end_indices[hindex] = indices[hindex] + hbonds[i];
+    }
+    else
+    {
+        my_hbonds = 0;
+        indices[hindex] = 0;
+        end_indices[hindex] = 0;
+    }
+    atoms[i].num_hbonds = my_hbonds;
+}
+
+
+/* Initialize indices for far neighbors list post reallocation
+ *
+ * system: atomic system info. */
+void Cuda_Init_Neighbor_Indices( reax_system *system )
+{
+    int blocks;
+    reax_list *far_nbrs = *dev_lists + FAR_NBRS;
+
+    /* init indices */
+    Cuda_Scan_Excl_Sum( system->d_max_far_nbrs, far_nbrs->index, system->total_cap );
+
+    /* init end_indices */
+    blocks = system->N / DEF_BLOCK_SIZE + 
+        ((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+    k_init_end_index <<< blocks, DEF_BLOCK_SIZE >>>
+        ( system->d_far_nbrs, far_nbrs->index, far_nbrs->end_index, system->N );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
+/* Initialize indices for far hydrogen bonds list post reallocation
+ *
+ * system: atomic system info. */
+void Cuda_Init_HBond_Indices( reax_system *system )
+{
+    int blocks;
+    int *temp;
+    reax_list *hbonds = *dev_lists + HBONDS;
+
+    temp = (int *) scratch;
+
+    /* init Hindices */
+    blocks = system->N / DEF_BLOCK_SIZE + 
+        ((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+    k_setup_hindex <<< blocks, DEF_BLOCK_SIZE >>>
+        ( system->d_my_atoms, system->N );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    /* init indices and end_indices */
+    Cuda_Scan_Excl_Sum( system->d_max_hbonds, temp, system->total_cap );
+
+    k_init_hbond_indices <<< blocks, DEF_BLOCK_SIZE >>>
+        ( system->d_my_atoms, system->reax_param.d_sbp, system->d_hbonds, temp, 
+          hbonds->index, hbonds->end_index, system->N );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
+/* Initialize indices for far bonds list post reallocation
+ *
+ * system: atomic system info. */
+void Cuda_Init_Bond_Indices( reax_system *system )
+{
+    int blocks;
+    reax_list *bonds = *dev_lists + BONDS;
+
+    /* init indices */
+    Cuda_Scan_Excl_Sum( system->d_max_bonds, bonds->index, system->total_cap );
+
+    /* init end_indices */
+    blocks = system->N / DEF_BLOCK_SIZE + 
+        ((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+    k_init_end_index <<< blocks, DEF_BLOCK_SIZE >>>
+        ( system->d_bonds, bonds->index, bonds->end_index, system->N );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
+/* Initialize indices for charge matrix post reallocation
+ *
+ * system: atomic system info.
+ * H: charge matrix */
+void Cuda_Init_Sparse_Matrix_Indices( reax_system *system, sparse_matrix *H )
+{
+    int blocks;
+
+    /* init indices */
+    Cuda_Scan_Excl_Sum( system->d_max_cm_entries, H->start, system->N );
+
+    /* init end_indices */
+    blocks = system->N / DEF_BLOCK_SIZE + 
+        ((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+    k_init_end_index <<< blocks, DEF_BLOCK_SIZE >>>
+        ( system->d_cm_entries, H->start, H->end, system->N );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
+/* Initialize indices for three body list post reallocation
+ *
+ * indices: list indices
+ * entries: num. of entries in list */
+void Cuda_Init_Three_Body_Indices( int *indices, int entries )
+{
+    reax_list *thbody = *dev_lists + THREE_BODIES;
+
+    Cuda_Scan_Excl_Sum( indices, thbody->index, entries );
+}
+
+
 CUDA_GLOBAL void k_estimate_storages( reax_atom *my_atoms, 
         single_body_parameters *sbp, two_body_parameters *tbp,
         control_params *control, reax_list far_nbrs, 
-        int num_atom_types, int n, int N, int Hcap, int total_cap,
-        int *cm_entries, int *max_cm_entries, int *realloc_cm_entries,
-        int *bonds, int *max_bonds, int *realloc_bonds,
-        int *hbonds, int *max_hbonds, int *realloc_hbonds )
+        int num_atom_types, int n, int N, int total_cap,
+        int *cm_entries, int *max_cm_entries,
+        int *bonds, int *max_bonds,
+        int *hbonds, int *max_hbonds )
 {
     int i, j, pj; 
     int start_i, end_i;
     int type_i, type_j;
     int ihb, jhb;
     int local;
+    int my_bonds, my_hbonds, my_cm_entries;
     real cutoff;
     real r_ij, r2; 
     real C12, C34, C56;
@@ -59,6 +227,10 @@ CUDA_GLOBAL void k_estimate_storages( reax_atom *my_atoms,
         return;
     }
 
+    my_bonds = 0;
+    my_hbonds = 0;
+    my_cm_entries = 0;
+
     if ( i < N )
     {
         atom_i = &(my_atoms[i]);
@@ -71,7 +243,7 @@ CUDA_GLOBAL void k_estimate_storages( reax_atom *my_atoms,
         { 
             local = TRUE;
             cutoff = control->nonb_cut;
-            ++cm_entries[i];
+            ++my_cm_entries;
 //            ihb = sbp_i->p_hbond;
         }   
         else
@@ -100,18 +272,18 @@ CUDA_GLOBAL void k_estimate_storages( reax_atom *my_atoms,
                 {
                     if ( i < j && (j < n || atom_i->orig_id < atom_j->orig_id) )
                     {
-                        ++cm_entries[i];
+                        ++my_cm_entries;
                     }
                     else if ( i > j && (j < n || atom_j->orig_id > atom_i->orig_id) )
                     {
-                        ++cm_entries[i];
+                        ++my_cm_entries;
                     }
                 }
                 else
                 {
                     if ( i > j && j < n && atom_j->orig_id < atom_i->orig_id )
                     {
-                        ++cm_entries[i];
+                        ++my_cm_entries;
                     }
                 }
 
@@ -120,7 +292,7 @@ CUDA_GLOBAL void k_estimate_storages( reax_atom *my_atoms,
                 if ( control->hbond_cut > 0.0 && nbr_pj->d <= control->hbond_cut 
                         && ihb == H_BONDING_ATOM && jhb == H_ATOM && i >= n && j < n )
                 {
-                    ++hbonds[i];
+                    ++my_hbonds;
                 }
 
 //                if ( i >= n )
@@ -148,13 +320,13 @@ CUDA_GLOBAL void k_estimate_storages( reax_atom *my_atoms,
                          * atom j: H bonding atom */
                         if( ihb == H_ATOM && jhb == H_BONDING_ATOM )
                         {
-                            ++hbonds[i];
+                            ++my_hbonds;
                         }
                         /* atom i: H bonding atom, native
                          * atom j: H atom, native */
                         else if( ihb == H_BONDING_ATOM && jhb == H_ATOM && j < n )
                         {
-                            ++hbonds[i];
+                            ++my_hbonds;
                         }
                     }
                 }
@@ -199,158 +371,90 @@ CUDA_GLOBAL void k_estimate_storages( reax_atom *my_atoms,
 
                     if ( BO >= control->bo_cut )
                     {
-                        ++bonds[i];
-//                        atomicAdd( bonds + j, 1 );
+                        ++my_bonds;
                     }
                 }
             }
         }
     }
-    else
-    {
-        bonds[i] = MIN_BONDS;
-        hbonds[i] = 0;
-    }
 
-    if ( bonds[i] > max_bonds[i] )
-    {
-        max_bonds[i] = MAX( (int)(bonds[i] * 2), MIN_BONDS );
-        *realloc_bonds = TRUE;
-    }
+    bonds[i] = my_bonds;
+    max_bonds[i] = MAX( (int)(my_bonds * 2), MIN_BONDS );
 
-    if ( hbonds[i] > max_hbonds[i] )
-    {
-        max_hbonds[i] = MAX( (int)(hbonds[i] * SAFE_ZONE), MIN_HBONDS );
-        *realloc_hbonds = TRUE;
-    }
-
-    if ( cm_entries[i] > max_cm_entries[i] )
-    {
-        max_cm_entries[i] = MAX( (int)(cm_entries[i] * SAFE_ZONE), MIN_CM_ENTRIES );
-        *realloc_cm_entries = TRUE;
-    }
-}
-
-
-/* Copy num. of bonds/hbonds per atom into atom structs (used for MPI messaging)
- *
- * my_atoms: atom structs
- * N: num. of atoms (native + ghost)
- * hbonds: num. of hydrogen bonds per atom
- * bonds: num. of bonds per atom */
-CUDA_GLOBAL void k_init_system_atoms( reax_atom *my_atoms, int N, 
-        int *hbonds, int *bonds )
-{
-    int i;
-    
-    i = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if ( i >= N )
-    {
-        return;
-    }
+    hbonds[i] = my_hbonds;
+    max_hbonds[i] = MAX( (int)(my_hbonds * SAFE_ZONE), MIN_HBONDS );
 
-    my_atoms[i].num_bonds = bonds[i];
-    my_atoms[i].num_hbonds = hbonds[i];
+    cm_entries[i] = my_cm_entries;
+    max_cm_entries[i] = MAX( (int)(my_cm_entries * SAFE_ZONE), MIN_CM_ENTRIES );
 }
 
 
-int Cuda_Estimate_Storages( reax_system *system, control_params *control, 
-        reax_list **lists, sparse_matrix *H, int step )
+void Cuda_Estimate_Storages( reax_system *system, control_params *control, 
+        reax_list **lists, int realloc_bonds, int realloc_hbonds, int realloc_cm,
+        int step )
 {
-    int i, ret, ret_bonds, ret_hbonds, ret_cm;
-    int blocks = 0;
-
-    ret = SUCCESS;
+    int blocks;
 
-    /* careful: this wrapper around cudaMemset(...) performs a byte-wide assignment
-     * to the provided literal */
-    cuda_memset( system->d_realloc_bonds, FALSE, sizeof(int), 
-            "Cuda_Estimate_Storages::d_realloc_bonds" );
-    cuda_memset( system->d_realloc_hbonds, FALSE, sizeof(int), 
-            "Cuda_Estimate_Storages::d_realloc_hbonds" );
-    cuda_memset( system->d_realloc_cm_entries, FALSE, sizeof(int), 
-            "Cuda_Estimate_Storages::d_realloc_cm_entries" );
-    cuda_memset( system->d_bonds, 0, system->total_cap * sizeof(int), 
-            "Cuda_Estimate_Storages::d_bonds" );
-    cuda_memset( system->d_hbonds, 0, system->total_cap * sizeof(int), 
-            "Cuda_Estimate_Storages::d_hbonds" );
-    cuda_memset( system->d_cm_entries, 0, system->total_cap * sizeof(int), 
-            "Cuda_Estimate_Storages::d_cm_entries" );
- 
-    blocks = (int)CEIL( (real)system->total_cap / ST_BLOCK_SIZE );
+    blocks = system->total_cap / ST_BLOCK_SIZE + 
+        (((system->total_cap % ST_BLOCK_SIZE == 0)) ? 0 : 1);
 
     k_estimate_storages <<< blocks, ST_BLOCK_SIZE >>>
         ( system->d_my_atoms, system->reax_param.d_sbp, system->reax_param.d_tbp, 
           (control_params *)control->d_control_params,
           *(*dev_lists + FAR_NBRS), system->reax_param.num_atom_types,
-          system->n, system->N, system->Hcap, system->total_cap,
-          system->d_cm_entries, system->d_max_cm_entries, system->d_realloc_cm_entries,
-          system->d_bonds, system->d_max_bonds, system->d_realloc_bonds,
-          system->d_hbonds, system->d_max_hbonds, system->d_realloc_hbonds );
+          system->n, system->N, system->total_cap,
+          system->d_cm_entries, system->d_max_cm_entries,
+          system->d_bonds, system->d_max_bonds,
+          system->d_hbonds, system->d_max_hbonds );
     cudaThreadSynchronize( );
     cudaCheckError( );
 
-    /* check reallocation flags on device */
-    copy_host_device( &ret_bonds, system->d_realloc_bonds, sizeof(int), 
-            cudaMemcpyDeviceToHost, "Cuda_Estimate_Storages::d_realloc_bonds" );
-    copy_host_device( &ret_hbonds, system->d_realloc_hbonds, sizeof(int), 
-            cudaMemcpyDeviceToHost, "Cuda_Estimate_Storages::d_realloc_hbonds" );
-    copy_host_device( &ret_cm, system->d_realloc_cm_entries, sizeof(int), 
-            cudaMemcpyDeviceToHost, "Cuda_Estimate_Storages::d_realloc_cm_entries" );
-
-    if ( ret_bonds == TRUE )
+    if ( realloc_bonds == TRUE )
     {
         Cuda_Reduction_Sum( system->d_max_bonds, system->d_total_bonds,
                 system->total_cap );
-
         copy_host_device( &(system->total_bonds), system->d_total_bonds, sizeof(int), 
                 cudaMemcpyDeviceToHost, "Cuda_Estimate_Storages::d_total_bonds" );
-
-        if ( step > 0 )
-        {
-            dev_workspace->realloc.bonds = TRUE;
-        }
-        ret = FAILURE;
     }
 
-    if ( system->numH > 0 && control->hbond_cut > 0.0 && ret_hbonds == TRUE )
+    if ( system->numH > 0 && control->hbond_cut > 0.0 )
     {
-        Cuda_Reduction_Sum( system->d_max_hbonds, system->d_total_hbonds,
-                system->total_cap );
-
-        copy_host_device( &(system->total_hbonds), system->d_total_hbonds, sizeof(int), 
-                cudaMemcpyDeviceToHost, "Cuda_Estimate_Storages::d_total_hbonds" );
-
-        if ( step > 0 )
+        if ( realloc_hbonds == TRUE )
         {
-            dev_workspace->realloc.hbonds = TRUE;
+            Cuda_Reduction_Sum( system->d_max_hbonds, system->d_total_hbonds,
+                    system->total_cap );
+            copy_host_device( &(system->total_hbonds), system->d_total_hbonds, sizeof(int), 
+                    cudaMemcpyDeviceToHost, "Cuda_Estimate_Storages::d_total_hbonds" );
         }
-        ret = FAILURE;
     }
     else
     {
-        /* if number of hydrogen atoms is 0, disable hydrogen bond functionality */
-        if ( system->numH == 0 && step == 0 )
+        if ( step == 0 )
         {
-            fprintf( stderr, "[WARNING] DISABLING HYDROGEN BONDS\n" );
+#if defined(DEBUG)
+            if ( system->numH == 0 )
+            {
+                fprintf( stderr, "[INFO] DISABLING HYDROGEN BOND COMPUTATION: NO HYDROGEN ATOMS FOUND\n" );
+            }
+#endif
+
+#if defined(DEBUG)
+            if ( control->hbond_cut <= 0.0 )
+            {
+                fprintf( stderr, "[INFO] DISABLING HYDROGEN BOND COMPUTATION: BOND CUTOFF LENGTH IS ZERO\n" );
+            }
+#endif
+
             control->hbond_cut = 0.0;
             k_disable_hydrogen_bonding <<< 1, 1 >>> ( (control_params *)control->d_control_params );
         }
     }
 
-    if ( ret_cm == TRUE )
+    if ( realloc_cm == TRUE )
     {
         Cuda_Reduction_Sum( system->d_max_cm_entries, system->d_total_cm_entries, system->total_cap );
-
         copy_host_device( &(system->total_cm_entries), system->d_total_cm_entries, sizeof(int),
-                cudaMemcpyDeviceToHost, "d_total_cm_entries" );
-
-        if ( step > 0 )
-        {
-            dev_workspace->realloc.cm = TRUE;
-        }
-        ret = FAILURE;
+                cudaMemcpyDeviceToHost, "Cuda_Estimate_Storages::d_total_cm_entries" );
     }
 
 #if defined(DEBUG)
@@ -359,13 +463,6 @@ int Cuda_Estimate_Storages( reax_system *system, control_params *control,
     fprintf( stderr, " TOTAL DEVICE HBOND COUNT: %d \n", system->total_hbonds );
     fprintf( stderr, " TOTAL DEVICE SPARSE COUNT: %d \n", system->total_cm_entries );
 #endif
-
-    k_init_system_atoms <<< blocks, ST_BLOCK_SIZE >>>
-        ( system->d_my_atoms, system->N, system->d_hbonds, system->d_bonds );
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-
-    return ret;
 }
 
 
@@ -375,6 +472,7 @@ int Cuda_Estimate_Storage_Three_Body( reax_system *system, control_params *contr
     int ret;
 
     ret = SUCCESS;
+
     cuda_memset( thbody, 0, system->total_bonds * sizeof(int), "scratch::thbody" );
 
     Estimate_Cuda_Valence_Angles <<< BLOCKS_N, BLOCK_SIZE >>>
@@ -390,16 +488,29 @@ int Cuda_Estimate_Storage_Three_Body( reax_system *system, control_params *contr
 
     if ( step == 0 )
     {
+        system->total_thbodies = MAX( (int)(system->total_thbodies * SAFE_ZONE), MIN_3BODIES );
+        system->total_thbodies_indices = system->total_bonds;
+
         /* create Three-body list */
-        Dev_Make_List( system->total_bonds, system->total_thbodies,
+        Dev_Make_List( system->total_thbodies_indices, system->total_thbodies,
                 TYP_THREE_BODY, *dev_lists + THREE_BODIES );
     }
 
     if ( system->total_thbodies > (*dev_lists + THREE_BODIES)->num_intrs ||
-            (*dev_lists + THREE_BODIES)->n < system->total_bonds )
+            system->total_bonds > (*dev_lists + THREE_BODIES)->n )
     {
-        system->total_thbodies = (*dev_lists + THREE_BODIES)->num_intrs * SAFE_ZONE;
-        dev_workspace->realloc.num_3body = system->total_thbodies;
+        if ( system->total_thbodies > (*dev_lists + THREE_BODIES)->num_intrs )
+        {
+            system->total_thbodies = MAX( (int)((*dev_lists + THREE_BODIES)->num_intrs * SAFE_ZONE),
+                    system->total_thbodies );
+        }
+        if ( system->total_bonds > (*dev_lists + THREE_BODIES)->n )
+        {
+            system->total_thbodies_indices = MAX( (int)((*dev_lists + THREE_BODIES)->n * SAFE_ZONE),
+                    system->total_bonds );
+        }
+
+        dev_workspace->realloc.thbody = TRUE;
         ret = FAILURE;
     }
 
@@ -491,14 +602,17 @@ CUDA_GLOBAL void k_print_hbond_info( reax_atom *my_atoms, single_body_parameters
 
 CUDA_GLOBAL void k_init_forces( reax_atom *my_atoms, single_body_parameters *sbp, 
         two_body_parameters *tbp, storage workspace, control_params *control, 
-        reax_list far_nbrs, reax_list bonds, reax_list hbonds, 
-        LR_lookup_table *t_LR, int n, int N, int num_atom_types, int renbr )
+        reax_list far_nbrs_list, reax_list bonds_list, reax_list hbonds_list, 
+        LR_lookup_table *t_LR, int n, int N, int num_atom_types, int renbr,
+        int *cm_entries, int *max_cm_entries, int *realloc_cm_entries,
+        int *bonds, int *max_bonds, int *realloc_bonds,
+        int *hbonds, int *max_hbonds, int *realloc_hbonds )
 {
     int i, j, pj;
     int start_i, end_i;
     int type_i, type_j;
-    int Htop;
-    int btop_i, ihb, jhb, ihb_top;
+    int Htop, btop_i, ihb, jhb, ihb_top;
+    int my_bonds, my_hbonds, my_cm_entries;
     int local, flag, flag2, flag3;
     real r_ij, cutoff;
     single_body_parameters *sbp_i, *sbp_j;
@@ -519,9 +633,9 @@ CUDA_GLOBAL void k_init_forces( reax_atom *my_atoms, single_body_parameters *sbp
 
     atom_i = &(my_atoms[i]);
     type_i = atom_i->type;
-    start_i = Dev_Start_Index( i, &far_nbrs );
-    end_i = Dev_End_Index( i, &far_nbrs );
-    btop_i = Dev_Start_Index( i, &bonds );
+    start_i = Dev_Start_Index( i, &far_nbrs_list );
+    end_i = Dev_End_Index( i, &far_nbrs_list );
+    btop_i = Dev_Start_Index( i, &bonds_list );
     sbp_i = &(sbp[type_i]);
 
     if ( i < n )
@@ -557,7 +671,7 @@ CUDA_GLOBAL void k_init_forces( reax_atom *my_atoms, single_body_parameters *sbp
 
         if ( ihb == H_ATOM || ihb == H_BONDING_ATOM )
         {
-            ihb_top = Dev_Start_Index( atom_i->Hindex, &hbonds );
+            ihb_top = Dev_Start_Index( atom_i->Hindex, &hbonds_list );
         }
         else
         {
@@ -568,7 +682,7 @@ CUDA_GLOBAL void k_init_forces( reax_atom *my_atoms, single_body_parameters *sbp
     /* update i-j distance - check if j is within cutoff */
     for ( pj = start_i; pj < end_i; ++pj )
     {
-        nbr_pj = &( far_nbrs.select.far_nbr_list[pj] );
+        nbr_pj = &( far_nbrs_list.select.far_nbr_list[pj] );
         j = nbr_pj->nbr;
         atom_j = &(my_atoms[j]);
 
@@ -641,13 +755,13 @@ CUDA_GLOBAL void k_init_forces( reax_atom *my_atoms, single_body_parameters *sbp
             if ( control->hbond_cut > 0.0 && nbr_pj->d <= control->hbond_cut
                     && ihb == H_BONDING_ATOM && jhb == H_ATOM && i >= n && j < n ) 
             {
-                hbonds.select.hbond_list[ihb_top].nbr = j;
-                hbonds.select.hbond_list[ihb_top].scl = -1;
-                hbonds.select.hbond_list[ihb_top].ptr = nbr_pj;
+                hbonds_list.select.hbond_list[ihb_top].nbr = j;
+                hbonds_list.select.hbond_list[ihb_top].scl = -1;
+                hbonds_list.select.hbond_list[ihb_top].ptr = nbr_pj;
 
                 //CUDA SPECIFIC
-                hbonds.select.hbond_list[ihb_top].sym_index = -1;
-                rvec_MakeZero( hbonds.select.hbond_list[ihb_top].hb_f );
+                hbonds_list.select.hbond_list[ihb_top].sym_index = -1;
+                rvec_MakeZero( hbonds_list.select.hbond_list[ihb_top].hb_f );
 
                 ++ihb_top;
             }
@@ -727,21 +841,21 @@ CUDA_GLOBAL void k_init_forces( reax_atom *my_atoms, single_body_parameters *sbp
                      * atom j: H bonding atom */
                     if ( ihb == H_ATOM && jhb == H_BONDING_ATOM )
                     {
-                        hbonds.select.hbond_list[ihb_top].nbr = j;
+                        hbonds_list.select.hbond_list[ihb_top].nbr = j;
 
                         if ( i < j )
                         {
-                            hbonds.select.hbond_list[ihb_top].scl = 1;
+                            hbonds_list.select.hbond_list[ihb_top].scl = 1;
                         }
                         else
                         {
-                            hbonds.select.hbond_list[ihb_top].scl = -1;
+                            hbonds_list.select.hbond_list[ihb_top].scl = -1;
                         }
-                        hbonds.select.hbond_list[ihb_top].ptr = nbr_pj;
+                        hbonds_list.select.hbond_list[ihb_top].ptr = nbr_pj;
 
                         //CUDA SPECIFIC
-                        hbonds.select.hbond_list[ihb_top].sym_index = -1;
-                        rvec_MakeZero( hbonds.select.hbond_list[ihb_top].hb_f );
+                        hbonds_list.select.hbond_list[ihb_top].sym_index = -1;
+                        rvec_MakeZero( hbonds_list.select.hbond_list[ihb_top].hb_f );
 
                         ++ihb_top;
                     }
@@ -750,13 +864,13 @@ CUDA_GLOBAL void k_init_forces( reax_atom *my_atoms, single_body_parameters *sbp
                     else if ( ihb == H_BONDING_ATOM && jhb == H_ATOM && j < n )
                     {
                         //jhb_top = End_Index( atom_j->Hindex, hbonds );
-                        hbonds.select.hbond_list[ihb_top].nbr = j;
-                        hbonds.select.hbond_list[ihb_top].scl = -1;
-                        hbonds.select.hbond_list[ihb_top].ptr = nbr_pj;
+                        hbonds_list.select.hbond_list[ihb_top].nbr = j;
+                        hbonds_list.select.hbond_list[ihb_top].scl = -1;
+                        hbonds_list.select.hbond_list[ihb_top].ptr = nbr_pj;
 
                         //CUDA SPECIFIC
-                        hbonds.select.hbond_list[ihb_top].sym_index = -1;
-                        rvec_MakeZero( hbonds.select.hbond_list[ihb_top].hb_f );
+                        hbonds_list.select.hbond_list[ihb_top].sym_index = -1;
+                        rvec_MakeZero( hbonds_list.select.hbond_list[ihb_top].hb_f );
 
                         ++ihb_top;
                     }
@@ -765,11 +879,10 @@ CUDA_GLOBAL void k_init_forces( reax_atom *my_atoms, single_body_parameters *sbp
 
             /* uncorrected bond orders */
             if ( nbr_pj->d <= control->bond_cut &&
-                    Dev_BOp( bonds, control->bo_cut, i, btop_i, nbr_pj,
+                    Dev_BOp( bonds_list, control->bo_cut, i, btop_i, nbr_pj,
                         sbp_i, sbp_j, twbp, workspace.dDeltap_self,
                         workspace.total_bond_order ) == TRUE )
             {
-                //num_bonds += 2;
                 ++btop_i;
 
                 /* TODO: Need to do later... since i and j are parallel */
@@ -785,15 +898,40 @@ CUDA_GLOBAL void k_init_forces( reax_atom *my_atoms, single_body_parameters *sbp
         }
     }
 
-    Dev_Set_End_Index( i, btop_i, &bonds );
+    Dev_Set_End_Index( i, btop_i, &bonds_list );
     H->end[i] = Htop;
 //    if( local == TRUE )
 //    {
         if ( control->hbond_cut > 0.0 && ihb_top > 0 && (ihb == H_ATOM || ihb == H_BONDING_ATOM) )
         {
-            Dev_Set_End_Index( atom_i->Hindex, ihb_top, &hbonds );
+            Dev_Set_End_Index( atom_i->Hindex, ihb_top, &hbonds_list );
         }
 //    }
+
+    my_bonds = btop_i - Dev_Start_Index( i, &bonds_list );
+    my_hbonds = ihb_top - Dev_Start_Index( atom_i->Hindex, &hbonds_list );
+    my_cm_entries = Htop - H->start[i];
+
+    /* copy (h)bond info to atom structure
+     * (needed for atom ownership transfer via MPI) */
+    my_atoms[i].num_bonds = my_bonds;
+    my_atoms[i].num_hbonds = my_hbonds;
+
+    /* reallocation checks */
+    if ( my_bonds > max_bonds[i] )
+    {
+        *realloc_bonds = TRUE;
+    }
+
+    if ( my_hbonds > max_hbonds[i] )
+    {
+        *realloc_hbonds = TRUE;
+    }
+
+    if ( my_cm_entries > max_cm_entries[i] )
+    {
+        *realloc_cm_entries = TRUE;
+    }
 }
 
 
@@ -859,17 +997,21 @@ CUDA_GLOBAL void New_fix_sym_hbond_indices( reax_atom *my_atoms, reax_list hbond
     int nbr, nbrstart, nbrend;
     int start, end;
     hbond_data *ihbond, *jhbond;
-    int __THREADS_PER_ATOM__ = HB_KER_SYM_THREADS_PER_ATOM;
-    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    int warp_id = thread_id / __THREADS_PER_ATOM__;
-    int lane_id = thread_id & (__THREADS_PER_ATOM__ - 1);
-    int my_bucket = threadIdx.x / __THREADS_PER_ATOM__;
+    int __THREADS_PER_ATOM__;
+    int thread_id;
+    int warp_id;
+    int lane_id;
+
+    __THREADS_PER_ATOM__ = HB_KER_SYM_THREADS_PER_ATOM;
+    thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    warp_id = thread_id / __THREADS_PER_ATOM__;
 
     if ( warp_id > N )
     {
         return;
     }
 
+    lane_id = thread_id & (__THREADS_PER_ATOM__ - 1);
     i = warp_id;
     start = Dev_Start_Index( my_atoms[i].Hindex, &hbonds );
     end = Dev_End_Index( my_atoms[i].Hindex, &hbonds );
@@ -932,38 +1074,6 @@ CUDA_GLOBAL void k_update_hbonds( reax_atom *my_atoms, reax_list hbonds, int n )
 }
 
 
-int Cuda_Validate_Lists( reax_system *system, storage *workspace,
-        reax_list **lists, control_params *control, 
-        int step, int numH )
-{
-    int blocks;
-    int ret;
-
-    blocks = system->n / DEF_BLOCK_SIZE + 
-        ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-    ret = SUCCESS;
-
-    k_update_bonds <<< blocks, DEF_BLOCK_SIZE >>>
-        ( system->d_my_atoms, *(*lists + BONDS), system->n );
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-
-    if ( control->hbond_cut > 0.0 && system->numH > 0 )
-    {
-        k_update_hbonds <<< blocks, DEF_BLOCK_SIZE >>>
-            ( system->d_my_atoms, *(*lists + HBONDS), system->n );
-        cudaThreadSynchronize( );
-        cudaCheckError( );
-    }
-
-    /* 3bodies list: since a more accurate estimate of the num.
-     * of three body interactions requires that bond orders have
-     * been computed, delay validation until for computation */
-
-    return ret;
-}
-
-
 CUDA_GLOBAL void k_print_forces( reax_atom *my_atoms, rvec *f, int n )
 {
     int i; 
@@ -997,23 +1107,69 @@ static void Print_Forces( reax_system *system )
 }
 
 
+CUDA_GLOBAL void k_print_hbonds( reax_atom *my_atoms, reax_list p_hbonds, int n, int rank, int step )
+{
+    int i, k, pj, start, end; 
+    reax_list *hbonds;
+    hbond_data *hbond_list, *hbond_jk;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= n )
+    {
+        return;
+    }
+
+    hbonds = &( p_hbonds );
+    start = Dev_Start_Index( my_atoms[i].Hindex, hbonds );
+    end = Dev_End_Index( my_atoms[i].Hindex, hbonds );
+    hbond_list = hbonds->select.hbond_list;
+
+    for ( pj = start; pj < end; ++pj )
+    {
+        k = hbond_list[pj].nbr;
+        hbond_jk = &( hbond_list[pj] );
+
+        printf( "p%03d, step %05d: %8d: %8d, %24.15f, %24.15f, %24.15f\n",
+                rank, step, my_atoms[i].Hindex, k,
+                hbond_jk->hb_f[0],
+                hbond_jk->hb_f[1],
+                hbond_jk->hb_f[2] );
+    }
+}
+
+
+static void Print_HBonds( reax_system *system, int step )
+{
+    int blocks;
+    
+    blocks = (system->n) / DEF_BLOCK_SIZE + 
+        (((system->n % DEF_BLOCK_SIZE) == 0) ? 0 : 1);
+
+    k_print_hbonds <<< blocks, DEF_BLOCK_SIZE >>>
+        ( system->d_my_atoms, *(*dev_lists + HBONDS), system->n, system->my_rank, step );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
 CUDA_GLOBAL void k_init_bond_orders( reax_atom *my_atoms, reax_list far_nbrs, 
         reax_list bonds, real *total_bond_order, int N )
 {
-    int i, j, pj; 
+    int i, pj; 
+//    int j; 
     int start_i, end_i;
-    int type_i, type_j;
-    far_neighbor_data *nbr_pj;
-    reax_atom *atom_i, *atom_j;
+//    far_neighbor_data *nbr_pj;
+//    reax_atom *atom_i, *atom_j;
 
     i = blockIdx.x * blockDim.x + threadIdx.x;
 
-    if (i >= N)
+    if ( i >= N )
     {
         return;
     }
 
-    atom_i = &(my_atoms[i]);
+//    atom_i = &(my_atoms[i]);
     start_i = Dev_Start_Index(i, &far_nbrs);
     end_i = Dev_End_Index(i, &far_nbrs);
 
@@ -1068,51 +1224,75 @@ int Cuda_Init_Forces( reax_system *system, control_params *control,
         simulation_data *data, storage *workspace,
         reax_list **lists, output_controls *out_control ) 
 {
-    int i, ret, Htop;
+    int ret, ret_bonds, ret_hbonds, ret_cm;
     int blocks, hblocks;
 
-    ret = Cuda_Estimate_Storages( system, control, dev_lists, &(dev_workspace->H), data->step );
-
-    if ( ret == SUCCESS )
-    {
-//        /* init the workspace (bond_mark) */
-//        cuda_memset( dev_workspace->bond_mark, 0, sizeof(int) * system->n, "bond_mark" );
+    /* init the workspace (bond_mark) */
+//    cuda_memset( dev_workspace->bond_mark, 0, sizeof(int) * system->n, "bond_mark" );
 //
-//        blocks = (system->N - system->n) / DEF_BLOCK_SIZE + 
-//           (((system->N - system->n) % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-//        k_init_bond_mark <<< blocks, DEF_BLOCK_SIZE >>>
-//           ( system->n, (system->N - system->n), dev_workspace->bond_mark );
-//        cudaThreadSynchronize( );
-//        cudaCheckError( );
+//    blocks = (system->N - system->n) / DEF_BLOCK_SIZE + 
+//       (((system->N - system->n) % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+//    k_init_bond_mark <<< blocks, DEF_BLOCK_SIZE >>>
+//       ( system->n, (system->N - system->n), dev_workspace->bond_mark );
+//    cudaThreadSynchronize( );
+//    cudaCheckError( );
+
+    /* main kernel */
+    blocks = (system->N) / DEF_BLOCK_SIZE + 
+        (((system->N % DEF_BLOCK_SIZE) == 0) ? 0 : 1);
+
+//    k_init_bond_orders <<< blocks, DEF_BLOCK_SIZE >>>
+//        ( system->d_my_atoms, *(*dev_lists + FAR_NBRS), *(*dev_lists + BONDS),
+//          dev_workspace->total_bond_order, system->N );
+//    cudaThreadSynchronize( );
+//    cudaCheckError( );
+//
+//    k_print_hbond_info <<< blocks, DEF_BLOCK_SIZE >>>
+//        ( system->d_my_atoms, system->reax_param.d_sbp,
+//          (control_params *)control->d_control_params,
+//          *(*dev_lists + HBONDS), system->N );
+//    cudaThreadSynchronize( );
+//    cudaCheckError( );
+
+    /* reset reallocation flags on device */
+    cuda_memset( system->d_realloc_bonds, FALSE, sizeof(int), 
+            "Cuda_Init_Forces::d_realloc_bonds" );
+    cuda_memset( system->d_realloc_hbonds, FALSE, sizeof(int), 
+            "Cuda_Init_Forces::d_realloc_hbonds" );
+    cuda_memset( system->d_realloc_cm_entries, FALSE, sizeof(int), 
+            "Cuda_Init_Forces::d_realloc_cm_entries" );
 
-        /* main kernel */
-        blocks = (system->N) / DEF_BLOCK_SIZE + 
-            (((system->N % DEF_BLOCK_SIZE) == 0) ? 0 : 1);
+    k_init_forces <<< blocks, DEF_BLOCK_SIZE >>>
+        ( system->d_my_atoms, system->reax_param.d_sbp,
+          system->reax_param.d_tbp, *dev_workspace,
+          (control_params *)control->d_control_params,
+          *(*dev_lists + FAR_NBRS), *(*dev_lists + BONDS),
+          *(*dev_lists + HBONDS), d_LR, system->n,
+          system->N, system->reax_param.num_atom_types,
+          (((data->step-data->prev_steps) % control->reneighbor) == 0),
+          system->d_cm_entries, system->d_max_cm_entries, system->d_realloc_cm_entries,
+          system->d_bonds, system->d_max_bonds, system->d_realloc_bonds,
+          system->d_hbonds, system->d_max_hbonds, system->d_realloc_hbonds );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
 
-//        k_init_bond_orders <<< blocks, DEF_BLOCK_SIZE >>>
-//            ( system->d_my_atoms, *(*dev_lists + FAR_NBRS), *(*dev_lists + BONDS),
-//              dev_workspace->total_bond_order, system->N );
-//        cudaThreadSynchronize( );
-//        cudaCheckError( );
+    /* check reallocation flags on device */
+    copy_host_device( &ret_bonds, system->d_realloc_bonds, sizeof(int), 
+            cudaMemcpyDeviceToHost, "Cuda_Init_Forces::d_realloc_bonds" );
+    copy_host_device( &ret_hbonds, system->d_realloc_hbonds, sizeof(int), 
+            cudaMemcpyDeviceToHost, "Cuda_Init_Forces::d_realloc_hbonds" );
+    copy_host_device( &ret_cm, system->d_realloc_cm_entries, sizeof(int), 
+            cudaMemcpyDeviceToHost, "Cuda_Init_Forces::d_realloc_cm_entries" );
 
-//        k_print_hbond_info <<< blocks, DEF_BLOCK_SIZE >>>
-//            ( system->d_my_atoms, system->reax_param.d_sbp,
-//              (control_params *)control->d_control_params,
-//              *(*dev_lists + HBONDS), system->N );
-//        cudaThreadSynchronize( );
-//        cudaCheckError( );
+    ret = (ret_bonds == FALSE && ret_hbonds == FALSE && ret_cm == FALSE) ? SUCCESS : FAILURE;
 
-        k_init_forces <<< blocks, DEF_BLOCK_SIZE >>>
-            ( system->d_my_atoms, system->reax_param.d_sbp,
-              system->reax_param.d_tbp, *dev_workspace,
-              (control_params *)control->d_control_params,
-              *(*dev_lists + FAR_NBRS), *(*dev_lists + BONDS),
-              *(*dev_lists + HBONDS), d_LR, system->n,
-              system->N, system->reax_param.num_atom_types,
-              (((data->step-data->prev_steps) % control->reneighbor) == 0) );
-        cudaThreadSynchronize( );
-        cudaCheckError( );
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "[INFO] p%d, step %d: ret = %d, ret_bonds = %d, ret_hbonds = %d, ret_cm = %d\n",
+            system->my_rank, data->step, ret, ret_bonds, ret_hbonds, ret_cm );
+#endif
 
+    if ( ret == SUCCESS )
+    {
         /* fix sym_index and dbond_index */
         New_fix_sym_dbond_indices <<< blocks, BLOCK_SIZE >>> 
             ( *(*dev_lists + BONDS), system->N );
@@ -1137,10 +1317,15 @@ int Cuda_Init_Forces( reax_system *system, control_params *control,
 //            ( *(*dev_lists + BONDS), *dev_workspace, system->N );
 //        cudaThreadSynchronize( );
 //        cudaCheckError( );
+    }
+    else
+    {
+        Cuda_Estimate_Storages( system, control, dev_lists,
+               ret_bonds, ret_hbonds, ret_cm, data->step );
 
-        /* validate lists */
-        ret = Cuda_Validate_Lists( system, workspace, dev_lists, control,
-                data->step, system->numH );
+        dev_workspace->realloc.bonds = ret_bonds;
+        dev_workspace->realloc.hbonds = ret_hbonds;
+        dev_workspace->realloc.cm = ret_cm;
     }
 
     return ret;
@@ -1160,12 +1345,14 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
         simulation_data *data, storage *workspace, 
         reax_list **lists, output_controls *out_control )
 {
-    int i, hbs, hnbrs_blocks, update_energy, ret;
+    int hbs, hnbrs_blocks, update_energy, ret;
     int *thbody;
     static int compute_bonded_part1 = FALSE;
-    real t_start, t_elapsed;
     real *spad = (real *) scratch;
     rvec *rvec_spad;
+#if defined(DEBUG)
+    real t_start, t_elapsed;
+#endif
 
     update_energy = (out_control->energy_update_freq > 0
             && data->step % out_control->energy_update_freq == 0) ? TRUE : FALSE;
@@ -1174,9 +1361,9 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
     if ( compute_bonded_part1 == FALSE )
     {
         /* 1. Bond Order Interactions */
+#if defined(DEBUG)
         t_start = Get_Time( );
 
-#if defined(DEBUG)
         fprintf( stderr, " Begin Bonded Forces ... %d x %d\n",
                 BLOCKS_N, BLOCK_SIZE );
 #endif
@@ -1206,16 +1393,19 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
         cudaThreadSynchronize( );
         cudaCheckError( );
 
+#if defined(DEBUG)
         t_elapsed = Get_Timing_Info( t_start );
 
-#if defined(DEBUG)
         fprintf( stderr, "Bond Orders... return value --> %d --- Timing %lf \n",
                 cudaGetLastError( ), t_elapsed );
         fprintf( stderr, "Cuda_Calculate_Bond_Orders Done... \n" );
 #endif
 
         /* 2. Bond Energy Interactions */
+#if defined(DEBUG)
         t_start = Get_Time( );
+#endif
+
         cuda_memset( spad, 0, system->N * (2 * sizeof(real)) , "scratch" );
 
         Cuda_Bonds <<< BLOCKS, BLOCK_SIZE, sizeof(real)* BLOCK_SIZE >>>
@@ -1232,16 +1422,19 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
                     system->n );
         }
 
+#if defined(DEBUG)
         t_elapsed = Get_Timing_Info( t_start );
 
-#if defined(DEBUG)
         fprintf( stderr, "Cuda_Bond_Energy ... return value --> %d --- Timing %lf \n",
                 cudaGetLastError( ), t_elapsed );
         fprintf( stderr, "Cuda_Bond_Energy Done... \n" );
 #endif
 
         /* 3. Atom Energy Interactions */
+#if defined(DEBUG)
         t_start = Get_Time( );
+#endif
+
         cuda_memset( spad, 0, ( 6 * sizeof(real) * system->n ), "scratch" );
 
         Cuda_Atom_Energy <<< BLOCKS, BLOCK_SIZE >>>
@@ -1282,9 +1475,9 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
                     system->n );
         }
 
+#if defined(DEBUG)
         t_elapsed = Get_Timing_Info( t_start );
 
-#if defined(DEBUG)
         fprintf( stderr, "test_LonePair_postprocess ... return value --> %d --- Timing %lf \n",
                 cudaGetLastError( ), t_elapsed );
         fprintf( stderr, "test_LonePair_postprocess Done... \n");
@@ -1294,7 +1487,9 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
     }
 
     /* 4. Valence Angles Interactions */
+#if defined(DEBUG)
     t_start = Get_Time( );
+#endif
 
     thbody = (int *) scratch;
     ret = Cuda_Estimate_Storage_Three_Body( system, control, data->step,
@@ -1310,9 +1505,10 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
 
     if ( ret == SUCCESS )
     {
-        Cuda_Init_Three_Body_Indices( thbody, system->total_bonds );
+        Cuda_Init_Three_Body_Indices( thbody, system->total_thbodies_indices );
 
         cuda_memset( spad, 0, 6 * sizeof(real) * system->N + sizeof(rvec) * system->N * 2, "scratch" );
+
         Cuda_Valence_Angles <<< BLOCKS_N, BLOCK_SIZE >>>
             ( system->d_my_atoms, system->reax_param.d_gp, 
               system->reax_param.d_sbp, system->reax_param.d_thbp, 
@@ -1367,19 +1563,22 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
         cudaThreadSynchronize( );
         cudaCheckError( );
 
+#if defined(DEBUG)
         t_elapsed = Get_Timing_Info( t_start );
 
-#if defined(DEBUG)
         fprintf( stderr, "Three_Body_Interactions ...  Timing %lf \n",
                 t_elapsed );
         fprintf( stderr, "Three_Body_Interactions Done... \n" );
 #endif
 
         /* 5. Torsion Angles Interactions */
+#if defined(DEBUG)
         t_start = Get_Time( );
-        cuda_memset( spad, 0,
-                4 * sizeof(real) * system->n + sizeof(rvec) * system->n * 2,
+#endif
+
+        cuda_memset( spad, 0, 4 * sizeof(real) * system->n + sizeof(rvec) * system->n * 2,
                 "scratch" );
+
         Cuda_Torsion_Angles <<< BLOCKS, BLOCK_SIZE >>>
             ( system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_fbp,
               (control_params *) control->d_control_params, *(*dev_lists + BONDS),
@@ -1426,9 +1625,9 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
         cudaThreadSynchronize( );
         cudaCheckError( );
 
+#if defined(DEBUG)
         t_elapsed = Get_Timing_Info( t_start );
 
-#if defined(DEBUG)
         fprintf( stderr, "Four_Body_post process return value --> %d --- Four body Timing %lf \n",
                 cudaGetLastError( ), t_elapsed );
         fprintf( stderr, " Four_Body_ Done... \n");
@@ -1437,7 +1636,10 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
         /* 6. Hydrogen Bonds Interactions */
         if ( control->hbond_cut > 0.0 && system->numH > 0 )
         {
+#if defined(DEBUG)
             t_start = Get_Time( );
+#endif
+
             cuda_memset( spad, 0,
                     2 * sizeof(real) * system->n + sizeof(rvec) * system->n * 2, "scratch" );
 
@@ -1452,10 +1654,15 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
                       (control_params *) control->d_control_params,
                       *dev_workspace, *(*dev_lists + BONDS), *(*dev_lists + HBONDS),
                       system->n, system->reax_param.num_atom_types,
-                      spad, (rvec *) (spad + 2 * system->n) );
+                      spad, (rvec *) (spad + 2 * system->n), system->my_rank, data->step );
             cudaThreadSynchronize( );
             cudaCheckError( );
 
+//            if ( data->step == 10 )
+//            {
+//                Print_HBonds( system, data->step );
+//            }
+
             /* reduction for E_HB */
             if ( update_energy == TRUE )
             {
@@ -1498,9 +1705,9 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
             cudaThreadSynchronize( );
             cudaCheckError( );
 
+#if defined(DEBUG)
             t_elapsed = Get_Timing_Info( t_start );
 
-#if defined(DEBUG)
             fprintf( stderr,
                     "Hydrogen bonds return value --> %d --- HydrogenBonds Timing %lf \n",
                     cudaGetLastError( ), t_elapsed );
@@ -1564,6 +1771,7 @@ int Cuda_Compute_Forces( reax_system *system, control_params *control,
         output_controls *out_control, mpi_datatypes *mpi_data )
 {
     int charge_flag, retVal;
+    static int init_forces_done = FALSE;
 
 #if defined(LOG_PERFORMANCE)
     real t_start = 0;
@@ -1587,13 +1795,21 @@ int Cuda_Compute_Forces( reax_system *system, control_params *control,
         charge_flag = FALSE;
     }
 
-    if ( charge_flag == TRUE )
-    {
-        retVal = Cuda_Init_Forces( system, control, data, workspace, lists, out_control );
-    }
-    else
+    if ( init_forces_done == FALSE )
     {
-        retVal = Cuda_Init_Forces_No_Charges( system, control, data, workspace, lists, out_control );
+        if ( charge_flag == TRUE )
+        {
+            retVal = Cuda_Init_Forces( system, control, data, workspace, lists, out_control );
+        }
+        else
+        {
+            retVal = Cuda_Init_Forces_No_Charges( system, control, data, workspace, lists, out_control );
+        }
+
+        if ( retVal == SUCCESS )
+        {
+            init_forces_done = TRUE;
+        }
     }
 
     if ( retVal == SUCCESS )
@@ -1647,6 +1863,7 @@ int Cuda_Compute_Forces( reax_system *system, control_params *control,
         fprintf(stderr, "p%d @ step%d: qeq completed\n", system->my_rank, data->step);
         MPI_Barrier( MPI_COMM_WORLD );
 #endif
+
 #endif //PURE_REAX
 
         /********* nonbonded interactions ************/
@@ -1660,6 +1877,7 @@ int Cuda_Compute_Forces( reax_system *system, control_params *control,
             Update_Timing_Info( &t_start, &(data->timing.nonb) );
         }
 #endif
+
 #if defined(DEBUG_FOCUS)
         fprintf( stderr, "p%d @ step%d: nonbonded forces completed\n",
                 system->my_rank, data->step );
@@ -1676,6 +1894,7 @@ int Cuda_Compute_Forces( reax_system *system, control_params *control,
             Update_Timing_Info( &t_start, &(data->timing.bonded) );
         }
 #endif
+
 #if defined(DEBUG_FOCUS)
         fprintf( stderr, "p%d @ step%d: total forces computed\n",
                 system->my_rank, data->step );
@@ -1683,6 +1902,8 @@ int Cuda_Compute_Forces( reax_system *system, control_params *control,
         MPI_Barrier( MPI_COMM_WORLD );
 
 #endif
+
+        init_forces_done = FALSE;
     }
 
     return retVal;
diff --git a/PG-PuReMD/src/cuda/cuda_forces.h b/PG-PuReMD/src/cuda/cuda_forces.h
index b618fa29..94d3b73f 100644
--- a/PG-PuReMD/src/cuda/cuda_forces.h
+++ b/PG-PuReMD/src/cuda/cuda_forces.h
@@ -10,8 +10,16 @@ extern "C" {
 #endif
 
 
-int Cuda_Estimate_Storages( reax_system *, control_params *, reax_list **,
-        sparse_matrix *, int );
+void Cuda_Init_HBond_Indices( reax_system * );
+
+void Cuda_Init_Bond_Indices( reax_system * );
+
+void Cuda_Init_Sparse_Matrix_Indices( reax_system *, sparse_matrix * );
+
+void Cuda_Init_Three_Body_Indices( int *, int );
+
+void Cuda_Estimate_Storages( reax_system *, control_params *, reax_list **,
+        int, int, int, int );
 
 int Cuda_Estimate_Storage_Three_Body( reax_system *, control_params *,
         int, reax_list **, int *, int * );
@@ -22,9 +30,6 @@ int Cuda_Init_Forces( reax_system *, control_params *, simulation_data *,
 int Cuda_Init_Forces_No_Charges( reax_system *, control_params *, simulation_data *,
         storage *, reax_list **, output_controls * );
 
-int Cuda_Validate_Lists( reax_system *, storage *, reax_list **, control_params *,
-        int, int, int, int );
-
 int Cuda_Compute_Bonded_Forces( reax_system *, control_params *, simulation_data *,
         storage *, reax_list **, output_controls * );
 
diff --git a/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu b/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu
index 18cdbb57..9043e424 100644
--- a/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu
+++ b/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu
@@ -33,7 +33,7 @@
 CUDA_GLOBAL void Cuda_Hydrogen_Bonds( reax_atom *my_atoms, single_body_parameters *sbp, 
         hbond_parameters *d_hbp, global_parameters gp, control_params *control, 
         storage p_workspace, reax_list p_bonds, reax_list p_hbonds, int n, 
-        int num_atom_types, real *data_e_hb, rvec *data_ext_press )
+        int num_atom_types, real *data_e_hb, rvec *data_ext_press, int rank, int step )
 {
     int i, j, k, pi, pk;
     int type_i, type_j, type_k;
@@ -41,7 +41,7 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds( reax_atom *my_atoms, single_body_parameter
     int hblist[MAX_BONDS];
     int itr, top;
     ivec rel_jk;
-    real r_ij, r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2;
+    real r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2;
     real e_hb, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3;
     rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk;
     rvec dvec_jk, force, ext_press;
@@ -53,9 +53,6 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds( reax_atom *my_atoms, single_body_parameter
     bond_data *bond_list;
     hbond_data *hbond_list, *hbond_jk;
     storage *workspace;
-#if defined(DEBUG)
-    int num_hb_intrs;
-#endif
 
     j = blockIdx.x * blockDim.x + threadIdx.x;
 
@@ -69,15 +66,12 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds( reax_atom *my_atoms, single_body_parameter
     hbonds = &( p_hbonds );
     hbond_list = hbonds->select.hbond_list;
     workspace = &( p_workspace );
-#if defined(DEBUG)
-    num_hb_intrs = 0;
-#endif
 
     /* loops below discover the Hydrogen bonds between i-j-k triplets.
      * here j is H atom and there has to be some bond between i and j.
      * Hydrogen bond is between j and k.
      * so in this function i->X, j->H, k->Z when we map 
-     * variables onto the ones in the handout.*/
+     * variables onto the ones in the handout. */
     //for( j = 0; j < system->n; ++j )
     if ( sbp[ my_atoms[j].type ].p_hbond == H_ATOM )
     {
@@ -89,14 +83,14 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds( reax_atom *my_atoms, single_body_parameter
 
         top = 0;
         /* search bonded atoms to atom j (i.e., hydrogen atom) for potential hydrogen bonding */
-        for( pi = start_j; pi < end_j; ++pi )
+        for ( pi = start_j; pi < end_j; ++pi )
         {
             pbond_ij = &( bond_list[pi] );
             i = pbond_ij->nbr;
             bo_ij = &(pbond_ij->bo_data);
             type_i = my_atoms[i].type;
 
-            if( sbp[type_i].p_hbond == H_BONDING_ATOM && 
+            if ( sbp[type_i].p_hbond == H_BONDING_ATOM && 
                     bo_ij->BO >= HB_THRESHOLD )
             {
                 hblist[top++] = pi;
@@ -130,12 +124,7 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds( reax_atom *my_atoms, single_body_parameter
                 {
                     bo_ij = &(pbond_ij->bo_data);
                     type_i = my_atoms[i].type;
-                    r_ij = pbond_ij->d;         
-                    hbp = &(d_hbp[ index_hbp (type_i,type_j,type_k,num_atom_types) ]);
-
-#if defined(DEBUG)
-                    ++num_hb_intrs;
-#endif
+                    hbp = &(d_hbp[ index_hbp(type_i, type_j, type_k, num_atom_types) ]);
 
                     Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
                             &theta, &cos_theta );
@@ -144,15 +133,30 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds( reax_atom *my_atoms, single_body_parameter
                             &dcos_theta_di, &dcos_theta_dj, 
                             &dcos_theta_dk );
 
+//                    if ( j == 0 && k == 36 && step == 10 && rank == 0 )
+//                    {
+//                        printf( "[0] p%05d, top = %d, itr = %d, MAX_BONDS = %d\n", rank, top, itr, MAX_BONDS );
+//                        printf( "[1] p%05d %05d, %05d: %12.5f %12.5f %12.5f %12.5f %12.5f\n", rank, j, k, theta, cos_theta, dcos_theta_di,
+//                                dcos_theta_dj, dcos_theta_dk );
+//                    }
+
                     /* hydrogen bond energy */
                     sin_theta2 = SIN( theta / 2.0 );
-                    sin_xhz4 = SQR(sin_theta2);
+                    sin_xhz4 = SQR( sin_theta2 );
                     sin_xhz4 *= sin_xhz4;
                     cos_xhz1 = ( 1.0 - cos_theta );
                     exp_hb2 = EXP( -hbp->p_hb2 * bo_ij->BO );
                     exp_hb3 = EXP( -hbp->p_hb3 * ( hbp->r0_hb / r_jk + 
                                 r_jk / hbp->r0_hb - 2.0 ) );
 
+//                    if ( j == 0 && k == 36 && step == 10 && rank == 0 )
+//                    {
+//                        printf( "[2] p%05d %05d, %05d: %12.5f %12.5f %12.5f %12.5f %12.5f\n", rank, j, k,
+//                                sin_theta2, sin_xhz4, cos_xhz1, exp_hb2, exp_hb3 );
+//                        printf( "[3] p%05d %05d, %05d: %12.5f %12.5f %12.5f\n", rank, j, k,
+//                                hbp->p_hb3, hbp->r0_hb, r_jk );
+//                    }
+
                     e_hb = hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4;
                     data_e_hb[j] += e_hb;
 
@@ -161,6 +165,10 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds( reax_atom *my_atoms, single_body_parameter
                     CEhb3 = -hbp->p_hb3 * 
                         (-hbp->r0_hb / SQR(r_jk) + 1.0 / hbp->r0_hb) * e_hb;
 
+//                    if ( j == 0 && k == 36 && step == 10 && rank == 0 )
+//                        printf( "[4] p%05d %05d, %05d: %12.5f %12.5f %12.5f %12.5f %12.5f\n", rank, j, k,
+//                                e_hb, data_e_hb[j], CEhb1, CEhb2, CEhb3 );
+
                     /*fprintf( stdout, 
                       "%6d%6d%6d%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f\n",
                       system->my_atoms[i].orig_id, system->my_atoms[j].orig_id, 
@@ -173,7 +181,7 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds( reax_atom *my_atoms, single_body_parameter
 
                     if ( control->virial == 0 )
                     {
-                        // dcos terms
+                        /* dcos terms */
                         //rvec_ScaledAdd( workspace->f[i], +CEhb2, dcos_theta_di ); 
                         //atomic_rvecScaledAdd( workspace->f[i], +CEhb2, dcos_theta_di );
                         rvec_ScaledAdd( pbond_ij->hb_f, +CEhb2, dcos_theta_di ); 
@@ -184,7 +192,7 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds( reax_atom *my_atoms, single_body_parameter
                         //atomic_rvecScaledAdd( workspace->f[k], +CEhb2, dcos_theta_dk );
                         rvec_ScaledAdd( hbond_jk->hb_f, +CEhb2, dcos_theta_dk );
 
-                        // dr terms
+                        /* dr terms */
                         rvec_ScaledAdd( workspace->f[j], -CEhb3/r_jk, dvec_jk ); 
 
                         //rvec_ScaledAdd( workspace->f[k], +CEhb3/r_jk, dvec_jk );
@@ -209,7 +217,7 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds( reax_atom *my_atoms, single_body_parameter
                         rvec_Add( hbond_jk->hb_f, force );
                         rvec_iMultiply( ext_press, rel_jk, force );
                         rvec_ScaledAdd( data_ext_press[j], 1.0, ext_press );
-                        // dr terms
+                        /* dr terms */
                         rvec_ScaledAdd( workspace->f[j], -CEhb3/r_jk, dvec_jk ); 
 
                         rvec_Scale( force, CEhb3 / r_jk, dvec_jk );
@@ -271,7 +279,7 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_MT( reax_atom *my_atoms, single_body_parame
     real *sh_hb = t_hb;
     real *sh_cdbo = t_hb + blockDim.x;
     rvec *sh_atomf = (rvec *)(sh_cdbo + blockDim.x);
-    rvec *sh_hf = (rvec *) (sh_atomf + blockDim.x);
+    rvec *sh_hf = (rvec *)(sh_atomf + blockDim.x);
 #endif
     int __THREADS_PER_ATOM__, thread_id, group_id, lane_id; 
     int i, j, k, pi, pk;
@@ -282,7 +290,7 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_MT( reax_atom *my_atoms, single_body_parame
     int itr, top;
     int loopcount, count;
     ivec rel_jk;
-    real r_ij, r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2;
+    real r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2;
     real e_hb, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3;
     rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk;
     rvec dvec_jk, force, ext_press;
@@ -294,9 +302,6 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_MT( reax_atom *my_atoms, single_body_parame
     bond_data *bond_list;
     hbond_data *hbond_list, *hbond_jk;
     storage *workspace;
-#if defined(DEBUG)
-    int num_hb_intrs;
-#endif
 
     __THREADS_PER_ATOM__ = HB_KER_THREADS_PER_ATOM;
     thread_id = blockIdx.x * blockDim.x + threadIdx.x;
@@ -314,9 +319,6 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_MT( reax_atom *my_atoms, single_body_parame
     hbonds = &( p_hbonds );
     hbond_list = hbonds->select.hbond_list;
     j = group_id;
-#if defined(DEBUG)
-    num_hb_intrs = 0;
-#endif
 
     /* loops below discover the Hydrogen bonds between i-j-k triplets.
        here j is H atom and there has to be some bond between i and j.
@@ -404,13 +406,8 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_MT( reax_atom *my_atoms, single_body_parame
                 {
                     bo_ij = &(pbond_ij->bo_data);
                     type_i = my_atoms[i].type;
-                    r_ij = pbond_ij->d;         
                     hbp = &(d_hbp[ index_hbp(type_i,type_j,type_k,num_atom_types) ]);
 
-#if defined(DEBUG)
-                    ++num_hb_intrs;
-#endif
-
                     Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
                             &theta, &cos_theta );
                     /* the derivative of cos(theta) */
@@ -645,7 +642,7 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs( reax_atom *atoms,
 #else
     extern __shared__ rvec __f[];
 #endif
-    int i, pj,j;
+    int i, pj;
     int start, end;
     storage *workspace;
     hbond_data *nbr_pj, *sym_index_nbr;
@@ -667,7 +664,6 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs( reax_atom *atoms,
     while ( pj < end )
     {
         nbr_pj = &( hbonds->select.hbond_list[pj] );
-        j = nbr_pj->nbr;
 
         sym_index_nbr = &(hbonds->select.hbond_list[ nbr_pj->sym_index ]);
 
@@ -743,7 +739,7 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs_BL( reax_atom *atoms,
 #else
     extern __shared__ rvec __f[];
 #endif
-    int i, pj,j;
+    int i, pj;
     int start, end;
     storage *workspace;
     hbond_data *nbr_pj, *sym_index_nbr;
@@ -778,7 +774,6 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs_BL( reax_atom *atoms,
     while ( pj < end )
     {
         nbr_pj = &(hbonds->select.hbond_list[pj]);
-        j = nbr_pj->nbr;
 
         sym_index_nbr = &(hbonds->select.hbond_list[ nbr_pj->sym_index ]);
 #if defined(__SM_35__)
diff --git a/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.h b/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.h
index 606196b4..aa09d2f7 100644
--- a/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.h
+++ b/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.h
@@ -25,24 +25,24 @@
 #include "../reax_types.h"
 
 
-CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs( reax_atom *,
-        storage, reax_list );
-
-CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs_BL( reax_atom *,
-        storage, reax_list, int );
-
-CUDA_GLOBAL void Cuda_Hydrogen_Bonds_PostProcess( reax_atom *,
-        storage, reax_list, int );
-
 CUDA_GLOBAL void Cuda_Hydrogen_Bonds( reax_atom *,
         single_body_parameters *, hbond_parameters *,
         global_parameters, control_params *, storage ,
-        reax_list, reax_list, int, int, real *, rvec * );
+        reax_list, reax_list, int, int, real *, rvec *, int, int );
 
 CUDA_GLOBAL void Cuda_Hydrogen_Bonds_MT( reax_atom *,
         single_body_parameters *, hbond_parameters *,
         global_parameters , control_params *, storage,
         reax_list, reax_list, int, int, real *, rvec * );
 
+CUDA_GLOBAL void Cuda_Hydrogen_Bonds_PostProcess( reax_atom *,
+        storage, reax_list, int );
+
+CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs( reax_atom *,
+        storage, reax_list );
+
+CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs_BL( reax_atom *,
+        storage, reax_list, int );
+
 
 #endif
diff --git a/PG-PuReMD/src/cuda/cuda_init_md.cu b/PG-PuReMD/src/cuda/cuda_init_md.cu
index 3162e732..b115f287 100644
--- a/PG-PuReMD/src/cuda/cuda_init_md.cu
+++ b/PG-PuReMD/src/cuda/cuda_init_md.cu
@@ -56,8 +56,7 @@ int Cuda_Init_System( reax_system *system, control_params *control,
         simulation_data *data, storage *workspace,
         mpi_datatypes *mpi_data, char *msg )
 {
-    int i, ret;
-    reax_atom *atom;
+    int i;
     int nrecv[MAX_NBRS];
 
     Setup_New_Grid( system, control, MPI_COMM_WORLD );
@@ -111,7 +110,7 @@ int Cuda_Init_System( reax_system *system, control_params *control,
     /* initialize velocities so that desired init T can be attained */
     if ( !control->restart || (control->restart && control->random_vel) )
     {
-        Generate_Initial_Velocities( system, control->T_init );
+        Cuda_Generate_Initial_Velocities( system, control->T_init );
     }
 
     Cuda_Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
@@ -228,15 +227,11 @@ void Cuda_Init_Workspace( reax_system *system, control_params *control,
 }
 
 
-int Cuda_Init_Lists( reax_system *system, control_params *control,
+void Cuda_Init_Lists( reax_system *system, control_params *control,
         simulation_data *data, storage *workspace, reax_list **lists,
         mpi_datatypes *mpi_data, char *msg )
 {
-    int ret;
-    int Htop;
-   
-    /* ignore returned error, as system->d_max_far_nbrs was not valid */
-    ret = Cuda_Estimate_Neighbors( system, data->step );
+    Cuda_Estimate_Neighbors( system );
 
     Dev_Make_List( system->total_cap, system->total_far_nbrs,
             TYP_FAR_NEIGHBOR, *dev_lists + FAR_NBRS );
@@ -253,27 +248,21 @@ int Cuda_Init_Lists( reax_system *system, control_params *control,
     Cuda_Generate_Neighbor_Lists( system, data, workspace, dev_lists );
 
     /* estimate storage for bonds, hbonds, and sparse matrix */
-    Cuda_Estimate_Storages( system, control, dev_lists, &(dev_workspace->H), data->step );
+    Cuda_Estimate_Storages( system, control, dev_lists,
+            TRUE, TRUE, TRUE, data->step );
 
     dev_alloc_matrix( &(dev_workspace->H), system->total_cap, system->total_cm_entries );
     Cuda_Init_Sparse_Matrix_Indices( system, &(dev_workspace->H) );
 
-    //MATRIX CHANGES
-    //workspace->L = NULL;
-    //workspace->U = NULL;
-
 #if defined(DEBUG_FOCUS)
-    fprintf( stderr, "p:%d - allocated H matrix: max_entries: %d, cap: %d \n",
-            system->my_rank, system->total_cm_entries, dev_workspace->H.m );
-    fprintf( stderr, "p%d: allocated H matrix: Htop=%d, space=%dMB\n",
-            system->my_rank, Htop,
-            (int)(Htop * sizeof(sparse_matrix_entry) / (1024 * 1024)) );
+    fprintf( stderr, "p:%d - allocated H matrix: max_entries: %d, space=%dMB\n",
+            system->my_rank, system->total_cm_entries,
+            (int)(system->total_cm_entries * sizeof(sparse_matrix_entry) / (1024 * 1024)) );
 #endif
 
     if ( control->hbond_cut > 0.0 &&  system->numH > 0 )
     {
         Dev_Make_List( system->total_cap, system->total_hbonds, TYP_HBOND, *dev_lists + HBONDS );
-
         Cuda_Init_HBond_Indices( system );
 
 #if defined(DEBUG_FOCUS)
@@ -285,7 +274,6 @@ int Cuda_Init_Lists( reax_system *system, control_params *control,
 
     /* bonds list */
     Dev_Make_List( system->total_cap, system->total_bonds, TYP_BOND, *dev_lists + BONDS );
-
     Cuda_Init_Bond_Indices( system );
 
 #if defined(DEBUG_FOCUS)
@@ -295,10 +283,8 @@ int Cuda_Init_Lists( reax_system *system, control_params *control,
 #endif
 
     /* 3bodies list: since a more accurate estimate of the num.
-     * of three body interactions requires that bond orders have
+     * three body interactions requires that bond orders have
      * been computed, delay estimation until for computation */
-
-    return SUCCESS;
 }
 
 
@@ -308,7 +294,6 @@ void Cuda_Initialize( reax_system *system, control_params *control,
         mpi_datatypes *mpi_data )
 {
     char msg[MAX_STR];
-    real t_start, t_end;
 
     /* HOST/DEVICE SCRATCH */
     Cuda_Init_ScratchArea( );
@@ -348,20 +333,11 @@ void Cuda_Initialize( reax_system *system, control_params *control,
     fprintf( stderr, "p%d: initialized workspace\n", system->my_rank );
 #endif
 
-    //Sync the taper here from host to device.
-
     /* CONTROL */
     dev_alloc_control( control );
 
     /* LISTS */
-    if ( Cuda_Init_Lists( system, control, data, workspace, lists, mpi_data, msg ) ==
-            FAILURE )
-    {
-        fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
-        fprintf( stderr, "p%d: system could not be initialized! terminating.\n",
-                 system->my_rank );
-        MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
-    }
+    Cuda_Init_Lists( system, control, data, workspace, lists, mpi_data, msg );
 
 #if defined(DEBUG)
     fprintf( stderr, "p%d: initialized lists\n", system->my_rank );
diff --git a/PG-PuReMD/src/cuda/cuda_integrate.cu b/PG-PuReMD/src/cuda/cuda_integrate.cu
index 77203263..e1de784a 100644
--- a/PG-PuReMD/src/cuda/cuda_integrate.cu
+++ b/PG-PuReMD/src/cuda/cuda_integrate.cu
@@ -284,7 +284,7 @@ int Cuda_Velocity_Verlet_NVE( reax_system* system, control_params* control,
         output_controls *out_control, mpi_datatypes *mpi_data )
 {
     int steps, renbr, ret;
-    static int verlet_part1_done = FALSE, estimate_nbrs_done = 0;
+    static int verlet_part1_done = FALSE, far_nbrs_done = FALSE;
     real dt;
 #if defined(DEBUG)
     real t_over_start, t_over_elapsed;
@@ -300,11 +300,10 @@ int Cuda_Velocity_Verlet_NVE( reax_system* system, control_params* control,
     renbr = steps % control->reneighbor == 0 ? TRUE : FALSE;
     ret = SUCCESS;
 
-    Cuda_ReAllocate( system, control, data, workspace, lists, mpi_data );
-
     if ( verlet_part1_done == FALSE )
     {
         update_velocity_part1( system, dt );
+
         verlet_part1_done = TRUE;
 
 #if defined(DEBUG_FOCUS)
@@ -321,43 +320,38 @@ int Cuda_Velocity_Verlet_NVE( reax_system* system, control_params* control,
         Comm_Atoms( system, control, data, workspace, lists, mpi_data, renbr );
         Sync_Atoms( system );
 
-        /* synch the Grid to the Device here */
+        /* sync grid to device */
         Sync_Grid( &system->my_grid, &system->d_my_grid );
 
         init_blocks( system );
-
-#if defined(__CUDA_DEBUG_LOG__)
-        fprintf( stderr, "p:%d - Matvec BLocks: %d, blocksize: %d \n",
-                system->my_rank, MATVEC_BLOCKS, MATVEC_BLOCK_SIZE );
-#endif
     }
 
+    Cuda_ReAllocate( system, control, data, workspace, lists, mpi_data );
+
     Cuda_Reset( system, control, data, workspace, lists );
 
-    if ( renbr )
+    if ( renbr && far_nbrs_done == FALSE )
     {
 #if defined(DEBUG)
         t_over_start  = Get_Time( );
 #endif
 
-        if ( estimate_nbrs_done == 0 )
+        ret = Cuda_Generate_Neighbor_Lists( system, data, workspace, lists );
+
+        if ( ret != SUCCESS )
         {
-            //TODO: move far_nbrs reallocation checks outside of renbr frequency check
-            ret = Cuda_Estimate_Neighbors( system, data->step );
-            estimate_nbrs_done = 1;
+            Cuda_Estimate_Neighbors( system );
         }
-
-        if ( ret == SUCCESS && estimate_nbrs_done == 1 )
+        if ( ret == SUCCESS )
         {
-            Cuda_Generate_Neighbor_Lists( system, data, workspace, lists );
-            estimate_nbrs_done = 2;
+            far_nbrs_done = TRUE;
+        }
     
 #if defined(DEBUG)
-            t_over_elapsed = Get_Timing_Info( t_over_start );
-            fprintf( stderr, "p%d --> Overhead (Step-%d) %f \n",
-                    system->my_rank, data->step, t_over_elapsed );
+        t_over_elapsed = Get_Timing_Info( t_over_start );
+        fprintf( stderr, "p%d --> Overhead (Step-%d) %f \n",
+                system->my_rank, data->step, t_over_elapsed );
 #endif
-        }
     }
 
     if ( ret == SUCCESS )
@@ -371,7 +365,7 @@ int Cuda_Velocity_Verlet_NVE( reax_system* system, control_params* control,
         update_velocity_part2( system, dt );
 
         verlet_part1_done = FALSE;
-        estimate_nbrs_done = 0;
+        far_nbrs_done = FALSE;
     }
     
 #if defined(DEBUG_FOCUS)
@@ -389,7 +383,7 @@ int Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein( reax_system* system,
 {
     int itr, steps, renbr, ret;
     real *d_my_ekin, *d_total_my_ekin;
-    static int verlet_part1_done = FALSE, estimate_nbrs_done = 0;
+    static int verlet_part1_done = FALSE, far_nbrs_done = FALSE;
     real dt, dt_sqr;
     real my_ekin, new_ekin;
     real G_xi_new, v_xi_new, v_xi_old;
@@ -406,8 +400,6 @@ int Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein( reax_system* system,
     steps = data->step - data->prev_steps;
     renbr = steps % control->reneighbor == 0 ? TRUE : FALSE;
 
-    Cuda_ReAllocate( system, control, data, workspace, lists, mpi_data );
-
     if ( verlet_part1_done == FALSE )
     {
         nhNVT_update_velocity_part1( system, dt );
@@ -431,46 +423,44 @@ int Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein( reax_system* system,
         Comm_Atoms( system, control, data, workspace, lists, mpi_data, renbr );
         Sync_Atoms( system );
 
-        /* synch the Grid to the Device here */
+        /* sync grid to device */
         Sync_Grid( &system->my_grid, &system->d_my_grid );
 
         init_blocks( system );
     }
 
+    Cuda_ReAllocate( system, control, data, workspace, lists, mpi_data );
+
     Cuda_Reset( system, control, data, workspace, lists );
 
-    if ( renbr )
+    if ( renbr && far_nbrs_done == FALSE )
     {
 #if defined(DEBUG)
         t_over_start  = Get_Time( );
 #endif
 
-        if ( estimate_nbrs_done == 0 )
+        ret = Cuda_Generate_Neighbor_Lists( system, data, workspace, lists );
+
+        if ( ret != SUCCESS )
         {
-            //TODO: move far_nbrs reallocation checks outside of renbr frequency check
-            ret = Cuda_Estimate_Neighbors( system, data->step );
-            fprintf( stderr, "Cuda_Estimate_Neighbors (%d)\n", ret );
-            estimate_nbrs_done = 1;
+            Cuda_Estimate_Neighbors( system );
         }
-
-        if ( ret == SUCCESS && estimate_nbrs_done == 1 )
+        if ( ret == SUCCESS )
         {
-            Cuda_Generate_Neighbor_Lists( system, data, workspace, lists );
-            estimate_nbrs_done = 2;
-    
+            far_nbrs_done = TRUE;
+        }
+
 #if defined(DEBUG)
-            t_over_elapsed = Get_Timing_Info( t_over_start );
-            fprintf( stderr, "p%d --> Overhead (Step-%d) %f \n",
-                    system->my_rank, data->step, t_over_elapsed );
+        t_over_elapsed = Get_Timing_Info( t_over_start );
+        fprintf( stderr, "p%d --> Overhead (Step-%d) %f \n",
+                system->my_rank, data->step, t_over_elapsed );
 #endif
-        }
     }
 
     if ( ret == SUCCESS )
     {
         ret = Cuda_Compute_Forces( system, control, data, workspace,
                 lists, out_control, mpi_data );
-        fprintf( stderr, "Cuda_Compute_Forces (%d)\n", ret );
     }
 
     if ( ret == SUCCESS )
@@ -514,7 +504,7 @@ int Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein( reax_system* system,
                 "Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein::d_my_ekin" );
 
         verlet_part1_done = FALSE;
-        estimate_nbrs_done = 0;
+        far_nbrs_done = FALSE;
     }
     
 #if defined(DEBUG_FOCUS)
@@ -534,7 +524,7 @@ int Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* con
         output_controls *out_control, mpi_datatypes *mpi_data )
 {
     int steps, renbr, ret;
-    static int verlet_part1_done = FALSE, estimate_nbrs_done = 0;
+    static int verlet_part1_done = FALSE, far_nbrs_done = FALSE;
     real dt, lambda;
 #if defined(DEBUG)
     real t_over_start, t_over_elapsed;
@@ -550,12 +540,11 @@ int Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* con
     renbr = steps % control->reneighbor == 0 ? TRUE : FALSE;
     ret = SUCCESS;
 
-    Cuda_ReAllocate( system, control, data, workspace, lists, mpi_data );
-
     if ( verlet_part1_done == FALSE )
     {
         /* velocity verlet, 1st part */
         update_velocity_part1( system, dt );
+
         verlet_part1_done = TRUE;
 
 #if defined(DEBUG_FOCUS)
@@ -563,6 +552,8 @@ int Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* con
         MPI_Barrier( MPI_COMM_WORLD );
 #endif
 
+        Cuda_ReAllocate( system, control, data, workspace, lists, mpi_data );
+
         if ( renbr )
         {
             Update_Grid( system, control, mpi_data->world );
@@ -576,39 +567,38 @@ int Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* con
         Sync_Grid( &system->my_grid, &system->d_my_grid );
 
         init_blocks( system );
-
-#if defined(__CUDA_DEBUG_LOG__)
-        fprintf( stderr, "p:%d - Matvec BLocks: %d, blocksize: %d \n",
-                system->my_rank, MATVEC_BLOCKS, MATVEC_BLOCK_SIZE );
-#endif
+    
+        Cuda_Reset( system, control, data, workspace, lists );
     }
+    else
+    {
+        Cuda_ReAllocate( system, control, data, workspace, lists, mpi_data );
     
-    Cuda_Reset( system, control, data, workspace, lists );
+        Cuda_Reset( system, control, data, workspace, lists );
+    }
 
-    if ( renbr )
+    if ( renbr && far_nbrs_done == FALSE )
     {
 #if defined(DEBUG)
-        t_over_start  = Get_Time ();
+        t_over_start  = Get_Time( );
 #endif
 
-        if ( estimate_nbrs_done == 0 )
+        ret = Cuda_Generate_Neighbor_Lists( system, data, workspace, lists );
+
+        if ( ret != SUCCESS )
         {
-            //TODO: move far_nbrs reallocation checks outside of renbr frequency check
-            ret = Cuda_Estimate_Neighbors( system, data->step );
-            estimate_nbrs_done = 1;
+            Cuda_Estimate_Neighbors( system );
         }
-
-        if ( ret == SUCCESS && estimate_nbrs_done == 1 )
+        if ( ret == SUCCESS )
         {
-            Cuda_Generate_Neighbor_Lists( system, data, workspace, lists );
-            estimate_nbrs_done = 2;
-    
+            far_nbrs_done = TRUE;
+        }
+        
 #if defined(DEBUG)
-            t_over_elapsed  = Get_Timing_Info( t_over_start );
-            fprintf( stderr, "p%d --> Overhead (Step-%d) %f \n",
-                    system->my_rank, data->step, t_over_elapsed );
+        t_over_elapsed  = Get_Timing_Info( t_over_start );
+        fprintf( stderr, "p%d --> Overhead (Step-%d) %f \n",
+                system->my_rank, data->step, t_over_elapsed );
 #endif
-        }
     }
 
     if ( ret == SUCCESS )
@@ -653,7 +643,7 @@ int Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* con
 #endif
 
         verlet_part1_done = FALSE;
-        estimate_nbrs_done = 0;
+        far_nbrs_done = FALSE;
     }
 
     return ret;
@@ -668,7 +658,7 @@ int Cuda_Velocity_Verlet_Berendsen_NPT( reax_system* system, control_params* con
         output_controls *out_control, mpi_datatypes *mpi_data )
 {
     int steps, renbr, ret;
-    static int verlet_part1_done = FALSE, estimate_nbrs_done = 0;
+    static int verlet_part1_done = FALSE, far_nbrs_done = FALSE;
     real dt;
 #if defined(DEBUG)
     real t_over_start, t_over_elapsed;
@@ -684,11 +674,10 @@ int Cuda_Velocity_Verlet_Berendsen_NPT( reax_system* system, control_params* con
     renbr = steps % control->reneighbor == 0 ? TRUE : FALSE;
     ret = SUCCESS;
 
-    Cuda_ReAllocate( system, control, data, workspace, lists, mpi_data );
-
     if ( verlet_part1_done == FALSE )
     {
         update_velocity_part1( system, dt );
+
         verlet_part1_done = TRUE;
 
 #if defined(DEBUG_FOCUS)
@@ -711,32 +700,32 @@ int Cuda_Velocity_Verlet_Berendsen_NPT( reax_system* system, control_params* con
         init_blocks( system );
     }
 
+    Cuda_ReAllocate( system, control, data, workspace, lists, mpi_data );
+
     Cuda_Reset( system, control, data, workspace, lists );
 
-    if ( renbr )
+    if ( renbr && far_nbrs_done == FALSE )
     {
 #if defined(DEBUG)
         t_over_start  = Get_Time( );
 #endif
 
-        if ( estimate_nbrs_done == 0 )
+        ret = Cuda_Generate_Neighbor_Lists( system, data, workspace, lists );
+
+        if ( ret != SUCCESS )
         {
-            //TODO: move far_nbrs reallocation checks outside of renbr frequency check
-            ret = Cuda_Estimate_Neighbors( system, data->step );
-            estimate_nbrs_done = 1;
+            Cuda_Estimate_Neighbors( system );
         }
-
-        if ( ret == SUCCESS && estimate_nbrs_done == 1 )
+        if ( ret == SUCCESS )
         {
-            Cuda_Generate_Neighbor_Lists( system, data, workspace, lists );
-            estimate_nbrs_done = 2;
+            far_nbrs_done = TRUE;
+        }
     
 #if defined(DEBUG)
-            t_over_elapsed = Get_Timing_Info( t_over_start );
-            fprintf( stderr, "p%d --> Overhead (Step-%d) %f \n",
-                    system->my_rank, data->step, t_over_elapsed );
+        t_over_elapsed = Get_Timing_Info( t_over_start );
+        fprintf( stderr, "p%d --> Overhead (Step-%d) %f \n",
+                system->my_rank, data->step, t_over_elapsed );
 #endif
-        }
     }
 
     if ( ret == SUCCESS )
@@ -749,12 +738,12 @@ int Cuda_Velocity_Verlet_Berendsen_NPT( reax_system* system, control_params* con
     {
         update_velocity_part2( system, dt );
 
-        verlet_part1_done = FALSE;
-        estimate_nbrs_done = 0;
-
         Cuda_Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
         Cuda_Compute_Pressure( system, control, data, mpi_data );
         Cuda_Scale_Box( system, control, data, mpi_data );
+
+        verlet_part1_done = FALSE;
+        far_nbrs_done = FALSE;
     }
     
 #if defined(DEBUG_FOCUS)
diff --git a/PG-PuReMD/src/cuda/cuda_lin_alg.cu b/PG-PuReMD/src/cuda/cuda_lin_alg.cu
index a0db127b..d47a0856 100644
--- a/PG-PuReMD/src/cuda/cuda_lin_alg.cu
+++ b/PG-PuReMD/src/cuda/cuda_lin_alg.cu
@@ -570,18 +570,18 @@ void Cuda_RvecCopy_To(rvec2 *dst, real *src, int index, int n)
 
 void Cuda_Dual_Matvec( sparse_matrix *H, rvec2 *a, rvec2 *b, int n, int size )
 {
-    int blocks;
+//    int blocks;
 
-    blocks = (n / DEF_BLOCK_SIZE) + 
-        (( n % DEF_BLOCK_SIZE) == 0 ? 0 : 1);
+//    blocks = (n / DEF_BLOCK_SIZE) + 
+//        (( n % DEF_BLOCK_SIZE) == 0 ? 0 : 1);
 
     cuda_memset( b, 0, sizeof(rvec2) * size, "dual_matvec:result" );
 
-    //One thread per row implementation
-    //k_dual_matvec <<< blocks, DEF_BLOCK_SIZE >>>
-    //        (*H, a, b, n);
-    //cudaThreadSynchronize ();
-    //cudaCheckError ();
+    /* one thread per row implementation */
+//    k_dual_matvec <<< blocks, DEF_BLOCK_SIZE >>>
+//        ( *H, a, b, n );
+//    cudaThreadSynchronize( );
+//    cudaCheckError( );
 
     //One warp per row implementation
 #if defined(__SM_35__)
@@ -628,16 +628,16 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
         rvec2 *b, real tol, rvec2 *x, mpi_datatypes* mpi_data, FILE *fout,
         simulation_data *data )
 {
-    int  i, j, n, N, matvecs, scale;
+    int i, n, matvecs, scale;
+//    int j, N;
     rvec2 tmp, alpha, beta;
     rvec2 my_sum, norm_sqr, b_norm, my_dot;
     rvec2 sig_old, sig_new;
     MPI_Comm comm;
     rvec2 *spad = (rvec2 *) host_scratch;
-    int a;
 
     n = system->n;
-    N = system->N;
+//    N = system->N;
     comm = mpi_data->world;
     matvecs = 0;
     scale = sizeof(rvec2) / sizeof(void);
@@ -679,7 +679,7 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
     //originally we were using only H->n which was system->n (init_md.c)
     //Cuda_Dual_Matvec ( H, x, dev_workspace->q2, H->n, system->total_cap);
     
-    Cuda_Dual_Matvec ( H, x, dev_workspace->q2, system->N, system->total_cap);
+    Cuda_Dual_Matvec( H, x, dev_workspace->q2, system->N, system->total_cap );
 
 //  compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2");
 
@@ -693,7 +693,7 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
     
     copy_host_device( spad, dev_workspace->q2, sizeof(rvec2) * system->total_cap,
             cudaMemcpyDeviceToHost, "CG:q2:get" );
-    Coll(system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_unpacker);
+    Coll( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_unpacker );
     copy_host_device( spad, dev_workspace->q2, sizeof(rvec2) * system->total_cap,
             cudaMemcpyHostToDevice,"CG:q2:put" );
 
@@ -730,8 +730,9 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
 //  fprintf (stderr, "cg: my_sum[ %f, %f] \n", my_sum[0], my_sum[1]);
 //#endif
 
-    my_sum[0] = my_sum[1] = 0;
-    Cuda_Norm (b, n, my_sum);
+    my_sum[0] = 0;
+    my_sum[1] = 0;
+    Cuda_Norm( b, n, my_sum );
 
 //  fprintf (stderr, "cg: my_sum[ %f, %f] \n", my_sum[0], my_sum[1]);
 
@@ -750,8 +751,9 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
 //  fprintf( stderr, "my_dot: %f %f\n", my_dot[0], my_dot[1] );
 //#endif
 
-    my_dot[0] = my_dot[1] = 0;
-    Cuda_Dot (dev_workspace->r2, dev_workspace->d2, my_dot, n);
+    my_dot[0] = 0;
+    my_dot[1] = 0;
+    Cuda_Dot( dev_workspace->r2, dev_workspace->d2, my_dot, n );
 
 // fprintf( stderr, "my_dot: %f %f\n", my_dot[0], my_dot[1] );
     
@@ -841,7 +843,8 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
 
         alpha[0] = sig_new[0] / tmp[0];
         alpha[1] = sig_new[1] / tmp[1];
-        my_dot[0] = my_dot[1] = 0;
+        my_dot[0] = 0;
+        my_dot[1] = 0;
 
 //#ifdef __CUDA_DEBUG__
 //    for( j = 0; j < system->n; ++j ) {
@@ -861,7 +864,8 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
 //       fprintf( stderr, "H:my_dot: %f %f\n", my_dot[0], my_dot[1] );
 //#endif
 
-        my_dot[0] = my_dot[1] = 0;
+        my_dot[0] = 0;
+        my_dot[1] = 0;
         Cuda_DualCG_Preconditioner( dev_workspace, x, alpha, system->n, my_dot );
 
         //fprintf( stderr, "D:my_dot: %f %f\n", my_dot[0], my_dot[1] );
@@ -971,9 +975,10 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
 int Cuda_CG( reax_system *system, storage *workspace, sparse_matrix *H, real
         *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
 {
-    int  i, j, scale;
+    int  i, scale;
+//    int j;
     real tmp, alpha, beta, b_norm;
-    real sig_old, sig_new, sig0;
+    real sig_old, sig_new;
     real *spad = (real *) host_scratch;
 
     scale = sizeof(real) / sizeof(void);
@@ -1027,8 +1032,6 @@ int Cuda_CG( reax_system *system, storage *workspace, sparse_matrix *H, real
     sig_new = Parallel_Dot( spad, spad + system->total_cap, system->n,
             mpi_data->world );
 
-    sig0 = sig_new;
-
 #if defined(CG_PERFORMANCE)
     if ( system->my_rank == MASTER_NODE )
     {
@@ -1036,7 +1039,7 @@ int Cuda_CG( reax_system *system, storage *workspace, sparse_matrix *H, real
     }
 #endif
 
-    for ( i = 1; i < 300 && SQRT(sig_new) / b_norm > tol; ++i )
+    for ( i = 1; i < 1000 && SQRT(sig_new) / b_norm > tol; ++i )
     {
         //MVAPICH2
         copy_host_device( spad, dev_workspace->d, sizeof(real) * system->total_cap,
@@ -1103,11 +1106,5 @@ int Cuda_CG( reax_system *system, storage *workspace, sparse_matrix *H, real
 #endif
     }
 
-    if ( i >= 300 )
-    {
-        fprintf( stderr, "CG convergence failed!\n" );
-        return i;
-    }
-
     return i;
 }
diff --git a/PG-PuReMD/src/cuda/cuda_multi_body.cu b/PG-PuReMD/src/cuda/cuda_multi_body.cu
index cb741571..9a328061 100644
--- a/PG-PuReMD/src/cuda/cuda_multi_body.cu
+++ b/PG-PuReMD/src/cuda/cuda_multi_body.cu
@@ -41,9 +41,9 @@ CUDA_GLOBAL void Cuda_Atom_Energy( reax_atom *my_atoms, global_parameters gp,
     real exp_ovun2n, exp_ovun6, exp_ovun8;
     real inv_exp_ovun1, inv_exp_ovun2, inv_exp_ovun2n, inv_exp_ovun8;
     real e_un, CEunder1, CEunder2, CEunder3, CEunder4;
-    real p_lp1, p_lp2, p_lp3;
+    real p_lp2, p_lp3;
     real p_ovun2, p_ovun3, p_ovun4, p_ovun5, p_ovun6, p_ovun7, p_ovun8;
-    single_body_parameters *sbp_i, *sbp_j;
+    single_body_parameters *sbp_i;
     two_body_parameters *twbp;
     bond_data *pbond;
     bond_order_data *bo_ij; 
@@ -59,7 +59,6 @@ CUDA_GLOBAL void Cuda_Atom_Energy( reax_atom *my_atoms, global_parameters gp,
     storage *workspace = &( p_workspace );
 
     /* Initialize parameters */
-    p_lp1 = gp.l[15];
     p_lp3 = gp.l[5];
     p_ovun3 = gp.l[32];
     p_ovun4 = gp.l[31];
@@ -78,8 +77,8 @@ CUDA_GLOBAL void Cuda_Atom_Energy( reax_atom *my_atoms, global_parameters gp,
     inv_expvd2 = 1. / (1. + expvd2 );
 
     /* calculate the energy */
-    data_elp [i] += e_lp = 
-        p_lp2 * workspace->Delta_lp[i] * inv_expvd2;
+    e_lp = p_lp2 * workspace->Delta_lp[i] * inv_expvd2;
+    data_elp[i] += e_lp;
 
     dElp = p_lp2 * inv_expvd2 + 
         75 * p_lp2 * workspace->Delta_lp[i] * expvd2 * SQR(inv_expvd2);
@@ -117,14 +116,16 @@ CUDA_GLOBAL void Cuda_Atom_Energy( reax_atom *my_atoms, global_parameters gp,
                     twbp = &( tbp[index_tbp (type_i,type_j, num_atom_types) ]);
                     bo_ij = &( bonds->select.bond_list[pj].bo_data );
                     Di = workspace->Delta[i];
-                    vov3 = bo_ij->BO - Di - 0.040*POW(Di, 4.);
+                    vov3 = bo_ij->BO - Di - 0.040 * POW(Di, 4.);
 
                     if ( vov3 > 3. )
                     {
-                        data_elp [i] += e_lph = p_lp3 * SQR(vov3-3.0);
+                        e_lph = p_lp3 * SQR( vov3 - 3.0 );
+                        data_elp[i] += e_lph;
 
-                        deahu2dbo = 2.*p_lp3*(vov3 - 3.);
-                        deahu2dsbo = 2.*p_lp3*(vov3 - 3.)*(-1. - 0.16*POW(Di, 3.));
+                        deahu2dbo = 2. * p_lp3 * (vov3 - 3.);
+                        deahu2dsbo = 2. * p_lp3 * (vov3 - 3.) *
+                            (-1. - 0.16 * POW(Di, 3.));
 
                         bo_ij->Cdbo += deahu2dbo;
                         workspace->CdDelta[i] += deahu2dsbo;
@@ -167,18 +168,11 @@ CUDA_GLOBAL void Cuda_Atom_Energy( reax_atom *my_atoms, global_parameters gp,
         j = bonds->select.bond_list[pj].nbr;
         type_j = my_atoms[j].type;
         bo_ij = &(bonds->select.bond_list[pj].bo_data);
-        sbp_j = &(sbp[ type_j ]);
         twbp = &(tbp[ index_tbp(type_i, type_j, num_atom_types )]);
 
         sum_ovun1 += twbp->p_ovun1 * twbp->De_s * bo_ij->BO;
         sum_ovun2 += (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j])*
             ( bo_ij->BO_pi + bo_ij->BO_pi2 );
-
-        /*fprintf( stdout, "%4d%4d%12.6f%12.6f%12.6f\n",
-          i+1, j+1,      
-          dfvl * workspace->Delta_lp_temp[j], 
-          sbp_j->nlp_opt,
-          workspace->nlp_temp[j] );*/
     }
 
     exp_ovun1 = p_ovun3 * EXP( p_ovun4 * sum_ovun2 );
@@ -192,7 +186,8 @@ CUDA_GLOBAL void Cuda_Atom_Energy( reax_atom *my_atoms, global_parameters gp,
     DlpVi = 1.0 / (Delta_lpcorr + sbp_i->valency + 1e-8);
     CEover1 = Delta_lpcorr * DlpVi * inv_exp_ovun2;
 
-    data_eov [i] += e_ov = sum_ovun1 * CEover1;
+    e_ov = sum_ovun1 * CEover1;
+    data_eov[i] += e_ov;
 
     CEover2 = sum_ovun1 * DlpVi * inv_exp_ovun2 *
         (1.0 - Delta_lpcorr * ( DlpVi + p_ovun2 * exp_ovun2 * inv_exp_ovun2 ));
@@ -212,8 +207,8 @@ CUDA_GLOBAL void Cuda_Atom_Energy( reax_atom *my_atoms, global_parameters gp,
     inv_exp_ovun2n = 1.0 / (1.0 + exp_ovun2n);
     inv_exp_ovun8 = 1.0 / (1.0 + exp_ovun8);
 
-    data_eun [i] += e_un =
-        -p_ovun5 * (1.0 - exp_ovun6) * inv_exp_ovun2n * inv_exp_ovun8;
+    e_un = -p_ovun5 * (1.0 - exp_ovun6) * inv_exp_ovun2n * inv_exp_ovun8;
+    data_eun[i] += e_un;
 
     CEunder1 = inv_exp_ovun2n * 
         ( p_ovun5 * p_ovun6 * exp_ovun6 * inv_exp_ovun8 +
@@ -330,8 +325,9 @@ CUDA_GLOBAL void Cuda_Atom_Energy( reax_atom *my_atoms, global_parameters gp,
 CUDA_GLOBAL void Cuda_Atom_Energy_PostProcess( reax_list p_bonds, 
         storage p_workspace, int n )
 {
-    int i,pj;
-    bond_data *pbond, *sbond;
+    int i, pj;
+    bond_data *sbond;
+//    bond_data *pbond;
     bond_data *sym_index_bond;
     reax_list *bonds;
     storage *workspace;
@@ -348,11 +344,9 @@ CUDA_GLOBAL void Cuda_Atom_Energy_PostProcess( reax_list p_bonds,
 
     for ( pj = Dev_Start_Index(i, bonds); pj < Dev_End_Index(i, bonds); ++pj )
     {
-        /*
-           pbond = &(bonds->select.bond_list[pj]);
-           dbond_index_bond = &( bonds->select.bond_list[ pbond->dbond_index ] );
-           workspace->CdDelta [i] += dbond_index_bond->ae_CdDelta;
-         */
+//        pbond = &(bonds->select.bond_list[pj]);
+//        dbond_index_bond = &( bonds->select.bond_list[ pbond->dbond_index ] );
+//        workspace->CdDelta[i] += dbond_index_bond->ae_CdDelta;
 
         sbond = &(bonds->select.bond_list[pj]);
         sym_index_bond = &( bonds->select.bond_list[ sbond->sym_index ]); 
diff --git a/PG-PuReMD/src/cuda/cuda_neighbors.cu b/PG-PuReMD/src/cuda/cuda_neighbors.cu
index f9c127d6..1ba30fa0 100644
--- a/PG-PuReMD/src/cuda/cuda_neighbors.cu
+++ b/PG-PuReMD/src/cuda/cuda_neighbors.cu
@@ -49,13 +49,14 @@ CUDA_DEVICE real Dev_DistSqr_to_Special_Point( rvec cp, rvec x )
 
 /* Generate far neighbor lists by scanning the atoms list and applying cutoffs */
 CUDA_GLOBAL void k_generate_neighbor_lists( reax_atom *my_atoms, 
-        simulation_box my_ext_box, grid g, reax_list far_nbrs, int n, int N )
+        simulation_box my_ext_box, grid g, reax_list far_nbrs_list, int n, int N,
+        int *far_nbrs, int *max_far_nbrs, int *realloc_far_nbrs )
 {
-    int i, j, k, l, m, itr, num_far;
+    int i, j, k, l, m, itr, num_far, my_num_far;
     real d, cutoff;
     ivec c, nbrs_x;
     rvec dvec;
-    far_neighbor_data *nbr_data;//, *my_start;
+    far_neighbor_data *nbr_data;
     reax_atom *atom1, *atom2;
 
     l = blockIdx.x * blockDim.x  + threadIdx.x;
@@ -65,8 +66,8 @@ CUDA_GLOBAL void k_generate_neighbor_lists( reax_atom *my_atoms,
         return;
     }
 
-    atom1 = &(my_atoms[l]);
-    num_far = Dev_Start_Index( l, &far_nbrs );
+    atom1 = &( my_atoms[l] );
+    num_far = Dev_Start_Index( l, &far_nbrs_list );
 
     /* get the coordinates of the atom and compute the grid cell */
     if ( l < n )
@@ -132,7 +133,7 @@ CUDA_GLOBAL void k_generate_neighbor_lists( reax_atom *my_atoms,
                     if ( d <= cutoff )
                     { 
                         /* commit far neighbor to list */
-                        nbr_data = &(far_nbrs.select.far_nbr_list[num_far]);
+                        nbr_data = &(far_nbrs_list.select.far_nbr_list[num_far]);
                         nbr_data->nbr = m;
                         nbr_data->d = SQRT( d );
                         rvec_Copy( nbr_data->dvec, dvec );
@@ -176,7 +177,7 @@ CUDA_GLOBAL void k_generate_neighbor_lists( reax_atom *my_atoms,
                     if ( d <= cutoff )
                     {
                         /* commit far neighbor to list */
-                        nbr_data = &(far_nbrs.select.far_nbr_list[num_far]);
+                        nbr_data = &(far_nbrs_list.select.far_nbr_list[num_far]);
                         nbr_data->nbr = m;
                         nbr_data->d = SQRT( d );
                         rvec_Copy( nbr_data->dvec, dvec );
@@ -193,7 +194,14 @@ CUDA_GLOBAL void k_generate_neighbor_lists( reax_atom *my_atoms,
         ++itr;
     }   
 
-    Dev_Set_End_Index( l, num_far, &far_nbrs );
+    Dev_Set_End_Index( l, num_far, &far_nbrs_list );
+
+    /* reallocation check */
+    my_num_far = num_far - Dev_Start_Index( l, &far_nbrs_list );
+    if ( my_num_far > max_far_nbrs[l] )
+    {
+        *realloc_far_nbrs = TRUE;
+    }
 }
 
 
@@ -421,10 +429,10 @@ CUDA_GLOBAL void k_mt_generate_neighbor_lists( reax_atom *my_atoms,
 }
 
 
-void Cuda_Generate_Neighbor_Lists( reax_system *system, simulation_data *data, 
+int Cuda_Generate_Neighbor_Lists( reax_system *system, simulation_data *data, 
         storage *workspace, reax_list **lists )
 {
-    int i, blocks;
+    int blocks, ret, ret_far_nbr;
 #if defined(LOG_PERFORMANCE)
     real t_start = 0, t_elapsed = 0;
 
@@ -434,12 +442,20 @@ void Cuda_Generate_Neighbor_Lists( reax_system *system, simulation_data *data,
     }
 #endif
 
+    /* reset reallocation flag on device */
+    /* careful: this wrapper around cudaMemset(...) performs a byte-wide assignment
+     * to the provided literal */
+    cuda_memset( system->d_realloc_far_nbrs, FALSE, sizeof(int), 
+            "Cuda_Generate_Neighbor_Lists::d_realloc_far_nbrs" );
+
     /* one thread per atom implementation */
     blocks = (system->N / NBRS_BLOCK_SIZE) +
         ((system->N % NBRS_BLOCK_SIZE) == 0 ? 0 : 1);
+
     k_generate_neighbor_lists <<< blocks, NBRS_BLOCK_SIZE >>>
         ( system->d_my_atoms, system->my_ext_box, system->d_my_grid,
-          *(*dev_lists + FAR_NBRS), system->n, system->N );
+          *(*dev_lists + FAR_NBRS), system->n, system->N,
+          system->d_far_nbrs, system->d_max_far_nbrs, system->d_realloc_far_nbrs );
     cudaThreadSynchronize( );
     cudaCheckError( );
 
@@ -454,6 +470,13 @@ void Cuda_Generate_Neighbor_Lists( reax_system *system, simulation_data *data,
 //    cudaThreadSynchronize( );
 //    cudaCheckError( );
 
+    /* check reallocation flag on device */
+    copy_host_device( &ret_far_nbr, system->d_realloc_far_nbrs, sizeof(int), 
+            cudaMemcpyDeviceToHost, "Cuda_Generate_Neighbor_Lists::d_realloc_far_nbrs" );
+
+    ret = (ret_far_nbr == FALSE) ? SUCCESS : FAILURE;
+    dev_workspace->realloc.far_nbrs = ret_far_nbr;
+
 #if defined(LOG_PERFORMANCE)
     if( system->my_rank == MASTER_NODE )
     {
@@ -467,19 +490,20 @@ void Cuda_Generate_Neighbor_Lists( reax_system *system, simulation_data *data,
             system->my_rank, data->step );
     MPI_Barrier( MPI_COMM_WORLD );
 #endif
+
+    return ret;
 }
 
 
 /* Estimate the number of far neighbors per atom (GPU) */
 CUDA_GLOBAL void k_estimate_neighbors( reax_atom *my_atoms, 
         simulation_box my_ext_box, grid g, int n, int N, int total_cap,
-        int *far_nbrs, int *max_far_nbrs, int *realloc_far_nbrs )
+        int *far_nbrs, int *max_far_nbrs )
 {
     int i, j, k, l, m, itr, num_far;
     real d, cutoff;
     ivec c, nbrs_x;
     rvec dvec;
-    far_neighbor_data *nbr_data;
     reax_atom *atom1, *atom2;
 
     l = blockIdx.x * blockDim.x  + threadIdx.x;
@@ -604,13 +628,8 @@ CUDA_GLOBAL void k_estimate_neighbors( reax_atom *my_atoms,
         num_far = MIN_NBRS;
     }
 
-    if ( num_far > max_far_nbrs[l] )
-    {
-        max_far_nbrs[l] = MAX( (int)(num_far * SAFE_ZONE), MIN_NBRS );
-        *realloc_far_nbrs = TRUE;
-    }
-
     far_nbrs[l] = num_far;
+    max_far_nbrs[l] = MAX( (int)(num_far * SAFE_ZONE), MIN_NBRS );
 }
 
 
@@ -619,17 +638,9 @@ CUDA_GLOBAL void k_estimate_neighbors( reax_atom *my_atoms,
  * system: atomic system info
  * returns: SUCCESS if reallocation of the far neighbors list is necessary
  *  based on current per-atom far neighbor limits, FAILURE otherwise */
-int Cuda_Estimate_Neighbors( reax_system *system, int step )
+void Cuda_Estimate_Neighbors( reax_system *system )
 {
-    int blocks, ret, ret_far_nbr;
-    reax_list *far_nbrs;
-
-    ret = SUCCESS;
-
-    /* careful: this wrapper around cudaMemset(...) performs a byte-wide assignment
-     * to the provided literal */
-    cuda_memset( system->d_realloc_far_nbrs, FALSE, sizeof(int), 
-            "Cuda_Estimate_Neighbors::d_realloc_far_nbrs" );
+    int blocks;
 
     blocks = system->total_cap / DEF_BLOCK_SIZE + 
         ((system->total_cap % DEF_BLOCK_SIZE == 0) ? 0 : 1);
@@ -637,249 +648,12 @@ int Cuda_Estimate_Neighbors( reax_system *system, int step )
     k_estimate_neighbors <<< blocks, DEF_BLOCK_SIZE >>>
         ( system->d_my_atoms, system->my_ext_box, system->d_my_grid,
           system->n, system->N, system->total_cap,
-          system->d_far_nbrs, system->d_max_far_nbrs, system->d_realloc_far_nbrs );
+          system->d_far_nbrs, system->d_max_far_nbrs );
     cudaThreadSynchronize( );
     cudaCheckError( );
 
-    /* check reallocation flag on device */
-    copy_host_device( &ret_far_nbr, system->d_realloc_far_nbrs, sizeof(int), 
-            cudaMemcpyDeviceToHost, "Cuda_Estimate_Neighbors::d_realloc_far_nbrs" );
-
-    if ( ret_far_nbr == TRUE )
-    {
-        Cuda_Reduction_Sum( system->d_max_far_nbrs, system->d_total_far_nbrs,
-                system->total_cap );
-
-        copy_host_device( &(system->total_far_nbrs), system->d_total_far_nbrs, sizeof(int), 
-                cudaMemcpyDeviceToHost, "Cuda_Estimate_Neighbors::d_total_far_nbrs" );
-
-        if ( step > 0 )
-        {
-            dev_workspace->realloc.far_nbrs = TRUE;
-        }
-        ret = FAILURE;
-    }
-
-    return ret;
-}
-
-
-CUDA_GLOBAL void k_init_end_index( int * intr_cnt, int *indices, int *end_indices, int N )
-{
-    int i;
-
-    i = blockIdx.x * blockDim.x  + threadIdx.x;
-
-    if ( i >= N )
-    {
-        return;
-    }
-
-    end_indices[i] = indices[i] + intr_cnt[i];
-}
-
-
-CUDA_GLOBAL void k_setup_hindex( reax_atom *my_atoms, int N )
-{
-    int i;
-
-    i = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if ( i >= N )
-    {
-        return;
-    }
-
-    my_atoms[i].Hindex = i;
-}
-
-
-CUDA_GLOBAL void k_setup_hindex_part1( reax_atom *my_atoms, int * hindex, int n )
-{
-    int i;
-
-    i = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if ( i >= n )
-    {
-        return;
-    }
-
-    hindex[i] = my_atoms[i].Hindex;
-}
-
-
-CUDA_GLOBAL void k_setup_hindex_part2( reax_atom *my_atoms, int * hindex, int n )
-{
-    int i;
-
-    i = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if ( i >= n )
-    {
-        return;
-    }
-
-    if ( hindex[i + 1] - hindex[i] > 0 )
-    {
-        my_atoms[i].Hindex = hindex[i];
-    }
-    else
-    {
-        my_atoms[i].Hindex = -1;
-    }
-}
-
-
-CUDA_GLOBAL void k_init_hbond_indices( reax_atom * atoms, single_body_parameters *sbp,
-        int *hbonds, int *max_hbonds, int *indices, int *end_indices, int N )
-{
-    int i, hindex, my_hbonds;
-
-    i = blockIdx.x * blockDim.x  + threadIdx.x;
-
-    if ( i >= N )
-    {
-        return;
-    }
-
-    hindex = atoms[i].Hindex;
-
-    if ( sbp[ atoms[i].type ].p_hbond == H_ATOM || 
-            sbp[ atoms[i].type ].p_hbond == H_BONDING_ATOM )
-    {
-        my_hbonds = hbonds[i];
-        indices[hindex] = max_hbonds[i];
-        end_indices[hindex] = indices[hindex] + hbonds[i];
-    }
-    else
-    {
-        my_hbonds = 0;
-        indices[hindex] = 0;
-        end_indices[hindex] = 0;
-    }
-    atoms[i].num_hbonds = my_hbonds;
-
-//    hindex = atoms[i].Hindex;
-//
-//    if ( hindex >= 0 )
-//    {
-//        my_hbonds = hbonds[i];
-//        indices[hindex] = max_hbonds[i];
-//        end_indices[hindex] = indices[hindex] + hbonds[i];
-//    }
-//    else
-//    {
-//        my_hbonds = 0;
-//    }
-//    atoms[i].num_hbonds = my_hbonds;
-}
-
-
-/* Initialize indices for far neighbors list post reallocation
- *
- * system: atomic system info. */
-void Cuda_Init_Neighbor_Indices( reax_system *system )
-{
-    int blocks;
-    reax_list *far_nbrs = *dev_lists + FAR_NBRS;
-
-    /* init indices */
-    Cuda_Scan_Excl_Sum( system->d_max_far_nbrs, far_nbrs->index, system->total_cap );
-
-    /* init end_indices */
-    blocks = system->N / DEF_BLOCK_SIZE + 
-        ((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-    k_init_end_index <<< blocks, DEF_BLOCK_SIZE >>>
-        ( system->d_far_nbrs, far_nbrs->index, far_nbrs->end_index, system->N );
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-}
-
-
-void Cuda_Init_HBond_Indices( reax_system *system )
-{
-    int blocks;
-    int *temp;
-    reax_list *hbonds = *dev_lists + HBONDS;
-
-    temp = (int *) scratch;
-//    cuda_memset( temp, 0, 2 * (system->N + 1) * sizeof(int), 
-//            "Cuda_Init_HBond_Indices::temp" );
-
-    /* init Hindices */
-    blocks = system->N / DEF_BLOCK_SIZE + 
-        ((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-
-    k_setup_hindex <<< blocks, DEF_BLOCK_SIZE >>>
-        ( system->d_my_atoms, system->N );
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-
-//    blocks = system->n / DEF_BLOCK_SIZE + 
-//        ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-//
-//    k_setup_hindex_part1 <<< blocks, DEF_BLOCK_SIZE >>>
-//        ( system->d_my_atoms, temp, system->n );
-//    cudaThreadSynchronize( );
-//    cudaCheckError( );
-//
-//    Cuda_Scan_Excl_Sum( temp, temp + system->n + 1, system->n + 1 );
-//
-//    k_setup_hindex_part2 <<< blocks, DEF_BLOCK_SIZE >>>
-//        ( system->d_my_atoms, temp + system->n + 1, system->n );
-//    cudaThreadSynchronize( );
-//    cudaCheckError( );
-
-    /* init indices and end_indices */
-    Cuda_Scan_Excl_Sum( system->d_max_hbonds, temp, system->total_cap );
-
-    k_init_hbond_indices <<< blocks, DEF_BLOCK_SIZE >>>
-        ( system->d_my_atoms, system->reax_param.d_sbp, system->d_hbonds, temp, 
-          hbonds->index, hbonds->end_index, system->N );
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-}
-
-
-void Cuda_Init_Bond_Indices( reax_system *system )
-{
-    int blocks;
-    reax_list *bonds = *dev_lists + BONDS;
-
-    /* init indices */
-    Cuda_Scan_Excl_Sum( system->d_max_bonds, bonds->index, system->total_cap );
-
-    /* init end_indices */
-    blocks = system->N / DEF_BLOCK_SIZE + 
-        ((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-    k_init_end_index <<< blocks, DEF_BLOCK_SIZE >>>
-        ( system->d_bonds, bonds->index, bonds->end_index, system->N );
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-}
-
-
-void Cuda_Init_Sparse_Matrix_Indices( reax_system *system, sparse_matrix *H )
-{
-    int blocks;
-
-    /* init indices */
-    Cuda_Scan_Excl_Sum( system->d_max_cm_entries, H->start, system->N );
-
-    /* init end_indices */
-    blocks = system->N / DEF_BLOCK_SIZE + 
-        ((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-    k_init_end_index <<< blocks, DEF_BLOCK_SIZE >>>
-        ( system->d_cm_entries, H->start, H->end, system->N );
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-}
-
-
-void Cuda_Init_Three_Body_Indices( int *indices, int entries )
-{
-    reax_list *thbody = *dev_lists + THREE_BODIES;
-
-    Cuda_Scan_Excl_Sum( indices, thbody->index, entries );
+    Cuda_Reduction_Sum( system->d_max_far_nbrs, system->d_total_far_nbrs,
+            system->total_cap );
+    copy_host_device( &(system->total_far_nbrs), system->d_total_far_nbrs, sizeof(int), 
+            cudaMemcpyDeviceToHost, "Cuda_Estimate_Neighbors::d_total_far_nbrs" );
 }
diff --git a/PG-PuReMD/src/cuda/cuda_neighbors.h b/PG-PuReMD/src/cuda/cuda_neighbors.h
index 4d4a9c4e..d7bfc4b9 100644
--- a/PG-PuReMD/src/cuda/cuda_neighbors.h
+++ b/PG-PuReMD/src/cuda/cuda_neighbors.h
@@ -9,20 +9,12 @@
 extern "C" {
 #endif
 
-void Cuda_Generate_Neighbor_Lists( reax_system *, simulation_data *, storage *, reax_list ** );
+int Cuda_Generate_Neighbor_Lists( reax_system *, simulation_data *, storage *, reax_list ** );
 
-int Cuda_Estimate_Neighbors( reax_system *, int );
+void Cuda_Estimate_Neighbors( reax_system * );
 
 void Cuda_Init_Neighbor_Indices( reax_system * );
 
-void Cuda_Init_HBond_Indices( reax_system * );
-
-void Cuda_Init_Bond_Indices( reax_system * );
-
-void Cuda_Init_Sparse_Matrix_Indices( reax_system *, sparse_matrix * );
-
-void Cuda_Init_Three_Body_Indices( int *, int );
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/PG-PuReMD/src/cuda/cuda_random.cu b/PG-PuReMD/src/cuda/cuda_random.cu
new file mode 100644
index 00000000..d0de3700
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_random.cu
@@ -0,0 +1,65 @@
+/*----------------------------------------------------------------------
+  PuReMD - Purdue ReaxFF Molecular Dynamics Program
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#include "cuda_random.h"
+
+
+/* System random number generator used linear congruance method with
+   large periodicity for generation of pseudo random number. function
+   Random returns this random number appropriately scaled so that
+   0 <= Random(range) < range */
+CUDA_DEVICE double Cuda_Random( double range )
+{
+    //TODO: use cuRAND
+//    return (random( ) * range) / 2147483647L;
+    return 0.0;
+}
+
+
+/* This function seeds the system pseudo random number generator with
+   current time. Use this function once in the begining to initialize
+   the system */
+void Cuda_Randomize( )
+{
+    //TODO: use cuRAND
+//    curandState_t state;
+//
+//    curand_init( time(NULL), 0, 0, &state );
+}
+
+
+/* GRandom return random number with gaussian distribution with mean
+   and standard deviation "sigma" */
+CUDA_DEVICE double Cuda_GRandom( double mean, double sigma )
+{
+    double v1 = Cuda_Random(2.0) - 1.0;
+    double v2 = Cuda_Random(2.0) - 1.0;
+    double rsq = v1 * v1 + v2 * v2;
+
+    while (rsq >= 1.0 || rsq == 0.0)
+    {
+        v1 = Cuda_Random(2.0) - 1.0;
+        v2 = Cuda_Random(2.0) - 1.0;
+        rsq = v1 * v1 + v2 * v2;
+    }
+
+    return mean + v1 * sigma * SQRT(-2.0 * LOG(rsq) / rsq);
+}
diff --git a/PG-PuReMD/src/cuda/cuda_random.h b/PG-PuReMD/src/cuda/cuda_random.h
new file mode 100644
index 00000000..388359c7
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_random.h
@@ -0,0 +1,44 @@
+/*----------------------------------------------------------------------
+  PuReMD - Purdue ReaxFF Molecular Dynamics Program
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#ifndef __CUDA_RANDOM_H_
+#define __CUDA_RANDOM_H_
+
+#include "../reax_types.h"
+
+
+/* System random number generator used linear congruance method with
+   large periodicity for generation of pseudo random number. function
+   Random returns this random number appropriately scaled so that
+   0 <= Random(range) < range */
+CUDA_DEVICE double Cuda_Random( double );
+
+/* This function seeds the system pseudo random number generator with
+   current time. Use this function once in the begining to initialize
+   the system */
+void Cuda_Randomize( );
+
+/* GRandom return random number with gaussian distribution with mean
+   and standard deviation "sigma" */
+CUDA_DEVICE double Cuda_GRandom( double, double );
+
+
+#endif
diff --git a/PG-PuReMD/src/cuda/cuda_system_props.cu b/PG-PuReMD/src/cuda/cuda_system_props.cu
index a7ece36c..f67bdaf3 100644
--- a/PG-PuReMD/src/cuda/cuda_system_props.cu
+++ b/PG-PuReMD/src/cuda/cuda_system_props.cu
@@ -3,8 +3,10 @@
 
 #include "cuda_copy.h"
 #include "cuda_utils.h"
+#include "cuda_random.h"
 #include "cuda_reduction.h"
 #include "cuda_shuffle.h"
+#include "cuda_vector.h"
 
 #include "../tool_box.h"
 #include "../vector.h"
@@ -27,32 +29,35 @@ CUDA_GLOBAL void center_of_mass_blocks( single_body_parameters *sbp, reax_atom *
     rvec tmp;
     real m;
 
-    rvec_MakeZero (xcm [threadIdx.x]);
-    rvec_MakeZero (vcm [vcm_id + threadIdx.x]);
-    rvec_MakeZero (amcm[amcm_id + threadIdx.x]);
-    rvec_MakeZero (tmp);
+    rvec_MakeZero(xcm[threadIdx.x]);
+    rvec_MakeZero(vcm[vcm_id + threadIdx.x]);
+    rvec_MakeZero(amcm[amcm_id + threadIdx.x]);
+    rvec_MakeZero(tmp);
 
-    if (i < n){
+    if ( i < n )
+    {
         m = sbp [ atoms[i].type ].mass;
         rvec_ScaledAdd (xcm [threadIdx.x], m, atoms [i].x);
         rvec_ScaledAdd (vcm [vcm_id + threadIdx.x], m, atoms [i].v);
         rvec_Cross (tmp, atoms[i].x, atoms [i].v);
         rvec_ScaledAdd (amcm[amcm_id + threadIdx.x], m, tmp);
     }
-    __syncthreads ();
-
-    for( int offset = blockDim.x / 2; offset > 0; offset >>= 1 ) { 
+    __syncthreads( );
 
-        if ((threadIdx.x < offset)) {
+    for( int offset = blockDim.x / 2; offset > 0; offset >>= 1 )
+    { 
+        if ((threadIdx.x < offset))
+        {
             index = threadIdx.x + offset;
             rvec_Add (xcm [threadIdx.x], xcm[index]);
             rvec_Add (vcm [vcm_id  + threadIdx.x], vcm[vcm_id + index]);
             rvec_Add (amcm[amcm_id + threadIdx.x], amcm[amcm_id + index]);
         } 
-        __syncthreads ();
+        __syncthreads( );
     }
 
-    if ((threadIdx.x == 0)){
+    if ((threadIdx.x == 0))
+    {
         rvec_Copy (res_xcm[blockIdx.x], xcm[0]);
         rvec_Copy (res_vcm[blockIdx.x], vcm[vcm_id]);
         rvec_Copy (res_amcm[blockIdx.x], amcm[amcm_id]);
@@ -883,6 +888,53 @@ extern "C" void dev_sync_simulation_data( simulation_data *data )
 }
 
 
+CUDA_GLOBAL void k_generate_initial_velocities( single_body_parameters *sbp, reax_atom *my_atoms, 
+        real T, int n )
+{
+    int i;
+    real m, scale, norm;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= n )
+    {
+        return;
+    }
+
+    if ( T <= 0.1 )
+    {
+        rvec_MakeZero( my_atoms[i].v );
+    }
+    else
+    {
+        cuda_rvec_Random( my_atoms[i].v );
+
+        norm = rvec_Norm_Sqr( my_atoms[i].v );
+        m = sbp[ my_atoms[i].type ].mass;
+        scale = SQRT( m * norm / (3.0 * K_B * T) );
+
+        rvec_Scale( my_atoms[i].v, 1. / scale, my_atoms[i].v );
+    }
+}
+
+
+void Cuda_Generate_Initial_Velocities( reax_system *system, real T )
+{
+    int blocks;
+
+    blocks = system->n / DEF_BLOCK_SIZE + 
+        ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+    if ( T > 0.1 )
+    {
+        Cuda_Randomize( );
+    }
+
+    k_generate_initial_velocities <<< blocks, DEF_BLOCK_SIZE >>>
+        ( system->reax_param.d_sbp, system->d_my_atoms, T, system->n );
+}
+
+
 void Cuda_Compute_Kinetic_Energy( reax_system* system, simulation_data* data,
         MPI_Comm comm )
 {
diff --git a/PG-PuReMD/src/cuda/cuda_system_props.h b/PG-PuReMD/src/cuda/cuda_system_props.h
index ddf8ed4d..9877f6dd 100644
--- a/PG-PuReMD/src/cuda/cuda_system_props.h
+++ b/PG-PuReMD/src/cuda/cuda_system_props.h
@@ -23,6 +23,8 @@ void dev_sync_simulation_data( simulation_data * );
 
 void Cuda_Compute_Total_Mass( reax_system*, simulation_data*, MPI_Comm );
 
+void Cuda_Generate_Initial_Velocities( reax_system *, real );
+
 void Cuda_Compute_Kinetic_Energy( reax_system*, simulation_data*, MPI_Comm );
 
 void Cuda_Compute_Center_of_Mass( reax_system*, simulation_data*,
diff --git a/PG-PuReMD/src/cuda/cuda_torsion_angles.cu b/PG-PuReMD/src/cuda/cuda_torsion_angles.cu
index 47c087d2..21cb33d6 100644
--- a/PG-PuReMD/src/cuda/cuda_torsion_angles.cu
+++ b/PG-PuReMD/src/cuda/cuda_torsion_angles.cu
@@ -165,9 +165,9 @@ CUDA_GLOBAL void Cuda_Torsion_Angles( reax_atom *my_atoms, global_parameters gp,
         reax_list p_thb_intrs, storage p_workspace, int n, int num_atom_types, 
         real *data_e_tor, real *data_e_con, rvec *data_ext_press )
 {
-    int i, j, k, l, pi, pj, pk, pl, pij, plk, natoms;
+    int i, j, k, l, pi, pj, pk, pl, pij, plk;
     int type_i, type_j, type_k, type_l;
-    int start_j, end_j, start_k, end_k;
+    int start_j, end_j;
     int start_pj, end_pj, start_pk, end_pk;
     int num_frb_intrs = 0;
 
@@ -217,9 +217,6 @@ CUDA_GLOBAL void Cuda_Torsion_Angles( reax_atom *my_atoms, global_parameters gp,
     // sprintf( fname, "tor%d.out", system->my_rank );
     // ftor = fopen( fname, "w" );
 
-    //natoms = system->n;
-
-    //for( j = 0; j < natoms; ++j ) {
     type_j = my_atoms[j].type;
     Delta_j = workspace->Delta_boc[j];
     start_j = Dev_Start_Index(j, bonds);
@@ -236,10 +233,8 @@ CUDA_GLOBAL void Cuda_Torsion_Angles( reax_atom *my_atoms, global_parameters gp,
            where j is the central atom. Otherwise there is no point in
            trying to form a 4-body interaction out of this neighborhood */
         if ( my_atoms[j].orig_id < my_atoms[k].orig_id && 
-                bo_jk->BO > control->thb_cut/*0*/ && Dev_Num_Entries(pk, thb_intrs) )
+                bo_jk->BO > control->thb_cut && Dev_Num_Entries(pk, thb_intrs) )
         {
-            start_k = Dev_Start_Index(k, bonds);
-            end_k = Dev_End_Index(k, bonds);               
             pj = pbond_jk->sym_index; // pj points to j on k's list
 
             /* do the same check as above: 
@@ -272,7 +267,7 @@ CUDA_GLOBAL void Cuda_Torsion_Angles( reax_atom *my_atoms, global_parameters gp,
                     bo_ij = &( pbond_ij->bo_data );
 
 
-                    if ( bo_ij->BO > control->thb_cut/*0*/ )
+                    if ( bo_ij->BO > control->thb_cut )
                     {
                         i = p_ijk->thb;
                         type_i = my_atoms[i].type;
@@ -312,8 +307,8 @@ CUDA_GLOBAL void Cuda_Torsion_Angles( reax_atom *my_atoms, global_parameters gp,
                             fbp = &(d_fbp[index_fbp (type_i,type_j,type_k,type_l,num_atom_types)].prm[0]);
 
                             if ( i != l && fbh->cnt && 
-                                    bo_kl->BO > control->thb_cut/*0*/ &&
-                                    bo_ij->BO * bo_jk->BO * bo_kl->BO > control->thb_cut/*0*/ )
+                                    bo_kl->BO > control->thb_cut &&
+                                    bo_ij->BO * bo_jk->BO * bo_kl->BO > control->thb_cut )
                             {
                                 ++num_frb_intrs;
                                 r_kl = pbond_kl->d;
@@ -340,7 +335,6 @@ CUDA_GLOBAL void Cuda_Torsion_Angles( reax_atom *my_atoms, global_parameters gp,
                                         -1., my_atoms[l].x );
                                 r_li = rvec_Norm( dvec_li );                 
 
-
                                 /* omega and its derivative */
                                 omega = Calculate_Omega( pbond_ij->dvec, r_ij,
                                         pbond_jk->dvec, r_jk, pbond_kl->dvec, r_kl,
@@ -365,7 +359,8 @@ CUDA_GLOBAL void Cuda_Torsion_Angles( reax_atom *my_atoms, global_parameters gp,
                                         fbp->V2 * exp_tor1 * (1.0 - cos2omega) +
                                         fbp->V3 * (1.0 + cos3omega) );
 
-                                data_e_tor [j] += e_tor = fn10 * sin_ijk * sin_jkl * CV;
+                                e_tor = fn10 * sin_ijk * sin_jkl * CV;
+                                data_e_tor[j] += e_tor;
 
                                 dfn11 = (-p_tor3 * exp_tor3_DjDk +
                                         (p_tor3 * exp_tor3_DjDk - p_tor4 * exp_tor4_DjDk) *
@@ -398,9 +393,9 @@ CUDA_GLOBAL void Cuda_Torsion_Angles( reax_atom *my_atoms, global_parameters gp,
 
                                 /* 4-body conjugation energy */
                                 fn12 = exp_cot2_ij * exp_cot2_jk * exp_cot2_kl;
-                                data_e_con [j] += e_con =
-                                    fbp->p_cot1 * fn12 * 
+                                e_con = fbp->p_cot1 * fn12 * 
                                     (1.0 + (SQR(cos_omega) - 1.0) * sin_ijk * sin_jkl);
+                                data_e_con[j] += e_con;
 
                                 Cconj = -2.0 * fn12 * fbp->p_cot1 * p_cot2 * 
                                     (1.0 + (SQR(cos_omega) - 1.0) * sin_ijk * sin_jkl);
diff --git a/PG-PuReMD/src/cuda/cuda_utils.cu b/PG-PuReMD/src/cuda/cuda_utils.cu
index 7e1757bc..22ac8de6 100644
--- a/PG-PuReMD/src/cuda/cuda_utils.cu
+++ b/PG-PuReMD/src/cuda/cuda_utils.cu
@@ -11,7 +11,7 @@ extern "C" void cuda_malloc( void **ptr, size_t size, int mem_set, const char *m
     if ( retVal != cudaSuccess )
     {
         fprintf( stderr, "[ERROR] failed to allocate memory on device for resouce %s\n", msg );
-        fprintf( stderr, "CUDA API error code: %d, requested memory size (in bytes): %lu\n", 
+        fprintf( stderr, "    [INFO] CUDA API error code: %d, requested memory size (in bytes): %lu\n", 
                 retVal, size );
         exit( INSUFFICIENT_MEMORY );
     }  
@@ -23,7 +23,7 @@ extern "C" void cuda_malloc( void **ptr, size_t size, int mem_set, const char *m
         if( retVal != cudaSuccess )
         {
             fprintf( stderr, "[ERROR] failed to memset memory on device for resource %s\n", msg );
-            fprintf( stderr, "CUDA API error code: %d, requested memory size (in bytes): %lu\n", 
+            fprintf( stderr, "    [INFO] CUDA API error code: %d, requested memory size (in bytes): %lu\n", 
                     retVal, size );
             exit( INSUFFICIENT_MEMORY );
         }
@@ -45,9 +45,10 @@ extern "C" void cuda_free( void *ptr, const char *msg )
 
     if( retVal != cudaSuccess )
     {
-        fprintf( stderr,
-                "[WARNING] failed to release memory on device for resource %s\nCUDA API error code: %d, memory address: %ld\n", 
-                msg, retVal, (long int) ptr );
+        fprintf( stderr, "[WARNING] failed to release memory on device for resource %s\n",
+                msg );
+        fprintf( stderr, "    [INFO] CUDA API error code: %d, memory address: %ld\n", 
+                retVal, (long int) ptr );
         return;
     }  
 }
@@ -61,10 +62,9 @@ extern "C" void cuda_memset( void *ptr, int data, size_t count, const char *msg
 
     if( retVal != cudaSuccess )
     {
-        fprintf( stderr,
-                "[ERROR] failed to memset memory on device for resource %s\nCUDA API error code: %d\n", 
-                msg, retVal );
-        exit( INSUFFICIENT_MEMORY );
+        fprintf( stderr, "[ERROR] failed to memset memory on device for resource %s\n", msg );
+        fprintf( stderr, "    [INFO] CUDA API error code: %d\n", retVal );
+        exit( RUNTIME_ERROR );
     }
 }
 
@@ -86,7 +86,7 @@ extern "C" void copy_host_device( void *host, void *dev, size_t size,
     if( retVal != cudaSuccess )
     {
         fprintf( stderr,
-                "[ERROR] could not copy resource %s from host to device\nCUDA API error code: %d n",
+                "[ERROR] could not copy resource %s from host to device\n    [INFO] CUDA API error code: %d n",
                 msg, retVal );
         exit( INSUFFICIENT_MEMORY );
     }
@@ -102,7 +102,7 @@ extern "C" void copy_device( void *dest, void *src, size_t size, const char *msg
     if( retVal != cudaSuccess )
     {
         fprintf( stderr,
-                "[ERROR] could not copy resource %s from device to device\nCUDA API error code: %d\n",
+                "[ERROR] could not copy resource %s from device to device\n    [INFO] CUDA API error code: %d\n",
                 msg, retVal );
         exit( INSUFFICIENT_MEMORY );
     }
@@ -140,7 +140,7 @@ extern "C" void print_device_mem_usage( )
     if ( retVal != cudaSuccess )
     {
         fprintf( stderr,
-                "[WARNING] could not get message usage info from device\nCUDA API error code: %d\n",
+                "[WARNING] could not get message usage info from device\n    [INFO] CUDA API error code: %d\n",
                 retVal );
         return;
     }
diff --git a/PG-PuReMD/src/cuda/cuda_valence_angles.cu b/PG-PuReMD/src/cuda/cuda_valence_angles.cu
index 21b8d2c8..35e07fe0 100644
--- a/PG-PuReMD/src/cuda/cuda_valence_angles.cu
+++ b/PG-PuReMD/src/cuda/cuda_valence_angles.cu
@@ -37,7 +37,8 @@ CUDA_GLOBAL void Cuda_Valence_Angles( reax_atom *my_atoms,
 {
     int i, j, pi, k, pk, t;
     int type_i, type_j, type_k;
-    int start_j, end_j, start_pk, end_pk;
+    int start_j, end_j;
+//    int start_pk, end_pk;
     int cnt, num_thb_intrs;
     real temp, temp_bo_jt, pBOjt7;
     real p_val1, p_val2, p_val3, p_val4, p_val5;
@@ -54,12 +55,12 @@ CUDA_GLOBAL void Cuda_Valence_Angles( reax_atom *my_atoms,
     real Cf7ij, Cf7jk, Cf8j, Cf9j;
     real f7_ij, f7_jk, f8_Dj, f9_Dj;
     real Ctheta_0, theta_0, theta_00, theta, cos_theta, sin_theta;
-    real r_ij, r_jk;
     real BOA_ij, BOA_jk;
     rvec force, ext_press;
     three_body_header *thbh;
     three_body_parameters *thbp;
-    three_body_interaction_data *p_ijk, *p_kji;
+    three_body_interaction_data *p_ijk;
+//    three_body_interaction_data *p_kji;
     bond_data *pbond_ij, *pbond_jk, *pbond_jt;
     bond_order_data *bo_ij, *bo_jk, *bo_jt;
     reax_list *bonds;
@@ -146,17 +147,14 @@ CUDA_GLOBAL void Cuda_Valence_Angles( reax_atom *my_atoms,
         bo_ij = &(pbond_ij->bo_data);
         BOA_ij = bo_ij->BO - control->thb_cut;
 
-        //if( BOA_ij/*bo_ij->BO*/ > 0.0) {
         if ( BOA_ij > 0.0 &&
                 ( j < n || pbond_ij->nbr < n ) )
         {
             i = pbond_ij->nbr;
-            r_ij = pbond_ij->d;
             type_i = my_atoms[i].type;
 
-            /* first copy 3-body intrs from previously computed ones where i>k.
-               in the second for-loop below,
-               we compute only new 3-body intrs where i < k */
+            /* first copy 3-body intrs from previously computed ones where i > k;
+               in the second for-loop below, compute only new 3-body intrs where i < k */
 
             // The copy loop commented out because strange asynchronous issues started to surface
             // Each kernel now manually generates everything
@@ -234,7 +232,6 @@ CUDA_GLOBAL void Cuda_Valence_Angles( reax_atom *my_atoms,
                 if ( j < n && BOA_jk > 0.0 &&
                         bo_ij->BO * bo_jk->BO > SQR(control->thb_cut) )
                 {
-                    r_jk = pbond_jk->d;
                     thbh = &( d_thbh[ index_thbp(type_i, type_j, type_k, num_atom_types) ] );
 
                     for ( cnt = 0; cnt < thbh->cnt; ++cnt )
@@ -274,7 +271,7 @@ CUDA_GLOBAL void Cuda_Valence_Angles( reax_atom *my_atoms,
                             {
                                 expval12theta = p_val1 * (1.0 - expval2theta);
                             }
-                            // To avoid linear Me-H-Me angles (6/6/06)
+                            /* to avoid linear Me-H-Me angles (6/6/06) */
                             else
                             {
                                 expval12theta = p_val1 * -expval2theta;
@@ -340,13 +337,13 @@ CUDA_GLOBAL void Cuda_Valence_Angles( reax_atom *my_atoms,
 
                             exp_coa2 = EXP( p_coa2 * workspace->Delta_boc[j] );
 
-                            /* Similar to above comment regarding if statement */
+                            /* similar to above comment regarding if statement */
                             if ( pk < pi )
                             {
                                 e_coa =
                                     p_coa1 / (1. + exp_coa2) *
-                                    EXP( -p_coa3 * SQR(workspace->total_bond_order[i]-BOA_ij) ) *
-                                    EXP( -p_coa3 * SQR(workspace->total_bond_order[k]-BOA_jk) ) *
+                                    EXP( -p_coa3 * SQR(workspace->total_bond_order[i] - BOA_ij) ) *
+                                    EXP( -p_coa3 * SQR(workspace->total_bond_order[k] - BOA_jk) ) *
                                     EXP( -p_coa4 * SQR(BOA_ij - 1.5) ) *
                                     EXP( -p_coa4 * SQR(BOA_jk - 1.5) );
                                 data_e_coa[j] += e_coa;
@@ -356,9 +353,9 @@ CUDA_GLOBAL void Cuda_Valence_Angles( reax_atom *my_atoms,
                             CEcoa2 = -2 * p_coa4 * (BOA_jk - 1.5) * e_coa;
                             CEcoa3 = -p_coa2 * exp_coa2 * e_coa / (1 + exp_coa2);
                             CEcoa4 = -2 * p_coa3 *
-                                (workspace->total_bond_order[i]-BOA_ij) * e_coa;
+                                (workspace->total_bond_order[i] - BOA_ij) * e_coa;
                             CEcoa5 = -2 * p_coa3 *
-                                (workspace->total_bond_order[k]-BOA_jk) * e_coa;
+                                (workspace->total_bond_order[k] - BOA_jk) * e_coa;
                             /* END COALITION ENERGY */
 
                             /* FORCES */
@@ -562,7 +559,7 @@ CUDA_GLOBAL void Cuda_Valence_Angles_PostProcess( reax_atom *atoms,
 CUDA_GLOBAL void Estimate_Cuda_Valence_Angles( reax_atom *my_atoms,
         control_params *control, reax_list p_bonds, int n, int N, int *count )
 {
-    int i, j, k, pi, pk;
+    int j, pi, pk;
     int start_j, end_j;
     int num_thb_intrs;
     real BOA_ij, BOA_jk;
@@ -590,12 +587,9 @@ CUDA_GLOBAL void Estimate_Cuda_Valence_Angles( reax_atom *my_atoms,
         bo_ij = &(pbond_ij->bo_data);
         BOA_ij = bo_ij->BO - control->thb_cut;
 
-        //if( BOA_ij/*bo_ij->BO*/ > 0.0) {
         if ( BOA_ij > 0.0 &&
                 ( j < n || pbond_ij->nbr < n ) )
         {
-            i = pbond_ij->nbr;
-
             for ( pk = start_j; pk < end_j; ++pk )
             {
                 if ( pk == pi )
diff --git a/PG-PuReMD/src/cuda/cuda_vector.h b/PG-PuReMD/src/cuda/cuda_vector.h
new file mode 100644
index 00000000..ef526938
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_vector.h
@@ -0,0 +1,41 @@
+/*----------------------------------------------------------------------
+  PuReMD - Purdue ReaxFF Molecular Dynamics Program
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#ifndef __CUDA_VECTOR_H_
+#define __CUDA_VECTOR_H_
+
+#include "../reax_types.h"
+
+#include "cuda_random.h"
+
+
+CUDA_DEVICE static inline void cuda_rvec_Random( rvec v )
+{
+//    v[0] = Cuda_Random( 2.0 ) - 1.0;
+//    v[1] = Cuda_Random( 2.0 ) - 1.0;
+//    v[2] = Cuda_Random( 2.0 ) - 1.0;
+    v[0] = 0.0;
+    v[1] = 0.0;
+    v[2] = 0.0;
+}
+
+
+#endif
diff --git a/PG-PuReMD/src/ffield.c b/PG-PuReMD/src/ffield.c
index 7252674c..4302e57c 100644
--- a/PG-PuReMD/src/ffield.c
+++ b/PG-PuReMD/src/ffield.c
@@ -31,7 +31,7 @@
 
 
 int Read_Force_Field( char *ffield_file, reax_interaction *reax,
-        control_params *control )
+        reax_system *system, control_params *control )
 {
     FILE *fp;
     char *s;
@@ -45,7 +45,8 @@ int Read_Force_Field( char *ffield_file, reax_interaction *reax,
     /* open force field file */
     if ( (fp = fopen( ffield_file, "r" ) ) == NULL )
     {
-        fprintf( stderr, "[ERROR] cannot open force filed file! terminating...\n" );
+        fprintf( stderr, "[ERROR] p%d: cannot open force field file! terminating...\n",
+              system->my_rank );
         MPI_Abort( MPI_COMM_WORLD, FILE_NOT_FOUND );
     }
 
@@ -67,7 +68,8 @@ int Read_Force_Field( char *ffield_file, reax_interaction *reax,
     n = atoi(tmp[0]);
     if ( n < 1 )
     {
-        fprintf( stderr, "[WARNING] number of globals in ffield file is 0!\n" );
+        fprintf( stderr, "[WARNING] p%d: number of globals in ffield file is 0!\n",
+              system->my_rank );
         return SUCCESS;
     }
 
@@ -146,7 +148,8 @@ int Read_Force_Field( char *ffield_file, reax_interaction *reax,
         }
 
 #if defined(DEBUG_FOCUS)
-        fprintf( stderr, "Atom Name in the force field : %s \n", reax->sbp[i].name );
+        fprintf( stderr, "p%d: Atom Name in the force field : %s \n",
+                system->my_rank, reax->sbp[i].name );
 #endif
 
         val = atof(tmp[1]);
@@ -233,20 +236,20 @@ int Read_Force_Field( char *ffield_file, reax_interaction *reax,
             {
                 if ( reax->gp.vdw_type != 0 && reax->gp.vdw_type != 3 )
                 {
-                    fprintf( stderr, "[WARNING] inconsistent vdWaals-parameters\n"
+                    fprintf( stderr, "[WARNING] p%d: inconsistent vdWaals-parameters\n"
                             "Force field parameters for element %s\n"
                             "indicate inner wall+shielding, but earlier\n"
                             "atoms indicate different vdWaals-method.\n"
                             "This may cause division-by-zero errors.\n"
                             "Keeping vdWaals-setting for earlier atoms.\n",
-                            reax->sbp[i].name );
+                            system->my_rank, reax->sbp[i].name );
                 }
                 else
                 {
                     reax->gp.vdw_type = 3;
 #if defined(DEBUG)
-                    fprintf( stderr, "vdWaals type for element %s: Shielding+inner-wall",
-                             reax->sbp[i].name );
+                    fprintf( stderr, "p%d: vdWaals type for element %s: Shielding+inner-wall",
+                            system->my_rank, reax->sbp[i].name );
 #endif
                 }
             }
@@ -255,7 +258,8 @@ int Read_Force_Field( char *ffield_file, reax_interaction *reax,
             {
                 if ( reax->gp.vdw_type != 0 && reax->gp.vdw_type != 2 )
                 {
-                    fprintf( stderr, "[WARNING] inconsistent vdWaals-parameters\n" );
+                    fprintf( stderr, "[WARNING] p%d: inconsistent vdWaals-parameters\n",
+                            system->my_rank );
                     fprintf( stderr, "    [INFO] Force field parameters for element %s\n", reax->sbp[i].name );
                     fprintf( stderr, "    [INFO] indicate inner wall without shielding, but earlier\n" );
                     fprintf( stderr, "    [INFO] atoms indicate different vdWaals-method.\n" );
@@ -266,8 +270,8 @@ int Read_Force_Field( char *ffield_file, reax_interaction *reax,
                 {
                     reax->gp.vdw_type = 2;
 #if defined(DEBUG)
-                    fprintf( stderr, "vdWaals type for element%s: No Shielding,inner-wall",
-                            reax->sbp[i].name );
+                    fprintf( stderr, "p%d: vdWaals type for element%s: No Shielding,inner-wall",
+                            system->my_rank, reax->sbp[i].name );
 #endif
                 }
             }
@@ -279,34 +283,34 @@ int Read_Force_Field( char *ffield_file, reax_interaction *reax,
             if ( reax->sbp[i].gamma_w > 0.5 )
             {
                 if ( reax->gp.vdw_type != 0 && reax->gp.vdw_type != 1 )
-                    fprintf( stderr, "[WARNING] inconsistent vdWaals-parameters\n" \
+                    fprintf( stderr, "[WARNING] p%d: inconsistent vdWaals-parameters\n" \
                             "    [INFO] Force field parameters for element %s\n"        \
                             "    [INFO] indicate  shielding without inner wall, but earlier\n" \
                             "    [INFO] atoms indicate different vdWaals-method.\n"     \
                             "    [INFO] This may cause division-by-zero errors.\n"      \
                             "    [INFO] Keeping vdWaals-setting for earlier atoms.\n",
-                            reax->sbp[i].name );
+                            system->my_rank, reax->sbp[i].name );
                 else
                 {
                     reax->gp.vdw_type = 1;
 #if defined(DEBUG)
-                    fprintf( stderr, "vdWaals type for element%s: Shielding,no inner-wall",
-                            reax->sbp[i].name );
+                    fprintf( stderr, "p%d, vdWaals type for element%s: Shielding,no inner-wall",
+                            system->my_rank, reax->sbp[i].name );
 #endif
                 }
             }
             else
             {
-                fprintf( stderr, "[ERROR] inconsistent vdWaals-parameters\n"\
+                fprintf( stderr, "[ERROR] p%d: inconsistent vdWaals-parameters\n" \
                          "    [INFO] No shielding or inner-wall set for element %s\n",
-                         reax->sbp[i].name );
+                         system->my_rank, reax->sbp[i].name );
                 MPI_Abort( MPI_COMM_WORLD, INVALID_INPUT );
             }
         }
     }
 
 #if defined(DEBUG)
-    fprintf( stderr, "vdWaals type: %d\n", reax->gp.vdw_type );
+    fprintf( stderr, "p%d: vdWaals type: %d\n", system->my_rank, reax->gp.vdw_type );
 #endif
 
     /* Equate vval3 to valf for first-row elements (25/10/2004) */
@@ -315,8 +319,8 @@ int Read_Force_Field( char *ffield_file, reax_interaction *reax,
         if ( reax->sbp[i].mass < 21 &&
                 reax->sbp[i].valency_val != reax->sbp[i].valency_boc )
         {
-            fprintf( stderr, "[WARNING] changed valency_val to valency_boc for atom type %s\n",
-                    reax->sbp[i].name );
+            fprintf( stderr, "[WARNING] p%d: changed valency_val to valency_boc for atom type %s\n",
+                    system->my_rank, reax->sbp[i].name );
             reax->sbp[i].valency_val = reax->sbp[i].valency_boc;
         }
     }
@@ -764,7 +768,7 @@ int Read_Force_Field( char *ffield_file, reax_interaction *reax,
     sfree( tor_flag, "READ_FFIELD" );
 
 #if defined(DEBUG_FOCUS)
-    fprintf( stderr, "force field read\n" );
+    fprintf( stderr, "p%d: force field read\n", system->my_rank );
 #endif
 
     return SUCCESS;
diff --git a/PG-PuReMD/src/ffield.h b/PG-PuReMD/src/ffield.h
index 5cca5e9d..902e8572 100644
--- a/PG-PuReMD/src/ffield.h
+++ b/PG-PuReMD/src/ffield.h
@@ -29,7 +29,7 @@
 extern "C" {
 #endif
 
-int Read_Force_Field( char*, reax_interaction*, control_params* );
+int Read_Force_Field( char*, reax_interaction*, reax_system *, control_params* );
 
 #ifdef __cplusplus
 }
diff --git a/PG-PuReMD/src/init_md.c b/PG-PuReMD/src/init_md.c
index 2e406d1a..3d27ecf6 100644
--- a/PG-PuReMD/src/init_md.c
+++ b/PG-PuReMD/src/init_md.c
@@ -105,7 +105,7 @@ void Generate_Initial_Velocities( reax_system *system, real T )
     }
     else
     {
-        Randomize();
+        Randomize( );
 
         for ( i = 0; i < system->n; i++ )
         {
diff --git a/PG-PuReMD/src/io_tools.c b/PG-PuReMD/src/io_tools.c
index 91783d67..3cce1777 100644
--- a/PG-PuReMD/src/io_tools.c
+++ b/PG-PuReMD/src/io_tools.c
@@ -634,25 +634,19 @@ void Print_Grid( grid* g, FILE *out )
     fprintf( out, "\t---------------------------------\n" );
 
     fprintf( stderr, "GCELL MARKS:\n" );
-    //SUDHIR
-    //gc_type = g->cells[0][0][0].type;
-    gc_type = g->cells[ index_grid_3d (0, 0, 0, g) ].type;
+    gc_type = g->cells[ index_grid_3d(0, 0, 0, g) ].type;
     ivec_MakeZero( gc_str );
 
     x = y = z = 0;
     for ( x = 0; x < g->ncells[0]; ++x )
         for ( y = 0; y < g->ncells[1]; ++y )
             for ( z = 0; z < g->ncells[2]; ++z )
-                //SUDHIR
-                //if( g->cells[x][y][z].type != gc_type ){
                 if ( g->cells[ index_grid_3d(x, y, z, g) ].type != gc_type )
                 {
                     fprintf( stderr,
                              "\tgcells from(%2d %2d %2d) to (%2d %2d %2d): %d - %s\n",
                              gc_str[0], gc_str[1], gc_str[2], x, y, z,
                              gc_type, gcell_type_text[gc_type] );
-                    //SUDHIR
-                    //gc_type = g->cells[x][y][z].type;
                     gc_type = g->cells[ index_grid_3d(x, y, z, g) ].type;
                     gc_str[0] = x;
                     gc_str[1] = y;
@@ -728,8 +722,6 @@ void Print_Native_GCells( reax_system *system )
         for ( j = g->native_str[1]; j < g->native_end[1]; j++ )
             for ( k = g->native_str[2]; k < g->native_end[2]; k++ )
             {
-                //SUDHIR
-                //gc = &( g->cells[i][j][k] );
                 gc = &( g->cells[ index_grid_3d(i, j, k, g) ] );
 
                 fprintf( f, "p%d gcell(%2d %2d %2d) of type %d(%s)\n",
@@ -770,8 +762,6 @@ void Print_All_GCells( reax_system *system )
         for ( j = 0; j < g->ncells[1]; j++ )
             for ( k = 0; k < g->ncells[2]; k++ )
             {
-                //SUDHIR
-                //gc = &( g->cells[i][j][k] );
                 gc = &( g->cells[ index_grid_3d(i, j, k, g) ] );
 
                 fprintf( f, "p%d gcell(%2d %2d %2d) of type %d(%s)\n",
diff --git a/PG-PuReMD/src/parallelreax.c b/PG-PuReMD/src/parallelreax.c
index cc5f9d3d..9ace1c32 100644
--- a/PG-PuReMD/src/parallelreax.c
+++ b/PG-PuReMD/src/parallelreax.c
@@ -73,7 +73,7 @@ void Read_System( char *geo_file, char *ffield_file, char *control_file,
         storage *workspace, output_controls *out_control, mpi_datatypes *mpi_data )
 {
     /* ffield file */
-    Read_Force_Field( ffield_file, &(system->reax_param), control );
+    Read_Force_Field( ffield_file, &(system->reax_param), system, control );
 
     /* control file */
     Read_Control_File( control_file, control, out_control );
@@ -240,7 +240,7 @@ int main( int argc, char* argv[] )
     print_device_mem_usage( );
 #endif
 
-    /* init the blocks sizes for cuda kernels */
+    /* init blocks sizes */
     init_blocks( system );
 
     /* measure total simulation time after input is read */
@@ -397,7 +397,7 @@ int main( int argc, char* argv[] )
 
     if ( retries >= MAX_RETRIES )
     {
-        fprintf( stderr, "Maximum retries reached for this step (%d). Terminating...\n",
+        fprintf( stderr, "[ERROR] Maximum retries reached for this step (%d). Terminating...\n",
               retries );
         MPI_Abort( MPI_COMM_WORLD, MAX_RETRIES_REACHED );
     }
diff --git a/PG-PuReMD/src/random.c b/PG-PuReMD/src/random.c
index ffe55458..f9152830 100644
--- a/PG-PuReMD/src/random.c
+++ b/PG-PuReMD/src/random.c
@@ -19,30 +19,31 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#include "reax_types.h"
-
 #include "random.h"
 
+
 /* System random number generator used linear congruance method with
    large periodicity for generation of pseudo random number. function
    Random returns this random number appropriately scaled so that
    0 <= Random(range) < range */
-double Random(double range)
+double Random( double range )
 {
-    return (random() * range) / 2147483647L;
+    return (random( ) * range) / 2147483647L;
 }
 
+
 /* This function seeds the system pseudo random number generator with
    current time. Use this function once in the begining to initialize
    the system */
-void Randomize()
+void Randomize( )
 {
-    srandom(time(NULL));
+    srandom( time(NULL) );
 }
 
+
 /* GRandom return random number with gaussian distribution with mean
    and standard deviation "sigma" */
-double GRandom(double mean, double sigma)
+double GRandom( double mean, double sigma )
 {
     double v1 = Random(2.0) - 1.0;
     double v2 = Random(2.0) - 1.0;
diff --git a/PG-PuReMD/src/random.h b/PG-PuReMD/src/random.h
index 66a5d59d..d26de8f0 100644
--- a/PG-PuReMD/src/random.h
+++ b/PG-PuReMD/src/random.h
@@ -48,4 +48,5 @@ double GRandom( double, double );
 }
 #endif
 
+
 #endif
diff --git a/PG-PuReMD/src/reax_types.h b/PG-PuReMD/src/reax_types.h
index c0bac402..a6d16f8e 100644
--- a/PG-PuReMD/src/reax_types.h
+++ b/PG-PuReMD/src/reax_types.h
@@ -198,12 +198,12 @@
 #endif
 
 /**************** RESOURCE CONSTANTS **********************/
-/* 20 MB */
-#define HOST_SCRATCH_SIZE               (1024 * 1024 * 20)
+/* 500 MB */
+#define HOST_SCRATCH_SIZE               (1024 * 1024 * 500)
 #ifdef HAVE_CUDA
-/* 20 MB */
-#define DEVICE_SCRATCH_SIZE             (1024 * 1024 * 20)
-/* 20 MB */
+/* 500 MB */
+#define DEVICE_SCRATCH_SIZE             (1024 * 1024 * 500)
+/* 500 MB */
 #define RES_SCRATCH                     0x90
 
 /* BLOCK SIZES for kernels */
@@ -433,21 +433,21 @@ typedef real rvec4[4];
 /* header used in restart file */
 typedef struct
 {
-    /**/
+    /* current simulation time step */
     int step;
-    /**/
+    /* total num. atoms in simulation */
     int bigN;
-    /**/
+    /* thermostat temperature */
     real T;
-    /**/
+    /* thrmostat ??? */
     real xi;
-    /**/
+    /* thrmostat ??? */
     real v_xi;
-    /**/
+    /* thrmostat ??? */
     real v_xi_old;
-    /**/
+    /* thrmostat ??? */
     real G_xi;
-    /**/
+    /* ??? */
     rtensor box;
 } restart_header;
 
@@ -1238,6 +1238,8 @@ typedef struct
     /* TRUE if charge matrix requires reallocation, FALSE otherwise (GPU) */
     int *d_realloc_cm_entries;
 
+    /* total num. three body list indices */
+    int total_thbodies_indices;
     /* total num. three body interactions */
     int total_thbodies;
     /* total num. three body interactions (GPU) */
@@ -1802,6 +1804,8 @@ typedef struct
     int num_bonds;
     /* TRUE if three body list needs
      * to be reallocated, FALSE otherwise */
+    int thbody;
+    /**/
     int num_3body;
     /**/
     int gcell_atoms;
diff --git a/PG-PuReMD/src/traj.c b/PG-PuReMD/src/traj.c
index 331e4430..af27e1cb 100644
--- a/PG-PuReMD/src/traj.c
+++ b/PG-PuReMD/src/traj.c
@@ -350,11 +350,11 @@ int Write_Header( reax_system *system, control_params *control,
     {
         out_control->trj_offset = 0;
         Set_My_Trajectory_View( out_control->trj,
-                                out_control->trj_offset, mpi_data->header_line,
-                                mpi_data->world, system->my_rank,
-                                my_hdr_lines, num_hdr_lines );
+                out_control->trj_offset, mpi_data->header_line,
+                mpi_data->world, system->my_rank,
+                my_hdr_lines, num_hdr_lines );
         MPI_File_write_all( out_control->trj, out_control->buffer,
-                            num_hdr_lines, mpi_data->header_line, &status );
+                num_hdr_lines, mpi_data->header_line, &status );
         out_control->trj_offset = (num_hdr_lines) * HEADER_LINE_LEN;
     }
     else
@@ -411,10 +411,10 @@ int Write_Init_Desc( reax_system *system, control_params *control,
     if ( out_control->traj_method == MPI_TRAJ )
     {
         Set_My_Trajectory_View( out_control->trj, out_control->trj_offset,
-                                mpi_data->init_desc_line, mpi_data->world,
-                                me, system->n, system->bigN );
+                mpi_data->init_desc_line, mpi_data->world,
+                me, system->n, system->bigN );
         MPI_File_write( out_control->trj, out_control->buffer, system->n,
-                        mpi_data->init_desc_line, &status );
+                mpi_data->init_desc_line, &status );
         out_control->trj_offset += system->bigN * INIT_DESC_LEN;
     }
     else
@@ -446,12 +446,12 @@ int Init_Traj( reax_system *system, control_params *control,
         output_controls *out_control, mpi_datatypes *mpi_data, char *msg )
 {
     char fname[MAX_STR];
-    int  atom_line_len[ NR_OPT_ATOM ] = { 0, 0, 0, 0,
-                                          ATOM_BASIC_LEN, ATOM_wV_LEN,
-                                          ATOM_wF_LEN, ATOM_FULL_LEN
-                                        };
-    int  bond_line_len[ NR_OPT_BOND ] = { 0, BOND_BASIC_LEN, BOND_FULL_LEN };
-    int  angle_line_len[ NR_OPT_ANGLE ] = { 0, ANGLE_BASIC_LEN };
+    int atom_line_len[ NR_OPT_ATOM ] = { 0, 0, 0, 0,
+        ATOM_BASIC_LEN, ATOM_wV_LEN,
+        ATOM_wF_LEN, ATOM_FULL_LEN
+    };
+    int bond_line_len[ NR_OPT_BOND ] = { 0, BOND_BASIC_LEN, BOND_FULL_LEN };
+    int angle_line_len[ NR_OPT_ANGLE ] = { 0, ANGLE_BASIC_LEN };
 
     /* generate trajectory name */
     sprintf( fname, "%s.trj", control->sim_name );
@@ -531,11 +531,15 @@ int Init_Traj( reax_system *system, control_params *control,
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d: initiated trajectory\n", system->my_rank );
 #endif
+
     Write_Header( system, control, out_control, mpi_data );
+
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d: header written\n", system->my_rank );
 #endif
+
     Write_Init_Desc( system, control, out_control, mpi_data );
+
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d: atom descriptions written\n", system->my_rank );
 #endif
@@ -669,11 +673,11 @@ int Write_Frame_Header( reax_system *system, control_params *control,
     if ( out_control->traj_method == MPI_TRAJ )
     {
         Set_My_Trajectory_View( out_control->trj, out_control->trj_offset,
-                                mpi_data->header_line, mpi_data->world,
-                                me, my_frm_hdr_lines, num_frm_hdr_lines );
+                mpi_data->header_line, mpi_data->world,
+                me, my_frm_hdr_lines, num_frm_hdr_lines );
 
         MPI_File_write_all(out_control->trj, out_control->buffer, my_frm_hdr_lines,
-                           mpi_data->header_line, &status);
+                mpi_data->header_line, &status);
         out_control->trj_offset += (num_frm_hdr_lines) * HEADER_LINE_LEN;
     }
     else
@@ -756,10 +760,10 @@ int Write_Atoms( reax_system *system, control_params *control,
     if ( out_control->traj_method == MPI_TRAJ )
     {
         Set_My_Trajectory_View( out_control->trj, out_control->trj_offset,
-                                mpi_data->atom_line, mpi_data->world,
-                                me, system->n, system->bigN );
+                mpi_data->atom_line, mpi_data->world,
+                me, system->n, system->bigN );
         MPI_File_write( out_control->trj, out_control->buffer, system->n,
-                        mpi_data->atom_line, &status );
+                mpi_data->atom_line, &status );
         out_control->trj_offset += (system->bigN) * out_control->atom_line_len;
     }
     else
@@ -869,10 +873,10 @@ int Write_Bonds( reax_system *system, control_params *control, reax_list *bonds,
     if ( out_control->traj_method == MPI_TRAJ )
     {
         Set_My_Trajectory_View( out_control->trj, out_control->trj_offset,
-                                mpi_data->bond_line, mpi_data->world,
-                                me, my_bonds, num_bonds );
+                mpi_data->bond_line, mpi_data->world,
+                me, my_bonds, num_bonds );
         MPI_File_write( out_control->trj, out_control->buffer, my_bonds,
-                        mpi_data->bond_line, &status );
+                mpi_data->bond_line, &status );
         out_control->trj_offset += num_bonds * line_len;
     }
     else
@@ -990,10 +994,10 @@ int Write_Angles( reax_system *system, control_params *control,
     if ( out_control->traj_method == MPI_TRAJ )
     {
         Set_My_Trajectory_View( out_control->trj, out_control->trj_offset,
-                                mpi_data->angle_line, mpi_data->world,
-                                me, my_angles, num_angles );
+                mpi_data->angle_line, mpi_data->world,
+                me, my_angles, num_angles );
         MPI_File_write( out_control->trj, out_control->buffer, my_angles,
-                        mpi_data->angle_line, &status );
+                mpi_data->angle_line, &status );
         out_control->trj_offset += num_angles * line_len;
     }
     else
diff --git a/PG-PuReMD/src/vector.h b/PG-PuReMD/src/vector.h
index 14250909..1cfd7950 100644
--- a/PG-PuReMD/src/vector.h
+++ b/PG-PuReMD/src/vector.h
@@ -23,13 +23,13 @@
 #define __VECTOR_H_
 
 #include "reax_types.h"
+
 #include "random.h"
 
 #ifdef __cplusplus
 extern "C"  {
 #endif
 
-
 #if defined(LAMMPS_REAX) || defined(PURE_REAX)
 CUDA_HOST_DEVICE static inline int Vector_isZero( real* v, int k )
 {
@@ -285,11 +285,11 @@ CUDA_HOST_DEVICE static inline void rvec_MakeZero( rvec v )
 
 
 #if defined(PURE_REAX)
-CUDA_HOST_DEVICE static inline void rvec_Random( rvec v )
+static inline void rvec_Random( rvec v )
 {
-    v[0] = Random(2.0) - 1.0;
-    v[1] = Random(2.0) - 1.0;
-    v[2] = Random(2.0) - 1.0;
+    v[0] = Random( 2.0 ) - 1.0;
+    v[1] = Random( 2.0 ) - 1.0;
+    v[2] = Random( 2.0 ) - 1.0;
 }
 #endif
 
diff --git a/PuReMD/src/basic_comm.c b/PuReMD/src/basic_comm.c
index 55f2867e..96c83976 100644
--- a/PuReMD/src/basic_comm.c
+++ b/PuReMD/src/basic_comm.c
@@ -299,7 +299,7 @@ void Coll_ids_at_Master( reax_system *system, storage *workspace,
                  workspace->id_all, workspace->rcounts, workspace->displs,
                  MPI_INT, MASTER_NODE, mpi_data->world );
 
-    free( id_list );
+    sfree( id_list, "id_list" );
 
 #if defined(DEBUG)
     if ( system->my_rank == MASTER_NODE )
diff --git a/PuReMD/src/control.c b/PuReMD/src/control.c
index 7ef81836..07b46d3a 100644
--- a/PuReMD/src/control.c
+++ b/PuReMD/src/control.c
@@ -439,9 +439,9 @@ char Read_Control_File( char *control_file, control_params* control,
 
     /* free memory allocations at the top */
     for ( i = 0; i < MAX_TOKENS; i++ )
-        free( tmp[i] );
-    free( tmp );
-    free( s );
+        sfree( tmp[i], "tmp[i]" );
+    sfree( tmp, "tmp" );
+    sfree( s, "s" );
 
     // fprintf( stderr,"%d %d %10.5f %d %10.5f %10.5f\n",
     //   control->ensemble, control->nsteps, control->dt,
diff --git a/PuReMD/src/ffield.c b/PuReMD/src/ffield.c
index 29138f5b..b05216bd 100644
--- a/PuReMD/src/ffield.c
+++ b/PuReMD/src/ffield.c
@@ -766,9 +766,9 @@ char Read_Force_Field( char *ffield_file, reax_interaction *reax,
 
     /* deallocate helper storage */
     for ( i = 0; i < MAX_TOKENS; i++ )
-        free( tmp[i] );
-    free( tmp );
-    free( s );
+        sfree( tmp[i], "tmp[i]" );
+    sfree( tmp, "tmp" );
+    sfree( s, "s" );
 
 
     /* deallocate tor_flag */
@@ -777,12 +777,12 @@ char Read_Force_Field( char *ffield_file, reax_interaction *reax,
         for ( j = 0; j < reax->num_atom_types; j++ )
         {
             for ( k = 0; k < reax->num_atom_types; k++ )
-                free( tor_flag[i][j][k] );
+                sfree( tor_flag[i][j][k], "tor_flag[i][j][k]" );
 
-            free( tor_flag[i][j] );
+            sfree( tor_flag[i][j], "tor_flag[i][j]" );
         }
 
-        free( tor_flag[i] );
+        sfree( tor_flag[i], "tor_flag[i]" );
     }
 
 
diff --git a/PuReMD/src/geo_tools.c b/PuReMD/src/geo_tools.c
index c037840b..dd5115c0 100644
--- a/PuReMD/src/geo_tools.c
+++ b/PuReMD/src/geo_tools.c
@@ -623,8 +623,8 @@ char Write_PDB(reax_system* system, reax_list* bonds, simulation_data *data,
     }
     */
 
-    free(buffer);
-    free(line);
+    sfree(buffer, "buffer");
+    sfree(line, "line");
 
     return SUCCESS;
 }
diff --git a/PuReMD/src/grid.c b/PuReMD/src/grid.c
index 0881e00f..0064f220 100644
--- a/PuReMD/src/grid.c
+++ b/PuReMD/src/grid.c
@@ -515,7 +515,7 @@ void Reorder_My_Atoms( reax_system *system, storage *workspace )
     }
 
     /* deallocate old storage */
-    free( system->my_atoms );
+    sfree( system->my_atoms, "system->my_atoms" );
     /* start using clustered storages */
     system->my_atoms = new_atoms;
     system->n = top;
diff --git a/PuReMD/src/init_md.c b/PuReMD/src/init_md.c
index 897552ec..b35a4688 100644
--- a/PuReMD/src/init_md.c
+++ b/PuReMD/src/init_md.c
@@ -675,8 +675,8 @@ int  Init_Lists( reax_system *system, control_params *control,
              bond_cap * MAX_BONDS * 3 * sizeof(dbond_data) / (1024 * 1024) );
 #endif
 
-    free( hb_top );
-    free( bond_top );
+    sfree( hb_top, "hb_top" );
+    sfree( bond_top, "bond_top" );
 
     return SUCCESS;
 }
@@ -779,8 +779,8 @@ int  Init_Lists( reax_system *system, control_params *control,
              bond_cap * MAX_BONDS * 3 * sizeof(dbond_data) / (1024 * 1024) );
 #endif
 
-    free( hb_top );
-    free( bond_top );
+    sfree( hb_top, "hb_top" );
+    sfree( bond_top, "bond_top" );
 
     return SUCCESS;
 }
diff --git a/PuReMD/src/lookup.c b/PuReMD/src/lookup.c
index e821db03..2a38eb23 100644
--- a/PuReMD/src/lookup.c
+++ b/PuReMD/src/lookup.c
@@ -304,12 +304,12 @@ int Init_Lookup_Tables( reax_system *system, control_params *control,
                                           comm );
                 }
 
-    free(h);
-    free(fh);
-    free(fvdw);
-    free(fCEvd);
-    free(fele);
-    free(fCEclmb);
+    sfree(h, "h");
+    sfree(fh, "fh");
+    sfree(fvdw, "fvdw");
+    sfree(fCEvd, "fCEvd");
+    sfree(fele, "fele");
+    sfree(fCEclmb, "cCEclmb");
 
     return 1;
 }
diff --git a/PuReMD/src/parallelreax.c b/PuReMD/src/parallelreax.c
index 703a434c..6d0955c7 100644
--- a/PuReMD/src/parallelreax.c
+++ b/PuReMD/src/parallelreax.c
@@ -233,13 +233,13 @@ int main( int argc, char* argv[] )
     MPI_Finalize();
 
     /* de-allocate data structures */
-    free( system );
-    free( control );
-    free( data );
-    free( workspace );
-    free( lists );
-    free( out_control );
-    free( mpi_data );
+    sfree( system, "system" );
+    sfree( control, "control" );
+    sfree( data, "data" );
+    sfree( workspace, "workspace" );
+    sfree( lists, "lists" );
+    sfree( out_control, "out_control" );
+    sfree( mpi_data, "mpi_data" );
 
 #if defined(TEST_ENERGY) || defined(TEST_FORCES)
 //  Integrate_Results(control);
diff --git a/PuReMD/src/qEq.c b/PuReMD/src/qEq.c
index cba4f83d..ed5c276b 100644
--- a/PuReMD/src/qEq.c
+++ b/PuReMD/src/qEq.c
@@ -355,7 +355,7 @@ void Calculate_Charges( reax_system *system, storage *workspace,
     for ( i = system->n; i < system->N; ++i )
         system->my_atoms[i].q = q[i];
 
-    free(q);
+    sfree(q, "q");
 }
 
 
diff --git a/PuReMD/src/restart.c b/PuReMD/src/restart.c
index 4b9fa853..b687d82a 100644
--- a/PuReMD/src/restart.c
+++ b/PuReMD/src/restart.c
@@ -111,7 +111,7 @@ void Write_Binary_Restart( reax_system *system, control_params *control,
         fclose( fres );
     }
 
-    free(buffer);
+    sfree(buffer, "buffer");
 }
 
 
@@ -206,8 +206,8 @@ void Write_Restart( reax_system *system, control_params *control,
         fprintf( fres, "%s", buffer );
         fclose( fres );
     }
-    free(buffer);
-    free(line);
+    sfree(buffer, "buffer");
+    sfree(line, "line");
 }
 
 
@@ -467,9 +467,9 @@ void Read_Restart( char *res_file, reax_system *system,
     fclose( fres );
     /* free memory allocations at the top */
     for ( i = 0; i < MAX_TOKENS; i++ )
-        free( tmp[i] );
-    free( tmp );
-    free( s );
+        sfree( tmp[i], "tmp[i]" );
+    sfree( tmp, "tmp" );
+    sfree( s, "s" );
 
     data->step = data->prev_steps;
     // nsteps is updated based on the number of steps in the previous run
diff --git a/PuReMD/src/tool_box.c b/PuReMD/src/tool_box.c
index 02d9e3d9..a0892870 100644
--- a/PuReMD/src/tool_box.c
+++ b/PuReMD/src/tool_box.c
@@ -48,7 +48,7 @@ int SumScan( int n, int me, int root, MPI_Comm comm )
 
         MPI_Scatter( nbuf, 1, MPI_INT, &my_order, 1, MPI_INT, root, comm );
 
-        free( nbuf );
+        sfree( nbuf, "nbuf" );
     }
     else
     {
diff --git a/PuReMD/src/traj.c b/PuReMD/src/traj.c
index ab321a2e..e455d74b 100644
--- a/PuReMD/src/traj.c
+++ b/PuReMD/src/traj.c
@@ -75,7 +75,7 @@ int Reallocate_Output_Buffer( output_controls *out_control, int req_space,
                               MPI_Comm comm )
 {
     if ( out_control->buffer_len > 0 )
-        free( out_control->buffer );
+        sfree( out_control->buffer, "out_control->buffer" );
 
     out_control->buffer_len = (int)(req_space * SAFE_ZONE);
     out_control->buffer = (char*) malloc(out_control->buffer_len * sizeof(char));
@@ -1122,8 +1122,8 @@ int End_Traj( int my_rank, output_controls *out_control )
         fclose( out_control->strj );
 #endif
 
-    free( out_control->buffer );
-    free( out_control->line );
+    sfree( out_control->buffer, "out_control->buffer" );
+    sfree( out_control->line, "out_control->line" );
 
     return SUCCESS;
 }
diff --git a/cuda.am b/cuda.am
index e5e86003..eb61fb31 100644
--- a/cuda.am
+++ b/cuda.am
@@ -7,12 +7,8 @@
 
 AM_V_NVCC = $(AM_V_NVCC_@AM_V@)
 AM_V_NVCC_ = $(AM_V_NVCC_@AM_DEFAULT_V@)
-AM_V_NVCC_0 = @echo "  NVCC" $@;
-
-# these are default values for the maximun register count parameter
-# passed to nvcc compiler (you might need to change it sometimes; all you need
-# is to set it as an environment variable).
-MAX_REG_COUNT ?=48
+AM_V_NVCC_0 = @echo "  NVCC    " $@;
+AM_V_NVCC_1 =
 
 .cu.o:
-	$(AM_V_NVCC)$(NVCC) $(NVCCFLAGS) -maxrregcount=$(MAX_REG_COUNT) -o $@ -c $<
+	$(AM_V_NVCC)$(NVCC) $(NVCCFLAGS) -o $@ -c $<
-- 
GitLab