diff --git a/PG-PuReMD/src/cuda/cuda_allocate.cu b/PG-PuReMD/src/cuda/cuda_allocate.cu
index e85b4eaee10deabf763901c6b67358cde9a8eaf2..c81938c5e34f6a80096c32d42c33db38866dc0e4 100644
--- a/PG-PuReMD/src/cuda/cuda_allocate.cu
+++ b/PG-PuReMD/src/cuda/cuda_allocate.cu
@@ -577,13 +577,8 @@ void Cuda_Reallocate_Bonds_List( reax_list *bonds, size_t n, size_t num_intrs )
 
 void Cuda_Reallocate_Thbodies_List( reax_list *thbodies, size_t n, size_t num_intrs )
 {
-    /* delete three-body list */
     Dev_Delete_List( thbodies );
-//    Delete_List( thbodies );
-
-    /* recreate Three-body list */
     Dev_Make_List( n, num_intrs, TYP_THREE_BODY, thbodies );
-//    Make_List( n, num_intrs, TYP_THREE_BODY, thbodies );
 
 }
 
diff --git a/PG-PuReMD/src/cuda/cuda_copy.cu b/PG-PuReMD/src/cuda/cuda_copy.cu
index 1098d197ffdce8c39a51fe4ff5033eacd13beeb4..4912c42a410af43e364ccec4e746f6ce375ef27b 100644
--- a/PG-PuReMD/src/cuda/cuda_copy.cu
+++ b/PG-PuReMD/src/cuda/cuda_copy.cu
@@ -3,6 +3,7 @@
 
 #include "cuda_utils.h"
 
+#include "../list.h"
 #include "../vector.h"
 
 
@@ -133,11 +134,11 @@ void Output_Sync_Lists( reax_list *host, reax_list *device, int type )
     fprintf( stderr, " Trying to copy *%d* list from device to host \n", type );
 #endif
 
-//    if ( host->allocated == TRUE )
-//    {
-//        Delete_List( host );
-//    }
-//    Make_List( device->n, device->num_intrs, type, host );
+    if ( host->allocated == TRUE )
+    {
+        Delete_List( host );
+    }
+    Make_List( device->n, device->num_intrs, type, host );
 
     copy_host_device( host->index, device->index, sizeof(int) * device->n,
             cudaMemcpyDeviceToHost, "Output_Sync_Lists::list->index" );
diff --git a/PG-PuReMD/src/cuda/cuda_forces.cu b/PG-PuReMD/src/cuda/cuda_forces.cu
index 553acbd06d26052379bd6a35817bd51683c4a1b9..71224dd05ced22112cd504fcd0d299bb67762612 100644
--- a/PG-PuReMD/src/cuda/cuda_forces.cu
+++ b/PG-PuReMD/src/cuda/cuda_forces.cu
@@ -432,8 +432,8 @@ CUDA_DEVICE real Compute_tabH( LR_lookup_table *t_LR, real r_ij, int ti, int tj,
     real val, dif, base;
     LR_lookup_table *t; 
 
-    tmin  = MIN( ti, tj );
-    tmax  = MAX( ti, tj );
+    tmin = MIN( ti, tj );
+    tmax = MAX( ti, tj );
     t = &( t_LR[ index_lr(tmin,tmax, num_atom_types) ] );    
 
     /* cubic spline interpolation */
@@ -452,149 +452,6 @@ CUDA_DEVICE real Compute_tabH( LR_lookup_table *t_LR, real r_ij, int ti, int tj,
 }
 
 
-CUDA_GLOBAL void k_estimate_sparse_matrix( reax_atom *my_atoms,
-        control_params *control, reax_list p_far_nbrs, int n,
-        int N, int renbr, int *indices )
-{
-    int i, j, pj;
-    int start_i, end_i;
-    int flag;
-    real cutoff;
-    far_neighbor_data *nbr_pj;
-    reax_atom *atom_i, *atom_j;
-    reax_list *far_nbrs;
-
-    i = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if ( i >= N )
-    {
-        return;
-    }
-
-    far_nbrs = &( p_far_nbrs );
-    atom_i = &(my_atoms[i]);
-    start_i = Dev_Start_Index( i, far_nbrs );
-    end_i = Dev_End_Index( i, far_nbrs );
-    cutoff = control->nonb_cut;
-
-    if ( i < n )
-    {
-        indices[i]++;
-    }
-
-    /* update i-j distance - check if j is within cutoff */
-    for ( pj = start_i; pj < end_i; ++pj )
-    {
-        nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
-        j = nbr_pj->nbr;
-        atom_j = &(my_atoms[j]);
-
-        if ( renbr )
-        {
-            if ( nbr_pj->d <= cutoff )
-            {
-                flag = TRUE;
-            }
-            else
-            {
-                flag = FALSE;
-            }
-        }
-        else
-        {
-            nbr_pj->dvec[0] = atom_j->x[0] - atom_i->x[0];
-            nbr_pj->dvec[1] = atom_j->x[1] - atom_i->x[1];
-            nbr_pj->dvec[2] = atom_j->x[2] - atom_i->x[2];
-
-            nbr_pj->d = rvec_Norm_Sqr( nbr_pj->dvec );
-
-            if ( nbr_pj->d <= SQR(cutoff) )
-            {
-                nbr_pj->d = SQRT(nbr_pj->d);
-                flag = TRUE;
-            }
-            else
-            {
-                flag = FALSE;
-            }
-        }
-
-        if ( flag == TRUE )
-        {
-            /* H matrix entry */
-            if ( i < j && i < n && (j < n || atom_i->orig_id < atom_j->orig_id) )
-            {
-                indices[i]++;
-            }
-            else if ( i > j && i >= n && j < n && atom_j->orig_id < atom_i->orig_id )
-            {
-                indices[i]++;
-            }
-            else if ( i > j && i < n && (j < n || atom_j->orig_id < atom_i->orig_id) )
-            {
-                indices[i]++;
-            }
-        }
-    }
-}
-
-
-int Cuda_Estimate_Storage_Sparse_Matrix( reax_system *system, control_params *control, 
-        simulation_data *data, reax_list **lists )
-{
-    int blocks, max_sp_entries, total_sp_entries;
-    int *indices = (int *) scratch;
-//    int *h_indices = (int *) host_scratch;
-
-    cuda_memset( indices, 0, sizeof(int) * system->N, "scratch::sp_matrix::indices" );
-
-    blocks = system->N / DEF_BLOCK_SIZE + 
-        ((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-
-    //TODO
-    k_estimate_sparse_matrix  <<< blocks, DEF_BLOCK_SIZE >>>
-        ( system->d_my_atoms, (control_params *)control->d_control_params, 
-          *(*dev_lists + FAR_NBRS), system->n, system->N,
-          (data->step-data->prev_steps % control->reneighbor) == 0, indices );
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-
-    Cuda_Reduction_Max( indices, system->d_max_cm_entries, system->N );
-
-    copy_host_device( &max_sp_entries, system->d_max_cm_entries, sizeof(int),
-            cudaMemcpyDeviceToHost, "d_max_sparse_entries" );
-
-    Cuda_Reduction_Sum( indices, system->d_cm_entries, system->N );
-
-    copy_host_device( &total_sp_entries, system->d_cm_entries, sizeof(int),
-            cudaMemcpyDeviceToHost, "d_cm_sparse_entries" );
-
-//    copy_host_device( h_indices, indices, sizeof(int) * system->N, 
-//            cudaMemcpyDeviceToHost, "sp_matrix:indices" );
-//    max_sp_entries = 0;    
-//    total_sp_entries = 0;    
-//    for (int i = 0; i < system->N; i++)
-//    {
-//        total_sp_entries += h_indices[i];
-//        if (max_sp_entries < h_indices[i])
-//        {
-//            max_sp_entries = h_indices[i];
-//        }
-//    }
-
-#if defined(DBEUG)
-    fprintf( stderr, " TOTAL DEVICE SPARSE ENTRIES: %d \n",
-            total_sp_entries );
-    fprintf( stderr, "p%d: Max sparse entries -> %d \n",
-            system->my_rank, max_sp_entries );
-#endif 
-
-    system->total_cm_entries = max_sp_entries * SAFE_ZONE;
-
-    return SUCCESS;
-}
-
-
 CUDA_GLOBAL void k_print_hbond_info( reax_atom *my_atoms, single_body_parameters *sbp, 
         control_params *control, reax_list hbonds, int N )
 {
@@ -915,7 +772,7 @@ CUDA_GLOBAL void k_init_forces( reax_atom *my_atoms, single_body_parameters *sbp
                 //num_bonds += 2;
                 ++btop_i;
 
-                /* Need to do later... since i and j are parallel */
+                /* TODO: Need to do later... since i and j are parallel */
 //                if( workspace->bond_mark[j] > workspace->bond_mark[i] + 1 )
 //                {
 //                    workspace->bond_mark[j] = workspace->bond_mark[i] + 1;
@@ -1043,8 +900,6 @@ CUDA_GLOBAL void New_fix_sym_hbond_indices( reax_atom *my_atoms, reax_list hbond
 }
 
 
-////////////////////////
-// HBOND ISSUE
 CUDA_GLOBAL void k_update_bonds( reax_atom *my_atoms, reax_list bonds, int n )
 {
     int i;
@@ -1056,8 +911,6 @@ CUDA_GLOBAL void k_update_bonds( reax_atom *my_atoms, reax_list bonds, int n )
         return;
     }
 
-//    my_atoms[i].num_bonds = 
-//        MAX( Dev_Num_Entries(i, &bonds) * 2, MIN_BONDS );
     my_atoms[i].num_bonds = Dev_Num_Entries( i, &bonds );
 }
 
@@ -1075,13 +928,8 @@ CUDA_GLOBAL void k_update_hbonds( reax_atom *my_atoms, reax_list hbonds, int n )
     }
 
     Hindex = my_atoms[i].Hindex;
-//    my_atoms[i].num_hbonds = 
-//        MAX( Dev_Num_Entries(Hindex, &hbonds) * SAFER_ZONE, MIN_HBONDS );
     my_atoms[i].num_hbonds = Dev_Num_Entries( Hindex, &hbonds );
 }
-////////////////////////
-////////////////////////
-////////////////////////
 
 
 int Cuda_Validate_Lists( reax_system *system, storage *workspace,
@@ -1089,15 +937,8 @@ int Cuda_Validate_Lists( reax_system *system, storage *workspace,
         int step, int numH )
 {
     int blocks;
-    int i, comp, Hindex, ret;
-    int *index, *end_index;
-    int *thbody;
-    reax_list *bonds, *hbonds;
-    reallocate_data *realloc;
-    int max_sp_entries;
-    int total_sp_entries;
+    int ret;
 
-    realloc = &( dev_workspace->realloc );
     blocks = system->n / DEF_BLOCK_SIZE + 
         ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
     ret = SUCCESS;
@@ -1107,9 +948,6 @@ int Cuda_Validate_Lists( reax_system *system, storage *workspace,
     cudaThreadSynchronize( );
     cudaCheckError( );
 
-    ////////////////////////
-    // HBOND ISSUE
-    //FIX - 4 - Added this check for hydrogen bond issue
     if ( control->hbond_cut > 0.0 && system->numH > 0 )
     {
         k_update_hbonds <<< blocks, DEF_BLOCK_SIZE >>>
@@ -1118,68 +956,6 @@ int Cuda_Validate_Lists( reax_system *system, storage *workspace,
         cudaCheckError( );
     }
 
-    /* validate charge matrix */
-    memset( host_scratch, 0, 2 * system->N * sizeof(int) );
-    index = (int *) host_scratch;
-    end_index = index + system->N;
-    copy_host_device( index, dev_workspace->H.start, system->N * sizeof(int), 
-            cudaMemcpyDeviceToHost, "sparse_matrix:start" );
-    copy_host_device( end_index, dev_workspace->H.end, system->N * sizeof(int), 
-            cudaMemcpyDeviceToHost, "sparse_matrix:end" );
-
-    max_sp_entries = 0;
-    total_sp_entries = 0;
-
-//    for (i = 0; i < system->N; i++ )
-//    {
-//        //if (i < system->N-1)
-//        //    comp = index [i+1];
-//        //else
-//        //    comp = dev_workspace->H.m;
-//
-//        total_sp_entries += end_index[i] - index[i];
-//        if ( end_index[i] - index[i] > system->max_sparse_entries )
-//        {
-//            fprintf( stderr, "step%d-sparsemat-chk failed: i=%d start(i)=%d end(i)=%d \n",
-//                    step, i, index[i], end_index[i] );
-//            ret = FAILURE;
-//        }
-//        else if ( end_index[i] >= dev_workspace->H.m )
-//        {
-//            //SUDHIR_FIX_SPARSE_MATRIX
-//            //TODO move this carver
-//            fprintf( stderr, "p:%d - step%d-sparsemat-chk failed (exceed limits): i=%d start(i)=%d end(i)=%d \n", 
-//                    system->my_rank, step, i, index[i], end_index[i] );
-//            //TODO move this carver
-//            ret = FAILURE;
-//        }
-//        else
-//        {
-//            if ( max_sp_entries <= end_index[i] - index[i] )
-//            {
-//                max_sp_entries = end_index[i] - index[i];
-//            }
-//        }
-//    }
-    //if (max_sp_entries <= end_index[i] - index[i])
-    //    max_sp_entries = end_index[i] - index[i];
-
-    //update the current step max_sp_entries;
-    realloc->Htop = max_sp_entries;
-
-#if defined(DEBUG)
-    fprintf( stderr, "p:%d - Cuda_Reallocate: Total H matrix entries: %d, cap: %d, used: %d \n", 
-            system->my_rank, dev_workspace->H.n, dev_workspace->H.m, total_sp_entries );
-#endif
-
-    if ( total_sp_entries >= dev_workspace->H.m )
-    {
-        fprintf( stderr, "p:%d - **ran out of space for sparse matrix: step: %d, allocated: %d, used: %d \n", 
-                system->my_rank, step, dev_workspace->H.m, total_sp_entries );
-
-        ret = FAILURE;
-    }
-
     /* 3bodies list: since a more accurate estimate of the num.
      * of three body interactions requires that bond orders have
      * been computed, delay validation until for computation */
@@ -1207,7 +983,7 @@ CUDA_GLOBAL void k_print_forces( reax_atom *my_atoms, rvec *f, int n )
 }
 
 
-void Print_Forces( reax_system *system )
+static void Print_Forces( reax_system *system )
 {
     int blocks;
     
@@ -1343,9 +1119,6 @@ int Cuda_Init_Forces( reax_system *system, control_params *control,
         cudaThreadSynchronize( );
         cudaCheckError( );
 
-        ///////////////////////
-        ///////////////////////
-        // FIX - 4 - HBOND ISSUE
         if ( control->hbond_cut > 0 && system->numH > 0 )
         {
             /* make hbond_list symmetric */
@@ -1358,7 +1131,7 @@ int Cuda_Init_Forces( reax_system *system, control_params *control,
             cudaCheckError( );
         }
 
-//        /* update bond_mark */
+        /* update bond_mark */
 //        k_bond_mark <<< blocks, DEF_BLOCK_SIZE >>>
 //        k_bond_mark <<< 1, 1 >>>
 //            ( *(*dev_lists + BONDS), *dev_workspace, system->N );
@@ -1378,7 +1151,7 @@ int Cuda_Init_Forces_No_Charges( reax_system *system, control_params *control,
         simulation_data *data, storage *workspace,
         reax_list **lists, output_controls *out_control ) 
 {
-    //TODO: Implement later when you figure out the bond_mark usage.
+    //TODO: implement later when figure out bond_mark usage
     return FAILURE;
 }
 
@@ -1387,13 +1160,15 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
         simulation_data *data, storage *workspace, 
         reax_list **lists, output_controls *out_control )
 {
-    int i, hbs, hnbrs_blocks, ret;
+    int i, hbs, hnbrs_blocks, update_energy, ret;
     int *thbody;
     static int compute_bonded_part1 = FALSE;
     real t_start, t_elapsed;
     real *spad = (real *) scratch;
     rvec *rvec_spad;
 
+    update_energy = (out_control->energy_update_freq > 0
+            && data->step % out_control->energy_update_freq == 0) ? TRUE : FALSE;
     ret = SUCCESS;
 
     if ( compute_bonded_part1 == FALSE )
@@ -1406,7 +1181,7 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
                 BLOCKS_N, BLOCK_SIZE );
 #endif
 
-        Cuda_Calculate_BO_init  <<< BLOCKS_N, BLOCK_SIZE >>>
+        Cuda_Calculate_BO_init <<< BLOCKS_N, BLOCK_SIZE >>>
             ( system->d_my_atoms, system->reax_param.d_sbp, 
               *dev_workspace, system->N );
         cudaThreadSynchronize( );
@@ -1420,14 +1195,14 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
         cudaThreadSynchronize( );
         cudaCheckError( );
 
-        Cuda_Update_Uncorrected_BO <<<BLOCKS_N, BLOCK_SIZE>>>
-            (*dev_workspace, *(*dev_lists + BONDS), system->N);
+        Cuda_Update_Uncorrected_BO <<< BLOCKS_N, BLOCK_SIZE >>>
+            ( *dev_workspace, *(*dev_lists + BONDS), system->N );
         cudaThreadSynchronize( );
         cudaCheckError( );
 
-        Cuda_Update_Workspace_After_BO <<<BLOCKS_N, BLOCK_SIZE>>>
-            (system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_sbp, 
-             *dev_workspace, system->N);
+        Cuda_Update_Workspace_After_BO <<< BLOCKS_N, BLOCK_SIZE >>>
+            ( system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_sbp, 
+             *dev_workspace, system->N );
         cudaThreadSynchronize( );
         cudaCheckError( );
 
@@ -1451,8 +1226,11 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
         cudaCheckError( );
 
         /* reduction for E_BE */
-        Cuda_Reduction_Sum( spad, &((simulation_data *)data->d_simulation_data)->my_en.e_bond,
-                system->n );
+        if ( update_energy == TRUE )
+        {
+            Cuda_Reduction_Sum( spad, &((simulation_data *)data->d_simulation_data)->my_en.e_bond,
+                    system->n );
+        }
 
         t_elapsed = Get_Timing_Info( t_start );
 
@@ -1466,36 +1244,43 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
         t_start = Get_Time( );
         cuda_memset( spad, 0, ( 6 * sizeof(real) * system->n ), "scratch" );
 
-        Cuda_Atom_Energy <<<BLOCKS, BLOCK_SIZE>>>( system->d_my_atoms, system->reax_param.d_gp, 
-                system->reax_param.d_sbp, system->reax_param.d_tbp, 
-                *dev_workspace, 
-                *(*dev_lists + BONDS), system->n, system->reax_param.num_atom_types, 
-                spad, spad + 2 * system->n, spad + 4 * system->n);
+        Cuda_Atom_Energy <<< BLOCKS, BLOCK_SIZE >>>
+            ( system->d_my_atoms, system->reax_param.d_gp,
+              system->reax_param.d_sbp, system->reax_param.d_tbp, *dev_workspace,
+              *(*dev_lists + BONDS), system->n, system->reax_param.num_atom_types,
+              spad, spad + 2 * system->n, spad + 4 * system->n);
         cudaThreadSynchronize( );
         cudaCheckError( );
 
-        //CHANGE ORIGINAL
-//        Cuda_Atom_Energy_PostProcess <<<BLOCKS, BLOCK_SIZE >>>
+//        Cuda_Atom_Energy_PostProcess <<< BLOCKS, BLOCK_SIZE >>>
 //            ( *(*dev_lists + BONDS), *dev_workspace, system->n );
-        Cuda_Atom_Energy_PostProcess <<<BLOCKS_N, BLOCK_SIZE >>>
+        Cuda_Atom_Energy_PostProcess <<< BLOCKS_N, BLOCK_SIZE >>>
             ( *(*dev_lists + BONDS), *dev_workspace, system->N );
-        //CHANGE ORIGINAL
         cudaThreadSynchronize( );
         cudaCheckError( );
 
         /* reduction for E_Lp */
-        Cuda_Reduction_Sum( spad, &((simulation_data *)data->d_simulation_data)->my_en.e_lp,
-                system->n );
+        if ( update_energy == TRUE )
+        {
+            Cuda_Reduction_Sum( spad, &((simulation_data *)data->d_simulation_data)->my_en.e_lp,
+                    system->n );
+        }
 
         /* reduction for E_Ov */
-        Cuda_Reduction_Sum( spad + 2 * system->n,
-                &((simulation_data *)data->d_simulation_data)->my_en.e_ov,
-                system->n );
+        if ( update_energy == TRUE )
+        {
+            Cuda_Reduction_Sum( spad + 2 * system->n,
+                    &((simulation_data *)data->d_simulation_data)->my_en.e_ov,
+                    system->n );
+        }
 
         /* reduction for E_Un */
-        Cuda_Reduction_Sum( spad + 4 * system->n,
-                &((simulation_data *)data->d_simulation_data)->my_en.e_ov,
-                system->n );
+        if ( update_energy == TRUE )
+        {
+            Cuda_Reduction_Sum( spad + 4 * system->n,
+                    &((simulation_data *)data->d_simulation_data)->my_en.e_ov,
+                    system->n );
+        }
 
         t_elapsed = Get_Timing_Info( t_start );
 
@@ -1534,23 +1319,32 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
               (control_params *)control->d_control_params,
               *dev_workspace, *(*dev_lists + BONDS), *(*dev_lists + THREE_BODIES),
               system->n, system->N, system->reax_param.num_atom_types, 
-              spad, spad + 2 * system->N, spad + 4 * system->N, (rvec *)(spad + 6 * system->N));
+              spad, spad + 2 * system->N, spad + 4 * system->N, (rvec *)(spad + 6 * system->N) );
         cudaThreadSynchronize( );
         cudaCheckError( );
 
         /* reduction for E_Ang */
-        Cuda_Reduction_Sum( spad, &((simulation_data *)data->d_simulation_data)->my_en.e_ang,
-                system->N );
+        if ( update_energy == TRUE )
+        {
+            Cuda_Reduction_Sum( spad, &((simulation_data *)data->d_simulation_data)->my_en.e_ang,
+                    system->N );
+        }
 
         /* reduction for E_Pen */
-        Cuda_Reduction_Sum( spad + 2 * system->N,
-                &((simulation_data *)data->d_simulation_data)->my_en.e_pen,
-                system->N );
+        if ( update_energy == TRUE )
+        {
+            Cuda_Reduction_Sum( spad + 2 * system->N,
+                    &((simulation_data *)data->d_simulation_data)->my_en.e_pen,
+                    system->N );
+        }
 
         /* reduction for E_Coa */
-        Cuda_Reduction_Sum( spad + 4 * system->N,
-                &((simulation_data *)data->d_simulation_data)->my_en.e_coa,
-                system->N );
+        if ( update_energy == TRUE )
+        {
+            Cuda_Reduction_Sum( spad + 4 * system->N,
+                    &((simulation_data *)data->d_simulation_data)->my_en.e_coa,
+                    system->N );
+        }
 
         /* reduction for ext_pres */
         rvec_spad = (rvec *) (spad + 6 * system->N);
@@ -1596,24 +1390,30 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
         cudaCheckError( );
 
         /* reduction for E_Tor */
-        Cuda_Reduction_Sum( spad, &((simulation_data *)data->d_simulation_data)->my_en.e_tor,
-                system->n );
+        if ( update_energy == TRUE )
+        {
+            Cuda_Reduction_Sum( spad, &((simulation_data *)data->d_simulation_data)->my_en.e_tor,
+                    system->n );
+        }
 
         /* reduction for E_Con */
-        Cuda_Reduction_Sum( spad + 2 * system->n,
-                &((simulation_data *)data->d_simulation_data)->my_en.e_con,
-                system->n );
+        if ( update_energy == TRUE )
+        {
+            Cuda_Reduction_Sum( spad + 2 * system->n,
+                    &((simulation_data *)data->d_simulation_data)->my_en.e_con,
+                    system->n );
+        }
 
         /* reduction for ext_pres */
         rvec_spad = (rvec *) (spad + 4 * system->n);
-        k_reduction_rvec <<<BLOCKS, BLOCK_SIZE, sizeof(rvec) * BLOCK_SIZE >>>
-            (rvec_spad, rvec_spad + system->n,  system->n);
+        k_reduction_rvec <<< BLOCKS, BLOCK_SIZE, sizeof(rvec) * BLOCK_SIZE >>>
+            ( rvec_spad, rvec_spad + system->n,  system->n );
         cudaThreadSynchronize( );
         cudaCheckError( );
 
-        k_reduction_rvec <<<1, BLOCKS_POW_2, sizeof(rvec) * BLOCKS_POW_2 >>>
+        k_reduction_rvec <<< 1, BLOCKS_POW_2, sizeof(rvec) * BLOCKS_POW_2 >>>
                 ( rvec_spad + system->n,
-                &((simulation_data *)data->d_simulation_data)->my_ext_press, BLOCKS );
+                  &((simulation_data *)data->d_simulation_data)->my_ext_press, BLOCKS );
         cudaThreadSynchronize( );
         cudaCheckError( );
 //        Cuda_Reduction_Sum( rvec_spad,
@@ -1635,7 +1435,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
 #endif
 
         /* 6. Hydrogen Bonds Interactions */
-        // FIX - 4 - Added additional check here
         if ( control->hbond_cut > 0.0 && system->numH > 0 )
         {
             t_start = Get_Time( );
@@ -1658,18 +1457,21 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
             cudaCheckError( );
 
             /* reduction for E_HB */
-            Cuda_Reduction_Sum( spad,
-                    &((simulation_data *)data->d_simulation_data)->my_en.e_hb,
-                    system->n );
+            if ( update_energy == TRUE )
+            {
+                Cuda_Reduction_Sum( spad,
+                        &((simulation_data *)data->d_simulation_data)->my_en.e_hb,
+                        system->n );
+            }
 
             /* reduction for ext_pres */
             rvec_spad = (rvec *) (spad + 2 * system->n);
-            k_reduction_rvec <<<BLOCKS, BLOCK_SIZE, sizeof(rvec) * BLOCK_SIZE >>>
+            k_reduction_rvec <<< BLOCKS, BLOCK_SIZE, sizeof(rvec) * BLOCK_SIZE >>>
                 (rvec_spad, rvec_spad + system->n,  system->n);
             cudaThreadSynchronize( );
             cudaCheckError( );
 
-            k_reduction_rvec <<<1, BLOCKS_POW_2, sizeof(rvec) * BLOCKS_POW_2 >>>
+            k_reduction_rvec <<< 1, BLOCKS_POW_2, sizeof(rvec) * BLOCKS_POW_2 >>>
                 (rvec_spad + system->n, &((simulation_data *)data->d_simulation_data)->my_ext_press, BLOCKS);
             cudaThreadSynchronize( );
             cudaCheckError( );
@@ -1679,8 +1481,8 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
 
             /* post process step1 */
             Cuda_Hydrogen_Bonds_PostProcess <<< BLOCKS_N, BLOCK_SIZE, BLOCK_SIZE * sizeof(rvec) >>>
-                (  system->d_my_atoms, *dev_workspace,
-                   *(*dev_lists + BONDS), system->N );
+                ( system->d_my_atoms, *dev_workspace,
+                  *(*dev_lists + BONDS), system->N );
             cudaThreadSynchronize( );
             cudaCheckError( );
 
@@ -1788,39 +1590,6 @@ int Cuda_Compute_Forces( reax_system *system, control_params *control,
     if ( charge_flag == TRUE )
     {
         retVal = Cuda_Init_Forces( system, control, data, workspace, lists, out_control );
-
-//        int i;
-//        static reax_list **temp_lists;
-//       
-//        if ( data->step == 0 )
-//        {
-//            temp_lists = (reax_list **) smalloc( LIST_N * sizeof (reax_list *), "temp_lists" );
-//            for ( i = 0; i < LIST_N; ++i )
-//            {
-//                temp_lists[i] = (reax_list *) smalloc( sizeof(reax_list), "lists[i]" );
-//                temp_lists[i]->allocated = FALSE;
-//            }
-//            Make_List( (*dev_lists + BONDS)->n, (*dev_lists + BONDS)->num_intrs,
-//                    TYP_BOND, *temp_lists + BONDS );
-//            Make_List( (*dev_lists + HBONDS)->n, (*dev_lists + HBONDS)->num_intrs,
-//                    TYP_HBOND, *temp_lists + HBONDS );
-//        }
-//        else
-//        {
-//            Delete_List( *temp_lists + BONDS );
-//            Make_List( (*dev_lists + BONDS)->n, (*dev_lists + BONDS)->num_intrs,
-//                    TYP_BOND, *temp_lists + BONDS );
-//            Delete_List( *temp_lists + HBONDS );
-//            Make_List( (*dev_lists + HBONDS)->n, (*dev_lists + HBONDS)->num_intrs,
-//                    TYP_HBOND, *temp_lists + HBONDS );
-//
-//        }
-//        Output_Sync_Lists( *temp_lists + BONDS, *dev_lists + BONDS, TYP_BOND );
-//        Print_Bonds( system, temp_lists, control );
-//        Output_Sync_Lists( *temp_lists + HBONDS, *dev_lists + HBONDS, TYP_HBOND );
-//        Print_HBonds( system, temp_lists, control, data->step );
-//        Print_HBond_Indices( system, temp_lists, control, data->step );
-//        exit( 0 );
     }
     else
     {
@@ -1870,7 +1639,7 @@ int Cuda_Compute_Forces( reax_system *system, control_params *control,
         //MPI_Barrier( MPI_COMM_WORLD );
         if ( system->my_rank == MASTER_NODE )
         {
-            Update_Timing_Info( &t_start, &(data->timing.qEq) );
+            Update_Timing_Info( &t_start, &(data->timing.cm) );
         }
 #endif
 
@@ -1914,8 +1683,6 @@ int Cuda_Compute_Forces( reax_system *system, control_params *control,
         MPI_Barrier( MPI_COMM_WORLD );
 
 #endif
-
-//        Print_Forces( system );
     }
 
     return retVal;
diff --git a/PG-PuReMD/src/cuda/cuda_init_md.cu b/PG-PuReMD/src/cuda/cuda_init_md.cu
index 2bb2f89e7b76315462927b659c00efe8102b56c0..3162e7326b03fa9165772300bb4d926c900016f4 100644
--- a/PG-PuReMD/src/cuda/cuda_init_md.cu
+++ b/PG-PuReMD/src/cuda/cuda_init_md.cu
@@ -252,14 +252,10 @@ int Cuda_Init_Lists( reax_system *system, control_params *control,
 
     Cuda_Generate_Neighbor_Lists( system, data, workspace, dev_lists );
 
-    /* estimate storage for bonds and hbonds */
+    /* estimate storage for bonds, hbonds, and sparse matrix */
     Cuda_Estimate_Storages( system, control, dev_lists, &(dev_workspace->H), data->step );
 
-    /* estimate storage for charge sparse matrix */
-//    Cuda_Estimate_Storage_Sparse_Matrix( system, control, data, dev_lists );
-
     dev_alloc_matrix( &(dev_workspace->H), system->total_cap, system->total_cm_entries );
-
     Cuda_Init_Sparse_Matrix_Indices( system, &(dev_workspace->H) );
 
     //MATRIX CHANGES
@@ -277,7 +273,6 @@ int Cuda_Init_Lists( reax_system *system, control_params *control,
     if ( control->hbond_cut > 0.0 &&  system->numH > 0 )
     {
         Dev_Make_List( system->total_cap, system->total_hbonds, TYP_HBOND, *dev_lists + HBONDS );
-//        Make_List( system->total_cap, system->total_hbonds, TYP_HBOND, *lists + HBONDS );
 
         Cuda_Init_HBond_Indices( system );
 
@@ -290,7 +285,6 @@ int Cuda_Init_Lists( reax_system *system, control_params *control,
 
     /* bonds list */
     Dev_Make_List( system->total_cap, system->total_bonds, TYP_BOND, *dev_lists + BONDS );
-//    Make_List( system->total_cap, system->total_bonds, TYP_BOND, *lists + BONDS );
 
     Cuda_Init_Bond_Indices( system );
 
diff --git a/PG-PuReMD/src/cuda/cuda_nonbonded.cu b/PG-PuReMD/src/cuda/cuda_nonbonded.cu
index 25c0b17df74448775e5e37a61db25c151169e056..473ef77e6c1259eed6e7fd204fc79dcb36784b6d 100644
--- a/PG-PuReMD/src/cuda/cuda_nonbonded.cu
+++ b/PG-PuReMD/src/cuda/cuda_nonbonded.cu
@@ -30,13 +30,12 @@
 #include "../vector.h"
 
 
-//CUDA_GLOBAL void __launch_bounds__ (960) ker_vdW_coulomb_energy(    
-CUDA_GLOBAL void ker_vdW_coulomb_energy( reax_atom *my_atoms, 
+//CUDA_GLOBAL void __launch_bounds__ (960) k_vdW_coulomb_energy(    
+CUDA_GLOBAL void k_vdW_coulomb_energy( reax_atom *my_atoms, 
         two_body_parameters *tbp, global_parameters gp, control_params *control, 
         storage p_workspace, reax_list p_far_nbrs, int n, int N, int num_atom_types, 
         real *data_e_vdW, real *data_e_ele, rvec *data_ext_press )
 {
-
 #if defined(__SM_35__)
     real sh_vdw;
     real sh_ele;
@@ -44,13 +43,11 @@ CUDA_GLOBAL void ker_vdW_coulomb_energy( reax_atom *my_atoms,
 #else
     extern __shared__ real _vdw[];
     extern __shared__ real _ele[];
-    extern __shared__ rvec _force [];
-
+    extern __shared__ rvec _force[];
     real *sh_vdw;
     real *sh_ele;
     rvec *sh_force;
 #endif
-
     int i, j, pj, natoms;
     int start_i, end_i, orig_i, orig_j;
     real p_vdW1, p_vdW1i;
@@ -64,12 +61,13 @@ CUDA_GLOBAL void ker_vdW_coulomb_energy( reax_atom *my_atoms,
     far_neighbor_data *nbr_pj;
     reax_list *far_nbrs;
     storage *workspace = &( p_workspace );
-    // rtensor temp_rtensor, total_rtensor;
-
-    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    int warpid = thread_id / VDW_KER_THREADS_PER_ATOM;
-    int laneid = thread_id & (VDW_KER_THREADS_PER_ATOM -1); 
+    int thread_id;
+    int warpid;
+    int laneid; 
 
+    thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    warpid = thread_id / VDW_KER_THREADS_PER_ATOM;
+    laneid = thread_id & (VDW_KER_THREADS_PER_ATOM -1); 
 #if defined(__SM_35__)
     sh_vdw = 0.0;
     sh_ele = 0.0;
@@ -83,12 +81,11 @@ CUDA_GLOBAL void ker_vdW_coulomb_energy( reax_atom *my_atoms,
     sh_ele[threadIdx.x] = 0.0;
     rvec_MakeZero ( sh_force [threadIdx.x] );
 #endif
-
     //i = blockIdx.x * blockDim.x + threadIdx.x;
     //if (i >= N) return;
     i = warpid;
 
-    if (i < N)
+    if ( i < N )
     {
         natoms = n;
         far_nbrs = &( p_far_nbrs );
@@ -97,18 +94,18 @@ CUDA_GLOBAL void ker_vdW_coulomb_energy( reax_atom *my_atoms,
         e_core = 0;
         e_vdW = 0;
 
-        data_e_vdW [i] = 0;
-        data_e_ele [i] = 0;
+        data_e_vdW[i] = 0;
+        data_e_ele[i] = 0;
 
         //for( i = 0; i < natoms; ++i ) {
         start_i = Dev_Start_Index(i, far_nbrs);
-        end_i   = Dev_End_Index(i, far_nbrs);
-        orig_i  = my_atoms[i].orig_id;
+        end_i = Dev_End_Index(i, far_nbrs);
+        orig_i = my_atoms[i].orig_id;
         //fprintf( stderr, "i:%d, start_i: %d, end_i: %d\n", i, start_i, end_i );
 
         //for( pj = start_i; pj < end_i; ++pj )
         pj = start_i + laneid;
-        while (pj < end_i)
+        while ( pj < end_i )
         {
 
             nbr_pj = &(far_nbrs->select.far_nbr_list[pj]);
@@ -133,18 +130,18 @@ CUDA_GLOBAL void ker_vdW_coulomb_energy( reax_atom *my_atoms,
                 Tap = Tap * r_ij + workspace->Tap[1];
                 Tap = Tap * r_ij + workspace->Tap[0];
 
-                dTap = 7*workspace->Tap[7] * r_ij + 6*workspace->Tap[6];
+                dTap = 7 * workspace->Tap[7] * r_ij + 6 * workspace->Tap[6];
                 dTap = dTap * r_ij + 5*workspace->Tap[5];
                 dTap = dTap * r_ij + 4*workspace->Tap[4];
                 dTap = dTap * r_ij + 3*workspace->Tap[3];
                 dTap = dTap * r_ij + 2*workspace->Tap[2];
-                dTap += workspace->Tap[1]/r_ij;
+                dTap += workspace->Tap[1] / r_ij;
 
-                /*vdWaals Calculations*/
-                if(gp.vdw_type==1 || gp.vdw_type==3)
-                { // shielding
-                    powr_vdW1 = POW(r_ij, p_vdW1);
-                    powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1);
+                /* shielding vdWaals Calculations */
+                if ( gp.vdw_type == 1 || gp.vdw_type == 3 )
+                {
+                    powr_vdW1 = POW( r_ij, p_vdW1 );
+                    powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1 );
 
                     fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i );
                     exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
@@ -152,44 +149,47 @@ CUDA_GLOBAL void ker_vdW_coulomb_energy( reax_atom *my_atoms,
 
                     e_vdW = twbp->D * (exp1 - 2.0 * exp2);      
 
-                    //data_e_vdW [i] += Tap * e_vdW;
-                    //     data_e_vdW [i] += Tap * e_vdW / 2.0;
+                    //data_e_vdW[i] += Tap * e_vdW;
+                    //data_e_vdW[i] += Tap * e_vdW / 2.0;
 #if defined(__SM_35__)
-                    sh_vdw  += Tap * e_vdW / 2.0;
+                    sh_vdw += Tap * e_vdW / 2.0;
 #else
-                    sh_vdw [threadIdx.x] += Tap * e_vdW / 2.0;
+                    sh_vdw[threadIdx.x] += Tap * e_vdW / 2.0;
 #endif
 
-                    dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * 
-                        POW(r_ij, p_vdW1 - 2.0);
+                    dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) *
+                        POW( r_ij, p_vdW1 - 2.0 );
 
                     CEvd = dTap * e_vdW - 
                         Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13;
                 }
-                else{ // no shielding
+                /* no shielding */
+                else
+                {
                     exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
                     exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
 
                     e_vdW = twbp->D * (exp1 - 2.0 * exp2);
 
-                    //data_e_vdW [i] += Tap * e_vdW;
-                    //data_e_vdW [i] += Tap * e_vdW / 2.0;
+                    //data_e_vdW[i] += Tap * e_vdW;
+                    //data_e_vdW[i] += Tap * e_vdW / 2.0;
 #if defined(__SM_35__)
                     sh_vdw += Tap * e_vdW / 2.0;
 #else
-                    sh_vdw [threadIdx.x] += Tap * e_vdW / 2.0;
+                    sh_vdw[threadIdx.x] += Tap * e_vdW / 2.0;
 #endif
 
                     CEvd = dTap * e_vdW - 
                         Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2);
                 }
 
-                if(gp.vdw_type==2 || gp.vdw_type==3)
-                { // innner wall
+                /* inner wall */
+                if ( gp.vdw_type == 2 || gp.vdw_type == 3 )
+                {
                     e_core = twbp->ecore * EXP(twbp->acore * (1.0-(r_ij/twbp->rcore)));
 
-                    //data_e_vdW [i] += Tap * e_core;
-                    //data_e_vdW [i] += Tap * e_core / 2.0;
+                    //data_e_vdW[i] += Tap * e_core;
+                    //data_e_vdW[i] += Tap * e_core / 2.0;
 #if defined(__SM_35__)
                     sh_vdw += Tap * e_core / 2.0;
 #else
@@ -201,46 +201,53 @@ CUDA_GLOBAL void ker_vdW_coulomb_energy( reax_atom *my_atoms,
                 }
 
                 /*Coulomb Calculations*/
-                dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma );
-                dr3gamij_3 = POW( dr3gamij_1 , 1.0 / 3.0 );
+                dr3gamij_1 = r_ij * r_ij * r_ij + twbp->gamma;
+                dr3gamij_3 = POW( dr3gamij_1, 1.0 / 3.0 );
 
                 tmp = Tap / dr3gamij_3;
-                //data_e_ele [i] += e_ele = C_ele * my_atoms[i].q * my_atoms[j].q * tmp;
+                //data_e_ele[i] += e_ele = C_ele * my_atoms[i].q * my_atoms[j].q * tmp;
                 e_ele = C_ele * my_atoms[i].q * my_atoms[j].q * tmp;
-                //data_e_ele [i] += e_ele;
-                //data_e_ele [i] += e_ele  / 2.0;
+                //data_e_ele[i] += e_ele;
+                //data_e_ele[i] += e_ele  / 2.0;
 #if defined(__SM_35__)
                 sh_ele += e_ele  / 2.0;
 #else
-                sh_ele [ threadIdx.x ] += e_ele  / 2.0;
+                sh_ele[ threadIdx.x ] += e_ele  / 2.0;
 #endif
 
-
                 CEclmb = C_ele * my_atoms[i].q * my_atoms[j].q * 
                     ( dTap -  Tap * r_ij / dr3gamij_1 ) / dr3gamij_3;
+
                 // fprintf( fout, "%5d %5d %10.6f %10.6f\n",
                 //   MIN( system->my_atoms[i].orig_id, system->my_atoms[j].orig_id ),
                 //   MAX( system->my_atoms[i].orig_id, system->my_atoms[j].orig_id ), 
                 //   CEvd, CEclmb );                  
 
-                if( control->virial == 0 ) {
+                if ( control->virial == 0 )
+                {
                     if ( i < j ) 
+                    {
                         //rvec_ScaledAdd( workspace->f[i], -(CEvd + CEclmb), nbr_pj->dvec );
 #if defined (__SM_35__)
                         rvec_ScaledAdd( sh_force, -(CEvd + CEclmb), nbr_pj->dvec );
 #else
                         rvec_ScaledAdd( sh_force[ threadIdx.x ], -(CEvd + CEclmb), nbr_pj->dvec );
 #endif
+                    }
                     else 
+                    {
                         //rvec_ScaledAdd( workspace->f[i], +(CEvd + CEclmb), nbr_pj->dvec );
 #if defined (__SM_35__)
                         rvec_ScaledAdd( sh_force , +(CEvd + CEclmb), nbr_pj->dvec );
 #else
-                        rvec_ScaledAdd( sh_force [ threadIdx.x ], +(CEvd + CEclmb), nbr_pj->dvec );
+                        rvec_ScaledAdd( sh_force[ threadIdx.x ], +(CEvd + CEclmb), nbr_pj->dvec );
 #endif
                         //rvec_ScaledAdd( workspace->f[j], +(CEvd + CEclmb), nbr_pj->dvec );
+                    }
                 }
-                else { /* NPT, iNPT or sNPT */
+                /* NPT, iNPT or sNPT */
+                else
+                {
                     /* for pressure coupling, terms not related to bond order 
                        derivatives are added directly into pressure vector/tensor */
                     rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec );
@@ -274,6 +281,7 @@ CUDA_GLOBAL void ker_vdW_coulomb_energy( reax_atom *my_atoms,
                         r_ij, system->my_atoms[i].q, system->my_atoms[j].q, 
                         e_ele, data->my_en.e_ele );
 #endif
+
 #ifdef TEST_FORCES
                 rvec_ScaledAdd( workspace->f_vdw[i], -CEvd, nbr_pj->dvec );
                 rvec_ScaledAdd( workspace->f_vdw[j], +CEvd, nbr_pj->dvec );
@@ -289,7 +297,8 @@ CUDA_GLOBAL void ker_vdW_coulomb_energy( reax_atom *my_atoms,
     } // if i < N
 
 #if defined( __SM_35__)
-    for (int x = VDW_KER_THREADS_PER_ATOM >> 1; x >= 1; x/=2){
+    for ( int x = VDW_KER_THREADS_PER_ATOM >> 1; x >= 1; x/=2 )
+    {
         sh_vdw += shfl( sh_vdw, x);
         sh_ele += shfl( sh_ele, x );
         sh_force[0] += shfl( sh_force[0], x );
@@ -297,57 +306,64 @@ CUDA_GLOBAL void ker_vdW_coulomb_energy( reax_atom *my_atoms,
         sh_force[2] += shfl( sh_force[2], x );
     }
 
-    if (laneid == 0) {
+    if ( laneid == 0 )
+    {
         data_e_vdW[i] += sh_vdw;
         data_e_ele[i] += sh_ele;
-        rvec_Add (workspace->f[i], sh_force );
+        rvec_Add( workspace->f[i], sh_force );
     }
 
 #else
 
-    __syncthreads ();
+    __syncthreads( );
 
-    if (laneid < 16) {
+    if (laneid < 16)
+    {
         sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 16];
         sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 16];
-        rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 16] );
+        rvec_Add( sh_force[threadIdx.x], sh_force[threadIdx.x + 16] );
     }
-    __syncthreads ();
-    if (laneid < 8) {
+    __syncthreads( );
+    if (laneid < 8)
+    {
         sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 8];
         sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 8];
-        rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 8] );
+        rvec_Add( sh_force[threadIdx.x], sh_force[threadIdx.x + 8] );
     }
-    __syncthreads ();
-    if (laneid < 4) {
+    __syncthreads( );
+    if (laneid < 4)
+    {
         sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 4];
         sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 4];
-        rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 4] );
+        rvec_Add( sh_force[threadIdx.x], sh_force[threadIdx.x + 4] );
     }
-    __syncthreads ();
-    if (laneid < 2) {
+    __syncthreads( );
+    if (laneid < 2)
+    {
         sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 2];
         sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 2];
-        rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 2] );
+        rvec_Add( sh_force[threadIdx.x], sh_force[threadIdx.x + 2] );
     }
-    __syncthreads ();
-    if (laneid < 1) {
+    __syncthreads( );
+    if (laneid < 1)
+    {
         sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 1];
         sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 1];
-        rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 1] );
+        rvec_Add( sh_force[threadIdx.x], sh_force[threadIdx.x + 1] );
     }
-    __syncthreads ();
-    if (laneid == 0) {
+    __syncthreads( );
+    if (laneid == 0)
+    {
         data_e_vdW[i] += sh_vdw[threadIdx.x];
         data_e_ele[i] += sh_ele[threadIdx.x];
-        rvec_Add (workspace->f[i], sh_force [ threadIdx.x ]);
+        rvec_Add( workspace->f[i], sh_force[ threadIdx.x ] );
     }
 #endif
 
 }
 
 
-CUDA_GLOBAL void ker_tabulated_vdW_coulomb_energy( reax_atom *my_atoms, 
+CUDA_GLOBAL void k_tabulated_vdW_coulomb_energy( reax_atom *my_atoms, 
         global_parameters gp, control_params *control, 
         storage p_workspace, reax_list p_far_nbrs, 
         LR_lookup_table *t_LR, int n, int N, int num_atom_types, 
@@ -364,27 +380,30 @@ CUDA_GLOBAL void ker_tabulated_vdW_coulomb_energy( reax_atom *my_atoms,
     far_neighbor_data *nbr_pj;
     reax_list *far_nbrs;
     LR_lookup_table *t;
+    storage *workspace;
 
-    storage *workspace = &( p_workspace );
+    i = blockIdx.x * blockDim.x + threadIdx.x;
 
+    if ( i >= N )
+    {
+        return;
+    }
+
+    workspace = &( p_workspace );
     natoms = n;
     far_nbrs = &( p_far_nbrs );
     steps = step - prev_steps;
     update_freq = energy_update_freq;
     update_energies = update_freq > 0 && steps % update_freq == 0;
     e_ele = e_vdW = 0;
-
-    i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= N) return;
-
-    data_e_vdW [i] = 0;
-    data_e_ele [i] = 0;
+    data_e_vdW[i] = 0;
+    data_e_ele[i] = 0;
 
     //for( i = 0; i < natoms; ++i ) {
-    type_i  = my_atoms[i].type;
+    type_i = my_atoms[i].type;
     start_i = Dev_Start_Index(i,far_nbrs);
-    end_i   = Dev_End_Index(i,far_nbrs);
-    orig_i  = my_atoms[i].orig_id;
+    end_i = Dev_End_Index(i,far_nbrs);
+    orig_i = my_atoms[i].orig_id;
 
     for( pj = start_i; pj < end_i; ++pj )
     {
@@ -404,18 +423,22 @@ CUDA_GLOBAL void ker_tabulated_vdW_coulomb_energy( reax_atom *my_atoms,
             tmin  = MIN( type_i, type_j );
             tmax  = MAX( type_i, type_j );
 
-            t = &( t_LR[ index_lr (tmin, tmax, num_atom_types) ]);    
+            t = &( t_LR[ index_lr(tmin, tmax, num_atom_types) ]);    
 
-            // table = &( LR[type_i][type_j] ); 
+            //table = &( LR[type_i][type_j] ); 
 
             /* Cubic Spline Interpolation */
             r = (int)(r_ij * t->inv_dx);
-            if( r == 0 )  ++r;
+            if( r == 0 )
+            {
+                ++r;
+            }
             base = (real)(r+1) * t->dx;
             dif = r_ij - base;
             //fprintf(stderr, "r: %f, i: %d, base: %f, dif: %f\n", r, i, base, dif);
 
-            if( update_energies ) {
+            if ( update_energies )
+            {
                 e_vdW = ((t->vdW[r].d*dif + t->vdW[r].c)*dif + t->vdW[r].b)*dif + 
                     t->vdW[r].a;
 
@@ -423,28 +446,35 @@ CUDA_GLOBAL void ker_tabulated_vdW_coulomb_energy( reax_atom *my_atoms,
                     t->ele[r].a;
                 e_ele *= my_atoms[i].q * my_atoms[j].q;
 
-                //data_e_vdW [i] += e_vdW;
-                data_e_vdW [i] += e_vdW / 2.0;
-                //data_e_ele [i] += e_ele;
-                data_e_ele [i] += e_ele / 2.0;
+                //data_e_vdW[i] += e_vdW;
+                data_e_vdW[i] += e_vdW / 2.0;
+                //data_e_ele[i] += e_ele;
+                data_e_ele[i] += e_ele / 2.0;
             }    
 
-            CEvd = ((t->CEvd[r].d*dif + t->CEvd[r].c)*dif + t->CEvd[r].b)*dif + 
+            CEvd = ((t->CEvd[r].d * dif + t->CEvd[r].c) * dif + t->CEvd[r].b) * dif + 
                 t->CEvd[r].a;
 
-            CEclmb = ((t->CEclmb[r].d*dif+t->CEclmb[r].c)*dif+t->CEclmb[r].b)*dif + 
+            CEclmb = ((t->CEclmb[r].d * dif + t->CEclmb[r].c) * dif + t->CEclmb[r].b) * dif + 
                 t->CEclmb[r].a;
             CEclmb *= my_atoms[i].q * my_atoms[j].q;
 
-            if( control->virial == 0 ) {
+            if( control->virial == 0 )
+            {
                 if ( i < j ) 
+                {
                     rvec_ScaledAdd( workspace->f[i], -(CEvd + CEclmb), nbr_pj->dvec );
+                }
                 else 
+                {
                     rvec_ScaledAdd( workspace->f[i], +(CEvd + CEclmb), nbr_pj->dvec );
+                }
                 //rvec_ScaledAdd( workspace->f[i], -(CEvd + CEclmb), nbr_pj->dvec );
                 //rvec_ScaledAdd( workspace->f[j], +(CEvd + CEclmb), nbr_pj->dvec );
             }
-            else { // NPT, iNPT or sNPT
+            /* NPT, iNPT or sNPT */
+            else
+            {
                 /* for pressure coupling, terms not related to bond order derivatives
                    are added directly into pressure vector/tensor */
                 rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec );
@@ -467,6 +497,7 @@ CUDA_GLOBAL void ker_tabulated_vdW_coulomb_energy( reax_atom *my_atoms,
                     r_ij, system->my_atoms[i].q, system->my_atoms[j].q, 
                     e_ele, data->my_en.e_ele );
 #endif
+
 #ifdef TEST_FORCES
             rvec_ScaledAdd( workspace->f_vdw[i], -CEvd, nbr_pj->dvec );
             rvec_ScaledAdd( workspace->f_vdw[j], +CEvd, nbr_pj->dvec );
@@ -479,25 +510,25 @@ CUDA_GLOBAL void ker_tabulated_vdW_coulomb_energy( reax_atom *my_atoms,
 }
 
 
-CUDA_GLOBAL void ker_pol_energy( reax_atom *my_atoms, 
+CUDA_GLOBAL void k_pol_energy( reax_atom *my_atoms, 
         single_body_parameters *sbp, int n, real *data_e_pol )
 {
-    int type_i;
+    int i, type_i;
     real q;
 
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if ( i >= n) return;
+    i = blockIdx.x * blockDim.x + threadIdx.x;
 
-    data_e_pol [i] = 0;
+    if ( i >= n )
+    {
+        return;
+    }
 
-    //for( i = 0; i < system->n; i++ ) {
     q = my_atoms[i].q;
     type_i = my_atoms[i].type;
 
-    data_e_pol[i] += 
+    data_e_pol[i] = 
         KCALpMOL_to_EV * (sbp[type_i].chi * q + 
                 (sbp[type_i].eta / 2.) * SQR(q));
-    //}
 }
 
 
@@ -505,17 +536,18 @@ void Cuda_Compute_Polarization_Energy( reax_system *system, simulation_data *dat
 {
     int blocks;
     real *spad = (real *) scratch;
+
     cuda_memset( spad, 0, sizeof(real) * 2 * system->n, "pol_energy" );
 
     blocks = system->n / DEF_BLOCK_SIZE + 
         ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-    ker_pol_energy <<< blocks, DEF_BLOCK_SIZE >>>
+
+    k_pol_energy <<< blocks, DEF_BLOCK_SIZE >>>
         ( system->d_my_atoms, system->reax_param.d_sbp, 
           system->n, spad );
     cudaThreadSynchronize( );
     cudaCheckError( );
 
-    /* reduction for polarization energy */
     Cuda_Reduction_Sum( spad,
             &((simulation_data *)data->d_simulation_data)->my_en.e_pol,
             system->n );
@@ -526,22 +558,23 @@ void Cuda_NonBonded_Energy( reax_system *system, control_params *control,
         storage *workspace, simulation_data *data,  reax_list **lists,
         output_controls *out_control, bool isTabulated )
 {
-    int blocks;
-    int rblocks;
+    int blocks, rblocks, update_energy;
     int size = (2 * system->N + 2 * system->N ) * sizeof(real) +
         2 * system->N * sizeof(rvec);
-
     rvec *spad_rvec;
     real *spad = (real *) scratch;
-    cuda_memset (spad, 0, size, "pol_energy");
 
+    update_energy = (out_control->energy_update_freq > 0
+            && data->step % out_control->energy_update_freq == 0) ? TRUE : FALSE;
     rblocks = system->N / DEF_BLOCK_SIZE + ((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1);
     blocks = ((system->N * VDW_KER_THREADS_PER_ATOM) / DEF_BLOCK_SIZE) 
         + (((system->N * VDW_KER_THREADS_PER_ATOM) % DEF_BLOCK_SIZE == 0) ? 0 : 1);
 
-    if (!isTabulated)
+    cuda_memset( spad, 0, size, "pol_energy" );
+
+    if ( !isTabulated )
     {
-        ker_vdW_coulomb_energy <<< blocks, DEF_BLOCK_SIZE, DEF_BLOCK_SIZE * (2 * sizeof(real) + sizeof(rvec)) >>>
+        k_vdW_coulomb_energy <<< blocks, DEF_BLOCK_SIZE, DEF_BLOCK_SIZE * (2 * sizeof(real) + sizeof(rvec)) >>>
             ( system->d_my_atoms, system->reax_param.d_tbp, 
               system->reax_param.d_gp, (control_params *)control->d_control_params, 
               *(dev_workspace), *(*dev_lists + FAR_NBRS), 
@@ -552,7 +585,7 @@ void Cuda_NonBonded_Energy( reax_system *system, control_params *control,
     }
     else
     {
-        ker_tabulated_vdW_coulomb_energy <<< blocks, DEF_BLOCK_SIZE >>>
+        k_tabulated_vdW_coulomb_energy <<< blocks, DEF_BLOCK_SIZE >>>
             ( system->d_my_atoms, system->reax_param.d_gp, 
               (control_params *)control->d_control_params, 
               *(dev_workspace), *(*dev_lists + FAR_NBRS), 
@@ -567,14 +600,20 @@ void Cuda_NonBonded_Energy( reax_system *system, control_params *control,
     }
 
     /* reduction for vdw */
-    Cuda_Reduction_Sum( spad,
-            &((simulation_data *)data->d_simulation_data)->my_en.e_vdW,
-            system->N );
+    if ( update_energy == TRUE )
+    {
+        Cuda_Reduction_Sum( spad,
+                &((simulation_data *)data->d_simulation_data)->my_en.e_vdW,
+                system->N );
+    }
 
-    /* reduction for  ele */
-    Cuda_Reduction_Sum( spad + 2 * system->N,
-            &((simulation_data *)data->d_simulation_data)->my_en.e_ele,
-            system->N );
+    /* reduction for ele */
+    if ( update_energy == TRUE )
+    {
+        Cuda_Reduction_Sum( spad + 2 * system->N,
+                &((simulation_data *)data->d_simulation_data)->my_en.e_ele,
+                system->N );
+    }
 
     /* reduction for ext_press */
     spad_rvec = (rvec *) (spad + 4 * system->N);
@@ -588,5 +627,8 @@ void Cuda_NonBonded_Energy( reax_system *system, control_params *control,
     cudaThreadSynchronize( );
     cudaCheckError( );
 
-    Cuda_Compute_Polarization_Energy( system, data );
+    if ( update_energy == TRUE )
+    {
+        Cuda_Compute_Polarization_Energy( system, data );
+    }
 }
diff --git a/PG-PuReMD/src/cuda/cuda_utils.h b/PG-PuReMD/src/cuda/cuda_utils.h
index bfc4256d1408e3a2947714d3adde62899b144f55..f4e49a10b65429f0910f5e374a2f15790ff0c89b 100644
--- a/PG-PuReMD/src/cuda/cuda_utils.h
+++ b/PG-PuReMD/src/cuda/cuda_utils.h
@@ -43,7 +43,7 @@ static inline void __cudaCheckError( const char *file, const int line )
         exit( RUNTIME_ERROR );
     }
 
-#if defined(DEBUG)
+#if defined(DEBUG_FOCUS)
     /* More careful checking. However, this will affect performance. */
     err = cudaDeviceSynchronize( );
     if( cudaSuccess != err )
diff --git a/PG-PuReMD/src/forces.c b/PG-PuReMD/src/forces.c
index 19133fcee9aaeeb9dcca9e736a1859920a3acd19..f210c3b4ba45ec46f6bdbc5d4a2f23e24629a02d 100644
--- a/PG-PuReMD/src/forces.c
+++ b/PG-PuReMD/src/forces.c
@@ -1751,7 +1751,7 @@ int Compute_Forces( reax_system *system, control_params *control,
         //MPI_Barrier( MPI_COMM_WORLD );
         if ( system->my_rank == MASTER_NODE )
         {
-            Update_Timing_Info( &t_start, &(data->timing.qEq) );
+            Update_Timing_Info( &t_start, &(data->timing.cm) );
         }
 #endif
 #if defined(DEBUG_FOCUS)
diff --git a/PG-PuReMD/src/io_tools.c b/PG-PuReMD/src/io_tools.c
index 131f8a2e955fefd4fec111f5b5e6c1b1ad4f2de7..91783d67ceb452962c64576ea2b424a1445ba539 100644
--- a/PG-PuReMD/src/io_tools.c
+++ b/PG-PuReMD/src/io_tools.c
@@ -117,9 +117,9 @@ int Init_Output_Files( reax_system *system, control_params *control,
             sprintf( temp, "%s.log", control->sim_name );
             if ( (out_control->log = fopen( temp, "w" )) != NULL )
             {
-                fprintf( out_control->log, "%6s%8s%8s%8s%8s%8s%8s%8s%8s\n",
+                fprintf( out_control->log, "%6s%8s%8s%8s%8s%8s%8s%8s%8s%8s\n",
                          "step", "total", "comm", "nbrs", "init", "bonded", "nonb",
-                         "qeq", "matvecs" );
+                         "charges", "l iters", "retries" );
                 fflush( out_control->log );
             }
             else
@@ -1156,9 +1156,9 @@ void Print_Bond_List2( reax_system *system, reax_list *bonds, char *fname )
 
 
 void Print_Total_Force( reax_system *system, simulation_data *data,
-                        storage *workspace )
+        storage *workspace )
 {
-    int    i;
+    int i;
 
     fprintf( stderr, "step: %d\n", data->step );
     fprintf( stderr, "%6s\t%-38s\n", "atom", "atom.f[0,1,2]");
@@ -1179,10 +1179,10 @@ void Output_Results( reax_system *system, control_params *control,
     real t_elapsed, denom;
 #endif
 
-    if ((out_control->energy_update_freq > 0 &&
+    if ( (out_control->energy_update_freq > 0 &&
             data->step % out_control->energy_update_freq == 0) ||
             (out_control->write_steps > 0 &&
-             data->step % out_control->write_steps == 0))
+             data->step % out_control->write_steps == 0) )
     {
         /* update system-wide energies */
         Compute_System_Energy( system, data, MPI_COMM_WORLD );
@@ -1192,7 +1192,7 @@ void Output_Results( reax_system *system, control_params *control,
                 out_control->energy_update_freq > 0 &&
                 data->step % out_control->energy_update_freq == 0 )
         {
-#if defined(DEBUG) && defined(DEBUG_FOCUS)
+#if !defined(DEBUG) && !defined(DEBUG_FOCUS)
             fprintf( out_control->out,
                      "%-6d%14.2f%14.2f%14.2f%11.2f%13.2f%13.5f\n",
                      data->step, data->sys_en.e_tot, data->sys_en.e_pot,
@@ -1229,19 +1229,21 @@ void Output_Results( reax_system *system, control_params *control,
 #if defined(LOG_PERFORMANCE)
             t_elapsed = Get_Timing_Info( data->timing.total );
             if ( data->step - data->prev_steps > 0 )
+            {
                 denom = 1.0 / out_control->energy_update_freq;
-            else denom = 1;
+            }
+            else
+            {
+                denom = 1;
+            }
 
-            fprintf( out_control->log, "%6d%8.3f%8.3f%8.3f%8.3f%8.3f%8.3f%8.3f%6d\n",
-                     data->step,
-                     t_elapsed * denom,
-                     data->timing.comm * denom,
-                     data->timing.nbrs * denom,
-                     data->timing.init_forces * denom,
-                     data->timing.bonded * denom,
-                     data->timing.nonb * denom,
-                     data->timing.qEq * denom,
-                     (int)((data->timing.s_matvecs + data->timing.t_matvecs)*denom) );
+            fprintf( out_control->log, "%6d%8.3f%8.3f%8.3f%8.3f%8.3f%8.3f%8.3f%8d%8d\n",
+                    data->step, t_elapsed * denom, data->timing.comm * denom,
+                    data->timing.nbrs * denom, data->timing.init_forces * denom,
+                    data->timing.bonded * denom, data->timing.nonb * denom,
+                    data->timing.cm * denom,
+                    (int)((data->timing.s_matvecs + data->timing.t_matvecs) * denom),
+                    data->timing.num_retries );
 
             Reset_Timing( &(data->timing) );
             fflush( out_control->log );
diff --git a/PG-PuReMD/src/list.h b/PG-PuReMD/src/list.h
index df6ec82f24b9002f51e0b93a6461a0d7ca14bbeb..a6a82d204a9c6409553cbd8a4e4ab3a7992959a0 100644
--- a/PG-PuReMD/src/list.h
+++ b/PG-PuReMD/src/list.h
@@ -25,7 +25,7 @@
 #include "reax_types.h"
 
 
-#ifdef _cplusplus
+#ifdef __cplusplus
 extern "C" {
 #endif
 
@@ -35,7 +35,7 @@ void Make_List( int, int, int, reax_list* );
 
 void Delete_List( reax_list* );
 
-#ifdef _cplusplus
+#ifdef __cplusplus
 }
 #endif
 
diff --git a/PG-PuReMD/src/parallelreax.c b/PG-PuReMD/src/parallelreax.c
index 30c2372227d3ce6d7fe3e81e0e4789f1fe61537a..304bb1f6e1825d0953c68014c7125427564b8a9a 100644
--- a/PG-PuReMD/src/parallelreax.c
+++ b/PG-PuReMD/src/parallelreax.c
@@ -109,7 +109,7 @@ void Post_Evolve( reax_system* system, control_params* control,
     int i;
     rvec diff, cross;
 
-    /* remove trans & rot velocity of the center of mass from system */
+    /* remove translational and rotational velocity of the center of mass from system */
     if ( control->ensemble != NVE && control->remove_CoM_vel &&
             data->step % control->remove_CoM_vel == 0 )
     {
@@ -118,17 +118,17 @@ void Post_Evolve( reax_system* system, control_params* control,
 
         for ( i = 0; i < system->n; i++ )
         {
-            /* remove translational vel */
+            /* remove translational term */
             rvec_ScaledAdd( system->my_atoms[i].v, -1., data->vcm );
 
-            /* remove rotational */
+            /* remove rotational term */
             rvec_ScaledSum( diff, 1., system->my_atoms[i].x, -1., data->xcm );
             rvec_Cross( cross, data->avcm, diff );
             rvec_ScaledAdd( system->my_atoms[i].v, -1., cross );
         }
     }
 
-    /* compute kinetic energy of the system */
+    /* compute kinetic energy of system */
     Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
 }
 
@@ -183,27 +183,6 @@ int main( int argc, char* argv[] )
 
 #ifdef HAVE_CUDA
 
-    /* Remove this debug information later */
-#if defined(__CUDA_DEBUG_LOG__)
-    fprintf( stderr, " Size of LR Lookup table %d \n", sizeof(LR_lookup_table) );
-#endif
-
-#if defined( __SM_35__)
-    fprintf( stderr, " nbrs block size: %d \n", NBRS_BLOCK_SIZE );
-    fprintf( stderr, " nbrs threads per atom: %d \n", NB_KER_THREADS_PER_ATOM );
-
-    fprintf( stderr, " hbonds block size: %d \n", HB_BLOCK_SIZE );
-    fprintf( stderr, " hbonds threads per atom: %d \n", HB_KER_THREADS_PER_ATOM );
-
-    fprintf( stderr, " vdw block size: %d \n", VDW_BLOCK_SIZE );
-    fprintf( stderr, " vdw threads per atom: %d \n", VDW_KER_THREADS_PER_ATOM );
-
-    fprintf( stderr, " matvec block size: %d \n", MATVEC_BLOCK_SIZE );
-    fprintf( stderr, " matvec threads per atom: %d \n", MATVEC_KER_THREADS_PER_ROW);
-
-    fprintf( stderr, " General block size: %d \n", DEF_BLOCK_SIZE );
-#endif
-
     /* allocate main data structures */
     system = (reax_system *) smalloc( sizeof(reax_system), "system" );
     control = (control_params *) smalloc( sizeof(control_params), "control" );
@@ -249,17 +228,11 @@ int main( int argc, char* argv[] )
     Read_System( argv[1], argv[2], argv[3], system, control,
             data, workspace, out_control, mpi_data );
 
-#if defined(DEBUG)
-    fprintf( stderr, "p%d: read simulation info\n", system->my_rank );
-    MPI_Barrier( MPI_COMM_WORLD );
-#endif
-
     /* setup the CUDA Device for this process */
     Setup_Cuda_Environment( system->my_rank, control->nprocs, control->gpus_per_node );
 
 #if defined(DEBUG)
     print_device_mem_usage( );
-    fprintf( stderr, "p%d: Total number of GPUs on this node -- %d\n", system->my_rank, my_device_id);
 #endif
 
     /* init the blocks sizes for cuda kernels */
@@ -278,67 +251,31 @@ int main( int argc, char* argv[] )
     Pure_Initialize( system, control, data, workspace, lists, out_control, mpi_data );
 #endif
 
-//#if defined(DEBUG)
+#if defined(DEBUG)
     print_device_mem_usage( );
-//#endif
+#endif
 
     /* init the blocks sizes for cuda kernels */
     init_blocks( system );
 
-#if defined(DEBUG)
-    fprintf( stderr, "p%d: initializated data structures\n", system->my_rank );
-    MPI_Barrier( MPI_COMM_WORLD );
-#endif
-    //END OF FIRST STEP
-
-    // compute f_0
+    /* compute f_0 */
     Comm_Atoms( system, control, data, workspace, lists, mpi_data, TRUE );
     Sync_Atoms( system );
     Sync_Grid( &system->my_grid, &system->d_my_grid );
     init_blocks( system );
 
-#if defined(__CUDA_DEBUG_LOG__)
-    fprintf( stderr, "p%d: Comm_Atoms synchronized \n", system->my_rank );
-#endif
-
-    //Second step
     Cuda_Reset( system, control, data, workspace, lists );
 
 #if defined(__CUDA_DEBUG__)
     Reset( system, control, data, workspace, lists );
 #endif
 
-#if defined(DEBUG)
-    fprintf( stderr, "p%d: Cuda_Reset done...\n", system->my_rank );
-#endif
-
-    //Third Step
     Cuda_Generate_Neighbor_Lists( system, data, workspace, lists );
 
 #if defined(__CUDA_DEBUG__)
     Generate_Neighbor_Lists( system, data, workspace, lists );
 #endif
 
-//    reax_list **temp_lists = (reax_list **) smalloc( LIST_N * sizeof (reax_list *), "temp_lists" );
-//    for ( i = 0; i < LIST_N; ++i )
-//    {
-//        temp_lists[i] = (reax_list *) smalloc( sizeof(reax_list), "lists[i]" );
-//        temp_lists[i]->allocated = FALSE;
-//    }
-//    Make_List( (*dev_lists + FAR_NBRS)->n, (*dev_lists + FAR_NBRS)->num_intrs,
-//            TYP_FAR_NEIGHBOR, (*temp_lists + FAR_NBRS) );
-//    Output_Sync_Lists( (*temp_lists + FAR_NBRS), (*dev_lists + FAR_NBRS), TYP_FAR_NEIGHBOR );
-//    Print_Far_Neighbors( system, temp_lists, control );
-
-#if defined(DEBUG)
-    fprintf( stderr, "p%d: Cuda_Generate_Neighbor_Lists done...\n", system->my_rank );
-#endif
-
-    //Fourth Step
-#if defined(DEBUG)
-    fprintf( stderr, " Host Compute Forces begin.... \n" );
-#endif
-
 #if defined(__CUDA_DEBUG__)
     Compute_Forces( system, control, data, workspace,
             lists, out_control, mpi_data );
@@ -347,20 +284,12 @@ int main( int argc, char* argv[] )
     Cuda_Compute_Forces( system, control, data, workspace, lists,
             out_control, mpi_data );
 
-#if defined(DEBUG)
-    fprintf (stderr, "p%d: Cuda_Compute_Forces done...\n", system->my_rank );
-#endif
-
 #if defined (__CUDA_DEBUG__)
     Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
 #endif
 
     Cuda_Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
 
-#if defined(DEBUG)
-    fprintf (stderr, "p%d: Cuda_Compute_Kinetic_Energy done ... \n", system->my_rank);
-#endif
-
 #if defined(__CUDA_DEBUG__)
     validate_device( system, data, workspace, lists );
 #endif
@@ -368,16 +297,13 @@ int main( int argc, char* argv[] )
 #if !defined(__CUDA_DEBUG__)
     Output_Results( system, control, data, lists, out_control, mpi_data );
 #endif
-#if defined(DEBUG)
-    fprintf (stderr, "p%d: Output_Results done ... \n", system->my_rank);
-#endif
 
 #if defined(DEBUG)
-    fprintf( stderr, "p%d: computed forces at t0\n", system->my_rank );
+    fprintf( stderr, "p%d: step%d completed\n", system->my_rank, data->step );
     MPI_Barrier( MPI_COMM_WORLD );
 #endif
 
-    // start the simulation
+    /* begin main simulation loop */
     ++data->step;
     retries = 0;
     while ( data->step <= control->nsteps && retries < MAX_RETRIES )
@@ -423,20 +349,18 @@ int main( int argc, char* argv[] )
         fprintf( stderr, " Post Evolve time: %f \n", t_end );
 #endif
 
-#if !defined(__CUDA_DEBUG__)
         if ( ret == SUCCESS )
         {
+            data->timing.num_retries = retries;
+
+#if !defined(__CUDA_DEBUG__)
             Output_Results( system, control, data, lists, out_control, mpi_data );
-        }
 #endif
 
-//        if ( ret == SUCCESS )
-//        {
-//            Analysis(system, control, data, workspace, lists, out_control, mpi_data);
-//        }
+//        Analysis(system, control, data, workspace, lists, out_control, mpi_data);
 
         /* dump restart info */
-//        if ( ret == SUCCESS && out_control->restart_freq &&
+//        if ( out_control->restart_freq &&
 //                (data->step-data->prev_steps) % out_control->restart_freq == 0 )
 //        {
 //            if( out_control->restart_format == WRITE_ASCII )
@@ -449,8 +373,6 @@ int main( int argc, char* argv[] )
 //            }
 //        }
 
-        if ( ret == SUCCESS )
-        {
 #if defined(DEBUG)
             fprintf( stderr, "p%d: step%d completed\n", system->my_rank, data->step );
             MPI_Barrier( MPI_COMM_WORLD );
@@ -462,7 +384,9 @@ int main( int argc, char* argv[] )
         else
         {
             ++retries;
+#if defined(DEBUG)
             fprintf( stderr, "[INFO] p%d: retrying step %d...\n", system->my_rank, data->step );
+#endif
         }
     }
 
@@ -576,16 +500,12 @@ int main( int argc, char* argv[] )
 
         if ( ret == SUCCESS )
         {
+            data->timing.num_retries = retries;
+
             Output_Results( system, control, data, lists, out_control, mpi_data );
-        }
 
-        if ( ret == SUCCESS )
-        {
 //            Analysis(system, control, data, workspace, lists, out_control, mpi_data);
-        }
 
-        if ( ret == SUCCESS )
-        {
             /* dump restart info */
             if ( out_control->restart_freq &&
                     (data->step - data->prev_steps) % out_control->restart_freq == 0 )
@@ -599,10 +519,7 @@ int main( int argc, char* argv[] )
                     Write_Binary_Restart( system, control, data, out_control, mpi_data );
                 }
             }
-        }
 
-        if ( ret == SUCCESS )
-        {
 #if defined(DEBUG)
             fprintf( stderr, "p%d: step%d completed\n", system->my_rank, data->step );
             MPI_Barrier( mpi_data->world );
diff --git a/PG-PuReMD/src/reax_types.h b/PG-PuReMD/src/reax_types.h
index 38810bd6e6bb43d38f7e20ac47d69f1241cc10f6..d683df1cb80d925a35b79bdec5f2e11a61ad4c23 100644
--- a/PG-PuReMD/src/reax_types.h
+++ b/PG-PuReMD/src/reax_types.h
@@ -1456,11 +1456,13 @@ typedef struct
     /* non-bonded force calculation time */
     real nonb;
     /* atomic charge distribution calculation time */
-    real qEq;
+    real cm;
     /* num. of steps in iterative linear solver for charge distribution (QEq, first solve) */
-    int  s_matvecs;
+    int s_matvecs;
     /* num. of steps in iterative linear solver for charge distribution (QEq, second solve) */
-    int  t_matvecs;
+    int t_matvecs;
+    /* num. of retries in main sim. loop */
+    int num_retries;
 } reax_timing;
 
 
diff --git a/PG-PuReMD/src/reset_tools.c b/PG-PuReMD/src/reset_tools.c
index c3778145e1913b16e5299bd5e3c2cf5f3f263b5f..f2a24753f8748cdacf00a93160b687e14d28dfa8 100644
--- a/PG-PuReMD/src/reset_tools.c
+++ b/PG-PuReMD/src/reset_tools.c
@@ -112,9 +112,10 @@ void Reset_Timing( reax_timing *rt )
     rt->init_forces = 0;
     rt->bonded = 0;
     rt->nonb = 0;
-    rt->qEq = 0;
+    rt->cm = 0;
     rt->s_matvecs = 0;
     rt->t_matvecs = 0;
+    rt->num_retries = 0;
 }
 
 
diff --git a/PG-PuReMD/src/tool_box.c b/PG-PuReMD/src/tool_box.c
index a6f570c8c2b3fdb2a7d1afd789490f8a1f5fcd57..58772d773252982721b9ba1385ca932123632f02 100644
--- a/PG-PuReMD/src/tool_box.c
+++ b/PG-PuReMD/src/tool_box.c
@@ -186,7 +186,7 @@ void Update_Timing_Info( real *t_start, real *timing )
     struct timeval tim;
     real t_end;
 
-    gettimeofday(&tim, NULL );
+    gettimeofday( &tim, NULL );
 
     t_end = tim.tv_sec + (tim.tv_usec / 1000000.0);
     *timing += (t_end - *t_start);
diff --git a/PG-PuReMD/src/traj.c b/PG-PuReMD/src/traj.c
index b7ba111220dd8874726590c294cc72e07cb06b50..331e443057455b8e74b003a0e20a6f81fba3d550 100644
--- a/PG-PuReMD/src/traj.c
+++ b/PG-PuReMD/src/traj.c
@@ -1032,7 +1032,6 @@ int Append_Frame( reax_system *system, control_params *control,
 
     if ( out_control->write_atoms )
     {
-        //Sync atoms here
 #ifdef HAVE_CUDA
         Output_Sync_Atoms( system );
 #endif
@@ -1041,22 +1040,21 @@ int Append_Frame( reax_system *system, control_params *control,
 
     if ( out_control->write_bonds )
     {
-        //sync bonds here
 #ifdef HAVE_CUDA
-        Output_Sync_Lists((*lists + BONDS), (*dev_lists + BONDS), TYP_BOND);
+        Output_Sync_Lists( (*lists + BONDS), (*dev_lists + BONDS), TYP_BOND );
 #endif
         Write_Bonds( system, control, (*lists + BONDS), out_control, mpi_data );
     }
 
     if ( out_control->write_angles )
     {
-        //sync three body interactions here
 #ifdef HAVE_CUDA
-        Output_Sync_Lists((*lists + THREE_BODIES), (*dev_lists + THREE_BODIES), TYP_THREE_BODY);
+        Output_Sync_Lists( (*lists + THREE_BODIES), (*dev_lists + THREE_BODIES), TYP_THREE_BODY );
 #endif
         Write_Angles( system, control, (*lists + BONDS), (*lists + THREE_BODIES),
                       out_control, mpi_data );
     }
+
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d: appended frame %d\n", system->my_rank, data->step );
 #endif