diff --git a/PG-PuReMD/src/analyze.c b/PG-PuReMD/src/analyze.c
index 8b8b42673be48103e8bf1247dea700e2c5cc4958..f395ad05b30aa2d2ff1054498db71b1ff96f30c4 100644
--- a/PG-PuReMD/src/analyze.c
+++ b/PG-PuReMD/src/analyze.c
@@ -158,7 +158,9 @@ void Analyze_Fragments( reax_system *system, control_params *control,
         }
     }
     fprintf( fout, "\n" );
+#if defined(DEBUG)
     fflush( fout );
+#endif
 }
 
 
diff --git a/PG-PuReMD/src/cuda/cuda_allocate.cu b/PG-PuReMD/src/cuda/cuda_allocate.cu
index dd791df7996fdd2bd7b487f629eebb1adf4c64dc..a1ca790f8752eeaf2ee0e7283c277fde34f8508e 100644
--- a/PG-PuReMD/src/cuda/cuda_allocate.cu
+++ b/PG-PuReMD/src/cuda/cuda_allocate.cu
@@ -85,7 +85,6 @@ void Cuda_Allocate_Grid( reax_system *system )
 //
 //    k_init_nbrs <<< blocks, block_size >>>
 //        ( nbrs_x, host->max_nbrs );
-//    cudaDeviceSynchronize( );
 //    cudaCheckError( );
 //
 //    cuda_malloc( (void **)& device->cells, sizeof(grid_cell) * total,
@@ -406,9 +405,15 @@ void Cuda_Allocate_Workspace_Part2( reax_system *system, control_params *control
         storage *workspace, int total_cap )
 {
     int total_real, total_rvec;
+#if defined(DUAL_SOLVER)
+    int total_rvec2;
+#endif
 
     total_real = sizeof(real) * total_cap;
     total_rvec = sizeof(rvec) * total_cap;
+#if defined(DUAL_SOLVER)
+    total_rvec2 = sizeof(rvec2) * total_cap;
+#endif
 
     /* bond order related storage  */
     cuda_malloc( (void **) &workspace->total_bond_order, total_real, TRUE, "total_bo" );
@@ -431,22 +436,22 @@ void Cuda_Allocate_Workspace_Part2( reax_system *system, control_params *control
     /* charge matrix storage */
     if ( control->cm_solver_pre_comp_type == JACOBI_PC )
     {
-        cuda_malloc( (void **) &workspace->Hdia_inv, sizeof(real) * total_cap, TRUE, "Hdia_inv" );
+        cuda_malloc( (void **) &workspace->Hdia_inv, total_real, TRUE, "Hdia_inv" );
     }
     if ( control->cm_solver_pre_comp_type == ICHOLT_PC
             || control->cm_solver_pre_comp_type == ILUT_PC
             || control->cm_solver_pre_comp_type == ILUTP_PC
             || control->cm_solver_pre_comp_type == FG_ILUT_PC )
     {
-        cuda_malloc( (void **) &workspace->droptol, sizeof(real) * total_cap, TRUE, "droptol" );
+        cuda_malloc( (void **) &workspace->droptol, total_real, TRUE, "droptol" );
     }
-    cuda_malloc( (void **) &workspace->b_s, sizeof(real) * total_cap, TRUE, "b_s" );
-    cuda_malloc( (void **) &workspace->b_t, sizeof(real) * total_cap, TRUE, "b_t" );
-    cuda_malloc( (void **) &workspace->s, sizeof(real) * total_cap, TRUE, "s" );
-    cuda_malloc( (void **) &workspace->t, sizeof(real) * total_cap, TRUE, "t" );
+    cuda_malloc( (void **) &workspace->b_s, total_real, TRUE, "b_s" );
+    cuda_malloc( (void **) &workspace->b_t, total_real, TRUE, "b_t" );
+    cuda_malloc( (void **) &workspace->s, total_real, TRUE, "s" );
+    cuda_malloc( (void **) &workspace->t, total_real, TRUE, "t" );
 #if defined(DUAL_SOLVER)
-    cuda_malloc( (void **) &workspace->b, sizeof(rvec2) * total_cap, TRUE, "b" );
-    cuda_malloc( (void **) &workspace->x, sizeof(rvec2) * total_cap, TRUE, "x" );
+    cuda_malloc( (void **) &workspace->b, total_rvec2, TRUE, "b" );
+    cuda_malloc( (void **) &workspace->x, total_rvec2, TRUE, "x" );
 #endif
 
     switch ( control->cm_solver_type )
@@ -454,9 +459,9 @@ void Cuda_Allocate_Workspace_Part2( reax_system *system, control_params *control
     case GMRES_S:
     case GMRES_H_S:
         cuda_malloc( (void **) &workspace->b_prc,
-                sizeof(real) * total_cap, TRUE, "b_prc" );
+                total_real, TRUE, "b_prc" );
         cuda_malloc( (void **) &workspace->b_prm,
-                sizeof(real) * total_cap, TRUE, "b_prm" );
+                total_real, TRUE, "b_prm" );
         cuda_malloc( (void **) &workspace->y,
                 (control->cm_solver_restart + 1) * sizeof(real), TRUE, "y" );
         cuda_malloc( (void **) &workspace->z,
@@ -474,45 +479,45 @@ void Cuda_Allocate_Workspace_Part2( reax_system *system, control_params *control
         break;
 
     case SDM_S:
-        cuda_malloc( (void **) &workspace->r, sizeof(real) * total_cap, TRUE, "r" );
-        cuda_malloc( (void **) &workspace->d, sizeof(real) * total_cap, TRUE, "d" );
-        cuda_malloc( (void **) &workspace->q, sizeof(real) * total_cap, TRUE, "q" );
-        cuda_malloc( (void **) &workspace->p, sizeof(real) * total_cap, TRUE, "p" );
+        cuda_malloc( (void **) &workspace->r, total_real, TRUE, "r" );
+        cuda_malloc( (void **) &workspace->d, total_real, TRUE, "d" );
+        cuda_malloc( (void **) &workspace->q, total_real, TRUE, "q" );
+        cuda_malloc( (void **) &workspace->p, total_real, TRUE, "p" );
         break;
 
     case CG_S:
-        cuda_malloc( (void **) &workspace->r, sizeof(real) * total_cap, TRUE, "r" );
-        cuda_malloc( (void **) &workspace->d, sizeof(real) * total_cap, TRUE, "d" );
-        cuda_malloc( (void **) &workspace->q, sizeof(real) * total_cap, TRUE, "q" );
-        cuda_malloc( (void **) &workspace->p, sizeof(real) * total_cap, TRUE, "p" );
+        cuda_malloc( (void **) &workspace->r, total_real, TRUE, "r" );
+        cuda_malloc( (void **) &workspace->d, total_real, TRUE, "d" );
+        cuda_malloc( (void **) &workspace->q, total_real, TRUE, "q" );
+        cuda_malloc( (void **) &workspace->p, total_real, TRUE, "p" );
 #if defined(DUAL_SOLVER)
-        cuda_malloc( (void **) &workspace->r2, sizeof(rvec2) * total_cap, TRUE, "r2" );
-        cuda_malloc( (void **) &workspace->d2, sizeof(rvec2) * total_cap, TRUE, "d2" );
-        cuda_malloc( (void **) &workspace->q2, sizeof(rvec2) * total_cap, TRUE, "q2" );
-        cuda_malloc( (void **) &workspace->p2, sizeof(rvec2) * total_cap, TRUE, "p2" );
+        cuda_malloc( (void **) &workspace->r2, total_rvec2, TRUE, "r2" );
+        cuda_malloc( (void **) &workspace->d2, total_rvec2, TRUE, "d2" );
+        cuda_malloc( (void **) &workspace->q2, total_rvec2, TRUE, "q2" );
+        cuda_malloc( (void **) &workspace->p2, total_rvec2, TRUE, "p2" );
 #endif
         break;
 
     case BiCGStab_S:
-        cuda_malloc( (void **) &workspace->y, sizeof(real) * total_cap, TRUE, "y" );
-        cuda_malloc( (void **) &workspace->g, sizeof(real) * total_cap, TRUE, "g" );
-        cuda_malloc( (void **) &workspace->z, sizeof(real) * total_cap, TRUE, "z" );
-        cuda_malloc( (void **) &workspace->r, sizeof(real) * total_cap, TRUE, "r" );
-        cuda_malloc( (void **) &workspace->d, sizeof(real) * total_cap, TRUE, "d" );
-        cuda_malloc( (void **) &workspace->q, sizeof(real) * total_cap, TRUE, "q" );
-        cuda_malloc( (void **) &workspace->p, sizeof(real) * total_cap, TRUE, "p" );
-        cuda_malloc( (void **) &workspace->r_hat, sizeof(real) * total_cap, TRUE, "r_hat" );
-        cuda_malloc( (void **) &workspace->q_hat, sizeof(real) * total_cap, TRUE, "q_hat" );
+        cuda_malloc( (void **) &workspace->y, total_real, TRUE, "y" );
+        cuda_malloc( (void **) &workspace->g, total_real, TRUE, "g" );
+        cuda_malloc( (void **) &workspace->z, total_real, TRUE, "z" );
+        cuda_malloc( (void **) &workspace->r, total_real, TRUE, "r" );
+        cuda_malloc( (void **) &workspace->d, total_real, TRUE, "d" );
+        cuda_malloc( (void **) &workspace->q, total_real, TRUE, "q" );
+        cuda_malloc( (void **) &workspace->p, total_real, TRUE, "p" );
+        cuda_malloc( (void **) &workspace->r_hat, total_real, TRUE, "r_hat" );
+        cuda_malloc( (void **) &workspace->q_hat, total_real, TRUE, "q_hat" );
 #if defined(DUAL_SOLVER)
-        cuda_malloc( (void **) &workspace->y2, sizeof(rvec2) * total_cap, TRUE, "y" );
-        cuda_malloc( (void **) &workspace->g2, sizeof(rvec2) * total_cap, TRUE, "g" );
-        cuda_malloc( (void **) &workspace->z2, sizeof(rvec2) * total_cap, TRUE, "z" );
-        cuda_malloc( (void **) &workspace->r2, sizeof(rvec2) * total_cap, TRUE, "r" );
-        cuda_malloc( (void **) &workspace->d2, sizeof(rvec2) * total_cap, TRUE, "d" );
-        cuda_malloc( (void **) &workspace->q2, sizeof(rvec2) * total_cap, TRUE, "q" );
-        cuda_malloc( (void **) &workspace->p2, sizeof(rvec2) * total_cap, TRUE, "p" );
-        cuda_malloc( (void **) &workspace->r_hat2, sizeof(rvec2) * total_cap, TRUE, "r_hat" );
-        cuda_malloc( (void **) &workspace->q_hat2, sizeof(rvec2) * total_cap, TRUE, "q_hat" );
+        cuda_malloc( (void **) &workspace->y2, total_rvec2, TRUE, "y" );
+        cuda_malloc( (void **) &workspace->g2, total_rvec2, TRUE, "g" );
+        cuda_malloc( (void **) &workspace->z2, total_rvec2, TRUE, "z" );
+        cuda_malloc( (void **) &workspace->r2, total_rvec2, TRUE, "r" );
+        cuda_malloc( (void **) &workspace->d2, total_rvec2, TRUE, "d" );
+        cuda_malloc( (void **) &workspace->q2, total_rvec2, TRUE, "q" );
+        cuda_malloc( (void **) &workspace->p2, total_rvec2, TRUE, "p" );
+        cuda_malloc( (void **) &workspace->r_hat2, total_rvec2, TRUE, "r_hat" );
+        cuda_malloc( (void **) &workspace->q_hat2, total_rvec2, TRUE, "q_hat" );
 #endif
         break;
 
diff --git a/PG-PuReMD/src/cuda/cuda_bond_orders.cu b/PG-PuReMD/src/cuda/cuda_bond_orders.cu
index 0c11e58fbaf40d8237c16e28d5aea1b966a14919..43d8f988dd28191d46f4d8b2818bc0a02b1e5247 100644
--- a/PG-PuReMD/src/cuda/cuda_bond_orders.cu
+++ b/PG-PuReMD/src/cuda/cuda_bond_orders.cu
@@ -767,7 +767,6 @@ void Cuda_Total_Forces( reax_system *system, control_params *control,
           (control_params *) control->d_control_params, 
           (simulation_data *)data->d_simulation_data, 
           spad_rvec, system->N );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 
     if ( control->virial != 0 )
@@ -775,19 +774,16 @@ void Cuda_Total_Forces( reax_system *system, control_params *control,
         /* reduction for ext_press */
         k_reduction_rvec <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec) * (DEF_BLOCK_SIZE / 32) >>> 
             ( spad_rvec, &spad_rvec[system->N], system->N );
-        cudaDeviceSynchronize( ); 
         cudaCheckError( ); 
 
         k_reduction_rvec <<< 1, ((blocks + 31) / 32) * 32, sizeof(rvec) * ((blocks + 31) / 32) >>>
             ( &spad_rvec[system->N], &((simulation_data *)data->d_simulation_data)->my_ext_press, blocks );
-        cudaDeviceSynchronize( ); 
         cudaCheckError( ); 
     }
 
     /* post processing for the atomic forces */
     k_total_forces_part2  <<< blocks, DEF_BLOCK_SIZE >>>
         ( system->d_my_atoms, *(lists[BONDS]), *(workspace->d_workspace), system->N );
-    cudaDeviceSynchronize( ); 
     cudaCheckError( ); 
 }
 
@@ -801,6 +797,5 @@ void Cuda_Total_Forces_PURE( reax_system *system, storage *workspace )
 
     k_total_forces_pure <<< blocks, DEF_BLOCK_SIZE >>>
         ( system->d_my_atoms, system->n, *(workspace->d_workspace) );
-    cudaDeviceSynchronize( ); 
     cudaCheckError( ); 
 }
diff --git a/PG-PuReMD/src/cuda/cuda_charges.cu b/PG-PuReMD/src/cuda/cuda_charges.cu
index a8d4bc86feff484d8e143167e27837b849870a35..6688eb854b91e42efb7e511295b9b133a00d5ac8 100644
--- a/PG-PuReMD/src/cuda/cuda_charges.cu
+++ b/PG-PuReMD/src/cuda/cuda_charges.cu
@@ -63,7 +63,6 @@ static void jacobi( reax_system const * const system,
     k_jacobi <<< blocks, DEF_BLOCK_SIZE >>>
         ( system->d_my_atoms, system->reax_param.d_sbp, 
           *(workspace->d_workspace), system->n );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 }
 
@@ -133,7 +132,6 @@ void Sort_Matrix_Rows( sparse_matrix * const A, reax_system const * const system
         cub::DeviceRadixSort::SortPairs( d_temp_storage, temp_storage_bytes,
                 &d_j_temp[start[i]], &A->j[start[i]],
                 &d_val_temp[start[i]], &A->val[start[i]], num_entries );
-        cudaDeviceSynchronize( );
         cudaCheckError( );
     }
 
@@ -245,7 +243,6 @@ static void Spline_Extrapolate_Charges_QEq( reax_system const * const system,
         ( system->d_my_atoms, system->reax_param.d_sbp, 
           (control_params *)control->d_control_params,
           *(workspace->d_workspace), system->n );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 }
 
@@ -421,7 +418,6 @@ static void Extrapolate_Charges_QEq_Part2( reax_system const * const system,
 
     k_extrapolate_charges_qeq_part2 <<< blocks, DEF_BLOCK_SIZE >>>
         ( system->d_my_atoms, *(workspace->d_workspace), u, spad, system->n );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 
     copy_host_device( q, spad, sizeof(real) * system->n, 
@@ -463,7 +459,6 @@ static void Update_Ghost_Atom_Charges( reax_system const * const system,
 
     k_update_ghost_atom_charges <<< blocks, DEF_BLOCK_SIZE >>>
         ( system->d_my_atoms, spad, system->n, system->N );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 }
 
@@ -502,13 +497,11 @@ static void Calculate_Charges_QEq( reax_system const * const system,
     k_reduction_rvec2 <<< blocks, DEF_BLOCK_SIZE,
                       sizeof(rvec2) * (DEF_BLOCK_SIZE / 32) >>>
         ( workspace->d_workspace->x, spad_rvec2, system->n );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 
     k_reduction_rvec2 <<< 1, ((blocks + 31) / 32) * 32,
                       sizeof(rvec2) * ((blocks + 31) / 32) >>>
         ( spad_rvec2, &spad_rvec2[blocks], blocks );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 
     copy_host_device( &my_sum, &spad_rvec2[blocks],
diff --git a/PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu b/PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu
index 01d8b831a2f3528cbdb5b8d097d894f77fc41e58..2dcd614bcc2c8ced5fb9a2486fa0827eca6ee9e8 100644
--- a/PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu
+++ b/PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu
@@ -320,7 +320,6 @@ void Vector_MakeZero( real * const v, unsigned int k )
 
     k_vector_makezero <<< blocks, DEF_BLOCK_SIZE >>>
         ( v, k );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 }
 
@@ -343,7 +342,6 @@ void Vector_Copy( real * const dest, real const * const v,
 
     k_vector_copy <<< blocks, DEF_BLOCK_SIZE >>>
         ( dest, v, k );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 }
 
@@ -366,7 +364,6 @@ void Vector_Copy_rvec2( rvec2 * const dest, rvec2 const * const v,
 
     k_vector_copy_rvec2 <<< blocks, DEF_BLOCK_SIZE >>>
         ( dest, v, k );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 }
 
@@ -381,7 +378,6 @@ void Vector_Copy_From_rvec2( real * const dst, rvec2 const * const src,
 
     k_vector_copy_from_rvec2 <<< blocks, DEF_BLOCK_SIZE >>>
         ( dst, src, index, k );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 }
 
@@ -396,7 +392,6 @@ void Vector_Copy_To_rvec2( rvec2 * const dst, real const * const src,
 
     k_vector_copy_to_rvec2 <<< blocks, DEF_BLOCK_SIZE >>>
         ( dst, src, index, k );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 }
 
@@ -420,7 +415,6 @@ void Vector_Scale( real * const dest, real c, real const * const v,
 
     k_vector_scale <<< blocks, DEF_BLOCK_SIZE >>>
         ( dest, c, v, k );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 }
 
@@ -445,7 +439,6 @@ void Vector_Sum( real * const dest, real c, real const * const v,
 
     k_vector_sum <<< blocks, DEF_BLOCK_SIZE >>>
         ( dest, c, v, d, y, k );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 }
 
@@ -460,7 +453,6 @@ void Vector_Sum_rvec2( rvec2 * const dest, real c0, real c1, rvec2 const * const
 
     k_vector_sum_rvec2 <<< blocks, DEF_BLOCK_SIZE >>> 
         ( dest, c0, c1, v, d0, d1, y, k );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 }
 
@@ -485,7 +477,6 @@ void Vector_Add( real * const dest, real c, real const * const v,
 
     k_vector_add <<< blocks, DEF_BLOCK_SIZE >>>
         ( dest, c, v, k );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 }
 
@@ -510,7 +501,6 @@ void Vector_Add_rvec2( rvec2 * const dest, real c0, real c1, rvec2 const * const
 
     k_vector_add_rvec2 <<< blocks, DEF_BLOCK_SIZE >>>
         ( dest, c0, c1, v, k );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 }
 
@@ -533,7 +523,6 @@ void Vector_Mult( real * const dest, real const * const v1,
 
     k_vector_mult <<< blocks, DEF_BLOCK_SIZE >>>
         ( dest, v1, v2, k );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 }
 
@@ -556,7 +545,6 @@ void Vector_Mult_rvec2( rvec2 * const dest, rvec2 const * const v1,
 
     k_vector_mult_rvec2 <<< blocks, DEF_BLOCK_SIZE >>>
         ( dest, v1, v2, k );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 }
 
@@ -687,13 +675,11 @@ void Dot_local_rvec2( control_params const * const control,
     k_reduction_rvec2 <<< blocks, DEF_BLOCK_SIZE,
                       sizeof(rvec2) * DEF_BLOCK_SIZE >>>
         ( spad, &spad[k], k );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 
     k_reduction_rvec2 <<< 1, ((blocks + 31) / 32) * 32,
                       sizeof(rvec2) * ((blocks + 31) / 32) >>>
         ( &spad[k], &spad[k + blocks], blocks );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 
     //TODO: keep result of reduction on devie and pass directly to CUDA-aware MPI
diff --git a/PG-PuReMD/src/cuda/cuda_forces.cu b/PG-PuReMD/src/cuda/cuda_forces.cu
index 1da8cb67c990b80a013e423fa82b9f206194b54a..6d6f5aec22d3fe984d80863de630b4cf796f1eab 100644
--- a/PG-PuReMD/src/cuda/cuda_forces.cu
+++ b/PG-PuReMD/src/cuda/cuda_forces.cu
@@ -1203,7 +1203,6 @@ static int Cuda_Estimate_Storage_Three_Body( reax_system *system, control_params
     Cuda_Estimate_Valence_Angles <<< control->blocks_n, control->block_size_n >>>
         ( system->d_my_atoms, (control_params *)control->d_control_params, 
           *(lists[BONDS]), system->n, system->N, thbody );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 
     Cuda_Reduction_Sum( thbody, system->d_total_thbodies, system->total_bonds );
@@ -1252,7 +1251,6 @@ static void Print_Forces( reax_system *system )
 
     k_print_forces <<< blocks, DEF_BLOCK_SIZE >>>
         ( system->d_my_atoms, workspace->d_workspace->f, system->n );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 }
 
@@ -1266,7 +1264,6 @@ static void Print_HBonds( reax_system *system, int step )
 
     k_print_hbonds <<< blocks, DEF_BLOCK_SIZE >>>
         ( system->d_my_atoms, *(lists[HBONDS]), system->n, system->my_rank, step );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 }
 #endif
@@ -1288,7 +1285,6 @@ void Cuda_Init_Neighbor_Indices( reax_system *system, reax_list *far_nbr_list )
     /* init end_indices */
     k_init_end_index <<< blocks, DEF_BLOCK_SIZE >>>
         ( system->d_far_nbrs, far_nbr_list->index, far_nbr_list->end_index, far_nbr_list->n );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 }
 
@@ -1312,7 +1308,6 @@ void Cuda_Init_HBond_Indices( reax_system *system, storage *workspace,
     /* init Hindices */
     k_init_hindex <<< blocks, DEF_BLOCK_SIZE >>>
         ( system->d_my_atoms, system->total_cap );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 
     /* init indices and end_indices */
@@ -1321,7 +1316,6 @@ void Cuda_Init_HBond_Indices( reax_system *system, storage *workspace,
     k_init_hbond_indices <<< blocks, DEF_BLOCK_SIZE >>>
         ( system->d_my_atoms, system->reax_param.d_sbp, system->d_hbonds, temp, 
           hbond_list->index, hbond_list->end_index, system->total_cap );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 }
 
@@ -1342,7 +1336,6 @@ void Cuda_Init_Bond_Indices( reax_system *system, reax_list * bond_list )
     /* init end_indices */
     k_init_end_index <<< blocks, DEF_BLOCK_SIZE >>>
         ( system->d_bonds, bond_list->index, bond_list->end_index, system->total_cap );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 }
 
@@ -1365,7 +1358,6 @@ void Cuda_Init_Sparse_Matrix_Indices( reax_system *system, sparse_matrix *H )
     /* init end_indices */
     k_init_end_index <<< blocks, DEF_BLOCK_SIZE >>>
         ( system->d_cm_entries, H->start, H->end, H->n_max );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 }
 
@@ -1411,7 +1403,6 @@ void Cuda_Estimate_Storages( reax_system *system, control_params *control,
                   workspace->d_workspace->H.n_max,
                   system->d_cm_entries, system->d_max_cm_entries );
         }
-        cudaDeviceSynchronize( );
         cudaCheckError( );
 
         Cuda_Reduction_Sum( system->d_max_cm_entries,
@@ -1432,7 +1423,6 @@ void Cuda_Estimate_Storages( reax_system *system, control_params *control,
               system->n, system->N, system->total_cap,
               system->d_bonds, system->d_max_bonds,
               system->d_hbonds, system->d_max_hbonds );
-        cudaDeviceSynchronize( );
         cudaCheckError( );
     }
 
@@ -1475,7 +1465,7 @@ int Cuda_Init_Forces( reax_system *system, control_params *control,
         simulation_data *data, storage *workspace,
         reax_list **lists, output_controls *out_control ) 
 {
-    int renbr, blocks, ret, ret_bonds, ret_hbonds, ret_cm;
+    int renbr, blocks, ret, realloc_bonds, realloc_hbonds, realloc_cm;
     static int dist_done = FALSE, cm_done = FALSE, bonds_done = FALSE;
 #if defined(LOG_PERFORMANCE)
     double time;
@@ -1492,7 +1482,6 @@ int Cuda_Init_Forces( reax_system *system, control_params *control,
 //       (((system->N - system->n) % DEF_BLOCK_SIZE == 0) ? 0 : 1);
 //    k_init_bond_mark <<< blocks, DEF_BLOCK_SIZE >>>
 //       ( system->n, (system->N - system->n), workspace->d_workspace->bond_mark );
-//    cudaDeviceSynchronize( );
 //    cudaCheckError( );
 
     /* reset reallocation flags on device */
@@ -1507,7 +1496,6 @@ int Cuda_Init_Forces( reax_system *system, control_params *control,
     {
         k_init_distance <<< control->blocks_n, control->block_size_n >>>
             ( system->d_my_atoms, *(lists[FAR_NBRS]), system->N );
-        cudaDeviceSynchronize( );
         cudaCheckError( );
 
         dist_done = TRUE;
@@ -1563,7 +1551,6 @@ int Cuda_Init_Forces( reax_system *system, control_params *control,
                       system->d_max_cm_entries, system->d_realloc_cm_entries );
             }
         }
-        cudaDeviceSynchronize( );
         cudaCheckError( );
     }
 
@@ -1581,7 +1568,6 @@ int Cuda_Init_Forces( reax_system *system, control_params *control,
               workspace->d_LR, system->n, system->N, system->reax_param.num_atom_types,
               system->d_max_bonds, system->d_realloc_bonds,
               system->d_max_hbonds, system->d_realloc_hbonds );
-        cudaDeviceSynchronize( );
         cudaCheckError( );
     }
 
@@ -1590,21 +1576,21 @@ int Cuda_Init_Forces( reax_system *system, control_params *control,
 #endif
 
     /* check reallocation flags on device */
-    copy_host_device( &ret_cm, system->d_realloc_cm_entries, sizeof(int), 
+    copy_host_device( &realloc_cm, system->d_realloc_cm_entries, sizeof(int), 
             cudaMemcpyDeviceToHost, "Cuda_Init_Forces::d_realloc_cm_entries" );
-    copy_host_device( &ret_bonds, system->d_realloc_bonds, sizeof(int), 
+    copy_host_device( &realloc_bonds, system->d_realloc_bonds, sizeof(int), 
             cudaMemcpyDeviceToHost, "Cuda_Init_Forces::d_realloc_bonds" );
-    copy_host_device( &ret_hbonds, system->d_realloc_hbonds, sizeof(int), 
+    copy_host_device( &realloc_hbonds, system->d_realloc_hbonds, sizeof(int), 
             cudaMemcpyDeviceToHost, "Cuda_Init_Forces::d_realloc_hbonds" );
 
-    ret = (ret_cm == FALSE && ret_bonds == FALSE && ret_hbonds == FALSE)
+    ret = (realloc_cm == FALSE && realloc_bonds == FALSE && realloc_hbonds == FALSE)
         ? SUCCESS : FAILURE;
 
-    if ( ret_cm == FALSE )
+    if ( realloc_cm == FALSE )
     {
         cm_done = TRUE;
     }
-    if ( ret_bonds == FALSE && ret_hbonds == FALSE )
+    if ( realloc_bonds == FALSE && realloc_hbonds == FALSE )
     {
         bonds_done = TRUE;
     }
@@ -1613,7 +1599,6 @@ int Cuda_Init_Forces( reax_system *system, control_params *control,
     {
         k_update_sym_dbond_indices <<< control->blocks_n, control->block_size_n >>> 
             ( *(lists[BONDS]), system->N );
-        cudaDeviceSynchronize( );
         cudaCheckError( );
 
         if ( control->hbond_cut > 0.0 && system->numH > 0 )
@@ -1624,13 +1609,11 @@ int Cuda_Init_Forces( reax_system *system, control_params *control,
             /* make hbond_list symmetric */
             k_update_sym_hbond_indices <<< blocks, DEF_BLOCK_SIZE >>>
                 ( system->d_my_atoms, *(lists[HBONDS]), system->N );
-            cudaDeviceSynchronize( );
             cudaCheckError( );
         }
 
         /* update bond_mark */
 //        k_bond_mark <<< control->blocks_n, control->block_size_n >>>
-//        cudaDeviceSynchronize( );
 //        cudaCheckError( );
 
         dist_done = FALSE;
@@ -1640,11 +1623,13 @@ int Cuda_Init_Forces( reax_system *system, control_params *control,
     else
     {
         Cuda_Estimate_Storages( system, control, workspace, lists,
-               ret_bonds, ret_hbonds, ret_cm, data->step - data->prev_steps );
+               realloc_bonds, realloc_hbonds, realloc_cm,
+               data->step - data->prev_steps );
 
-        workspace->d_workspace->realloc.bonds = ret_bonds;
-        workspace->d_workspace->realloc.hbonds = ret_hbonds;
-        workspace->d_workspace->realloc.cm = ret_cm;
+        /* schedule reallocations after updating allocation sizes */
+        workspace->d_workspace->realloc.bonds = realloc_bonds;
+        workspace->d_workspace->realloc.hbonds = realloc_hbonds;
+        workspace->d_workspace->realloc.cm = realloc_cm;
     }
 
     return ret;
@@ -1692,7 +1677,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
         Cuda_BO_Part1 <<< control->blocks_n, control->block_size_n >>>
             ( system->d_my_atoms, system->reax_param.d_sbp, 
               *(workspace->d_workspace), system->N );
-        cudaDeviceSynchronize( );
         cudaCheckError( );
 
         Cuda_BO_Part2 <<< control->blocks_n, control->block_size_n >>>
@@ -1700,18 +1684,15 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
               system->reax_param.d_tbp, *(workspace->d_workspace), 
               *(lists[BONDS]),
               system->reax_param.num_atom_types, system->N );
-        cudaDeviceSynchronize( );
         cudaCheckError( );
 
         Cuda_BO_Part3 <<< control->blocks_n, control->block_size_n >>>
             ( *(workspace->d_workspace), *(lists[BONDS]), system->N );
-        cudaDeviceSynchronize( );
         cudaCheckError( );
 
         Cuda_BO_Part4 <<< control->blocks_n, control->block_size_n >>>
             ( system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_sbp, 
              *(workspace->d_workspace), system->N );
-        cudaDeviceSynchronize( );
         cudaCheckError( );
 
         /* 2. Bond Energy Interactions */
@@ -1722,7 +1703,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
             ( system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_sbp, system->reax_param.d_tbp,
               *(workspace->d_workspace), *(lists[BONDS]), 
               system->n, system->reax_param.num_atom_types, spad );
-        cudaDeviceSynchronize( );
         cudaCheckError( );
 
         /* reduction for E_BE */
@@ -1741,12 +1721,10 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
               system->reax_param.d_sbp, system->reax_param.d_tbp, *(workspace->d_workspace),
               *(lists[BONDS]), system->n, system->reax_param.num_atom_types,
               spad, &spad[system->n], &spad[2 * system->n] );
-        cudaDeviceSynchronize( );
         cudaCheckError( );
 
         Cuda_Atom_Energy_Part2 <<< control->blocks, control->block_size >>>
             ( *(lists[BONDS]), *(workspace->d_workspace), system->n );
-        cudaDeviceSynchronize( );
         cudaCheckError( );
 
         if ( update_energy == TRUE )
@@ -1795,7 +1773,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
               *(workspace->d_workspace), *(lists[BONDS]), *(lists[THREE_BODIES]),
               system->n, system->N, system->reax_param.num_atom_types, 
               spad, &spad[system->N], &spad[2 * system->N], (rvec *) (&spad[3 * system->N]) );
-        cudaDeviceSynchronize( );
         cudaCheckError( );
 
         if ( update_energy == TRUE )
@@ -1824,7 +1801,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
             k_reduction_rvec <<< control->blocks_n, control->block_size_n,
                              sizeof(rvec) * (control->block_size_n / 32) >>>
                 ( rvec_spad, &rvec_spad[system->N], system->N );
-            cudaDeviceSynchronize( );
             cudaCheckError( );
 
             k_reduction_rvec <<< 1, control->blocks_pow_2_n,
@@ -1832,7 +1808,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
                 ( &rvec_spad[system->N],
                   &((simulation_data *)data->d_simulation_data)->my_ext_press,
                   control->blocks_n );
-            cudaDeviceSynchronize ();
             cudaCheckError( );
 //            Cuda_Reduction_Sum( rvec_spad,
 //                    &((simulation_data *)data->d_simulation_data)->my_ext_press,
@@ -1842,7 +1817,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
         Cuda_Valence_Angles_Part2 <<< control->blocks_n, control->block_size_n >>>
             ( system->d_my_atoms, (control_params *) control->d_control_params,
               *(workspace->d_workspace), *(lists[BONDS]), system->N );
-        cudaDeviceSynchronize( );
         cudaCheckError( );
 
         /* 5. Torsion Angles Interactions */
@@ -1855,7 +1829,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
               *(lists[THREE_BODIES]), *(workspace->d_workspace), system->n,
               system->reax_param.num_atom_types, 
               spad, &spad[system->n], (rvec *) (&spad[2 * system->n]) );
-        cudaDeviceSynchronize( );
         cudaCheckError( );
 
         if ( update_energy == TRUE )
@@ -1879,7 +1852,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
             k_reduction_rvec <<< control->blocks, control->block_size,
                              sizeof(rvec) * (control->block_size / 32) >>>
                 ( rvec_spad, &rvec_spad[system->n], system->n );
-            cudaDeviceSynchronize( );
             cudaCheckError( );
 
             k_reduction_rvec <<< 1, control->blocks_pow_2,
@@ -1887,7 +1859,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
                     ( &rvec_spad[system->n],
                       &((simulation_data *)data->d_simulation_data)->my_ext_press,
                       control->blocks );
-            cudaDeviceSynchronize( );
             cudaCheckError( );
 //            Cuda_Reduction_Sum( rvec_spad,
 //                    &((simulation_data *)data->d_simulation_data)->my_ext_press,
@@ -1897,7 +1868,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
         Cuda_Torsion_Angles_Part2 <<< control->blocks_n, control->block_size_n >>>
                 ( system->d_my_atoms, *(workspace->d_workspace), *(lists[BONDS]),
                   system->N );
-        cudaDeviceSynchronize( );
         cudaCheckError( );
 
         /* 6. Hydrogen Bonds Interactions */
@@ -1920,7 +1890,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
                       *(lists[FAR_NBRS]), *(lists[BONDS]), *(lists[HBONDS]),
                       system->n, system->reax_param.num_atom_types,
                       spad, (rvec *) (&spad[system->n]), system->my_rank );
-            cudaDeviceSynchronize( );
             cudaCheckError( );
 
             if ( update_energy == TRUE )
@@ -1939,7 +1908,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
                 k_reduction_rvec <<< control->blocks, control->block_size,
                                  sizeof(rvec) * (control->block_size / 32) >>>
                     ( rvec_spad, &rvec_spad[system->n], system->n );
-                cudaDeviceSynchronize( );
                 cudaCheckError( );
 
                 k_reduction_rvec <<< 1, control->blocks_pow_2,
@@ -1947,7 +1915,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
                     ( &rvec_spad[system->n],
                       &((simulation_data *)data->d_simulation_data)->my_ext_press,
                       control->blocks );
-                cudaDeviceSynchronize( );
                 cudaCheckError( );
 //                Cuda_Reduction_Sum( rvec_spad,
 //                        &((simulation_data *)data->d_simulation_data)->my_ext_press,
@@ -1957,7 +1924,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
             Cuda_Hydrogen_Bonds_Part2 <<< control->blocks, control->block_size >>>
                 ( system->d_my_atoms, *(workspace->d_workspace),
                   *(lists[BONDS]), system->n );
-            cudaDeviceSynchronize( );
             cudaCheckError( );
 
 //            hnbrs_blocks = (system->n * HB_POST_PROC_KER_THREADS_PER_ATOM / HB_POST_PROC_BLOCK_SIZE) +
@@ -1968,7 +1934,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
 //            Cuda_Hydrogen_Bonds_Part3_opt <<< hnbrs_blocks, HB_POST_PROC_BLOCK_SIZE, 
 //                    HB_POST_PROC_BLOCK_SIZE * sizeof(rvec) >>>
 //                ( system->d_my_atoms, *(workspace->d_workspace), *(lists[HBONDS]), system->n );
-            cudaDeviceSynchronize( );
             cudaCheckError( );
         }
 
diff --git a/PG-PuReMD/src/cuda/cuda_integrate.cu b/PG-PuReMD/src/cuda/cuda_integrate.cu
index 1c3f30682893640e272135c788ddce273fab919f..8478249e5e7da1e4d2744fb763eca204d4dd3565 100644
--- a/PG-PuReMD/src/cuda/cuda_integrate.cu
+++ b/PG-PuReMD/src/cuda/cuda_integrate.cu
@@ -191,7 +191,6 @@ void Velocity_Verlet_Part1( reax_system *system, real dt )
 
     k_velocity_verlet_part1 <<< blocks, DEF_BLOCK_SIZE >>>
         ( system->d_my_atoms, system->reax_param.d_sbp, dt, system->n );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 }
 
@@ -205,7 +204,6 @@ void Velocity_Verlet_Part2( reax_system *system, real dt )
 
     k_velocity_verlet_part2 <<< blocks, DEF_BLOCK_SIZE >>>
         ( system->d_my_atoms, system->reax_param.d_sbp, dt, system->n );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 }
 
@@ -219,7 +217,6 @@ void Velocity_Verlet_Nose_Hoover_NVT_Part1( reax_system *system, real dt )
 
     k_velocity_verlet_nose_hoover_nvt <<< blocks, DEF_BLOCK_SIZE >>>
         ( system->d_my_atoms, system->reax_param.d_sbp, dt, system->n );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 }
 
@@ -234,7 +231,6 @@ void Velocity_Verlet_Nose_Hoover_NVT_Part2( reax_system *system, storage *worksp
     k_velocity_verlet_nose_hoover_nvt <<< blocks, DEF_BLOCK_SIZE >>>
         ( system->d_my_atoms, workspace->v_const,
           system->reax_param.d_sbp, dt, v_xi, system->n );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 }
 
@@ -251,7 +247,6 @@ real Velocity_Verlet_Nose_Hoover_NVT_Part3( reax_system *system, storage *worksp
     k_velocity_verlet_nose_hoover_nvt_part3 <<< blocks, DEF_BLOCK_SIZE >>>
         ( system->d_my_atoms, workspace->v_const, system->reax_param.d_sbp,
           dt, v_xi_old, d_my_ekin, system->n );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 
     Cuda_Reduction_Sum( d_my_ekin, d_total_my_ekin, system->n );
@@ -273,7 +268,6 @@ static void Cuda_Scale_Velocities_Berendsen_NVT( reax_system *system, real lambd
 
     k_scale_velocites_berendsen_nvt <<< blocks, DEF_BLOCK_SIZE >>>
         ( system->d_my_atoms, lambda, system->n );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 }
 
@@ -287,7 +281,6 @@ void Cuda_Scale_Velocities_NPT( reax_system *system, real lambda, rvec mu )
 
     k_scale_velocities_npt <<< blocks, DEF_BLOCK_SIZE >>>
         ( system->d_my_atoms, lambda, mu[0], mu[1], mu[2], system->n );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 }
 
diff --git a/PG-PuReMD/src/cuda/cuda_neighbors.cu b/PG-PuReMD/src/cuda/cuda_neighbors.cu
index 62878b2fef5df0326a0ee5dcdd320e6b29c14f2c..d937b35191273a00bff94ba9873b01638c842b77 100644
--- a/PG-PuReMD/src/cuda/cuda_neighbors.cu
+++ b/PG-PuReMD/src/cuda/cuda_neighbors.cu
@@ -587,7 +587,6 @@ extern "C" int Cuda_Generate_Neighbor_Lists( reax_system *system,
           system->d_my_grid, *(lists[FAR_NBRS]),
           system->n, system->N,
           system->d_far_nbrs, system->d_max_far_nbrs, system->d_realloc_far_nbrs );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 
     /* multiple threads per atom implementation */
@@ -598,7 +597,6 @@ extern "C" int Cuda_Generate_Neighbor_Lists( reax_system *system,
 //        sizeof(int) * 2 * NBRS_BLOCK_SIZE >>>
 //            ( system->d_my_atoms, system->my_ext_box, system->d_my_grid,
 //              *(lists[FAR_NBRS]), system->n, system->N );
-//    cudaDeviceSynchronize( );
 //    cudaCheckError( );
 
     /* check reallocation flag on device */
@@ -635,7 +633,6 @@ void Cuda_Estimate_Num_Neighbors( reax_system *system, simulation_data *data )
         ( system->d_my_atoms, system->my_ext_box, system->d_my_grid,
           system->n, system->N, system->total_cap,
           system->d_far_nbrs, system->d_max_far_nbrs );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 
     Cuda_Reduction_Sum( system->d_max_far_nbrs, system->d_total_far_nbrs,
diff --git a/PG-PuReMD/src/cuda/cuda_nonbonded.cu b/PG-PuReMD/src/cuda/cuda_nonbonded.cu
index 37e0e28b1912c155efcf7df1482f28c3aa016868..ab1c21224ca9363914d1210055cb05c8c073603b 100644
--- a/PG-PuReMD/src/cuda/cuda_nonbonded.cu
+++ b/PG-PuReMD/src/cuda/cuda_nonbonded.cu
@@ -680,7 +680,6 @@ static void Cuda_Compute_Polarization_Energy( reax_system *system, storage *work
     k_compute_polarization_energy <<< blocks, DEF_BLOCK_SIZE >>>
         ( system->d_my_atoms, system->reax_param.d_sbp, 
           system->n, spad );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 
     Cuda_Reduction_Sum( spad,
@@ -720,7 +719,6 @@ void Cuda_NonBonded_Energy( reax_system *system, control_params *control,
 //              *(workspace->d_workspace), *(lists[FAR_NBRS]), 
 //              system->n, system->reax_param.num_atom_types, 
 //              spad, &spad[system->n], (rvec *) (&spad[2 * system->n]) );
-        cudaDeviceSynchronize( );
         cudaCheckError( );
     }
     else
@@ -734,7 +732,6 @@ void Cuda_NonBonded_Energy( reax_system *system, control_params *control,
               data->step, data->prev_steps, 
               out_control->energy_update_freq,
               spad, &spad[system->n], (rvec *) (&spad[2 * system->n]));
-        cudaDeviceSynchronize( );
         cudaCheckError( );
     }
 
@@ -759,7 +756,6 @@ void Cuda_NonBonded_Energy( reax_system *system, control_params *control,
         k_reduction_rvec <<< control->blocks, control->block_size,
                          sizeof(rvec) * (control->block_size / 32) >>>
             ( spad_rvec, &spad_rvec[system->n], system->n );
-        cudaDeviceSynchronize( );
         cudaCheckError( );
 
         k_reduction_rvec <<< 1, control->blocks_pow_2,
@@ -767,7 +763,6 @@ void Cuda_NonBonded_Energy( reax_system *system, control_params *control,
             ( &spad_rvec[system->n],
               &((simulation_data *)data->d_simulation_data)->my_ext_press,
               control->blocks );
-        cudaDeviceSynchronize( );
         cudaCheckError( );
     }
 
diff --git a/PG-PuReMD/src/cuda/cuda_post_evolve.cu b/PG-PuReMD/src/cuda/cuda_post_evolve.cu
index 7d419aa49db0fecca903f3ae3d151bdca96aed07..ea0e3f163342ff7f4d2611aa2ac77df2888cf7d8 100644
--- a/PG-PuReMD/src/cuda/cuda_post_evolve.cu
+++ b/PG-PuReMD/src/cuda/cuda_post_evolve.cu
@@ -35,6 +35,5 @@ extern "C" void Cuda_Remove_CoM_Velocities( reax_system *system,
 {
     k_remove_center_of_mass_velocities <<< control->blocks, control->block_size >>>
         ( system->d_my_atoms, (simulation_data *)data->d_simulation_data, system->n );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 }
diff --git a/PG-PuReMD/src/cuda/cuda_reduction.cu b/PG-PuReMD/src/cuda/cuda_reduction.cu
index 71dce725d14ae8576bad491a8b75e66a801e1d8c..16ed9d253a1113cf30d2f8689233f323b33304c2 100644
--- a/PG-PuReMD/src/cuda/cuda_reduction.cu
+++ b/PG-PuReMD/src/cuda/cuda_reduction.cu
@@ -37,7 +37,6 @@ void Cuda_Reduction_Sum( int *d_array, int *d_dest, size_t n )
     /* determine temporary device storage requirements */
     cub::DeviceReduce::Sum( d_temp_storage, temp_storage_bytes,
             d_array, d_dest, n );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 
     /* allocate temporary storage */
@@ -47,7 +46,6 @@ void Cuda_Reduction_Sum( int *d_array, int *d_dest, size_t n )
     /* run sum-reduction */
     cub::DeviceReduce::Sum( d_temp_storage, temp_storage_bytes,
             d_array, d_dest, n );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 
     /* deallocate temporary storage */
@@ -67,7 +65,6 @@ void Cuda_Reduction_Sum( real *d_array, real *d_dest, size_t n )
     /* determine temporary device storage requirements */
     cub::DeviceReduce::Sum( d_temp_storage, temp_storage_bytes,
             d_array, d_dest, n );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 
     /* allocate temporary storage */
@@ -77,7 +74,6 @@ void Cuda_Reduction_Sum( real *d_array, real *d_dest, size_t n )
     /* run sum-reduction */
     cub::DeviceReduce::Sum( d_temp_storage, temp_storage_bytes,
             d_array, d_dest, n );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 
     /* deallocate temporary storage */
@@ -99,7 +95,6 @@ void Cuda_Reduction_Sum( real *d_array, real *d_dest, size_t n )
 //    /* determine temporary device storage requirements */
 //    cub::DeviceReduce::Reduce( d_temp_storage, temp_storage_bytes,
 //            d_array, d_dest, n, sum_op, init );
-//    cudaDeviceSynchronize( );
 //    cudaCheckError( );
 //
 //    /* allocate temporary storage */
@@ -109,7 +104,6 @@ void Cuda_Reduction_Sum( real *d_array, real *d_dest, size_t n )
 //    /* run sum-reduction */
 //    cub::DeviceReduce::Reduce( d_temp_storage, temp_storage_bytes,
 //            d_array, d_dest, n, sum_op, init );
-//    cudaDeviceSynchronize( );
 //    cudaCheckError( );
 //
 //    /* deallocate temporary storage */
@@ -129,7 +123,6 @@ void Cuda_Reduction_Max( int *d_array, int *d_dest, size_t n )
     /* determine temporary device storage requirements */
     cub::DeviceReduce::Max( d_temp_storage, temp_storage_bytes,
             d_array, d_dest, n );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 
     /* allocate temporary storage */
@@ -139,7 +132,6 @@ void Cuda_Reduction_Max( int *d_array, int *d_dest, size_t n )
     /* run exclusive prefix sum */
     cub::DeviceReduce::Max( d_temp_storage, temp_storage_bytes,
             d_array, d_dest, n );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 
     /* deallocate temporary storage */
@@ -159,7 +151,6 @@ void Cuda_Scan_Excl_Sum( int *d_src, int *d_dest, size_t n )
     /* determine temporary device storage requirements */
     cub::DeviceScan::ExclusiveSum( d_temp_storage, temp_storage_bytes,
             d_src, d_dest, n );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 
     /* allocate temporary storage */
@@ -169,7 +160,6 @@ void Cuda_Scan_Excl_Sum( int *d_src, int *d_dest, size_t n )
     /* run exclusive prefix sum */
     cub::DeviceScan::ExclusiveSum( d_temp_storage, temp_storage_bytes,
             d_src, d_dest, n );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 
     /* deallocate temporary storage */
diff --git a/PG-PuReMD/src/cuda/cuda_reset_tools.cu b/PG-PuReMD/src/cuda/cuda_reset_tools.cu
index 5d1af4df2d6fae5a376665a7d87bcfd2f360944e..99d55dd095d2431527931b262cdb2a7ed8bd47a5 100644
--- a/PG-PuReMD/src/cuda/cuda_reset_tools.cu
+++ b/PG-PuReMD/src/cuda/cuda_reset_tools.cu
@@ -62,7 +62,6 @@ void Cuda_Reset_Workspace( reax_system *system, storage *workspace )
 
     k_reset_workspace <<< blocks, DEF_BLOCK_SIZE >>>
         ( *(workspace->d_workspace), system->total_cap );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 }
 
@@ -79,7 +78,6 @@ void Cuda_Reset_Atoms_HBond_Indices( reax_system* system, control_params *contro
 
     k_reset_hindex <<< control->blocks_n, control->block_size_n >>>
         ( system->d_my_atoms, system->reax_param.d_sbp, hindex, system->N );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 
     Cuda_Reduction_Sum( hindex, system->d_numH, system->N );
diff --git a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu
index 39edee8b1cf89af02f01a391e365065e4e9ca526..2964a08199a509727bd33bc8c5eb158574bf3064 100644
--- a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu
+++ b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu
@@ -532,7 +532,6 @@ void dual_jacobi_apply( real const * const Hdia_inv, rvec2 const * const y,
 
     k_dual_jacobi_apply <<< blocks, DEF_BLOCK_SIZE >>>
         ( Hdia_inv, y, x, n );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 }
 
@@ -547,7 +546,6 @@ void jacobi_apply( real const * const Hdia_inv, real const * const y,
 
     k_jacobi_apply <<< blocks, DEF_BLOCK_SIZE >>>
         ( Hdia_inv, y, x, n );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 }
 
@@ -638,7 +636,6 @@ static void Dual_Sparse_MatVec_local( control_params const * const control,
         k_dual_sparse_matvec_full_opt_csr <<< blocks, DEF_BLOCK_SIZE >>>
                 ( A->start, A->end, A->j, A->val, x, b, A->n );
     }
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 }
 
@@ -818,7 +815,6 @@ static void Sparse_MatVec_local( control_params const * const control,
         k_sparse_matvec_full_opt_csr <<< blocks, DEF_BLOCK_SIZE >>>
              ( A->start, A->end, A->j, A->val, x, b, A->n );
     }
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 }
 
diff --git a/PG-PuReMD/src/cuda/cuda_system_props.cu b/PG-PuReMD/src/cuda/cuda_system_props.cu
index e13bb8b9c135ea93d4404db0d76c9caf5f570738..88d0e1a84a296cb4b7b17fbc1b4429c8123a749e 100644
--- a/PG-PuReMD/src/cuda/cuda_system_props.cu
+++ b/PG-PuReMD/src/cuda/cuda_system_props.cu
@@ -576,13 +576,11 @@ static void Cuda_Compute_Momentum( reax_system *system, control_params *control,
     k_center_of_mass_blocks_xcm <<< control->blocks, control->block_size,
                                 sizeof(rvec) * (control->block_size / 32) >>>
         ( system->reax_param.d_sbp, system->d_my_atoms, spad, system->n );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
     
     k_reduction_rvec <<< 1, control->blocks_pow_2,
                      sizeof(rvec) * (control->blocks_pow_2 / 32) >>>
             ( spad, &spad[control->blocks], control->blocks );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 
     copy_host_device( xcm, &spad[control->blocks], sizeof(rvec),
@@ -595,13 +593,11 @@ static void Cuda_Compute_Momentum( reax_system *system, control_params *control,
     k_center_of_mass_blocks_vcm <<< control->blocks, control->block_size,
                                 sizeof(rvec) * (control->block_size / 32) >>>
         ( system->reax_param.d_sbp, system->d_my_atoms, spad, system->n );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
     
     k_reduction_rvec <<< 1, control->blocks_pow_2,
                      sizeof(rvec) * (control->blocks_pow_2 / 32) >>>
         ( spad, &spad[control->blocks], control->blocks );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 
     copy_host_device( vcm, &spad[control->blocks], sizeof(rvec),
@@ -614,13 +610,11 @@ static void Cuda_Compute_Momentum( reax_system *system, control_params *control,
     k_center_of_mass_blocks_amcm <<< control->blocks, control->block_size,
                                  sizeof(rvec) * (control->block_size / 32) >>>
         ( system->reax_param.d_sbp, system->d_my_atoms, spad, system->n );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
     
     k_reduction_rvec <<< 1, control->blocks_pow_2,
                      sizeof(rvec) * (control->blocks_pow_2 / 32) >>>
         ( spad, &spad[control->blocks], control->blocks );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 
     copy_host_device( amcm, &spad[control->blocks], sizeof(rvec),
@@ -644,28 +638,24 @@ static void Cuda_Compute_Inertial_Tensor( reax_system *system, control_params *c
                                 sizeof(real) * 2 * (control->block_size / 32) >>>
         ( system->reax_param.d_sbp, system->d_my_atoms, spad,
           my_xcm[0], my_xcm[1], my_xcm[2], system->n );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 
     k_compute_inertial_tensor_xz_yy <<< control->blocks, control->block_size,
                                 sizeof(real) * 2 * (control->block_size / 32) >>>
         ( system->reax_param.d_sbp, system->d_my_atoms, spad,
           my_xcm[0], my_xcm[1], my_xcm[2], system->n );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 
     k_compute_inertial_tensor_yz_zz <<< control->blocks, control->block_size,
                                 sizeof(real) * 2 * (control->block_size / 32) >>>
         ( system->reax_param.d_sbp, system->d_my_atoms, spad,
           my_xcm[0], my_xcm[1], my_xcm[2], system->n );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 
     /* reduction of block-level partial sums for inertial tensor */
     k_compute_inertial_tensor_blocks <<< 1, control->blocks_pow_2,
                               sizeof(real) * 6 * control->blocks_pow_2 >>>
         ( spad, &spad[6 * control->blocks], control->blocks );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 
     copy_host_device( t, &spad[6 * control->blocks],
@@ -710,7 +700,6 @@ extern "C" void Cuda_Compute_Kinetic_Energy( reax_system *system,
     k_compute_kinetic_energy <<< control->blocks, control->block_size,
                              sizeof(real) * (control->block_size / 32) >>>
         ( system->reax_param.d_sbp, system->d_my_atoms, block_energy, system->n );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 
     /* note: above kernel sums the kinetic energy contribution within blocks,
@@ -751,7 +740,6 @@ void Cuda_Compute_Total_Mass( reax_system *system, control_params *control,
     k_compute_total_mass <<< control->blocks, control->block_size,
                          sizeof(real) * (control->block_size / 32) >>>
         ( system->reax_param.d_sbp, system->d_my_atoms, spad_real, system->n );
-    cudaDeviceSynchronize( );
     cudaCheckError( );
 
     /* note: above kernel sums the mass contribution within blocks,
@@ -905,14 +893,12 @@ void Cuda_Compute_Pressure( reax_system* system, control_params *control,
         k_reduction_rvec <<< control->blocks, control->block_size,
                          sizeof(rvec) * (control->block_size / 32) >>>
             ( rvec_spad, &rvec_spad[system->n],  system->n );
-        cudaDeviceSynchronize( );
         cudaCheckError( );
 
         k_reduction_rvec <<< 1, control->blocks_pow_2,
                          sizeof(rvec) * (control->blocks_pow_2 / 32) >>>
             ( &rvec_spad[system->n], &rvec_spad[system->n + control->blocks],
               control->blocks );
-        cudaDeviceSynchronize( );
         cudaCheckError( );
 
         copy_host_device( &int_press, &rvec_spad[system->n + control->blocks],
diff --git a/PG-PuReMD/src/cuda/cuda_utils.h b/PG-PuReMD/src/cuda/cuda_utils.h
index b40fa6d1896da3d5760249c843104f20fdad2c1b..5d039c57a72d1ae5a3d89f86479e065763830560 100644
--- a/PG-PuReMD/src/cuda/cuda_utils.h
+++ b/PG-PuReMD/src/cuda/cuda_utils.h
@@ -22,26 +22,33 @@ void Cuda_Print_Mem_Usage( );
 #define cudaCheckError() __cudaCheckError( __FILE__, __LINE__ )
 static inline void __cudaCheckError( const char *file, const int line )
 {
+#if defined(DEBUG)
     cudaError err;
 
-    err = cudaGetLastError();
+#if defined(DEBUG_FOCUS)
+    /* Block until tasks in stream are complete in order to enable
+     * more pinpointed error checking. However, this will affect performance. */
+    err = cudaDeviceSynchronize( );
     if ( cudaSuccess != err )
     {
-        fprintf( stderr, "[ERROR] runtime error encountered: %s:%d\n", file, line );
+        fprintf( stderr, "[ERROR] runtime error encountered with cudaDeviceSynchronize( ) at: %s:%d\n", file, line );
         fprintf( stderr, "    [INFO] CUDA API error code: %d\n", err );
+        fprintf( stderr, "    [INFO] CUDA API error name: %s\n", cudaGetErrorName( err ) );
+        fprintf( stderr, "    [INFO] CUDA API error text: %s\n", cudaGetErrorString( err ) );
         exit( RUNTIME_ERROR );
     }
+#endif
 
-#if defined(DEBUG_FOCUS)
-    /* More careful checking. However, this will affect performance. */
-    err = cudaDeviceSynchronize( );
-    if( cudaSuccess != err )
+    err = cudaGetLastError();
+    if ( cudaSuccess != err )
     {
-       exit( RUNTIME_ERROR );
+        fprintf( stderr, "[ERROR] runtime error encountered: %s:%d\n", file, line );
+        fprintf( stderr, "    [INFO] CUDA API error code: %d\n", err );
+        fprintf( stderr, "    [INFO] CUDA API error name: %s\n", cudaGetErrorName( err ) );
+        fprintf( stderr, "    [INFO] CUDA API error text: %s\n", cudaGetErrorString( err ) );
+        exit( RUNTIME_ERROR );
     }
 #endif
-
-    return;
 }
 
 
diff --git a/PG-PuReMD/src/geo_tools.c b/PG-PuReMD/src/geo_tools.c
index 7ec19c85dad48eb2ae3f3ede092f548b21f1bdda..3b3b0c1a2cef4f9eb947fcf5b88925e233deffbb 100644
--- a/PG-PuReMD/src/geo_tools.c
+++ b/PG-PuReMD/src/geo_tools.c
@@ -532,7 +532,9 @@ void Write_PDB_File( reax_system * const system, reax_list * const bond_list,
                  system->big_box.box_norms[2],
                  RAD2DEG(alpha), RAD2DEG(beta), RAD2DEG(gamma), " ", 0 );
         fprintf( out_control->log, "Box written\n" );
+#if defined(DEBUG)
         fflush( out_control->log );
+#endif
     }
 
     /* write atom lines to buffer */
diff --git a/PG-PuReMD/src/io_tools.c b/PG-PuReMD/src/io_tools.c
index a139964e61b5bfd15e1bd0176d8a5dab44acf9d7..d97655b798ff1abe1a2b052ca75aab06873ccc9a 100644
--- a/PG-PuReMD/src/io_tools.c
+++ b/PG-PuReMD/src/io_tools.c
@@ -81,8 +81,8 @@ void Init_Output_Files( reax_system *system, control_params *control,
             fprintf( out_control->out, "%-6s%24s%24s%24s%13s%16s%13s\n",
                      "step", "total energy", "potential", "kinetic",
                      "T(K)", "V(A^3)", "P(GPa)" );
-#endif
             fflush( out_control->out );
+#endif
 
             /* potential file */
             sprintf( temp, "%s.pot", control->sim_name );
@@ -101,8 +101,8 @@ void Init_Output_Files( reax_system *system, control_params *control,
                      "step", "ebond", "eatom", "elp",
                      "eang", "ecoa", "ehb", "etor", "econj",
                      "evdw", "ecoul", "epol" );
-#endif
             fflush( out_control->pot );
+#endif
 
 #if defined(LOG_PERFORMANCE)
             /* log file */
@@ -114,7 +114,9 @@ void Init_Output_Files( reax_system *system, control_params *control,
                      "step", "total", "comm", "nbrs", "init", "bonded", "nonb",
                      "charges", "siters", "retries" );
 
+#if defined(DEBUG)
             fflush( out_control->log );
+#endif
 #endif
         }
 
@@ -130,7 +132,9 @@ void Init_Output_Files( reax_system *system, control_params *control,
             fprintf( out_control->prs, "%8s%13s%13s%13s%13s%13s%13s%13s\n",
                     "step", "Pint/norm[x]", "Pint/norm[y]", "Pint/norm[z]",
                     "Pext/Ptot[x]", "Pext/Ptot[y]", "Pext/Ptot[z]", "Pkin/V" );
+#if defined(DEBUG)
             fflush( out_control->prs );
+#endif
         }
 
         /* electric dipole moment analysis file */
@@ -143,7 +147,9 @@ void Init_Output_Files( reax_system *system, control_params *control,
 
             fprintf( out_control->dpl, "%6s%20s%30s",
                      "step", "molecule count", "avg dipole moment norm" );
+#if defined(DEBUG)
             fflush( out_control->dpl );
+#endif
         }
 
         /* diffusion coef analysis file */
@@ -155,7 +161,9 @@ void Init_Output_Files( reax_system *system, control_params *control,
 
             fprintf( out_control->drft, "%7s%20s%20s\n",
                      "step", "type count", "avg disp^2" );
+#if defined(DEBUG)
             fflush( out_control->drft );
+#endif
         }
     }
 
@@ -1295,10 +1303,10 @@ void Output_Results( reax_system *system, control_params *control,
                         data->sys_en.e_hb,
                         data->sys_en.e_tor, data->sys_en.e_con,
                         data->sys_en.e_vdW, data->sys_en.e_ele, data->sys_en.e_pol);
-#endif //DEBUG
 
                 fflush( out_control->out );
                 fflush( out_control->pot );
+#endif //DEBUG
 
 #if defined(LOG_PERFORMANCE)
                 t_elapsed = Get_Elapsed_Time( data->timing.total );
@@ -1346,7 +1354,9 @@ void Output_Results( reax_system *system, control_params *control,
 //                        data->timing.cm_solver_orthog * denom,
 //                        data->timing.cm_solver_tri_solve * denom );
 
+#if defined(DEBUG)
                 fflush( out_control->log );
+#endif
 #endif //LOG_PERFORMANCE
 
                 if ( control->virial )
@@ -1365,7 +1375,9 @@ void Output_Results( reax_system *system, control_params *control,
                              data->tot_press[0], data->tot_press[1], data->tot_press[2],
                              system->big_box.V );
 
+#if defined(DEBUG)
                     fflush( out_control->prs );
+#endif
                 }
             }
 
diff --git a/PG-PuReMD/src/lin_alg.c b/PG-PuReMD/src/lin_alg.c
index 8001090c5466b99a7a8ed15c202e1a41fc0edbf4..7d470e3e9d1c40471e3162ba3acac70ecb205877 100644
--- a/PG-PuReMD/src/lin_alg.c
+++ b/PG-PuReMD/src/lin_alg.c
@@ -964,7 +964,7 @@ real sparse_approx_inverse( reax_system const * const system,
     t_start = Get_Time( );
     Dist( system, mpi_data, row_nnz, REAL_PTR_TYPE, MPI_INT );
     t_comm += Get_Time( ) - t_start;
-    fprintf( stdout,"SAI after Dist call\n");
+    fprintf( stdout,"SAI after Dist call\n" );
     fflush( stdout );
 
     out_bufs = mpi_data->out_nt_buffers;
@@ -992,7 +992,7 @@ real sparse_approx_inverse( reax_system const * const system,
                 j_recv[d] = (int *) malloc( sizeof(int) * cnt );
                 val_recv[d] = (real *) malloc( sizeof(real) * cnt );
 
-                fprintf( stdout,"Dist communication receive phase direction %d will receive %d\n", d, cnt);
+                fprintf( stdout, "Dist communication receive phase direction %d will receive %d\n", d, cnt );
                 fflush( stdout );
                 t_start = Get_Time( );
                 ret = MPI_Irecv( j_recv + d, cnt, MPI_INT, nbr->receive_rank,
@@ -1023,7 +1023,7 @@ real sparse_approx_inverse( reax_system const * const system,
                 }
                //     row_nnz[ out_bufs[d].index[i] ];
             }
-            fprintf( stdout,"Dist communication    send phase direction %d should  send %d\n", d, cnt);
+            fprintf( stdout,"Dist communication    send phase direction %d should  send %d\n", d, cnt );
             fflush( stdout );
 
             if ( cnt )
@@ -1050,18 +1050,18 @@ real sparse_approx_inverse( reax_system const * const system,
                 ret = MPI_Send( j_send, cnt, MPI_INT, nbr->rank,
                         d, mpi_data->comm_mesh3D );
                 Check_MPI_Error( ret, __FILE__, __LINE__ );
-                fprintf( stdout,"Dist communication send phase direction %d cnt = %d\n", d, cnt);
+                fprintf( stdout,"Dist communication send phase direction %d cnt = %d\n", d, cnt );
                 fflush( stdout );
                 ret = MPI_Send( val_send, cnt, MPI_DOUBLE, nbr->rank,
                         d, mpi_data->comm_mesh3D );
                 Check_MPI_Error( ret, __FILE__, __LINE__ );
-                fprintf( stdout,"Dist communication send phase direction %d cnt = %d\n", d, cnt);
+                fprintf( stdout,"Dist communication send phase direction %d cnt = %d\n", d, cnt );
                 fflush( stdout );
                 t_comm += Get_Time( ) - t_start;
             }
         }
     }
-    fprintf( stdout," Dist communication for sending row info before waitany\n");
+    fprintf( stdout," Dist communication for sending row info before waitany\n" );
     fflush( stdout );
     ///////////////////////
     for ( d = 0; d < count; ++d )
@@ -1097,7 +1097,7 @@ real sparse_approx_inverse( reax_system const * const system,
         }
     }
     //////////////////////
-    fprintf( stdout," wow wow wow, Dist communication for sending row info worked\n");
+    fprintf( stdout, "Dist communication for sending row info worked\n" );
     fflush( stdout );
     //TODO: size?
     X = (int *) malloc( sizeof(int) * (system->bigN + 1) );
@@ -1180,7 +1180,7 @@ real sparse_approx_inverse( reax_system const * const system,
             local_pos = A_spar_patt->j[ A_spar_patt->start[i] + d_j ];
             if ( local_pos < 0 || local_pos >= system->N )
             {
-                fprintf( stderr, "THE LOCAL POSITION OF THE ATOM IS NOT VALID, STOP THE EXECUTION\n");
+                fprintf( stderr, "THE LOCAL POSITION OF THE ATOM IS NOT VALID, STOP THE EXECUTION\n" );
                 fflush( stderr );
 
             }
diff --git a/PG-PuReMD/src/nonbonded.c b/PG-PuReMD/src/nonbonded.c
index f2bad81a279f47236feafc7193144fad819e0595..cc86a3a2c60d165abe045308b74f20d7a7767c47 100644
--- a/PG-PuReMD/src/nonbonded.c
+++ b/PG-PuReMD/src/nonbonded.c
@@ -182,7 +182,8 @@ void vdW_Coulomb_Energy( reax_system const * const system,
 #if defined(DEBUG_FOCUS)
                 fprintf( stderr, "%6d%6d%24.12f%24.12f%24.12f%24.12f\n",
                         i + 1, j + 1, 
-                        e_base, de_base, e_core, de_core ); fflush( stderr );
+                        e_base, de_base, e_core, de_core );
+                fflush( stderr );
 #endif
 
                 /* Coulomb Calculations */
@@ -198,7 +199,8 @@ void vdW_Coulomb_Energy( reax_system const * const system,
 
 #if defined(DEBUG_FOCUS)
                 fprintf( stderr, "%6d%6d%24.12f%24.12f\n",
-                        i + 1, j + 1, e_clb, de_clb ); fflush( stderr );
+                        i + 1, j + 1, e_clb, de_clb );
+                fflush( stderr );
 #endif
 
                 if ( control->virial == 0 )
diff --git a/PG-PuReMD/src/restart.c b/PG-PuReMD/src/restart.c
index 81ded98bee6acd1509e16e3e58e0953144ae8b63..ef932e36a61729d8642c6e9a0c4bdd69d638be5c 100644
--- a/PG-PuReMD/src/restart.c
+++ b/PG-PuReMD/src/restart.c
@@ -147,7 +147,9 @@ void Write_Restart_File( reax_system *system, control_params *control,
                  system->big_box.box[1][2],
                  system->big_box.box[2][0], system->big_box.box[2][1],
                  system->big_box.box[2][2]);
-        fflush(fres);
+#if defined(DEBUG)
+        fflush( fres );
+#endif
 
         buffer_req = system->bigN * RESTART_LINE_LEN + 1;
     }
@@ -312,7 +314,8 @@ void Read_Binary_Restart_File( const char * const res_file, reax_system *system,
                    " name = %8s, x = (%f, %f, %f), v = (%f, %f, %f)\n",
                     top, p_atom->orig_id, p_atom->type, p_atom->name,
                     p_atom->x[0], p_atom->x[1], p_atom->x[2],
-                    p_atom->v[0], p_atom->v[1], p_atom->v[2] ); fflush( stderr );
+                    p_atom->v[0], p_atom->v[1], p_atom->v[2] );
+            fflush( stderr );
 #endif
 
             top++;