From 688fdb8f625621f7b2695124e9aae1e841a57deb Mon Sep 17 00:00:00 2001
From: "Kurt A. O'Hearn" <ohearnku@msu.edu>
Date: Sat, 23 Sep 2017 17:16:25 -0400
Subject: [PATCH] PG-PuReMD: continue merging EE/ACKS2 code from sPuReMD.
 Remove unused arrays. Other clean-up.

---
 PG-PuReMD/Makefile.am                |   9 +-
 PG-PuReMD/configure.ac               |   6 +-
 PG-PuReMD/src/allocate.c             |   2 -
 PG-PuReMD/src/cuda/cuda_allocate.cu  | 127 +++++++++------
 PG-PuReMD/src/cuda/cuda_charges.cu   | 148 +++++++++++++++---
 PG-PuReMD/src/cuda/cuda_charges.h    |   2 +-
 PG-PuReMD/src/cuda/cuda_forces.cu    |   6 +-
 PG-PuReMD/src/cuda/cuda_init_md.cu   |   4 +-
 PG-PuReMD/src/cuda/cuda_lin_alg.cu   | 222 ++-------------------------
 PG-PuReMD/src/cuda/cuda_reduction.cu |  37 +++--
 PG-PuReMD/src/forces.c               |  11 +-
 PG-PuReMD/src/lin_alg.c              |   6 +-
 PG-PuReMD/src/parallelreax.c         |   5 +-
 PG-PuReMD/src/reax_types.h           |   2 -
 environ/parallel_control             |   2 +-
 15 files changed, 279 insertions(+), 310 deletions(-)

diff --git a/PG-PuReMD/Makefile.am b/PG-PuReMD/Makefile.am
index d16fd249..6c23c15d 100644
--- a/PG-PuReMD/Makefile.am
+++ b/PG-PuReMD/Makefile.am
@@ -42,7 +42,7 @@ bin_pg_puremd_SOURCES += src/cuda/cuda_utils.cu src/cuda/cuda_allocate.cu src/cu
       src/cuda/cuda_torsion_angles.cu src/cuda/cuda_hydrogen_bonds.cu src/cuda/cuda_forces.cu \
       src/cuda/cuda_charges.cu src/cuda/cuda_lin_alg.cu \
       src/cuda/cuda_nonbonded.cu src/cuda/cuda_integrate.cu src/cuda/cuda_post_evolve.cu \
-      src/cuda/cuda_init_md.cu src/cuda/cuda_validation.cu src/cuda/cuda_lookup.cu
+      src/cuda/cuda_init_md.cu src/cuda/cuda_lookup.cu
 include_HEADERS += src/cuda/cuda_helpers.h src/cuda/cuda_shuffle.h \
       src/cuda/cuda_utils.h src/cuda/cuda_allocate.h src/cuda/cuda_environment.h \
       src/cuda/cuda_system_props.h src/cuda/cuda_reduction.h src/cuda/cuda_box.h src/cuda/cuda_list.h \
@@ -52,7 +52,12 @@ include_HEADERS += src/cuda/cuda_helpers.h src/cuda/cuda_shuffle.h \
       src/cuda/cuda_torsion_angles.h src/cuda/cuda_hydrogen_bonds.h src/cuda/cuda_forces.h \
       src/cuda/cuda_charges.h src/cuda/cuda_lin_alg.h \
       src/cuda/cuda_nonbonded.h src/cuda/cuda_integrate.h src/cuda/cuda_post_evolve.h \
-      src/cuda/cuda_init_md.h src/cuda/cuda_validation.h src/cuda/cuda_lookup.h
+      src/cuda/cuda_init_md.h src/cuda/cuda_lookup.h
+
+if DEBUG
+bin_pg_puremd_SOURCES += src/cuda/cuda_validation.cu
+include_HEADERS += src/cuda/cuda_validation.h
+endif
 
 # dummy source to cause C linking
 nodist_EXTRA_bin_pg_puremd_SOURCES = src/dummy.c
diff --git a/PG-PuReMD/configure.ac b/PG-PuReMD/configure.ac
index 6e9f3fb5..5a5ac9e1 100644
--- a/PG-PuReMD/configure.ac
+++ b/PG-PuReMD/configure.ac
@@ -133,18 +133,20 @@ if test "x$BUILD_GPU" = "xyes"; then
 
         NVCCFLAGS=
 	NVCCFLAGS+=" -ccbin=$CXX"
-	if test "BUILD_DEBUG" = "true"
+	if test "x${BUILD_DEBUG}" = "xtrue"
 	then
 		NVCCFLAGS+=" -g -G"
+
 	fi
 	AC_DEFINE([HAVE_CUDA], [1], [Define to 1 if you have CUDA support enabled.])
 else
 	AM_CONDITIONAL(USE_CUDA, test "x" = "xyes")
 fi
+AM_CONDITIONAL(DEBUG, test "x${BUILD_DEBUG}" = "xtrue")
 
 if test "BUILD_PROF" = "true"
 then
-	if test "x$BUILD_GPU" = "xyes"; then
+	if test "x${BUILD_GPU}" = "xyes"; then
 		NVCCFLAGS+=" --compiler-options ${gprof_flags}"
 	else
 		CFLAGS+=" ${gprof_flags}"
diff --git a/PG-PuReMD/src/allocate.c b/PG-PuReMD/src/allocate.c
index 039e65d2..c2de4893 100644
--- a/PG-PuReMD/src/allocate.c
+++ b/PG-PuReMD/src/allocate.c
@@ -117,7 +117,6 @@ void DeAllocate_Workspace( control_params *control, storage *workspace )
     sfree( workspace->Clp, "Clp" );
     sfree( workspace->vlpex, "vlpex" );
     sfree( workspace->bond_mark, "bond_mark" );
-    sfree( workspace->done_after, "done_after" );
 
     /* QEq storage */
     sfree( workspace->Hdia_inv, "Hdia_inv" );
@@ -245,7 +244,6 @@ void Allocate_Workspace( reax_system *system, control_params *control,
     workspace->Clp = (real*) smalloc( total_real, "Clp" );
     workspace->vlpex = (real*) smalloc( total_real, "vlpex" );
     workspace->bond_mark = (int*) scalloc(total_cap, sizeof(int), "bond_mark");
-    workspace->done_after = (int*) scalloc(total_cap, sizeof(int), "done_after");
 
     /* charge method storage */
     switch ( control->charge_method )
diff --git a/PG-PuReMD/src/cuda/cuda_allocate.cu b/PG-PuReMD/src/cuda/cuda_allocate.cu
index 5df47e6f..d683bba5 100644
--- a/PG-PuReMD/src/cuda/cuda_allocate.cu
+++ b/PG-PuReMD/src/cuda/cuda_allocate.cu
@@ -358,7 +358,6 @@ void dev_alloc_workspace( reax_system *system, control_params *control,
      */
 
     /* bond order related storage  */
-    cuda_malloc( (void **) &workspace->within_bond_box, total_cap * sizeof (int), TRUE, "skin" );
     cuda_malloc( (void **) &workspace->total_bond_order, total_real, TRUE, "total_bo" );
     cuda_malloc( (void **) &workspace->Deltap, total_real, TRUE, "Deltap" );
     cuda_malloc( (void **) &workspace->Deltap_boc, total_real, TRUE, "Deltap_boc" );
@@ -375,39 +374,58 @@ void dev_alloc_workspace( reax_system *system, control_params *control,
     cuda_malloc( (void **) &workspace->Clp, total_real, TRUE, "Clp" );
     cuda_malloc( (void **) &workspace->vlpex, total_real, TRUE, "vlpex" );
     cuda_malloc( (void **) &workspace->bond_mark, total_real, TRUE, "bond_mark" );
-    cuda_malloc( (void **) &workspace->done_after, total_real, TRUE, "done_after" );
-
 
     /* charge matrix storage */
-    cuda_malloc( (void **) &workspace->Hdia_inv, total_cap * sizeof(real), TRUE, "Hdia_inv" );
+    if ( control->cm_solver_pre_comp_type == DIAG_PC )
+    {
+        cuda_malloc( (void **) &workspace->Hdia_inv, total_cap * sizeof(real), TRUE, "Hdia_inv" );
+    }
     cuda_malloc( (void **) &workspace->b_s, total_cap * sizeof(real), TRUE, "b_s" );
     cuda_malloc( (void **) &workspace->b_t, total_cap * sizeof(real), TRUE, "b_t" );
     cuda_malloc( (void **) &workspace->b_prc, total_cap * sizeof(real), TRUE, "b_prc" );
     cuda_malloc( (void **) &workspace->b_prm, total_cap * sizeof(real), TRUE, "b_prm" );
     cuda_malloc( (void **) &workspace->s, total_cap * sizeof(real), TRUE, "s" );
     cuda_malloc( (void **) &workspace->t, total_cap * sizeof(real), TRUE, "t" );
-    cuda_malloc( (void **) &workspace->droptol, total_cap * sizeof(real), TRUE, "droptol" );
+    if ( control->cm_solver_pre_comp_type == ICHOLT_PC ||
+            control->cm_solver_pre_comp_type == ILUT_PAR_PC )
+    {
+        cuda_malloc( (void **) &workspace->droptol, total_cap * sizeof(real), TRUE, "droptol" );
+    }
     cuda_malloc( (void **) &workspace->b, total_cap * sizeof(rvec2), TRUE, "b" );
     cuda_malloc( (void **) &workspace->x, total_cap * sizeof(rvec2), TRUE, "x" );
 
-    /* GMRES storage */
-    cuda_malloc( (void **) &workspace->y, (RESTART+1)*sizeof(real), TRUE, "y" );
-    cuda_malloc( (void **) &workspace->z, (RESTART+1)*sizeof(real), TRUE, "z" );
-    cuda_malloc( (void **) &workspace->g, (RESTART+1)*sizeof(real), TRUE, "g" );
-    cuda_malloc( (void **) &workspace->h, (RESTART+1)*(RESTART+1)*sizeof(real), TRUE, "h" );
-    cuda_malloc( (void **) &workspace->hs, (RESTART+1)*sizeof(real), TRUE, "hs" );
-    cuda_malloc( (void **) &workspace->hc, (RESTART+1)*sizeof(real), TRUE, "hc" );
-    cuda_malloc( (void **) &workspace->v, (RESTART+1)*(RESTART+1)*sizeof(real), TRUE, "v" );
-
-    /* CG storage */
-    cuda_malloc( (void **) &workspace->r, total_cap * sizeof(real), TRUE, "r" );
-    cuda_malloc( (void **) &workspace->d, total_cap * sizeof(real), TRUE, "d" );
-    cuda_malloc( (void **) &workspace->q, total_cap * sizeof(real), TRUE, "q" );
-    cuda_malloc( (void **) &workspace->p, total_cap * sizeof(real), TRUE, "p" );
-    cuda_malloc( (void **) &workspace->r2, total_cap * sizeof(rvec2), TRUE, "r2" );
-    cuda_malloc( (void **) &workspace->d2, total_cap * sizeof(rvec2), TRUE, "d2" );
-    cuda_malloc( (void **) &workspace->q2, total_cap * sizeof(rvec2), TRUE, "q2" );
-    cuda_malloc( (void **) &workspace->p2, total_cap * sizeof(rvec2), TRUE, "p2" );
+    switch ( control->cm_solver_type )
+    {
+    case GMRES_S:
+    case GMRES_H_S:
+        cuda_malloc( (void **) &workspace->y, (RESTART+1)*sizeof(real), TRUE, "y" );
+        cuda_malloc( (void **) &workspace->z, (RESTART+1)*sizeof(real), TRUE, "z" );
+        cuda_malloc( (void **) &workspace->g, (RESTART+1)*sizeof(real), TRUE, "g" );
+        cuda_malloc( (void **) &workspace->h, (RESTART+1)*(RESTART+1)*sizeof(real), TRUE, "h" );
+        cuda_malloc( (void **) &workspace->hs, (RESTART+1)*sizeof(real), TRUE, "hs" );
+        cuda_malloc( (void **) &workspace->hc, (RESTART+1)*sizeof(real), TRUE, "hc" );
+        cuda_malloc( (void **) &workspace->v, (RESTART+1)*(RESTART+1)*sizeof(real), TRUE, "v" );
+        break;
+
+    case SDM_S:
+        break;
+
+    case CG_S:
+        cuda_malloc( (void **) &workspace->r, total_cap * sizeof(real), TRUE, "r" );
+        cuda_malloc( (void **) &workspace->d, total_cap * sizeof(real), TRUE, "d" );
+        cuda_malloc( (void **) &workspace->q, total_cap * sizeof(real), TRUE, "q" );
+        cuda_malloc( (void **) &workspace->p, total_cap * sizeof(real), TRUE, "p" );
+        cuda_malloc( (void **) &workspace->r2, total_cap * sizeof(rvec2), TRUE, "r2" );
+        cuda_malloc( (void **) &workspace->d2, total_cap * sizeof(rvec2), TRUE, "d2" );
+        cuda_malloc( (void **) &workspace->q2, total_cap * sizeof(rvec2), TRUE, "q2" );
+        cuda_malloc( (void **) &workspace->p2, total_cap * sizeof(rvec2), TRUE, "p2" );
+        break;
+
+    default:
+        fprintf( stderr, "Unrecognized QEq solver selection. Terminating...\n" );
+        exit( INVALID_INPUT );
+        break;
+    }
 
     /* integrator storage */
     cuda_malloc( (void **) &workspace->v_const, local_rvec, TRUE, "v_const" );
@@ -458,7 +476,6 @@ void dev_dealloc_workspace( control_params *control, storage *workspace )
      */
 
     /* bond order related storage  */
-    cuda_free( workspace->within_bond_box, "skin" );
     cuda_free( workspace->total_bond_order, "total_bo" );
     cuda_free( workspace->Deltap, "Deltap" );
     cuda_free( workspace->Deltap_boc, "Deltap_boc" );
@@ -475,38 +492,58 @@ void dev_dealloc_workspace( control_params *control, storage *workspace )
     cuda_free( workspace->Clp, "Clp" );
     cuda_free( workspace->vlpex, "vlpex" );
     cuda_free( workspace->bond_mark, "bond_mark" );
-    cuda_free( workspace->done_after, "done_after" );
 
     /* charge matrix storage */
-    cuda_free( workspace->Hdia_inv, "Hdia_inv" );
+    if ( control->cm_solver_pre_comp_type == DIAG_PC )
+    {
+        cuda_free( workspace->Hdia_inv, "Hdia_inv" );
+    }
     cuda_free( workspace->b_s, "b_s" );
     cuda_free( workspace->b_t, "b_t" );
     cuda_free( workspace->b_prc, "b_prc" );
     cuda_free( workspace->b_prm, "b_prm" );
     cuda_free( workspace->s, "s" );
     cuda_free( workspace->t, "t" );
-    cuda_free( workspace->droptol, "droptol" );
+    if ( control->cm_solver_pre_comp_type == ICHOLT_PC ||
+            control->cm_solver_pre_comp_type == ILUT_PAR_PC )
+    {
+        cuda_free( workspace->droptol, "droptol" );
+    }
     cuda_free( workspace->b, "b" );
     cuda_free( workspace->x, "x" );
 
-    /* GMRES storage */
-    cuda_free( workspace->y, "y" );
-    cuda_free( workspace->z, "z" );
-    cuda_free( workspace->g, "g" );
-    cuda_free( workspace->h, "h" );
-    cuda_free( workspace->hs, "hs" );
-    cuda_free( workspace->hc, "hc" );
-    cuda_free( workspace->v, "v" );
-
-    /* CG storage */
-    cuda_free( workspace->r, "r" );
-    cuda_free( workspace->d, "d" );
-    cuda_free( workspace->q, "q" );
-    cuda_free( workspace->p, "p" );
-    cuda_free( workspace->r2, "r2" );
-    cuda_free( workspace->d2, "d2" );
-    cuda_free( workspace->q2, "q2" );
-    cuda_free( workspace->p2, "p2" );
+    switch ( control->cm_solver_type )
+    {
+    case GMRES_S:
+    case GMRES_H_S:
+        cuda_free( workspace->y, "y" );
+        cuda_free( workspace->z, "z" );
+        cuda_free( workspace->g, "g" );
+        cuda_free( workspace->h, "h" );
+        cuda_free( workspace->hs, "hs" );
+        cuda_free( workspace->hc, "hc" );
+        cuda_free( workspace->v, "v" );
+        break;
+
+    case SDM_S:
+        break;
+
+    case CG_S:
+        cuda_free( workspace->r, "r" );
+        cuda_free( workspace->d, "d" );
+        cuda_free( workspace->q, "q" );
+        cuda_free( workspace->p, "p" );
+        cuda_free( workspace->r2, "r2" );
+        cuda_free( workspace->d2, "d2" );
+        cuda_free( workspace->q2, "q2" );
+        cuda_free( workspace->p2, "p2" );
+        break;
+
+    default:
+        fprintf( stderr, "Unrecognized QEq solver selection. Terminating...\n" );
+        exit( INVALID_INPUT );
+        break;
+    }
 
     /* integrator storage */
     cuda_free( workspace->v_const, "v_const" );
diff --git a/PG-PuReMD/src/cuda/cuda_charges.cu b/PG-PuReMD/src/cuda/cuda_charges.cu
index 8452548d..c22058e9 100644
--- a/PG-PuReMD/src/cuda/cuda_charges.cu
+++ b/PG-PuReMD/src/cuda/cuda_charges.cu
@@ -24,7 +24,9 @@
 #include "cuda_lin_alg.h"
 #include "cuda_reduction.h"
 #include "cuda_utils.h"
-#include "cuda_validation.h"
+#if defined(DEBUG)
+  #include "cuda_validation.h"
+#endif
 
 #include "../basic_comm.h"
 
@@ -259,32 +261,114 @@ void Cuda_QEq( reax_system *system, control_params *control, simulation_data
     // compare_array (workspace->b_t, dev_workspace->b_t, system->n, "b_t");
     //}
 
-//#ifdef __CUDA_DEBUG__
-//  Init_MatVec( system, data, control, workspace, mpi_data );
-//#endif
+    switch ( control->cm_solver_type )
+    {
+    case GMRES_S:
+    case GMRES_H_S:
+    case SDM_S:
+        fprintf( stderr, "Unsupported QEq solver selection. Terminating...\n" );
+        exit( INVALID_INPUT );
+        break;
+
+    case CG_S:
+        s_matvecs = Cuda_dual_CG( system, control, workspace, &dev_workspace->H,
+                dev_workspace->b, control->cm_solver_q_err, dev_workspace->x, mpi_data,
+                out_control->log, data );
+        t_matvecs = 0;
+        break;
+
+
+    default:
+        fprintf( stderr, "Unrecognized QEq solver selection. Terminating...\n" );
+        exit( INVALID_INPUT );
+        break;
+    }
 
-#if defined(DEBUG)
-    fprintf( stderr, "p%d: initialized qEq\n", system->my_rank );
-    //Print_Linear_System( system, control, workspace, data->step );
+    Cuda_Calculate_Charges( system, workspace, mpi_data );
+
+#if defined(LOG_PERFORMANCE)
+    if ( system->my_rank == MASTER_NODE )
+    {
+        data->timing.s_matvecs += s_matvecs;
+        data->timing.t_matvecs += t_matvecs;
+    }
 #endif
+}
 
-    //MATRIX CHANGES
-    s_matvecs = Cuda_dual_CG( system, control, workspace, &dev_workspace->H,
-            dev_workspace->b, control->cm_solver_q_err, dev_workspace->x, mpi_data,
-            out_control->log, data );
-    t_matvecs = 0;
-    //fprintf (stderr, "Device: First CG complated with iterations: %d \n", s_matvecs);
 
-#if defined(DEBUG)
-    fprintf( stderr, "p%d: first CG completed\n", system->my_rank );
-#endif
+void Cuda_EE( reax_system *system, control_params *control, simulation_data
+        *data, storage *workspace, output_controls *out_control, mpi_datatypes
+        *mpi_data )
+{
+    int s_matvecs, t_matvecs;
+
+    Cuda_Init_MatVec( system, workspace );
+
+    switch ( control->cm_solver_type )
+    {
+    case GMRES_S:
+    case GMRES_H_S:
+    case SDM_S:
+        fprintf( stderr, "Unsupported QEq solver selection. Terminating...\n" );
+        exit( INVALID_INPUT );
+        break;
+
+    case CG_S:
+        s_matvecs = Cuda_CG( system, control, workspace, &dev_workspace->H,
+                dev_workspace->b_s, control->cm_solver_q_err, dev_workspace->s, mpi_data );
+        t_matvecs = 0;
+        break;
+
+
+    default:
+        fprintf( stderr, "Unrecognized QEq solver selection. Terminating...\n" );
+        exit( INVALID_INPUT );
+        break;
+    }
 
     Cuda_Calculate_Charges( system, workspace, mpi_data );
 
-#if defined(DEBUG)
-    fprintf( stderr, "p%d: computed charges\n", system->my_rank );
-    //Print_Charges( system );
+#if defined(LOG_PERFORMANCE)
+    if ( system->my_rank == MASTER_NODE )
+    {
+        data->timing.s_matvecs += s_matvecs;
+        data->timing.t_matvecs += t_matvecs;
+    }
 #endif
+}
+
+
+void Cuda_ACKS2( reax_system *system, control_params *control, simulation_data
+        *data, storage *workspace, output_controls *out_control, mpi_datatypes
+        *mpi_data )
+{
+    int s_matvecs, t_matvecs;
+
+    Cuda_Init_MatVec( system, workspace );
+
+    switch ( control->cm_solver_type )
+    {
+    case GMRES_S:
+    case GMRES_H_S:
+    case SDM_S:
+        fprintf( stderr, "Unsupported QEq solver selection. Terminating...\n" );
+        exit( INVALID_INPUT );
+        break;
+
+    case CG_S:
+        s_matvecs = Cuda_CG( system, control, workspace, &dev_workspace->H,
+                dev_workspace->b_s, control->cm_solver_q_err, dev_workspace->s, mpi_data );
+        t_matvecs = 0;
+        break;
+
+
+    default:
+        fprintf( stderr, "Unrecognized QEq solver selection. Terminating...\n" );
+        exit( INVALID_INPUT );
+        break;
+    }
+
+    Cuda_Calculate_Charges( system, workspace, mpi_data );
 
 #if defined(LOG_PERFORMANCE)
     if ( system->my_rank == MASTER_NODE )
@@ -294,3 +378,29 @@ void Cuda_QEq( reax_system *system, control_params *control, simulation_data
     }
 #endif
 }
+
+
+void Cuda_Compute_Charges( reax_system *system, control_params *control, simulation_data
+        *data, storage *workspace, output_controls *out_control, mpi_datatypes
+        *mpi_data )
+{
+    switch ( control->charge_method )
+    {
+    case QEQ_CM:
+        Cuda_QEq( system, control, data, workspace, out_control, mpi_data );
+        break;
+
+    case EE_CM:
+        Cuda_EE( system, control, data, workspace, out_control, mpi_data );
+        break;
+
+    case ACKS2_CM:
+        Cuda_ACKS2( system, control, data, workspace, out_control, mpi_data );
+        break;
+
+    default:
+        fprintf( stderr, "Invalid charge method. Terminating...\n" );
+        exit( INVALID_INPUT );
+        break;
+    }
+}
diff --git a/PG-PuReMD/src/cuda/cuda_charges.h b/PG-PuReMD/src/cuda/cuda_charges.h
index d1922a48..dc8c16b6 100644
--- a/PG-PuReMD/src/cuda/cuda_charges.h
+++ b/PG-PuReMD/src/cuda/cuda_charges.h
@@ -37,7 +37,7 @@ void cuda_charges_st( reax_system *, storage *, real *, real );
 
 void cuda_charges_updateq( reax_system *, real * );
 
-void Cuda_QEq( reax_system*, control_params*, simulation_data*,
+void Cuda_Compute_Charges( reax_system*, control_params*, simulation_data*,
         storage*, output_controls*, mpi_datatypes* );
 
 
diff --git a/PG-PuReMD/src/cuda/cuda_forces.cu b/PG-PuReMD/src/cuda/cuda_forces.cu
index 28982bff..bc86bf41 100644
--- a/PG-PuReMD/src/cuda/cuda_forces.cu
+++ b/PG-PuReMD/src/cuda/cuda_forces.cu
@@ -15,7 +15,9 @@
 #include "cuda_torsion_angles.h"
 #include "cuda_utils.h"
 #include "cuda_valence_angles.h"
-#include "cuda_validation.h"
+#if defined(DEBUG)
+  #include "cuda_validation.h"
+#endif
 
 #include "../basic_comm.h"
 #include "../forces.h"
@@ -1849,7 +1851,7 @@ int Cuda_Compute_Forces( reax_system *system, control_params *control,
 #if defined(PURE_REAX)
         if ( charge_flag == TRUE )
         {
-            Cuda_QEq( system, control, data, workspace, out_control, mpi_data );
+            Cuda_Compute_Charges( system, control, data, workspace, out_control, mpi_data );
         }
 
 #if defined(LOG_PERFORMANCE)
diff --git a/PG-PuReMD/src/cuda/cuda_init_md.cu b/PG-PuReMD/src/cuda/cuda_init_md.cu
index b115f287..81b9b3e1 100644
--- a/PG-PuReMD/src/cuda/cuda_init_md.cu
+++ b/PG-PuReMD/src/cuda/cuda_init_md.cu
@@ -10,7 +10,9 @@
 #include "cuda_reset_tools.h"
 #include "cuda_system_props.h"
 #include "cuda_utils.h"
-#include "cuda_validation.h"
+#if defined(DEBUG)
+  #include "cuda_validation.h"
+#endif
 
 #if defined(PURE_REAX)
   #include "../box.h"
diff --git a/PG-PuReMD/src/cuda/cuda_lin_alg.cu b/PG-PuReMD/src/cuda/cuda_lin_alg.cu
index bb35d181..1b51216a 100644
--- a/PG-PuReMD/src/cuda/cuda_lin_alg.cu
+++ b/PG-PuReMD/src/cuda/cuda_lin_alg.cu
@@ -336,7 +336,7 @@ void Cuda_CG_Preconditioner( real *res, real *a, real *b, int count )
 }
 
 
-CUDA_GLOBAL void k_diagonal_preconditioner(storage p_workspace, rvec2 *b, int n)
+CUDA_GLOBAL void k_diagonal_preconditioner( storage p_workspace, rvec2 *b, int n )
 {
     storage *workspace;
     int j;
@@ -350,15 +350,13 @@ CUDA_GLOBAL void k_diagonal_preconditioner(storage p_workspace, rvec2 *b, int n)
 
     workspace = &( p_workspace );
 
-    //for( j = 0; j < system->n; ++j ) {
-    // residual 
+    /* compute residuals */
     workspace->r2[j][0] = b[j][0] - workspace->q2[j][0];
     workspace->r2[j][1] = b[j][1] - workspace->q2[j][1];
 
-    // apply diagonal pre-conditioner
+    /* apply diagonal preconditioner to residuals */
     workspace->d2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j]; 
     workspace->d2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j]; 
-    //}
 }
 
 
@@ -370,7 +368,7 @@ void Cuda_CG_Diagonal_Preconditioner( storage *workspace, rvec2 *b, int n )
         (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
 
     k_diagonal_preconditioner <<< blocks, DEF_BLOCK_SIZE >>>
-        (*workspace, b, n);
+        ( *workspace, b, n );
 
     cudaThreadSynchronize( );
     cudaCheckError( );
@@ -396,7 +394,6 @@ CUDA_GLOBAL void k_dual_cg_preconditioner( storage p_workspace, rvec2 *x,
     alpha[1] = alpha_1;
     my_dot[j][0] = my_dot[j][1] = 0.0;
 
-    //for( j = 0; j < system->n; ++j ) {
     // update x 
     x[j][0] += alpha[0] * workspace->d2[j][0];
     x[j][1] += alpha[1] * workspace->d2[j][1];      
@@ -412,7 +409,6 @@ CUDA_GLOBAL void k_dual_cg_preconditioner( storage p_workspace, rvec2 *x,
     // dot product: r.p 
     my_dot[j][0] = workspace->r2[j][0] * workspace->p2[j][0];
     my_dot[j][1] = workspace->r2[j][1] * workspace->p2[j][1];
-    //}
 }
 
 
@@ -433,7 +429,7 @@ void Cuda_DualCG_Preconditioner( storage *workspace, rvec2 *x, rvec2 alpha,
     cudaThreadSynchronize( );
     cudaCheckError( );
 
-    //Reduction to calculate my_dot
+    /* reduction to calculate my_dot */
     k_reduction_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec2) * DEF_BLOCK_SIZE >>>
         ( tmp, tmp + n, n);
 
@@ -515,9 +511,11 @@ void Cuda_Vector_Sum_Rvec2(rvec2 *x, rvec2 *a, rvec2 b, rvec2 *c, int n)
 
 CUDA_GLOBAL void k_rvec2_to_real_copy( real *dst, rvec2 *src, int index, int n )
 {
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    int i;
 
-    if (i >= n)
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= n )
     {
         return;
     }
@@ -542,9 +540,11 @@ void Cuda_RvecCopy_From( real *dst, rvec2 *src, int index, int n )
 
 CUDA_GLOBAL void k_real_to_rvec2_copy( rvec2 *dst, real *src, int index, int n)
 {
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    int i;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
 
-    if (i >= n)
+    if ( i >= n )
     {
         return;
     }
@@ -651,46 +651,14 @@ int Cuda_dual_CG( reax_system *system, control_params *control, storage *workspa
     }
 #endif
 
-    //MVAPICH2
-//#ifdef __CUDA_DEBUG__
-//  Dist( system, mpi_data, workspace->x, mpi_data->mpi_rvec2, scale, rvec2_packer );
-//#endif
-
-//  check_zeros_device( x, system->N, "x" );
-
     copy_host_device( spad, x, sizeof(rvec2) * system->total_cap, cudaMemcpyDeviceToHost, "CG:x:get" );
     Dist( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_packer );
     copy_host_device( spad, x, sizeof(rvec2) * system->total_cap, cudaMemcpyHostToDevice, "CG:x:put" );
 
-//  check_zeros_device( x, system->N, "x" );
-
-//  compare_rvec2 (workspace->x, x, N, "x");
-//  if (data->step > 0) {
-//      compare_rvec2 (workspace->b, dev_workspace->b, system->N, "b");
-//      compare_rvec2 (workspace->x, dev_workspace->x, system->N, "x");
-//
-//      exit (0);
-//  }
-
-
-//#ifdef __CUDA_DEBUG__
-//  dual_Sparse_MatVec( &workspace->H, workspace->x, workspace->q2, N );
-//#endif
     //originally we were using only H->n which was system->n (init_md.c)
     //Cuda_Dual_Matvec ( H, x, dev_workspace->q2, H->n, system->total_cap);
-    
     Cuda_Dual_Matvec( H, x, dev_workspace->q2, system->N, system->total_cap );
 
-//  compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2");
-
-//  if (data->step > 0) exit (0);
-
-    // tryQEq
-    //MVAPICH2
-//#ifdef __CUDA_DEBUG__
-//  Coll(system,mpi_data,workspace->q2,mpi_data->mpi_rvec2,scale,rvec2_unpacker);
-//#endif
-    
     copy_host_device( spad, dev_workspace->q2, sizeof(rvec2) * system->total_cap,
             cudaMemcpyDeviceToHost, "CG:q2:get" );
     Coll( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_unpacker );
@@ -704,63 +672,23 @@ int Cuda_dual_CG( reax_system *system, control_params *control, storage *workspa
     }
 #endif
 
-//#ifdef __CUDA_DEBUG__
-//  for( j = 0; j < system->n; ++j ) {
-//    // residual
-//    workspace->r2[j][0] = workspace->b[j][0] - workspace->q2[j][0];
-//    workspace->r2[j][1] = workspace->b[j][1] - workspace->q2[j][1];
-//    // apply diagonal pre-conditioner
-//    workspace->d2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
-//    workspace->d2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
-//  }
-//#endif
-    
     Cuda_CG_Diagonal_Preconditioner( dev_workspace, b, system->n );
 
-//  compare_rvec2 (workspace->r2, dev_workspace->r2, n, "r2");
-//  compare_rvec2 (workspace->d2, dev_workspace->d2, n, "d2");
-
-    /* norm of b */
-//#ifdef __CUDA_DEBUG__
-//  my_sum[0] = my_sum[1] = 0;
-//  for( j = 0; j < n; ++j ) {
-//    my_sum[0] += SQR( workspace->b[j][0] );
-//    my_sum[1] += SQR( workspace->b[j][1] );
-//  }
-//  fprintf (stderr, "cg: my_sum[ %f, %f] \n", my_sum[0], my_sum[1]);
-//#endif
-
     my_sum[0] = 0;
     my_sum[1] = 0;
     Cuda_Norm( b, n, my_sum );
 
-//  fprintf (stderr, "cg: my_sum[ %f, %f] \n", my_sum[0], my_sum[1]);
-
     MPI_Allreduce( &my_sum, &norm_sqr, 2, MPI_DOUBLE, MPI_SUM, comm );
     b_norm[0] = SQRT( norm_sqr[0] );
     b_norm[1] = SQRT( norm_sqr[1] );
-    //fprintf( stderr, "bnorm = %f %f\n", b_norm[0], b_norm[1] );
 
     /* dot product: r.d */
-//#ifdef __CUDA_DEBUG__
-//  my_dot[0] = my_dot[1] = 0;
-//  for( j = 0; j < n; ++j ) {
-//    my_dot[0] += workspace->r2[j][0] * workspace->d2[j][0];
-//    my_dot[1] += workspace->r2[j][1] * workspace->d2[j][1];
-//  }
-//  fprintf( stderr, "my_dot: %f %f\n", my_dot[0], my_dot[1] );
-//#endif
-
     my_dot[0] = 0;
     my_dot[1] = 0;
     Cuda_Dot( dev_workspace->r2, dev_workspace->d2, my_dot, n );
-
-// fprintf( stderr, "my_dot: %f %f\n", my_dot[0], my_dot[1] );
     
     MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm );
 
-    //fprintf( stderr, "DEVICE:sig_new: %f %f\n", sig_new[0], sig_new[1] );
-
 #if defined(CG_PERFORMANCE)
     if ( system->my_rank == MASTER_NODE )
     {
@@ -770,53 +698,21 @@ int Cuda_dual_CG( reax_system *system, control_params *control, storage *workspa
 
     for ( i = 1; i < control->cm_solver_max_iters; ++i )
     {
-        //MVAPICH2
-//#ifdef __CUDA_DEBUG__
-//    Dist(system,mpi_data,workspace->d2,mpi_data->mpi_rvec2,scale,rvec2_packer);
-//#endif
-        
         copy_host_device( spad, dev_workspace->d2, sizeof(rvec2) * system->total_cap,
                 cudaMemcpyDeviceToHost, "cg:d2:get" );
         Dist( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_packer );
         copy_host_device( spad, dev_workspace->d2, sizeof(rvec2) * system->total_cap,
                 cudaMemcpyHostToDevice, "cg:d2:put" );
 
-        //print_device_rvec2 (dev_workspace->d2, N);
-
-//#ifdef __CUDA_DEBUG__
-//    dual_Sparse_MatVec( &workspace->H, workspace->d2, workspace->q2, N );
-//#endif
-        
         Cuda_Dual_Matvec( H, dev_workspace->d2, dev_workspace->q2, system->N,
                 system->total_cap );
 
-        /*
-        fprintf (stderr, "******************* Device sparse Matrix--------> %d \n", H->n );
-        fprintf (stderr, " ******* HOST SPARSE MATRIX ******** \n");
-        print_sparse_matrix_host (&workspace->H);
-        fprintf (stderr, " ******* HOST Vector ***************\n");
-        print_host_rvec2 (workspace->d2, system->N);
-        fprintf (stderr, " ******* Device SPARSE MATRIX ******** \n");
-        print_sparse_matrix (&dev_workspace->H);
-        fprintf (stderr, " ******* Device Vector ***************\n");
-        print_device_rvec2 (dev_workspace->d2, system->N);
-        */
-        //compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2");
-
-        // tryQEq
-        // MVAPICH2
-//#ifdef __CUDA_DEBUG__
-//    Coll(system,mpi_data,workspace->q2,mpi_data->mpi_rvec2,scale,rvec2_unpacker);
-//#endif
-
         copy_host_device( spad, dev_workspace->q2, sizeof(rvec2) * system->total_cap,
                 cudaMemcpyDeviceToHost, "cg:q2:get" );
         Coll( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_unpacker );
         copy_host_device( spad, dev_workspace->q2, sizeof(rvec2) * system->total_cap,
                 cudaMemcpyHostToDevice, "cg:q2:put" );
 
-//       compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2");
-
 #if defined(CG_PERFORMANCE)
         if ( system->my_rank == MASTER_NODE )
         {
@@ -825,61 +721,23 @@ int Cuda_dual_CG( reax_system *system, control_params *control, storage *workspa
 #endif
 
         /* dot product: d.q */
-//#ifdef __CUDA_DEBUG__
-//    my_dot[0] = my_dot[1] = 0;
-//    for( j = 0; j < n; ++j ) {
-//      my_dot[0] += workspace->d2[j][0] * workspace->q2[j][0];
-//      my_dot[1] += workspace->d2[j][1] * workspace->q2[j][1];
-//    }
-//       fprintf( stderr, "H:my_dot: %f %f\n", my_dot[0], my_dot[1] );
-//#endif
-
-        my_dot[0] = my_dot[1] = 0;
+        my_dot[0] = 0;
+        my_dot[1] = 0;
         Cuda_Dot (dev_workspace->d2, dev_workspace->q2, my_dot, n);
-        //fprintf( stderr, "D:my_dot: %f %f\n", my_dot[0], my_dot[1] );
 
         MPI_Allreduce( &my_dot, &tmp, 2, MPI_DOUBLE, MPI_SUM, comm );
-        //fprintf( stderr, "tmp: %f %f\n", tmp[0], tmp[1] );
 
         alpha[0] = sig_new[0] / tmp[0];
         alpha[1] = sig_new[1] / tmp[1];
         my_dot[0] = 0;
         my_dot[1] = 0;
 
-//#ifdef __CUDA_DEBUG__
-//    for( j = 0; j < system->n; ++j ) {
-//      // update x
-//      workspace->x[j][0] += alpha[0] * workspace->d2[j][0];
-//      workspace->x[j][1] += alpha[1] * workspace->d2[j][1];
-//      // update residual
-//      workspace->r2[j][0] -= alpha[0] * workspace->q2[j][0];
-//      workspace->r2[j][1] -= alpha[1] * workspace->q2[j][1];
-//      // apply diagonal pre-conditioner
-//      workspace->p2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
-//      workspace->p2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
-//      // dot product: r.p
-//      my_dot[0] += workspace->r2[j][0] * workspace->p2[j][0];
-//      my_dot[1] += workspace->r2[j][1] * workspace->p2[j][1];
-//    }
-//       fprintf( stderr, "H:my_dot: %f %f\n", my_dot[0], my_dot[1] );
-//#endif
-
-        my_dot[0] = 0;
-        my_dot[1] = 0;
         Cuda_DualCG_Preconditioner( dev_workspace, x, alpha, system->n, my_dot );
 
-        //fprintf( stderr, "D:my_dot: %f %f\n", my_dot[0], my_dot[1] );
-
-//   compare_rvec2 (workspace->x, dev_workspace->x, N, "x");
-//   compare_rvec2 (workspace->r2, dev_workspace->r2, N, "r2");
-//   compare_rvec2 (workspace->p2, dev_workspace->p2, N, "p2");
-
         sig_old[0] = sig_new[0];
         sig_old[1] = sig_new[1];
         MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm );
 
-        //fprintf( stderr, "DEVICE:sig_new: %f %f\n", sig_new[0], sig_new[1] );
-
 #if defined(CG_PERFORMANCE)
         if ( system->my_rank == MASTER_NODE )
         {
@@ -895,60 +753,26 @@ int Cuda_dual_CG( reax_system *system, control_params *control, storage *workspa
         beta[0] = sig_new[0] / sig_old[0];
         beta[1] = sig_new[1] / sig_old[1];
 
-//#ifdef __CUDA_DEBUG__
-//    for( j = 0; j < system->n; ++j ) {
-//      // d = p + beta * d
-//      workspace->d2[j][0] = workspace->p2[j][0] + beta[0] * workspace->d2[j][0];
-//      workspace->d2[j][1] = workspace->p2[j][1] + beta[1] * workspace->d2[j][1];
-//    }
-//#endif
-
         Cuda_Vector_Sum_Rvec2( dev_workspace->d2, dev_workspace->p2, beta,
                 dev_workspace->d2, system->n );
-
-//       compare_rvec2 (workspace->d2, dev_workspace->d2, N, "q2");
     }
 
-
     if ( SQRT(sig_new[0]) / b_norm[0] <= tol )
     {
-        //for( j = 0; j < n; ++j )
-        //  workspace->t[j] = workspace->x[j][1];
-        //fprintf (stderr, "Getting started with Cuda_CG1 \n");
-
         Cuda_RvecCopy_From( dev_workspace->t, dev_workspace->x, 1, system->n );
 
-        //compare_array (workspace->b_t, dev_workspace->b_t, system->n, "b_t");
-        //compare_array (workspace->t, dev_workspace->t, system->n, "t");
-
         matvecs = Cuda_CG( system, control, workspace, H, dev_workspace->b_t, tol, dev_workspace->t,
                 mpi_data );
 
-        //fprintf (stderr, " Cuda_CG1: iterations --> %d \n", matvecs );
-        //for( j = 0; j < n; ++j )
-        //  workspace->x[j][1] = workspace->t[j];
-
         Cuda_RvecCopy_To( dev_workspace->x, dev_workspace->t, 1, system->n );
     }
     else if ( SQRT(sig_new[1]) / b_norm[1] <= tol )
     {
-        //for( j = 0; j < n; ++j )
-        //  workspace->s[j] = workspace->x[j][0];
-
         Cuda_RvecCopy_From( dev_workspace->s, dev_workspace->x, 0, system->n );
 
-        //compare_array (workspace->s, dev_workspace->s, system->n, "s");
-        //compare_array (workspace->b_s, dev_workspace->b_s, system->n, "b_s");
-
-        //fprintf (stderr, "Getting started with Cuda_CG2 \n");
-
         matvecs = Cuda_CG( system, control, workspace, H, dev_workspace->b_s, tol, dev_workspace->s,
                 mpi_data );
 
-        //fprintf (stderr, " Cuda_CG2: iterations --> %d \n", matvecs );
-        //for( j = 0; j < system->n; ++j )
-        //  workspace->x[j][0] = workspace->s[j];
-
         Cuda_RvecCopy_To( dev_workspace->x, dev_workspace->s, 0, system->n );
     }
 
@@ -983,25 +807,19 @@ int Cuda_CG( reax_system *system, control_params *control, storage *workspace,
 
     scale = sizeof(real) / sizeof(void);
 
-    /* x is on the device */
-    //MVAPICH2
     memset( spad, 0, sizeof(real) * system->total_cap );
     copy_host_device( spad, x, sizeof(real) * system->total_cap,
             cudaMemcpyDeviceToHost, "cuda_cg:x:get" );
     Dist( system, mpi_data, spad, MPI_DOUBLE, scale, real_packer );
 
-    //MVAPICH2
     copy_host_device( spad, x, sizeof(real) * system->total_cap,
             cudaMemcpyHostToDevice, "cuda_cg:x:put" );
     Cuda_Matvec( H, x, dev_workspace->q, system->N, system->total_cap );
 
-    // tryQEq
-    // MVAPICH2
     copy_host_device( spad, dev_workspace->q, sizeof(real) * system->total_cap,
             cudaMemcpyDeviceToHost, "cuda_cg:q:get" );
     Coll( system, mpi_data, spad, MPI_DOUBLE, scale, real_unpacker );
 
-    //MVAPICH2
     copy_host_device( spad, dev_workspace->q, sizeof(real) * system->total_cap,
             cudaMemcpyHostToDevice, "cuda_cg:q:put" );
 
@@ -1014,8 +832,7 @@ int Cuda_CG( reax_system *system, control_params *control, storage *workspace,
 
     Cuda_Vector_Sum( dev_workspace->r , 1.,  b, -1., dev_workspace->q,
             system->n );
-    //for( j = 0; j < system->n; ++j )
-    //  workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; //pre-condition
+
     Cuda_CG_Preconditioner( dev_workspace->d, dev_workspace->r,
             dev_workspace->Hdia_inv, system->n );
 
@@ -1041,7 +858,6 @@ int Cuda_CG( reax_system *system, control_params *control, storage *workspace,
 
     for ( i = 1; i < control->cm_solver_max_iters && SQRT(sig_new) / b_norm > tol; ++i )
     {
-        //MVAPICH2
         copy_host_device( spad, dev_workspace->d, sizeof(real) * system->total_cap,
                 cudaMemcpyDeviceToHost, "cuda_cg:d:get" );
         Dist( system, mpi_data, spad, MPI_DOUBLE, scale, real_packer );
@@ -1050,7 +866,6 @@ int Cuda_CG( reax_system *system, control_params *control, storage *workspace,
 
         Cuda_Matvec( H, dev_workspace->d, dev_workspace->q, system->N, system->total_cap );
 
-        //tryQEq
         copy_host_device( spad, dev_workspace->q, sizeof(real) * system->total_cap,
                 cudaMemcpyDeviceToHost, "cuda_cg:q:get" );
         Coll( system, mpi_data, spad, MPI_DOUBLE, scale, real_unpacker );
@@ -1078,9 +893,7 @@ int Cuda_CG( reax_system *system, control_params *control, storage *workspace,
         //Cuda_Vector_Add( workspace->r, -alpha, workspace->q, system->n );
         Cuda_Vector_Sum( dev_workspace->r, -alpha, dev_workspace->q, 1.0,
                 dev_workspace->r, system->n );
-        /* pre-conditioning */
-        //for( j = 0; j < system->n; ++j )
-        //  workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j];
+
         Cuda_CG_Preconditioner( dev_workspace->p, dev_workspace->r,
                 dev_workspace->Hdia_inv, system->n );
 
@@ -1092,7 +905,6 @@ int Cuda_CG( reax_system *system, control_params *control, storage *workspace,
         copy_host_device( spad + system->n, dev_workspace->p, sizeof(real) * system->n,
                 cudaMemcpyDeviceToHost, "cuda_cg:p:get" );
         sig_new = Parallel_Dot( spad , spad + system->n, system->n, mpi_data->world );
-        //fprintf (stderr, "Device: sig_new: %f \n", sig_new );
 
         beta = sig_new / sig_old;
         Cuda_Vector_Sum( dev_workspace->d, 1., dev_workspace->p, beta,
diff --git a/PG-PuReMD/src/cuda/cuda_reduction.cu b/PG-PuReMD/src/cuda/cuda_reduction.cu
index 01bd3c81..49224a06 100644
--- a/PG-PuReMD/src/cuda/cuda_reduction.cu
+++ b/PG-PuReMD/src/cuda/cuda_reduction.cu
@@ -662,8 +662,8 @@ CUDA_GLOBAL void k_norm_rvec2( const rvec2 *input, rvec2 *per_block_results,
 }
 
 
-CUDA_GLOBAL void k_dot_rvec2(const rvec2 *a, rvec2 *b, rvec2 *res,
-        const size_t n)
+CUDA_GLOBAL void k_dot_rvec2( const rvec2 *a, rvec2 *b, rvec2 *res,
+        const size_t n )
 {
 #if defined(__SM_35__)
     extern __shared__ rvec2 my_dot2[];
@@ -746,11 +746,13 @@ CUDA_GLOBAL void k_dot_rvec2(const rvec2 *a, rvec2 *b, rvec2 *res,
 //vector functions
 //////////////////////////////////////////////////
 CUDA_GLOBAL void k_vector_sum( real* dest, real c, real* v, real d, real* y,
-        int k )
+        int n )
 {
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    int i;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
 
-    if( i >= k )
+    if ( i >= n )
     {
         return;
     }
@@ -759,11 +761,13 @@ CUDA_GLOBAL void k_vector_sum( real* dest, real c, real* v, real d, real* y,
 }
 
 
-CUDA_GLOBAL void k_vector_mul( real* dest, real* v, real* y, int k )
+CUDA_GLOBAL void k_vector_mul( real* dest, real* v, real* y, int n )
 {
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    int i;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
 
-    if( i >= k )
+    if ( i >= n )
     {
         return;
     }
@@ -772,11 +776,13 @@ CUDA_GLOBAL void k_vector_mul( real* dest, real* v, real* y, int k )
 }
 
 
-CUDA_GLOBAL void k_rvec2_mul( rvec2* dest, rvec2* v, rvec2* y, int k )
+CUDA_GLOBAL void k_rvec2_mul( rvec2* dest, rvec2* v, rvec2* y, int n )
 {
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    int i;
 
-    if( i >= k )
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= n )
     {
         return;
     }
@@ -787,12 +793,13 @@ CUDA_GLOBAL void k_rvec2_mul( rvec2* dest, rvec2* v, rvec2* y, int k )
 
 
 CUDA_GLOBAL void k_rvec2_pbetad( rvec2 *dest, rvec2 *a, 
-        real beta0, real beta1, 
-        rvec2 *b, int n )
+        real beta0, real beta1, rvec2 *b, int n )
 {
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    int i;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
 
-    if( i >= n )
+    if ( i >= n )
     {
         return;
     }
diff --git a/PG-PuReMD/src/forces.c b/PG-PuReMD/src/forces.c
index d3e295cf..91fbfd17 100644
--- a/PG-PuReMD/src/forces.c
+++ b/PG-PuReMD/src/forces.c
@@ -800,7 +800,6 @@ int Init_Forces( reax_system *system, control_params *control,
     {
         /* put ghost atoms to an infinite distance */
         workspace->bond_mark[i] = 1000;
-        //workspace->done_after[i] = Start_Index( i, far_nbrs );
     }
 
     H = &(workspace->H); //MATRIX CHANGES
@@ -969,8 +968,6 @@ int Init_Forces( reax_system *system, control_params *control,
                     else if ( workspace->bond_mark[i] > workspace->bond_mark[j] + 1 )
                     {
                         workspace->bond_mark[i] = workspace->bond_mark[j] + 1;
-                        //if( workspace->bond_mark[i] == 1000 )
-                        //  workspace->done_after[i] = pj;
                     }
 
                     //fprintf( stdout, "%d%d - %d(%d) %d(%d)\n",
@@ -1005,7 +1002,6 @@ int Init_Forces( reax_system *system, control_params *control,
 
     /*for( i = system->n; i < system->N; ++i ) {
       start_i = Start_Index(i, far_nbrs);
-      end_i = workspace->done_after[i];
 
       if( workspace->bond_mark[i] >= 2 && start_i < end_i ) {
         atom_i = &(system->my_atoms[i]);
@@ -1088,7 +1084,6 @@ int Init_Forces_No_Charges( reax_system *system, control_params *control,
     for ( i = system->n; i < system->N; ++i )
     {
         workspace->bond_mark[i] = 1000; // put ghost atoms to an infinite distance
-        //workspace->done_after[i] = Start_Index( i, far_nbrs );
     }
 
     num_bonds = 0;
@@ -1214,15 +1209,13 @@ int Init_Forces_No_Charges( reax_system *system, control_params *control,
                     ++btop_i;
 
                     if ( workspace->bond_mark[j] > workspace->bond_mark[i] + 1 )
+                    {
                         workspace->bond_mark[j] = workspace->bond_mark[i] + 1;
+                    }
                     else if ( workspace->bond_mark[i] > workspace->bond_mark[j] + 1 )
                     {
                         workspace->bond_mark[i] = workspace->bond_mark[j] + 1;
-                        //if( workspace->bond_mark[i] == 1000 )
-                        //  workspace->done_after[i] = pj;
                     }
-                    //fprintf( stdout, "%d%d - %d(%d) %d(%d)\n",
-                    //   i , j, i, workspace->bond_mark[i], j, workspace->bond_mark[j] );
                 }
             }
         }
diff --git a/PG-PuReMD/src/lin_alg.c b/PG-PuReMD/src/lin_alg.c
index 15d8ad96..5de64a8a 100644
--- a/PG-PuReMD/src/lin_alg.c
+++ b/PG-PuReMD/src/lin_alg.c
@@ -28,7 +28,7 @@
 #include "tool_box.h"
 #include "vector.h"
 
-#ifdef HAVE_CUDA
+#if defined(HAVE_CUDA) && defined(DEBUG)
   #include "cuda/cuda_validation.h"
 #endif
 
@@ -98,13 +98,13 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, rvec2
     }
 #endif
 
-#ifdef HAVE_CUDA
+#if defined(HAVE_CUDA) && defined(DEBUG)
     check_zeros_host( x, N, "x" );
 #endif
 
     Dist( system, mpi_data, x, mpi_data->mpi_rvec2, scale, rvec2_packer );
 
-#ifdef HAVE_CUDA
+#if defined(HAVE_CUDA) && defined(DEBUG)
     check_zeros_host( x, N, "x" );
 #endif
 
diff --git a/PG-PuReMD/src/parallelreax.c b/PG-PuReMD/src/parallelreax.c
index 9ace1c32..da907412 100644
--- a/PG-PuReMD/src/parallelreax.c
+++ b/PG-PuReMD/src/parallelreax.c
@@ -49,9 +49,12 @@
   #include "cuda/cuda_reset_tools.h"
   #include "cuda/cuda_system_props.h"
   #include "cuda/cuda_utils.h"
-  #include "cuda/cuda_validation.h"
+  #if defined(DEBUG)
+    #include "cuda/cuda_validation.h"
+  #endif
 #endif
 
+
 evolve_function Evolve;
 evolve_function Cuda_Evolve;
 LR_lookup_table *LR;
diff --git a/PG-PuReMD/src/reax_types.h b/PG-PuReMD/src/reax_types.h
index 192cecbe..80a4dafc 100644
--- a/PG-PuReMD/src/reax_types.h
+++ b/PG-PuReMD/src/reax_types.h
@@ -1919,8 +1919,6 @@ typedef struct
     rvec *dDeltap_self;
     /**/
     int *bond_mark;
-    /**/
-    int *done_after;
 
     /* charge matrix storage */
     /* charge matrix */
diff --git a/environ/parallel_control b/environ/parallel_control
index cfe9fa5a..4f6d75b2 100644
--- a/environ/parallel_control
+++ b/environ/parallel_control
@@ -21,7 +21,7 @@ hbond_cutoff             7.50                   ! cutoff distance for hydrogen b
 charge_method                     0             ! charge method: 0 = QEq, 1 = EEM, 2 = ACKS2
 charge_freq                       1             ! frequency (sim step) at which atomic charges are computed
 cm_q_net                          0.0           ! net system charge
-cm_solver_type                    0             ! iterative linear solver for charge method: 0 = GMRES, 1 = GMRES_H, 2 = CG, 3 = SDM
+cm_solver_type                    2             ! iterative linear solver for charge method: 0 = GMRES, 1 = GMRES_H, 2 = CG, 3 = SDM
 cm_solver_max_iters               1000          ! max solver iterations
 cm_solver_restart                 100           ! inner iterations of GMRES before restarting
 cm_solver_q_err                   1e-6          ! relative residual norm threshold used in solver
-- 
GitLab