diff --git a/PG-PuReMD/Makefile.am b/PG-PuReMD/Makefile.am
index d16fd2499993a8a324e414f265d14da31057eea0..6c23c15d9ae6af2cb62f9621acea4c6f23b914f8 100644
--- a/PG-PuReMD/Makefile.am
+++ b/PG-PuReMD/Makefile.am
@@ -42,7 +42,7 @@ bin_pg_puremd_SOURCES += src/cuda/cuda_utils.cu src/cuda/cuda_allocate.cu src/cu
       src/cuda/cuda_torsion_angles.cu src/cuda/cuda_hydrogen_bonds.cu src/cuda/cuda_forces.cu \
       src/cuda/cuda_charges.cu src/cuda/cuda_lin_alg.cu \
       src/cuda/cuda_nonbonded.cu src/cuda/cuda_integrate.cu src/cuda/cuda_post_evolve.cu \
-      src/cuda/cuda_init_md.cu src/cuda/cuda_validation.cu src/cuda/cuda_lookup.cu
+      src/cuda/cuda_init_md.cu src/cuda/cuda_lookup.cu
 include_HEADERS += src/cuda/cuda_helpers.h src/cuda/cuda_shuffle.h \
       src/cuda/cuda_utils.h src/cuda/cuda_allocate.h src/cuda/cuda_environment.h \
       src/cuda/cuda_system_props.h src/cuda/cuda_reduction.h src/cuda/cuda_box.h src/cuda/cuda_list.h \
@@ -52,7 +52,12 @@ include_HEADERS += src/cuda/cuda_helpers.h src/cuda/cuda_shuffle.h \
       src/cuda/cuda_torsion_angles.h src/cuda/cuda_hydrogen_bonds.h src/cuda/cuda_forces.h \
       src/cuda/cuda_charges.h src/cuda/cuda_lin_alg.h \
       src/cuda/cuda_nonbonded.h src/cuda/cuda_integrate.h src/cuda/cuda_post_evolve.h \
-      src/cuda/cuda_init_md.h src/cuda/cuda_validation.h src/cuda/cuda_lookup.h
+      src/cuda/cuda_init_md.h src/cuda/cuda_lookup.h
+
+if DEBUG
+bin_pg_puremd_SOURCES += src/cuda/cuda_validation.cu
+include_HEADERS += src/cuda/cuda_validation.h
+endif
 
 # dummy source to cause C linking
 nodist_EXTRA_bin_pg_puremd_SOURCES = src/dummy.c
diff --git a/PG-PuReMD/configure.ac b/PG-PuReMD/configure.ac
index 6e9f3fb5b91f9727f5762fc23bf9d2d400a10ee4..5a5ac9e11e3d9d2849bf9e39b83f99de7bd7db0d 100644
--- a/PG-PuReMD/configure.ac
+++ b/PG-PuReMD/configure.ac
@@ -133,18 +133,20 @@ if test "x$BUILD_GPU" = "xyes"; then
 
         NVCCFLAGS=
 	NVCCFLAGS+=" -ccbin=$CXX"
-	if test "BUILD_DEBUG" = "true"
+	if test "x${BUILD_DEBUG}" = "xtrue"
 	then
 		NVCCFLAGS+=" -g -G"
+
 	fi
 	AC_DEFINE([HAVE_CUDA], [1], [Define to 1 if you have CUDA support enabled.])
 else
 	AM_CONDITIONAL(USE_CUDA, test "x" = "xyes")
 fi
+AM_CONDITIONAL(DEBUG, test "x${BUILD_DEBUG}" = "xtrue")
 
 if test "BUILD_PROF" = "true"
 then
-	if test "x$BUILD_GPU" = "xyes"; then
+	if test "x${BUILD_GPU}" = "xyes"; then
 		NVCCFLAGS+=" --compiler-options ${gprof_flags}"
 	else
 		CFLAGS+=" ${gprof_flags}"
diff --git a/PG-PuReMD/src/allocate.c b/PG-PuReMD/src/allocate.c
index 54614694e37590dce079f9edb6c47a150c7baebe..c2de48939bf36905db729e0d3abc36c6460eb7d9 100644
--- a/PG-PuReMD/src/allocate.c
+++ b/PG-PuReMD/src/allocate.c
@@ -89,7 +89,7 @@ void DeAllocate_Workspace( control_params *control, storage *workspace )
         return;
     }
 
-    workspace->allocated = 0;
+    workspace->allocated = FALSE;
 
     /* communication storage */
     for ( i = 0; i < MAX_NBRS; ++i )
@@ -117,7 +117,6 @@ void DeAllocate_Workspace( control_params *control, storage *workspace )
     sfree( workspace->Clp, "Clp" );
     sfree( workspace->vlpex, "vlpex" );
     sfree( workspace->bond_mark, "bond_mark" );
-    sfree( workspace->done_after, "done_after" );
 
     /* QEq storage */
     sfree( workspace->Hdia_inv, "Hdia_inv" );
@@ -213,7 +212,7 @@ void Allocate_Workspace( reax_system *system, control_params *control,
 {
     int i, total_real, total_rvec, local_rvec;
 
-    workspace->allocated = 1;
+    workspace->allocated = TRUE;
     total_real = total_cap * sizeof(real);
     total_rvec = total_cap * sizeof(rvec);
     local_rvec = local_cap * sizeof(rvec);
@@ -245,9 +244,25 @@ void Allocate_Workspace( reax_system *system, control_params *control,
     workspace->Clp = (real*) smalloc( total_real, "Clp" );
     workspace->vlpex = (real*) smalloc( total_real, "vlpex" );
     workspace->bond_mark = (int*) scalloc(total_cap, sizeof(int), "bond_mark");
-    workspace->done_after = (int*) scalloc(total_cap, sizeof(int), "done_after");
 
-    /* QEq storage */
+    /* charge method storage */
+    switch ( control->charge_method )
+    {
+        case QEQ_CM:
+            system->N_cm = system->N;
+            break;
+        case EE_CM:
+            system->N_cm = system->N + 1;
+            break;
+        case ACKS2_CM:
+            system->N_cm = 2 * system->N + 2;
+            break;
+        default:
+            fprintf( stderr, "[ERROR] Unknown charge method type. Terminating...\n" );
+            exit( INVALID_INPUT );
+            break;
+    }
+
     workspace->Hdia_inv = (real*) scalloc( total_cap, sizeof(real), "Hdia_inv" );
     workspace->b_s = (real*) scalloc( total_cap, sizeof(real), "b_s" );
     workspace->b_t = (real*) scalloc( total_cap, sizeof(real), "b_t" );
@@ -255,39 +270,56 @@ void Allocate_Workspace( reax_system *system, control_params *control,
     workspace->b_prm = (real*) scalloc( total_cap, sizeof(real), "b_prm" );
     workspace->s = (real*) scalloc( total_cap, sizeof(real), "s" );
     workspace->t = (real*) scalloc( total_cap, sizeof(real), "t" );
-    workspace->droptol = (real*) scalloc( total_cap, sizeof(real), "droptol" );
+    if ( control->cm_solver_pre_comp_type == ICHOLT_PC ||
+            control->cm_solver_pre_comp_type == ILUT_PAR_PC )
+    {
+        workspace->droptol = (real*) scalloc( total_cap, sizeof(real), "droptol" );
+    }
     workspace->b = (rvec2*) scalloc( total_cap, sizeof(rvec2), "b" );
     workspace->x = (rvec2*) scalloc( total_cap, sizeof(rvec2), "x" );
 
-    /* GMRES storage */
-    workspace->y = (real*) scalloc( RESTART + 1, sizeof(real), "y" );
-    workspace->z = (real*) scalloc( RESTART + 1, sizeof(real), "z" );
-    workspace->g = (real*) scalloc( RESTART + 1, sizeof(real), "g" );
-    //SUHDIR
-    //workspace->h = (real**) scalloc( RESTART+1, sizeof(real*), "h" );
-    workspace->h = (real *) scalloc ( (RESTART + 1) * (RESTART + 1), sizeof (real), "h");
-    workspace->hs = (real*) scalloc( RESTART + 1, sizeof(real), "hs" );
-    workspace->hc = (real*) scalloc( RESTART + 1, sizeof(real), "hc" );
-    //SUDHIR
-    //workspace->v = (real**) scalloc( RESTART+1, sizeof(real*), "v" );
-    workspace->v = (real *) scalloc ( (RESTART + 1) * (RESTART + 1), sizeof (real), "v");
-
-    /*
-    for( i = 0; i < RESTART+1; ++i ) {
-      workspace->h[i] = (real*) scalloc( RESTART+1, sizeof(real), "h[i]" );
-      workspace->v[i] = (real*) scalloc( total_cap, sizeof(real), "v[i]" );
+    switch ( control->cm_solver_type )
+    {
+        /* GMRES storage */
+        case GMRES_S:
+        case GMRES_H_S:
+            workspace->y = (real*) scalloc( RESTART + 1, sizeof(real), "y" );
+            workspace->z = (real*) scalloc( RESTART + 1, sizeof(real), "z" );
+            workspace->g = (real*) scalloc( RESTART + 1, sizeof(real), "g" );
+            workspace->h = (real *) scalloc ( (RESTART + 1) * (RESTART + 1), sizeof (real), "h");
+            workspace->hs = (real*) scalloc( RESTART + 1, sizeof(real), "hs" );
+            workspace->hc = (real*) scalloc( RESTART + 1, sizeof(real), "hc" );
+            workspace->v = (real *) scalloc ( (RESTART + 1) * (RESTART + 1), sizeof (real), "v");
+            break;
+
+        /* CG storage */
+        case CG_S:
+            workspace->r = (real*) scalloc( total_cap, sizeof(real), "r" );
+            workspace->d = (real*) scalloc( total_cap, sizeof(real), "d" );
+            workspace->q = (real*) scalloc( total_cap, sizeof(real), "q" );
+            workspace->p = (real*) scalloc( total_cap, sizeof(real), "p" );
+            workspace->r2 = (rvec2*) scalloc( total_cap, sizeof(rvec2), "r2" );
+            workspace->d2 = (rvec2*) scalloc( total_cap, sizeof(rvec2), "d2" );
+            workspace->q2 = (rvec2*) scalloc( total_cap, sizeof(rvec2), "q2" );
+            workspace->p2 = (rvec2*) scalloc( total_cap, sizeof(rvec2), "p2" );
+            break;
+
+        case SDM_S:
+            workspace->r = (real*) scalloc( total_cap, sizeof(real), "r" );
+            workspace->d = (real*) scalloc( total_cap, sizeof(real), "d" );
+            workspace->q = (real*) scalloc( total_cap, sizeof(real), "q" );
+            workspace->p = (real*) scalloc( total_cap, sizeof(real), "p" );
+            workspace->r2 = (rvec2*) scalloc( total_cap, sizeof(rvec2), "r2" );
+            workspace->d2 = (rvec2*) scalloc( total_cap, sizeof(rvec2), "d2" );
+            workspace->q2 = (rvec2*) scalloc( total_cap, sizeof(rvec2), "q2" );
+            workspace->p2 = (rvec2*) scalloc( total_cap, sizeof(rvec2), "p2" );
+            break;
+
+        default:
+            fprintf( stderr, "Unknown charge method linear solver type. Terminating...\n" );
+            exit( INVALID_INPUT );
+            break;
     }
-    */
-
-    /* CG storage */
-    workspace->r = (real*) scalloc( total_cap, sizeof(real), "r" );
-    workspace->d = (real*) scalloc( total_cap, sizeof(real), "d" );
-    workspace->q = (real*) scalloc( total_cap, sizeof(real), "q" );
-    workspace->p = (real*) scalloc( total_cap, sizeof(real), "p" );
-    workspace->r2 = (rvec2*) scalloc( total_cap, sizeof(rvec2), "r2" );
-    workspace->d2 = (rvec2*) scalloc( total_cap, sizeof(rvec2), "d2" );
-    workspace->q2 = (rvec2*) scalloc( total_cap, sizeof(rvec2), "q2" );
-    workspace->p2 = (rvec2*) scalloc( total_cap, sizeof(rvec2), "p2" );
 
     /* integrator storage */
     workspace->v_const = (rvec*) smalloc( local_rvec, "v_const" );
diff --git a/PG-PuReMD/src/charges.c b/PG-PuReMD/src/charges.c
index 8f53b65d4b49a95fc68f4acf7698802e650137df..791804f6485ce0837a19a64dacc3ad395b812568 100644
--- a/PG-PuReMD/src/charges.c
+++ b/PG-PuReMD/src/charges.c
@@ -258,12 +258,12 @@ void ICHOLT( sparse_matrix *A, real *droptol,
 
 
 void Init_MatVec( reax_system *system, simulation_data *data,
-        control_params *control,  storage *workspace, mpi_datatypes *mpi_data )
+        control_params *control, storage *workspace, mpi_datatypes *mpi_data )
 {
     int i; //, fillin;
     reax_atom *atom;
 
-//    if( (data->step - data->prev_steps) % control->refactor == 0 ||
+//    if( (data->step - data->prev_steps) % control->cm_solver_pre_comp_refactor == 0 ||
 //            workspace->L == NULL )
 //    {
 ////        Print_Linear_System( system, control, workspace, data->step );
@@ -308,12 +308,8 @@ void Init_MatVec( reax_system *system, simulation_data *data,
     {
         atom = &( system->my_atoms[i] );
 
-        /* init pre-conditioner for H and init solution vectors */
+        /* initialize diagonal inverse preconditioner vectors */
         workspace->Hdia_inv[i] = 1. / system->reax_param.sbp[ atom->type ].eta;
-        workspace->b_s[i] = -system->reax_param.sbp[ atom->type ].chi;
-        workspace->b_t[i] = -1.0;
-        workspace->b[i][0] = -system->reax_param.sbp[ atom->type ].chi;
-        workspace->b[i][1] = -1.0;
 
         /* linear extrapolation for s and for t */
         // newQEq: no extrapolation!
@@ -334,6 +330,79 @@ void Init_MatVec( reax_system *system, simulation_data *data,
 
 //        fprintf(stderr, "i=%d s=%f t=%f\n", i, workspace->s[i], workspace->t[i]);
     }
+
+    /* initialize solution vectors for linear solves in charge method */
+    switch ( control->charge_method )
+    {
+        case QEQ_CM:
+            for ( i = 0; i < system->n; ++i )
+            {
+                atom = &( system->my_atoms[i] );
+
+                workspace->b_s[i] = -system->reax_param.sbp[ atom->type ].chi;
+                workspace->b_t[i] = -1.0;
+                workspace->b[i][0] = -system->reax_param.sbp[ atom->type ].chi;
+                workspace->b[i][1] = -1.0;
+            }
+            break;
+
+        case EE_CM:
+            for ( i = 0; i < system->n; ++i )
+            {
+                atom = &( system->my_atoms[i] );
+
+                workspace->b_s[i] = -system->reax_param.sbp[ atom->type ].chi;
+
+                //TODO: check if unused (redundant)
+                workspace->b[i][0] = -system->reax_param.sbp[ atom->type ].chi;
+            }
+
+            if ( system->my_rank == 0 )
+            {
+                workspace->b_s[system->n] = control->cm_q_net;
+                workspace->b[system->n][0] = control->cm_q_net;
+            }
+            break;
+
+        case ACKS2_CM:
+            for ( i = 0; i < system->n; ++i )
+            {
+                atom = &( system->my_atoms[i] );
+
+                workspace->b_s[i] = -system->reax_param.sbp[ atom->type ].chi;
+
+                //TODO: check if unused (redundant)
+                workspace->b[i][0] = -system->reax_param.sbp[ atom->type ].chi;
+            }
+
+            if ( system->my_rank == 0 )
+            {
+                workspace->b_s[system->n] = control->cm_q_net;
+                workspace->b[system->n][0] = control->cm_q_net;
+            }
+
+            for ( i = system->n + 1; i < system->N_cm; ++i )
+            {
+                atom = &( system->my_atoms[i] );
+
+                workspace->b_s[i] = 0.0;
+
+                //TODO: check if unused (redundant)
+                workspace->b[i][0] = 0.0;
+            }
+
+            if ( system->my_rank == 0 )
+            {
+                workspace->b_s[system->n] = control->cm_q_net;
+                workspace->b[system->n][0] = control->cm_q_net;
+            }
+            break;
+
+        default:
+            fprintf( stderr, "Unknown charge method type. Terminating...\n" );
+            exit( INVALID_INPUT );
+            break;
+    }
 }
 
 
@@ -359,7 +428,7 @@ void Calculate_Charges( reax_system *system, storage *workspace,
     }
 
 #if defined(DEBUG)
-    fprintf (stderr, "Host : my_sum[0]: %f and %f \n", my_sum[0], my_sum[1]);
+    fprintf( stderr, "Host : my_sum[0]: %f and %f \n", my_sum[0], my_sum[1] );
 #endif
 
     MPI_Allreduce( &my_sum, &all_sum, 2, MPI_DOUBLE, MPI_SUM, mpi_data->world );
@@ -367,7 +436,7 @@ void Calculate_Charges( reax_system *system, storage *workspace,
     u = all_sum[0] / all_sum[1];
 
 #if defined(DEBUG)
-    fprintf (stderr, "Host : u: %f \n", u);
+    fprintf( stderr, "Host : u: %f \n", u );
 #endif
 
     for ( i = 0; i < system->n; ++i )
@@ -421,14 +490,14 @@ void QEq( reax_system *system, control_params *control, simulation_data *data,
 
     //MATRIX CHANGES
     s_matvecs = dual_CG( system, workspace, &workspace->H, workspace->b,
-            control->q_err, workspace->x, mpi_data, out_control->log, data );
+            control->cm_solver_q_err, workspace->x, mpi_data, out_control->log, data );
     t_matvecs = 0;
     //fprintf (stderr, "Host: First CG complated with iterations: %d \n", s_matvecs);
 
     //s_matvecs = CG(system, workspace, workspace->H, workspace->b_s, //newQEq sCG
-    // control->q_err, workspace->s, mpi_data, out_control->log );
+    // control->cm_solver_q_err, workspace->s, mpi_data, out_control->log );
     //s_matvecs = PCG( system, workspace, workspace->H, workspace->b_s,
-    //   control->q_err, workspace->L, workspace->U, workspace->s,
+    //   control->cm_solver_q_err, workspace->L, workspace->U, workspace->s,
     //   mpi_data, out_control->log );
 
 #if defined(DEBUG)
@@ -436,9 +505,9 @@ void QEq( reax_system *system, control_params *control, simulation_data *data,
 #endif
 
     //t_matvecs = CG(system, workspace, workspace->H, workspace->b_t, //newQEq sCG
-    // control->q_err, workspace->t, mpi_data, out_control->log );
+    // control->cm_solver_q_err, workspace->t, mpi_data, out_control->log );
     //t_matvecs = PCG( system, workspace, workspace->H, workspace->b_t,
-    //   control->q_err, workspace->L, workspace->U, workspace->t,
+    //   control->cm_solver_q_err, workspace->L, workspace->U, workspace->t,
     //   mpi_data, out_control->log );
 
 #if defined(DEBUG)
diff --git a/PG-PuReMD/src/control.c b/PG-PuReMD/src/control.c
index 268b6a1750874a63f3fbbc3a7e339582f5b231f4..5cb8ce6f1099fde86343d81d8c7b147d41bc4864 100644
--- a/PG-PuReMD/src/control.c
+++ b/PG-PuReMD/src/control.c
@@ -47,7 +47,7 @@ char Read_Control_File( char *control_file, control_params* control,
     }
 
     /* assign default values */
-    strcpy( control->sim_name, "simulate" );
+    strcpy( control->sim_name, "default.sim" );
     control->ensemble        = NVE;
     control->nsteps          = 0;
     control->dt              = 0.25;
@@ -77,10 +77,21 @@ char Read_Control_File( char *control_file, control_params* control,
 
     control->tabulate = 0;
 
+    control->charge_method = QEQ_CM;
     control->charge_freq = 1;
-    control->q_err = 1e-6;
-    control->refactor = 100;
-    control->droptol = 1e-2;;
+    control->cm_q_net = 0.0;
+    control->cm_solver_type = GMRES_S;
+    control->cm_solver_max_iters = 100;
+    control->cm_solver_restart = 50;
+    control->cm_solver_q_err = 0.000001;
+    control->cm_domain_sparsify_enabled = FALSE;
+    control->cm_domain_sparsity = 1.0;
+    control->cm_solver_pre_comp_type = DIAG_PC;
+    control->cm_solver_pre_comp_sweeps = 3;
+    control->cm_solver_pre_comp_refactor = 100;
+    control->cm_solver_pre_comp_droptol = 0.01;
+    control->cm_solver_pre_app_type = TRI_SOLVE_PA;
+    control->cm_solver_pre_app_jacobi_iters = 50;
 
     control->T_init = 0.;
     control->T_final = 300.;
@@ -247,25 +258,79 @@ char Read_Control_File( char *control_file, control_params* control,
             ival = atoi( tmp[1] );
             control->tabulate = ival;
         }
+        else if ( strcmp(tmp[0], "charge_method") == 0 )
+        {
+            ival = atoi( tmp[1] );
+            control->charge_method = ival;
+        }
         else if ( strcmp(tmp[0], "charge_freq") == 0 )
         {
             ival = atoi( tmp[1] );
             control->charge_freq = ival;
         }
-        else if ( strcmp(tmp[0], "q_err") == 0 )
+        else if ( strcmp(tmp[0], "cm_q_net") == 0 )
+        {
+            val = atof( tmp[1] );
+            control->cm_q_net = val;
+        }
+        else if ( strcmp(tmp[0], "cm_solver_type") == 0 )
+        {
+            ival = atoi( tmp[1] );
+            control->cm_solver_type = ival;
+        }
+        else if ( strcmp(tmp[0], "cm_solver_max_iters") == 0 )
+        {
+            ival = atoi( tmp[1] );
+            control->cm_solver_max_iters = ival;
+        }
+        else if ( strcmp(tmp[0], "cm_solver_restart") == 0 )
+        {
+            ival = atoi( tmp[1] );
+            control->cm_solver_restart = ival;
+        }
+        else if ( strcmp(tmp[0], "cm_solver_q_err") == 0 )
+        {
+            val = atof( tmp[1] );
+            control->cm_solver_q_err = val;
+        }
+        else if ( strcmp(tmp[0], "cm_domain_sparsity") == 0 )
         {
             val = atof( tmp[1] );
-            control->q_err = val;
+            control->cm_domain_sparsity = val;
+            if ( val < 1.0 )
+            {
+                control->cm_domain_sparsify_enabled = TRUE;
+            }
         }
-        else if ( strcmp(tmp[0], "ilu_refactor") == 0 )
+        else if ( strcmp(tmp[0], "cm_solver_pre_comp_type") == 0 )
         {
             ival = atoi( tmp[1] );
-            control->refactor = ival;
+            control->cm_solver_pre_comp_type = ival;
         }
-        else if ( strcmp(tmp[0], "ilu_droptol") == 0 )
+        else if ( strcmp(tmp[0], "cm_solver_pre_comp_refactor") == 0 )
+        {
+            ival = atoi( tmp[1] );
+            control->cm_solver_pre_comp_refactor = ival;
+        }
+        else if ( strcmp(tmp[0], "cm_solver_pre_comp_droptol") == 0 )
         {
             val = atof( tmp[1] );
-            control->droptol = val;
+            control->cm_solver_pre_comp_droptol = val;
+        }
+        else if ( strcmp(tmp[0], "cm_solver_pre_comp_sweeps") == 0 )
+        {
+            ival = atoi( tmp[1] );
+            control->cm_solver_pre_comp_sweeps = ival;
+        }
+        else if ( strcmp(tmp[0], "cm_solver_pre_app_type") == 0 )
+        {
+            ival = atoi( tmp[1] );
+            control->cm_solver_pre_app_type = ival;
+        }
+        else if ( strcmp(tmp[0], "cm_solver_pre_app_jacobi_iters") == 0 )
+        {
+            ival = atoi( tmp[1] );
+            control->cm_solver_pre_app_jacobi_iters = ival;
         }
         else if ( strcmp(tmp[0], "temp_init") == 0 )
         {
diff --git a/PG-PuReMD/src/cub b/PG-PuReMD/src/cub
index 01347a797c620618d09e7d2d90bce4be4c42513e..d622848f9fb62f13e5e064e1deb43b6bcbb12bad 160000
--- a/PG-PuReMD/src/cub
+++ b/PG-PuReMD/src/cub
@@ -1 +1 @@
-Subproject commit 01347a797c620618d09e7d2d90bce4be4c42513e
+Subproject commit d622848f9fb62f13e5e064e1deb43b6bcbb12bad
diff --git a/PG-PuReMD/src/cuda/cuda_allocate.cu b/PG-PuReMD/src/cuda/cuda_allocate.cu
index 5df47e6f4fd55744b44b190d011f2e8a5b04f617..d683bba502aff5eae65bbd0f7e064768b62725d2 100644
--- a/PG-PuReMD/src/cuda/cuda_allocate.cu
+++ b/PG-PuReMD/src/cuda/cuda_allocate.cu
@@ -358,7 +358,6 @@ void dev_alloc_workspace( reax_system *system, control_params *control,
      */
 
     /* bond order related storage  */
-    cuda_malloc( (void **) &workspace->within_bond_box, total_cap * sizeof (int), TRUE, "skin" );
     cuda_malloc( (void **) &workspace->total_bond_order, total_real, TRUE, "total_bo" );
     cuda_malloc( (void **) &workspace->Deltap, total_real, TRUE, "Deltap" );
     cuda_malloc( (void **) &workspace->Deltap_boc, total_real, TRUE, "Deltap_boc" );
@@ -375,39 +374,58 @@ void dev_alloc_workspace( reax_system *system, control_params *control,
     cuda_malloc( (void **) &workspace->Clp, total_real, TRUE, "Clp" );
     cuda_malloc( (void **) &workspace->vlpex, total_real, TRUE, "vlpex" );
     cuda_malloc( (void **) &workspace->bond_mark, total_real, TRUE, "bond_mark" );
-    cuda_malloc( (void **) &workspace->done_after, total_real, TRUE, "done_after" );
-
 
     /* charge matrix storage */
-    cuda_malloc( (void **) &workspace->Hdia_inv, total_cap * sizeof(real), TRUE, "Hdia_inv" );
+    if ( control->cm_solver_pre_comp_type == DIAG_PC )
+    {
+        cuda_malloc( (void **) &workspace->Hdia_inv, total_cap * sizeof(real), TRUE, "Hdia_inv" );
+    }
     cuda_malloc( (void **) &workspace->b_s, total_cap * sizeof(real), TRUE, "b_s" );
     cuda_malloc( (void **) &workspace->b_t, total_cap * sizeof(real), TRUE, "b_t" );
     cuda_malloc( (void **) &workspace->b_prc, total_cap * sizeof(real), TRUE, "b_prc" );
     cuda_malloc( (void **) &workspace->b_prm, total_cap * sizeof(real), TRUE, "b_prm" );
     cuda_malloc( (void **) &workspace->s, total_cap * sizeof(real), TRUE, "s" );
     cuda_malloc( (void **) &workspace->t, total_cap * sizeof(real), TRUE, "t" );
-    cuda_malloc( (void **) &workspace->droptol, total_cap * sizeof(real), TRUE, "droptol" );
+    if ( control->cm_solver_pre_comp_type == ICHOLT_PC ||
+            control->cm_solver_pre_comp_type == ILUT_PAR_PC )
+    {
+        cuda_malloc( (void **) &workspace->droptol, total_cap * sizeof(real), TRUE, "droptol" );
+    }
     cuda_malloc( (void **) &workspace->b, total_cap * sizeof(rvec2), TRUE, "b" );
     cuda_malloc( (void **) &workspace->x, total_cap * sizeof(rvec2), TRUE, "x" );
 
-    /* GMRES storage */
-    cuda_malloc( (void **) &workspace->y, (RESTART+1)*sizeof(real), TRUE, "y" );
-    cuda_malloc( (void **) &workspace->z, (RESTART+1)*sizeof(real), TRUE, "z" );
-    cuda_malloc( (void **) &workspace->g, (RESTART+1)*sizeof(real), TRUE, "g" );
-    cuda_malloc( (void **) &workspace->h, (RESTART+1)*(RESTART+1)*sizeof(real), TRUE, "h" );
-    cuda_malloc( (void **) &workspace->hs, (RESTART+1)*sizeof(real), TRUE, "hs" );
-    cuda_malloc( (void **) &workspace->hc, (RESTART+1)*sizeof(real), TRUE, "hc" );
-    cuda_malloc( (void **) &workspace->v, (RESTART+1)*(RESTART+1)*sizeof(real), TRUE, "v" );
-
-    /* CG storage */
-    cuda_malloc( (void **) &workspace->r, total_cap * sizeof(real), TRUE, "r" );
-    cuda_malloc( (void **) &workspace->d, total_cap * sizeof(real), TRUE, "d" );
-    cuda_malloc( (void **) &workspace->q, total_cap * sizeof(real), TRUE, "q" );
-    cuda_malloc( (void **) &workspace->p, total_cap * sizeof(real), TRUE, "p" );
-    cuda_malloc( (void **) &workspace->r2, total_cap * sizeof(rvec2), TRUE, "r2" );
-    cuda_malloc( (void **) &workspace->d2, total_cap * sizeof(rvec2), TRUE, "d2" );
-    cuda_malloc( (void **) &workspace->q2, total_cap * sizeof(rvec2), TRUE, "q2" );
-    cuda_malloc( (void **) &workspace->p2, total_cap * sizeof(rvec2), TRUE, "p2" );
+    switch ( control->cm_solver_type )
+    {
+    case GMRES_S:
+    case GMRES_H_S:
+        cuda_malloc( (void **) &workspace->y, (RESTART+1)*sizeof(real), TRUE, "y" );
+        cuda_malloc( (void **) &workspace->z, (RESTART+1)*sizeof(real), TRUE, "z" );
+        cuda_malloc( (void **) &workspace->g, (RESTART+1)*sizeof(real), TRUE, "g" );
+        cuda_malloc( (void **) &workspace->h, (RESTART+1)*(RESTART+1)*sizeof(real), TRUE, "h" );
+        cuda_malloc( (void **) &workspace->hs, (RESTART+1)*sizeof(real), TRUE, "hs" );
+        cuda_malloc( (void **) &workspace->hc, (RESTART+1)*sizeof(real), TRUE, "hc" );
+        cuda_malloc( (void **) &workspace->v, (RESTART+1)*(RESTART+1)*sizeof(real), TRUE, "v" );
+        break;
+
+    case SDM_S:
+        break;
+
+    case CG_S:
+        cuda_malloc( (void **) &workspace->r, total_cap * sizeof(real), TRUE, "r" );
+        cuda_malloc( (void **) &workspace->d, total_cap * sizeof(real), TRUE, "d" );
+        cuda_malloc( (void **) &workspace->q, total_cap * sizeof(real), TRUE, "q" );
+        cuda_malloc( (void **) &workspace->p, total_cap * sizeof(real), TRUE, "p" );
+        cuda_malloc( (void **) &workspace->r2, total_cap * sizeof(rvec2), TRUE, "r2" );
+        cuda_malloc( (void **) &workspace->d2, total_cap * sizeof(rvec2), TRUE, "d2" );
+        cuda_malloc( (void **) &workspace->q2, total_cap * sizeof(rvec2), TRUE, "q2" );
+        cuda_malloc( (void **) &workspace->p2, total_cap * sizeof(rvec2), TRUE, "p2" );
+        break;
+
+    default:
+        fprintf( stderr, "Unrecognized QEq solver selection. Terminating...\n" );
+        exit( INVALID_INPUT );
+        break;
+    }
 
     /* integrator storage */
     cuda_malloc( (void **) &workspace->v_const, local_rvec, TRUE, "v_const" );
@@ -458,7 +476,6 @@ void dev_dealloc_workspace( control_params *control, storage *workspace )
      */
 
     /* bond order related storage  */
-    cuda_free( workspace->within_bond_box, "skin" );
     cuda_free( workspace->total_bond_order, "total_bo" );
     cuda_free( workspace->Deltap, "Deltap" );
     cuda_free( workspace->Deltap_boc, "Deltap_boc" );
@@ -475,38 +492,58 @@ void dev_dealloc_workspace( control_params *control, storage *workspace )
     cuda_free( workspace->Clp, "Clp" );
     cuda_free( workspace->vlpex, "vlpex" );
     cuda_free( workspace->bond_mark, "bond_mark" );
-    cuda_free( workspace->done_after, "done_after" );
 
     /* charge matrix storage */
-    cuda_free( workspace->Hdia_inv, "Hdia_inv" );
+    if ( control->cm_solver_pre_comp_type == DIAG_PC )
+    {
+        cuda_free( workspace->Hdia_inv, "Hdia_inv" );
+    }
     cuda_free( workspace->b_s, "b_s" );
     cuda_free( workspace->b_t, "b_t" );
     cuda_free( workspace->b_prc, "b_prc" );
     cuda_free( workspace->b_prm, "b_prm" );
     cuda_free( workspace->s, "s" );
     cuda_free( workspace->t, "t" );
-    cuda_free( workspace->droptol, "droptol" );
+    if ( control->cm_solver_pre_comp_type == ICHOLT_PC ||
+            control->cm_solver_pre_comp_type == ILUT_PAR_PC )
+    {
+        cuda_free( workspace->droptol, "droptol" );
+    }
     cuda_free( workspace->b, "b" );
     cuda_free( workspace->x, "x" );
 
-    /* GMRES storage */
-    cuda_free( workspace->y, "y" );
-    cuda_free( workspace->z, "z" );
-    cuda_free( workspace->g, "g" );
-    cuda_free( workspace->h, "h" );
-    cuda_free( workspace->hs, "hs" );
-    cuda_free( workspace->hc, "hc" );
-    cuda_free( workspace->v, "v" );
-
-    /* CG storage */
-    cuda_free( workspace->r, "r" );
-    cuda_free( workspace->d, "d" );
-    cuda_free( workspace->q, "q" );
-    cuda_free( workspace->p, "p" );
-    cuda_free( workspace->r2, "r2" );
-    cuda_free( workspace->d2, "d2" );
-    cuda_free( workspace->q2, "q2" );
-    cuda_free( workspace->p2, "p2" );
+    switch ( control->cm_solver_type )
+    {
+    case GMRES_S:
+    case GMRES_H_S:
+        cuda_free( workspace->y, "y" );
+        cuda_free( workspace->z, "z" );
+        cuda_free( workspace->g, "g" );
+        cuda_free( workspace->h, "h" );
+        cuda_free( workspace->hs, "hs" );
+        cuda_free( workspace->hc, "hc" );
+        cuda_free( workspace->v, "v" );
+        break;
+
+    case SDM_S:
+        break;
+
+    case CG_S:
+        cuda_free( workspace->r, "r" );
+        cuda_free( workspace->d, "d" );
+        cuda_free( workspace->q, "q" );
+        cuda_free( workspace->p, "p" );
+        cuda_free( workspace->r2, "r2" );
+        cuda_free( workspace->d2, "d2" );
+        cuda_free( workspace->q2, "q2" );
+        cuda_free( workspace->p2, "p2" );
+        break;
+
+    default:
+        fprintf( stderr, "Unrecognized QEq solver selection. Terminating...\n" );
+        exit( INVALID_INPUT );
+        break;
+    }
 
     /* integrator storage */
     cuda_free( workspace->v_const, "v_const" );
diff --git a/PG-PuReMD/src/cuda/cuda_charges.cu b/PG-PuReMD/src/cuda/cuda_charges.cu
index 9ea976537c2527ae52616edab5e5b367615f81e1..c22058e9e81af68be9e7da236ca27c8e62eae10c 100644
--- a/PG-PuReMD/src/cuda/cuda_charges.cu
+++ b/PG-PuReMD/src/cuda/cuda_charges.cu
@@ -24,7 +24,9 @@
 #include "cuda_lin_alg.h"
 #include "cuda_reduction.h"
 #include "cuda_utils.h"
-#include "cuda_validation.h"
+#if defined(DEBUG)
+  #include "cuda_validation.h"
+#endif
 
 #include "../basic_comm.h"
 
@@ -259,32 +261,114 @@ void Cuda_QEq( reax_system *system, control_params *control, simulation_data
     // compare_array (workspace->b_t, dev_workspace->b_t, system->n, "b_t");
     //}
 
-//#ifdef __CUDA_DEBUG__
-//  Init_MatVec( system, data, control, workspace, mpi_data );
-//#endif
+    switch ( control->cm_solver_type )
+    {
+    case GMRES_S:
+    case GMRES_H_S:
+    case SDM_S:
+        fprintf( stderr, "Unsupported QEq solver selection. Terminating...\n" );
+        exit( INVALID_INPUT );
+        break;
+
+    case CG_S:
+        s_matvecs = Cuda_dual_CG( system, control, workspace, &dev_workspace->H,
+                dev_workspace->b, control->cm_solver_q_err, dev_workspace->x, mpi_data,
+                out_control->log, data );
+        t_matvecs = 0;
+        break;
+
+
+    default:
+        fprintf( stderr, "Unrecognized QEq solver selection. Terminating...\n" );
+        exit( INVALID_INPUT );
+        break;
+    }
 
-#if defined(DEBUG)
-    fprintf( stderr, "p%d: initialized qEq\n", system->my_rank );
-    //Print_Linear_System( system, control, workspace, data->step );
+    Cuda_Calculate_Charges( system, workspace, mpi_data );
+
+#if defined(LOG_PERFORMANCE)
+    if ( system->my_rank == MASTER_NODE )
+    {
+        data->timing.s_matvecs += s_matvecs;
+        data->timing.t_matvecs += t_matvecs;
+    }
 #endif
+}
 
-    //MATRIX CHANGES
-    s_matvecs = Cuda_dual_CG( system, workspace, &dev_workspace->H,
-            dev_workspace->b, control->q_err, dev_workspace->x, mpi_data,
-            out_control->log, data );
-    t_matvecs = 0;
-    //fprintf (stderr, "Device: First CG complated with iterations: %d \n", s_matvecs);
 
-#if defined(DEBUG)
-    fprintf( stderr, "p%d: first CG completed\n", system->my_rank );
-#endif
+void Cuda_EE( reax_system *system, control_params *control, simulation_data
+        *data, storage *workspace, output_controls *out_control, mpi_datatypes
+        *mpi_data )
+{
+    int s_matvecs, t_matvecs;
+
+    Cuda_Init_MatVec( system, workspace );
+
+    switch ( control->cm_solver_type )
+    {
+    case GMRES_S:
+    case GMRES_H_S:
+    case SDM_S:
+        fprintf( stderr, "Unsupported QEq solver selection. Terminating...\n" );
+        exit( INVALID_INPUT );
+        break;
+
+    case CG_S:
+        s_matvecs = Cuda_CG( system, control, workspace, &dev_workspace->H,
+                dev_workspace->b_s, control->cm_solver_q_err, dev_workspace->s, mpi_data );
+        t_matvecs = 0;
+        break;
+
+
+    default:
+        fprintf( stderr, "Unrecognized QEq solver selection. Terminating...\n" );
+        exit( INVALID_INPUT );
+        break;
+    }
 
     Cuda_Calculate_Charges( system, workspace, mpi_data );
 
-#if defined(DEBUG)
-    fprintf( stderr, "p%d: computed charges\n", system->my_rank );
-    //Print_Charges( system );
+#if defined(LOG_PERFORMANCE)
+    if ( system->my_rank == MASTER_NODE )
+    {
+        data->timing.s_matvecs += s_matvecs;
+        data->timing.t_matvecs += t_matvecs;
+    }
 #endif
+}
+
+
+void Cuda_ACKS2( reax_system *system, control_params *control, simulation_data
+        *data, storage *workspace, output_controls *out_control, mpi_datatypes
+        *mpi_data )
+{
+    int s_matvecs, t_matvecs;
+
+    Cuda_Init_MatVec( system, workspace );
+
+    switch ( control->cm_solver_type )
+    {
+    case GMRES_S:
+    case GMRES_H_S:
+    case SDM_S:
+        fprintf( stderr, "Unsupported QEq solver selection. Terminating...\n" );
+        exit( INVALID_INPUT );
+        break;
+
+    case CG_S:
+        s_matvecs = Cuda_CG( system, control, workspace, &dev_workspace->H,
+                dev_workspace->b_s, control->cm_solver_q_err, dev_workspace->s, mpi_data );
+        t_matvecs = 0;
+        break;
+
+
+    default:
+        fprintf( stderr, "Unrecognized QEq solver selection. Terminating...\n" );
+        exit( INVALID_INPUT );
+        break;
+    }
+
+    Cuda_Calculate_Charges( system, workspace, mpi_data );
 
 #if defined(LOG_PERFORMANCE)
     if ( system->my_rank == MASTER_NODE )
@@ -294,3 +378,29 @@ void Cuda_QEq( reax_system *system, control_params *control, simulation_data
     }
 #endif
 }
+
+
+void Cuda_Compute_Charges( reax_system *system, control_params *control, simulation_data
+        *data, storage *workspace, output_controls *out_control, mpi_datatypes
+        *mpi_data )
+{
+    switch ( control->charge_method )
+    {
+    case QEQ_CM:
+        Cuda_QEq( system, control, data, workspace, out_control, mpi_data );
+        break;
+
+    case EE_CM:
+        Cuda_EE( system, control, data, workspace, out_control, mpi_data );
+        break;
+
+    case ACKS2_CM:
+        Cuda_ACKS2( system, control, data, workspace, out_control, mpi_data );
+        break;
+
+    default:
+        fprintf( stderr, "Invalid charge method. Terminating...\n" );
+        exit( INVALID_INPUT );
+        break;
+    }
+}
diff --git a/PG-PuReMD/src/cuda/cuda_charges.h b/PG-PuReMD/src/cuda/cuda_charges.h
index d1922a48b83a6df1b27a14efb38e63d153f89111..dc8c16b601820def6227da5710662305dee7c49b 100644
--- a/PG-PuReMD/src/cuda/cuda_charges.h
+++ b/PG-PuReMD/src/cuda/cuda_charges.h
@@ -37,7 +37,7 @@ void cuda_charges_st( reax_system *, storage *, real *, real );
 
 void cuda_charges_updateq( reax_system *, real * );
 
-void Cuda_QEq( reax_system*, control_params*, simulation_data*,
+void Cuda_Compute_Charges( reax_system*, control_params*, simulation_data*,
         storage*, output_controls*, mpi_datatypes* );
 
 
diff --git a/PG-PuReMD/src/cuda/cuda_forces.cu b/PG-PuReMD/src/cuda/cuda_forces.cu
index 4960038cb48d0c2c2a127f05ad1146d63f7ebf8a..bc86bf41cc30e7a7d43fd57c2585991db34c8263 100644
--- a/PG-PuReMD/src/cuda/cuda_forces.cu
+++ b/PG-PuReMD/src/cuda/cuda_forces.cu
@@ -15,7 +15,9 @@
 #include "cuda_torsion_angles.h"
 #include "cuda_utils.h"
 #include "cuda_valence_angles.h"
-#include "cuda_validation.h"
+#if defined(DEBUG)
+  #include "cuda_validation.h"
+#endif
 
 #include "../basic_comm.h"
 #include "../forces.h"
@@ -212,7 +214,7 @@ CUDA_GLOBAL void k_estimate_storages( reax_atom *my_atoms,
     int local;
     int my_bonds, my_hbonds, my_cm_entries;
     real cutoff;
-    real r_ij, r2; 
+    real r_ij; 
     real C12, C34, C56;
     real BO, BO_s, BO_pi, BO_pi2;
     single_body_parameters *sbp_i, *sbp_j;
@@ -334,8 +336,6 @@ CUDA_GLOBAL void k_estimate_storages( reax_atom *my_atoms,
                 /* uncorrected bond orders */
                 if ( nbr_pj->d <= control->bond_cut )
                 {
-                    r2 = SQR( r_ij );
-
                     if ( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0 )
                     {
                         C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 );
@@ -1074,6 +1074,7 @@ CUDA_GLOBAL void k_update_hbonds( reax_atom *my_atoms, reax_list hbonds, int n )
 }
 
 
+#if defined(DEBUG)
 CUDA_GLOBAL void k_print_forces( reax_atom *my_atoms, rvec *f, int n )
 {
     int i; 
@@ -1151,6 +1152,7 @@ static void Print_HBonds( reax_system *system, int step )
     cudaThreadSynchronize( );
     cudaCheckError( );
 }
+#endif
 
 
 CUDA_GLOBAL void k_init_bond_orders( reax_atom *my_atoms, reax_list far_nbrs, 
@@ -1345,7 +1347,8 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
         simulation_data *data, storage *workspace, 
         reax_list **lists, output_controls *out_control )
 {
-    int hbs, hnbrs_blocks, update_energy, ret;
+    int update_energy, ret;
+//    int hbs, hnbrs_blocks;
     int *thbody;
     static int compute_bonded_part1 = FALSE;
     real *spad = (real *) scratch;
@@ -1643,8 +1646,8 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
             cuda_memset( spad, 0,
                     2 * sizeof(real) * system->n + sizeof(rvec) * system->n * 2, "scratch" );
 
-            hbs = (system->n * HB_KER_THREADS_PER_ATOM / HB_BLOCK_SIZE) + 
-                (((system->n * HB_KER_THREADS_PER_ATOM) % HB_BLOCK_SIZE) == 0 ? 0 : 1);
+//            hbs = (system->n * HB_KER_THREADS_PER_ATOM / HB_BLOCK_SIZE) + 
+//                (((system->n * HB_KER_THREADS_PER_ATOM) % HB_BLOCK_SIZE) == 0 ? 0 : 1);
 
             Cuda_Hydrogen_Bonds <<< BLOCKS, BLOCK_SIZE >>>
 //            Cuda_Hydrogen_Bonds_MT <<< hbs, HB_BLOCK_SIZE, 
@@ -1694,8 +1697,8 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
             cudaCheckError( );
 
             /* post process step2 */
-            hnbrs_blocks = (system->N * HB_POST_PROC_KER_THREADS_PER_ATOM / HB_POST_PROC_BLOCK_SIZE) +
-                (((system->N * HB_POST_PROC_KER_THREADS_PER_ATOM) % HB_POST_PROC_BLOCK_SIZE) == 0 ? 0 : 1);
+//            hnbrs_blocks = (system->N * HB_POST_PROC_KER_THREADS_PER_ATOM / HB_POST_PROC_BLOCK_SIZE) +
+//                (((system->N * HB_POST_PROC_KER_THREADS_PER_ATOM) % HB_POST_PROC_BLOCK_SIZE) == 0 ? 0 : 1);
 
             Cuda_Hydrogen_Bonds_HNbrs <<< system->N, 32, 32 * sizeof(rvec) >>>
                 ( system->d_my_atoms, *dev_workspace, *(*dev_lists + HBONDS) );
@@ -1848,7 +1851,7 @@ int Cuda_Compute_Forces( reax_system *system, control_params *control,
 #if defined(PURE_REAX)
         if ( charge_flag == TRUE )
         {
-            Cuda_QEq( system, control, data, workspace, out_control, mpi_data );
+            Cuda_Compute_Charges( system, control, data, workspace, out_control, mpi_data );
         }
 
 #if defined(LOG_PERFORMANCE)
@@ -1898,7 +1901,7 @@ int Cuda_Compute_Forces( reax_system *system, control_params *control,
 #if defined(DEBUG_FOCUS)
         fprintf( stderr, "p%d @ step%d: total forces computed\n",
                 system->my_rank, data->step );
-        //Print_Total_Force( system, data, workspace );
+//        Print_Total_Force( system, data, workspace );
         MPI_Barrier( MPI_COMM_WORLD );
 
 #endif
diff --git a/PG-PuReMD/src/cuda/cuda_init_md.cu b/PG-PuReMD/src/cuda/cuda_init_md.cu
index b115f2870de362525282cbaea220c762b706baf1..81b9b3e13f82f5eb0755fc4c3980738e99be324b 100644
--- a/PG-PuReMD/src/cuda/cuda_init_md.cu
+++ b/PG-PuReMD/src/cuda/cuda_init_md.cu
@@ -10,7 +10,9 @@
 #include "cuda_reset_tools.h"
 #include "cuda_system_props.h"
 #include "cuda_utils.h"
-#include "cuda_validation.h"
+#if defined(DEBUG)
+  #include "cuda_validation.h"
+#endif
 
 #if defined(PURE_REAX)
   #include "../box.h"
diff --git a/PG-PuReMD/src/cuda/cuda_lin_alg.cu b/PG-PuReMD/src/cuda/cuda_lin_alg.cu
index d47a08562b9993f8cd2d760069eeb802a010bd8f..1b51216ac4f142c328c6e1469d3661e1a7bb7ebd 100644
--- a/PG-PuReMD/src/cuda/cuda_lin_alg.cu
+++ b/PG-PuReMD/src/cuda/cuda_lin_alg.cu
@@ -336,7 +336,7 @@ void Cuda_CG_Preconditioner( real *res, real *a, real *b, int count )
 }
 
 
-CUDA_GLOBAL void k_diagonal_preconditioner(storage p_workspace, rvec2 *b, int n)
+CUDA_GLOBAL void k_diagonal_preconditioner( storage p_workspace, rvec2 *b, int n )
 {
     storage *workspace;
     int j;
@@ -350,15 +350,13 @@ CUDA_GLOBAL void k_diagonal_preconditioner(storage p_workspace, rvec2 *b, int n)
 
     workspace = &( p_workspace );
 
-    //for( j = 0; j < system->n; ++j ) {
-    // residual 
+    /* compute residuals */
     workspace->r2[j][0] = b[j][0] - workspace->q2[j][0];
     workspace->r2[j][1] = b[j][1] - workspace->q2[j][1];
 
-    // apply diagonal pre-conditioner
+    /* apply diagonal preconditioner to residuals */
     workspace->d2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j]; 
     workspace->d2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j]; 
-    //}
 }
 
 
@@ -370,7 +368,7 @@ void Cuda_CG_Diagonal_Preconditioner( storage *workspace, rvec2 *b, int n )
         (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
 
     k_diagonal_preconditioner <<< blocks, DEF_BLOCK_SIZE >>>
-        (*workspace, b, n);
+        ( *workspace, b, n );
 
     cudaThreadSynchronize( );
     cudaCheckError( );
@@ -396,7 +394,6 @@ CUDA_GLOBAL void k_dual_cg_preconditioner( storage p_workspace, rvec2 *x,
     alpha[1] = alpha_1;
     my_dot[j][0] = my_dot[j][1] = 0.0;
 
-    //for( j = 0; j < system->n; ++j ) {
     // update x 
     x[j][0] += alpha[0] * workspace->d2[j][0];
     x[j][1] += alpha[1] * workspace->d2[j][1];      
@@ -412,7 +409,6 @@ CUDA_GLOBAL void k_dual_cg_preconditioner( storage p_workspace, rvec2 *x,
     // dot product: r.p 
     my_dot[j][0] = workspace->r2[j][0] * workspace->p2[j][0];
     my_dot[j][1] = workspace->r2[j][1] * workspace->p2[j][1];
-    //}
 }
 
 
@@ -433,7 +429,7 @@ void Cuda_DualCG_Preconditioner( storage *workspace, rvec2 *x, rvec2 alpha,
     cudaThreadSynchronize( );
     cudaCheckError( );
 
-    //Reduction to calculate my_dot
+    /* reduction to calculate my_dot */
     k_reduction_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec2) * DEF_BLOCK_SIZE >>>
         ( tmp, tmp + n, n);
 
@@ -515,9 +511,11 @@ void Cuda_Vector_Sum_Rvec2(rvec2 *x, rvec2 *a, rvec2 b, rvec2 *c, int n)
 
 CUDA_GLOBAL void k_rvec2_to_real_copy( real *dst, rvec2 *src, int index, int n )
 {
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    int i;
 
-    if (i >= n)
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= n )
     {
         return;
     }
@@ -542,9 +540,11 @@ void Cuda_RvecCopy_From( real *dst, rvec2 *src, int index, int n )
 
 CUDA_GLOBAL void k_real_to_rvec2_copy( rvec2 *dst, real *src, int index, int n)
 {
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    int i;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
 
-    if (i >= n)
+    if ( i >= n )
     {
         return;
     }
@@ -598,35 +598,35 @@ void Cuda_Dual_Matvec( sparse_matrix *H, rvec2 *a, rvec2 *b, int n, int size )
 
 void Cuda_Matvec( sparse_matrix *H, real *a, real *b, int n, int size )
 {
-    int blocks;
+//    int blocks;
 
-    blocks = (n / DEF_BLOCK_SIZE) + 
-        (( n % DEF_BLOCK_SIZE) == 0 ? 0 : 1);
+//    blocks = (n / DEF_BLOCK_SIZE) + 
+//        (( n % DEF_BLOCK_SIZE) == 0 ? 0 : 1);
 
     cuda_memset( b, 0, sizeof(real) * size, "dual_matvec:result" );
 
-    //one thread per row implementation
-    //k_matvec <<< blocks, DEF_BLOCK_SIZE >>>
-    //        (*H, a, b, n);
-    //cudaThreadSynchronize ();
-    //cudaCheckError ();
+    /* one thread per row implementation */
+//    k_matvec <<< blocks, DEF_BLOCK_SIZE >>>
+//        ( *H, a, b, n );
+//    cudaThreadSynchronize( );
+//    cudaCheckError( );
 
 #if defined(__SM_35__)
     k_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE >>>
 #else
     k_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE,
-                 sizeof(real) * MATVEC_BLOCK_SIZE>>>
+                 sizeof(real) * MATVEC_BLOCK_SIZE >>>
 #endif
-                     (*H, a, b, n);
+         ( *H, a, b, n );
 
     cudaThreadSynchronize( );
     cudaCheckError( );
 }
 
 
-int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
-        rvec2 *b, real tol, rvec2 *x, mpi_datatypes* mpi_data, FILE *fout,
-        simulation_data *data )
+int Cuda_dual_CG( reax_system *system, control_params *control, storage *workspace,
+        sparse_matrix *H, rvec2 *b, real tol, rvec2 *x, mpi_datatypes* mpi_data,
+        FILE *fout, simulation_data *data )
 {
     int i, n, matvecs, scale;
 //    int j, N;
@@ -651,46 +651,14 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
     }
 #endif
 
-    //MVAPICH2
-//#ifdef __CUDA_DEBUG__
-//  Dist( system, mpi_data, workspace->x, mpi_data->mpi_rvec2, scale, rvec2_packer );
-//#endif
-
-//  check_zeros_device( x, system->N, "x" );
-
     copy_host_device( spad, x, sizeof(rvec2) * system->total_cap, cudaMemcpyDeviceToHost, "CG:x:get" );
     Dist( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_packer );
     copy_host_device( spad, x, sizeof(rvec2) * system->total_cap, cudaMemcpyHostToDevice, "CG:x:put" );
 
-//  check_zeros_device( x, system->N, "x" );
-
-//  compare_rvec2 (workspace->x, x, N, "x");
-//  if (data->step > 0) {
-//      compare_rvec2 (workspace->b, dev_workspace->b, system->N, "b");
-//      compare_rvec2 (workspace->x, dev_workspace->x, system->N, "x");
-//
-//      exit (0);
-//  }
-
-
-//#ifdef __CUDA_DEBUG__
-//  dual_Sparse_MatVec( &workspace->H, workspace->x, workspace->q2, N );
-//#endif
     //originally we were using only H->n which was system->n (init_md.c)
     //Cuda_Dual_Matvec ( H, x, dev_workspace->q2, H->n, system->total_cap);
-    
     Cuda_Dual_Matvec( H, x, dev_workspace->q2, system->N, system->total_cap );
 
-//  compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2");
-
-//  if (data->step > 0) exit (0);
-
-    // tryQEq
-    //MVAPICH2
-//#ifdef __CUDA_DEBUG__
-//  Coll(system,mpi_data,workspace->q2,mpi_data->mpi_rvec2,scale,rvec2_unpacker);
-//#endif
-    
     copy_host_device( spad, dev_workspace->q2, sizeof(rvec2) * system->total_cap,
             cudaMemcpyDeviceToHost, "CG:q2:get" );
     Coll( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_unpacker );
@@ -704,63 +672,23 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
     }
 #endif
 
-//#ifdef __CUDA_DEBUG__
-//  for( j = 0; j < system->n; ++j ) {
-//    // residual
-//    workspace->r2[j][0] = workspace->b[j][0] - workspace->q2[j][0];
-//    workspace->r2[j][1] = workspace->b[j][1] - workspace->q2[j][1];
-//    // apply diagonal pre-conditioner
-//    workspace->d2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
-//    workspace->d2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
-//  }
-//#endif
-    
     Cuda_CG_Diagonal_Preconditioner( dev_workspace, b, system->n );
 
-//  compare_rvec2 (workspace->r2, dev_workspace->r2, n, "r2");
-//  compare_rvec2 (workspace->d2, dev_workspace->d2, n, "d2");
-
-    /* norm of b */
-//#ifdef __CUDA_DEBUG__
-//  my_sum[0] = my_sum[1] = 0;
-//  for( j = 0; j < n; ++j ) {
-//    my_sum[0] += SQR( workspace->b[j][0] );
-//    my_sum[1] += SQR( workspace->b[j][1] );
-//  }
-//  fprintf (stderr, "cg: my_sum[ %f, %f] \n", my_sum[0], my_sum[1]);
-//#endif
-
     my_sum[0] = 0;
     my_sum[1] = 0;
     Cuda_Norm( b, n, my_sum );
 
-//  fprintf (stderr, "cg: my_sum[ %f, %f] \n", my_sum[0], my_sum[1]);
-
     MPI_Allreduce( &my_sum, &norm_sqr, 2, MPI_DOUBLE, MPI_SUM, comm );
     b_norm[0] = SQRT( norm_sqr[0] );
     b_norm[1] = SQRT( norm_sqr[1] );
-    //fprintf( stderr, "bnorm = %f %f\n", b_norm[0], b_norm[1] );
 
     /* dot product: r.d */
-//#ifdef __CUDA_DEBUG__
-//  my_dot[0] = my_dot[1] = 0;
-//  for( j = 0; j < n; ++j ) {
-//    my_dot[0] += workspace->r2[j][0] * workspace->d2[j][0];
-//    my_dot[1] += workspace->r2[j][1] * workspace->d2[j][1];
-//  }
-//  fprintf( stderr, "my_dot: %f %f\n", my_dot[0], my_dot[1] );
-//#endif
-
     my_dot[0] = 0;
     my_dot[1] = 0;
     Cuda_Dot( dev_workspace->r2, dev_workspace->d2, my_dot, n );
-
-// fprintf( stderr, "my_dot: %f %f\n", my_dot[0], my_dot[1] );
     
     MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm );
 
-    //fprintf( stderr, "DEVICE:sig_new: %f %f\n", sig_new[0], sig_new[1] );
-
 #if defined(CG_PERFORMANCE)
     if ( system->my_rank == MASTER_NODE )
     {
@@ -768,55 +696,23 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
     }
 #endif
 
-    for ( i = 1; i < 1000; ++i )
+    for ( i = 1; i < control->cm_solver_max_iters; ++i )
     {
-        //MVAPICH2
-//#ifdef __CUDA_DEBUG__
-//    Dist(system,mpi_data,workspace->d2,mpi_data->mpi_rvec2,scale,rvec2_packer);
-//#endif
-        
         copy_host_device( spad, dev_workspace->d2, sizeof(rvec2) * system->total_cap,
                 cudaMemcpyDeviceToHost, "cg:d2:get" );
         Dist( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_packer );
         copy_host_device( spad, dev_workspace->d2, sizeof(rvec2) * system->total_cap,
                 cudaMemcpyHostToDevice, "cg:d2:put" );
 
-        //print_device_rvec2 (dev_workspace->d2, N);
-
-//#ifdef __CUDA_DEBUG__
-//    dual_Sparse_MatVec( &workspace->H, workspace->d2, workspace->q2, N );
-//#endif
-        
         Cuda_Dual_Matvec( H, dev_workspace->d2, dev_workspace->q2, system->N,
                 system->total_cap );
 
-        /*
-        fprintf (stderr, "******************* Device sparse Matrix--------> %d \n", H->n );
-        fprintf (stderr, " ******* HOST SPARSE MATRIX ******** \n");
-        print_sparse_matrix_host (&workspace->H);
-        fprintf (stderr, " ******* HOST Vector ***************\n");
-        print_host_rvec2 (workspace->d2, system->N);
-        fprintf (stderr, " ******* Device SPARSE MATRIX ******** \n");
-        print_sparse_matrix (&dev_workspace->H);
-        fprintf (stderr, " ******* Device Vector ***************\n");
-        print_device_rvec2 (dev_workspace->d2, system->N);
-        */
-        //compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2");
-
-        // tryQEq
-        // MVAPICH2
-//#ifdef __CUDA_DEBUG__
-//    Coll(system,mpi_data,workspace->q2,mpi_data->mpi_rvec2,scale,rvec2_unpacker);
-//#endif
-
         copy_host_device( spad, dev_workspace->q2, sizeof(rvec2) * system->total_cap,
                 cudaMemcpyDeviceToHost, "cg:q2:get" );
         Coll( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_unpacker );
         copy_host_device( spad, dev_workspace->q2, sizeof(rvec2) * system->total_cap,
                 cudaMemcpyHostToDevice, "cg:q2:put" );
 
-//       compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2");
-
 #if defined(CG_PERFORMANCE)
         if ( system->my_rank == MASTER_NODE )
         {
@@ -825,61 +721,23 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
 #endif
 
         /* dot product: d.q */
-//#ifdef __CUDA_DEBUG__
-//    my_dot[0] = my_dot[1] = 0;
-//    for( j = 0; j < n; ++j ) {
-//      my_dot[0] += workspace->d2[j][0] * workspace->q2[j][0];
-//      my_dot[1] += workspace->d2[j][1] * workspace->q2[j][1];
-//    }
-//       fprintf( stderr, "H:my_dot: %f %f\n", my_dot[0], my_dot[1] );
-//#endif
-
-        my_dot[0] = my_dot[1] = 0;
+        my_dot[0] = 0;
+        my_dot[1] = 0;
         Cuda_Dot (dev_workspace->d2, dev_workspace->q2, my_dot, n);
-        //fprintf( stderr, "D:my_dot: %f %f\n", my_dot[0], my_dot[1] );
 
         MPI_Allreduce( &my_dot, &tmp, 2, MPI_DOUBLE, MPI_SUM, comm );
-        //fprintf( stderr, "tmp: %f %f\n", tmp[0], tmp[1] );
 
         alpha[0] = sig_new[0] / tmp[0];
         alpha[1] = sig_new[1] / tmp[1];
         my_dot[0] = 0;
         my_dot[1] = 0;
 
-//#ifdef __CUDA_DEBUG__
-//    for( j = 0; j < system->n; ++j ) {
-//      // update x
-//      workspace->x[j][0] += alpha[0] * workspace->d2[j][0];
-//      workspace->x[j][1] += alpha[1] * workspace->d2[j][1];
-//      // update residual
-//      workspace->r2[j][0] -= alpha[0] * workspace->q2[j][0];
-//      workspace->r2[j][1] -= alpha[1] * workspace->q2[j][1];
-//      // apply diagonal pre-conditioner
-//      workspace->p2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
-//      workspace->p2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
-//      // dot product: r.p
-//      my_dot[0] += workspace->r2[j][0] * workspace->p2[j][0];
-//      my_dot[1] += workspace->r2[j][1] * workspace->p2[j][1];
-//    }
-//       fprintf( stderr, "H:my_dot: %f %f\n", my_dot[0], my_dot[1] );
-//#endif
-
-        my_dot[0] = 0;
-        my_dot[1] = 0;
         Cuda_DualCG_Preconditioner( dev_workspace, x, alpha, system->n, my_dot );
 
-        //fprintf( stderr, "D:my_dot: %f %f\n", my_dot[0], my_dot[1] );
-
-//   compare_rvec2 (workspace->x, dev_workspace->x, N, "x");
-//   compare_rvec2 (workspace->r2, dev_workspace->r2, N, "r2");
-//   compare_rvec2 (workspace->p2, dev_workspace->p2, N, "p2");
-
         sig_old[0] = sig_new[0];
         sig_old[1] = sig_new[1];
         MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm );
 
-        //fprintf( stderr, "DEVICE:sig_new: %f %f\n", sig_new[0], sig_new[1] );
-
 #if defined(CG_PERFORMANCE)
         if ( system->my_rank == MASTER_NODE )
         {
@@ -895,64 +753,30 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
         beta[0] = sig_new[0] / sig_old[0];
         beta[1] = sig_new[1] / sig_old[1];
 
-//#ifdef __CUDA_DEBUG__
-//    for( j = 0; j < system->n; ++j ) {
-//      // d = p + beta * d
-//      workspace->d2[j][0] = workspace->p2[j][0] + beta[0] * workspace->d2[j][0];
-//      workspace->d2[j][1] = workspace->p2[j][1] + beta[1] * workspace->d2[j][1];
-//    }
-//#endif
-
         Cuda_Vector_Sum_Rvec2( dev_workspace->d2, dev_workspace->p2, beta,
                 dev_workspace->d2, system->n );
-
-//       compare_rvec2 (workspace->d2, dev_workspace->d2, N, "q2");
     }
 
-
     if ( SQRT(sig_new[0]) / b_norm[0] <= tol )
     {
-        //for( j = 0; j < n; ++j )
-        //  workspace->t[j] = workspace->x[j][1];
-        //fprintf (stderr, "Getting started with Cuda_CG1 \n");
-
         Cuda_RvecCopy_From( dev_workspace->t, dev_workspace->x, 1, system->n );
 
-        //compare_array (workspace->b_t, dev_workspace->b_t, system->n, "b_t");
-        //compare_array (workspace->t, dev_workspace->t, system->n, "t");
-
-        matvecs = Cuda_CG( system, workspace, H, dev_workspace->b_t, tol, dev_workspace->t,
-                mpi_data, fout );
-
-        //fprintf (stderr, " Cuda_CG1: iterations --> %d \n", matvecs );
-        //for( j = 0; j < n; ++j )
-        //  workspace->x[j][1] = workspace->t[j];
+        matvecs = Cuda_CG( system, control, workspace, H, dev_workspace->b_t, tol, dev_workspace->t,
+                mpi_data );
 
         Cuda_RvecCopy_To( dev_workspace->x, dev_workspace->t, 1, system->n );
     }
     else if ( SQRT(sig_new[1]) / b_norm[1] <= tol )
     {
-        //for( j = 0; j < n; ++j )
-        //  workspace->s[j] = workspace->x[j][0];
-
         Cuda_RvecCopy_From( dev_workspace->s, dev_workspace->x, 0, system->n );
 
-        //compare_array (workspace->s, dev_workspace->s, system->n, "s");
-        //compare_array (workspace->b_s, dev_workspace->b_s, system->n, "b_s");
-
-        //fprintf (stderr, "Getting started with Cuda_CG2 \n");
-
-        matvecs = Cuda_CG( system, workspace, H, dev_workspace->b_s, tol, dev_workspace->s,
-                mpi_data, fout );
-
-        //fprintf (stderr, " Cuda_CG2: iterations --> %d \n", matvecs );
-        //for( j = 0; j < system->n; ++j )
-        //  workspace->x[j][0] = workspace->s[j];
+        matvecs = Cuda_CG( system, control, workspace, H, dev_workspace->b_s, tol, dev_workspace->s,
+                mpi_data );
 
         Cuda_RvecCopy_To( dev_workspace->x, dev_workspace->s, 0, system->n );
     }
 
-    if ( i >= 1000 )
+    if ( i >= control->cm_solver_max_iters )
     {
         fprintf( stderr, "[WARNING] p%d: dual CG convergence failed! (%d steps)\n",
                 system->my_rank, i );
@@ -972,8 +796,8 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
 }
 
 
-int Cuda_CG( reax_system *system, storage *workspace, sparse_matrix *H, real
-        *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
+int Cuda_CG( reax_system *system, control_params *control, storage *workspace,
+        sparse_matrix *H, real *b, real tol, real *x, mpi_datatypes* mpi_data )
 {
     int  i, scale;
 //    int j;
@@ -983,25 +807,19 @@ int Cuda_CG( reax_system *system, storage *workspace, sparse_matrix *H, real
 
     scale = sizeof(real) / sizeof(void);
 
-    /* x is on the device */
-    //MVAPICH2
     memset( spad, 0, sizeof(real) * system->total_cap );
     copy_host_device( spad, x, sizeof(real) * system->total_cap,
             cudaMemcpyDeviceToHost, "cuda_cg:x:get" );
     Dist( system, mpi_data, spad, MPI_DOUBLE, scale, real_packer );
 
-    //MVAPICH2
     copy_host_device( spad, x, sizeof(real) * system->total_cap,
             cudaMemcpyHostToDevice, "cuda_cg:x:put" );
     Cuda_Matvec( H, x, dev_workspace->q, system->N, system->total_cap );
 
-    // tryQEq
-    // MVAPICH2
     copy_host_device( spad, dev_workspace->q, sizeof(real) * system->total_cap,
             cudaMemcpyDeviceToHost, "cuda_cg:q:get" );
     Coll( system, mpi_data, spad, MPI_DOUBLE, scale, real_unpacker );
 
-    //MVAPICH2
     copy_host_device( spad, dev_workspace->q, sizeof(real) * system->total_cap,
             cudaMemcpyHostToDevice, "cuda_cg:q:put" );
 
@@ -1014,8 +832,7 @@ int Cuda_CG( reax_system *system, storage *workspace, sparse_matrix *H, real
 
     Cuda_Vector_Sum( dev_workspace->r , 1.,  b, -1., dev_workspace->q,
             system->n );
-    //for( j = 0; j < system->n; ++j )
-    //  workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; //pre-condition
+
     Cuda_CG_Preconditioner( dev_workspace->d, dev_workspace->r,
             dev_workspace->Hdia_inv, system->n );
 
@@ -1039,9 +856,8 @@ int Cuda_CG( reax_system *system, storage *workspace, sparse_matrix *H, real
     }
 #endif
 
-    for ( i = 1; i < 1000 && SQRT(sig_new) / b_norm > tol; ++i )
+    for ( i = 1; i < control->cm_solver_max_iters && SQRT(sig_new) / b_norm > tol; ++i )
     {
-        //MVAPICH2
         copy_host_device( spad, dev_workspace->d, sizeof(real) * system->total_cap,
                 cudaMemcpyDeviceToHost, "cuda_cg:d:get" );
         Dist( system, mpi_data, spad, MPI_DOUBLE, scale, real_packer );
@@ -1050,7 +866,6 @@ int Cuda_CG( reax_system *system, storage *workspace, sparse_matrix *H, real
 
         Cuda_Matvec( H, dev_workspace->d, dev_workspace->q, system->N, system->total_cap );
 
-        //tryQEq
         copy_host_device( spad, dev_workspace->q, sizeof(real) * system->total_cap,
                 cudaMemcpyDeviceToHost, "cuda_cg:q:get" );
         Coll( system, mpi_data, spad, MPI_DOUBLE, scale, real_unpacker );
@@ -1078,9 +893,7 @@ int Cuda_CG( reax_system *system, storage *workspace, sparse_matrix *H, real
         //Cuda_Vector_Add( workspace->r, -alpha, workspace->q, system->n );
         Cuda_Vector_Sum( dev_workspace->r, -alpha, dev_workspace->q, 1.0,
                 dev_workspace->r, system->n );
-        /* pre-conditioning */
-        //for( j = 0; j < system->n; ++j )
-        //  workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j];
+
         Cuda_CG_Preconditioner( dev_workspace->p, dev_workspace->r,
                 dev_workspace->Hdia_inv, system->n );
 
@@ -1092,7 +905,6 @@ int Cuda_CG( reax_system *system, storage *workspace, sparse_matrix *H, real
         copy_host_device( spad + system->n, dev_workspace->p, sizeof(real) * system->n,
                 cudaMemcpyDeviceToHost, "cuda_cg:p:get" );
         sig_new = Parallel_Dot( spad , spad + system->n, system->n, mpi_data->world );
-        //fprintf (stderr, "Device: sig_new: %f \n", sig_new );
 
         beta = sig_new / sig_old;
         Cuda_Vector_Sum( dev_workspace->d, 1., dev_workspace->p, beta,
diff --git a/PG-PuReMD/src/cuda/cuda_lin_alg.h b/PG-PuReMD/src/cuda/cuda_lin_alg.h
index aa31c126642b8eb560256390c89698df99b34a73..768c0a36a1582a585f5d45161844a23640a68212 100644
--- a/PG-PuReMD/src/cuda/cuda_lin_alg.h
+++ b/PG-PuReMD/src/cuda/cuda_lin_alg.h
@@ -51,11 +51,11 @@ void Cuda_Dual_Matvec( sparse_matrix *, rvec2 *, rvec2 *, int , int );
 
 void Cuda_Matvec( sparse_matrix *, real *, real *, int , int );
 
-int Cuda_dual_CG( reax_system*, storage*, sparse_matrix*,
+int Cuda_dual_CG( reax_system*, control_params*, storage*, sparse_matrix*,
         rvec2*, real, rvec2*, mpi_datatypes*, FILE* , simulation_data * );
 
-int Cuda_CG( reax_system*, storage*, sparse_matrix*,
-        real*, real, real*, mpi_datatypes*, FILE* );
+int Cuda_CG( reax_system*, control_params*, storage*, sparse_matrix*,
+        real*, real, real*, mpi_datatypes* );
 
 #ifdef __cplusplus
 }
diff --git a/PG-PuReMD/src/cuda/cuda_reduction.cu b/PG-PuReMD/src/cuda/cuda_reduction.cu
index 01bd3c8199a76144703738188713190e02b35e9f..49224a06aec4157eed74a977ec4b99cf22c5cdc7 100644
--- a/PG-PuReMD/src/cuda/cuda_reduction.cu
+++ b/PG-PuReMD/src/cuda/cuda_reduction.cu
@@ -662,8 +662,8 @@ CUDA_GLOBAL void k_norm_rvec2( const rvec2 *input, rvec2 *per_block_results,
 }
 
 
-CUDA_GLOBAL void k_dot_rvec2(const rvec2 *a, rvec2 *b, rvec2 *res,
-        const size_t n)
+CUDA_GLOBAL void k_dot_rvec2( const rvec2 *a, rvec2 *b, rvec2 *res,
+        const size_t n )
 {
 #if defined(__SM_35__)
     extern __shared__ rvec2 my_dot2[];
@@ -746,11 +746,13 @@ CUDA_GLOBAL void k_dot_rvec2(const rvec2 *a, rvec2 *b, rvec2 *res,
 //vector functions
 //////////////////////////////////////////////////
 CUDA_GLOBAL void k_vector_sum( real* dest, real c, real* v, real d, real* y,
-        int k )
+        int n )
 {
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    int i;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
 
-    if( i >= k )
+    if ( i >= n )
     {
         return;
     }
@@ -759,11 +761,13 @@ CUDA_GLOBAL void k_vector_sum( real* dest, real c, real* v, real d, real* y,
 }
 
 
-CUDA_GLOBAL void k_vector_mul( real* dest, real* v, real* y, int k )
+CUDA_GLOBAL void k_vector_mul( real* dest, real* v, real* y, int n )
 {
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    int i;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
 
-    if( i >= k )
+    if ( i >= n )
     {
         return;
     }
@@ -772,11 +776,13 @@ CUDA_GLOBAL void k_vector_mul( real* dest, real* v, real* y, int k )
 }
 
 
-CUDA_GLOBAL void k_rvec2_mul( rvec2* dest, rvec2* v, rvec2* y, int k )
+CUDA_GLOBAL void k_rvec2_mul( rvec2* dest, rvec2* v, rvec2* y, int n )
 {
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    int i;
 
-    if( i >= k )
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= n )
     {
         return;
     }
@@ -787,12 +793,13 @@ CUDA_GLOBAL void k_rvec2_mul( rvec2* dest, rvec2* v, rvec2* y, int k )
 
 
 CUDA_GLOBAL void k_rvec2_pbetad( rvec2 *dest, rvec2 *a, 
-        real beta0, real beta1, 
-        rvec2 *b, int n )
+        real beta0, real beta1, rvec2 *b, int n )
 {
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    int i;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
 
-    if( i >= n )
+    if ( i >= n )
     {
         return;
     }
diff --git a/PG-PuReMD/src/forces.c b/PG-PuReMD/src/forces.c
index d3e295cf392f45f400596f041f1995f93ed2e02f..91fbfd1792f479c8810b783877a837c7d3ec93c6 100644
--- a/PG-PuReMD/src/forces.c
+++ b/PG-PuReMD/src/forces.c
@@ -800,7 +800,6 @@ int Init_Forces( reax_system *system, control_params *control,
     {
         /* put ghost atoms to an infinite distance */
         workspace->bond_mark[i] = 1000;
-        //workspace->done_after[i] = Start_Index( i, far_nbrs );
     }
 
     H = &(workspace->H); //MATRIX CHANGES
@@ -969,8 +968,6 @@ int Init_Forces( reax_system *system, control_params *control,
                     else if ( workspace->bond_mark[i] > workspace->bond_mark[j] + 1 )
                     {
                         workspace->bond_mark[i] = workspace->bond_mark[j] + 1;
-                        //if( workspace->bond_mark[i] == 1000 )
-                        //  workspace->done_after[i] = pj;
                     }
 
                     //fprintf( stdout, "%d%d - %d(%d) %d(%d)\n",
@@ -1005,7 +1002,6 @@ int Init_Forces( reax_system *system, control_params *control,
 
     /*for( i = system->n; i < system->N; ++i ) {
       start_i = Start_Index(i, far_nbrs);
-      end_i = workspace->done_after[i];
 
       if( workspace->bond_mark[i] >= 2 && start_i < end_i ) {
         atom_i = &(system->my_atoms[i]);
@@ -1088,7 +1084,6 @@ int Init_Forces_No_Charges( reax_system *system, control_params *control,
     for ( i = system->n; i < system->N; ++i )
     {
         workspace->bond_mark[i] = 1000; // put ghost atoms to an infinite distance
-        //workspace->done_after[i] = Start_Index( i, far_nbrs );
     }
 
     num_bonds = 0;
@@ -1214,15 +1209,13 @@ int Init_Forces_No_Charges( reax_system *system, control_params *control,
                     ++btop_i;
 
                     if ( workspace->bond_mark[j] > workspace->bond_mark[i] + 1 )
+                    {
                         workspace->bond_mark[j] = workspace->bond_mark[i] + 1;
+                    }
                     else if ( workspace->bond_mark[i] > workspace->bond_mark[j] + 1 )
                     {
                         workspace->bond_mark[i] = workspace->bond_mark[j] + 1;
-                        //if( workspace->bond_mark[i] == 1000 )
-                        //  workspace->done_after[i] = pj;
                     }
-                    //fprintf( stdout, "%d%d - %d(%d) %d(%d)\n",
-                    //   i , j, i, workspace->bond_mark[i], j, workspace->bond_mark[j] );
                 }
             }
         }
diff --git a/PG-PuReMD/src/init_md.c b/PG-PuReMD/src/init_md.c
index 3d27ecf6ec83c3d1da8b298e37301c6944bc7c6a..d071f0dd3e5175ca55bc00d36a266344b41bd059 100644
--- a/PG-PuReMD/src/init_md.c
+++ b/PG-PuReMD/src/init_md.c
@@ -117,16 +117,16 @@ void Generate_Initial_Velocities( reax_system *system, real T )
 
             rvec_Scale( system->my_atoms[i].v, 1. / scale, system->my_atoms[i].v );
 
-            // fprintf( stderr, "v = %f %f %f\n",
-            // system->my_atoms[i].v[0],
-            // system->my_atoms[i].v[1],
-            // system->my_atoms[i].v[2] );
-
-            // fprintf( stderr, "scale = %f\n", scale );
-            // fprintf( stderr, "v = %f %f %f\n",
-            // system->my_atoms[i].v[0],
-            // system->my_atoms[i].v[1],
-            // system->my_atoms[i].v[2] );
+//            fprintf( stderr, "v = %f %f %f\n",
+//                    system->my_atoms[i].v[0],
+//                    system->my_atoms[i].v[1],
+//                    system->my_atoms[i].v[2] );
+//
+//            fprintf( stderr, "scale = %f\n", scale );
+//            fprintf( stderr, "v = %f %f %f\n",
+//                    system->my_atoms[i].v[0],
+//                    system->my_atoms[i].v[1],
+//                    system->my_atoms[i].v[2] );
         }
     }
 }
diff --git a/PG-PuReMD/src/lin_alg.c b/PG-PuReMD/src/lin_alg.c
index e9ce62e72411dd1e7739d79b3a2875476b63ed48..5de64a8a8d242c8d5968594878d7add6b64c6ac6 100644
--- a/PG-PuReMD/src/lin_alg.c
+++ b/PG-PuReMD/src/lin_alg.c
@@ -28,7 +28,7 @@
 #include "tool_box.h"
 #include "vector.h"
 
-#ifdef HAVE_CUDA
+#if defined(HAVE_CUDA) && defined(DEBUG)
   #include "cuda/cuda_validation.h"
 #endif
 
@@ -44,7 +44,8 @@ void dual_Sparse_MatVec( sparse_matrix *A, rvec2 *x, rvec2 *b, int N )
 
     for ( i = 0; i < N; ++i )
     {
-        b[i][0] = b[i][1] = 0;
+        b[i][0] = 0;
+        b[i][1] = 0;
     }
 
     /* perform multiplication */
@@ -76,14 +77,12 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, rvec2
         *b, real tol, rvec2 *x, mpi_datatypes* mpi_data, FILE *fout,
         simulation_data *data )
 {
-    int  i, j, n, N, matvecs, scale;
+    int i, j, n, N, matvecs, scale;
     rvec2 tmp, alpha, beta;
     rvec2 my_sum, norm_sqr, b_norm, my_dot;
     rvec2 sig_old, sig_new;
     MPI_Comm comm;
 
-    int a;
-
     n = system->n;
     N = system->N;
     comm = mpi_data->world;
@@ -99,14 +98,14 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, rvec2
     }
 #endif
 
-#ifdef HAVE_CUDA
-    check_zeros_host( x, system->N, "x" );
+#if defined(HAVE_CUDA) && defined(DEBUG)
+    check_zeros_host( x, N, "x" );
 #endif
 
     Dist( system, mpi_data, x, mpi_data->mpi_rvec2, scale, rvec2_packer );
 
-#ifdef HAVE_CUDA
-    check_zeros_host( x, system->N, "x" );
+#if defined(HAVE_CUDA) && defined(DEBUG)
+    check_zeros_host( x, N, "x" );
 #endif
 
     dual_Sparse_MatVec( H, x, workspace->q2, N );
@@ -121,7 +120,7 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, rvec2
         Update_Timing_Info( &t_start, &matvec_time );
 #endif
 
-    for ( j = 0; j < system->n; ++j )
+    for ( j = 0; j < n; ++j )
     {
         /* residual */
         workspace->r2[j][0] = b[j][0] - workspace->q2[j][0];
@@ -168,7 +167,7 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, rvec2
     for ( i = 1; i < 300; ++i )
     {
         Dist(system, mpi_data, workspace->d2, mpi_data->mpi_rvec2, scale, rvec2_packer);
-        //print_host_rvec2 (workspace->d2, N);
+        //print_host_rvec2( workspace->d2, N );
 
         dual_Sparse_MatVec( H, workspace->d2, workspace->q2, N );
 
@@ -193,7 +192,7 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, rvec2
         alpha[0] = sig_new[0] / tmp[0];
         alpha[1] = sig_new[1] / tmp[1];
         my_dot[0] = my_dot[1] = 0;
-        for ( j = 0; j < system->n; ++j )
+        for ( j = 0; j < n; ++j )
         {
             /* update x */
             x[j][0] += alpha[0] * workspace->d2[j][0];
@@ -225,7 +224,7 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, rvec2
 
         beta[0] = sig_new[0] / sig_old[0];
         beta[1] = sig_new[1] / sig_old[1];
-        for ( j = 0; j < system->n; ++j )
+        for ( j = 0; j < n; ++j )
         {
             /* d = p + beta * d */
             workspace->d2[j][0] = workspace->p2[j][0] + beta[0] * workspace->d2[j][0];
@@ -240,7 +239,7 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, rvec2
             workspace->t[j] = workspace->x[j][1];
         }
         matvecs = CG( system, workspace, H, workspace->b_t, tol, workspace->t,
-                      mpi_data, fout );
+                mpi_data );
 
 #if defined(DEBUG)
         fprintf (stderr, " CG1: iterations --> %d \n", matvecs );
@@ -258,13 +257,13 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, rvec2
             workspace->s[j] = workspace->x[j][0];
         }
         matvecs = CG( system, workspace, H, workspace->b_s, tol, workspace->s,
-                      mpi_data, fout );
+                mpi_data );
 
 #if defined(DEBUG)
         fprintf (stderr, " CG2: iterations --> %d \n", matvecs );
 #endif
 
-        for ( j = 0; j < system->n; ++j )
+        for ( j = 0; j < n; ++j )
         {
             workspace->x[j][0] = workspace->s[j];
         }
@@ -313,7 +312,7 @@ void Sparse_MatVec( sparse_matrix *A, real *x, real *b, int N )
 
 
 int CG( reax_system *system, storage *workspace, sparse_matrix *H, real *b,
-        real tol, real *x, mpi_datatypes* mpi_data, FILE *fout)
+        real tol, real *x, mpi_datatypes* mpi_data )
 {
     int  i, j, scale;
     real tmp, alpha, beta, b_norm;
@@ -762,7 +761,7 @@ int sCG( reax_system *system, storage *workspace, sparse_matrix *H,
 
 
 int GMRES( reax_system *system, storage *workspace, sparse_matrix *H,
-           real *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
+        real *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
 {
     int i, j, k, itr, N;
     real cc, tmp1, tmp2, temp, bnorm;
@@ -883,8 +882,8 @@ int GMRES( reax_system *system, storage *workspace, sparse_matrix *H,
 
 
 int GMRES_HouseHolder( reax_system *system, storage *workspace,
-                       sparse_matrix *H, real *b, real tol, real *x,
-                       mpi_datatypes* mpi_data, FILE *fout )
+        sparse_matrix *H, real *b, real tol, real *x,
+        mpi_datatypes* mpi_data, FILE *fout )
 {
     int  i, j, k, itr, N;
     real cc, tmp1, tmp2, temp, bnorm;
diff --git a/PG-PuReMD/src/lin_alg.h b/PG-PuReMD/src/lin_alg.h
index 3663978e4b6cc412a1b9d8fe2cc613443383c3fc..4b7ba2f0368e81d498f93d91e33db074ebddd9ca 100644
--- a/PG-PuReMD/src/lin_alg.h
+++ b/PG-PuReMD/src/lin_alg.h
@@ -36,10 +36,10 @@ int GMRES_HouseHolder( reax_system*, storage*, sparse_matrix*,
         real*, real, real*, mpi_datatypes*, FILE* );
 
 int dual_CG( reax_system*, storage*, sparse_matrix*,
-        rvec2*, real, rvec2*, mpi_datatypes*, FILE* , simulation_data *);
+        rvec2*, real, rvec2*, mpi_datatypes*, FILE* , simulation_data * );
 
 int CG( reax_system*, storage*, sparse_matrix*,
-        real*, real, real*, mpi_datatypes*, FILE* );
+        real*, real, real*, mpi_datatypes* );
 
 int PCG( reax_system*, storage*, sparse_matrix*, real*, real,
         sparse_matrix*, sparse_matrix*, real*, mpi_datatypes*, FILE* );
diff --git a/PG-PuReMD/src/parallelreax.c b/PG-PuReMD/src/parallelreax.c
index 9ace1c3276e9598f40d5567460f9c5a57db0955f..da9074127fec719403a9ea0e892961314c7a7b29 100644
--- a/PG-PuReMD/src/parallelreax.c
+++ b/PG-PuReMD/src/parallelreax.c
@@ -49,9 +49,12 @@
   #include "cuda/cuda_reset_tools.h"
   #include "cuda/cuda_system_props.h"
   #include "cuda/cuda_utils.h"
-  #include "cuda/cuda_validation.h"
+  #if defined(DEBUG)
+    #include "cuda/cuda_validation.h"
+  #endif
 #endif
 
+
 evolve_function Evolve;
 evolve_function Cuda_Evolve;
 LR_lookup_table *LR;
diff --git a/PG-PuReMD/src/reax_types.h b/PG-PuReMD/src/reax_types.h
index a6d16f8ef08ef7c40d2770d2f466b2089a468d1a..80a4dafc7b12dc1ca06fcb63dbd090f1b8bef638 100644
--- a/PG-PuReMD/src/reax_types.h
+++ b/PG-PuReMD/src/reax_types.h
@@ -277,24 +277,6 @@
 
 
 /******************* ENUMERATIONS *************************/
-/* geometry file format */
-enum geo_formats
-{
-    CUSTOM = 0,
-    PDB = 1,
-    ASCII_RESTART = 2,
-    BINARY_RESTART = 3,
-    GF_N = 4,
-};
-
-/* restart file format */
-enum restart_formats
-{
-    WRITE_ASCII = 0,
-    WRITE_BINARY = 1,
-    RF_N = 2,
-};
-
 /* ensemble type */
 enum ensembles
 {
@@ -369,6 +351,57 @@ enum errors
     RUNTIME_ERROR = -19,
 };
 
+/* restart file format */
+enum restart_formats
+{
+    WRITE_ASCII = 0,
+    WRITE_BINARY = 1,
+    RF_N = 2,
+};
+
+/* geometry file format */
+enum geo_formats
+{
+    CUSTOM = 0,
+    PDB = 1,
+    ASCII_RESTART = 2,
+    BINARY_RESTART = 3,
+    GF_N = 4,
+};
+
+enum charge_method
+{
+    QEQ_CM = 0,
+    EE_CM = 1,
+    ACKS2_CM = 2,
+};
+
+enum solver
+{
+    GMRES_S = 0,
+    GMRES_H_S = 1,
+    CG_S = 2,
+    SDM_S = 3,
+};
+
+enum pre_comp
+{
+    NONE_PC = 0,
+    DIAG_PC = 1,
+    ICHOLT_PC = 2,
+    ILU_PAR_PC = 3,
+    ILUT_PAR_PC = 4,
+    ILU_SUPERLU_MT_PC = 5,
+};
+
+enum pre_app
+{
+    TRI_SOLVE_PA = 0,
+    TRI_SOLVE_LEVEL_SCHED_PA = 1,
+    TRI_SOLVE_GC_PA = 2,
+    JACOBI_ITER_PA = 3,
+};
+
 /* ??? */
 enum exchanges
 {
@@ -1126,8 +1159,11 @@ typedef struct
     int N;
     /* num. atoms within simulation */
     int bigN;
+    /* dimension of sparse charge method matrix */
+    int N_cm;
     /* num. hydrogen atoms */
     int numH;
+    /* num. hydrogen atoms (GPU) */
     int *d_numH;
     /**/
     int local_cap;
@@ -1274,8 +1310,9 @@ typedef struct
     /* format of geometry input file */
     int geo_format;
     /* format of restart file */
-    int  restart;
+    int restart;
 
+    /**/
     int restrict_bonds;
     /* flag to control if center of mass velocity is removed */
     int remove_CoM_vel;
@@ -1313,18 +1350,40 @@ typedef struct
     /* flag to control if force computations are tablulated */
     int tabulate;
 
+    /**/
+    unsigned int charge_method;
     /* frequency (in terms of simulation time steps) at which to
      * re-compute atomic charge distribution */
     int charge_freq;
+    /**/
+    unsigned int cm_solver_type;
+    /**/
+    real cm_q_net;
+    /**/
+    unsigned int cm_solver_max_iters;
+    /**/
+    unsigned int cm_solver_restart;
     /* error tolerance of solution produced by charge distribution
      * sparse iterative linear solver */
-    real q_err;
+    real cm_solver_q_err;
+    /**/
+    real cm_domain_sparsity;
+    /**/
+    unsigned int cm_domain_sparsify_enabled;
+    /**/
+    unsigned int cm_solver_pre_comp_type;
     /* frequency (in terms of simulation time steps) at which to recompute
      * incomplete factorizations */
-    int refactor;
+    unsigned int cm_solver_pre_comp_refactor;
     /* drop tolerance of incomplete factorization schemes (ILUT, ICHOLT, etc.)
      * used for preconditioning the iterative linear solver used in charge distribution */
-    real droptol;
+    real cm_solver_pre_comp_droptol;
+    /**/
+    unsigned int cm_solver_pre_comp_sweeps;
+    /**/
+    unsigned int cm_solver_pre_app_type;
+    /**/
+    unsigned int cm_solver_pre_app_jacobi_iters;
 
     /* initial temperature of simulation, in Kelvin */
     real T_init;
@@ -1860,8 +1919,6 @@ typedef struct
     rvec *dDeltap_self;
     /**/
     int *bond_mark;
-    /**/
-    int *done_after;
 
     /* charge matrix storage */
     /* charge matrix */
diff --git a/PG-PuReMD/src/traj.c b/PG-PuReMD/src/traj.c
index af27e1cb04c9061f3bb52cb4a290cc999c93deac..14560661f57d23918774095a4df4c2d3a41fae4b 100644
--- a/PG-PuReMD/src/traj.c
+++ b/PG-PuReMD/src/traj.c
@@ -273,7 +273,7 @@ int Write_Header( reax_system *system, control_params *control,
                  control->thb_cut );
         strncat( out_control->buffer, out_control->line, HEADER_LINE_LEN + 1 );
 
-        sprintf( out_control->line, SCI_LINE, "QEq_tolerance:", control->q_err );
+        sprintf( out_control->line, SCI_LINE, "QEq_tolerance:", control->cm_solver_q_err );
         strncat( out_control->buffer, out_control->line, HEADER_LINE_LEN + 1 );
 
         /* temperature controls */
diff --git a/environ/parallel_control b/environ/parallel_control
index 3b608f1b0ddccdaf224dd254b5266ee33a148c82..4f6d75b2e1a4f572dc3e8dc03cc63eeb4896780c 100644
--- a/environ/parallel_control
+++ b/environ/parallel_control
@@ -18,8 +18,20 @@ bond_graph_cutoff        0.3                    ! bond strength cutoff for bond
 thb_cutoff               0.001                  ! cutoff value for three body interactions (Angstroms)
 hbond_cutoff             7.50                   ! cutoff distance for hydrogen bond interactions (Angstroms)
 
-charge_freq              1                      ! frequency (sim step) at which atomic charges are computed
-q_err                    1e-6                   ! norm of the relative residual in QEq solve
+charge_method                     0             ! charge method: 0 = QEq, 1 = EEM, 2 = ACKS2
+charge_freq                       1             ! frequency (sim step) at which atomic charges are computed
+cm_q_net                          0.0           ! net system charge
+cm_solver_type                    2             ! iterative linear solver for charge method: 0 = GMRES, 1 = GMRES_H, 2 = CG, 3 = SDM
+cm_solver_max_iters               1000          ! max solver iterations
+cm_solver_restart                 100           ! inner iterations of GMRES before restarting
+cm_solver_q_err                   1e-6          ! relative residual norm threshold used in solver
+cm_domain_sparsity                1.0           ! scalar for scaling cut-off distance, used to sparsify charge matrix (between 0.0 and 1.0)
+cm_solver_pre_comp_type           1             ! method used to compute preconditioner, if applicable
+cm_solver_pre_comp_refactor       1000          ! number of steps before recomputing preconditioner
+cm_solver_pre_comp_droptol        0.0           ! threshold tolerance for dropping values in preconditioner computation, if applicable
+cm_solver_pre_comp_sweeps         3             ! number of sweeps used to compute preconditioner (ILU_PAR)
+cm_solver_pre_app_type            1             ! method used to apply preconditioner
+cm_solver_pre_app_jacobi_iters    50            ! number of Jacobi iterations used for applying precondition, if applicable
 
 temp_init                0.01                   ! desired initial temperature of the simulated system
 temp_final               300.0                  ! desired final temperature of the simulated system
diff --git a/sPuReMD/src/lin_alg.c b/sPuReMD/src/lin_alg.c
index 28b7c9560d4ab5f11e8161f8a9ff867ab5f3dc1a..208791b8c297d0083489ca9faf837a746bdc0411 100644
--- a/sPuReMD/src/lin_alg.c
+++ b/sPuReMD/src/lin_alg.c
@@ -45,6 +45,7 @@ unsigned int *top = NULL;
 unsigned int *color = NULL;
 unsigned int *to_color = NULL;
 unsigned int *conflict = NULL;
+unsigned int *conflict_cnt = NULL;
 unsigned int *temp_ptr;
 unsigned int *recolor = NULL;
 unsigned int recolor_cnt;
@@ -530,8 +531,16 @@ void graph_coloring( const sparse_matrix * const A, const TRIANGULARITY tri )
     {
 #define MAX_COLOR (500)
         int i, pj, v;
-        unsigned int temp;
-        int *fb_color;
+        unsigned int temp, recolor_cnt_local, *conflict_local;
+        int tid, num_thread, *fb_color;
+
+#ifdef _OPENMP
+        tid = omp_get_thread_num();
+        num_thread = omp_get_num_threads();
+#else
+        tid = 0;
+        num_thread = 1;
+#endif
 
         #pragma omp master
         {
@@ -558,7 +567,8 @@ void graph_coloring( const sparse_matrix * const A, const TRIANGULARITY tri )
             }
         }
 
-        if ( (fb_color = (int*) malloc(sizeof(int) * MAX_COLOR)) == NULL )
+        if ( (fb_color = (int*) malloc(sizeof(int) * MAX_COLOR)) == NULL ||
+                (conflict_local = (unsigned int*) malloc(sizeof(unsigned int) * A->n)) == NULL )
         {
             fprintf( stderr, "not enough memory for graph coloring. terminating.\n" );
             exit( INSUFFICIENT_MEMORY );
@@ -594,29 +604,63 @@ void graph_coloring( const sparse_matrix * const A, const TRIANGULARITY tri )
             }
 
             /* determine if recoloring required */
-            //TODO: switch to reduction on recolor_cnt (+) via parallel scan through recolor
+            temp = recolor_cnt;
+            recolor_cnt_local = 0;
+
+            #pragma omp barrier
+
             #pragma omp master
             {
-                temp = recolor_cnt;
                 recolor_cnt = 0;
+            }
+	    
+            #pragma omp barrier
 
-                for ( i = 0; i < temp; ++i )
-                {
-                    v = to_color[i];
+            #pragma omp for schedule(static)
+            for ( i = 0; i < temp; ++i )
+            {
+                v = to_color[i];
 
-                    /* search for color conflicts with adjacent vertices */
-                    for ( pj = A->start[v]; pj < A->start[v + 1]; ++pj )
+                /* search for color conflicts with adjacent vertices */
+                for ( pj = A->start[v]; pj < A->start[v + 1]; ++pj )
+                {
+                    if ( color[v] == color[A->j[pj]] && v > A->j[pj] )
                     {
-                        if ( color[v] == color[A->j[pj]] && v > A->j[pj] )
-                        {
-                            conflict[recolor_cnt] = v;
-                            color[v] = 0;
-                            ++recolor_cnt;
-                            break;
-                        }
+                        conflict_local[recolor_cnt_local] = v;
+                        ++recolor_cnt_local;
+                        break;
                     }
                 }
+            }
+
+            /* count thread-local conflicts and compute offsets for copying into shared buffer */
+	    conflict_cnt[tid + 1] = recolor_cnt_local;
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                conflict_cnt[0] = 0;
+                for ( i = 1; i < num_thread + 1; ++i )
+                {
+	            conflict_cnt[i] += conflict_cnt[i - 1];
+                }
+                recolor_cnt = conflict_cnt[num_thread];
+            }
+
+            #pragma omp barrier
 
+            /* copy thread-local conflicts into shared buffer */
+            for ( i = 0; i < recolor_cnt_local; ++i )
+            {
+                conflict[conflict_cnt[tid] + i] = conflict_local[i];
+                color[conflict_local[i]] = 0;
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
                 temp_ptr = to_color;
                 to_color = conflict;
                 conflict = temp_ptr;
@@ -625,16 +669,9 @@ void graph_coloring( const sparse_matrix * const A, const TRIANGULARITY tri )
             #pragma omp barrier
         }
 
+        free( conflict_local );
         free( fb_color );
 
-//#if defined(DEBUG)
-//    #pragma omp master
-//    {
-//        for ( i = 0; i < A->n; ++i )
-//            printf("Vertex: %5d, Color: %5d\n", i, color[i] );
-//    }
-//#endif
-
         #pragma omp barrier
     }
 }
@@ -873,12 +910,24 @@ void permute_matrix( sparse_matrix * const LU, const TRIANGULARITY tri )
  */
 sparse_matrix * setup_graph_coloring( sparse_matrix * const H )
 {
+    int num_thread;
+
     if ( color == NULL )
     {
+#ifdef _OPENMP
+        #pragma omp parallel
+        {
+            num_thread = omp_get_num_threads();
+        }
+#else
+        num_thread = 1;
+#endif
+
         /* internal storage for graph coloring (global to facilitate simultaneous access to OpenMP threads) */
         if ( (color = (unsigned int*) malloc(sizeof(unsigned int) * H->n)) == NULL ||
                 (to_color =(unsigned int*) malloc(sizeof(unsigned int) * H->n)) == NULL ||
                 (conflict = (unsigned int*) malloc(sizeof(unsigned int) * H->n)) == NULL ||
+                (conflict_cnt = (unsigned int*) malloc(sizeof(unsigned int) * (num_thread + 1))) == NULL ||
                 (recolor = (unsigned int*) malloc(sizeof(unsigned int) * H->n)) == NULL ||
                 (color_top = (unsigned int*) malloc(sizeof(unsigned int) * (H->n + 1))) == NULL ||
                 (permuted_row_col = (unsigned int*) malloc(sizeof(unsigned int) * H->n)) == NULL ||
diff --git a/sPuReMD/src/print_utils.c b/sPuReMD/src/print_utils.c
index 9ff0f571be830e29591aaeee2695b8f441077fda..2f14e38ff616e6bcbf40f0de61dab90712cda7a8 100644
--- a/sPuReMD/src/print_utils.c
+++ b/sPuReMD/src/print_utils.c
@@ -584,7 +584,7 @@ void Output_Results( reax_system *system, control_params *control,
     e_pol = 0.0;
 
 #ifdef _OPENMP
-    #pragma omp parallel for default(none) private(q, type_i,) shared(system) \
+    #pragma omp parallel for default(none) private(q, type_i) shared(system) \
         reduction(+: e_pol) schedule(static)
 #endif
     for ( i = 0; i < system->N; i++ )