diff --git a/environ/param.gpu.water b/environ/param.gpu.water
index a1366285d5e1ba655f66d9aea163c5ff3e5dc18e..90288d3da7b9a108722c74a061ce47dd1d76c5e8 100644
--- a/environ/param.gpu.water
+++ b/environ/param.gpu.water
@@ -1,53 +1,55 @@
-simulation_name	        water.6.notab     		! output files will carry this name + their specific extension
-ensemble_type	 	0			! 0: NVE, 1: NVT, 2: anisotropic NPT, 3: semi-isotropic NPT, 4: isotropic NPT 6: berendsen NVT
-nsteps			100       	! number of simulation steps
-dt			0.25	    	   	! time step in fs
+simulation_name         water.6.notab           ! output files will carry this name + their specific extension
+ensemble_type           0                       ! 0: NVE, 1: NVT, 2: anisotropic NPT, 3: semi-isotropic NPT, 4: isotropic NPT 6: berendsen NVT
+nsteps                  100                     ! number of simulation steps
+dt                      0.25                    ! time step in fs
 
-reposition_atoms	0			! 0: just fit to periodic boundaries, 1: CoM to the center of box, 3: CoM to the origin
-restrict_bonds 		0			! enforce the bonds given in CONECT lines of pdb file for this many steps
-tabulate_long_range	0			! denotes the granularity of long range tabulation, 0 means no tabulation
-energy_update_freq 	1
-remove_CoM_vel	        500			! remove the translational and rotational vel around the center of mass at every 'this many' steps
+reposition_atoms        0                       ! 0: just fit to periodic boundaries, 1: CoM to the center of box, 3: CoM to the origin
+restrict_bonds          0                       ! enforce the bonds given in CONECT lines of pdb file for this many steps
+tabulate_long_range     0                       ! denotes the granularity of long range tabulation, 0 means no tabulation
+energy_update_freq      1
+remove_CoM_vel          500                     ! remove the translational and rotational vel around the center of mass at every 'this many' steps
 
-nbrhood_cutoff		5.0 	   		! near neighbors cutoff for bond calculations in A
-bond_graph_cutoff	0.3			! bond strength cutoff for bond graphs
-thb_cutoff		0.001	  		! cutoff value for three body interactions
-hbond_cutoff		7.50			! cutoff distance for hydrogen bond interactions
-q_err			1e-6		     	! relative residual norm threshold used in GMRES
-solver			2			! method used to solve charge equilibration phase (QEq)
-pre_comp		1			! method used to compute QEq preconditioner, if applicable
-pre_j_iters		50			! number of Jacobi iterations used for applying QEq precondition, if applicable
-pre_refactor		100			! nsteps to recompute preconditioner
-pre_droptol		0.01			! threshold tolerance for dropping values in preconditioner computation, if applicable
+nbrhood_cutoff          5.0                     ! near neighbors cutoff for bond calculations in A
+bond_graph_cutoff       0.3                     ! bond strength cutoff for bond graphs
+thb_cutoff              0.001                   ! cutoff value for three body interactions
+hbond_cutoff            7.50                    ! cutoff distance for hydrogen bond interactions
 
-temp_init	      	0.0             	! desired initial temperature of the simulated system
-temp_final 	      	300.0	        	! desired final temperature of the simulated system
-t_mass		      	0.16666			! 0.16666 for Nose-Hoover nvt ! 100.0 for npt! in fs, thermal inertia parameter
+qeq_solver_type         2                       ! method used to solve charge equilibration phase (QEq)
+qeq_solver_q_err        1e-6                    ! relative residual norm threshold used in solver
+pre_comp_type           2                       ! method used to compute QEq preconditioner, if applicable
+pre_comp_refactor       100                     ! nsteps to recompute preconditioner
+pre_comp_droptol        0.0                     ! threshold tolerance for dropping values in preconditioner computation, if applicable
+pre_comp_sweeps         3                       ! sweeps to compute preconditioner (ILU_PAR/ICHOL_PAR)
+pre_app_jacobi_iters    50                      ! number of Jacobi iterations used for applying QEq precondition, if applicable
+
+temp_init               0.0                     ! desired initial temperature of the simulated system
+temp_final              300.0                   ! desired final temperature of the simulated system
+t_mass                  0.16666                 ! 0.16666 for Nose-Hoover nvt ! 100.0 for npt! in fs, thermal inertia parameter
 t_mode                  0                       ! 0: T-coupling only, 1: step-wise, 2: constant slope
-t_rate                 -100.0                   ! in K
+t_rate                  -100.0                  ! in K
 t_freq                  4.0                     ! in ps
 
-pressure 		0.000101325	       	! desired pressure of the simulated system in GPa, 1atm = 0.000101325 GPa
-p_mass		        5000.00        		! in fs, pressure inertia parameter
-compress 		0.008134     		! in ps^2 * A / amu ( 4.5X10^(-5) bar^(-1) )
-press_mode		0			! 0: internal + external pressure, 1: ext only, 2: int only
+pressure                0.000101325             ! desired pressure of the simulated system in GPa, 1atm = 0.000101325 GPa
+p_mass                  5000.00                 ! in fs, pressure inertia parameter
+compress                0.008134                ! in ps^2 * A / amu ( 4.5X10^(-5) bar^(-1) )
+press_mode              0                       ! 0: internal + external pressure, 1: ext only, 2: int only
 
-geo_format		1			! 0: xyz, 1: pdb, 2: bgf
-write_freq		1000  			! write trajectory after so many steps
-traj_compress		0			! 0: no compression  1: uses zlib to compress trajectory output
-traj_format		0			! 0: our own format (below options apply to this only), 1: xyz, 2: bgf, 3: pdb
-traj_title		SILICA_NVT		! (no white spaces)
-atom_info		1			! 0: no atom info, 1: print basic atom info in the trajectory file
-atom_forces		0			! 0: basic atom format, 1: print force on each atom in the trajectory file
-atom_velocities		0			! 0: basic atom format, 1: print the velocity of each atom in the trajectory file
-bond_info		1			! 0: do not print bonds, 1: print bonds in the trajectory file
-angle_info		1			! 0: do not print angles, 1: print angles in the trajectory file
-test_forces		0			! 0: normal run, 1: at every timestep print each force type into a different file
+geo_format              1                       ! 0: xyz, 1: pdb, 2: bgf
+write_freq              1000                    ! write trajectory after so many steps
+traj_compress           0                       ! 0: no compression  1: uses zlib to compress trajectory output
+traj_format             0                       ! 0: our own format (below options apply to this only), 1: xyz, 2: bgf, 3: pdb
+traj_title              WATER_NVT               ! (no white spaces)
+atom_info               1                       ! 0: no atom info, 1: print basic atom info in the trajectory file
+atom_forces             0                       ! 0: basic atom format, 1: print force on each atom in the trajectory file
+atom_velocities         0                       ! 0: basic atom format, 1: print the velocity of each atom in the trajectory file
+bond_info               1                       ! 0: do not print bonds, 1: print bonds in the trajectory file
+angle_info              1                       ! 0: do not print angles, 1: print angles in the trajectory file
+test_forces             0                       ! 0: normal run, 1: at every timestep print each force type into a different file
 
-molec_anal		0			! 1: outputs newly formed molecules as the simulation progresses
-freq_molec_anal		0			! perform molecular analysis at every 'this many' timesteps
-dipole_anal		0			! 1: calculate a electric dipole moment of the system
-freq_dipole_anal	1			! calculate electric dipole moment at every 'this many' steps
-diffusion_coef		0			! 1: calculate diffusion coefficient of the system
-freq_diffusion_coef	1			! calculate diffusion coefficient at every 'this many' steps
-restrict_type		2			! -1: all types of atoms, 0 and up: only this type of atoms
+molec_anal              0                       ! 1: outputs newly formed molecules as the simulation progresses
+freq_molec_anal         0                       ! perform molecular analysis at every 'this many' timesteps
+dipole_anal             0                       ! 1: calculate a electric dipole moment of the system
+freq_dipole_anal        1                       ! calculate electric dipole moment at every 'this many' steps
+diffusion_coef          0                       ! 1: calculate diffusion coefficient of the system
+freq_diffusion_coef     1                       ! calculate diffusion coefficient at every 'this many' steps
+restrict_type           2                       ! -1: all types of atoms, 0 and up: only this type of atoms
diff --git a/sPuReMD/configure.ac b/sPuReMD/configure.ac
index be425c8afcd700a20e48bf620f1f80bcd3dcd316..66172fa1c903671ba3d1d32c8db448fcc108b6d2 100644
--- a/sPuReMD/configure.ac
+++ b/sPuReMD/configure.ac
@@ -31,7 +31,7 @@ AC_SEARCH_LIBS([gzopen], [z])
 AC_SEARCH_LIBS([gzeof], [z])
 AC_SEARCH_LIBS([gzgets], [z])
 AC_SEARCH_LIBS([gzseek], [z])
-AC_SEARCH_LIBS([gzclose, [z]])
+AC_SEARCH_LIBS([gzclose], [z])
 
 # Checks for typedefs, structures, and compiler characteristics.
 AC_C_INLINE
@@ -59,9 +59,43 @@ fi
 
 if test "x$BUILD_SUPERLU_MT" != "xno"
 then
+	CPPFLAGS="${CPPFLAGS} -I${BUILD_SUPERLU_MT}/include"
+	LDFLAGS="${LDFLAGS} -L${BUILD_SUPERLU_MT}/lib"
+	#TODO: implement better BLAS detection
+	LIBS="${LIBS} -lblas"
+#	BLAS_FOUND_LIBS="yes"
+#	AC_SEARCH_LIBS([dtrsv_], [blas blas_OPENMP],
+#		        [], [BLAS_FOUND_LIBS="no"], [])
+#	AS_IF([test "x${BLAS_FOUND_LIBS}" != "xyes"],
+#	      [AC_MSG_ERROR([Unable to find BLAS library.])])
+	AC_CHECK_HEADERS([slu_mt_ddefs.h], [SUPERLU_MT_FOUND_HEADERS="yes"])
+	AS_IF([test "x${SUPERLU_MT_FOUND_HEADERS}" != "xyes"],
+	      [AC_MSG_ERROR([Unable to find SuperLU MT headers.])])
+	SUPERLU_MT_FOUND_LIBS="yes"
+	#TODO: fix issue where multiple -l flags added, one for each call below
+	AC_SEARCH_LIBS([intMalloc], [superlu_mt superlu_mt_OPENMP],
+		        [], [SUPERLU_MT_FOUND_LIBS="no"], [-lgomp])
+	AC_SEARCH_LIBS([get_perm_c], [superlu_mt superlu_mt_OPENMP],
+		        [], [SUPERLU_MT_FOUND_LIBS="no"], [-lgomp])
+	AC_SEARCH_LIBS([pdgstrf_init], [superlu_mt superlu_mt_OPENMP],
+		        [], [SUPERLU_MT_FOUND_LIBS="no"], [-lgomp -lblas -lblas_OPENMP])
+	AC_SEARCH_LIBS([pdgstrf], [superlu_mt superlu_mt_OPENMP],
+		        [], [SUPERLU_MT_FOUND_LIBS="no"], [-lgomp -lblas -lblas_OPENMP])
+	AC_SEARCH_LIBS([pxgstrf_finalize], [superlu_mt superlu_mt_OPENMP],
+		        [], [SUPERLU_MT_FOUND_LIBS="no"], [-lgomp -lblas -lblas_OPENMP])
+	AC_SEARCH_LIBS([StatAlloc], [superlu_mt superlu_mt_OPENMP],
+		        [], [SUPERLU_MT_FOUND_LIBS="no"], [-lgomp])
+	AC_SEARCH_LIBS([StatInit], [superlu_mt superlu_mt_OPENMP],
+		        [], [SUPERLU_MT_FOUND_LIBS="no"], [-lgomp])
+	AC_SEARCH_LIBS([StatFree], [superlu_mt superlu_mt_OPENMP],
+		        [], [SUPERLU_MT_FOUND_LIBS="no"], [-lgomp])
+	AC_SEARCH_LIBS([Destroy_SuperNode_SCP], [superlu_mt superlu_mt_OPENMP],
+		        [], [SUPERLU_MT_FOUND_LIBS="no"], [-lgomp])
+	AC_SEARCH_LIBS([Destroy_CompCol_NCP], [superlu_mt superlu_mt_OPENMP],
+		        [], [SUPERLU_MT_FOUND_LIBS="no"], [-lgomp])
+	AS_IF([test "x${SUPERLU_MT_FOUND_LIBS}" != "xyes"],
+	      [AC_MSG_ERROR([Unable to find SuperLU MT library.])])
 	AC_DEFINE([HAVE_SUPERLU_MT], [1], [Define to 1 if you have SuperLU_MT support enabled.])
-	AC_SUBST(AM_CPPFLAGS, "-I${BUILD_SUPERLU_MT}/include")
-	AC_SUBST(AM_LDFLAGS, "-L${BUILD_SUPERLU_MT}/lib")
 fi
 
 AC_CONFIG_FILES([Makefile])
diff --git a/sPuReMD/src/GMRES.c b/sPuReMD/src/GMRES.c
index 4aec2adea2bf820ebae9e1a02331e87748557028..fae32bb8f66fe1d110503e1d71d0bfe3f1cb3365 100644
--- a/sPuReMD/src/GMRES.c
+++ b/sPuReMD/src/GMRES.c
@@ -27,6 +27,13 @@
 #include <omp.h>
 
 
+typedef enum
+{
+    LOWER = 0,
+    UPPER = 1,
+} TRIANGULARITY;
+
+
 /* sparse matrix-vector product Ax=b
  * where:
  *   A: lower triangular matrix
@@ -199,7 +206,7 @@ static void Sparse_MatVec_Vector_Add( const sparse_matrix * const R,
 
 
 /* solve sparse lower triangular linear system using forward substitution */
-void Forward_Subs( sparse_matrix *L, real *b, real *y )
+static void Forward_Subs( const sparse_matrix * const L, const real * const b, real * const y )
 {
     int i, pj, j, si, ei;
     real val;
@@ -222,7 +229,7 @@ void Forward_Subs( sparse_matrix *L, real *b, real *y )
 
 
 /* solve sparse upper triangular linear system using backward substitution */
-void Backward_Subs( sparse_matrix *U, real *y, real *x )
+static void Backward_Subs( const sparse_matrix * const U, const real * const y, real * const x )
 {
     int i, pj, j, si, ei;
     real val;
@@ -1222,7 +1229,7 @@ int SDM( static_storage *workspace, sparse_matrix *H,
 }
 
 
-real condest( sparse_matrix *L, sparse_matrix *U )
+real condest( const sparse_matrix * const L, const sparse_matrix * const U )
 {
     unsigned int i, N;
     real *e, c;
diff --git a/sPuReMD/src/GMRES.h b/sPuReMD/src/GMRES.h
index 391c4777b95131e55fbcfac8393638f06a3686de..2acf61f2c2df46a175b4bccf31d229c147611755 100644
--- a/sPuReMD/src/GMRES.h
+++ b/sPuReMD/src/GMRES.h
@@ -22,16 +22,8 @@
 #ifndef __GMRES_H_
 #define __GMRES_H_
 
-#define SIGN(x) (x < 0.0 ? -1 : 1);
-
 #include "mytypes.h"
 
-typedef enum
-{
-    LOWER = 0,
-    UPPER = 1,
-} TRIANGULARITY;
-
 int GMRES( static_storage*, sparse_matrix*,
            real*, real, real*, FILE*, real*, real* );
 
@@ -50,9 +42,9 @@ int PCG( static_storage*, sparse_matrix*, real*, real,
 int CG( static_storage*, sparse_matrix*,
         real*, real, real*, FILE* );
 
-int uyduruk_GMRES( static_storage*, sparse_matrix*,
-                   real*, real, real*, int, FILE* );
+int SDM( static_storage*, sparse_matrix*,
+         real*, real, real*, FILE* );
 
-real condest( sparse_matrix*, sparse_matrix* );
+real condest( const sparse_matrix * const, const sparse_matrix * const );
 
 #endif
diff --git a/sPuReMD/src/QEq.c b/sPuReMD/src/QEq.c
index 7db8083933c35d3c1b234835d6b8ad7a5c54f77d..8306ac1db912641a66dfedfe40ae691e6748812a 100644
--- a/sPuReMD/src/QEq.c
+++ b/sPuReMD/src/QEq.c
@@ -28,14 +28,14 @@
 #include "slu_mt_ddefs.h"
 #endif
 
-int compare_matrix_entry(const void *v1, const void *v2)
+static int compare_matrix_entry(const void *v1, const void *v2)
 {
     /* larger element has larger column index */
     return ((sparse_matrix_entry *)v1)->j - ((sparse_matrix_entry *)v2)->j;
 }
 
 
-void Sort_Matrix_Rows( sparse_matrix *A )
+static void Sort_Matrix_Rows( sparse_matrix * const A )
 {
     int i, si, ei;
 
@@ -51,7 +51,71 @@ void Sort_Matrix_Rows( sparse_matrix *A )
 }
 
 
-void Calculate_Droptol( sparse_matrix *A, real *droptol, real dtol )
+/* transpose A and copy into A^T */
+static void Transpose( const sparse_matrix const *A, sparse_matrix const *A_t )
+{
+    unsigned int i, j, pj, *A_t_top;
+
+    if ( (A_t_top = (unsigned int*) calloc( A->n + 1, sizeof(unsigned int))) == NULL )
+    {
+	fprintf( stderr, "Not enough space for matrix tranpose. Terminating...\n" );
+	exit( INSUFFICIENT_SPACE );
+    }
+
+    memset( A_t->start, 0, (A->n + 1) * sizeof(unsigned int) );
+
+    /* count nonzeros in each column of A^T */
+    for ( i = 0; i < A->n; ++i )
+    {
+        for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
+        {
+                ++A_t->start[A->entries[pj].j + 1];
+        }
+    }
+
+    /* setup the row pointers for A^T */
+    for ( i = 1; i <= A->n; ++i )
+    {
+        A_t_top[i] = A_t->start[i] = A_t->start[i] + A_t->start[i - 1];
+    }
+
+    /* fill in A^T */
+    for ( i = 0; i < A->n; ++i )
+    {
+        for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
+        {
+            j = A->entries[pj].j;
+            A_t->entries[A_t_top[j]].j = i;
+            A_t->entries[A_t_top[j]].val = A->entries[pj].val;
+            ++A_t_top[j];
+        }
+    }
+
+    free( A_t_top );
+}
+
+
+/* transpose A in-place */
+static void Transpose_I( sparse_matrix * const A )
+{
+    sparse_matrix * A_t;
+
+    if ( Allocate_Matrix( &A_t, A->n, A->m ) == 0 )
+    {
+        fprintf( stderr, "not enough memory for transposing matrices. terminating.\n" );
+        exit(INSUFFICIENT_SPACE);
+    }
+
+    Transpose( A, A_t );
+
+    memcpy( A->start, A_t->start, sizeof(int) * (A_t->n + 1) );
+    memcpy( A->entries, A_t->entries, sizeof(sparse_matrix_entry) * (A_t->start[A_t->n]) );
+
+    Deallocate_Matrix( A_t );
+}
+
+
+static void Calculate_Droptol( const sparse_matrix * const A, real * const droptol, real dtol )
 {
     int i, j, k;
     real val;
@@ -88,7 +152,7 @@ void Calculate_Droptol( sparse_matrix *A, real *droptol, real dtol )
 }
 
 
-int Estimate_LU_Fill( sparse_matrix *A, real *droptol )
+static int Estimate_LU_Fill( const sparse_matrix * const A, const real * const droptol )
 {
     int i, j, pj;
     int fillin;
@@ -117,19 +181,21 @@ static real SuperLU_Factorize( const sparse_matrix * const A,
     sparse_matrix * const L, sparse_matrix * const U )
 {
     unsigned int i, pj, count, *Ltop, *Utop, r;
+    sparse_matrix *A_t;
     SuperMatrix A_S, AC_S, L_S, U_S;
+    NCformat *A_S_store;
     SCPformat *L_S_store;
     NCPformat *U_S_store;
     superlumt_options_t superlumt_options;
-//    pxgstrf_shared_t pxgstrf_shared;
-//    pdgstrf_threadarg_t *pdgstrf_threadarg;
+    pxgstrf_shared_t pxgstrf_shared;
+    pdgstrf_threadarg_t *pdgstrf_threadarg;
     int_t nprocs;
     fact_t fact;
     trans_t trans;
     yes_no_t refact, usepr;
     real u, drop_tol;
-    real *a;
-    int_t *asub, *xa;
+    real *a, *at;
+    int_t *asub, *atsub, *xa, *xat;
     int_t *perm_c; /* column permutation vector */
     int_t *perm_r; /* row permutations from partial pivoting */
     void *work;
@@ -153,6 +219,7 @@ static real SuperLU_Factorize( const sparse_matrix * const A,
 #else
     nprocs = 1;
 #endif
+
 //    fact = EQUILIBRATE; /* equilibrate A (i.e., scale rows & cols to have unit norm), then factorize */
     fact = DOFACT; /* factor from scratch */
     trans = NOTRANS;
@@ -160,17 +227,23 @@ static real SuperLU_Factorize( const sparse_matrix * const A,
     //TODO: add to control file and use the value there to set these
     panel_size = sp_ienv(1); /* # consec. cols treated as unit task */
     relax = sp_ienv(2); /* # cols grouped as relaxed supernode */
-    u = 1.0;
+    u = 1.0; /* diagonal pivoting threshold */
     usepr = NO;
     drop_tol = 0.0;
     work = NULL;
     lwork = 0;
 
-    if (!(perm_r = intMalloc(A->n)))
+//#if defined(DEBUG)
+    fprintf( stderr, "nprocs = %d\n", nprocs );
+    fprintf( stderr, "Panel size = %d\n", panel_size );
+    fprintf( stderr, "Relax = %d\n", relax );
+//#endif
+
+    if ( !(perm_r = intMalloc(A->n)) )
     {
         SUPERLU_ABORT("Malloc fails for perm_r[].");
     }
-    if (!(perm_c = intMalloc(A->n)))
+    if ( !(perm_c = intMalloc(A->n)) )
     {
         SUPERLU_ABORT("Malloc fails for perm_c[].");
     }
@@ -184,43 +257,84 @@ static real SuperLU_Factorize( const sparse_matrix * const A,
     }
     if ( !(superlumt_options.part_super_h = intMalloc(A->n)) )
     {
-	SUPERLU_ABORT("Malloc fails for colcnt_h[].");
+	SUPERLU_ABORT("Malloc fails for part_super__h[].");
     }
-    if ( (a = (real*) malloc( A->start[A->n] * sizeof(real))) == NULL
-       || (asub = (int_t*) malloc( A->start[A->n] * sizeof(int_t))) == NULL 
-       || (xa = (int_t*) malloc( (A->n + 1) * sizeof(int_t))) == NULL )
+    if ( ( (a = (real*) malloc( (2 * A->start[A->n] - A->n) * sizeof(real))) == NULL )
+       || ( (asub = (int_t*) malloc( (2 * A->start[A->n] - A->n) * sizeof(int_t))) == NULL )
+       || ( (xa = (int_t*) malloc( (A->n + 1) * sizeof(int_t))) == NULL )
+       || ( (Ltop = (unsigned int*) malloc( (A->n + 1) * sizeof(unsigned int))) == NULL )
+       || ( (Utop = (unsigned int*) malloc( (A->n + 1) * sizeof(unsigned int))) == NULL ) )
     {
-
+	fprintf( stderr, "Not enough space for SuperLU factorization. Terminating...\n" );
+	exit( INSUFFICIENT_SPACE );
+    }
+    if ( Allocate_Matrix( &A_t, A->n, A->m ) == 0 )
+    {
+        fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
+        exit(INSUFFICIENT_SPACE);
     }
 
     /* Set up the sparse matrix data structure for A. */
+    Transpose( A, A_t );
+
     count = 0;
     for( i = 0; i < A->n; ++i )
     {
+        xa[i] = count;
         for( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
         {
             a[count] = A->entries[pj].val;
             asub[count] = A->entries[pj].j;
             ++count;
         }
+        for( pj = A_t->start[i] + 1; pj < A_t->start[i + 1]; ++pj )
+        {
+            a[count] = A_t->entries[pj].val;
+            asub[count] = A_t->entries[pj].j;
+            ++count;
+        }
     }
-    memcpy( xa, A->start, sizeof(int) * (A->n + 1) );
-    dCreate_CompRow_Matrix(&A_S, A->n, A->n, A->start[A->n], a, asub, xa,
-        SLU_NR, /* row-wise, no supernode */
-        SLU_D, /* double-precision */
-        SLU_SYL /* symmetric, store lower half */
-    );
-
-
-    /********************************
-     * THE FIRST TIME FACTORIZATION *
-     ********************************/
+    xa[i] = count;
+
+    dCompRow_to_CompCol( A->n, A->n, 2 * A->start[A->n] - A->n, a, asub, xa,
+        &at, &atsub, &xat );
+
+    for( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
+        fprintf( stderr, "%6d", asub[i] );
+    fprintf( stderr, "\n" );
+    for( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
+        fprintf( stderr, "%6.1f", a[i] );
+    fprintf( stderr, "\n" );
+    for( i = 0; i <= A->n; ++i )
+        fprintf( stderr, "%6d", xa[i] );
+    fprintf( stderr, "\n" );
+    for( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
+        fprintf( stderr, "%6d", atsub[i] );
+    fprintf( stderr, "\n" );
+    for( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
+        fprintf( stderr, "%6.1f", at[i] );
+    fprintf( stderr, "\n" );
+    for( i = 0; i <= A->n; ++i )
+        fprintf( stderr, "%6d", xat[i] );
+    fprintf( stderr, "\n" );
+
+    A_S.Stype = SLU_NC; /* column-wise, no supernode */
+    A_S.Dtype = SLU_D; /* double-precision */
+    A_S.Mtype = SLU_GE; /* full (general) matrix -- required for parallel factorization */
+    A_S.nrow = A->n;
+    A_S.ncol = A->n;
+    A_S.Store = (void *) SUPERLU_MALLOC( sizeof(NCformat) );
+    A_S_store = (NCformat *) A_S.Store;
+    A_S_store->nnz = 2 * A->start[A->n] - A->n;
+    A_S_store->nzval = at;
+    A_S_store->rowind = atsub;
+    A_S_store->colptr = xat;
 
     /* ------------------------------------------------------------
        Allocate storage and initialize statistics variables. 
        ------------------------------------------------------------*/
-    StatAlloc(A->n, nprocs, panel_size, relax, &Gstat);
-    StatInit(A->n, nprocs, &Gstat);
+    StatAlloc( A->n, nprocs, panel_size, relax, &Gstat );
+    StatInit( A->n, nprocs, &Gstat );
 
     /* ------------------------------------------------------------
        Get column permutation vector perm_c[], according to permc_spec:
@@ -230,22 +344,28 @@ static real SuperLU_Factorize( const sparse_matrix * const A,
        permc_spec = 3: approximate minimum degree for unsymmetric matrices
        ------------------------------------------------------------*/ 	
     permc_spec = 0;
-    get_perm_c(permc_spec, &A_S, perm_c);
+    get_perm_c( permc_spec, &A_S, perm_c );
 
     /* ------------------------------------------------------------
        Initialize the option structure superlumt_options using the
        user-input parameters;
        Apply perm_c to the columns of original A to form AC.
        ------------------------------------------------------------*/
-    pdgstrf_init(nprocs, fact, trans, refact, panel_size, relax,
+    pdgstrf_init( nprocs, fact, trans, refact, panel_size, relax,
         u, usepr, drop_tol, perm_c, perm_r,
-        work, lwork, &A_S, &AC_S, &superlumt_options, &Gstat);
+        work, lwork, &A_S, &AC_S, &superlumt_options, &Gstat );
+
+    for( i = 0; i < ((NCPformat*)AC_S.Store)->nnz; ++i )
+        fprintf( stderr, "%6.1f", ((real*)(((NCPformat*)AC_S.Store)->nzval))[i] );
+    fprintf( stderr, "\n" );
 
     /* ------------------------------------------------------------
        Compute the LU factorization of A.
        The following routine will create nprocs threads.
        ------------------------------------------------------------*/
-    pdgstrf(&superlumt_options, &AC_S, perm_r, &L_S, &U_S, &Gstat, &info);
+    pdgstrf( &superlumt_options, &AC_S, perm_r, &L_S, &U_S, &Gstat, &info );
+
+    fprintf( stderr, "INFO: %d\n", info );
     
     flopcnt = 0;
     for (i = 0; i < nprocs; ++i)
@@ -254,70 +374,86 @@ static real SuperLU_Factorize( const sparse_matrix * const A,
     }
     Gstat.ops[FACT] = flopcnt;
 
-    /* Free extra arrays in AC_S. */
-    Destroy_CompRow_Permuted(&AC_S);
-
-    /* ------------------------------------------------------------
-      Deallocate storage after factorization.
-      ------------------------------------------------------------*/
-    pxgstrf_finalize(&superlumt_options, &AC_S);
-
+//#if defined(DEBUG)
     printf("\n** Result of sparse LU **\n");
     L_S_store = (SCPformat *) L_S.Store;
     U_S_store = (NCPformat *) U_S.Store;
-    printf("No of nonzeros in factor L = " IFMT "\n", L_S_store->nnz);
-    printf("No of nonzeros in factor U = " IFMT "\n", U_S_store->nnz);
-    printf("No of nonzeros in L+U = " IFMT "\n", L_S_store->nnz + U_S_store->nnz - A->n);
-    fflush(stdout);
+    printf( "No of nonzeros in factor L = " IFMT "\n", L_S_store->nnz );
+    printf( "No of nonzeros in factor U = " IFMT "\n", U_S_store->nnz );
+    fflush( stdout );
+//#endif
 
-    /* convert L and R from SuperMatrix to CSR */
+    /* convert L and R from SuperLU formats to CSR */
     memset( Ltop, 0, (A->n + 1) * sizeof(int) );
     memset( Utop, 0, (A->n + 1) * sizeof(int) );
+    memset( L->start, 0, (A->n + 1) * sizeof(int) );
+    memset( U->start, 0, (A->n + 1) * sizeof(int) );
+
+    for( i = 0; i < 2 * L_S_store->nnz; ++i )
+        fprintf( stderr, "%6.1f", ((real*)(L_S_store->nzval))[i] );
+    fprintf( stderr, "\n" );
+    for( i = 0; i < 2 * U_S_store->nnz; ++i )
+        fprintf( stderr, "%6.1f", ((real*)(U_S_store->nzval))[i] );
+    fprintf( stderr, "\n" );
+
+    printf( "No of supernodes in factor L = " IFMT "\n", L_S_store->nsuper );
     for( i = 0; i < A->n; ++i )
     {
+        fprintf( stderr, "nzval_col_beg[%5d] = %d\n", i, L_S_store->nzval_colbeg[i] );
+        fprintf( stderr, "nzval_col_end[%5d] = %d\n", i, L_S_store->nzval_colend[i] );
         //TODO: correct for SCPformat for L?
-        for( pj = L_S_store->rowind_colbeg[i]; pj < L_S_store->rowind_colend[i]; ++pj )
-        {
-            ++Ltop[L_S_store->rowind[pj] + 1];
-        }
+        //for( pj = L_S_store->rowind_colbeg[i]; pj < L_S_store->rowind_colend[i]; ++pj )
+//        for( pj = 0; pj < L_S_store->rowind_colend[i] - L_S_store->rowind_colbeg[i]; ++pj )
+//        {
+//            ++Ltop[L_S_store->rowind[L_S_store->rowind_colbeg[i] + pj] + 1];
+//        }
+        fprintf( stderr, "col_beg[%5d] = %d\n", i, U_S_store->colbeg[i] );
+        fprintf( stderr, "col_end[%5d] = %d\n", i, U_S_store->colend[i] );
         for( pj = U_S_store->colbeg[i]; pj < U_S_store->colend[i]; ++pj )
         {
             ++Utop[U_S_store->rowind[pj] + 1];
+            fprintf( stderr, "Utop[%5d]     = %d\n", U_S_store->rowind[pj] + 1, Utop[U_S_store->rowind[pj] + 1] );
         }
     }
     for( i = 1; i <= A->n; ++i )
     {
-        Ltop[i] = L->start[i] = Ltop[i] + Ltop[i - 1];
-        Utop[i] = U->start[i] = Ltop[i] + Utop[i - 1];
+//        Ltop[i] = L->start[i] = Ltop[i] + Ltop[i - 1];
+        Utop[i] = U->start[i] = Utop[i] + Utop[i - 1];
+//        fprintf( stderr, "Utop[%5d]     = %d\n", i, Utop[i] );
+//        fprintf( stderr, "U->start[%5d] = %d\n", i, U->start[i] );
     }
     for( i = 0; i < A->n; ++i )
     {
-        //TODO: correct for SCPformat for L?
-        for( pj = L_S_store->nzval_colbeg[i]; pj < L_S_store->nzval_colend[i]; ++pj )
-        {
-            r = L_S_store->rowind[pj];
-            L->entries[Ltop[r]].j = i;
-            L->entries[Ltop[r]].val = L_S_store->nzval[pj];
-            ++Ltop[r];
-        }
+//        for( pj = 0; pj < L_S_store->nzval_colend[i] - L_S_store->nzval_colbeg[i]; ++pj )
+//        {
+//            r = L_S_store->rowind[L_S_store->rowind_colbeg[i] + pj];
+//            L->entries[Ltop[r]].j = r;
+//            L->entries[Ltop[r]].val = ((real*)L_S_store->nzval)[L_S_store->nzval_colbeg[i] + pj];
+//            ++Ltop[r];
+//        }
         for( pj = U_S_store->colbeg[i]; pj < U_S_store->colend[i]; ++pj )
         {
             r = U_S_store->rowind[pj];
             U->entries[Utop[r]].j = i;
-            U->entries[Utop[r]].val = U_S_store->nzval[pj];
+            U->entries[Utop[r]].val = ((real*)U_S_store->nzval)[pj];
             ++Utop[r];
         }
     }
 
+    /* ------------------------------------------------------------
+      Deallocate storage after factorization.
+      ------------------------------------------------------------*/
+    pxgstrf_finalize( &superlumt_options, &AC_S );
+    Deallocate_Matrix( A_t );
     free( xa );
     free( asub );
     free( a );
-    SUPERLU_FREE (perm_r);
-    SUPERLU_FREE (perm_c);
-    SUPERLU_FREE (superlumt_options.etree);
-    SUPERLU_FREE (superlumt_options.colcnt_h);
-    SUPERLU_FREE (superlumt_options.part_super_h);
-    Destroy_CompRow_Matrix(&A_S);
+    SUPERLU_FREE( perm_r );
+    SUPERLU_FREE( perm_c );
+    SUPERLU_FREE( ((NCformat *)A_S.Store)->rowind );
+    SUPERLU_FREE( ((NCformat *)A_S.Store)->colptr );
+    SUPERLU_FREE( ((NCformat *)A_S.Store)->nzval );
+    SUPERLU_FREE( A_S.Store );
     if ( lwork == 0 )
     {
         Destroy_SuperNode_SCP(&L_S);
@@ -329,15 +465,39 @@ static real SuperLU_Factorize( const sparse_matrix * const A,
     }
     StatFree(&Gstat);
 
+    free( Utop );
+    free( Ltop );
+
     //TODO: return iters
     return 0.;
 }
 #endif
 
 
+/* Diagonal preconditioner */
+static real diagonal_pre( const reax_system * const system, real * const Hdia_inv )
+{
+    unsigned int i;
+    struct timeval start, stop;
+
+    gettimeofday( &start, NULL );
+
+    #pragma omp parallel for schedule(guided) \
+        default(none) private(i)
+    for ( i = 0; i < system->N; ++i )
+    {
+        Hdia_inv[i] = 1.0 / system->reaxprm.sbp[system->atoms[i].type].eta;
+    }
+
+    gettimeofday( &stop, NULL );
+    return (stop.tv_sec + stop.tv_usec / 1000000.0)
+        - (start.tv_sec + start.tv_usec / 1000000.0);
+}
+
+
 /* Incomplete Cholesky factorization with thresholding */
-real ICHOLT( sparse_matrix *A, real *droptol,
-            sparse_matrix *L, sparse_matrix *U )
+static real ICHOLT( const sparse_matrix * const A, const real * const droptol,
+            sparse_matrix * const L, sparse_matrix * const U )
 {
     sparse_matrix_entry tmp[1000];
     int i, j, pj, k1, k2, tmptop, Ltop;
@@ -655,31 +815,247 @@ static real ICHOL_PAR( const sparse_matrix * const A, const unsigned int sweeps,
 }
 
 
-void Init_MatVec( reax_system *system, control_params *control,
-                  simulation_data *data, static_storage *workspace,
-                  list *far_nbrs )
+/* Fine-grained (parallel) incomplete LU factorization
+ * 
+ * Reference:
+ * Edmond Chow and Aftab Patel
+ * Fine-Grained Parallel Incomplete LU Factorization
+ * SIAM J. Sci. Comp.
+ * 
+ * A: symmetric, half-format (lower triangular)
+ * sweeps: number of ILUL_PAR loops over non-zeros
+ * L / U: factorized matrices (A \approx LU) */
+static real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps,
+                       sparse_matrix * const L, sparse_matrix * const U )
+{
+    unsigned int i, j, k, pj, x, y, ei_x, ei_y;
+    real *D, *D_inv, sum;
+    sparse_matrix *DAD;
+    struct timeval start, stop;
+
+    gettimeofday( &start, NULL );
+
+    if ( Allocate_Matrix( &DAD, A->n, A->m ) == 0 )
+    {
+        fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
+        exit(INSUFFICIENT_SPACE);
+    }
+
+    /*TODO: check malloc return status*/
+    D = (real*) malloc(A->n * sizeof(real));
+    D_inv = (real*) malloc(A->n * sizeof(real));
+
+    #pragma omp parallel for schedule(guided) \
+        default(none) shared(D, D_inv) private(i)
+    for ( i = 0; i < A->n; ++i )
+    {
+        D_inv[i] = SQRT( A->entries[A->start[i + 1] - 1].val );
+        D[i] = 1.0 / D_inv[i];
+    }
+
+    /* to get convergence, A must have unit diagonal, so apply
+     * transformation DAD, where D = D(1./sqrt(D(A))) */
+    memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) );
+    #pragma omp parallel for schedule(guided) \
+        default(none) shared(DAD, D) private(i, pj)
+    for ( i = 0; i < A->n; ++i )
+    {
+        /* non-diagonals */
+        for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
+        {
+            DAD->entries[pj].j = A->entries[pj].j;
+            DAD->entries[pj].val =
+                D[i] * A->entries[pj].val * D[A->entries[pj].j];
+        }
+        /* diagonal */
+        DAD->entries[pj].j = A->entries[pj].j;
+        DAD->entries[pj].val = 1.0;
+    }
+
+    /* initial guesses for L and U,
+     * assume: A and DAD symmetric and stored lower triangular */
+    memcpy( L->start, DAD->start, sizeof(int) * (DAD->n + 1) );
+    memcpy( L->entries, DAD->entries, sizeof(sparse_matrix_entry) * (DAD->start[DAD->n]) );
+    /* store U^T in CSR for row-wise access and tranpose later */
+    memcpy( U->start, DAD->start, sizeof(int) * (DAD->n + 1) );
+    memcpy( U->entries, DAD->entries, sizeof(sparse_matrix_entry) * (DAD->start[DAD->n]) );
+
+    /* L has unit diagonal, by convention */
+    #pragma omp parallel for schedule(guided) \
+        default(none) private(i)
+    for ( i = 0; i < A->n; ++i )
+    {
+        L->entries[L->start[i + 1] - 1].val = 1.0;
+    }
+
+    for ( i = 0; i < sweeps; ++i )
+    {
+        /* for each nonzero in L */
+        #pragma omp parallel for schedule(guided) \
+            default(none) shared(DAD) private(j, k, x, y, ei_x, ei_y, sum)
+        for ( j = 0; j < DAD->start[DAD->n]; ++j )
+        {
+            sum = ZERO;
+
+            /* determine row bounds of current nonzero */
+            x = 0;
+            ei_x = 0;
+            for ( k = 1; k <= DAD->n; ++k )
+            {
+                if ( DAD->start[k] > j )
+                {
+                    x = DAD->start[k - 1];
+                    ei_x = DAD->start[k];
+                    break;
+                }
+            }
+            /* determine column bounds of current nonzero */
+            y = DAD->start[DAD->entries[j].j];
+            ei_y = DAD->start[DAD->entries[j].j + 1];
+
+            /* sparse dot product:
+             *   dot( L(i,1:j-1), U(1:j-1,j) ) */
+            while ( L->entries[x].j < L->entries[j].j &&
+                    L->entries[y].j < L->entries[j].j &&
+                    x < ei_x && y < ei_y )
+            {
+                if ( L->entries[x].j == L->entries[y].j )
+                {
+                    sum += (L->entries[x].val * U->entries[y].val);
+                    ++x;
+                    ++y;
+                }
+                else if ( L->entries[x].j < L->entries[y].j )
+                {
+                    ++x;
+                }
+                else
+                {
+                    ++y;
+                }
+            }
+
+            if( j != ei_x - 1 )
+            {
+                L->entries[j].val = ( DAD->entries[j].val - sum ) / U->entries[ei_y - 1].val;
+            }
+        }
+
+        #pragma omp parallel for schedule(guided) \
+            default(none) shared(DAD) private(j, k, x, y, ei_x, ei_y, sum)
+        for ( j = 0; j < DAD->start[DAD->n]; ++j )
+        {
+            sum = ZERO;
+
+            /* determine row bounds of current nonzero */
+            x = 0;
+            ei_x = 0;
+            for ( k = 1; k <= DAD->n; ++k )
+            {
+                if ( DAD->start[k] > j )
+                {
+                    x = DAD->start[k - 1];
+                    ei_x = DAD->start[k];
+                    break;
+                }
+            }
+            /* determine column bounds of current nonzero */
+            y = DAD->start[DAD->entries[j].j];
+            ei_y = DAD->start[DAD->entries[j].j + 1];
+
+            /* sparse dot product:
+             *   dot( L(i,1:i-1), U(1:i-1,j) ) */
+            while ( U->entries[x].j < U->entries[j].j &&
+                    U->entries[y].j < U->entries[j].j &&
+                    x < ei_x && y < ei_y )
+            {
+                if ( U->entries[x].j == U->entries[y].j )
+                {
+                    sum += (L->entries[y].val * U->entries[x].val);
+                    ++x;
+                    ++y;
+                }
+                else if ( U->entries[x].j < U->entries[y].j )
+                {
+                    ++x;
+                }
+                else
+                {
+                    ++y;
+                }
+            }
+
+            U->entries[j].val = DAD->entries[j].val - sum;
+        }
+    }
+
+    /* apply inverse transformation:
+     * since DAD \approx LU, then
+     * D^{-1}DADD^{-1} = A \approx D^{-1}LUD^{-1} */
+    #pragma omp parallel for schedule(guided) \
+        default(none) shared(DAD, D_inv) private(i, pj)
+    for ( i = 0; i < DAD->n; ++i )
+    {
+        for ( pj = DAD->start[i]; pj < DAD->start[i + 1]; ++pj )
+        {
+            L->entries[pj].val = D_inv[i] * L->entries[pj].val;
+            /* currently storing U^T, so use row index instead of column index */
+            U->entries[pj].val = U->entries[pj].val * D_inv[i];
+        }
+    }
+
+    Transpose_I( U );
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "nnz(L): %d, max: %d\n", L->start[L->n], L->n * 50 );
+    fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 );
+#endif
+
+    Deallocate_Matrix( DAD );
+    free( D_inv );
+    free( D );
+
+    gettimeofday( &stop, NULL );
+    return (stop.tv_sec + stop.tv_usec / 1000000.0)
+        - (start.tv_sec + start.tv_usec / 1000000.0);
+}
+
+
+static void Init_MatVec( const reax_system * const system, const control_params * const control,
+                  simulation_data * const data, static_storage * const workspace,
+                  const list * const far_nbrs )
 {
     int i, fillin;
     real s_tmp, t_tmp;
+    sparse_matrix *H_test, *L_test, *U_test;
 //    char fname[100];
 
-    if (control->refactor > 0 &&
-            ((data->step - data->prev_steps) % control->refactor == 0 || workspace->L == NULL))
+    if (control->pre_comp_refactor > 0 &&
+            ((data->step - data->prev_steps) % control->pre_comp_refactor == 0 || workspace->L == NULL))
     {
         //Print_Linear_System( system, control, workspace, data->step );
         Sort_Matrix_Rows( workspace->H );
         Sort_Matrix_Rows( workspace->H_sp );
-        //fprintf( stderr, "H matrix sorted\n" );
-	if ( control->pre_comp == ICHOLT_PC )
+#if defined(DEBUG)
+        fprintf( stderr, "H matrix sorted\n" );
+#endif
+
+	switch ( control->pre_comp_type )
 	{
-            Calculate_Droptol( workspace->H, workspace->droptol, control->droptol );
-            //fprintf( stderr, "drop tolerances calculated\n" );
-	}
-        if ( workspace->L == NULL )
-        {
-	    switch ( control->pre_comp )
-	    {
-	        case ICHOLT_PC:
+	    case DIAG_PC:
+                if ( workspace->Hdia_inv == NULL )
+                {
+                    workspace->Hdia_inv = (real *) calloc( system->N, sizeof( real ) );
+                }
+                data->timing.pre_comp += diagonal_pre( system, workspace->Hdia_inv );
+		break;
+	    case ICHOLT_PC:
+                Calculate_Droptol( workspace->H, workspace->droptol, control->pre_comp_droptol );
+#if defined(DEBUG_FOCUS)
+                fprintf( stderr, "drop tolerances calculated\n" );
+#endif
+                if ( workspace->L == NULL )
+                {
                     fillin = Estimate_LU_Fill( workspace->H, workspace->droptol );
                     if ( Allocate_Matrix( &(workspace->L), far_nbrs->n, fillin ) == 0 ||
                             Allocate_Matrix( &(workspace->U), far_nbrs->n, fillin ) == 0 )
@@ -687,9 +1063,19 @@ void Init_MatVec( reax_system *system, control_params *control,
                         fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
                         exit(INSUFFICIENT_SPACE);
                     }
-    		    break;
-		case ICHOL_PAR_PC:
-                case ILU_SUPERLU_MT_PC:
+#if defined(DEBUG)
+                    fprintf( stderr, "fillin = %d\n", fillin );
+                    fprintf( stderr, "allocated memory: L = U = %ldMB\n",
+                                 fillin * sizeof(sparse_matrix_entry) / (1024 * 1024) );
+#endif
+                }
+
+                data->timing.pre_comp += ICHOLT( workspace->H, workspace->droptol, workspace->L, workspace->U );
+//                data->timing.pre_comp += ICHOLT( workspace->H_sp, workspace->droptol, workspace->L, workspace->U );
+		break;
+	    case ICHOL_PAR_PC:
+                if ( workspace->L == NULL )
+                {
                     /* factors have sparsity pattern as H */
                     if ( Allocate_Matrix( &(workspace->L), workspace->H->n, workspace->H->m ) == 0 ||
                             Allocate_Matrix( &(workspace->U), workspace->H->n, workspace->H->m ) == 0 )
@@ -697,34 +1083,73 @@ void Init_MatVec( reax_system *system, control_params *control,
                         fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
                         exit(INSUFFICIENT_SPACE);
                     }
-		    break;
-		default:
-		    break;
-	    }
-#if defined(DEBUG_FOCUS)
-            fprintf( stderr, "fillin = %d\n", fillin );
-            fprintf( stderr, "allocated memory: L = U = %ldMB\n",
-                     fillin * sizeof(sparse_matrix_entry) / (1024 * 1024) );
-#endif
-        }
+                }
 
-	switch ( control->pre_comp )
-	{
-	    case ICHOLT_PC:
-                data->timing.pre_comp += ICHOLT( workspace->H, workspace->droptol, workspace->L, workspace->U );
-//                data->timing.pre_comp += ICHOLT( workspace->H_sp, workspace->droptol, workspace->L, workspace->U );
-		break;
-	    case ICHOL_PAR_PC:
-                data->timing.pre_comp += ICHOL_PAR( workspace->H, 1, workspace->L, workspace->U );
+//                data->timing.pre_comp += ICHOL_PAR( workspace->H, control->pre_comp_sweeps, workspace->L, workspace->U );
+                data->timing.pre_comp += ILU_PAR( workspace->H, control->pre_comp_sweeps, workspace->L, workspace->U );
+
+//                Print_Sparse_Matrix2( workspace->H, "H.out" );
+//                Print_Sparse_Matrix2( workspace->L, "L.out" );
+//                Print_Sparse_Matrix2( workspace->U, "U.out" );
+
+//                if ( Allocate_Matrix( &H_test, 3, 6 ) == 0 ||
+//                     Allocate_Matrix( &L_test, 3, 6 ) == 0 ||
+//                     Allocate_Matrix( &U_test, 3, 6 ) == 0 )
+//                {
+//                    fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
+//                    exit(INSUFFICIENT_SPACE);
+//                }
+//
+//                //3x3, SPD, store lower half
+//                H_test->start[0] = 0;
+//                H_test->start[1] = 1;
+//                H_test->start[2] = 3;
+//                H_test->start[3] = 6;
+//                H_test->entries[0].j = 0;
+//                H_test->entries[0].val = 4.;
+//                H_test->entries[1].j = 0;
+//                H_test->entries[1].val = 12.;
+//                H_test->entries[2].j = 1;
+//                H_test->entries[2].val = 37.;
+//                H_test->entries[3].j = 0;
+//                H_test->entries[3].val = -16.;
+//                H_test->entries[4].j = 1;
+//                H_test->entries[4].val = -43.;
+//                H_test->entries[5].j = 2;
+//                H_test->entries[5].val = 98.;
+//                
+////                data->timing.pre_comp += ICHOLT( H_test, workspace->droptol, L_test, U_test );
+////                Print_Sparse_Matrix2( L_test, "L_ICHOLT.out" );
+////                Print_Sparse_Matrix2( U_test, "U_ICHOLT.out" );
+////                data->timing.pre_comp += ICHOL_PAR( H_test, 1, L_test, U_test );
+//                data->timing.pre_comp += ILU_PAR( H_test, 1, L_test, U_test );
+//                Print_Sparse_Matrix2( L_test, "L_SLU.out" );
+//                Print_Sparse_Matrix2( U_test, "U_SLU.out" );
+//                exit( 0 );
 		break;
 	    case ILU_SUPERLU_MT_PC:
+                if ( workspace->L == NULL )
+                {
+                    /* factors have sparsity pattern as H */
+                    if ( Allocate_Matrix( &(workspace->L), workspace->H->n, workspace->H->m ) == 0 ||
+                            Allocate_Matrix( &(workspace->U), workspace->H->n, workspace->H->m ) == 0 )
+                    {
+                        fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
+                        exit(INSUFFICIENT_SPACE);
+                    }
+                }
+
                 data->timing.pre_comp += SuperLU_Factorize( workspace->H, workspace->L, workspace->U );
 		break;
 	    default:
-		break;
+                fprintf( stderr, "Unrecognized preconditioner computation method. Terminating...\n" );
+                exit( INVALID_INPUT );
+	        break;
 	}
 
-//        fprintf( stderr, "condest = %f\n", condest(workspace->L, workspace->U) );
+#if defined(DEBUG)
+        fprintf( stderr, "condest = %f\n", condest(workspace->L, workspace->U) );
+#endif
 
 #if defined(DEBUG_FOCUS)
         sprintf( fname, "%s.L%d.out", control->sim_name, data->step );
@@ -781,8 +1206,7 @@ void Init_MatVec( reax_system *system, control_params *control,
 }
 
 
-
-void Calculate_Charges( reax_system *system, static_storage *workspace )
+static void Calculate_Charges( const reax_system * const system, static_storage * const workspace )
 {
     int i;
     real u, s_sum, t_sum;
@@ -800,9 +1224,9 @@ void Calculate_Charges( reax_system *system, static_storage *workspace )
 }
 
 
-void QEq( reax_system *system, control_params *control, simulation_data *data,
-          static_storage *workspace, list *far_nbrs,
-          output_controls *out_control )
+void QEq( reax_system * const system, control_params * const control, simulation_data * const data,
+          static_storage * const workspace, const list * const far_nbrs,
+          const output_controls * const out_control )
 {
     int matvecs;
 
@@ -811,55 +1235,55 @@ void QEq( reax_system *system, control_params *control, simulation_data *data,
 //    if( data->step == 0 || data->step == 100 )
 //      Print_Linear_System( system, control, workspace, data->step );
 
-    switch ( control->solver )
+    switch ( control->qeq_solver_type )
     {
         case GMRES_S:
             matvecs = GMRES( workspace, workspace->H,
-                             workspace->b_s, control->q_err, workspace->s[0], out_control->log,
+                             workspace->b_s, control->qeq_solver_q_err, workspace->s[0], out_control->log,
                              &(data->timing.pre_app), &(data->timing.spmv) );
             matvecs += GMRES( workspace, workspace->H,
-                              workspace->b_t, control->q_err, workspace->t[0], out_control->log, 
+                              workspace->b_t, control->qeq_solver_q_err, workspace->t[0], out_control->log, 
                               &(data->timing.pre_app), &(data->timing.spmv) );
             break;
         case GMRES_H_S:
             matvecs = GMRES_HouseHolder( workspace, workspace->H,
-                                         workspace->b_s, control->q_err, workspace->s[0], out_control->log );
+                                         workspace->b_s, control->qeq_solver_q_err, workspace->s[0], out_control->log );
             matvecs += GMRES_HouseHolder( workspace, workspace->H,
-                                          workspace->b_t, control->q_err, workspace->t[0], out_control->log );
+                                          workspace->b_t, control->qeq_solver_q_err, workspace->t[0], out_control->log );
             break;
         case PGMRES_S:
-            matvecs = PGMRES( workspace, workspace->H, workspace->b_s, control->q_err,
+            matvecs = PGMRES( workspace, workspace->H, workspace->b_s, control->qeq_solver_q_err,
                               workspace->L, workspace->U, workspace->s[0], out_control->log,
                               &(data->timing.pre_app), &(data->timing.spmv) );
-            matvecs += PGMRES( workspace, workspace->H, workspace->b_t, control->q_err,
+            matvecs += PGMRES( workspace, workspace->H, workspace->b_t, control->qeq_solver_q_err,
                                workspace->L, workspace->U, workspace->t[0], out_control->log,
                                &(data->timing.pre_app), &(data->timing.spmv) );
             break;
         case PGMRES_J_S:
-            matvecs = PGMRES_Jacobi( workspace, workspace->H, workspace->b_s, control->q_err,
-                                     workspace->L, workspace->U, workspace->s[0], control->jacobi_iters,
+            matvecs = PGMRES_Jacobi( workspace, workspace->H, workspace->b_s, control->qeq_solver_q_err,
+                                     workspace->L, workspace->U, workspace->s[0], control->pre_app_jacobi_iters,
 				     out_control->log, &(data->timing.pre_app), &(data->timing.spmv) );
-            matvecs += PGMRES_Jacobi( workspace, workspace->H, workspace->b_t, control->q_err,
-                                      workspace->L, workspace->U, workspace->t[0], control->jacobi_iters,
+            matvecs += PGMRES_Jacobi( workspace, workspace->H, workspace->b_t, control->qeq_solver_q_err,
+                                      workspace->L, workspace->U, workspace->t[0], control->pre_app_jacobi_iters,
 				      out_control->log, &(data->timing.pre_app), &(data->timing.spmv) );
             break;
         case CG_S:
             matvecs = CG( workspace, workspace->H,
-                          workspace->b_s, control->q_err, workspace->s[0], out_control->log ) + 1;
+                          workspace->b_s, control->qeq_solver_q_err, workspace->s[0], out_control->log ) + 1;
             matvecs += CG( workspace, workspace->H,
-                           workspace->b_t, control->q_err, workspace->t[0], out_control->log ) + 1;
+                           workspace->b_t, control->qeq_solver_q_err, workspace->t[0], out_control->log ) + 1;
             break;
         case PCG_S:
-            matvecs = PCG( workspace, workspace->H, workspace->b_s, control->q_err,
+            matvecs = PCG( workspace, workspace->H, workspace->b_s, control->qeq_solver_q_err,
                          workspace->L, workspace->U, workspace->s[0], out_control->log ) + 1;
-            matvecs += PCG( workspace, workspace->H, workspace->b_t, control->q_err,
+            matvecs += PCG( workspace, workspace->H, workspace->b_t, control->qeq_solver_q_err,
                             workspace->L, workspace->U, workspace->t[0], out_control->log ) + 1;
             break;
         case SDM_S:
             matvecs = SDM( workspace, workspace->H,
-                           workspace->b_s, control->q_err, workspace->s[0], out_control->log ) + 1;
+                           workspace->b_s, control->qeq_solver_q_err, workspace->s[0], out_control->log ) + 1;
             matvecs += SDM( workspace, workspace->H,
-                            workspace->b_t, control->q_err, workspace->t[0], out_control->log ) + 1;
+                            workspace->b_t, control->qeq_solver_q_err, workspace->t[0], out_control->log ) + 1;
             break;
 	default:
             fprintf( stderr, "Unrecognized QEq solver selection. Terminating...\n" );
diff --git a/sPuReMD/src/QEq.h b/sPuReMD/src/QEq.h
index f598061be2c530e58ad90f04f779adafeba2f44d..c7186aaee64b35ab67a6fe590de525d99db08c32 100644
--- a/sPuReMD/src/QEq.h
+++ b/sPuReMD/src/QEq.h
@@ -24,7 +24,8 @@
 
 #include "mytypes.h"
 
-void QEq( reax_system*, control_params*, simulation_data*, static_storage*,
-          list*, output_controls* );
+void QEq( reax_system* const, control_params* const, simulation_data* const,
+          static_storage* const, const list* const,
+          const output_controls* const );
 
 #endif
diff --git a/sPuReMD/src/grid.c b/sPuReMD/src/grid.c
index 20c87c12514caa6673c6212978bb8aaa45f987ef..0d618f98a664018ad672979bfc871d0797106628 100644
--- a/sPuReMD/src/grid.c
+++ b/sPuReMD/src/grid.c
@@ -454,7 +454,6 @@ void Copy_Storage( reax_system *system, static_storage *workspace,
 
     orig_id[top]  = workspace->orig_id[old_id];
 
-    workspace->Hdia_inv[top] = 1. / system->reaxprm.sbp[ old_type ].eta;
     workspace->b_s[top] = -system->reaxprm.sbp[ old_type ].chi;
     workspace->b_t[top] = -1.0;
 
diff --git a/sPuReMD/src/init_md.c b/sPuReMD/src/init_md.c
index aaa1a61fd4a2ed9c445233b3c3cfae71e9f06e82..b6b5997db4000d3d67155189090b4be74e9318f2 100644
--- a/sPuReMD/src/init_md.c
+++ b/sPuReMD/src/init_md.c
@@ -251,7 +251,6 @@ void Init_Workspace( reax_system *system, control_params *control,
     workspace->U        = NULL;
     workspace->droptol  = (real *) calloc( system->N, sizeof( real ) );
     workspace->w        = (real *) calloc( system->N, sizeof( real ) );
-    workspace->Hdia_inv = (real *) calloc( system->N, sizeof( real ) );
     workspace->b        = (real *) calloc( system->N * 2, sizeof( real ) );
     workspace->b_s      = (real *) calloc( system->N, sizeof( real ) );
     workspace->b_t      = (real *) calloc( system->N, sizeof( real ) );
@@ -272,7 +271,6 @@ void Init_Workspace( reax_system *system, control_params *control,
 
     for ( i = 0; i < system->N; ++i )
     {
-        workspace->Hdia_inv[i] = 1. / system->reaxprm.sbp[system->atoms[i].type].eta;
         workspace->b_s[i] = -system->reaxprm.sbp[ system->atoms[i].type ].chi;
         workspace->b_t[i] = -1.0;
 
diff --git a/sPuReMD/src/mytypes.h b/sPuReMD/src/mytypes.h
index b3bb106c710dfe6101ffeb2699abc971236ab90f..9d5136efec2dfee42cfc8a0ecf9c6f02b869ccd8 100644
--- a/sPuReMD/src/mytypes.h
+++ b/sPuReMD/src/mytypes.h
@@ -443,7 +443,6 @@ typedef struct
     real thb_cut;
     real hb_cut;
     real Tap7, Tap6, Tap5, Tap4, Tap3, Tap2, Tap1, Tap0;
-    real q_err;
     int  max_far_nbrs;
 
     real T_init, T_final, T;
@@ -467,12 +466,13 @@ typedef struct
     int freq_diffusion_coef;
     int restrict_type;
 
-    unsigned int solver;
-    unsigned int pre_comp;
-    int refactor;
-    real droptol;
-    unsigned int jacobi_iters;
-
+    unsigned int qeq_solver_type;
+    real qeq_solver_q_err;
+    unsigned int pre_comp_type;
+    unsigned int pre_comp_refactor;
+    real pre_comp_droptol;
+    unsigned int pre_comp_sweeps;
+    unsigned int pre_app_jacobi_iters;
 
     int molec_anal;
     int freq_molec_anal;
diff --git a/sPuReMD/src/param.c b/sPuReMD/src/param.c
index 406a86935441302cd318b4ccf24af04ede5dfa0d..4cae89e79cfc93db2e32c9cec68563c76d7f4208 100644
--- a/sPuReMD/src/param.c
+++ b/sPuReMD/src/param.c
@@ -868,13 +868,15 @@ char Read_Control_File( FILE* fp, reax_system *system, control_params* control,
     control->thb_cut = 0.001;
     control->hb_cut = 7.50;
 
-    control->q_err = 0.000001;
     control->tabulate = 0;
-    control->solver = PGMRES_S;
-    control->pre_comp = ICHOLT_PC;
-    control->refactor = 100;
-    control->droptol = 0.01;
-    control->jacobi_iters = 0;
+
+    control->qeq_solver_type = PGMRES_S;
+    control->qeq_solver_q_err = 0.000001;
+    control->pre_comp_type = ICHOLT_PC;
+    control->pre_comp_sweeps = ICHOLT_PC;
+    control->pre_comp_refactor = 100;
+    control->pre_comp_droptol = 0.01;
+    control->pre_app_jacobi_iters = 10;
 
     control->T_init = 0.;
     control->T_final = 300.;
@@ -1035,35 +1037,40 @@ char Read_Control_File( FILE* fp, reax_system *system, control_params* control,
             val = atof( tmp[1] );
             control->hb_cut = val;
         }
-        else if ( strcmp(tmp[0], "q_err") == 0 )
+        else if ( strcmp(tmp[0], "qeq_solver_type") == 0 )
         {
-            val = atof( tmp[1] );
-            control->q_err = val;
+            ival = atoi( tmp[1] );
+            control->qeq_solver_type = ival;
         }
-        else if ( strcmp(tmp[0], "solver") == 0 )
+        else if ( strcmp(tmp[0], "qeq_solver_q_err") == 0 )
         {
-            ival = atoi( tmp[1] );
-            control->solver = ival;
+            val = atof( tmp[1] );
+            control->qeq_solver_q_err = val;
         }
-        else if ( strcmp(tmp[0], "pre_comp") == 0 )
+        else if ( strcmp(tmp[0], "pre_comp_type") == 0 )
         {
             ival = atoi( tmp[1] );
-            control->pre_comp = ival;
+            control->pre_comp_type = ival;
         }
-        else if ( strcmp(tmp[0], "pre_refactor") == 0 )
+        else if ( strcmp(tmp[0], "pre_comp_refactor") == 0 )
         {
             ival = atoi( tmp[1] );
-            control->refactor = ival;
+            control->pre_comp_refactor = ival;
         }
-        else if ( strcmp(tmp[0], "pre_droptol") == 0 )
+        else if ( strcmp(tmp[0], "pre_comp_droptol") == 0 )
         {
             val = atof( tmp[1] );
-            control->droptol = val;
+            control->pre_comp_droptol = val;
+        }
+        else if ( strcmp(tmp[0], "pre_comp_sweeps") == 0 )
+        {
+            ival = atoi( tmp[1] );
+            control->pre_comp_sweeps = ival;
         }
-        else if ( strcmp(tmp[0], "pre_j_iters") == 0 )
+        else if ( strcmp(tmp[0], "pre_app_jacobi_iters") == 0 )
         {
             val = atof( tmp[1] );
-            control->jacobi_iters = val;
+            control->pre_app_jacobi_iters = val;
         }
         else if ( strcmp(tmp[0], "temp_init") == 0 )
         {
diff --git a/sPuReMD/src/traj.c b/sPuReMD/src/traj.c
index 49a1595adce827cc0926af740e9aae59d127bf1f..81207a00720ee20143fe066cf939d41e7f244130 100644
--- a/sPuReMD/src/traj.c
+++ b/sPuReMD/src/traj.c
@@ -53,7 +53,7 @@ int Write_Custom_Header(reax_system *system, control_params *control,
              control->bo_cut,
              control->thb_cut,
              control->hb_cut,
-             control->q_err,
+             control->qeq_solver_q_err,
              control->T_init,
              control->T_final,
              control->Tau_T,