diff --git a/sPuReMD/src/GMRES.c b/sPuReMD/src/GMRES.c
index 6f526aeb12fd56b9d266ffbf670f5f5533b3a78b..9037d314162cc56ffb09778d630835181097d17f 100644
--- a/sPuReMD/src/GMRES.c
+++ b/sPuReMD/src/GMRES.c
@@ -22,6 +22,7 @@
 #include "GMRES.h"
 #include "allocate.h"
 #include "list.h"
+#include "tool_box.h"
 #include "vector.h"
 
 
@@ -38,7 +39,7 @@ typedef enum
  *   x: vector
  *   b: vector (result) */
 static void Sparse_MatVec( const sparse_matrix * const A,
-                           const real * const x, real * const b )
+        const real * const x, real * const b )
 {
     int i, j, k, n, si, ei;
     real H;
@@ -124,8 +125,8 @@ static void Sparse_MatVec( const sparse_matrix * const A,
  *   b: vector (result)
  *   D^{-1}b (Dinv_b): precomputed vector-vector product */
 static void Sparse_MatVec_Vector_Add( const sparse_matrix * const R,
-                            const TRIANGULARITY tri, const real * const Dinv,
-                            const real * const x, real * const b, const real * const Dinv_b)
+        const TRIANGULARITY tri, const real * const Dinv,
+        const real * const x, real * const b, const real * const Dinv_b)
 {
     int i, k, si = 0, ei = 0;
 #ifdef _OPENMP
@@ -203,109 +204,157 @@ static void Sparse_MatVec_Vector_Add( const sparse_matrix * const R,
 }
 
 
-/* solve sparse lower triangular linear system using forward substitution */
-static void Forward_Subs( const sparse_matrix * const L, const real * const b, real * const y )
+static void diag_pre_app( const real * const Hdia_inv, const real * const y,
+        real * const x, const int N )
 {
-    int i, pj, j, si, ei;
-    real val;
+    unsigned int i;
 
-    for ( i = 0; i < L->n; ++i )
+    #pragma omp parallel for schedule(guided) \
+        default(none) private(i) shared(stderr)
+    for ( i = 0; i < N; ++i )
     {
-        y[i] = b[i];
-        si = L->start[i];
-        ei = L->start[i + 1];
-        for ( pj = si; pj < ei - 1; ++pj )
-        {
-            j = L->j[pj];
-            val = L->val[pj];
-            y[i] -= val * y[j];
-        }
-        y[i] /= L->val[pj];
+        x[i] = y[i] * Hdia_inv[i];
     }
 }
 
 
-/* solve sparse upper triangular linear system using backward substitution */
-static void Backward_Subs( const sparse_matrix * const U, const real * const y, real * const x )
+/* Solve triangular system LU*x = y using level scheduling
+ *
+ * LU: lower/upper triangular, stored in CSR
+ * y: constants in linear system (RHS)
+ * x: solution
+ * tri: triangularity of LU (lower/upper)
+ * 
+ * Assumptions:
+ *   LU has non-zero diagonals
+ *   Each row of LU has at least one non-zero (i.e., no rows with all zeros) */
+static void tri_solve( const sparse_matrix * const LU, const real * const y,
+        real * const x, const TRIANGULARITY tri )
 {
     int i, pj, j, si, ei;
     real val;
 
-    for ( i = U->n - 1; i >= 0; --i )
+    if ( tri == LOWER )
     {
-        x[i] = y[i];
-        si = U->start[i];
-        ei = U->start[i + 1];
-        for ( pj = si + 1; pj < ei; ++pj )
+        for ( i = 0; i < LU->n; ++i )
         {
-            j = U->j[pj];
-            val = U->val[pj];
-            x[i] -= val * x[j];
+            x[i] = y[i];
+            si = LU->start[i];
+            ei = LU->start[i + 1];
+            for ( pj = si; pj < ei - 1; ++pj )
+            {
+                j = LU->j[pj];
+                val = LU->val[pj];
+                x[i] -= val * x[j];
+            }
+            x[i] /= LU->val[pj];
+        }
+    }
+    else
+    {
+        for ( i = LU->n - 1; i >= 0; --i )
+        {
+            x[i] = y[i];
+            si = LU->start[i];
+            ei = LU->start[i + 1];
+            for ( pj = si + 1; pj < ei; ++pj )
+            {
+                j = LU->j[pj];
+                val = LU->val[pj];
+                x[i] -= val * x[j];
+            }
+            x[i] /= LU->val[si];
         }
-        x[i] /= U->val[si];
     }
 }
 
 
-/* Triangular solve LU*x = y using level scheduling
+/* Solve triangular system LU*x = y using level scheduling
  *
  * LU: lower/upper triangular, stored in CSR
  * y: constants in linear system (RHS)
  * x: solution
- * tri: triangularity of A (lower/upper)
+ * tri: triangularity of LU (lower/upper)
+ * find_levels: perform level search if positive, otherwise reuse existing levels
  * 
  * Assumptions:
  *   LU has non-zero diagonals
  *   Each row of LU has at least one non-zero (i.e., no rows with all zeros) */
-static void tri_solve_level_sched( sparse_matrix * const LU, const real * const y,
-    real * const x, const TRIANGULARITY tri )
+static void tri_solve_level_sched( const sparse_matrix * const LU, const real * const y,
+        real * const x, const TRIANGULARITY tri, int find_levels )
 {
-    int i, j, pj, local_row, local_level, levels = 1;
+    int i, j, pj, local_row, local_level, levels;
+    static int levels_L = 1, levels_U = 1;
+    static unsigned int *row_levels_L = NULL, *level_rows_L = NULL, *level_rows_cnt_L = NULL;
+    static unsigned int *row_levels_U = NULL, *level_rows_U = NULL, *level_rows_cnt_U = NULL;
     unsigned int *row_levels, *level_rows, *level_rows_cnt;
-    
-    if ( (row_levels = (unsigned int*) calloc(LU->n, sizeof(unsigned int))) == NULL
-        || (level_rows = (unsigned int*) malloc(LU->n * LU->n * sizeof(unsigned int))) == NULL
-        || (level_rows_cnt = (unsigned int*) calloc(LU->n, sizeof(unsigned int))) == NULL )
+
+    if ( tri == LOWER )
     {
-	fprintf( stderr, "Not enough space for triangular solve via level scheduling. Terminating...\n" );
-	exit( INSUFFICIENT_MEMORY );
+        row_levels = row_levels_L;
+        level_rows = level_rows_L;
+        level_rows_cnt = level_rows_cnt_L;
+        levels = levels_L;
+    }
+    else
+    {
+        row_levels = row_levels_U;
+        level_rows = level_rows_U;
+        level_rows_cnt = level_rows_cnt_U;
+        levels = levels_U;
+    }
+
+    if ( row_levels == NULL || level_rows == NULL || level_rows_cnt == NULL )
+    {
+        if ( (row_levels = (unsigned int*) malloc(LU->n * sizeof(unsigned int))) == NULL
+            || (level_rows = (unsigned int*) malloc(LU->n * LU->n * sizeof(unsigned int))) == NULL
+            || (level_rows_cnt = (unsigned int*) malloc(LU->n * sizeof(unsigned int))) == NULL )
+        {
+            fprintf( stderr, "Not enough space for triangular solve via level scheduling. Terminating...\n" );
+            exit( INSUFFICIENT_MEMORY );
+        }
     }
 
     /* find levels (row dependencies in substitutions) */
-    if ( tri == LOWER )
+    if ( find_levels )
     {
-        for ( i = 0; i < LU->n; ++i )
+        memset( row_levels, 0, LU->n * sizeof( unsigned int) );
+        memset( level_rows_cnt, 0, LU->n * sizeof( unsigned int) );
+
+        if ( tri == LOWER )
         {
-            local_level = 0;
-            for ( pj = LU->start[i]; pj < LU->start[i + 1] - 1; ++pj )
+            for ( i = 0; i < LU->n; ++i )
             {
-                local_level = MAX( local_level, row_levels[LU->j[pj]] + 1 );
+                local_level = 0;
+                for ( pj = LU->start[i]; pj < LU->start[i + 1] - 1; ++pj )
+                {
+                    local_level = MAX( local_level, row_levels[LU->j[pj]] + 1 );
+                }
+        
+                levels = MAX( levels, local_level + 1 );
+                row_levels[i] = local_level;
+                level_rows[local_level * LU->n + level_rows_cnt[local_level]] = i;
+                ++level_rows_cnt[local_level];
             }
-    
-            levels = MAX( levels, local_level + 1 );
-            row_levels[i] = local_level;
-            level_rows[local_level * LU->n + level_rows_cnt[local_level]] = i;
-            ++level_rows_cnt[local_level];
         }
-    }
-    else
-    {
-        for ( i = LU->n - 1; i >= 0; --i )
+        else
         {
-            local_level = 0;
-            for ( pj = LU->start[i] + 1; pj < LU->start[i + 1]; ++pj )
+            for ( i = LU->n - 1; i >= 0; --i )
             {
-                local_level = MAX( local_level, row_levels[LU->j[pj]] + 1 );
+                local_level = 0;
+                for ( pj = LU->start[i] + 1; pj < LU->start[i + 1]; ++pj )
+                {
+                    local_level = MAX( local_level, row_levels[LU->j[pj]] + 1 );
+                }
+        
+                levels = MAX( levels, local_level + 1 );
+                row_levels[i] = local_level;
+                level_rows[local_level * LU->n + level_rows_cnt[local_level]] = i;
+                ++level_rows_cnt[local_level];
             }
-    
-            levels = MAX( levels, local_level + 1 );
-            row_levels[i] = local_level;
-            level_rows[local_level * LU->n + level_rows_cnt[local_level]] = i;
-            ++level_rows_cnt[local_level];
         }
     }
 
-
     /* perform substitutions by level */
     if ( tri == LOWER )
     {
@@ -346,9 +395,21 @@ static void tri_solve_level_sched( sparse_matrix * const LU, const real * const
         }
     }
 
-    free( level_rows_cnt );
-    free( level_rows );
-    free( row_levels );
+    /* save level info for re-use if performing repeated triangular solves via preconditioning */
+    if ( tri == LOWER )
+    {
+        row_levels_L = row_levels;
+        level_rows_L = level_rows;
+        level_rows_cnt_L = level_rows_cnt;
+        levels_L = levels;
+    }
+    else
+    {
+        row_levels_U = row_levels;
+        level_rows_U = level_rows;
+        level_rows_cnt_U = level_rows_cnt;
+        levels_U = levels;
+    }
 }
 
 
@@ -363,9 +424,9 @@ static void tri_solve_level_sched( sparse_matrix * const LU, const real * const
  *
  * Note: Newmann series arises from series expansion of the inverse of
  * the coefficient matrix in the triangular system */
-static void Jacobi_Iter( const sparse_matrix * const R, const TRIANGULARITY tri,
-                         const real * const Dinv, const unsigned int n,
-                         const real * const b, real * const x, const unsigned int maxiter )
+static void jacobi_iter( const sparse_matrix * const R, const real * const Dinv,
+        const real * const b, real * const x, const TRIANGULARITY tri,
+        const unsigned int maxiter )
 {
     unsigned int i, k, si = 0, ei = 0, iter = 0;
 #ifdef _OPENMP
@@ -387,7 +448,7 @@ static void Jacobi_Iter( const sparse_matrix * const R, const TRIANGULARITY tri,
              * overhead per Sparse_MatVec call*/
             if ( Dinv_b == NULL )
             {
-                if ( (Dinv_b = (real*) malloc(sizeof(real) * n)) == NULL )
+                if ( (Dinv_b = (real*) malloc(sizeof(real) * R->n)) == NULL )
                 {
                     fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" );
                     exit( INSUFFICIENT_MEMORY );
@@ -395,7 +456,7 @@ static void Jacobi_Iter( const sparse_matrix * const R, const TRIANGULARITY tri,
             }
             if ( rp == NULL )
             {
-                if ( (rp = (real*) malloc(sizeof(real) * n)) == NULL )
+                if ( (rp = (real*) malloc(sizeof(real) * R->n)) == NULL )
                 {
                     fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" );
                     exit( INSUFFICIENT_MEMORY );
@@ -403,7 +464,7 @@ static void Jacobi_Iter( const sparse_matrix * const R, const TRIANGULARITY tri,
             }
             if ( rp2 == NULL )
             {
-                    if ( (rp2 = (real*) malloc(sizeof(real) * n)) == NULL )
+                    if ( (rp2 = (real*) malloc(sizeof(real) * R->n)) == NULL )
                 {
                     fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" );
                     exit( INSUFFICIENT_MEMORY );
@@ -420,14 +481,14 @@ static void Jacobi_Iter( const sparse_matrix * const R, const TRIANGULARITY tri,
 	    }
 #endif
     
-            Vector_MakeZero( rp, n );
+            Vector_MakeZero( rp, R->n );
 	}
     
         #pragma omp barrier
     
         /* precompute and cache, as invariant in loop below */
         #pragma omp for schedule(guided)
-        for ( i = 0; i < n; ++i )
+        for ( i = 0; i < R->n; ++i )
         {
             Dinv_b[i] = Dinv[i] * b[i];
         }
@@ -456,7 +517,7 @@ static void Jacobi_Iter( const sparse_matrix * const R, const TRIANGULARITY tri,
                     si = R->start[i];
                     ei = R->start[i + 1] - 1;
                 }
-                else if (tri == UPPER)
+                else
                 {
     
                     si = R->start[i] + 1;
@@ -506,42 +567,195 @@ static void Jacobi_Iter( const sparse_matrix * const R, const TRIANGULARITY tri,
         while ( iter < maxiter );
     }
 
-    Vector_Copy( x, rp, n );
+    Vector_Copy( x, rp, R->n );
 }
 
 
-/* generalized minimual residual iterative solver for sparse linear systems,
- * diagonal preconditioner */
-int GMRES( static_storage *workspace, sparse_matrix *H,
-           real *b, real tol, real *x, FILE *fout, real *time, real *spmv_time )
+/* Solve triangular system LU*x = y using level scheduling
+ *
+ * workspace: lower/upper triangular, stored in CSR
+ * y: constants in linear system (RHS)
+ * x: solution
+ * tri: triangularity of preconditoning factor LU (lower/upper), if using ILU-based
+ * pc_type: preconditioner computation method used
+ * pa_type: preconditioner application method to use
+ * extra: parameter for some application methods, if applicable
+ * 
+ * Assumptions:
+ *   LU has non-zero diagonals
+ *   Each row of LU has at least one non-zero (i.e., no rows with all zeros) */
+void apply_preconditioner( const static_storage * const workspace, real * const y,
+        real * const x, const TRIANGULARITY tri, int pc_type, int pa_type, int extra )
 {
-    int i, j, k, itr, N;
-    real cc, tmp1, tmp2, temp, bnorm;
-    struct timeval start, stop;
+    int i, si;
+    sparse_matrix *LU;
+    static real *Dinv_L = NULL, *Dinv_U = NULL;
+
+    if ( tri == LOWER )
+    {
+        LU = workspace->L;
+    }
+    else
+    {
+        LU = workspace->U;
+    }
+
+    if ( ( pc_type != DIAG_PC || tri != LOWER ) && LU == NULL )
+    {
+        return;
+    }
+
+    switch ( pa_type )
+    {
+        case NONE_PA:
+            break;
+        case TRI_SOLVE_PA:
+            switch ( pc_type )
+            {
+                case DIAG_PC:
+                    diag_pre_app( workspace->Hdia_inv, y, x, workspace->H->n );
+                    break;
+                case ICHOLT_PC:
+                case ILU_PAR_PC:
+                    tri_solve( LU, y, x, tri );
+                    break;
+                default:
+                    break;
+            }
+            break;
+        case TRI_SOLVE_LEVEL_SCHED_PA:
+            switch ( pc_type )
+            {
+                case DIAG_PC:
+                    diag_pre_app( workspace->Hdia_inv, y, x, workspace->H->n );
+                    break;
+                case ICHOLT_PC:
+                case ILU_PAR_PC:
+                    tri_solve_level_sched( LU, y, x, tri, extra );
+                    break;
+                default:
+                    break;
+            }
+            break;
+        case JACOBI_ITER_PA:
+            switch ( pc_type )
+            {
+                case DIAG_PC:
+                    fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" );
+                    exit( INVALID_INPUT );
+                    break;
+                case ICHOLT_PC:
+                case ILU_PAR_PC:
+                    if ( tri == LOWER )
+                    {
+                        if ( Dinv_L == NULL )
+                        {
+                            if ( (Dinv_L = (real*) malloc(sizeof(real) * LU->n)) == NULL )
+                            {
+                                fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" );
+                                exit( INSUFFICIENT_MEMORY );
+                            }
+                        }
+
+                        /* construct D^{-1} */
+                        for ( i = 0; i < LU->n; ++i )
+                        {
+                            si = LU->start[i + 1] - 1;
+                            Dinv_L[i] = 1. / LU->val[si];
+                        }
+
+                        jacobi_iter( LU, Dinv_L, y, x, tri, extra );
+                    }
+                    else
+                    {
+                        if ( Dinv_U == NULL )
+                        {
+                            if ( (Dinv_U = (real*) malloc(sizeof(real) * LU->n)) == NULL )
+                            {
+                                fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" );
+                                exit( INSUFFICIENT_MEMORY );
+                            }
+                        }
+
+                        /* construct D^{-1}_U */
+                        for ( i = 0; i < LU->n; ++i )
+                        {
+                            si = LU->start[i];
+                            Dinv_U[i] = 1. / LU->val[si];
+                        }
+
+                        jacobi_iter( LU, Dinv_U, y, x, tri, extra );
+                    }
+                    break;
+                default:
+                    break;
+            }
+            break;
+        default:
+            fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+            exit( INVALID_INPUT );
+            break;
+
+    }
+
+    return;
+}
+
+
+/* generalized minimual residual iterative solver for sparse linear systems */
+int GMRES( const static_storage * const workspace, const control_params * const control,
+        const sparse_matrix * const H, const real * const b, real tol, real * const x,
+        const FILE * const fout, real * const time, real * const spmv_time )
+{
+    int i, j, k, itr, N, si;
+    real cc, tmp1, tmp2, temp, bnorm, time_start;
 
     N = H->n;
     bnorm = Norm( b, N );
-    /* apply the diagonal pre-conditioner to rhs */
-    gettimeofday( &start, NULL );
-    for ( i = 0; i < N; ++i )
-        workspace->b_prc[i] = b[i] * workspace->Hdia_inv[i];
-    gettimeofday( &stop, NULL );
-    *time += (stop.tv_sec + stop.tv_usec / 1000000.0)
-             - (start.tv_sec + start.tv_usec / 1000000.0);
+
+    if ( control->pre_comp_type == DIAG_PC )
+    {
+        /* apply precondioner to RHS */
+        apply_preconditioner( workspace, workspace->b, workspace->b_prc, LOWER,
+                control->pre_comp_type, control->pre_app_type, control->pre_app_jacobi_iters );
+    }
 
     /* GMRES outer-loop */
     for ( itr = 0; itr < MAX_ITR; ++itr )
     {
         /* calculate r0 */
-        gettimeofday( &start, NULL );
+        time_start = Get_Time( );
         Sparse_MatVec( H, x, workspace->b_prm );
-        gettimeofday( &stop, NULL );
-        *spmv_time += (stop.tv_sec + stop.tv_usec / 1000000.0)
-                 - (start.tv_sec + start.tv_usec / 1000000.0);
-        for ( i = 0; i < N; ++i )
-            workspace->b_prm[i] *= workspace->Hdia_inv[i]; /* pre-conditioner */
+        *spmv_time += Get_Timing_Info( time_start );
+
+        if ( control->pre_comp_type == DIAG_PC )
+        {
+            time_start = Get_Time( );
+            apply_preconditioner( workspace, workspace->b_prm, workspace->b_prm, LOWER,
+                    control->pre_comp_type, control->pre_app_type, control->pre_app_jacobi_iters );
+            *time += Get_Timing_Info( time_start );
+        }
+
+
+        if ( control->pre_comp_type == DIAG_PC )
+        {
+            Vector_Sum( workspace->v[0], 1., workspace->b_prc, -1., workspace->b_prm, N );
+        }
+        else
+        {
+            Vector_Sum( workspace->v[0], 1., b, -1., workspace->b_prm, N );
+        }
+
+        if ( control->pre_comp_type != DIAG_PC )
+        {
+            time_start = Get_Time( );
+            apply_preconditioner( workspace, workspace->v[0], workspace->v[0], LOWER,
+                    control->pre_comp_type, control->pre_app_type, control->pre_app_jacobi_iters );
+            apply_preconditioner( workspace, workspace->v[0], workspace->v[0], UPPER,
+                    control->pre_comp_type, control->pre_app_type, control->pre_app_jacobi_iters );
+            *time += Get_Timing_Info( time_start );
+        }
 
-        Vector_Sum(workspace->v[0], 1., workspace->b_prc, -1., workspace->b_prm, N);
         workspace->g[0] = Norm( workspace->v[0], N );
         Vector_Scale( workspace->v[0], 1. / workspace->g[0], workspace->v[0], N );
         //fprintf( stderr, "res: %.15e\n", workspace->g[0] );
@@ -550,26 +764,28 @@ int GMRES( static_storage *workspace, sparse_matrix *H,
         for ( j = 0; j < RESTART && fabs(workspace->g[j]) / bnorm > tol; j++ )
         {
             /* matvec */
-            gettimeofday( &start, NULL );
+            time_start = Get_Time( );
             Sparse_MatVec( H, workspace->v[j], workspace->v[j + 1] );
-            gettimeofday( &stop, NULL );
-            *spmv_time += (stop.tv_sec + stop.tv_usec / 1000000.0)
-                     - (start.tv_sec + start.tv_usec / 1000000.0);
-            /*pre-conditioner*/
-            gettimeofday( &start, NULL );
-            for ( k = 0; k < N; ++k )
-                workspace->v[j + 1][k] *= workspace->Hdia_inv[k];
-            gettimeofday( &stop, NULL );
-            *time += (stop.tv_sec + stop.tv_usec / 1000000.0)
-                     - (start.tv_sec + start.tv_usec / 1000000.0);
-            //fprintf( stderr, "%d-%d: matvec done.\n", itr, j );
+            *spmv_time += Get_Timing_Info( time_start );
+
+            time_start = Get_Time( );
+            apply_preconditioner( workspace, workspace->v[j + 1], workspace->v[j + 1], LOWER,
+                    control->pre_comp_type, control->pre_app_type, control->pre_app_jacobi_iters );
+            apply_preconditioner( workspace, workspace->v[j + 1], workspace->v[j + 1], UPPER,
+                    control->pre_comp_type, control->pre_app_type, control->pre_app_jacobi_iters );
+            *time += Get_Timing_Info( time_start );
 
             /* apply modified Gram-Schmidt to orthogonalize the new residual */
-            for ( i = 0; i <= j; i++ )
+            for ( i = 0; i < j - 1; i++ )
+            {
+                workspace->h[i][j] = 0;
+            }
+
+            //for( i = 0; i <= j; i++ ) {
+            for ( i = MAX(j - 1, 0); i <= j; i++ )
             {
                 workspace->h[i][j] = Dot( workspace->v[i], workspace->v[j + 1], N );
-                Vector_Add( workspace->v[j + 1],
-                            -workspace->h[i][j], workspace->v[i], N );
+                Vector_Add( workspace->v[j + 1], -workspace->h[i][j], workspace->v[i], N );
             }
 
             workspace->h[j + 1][j] = Norm( workspace->v[j + 1], N );
@@ -577,9 +793,8 @@ int GMRES( static_storage *workspace, sparse_matrix *H,
                           1. / workspace->h[j + 1][j], workspace->v[j + 1], N );
             // fprintf( stderr, "%d-%d: orthogonalization completed.\n", itr, j );
 
-
             /* Givens rotations on the upper-Hessenberg matrix to make it U */
-            for ( i = 0; i <= j; i++ )
+            for ( i = MAX(j - 1, 0); i <= j; i++ )
             {
                 if ( i == j )
                 {
@@ -603,32 +818,41 @@ int GMRES( static_storage *workspace, sparse_matrix *H,
             workspace->g[j] = tmp1;
             workspace->g[j + 1] = tmp2;
 
-            // fprintf( stderr, "h: " );
-            // for( i = 0; i <= j+1; ++i )
-            //  fprintf( stderr, "%.6f ", workspace->h[i][j] );
-            // fprintf( stderr, "\n" );
+            //fprintf( stderr, "h: " );
+            //for( i = 0; i <= j+1; ++i )
+            //fprintf( stderr, "%.6f ", workspace->h[i][j] );
+            //fprintf( stderr, "\n" );
             //fprintf( stderr, "res: %.15e\n", workspace->g[j+1] );
         }
 
 
-        /* solve Hy = g.
-           H is now upper-triangular, do back-substitution */
+        /* TODO: solve using Jacobi iteration? */
+        /* solve Hy = g: H is now upper-triangular, do back-substitution */
         for ( i = j - 1; i >= 0; i-- )
         {
             temp = workspace->g[i];
             for ( k = j - 1; k > i; k-- )
+            {
                 temp -= workspace->h[i][k] * workspace->y[k];
+            }
 
             workspace->y[i] = temp / workspace->h[i][i];
         }
 
         /* update x = x_0 + Vy */
+        Vector_MakeZero( workspace->p, N );
         for ( i = 0; i < j; i++ )
-            Vector_Add( x, workspace->y[i], workspace->v[i], N );
+        {
+            Vector_Add( workspace->p, workspace->y[i], workspace->v[i], N );
+        }
+
+        Vector_Add( x, 1., workspace->p, N );
 
         /* stopping condition */
         if ( fabs(workspace->g[j]) / bnorm <= tol )
+        {
             break;
+        }
     }
 
     // Sparse_MatVec( H, x, workspace->b_prm );
@@ -654,9 +878,9 @@ int GMRES( static_storage *workspace, sparse_matrix *H,
 }
 
 
-
-int GMRES_HouseHolder( static_storage *workspace, sparse_matrix *H,
-                       real *b, real tol, real *x, FILE *fout)
+int GMRES_HouseHolder( const static_storage * const workspace, const control_params * const control,
+        const sparse_matrix * const H, const real * const b, real tol, real * const x,
+        const FILE * const fout, real * const time, real * const spmv_time )
 {
     int  i, j, k, itr, N;
     real cc, tmp1, tmp2, temp, bnorm;
@@ -844,330 +1068,7 @@ int GMRES_HouseHolder( static_storage *workspace, sparse_matrix *H,
 }
 
 
-/* generalized minimual residual iterative solver for sparse linear systems,
- * with preconditioner using factors LU \approx H
- * and forward / backward substitution */
-int PGMRES( static_storage *workspace, sparse_matrix *H, real *b, real tol,
-            sparse_matrix *L, sparse_matrix *U, real *x, FILE *fout, real *time, real *spmv_time )
-{
-    int i, j, k, itr, N;
-    real cc, tmp1, tmp2, temp, bnorm;
-    struct timeval start, stop;
-
-    N = H->n;
-    bnorm = Norm( b, N );
-
-    /* GMRES outer-loop */
-    for ( itr = 0; itr < MAX_ITR; ++itr )
-    {
-        /* calculate r0 */
-        gettimeofday( &start, NULL );
-        Sparse_MatVec( H, x, workspace->b_prm );
-        gettimeofday( &stop, NULL );
-        *spmv_time += (stop.tv_sec + stop.tv_usec / 1000000.0)
-                 - (start.tv_sec + start.tv_usec / 1000000.0);
-        Vector_Sum( workspace->v[0], 1., b, -1., workspace->b_prm, N );
-        gettimeofday( &start, NULL );
-        Forward_Subs( L, workspace->v[0], workspace->v[0] );
-        Backward_Subs( U, workspace->v[0], workspace->v[0] );
-        gettimeofday( &stop, NULL );
-        *time += (stop.tv_sec + stop.tv_usec / 1000000.0)
-                 - (start.tv_sec + start.tv_usec / 1000000.0);
-        workspace->g[0] = Norm( workspace->v[0], N );
-        Vector_Scale( workspace->v[0], 1. / workspace->g[0], workspace->v[0], N );
-        //fprintf( stderr, "res: %.15e\n", workspace->g[0] );
-
-        /* GMRES inner-loop */
-        for ( j = 0; j < RESTART && fabs(workspace->g[j]) / bnorm > tol; j++ )
-        {
-            /* matvec */
-            gettimeofday( &start, NULL );
-            Sparse_MatVec( H, workspace->v[j], workspace->v[j + 1] );
-            gettimeofday( &stop, NULL );
-            *spmv_time += (stop.tv_sec + stop.tv_usec / 1000000.0)
-                     - (start.tv_sec + start.tv_usec / 1000000.0);
-            gettimeofday( &start, NULL );
-            Forward_Subs( L, workspace->v[j + 1], workspace->v[j + 1] );
-            Backward_Subs( U, workspace->v[j + 1], workspace->v[j + 1] );
-            gettimeofday( &stop, NULL );
-            *time += (stop.tv_sec + stop.tv_usec / 1000000.0)
-                     - (start.tv_sec + start.tv_usec / 1000000.0);
-
-            /* apply modified Gram-Schmidt to orthogonalize the new residual */
-            for ( i = 0; i < j - 1; i++ ) workspace->h[i][j] = 0;
-
-            //for( i = 0; i <= j; i++ ) {
-            for ( i = MAX(j - 1, 0); i <= j; i++ )
-            {
-                workspace->h[i][j] = Dot( workspace->v[i], workspace->v[j + 1], N );
-                Vector_Add( workspace->v[j + 1], -workspace->h[i][j], workspace->v[i], N );
-            }
-
-            workspace->h[j + 1][j] = Norm( workspace->v[j + 1], N );
-            Vector_Scale( workspace->v[j + 1],
-                          1. / workspace->h[j + 1][j], workspace->v[j + 1], N );
-            // fprintf( stderr, "%d-%d: orthogonalization completed.\n", itr, j );
-
-            /* Givens rotations on the upper-Hessenberg matrix to make it U */
-            for ( i = MAX(j - 1, 0); i <= j; i++ )
-            {
-                if ( i == j )
-                {
-                    cc = SQRT( SQR(workspace->h[j][j]) + SQR(workspace->h[j + 1][j]) );
-                    workspace->hc[j] = workspace->h[j][j] / cc;
-                    workspace->hs[j] = workspace->h[j + 1][j] / cc;
-                }
-
-                tmp1 =  workspace->hc[i] * workspace->h[i][j] +
-                        workspace->hs[i] * workspace->h[i + 1][j];
-                tmp2 = -workspace->hs[i] * workspace->h[i][j] +
-                       workspace->hc[i] * workspace->h[i + 1][j];
-
-                workspace->h[i][j] = tmp1;
-                workspace->h[i + 1][j] = tmp2;
-            }
-
-            /* apply Givens rotations to the rhs as well */
-            tmp1 =  workspace->hc[j] * workspace->g[j];
-            tmp2 = -workspace->hs[j] * workspace->g[j];
-            workspace->g[j] = tmp1;
-            workspace->g[j + 1] = tmp2;
-
-            //fprintf( stderr, "h: " );
-            //for( i = 0; i <= j+1; ++i )
-            //fprintf( stderr, "%.6f ", workspace->h[i][j] );
-            //fprintf( stderr, "\n" );
-            //fprintf( stderr, "res: %.15e\n", workspace->g[j+1] );
-        }
-
-
-        /* solve Hy = g: H is now upper-triangular, do back-substitution */
-        for ( i = j - 1; i >= 0; i-- )
-        {
-            temp = workspace->g[i];
-            for ( k = j - 1; k > i; k-- )
-                temp -= workspace->h[i][k] * workspace->y[k];
-
-            workspace->y[i] = temp / workspace->h[i][i];
-        }
-
-        /* update x = x_0 + Vy */
-        Vector_MakeZero( workspace->p, N );
-        for ( i = 0; i < j; i++ )
-            Vector_Add( workspace->p, workspace->y[i], workspace->v[i], N );
-        //Backward_Subs( U, workspace->p, workspace->p );
-        //Forward_Subs( L, workspace->p, workspace->p );
-        Vector_Add( x, 1., workspace->p, N );
-
-        /* stopping condition */
-        if ( fabs(workspace->g[j]) / bnorm <= tol )
-            break;
-    }
-
-    // Sparse_MatVec( H, x, workspace->b_prm );
-    // for( i = 0; i < N; ++i )
-    // workspace->b_prm[i] *= workspace->Hdia_inv[i];
-    // fprintf( fout, "\n%10s%15s%15s\n", "b_prc", "b_prm", "x" );
-    // for( i = 0; i < N; ++i )
-    // fprintf( fout, "%10.5f%15.12f%15.12f\n",
-    // workspace->b_prc[i], workspace->b_prm[i], x[i] );*/
-
-    // fprintf(fout,"GMRES outer:%d, inner:%d iters - residual norm: %25.20f\n",
-    //          itr, j, fabs( workspace->g[j] ) / bnorm );
-    // data->timing.matvec += itr * RESTART + j;
-
-    if ( itr >= MAX_ITR )
-    {
-        fprintf( stderr, "GMRES convergence failed\n" );
-        // return -1;
-        return itr * (RESTART + 1) + j + 1;
-    }
-
-    return itr * (RESTART + 1) + j + 1;
-}
-
-
-/* generalized minimual residual iterative solver for sparse linear systems,
- * with preconditioner using factors LU \approx H
- * and Jacobi iteration for approximate factor application */
-int PGMRES_Jacobi( static_storage *workspace, sparse_matrix *H, real *b, real tol,
-                   sparse_matrix *L, sparse_matrix *U, real *x, unsigned int iters,
-		   FILE *fout, real *time, real *spmv_time )
-{
-    int i, j, k, itr, N, si;
-    real cc, tmp1, tmp2, temp, bnorm;
-    real *Dinv_L, *Dinv_U;
-    struct timeval start, stop;
-
-    N = H->n;
-    bnorm = Norm( b, N );
-
-    /* Compute Jacobi iteration matrices from
-     * truncated Newmann series: x_{k+1} = Gx_k + D^{-1}b
-     * where:
-     *   G = I - D^{-1}R
-     *   R = triangular matrix
-     *   D = diagonal matrix, diagonals from R */
-    if ( (Dinv_L = (real*) malloc(sizeof(real) * N)) == NULL
-            || (Dinv_U = (real*) malloc(sizeof(real) * N)) == NULL )
-    {
-        fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" );
-        exit( INSUFFICIENT_MEMORY );
-    }
-
-    /* construct D^{-1}_L and D^{-1}_U */
-    for ( i = 0; i < N; ++i )
-    {
-        si = L->start[i + 1] - 1;
-        Dinv_L[i] = 1. / L->val[si];
-
-        si = U->start[i];
-        Dinv_U[i] = 1. / U->val[si];
-    }
-
-    /* GMRES outer-loop */
-    for ( itr = 0; itr < MAX_ITR; ++itr )
-    {
-        /* calculate r0 */
-        gettimeofday( &start, NULL );
-        Sparse_MatVec( H, x, workspace->b_prm );
-        gettimeofday( &stop, NULL );
-        *spmv_time += (stop.tv_sec + stop.tv_usec / 1000000.0)
-                 - (start.tv_sec + start.tv_usec / 1000000.0);
-        Vector_Sum( workspace->v[0], 1., b, -1., workspace->b_prm, N );
-        gettimeofday( &start, NULL );
-        Jacobi_Iter( L, LOWER, Dinv_L, N, workspace->v[0], workspace->v[0], iters );
-        Jacobi_Iter( U, UPPER, Dinv_U, N, workspace->v[0], workspace->v[0], iters );
-        gettimeofday( &stop, NULL );
-        *time += (stop.tv_sec + stop.tv_usec / 1000000.0)
-                 - (start.tv_sec + start.tv_usec / 1000000.0);
-        workspace->g[0] = Norm( workspace->v[0], N );
-        Vector_Scale( workspace->v[0], 1. / workspace->g[0], workspace->v[0], N );
-        //fprintf( stderr, "res: %.15e\n", workspace->g[0] );
-
-        /* GMRES inner-loop */
-        for ( j = 0; j < RESTART && fabs(workspace->g[j]) / bnorm > tol; j++ )
-        {
-            /* matvec */
-            gettimeofday( &start, NULL );
-            Sparse_MatVec( H, workspace->v[j], workspace->v[j + 1] );
-            gettimeofday( &stop, NULL );
-            *spmv_time += (stop.tv_sec + stop.tv_usec / 1000000.0)
-                     - (start.tv_sec + start.tv_usec / 1000000.0);
-            gettimeofday( &start, NULL );
-            Jacobi_Iter( L, LOWER, Dinv_L, N, workspace->v[j + 1], workspace->v[j + 1], iters );
-            Jacobi_Iter( U, UPPER, Dinv_U, N, workspace->v[j + 1], workspace->v[j + 1], iters );
-            gettimeofday( &stop, NULL );
-            *time += (stop.tv_sec + stop.tv_usec / 1000000.0)
-                     - (start.tv_sec + start.tv_usec / 1000000.0);
-
-            /* apply modified Gram-Schmidt to orthogonalize the new residual */
-            for ( i = 0; i < j - 1; i++ )
-            {
-                workspace->h[i][j] = 0;
-            }
-
-            //for( i = 0; i <= j; i++ ) {
-            for ( i = MAX(j - 1, 0); i <= j; i++ )
-            {
-                workspace->h[i][j] = Dot( workspace->v[i], workspace->v[j + 1], N );
-                Vector_Add( workspace->v[j + 1], -workspace->h[i][j], workspace->v[i], N );
-            }
-
-            workspace->h[j + 1][j] = Norm( workspace->v[j + 1], N );
-            Vector_Scale( workspace->v[j + 1],
-                          1. / workspace->h[j + 1][j], workspace->v[j + 1], N );
-            // fprintf( stderr, "%d-%d: orthogonalization completed.\n", itr, j );
-
-            /* Givens rotations on the upper-Hessenberg matrix to make it U */
-            for ( i = MAX(j - 1, 0); i <= j; i++ )
-            {
-                if ( i == j )
-                {
-                    cc = SQRT( SQR(workspace->h[j][j]) + SQR(workspace->h[j + 1][j]) );
-                    workspace->hc[j] = workspace->h[j][j] / cc;
-                    workspace->hs[j] = workspace->h[j + 1][j] / cc;
-                }
-
-                tmp1 =  workspace->hc[i] * workspace->h[i][j] +
-                        workspace->hs[i] * workspace->h[i + 1][j];
-                tmp2 = -workspace->hs[i] * workspace->h[i][j] +
-                       workspace->hc[i] * workspace->h[i + 1][j];
-
-                workspace->h[i][j] = tmp1;
-                workspace->h[i + 1][j] = tmp2;
-            }
-
-            /* apply Givens rotations to the rhs as well */
-            tmp1 =  workspace->hc[j] * workspace->g[j];
-            tmp2 = -workspace->hs[j] * workspace->g[j];
-            workspace->g[j] = tmp1;
-            workspace->g[j + 1] = tmp2;
-
-            //fprintf( stderr, "h: " );
-            //for( i = 0; i <= j+1; ++i )
-            //fprintf( stderr, "%.6f ", workspace->h[i][j] );
-            //fprintf( stderr, "\n" );
-            //fprintf( stderr, "res: %.15e\n", workspace->g[j+1] );
-        }
-
-
-        /* TODO: solve using Jacobi iteration? */
-        /* solve Hy = g: H is now upper-triangular, do back-substitution */
-        for ( i = j - 1; i >= 0; i-- )
-        {
-            temp = workspace->g[i];
-            for ( k = j - 1; k > i; k-- )
-            {
-                temp -= workspace->h[i][k] * workspace->y[k];
-            }
-
-            workspace->y[i] = temp / workspace->h[i][i];
-        }
-
-        /* update x = x_0 + Vy */
-        Vector_MakeZero( workspace->p, N );
-        for ( i = 0; i < j; i++ )
-        {
-            Vector_Add( workspace->p, workspace->y[i], workspace->v[i], N );
-        }
-        //Backward_Subs( U, workspace->p, workspace->p );
-        //Forward_Subs( L, workspace->p, workspace->p );
-        Vector_Add( x, 1., workspace->p, N );
-
-        /* stopping condition */
-        if ( fabs(workspace->g[j]) / bnorm <= tol )
-        {
-            break;
-        }
-    }
-
-    // Sparse_MatVec( H, x, workspace->b_prm );
-    // for( i = 0; i < N; ++i )
-    // workspace->b_prm[i] *= workspace->Hdia_inv[i];
-    // fprintf( fout, "\n%10s%15s%15s\n", "b_prc", "b_prm", "x" );
-    // for( i = 0; i < N; ++i )
-    // fprintf( fout, "%10.5f%15.12f%15.12f\n",
-    // workspace->b_prc[i], workspace->b_prm[i], x[i] );*/
-
-    // fprintf(fout,"GMRES outer:%d, inner:%d iters - residual norm: %25.20f\n",
-    //          itr, j, fabs( workspace->g[j] ) / bnorm );
-    // data->timing.matvec += itr * RESTART + j;
-
-    free( Dinv_U );
-    free( Dinv_L );
-
-    if ( itr >= MAX_ITR )
-    {
-        fprintf( stderr, "GMRES convergence failed\n" );
-        // return -1;
-        return itr * (RESTART + 1) + j + 1;
-    }
-
-    return itr * (RESTART + 1) + j + 1;
-}
-
-
+/* Preconditioned Conjugate Gradient */
 int PCG( static_storage *workspace, sparse_matrix *A, real *b, real tol,
          sparse_matrix *L, sparse_matrix *U, real *x, FILE *fout )
 {
@@ -1185,8 +1086,8 @@ int PCG( static_storage *workspace, sparse_matrix *A, real *b, real tol,
     //Print_Soln( workspace, x, q, b, N );
     //fprintf( stderr, "res: %.15e\n", r_norm );
 
-    Forward_Subs( L, workspace->r, workspace->d );
-    Backward_Subs( U, workspace->d, workspace->p );
+    tri_solve( L, workspace->r, workspace->d, LOWER );
+    tri_solve( U, workspace->d, workspace->p, UPPER );
     sig_new = Dot( workspace->r, workspace->p, N );
     sig0 = sig_new;
 
@@ -1204,8 +1105,8 @@ int PCG( static_storage *workspace, sparse_matrix *A, real *b, real tol,
         r_norm = Norm(workspace->r, N);
         //fprintf( stderr, "res: %.15e\n", r_norm );
 
-        Forward_Subs( L, workspace->r, workspace->d );
-        Backward_Subs( U, workspace->d, workspace->d );
+        tri_solve( L, workspace->r, workspace->d, LOWER );
+        tri_solve( U, workspace->d, workspace->d, UPPER );
         sig_old = sig_new;
         sig_new = Dot( workspace->r, workspace->d, N );
         beta = sig_new / sig_old;
@@ -1223,6 +1124,7 @@ int PCG( static_storage *workspace, sparse_matrix *A, real *b, real tol,
 }
 
 
+/* Conjugate Gradient */
 int CG( static_storage *workspace, sparse_matrix *H,
         real *b, real tol, real *x, FILE *fout )
 {
@@ -1237,7 +1139,9 @@ int CG( static_storage *workspace, sparse_matrix *H,
     Sparse_MatVec( H, x, workspace->q );
     Vector_Sum( workspace->r , 1.,  b, -1., workspace->q, N );
     for ( j = 0; j < N; ++j )
+    {
         workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j];
+    }
 
     sig_new = Dot( workspace->r, workspace->d, N );
     sig0 = sig_new;
@@ -1280,7 +1184,6 @@ int CG( static_storage *workspace, sparse_matrix *H,
 }
 
 
-
 /* Steepest Descent */
 int SDM( static_storage *workspace, sparse_matrix *H,
          real *b, real tol, real *x, FILE *fout )
@@ -1296,7 +1199,9 @@ int SDM( static_storage *workspace, sparse_matrix *H,
     Sparse_MatVec( H, x, workspace->q );
     Vector_Sum( workspace->r , 1.,  b, -1., workspace->q, N );
     for ( j = 0; j < N; ++j )
+    {
         workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j];
+    }
 
     sig = Dot( workspace->r, workspace->d, N );
     sig0 = sig;
@@ -1312,7 +1217,9 @@ int SDM( static_storage *workspace, sparse_matrix *H,
         Vector_Add( x, alpha, workspace->d, N );
         Vector_Add( workspace->r, -alpha, workspace->q, N );
         for ( j = 0; j < N; ++j )
+        {
             workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j];
+        }
 
         //fprintf( stderr, "d_norm:%24.15e, q_norm:%24.15e, tmp:%24.15e\n",
         //     Norm(workspace->d,N), Norm(workspace->q,N), tmp );
@@ -1330,6 +1237,15 @@ int SDM( static_storage *workspace, sparse_matrix *H,
 }
 
 
+/* Estimate the stability of a 2-side preconditioning scheme
+ * using the factorization A \approx LU. Specifically, estimate the 1-norm of A^{-1}
+ * using the 1-norm of (LU)^{-1}e, with e = [1 1 ... 1]^T through 2 triangular solves:
+ *   1) Ly = e
+ *   2) Ux = y where y = Ux
+ * That is, we seek to solve e = LUx for unknown x 
+ *
+ * Reference: Incomplete LU Preconditioning with the Multilevel Fast Multipole Algorithm
+ *   for Electromagnetic Scattering, SIAM J. Sci. Computing, 2007 */
 real condest( const sparse_matrix * const L, const sparse_matrix * const U )
 {
     unsigned int i, N;
@@ -1348,9 +1264,10 @@ real condest( const sparse_matrix * const L, const sparse_matrix * const U )
         e[i] = 1.;
     }
 
-    Forward_Subs( L, e, e );
-    Backward_Subs( U, e, e );
+    tri_solve( L, e, e, LOWER );
+    tri_solve( U, e, e, UPPER );
 
+    /* compute 1-norm of vector e */
     c = fabs(e[0]);
     for ( i = 1; i < N; ++i)
     {
diff --git a/sPuReMD/src/GMRES.h b/sPuReMD/src/GMRES.h
index 2acf61f2c2df46a175b4bccf31d229c147611755..d6d1c4844a9ea2870edffbdf1d65a5da358b362f 100644
--- a/sPuReMD/src/GMRES.h
+++ b/sPuReMD/src/GMRES.h
@@ -24,20 +24,13 @@
 
 #include "mytypes.h"
 
-int GMRES( static_storage*, sparse_matrix*,
-           real*, real, real*, FILE*, real*, real* );
+int GMRES( const static_storage * const, const control_params * const,
+        const sparse_matrix * const, const real * const, real, real * const,
+        const FILE * const, real * const, real * const );
 
-int GMRES_HouseHolder( static_storage*, sparse_matrix*,
-                       real*, real, real*, FILE* );
-
-int PGMRES( static_storage*, sparse_matrix*, real*, real,
-            sparse_matrix*, sparse_matrix*, real*, FILE*, real*, real* );
-
-int PGMRES_Jacobi( static_storage*, sparse_matrix*, real*, real,
-                   sparse_matrix*, sparse_matrix*, real*, unsigned int, FILE*, real*, real* );
-
-int PCG( static_storage*, sparse_matrix*, real*, real,
-         sparse_matrix*, sparse_matrix*, real*, FILE* );
+int GMRES_HouseHolder( const static_storage * const, const control_params * const,
+        const sparse_matrix * const, const real * const, real, real * const,
+        const FILE * const, real * const, real * const );
 
 int CG( static_storage*, sparse_matrix*,
         real*, real, real*, FILE* );
diff --git a/sPuReMD/src/QEq.c b/sPuReMD/src/QEq.c
index 0355e68b0422af5a417b84b4958f8ff724f030d5..eca3d5516afba1184a61091c98798cadba6b4a57 100644
--- a/sPuReMD/src/QEq.c
+++ b/sPuReMD/src/QEq.c
@@ -506,8 +506,8 @@ static real SuperLU_Factorize( const sparse_matrix * const A,
 #endif
 
 
-/* Diagonal (Jacobi) preconditioner */
-static real diagonal_pre( const reax_system * const system, real * const Hdia_inv )
+/* Diagonal (Jacobi) preconditioner computation */
+static real diag_pre_comp( const reax_system * const system, real * const Hdia_inv )
 {
     unsigned int i;
     struct timeval start, stop;
@@ -1093,9 +1093,13 @@ static void Init_MatVec( const reax_system * const system, const control_params
 	    case DIAG_PC:
                 if ( workspace->Hdia_inv == NULL )
                 {
-                    workspace->Hdia_inv = (real *) calloc( system->N, sizeof( real ) );
+                    if ( ( workspace->Hdia_inv = (real *) calloc( system->N, sizeof( real ) ) ) == NULL )
+                    {
+                        fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
+                        exit( INSUFFICIENT_MEMORY );
+                    }
                 }
-                data->timing.pre_comp += diagonal_pre( system, workspace->Hdia_inv );
+                data->timing.pre_comp += diag_pre_comp( system, workspace->Hdia_inv );
 		break;
 	    case ICHOLT_PC:
                 Calculate_Droptol( workspace->H, workspace->droptol, control->pre_comp_droptol );
@@ -1121,7 +1125,7 @@ static void Init_MatVec( const reax_system * const system, const control_params
                 data->timing.pre_comp += ICHOLT( workspace->H, workspace->droptol, workspace->L, workspace->U );
 //                data->timing.pre_comp += ICHOLT( workspace->H_sp, workspace->droptol, workspace->L, workspace->U );
 		break;
-	    case ICHOL_PAR_PC:
+	    case ILU_PAR_PC:
                 if ( workspace->L == NULL )
                 {
                     /* factors have sparsity pattern as H */
@@ -1290,52 +1294,40 @@ void QEq( reax_system * const system, control_params * const control, simulation
     switch ( control->qeq_solver_type )
     {
         case GMRES_S:
-            matvecs = GMRES( workspace, workspace->H,
-                             workspace->b_s, control->qeq_solver_q_err, workspace->s[0], out_control->log,
-                             &(data->timing.pre_app), &(data->timing.spmv) );
-            matvecs += GMRES( workspace, workspace->H,
-                              workspace->b_t, control->qeq_solver_q_err, workspace->t[0], out_control->log, 
-                              &(data->timing.pre_app), &(data->timing.spmv) );
+            matvecs = GMRES( workspace, control, workspace->H, workspace->b_s, control->qeq_solver_q_err,
+                    workspace->s[0], out_control->log, &(data->timing.pre_app), &(data->timing.spmv) );
+            matvecs += GMRES( workspace, control, workspace->H, workspace->b_t, control->qeq_solver_q_err,
+                    workspace->t[0], out_control->log, &(data->timing.pre_app), &(data->timing.spmv) );
             break;
         case GMRES_H_S:
-            matvecs = GMRES_HouseHolder( workspace, workspace->H,
-                                         workspace->b_s, control->qeq_solver_q_err, workspace->s[0], out_control->log );
-            matvecs += GMRES_HouseHolder( workspace, workspace->H,
-                                          workspace->b_t, control->qeq_solver_q_err, workspace->t[0], out_control->log );
-            break;
-        case PGMRES_S:
-            matvecs = PGMRES( workspace, workspace->H, workspace->b_s, control->qeq_solver_q_err,
-                              workspace->L, workspace->U, workspace->s[0], out_control->log,
-                              &(data->timing.pre_app), &(data->timing.spmv) );
-            matvecs += PGMRES( workspace, workspace->H, workspace->b_t, control->qeq_solver_q_err,
-                               workspace->L, workspace->U, workspace->t[0], out_control->log,
-                               &(data->timing.pre_app), &(data->timing.spmv) );
-            break;
-        case PGMRES_J_S:
-            matvecs = PGMRES_Jacobi( workspace, workspace->H, workspace->b_s, control->qeq_solver_q_err,
-                                     workspace->L, workspace->U, workspace->s[0], control->pre_app_jacobi_iters,
-				     out_control->log, &(data->timing.pre_app), &(data->timing.spmv) );
-            matvecs += PGMRES_Jacobi( workspace, workspace->H, workspace->b_t, control->qeq_solver_q_err,
-                                      workspace->L, workspace->U, workspace->t[0], control->pre_app_jacobi_iters,
-				      out_control->log, &(data->timing.pre_app), &(data->timing.spmv) );
+            matvecs = GMRES_HouseHolder( workspace, control, workspace->H, workspace->b_s, control->qeq_solver_q_err,
+                    workspace->s[0], out_control->log, &(data->timing.pre_app), &(data->timing.spmv) );
+            matvecs += GMRES_HouseHolder( workspace, control, workspace->H, workspace->b_t, control->qeq_solver_q_err,
+                    workspace->t[0], out_control->log, &(data->timing.pre_app), &(data->timing.spmv) );
             break;
         case CG_S:
-            matvecs = CG( workspace, workspace->H,
-                          workspace->b_s, control->qeq_solver_q_err, workspace->s[0], out_control->log ) + 1;
-            matvecs += CG( workspace, workspace->H,
-                           workspace->b_t, control->qeq_solver_q_err, workspace->t[0], out_control->log ) + 1;
-            break;
-        case PCG_S:
-            matvecs = PCG( workspace, workspace->H, workspace->b_s, control->qeq_solver_q_err,
-                         workspace->L, workspace->U, workspace->s[0], out_control->log ) + 1;
-            matvecs += PCG( workspace, workspace->H, workspace->b_t, control->qeq_solver_q_err,
-                            workspace->L, workspace->U, workspace->t[0], out_control->log ) + 1;
+            matvecs = CG( workspace, workspace->H, workspace->b_s, control->qeq_solver_q_err,
+                    workspace->s[0], out_control->log ) + 1;
+            matvecs += CG( workspace, workspace->H, workspace->b_t, control->qeq_solver_q_err,
+                    workspace->t[0], out_control->log ) + 1;
+//            matvecs = CG( workspace, workspace->H, workspace->b_s, control->qeq_solver_q_err,
+//                    workspace->L, workspace->U, workspace->s[0], control->pre_app_type,
+//                    control->pre_app_jacobi_iters, out_control->log, &(data->timing.pre_app), &(data->timing.spmv) ) + 1;
+//            matvecs += CG( workspace, workspace->H, workspace->b_t, control->qeq_solver_q_err,
+//                    workspace->L, workspace->U, workspace->t[0], control->pre_app_type,
+//                    control->pre_app_jacobi_iters, out_control->log, &(data->timing.pre_app), &(data->timing.spmv) ) + 1;
             break;
         case SDM_S:
-            matvecs = SDM( workspace, workspace->H,
-                           workspace->b_s, control->qeq_solver_q_err, workspace->s[0], out_control->log ) + 1;
-            matvecs += SDM( workspace, workspace->H,
-                            workspace->b_t, control->qeq_solver_q_err, workspace->t[0], out_control->log ) + 1;
+            matvecs = SDM( workspace, workspace->H, workspace->b_s, control->qeq_solver_q_err,
+                    workspace->s[0], out_control->log ) + 1;
+            matvecs += SDM( workspace, workspace->H, workspace->b_t, control->qeq_solver_q_err,
+                    workspace->t[0], out_control->log ) + 1;
+//            matvecs = SDM( workspace, workspace->H, workspace->b_s, control->qeq_solver_q_err,
+//                    workspace->L, workspace->U, workspace->s[0], control->pre_app_type,
+//                    control->pre_app_jacobi_iters, out_control->log, &(data->timing.pre_app), &(data->timing.spmv) ) + 1;
+//            matvecs += SDM( workspace, workspace->H, workspace->b_t, control->qeq_solver_q_err,
+//                    workspace->L, workspace->U, workspace->t[0], control->pre_app_type,
+//                    control->pre_app_jacobi_iters, out_control->log, &(data->timing.pre_app), &(data->timing.spmv) ) + 1;
             break;
 	default:
             fprintf( stderr, "Unrecognized QEq solver selection. Terminating...\n" );
diff --git a/sPuReMD/src/control.c b/sPuReMD/src/control.c
index c6bb58c356a1a6d2b846540b612a986ba2cced4d..d70f830979c6be19a65ade21da2d263150072ecd 100644
--- a/sPuReMD/src/control.c
+++ b/sPuReMD/src/control.c
@@ -69,12 +69,13 @@ char Read_Control_File( FILE* fp, reax_system *system, control_params* control,
 
     control->tabulate = 0;
 
-    control->qeq_solver_type = PGMRES_S;
+    control->qeq_solver_type = GMRES_S;
     control->qeq_solver_q_err = 0.000001;
     control->pre_comp_type = ICHOLT_PC;
     control->pre_comp_sweeps = ICHOLT_PC;
     control->pre_comp_refactor = 100;
     control->pre_comp_droptol = 0.01;
+    control->pre_app_type = TRI_SOLVE_PA;
     control->pre_app_jacobi_iters = 10;
 
     control->T_init = 0.;
@@ -266,6 +267,11 @@ char Read_Control_File( FILE* fp, reax_system *system, control_params* control,
             ival = atoi( tmp[1] );
             control->pre_comp_sweeps = ival;
         }
+        else if ( strcmp(tmp[0], "pre_app_type") == 0 )
+        {
+            ival = atoi( tmp[1] );
+            control->pre_app_type = ival;
+        }
         else if ( strcmp(tmp[0], "pre_app_jacobi_iters") == 0 )
         {
             val = atof( tmp[1] );
diff --git a/sPuReMD/src/mytypes.h b/sPuReMD/src/mytypes.h
index 7b1407721b50cb2166c44a3cc7b45ab042b750e4..7dd51a7b949005993c94288f548f265d2eef2162 100644
--- a/sPuReMD/src/mytypes.h
+++ b/sPuReMD/src/mytypes.h
@@ -190,13 +190,17 @@ enum geo_formats
 
 enum solver
 {
-    GMRES_S = 0, GMRES_H_S = 1, PGMRES_S = 2,
-    PGMRES_J_S = 3, CG_S = 4, PCG_S = 5, SDM_S = 6,
+    GMRES_S = 0, GMRES_H_S = 1, CG_S = 2, SDM_S = 3,
 };
 
 enum pre_comp
 {
-    DIAG_PC = 0, ICHOLT_PC = 1, ICHOL_PAR_PC = 2, ILU_SUPERLU_MT_PC = 3,
+    DIAG_PC = 0, ICHOLT_PC = 1, ILU_PAR_PC = 2, ILU_SUPERLU_MT_PC = 3,
+};
+
+enum pre_app
+{
+    NONE_PA = 0, TRI_SOLVE_PA = 1, TRI_SOLVE_LEVEL_SCHED_PA = 2, JACOBI_ITER_PA = 3,
 };
 
 
@@ -510,6 +514,7 @@ typedef struct
     unsigned int pre_comp_refactor;
     real pre_comp_droptol;
     unsigned int pre_comp_sweeps;
+    unsigned int pre_app_type;
     unsigned int pre_app_jacobi_iters;
 
     int molec_anal;
diff --git a/tools/run_sim.py b/tools/run_sim.py
index 39342fdb03a3f71591453199b561fbc9ca9ba7c8..bcea5dbadefcf0484b189109b438c6cef4df679b 100644
--- a/tools/run_sim.py
+++ b/tools/run_sim.py
@@ -1,5 +1,6 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
+import argparse
 from fileinput import input
 from itertools import product
 from re import sub
@@ -12,7 +13,7 @@ from time import time
 
 class TestCase():
     def __init__(self, geo_file, ffield_file, control_file, params={}, result_header_fmt='',
-            result_header='', result_body_fmt='', result_file='results.txt'):
+            result_header='', result_body_fmt='', result_file='results.txt', geo_format='1'):
         self.__geo_file = geo_file
         self.__ffield_file = ffield_file
         self.__control_file = control_file
@@ -47,33 +48,35 @@ class TestCase():
                     r'(?P<key>geo_format\s+)\S+(?P<comment>.*)', r'\g<key>%s\g<comment>' % x, l), \
         }
 
+        self.__params['geo_format'] = geo_format
+
     def _setup(self, param, temp_file):
         fp = open(self.__control_file, 'r')
-	lines = fp.read()
-	fp.close()
+        lines = fp.read()
+        fp.close()
         for k in self.__control_res.keys():
             try:
                 lines = self.__control_res[k](lines, param[k])
             except KeyError:
                 pass
-        fp_temp = open(temp_file, 'w', 0)
-	fp_temp.write(lines)
-	fp_temp.close()
+        fp_temp = open(temp_file, 'w')
+        fp_temp.write(lines)
+        fp_temp.close()
 
     def run(self, bin_file='sPuReMD/bin/spuremd'):
         base_dir = getcwd()
-	bin_path = path.join(base_dir, bin_file)
+        bin_path = path.join(base_dir, bin_file)
         env = dict(environ)
 
         write_header = True
         if path.exists(self.__result_file):
             write_header = False
-        fout = open(self.__result_file, 'a', 0)
+        fout = open(self.__result_file, 'a')
         if write_header:
             fout.write(self.__result_header_fmt.format(*self.__result_header))
 
         temp_dir = mkdtemp()
-	temp_file = path.join(temp_dir, path.basename(self.__control_file))
+        temp_file = path.join(temp_dir, path.basename(self.__control_file))
 
         for p in product(*[self.__params[k] for k in self.__param_names]):
             param_dict = dict((k, v) for (k, v) in zip(self.__param_names, p))
@@ -95,7 +98,7 @@ class TestCase():
 
         fout.close()
         remove(temp_file)
-	rmdir(temp_dir)
+        rmdir(temp_dir)
 
     def _process_result(self, fout, time, param):
         qeq = 0.
@@ -129,69 +132,122 @@ class TestCase():
 
 
 if __name__ == '__main__':
+    DATA = [ \
+            'bilayer_56800', 'bilayer_340800', \
+            'dna_19733', \
+            'petn_48256', \
+            'silica_6000', 'silica_72000', 'silica_300000', \
+            'water_6540', 'water_78480', 'water_327000', \
+            ]
+
+    parser = argparse.ArgumentParser(description='Run molecular dynamics simulations on specified data sets.')
+    parser.add_argument('data', nargs='+',
+            choices=DATA, help='Data sets for which to run simulations.')
+
+    # parse args and take action
+    args = parser.parse_args()
+
     base_dir = getcwd()
     control_dir = path.join(base_dir, 'environ')
     data_dir = path.join(base_dir, 'data/benchmarks')
 
     header_fmt_str = '{:20}|{:5}|{:5}|{:5}|{:10}|{:10}|{:10}|{:10}|{:10}|{:10}|{:10}\n'
     header_str = ['Data Set', 'Steps', 'Q Tol', 'Pre T', 'Pre Comp',
-		    'Pre App', 'Iters', 'SpMV', 'QEq', 'Threads', 'Time (s)']
+            'Pre App', 'Iters', 'SpMV', 'QEq', 'Threads', 'Time (s)']
     body_fmt_str = '{:20} {:5} {:5} {:5} {:10.6f} {:10.6f} {:10.6f} {:10.6f} {:10.6f} {:10} {:10.6f}\n'
 
     params = {
             'ensemble_type': ['0'],
-            'nsteps': ['20', '100', '500', '1000'],
+            'nsteps': ['20'],
+#            'nsteps': ['20', '100', '500', '1000'],
             'qeq_solver_type': ['0'],
-            'qeq_solver_q_err': ['1e-6', '1e-10'],
+            'qeq_solver_q_err': ['1e-6'],
+#            'qeq_solver_q_err': ['1e-6', '1e-10'],
 #            'qeq_solver_q_err': ['1e-6', '1e-8', '1e-10', '1e-14'],
-            'pre_comp_type': ['0'],
+            'pre_comp_type': ['2'],
 #            'pre_comp_type': ['0', '1', '2'],
-            'pre_comp_refactor': ['1'],
-#            'pre_comp_sweeps': ['3'],
-#            'pre_app_type': ['0'],
-#            'pre_app_jacobi_iters': ['50'],
-            'threads': ['12'],
+            'pre_comp_refactor': ['100'],
+            'pre_comp_sweeps': ['3'],
+            'pre_app_type': ['2'],
+            'pre_app_jacobi_iters': ['50'],
+            'threads': ['2'],
 #            'threads': ['1', '2', '4', '12', '24'],
-            'geo_format': ['1'],
+            'geo_format': [],
     }
 
-    test_cases = [
+    test_cases = []
+    if 'water_6540' in args.data:
+        test_cases.append(
             TestCase(path.join(data_dir, 'water/water_6540.pdb'),
                 path.join(data_dir, 'water/ffield.water'),
                 path.join(control_dir, 'param.gpu.water'),
-                params=params, result_header_fmt=header_fmt_str, result_header = header_str, result_body_fmt=body_fmt_str),
-#            TestCase(path.join(data_dir, 'water/water_78480.pdb'),
-#                path.join(data_dir, 'water/ffield.water'),
-#                path.join(control_dir, 'param.gpu.water'),
-#                params=params, result_header_fmt=header_fmt_str, result_header = header_str, result_body_fmt=body_fmt_str),
-#            TestCase(path.join(data_dir, 'water/water_327000.geo'),
-#                path.join(data_dir, 'water/ffield.water'),
-#                path.join(control_dir, 'param.gpu.water'),
-#                params=params, result_header_fmt=header_fmt_str, result_header = header_str, result_body_fmt=body_fmt_str),
+                params=params, result_header_fmt=header_fmt_str,
+                result_header = header_str, result_body_fmt=body_fmt_str,
+                geo_format=['1']))
+    if 'water_78480' in args.data:
+        test_cases.append(
+            TestCase(path.join(data_dir, 'water/water_78480.pdb'),
+                path.join(data_dir, 'water/ffield.water'),
+                path.join(control_dir, 'param.gpu.water'),
+                params=params, result_header_fmt=header_fmt_str,
+                result_header = header_str, result_body_fmt=body_fmt_str,
+                geo_format=['0']))
+    if 'water_327000' in args.data:
+        test_cases.append(
+            TestCase(path.join(data_dir, 'water/water_327000.geo'),
+                path.join(data_dir, 'water/ffield.water'),
+                path.join(control_dir, 'param.gpu.water'),
+                params=params, result_header_fmt=header_fmt_str,
+                result_header = header_str, result_body_fmt=body_fmt_str,
+                geo_format=['0']))
+    if 'bilayer_56800' in args.data:
+        test_cases.append(
             TestCase(path.join(data_dir, 'bilayer/bilayer_56800.pdb'),
                 path.join(data_dir, 'bilayer/ffield-bio'),
                 path.join(control_dir, 'param.gpu.water'),
-                params=params, result_header_fmt=header_fmt_str, result_header = header_str, result_body_fmt=body_fmt_str),
+                params=params, result_header_fmt=header_fmt_str,
+                result_header = header_str, result_body_fmt=body_fmt_str,
+                geo_format=['1']))
+    if 'dna_19733' in args.data:
+        test_cases.append(
             TestCase(path.join(data_dir, 'dna/dna_19733.pdb'),
                 path.join(data_dir, 'dna/ffield-dna'),
                 path.join(control_dir, 'param.gpu.water'),
-                params=params, result_header_fmt=header_fmt_str, result_header = header_str, result_body_fmt=body_fmt_str),
+                params=params, result_header_fmt=header_fmt_str,
+                result_header = header_str, result_body_fmt=body_fmt_str,
+                geo_format=['1']))
+    if 'silica_6000' in args.data:
+        test_cases.append(
             TestCase(path.join(data_dir, 'silica/silica_6000.pdb'),
                 path.join(data_dir, 'silica/ffield-bio'),
                 path.join(control_dir, 'param.gpu.water'),
-                params=params, result_header_fmt=header_fmt_str, result_header = header_str, result_body_fmt=body_fmt_str),
-#            TestCase(path.join(data_dir, 'silica/silica_72000.geo'),
-#                path.join(data_dir, 'silica/ffield-bio'),
-#                path.join(control_dir, 'param.gpu.water'),
-#                params=params, result_header_fmt=header_fmt_str, result_header = header_str, result_body_fmt=body_fmt_str),
-#            TestCase(path.join(data_dir, 'silica/silica_300000.geo'),
-#                path.join(data_dir, 'silica/ffield-bio'),
-#                path.join(control_dir, 'param.gpu.water'),
-#                params=params, result_header_fmt=header_fmt_str, result_header = header_str, result_body_fmt=body_fmt_str),
+                params=params, result_header_fmt=header_fmt_str,
+                result_header = header_str, result_body_fmt=body_fmt_str,
+                geo_format=['1']))
+    if 'silica_72000' in args.data:
+        test_cases.append(
+            TestCase(path.join(data_dir, 'silica/silica_72000.geo'),
+                path.join(data_dir, 'silica/ffield-bio'),
+                path.join(control_dir, 'param.gpu.water'),
+                params=params, result_header_fmt=header_fmt_str,
+                result_header = header_str, result_body_fmt=body_fmt_str,
+                geo_format=['0']))
+    if 'silica_300000' in args.data:
+        test_cases.append(
+            TestCase(path.join(data_dir, 'silica/silica_300000.geo'),
+                path.join(data_dir, 'silica/ffield-bio'),
+                path.join(control_dir, 'param.gpu.water'),
+                params=params, result_header_fmt=header_fmt_str,
+                result_header = header_str, result_body_fmt=body_fmt_str,
+                geo_format=['0']))
+    if 'petn_48256' in args.data:
+        test_cases.append(
             TestCase(path.join(data_dir, 'petn/petn_48256.pdb'),
                 path.join(data_dir, 'petn/ffield.petn'),
                 path.join(control_dir, 'param.gpu.water'),
-                params=params, result_header_fmt=header_fmt_str, result_header = header_str, result_body_fmt=body_fmt_str),
-                        ]
+                params=params, result_header_fmt=header_fmt_str,
+                result_header = header_str, result_body_fmt=body_fmt_str,
+                geo_format=['1']))
+
     for test in test_cases:
         test.run(bin_file='sPuReMD/bin/spuremd')