diff --git a/sPuReMD/src/charges.c b/sPuReMD/src/charges.c
index 7561df494dd1246dd0e51cd8a812d2bd9e90be4a..7b9cbdab2470ae42a1ea5556a8c8a53455aa9400 100644
--- a/sPuReMD/src/charges.c
+++ b/sPuReMD/src/charges.c
@@ -871,7 +871,6 @@ static void Setup_Preconditioner_EE( const reax_system * const system,
         const control_params * const control,
         simulation_data * const data, static_storage * const workspace )
 {
-    int fillin;
     real time;
     sparse_matrix *Hptr;
 
@@ -968,7 +967,6 @@ static void Setup_Preconditioner_ACKS2( const reax_system * const system,
         const control_params * const control,
         simulation_data * const data, static_storage * const workspace )
 {
-    int fillin;
     real time;
     sparse_matrix *Hptr;
 
diff --git a/sPuReMD/src/init_md.c b/sPuReMD/src/init_md.c
index 4461a9cd251894040eab34d58aec153e7a809f27..5faf0b8c0f302275fda257649537811ca643e17c 100644
--- a/sPuReMD/src/init_md.c
+++ b/sPuReMD/src/init_md.c
@@ -618,6 +618,20 @@ static void Init_Workspace( reax_system *system, control_params *control,
         workspace->rp2 = NULL;
     }
 
+    /* ILUTP preconditioner related */
+    if ( control->cm_solver_pre_comp_type == ILUTP_PC )
+    {
+        workspace->perm_ilutp = smalloc( sizeof( int ) * system->N_cm,
+               "Init_Workspace::workspace->perm_ilutp" );
+        workspace->r_p = smalloc( sizeof( real ) * system->N_cm,
+               "Init_Workspace::workspace->r_p" );
+    }
+    else
+    {
+        workspace->perm_ilutp = NULL;
+        workspace->r_p = NULL;
+    }
+
     /* integrator storage */
     workspace->a = smalloc( system->N * sizeof( rvec ),
            "Init_Workspace::workspace->a" );
@@ -1349,6 +1363,13 @@ static void Finalize_Workspace( reax_system *system, control_params *control,
         sfree( workspace->rp2, "Finalize_Workspace::rp2" );
     }
 
+    /* ILUTP preconditioner related */
+    if ( control->cm_solver_pre_comp_type == ILUTP_PC )
+    {
+        sfree( workspace->perm_ilutp, "Finalize_Workspace::workspace->perm_ilutp" );
+        sfree( workspace->r_p, "Finalize_Workspace::workspace->r_p" );
+    }
+
     /* integrator storage */
     sfree( workspace->a, "Finalize_Workspace::workspace->a" );
     sfree( workspace->f_old, "Finalize_Workspace::workspace->f_old" );
diff --git a/sPuReMD/src/lin_alg.c b/sPuReMD/src/lin_alg.c
index 770c1141f18ab5eadafc4c7b6dc39e0cea9b86e1..f65b9f5f96f9b90d0f348956e5b06105a1838ddc 100644
--- a/sPuReMD/src/lin_alg.c
+++ b/sPuReMD/src/lin_alg.c
@@ -2209,6 +2209,28 @@ void sort_rows_by_colors( const static_storage * const workspace,
 }
 
 
+/* Apply permutation P*x = x_p
+ *
+ * x: vector to permute
+ * perm: vector containing mapping if computing P*x
+ * x_p (output): permuted vector
+ * n: number of entries in x
+ */
+static void permute_vector( const real * const x, const int * const perm,
+        real * const x_p, const unsigned int n )
+{
+    unsigned int i;
+
+#ifdef _OPENMP
+    #pragma omp for schedule(static)
+#endif
+    for ( i = 0; i < n; ++i )
+    {
+        x_p[i] = x[perm[i]];
+    }
+}
+
+
 /* Apply permutation Q^T*x or Q*x based on graph coloring
  *
  * workspace: storage container for workspace structures
@@ -2218,7 +2240,7 @@ void sort_rows_by_colors( const static_storage * const workspace,
  * invert_map: if TRUE, use Q^T, otherwise use Q
  * tri: coloring to triangular factor to use (lower/upper)
  */
-static void permute_vector( const static_storage * const workspace,
+static void permute_vector_gc( const static_storage * const workspace,
         real * const x, const unsigned int n, const int invert_map,
         const TRIANGULARITY tri )
 {
@@ -2514,7 +2536,7 @@ void jacobi_iter( const static_storage * const workspace,
  * control: data struct containing parameters
  * y: vector to which to apply preconditioning,
  *  specific to internals of iterative solver being used
- * x: preconditioned vector (output)
+ * x (output): preconditioned vector
  * fresh_pre: parameter indicating if this is a newly computed (fresh) preconditioner
  * side: used in determining how to apply preconditioner if the preconditioner is
  *  factorized as M = M_{1}M_{2} (e.g., incomplete LU, A \approx LU)
@@ -2522,9 +2544,9 @@ void jacobi_iter( const static_storage * const workspace,
  * Assumptions:
  *   Matrices have non-zero diagonals
  *   Each row of a matrix has at least one non-zero (i.e., no rows with all zeros) */
-static void apply_preconditioner( const static_storage * const workspace, const control_params * const control,
-                                  const real * const y, real * const x, const int fresh_pre,
-                                  const int side )
+static void apply_preconditioner( const static_storage * const workspace,
+        const control_params * const control, const real * const y, real * const x,
+        const int fresh_pre, const int side )
 {
     int i, si;
 
@@ -2551,15 +2573,18 @@ static void apply_preconditioner( const static_storage * const workspace, const
                     break;
                 case ICHOLT_PC:
                 case ILUT_PC:
-                case ILUTP_PC:
                 case FG_ILUT_PC:
                     tri_solve( workspace->L, y, x, workspace->L->n, LOWER );
                     break;
+                case ILUTP_PC:
+                    permute_vector( y, workspace->perm_ilutp, workspace->r_p, workspace->H->n );
+                    tri_solve( workspace->L, workspace->r_p, x, workspace->L->n, LOWER );
+                    break;
                 case SAI_PC:
                     Sparse_MatVec_full( workspace->H_app_inv, y, x );
                     break;
                 default:
-                    fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+                    fprintf( stderr, "[ERROR] Unrecognized preconditioner application method. Terminating...\n" );
                     exit( INVALID_INPUT );
                     break;
                 }
@@ -2572,16 +2597,20 @@ static void apply_preconditioner( const static_storage * const workspace, const
                     break;
                 case ICHOLT_PC:
                 case ILUT_PC:
-                case ILUTP_PC:
                 case FG_ILUT_PC:
                     tri_solve_level_sched( (static_storage *) workspace,
                             workspace->L, y, x, workspace->L->n, LOWER, fresh_pre );
                     break;
+                case ILUTP_PC:
+                    permute_vector( y, workspace->perm_ilutp, workspace->r_p, workspace->H->n );
+                    tri_solve_level_sched( (static_storage *) workspace,
+                            workspace->L, workspace->r_p, x, workspace->L->n, LOWER, fresh_pre );
+                    break;
                 case SAI_PC:
                     Sparse_MatVec_full( workspace->H_app_inv, y, x );
                     break;
                 default:
-                    fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+                    fprintf( stderr, "[ERROR] Unrecognized preconditioner application method. Terminating...\n" );
                     exit( INVALID_INPUT );
                     break;
                 }
@@ -2591,12 +2620,11 @@ static void apply_preconditioner( const static_storage * const workspace, const
                 {
                 case JACOBI_PC:
                 case SAI_PC:
-                    fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" );
+                    fprintf( stderr, "[ERROR] Unsupported preconditioner computation/application method combination. Terminating...\n" );
                     exit( INVALID_INPUT );
                     break;
                 case ICHOLT_PC:
                 case ILUT_PC:
-                case ILUTP_PC:
                 case FG_ILUT_PC:
 #ifdef _OPENMP
                     #pragma omp for schedule(static)
@@ -2606,12 +2634,27 @@ static void apply_preconditioner( const static_storage * const workspace, const
                         workspace->y_p[i] = y[i];
                     }
 
-                    permute_vector( workspace, workspace->y_p, workspace->H->n, FALSE, LOWER );
+                    permute_vector_gc( workspace, workspace->y_p, workspace->H->n, FALSE, LOWER );
+                    tri_solve_level_sched( (static_storage *) workspace,
+                            workspace->L, workspace->y_p, x, workspace->L->n, LOWER, fresh_pre );
+                    break;
+                case ILUTP_PC:
+                    permute_vector( y, workspace->perm_ilutp, workspace->r_p, workspace->H->n );
+
+#ifdef _OPENMP
+                    #pragma omp for schedule(static)
+#endif
+                    for ( i = 0; i < workspace->H->n; ++i )
+                    {
+                        workspace->y_p[i] = workspace->r_p[i];
+                    }
+
+                    permute_vector_gc( workspace, workspace->y_p, workspace->H->n, FALSE, LOWER );
                     tri_solve_level_sched( (static_storage *) workspace,
                             workspace->L, workspace->y_p, x, workspace->L->n, LOWER, fresh_pre );
                     break;
                 default:
-                    fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+                    fprintf( stderr, "[ERROR] Unrecognized preconditioner application method. Terminating...\n" );
                     exit( INVALID_INPUT );
                     break;
                 }
@@ -2621,12 +2664,11 @@ static void apply_preconditioner( const static_storage * const workspace, const
                 {
                 case JACOBI_PC:
                 case SAI_PC:
-                    fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" );
+                    fprintf( stderr, "[ERROR] Unsupported preconditioner computation/application method combination. Terminating...\n" );
                     exit( INVALID_INPUT );
                     break;
                 case ICHOLT_PC:
                 case ILUT_PC:
-                case ILUTP_PC:
                 case FG_ILUT_PC:
                     /* construct D^{-1}_L */
                     if ( fresh_pre == TRUE )
@@ -2644,14 +2686,33 @@ static void apply_preconditioner( const static_storage * const workspace, const
                     jacobi_iter( workspace, workspace->L, workspace->Dinv_L,
                             y, x, LOWER, control->cm_solver_pre_app_jacobi_iters );
                     break;
+                case ILUTP_PC:
+                    permute_vector( y, workspace->perm_ilutp, workspace->r_p, workspace->H->n );
+
+                    /* construct D^{-1}_L */
+                    if ( fresh_pre == TRUE )
+                    {
+#ifdef _OPENMP
+                        #pragma omp for schedule(static)
+#endif
+                        for ( i = 0; i < workspace->L->n; ++i )
+                        {
+                            si = workspace->L->start[i + 1] - 1;
+                            workspace->Dinv_L[i] = 1.0 / workspace->L->val[si];
+                        }
+                    }
+
+                    jacobi_iter( workspace, workspace->L, workspace->Dinv_L,
+                            workspace->r_p, x, LOWER, control->cm_solver_pre_app_jacobi_iters );
+                    break;
                 default:
-                    fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+                    fprintf( stderr, "[ERROR] Unrecognized preconditioner application method. Terminating...\n" );
                     exit( INVALID_INPUT );
                     break;
                 }
                 break;
             default:
-                fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+                fprintf( stderr, "[ERROR] Unrecognized preconditioner application method. Terminating...\n" );
                 exit( INVALID_INPUT );
                 break;
 
@@ -2678,7 +2739,7 @@ static void apply_preconditioner( const static_storage * const workspace, const
                     tri_solve( workspace->U, y, x, workspace->U->n, UPPER );
                     break;
                 default:
-                    fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+                    fprintf( stderr, "[ERROR] Unrecognized preconditioner application method. Terminating...\n" );
                     exit( INVALID_INPUT );
                     break;
                 }
@@ -2701,7 +2762,7 @@ static void apply_preconditioner( const static_storage * const workspace, const
                             workspace->U, y, x, workspace->U->n, UPPER, fresh_pre );
                     break;
                 default:
-                    fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+                    fprintf( stderr, "[ERROR] Unrecognized preconditioner application method. Terminating...\n" );
                     exit( INVALID_INPUT );
                     break;
                 }
@@ -2711,7 +2772,7 @@ static void apply_preconditioner( const static_storage * const workspace, const
                 {
                 case JACOBI_PC:
                 case SAI_PC:
-                    fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" );
+                    fprintf( stderr, "[ERROR] Unsupported preconditioner computation/application method combination. Terminating...\n" );
                     exit( INVALID_INPUT );
                     break;
                 case ICHOLT_PC:
@@ -2720,10 +2781,10 @@ static void apply_preconditioner( const static_storage * const workspace, const
                 case FG_ILUT_PC:
                     tri_solve_level_sched( (static_storage *) workspace,
                             workspace->U, y, x, workspace->U->n, UPPER, fresh_pre );
-                    permute_vector( workspace, x, workspace->H->n, TRUE, UPPER );
+                    permute_vector_gc( workspace, x, workspace->H->n, TRUE, UPPER );
                     break;
                 default:
-                    fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+                    fprintf( stderr, "[ERROR] Unrecognized preconditioner application method. Terminating...\n" );
                     exit( INVALID_INPUT );
                     break;
                 }
@@ -2733,7 +2794,7 @@ static void apply_preconditioner( const static_storage * const workspace, const
                 {
                 case JACOBI_PC:
                 case SAI_PC:
-                    fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" );
+                    fprintf( stderr, "[ERROR] Unsupported preconditioner computation/application method combination. Terminating...\n" );
                     exit( INVALID_INPUT );
                     break;
                 case ICHOLT_PC:
@@ -2757,13 +2818,13 @@ static void apply_preconditioner( const static_storage * const workspace, const
                             y, x, UPPER, control->cm_solver_pre_app_jacobi_iters );
                     break;
                 default:
-                    fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+                    fprintf( stderr, "[ERROR] Unrecognized preconditioner application method. Terminating...\n" );
                     exit( INVALID_INPUT );
                     break;
                 }
                 break;
             default:
-                fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+                fprintf( stderr, "[ERROR] Unrecognized preconditioner application method. Terminating...\n" );
                 exit( INVALID_INPUT );
                 break;
 
@@ -2781,7 +2842,7 @@ int GMRES( const static_storage * const workspace, const control_params * const
            const real tol, real * const x, const int fresh_pre )
 {
     int i, j, k, itr, N, g_j, g_itr;
-    real cc, tmp1, tmp2, temp, bnorm;
+    real cc, tmp1, tmp2, temp, bnorm, g_bnorm;
     real t_start, t_ortho, t_pa, t_spmv, t_ts, t_vops;
 
     N = H->n;
@@ -2796,7 +2857,7 @@ int GMRES( const static_storage * const workspace, const control_params * const
 #ifdef _OPENMP
     #pragma omp parallel default(none) \
     private(i, j, k, itr, bnorm, temp, t_start) \
-    shared(N, cc, tmp1, tmp2, g_itr, g_j, stderr) \
+    shared(N, cc, tmp1, tmp2, g_itr, g_j, g_bnorm, stderr) \
     reduction(+: t_ortho, t_pa, t_spmv, t_ts, t_vops)
 #endif
     {
@@ -2962,6 +3023,7 @@ int GMRES( const static_storage * const workspace, const control_params * const
         {
             g_j = j;
             g_itr = itr;
+            g_bnorm = bnorm;
         }
     }
 
@@ -2974,7 +3036,7 @@ int GMRES( const static_storage * const workspace, const control_params * const
     if ( g_itr >= control->cm_solver_max_iters )
     {
         fprintf( stderr, "[WARNING] GMRES convergence failed (%d outer iters)\n", g_itr );
-        fprintf( stderr, "  [INFO] Rel. residual error: %f\n", FABS(workspace->g[g_j]) / bnorm );
+        fprintf( stderr, "  [INFO] Rel. residual error: %f\n", FABS(workspace->g[g_j]) / g_bnorm );
         return g_itr * (control->cm_solver_restart + 1) + g_j + 1;
     }
 
@@ -2988,7 +3050,7 @@ int GMRES_HouseHolder( const static_storage * const workspace,
                        real * const x, const int fresh_pre )
 {
     int i, j, k, itr, N, g_j, g_itr;
-    real cc, tmp1, tmp2, temp, bnorm;
+    real cc, tmp1, tmp2, temp, bnorm, g_bnorm;
     real v[10000], z[control->cm_solver_restart + 2][10000], w[control->cm_solver_restart + 2];
     real u[control->cm_solver_restart + 2][10000];
     real t_start, t_ortho, t_pa, t_spmv, t_ts, t_vops;
@@ -3004,7 +3066,7 @@ int GMRES_HouseHolder( const static_storage * const workspace,
 #ifdef _OPENMP
     #pragma omp parallel default(none) \
     private(i, j, k, itr, bnorm, temp, t_start) \
-    shared(v, z, w, u, tol, N, cc, tmp1, tmp2, g_itr, g_j, stderr) \
+    shared(v, z, w, u, tol, N, cc, tmp1, tmp2, g_itr, g_j, g_bnorm, stderr) \
     reduction(+: t_ortho, t_pa, t_spmv, t_ts, t_vops)
 #endif
     {
@@ -3204,6 +3266,7 @@ int GMRES_HouseHolder( const static_storage * const workspace,
         {
             g_j = j;
             g_itr = itr;
+            g_bnorm = bnorm;
         }
     }
 
@@ -3216,7 +3279,7 @@ int GMRES_HouseHolder( const static_storage * const workspace,
     if ( g_itr >= control->cm_solver_max_iters )
     {
         fprintf( stderr, "[WARNING] GMRES convergence failed (%d outer iters)\n", g_itr );
-        fprintf( stderr, "  [INFO] Rel. residual error: %f\n", FABS(w[g_j]) / bnorm );
+        fprintf( stderr, "  [INFO] Rel. residual error: %f\n", FABS(w[g_j]) / g_bnorm );
         return g_itr * (control->cm_solver_restart + 1) + j + 1;
     }
 
@@ -3230,7 +3293,7 @@ int CG( const static_storage * const workspace, const control_params * const con
         const real tol, real * const x, const int fresh_pre )
 {
     int i, g_itr, N;
-    real tmp, alpha, beta, b_norm, r_norm;
+    real tmp, alpha, beta, bnorm, g_bnorm, rnorm, g_rnorm;
     real *d, *r, *p, *z;
     real sig_old, sig_new;
     real t_start, t_pa, t_spmv, t_vops;
@@ -3246,9 +3309,9 @@ int CG( const static_storage * const workspace, const control_params * const con
 
 #ifdef _OPENMP
     #pragma omp parallel default(none) \
-    private(i, tmp, alpha, beta, b_norm, r_norm, sig_old, sig_new, t_start) \
+    private(i, tmp, alpha, beta, bnorm, rnorm, sig_old, sig_new, t_start) \
     reduction(+: t_pa, t_spmv, t_vops) \
-    shared(g_itr, N, d, r, p, z)
+    shared(g_itr, g_bnorm, g_rnorm, N, d, r, p, z)
 #endif
     {
         t_pa = 0.0;
@@ -3256,7 +3319,7 @@ int CG( const static_storage * const workspace, const control_params * const con
         t_vops = 0.0;
 
         t_start = Get_Time( );
-        b_norm = Norm( b, N );
+        bnorm = Norm( b, N );
         t_vops += Get_Timing_Info( t_start );
 
         t_start = Get_Time( );
@@ -3265,7 +3328,7 @@ int CG( const static_storage * const workspace, const control_params * const con
 
         t_start = Get_Time( );
         Vector_Sum( r, 1.0,  b, -1.0, d, N );
-        r_norm = Norm( r, N );
+        rnorm = Norm( r, N );
         t_vops += Get_Timing_Info( t_start );
 
         t_start = Get_Time( );
@@ -3278,7 +3341,7 @@ int CG( const static_storage * const workspace, const control_params * const con
         sig_new = Dot( r, p, N );
         t_vops += Get_Timing_Info( t_start );
 
-        for ( i = 0; i < control->cm_solver_max_iters && r_norm / b_norm > tol; ++i )
+        for ( i = 0; i < control->cm_solver_max_iters && rnorm / bnorm > tol; ++i )
         {
             t_start = Get_Time( );
             Sparse_MatVec( workspace, H, p, d );
@@ -3289,7 +3352,7 @@ int CG( const static_storage * const workspace, const control_params * const con
             alpha = sig_new / tmp;
             Vector_Add( x, alpha, p, N );
             Vector_Add( r, -1.0 * alpha, d, N );
-            r_norm = Norm( r, N );
+            rnorm = Norm( r, N );
             t_vops += Get_Timing_Info( t_start );
 
             t_start = Get_Time( );
@@ -3308,7 +3371,11 @@ int CG( const static_storage * const workspace, const control_params * const con
 #ifdef _OPENMP
         #pragma omp single
 #endif
-        g_itr = i;
+        {
+            g_itr = i;
+            g_bnorm = bnorm;
+            g_rnorm = rnorm;
+        }
     }
 
     data->timing.cm_solver_pre_app += t_pa / control->num_threads;
@@ -3318,7 +3385,7 @@ int CG( const static_storage * const workspace, const control_params * const con
     if ( g_itr >= control->cm_solver_max_iters )
     {
         fprintf( stderr, "[WARNING] CG convergence failed (%d iters)\n", g_itr );
-        fprintf( stderr, "  [INFO] Rel. residual error: %f\n", r_norm / b_norm );
+        fprintf( stderr, "  [INFO] Rel. residual error: %f\n", g_rnorm / g_bnorm );
         return g_itr;
     }
 
@@ -3343,7 +3410,7 @@ int BiCGStab( const static_storage * const workspace, const control_params * con
         const real tol, real * const x, const int fresh_pre )
 {
     int i, g_itr, N;
-    real tmp, alpha, beta, omega, rho, rho_old, sigma, r_norm, b_norm;
+    real tmp, alpha, beta, omega, rho, rho_old, sigma, rnorm, g_rnorm, bnorm, g_bnorm;
     real t_start, t_pa, t_spmv, t_vops;
 
     N = H->n;
@@ -3353,9 +3420,9 @@ int BiCGStab( const static_storage * const workspace, const control_params * con
 
 #ifdef _OPENMP
     #pragma omp parallel default(none) \
-    private(i, tmp, alpha, beta, omega, rho, rho_old, sigma, r_norm, b_norm, t_start) \
+    private(i, tmp, alpha, beta, omega, rho, rho_old, sigma, rnorm, bnorm, t_start) \
     reduction(+: t_pa, t_spmv, t_vops) \
-    shared(g_itr, N)
+    shared(g_itr, g_rnorm, g_bnorm, N)
 #endif
     {
         t_pa = 0.0;
@@ -3367,18 +3434,18 @@ int BiCGStab( const static_storage * const workspace, const control_params * con
         t_spmv += Get_Timing_Info( t_start );
 
         t_start = Get_Time( );
-        b_norm = Norm( b, N );
+        bnorm = Norm( b, N );
         Vector_Sum( workspace->r, 1.0,  b, -1.0, workspace->d, N );
         t_vops += Get_Timing_Info( t_start );
 
         t_start = Get_Time( );
         Vector_Copy( workspace->r_hat, workspace->r, N );
-        r_norm = Norm( workspace->r, N );
+        rnorm = Norm( workspace->r, N );
         Vector_Copy( workspace->p, workspace->r, N );
         rho_old = Dot( workspace->r, workspace->r_hat, N );
         t_vops += Get_Timing_Info( t_start );
 
-        for ( i = 0; i < control->cm_solver_max_iters && r_norm / b_norm > tol; ++i )
+        for ( i = 0; i < control->cm_solver_max_iters && rnorm / bnorm > tol; ++i )
         {
             t_start = Get_Time( );
             Sparse_MatVec( workspace, H, workspace->p, workspace->d );
@@ -3401,7 +3468,7 @@ int BiCGStab( const static_storage * const workspace, const control_params * con
             Vector_Sum( workspace->z, alpha, workspace->p, omega, workspace->q, N );
             Vector_Add( x, 1.0, workspace->z, N );
             Vector_Sum( workspace->r, 1.0, workspace->q, -1.0 * omega, workspace->y, N );
-            r_norm = Norm( workspace->r, N );
+            rnorm = Norm( workspace->r, N );
             t_vops += Get_Timing_Info( t_start );
 
             t_start = Get_Time( );
@@ -3416,7 +3483,11 @@ int BiCGStab( const static_storage * const workspace, const control_params * con
 #ifdef _OPENMP
         #pragma omp single
 #endif
-        g_itr = i;
+        {
+            g_itr = i;
+            g_rnorm = rnorm;
+            g_bnorm = bnorm;
+        }
     }
 
     data->timing.cm_solver_pre_app += t_pa / control->num_threads;
@@ -3426,7 +3497,7 @@ int BiCGStab( const static_storage * const workspace, const control_params * con
     if ( g_itr >= control->cm_solver_max_iters )
     {
         fprintf( stderr, "[WARNING] BiCGStab convergence failed (%d iters)\n", g_itr );
-        fprintf( stderr, "  [INFO] Rel. residual error: %f\n", r_norm / b_norm );
+        fprintf( stderr, "  [INFO] Rel. residual error: %f\n", g_rnorm / g_bnorm );
         return g_itr;
     }
 
@@ -3440,8 +3511,8 @@ int SDM( const static_storage * const workspace, const control_params * const co
          const real tol, real * const x, const int fresh_pre )
 {
     int i, g_itr, N;
-    real tmp, alpha, b_norm;
-    real sig;
+    real tmp, alpha, bnorm, g_bnorm;
+    real sig, g_sig;
     real t_start, t_pa, t_spmv, t_vops;
 
     N = H->n;
@@ -3451,9 +3522,9 @@ int SDM( const static_storage * const workspace, const control_params * const co
 
 #ifdef _OPENMP
     #pragma omp parallel default(none) \
-    private(i, tmp, alpha, b_norm, sig, t_start) \
+    private(i, tmp, alpha, bnorm, sig, t_start) \
     reduction(+: t_pa, t_spmv, t_vops) \
-    shared(g_itr, N)
+    shared(g_itr, g_sig, g_bnorm, N)
 #endif
     {
         t_pa = 0.0;
@@ -3461,7 +3532,7 @@ int SDM( const static_storage * const workspace, const control_params * const co
         t_vops = 0.0;
 
         t_start = Get_Time( );
-        b_norm = Norm( b, N );
+        bnorm = Norm( b, N );
         t_vops += Get_Timing_Info( t_start );
 
         t_start = Get_Time( );
@@ -3481,7 +3552,7 @@ int SDM( const static_storage * const workspace, const control_params * const co
         sig = Dot( workspace->r, workspace->d, N );
         t_vops += Get_Timing_Info( t_start );
 
-        for ( i = 0; i < control->cm_solver_max_iters && SQRT(sig) / b_norm > tol; ++i )
+        for ( i = 0; i < control->cm_solver_max_iters && SQRT(sig) / bnorm > tol; ++i )
         {
             t_start = Get_Time( );
             Sparse_MatVec( workspace, H, workspace->d, workspace->q );
@@ -3514,7 +3585,11 @@ int SDM( const static_storage * const workspace, const control_params * const co
 #ifdef _OPENMP
         #pragma omp single
 #endif
-        g_itr = i;
+        {
+            g_itr = i;
+            g_sig = sig;
+            g_bnorm = bnorm;
+        }
     }
 
     data->timing.cm_solver_pre_app += t_pa / control->num_threads;
@@ -3524,7 +3599,7 @@ int SDM( const static_storage * const workspace, const control_params * const co
     if ( g_itr >= control->cm_solver_max_iters  )
     {
         fprintf( stderr, "[WARNING] SDM convergence failed (%d iters)\n", g_itr );
-        fprintf( stderr, "  [INFO] Rel. residual error: %f\n", SQRT(sig) / b_norm );
+        fprintf( stderr, "  [INFO] Rel. residual error: %f\n", SQRT(g_sig) / g_bnorm );
         return g_itr;
     }
 
diff --git a/sPuReMD/src/print_utils.c b/sPuReMD/src/print_utils.c
index aeabf83ca9e54fea713269cf7757748a8ad4f231..602cc4ada1586eb47cf1822e7c71ef0f63185855 100644
--- a/sPuReMD/src/print_utils.c
+++ b/sPuReMD/src/print_utils.c
@@ -884,7 +884,7 @@ void Read_Sparse_Matrix2( sparse_matrix *A, char *fname )
 
     A->start[cur_row] = top;
 
-    while ( fscanf( f, "%6d %6d %24.15lf", &row, &col, &val ) == 3 )
+    while ( fscanf( f, "%6d %6d %24lf", &row, &col, &val ) == 3 )
     {
         /* assumption: every row has at least one nonzero (diagonal) */
         if ( cur_row != row - 1 )
@@ -904,6 +904,28 @@ void Read_Sparse_Matrix2( sparse_matrix *A, char *fname )
 }
 
 
+/* Read permuation matrix in COO format (1-based indexing)
+ * and store in a dense vector (preallocated)
+ *
+ * Note: the file must be sorted in increasing order of
+ * columns than rows */
+void Read_Permutation_Matrix( int *v, char *fname )
+{
+    int row, col;
+    real val;
+    FILE *f;
+   
+    f = sfopen( fname, "r" );
+
+    while ( fscanf( f, "%6d %6d %24lf", &row, &col, &val ) == 3 )
+    {
+        v[row - 1] = col - 1;
+    }
+
+    sfclose( f, "Read_Permuation_Matrix::f" );
+}
+
+
 /* Note: watch out for portability issues with endianness
  * due to serialization of numeric types (integer, IEEE 754) */
 void Print_Sparse_Matrix_Binary( sparse_matrix *A, char *fname )
diff --git a/sPuReMD/src/print_utils.h b/sPuReMD/src/print_utils.h
index 8496a405d716a356d753ac24ee49ae400f3164ac..1852ca31e5d4a4d5ad27063633029f303edef5b7 100644
--- a/sPuReMD/src/print_utils.h
+++ b/sPuReMD/src/print_utils.h
@@ -56,6 +56,8 @@ void Print_Sparse_Matrix2( sparse_matrix*, char*, char* );
 
 void Read_Sparse_Matrix2( sparse_matrix *, char * );
 
+void Read_Permutation_Matrix( int *, char * );
+
 void Print_Sparse_Matrix_Binary( sparse_matrix*, char* );
 
 void Print_Bonds( reax_system*, reax_list*, char* );
diff --git a/sPuReMD/src/reax_types.h b/sPuReMD/src/reax_types.h
index e26363a48b042d661d554de7919ccfc59dc0c9c5..b0fa09a758288dff14e49750e8e44cce9caa2b0d 100644
--- a/sPuReMD/src/reax_types.h
+++ b/sPuReMD/src/reax_types.h
@@ -1116,7 +1116,7 @@ struct static_storage
     real *b_local;
 #endif
 
-    /* Level scheduling related storage for applying, e.g. ICHOLT and ILU(T),
+    /* Level scheduling related storage for applying ICHOLT/ILUT(P)/FG-ILUT
      * preconditioners */
     int levels_L;
     int levels_U;
@@ -1128,7 +1128,7 @@ struct static_storage
     unsigned int *level_rows_cnt_U;
     unsigned int *top;
 
-    /* Graph coloring related storage for applying, e.g. ICHOLT and ILU(T),
+    /* Graph coloring related storage for applying ICHOLT/ILUT(P)/FG-ILUT
      * preconditioners */
     unsigned int *color;
     unsigned int *to_color;
@@ -1142,7 +1142,7 @@ struct static_storage
     real *y_p;
     real *x_p;
 
-    /* Jacobi iteration related storage for applying, e.g. ICHOLT and ILU(T),
+    /* Jacobi iteration related storage for applying ICHOLT/ILUT(P)/FG-ILUT
      * preconditioners */
     real *Dinv_L;
     real *Dinv_U;
@@ -1150,6 +1150,11 @@ struct static_storage
     real *rp;
     real *rp2;
 
+    /* permutation for ILUTP */
+    int *perm_ilutp;
+    /* permuted residual for preconditioning with ILUTP */
+    real *r_p;
+
     int num_H;
     int *hbond_index; // for hydrogen bonds