diff --git a/sPuReMD/src/GMRES.c b/sPuReMD/src/GMRES.c
index 35269e9045dd2dfb65fc64239e29ca8b68f2b7cc..ddbd20a046f9ecf636f8e734cebae06946289577 100644
--- a/sPuReMD/src/GMRES.c
+++ b/sPuReMD/src/GMRES.c
@@ -39,7 +39,7 @@ typedef enum
  *   x: vector
  *   b: vector (result) */
 static void Sparse_MatVec( const sparse_matrix * const A,
-        const real * const x, real * const b )
+                           const real * const x, real * const b )
 {
     int i, j, k, n, si, ei;
     real H;
@@ -52,7 +52,7 @@ static void Sparse_MatVec( const sparse_matrix * const A,
     Vector_MakeZero( b, n );
 
     #pragma omp parallel \
-        default(none) shared(n, b_local) private(si, ei, H, i, j, k, tid)
+    default(none) shared(n, b_local) private(si, ei, H, i, j, k, tid)
     {
 #ifdef _OPENMP
         tid = omp_get_thread_num();
@@ -64,9 +64,9 @@ static void Sparse_MatVec( const sparse_matrix * const A,
             if ( b_local == NULL )
             {
                 if ( (b_local = (real*) malloc( omp_get_num_threads() * n * sizeof(real))) == NULL )
-		{
+                {
                     exit( INSUFFICIENT_MEMORY );
-		}
+                }
             }
 
             Vector_MakeZero( (real * const)b_local, omp_get_num_threads() * n );
@@ -125,8 +125,8 @@ static void Sparse_MatVec( const sparse_matrix * const A,
  *   b: vector (result)
  *   D^{-1}b (Dinv_b): precomputed vector-vector product */
 static void Sparse_MatVec_Vector_Add( const sparse_matrix * const R,
-        const TRIANGULARITY tri, const real * const Dinv,
-        const real * const x, real * const b, const real * const Dinv_b)
+                                      const TRIANGULARITY tri, const real * const Dinv,
+                                      const real * const x, real * const b, const real * const Dinv_b)
 {
     int i, k, si = 0, ei = 0;
 #ifdef _OPENMP
@@ -137,7 +137,7 @@ static void Sparse_MatVec_Vector_Add( const sparse_matrix * const R,
     Vector_MakeZero( b, R->n );
 
     #pragma omp parallel \
-        default(none) shared(b_local) private(si, ei, i, k, tid)
+    default(none) shared(b_local) private(si, ei, i, k, tid)
     {
 #ifdef _OPENMP
         tid = omp_get_thread_num();
@@ -152,10 +152,10 @@ static void Sparse_MatVec_Vector_Add( const sparse_matrix * const R,
                 {
                     exit( INSUFFICIENT_MEMORY );
                 }
-	    }
+            }
 
-	    Vector_MakeZero( b_local, omp_get_num_threads() * R->n );
-	}
+            Vector_MakeZero( b_local, omp_get_num_threads() * R->n );
+        }
         #pragma omp barrier
 
 #endif
@@ -197,7 +197,7 @@ static void Sparse_MatVec_Vector_Add( const sparse_matrix * const R,
                 b[i] += b_local[k * R->n + i];
             }
 
-	    b[i] += Dinv_b[i];
+            b[i] += Dinv_b[i];
         }
 #endif
     }
@@ -205,12 +205,12 @@ static void Sparse_MatVec_Vector_Add( const sparse_matrix * const R,
 
 
 static void diag_pre_app( const real * const Hdia_inv, const real * const y,
-        real * const x, const int N )
+                          real * const x, const int N )
 {
     unsigned int i;
 
     #pragma omp parallel for schedule(guided) \
-        default(none) private(i)
+    default(none) private(i)
     for ( i = 0; i < N; ++i )
     {
         x[i] = y[i] * Hdia_inv[i];
@@ -224,12 +224,12 @@ static void diag_pre_app( const real * const Hdia_inv, const real * const y,
  * y: constants in linear system (RHS)
  * x: solution
  * tri: triangularity of LU (lower/upper)
- * 
+ *
  * Assumptions:
  *   LU has non-zero diagonals
  *   Each row of LU has at least one non-zero (i.e., no rows with all zeros) */
 static void tri_solve( const sparse_matrix * const LU, const real * const y,
-        real * const x, const TRIANGULARITY tri )
+                       real * const x, const TRIANGULARITY tri )
 {
     int i, pj, j, si, ei;
     real val;
@@ -276,12 +276,12 @@ static void tri_solve( const sparse_matrix * const LU, const real * const y,
  * x: solution
  * tri: triangularity of LU (lower/upper)
  * find_levels: perform level search if positive, otherwise reuse existing levels
- * 
+ *
  * Assumptions:
  *   LU has non-zero diagonals
  *   Each row of LU has at least one non-zero (i.e., no rows with all zeros) */
 static void tri_solve_level_sched( const sparse_matrix * const LU, const real * const y,
-        real * const x, const TRIANGULARITY tri, int find_levels )
+                                   real * const x, const TRIANGULARITY tri, int find_levels )
 {
     int i, j, pj, local_row, local_level, levels;
     static int levels_L = 1, levels_U = 1;
@@ -307,8 +307,8 @@ static void tri_solve_level_sched( const sparse_matrix * const LU, const real *
     if ( row_levels == NULL || level_rows == NULL || level_rows_cnt == NULL )
     {
         if ( (row_levels = (unsigned int*) malloc(LU->n * sizeof(unsigned int))) == NULL
-            || (level_rows = (unsigned int*) malloc(LU->n * LU->n * sizeof(unsigned int))) == NULL
-            || (level_rows_cnt = (unsigned int*) malloc(LU->n * sizeof(unsigned int))) == NULL )
+                || (level_rows = (unsigned int*) malloc(LU->n * LU->n * sizeof(unsigned int))) == NULL
+                || (level_rows_cnt = (unsigned int*) malloc(LU->n * sizeof(unsigned int))) == NULL )
         {
             fprintf( stderr, "Not enough space for triangular solve via level scheduling. Terminating...\n" );
             exit( INSUFFICIENT_MEMORY );
@@ -330,7 +330,7 @@ static void tri_solve_level_sched( const sparse_matrix * const LU, const real *
                 {
                     local_level = MAX( local_level, row_levels[LU->j[pj]] + 1 );
                 }
-        
+
                 levels = MAX( levels, local_level + 1 );
                 row_levels[i] = local_level;
                 level_rows[local_level * LU->n + level_rows_cnt[local_level]] = i;
@@ -346,7 +346,7 @@ static void tri_solve_level_sched( const sparse_matrix * const LU, const real *
                 {
                     local_level = MAX( local_level, row_levels[LU->j[pj]] + 1 );
                 }
-        
+
                 levels = MAX( levels, local_level + 1 );
                 row_levels[i] = local_level;
                 level_rows[local_level * LU->n + level_rows_cnt[local_level]] = i;
@@ -361,7 +361,7 @@ static void tri_solve_level_sched( const sparse_matrix * const LU, const real *
         for ( i = 0; i < levels; ++i )
         {
             #pragma omp parallel for schedule(guided) \
-                default(none) private(j, pj, local_row) shared(stderr, i, level_rows_cnt, level_rows)
+            default(none) private(j, pj, local_row) shared(stderr, i, level_rows_cnt, level_rows)
             for ( j = 0; j < level_rows_cnt[i]; ++j )
             {
                 local_row = level_rows[i * LU->n + j];
@@ -369,7 +369,7 @@ static void tri_solve_level_sched( const sparse_matrix * const LU, const real *
                 for ( pj = LU->start[local_row]; pj < LU->start[local_row + 1] - 1; ++pj )
                 {
                     x[local_row] -= LU->val[pj] * x[LU->j[pj]];
-    
+
                 }
                 x[local_row] /= LU->val[pj];
             }
@@ -380,7 +380,7 @@ static void tri_solve_level_sched( const sparse_matrix * const LU, const real *
         for ( i = 0; i < levels; ++i )
         {
             #pragma omp parallel for schedule(guided) \
-                default(none) private(j, pj, local_row) shared(i, level_rows_cnt, level_rows)
+            default(none) private(j, pj, local_row) shared(i, level_rows_cnt, level_rows)
             for ( j = 0; j < level_rows_cnt[i]; ++j )
             {
                 local_row = level_rows[i * LU->n + j];
@@ -388,7 +388,7 @@ static void tri_solve_level_sched( const sparse_matrix * const LU, const real *
                 for ( pj = LU->start[local_row] + 1; pj < LU->start[local_row + 1]; ++pj )
                 {
                     x[local_row] -= LU->val[pj] * x[LU->j[pj]];
-    
+
                 }
                 x[local_row] /= LU->val[LU->start[local_row]];
             }
@@ -425,8 +425,8 @@ static void tri_solve_level_sched( const sparse_matrix * const LU, const real *
  * Note: Newmann series arises from series expansion of the inverse of
  * the coefficient matrix in the triangular system */
 static void jacobi_iter( const sparse_matrix * const R, const real * const Dinv,
-        const real * const b, real * const x, const TRIANGULARITY tri,
-        const unsigned int maxiter )
+                         const real * const b, real * const x, const TRIANGULARITY tri,
+                         const unsigned int maxiter )
 {
     unsigned int i, k, si = 0, ei = 0, iter = 0;
 #ifdef _OPENMP
@@ -436,12 +436,12 @@ static void jacobi_iter( const sparse_matrix * const R, const real * const Dinv,
     static real *Dinv_b, *rp, *rp2, *rp3;
 
     #pragma omp parallel \
-        default(none) shared(b_local, Dinv_b, rp, rp2, rp3, iter, stderr) private(si, ei, i, k, tid)
+    default(none) shared(b_local, Dinv_b, rp, rp2, rp3, iter, stderr) private(si, ei, i, k, tid)
     {
 #ifdef _OPENMP
         tid = omp_get_thread_num();
 #endif
-    
+
         #pragma omp master
         {
             /* keep b_local for program duration to avoid allocate/free
@@ -464,7 +464,7 @@ static void jacobi_iter( const sparse_matrix * const R, const real * const Dinv,
             }
             if ( rp2 == NULL )
             {
-                    if ( (rp2 = (real*) malloc(sizeof(real) * R->n)) == NULL )
+                if ( (rp2 = (real*) malloc(sizeof(real) * R->n)) == NULL )
                 {
                     fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" );
                     exit( INSUFFICIENT_MEMORY );
@@ -478,14 +478,14 @@ static void jacobi_iter( const sparse_matrix * const R, const real * const Dinv,
                     fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" );
                     exit( INSUFFICIENT_MEMORY );
                 }
-	    }
+            }
 #endif
-    
+
             Vector_MakeZero( rp, R->n );
-	}
-    
+        }
+
         #pragma omp barrier
-    
+
         /* precompute and cache, as invariant in loop below */
         #pragma omp for schedule(guided)
         for ( i = 0; i < R->n; ++i )
@@ -498,15 +498,15 @@ static void jacobi_iter( const sparse_matrix * const R, const real * const Dinv,
         do
         {
             // x_{k+1} = G*x_{k} + Dinv*b;
-	    //Sparse_MatVec_Vector_Add( (sparse_matrix*)R, tri, Dinv, rp, rp2, Dinv_b );
+            //Sparse_MatVec_Vector_Add( (sparse_matrix*)R, tri, Dinv, rp, rp2, Dinv_b );
             #pragma omp master
             {
                 Vector_MakeZero( rp2, R->n );
 #ifdef _OPENMP
-    	        Vector_MakeZero( b_local, omp_get_num_threads() * R->n );
+                Vector_MakeZero( b_local, omp_get_num_threads() * R->n );
 #endif
             }
-    
+
             #pragma omp barrier
 
             #pragma omp for schedule(guided)
@@ -519,11 +519,11 @@ static void jacobi_iter( const sparse_matrix * const R, const real * const Dinv,
                 }
                 else
                 {
-    
+
                     si = R->start[i] + 1;
                     ei = R->start[i + 1];
                 }
-    
+
                 for ( k = si; k < ei; ++k )
                 {
 #ifdef _OPENMP
@@ -538,7 +538,7 @@ static void jacobi_iter( const sparse_matrix * const R, const real * const Dinv,
                 rp2[i] *= -Dinv[i];
 #endif
             }
-	    
+
             #pragma omp barrier
 
             #pragma omp for schedule(guided)
@@ -551,7 +551,7 @@ static void jacobi_iter( const sparse_matrix * const R, const real * const Dinv,
                 }
 #endif
 
-    	        rp2[i] += Dinv_b[i];
+                rp2[i] += Dinv_b[i];
             }
 
             #pragma omp master
@@ -578,118 +578,118 @@ static void jacobi_iter( const sparse_matrix * const R, const real * const Dinv,
  * y: constants in linear system (RHS)
  * x: solution
  * fresh_pre: parameter indicating if this is a newly computed (fresh) preconditioner
- * 
+ *
  * Assumptions:
  *   Matrices have non-zero diagonals
  *   Each row of a matrix has at least one non-zero (i.e., no rows with all zeros) */
 void apply_preconditioner( const static_storage * const workspace, const control_params * const control,
-        const real * const y, real * const x, const int fresh_pre )
+                           const real * const y, real * const x, const int fresh_pre )
 {
     int i, si;
     static real *Dinv_L = NULL, *Dinv_U = NULL;
 
     switch ( control->pre_app_type )
     {
-        case NONE_PA:
+    case NONE_PA:
+        break;
+    case TRI_SOLVE_PA:
+        switch ( control->pre_comp_type )
+        {
+        case DIAG_PC:
+            diag_pre_app( workspace->Hdia_inv, y, x, workspace->H->n );
             break;
-        case TRI_SOLVE_PA:
-            switch ( control->pre_comp_type )
-            {
-                case DIAG_PC:
-                    diag_pre_app( workspace->Hdia_inv, y, x, workspace->H->n );
-                    break;
-                case ICHOLT_PC:
-                case ILU_PAR_PC:
-                case ILUT_PAR_PC:
-                    tri_solve( workspace->L, y, x, LOWER );
-                    tri_solve( workspace->U, y, x, UPPER );
-                    break;
-                default:
-                    fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
-                    exit( INVALID_INPUT );
-                    break;
-            }
+        case ICHOLT_PC:
+        case ILU_PAR_PC:
+        case ILUT_PAR_PC:
+            tri_solve( workspace->L, y, x, LOWER );
+            tri_solve( workspace->U, y, x, UPPER );
             break;
-        case TRI_SOLVE_LEVEL_SCHED_PA:
-            switch ( control->pre_comp_type )
-            {
-                case DIAG_PC:
-                    diag_pre_app( workspace->Hdia_inv, y, x, workspace->H->n );
-                    break;
-                case ICHOLT_PC:
-                case ILU_PAR_PC:
-                case ILUT_PAR_PC:
-                    tri_solve_level_sched( workspace->L, y, x, LOWER, fresh_pre );
-                    tri_solve_level_sched( workspace->U, y, x, UPPER, fresh_pre );
-                    break;
-                default:
-                    fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
-                    exit( INVALID_INPUT );
-                    break;
-            }
+        default:
+            fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+            exit( INVALID_INPUT );
             break;
-        case JACOBI_ITER_PA:
-            switch ( control->pre_comp_type )
+        }
+        break;
+    case TRI_SOLVE_LEVEL_SCHED_PA:
+        switch ( control->pre_comp_type )
+        {
+        case DIAG_PC:
+            diag_pre_app( workspace->Hdia_inv, y, x, workspace->H->n );
+            break;
+        case ICHOLT_PC:
+        case ILU_PAR_PC:
+        case ILUT_PAR_PC:
+            tri_solve_level_sched( workspace->L, y, x, LOWER, fresh_pre );
+            tri_solve_level_sched( workspace->U, y, x, UPPER, fresh_pre );
+            break;
+        default:
+            fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+            exit( INVALID_INPUT );
+            break;
+        }
+        break;
+    case JACOBI_ITER_PA:
+        switch ( control->pre_comp_type )
+        {
+        case DIAG_PC:
+            fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" );
+            exit( INVALID_INPUT );
+            break;
+        case ICHOLT_PC:
+        case ILU_PAR_PC:
+        case ILUT_PAR_PC:
+            if ( Dinv_L == NULL )
             {
-                case DIAG_PC:
-                    fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" );
-                    exit( INVALID_INPUT );
-                    break;
-                case ICHOLT_PC:
-                case ILU_PAR_PC:
-                case ILUT_PAR_PC:
-                    if ( Dinv_L == NULL )
-                    {
-                        if ( (Dinv_L = (real*) malloc(sizeof(real) * workspace->L->n)) == NULL )
-                        {
-                            fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" );
-                            exit( INSUFFICIENT_MEMORY );
-                        }
-                    }
-
-                    /* construct D^{-1}_L */
-                    if ( fresh_pre )
-                    {
-                        for ( i = 0; i < workspace->L->n; ++i )
-                        {
-                            si = workspace->L->start[i + 1] - 1;
-                            Dinv_L[i] = 1. / workspace->L->val[si];
-                        }
-                    }
+                if ( (Dinv_L = (real*) malloc(sizeof(real) * workspace->L->n)) == NULL )
+                {
+                    fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" );
+                    exit( INSUFFICIENT_MEMORY );
+                }
+            }
 
-                    jacobi_iter( workspace->L, Dinv_L, y, x, LOWER, control->pre_app_jacobi_iters );
+            /* construct D^{-1}_L */
+            if ( fresh_pre )
+            {
+                for ( i = 0; i < workspace->L->n; ++i )
+                {
+                    si = workspace->L->start[i + 1] - 1;
+                    Dinv_L[i] = 1. / workspace->L->val[si];
+                }
+            }
 
-                    if ( Dinv_U == NULL )
-                    {
-                        if ( (Dinv_U = (real*) malloc(sizeof(real) * workspace->U->n)) == NULL )
-                        {
-                            fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" );
-                            exit( INSUFFICIENT_MEMORY );
-                        }
-                    }
+            jacobi_iter( workspace->L, Dinv_L, y, x, LOWER, control->pre_app_jacobi_iters );
 
-                    /* construct D^{-1}_U */
-                    if ( fresh_pre )
-                    {
-                        for ( i = 0; i < workspace->U->n; ++i )
-                        {
-                            si = workspace->U->start[i];
-                            Dinv_U[i] = 1. / workspace->U->val[si];
-                        }
-                    }
+            if ( Dinv_U == NULL )
+            {
+                if ( (Dinv_U = (real*) malloc(sizeof(real) * workspace->U->n)) == NULL )
+                {
+                    fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" );
+                    exit( INSUFFICIENT_MEMORY );
+                }
+            }
 
-                    jacobi_iter( workspace->U, Dinv_U, y, x, UPPER, control->pre_app_jacobi_iters );
-                    break;
-                default:
-                    fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
-                    exit( INVALID_INPUT );
-                    break;
+            /* construct D^{-1}_U */
+            if ( fresh_pre )
+            {
+                for ( i = 0; i < workspace->U->n; ++i )
+                {
+                    si = workspace->U->start[i];
+                    Dinv_U[i] = 1. / workspace->U->val[si];
+                }
             }
+
+            jacobi_iter( workspace->U, Dinv_U, y, x, UPPER, control->pre_app_jacobi_iters );
             break;
         default:
             fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
             exit( INVALID_INPUT );
             break;
+        }
+        break;
+    default:
+        fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+        exit( INVALID_INPUT );
+        break;
 
     }
 
@@ -699,8 +699,8 @@ void apply_preconditioner( const static_storage * const workspace, const control
 
 /* generalized minimual residual iterative solver for sparse linear systems */
 int GMRES( const static_storage * const workspace, const control_params * const control,
-        const sparse_matrix * const H, const real * const b, real tol, real * const x,
-        const FILE * const fout, real * const time, real * const spmv_time, const int fresh_pre )
+           const sparse_matrix * const H, const real * const b, real tol, real * const x,
+           const FILE * const fout, real * const time, real * const spmv_time, const int fresh_pre )
 {
     int i, j, k, itr, N, si;
     real cc, tmp1, tmp2, temp, bnorm, time_start;
@@ -729,7 +729,6 @@ int GMRES( const static_storage * const workspace, const control_params * const
             *time += Get_Timing_Info( time_start );
         }
 
-
         if ( control->pre_comp_type == DIAG_PC )
         {
             Vector_Sum( workspace->v[0], 1., workspace->b_prc, -1., workspace->b_prm, N );
@@ -743,7 +742,7 @@ int GMRES( const static_storage * const workspace, const control_params * const
         {
             time_start = Get_Time( );
             apply_preconditioner( workspace, control, workspace->v[0], workspace->v[0],
-                    itr == 0 ? fresh_pre : 0 );
+                                  itr == 0 ? fresh_pre : 0 );
             *time += Get_Timing_Info( time_start );
         }
 
@@ -766,61 +765,60 @@ int GMRES( const static_storage * const workspace, const control_params * const
             if ( control->pre_comp_type == DIAG_PC )
             {
                 /* apply modified Gram-Schmidt to orthogonalize the new residual */
-                for( i = 0; i <= j; i++ )
+                for ( i = 0; i <= j; i++ )
                 {
-                    workspace->h[i][j] = Dot( workspace->v[i], workspace->v[j+1], N );
-                    Vector_Add( workspace->v[j+1], -workspace->h[i][j], workspace->v[i], N );
-                }
-                
-                workspace->h[j+1][j] = Norm( workspace->v[j+1], N );
-                Vector_Scale( workspace->v[j+1], 1. / workspace->h[j+1][j], workspace->v[j+1], N );
-                // fprintf( stderr, "%d-%d: orthogonalization completed.\n", itr, j );
-                
-                /* Givens rotations on the upper-Hessenberg matrix to make it U */
-                for( i = 0; i <= j; i++ )
-                {
-                    if( i == j )
-                    {
-                        cc = SQRT( SQR(workspace->h[j][j])+SQR(workspace->h[j+1][j]) );
-                        workspace->hc[j] = workspace->h[j][j] / cc;
-                        workspace->hs[j] = workspace->h[j+1][j] / cc;
-                    }
-                
-                    tmp1 =  workspace->hc[i] * workspace->h[i][j] +
-                    workspace->hs[i] * workspace->h[i+1][j];
-                    tmp2 = -workspace->hs[i] * workspace->h[i][j] +
-                    workspace->hc[i] * workspace->h[i+1][j];
-                    
-                    workspace->h[i][j] = tmp1;
-                    workspace->h[i+1][j] = tmp2;
+                    workspace->h[i][j] = Dot( workspace->v[i], workspace->v[j + 1], N );
+                    Vector_Add( workspace->v[j + 1], -workspace->h[i][j], workspace->v[i], N );
                 }
-                
-                /* apply Givens rotations to the rhs as well */
-                tmp1 =  workspace->hc[j] * workspace->g[j];
-                tmp2 = -workspace->hs[j] * workspace->g[j];
-                workspace->g[j] = tmp1;
-                workspace->g[j+1] = tmp2;
             }
-	    else
+            else
             {
+                //TODO: investigate correctness of not explicitly orthogonalizing first few vectors
                 /* apply modified Gram-Schmidt to orthogonalize the new residual */
                 for ( i = 0; i < j - 1; i++ )
                 {
                     workspace->h[i][j] = 0;
                 }
-    
-                //for( i = 0; i <= j; i++ ) {
+
                 for ( i = MAX(j - 1, 0); i <= j; i++ )
                 {
                     workspace->h[i][j] = Dot( workspace->v[i], workspace->v[j + 1], N );
                     Vector_Add( workspace->v[j + 1], -workspace->h[i][j], workspace->v[i], N );
                 }
-    
-                workspace->h[j + 1][j] = Norm( workspace->v[j + 1], N );
-                Vector_Scale( workspace->v[j + 1],
-                              1. / workspace->h[j + 1][j], workspace->v[j + 1], N );
-                // fprintf( stderr, "%d-%d: orthogonalization completed.\n", itr, j );
-    
+            }
+
+
+            workspace->h[j + 1][j] = Norm( workspace->v[j + 1], N );
+            Vector_Scale( workspace->v[j + 1],
+                          1. / workspace->h[j + 1][j], workspace->v[j + 1], N );
+#if defined(DEBUG)
+            fprintf( stderr, "%d-%d: orthogonalization completed.\n", itr, j );
+#endif
+
+            if ( control->pre_comp_type == DIAG_PC )
+            {
+                /* Givens rotations on the upper-Hessenberg matrix to make it U */
+                for ( i = 0; i <= j; i++ )
+                {
+                    if ( i == j )
+                    {
+                        cc = SQRT( SQR(workspace->h[j][j]) + SQR(workspace->h[j + 1][j]) );
+                        workspace->hc[j] = workspace->h[j][j] / cc;
+                        workspace->hs[j] = workspace->h[j + 1][j] / cc;
+                    }
+
+                    tmp1 =  workspace->hc[i] * workspace->h[i][j] +
+                            workspace->hs[i] * workspace->h[i + 1][j];
+                    tmp2 = -workspace->hs[i] * workspace->h[i][j] +
+                           workspace->hc[i] * workspace->h[i + 1][j];
+
+                    workspace->h[i][j] = tmp1;
+                    workspace->h[i + 1][j] = tmp2;
+                }
+            }
+            else
+            {
+                //TODO: investigate correctness of not explicitly orthogonalizing first few vectors
                 /* Givens rotations on the upper-Hessenberg matrix to make it U */
                 for ( i = MAX(j - 1, 0); i <= j; i++ )
                 {
@@ -830,23 +828,23 @@ int GMRES( const static_storage * const workspace, const control_params * const
                         workspace->hc[j] = workspace->h[j][j] / cc;
                         workspace->hs[j] = workspace->h[j + 1][j] / cc;
                     }
-    
+
                     tmp1 =  workspace->hc[i] * workspace->h[i][j] +
                             workspace->hs[i] * workspace->h[i + 1][j];
                     tmp2 = -workspace->hs[i] * workspace->h[i][j] +
                            workspace->hc[i] * workspace->h[i + 1][j];
-    
+
                     workspace->h[i][j] = tmp1;
                     workspace->h[i + 1][j] = tmp2;
                 }
-    
-                /* apply Givens rotations to the rhs as well */
-                tmp1 =  workspace->hc[j] * workspace->g[j];
-                tmp2 = -workspace->hs[j] * workspace->g[j];
-                workspace->g[j] = tmp1;
-                workspace->g[j + 1] = tmp2;
             }
 
+            /* apply Givens rotations to the rhs as well */
+            tmp1 =  workspace->hc[j] * workspace->g[j];
+            tmp2 = -workspace->hs[j] * workspace->g[j];
+            workspace->g[j] = tmp1;
+            workspace->g[j + 1] = tmp2;
+
             //fprintf( stderr, "h: " );
             //for( i = 0; i <= j+1; ++i )
             //fprintf( stderr, "%.6f ", workspace->h[i][j] );
@@ -854,7 +852,6 @@ int GMRES( const static_storage * const workspace, const control_params * const
             //fprintf( stderr, "res: %.15e\n", workspace->g[j+1] );
         }
 
-
         /* TODO: solve using Jacobi iteration? */
         /* solve Hy = g: H is now upper-triangular, do back-substitution */
         for ( i = j - 1; i >= 0; i-- )
@@ -908,8 +905,8 @@ int GMRES( const static_storage * const workspace, const control_params * const
 
 
 int GMRES_HouseHolder( const static_storage * const workspace, const control_params * const control,
-        const sparse_matrix * const H, const real * const b, real tol, real * const x,
-        const FILE * const fout, real * const time, real * const spmv_time, const int fresh_pre )
+                       const sparse_matrix * const H, const real * const b, real tol, real * const x,
+                       const FILE * const fout, real * const time, real * const spmv_time, const int fresh_pre )
 {
     int  i, j, k, itr, N;
     real cc, tmp1, tmp2, temp, bnorm;
@@ -1271,7 +1268,7 @@ int SDM( static_storage *workspace, sparse_matrix *H,
  * using the 1-norm of (LU)^{-1}e, with e = [1 1 ... 1]^T through 2 triangular solves:
  *   1) Ly = e
  *   2) Ux = y where y = Ux
- * That is, we seek to solve e = LUx for unknown x 
+ * That is, we seek to solve e = LUx for unknown x
  *
  * Reference: Incomplete LU Preconditioning with the Multilevel Fast Multipole Algorithm
  *   for Electromagnetic Scattering, SIAM J. Sci. Computing, 2007 */
diff --git a/sPuReMD/src/vector.c b/sPuReMD/src/vector.c
index bdbe19da73b56f3ee9a81aa7704ba4f6e9100556..f784ee11e73d4e2ce0b59fd00e67af6d025f903c 100644
--- a/sPuReMD/src/vector.c
+++ b/sPuReMD/src/vector.c
@@ -24,17 +24,18 @@
 
 inline int Vector_isZero( const real * const v, const unsigned int k )
 {
-    unsigned int i;
+    unsigned int i, ret = TRUE;
 
+    #pragma omp parallel for default(none) private(i) reduction(&&: ret) schedule(guided)
     for ( i = 0; i < k; ++i )
     {
         if ( fabs( v[i] ) > ALMOST_ZERO )
         {
-            return 0;
+            ret = FALSE;
         }
     }
 
-    return 1;
+    return ret;
 }
 
 
@@ -64,6 +65,7 @@ inline void Vector_Scale( real * const dest, const real c, const real * const v,
 {
     unsigned int i;
 
+    #pragma omp parallel for default(none) private(i) schedule(guided)
     for ( i = 0; i < k; ++i )
     {
         dest[i] = c * v[i];
@@ -76,6 +78,7 @@ inline void Vector_Sum( real * const dest, const real c, const real * const v, c
 {
     unsigned int i;
 
+    #pragma omp parallel for default(none) private(i) schedule(guided)
     for ( i = 0; i < k; ++i )
     {
         dest[i] = c * v[i] + d * y[i];
@@ -87,6 +90,7 @@ inline void Vector_Add( real * const dest, const real c, const real * const v, c
 {
     unsigned int i;
 
+    #pragma omp parallel for default(none) private(i) schedule(guided)
     for ( i = 0; i < k; ++i )
     {
         dest[i] += c * v[i];
@@ -113,6 +117,7 @@ inline real Dot( const real * const v1, const real * const v2, const unsigned in
     real ret = ZERO;
     unsigned int i;
 
+    #pragma omp parallel for default(none) private(i) reduction(+: ret) schedule(guided)
     for ( i = 0; i < k; ++i )
     {
         ret +=  v1[i] * v2[i];
@@ -127,6 +132,7 @@ inline real Norm( const real * const v1, const unsigned int k )
     real ret = ZERO;
     unsigned int i;
 
+    #pragma omp parallel for default(none) private(i) reduction(+: ret) schedule(guided)
     for ( i = 0; i < k; ++i )
     {
         ret +=  SQR( v1[i] );
@@ -138,24 +144,32 @@ inline real Norm( const real * const v1, const unsigned int k )
 
 inline void rvec_Copy( rvec dest, const rvec src )
 {
-    dest[0] = src[0], dest[1] = src[1], dest[2] = src[2];
+    dest[0] = src[0];
+    dest[1] = src[1];
+    dest[2] = src[2];
 }
 
 inline void rvec_Scale( rvec ret, const real c, const rvec v )
 {
-    ret[0] = c * v[0], ret[1] = c * v[1], ret[2] = c * v[2];
+    ret[0] = c * v[0];
+    ret[1] = c * v[1];
+    ret[2] = c * v[2];
 }
 
 
 inline void rvec_Add( rvec ret, const rvec v )
 {
-    ret[0] += v[0], ret[1] += v[1], ret[2] += v[2];
+    ret[0] += v[0];
+    ret[1] += v[1];
+    ret[2] += v[2];
 }
 
 
 inline void rvec_ScaledAdd( rvec ret, const real c, const rvec v )
 {
-    ret[0] += c * v[0], ret[1] += c * v[1], ret[2] += c * v[2];
+    ret[0] += c * v[0];
+    ret[1] += c * v[1];
+    ret[2] += c * v[2];
 }
 
 
@@ -269,9 +283,9 @@ inline int rvec_isZero( const rvec v )
             fabs(v[1]) > ALMOST_ZERO ||
             fabs(v[2]) > ALMOST_ZERO )
     {
-        return 0;
+        return FALSE;
     }
-    return 1;
+    return TRUE;
 }
 
 
@@ -455,10 +469,18 @@ inline void rtensor_MakeZero( rtensor t )
 
 inline void rtensor_Transpose( rtensor ret, rtensor t )
 {
-    ret[0][0] = t[0][0], ret[1][1] = t[1][1], ret[2][2] = t[2][2];
-    ret[0][1] = t[1][0], ret[0][2] = t[2][0];
-    ret[1][0] = t[0][1], ret[1][2] = t[2][1];
-    ret[2][0] = t[0][2], ret[2][1] = t[1][2];
+    ret[0][0] = t[0][0];
+    ret[1][1] = t[1][1];
+    ret[2][2] = t[2][2];
+
+    ret[0][1] = t[1][0];
+    ret[0][2] = t[2][0];
+
+    ret[1][0] = t[0][1];
+    ret[1][2] = t[2][1];
+
+    ret[2][0] = t[0][2];
+    ret[2][1] = t[1][2];
 }
 
 
@@ -498,7 +520,9 @@ inline void ivec_MakeZero( ivec v )
 
 inline void ivec_Copy( ivec dest, const ivec src )
 {
-    dest[0] = src[0], dest[1] = src[1], dest[2] = src[2];
+    dest[0] = src[0];
+    dest[1] = src[1];
+    dest[2] = src[2];
 }
 
 
@@ -522,9 +546,9 @@ inline int ivec_isZero( const ivec v )
 {
     if ( v[0] == 0 && v[1] == 0 && v[2] == 0 )
     {
-        return 1;
+        return TRUE;
     }
-    return 0;
+    return FALSE;
 }
 
 
@@ -532,10 +556,10 @@ inline int ivec_isEqual( const ivec v1, const ivec v2 )
 {
     if ( v1[0] == v2[0] && v1[1] == v2[1] && v1[2] == v2[2] )
     {
-        return 1;
+        return TRUE;
     }
 
-    return 0;
+    return FALSE;
 }