diff --git a/puremd_rc_1003/sPuReMD/GMRES.c b/puremd_rc_1003/sPuReMD/GMRES.c
index 8f8d0d2040c79c055000a3dd2ee277daa0f8db45..2d4150ac86ddf5ff2f1ab3c9f2f6cb03c3592c14 100644
--- a/puremd_rc_1003/sPuReMD/GMRES.c
+++ b/puremd_rc_1003/sPuReMD/GMRES.c
@@ -63,7 +63,7 @@ static void Sparse_MatVec( const sparse_matrix * const A,
 		}
             }
 
-            Vector_MakeZero( b_local, omp_get_num_threads() * n );
+            Vector_MakeZero( (real * const)b_local, omp_get_num_threads() * n );
         }
         #pragma omp barrier
 
@@ -259,52 +259,144 @@ static void Jacobi_Iter( const sparse_matrix * const R, const TRIANGULARITY tri,
                          const real * const Dinv, const unsigned int n,
                          const real * const b, real * const x, const unsigned int maxiter )
 {
-    unsigned int i, iter = 0;
+    unsigned int i, k, si = 0, ei = 0, iter = 0;
+#ifdef _OPENMP
+    static real *b_local;
+    unsigned int tid;
+#endif
     static real *Dinv_b, *rp, *rp2, *rp3;
 
-    if ( Dinv_b == NULL )
+    #pragma omp parallel \
+        default(none) shared(b, b_local, Dinv_b, rp, rp2, rp3, iter, stderr) private(si, ei, i, k, tid)
     {
-        if ( (Dinv_b = (real*) malloc(sizeof(real) * n)) == NULL )
+#ifdef _OPENMP
+        tid = omp_get_thread_num();
+#endif
+    
+        #pragma omp master
         {
-            fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" );
-            exit(INSUFFICIENT_SPACE);
-        }
-    }
-    if ( rp == NULL )
-    {
-        if ( (rp = (real*) malloc(sizeof(real) * n)) == NULL )
+            /* keep b_local for program duration to avoid allocate/free
+             * overhead per Sparse_MatVec call*/
+            if ( Dinv_b == NULL )
+            {
+                if ( (Dinv_b = (real*) malloc(sizeof(real) * n)) == NULL )
+                {
+                    fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" );
+                    exit(INSUFFICIENT_SPACE);
+                }
+            }
+            if ( rp == NULL )
+            {
+                if ( (rp = (real*) malloc(sizeof(real) * n)) == NULL )
+                {
+                    fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" );
+                    exit(INSUFFICIENT_SPACE);
+                }
+            }
+            if ( rp2 == NULL )
+            {
+                    if ( (rp2 = (real*) malloc(sizeof(real) * n)) == NULL )
+                {
+                    fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" );
+                    exit(INSUFFICIENT_SPACE);
+                }
+            }
+#ifdef _OPENMP
+            if ( b_local == NULL )
+            {
+                if ( (b_local = (real*) malloc( omp_get_num_threads() * R->n * sizeof(real))) == NULL )
+                {
+                    fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" );
+                    exit( INSUFFICIENT_SPACE );
+                }
+	    }
+#endif
+    
+            Vector_MakeZero( rp, n );
+	}
+    
+        #pragma omp barrier
+    
+        /* precompute and cache, as invariant in loop below */
+        #pragma omp for schedule(guided)
+        for ( i = 0; i < n; ++i )
         {
-            fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" );
-            exit(INSUFFICIENT_SPACE);
+            Dinv_b[i] = Dinv[i] * b[i];
         }
-    }
-    if ( rp2 == NULL )
-    {
-            if ( (rp2 = (real*) malloc(sizeof(real) * n)) == NULL )
+
+        #pragma omp barrier
+
+        do
         {
-            fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" );
-            exit(INSUFFICIENT_SPACE);
-        }
-    }
+            // x_{k+1} = G*x_{k} + Dinv*b;
+	    //Sparse_MatVec_Vector_Add( (sparse_matrix*)R, tri, Dinv, rp, rp2, Dinv_b );
+            #pragma omp master
+            {
+                Vector_MakeZero( rp2, R->n );
+#ifdef _OPENMP
+    	        Vector_MakeZero( b_local, omp_get_num_threads() * R->n );
+#endif
+            }
+    
+            #pragma omp barrier
 
-    Vector_MakeZero( rp, n );
+            #pragma omp for schedule(guided)
+            for ( i = 0; i < R->n; ++i )
+            {
+                if (tri == LOWER)
+                {
+                    si = R->start[i];
+                    ei = R->start[i + 1] - 1;
+                }
+                else if (tri == UPPER)
+                {
+    
+                    si = R->start[i] + 1;
+                    ei = R->start[i + 1];
+                }
+    
+                for ( k = si; k < ei; ++k )
+                {
+#ifdef _OPENMP
+                    b_local[tid * R->n + i] += R->entries[k].val * rp[R->entries[k].j];
+#else
+                    rp2[i] += R->entries[k].val * rp[R->entries[k].j];
+#endif
+                }
+#ifdef _OPENMP
+                b_local[tid * R->n + i] *= -Dinv[i];
+#else
+                rp2[i] *= -Dinv[i];
+#endif
+            }
+	    
+            #pragma omp barrier
 
-    /* precompute and cache, as invariant in loop below */
-    for ( i = 0; i < n; ++i )
-    {
-        Dinv_b[i] = Dinv[i] * b[i];
-    }
+            #pragma omp for schedule(guided)
+            for ( i = 0; i < R->n; ++i )
+            {
+#ifdef _OPENMP
+                for ( k = 0; k < omp_get_num_threads(); ++k )
+                {
+                    rp2[i] += b_local[k * R->n + i];
+                }
+#endif
 
-    do
-    {
-        // x_{k+1} = G*x_{k} + Dinv*b;
-        Sparse_MatVec_Vector_Add( (sparse_matrix*)R, tri, Dinv, rp, rp2, Dinv_b );
-        rp3 = rp;
-        rp = rp2;
-        rp2 = rp3;
-        ++iter;
+    	        rp2[i] += Dinv_b[i];
+            }
+
+            #pragma omp master
+            {
+                rp3 = rp;
+                rp = rp2;
+                rp2 = rp3;
+                ++iter;
+            }
+
+            #pragma omp barrier
+        }
+        while ( iter < maxiter );
     }
-    while ( iter < maxiter );
 
     Vector_Copy( x, rp, n );
 }