diff --git a/puremd_rc_1003/sPuReMD/GMRES.c b/puremd_rc_1003/sPuReMD/GMRES.c
index 0bf027085667549e5bfe12eb4285ba97f2c1635e..bdce70c9bcd97772151cfdd85f072bffe2c571b4 100644
--- a/puremd_rc_1003/sPuReMD/GMRES.c
+++ b/puremd_rc_1003/sPuReMD/GMRES.c
@@ -24,6 +24,8 @@
 #include "list.h"
 #include "vector.h"
 
+#include <omp.h>
+
 
 /* sparse matrix-vector product Ax=b
  * where:
@@ -36,20 +38,35 @@ static void Sparse_MatVec( const sparse_matrix * const A,
     int i, j, k, n, si, ei;
     real H;
 #ifdef _OPENMP
-    real *b_local;
+    static real **b_local;
+    unsigned int tid;
 #endif
 
     n = A->n;
     Vector_MakeZero( b, n );
 
-    #pragma omp parallel \
-    default(none) shared(n) private(b_local, si, ei, H, i, j, k)
-    {
 #ifdef _OPENMP
-        if ( (b_local = (real*) calloc(n, sizeof(real))) == NULL )
+    /* keep b_local for program duration to avoid allocate/free
+     * overhead per Sparse_MatVec call*/
+    if ( b_local == NULL )
+    {
+        b_local = (real**) malloc( omp_get_num_threads() * sizeof(real*));
+        for ( i = 0; i < omp_get_num_threads(); ++i )
         {
-            exit( INSUFFICIENT_SPACE );
+            if ( (b_local[i] = (real*) malloc( n * sizeof(real))) == NULL )
+            {
+                exit( INSUFFICIENT_SPACE );
+            }
         }
+    }
+#endif
+
+    #pragma omp parallel \
+        default(none) shared(n, b_local) private(si, ei, H, j, k, tid)
+    {
+#ifdef _OPENMP
+        tid = omp_get_thread_num();
+        Vector_MakeZero( b_local[tid], n );
 #endif
         #pragma omp for schedule(guided)
         for ( i = 0; i < n; ++i )
@@ -62,8 +79,8 @@ static void Sparse_MatVec( const sparse_matrix * const A,
                 j = A->entries[k].j;
                 H = A->entries[k].val;
 #ifdef _OPENMP
-                b_local[j] += H * x[i];
-                b_local[i] += H * x[j];
+                b_local[tid][j] += H * x[i];
+                b_local[tid][i] += H * x[j];
 #else
                 b[j] += H * x[i];
                 b[i] += H * x[j];
@@ -72,24 +89,22 @@ static void Sparse_MatVec( const sparse_matrix * const A,
 
             // the diagonal entry is the last one in
 #ifdef _OPENMP
-            b_local[i] += A->entries[k].val * x[i];
+            b_local[tid][i] += A->entries[k].val * x[i];
 #else
             b[i] += A->entries[k].val * x[i];
 #endif
         }
+    }
 
 #ifdef _OPENMP
-        #pragma omp critical(redux)
+    for ( tid = 0; tid < omp_get_num_threads(); ++tid )
+    {
+        for ( i = 0; i < n; ++i )
         {
-            for ( i = 0; i < n; ++i )
-            {
-                b[i] += b_local[i];
-            }
+            b[i] += b_local[tid][i];
         }
-
-        free(b_local);
-#endif
     }
+#endif
 }
 
 
diff --git a/puremd_rc_1003/sPuReMD/QEq.c b/puremd_rc_1003/sPuReMD/QEq.c
index ca8efc670ed6ac913cafd4e106a78c2fcb763e4d..04c9a0cc568c4c4dc59311a7acb87889c4564400 100644
--- a/puremd_rc_1003/sPuReMD/QEq.c
+++ b/puremd_rc_1003/sPuReMD/QEq.c
@@ -297,7 +297,7 @@ static void ICHOL_PAR( const sparse_matrix * const A, const unsigned int sweeps,
     {
         /* for each nonzero */
         #pragma omp parallel for schedule(guided) \
-            default(none) private(sum, ei_x, ei_y) firstprivate(x, y)
+            default(none) shared(DAD) private(sum, ei_x, ei_y, k) firstprivate(x, y)
         for ( j = 0; j < A->start[A->n]; ++j )
         {
             sum = ZERO;