From 987e4f476a5fe3e6d3ae902cdf9124ddd57adeba Mon Sep 17 00:00:00 2001
From: "Kurt A. O'Hearn" <ohearnku@cse.msu.edu>
Date: Wed, 7 Sep 2016 17:46:31 -0400
Subject: [PATCH] sPuReMD: fix Jacobi iteration synchronization issues and
 change inner SpMV thread scheduling policy to guided for load balancing.

---
 sPuReMD/src/GMRES.c | 63 +++++++--------------------------------------
 1 file changed, 10 insertions(+), 53 deletions(-)

diff --git a/sPuReMD/src/GMRES.c b/sPuReMD/src/GMRES.c
index c08019ff..9e6f83b3 100644
--- a/sPuReMD/src/GMRES.c
+++ b/sPuReMD/src/GMRES.c
@@ -367,24 +367,22 @@ static void jacobi_iter( const sparse_matrix * const R, const real * const Dinv,
                          const real * const b, real * const x, const TRIANGULARITY tri,
                          const unsigned int maxiter )
 {
-    unsigned int i, k, si = 0, ei = 0, iter = 0;
+    unsigned int i, k, si = 0, ei = 0, iter;
 #ifdef _OPENMP
-    static real *b_local;
     unsigned int tid;
 #endif
     static real *Dinv_b, *rp, *rp2, *rp3;
 
     #pragma omp parallel \
-    default(none) shared(b_local, Dinv_b, rp, rp2, rp3, iter, stderr) private(si, ei, i, k, tid)
+    default(none) shared(Dinv_b, rp, rp2, rp3, stderr) private(si, ei, i, k, iter, tid)
     {
 #ifdef _OPENMP
         tid = omp_get_thread_num();
 #endif
+        iter = 0;
 
         #pragma omp master
         {
-            /* keep b_local for program duration to avoid allocate/free
-             * overhead per Sparse_MatVec call*/
             if ( Dinv_b == NULL )
             {
                 if ( (Dinv_b = (real*) malloc(sizeof(real) * R->n)) == NULL )
@@ -409,16 +407,6 @@ static void jacobi_iter( const sparse_matrix * const R, const real * const Dinv,
                     exit( INSUFFICIENT_MEMORY );
                 }
             }
-#ifdef _OPENMP
-            if ( b_local == NULL )
-            {
-                if ( (b_local = (real*) malloc( omp_get_num_threads() * R->n * sizeof(real))) == NULL )
-                {
-                    fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" );
-                    exit( INSUFFICIENT_MEMORY );
-                }
-            }
-#endif
 
             Vector_MakeZero( rp, R->n );
         }
@@ -432,22 +420,12 @@ static void jacobi_iter( const sparse_matrix * const R, const real * const Dinv,
             Dinv_b[i] = Dinv[i] * b[i];
         }
 
-        #pragma omp barrier
-
         do
         {
-            // x_{k+1} = G*x_{k} + Dinv*b;
-            #pragma omp master
-            {
-                Vector_MakeZero( rp2, R->n );
-#ifdef _OPENMP
-                Vector_MakeZero( b_local, omp_get_num_threads() * R->n );
-#endif
-            }
-
             #pragma omp barrier
 
-            #pragma omp for schedule(static)
+            // x_{k+1} = G*x_{k} + Dinv*b;
+            #pragma omp for schedule(guided)
             for ( i = 0; i < R->n; ++i )
             {
                 if (tri == LOWER)
@@ -462,33 +440,14 @@ static void jacobi_iter( const sparse_matrix * const R, const real * const Dinv,
                     ei = R->start[i + 1];
                 }
 
+                rp2[i] = 0.;
+
                 for ( k = si; k < ei; ++k )
                 {
-#ifdef _OPENMP
-                    b_local[tid * R->n + i] += R->val[k] * rp[R->j[k]];
-#else
                     rp2[i] += R->val[k] * rp[R->j[k]];
-#endif
                 }
-#ifdef _OPENMP
-                b_local[tid * R->n + i] *= -Dinv[i];
-#else
-                rp2[i] *= -Dinv[i];
-#endif
-            }
-
-            #pragma omp barrier
-
-            #pragma omp for schedule(static)
-            for ( i = 0; i < R->n; ++i )
-            {
-#ifdef _OPENMP
-                for ( k = 0; k < omp_get_num_threads(); ++k )
-                {
-                    rp2[i] += b_local[k * R->n + i];
-                }
-#endif
 
+                rp2[i] *= -Dinv[i];
                 rp2[i] += Dinv_b[i];
             }
 
@@ -497,12 +456,10 @@ static void jacobi_iter( const sparse_matrix * const R, const real * const Dinv,
                 rp3 = rp;
                 rp = rp2;
                 rp2 = rp3;
-                ++iter;
             }
 
-            #pragma omp barrier
-        }
-        while ( iter < maxiter );
+            ++iter;
+        } while ( iter < maxiter );
     }
 
     Vector_Copy( x, rp, R->n );
-- 
GitLab