diff --git a/puremd_rc_1003/sPuReMD/GMRES.c b/puremd_rc_1003/sPuReMD/GMRES.c index 0bf027085667549e5bfe12eb4285ba97f2c1635e..bdce70c9bcd97772151cfdd85f072bffe2c571b4 100644 --- a/puremd_rc_1003/sPuReMD/GMRES.c +++ b/puremd_rc_1003/sPuReMD/GMRES.c @@ -24,6 +24,8 @@ #include "list.h" #include "vector.h" +#include <omp.h> + /* sparse matrix-vector product Ax=b * where: @@ -36,20 +38,35 @@ static void Sparse_MatVec( const sparse_matrix * const A, int i, j, k, n, si, ei; real H; #ifdef _OPENMP - real *b_local; + static real **b_local; + unsigned int tid; #endif n = A->n; Vector_MakeZero( b, n ); - #pragma omp parallel \ - default(none) shared(n) private(b_local, si, ei, H, i, j, k) - { #ifdef _OPENMP - if ( (b_local = (real*) calloc(n, sizeof(real))) == NULL ) + /* keep b_local for program duration to avoid allocate/free + * overhead per Sparse_MatVec call*/ + if ( b_local == NULL ) + { + b_local = (real**) malloc( omp_get_num_threads() * sizeof(real*)); + for ( i = 0; i < omp_get_num_threads(); ++i ) { - exit( INSUFFICIENT_SPACE ); + if ( (b_local[i] = (real*) malloc( n * sizeof(real))) == NULL ) + { + exit( INSUFFICIENT_SPACE ); + } } + } +#endif + + #pragma omp parallel \ + default(none) shared(n, b_local) private(si, ei, H, j, k, tid) + { +#ifdef _OPENMP + tid = omp_get_thread_num(); + Vector_MakeZero( b_local[tid], n ); #endif #pragma omp for schedule(guided) for ( i = 0; i < n; ++i ) @@ -62,8 +79,8 @@ static void Sparse_MatVec( const sparse_matrix * const A, j = A->entries[k].j; H = A->entries[k].val; #ifdef _OPENMP - b_local[j] += H * x[i]; - b_local[i] += H * x[j]; + b_local[tid][j] += H * x[i]; + b_local[tid][i] += H * x[j]; #else b[j] += H * x[i]; b[i] += H * x[j]; @@ -72,24 +89,22 @@ static void Sparse_MatVec( const sparse_matrix * const A, // the diagonal entry is the last one in #ifdef _OPENMP - b_local[i] += A->entries[k].val * x[i]; + b_local[tid][i] += A->entries[k].val * x[i]; #else b[i] += A->entries[k].val * x[i]; #endif } + } #ifdef _OPENMP - #pragma omp critical(redux) + for ( tid = 0; tid < omp_get_num_threads(); ++tid ) + { + for ( i = 0; i < n; ++i ) { - for ( i = 0; i < n; ++i ) - { - b[i] += b_local[i]; - } + b[i] += b_local[tid][i]; } - - free(b_local); -#endif } +#endif } diff --git a/puremd_rc_1003/sPuReMD/QEq.c b/puremd_rc_1003/sPuReMD/QEq.c index ca8efc670ed6ac913cafd4e106a78c2fcb763e4d..04c9a0cc568c4c4dc59311a7acb87889c4564400 100644 --- a/puremd_rc_1003/sPuReMD/QEq.c +++ b/puremd_rc_1003/sPuReMD/QEq.c @@ -297,7 +297,7 @@ static void ICHOL_PAR( const sparse_matrix * const A, const unsigned int sweeps, { /* for each nonzero */ #pragma omp parallel for schedule(guided) \ - default(none) private(sum, ei_x, ei_y) firstprivate(x, y) + default(none) shared(DAD) private(sum, ei_x, ei_y, k) firstprivate(x, y) for ( j = 0; j < A->start[A->n]; ++j ) { sum = ZERO;