From 3b478c091fed2d8f6ceadef2698a20a2a3afc8dd Mon Sep 17 00:00:00 2001 From: "Kurt A. O'Hearn" <ohearnku@cse.msu.edu> Date: Fri, 9 Sep 2016 23:41:35 -0400 Subject: [PATCH] sPuReMD: unify OpenMP constructs across GMRES for performance. --- sPuReMD/src/GMRES.c | 963 ++++++++++++++++++++++++------------------- sPuReMD/src/GMRES.h | 4 +- sPuReMD/src/QEq.c | 14 +- sPuReMD/src/vector.c | 56 ++- 4 files changed, 593 insertions(+), 444 deletions(-) diff --git a/sPuReMD/src/GMRES.c b/sPuReMD/src/GMRES.c index 9e6f83b3..107dc959 100644 --- a/sPuReMD/src/GMRES.c +++ b/sPuReMD/src/GMRES.c @@ -33,95 +33,108 @@ typedef enum } TRIANGULARITY; +/* global to make OpenMP shared (Sparse_MatVec) */ +#ifdef _OPENMP +real *b_local = NULL; +#endif +/* global to make OpenMP shared (apply_preconditioner) */ +real *Dinv_L = NULL, *Dinv_U = NULL; +/* global to make OpenMP shared (tri_solve_level_sched) */ +int levels = 1; +int levels_L = 1, levels_U = 1; +unsigned int *row_levels_L = NULL, *level_rows_L = NULL, *level_rows_cnt_L = NULL; +unsigned int *row_levels_U = NULL, *level_rows_U = NULL, *level_rows_cnt_U = NULL; +unsigned int *row_levels, *level_rows, *level_rows_cnt; +unsigned int *top = NULL; +/* global to make OpenMP shared (jacobi_iter) */ +real *Dinv_b = NULL, *rp = NULL, *rp2 = NULL, *rp3 = NULL; + + /* sparse matrix-vector product Ax=b * where: * A: lower triangular matrix * x: vector * b: vector (result) */ static void Sparse_MatVec( const sparse_matrix * const A, - const real * const x, real * const b ) + const real * const x, real * const b ) { int i, j, k, n, si, ei; real H; #ifdef _OPENMP - static real *b_local; unsigned int tid; #endif n = A->n; Vector_MakeZero( b, n ); - #pragma omp parallel \ - default(none) shared(n, b_local) private(si, ei, H, i, j, k, tid) - { #ifdef _OPENMP - tid = omp_get_thread_num(); + tid = omp_get_thread_num(); - #pragma omp master + #pragma omp master + { + + /* keep b_local for program duration to avoid allocate/free + * overhead per Sparse_MatVec call*/ + if ( b_local == NULL ) { - /* keep b_local for program duration to avoid allocate/free - * overhead per Sparse_MatVec call*/ - if ( b_local == NULL ) + if ( (b_local = (real*) malloc( omp_get_num_threads() * n * sizeof(real))) == NULL ) { - if ( (b_local = (real*) malloc( omp_get_num_threads() * n * sizeof(real))) == NULL ) - { - exit( INSUFFICIENT_MEMORY ); - } + exit( INSUFFICIENT_MEMORY ); } - - Vector_MakeZero( (real * const)b_local, omp_get_num_threads() * n ); } - #pragma omp barrier + } + + #pragma omp barrier + + Vector_MakeZero( (real * const)b_local, omp_get_num_threads() * n ); #endif - #pragma omp for schedule(static) - for ( i = 0; i < n; ++i ) - { - si = A->start[i]; - ei = A->start[i + 1] - 1; + #pragma omp for schedule(static) + for ( i = 0; i < n; ++i ) + { + si = A->start[i]; + ei = A->start[i + 1] - 1; - for ( k = si; k < ei; ++k ) - { - j = A->j[k]; - H = A->val[k]; + for ( k = si; k < ei; ++k ) + { + j = A->j[k]; + H = A->val[k]; #ifdef _OPENMP - b_local[tid * n + j] += H * x[i]; - b_local[tid * n + i] += H * x[j]; + b_local[tid * n + j] += H * x[i]; + b_local[tid * n + i] += H * x[j]; #else - b[j] += H * x[i]; - b[i] += H * x[j]; + b[j] += H * x[i]; + b[i] += H * x[j]; #endif - } + } - // the diagonal entry is the last one in + // the diagonal entry is the last one in #ifdef _OPENMP - b_local[tid * n + i] += A->val[k] * x[i]; + b_local[tid * n + i] += A->val[k] * x[i]; #else - b[i] += A->val[k] * x[i]; + b[i] += A->val[k] * x[i]; #endif - } + } #ifdef _OPENMP - #pragma omp for schedule(static) - for ( i = 0; i < n; ++i ) + #pragma omp for schedule(static) + for ( i = 0; i < n; ++i ) + { + for ( j = 0; j < omp_get_num_threads(); ++j ) { - for ( j = 0; j < omp_get_num_threads(); ++j ) - { - b[i] += b_local[j * n + i]; - } + b[i] += b_local[j * n + i]; } -#endif } +#endif } static void diag_pre_app( const real * const Hdia_inv, const real * const y, - real * const x, const int N ) + real * const x, const int N ) { unsigned int i; - #pragma omp parallel for schedule(static) \ - default(none) private(i) + #pragma omp for schedule(static) for ( i = 0; i < N; ++i ) { x[i] = y[i] * Hdia_inv[i]; @@ -140,41 +153,44 @@ static void diag_pre_app( const real * const Hdia_inv, const real * const y, * LU has non-zero diagonals * Each row of LU has at least one non-zero (i.e., no rows with all zeros) */ static void tri_solve( const sparse_matrix * const LU, const real * const y, - real * const x, const TRIANGULARITY tri ) + real * const x, const TRIANGULARITY tri ) { int i, pj, j, si, ei; real val; - if ( tri == LOWER ) + #pragma omp master { - for ( i = 0; i < LU->n; ++i ) + if ( tri == LOWER ) { - x[i] = y[i]; - si = LU->start[i]; - ei = LU->start[i + 1]; - for ( pj = si; pj < ei - 1; ++pj ) + for ( i = 0; i < LU->n; ++i ) { - j = LU->j[pj]; - val = LU->val[pj]; - x[i] -= val * x[j]; + x[i] = y[i]; + si = LU->start[i]; + ei = LU->start[i + 1]; + for ( pj = si; pj < ei - 1; ++pj ) + { + j = LU->j[pj]; + val = LU->val[pj]; + x[i] -= val * x[j]; + } + x[i] /= LU->val[pj]; } - x[i] /= LU->val[pj]; } - } - else - { - for ( i = LU->n - 1; i >= 0; --i ) + else { - x[i] = y[i]; - si = LU->start[i]; - ei = LU->start[i + 1]; - for ( pj = si + 1; pj < ei; ++pj ) + for ( i = LU->n - 1; i >= 0; --i ) { - j = LU->j[pj]; - val = LU->val[pj]; - x[i] -= val * x[j]; + x[i] = y[i]; + si = LU->start[i]; + ei = LU->start[i + 1]; + for ( pj = si + 1; pj < ei; ++pj ) + { + j = LU->j[pj]; + val = LU->val[pj]; + x[i] -= val * x[j]; + } + x[i] /= LU->val[si]; } - x[i] /= LU->val[si]; } } } @@ -192,162 +208,163 @@ static void tri_solve( const sparse_matrix * const LU, const real * const y, * LU has non-zero diagonals * Each row of LU has at least one non-zero (i.e., no rows with all zeros) */ static void tri_solve_level_sched( const sparse_matrix * const LU, const real * const y, - real * const x, const TRIANGULARITY tri, int find_levels ) + real * const x, const TRIANGULARITY tri, int find_levels ) { - int i, j, pj, local_row, local_level, levels; - static int levels_L = 1, levels_U = 1; - static unsigned int *row_levels_L = NULL, *level_rows_L = NULL, *level_rows_cnt_L = NULL; - static unsigned int *row_levels_U = NULL, *level_rows_U = NULL, *level_rows_cnt_U = NULL; - static unsigned int *top = NULL; - unsigned int *row_levels, *level_rows, *level_rows_cnt; + int i, j, pj, local_row, local_level; - if ( tri == LOWER ) - { - row_levels = row_levels_L; - level_rows = level_rows_L; - level_rows_cnt = level_rows_cnt_L; - levels = levels_L; - } - else + #pragma omp master { - row_levels = row_levels_U; - level_rows = level_rows_U; - level_rows_cnt = level_rows_cnt_U; - levels = levels_U; - } - - if ( row_levels == NULL || level_rows == NULL || level_rows_cnt == NULL ) - { - if ( (row_levels = (unsigned int*) malloc((size_t)LU->n * sizeof(unsigned int))) == NULL - || (level_rows = (unsigned int*) malloc((size_t)LU->n * sizeof(unsigned int))) == NULL - || (level_rows_cnt = (unsigned int*) malloc((size_t)(LU->n + 1) * sizeof(unsigned int))) == NULL ) + if ( tri == LOWER ) { - fprintf( stderr, "Not enough space for triangular solve via level scheduling. Terminating...\n" ); - exit( INSUFFICIENT_MEMORY ); + row_levels = row_levels_L; + level_rows = level_rows_L; + level_rows_cnt = level_rows_cnt_L; + levels = levels_L; + } + else + { + row_levels = row_levels_U; + level_rows = level_rows_U; + level_rows_cnt = level_rows_cnt_U; + levels = levels_U; } - } - if ( top == NULL ) - { - if ( (top = (unsigned int*) malloc((size_t)(LU->n + 1) * sizeof(unsigned int))) == NULL ) + if ( row_levels == NULL || level_rows == NULL || level_rows_cnt == NULL ) { - fprintf( stderr, "Not enough space for triangular solve via level scheduling. Terminating...\n" ); - exit( INSUFFICIENT_MEMORY ); + if ( (row_levels = (unsigned int*) malloc((size_t)LU->n * sizeof(unsigned int))) == NULL + || (level_rows = (unsigned int*) malloc((size_t)LU->n * sizeof(unsigned int))) == NULL + || (level_rows_cnt = (unsigned int*) malloc((size_t)(LU->n + 1) * sizeof(unsigned int))) == NULL ) + { + fprintf( stderr, "Not enough space for triangular solve via level scheduling. Terminating...\n" ); + exit( INSUFFICIENT_MEMORY ); + } } - } - /* find levels (row dependencies in substitutions) */ - if ( find_levels ) - { - memset( row_levels, 0, LU->n * sizeof(unsigned int) ); - memset( level_rows_cnt, 0, LU->n * sizeof(unsigned int) ); - memset( top, 0, LU->n * sizeof(unsigned int) ); + if ( top == NULL ) + { + if ( (top = (unsigned int*) malloc((size_t)(LU->n + 1) * sizeof(unsigned int))) == NULL ) + { + fprintf( stderr, "Not enough space for triangular solve via level scheduling. Terminating...\n" ); + exit( INSUFFICIENT_MEMORY ); + } + } - if ( tri == LOWER ) + /* find levels (row dependencies in substitutions) */ + if ( find_levels ) { - for ( i = 0; i < LU->n; ++i ) + memset( row_levels, 0, LU->n * sizeof(unsigned int) ); + memset( level_rows_cnt, 0, LU->n * sizeof(unsigned int) ); + memset( top, 0, LU->n * sizeof(unsigned int) ); + levels = 1; + + if ( tri == LOWER ) { - local_level = 1; - for ( pj = LU->start[i]; pj < LU->start[i + 1] - 1; ++pj ) + for ( i = 0; i < LU->n; ++i ) { - local_level = MAX( local_level, row_levels[LU->j[pj]] + 1 ); + local_level = 1; + for ( pj = LU->start[i]; pj < LU->start[i + 1] - 1; ++pj ) + { + local_level = MAX( local_level, row_levels[LU->j[pj]] + 1 ); + } + + levels = MAX( levels, local_level ); + row_levels[i] = local_level; + ++level_rows_cnt[local_level]; } - levels = MAX( levels, local_level ); - row_levels[i] = local_level; - ++level_rows_cnt[local_level]; + fprintf(stderr, "levels(L): %d\n", levels); + fprintf(stderr, "NNZ(L): %d\n", LU->start[LU->n]); } - - printf("levels(L): %d\n", levels); - printf("NNZ(L): %d\n", LU->start[LU->n]); - } - else - { - for ( i = LU->n - 1; i >= 0; --i ) + else { - local_level = 1; - for ( pj = LU->start[i] + 1; pj < LU->start[i + 1]; ++pj ) + for ( i = LU->n - 1; i >= 0; --i ) { - local_level = MAX( local_level, row_levels[LU->j[pj]] + 1 ); + local_level = 1; + for ( pj = LU->start[i] + 1; pj < LU->start[i + 1]; ++pj ) + { + local_level = MAX( local_level, row_levels[LU->j[pj]] + 1 ); + } + + levels = MAX( levels, local_level ); + row_levels[i] = local_level; + ++level_rows_cnt[local_level]; } - levels = MAX( levels, local_level ); - row_levels[i] = local_level; - ++level_rows_cnt[local_level]; + fprintf(stderr, "levels(U): %d\n", levels); + fprintf(stderr, "NNZ(U): %d\n", LU->start[LU->n]); } - printf("levels(U): %d\n", levels); - printf("NNZ(U): %d\n", LU->start[LU->n]); - } - - for ( i = 1; i < levels + 1; ++i ) - { - level_rows_cnt[i] += level_rows_cnt[i - 1]; - top[i] = level_rows_cnt[i]; - } + for ( i = 1; i < levels + 1; ++i ) + { + level_rows_cnt[i] += level_rows_cnt[i - 1]; + top[i] = level_rows_cnt[i]; + } - for ( i = 0; i < LU->n; ++i ) - { - level_rows[top[row_levels[i] - 1]] = i; - ++top[row_levels[i] - 1]; + for ( i = 0; i < LU->n; ++i ) + { + level_rows[top[row_levels[i] - 1]] = i; + ++top[row_levels[i] - 1]; + } } } + #pragma omp barrier + /* perform substitutions by level */ - #pragma omp parallel default(none) private(i, j, pj, local_row) shared(levels, level_rows_cnt, level_rows) + if ( tri == LOWER ) { - if ( tri == LOWER ) + for ( i = 0; i < levels; ++i ) { - for ( i = 0; i < levels; ++i ) + #pragma omp for schedule(static) + for ( j = level_rows_cnt[i]; j < level_rows_cnt[i + 1]; ++j ) { - #pragma omp for schedule(static) - for ( j = level_rows_cnt[i]; j < level_rows_cnt[i + 1]; ++j ) + local_row = level_rows[j]; + x[local_row] = y[local_row]; + for ( pj = LU->start[local_row]; pj < LU->start[local_row + 1] - 1; ++pj ) { - local_row = level_rows[j]; - x[local_row] = y[local_row]; - for ( pj = LU->start[local_row]; pj < LU->start[local_row + 1] - 1; ++pj ) - { - x[local_row] -= LU->val[pj] * x[LU->j[pj]]; + x[local_row] -= LU->val[pj] * x[LU->j[pj]]; - } - x[local_row] /= LU->val[pj]; } + x[local_row] /= LU->val[pj]; } } - else + } + else + { + for ( i = 0; i < levels; ++i ) { - for ( i = 0; i < levels; ++i ) + #pragma omp for schedule(static) + for ( j = level_rows_cnt[i]; j < level_rows_cnt[i + 1]; ++j ) { - #pragma omp for schedule(static) - for ( j = level_rows_cnt[i]; j < level_rows_cnt[i + 1]; ++j ) + local_row = level_rows[j]; + x[local_row] = y[local_row]; + for ( pj = LU->start[local_row] + 1; pj < LU->start[local_row + 1]; ++pj ) { - local_row = level_rows[j]; - x[local_row] = y[local_row]; - for ( pj = LU->start[local_row] + 1; pj < LU->start[local_row + 1]; ++pj ) - { - x[local_row] -= LU->val[pj] * x[LU->j[pj]]; + x[local_row] -= LU->val[pj] * x[LU->j[pj]]; - } - x[local_row] /= LU->val[LU->start[local_row]]; } + x[local_row] /= LU->val[LU->start[local_row]]; } } } - /* save level info for re-use if performing repeated triangular solves via preconditioning */ - if ( tri == LOWER ) + #pragma omp master { - row_levels_L = row_levels; - level_rows_L = level_rows; - level_rows_cnt_L = level_rows_cnt; - levels_L = levels; - } - else - { - row_levels_U = row_levels; - level_rows_U = level_rows; - level_rows_cnt_U = level_rows_cnt; - levels_U = levels; + /* save level info for re-use if performing repeated triangular solves via preconditioning */ + if ( tri == LOWER ) + { + row_levels_L = row_levels; + level_rows_L = level_rows; + level_rows_cnt_L = level_rows_cnt; + levels_L = levels; + } + else + { + row_levels_U = row_levels; + level_rows_U = level_rows; + level_rows_cnt_U = level_rows_cnt; + levels_U = levels; + } } } @@ -364,103 +381,93 @@ static void tri_solve_level_sched( const sparse_matrix * const LU, const real * * Note: Newmann series arises from series expansion of the inverse of * the coefficient matrix in the triangular system */ static void jacobi_iter( const sparse_matrix * const R, const real * const Dinv, - const real * const b, real * const x, const TRIANGULARITY tri, - const unsigned int maxiter ) + const real * const b, real * const x, const TRIANGULARITY tri, + const unsigned int maxiter ) { unsigned int i, k, si = 0, ei = 0, iter; -#ifdef _OPENMP - unsigned int tid; -#endif - static real *Dinv_b, *rp, *rp2, *rp3; - #pragma omp parallel \ - default(none) shared(Dinv_b, rp, rp2, rp3, stderr) private(si, ei, i, k, iter, tid) - { -#ifdef _OPENMP - tid = omp_get_thread_num(); -#endif - iter = 0; + iter = 0; - #pragma omp master + #pragma omp master + { + if ( Dinv_b == NULL ) { - if ( Dinv_b == NULL ) + if ( (Dinv_b = (real*) malloc(sizeof(real) * R->n)) == NULL ) { - if ( (Dinv_b = (real*) malloc(sizeof(real) * R->n)) == NULL ) - { - fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" ); - exit( INSUFFICIENT_MEMORY ); - } + fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" ); + exit( INSUFFICIENT_MEMORY ); } - if ( rp == NULL ) + } + if ( rp == NULL ) + { + if ( (rp = (real*) malloc(sizeof(real) * R->n)) == NULL ) { - if ( (rp = (real*) malloc(sizeof(real) * R->n)) == NULL ) - { - fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" ); - exit( INSUFFICIENT_MEMORY ); - } + fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" ); + exit( INSUFFICIENT_MEMORY ); } - if ( rp2 == NULL ) + } + if ( rp2 == NULL ) + { + if ( (rp2 = (real*) malloc(sizeof(real) * R->n)) == NULL ) { - if ( (rp2 = (real*) malloc(sizeof(real) * R->n)) == NULL ) - { - fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" ); - exit( INSUFFICIENT_MEMORY ); - } + fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" ); + exit( INSUFFICIENT_MEMORY ); } - - Vector_MakeZero( rp, R->n ); } + } - #pragma omp barrier + #pragma omp barrier - /* precompute and cache, as invariant in loop below */ - #pragma omp for schedule(static) - for ( i = 0; i < R->n; ++i ) - { - Dinv_b[i] = Dinv[i] * b[i]; - } + Vector_MakeZero( rp, R->n ); - do - { - #pragma omp barrier + /* precompute and cache, as invariant in loop below */ + #pragma omp for schedule(static) + for ( i = 0; i < R->n; ++i ) + { + Dinv_b[i] = Dinv[i] * b[i]; + } - // x_{k+1} = G*x_{k} + Dinv*b; - #pragma omp for schedule(guided) - for ( i = 0; i < R->n; ++i ) + do + { + // x_{k+1} = G*x_{k} + Dinv*b; + #pragma omp for schedule(guided) + for ( i = 0; i < R->n; ++i ) + { + if (tri == LOWER) + { + si = R->start[i]; + ei = R->start[i + 1] - 1; + } + else { - if (tri == LOWER) - { - si = R->start[i]; - ei = R->start[i + 1] - 1; - } - else - { - - si = R->start[i] + 1; - ei = R->start[i + 1]; - } - - rp2[i] = 0.; - - for ( k = si; k < ei; ++k ) - { - rp2[i] += R->val[k] * rp[R->j[k]]; - } - rp2[i] *= -Dinv[i]; - rp2[i] += Dinv_b[i]; + si = R->start[i] + 1; + ei = R->start[i + 1]; } - #pragma omp master + rp2[i] = 0.; + + for ( k = si; k < ei; ++k ) { - rp3 = rp; - rp = rp2; - rp2 = rp3; + rp2[i] += R->val[k] * rp[R->j[k]]; } - ++iter; - } while ( iter < maxiter ); + rp2[i] *= -Dinv[i]; + rp2[i] += Dinv_b[i]; + } + + #pragma omp master + { + rp3 = rp; + rp = rp2; + rp2 = rp3; + } + + #pragma omp barrier + + ++iter; } + while ( iter < maxiter ); Vector_Copy( x, rp, R->n ); } @@ -477,11 +484,10 @@ static void jacobi_iter( const sparse_matrix * const R, const real * const Dinv, * Assumptions: * Matrices have non-zero diagonals * Each row of a matrix has at least one non-zero (i.e., no rows with all zeros) */ -void apply_preconditioner( const static_storage * const workspace, const control_params * const control, - const real * const y, real * const x, const int fresh_pre ) +static void apply_preconditioner( const static_storage * const workspace, const control_params * const control, + const real * const y, real * const x, const int fresh_pre ) { int i, si; - static real *Dinv_L = NULL, *Dinv_U = NULL; switch ( control->pre_app_type ) { @@ -533,18 +539,24 @@ void apply_preconditioner( const static_storage * const workspace, const control case ICHOLT_PC: case ILU_PAR_PC: case ILUT_PAR_PC: - if ( Dinv_L == NULL ) + #pragma omp master { - if ( (Dinv_L = (real*) malloc(sizeof(real) * workspace->L->n)) == NULL ) + if ( Dinv_L == NULL ) { - fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" ); - exit( INSUFFICIENT_MEMORY ); + if ( (Dinv_L = (real*) malloc(sizeof(real) * workspace->L->n)) == NULL ) + { + fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" ); + exit( INSUFFICIENT_MEMORY ); + } } } + #pragma omp barrier + /* construct D^{-1}_L */ if ( fresh_pre ) { + #pragma omp for schedule(static) for ( i = 0; i < workspace->L->n; ++i ) { si = workspace->L->start[i + 1] - 1; @@ -554,18 +566,24 @@ void apply_preconditioner( const static_storage * const workspace, const control jacobi_iter( workspace->L, Dinv_L, y, x, LOWER, control->pre_app_jacobi_iters ); - if ( Dinv_U == NULL ) + #pragma omp master { - if ( (Dinv_U = (real*) malloc(sizeof(real) * workspace->U->n)) == NULL ) + if ( Dinv_U == NULL ) { - fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" ); - exit( INSUFFICIENT_MEMORY ); + if ( (Dinv_U = (real*) malloc(sizeof(real) * workspace->U->n)) == NULL ) + { + fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" ); + exit( INSUFFICIENT_MEMORY ); + } } } + #pragma omp barrier + /* construct D^{-1}_U */ if ( fresh_pre ) { + #pragma omp for schedule(static) for ( i = 0; i < workspace->U->n; ++i ) { si = workspace->U->start[i]; @@ -574,7 +592,7 @@ void apply_preconditioner( const static_storage * const workspace, const control } jacobi_iter( workspace->U, Dinv_U, y, x, UPPER, control->pre_app_jacobi_iters ); - break; + break; default: fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); exit( INVALID_INPUT ); @@ -594,208 +612,317 @@ void apply_preconditioner( const static_storage * const workspace, const control /* generalized minimual residual iterative solver for sparse linear systems */ int GMRES( const static_storage * const workspace, const control_params * const control, - simulation_data * const data, const sparse_matrix * const H, - const real * const b, real tol, real * const x, - const FILE * const fout, const int fresh_pre ) + simulation_data * const data, const sparse_matrix * const H, + const real * const b, const real tol, real * const x, + const FILE * const fout, const int fresh_pre ) { - int i, j, k, itr, N, si; - real cc, tmp1, tmp2, temp, bnorm, time_start; + int i, j, k, itr, N, g_j, g_itr; + real cc, tmp1, tmp2, temp, ret_temp, bnorm, time_start; N = H->n; - time_start = Get_Time( ); - bnorm = Norm( b, N ); - data->timing.solver_vector_ops += Get_Timing_Info( time_start ); - - if ( control->pre_comp_type == DIAG_PC ) + #pragma omp parallel default(none) private(i, j, k, itr, bnorm, ret_temp) \ + shared(N, cc, tmp1, tmp2, temp, time_start, g_itr, g_j, stderr) { - /* apply preconditioner to RHS */ - time_start = Get_Time( ); - apply_preconditioner( workspace, control, b, workspace->b_prc, fresh_pre ); - data->timing.pre_app += Get_Timing_Info( time_start ); - } - - /* GMRES outer-loop */ - for ( itr = 0; itr < MAX_ITR; ++itr ) - { - /* calculate r0 */ - time_start = Get_Time( ); - Sparse_MatVec( H, x, workspace->b_prm ); - data->timing.solver_spmv += Get_Timing_Info( time_start ); - - if ( control->pre_comp_type == DIAG_PC ) - { - time_start = Get_Time( ); - apply_preconditioner( workspace, control, workspace->b_prm, workspace->b_prm, fresh_pre ); - data->timing.pre_app += Get_Timing_Info( time_start ); - } - - if ( control->pre_comp_type == DIAG_PC ) + #pragma omp master { time_start = Get_Time( ); - Vector_Sum( workspace->v[0], 1., workspace->b_prc, -1., workspace->b_prm, N ); - data->timing.solver_vector_ops += Get_Timing_Info( time_start ); } - else + bnorm = Norm( b, N ); + #pragma omp master { - time_start = Get_Time( ); - Vector_Sum( workspace->v[0], 1., b, -1., workspace->b_prm, N ); data->timing.solver_vector_ops += Get_Timing_Info( time_start ); } - if ( control->pre_comp_type != DIAG_PC ) + if ( control->pre_comp_type == DIAG_PC ) { - time_start = Get_Time( ); - apply_preconditioner( workspace, control, workspace->v[0], workspace->v[0], - itr == 0 ? fresh_pre : 0 ); - data->timing.pre_app += Get_Timing_Info( time_start ); + /* apply preconditioner to RHS */ + #pragma omp master + { + time_start = Get_Time( ); + } + apply_preconditioner( workspace, control, b, workspace->b_prc, fresh_pre ); + #pragma omp master + { + data->timing.pre_app += Get_Timing_Info( time_start ); + } } - time_start = Get_Time( ); - workspace->g[0] = Norm( workspace->v[0], N ); - Vector_Scale( workspace->v[0], 1. / workspace->g[0], workspace->v[0], N ); - data->timing.solver_vector_ops += Get_Timing_Info( time_start ); - //fprintf( stderr, "res: %.15e\n", workspace->g[0] ); - - /* GMRES inner-loop */ - for ( j = 0; j < RESTART && fabs(workspace->g[j]) / bnorm > tol; j++ ) + /* GMRES outer-loop */ + for ( itr = 0; itr < MAX_ITR; ++itr ) { - /* matvec */ - time_start = Get_Time( ); - Sparse_MatVec( H, workspace->v[j], workspace->v[j + 1] ); - data->timing.solver_spmv += Get_Timing_Info( time_start ); + /* calculate r0 */ + #pragma omp master + { + time_start = Get_Time( ); + } + Sparse_MatVec( H, x, workspace->b_prm ); + #pragma omp master + { + data->timing.solver_spmv += Get_Timing_Info( time_start ); + } - time_start = Get_Time( ); - apply_preconditioner( workspace, control, workspace->v[j + 1], workspace->v[j + 1], 0 ); - data->timing.pre_app += Get_Timing_Info( time_start ); + if ( control->pre_comp_type == DIAG_PC ) + { + #pragma omp master + { + time_start = Get_Time( ); + } + apply_preconditioner( workspace, control, workspace->b_prm, workspace->b_prm, fresh_pre ); + #pragma omp master + { + data->timing.pre_app += Get_Timing_Info( time_start ); + } + } if ( control->pre_comp_type == DIAG_PC ) { - /* apply modified Gram-Schmidt to orthogonalize the new residual */ - time_start = Get_Time( ); - for ( i = 0; i <= j; i++ ) + #pragma omp master { - workspace->h[i][j] = Dot( workspace->v[i], workspace->v[j + 1], N ); - Vector_Add( workspace->v[j + 1], -workspace->h[i][j], workspace->v[i], N ); + time_start = Get_Time( ); + } + Vector_Sum( workspace->v[0], 1., workspace->b_prc, -1., workspace->b_prm, N ); + #pragma omp master + { + data->timing.solver_vector_ops += Get_Timing_Info( time_start ); } - data->timing.solver_vector_ops += Get_Timing_Info( time_start ); } else { - //TODO: investigate correctness of not explicitly orthogonalizing first few vectors - /* apply modified Gram-Schmidt to orthogonalize the new residual */ - time_start = Get_Time( ); - for ( i = 0; i < j - 1; i++ ) + #pragma omp master + { + time_start = Get_Time( ); + } + Vector_Sum( workspace->v[0], 1., b, -1., workspace->b_prm, N ); + #pragma omp master { - workspace->h[i][j] = 0; + data->timing.solver_vector_ops += Get_Timing_Info( time_start ); } + } - for ( i = MAX(j - 1, 0); i <= j; i++ ) + if ( control->pre_comp_type != DIAG_PC ) + { + #pragma omp master { - workspace->h[i][j] = Dot( workspace->v[i], workspace->v[j + 1], N ); - Vector_Add( workspace->v[j + 1], -workspace->h[i][j], workspace->v[i], N ); + time_start = Get_Time( ); } + apply_preconditioner( workspace, control, workspace->v[0], workspace->v[0], + itr == 0 ? fresh_pre : 0 ); + #pragma omp master + { + data->timing.pre_app += Get_Timing_Info( time_start ); + } + } + + #pragma omp master + { + time_start = Get_Time( ); + } + ret_temp = Norm( workspace->v[0], N ); + #pragma omp single + { + workspace->g[0] = ret_temp; + } + Vector_Scale( workspace->v[0], 1. / workspace->g[0], workspace->v[0], N ); + #pragma omp master + { data->timing.solver_vector_ops += Get_Timing_Info( time_start ); } - time_start = Get_Time( ); - workspace->h[j + 1][j] = Norm( workspace->v[j + 1], N ); - Vector_Scale( workspace->v[j + 1], - 1. / workspace->h[j + 1][j], workspace->v[j + 1], N ); - data->timing.solver_vector_ops += Get_Timing_Info( time_start ); + /* GMRES inner-loop */ + for ( j = 0; j < RESTART && FABS(workspace->g[j]) / bnorm > tol; j++ ) + { + /* matvec */ + #pragma omp master + { + time_start = Get_Time( ); + } + Sparse_MatVec( H, workspace->v[j], workspace->v[j + 1] ); + #pragma omp master + { + data->timing.solver_spmv += Get_Timing_Info( time_start ); + } + + #pragma omp master + { + time_start = Get_Time( ); + } + apply_preconditioner( workspace, control, workspace->v[j + 1], workspace->v[j + 1], 0 ); + #pragma omp master + { + data->timing.pre_app += Get_Timing_Info( time_start ); + } + + if ( control->pre_comp_type == DIAG_PC ) + { + /* apply modified Gram-Schmidt to orthogonalize the new residual */ + #pragma omp master + { + time_start = Get_Time( ); + } + for ( i = 0; i <= j; i++ ) + { + workspace->h[i][j] = Dot( workspace->v[i], workspace->v[j + 1], N ); + Vector_Add( workspace->v[j + 1], -workspace->h[i][j], workspace->v[i], N ); + } + #pragma omp master + { + data->timing.solver_vector_ops += Get_Timing_Info( time_start ); + } + } + else + { + //TODO: investigate correctness of not explicitly orthogonalizing first few vectors + /* apply modified Gram-Schmidt to orthogonalize the new residual */ + #pragma omp master + { + time_start = Get_Time( ); + for ( i = 0; i < j - 1; i++ ) + { + workspace->h[i][j] = 0; + } + } + + for ( i = MAX(j - 1, 0); i <= j; i++ ) + { + ret_temp = Dot( workspace->v[i], workspace->v[j + 1], N ); + #pragma omp single + { + workspace->h[i][j] = ret_temp; + } + Vector_Add( workspace->v[j + 1], -workspace->h[i][j], workspace->v[i], N ); + } + #pragma omp master + { + data->timing.solver_vector_ops += Get_Timing_Info( time_start ); + } + } + + #pragma omp master + { + time_start = Get_Time( ); + } + ret_temp = Norm( workspace->v[j + 1], N ); + #pragma omp single + { + workspace->h[j + 1][j] = ret_temp; + } + Vector_Scale( workspace->v[j + 1], + 1. / workspace->h[j + 1][j], workspace->v[j + 1], N ); + #pragma omp master + { + data->timing.solver_vector_ops += Get_Timing_Info( time_start ); + } #if defined(DEBUG) - fprintf( stderr, "%d-%d: orthogonalization completed.\n", itr, j ); + fprintf( stderr, "%d-%d: orthogonalization completed.\n", itr, j ); #endif - time_start = Get_Time( ); - if ( control->pre_comp_type == DIAG_PC ) - { - /* Givens rotations on the upper-Hessenberg matrix to make it U */ - for ( i = 0; i <= j; i++ ) + #pragma omp master { - if ( i == j ) + time_start = Get_Time( ); + if ( control->pre_comp_type == DIAG_PC ) { - cc = SQRT( SQR(workspace->h[j][j]) + SQR(workspace->h[j + 1][j]) ); - workspace->hc[j] = workspace->h[j][j] / cc; - workspace->hs[j] = workspace->h[j + 1][j] / cc; + /* Givens rotations on the upper-Hessenberg matrix to make it U */ + for ( i = 0; i <= j; i++ ) + { + if ( i == j ) + { + cc = SQRT( SQR(workspace->h[j][j]) + SQR(workspace->h[j + 1][j]) ); + workspace->hc[j] = workspace->h[j][j] / cc; + workspace->hs[j] = workspace->h[j + 1][j] / cc; + } + + tmp1 = workspace->hc[i] * workspace->h[i][j] + + workspace->hs[i] * workspace->h[i + 1][j]; + tmp2 = -workspace->hs[i] * workspace->h[i][j] + + workspace->hc[i] * workspace->h[i + 1][j]; + + workspace->h[i][j] = tmp1; + workspace->h[i + 1][j] = tmp2; + } + } + else + { + //TODO: investigate correctness of not explicitly orthogonalizing first few vectors + /* Givens rotations on the upper-Hessenberg matrix to make it U */ + for ( i = MAX(j - 1, 0); i <= j; i++ ) + { + if ( i == j ) + { + cc = SQRT( SQR(workspace->h[j][j]) + SQR(workspace->h[j + 1][j]) ); + workspace->hc[j] = workspace->h[j][j] / cc; + workspace->hs[j] = workspace->h[j + 1][j] / cc; + } + + tmp1 = workspace->hc[i] * workspace->h[i][j] + + workspace->hs[i] * workspace->h[i + 1][j]; + tmp2 = -workspace->hs[i] * workspace->h[i][j] + + workspace->hc[i] * workspace->h[i + 1][j]; + + workspace->h[i][j] = tmp1; + workspace->h[i + 1][j] = tmp2; + } } - tmp1 = workspace->hc[i] * workspace->h[i][j] + - workspace->hs[i] * workspace->h[i + 1][j]; - tmp2 = -workspace->hs[i] * workspace->h[i][j] + - workspace->hc[i] * workspace->h[i + 1][j]; - - workspace->h[i][j] = tmp1; - workspace->h[i + 1][j] = tmp2; + /* apply Givens rotations to the rhs as well */ + tmp1 = workspace->hc[j] * workspace->g[j]; + tmp2 = -workspace->hs[j] * workspace->g[j]; + workspace->g[j] = tmp1; + workspace->g[j + 1] = tmp2; + data->timing.solver_orthog += Get_Timing_Info( time_start ); } + + #pragma omp barrier + + //fprintf( stderr, "h: " ); + //for( i = 0; i <= j+1; ++i ) + //fprintf( stderr, "%.6f ", workspace->h[i][j] ); + //fprintf( stderr, "\n" ); + //fprintf( stderr, "res: %.15e\n", workspace->g[j+1] ); } - else + + /* solve Hy = g: H is now upper-triangular, do back-substitution */ + #pragma omp master { - //TODO: investigate correctness of not explicitly orthogonalizing first few vectors - /* Givens rotations on the upper-Hessenberg matrix to make it U */ - for ( i = MAX(j - 1, 0); i <= j; i++ ) + time_start = Get_Time( ); + for ( i = j - 1; i >= 0; i-- ) { - if ( i == j ) + temp = workspace->g[i]; + for ( k = j - 1; k > i; k-- ) { - cc = SQRT( SQR(workspace->h[j][j]) + SQR(workspace->h[j + 1][j]) ); - workspace->hc[j] = workspace->h[j][j] / cc; - workspace->hs[j] = workspace->h[j + 1][j] / cc; + temp -= workspace->h[i][k] * workspace->y[k]; } - tmp1 = workspace->hc[i] * workspace->h[i][j] + - workspace->hs[i] * workspace->h[i + 1][j]; - tmp2 = -workspace->hs[i] * workspace->h[i][j] + - workspace->hc[i] * workspace->h[i + 1][j]; - - workspace->h[i][j] = tmp1; - workspace->h[i + 1][j] = tmp2; + workspace->y[i] = temp / workspace->h[i][i]; } - } - - /* apply Givens rotations to the rhs as well */ - tmp1 = workspace->hc[j] * workspace->g[j]; - tmp2 = -workspace->hs[j] * workspace->g[j]; - workspace->g[j] = tmp1; - workspace->g[j + 1] = tmp2; - data->timing.solver_orthog += Get_Timing_Info( time_start ); - - //fprintf( stderr, "h: " ); - //for( i = 0; i <= j+1; ++i ) - //fprintf( stderr, "%.6f ", workspace->h[i][j] ); - //fprintf( stderr, "\n" ); - //fprintf( stderr, "res: %.15e\n", workspace->g[j+1] ); - } + data->timing.solver_tri_solve += Get_Timing_Info( time_start ); - /* TODO: solve using Jacobi iteration? */ - /* solve Hy = g: H is now upper-triangular, do back-substitution */ - time_start = Get_Time( ); - for ( i = j - 1; i >= 0; i-- ) - { - temp = workspace->g[i]; - for ( k = j - 1; k > i; k-- ) + /* update x = x_0 + Vy */ + time_start = Get_Time( ); + } + Vector_MakeZero( workspace->p, N ); + for ( i = 0; i < j; i++ ) { - temp -= workspace->h[i][k] * workspace->y[k]; + Vector_Add( workspace->p, workspace->y[i], workspace->v[i], N ); } - workspace->y[i] = temp / workspace->h[i][i]; - } - data->timing.solver_tri_solve += Get_Timing_Info( time_start ); + Vector_Add( x, 1., workspace->p, N ); + #pragma omp master + { + data->timing.solver_vector_ops += Get_Timing_Info( time_start ); + } - /* update x = x_0 + Vy */ - time_start = Get_Time( ); - Vector_MakeZero( workspace->p, N ); - for ( i = 0; i < j; i++ ) - { - Vector_Add( workspace->p, workspace->y[i], workspace->v[i], N ); + /* stopping condition */ + if ( FABS(workspace->g[j]) / bnorm <= tol ) + { + break; + } } - Vector_Add( x, 1., workspace->p, N ); - data->timing.solver_vector_ops += Get_Timing_Info( time_start ); - - /* stopping condition */ - if ( fabs(workspace->g[j]) / bnorm <= tol ) + #pragma omp master { - break; + g_itr = itr; + g_j = j; } } @@ -811,21 +938,21 @@ int GMRES( const static_storage * const workspace, const control_params * const // itr, j, fabs( workspace->g[j] ) / bnorm ); // data->timing.solver_iters += itr * RESTART + j; - if ( itr >= MAX_ITR ) + if ( g_itr >= MAX_ITR ) { fprintf( stderr, "GMRES convergence failed\n" ); // return -1; - return itr * (RESTART + 1) + j + 1; + return g_itr * (RESTART + 1) + g_j + 1; } - return itr * (RESTART + 1) + j + 1; + return g_itr * (RESTART + 1) + g_j + 1; } int GMRES_HouseHolder( const static_storage * const workspace, const control_params * const control, - simulation_data * const data, const sparse_matrix * const H, - const real * const b, real tol, real * const x, - const FILE * const fout, const int fresh_pre ) + simulation_data * const data, const sparse_matrix * const H, + const real * const b, real tol, real * const x, + const FILE * const fout, const int fresh_pre ) { int i, j, k, itr, N; real cc, tmp1, tmp2, temp, bnorm; @@ -1015,7 +1142,7 @@ int GMRES_HouseHolder( const static_storage * const workspace, const control_par /* Preconditioned Conjugate Gradient */ int PCG( static_storage *workspace, sparse_matrix *A, real *b, real tol, - sparse_matrix *L, sparse_matrix *U, real *x, FILE *fout ) + sparse_matrix *L, sparse_matrix *U, real *x, FILE *fout ) { int i, N; real tmp, alpha, beta, b_norm, r_norm; @@ -1131,7 +1258,7 @@ int CG( static_storage *workspace, sparse_matrix *H, /* Steepest Descent */ int SDM( static_storage *workspace, sparse_matrix *H, - real *b, real tol, real *x, FILE *fout ) + real *b, real tol, real *x, FILE *fout ) { int i, j, N; real tmp, alpha, beta, b_norm; diff --git a/sPuReMD/src/GMRES.h b/sPuReMD/src/GMRES.h index 77fa3e6a..37aed4bc 100644 --- a/sPuReMD/src/GMRES.h +++ b/sPuReMD/src/GMRES.h @@ -26,12 +26,12 @@ int GMRES( const static_storage * const, const control_params * const, simulation_data * const, const sparse_matrix * const, - const real * const, real, real * const, + const real * const, const real, real * const, const FILE * const, const int ); int GMRES_HouseHolder( const static_storage * const, const control_params * const, simulation_data * const, const sparse_matrix * const, - const real * const, real, real * const, + const real * const, const real, real * const, const FILE * const, const int ); int CG( static_storage*, sparse_matrix*, diff --git a/sPuReMD/src/QEq.c b/sPuReMD/src/QEq.c index ddf0000d..d9f30b5d 100644 --- a/sPuReMD/src/QEq.c +++ b/sPuReMD/src/QEq.c @@ -621,15 +621,9 @@ static real ICHOLT( const sparse_matrix * const A, const real * const droptol, // clear variables Ltop = 0; tmptop = 0; - for ( i = 0; i <= A->n; ++i ) - { - L->start[i] = U->start[i] = 0; - } - - for ( i = 0; i < A->n; ++i ) - { - Utop[i] = 0; - } + memset( L->start, 0, (A->n + 1) * sizeof(unsigned int) ); + memset( U->start, 0, (A->n + 1) * sizeof(unsigned int) ); + memset( Utop, 0, A->n * sizeof(unsigned int) ); //fprintf( stderr, "n: %d\n", A->n ); for ( i = 0; i < A->n; ++i ) @@ -674,7 +668,6 @@ static real ICHOLT( const sparse_matrix * const A, const real * const droptol, //fprintf( stderr, " -- done\n" ); } - // compute the ith diagonal in L // sanity check if ( A->j[pj] != i ) { @@ -682,6 +675,7 @@ static real ICHOLT( const sparse_matrix * const A, const real * const droptol, exit( NUMERIC_BREAKDOWN ); } + // compute the ith diagonal in L val = A->val[pj]; for ( k1 = 0; k1 < tmptop; ++k1 ) { diff --git a/sPuReMD/src/vector.c b/sPuReMD/src/vector.c index 1525d94d..ba4a5ea3 100644 --- a/sPuReMD/src/vector.c +++ b/sPuReMD/src/vector.c @@ -22,14 +22,27 @@ #include "vector.h" +/* global to make OpenMP shared (Vector_isZero) */ +unsigned int ret; +/* global to make OpenMP shared (Dot, Norm) */ +real ret2; + + inline int Vector_isZero( const real * const v, const unsigned int k ) { - unsigned int i, ret = TRUE; + unsigned int i; - #pragma omp parallel for default(none) private(i) reduction(&&: ret) schedule(static) + #pragma omp master + { + ret = TRUE; + } + + #pragma omp barrier + + #pragma omp for reduction(&&: ret) schedule(static) for ( i = 0; i < k; ++i ) { - if ( fabs( v[i] ) > ALMOST_ZERO ) + if ( FABS( v[i] ) > ALMOST_ZERO ) { ret = FALSE; } @@ -43,6 +56,7 @@ inline void Vector_MakeZero( real * const v, const unsigned int k ) { unsigned int i; + #pragma omp for schedule(static) for ( i = 0; i < k; ++i ) { v[i] = ZERO; @@ -54,6 +68,7 @@ inline void Vector_Copy( real * const dest, const real * const v, const unsigned { unsigned int i; + #pragma omp for schedule(static) for ( i = 0; i < k; ++i ) { dest[i] = v[i]; @@ -65,7 +80,7 @@ inline void Vector_Scale( real * const dest, const real c, const real * const v, { unsigned int i; - #pragma omp parallel for default(none) private(i) schedule(static) + #pragma omp for schedule(static) for ( i = 0; i < k; ++i ) { dest[i] = c * v[i]; @@ -78,7 +93,7 @@ inline void Vector_Sum( real * const dest, const real c, const real * const v, c { unsigned int i; - #pragma omp parallel for default(none) private(i) schedule(static) + #pragma omp for schedule(static) for ( i = 0; i < k; ++i ) { dest[i] = c * v[i] + d * y[i]; @@ -90,7 +105,7 @@ inline void Vector_Add( real * const dest, const real c, const real * const v, c { unsigned int i; - #pragma omp parallel for default(none) private(i) schedule(static) + #pragma omp for schedule(static) for ( i = 0; i < k; ++i ) { dest[i] += c * v[i]; @@ -114,31 +129,44 @@ void Vector_Print( FILE * const fout, const char * const vname, const real * con inline real Dot( const real * const v1, const real * const v2, const unsigned int k ) { - real ret = ZERO; unsigned int i; - #pragma omp parallel for default(none) private(i) reduction(+: ret) schedule(static) + #pragma omp master + { + ret2 = ZERO; + } + + #pragma omp barrier + + + #pragma omp for reduction(+: ret2) schedule(static) for ( i = 0; i < k; ++i ) { - ret += v1[i] * v2[i]; + ret2 += v1[i] * v2[i]; } - return ret; + return ret2; } inline real Norm( const real * const v1, const unsigned int k ) { - real ret = ZERO; unsigned int i; - #pragma omp parallel for default(none) private(i) reduction(+: ret) schedule(static) + #pragma omp master + { + ret2 = ZERO; + } + + #pragma omp barrier + + #pragma omp for reduction(+: ret2) schedule(static) for ( i = 0; i < k; ++i ) { - ret += SQR( v1[i] ); + ret2 += SQR( v1[i] ); } - return SQRT( ret ); + return SQRT( ret2 ); } -- GitLab