diff --git a/sPuReMD/src/GMRES.c b/sPuReMD/src/GMRES.c index 0779b950bed98d995582da9735f89ec9666614c5..6f526aeb12fd56b9d266ffbf670f5f5533b3a78b 100644 --- a/sPuReMD/src/GMRES.c +++ b/sPuReMD/src/GMRES.c @@ -19,13 +19,11 @@ <http://www.gnu.org/licenses/>. ----------------------------------------------------------------------*/ -#include "allocate.h" #include "GMRES.h" +#include "allocate.h" #include "list.h" #include "vector.h" -#include <omp.h> - typedef enum { @@ -218,7 +216,6 @@ static void Forward_Subs( const sparse_matrix * const L, const real * const b, r ei = L->start[i + 1]; for ( pj = si; pj < ei - 1; ++pj ) { - // TODO: remove assignments? compiler optimizes away? j = L->j[pj]; val = L->val[pj]; y[i] -= val * y[j]; @@ -241,7 +238,6 @@ static void Backward_Subs( const sparse_matrix * const U, const real * const y, ei = U->start[i + 1]; for ( pj = si + 1; pj < ei; ++pj ) { - // TODO: remove assignments? compiler optimizes away? j = U->j[pj]; val = U->val[pj]; x[i] -= val * x[j]; @@ -251,6 +247,111 @@ static void Backward_Subs( const sparse_matrix * const U, const real * const y, } +/* Triangular solve LU*x = y using level scheduling + * + * LU: lower/upper triangular, stored in CSR + * y: constants in linear system (RHS) + * x: solution + * tri: triangularity of A (lower/upper) + * + * Assumptions: + * LU has non-zero diagonals + * Each row of LU has at least one non-zero (i.e., no rows with all zeros) */ +static void tri_solve_level_sched( sparse_matrix * const LU, const real * const y, + real * const x, const TRIANGULARITY tri ) +{ + int i, j, pj, local_row, local_level, levels = 1; + unsigned int *row_levels, *level_rows, *level_rows_cnt; + + if ( (row_levels = (unsigned int*) calloc(LU->n, sizeof(unsigned int))) == NULL + || (level_rows = (unsigned int*) malloc(LU->n * LU->n * sizeof(unsigned int))) == NULL + || (level_rows_cnt = (unsigned int*) calloc(LU->n, sizeof(unsigned int))) == NULL ) + { + fprintf( stderr, "Not enough space for triangular solve via level scheduling. Terminating...\n" ); + exit( INSUFFICIENT_MEMORY ); + } + + /* find levels (row dependencies in substitutions) */ + if ( tri == LOWER ) + { + for ( i = 0; i < LU->n; ++i ) + { + local_level = 0; + for ( pj = LU->start[i]; pj < LU->start[i + 1] - 1; ++pj ) + { + local_level = MAX( local_level, row_levels[LU->j[pj]] + 1 ); + } + + levels = MAX( levels, local_level + 1 ); + row_levels[i] = local_level; + level_rows[local_level * LU->n + level_rows_cnt[local_level]] = i; + ++level_rows_cnt[local_level]; + } + } + else + { + for ( i = LU->n - 1; i >= 0; --i ) + { + local_level = 0; + for ( pj = LU->start[i] + 1; pj < LU->start[i + 1]; ++pj ) + { + local_level = MAX( local_level, row_levels[LU->j[pj]] + 1 ); + } + + levels = MAX( levels, local_level + 1 ); + row_levels[i] = local_level; + level_rows[local_level * LU->n + level_rows_cnt[local_level]] = i; + ++level_rows_cnt[local_level]; + } + } + + + /* perform substitutions by level */ + if ( tri == LOWER ) + { + for ( i = 0; i < levels; ++i ) + { + #pragma omp parallel for schedule(guided) \ + default(none) private(j, pj, local_row) shared(stderr, i, level_rows_cnt, level_rows) + for ( j = 0; j < level_rows_cnt[i]; ++j ) + { + local_row = level_rows[i * LU->n + j]; + x[local_row] = y[local_row]; + for ( pj = LU->start[local_row]; pj < LU->start[local_row + 1] - 1; ++pj ) + { + x[local_row] -= LU->val[pj] * x[LU->j[pj]]; + + } + x[local_row] /= LU->val[pj]; + } + } + } + else + { + for ( i = 0; i < levels; ++i ) + { + #pragma omp parallel for schedule(guided) \ + default(none) private(j, pj, local_row) shared(i, level_rows_cnt, level_rows) + for ( j = 0; j < level_rows_cnt[i]; ++j ) + { + local_row = level_rows[i * LU->n + j]; + x[local_row] = y[local_row]; + for ( pj = LU->start[local_row] + 1; pj < LU->start[local_row + 1]; ++pj ) + { + x[local_row] -= LU->val[pj] * x[LU->j[pj]]; + + } + x[local_row] /= LU->val[LU->start[local_row]]; + } + } + } + + free( level_rows_cnt ); + free( level_rows ); + free( row_levels ); +} + + /* Jacobi iteration using truncated Neumann series: x_{k+1} = Gx_k + D^{-1}b * where: * G = I - D^{-1}R diff --git a/sPuReMD/src/QEq.c b/sPuReMD/src/QEq.c index 20e29b1844796da6d99903679d9da4b4256d0bcb..0355e68b0422af5a417b84b4958f8ff724f030d5 100644 --- a/sPuReMD/src/QEq.c +++ b/sPuReMD/src/QEq.c @@ -506,7 +506,7 @@ static real SuperLU_Factorize( const sparse_matrix * const A, #endif -/* Diagonal preconditioner */ +/* Diagonal (Jacobi) preconditioner */ static real diagonal_pre( const reax_system * const system, real * const Hdia_inv ) { unsigned int i;