Skip to content
Snippets Groups Projects
GMRES.c 53.1 KiB
Newer Older
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
/*----------------------------------------------------------------------
  SerialReax - Reax Force Field Simulator
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
  Copyright (2010) Purdue University
  Hasan Metin Aktulga, haktulga@cs.purdue.edu
  Joseph Fogarty, jcfogart@mail.usf.edu
  Sagar Pandit, pandit@usf.edu
  Ananth Y Grama, ayg@cs.purdue.edu
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
  This program is free software; you can redistribute it and/or
  modify it under the terms of the GNU General Public License as
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
  published by the Free Software Foundation; either version 2 of
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
  the License, or (at your option) any later version.
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
  This program is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
  See the GNU General Public License for more details:
  <http://www.gnu.org/licenses/>.
  ----------------------------------------------------------------------*/

#include "GMRES.h"
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
#include "list.h"
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
#include "vector.h"


/* global to make OpenMP shared (Sparse_MatVec) */
#ifdef _OPENMP
real *b_local = NULL;
#endif
/* global to make OpenMP shared (apply_preconditioner) */
real *Dinv_L = NULL, *Dinv_U = NULL;
/* global to make OpenMP shared (tri_solve_level_sched) */
int levels = 1;
int levels_L = 1, levels_U = 1;
unsigned int *row_levels_L = NULL, *level_rows_L = NULL, *level_rows_cnt_L = NULL;
unsigned int *row_levels_U = NULL, *level_rows_U = NULL, *level_rows_cnt_U = NULL;
unsigned int *row_levels, *level_rows, *level_rows_cnt;
unsigned int *top = NULL;
/* global to make OpenMP shared (tri_solve_gc) */
unsigned int *color = NULL;
unsigned int *to_color = NULL;
unsigned int *recolor = NULL;
unsigned int recolor_cnt;
unsigned int *color_top = NULL;
unsigned int *permuted_row_col = NULL;
sparse_matrix *H_full;
/* global to make OpenMP shared (jacobi_iter) */
real *Dinv_b = NULL, *rp = NULL, *rp2 = NULL, *rp3 = NULL;


/* sparse matrix-vector product Ax=b
 * where:
 *   A: lower triangular matrix
 *   x: vector
 *   b: vector (result) */
static void Sparse_MatVec( const sparse_matrix * const A,
                           const real * const x, real * const b )
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
{
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    int i, j, k, n, si, ei;
    real H;
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

    n = A->n;
    Vector_MakeZero( b, n );
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

    #pragma omp master
    {

        /* keep b_local for program duration to avoid allocate/free
         * overhead per Sparse_MatVec call*/
        if ( b_local == NULL )
            if ( (b_local = (real*) malloc( omp_get_num_threads() * n * sizeof(real))) == NULL )
    }

    #pragma omp barrier

    Vector_MakeZero( (real * const)b_local, omp_get_num_threads() * n );
    #pragma omp for schedule(static)
    for ( i = 0; i < n; ++i )
    {
        si = A->start[i];
        ei = A->start[i + 1] - 1;
        for ( k = si; k < ei; ++k )
        {
            j = A->j[k];
            H = A->val[k];
            b_local[tid * n + j] += H * x[i];
            b_local[tid * n + i] += H * x[j];
            b[j] += H * x[i];
            b[i] += H * x[j];
        // the diagonal entry is the last one in
        b_local[tid * n + i] += A->val[k] * x[i];
    #pragma omp for schedule(static)
    for ( i = 0; i < n; ++i )
    {
        for ( j = 0; j < omp_get_num_threads(); ++j )
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
        {
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
        }
/* Transpose A and copy into A^T
 *
 * A: symmetric, lower triangular (half-format), stored in CSR
 * A_t: symmetric, upper triangular (half-format), stored in CSR
 *
 * Assumptions:
 *   A has non-zero diagonals
 *   Each row of A has at least one non-zero (i.e., no rows with all zeros) */
void Transpose( const sparse_matrix const *A, sparse_matrix const *A_t )
{
    unsigned int i, j, pj, *A_t_top;

    if ( (A_t_top = (unsigned int*) calloc( A->n + 1, sizeof(unsigned int))) == NULL )
    {
        fprintf( stderr, "Not enough space for matrix tranpose. Terminating...\n" );
        exit( INSUFFICIENT_MEMORY );
    }

    memset( A_t->start, 0, (A->n + 1) * sizeof(unsigned int) );

    /* count nonzeros in each column of A^T */
    for ( i = 0; i < A->n; ++i )
    {
        for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
        {
            ++A_t->start[A->j[pj] + 1];
        }
    }

    /* setup the row pointers for A^T */
    for ( i = 1; i <= A->n; ++i )
    {
        A_t_top[i] = A_t->start[i] = A_t->start[i] + A_t->start[i - 1];
    }

    /* fill in A^T */
    for ( i = 0; i < A->n; ++i )
    {
        for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
        {
            j = A->j[pj];
            A_t->j[A_t_top[j]] = i;
            A_t->val[A_t_top[j]] = A->val[pj];
            ++A_t_top[j];
        }
    }

    free( A_t_top );
}


/* Transpose A in-place
 *
 * A: symmetric, lower triangular (half-format), stored in CSR
 *
 * Assumptions:
 *   A has non-zero diagonals
 *   Each row of A has at least one non-zero (i.e., no rows with all zeros) */
void Transpose_I( sparse_matrix * const A )
{
    sparse_matrix * A_t;

    if ( Allocate_Matrix( &A_t, A->n, A->m ) == FAILURE )
    {
        fprintf( stderr, "not enough memory for transposing matrices. terminating.\n" );
        exit( INSUFFICIENT_MEMORY );
    }

    Transpose( A, A_t );

    memcpy( A->start, A_t->start, sizeof(int) * (A_t->n + 1) );
    memcpy( A->j, A_t->j, sizeof(int) * (A_t->start[A_t->n]) );
    memcpy( A->val, A_t->val, sizeof(real) * (A_t->start[A_t->n]) );

    Deallocate_Matrix( A_t );
}


/* Apply diagonal inverse (Jacobi) preconditioner to system residual
 *
 * Hdia_inv: diagonal inverse preconditioner (constructed using H)
 * y: current residual
 * x: preconditioned residual
 * N: length of preconditioner and vectors (# rows in H)
 */
static void diag_pre_app( const real * const Hdia_inv, const real * const y,
                          real * const x, const int N )
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
{
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

    #pragma omp for schedule(static)
/* Solve triangular system LU*x = y using level scheduling
 *
 * LU: lower/upper triangular, stored in CSR
 * y: constants in linear system (RHS)
 * x: solution
 * tri: triangularity of LU (lower/upper)
 * Assumptions:
 *   LU has non-zero diagonals
 *   Each row of LU has at least one non-zero (i.e., no rows with all zeros) */
static void tri_solve( const sparse_matrix * const LU, const real * const y,
                       real * const x, const TRIANGULARITY tri )
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
{
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    int i, pj, j, si, ei;
    real val;

Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
        {
                x[i] = y[i];
                si = LU->start[i];
                ei = LU->start[i + 1];
                for ( pj = si; pj < ei - 1; ++pj )
                {
                    j = LU->j[pj];
                    val = LU->val[pj];
                    x[i] -= val * x[j];
                }
                x[i] /= LU->val[pj];
            for ( i = LU->n - 1; i >= 0; --i )
                x[i] = y[i];
                si = LU->start[i];
                ei = LU->start[i + 1];
                for ( pj = si + 1; pj < ei; ++pj )
                {
                    j = LU->j[pj];
                    val = LU->val[pj];
                    x[i] -= val * x[j];
                }
                x[i] /= LU->val[si];
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
        }
/* Solve triangular system LU*x = y using level scheduling
 *
 * LU: lower/upper triangular, stored in CSR
 * y: constants in linear system (RHS)
 * x: solution
 * tri: triangularity of LU (lower/upper)
 * find_levels: perform level search if positive, otherwise reuse existing levels
 * Assumptions:
 *   LU has non-zero diagonals
 *   Each row of LU has at least one non-zero (i.e., no rows with all zeros) */
static void tri_solve_level_sched( const sparse_matrix * const LU, const real * const y,
                                   real * const x, const TRIANGULARITY tri, int find_levels )
    int i, j, pj, local_row, local_level;
            row_levels = row_levels_L;
            level_rows = level_rows_L;
            level_rows_cnt = level_rows_cnt_L;
            levels = levels_L;
        }
        else
        {
            row_levels = row_levels_U;
            level_rows = level_rows_U;
            level_rows_cnt = level_rows_cnt_U;
            levels = levels_U;
        if ( row_levels == NULL || level_rows == NULL || level_rows_cnt == NULL )
            if ( (row_levels = (unsigned int*) malloc((size_t)LU->n * sizeof(unsigned int))) == NULL
                    || (level_rows = (unsigned int*) malloc((size_t)LU->n * sizeof(unsigned int))) == NULL
                    || (level_rows_cnt = (unsigned int*) malloc((size_t)(LU->n + 1) * sizeof(unsigned int))) == NULL )
            {
                fprintf( stderr, "Not enough space for triangular solve via level scheduling. Terminating...\n" );
                exit( INSUFFICIENT_MEMORY );
            }
        if ( top == NULL )
        {
            if ( (top = (unsigned int*) malloc((size_t)(LU->n + 1) * sizeof(unsigned int))) == NULL )
            {
                fprintf( stderr, "Not enough space for triangular solve via level scheduling. Terminating...\n" );
                exit( INSUFFICIENT_MEMORY );
            }
        }
        /* find levels (row dependencies in substitutions) */
        if ( find_levels )
            memset( row_levels, 0, LU->n * sizeof(unsigned int) );
            memset( level_rows_cnt, 0, LU->n * sizeof(unsigned int) );
            memset( top, 0, LU->n * sizeof(unsigned int) );
            levels = 1;

            if ( tri == LOWER )
                    local_level = 1;
                    for ( pj = LU->start[i]; pj < LU->start[i + 1] - 1; ++pj )
                    {
                        local_level = MAX( local_level, row_levels[LU->j[pj]] + 1 );
                    }

                    levels = MAX( levels, local_level );
                    row_levels[i] = local_level;
                    ++level_rows_cnt[local_level];
                fprintf(stderr, "levels(L): %d\n", levels);
                fprintf(stderr, "NNZ(L): %d\n", LU->start[LU->n]);
                for ( i = LU->n - 1; i >= 0; --i )
                    local_level = 1;
                    for ( pj = LU->start[i] + 1; pj < LU->start[i + 1]; ++pj )
                    {
                        local_level = MAX( local_level, row_levels[LU->j[pj]] + 1 );
                    }

                    levels = MAX( levels, local_level );
                    row_levels[i] = local_level;
                    ++level_rows_cnt[local_level];
                fprintf(stderr, "levels(U): %d\n", levels);
                fprintf(stderr, "NNZ(U): %d\n", LU->start[LU->n]);
            for ( i = 1; i < levels + 1; ++i )
            {
                level_rows_cnt[i] += level_rows_cnt[i - 1];
                top[i] = level_rows_cnt[i];
            }
            for ( i = 0; i < LU->n; ++i )
            {
                level_rows[top[row_levels[i] - 1]] = i;
                ++top[row_levels[i] - 1];
            }
    /* perform substitutions by level */
        for ( i = 0; i < levels; ++i )
            #pragma omp for schedule(static)
            for ( j = level_rows_cnt[i]; j < level_rows_cnt[i + 1]; ++j )
                local_row = level_rows[j];
                x[local_row] = y[local_row];
                for ( pj = LU->start[local_row]; pj < LU->start[local_row + 1] - 1; ++pj )
                    x[local_row] -= LU->val[pj] * x[LU->j[pj]];
            #pragma omp for schedule(static)
            for ( j = level_rows_cnt[i]; j < level_rows_cnt[i + 1]; ++j )
                local_row = level_rows[j];
                x[local_row] = y[local_row];
                for ( pj = LU->start[local_row] + 1; pj < LU->start[local_row + 1]; ++pj )
                    x[local_row] -= LU->val[pj] * x[LU->j[pj]];
                x[local_row] /= LU->val[LU->start[local_row]];
        /* save level info for re-use if performing repeated triangular solves via preconditioning */
        if ( tri == LOWER )
        {
            row_levels_L = row_levels;
            level_rows_L = level_rows;
            level_rows_cnt_L = level_rows_cnt;
            levels_L = levels;
        }
        else
        {
            row_levels_U = row_levels;
            level_rows_U = level_rows;
            level_rows_cnt_U = level_rows_cnt;
            levels_U = levels;
        }
static void compute_H_full( const sparse_matrix * const H )
{
    int count, i, j, pj;
    sparse_matrix *H_t;

    #pragma omp master
    {
        if ( Allocate_Matrix( &H_t, H->n, H->m ) == FAILURE )
        {
            fprintf( stderr, "not enough memory for full H. terminating.\n" );
            exit( INSUFFICIENT_MEMORY );
        }

        /* Set up the sparse matrix data structure for A. */
        Transpose( H, H_t );

        count = 0;
        for ( i = 0; i < H->n; ++i )
        {
            H_full->start[i] = count;
            for ( pj = H->start[i]; pj < H->start[i + 1]; ++pj )
            {
                H_full->val[count] = H->val[pj];
                H_full->j[count] = H->j[pj];
                ++count;
            }
            for ( pj = H_t->start[i] + 1; pj < H_t->start[i + 1]; ++pj )
            {
                H_full->val[count] = H_t->val[pj];
                H_full->j[count] = H_t->j[pj];
                ++count;
            }
        }
        H_full->start[i] = count;

        Deallocate_Matrix( H_t );
    }

    #pragma omp barrier
}


static void graph_coloring()
{
#define MAX_COLOR (500)
    int i, pj, v;
    int fb_color[MAX_COLOR];

    #pragma omp master
    {
        memset( color, 0, sizeof(unsigned int) * H_full->n );
        recolor_cnt = H_full->n;
        for ( i = 0; i < H_full->n; ++i )
        {
            to_color[i] = i;
        }
    }
    memset( fb_color, -1, sizeof(unsigned int) * MAX_COLOR );
    #pragma omp barrier

    while ( recolor_cnt > 0 )
    {
        #pragma omp for schedule(static)
        for ( i = 0; i < H_full->n; ++i )
        {
            v = to_color[i];

            for ( pj = H_full->start[v]; pj < H_full->start[v + 1]; ++pj )
            {
                fb_color[color[H_full->j[pj]]] = v;

            }

            for ( pj = 1; fb_color[pj] == v; ++pj );

            color[v] = pj;
        }

        #pragma omp for schedule(static)
        for ( i = 0; i < H_full->n; ++i )
        {
            v = to_color[i];
            recolor[i] = FALSE;

            for ( pj = H_full->start[v]; pj < H_full->start[v + 1]; ++pj )
            {
                if ( color[v] == color[H_full->j[pj]] && v > H_full->j[pj] )
                {
                    recolor[i] = TRUE;
                    break;
                }
            }

        }

        //TODO: switch to reduction on recolor_cnt (+) via parallel scan through recolor
        #pragma omp master
        {
            recolor_cnt = 0;
            for ( i = 0; i < H_full->n; ++i )
            {
                if ( recolor[i] == TRUE )
                {
                    to_color[recolor_cnt] = i;
                    color[i] = 0;
                    ++recolor_cnt;
                }

            }
        }

        #pragma omp barrier
    }
}


static void permute_factor( sparse_matrix * const LU, const TRIANGULARITY tri, const int find_mapping )
{
    unsigned int i, pj;
    sparse_matrix *LUtemp;

    #pragma omp master
    {
        memset( color_top, 0, sizeof(unsigned int) * (H_full->n + 1) );
        if ( Allocate_Matrix( &LUtemp, LU->n, LU->m ) == FAILURE )
        {
            fprintf( stderr, "Not enough space for graph coloring (factor permutation). Terminating...\n" );
            exit( INSUFFICIENT_MEMORY );
        }

        if ( find_mapping == TRUE )
        {
            for ( i = 0; i < H_full->n; ++i )
            {
                ++color_top[color[i]];
            }

            for ( i = 1; i < H_full->n + 1; ++i )
            {
                color_top[i] += color_top[i - 1];
            }

            for ( i = 0; i < H_full->n; ++i )
            {
                permuted_row_col[color_top[color[i] - 1]] = i;
                ++color_top[color[i] - 1];
            }

            /* invert mapping */
            memcpy( color_top, permuted_row_col, sizeof(unsigned int) * H_full->n );
            for ( i = 0; i < H_full->n; ++i )
            {
                permuted_row_col[color_top[i]] = i;
            }

        }

        memset( color_top, 0, sizeof(unsigned int) * (H_full->n + 1) );

        if ( tri == LOWER )
        {
            /* count nonzeros in each row of permuted factor */
            for ( i = 0; i < LU->n; ++i )
            {
                for ( pj = LU->start[i]; pj < LU->start[i + 1]; ++pj )
                {
                    if ( permuted_row_col[i] < permuted_row_col[LU->j[pj]] )
                    {
                        ++color_top[permuted_row_col[i] + 1];
                    }
                    else
                    {
                        ++color_top[permuted_row_col[LU->j[pj]] + 1];
                    }
                }
            }

            for ( i = 1; i < LU->n + 1; ++i )
            {
                color_top[i] += color_top[i - 1];
            }

            memcpy( LUtemp->start, color_top, sizeof(unsigned int) * (LU->n + 1) );

            for ( i = 0; i < LU->n; ++i )
            {
                for ( pj = LU->start[i]; pj < LU->start[i + 1]; ++pj )
                {
                    if ( permuted_row_col[i] < permuted_row_col[LU->j[pj]] )
                    {
                        LUtemp->j[color_top[permuted_row_col[i]]] = permuted_row_col[LU->j[pj]];
                        LUtemp->val[color_top[permuted_row_col[i]]] = LU->val[pj];
                        ++color_top[permuted_row_col[i]];
                    }
                    else
                    {
                        LUtemp->j[color_top[permuted_row_col[LU->j[pj]]]] = permuted_row_col[i];
                        LUtemp->val[color_top[permuted_row_col[LU->j[pj]]]] = LU->val[pj];
                        ++color_top[permuted_row_col[LU->j[pj]]];
                    }
                }
            }
        }

        memcpy( LU->start, LUtemp->start, sizeof(unsigned int) * (LU->n + 1) );
        memcpy( LU->j, LUtemp->j, sizeof(unsigned int) * LU->start[LU->n] );
        memcpy( LU->val, LUtemp->val, sizeof(real) * LU->start[LU->n] );

        Deallocate_Matrix( LUtemp );
    }

    #pragma omp barrier
}


/* Jacobi iteration using truncated Neumann series: x_{k+1} = Gx_k + D^{-1}b
 * where:
 *   G = I - D^{-1}R
 *   R = triangular matrix
 *   D = diagonal matrix, diagonals from R
 *
 * Note: used during the backsolves when applying preconditioners with
 * triangular factors in iterative linear solvers
 *
 * Note: Newmann series arises from series expansion of the inverse of
 * the coefficient matrix in the triangular system */
static void jacobi_iter( const sparse_matrix * const R, const real * const Dinv,
                         const real * const b, real * const x, const TRIANGULARITY tri,
                         const unsigned int maxiter )
    unsigned int i, k, si = 0, ei = 0, iter;
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

    #pragma omp master
    {
        if ( Dinv_b == NULL )
            if ( (Dinv_b = (real*) malloc(sizeof(real) * R->n)) == NULL )
                fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" );
                exit( INSUFFICIENT_MEMORY );
        }
        if ( rp == NULL )
        {
            if ( (rp = (real*) malloc(sizeof(real) * R->n)) == NULL )
                fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" );
                exit( INSUFFICIENT_MEMORY );
        }
        if ( rp2 == NULL )
        {
            if ( (rp2 = (real*) malloc(sizeof(real) * R->n)) == NULL )
                fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" );
                exit( INSUFFICIENT_MEMORY );
    /* precompute and cache, as invariant in loop below */
    #pragma omp for schedule(static)
    for ( i = 0; i < R->n; ++i )
    {
        Dinv_b[i] = Dinv[i] * b[i];
    }
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

    do
    {
        // x_{k+1} = G*x_{k} + Dinv*b;
        #pragma omp for schedule(guided)
        for ( i = 0; i < R->n; ++i )
        {
            if (tri == LOWER)
            {
                si = R->start[i];
                ei = R->start[i + 1] - 1;
            }
            else
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

                si = R->start[i] + 1;
                ei = R->start[i + 1];
            rp2[i] = 0.;

            for ( k = si; k < ei; ++k )
                rp2[i] += R->val[k] * rp[R->j[k]];
            rp2[i] *= -Dinv[i];
            rp2[i] += Dinv_b[i];
        }

        #pragma omp master
        {
            rp3 = rp;
            rp = rp2;
            rp2 = rp3;
        }

        #pragma omp barrier

        ++iter;
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    }
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

/* Solve triangular system LU*x = y using level scheduling
 *
 * workspace: data struct containing matrices, lower/upper triangular, stored in CSR
 * control: data struct containing parameters
 * fresh_pre: parameter indicating if this is a newly computed (fresh) preconditioner
 *   Matrices have non-zero diagonals
 *   Each row of a matrix has at least one non-zero (i.e., no rows with all zeros) */
static void apply_preconditioner( const static_storage * const workspace, const control_params * const control,
                                  const real * const y, real * const x, const int fresh_pre )
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
{
    switch ( control->pre_app_type )
    case NONE_PA:
        break;
    case TRI_SOLVE_PA:
        switch ( control->pre_comp_type )
        {
        case DIAG_PC:
            diag_pre_app( workspace->Hdia_inv, y, x, workspace->H->n );
        case ICHOLT_PC:
        case ILU_PAR_PC:
        case ILUT_PAR_PC:
            tri_solve( workspace->L, y, x, LOWER );
            tri_solve( workspace->U, y, x, UPPER );
        default:
            fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
            exit( INVALID_INPUT );
        }
        break;
    case TRI_SOLVE_LEVEL_SCHED_PA:
        switch ( control->pre_comp_type )
        {
        case DIAG_PC:
            diag_pre_app( workspace->Hdia_inv, y, x, workspace->H->n );
            break;
        case ICHOLT_PC:
        case ILU_PAR_PC:
        case ILUT_PAR_PC:
            tri_solve_level_sched( workspace->L, y, x, LOWER, fresh_pre );
            tri_solve_level_sched( workspace->U, y, x, UPPER, fresh_pre );
            break;
        default:
            fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
            exit( INVALID_INPUT );
            break;
        }
        break;
    case TRI_SOLVE_GC_PA:
        switch ( control->pre_comp_type )
        {
        case DIAG_PC:
            fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" );
            exit( INVALID_INPUT );
            break;
        case ICHOLT_PC:
        case ILU_PAR_PC:
        case ILUT_PAR_PC:
                if ( color == NULL )
                    if ( (color = (unsigned int*) malloc(sizeof(unsigned int) * workspace->H->n)) == NULL ||
                            (to_color = (unsigned int*) malloc(sizeof(unsigned int) * workspace->H->n)) == NULL ||
                            (recolor = (unsigned int*) malloc(sizeof(unsigned int) * workspace->H->n)) == NULL ||
                            (color_top = (unsigned int*) malloc(sizeof(unsigned int) * (workspace->H->n + 1))) == NULL ||
                            (permuted_row_col = (unsigned int*) malloc(sizeof(unsigned int) * workspace->H->n)) == NULL ||
                            (Allocate_Matrix( &H_full, workspace->H->n, 2 * workspace->H->m - workspace->H->n ) == FAILURE ) )
                        fprintf( stderr, "not enough memory for graph coloring. terminating.\n" );
                compute_H_full( workspace->H );

                graph_coloring( );

                permute_factor( workspace->L, LOWER, TRUE );
                permute_factor( workspace->U, UPPER, FALSE );
            tri_solve_level_sched( workspace->L, y, x, LOWER, fresh_pre );
            tri_solve_level_sched( workspace->U, y, x, UPPER, fresh_pre );
        break;
        default:
            fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
            exit( INVALID_INPUT );
            break;
        }
        break;
    case JACOBI_ITER_PA:
        switch ( control->pre_comp_type )
        {
        case DIAG_PC:
            fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" );
            exit( INVALID_INPUT );
            break;
        case ICHOLT_PC:
        case ILU_PAR_PC:
        case ILUT_PAR_PC:
        {
            if ( Dinv_L == NULL )
                if ( (Dinv_L = (real*) malloc(sizeof(real) * workspace->L->n)) == NULL )
                    fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" );
                    exit( INSUFFICIENT_MEMORY );
        #pragma omp barrier
            /* construct D^{-1}_L */
        if ( fresh_pre )
        {
            #pragma omp for schedule(static)
            for ( i = 0; i < workspace->L->n; ++i )
                si = workspace->L->start[i + 1] - 1;
                Dinv_L[i] = 1. / workspace->L->val[si];
            }
        }

        jacobi_iter( workspace->L, Dinv_L, y, x, LOWER, control->pre_app_jacobi_iters );

        #pragma omp master
        {
            if ( Dinv_U == NULL )
            {
                if ( (Dinv_U = (real*) malloc(sizeof(real) * workspace->U->n)) == NULL )
                    fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" );
                    exit( INSUFFICIENT_MEMORY );
        #pragma omp barrier

            /* construct D^{-1}_U */
        if ( fresh_pre )
        {
            #pragma omp for schedule(static)
            for ( i = 0; i < workspace->U->n; ++i )
            {
                si = workspace->U->start[i];
                Dinv_U[i] = 1. / workspace->U->val[si];
            }
        }

        jacobi_iter( workspace->U, Dinv_U, y, x, UPPER, control->pre_app_jacobi_iters );
        default:
            fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
            exit( INVALID_INPUT );
            break;
        }
        break;
    default:
        fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
        exit( INVALID_INPUT );
        break;

    }

    return;
}


/* generalized minimual residual iterative solver for sparse linear systems */
int GMRES( const static_storage * const workspace, const control_params * const control,
           simulation_data * const data, const sparse_matrix * const H,
           const real * const b, const real tol, real * const x,
           const FILE * const fout, const int fresh_pre )
    int i, j, k, itr, N, g_j, g_itr;
    real cc, tmp1, tmp2, temp, ret_temp, bnorm, time_start;
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

    N = H->n;
    #pragma omp parallel default(none) private(i, j, k, itr, bnorm, ret_temp) \
    shared(N, cc, tmp1, tmp2, temp, time_start, g_itr, g_j, stderr)
        bnorm = Norm( b, N );
        #pragma omp master
            data->timing.solver_vector_ops += Get_Timing_Info( time_start );
        if ( control->pre_comp_type == DIAG_PC )
            /* apply preconditioner to RHS */
            #pragma omp master
            {
                time_start = Get_Time( );