/*----------------------------------------------------------------------
  SerialReax - Reax Force Field Simulator

  Copyright (2010) Purdue University
  Hasan Metin Aktulga, haktulga@cs.purdue.edu
  Joseph Fogarty, jcfogart@mail.usf.edu
  Sagar Pandit, pandit@usf.edu
  Ananth Y Grama, ayg@cs.purdue.edu

  This program is free software; you can redistribute it and/or
  modify it under the terms of the GNU General Public License as
  published by the Free Software Foundation; either version 2 of
  the License, or (at your option) any later version.

  This program is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  See the GNU General Public License for more details:
  <http://www.gnu.org/licenses/>.
  ----------------------------------------------------------------------*/

#include "lin_alg.h"

#include "allocate.h"
#include "tool_box.h"
#include "vector.h"

#include "print_utils.h"

/* Intel MKL */
#if defined(HAVE_LAPACKE_MKL)
#include "mkl.h"
/* reference LAPACK */
#elif defined(HAVE_LAPACKE)
#include "lapacke.h"
#endif


typedef struct
{
    unsigned int j;
    real val;
} sparse_matrix_entry;


enum preconditioner_type
{
    LEFT = 0,
    RIGHT = 1,
};


#if defined(TEST_MAT)
static sparse_matrix * create_test_mat( void )
{
    unsigned int i, n;
    sparse_matrix *H_test;

    if ( Allocate_Matrix( &H_test, 3, 6 ) == FAILURE )
    {
        fprintf( stderr, "not enough memory for test matrices. terminating.\n" );
        exit( INSUFFICIENT_MEMORY );
    }

    //3x3, SPD, store lower half
    i = 0;
    n = 0;
    H_test->start[n] = i;
    H_test->j[i] = 0;
    H_test->val[i] = 4.;
    ++i;
    ++n;
    H_test->start[n] = i;
    H_test->j[i] = 0;
    H_test->val[i] = 12.;
    ++i;
    H_test->j[i] = 1;
    H_test->val[i] = 37.;
    ++i;
    ++n;
    H_test->start[n] = i;
    H_test->j[i] = 0;
    H_test->val[i] = -16.;
    ++i;
    H_test->j[i] = 1;
    H_test->val[i] = -43.;
    ++i;
    H_test->j[i] = 2;
    H_test->val[i] = 98.;
    ++i;
    ++n;
    H_test->start[n] = i;

    return H_test;
}
#endif


/* Routine used with qsort for sorting nonzeros within a sparse matrix row
 *
 * v1/v2: pointers to column indices of nonzeros within a row (unsigned int)
 */
static int compare_matrix_entry(const void *v1, const void *v2)
{
    /* larger element has larger column index */
    return ((sparse_matrix_entry *)v1)->j - ((sparse_matrix_entry *)v2)->j;
}


/* Routine used for sorting nonzeros within a sparse matrix row;
 *  internally, a combination of qsort and manual sorting is utilized
 *  (parallel calls to qsort when multithreading, rows mapped to threads)
 *
 * A: sparse matrix for which to sort nonzeros within a row, stored in CSR format
 */
void Sort_Matrix_Rows( sparse_matrix * const A )
{
    unsigned int i, j, si, ei;
    sparse_matrix_entry *temp;

#ifdef _OPENMP
    //    #pragma omp parallel default(none) private(i, j, si, ei, temp) shared(stderr)
#endif
    {
        temp = (sparse_matrix_entry *) smalloc( A->n * sizeof(sparse_matrix_entry),
                                                "Sort_Matrix_Rows::temp" );

        /* sort each row of A using column indices */
#ifdef _OPENMP
        //        #pragma omp for schedule(guided)
#endif
        for ( i = 0; i < A->n; ++i )
        {
            si = A->start[i];
            ei = A->start[i + 1];

            for ( j = 0; j < (ei - si); ++j )
            {
                (temp + j)->j = A->j[si + j];
                (temp + j)->val = A->val[si + j];
            }

            /* polymorphic sort in standard C library using column indices */
            qsort( temp, ei - si, sizeof(sparse_matrix_entry), compare_matrix_entry );

            for ( j = 0; j < (ei - si); ++j )
            {
                A->j[si + j] = (temp + j)->j;
                A->val[si + j] = (temp + j)->val;
            }
        }

        sfree( temp, "Sort_Matrix_Rows::temp" );
    }
}


/* Convert a symmetric, half-sored sparse matrix into
 * a full-stored sparse matrix
 *
 * A: symmetric sparse matrix, lower half stored in CSR
 * A_full: resultant full sparse matrix in CSR
 *   If A_full is NULL, allocate space, otherwise do not
 *
 * Assumptions:
 *   A has non-zero diagonals
 *   Each row of A has at least one non-zero (i.e., no rows with all zeros) */
static void compute_full_sparse_matrix( const sparse_matrix * const A,
                                        sparse_matrix ** A_full )
{
    int count, i, pj;
    sparse_matrix *A_t;

    if ( *A_full == NULL )
    {
        if ( Allocate_Matrix( A_full, A->n, 2 * A->m - A->n ) == FAILURE )
        {
            fprintf( stderr, "not enough memory for full A. terminating.\n" );
            exit( INSUFFICIENT_MEMORY );
        }
    }
    else if ( (*A_full)->m < 2 * A->m - A->n )
    {
        Deallocate_Matrix( *A_full );
        if ( Allocate_Matrix( A_full, A->n, 2 * A->m - A->n ) == FAILURE )
        {
            fprintf( stderr, "not enough memory for full A. terminating.\n" );
            exit( INSUFFICIENT_MEMORY );
        }
    }

    if ( Allocate_Matrix( &A_t, A->n, A->m ) == FAILURE )
    {
        fprintf( stderr, "not enough memory for full A. terminating.\n" );
        exit( INSUFFICIENT_MEMORY );
    }

    /* Set up the sparse matrix data structure for A. */
    Transpose( A, A_t );

    count = 0;
    for ( i = 0; i < A->n; ++i )
    {

        if ((*A_full)->start == NULL)
        {
        }
        (*A_full)->start[i] = count;

        /* A: symmetric, lower triangular portion only stored */
        for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
        {
            (*A_full)->val[count] = A->val[pj];
            (*A_full)->j[count] = A->j[pj];
            ++count;
        }
        /* A^T: symmetric, upper triangular portion only stored;
         * skip diagonal from A^T, as included from A above */
        for ( pj = A_t->start[i] + 1; pj < A_t->start[i + 1]; ++pj )
        {
            (*A_full)->val[count] = A_t->val[pj];
            (*A_full)->j[count] = A_t->j[pj];
            ++count;
        }
    }
    (*A_full)->start[i] = count;

    Deallocate_Matrix( A_t );
}


/* Setup routines for sparse approximate inverse preconditioner
 *
 * A: symmetric sparse matrix, lower half stored in CSR
 * filter:
 * A_spar_patt:
 *
 * Assumptions:
 *   A has non-zero diagonals
 *   Each row of A has at least one non-zero (i.e., no rows with all zeros) */
void setup_sparse_approx_inverse( const sparse_matrix * const A, sparse_matrix ** A_full,
              sparse_matrix ** A_spar_patt, sparse_matrix **A_spar_patt_full,
                    sparse_matrix ** A_app_inv, const real filter )
{
    int i, pj, size;
    int left, right, k, p, turn;
    real pivot, tmp;
    real threshold;
    real *list;

    if ( *A_spar_patt == NULL )
    {
        if ( Allocate_Matrix( A_spar_patt, A->n, A->m ) == FAILURE )
        {
            fprintf( stderr, "[SAI] Not enough memory for preconditioning matrices. terminating.\n" );
            exit( INSUFFICIENT_MEMORY );
        }
    }
    else if ( ((*A_spar_patt)->m) < (A->m) )
    {
        Deallocate_Matrix( *A_spar_patt );
        if ( Allocate_Matrix( A_spar_patt, A->n, A->m ) == FAILURE )
        {
            fprintf( stderr, "[SAI] Not enough memory for preconditioning matrices. terminating.\n" );
            exit( INSUFFICIENT_MEMORY );
        }
    }


    /* quick-select algorithm for finding the kth greatest element in the matrix*/
    /* list: values from the matrix*/
    /* left-right: search space of the quick-select */

    list = (real *) smalloc( sizeof(real) * (A->start[A->n]),"Sparse_Approx_Inverse::list" );

    left = 0;
    right = A->start[A->n] - 1;
    k = (int)( (A->start[A->n])*filter );
    threshold = 0.0;

    for( i = left; i <= right ; ++i )
    {
        list[i] = abs( A->val[i] );
    }

    turn = 0;
    while( k ) {

        p  = left;
        turn = 1 - turn;
        if( turn == 1)
        {
            pivot = list[right];
        }
        else
        {
            pivot = list[left];
        }
        for( i = left + 1 - turn; i <= right-turn; ++i )
        {
            if( list[i] > pivot )
            {
                tmp = list[i];
                list[i] = list[p];
                list[p] = tmp;
                p++;
            }
        }
        if(turn == 1)
        {
            tmp = list[p];
            list[p] = list[right];
            list[right] = tmp;
        }
        else
        {
            tmp = list[p];
            list[p] = list[left];
            list[left] = tmp;
        }

        if( p == k - 1)
        {
            threshold = list[p];
            break;
        }
        else if( p > k - 1 )
        {
            right = p - 1;
        }
        else
        {
            left = p + 1;
        }
    }

    sfree( list, "setup_sparse_approx_inverse::list" );

    /* fill sparsity pattern */
    /* diagonal entries are always included */
    for ( size = 0, i = 0; i < A->n; ++i )
    {
        (*A_spar_patt)->start[i] = size;

        for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
        {
            if ( ( A->val[pj] >= threshold )  || ( A->j[pj] == i ) )
            {
                (*A_spar_patt)->val[size] = A->val[pj];
                (*A_spar_patt)->j[size] = A->j[pj];
                size++;
            }
        }
    }
    (*A_spar_patt)->start[A->n] = size;


    compute_full_sparse_matrix( A, A_full );
    compute_full_sparse_matrix( *A_spar_patt, A_spar_patt_full );

    /* A_app_inv has the same sparsity pattern
     * * as A_spar_patt_full (omit non-zero values) */
    if ( Allocate_Matrix( A_app_inv, (*A_spar_patt_full)->n, (*A_spar_patt_full)->m ) == FAILURE )
    {
        fprintf( stderr, "not enough memory for approximate inverse matrix. terminating.\n" );
        exit( INSUFFICIENT_MEMORY );
    }
}


void Calculate_Droptol( const sparse_matrix * const A,
                        real * const droptol, const real dtol )
{
    int i, j, k;
    real val;
#ifdef _OPENMP
    static real *droptol_local;
    unsigned int tid;
#endif

#ifdef _OPENMP
    #pragma omp parallel default(none) private(i, j, k, val, tid), shared(droptol_local, stderr)
#endif
    {
#ifdef _OPENMP
        tid = omp_get_thread_num();

        #pragma omp master
        {
            if ( droptol_local == NULL )
            {
                droptol_local = (real*) smalloc( omp_get_num_threads() * A->n * sizeof(real),
                                                 "Calculate_Droptol::droptol_local" );
            }
        }

        #pragma omp barrier
#endif

        /* init droptol to 0 */
        for ( i = 0; i < A->n; ++i )
        {
#ifdef _OPENMP
            droptol_local[tid * A->n + i] = 0.0;
#else
            droptol[i] = 0.0;
#endif
        }

#ifdef _OPENMP
        #pragma omp barrier
#endif

        /* calculate sqaure of the norm of each row */
#ifdef _OPENMP
        #pragma omp for schedule(static)
#endif
        for ( i = 0; i < A->n; ++i )
        {
            for ( k = A->start[i]; k < A->start[i + 1] - 1; ++k )
            {
                j = A->j[k];
                val = A->val[k];

#ifdef _OPENMP
                droptol_local[tid * A->n + i] += val * val;
                droptol_local[tid * A->n + j] += val * val;
#else
                droptol[i] += val * val;
                droptol[j] += val * val;
#endif
            }

            // diagonal entry
            val = A->val[k];
#ifdef _OPENMP
            droptol_local[tid * A->n + i] += val * val;
#else
            droptol[i] += val * val;
#endif
        }

#ifdef _OPENMP
        #pragma omp barrier

        #pragma omp for schedule(static)
        for ( i = 0; i < A->n; ++i )
        {
            droptol[i] = 0.0;
            for ( k = 0; k < omp_get_num_threads(); ++k )
            {
                droptol[i] += droptol_local[k * A->n + i];
            }
        }

        #pragma omp barrier
#endif

        /* calculate local droptol for each row */
        //fprintf( stderr, "droptol: " );
#ifdef _OPENMP
        #pragma omp for schedule(static)
#endif
        for ( i = 0; i < A->n; ++i )
        {
            //fprintf( stderr, "%f-->", droptol[i] );
            droptol[i] = SQRT( droptol[i] ) * dtol;
            //fprintf( stderr, "%f  ", droptol[i] );
        }
        //fprintf( stderr, "\n" );
    }
}


int Estimate_LU_Fill( const sparse_matrix * const A, const real * const droptol )
{
    int i, pj;
    int fillin;
    real val;

    fillin = 0;

#ifdef _OPENMP
    #pragma omp parallel for schedule(static) \
    default(none) private(i, pj, val) reduction(+: fillin)
#endif
    for ( i = 0; i < A->n; ++i )
    {
        for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
        {
            val = A->val[pj];

            if ( FABS(val) > droptol[i] )
            {
                ++fillin;
            }
        }
    }

    return fillin + A->n;
}


#if defined(HAVE_SUPERLU_MT)
real SuperLU_Factorize( const sparse_matrix * const A,
                        sparse_matrix * const L, sparse_matrix * const U )
{
    unsigned int i, pj, count, *Ltop, *Utop, r;
    sparse_matrix *A_t;
    SuperMatrix A_S, AC_S, L_S, U_S;
    NCformat *A_S_store;
    SCPformat *L_S_store;
    NCPformat *U_S_store;
    superlumt_options_t superlumt_options;
    pxgstrf_shared_t pxgstrf_shared;
    pdgstrf_threadarg_t *pdgstrf_threadarg;
    int_t nprocs;
    fact_t fact;
    trans_t trans;
    yes_no_t refact, usepr;
    real u, drop_tol;
    real *a, *at;
    int_t *asub, *atsub, *xa, *xat;
    int_t *perm_c; /* column permutation vector */
    int_t *perm_r; /* row permutations from partial pivoting */
    void *work;
    int_t info, lwork;
    int_t permc_spec, panel_size, relax;
    Gstat_t Gstat;
    flops_t flopcnt;

    /* Default parameters to control factorization. */
#ifdef _OPENMP
    //TODO: set as global parameter and use
    #pragma omp parallel \
    default(none) shared(nprocs)
    {
        #pragma omp master
        {
            /* SuperLU_MT spawns threads internally, so set and pass parameter */
            nprocs = omp_get_num_threads();
        }
    }
#else
    nprocs = 1;
#endif

    //    fact = EQUILIBRATE; /* equilibrate A (i.e., scale rows & cols to have unit norm), then factorize */
    fact = DOFACT; /* factor from scratch */
    trans = NOTRANS;
    refact = NO; /* first time factorization */
    //TODO: add to control file and use the value there to set these
    panel_size = sp_ienv(1); /* # consec. cols treated as unit task */
    relax = sp_ienv(2); /* # cols grouped as relaxed supernode */
    u = 1.0; /* diagonal pivoting threshold */
    usepr = NO;
    drop_tol = 0.0;
    work = NULL;
    lwork = 0;

#if defined(DEBUG)
    fprintf( stderr, "nprocs = %d\n", nprocs );
    fprintf( stderr, "Panel size = %d\n", panel_size );
    fprintf( stderr, "Relax = %d\n", relax );
#endif

    if ( !(perm_r = intMalloc(A->n)) )
    {
        SUPERLU_ABORT("Malloc fails for perm_r[].");
    }
    if ( !(perm_c = intMalloc(A->n)) )
    {
        SUPERLU_ABORT("Malloc fails for perm_c[].");
    }
    if ( !(superlumt_options.etree = intMalloc(A->n)) )
    {
        SUPERLU_ABORT("Malloc fails for etree[].");
    }
    if ( !(superlumt_options.colcnt_h = intMalloc(A->n)) )
    {
        SUPERLU_ABORT("Malloc fails for colcnt_h[].");
    }
    if ( !(superlumt_options.part_super_h = intMalloc(A->n)) )
    {
        SUPERLU_ABORT("Malloc fails for part_super__h[].");
    }
    a = (real*) smalloc( (2 * A->start[A->n] - A->n) * sizeof(real),
                         "SuperLU_Factorize::a" );
    asub = (int_t*) smalloc( (2 * A->start[A->n] - A->n) * sizeof(int_t),
                             "SuperLU_Factorize::asub" );
    xa = (int_t*) smalloc( (A->n + 1) * sizeof(int_t),
                           "SuperLU_Factorize::xa" );
    Ltop = (unsigned int*) smalloc( (A->n + 1) * sizeof(unsigned int),
                                    "SuperLU_Factorize::Ltop" );
    Utop = (unsigned int*) smalloc( (A->n + 1) * sizeof(unsigned int),
                                    "SuperLU_Factorize::Utop" );
    if ( Allocate_Matrix( &A_t, A->n, A->m ) == FAILURE )
    {
        fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
        exit( INSUFFICIENT_MEMORY );
    }

    /* Set up the sparse matrix data structure for A. */
    Transpose( A, A_t );

    count = 0;
    for ( i = 0; i < A->n; ++i )
    {
        xa[i] = count;
        for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
        {
            a[count] = A->entries[pj].val;
            asub[count] = A->entries[pj].j;
            ++count;
        }
        for ( pj = A_t->start[i] + 1; pj < A_t->start[i + 1]; ++pj )
        {
            a[count] = A_t->entries[pj].val;
            asub[count] = A_t->entries[pj].j;
            ++count;
        }
    }
    xa[i] = count;

    dCompRow_to_CompCol( A->n, A->n, 2 * A->start[A->n] - A->n, a, asub, xa,
                         &at, &atsub, &xat );

    for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
        fprintf( stderr, "%6d", asub[i] );
    fprintf( stderr, "\n" );
    for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
        fprintf( stderr, "%6.1f", a[i] );
    fprintf( stderr, "\n" );
    for ( i = 0; i <= A->n; ++i )
        fprintf( stderr, "%6d", xa[i] );
    fprintf( stderr, "\n" );
    for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
        fprintf( stderr, "%6d", atsub[i] );
    fprintf( stderr, "\n" );
    for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
        fprintf( stderr, "%6.1f", at[i] );
    fprintf( stderr, "\n" );
    for ( i = 0; i <= A->n; ++i )
        fprintf( stderr, "%6d", xat[i] );
    fprintf( stderr, "\n" );

    A_S.Stype = SLU_NC; /* column-wise, no supernode */
    A_S.Dtype = SLU_D; /* double-precision */
    A_S.Mtype = SLU_GE; /* full (general) matrix -- required for parallel factorization */
    A_S.nrow = A->n;
    A_S.ncol = A->n;
    A_S.Store = (void *) SUPERLU_MALLOC( sizeof(NCformat) );
    A_S_store = (NCformat *) A_S.Store;
    A_S_store->nnz = 2 * A->start[A->n] - A->n;
    A_S_store->nzval = at;
    A_S_store->rowind = atsub;
    A_S_store->colptr = xat;

    /* ------------------------------------------------------------
       Allocate storage and initialize statistics variables.
       ------------------------------------------------------------*/
    StatAlloc( A->n, nprocs, panel_size, relax, &Gstat );
    StatInit( A->n, nprocs, &Gstat );

    /* ------------------------------------------------------------
       Get column permutation vector perm_c[], according to permc_spec:
       permc_spec = 0: natural ordering
       permc_spec = 1: minimum degree ordering on structure of A'*A
       permc_spec = 2: minimum degree ordering on structure of A'+A
       permc_spec = 3: approximate minimum degree for unsymmetric matrices
       ------------------------------------------------------------*/
    permc_spec = 0;
    get_perm_c( permc_spec, &A_S, perm_c );

    /* ------------------------------------------------------------
       Initialize the option structure superlumt_options using the
       user-input parameters;
       Apply perm_c to the columns of original A to form AC.
       ------------------------------------------------------------*/
    pdgstrf_init( nprocs, fact, trans, refact, panel_size, relax,
                  u, usepr, drop_tol, perm_c, perm_r,
                  work, lwork, &A_S, &AC_S, &superlumt_options, &Gstat );

    for ( i = 0; i < ((NCPformat*)AC_S.Store)->nnz; ++i )
        fprintf( stderr, "%6.1f", ((real*)(((NCPformat*)AC_S.Store)->nzval))[i] );
    fprintf( stderr, "\n" );

    /* ------------------------------------------------------------
       Compute the LU factorization of A.
       The following routine will create nprocs threads.
       ------------------------------------------------------------*/
    pdgstrf( &superlumt_options, &AC_S, perm_r, &L_S, &U_S, &Gstat, &info );

    fprintf( stderr, "INFO: %d\n", info );

    flopcnt = 0;
    for (i = 0; i < nprocs; ++i)
    {
        flopcnt += Gstat.procstat[i].fcops;
    }
    Gstat.ops[FACT] = flopcnt;

#if defined(DEBUG)
    printf("\n** Result of sparse LU **\n");
    L_S_store = (SCPformat *) L_S.Store;
    U_S_store = (NCPformat *) U_S.Store;
    printf( "No of nonzeros in factor L = " IFMT "\n", L_S_store->nnz );
    printf( "No of nonzeros in factor U = " IFMT "\n", U_S_store->nnz );
    fflush( stdout );
#endif

    /* convert L and R from SuperLU formats to CSR */
    memset( Ltop, 0, (A->n + 1) * sizeof(int) );
    memset( Utop, 0, (A->n + 1) * sizeof(int) );
    memset( L->start, 0, (A->n + 1) * sizeof(int) );
    memset( U->start, 0, (A->n + 1) * sizeof(int) );

    for ( i = 0; i < 2 * L_S_store->nnz; ++i )
        fprintf( stderr, "%6.1f", ((real*)(L_S_store->nzval))[i] );
    fprintf( stderr, "\n" );
    for ( i = 0; i < 2 * U_S_store->nnz; ++i )
        fprintf( stderr, "%6.1f", ((real*)(U_S_store->nzval))[i] );
    fprintf( stderr, "\n" );

    printf( "No of supernodes in factor L = " IFMT "\n", L_S_store->nsuper );
    for ( i = 0; i < A->n; ++i )
    {
        fprintf( stderr, "nzval_col_beg[%5d] = %d\n", i, L_S_store->nzval_colbeg[i] );
        fprintf( stderr, "nzval_col_end[%5d] = %d\n", i, L_S_store->nzval_colend[i] );
        //TODO: correct for SCPformat for L?
        //for( pj = L_S_store->rowind_colbeg[i]; pj < L_S_store->rowind_colend[i]; ++pj )
        //        for( pj = 0; pj < L_S_store->rowind_colend[i] - L_S_store->rowind_colbeg[i]; ++pj )
        //        {
        //            ++Ltop[L_S_store->rowind[L_S_store->rowind_colbeg[i] + pj] + 1];
        //        }
        fprintf( stderr, "col_beg[%5d] = %d\n", i, U_S_store->colbeg[i] );
        fprintf( stderr, "col_end[%5d] = %d\n", i, U_S_store->colend[i] );
        for ( pj = U_S_store->colbeg[i]; pj < U_S_store->colend[i]; ++pj )
        {
            ++Utop[U_S_store->rowind[pj] + 1];
            fprintf( stderr, "Utop[%5d]     = %d\n", U_S_store->rowind[pj] + 1, Utop[U_S_store->rowind[pj] + 1] );
        }
    }
    for ( i = 1; i <= A->n; ++i )
    {
        //        Ltop[i] = L->start[i] = Ltop[i] + Ltop[i - 1];
        Utop[i] = U->start[i] = Utop[i] + Utop[i - 1];
        //        fprintf( stderr, "Utop[%5d]     = %d\n", i, Utop[i] );
        //        fprintf( stderr, "U->start[%5d] = %d\n", i, U->start[i] );
    }
    for ( i = 0; i < A->n; ++i )
    {
        //        for( pj = 0; pj < L_S_store->nzval_colend[i] - L_S_store->nzval_colbeg[i]; ++pj )
        //        {
        //            r = L_S_store->rowind[L_S_store->rowind_colbeg[i] + pj];
        //            L->entries[Ltop[r]].j = r;
        //            L->entries[Ltop[r]].val = ((real*)L_S_store->nzval)[L_S_store->nzval_colbeg[i] + pj];
        //            ++Ltop[r];
        //        }
        for ( pj = U_S_store->colbeg[i]; pj < U_S_store->colend[i]; ++pj )
        {
            r = U_S_store->rowind[pj];
            U->entries[Utop[r]].j = i;
            U->entries[Utop[r]].val = ((real*)U_S_store->nzval)[pj];
            ++Utop[r];
        }
    }

    /* ------------------------------------------------------------
       Deallocate storage after factorization.
       ------------------------------------------------------------*/
    pxgstrf_finalize( &superlumt_options, &AC_S );
    Deallocate_Matrix( A_t );
    sfree( xa, "SuperLU_Factorize::xa" );
    sfree( asub, "SuperLU_Factorize::asub" );
    sfree( a, "SuperLU_Factorize::a" );
    SUPERLU_FREE( perm_r );
    SUPERLU_FREE( perm_c );
    SUPERLU_FREE( ((NCformat *)A_S.Store)->rowind );
    SUPERLU_FREE( ((NCformat *)A_S.Store)->colptr );
    SUPERLU_FREE( ((NCformat *)A_S.Store)->nzval );
    SUPERLU_FREE( A_S.Store );
    if ( lwork == 0 )
    {
        Destroy_SuperNode_SCP(&L_S);
        Destroy_CompCol_NCP(&U_S);
    }
    else if ( lwork > 0 )
    {
        SUPERLU_FREE(work);
    }
    StatFree(&Gstat);

    sfree( Utop, "SuperLU_Factorize::Utop" );
    sfree( Ltop, "SuperLU_Factorize::Ltop" );

    //TODO: return iters
    return 0.;
}
#endif


/* Diagonal (Jacobi) preconditioner computation */
real diag_pre_comp( const sparse_matrix * const H, real * const Hdia_inv )
{
    unsigned int i;
    real start;

    start = Get_Time( );

#ifdef _OPENMP
    #pragma omp parallel for schedule(static) \
    default(none) private(i)
#endif
    for ( i = 0; i < H->n; ++i )
    {
        if ( H->val[H->start[i + 1] - 1] != 0.0 )
        {
            Hdia_inv[i] = 1.0 / H->val[H->start[i + 1] - 1];
        }
        else
        {
            Hdia_inv[i] = 1.0;
        }
    }

    return Get_Timing_Info( start );
}


/* Incomplete Cholesky factorization with dual thresholding */
real ICHOLT( const sparse_matrix * const A, const real * const droptol,
             sparse_matrix * const L, sparse_matrix * const U )
{
    int *tmp_j;
    real *tmp_val;
    int i, j, pj, k1, k2, tmptop, Ltop;
    real val, start;
    unsigned int *Utop;

    start = Get_Time( );

    Utop = (unsigned int*) smalloc( (A->n + 1) * sizeof(unsigned int),
                                    "ICHOLT::Utop" );
    tmp_j = (int*) smalloc( A->n * sizeof(int),
                            "ICHOLT::Utop" );
    tmp_val = (real*) smalloc( A->n * sizeof(real),
                               "ICHOLT::Utop" );

    // clear variables
    Ltop = 0;
    tmptop = 0;
    memset( L->start, 0, (A->n + 1) * sizeof(unsigned int) );
    memset( U->start, 0, (A->n + 1) * sizeof(unsigned int) );
    memset( Utop, 0, A->n * sizeof(unsigned int) );

    for ( i = 0; i < A->n; ++i )
    {
        L->start[i] = Ltop;
        tmptop = 0;

        for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
        {
            j = A->j[pj];
            val = A->val[pj];

            if ( FABS(val) > droptol[i] )
            {
                k1 = 0;
                k2 = L->start[j];
                while ( k1 < tmptop && k2 < L->start[j + 1] )
                {
                    if ( tmp_j[k1] < L->j[k2] )
                    {
                        ++k1;
                    }
                    else if ( tmp_j[k1] > L->j[k2] )
                    {
                        ++k2;
                    }
                    else
                    {
                        val -= (tmp_val[k1++] * L->val[k2++]);
                    }
                }

                // L matrix is lower triangular,
                // so right before the start of next row comes jth diagonal
                val /= L->val[L->start[j + 1] - 1];

                tmp_j[tmptop] = j;
                tmp_val[tmptop] = val;
                ++tmptop;
            }
        }

        // sanity check
        if ( A->j[pj] != i )
        {
            fprintf( stderr, "[ICHOLT] badly built A matrix!\n (i = %d) ", i );
            exit( NUMERIC_BREAKDOWN );
        }

        // compute the ith diagonal in L
        val = A->val[pj];
        for ( k1 = 0; k1 < tmptop; ++k1 )
        {
            val -= (tmp_val[k1] * tmp_val[k1]);
        }

#if defined(DEBUG)
        if ( val < 0.0 )
        {
            fprintf( stderr, "[ICHOLT] Numeric breakdown (SQRT of negative on diagonal i = %d). Terminating.\n", i );
            exit( NUMERIC_BREAKDOWN );

        }
#endif

        tmp_j[tmptop] = i;
        tmp_val[tmptop] = SQRT( val );

        // apply the dropping rule once again
        //fprintf( stderr, "row%d: tmptop: %d\n", i, tmptop );
        //for( k1 = 0; k1<= tmptop; ++k1 )
        //  fprintf( stderr, "%d(%f)  ", tmp[k1].j, tmp[k1].val );
        //fprintf( stderr, "\n" );
        //fprintf( stderr, "row(%d): droptol=%.4f\n", i+1, droptol[i] );
        for ( k1 = 0; k1 < tmptop; ++k1 )
        {
            if ( FABS(tmp_val[k1]) > droptol[i] / tmp_val[tmptop] )
            {
                L->j[Ltop] = tmp_j[k1];
                L->val[Ltop] = tmp_val[k1];
                U->start[tmp_j[k1] + 1]++;
                ++Ltop;
                //fprintf( stderr, "%d(%.4f)  ", tmp[k1].j+1, tmp[k1].val );
            }
        }
        // keep the diagonal in any case
        L->j[Ltop] = tmp_j[k1];
        L->val[Ltop] = tmp_val[k1];
        ++Ltop;
        //fprintf( stderr, "%d(%.4f)\n", tmp[k1].j+1,  tmp[k1].val );
    }

    L->start[i] = Ltop;
    //    fprintf( stderr, "nnz(L): %d, max: %d\n", Ltop, L->n * 50 );

    /* U = L^T (Cholesky factorization) */
    Transpose( L, U );
    //    for ( i = 1; i <= U->n; ++i )
    //    {
    //        Utop[i] = U->start[i] = U->start[i] + U->start[i - 1] + 1;
    //    }
    //    for ( i = 0; i < L->n; ++i )
    //    {
    //        for ( pj = L->start[i]; pj < L->start[i + 1]; ++pj )
    //        {
    //            j = L->j[pj];
    //            U->j[Utop[j]] = i;
    //            U->val[Utop[j]] = L->val[pj];
    //            Utop[j]++;
    //        }
    //    }

    //    fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 );

    sfree( tmp_val, "ICHOLT::tmp_val" );
    sfree( tmp_j, "ICHOLT::tmp_j" );
    sfree( Utop, "ICHOLT::Utop" );

    return Get_Timing_Info( start );
}


/* Fine-grained (parallel) incomplete Cholesky factorization
 *
 * Reference:
 * Edmond Chow and Aftab Patel
 * Fine-Grained Parallel Incomplete LU Factorization
 * SIAM J. Sci. Comp. */
#if defined(TESTING)
real ICHOL_PAR( const sparse_matrix * const A, const unsigned int sweeps,
                sparse_matrix * const U_t, sparse_matrix * const U )
{
    unsigned int i, j, k, pj, x = 0, y = 0, ei_x, ei_y;
    real *D, *D_inv, sum, start;
    sparse_matrix *DAD;
    int *Utop;

    start = Get_Time( );

    D = (real*) smalloc( A->n * sizeof(real), "ICHOL_PAR::D" );
    D_inv = (real*) smalloc( A->n * sizeof(real), "ICHOL_PAR::D_inv" );
    Utop = (int*) smalloc( (A->n + 1) * sizeof(int), "ICHOL_PAR::Utop" );
    if ( Allocate_Matrix( &DAD, A->n, A->m ) == FAILURE )
    {
        fprintf( stderr, "not enough memory for ICHOL_PAR preconditioning matrices. terminating.\n" );
        exit( INSUFFICIENT_MEMORY );
    }

#ifdef _OPENMP
    #pragma omp parallel for schedule(static) \
    default(none) shared(D_inv, D) private(i)
#endif
    for ( i = 0; i < A->n; ++i )
    {
        D_inv[i] = SQRT( A->val[A->start[i + 1] - 1] );
        D[i] = 1. / D_inv[i];
    }

    memset( U->start, 0, sizeof(unsigned int) * (A->n + 1) );
    memset( Utop, 0, sizeof(unsigned int) * (A->n + 1) );

    /* to get convergence, A must have unit diagonal, so apply
     * transformation DAD, where D = D(1./SQRT(D(A))) */
    memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) );
#ifdef _OPENMP
    #pragma omp parallel for schedule(guided) \
    default(none) shared(DAD, D_inv, D) private(i, pj)
#endif
    for ( i = 0; i < A->n; ++i )
    {
        /* non-diagonals */
        for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
        {
            DAD->j[pj] = A->j[pj];
            DAD->val[pj] = A->val[pj] * D[i] * D[A->j[pj]];
        }
        /* diagonal */
        DAD->j[pj] = A->j[pj];
        DAD->val[pj] = 1.;
    }

    /* initial guesses for U^T,
     * assume: A and DAD symmetric and stored lower triangular */
    memcpy( U_t->start, DAD->start, sizeof(int) * (DAD->n + 1) );
    memcpy( U_t->j, DAD->j, sizeof(int) * (DAD->m) );
    memcpy( U_t->val, DAD->val, sizeof(real) * (DAD->m) );

    for ( i = 0; i < sweeps; ++i )
    {
        /* for each nonzero */
#ifdef _OPENMP
        #pragma omp parallel for schedule(static) \
        default(none) shared(DAD, stderr) private(sum, ei_x, ei_y, k) firstprivate(x, y)
#endif
        for ( j = 0; j < A->start[A->n]; ++j )
        {
            sum = ZERO;

            /* determine row bounds of current nonzero */
            x = 0;
            ei_x = 0;
            for ( k = 0; k <= A->n; ++k )
            {
                if ( U_t->start[k] > j )
                {
                    x = U_t->start[k - 1];
                    ei_x = U_t->start[k];
                    break;
                }
            }
            /* column bounds of current nonzero */
            y = U_t->start[U_t->j[j]];
            ei_y = U_t->start[U_t->j[j] + 1];

            /* sparse dot product: dot( U^T(i,1:j-1), U^T(j,1:j-1) ) */
            while ( U_t->j[x] < U_t->j[j] &&
                    U_t->j[y] < U_t->j[j] &&
                    x < ei_x && y < ei_y )
            {
                if ( U_t->j[x] == U_t->j[y] )
                {
                    sum += (U_t->val[x] * U_t->val[y]);
                    ++x;
                    ++y;
                }
                else if ( U_t->j[x] < U_t->j[y] )
                {
                    ++x;
                }
                else
                {
                    ++y;
                }
            }

            sum = DAD->val[j] - sum;

            /* diagonal entries */
            if ( (k - 1) == U_t->j[j] )
            {
                /* sanity check */
                if ( sum < ZERO )
                {
                    fprintf( stderr, "Numeric breakdown in ICHOL_PAR. Terminating.\n");
#if defined(DEBUG_FOCUS)
                    fprintf( stderr, "A(%5d,%5d) = %10.3f\n",
                             k - 1, A->entries[j].j, A->entries[j].val );
                    fprintf( stderr, "sum = %10.3f\n", sum);
#endif
                    exit(NUMERIC_BREAKDOWN);
                }

                U_t->val[j] = SQRT( sum );
            }
            /* non-diagonal entries */
            else
            {
                U_t->val[j] = sum / U_t->val[ei_y - 1];
            }
        }
    }

    /* apply inverse transformation D^{-1}U^{T},
     * since DAD \approx U^{T}U, so
     * D^{-1}DADD^{-1} = A \approx D^{-1}U^{T}UD^{-1} */
#ifdef _OPENMP
    #pragma omp parallel for schedule(guided) \
    default(none) shared(D_inv) private(i, pj)
#endif
    for ( i = 0; i < A->n; ++i )
    {
        for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
        {
            U_t->val[pj] *= D_inv[i];
        }
    }

#if defined(DEBUG_FOCUS)
    fprintf( stderr, "nnz(L): %d, max: %d\n", U_t->start[U_t->n], U_t->n * 50 );
#endif

    /* transpose U^{T} and copy into U */
    Transpose( U_t, U );

#if defined(DEBUG_FOCUS)
    fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 );
#endif

    Deallocate_Matrix( DAD );
    sfree( D_inv, "ICHOL_PAR::D_inv" );
    sfree( D, "ICHOL_PAR::D" );
    sfree( Utop, "ICHOL_PAR::Utop" );

    return Get_Timing_Info( start );
}
#endif


/* Fine-grained (parallel) incomplete LU factorization
 *
 * Reference:
 * Edmond Chow and Aftab Patel
 * Fine-Grained Parallel Incomplete LU Factorization
 * SIAM J. Sci. Comp.
 *
 * A: symmetric, half-stored (lower triangular), CSR format
 * sweeps: number of loops over non-zeros for computation
 * L / U: factorized triangular matrices (A \approx LU), CSR format */
real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps,
              sparse_matrix * const L, sparse_matrix * const U )
{
    unsigned int i, j, k, pj, x, y, ei_x, ei_y;
    real *D, *D_inv, sum, start;
    sparse_matrix *DAD;

    start = Get_Time( );

    D = (real*) smalloc( A->n * sizeof(real), "ILU_PAR::D" );
    D_inv = (real*) smalloc( A->n * sizeof(real), "ILU_PAR::D_inv" );
    if ( Allocate_Matrix( &DAD, A->n, A->m ) == FAILURE )
    {
        fprintf( stderr, "[ILU_PAR] Not enough memory for preconditioning matrices. Terminating.\n" );
        exit( INSUFFICIENT_MEMORY );
    }

#ifdef _OPENMP
    #pragma omp parallel for schedule(static) \
    default(none) shared(D, D_inv) private(i)
#endif
    for ( i = 0; i < A->n; ++i )
    {
        D_inv[i] = SQRT( FABS( A->val[A->start[i + 1] - 1] ) );
        D[i] = 1.0 / D_inv[i];
        //        printf( "A->val[%8d] = %f, D[%4d] = %f, D_inv[%4d] = %f\n", A->start[i + 1] - 1, A->val[A->start[i + 1] - 1], i, D[i], i, D_inv[i] );
    }

    /* to get convergence, A must have unit diagonal, so apply
     * transformation DAD, where D = D(1./SQRT(abs(D(A)))) */
    memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) );
#ifdef _OPENMP
    #pragma omp parallel for schedule(static) \
    default(none) shared(DAD, D) private(i, pj)
#endif
    for ( i = 0; i < A->n; ++i )
    {
        /* non-diagonals */
        for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
        {
            DAD->j[pj] = A->j[pj];
            DAD->val[pj] = D[i] * A->val[pj] * D[A->j[pj]];
        }
        /* diagonal */
        DAD->j[pj] = A->j[pj];
        DAD->val[pj] = 1.0;
    }

    /* initial guesses for L and U,
     * assume: A and DAD symmetric and stored lower triangular */
    memcpy( L->start, DAD->start, sizeof(int) * (DAD->n + 1) );
    memcpy( L->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) );
    memcpy( L->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) );
    /* store U^T in CSR for row-wise access and tranpose later */
    memcpy( U->start, DAD->start, sizeof(int) * (DAD->n + 1) );
    memcpy( U->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) );
    memcpy( U->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) );

    /* L has unit diagonal, by convention */
#ifdef _OPENMP
    #pragma omp parallel for schedule(static) default(none) private(i)
#endif
    for ( i = 0; i < A->n; ++i )
    {
        L->val[L->start[i + 1] - 1] = 1.0;
    }

    for ( i = 0; i < sweeps; ++i )
    {
        /* for each nonzero in L */
#ifdef _OPENMP
        #pragma omp parallel for schedule(static) \
        default(none) shared(DAD) private(j, k, x, y, ei_x, ei_y, sum)
#endif
        for ( j = 0; j < DAD->start[DAD->n]; ++j )
        {
            sum = ZERO;

            /* determine row bounds of current nonzero */
            x = 0;
            ei_x = 0;
            for ( k = 1; k <= DAD->n; ++k )
            {
                if ( DAD->start[k] > j )
                {
                    x = DAD->start[k - 1];
                    ei_x = DAD->start[k];
                    break;
                }
            }
            /* determine column bounds of current nonzero */
            y = DAD->start[DAD->j[j]];
            ei_y = DAD->start[DAD->j[j] + 1];

            /* sparse dot product:
             *   dot( L(i,1:j-1), U(1:j-1,j) ) */
            while ( L->j[x] < L->j[j] &&
                    L->j[y] < L->j[j] &&
                    x < ei_x && y < ei_y )
            {
                if ( L->j[x] == L->j[y] )
                {
                    sum += (L->val[x] * U->val[y]);
                    ++x;
                    ++y;
                }
                else if ( L->j[x] < L->j[y] )
                {
                    ++x;
                }
                else
                {
                    ++y;
                }
            }

            if ( j != ei_x - 1 )
            {
                L->val[j] = ( DAD->val[j] - sum ) / U->val[ei_y - 1];
            }
        }

#ifdef _OPENMP
        #pragma omp parallel for schedule(static) \
        default(none) shared(DAD) private(j, k, x, y, ei_x, ei_y, sum)
#endif
        for ( j = 0; j < DAD->start[DAD->n]; ++j )
        {
            sum = ZERO;

            /* determine row bounds of current nonzero */
            x = 0;
            ei_x = 0;
            for ( k = 1; k <= DAD->n; ++k )
            {
                if ( DAD->start[k] > j )
                {
                    x = DAD->start[k - 1];
                    ei_x = DAD->start[k];
                    break;
                }
            }
            /* determine column bounds of current nonzero */
            y = DAD->start[DAD->j[j]];
            ei_y = DAD->start[DAD->j[j] + 1];

            /* sparse dot product:
             *   dot( L(i,1:i-1), U(1:i-1,j) ) */
            while ( U->j[x] < U->j[j] &&
                    U->j[y] < U->j[j] &&
                    x < ei_x && y < ei_y )
            {
                if ( U->j[x] == U->j[y] )
                {
                    sum += (L->val[y] * U->val[x]);
                    ++x;
                    ++y;
                }
                else if ( U->j[x] < U->j[y] )
                {
                    ++x;
                }
                else
                {
                    ++y;
                }
            }

            U->val[j] = DAD->val[j] - sum;
        }
    }

    /* apply inverse transformation:
     * since DAD \approx LU, then
     * D^{-1}DADD^{-1} = A \approx D^{-1}LUD^{-1} */
#ifdef _OPENMP
    #pragma omp parallel for schedule(static) \
    default(none) shared(DAD, D_inv) private(i, pj)
#endif
    for ( i = 0; i < DAD->n; ++i )
    {
        for ( pj = DAD->start[i]; pj < DAD->start[i + 1]; ++pj )
        {
            L->val[pj] = D_inv[i] * L->val[pj];
            /* currently storing U^T, so use row index instead of column index */
            U->val[pj] = U->val[pj] * D_inv[i];
        }
    }

    Transpose_I( U );

#if defined(DEBUG_FOCUS)
    fprintf( stderr, "nnz(L): %d, max: %d\n", L->start[L->n], L->n * 50 );
    fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 );
#endif

    Deallocate_Matrix( DAD );
    sfree( D_inv, "ILU_PAR::D_inv" );
    sfree( D, "ILU_PAR::D_inv" );

    return Get_Timing_Info( start );
}


/* Fine-grained (parallel) incomplete LU factorization with thresholding
 *
 * Reference:
 * Edmond Chow and Aftab Patel
 * Fine-Grained Parallel Incomplete LU Factorization
 * SIAM J. Sci. Comp.
 *
 * A: symmetric, half-stored (lower triangular), CSR format
 * droptol: row-wise tolerances used for dropping
 * sweeps: number of loops over non-zeros for computation
 * L / U: factorized triangular matrices (A \approx LU), CSR format */
real ILUT_PAR( const sparse_matrix * const A, const real * droptol,
               const unsigned int sweeps, sparse_matrix * const L, sparse_matrix * const U )
{
    unsigned int i, j, k, pj, x, y, ei_x, ei_y, Ltop, Utop;
    real *D, *D_inv, sum, start;
    sparse_matrix *DAD, *L_temp, *U_temp;

    start = Get_Time( );

    if ( Allocate_Matrix( &DAD, A->n, A->m ) == FAILURE ||
            Allocate_Matrix( &L_temp, A->n, A->m ) == FAILURE ||
            Allocate_Matrix( &U_temp, A->n, A->m ) == FAILURE )
    {
        fprintf( stderr, "not enough memory for ILUT_PAR preconditioning matrices. terminating.\n" );
        exit( INSUFFICIENT_MEMORY );
    }

    D = (real*) smalloc( A->n * sizeof(real), "ILUT_PAR::D" );
    D_inv = (real*) smalloc( A->n * sizeof(real), "ILUT_PAR::D_inv" );

#ifdef _OPENMP
    #pragma omp parallel for schedule(static) \
    default(none) shared(D, D_inv) private(i)
#endif
    for ( i = 0; i < A->n; ++i )
    {
        D_inv[i] = SQRT( FABS( A->val[A->start[i + 1] - 1] ) );
        D[i] = 1.0 / D_inv[i];
    }

    /* to get convergence, A must have unit diagonal, so apply
     * transformation DAD, where D = D(1./SQRT(D(A))) */
    memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) );
#ifdef _OPENMP
    #pragma omp parallel for schedule(static) \
    default(none) shared(DAD, D) private(i, pj)
#endif
    for ( i = 0; i < A->n; ++i )
    {
        /* non-diagonals */
        for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
        {
            DAD->j[pj] = A->j[pj];
            DAD->val[pj] = D[i] * A->val[pj] * D[A->j[pj]];
        }
        /* diagonal */
        DAD->j[pj] = A->j[pj];
        DAD->val[pj] = 1.0;
    }

    /* initial guesses for L and U,
     * assume: A and DAD symmetric and stored lower triangular */
    memcpy( L_temp->start, DAD->start, sizeof(int) * (DAD->n + 1) );
    memcpy( L_temp->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) );
    memcpy( L_temp->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) );
    /* store U^T in CSR for row-wise access and tranpose later */
    memcpy( U_temp->start, DAD->start, sizeof(int) * (DAD->n + 1) );
    memcpy( U_temp->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) );
    memcpy( U_temp->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) );

    /* L has unit diagonal, by convention */
#ifdef _OPENMP
    #pragma omp parallel for schedule(static) \
    default(none) private(i) shared(L_temp)
#endif
    for ( i = 0; i < A->n; ++i )
    {
        L_temp->val[L_temp->start[i + 1] - 1] = 1.0;
    }

    for ( i = 0; i < sweeps; ++i )
    {
        /* for each nonzero in L */
#ifdef _OPENMP
        #pragma omp parallel for schedule(static) \
        default(none) shared(DAD, L_temp, U_temp) private(j, k, x, y, ei_x, ei_y, sum)
#endif
        for ( j = 0; j < DAD->start[DAD->n]; ++j )
        {
            sum = ZERO;

            /* determine row bounds of current nonzero */
            x = 0;
            ei_x = 0;
            for ( k = 1; k <= DAD->n; ++k )
            {
                if ( DAD->start[k] > j )
                {
                    x = DAD->start[k - 1];
                    ei_x = DAD->start[k];
                    break;
                }
            }
            /* determine column bounds of current nonzero */
            y = DAD->start[DAD->j[j]];
            ei_y = DAD->start[DAD->j[j] + 1];

            /* sparse dot product:
             *   dot( L(i,1:j-1), U(1:j-1,j) ) */
            while ( L_temp->j[x] < L_temp->j[j] &&
                    L_temp->j[y] < L_temp->j[j] &&
                    x < ei_x && y < ei_y )
            {
                if ( L_temp->j[x] == L_temp->j[y] )
                {
                    sum += (L_temp->val[x] * U_temp->val[y]);
                    ++x;
                    ++y;
                }
                else if ( L_temp->j[x] < L_temp->j[y] )
                {
                    ++x;
                }
                else
                {
                    ++y;
                }
            }

            if ( j != ei_x - 1 )
            {
                L_temp->val[j] = ( DAD->val[j] - sum ) / U_temp->val[ei_y - 1];
            }
        }

#ifdef _OPENMP
        #pragma omp parallel for schedule(static) \
        default(none) shared(DAD, L_temp, U_temp) private(j, k, x, y, ei_x, ei_y, sum)
#endif
        for ( j = 0; j < DAD->start[DAD->n]; ++j )
        {
            sum = ZERO;

            /* determine row bounds of current nonzero */
            x = 0;
            ei_x = 0;
            for ( k = 1; k <= DAD->n; ++k )
            {
                if ( DAD->start[k] > j )
                {
                    x = DAD->start[k - 1];
                    ei_x = DAD->start[k];
                    break;
                }
            }
            /* determine column bounds of current nonzero */
            y = DAD->start[DAD->j[j]];
            ei_y = DAD->start[DAD->j[j] + 1];

            /* sparse dot product:
             *   dot( L(i,1:i-1), U(1:i-1,j) ) */
            while ( U_temp->j[x] < U_temp->j[j] &&
                    U_temp->j[y] < U_temp->j[j] &&
                    x < ei_x && y < ei_y )
            {
                if ( U_temp->j[x] == U_temp->j[y] )
                {
                    sum += (L_temp->val[y] * U_temp->val[x]);
                    ++x;
                    ++y;
                }
                else if ( U_temp->j[x] < U_temp->j[y] )
                {
                    ++x;
                }
                else
                {
                    ++y;
                }
            }

            U_temp->val[j] = DAD->val[j] - sum;
        }
    }

    /* apply inverse transformation:
     * since DAD \approx LU, then
     * D^{-1}DADD^{-1} = A \approx D^{-1}LUD^{-1} */
#ifdef _OPENMP
    #pragma omp parallel for schedule(static) \
    default(none) shared(DAD, L_temp, U_temp, D_inv) private(i, pj)
#endif
    for ( i = 0; i < DAD->n; ++i )
    {
        for ( pj = DAD->start[i]; pj < DAD->start[i + 1]; ++pj )
        {
            L_temp->val[pj] = D_inv[i] * L_temp->val[pj];
            /* currently storing U^T, so use row index instead of column index */
            U_temp->val[pj] = U_temp->val[pj] * D_inv[i];
        }
    }

    /* apply the dropping rule */
    Ltop = 0;
    Utop = 0;
    for ( i = 0; i < DAD->n; ++i )
    {
        L->start[i] = Ltop;
        U->start[i] = Utop;

        for ( pj = L_temp->start[i]; pj < L_temp->start[i + 1] - 1; ++pj )
        {
            if ( FABS( L_temp->val[pj] ) > FABS( droptol[i] / L_temp->val[L_temp->start[i + 1] - 1] ) )
            {
                L->j[Ltop] = L_temp->j[pj];
                L->val[Ltop] = L_temp->val[pj];
                ++Ltop;
            }
        }

        /* diagonal */
        L->j[Ltop] = L_temp->j[pj];
        L->val[Ltop] = L_temp->val[pj];
        ++Ltop;

        for ( pj = U_temp->start[i]; pj < U_temp->start[i + 1] - 1; ++pj )
        {
            if ( FABS( U_temp->val[pj] ) > FABS( droptol[i] / U_temp->val[U_temp->start[i + 1] - 1] ) )
            {
                U->j[Utop] = U_temp->j[pj];
                U->val[Utop] = U_temp->val[pj];
                ++Utop;
            }
        }

        /* diagonal */
        U->j[Utop] = U_temp->j[pj];
        U->val[Utop] = U_temp->val[pj];
        ++Utop;
    }

    L->start[i] = Ltop;
    U->start[i] = Utop;

    Transpose_I( U );

#if defined(DEBUG_FOCUS)
    fprintf( stderr, "nnz(L): %d\n", L->start[L->n] );
    fprintf( stderr, "nnz(U): %d\n", U->start[U->n] );
#endif

    Deallocate_Matrix( U_temp );
    Deallocate_Matrix( L_temp );
    Deallocate_Matrix( DAD );
    sfree( D_inv, "ILUT_PAR::D_inv" );
    sfree( D, "ILUT_PAR::D_inv" );

    return Get_Timing_Info( start );
}


#if defined(HAVE_LAPACKE) || defined(HAVE_LAPACKE_MKL)
/* Compute M^{1} \approx A which minimizes
 *  \sum_{j=1}^{N} ||e_j - Am_j||_2^2
 *  where: e_j is the j-th column of the NxN identify matrix,
 *         m_j is the j-th column of the NxN approximate sparse matrix M
 * 
 * Internally, use LAPACKE to solve the least-squares problems
 *
 * A: symmetric, sparse matrix, stored in full CSR format
 * A_spar_patt: sparse matrix used as template sparsity pattern
 *   for approximating the inverse, stored in full CSR format
 * A_app_inv: approximate inverse to A, stored in full CSR format (result)
 *
 * Reference:
 * Michele Benzi et al.
 * A Comparative Study of Sparse Approximate Inverse
 *  Preconditioners
 * Applied Numerical Mathematics 30, 1999
 * */
real sparse_approx_inverse( const sparse_matrix * const A,
                            const sparse_matrix * const A_spar_patt,
                            sparse_matrix ** A_app_inv )
{
    int i, k, pj, j_temp, identity_pos;
    int N, M, d_i, d_j;
    lapack_int m, n, nrhs, lda, ldb, info;
    int *pos_x, *pos_y;
    real start;
    real *e_j, *dense_matrix;
    char *X, *Y;

    start = Get_Time( );

    (*A_app_inv)->start[(*A_app_inv)->n] = A_spar_patt->start[A_spar_patt->n];

#ifdef _OPENMP
    #pragma omp parallel default(none) \
    private(i, k, pj, j_temp, identity_pos, N, M, d_i, d_j, m, n, \
            nrhs, lda, ldb, info, X, Y, pos_x, pos_y, e_j, dense_matrix) \
    shared(A_app_inv, stderr)
#endif
    {
        X = (char *) smalloc( sizeof(char) * A->n, "Sparse_Approx_Inverse::X" );
        Y = (char *) smalloc( sizeof(char) * A->n, "Sparse_Approx_Inverse::Y" );
        pos_x = (int *) smalloc( sizeof(int) * A->n, "Sparse_Approx_Inverse::pos_x" );
        pos_y = (int *) smalloc( sizeof(int) * A->n, "Sparse_Approx_Inverse::pos_y" );

        for ( i = 0; i < A->n; ++i )
        {
            X[i] = 0;
            Y[i] = 0;
            pos_x[i] = 0;
            pos_y[i] = 0;
        }

#ifdef _OPENMP
        #pragma omp for schedule(static)
#endif
        for ( i = 0; i < A_spar_patt->n; ++i )
        {
            N = 0;
            M = 0;

            // find column indices of nonzeros (which will be the columns indices of the dense matrix)
            for ( pj = A_spar_patt->start[i]; pj < A_spar_patt->start[i + 1]; ++pj )
            {

                j_temp = A_spar_patt->j[pj];

                Y[j_temp] = 1;
                pos_y[j_temp] = N;
                ++N;

                // for each of those indices
                // search through the row of full A of that index
                for ( k = A->start[j_temp]; k < A->start[j_temp + 1]; ++k )
                {
                    // and accumulate the nonzero column indices to serve as the row indices of the dense matrix
                    X[A->j[k]] = 1;
                }
            }

            // enumerate the row indices from 0 to (# of nonzero rows - 1) for the dense matrix
            identity_pos = M;
            for ( k = 0; k < A->n; k++)
            {
                if ( X[k] != 0 )
                {
                    pos_x[M] = k;
                    if ( k == i )
                    {
                        identity_pos = M;
                    }
                    ++M;
                }
            }

            // allocate memory for NxM dense matrix
            dense_matrix = (real *) smalloc( sizeof(real) * N * M,
                                             "Sparse_Approx_Inverse::dense_matrix" );

            // fill in the entries of dense matrix
            for ( d_i = 0; d_i < M; ++d_i)
            {
                // all rows are initialized to zero
                for ( d_j = 0; d_j < N; ++d_j )
                {
                    dense_matrix[d_i * N + d_j] = 0.0;
                }
                // change the value if any of the column indices is seen
                for ( d_j = A->start[pos_x[d_i]];
                        d_j < A->start[pos_x[d_i] + 1]; ++d_j )
                {
                    if ( Y[A->j[d_j]] == 1 )
                    {
                        dense_matrix[d_i * N + pos_y[A->j[d_j]]] = A->val[d_j];
                    }
                }

            }

            /* create the right hand side of the linear equation
               that is the full column of the identity matrix*/
            e_j = (real *) smalloc( sizeof(real) * M,
                                    "Sparse_Approx_Inverse::e_j" );

            for ( k = 0; k < M; ++k )
            {
                e_j[k] = 0.0;
            }
            e_j[identity_pos] = 1.0;

            /* Solve the overdetermined system AX = B through the least-squares problem:
             * min ||B - AX||_2 */
            m = M;
            n = N;
            nrhs = 1;
            lda = N;
            ldb = nrhs;
            info = LAPACKE_dgels( LAPACK_ROW_MAJOR, 'N', m, n, nrhs, dense_matrix, lda,
                                  e_j, ldb );

            /* Check for the full rank */
            if ( info > 0 )
            {
                fprintf( stderr, "The diagonal element %i of the triangular factor ", info );
                fprintf( stderr, "of A is zero, so that A does not have full rank;\n" );
                fprintf( stderr, "the least squares solution could not be computed.\n" );
                exit( INVALID_INPUT );
            }

            /* Print least squares solution */
            // print_matrix( "Least squares solution", n, nrhs, b, ldb );

            // accumulate the resulting vector to build A_app_inv
            (*A_app_inv)->start[i] = A_spar_patt->start[i];
            for ( k = A_spar_patt->start[i]; k < A_spar_patt->start[i + 1]; ++k)
            {
                (*A_app_inv)->j[k] = A_spar_patt->j[k];
                (*A_app_inv)->val[k] = e_j[k - A_spar_patt->start[i]];
            }

            //empty variables that will be used next iteration
            sfree( dense_matrix, "Sparse_Approx_Inverse::dense_matrix" );
            sfree( e_j, "Sparse_Approx_Inverse::e_j"  );
            for ( k = 0; k < A->n; ++k )
            {
                X[k] = 0;
                Y[k] = 0;
                pos_x[k] = 0;
                pos_y[k] = 0;
            }
        }

        sfree( pos_y, "Sparse_Approx_Inverse::pos_y" );
        sfree( pos_x, "Sparse_Approx_Inverse::pos_x" );
        sfree( Y, "Sparse_Approx_Inverse::Y" );
        sfree( X, "Sparse_Approx_Inverse::X" );
    }

    return Get_Timing_Info( start );
}
#endif


/* sparse matrix-vector product Ax = b
 *
 * workspace: storage container for workspace structures
 * A: lower triangular matrix, stored in CSR format
 * x: vector
 * b (output): vector */
static void Sparse_MatVec( const static_storage * const workspace,
        const sparse_matrix * const A, const real * const x, real * const b )
{
    int i, j, k, n, si, ei;
    real H;
#ifdef _OPENMP
    unsigned int tid;
#endif

    n = A->n;
    Vector_MakeZero( b, n );

#ifdef _OPENMP
    tid = omp_get_thread_num( );

    Vector_MakeZero( workspace->b_local, omp_get_num_threads() * n );

    #pragma omp for schedule(static)
#endif
    for ( i = 0; i < n; ++i )
    {
        si = A->start[i];
        ei = A->start[i + 1] - 1;

        for ( k = si; k < ei; ++k )
        {
            j = A->j[k];
            H = A->val[k];
#ifdef _OPENMP
            workspace->b_local[tid * n + j] += H * x[i];
            workspace->b_local[tid * n + i] += H * x[j];
#else
            b[j] += H * x[i];
            b[i] += H * x[j];
#endif
        }

        // the diagonal entry is the last one in
#ifdef _OPENMP
        workspace->b_local[tid * n + i] += A->val[k] * x[i];
#else
        b[i] += A->val[k] * x[i];
#endif
    }

#ifdef _OPENMP
    #pragma omp for schedule(static)
    for ( i = 0; i < n; ++i )
    {
        for ( j = 0; j < omp_get_num_threads(); ++j )
        {
            b[i] += workspace->b_local[j * n + i];
        }
    }
#endif
}


/* sparse matrix-vector product Ax = b
 * where:
 *   A: matrix, stored in CSR format
 *   x: vector
 *   b: vector (result) */
static void Sparse_MatVec_full( const sparse_matrix * const A,
                                const real * const x, real * const b )
{
    int i, pj;

    Vector_MakeZero( b, A->n );

#ifdef _OPENMP
    #pragma omp for schedule(static)
#endif
    for ( i = 0; i < A->n; ++i )
    {
        for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
        {
            b[i] += A->val[pj] * x[A->j[pj]];
        }
    }
}


/* Transpose A and copy into A^T
 *
 * A: stored in CSR
 * A_t: stored in CSR
 */
void Transpose( const sparse_matrix * const A, sparse_matrix * const A_t )
{
    unsigned int i, j, pj, *A_t_top;

    A_t_top = (unsigned int*) scalloc( A->n + 1, sizeof(unsigned int),
                                       "Transpose::A_t_top" );

    memset( A_t->start, 0, (A->n + 1) * sizeof(unsigned int) );

    /* count nonzeros in each column of A^T, store one row greater (see next loop) */
    for ( i = 0; i < A->n; ++i )
    {
        for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
        {
            ++A_t->start[A->j[pj] + 1];
        }
    }

    /* setup the row pointers for A^T */
    for ( i = 1; i <= A->n; ++i )
    {
        A_t_top[i] = A_t->start[i] = A_t->start[i] + A_t->start[i - 1];
    }

    /* fill in A^T */
    for ( i = 0; i < A->n; ++i )
    {
        for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
        {
            j = A->j[pj];
            A_t->j[A_t_top[j]] = i;
            A_t->val[A_t_top[j]] = A->val[pj];
            ++A_t_top[j];
        }
    }

    sfree( A_t_top, "Transpose::A_t_top" );
}


/* Transpose A in-place
 *
 * A: stored in CSR
 */
void Transpose_I( sparse_matrix * const A )
{
    sparse_matrix * A_t;

    if ( Allocate_Matrix( &A_t, A->n, A->m ) == FAILURE )
    {
        fprintf( stderr, "not enough memory for transposing matrices. terminating.\n" );
        exit( INSUFFICIENT_MEMORY );
    }

    Transpose( A, A_t );

    memcpy( A->start, A_t->start, sizeof(int) * (A_t->n + 1) );
    memcpy( A->j, A_t->j, sizeof(int) * (A_t->start[A_t->n]) );
    memcpy( A->val, A_t->val, sizeof(real) * (A_t->start[A_t->n]) );

    Deallocate_Matrix( A_t );
}


/* Apply diagonal inverse (Jacobi) preconditioner to system residual
 *
 * Hdia_inv: diagonal inverse preconditioner (constructed using H)
 * y: current residual
 * x: preconditioned residual
 * N: dimensions of preconditioner and vectors (# rows in H)
 */
static void diag_pre_app( const real * const Hdia_inv, const real * const y,
                          real * const x, const int N )
{
    unsigned int i;

#ifdef _OPENMP
    #pragma omp for schedule(static)
#endif
    for ( i = 0; i < N; ++i )
    {
        x[i] = y[i] * Hdia_inv[i];
    }
}


/* Solve triangular system LU*x = y using level scheduling
 *
 * LU: lower/upper triangular, stored in CSR
 * y: constants in linear system (RHS)
 * x: solution
 * N: dimensions of matrix and vectors
 * tri: triangularity of LU (lower/upper)
 *
 * Assumptions:
 *   LU has non-zero diagonals
 *   Each row of LU has at least one non-zero (i.e., no rows with all zeros) */
void tri_solve( const sparse_matrix * const LU, const real * const y,
                real * const x, const int N, const TRIANGULARITY tri )
{
    int i, pj, j, si, ei;
    real val;

#ifdef _OPENMP
    #pragma omp single
#endif
    {
        if ( tri == LOWER )
        {
            for ( i = 0; i < N; ++i )
            {
                x[i] = y[i];
                si = LU->start[i];
                ei = LU->start[i + 1];
                for ( pj = si; pj < ei - 1; ++pj )
                {
                    j = LU->j[pj];
                    val = LU->val[pj];
                    x[i] -= val * x[j];
                }
                x[i] /= LU->val[pj];
            }
        }
        else
        {
            for ( i = N - 1; i >= 0; --i )
            {
                x[i] = y[i];
                si = LU->start[i];
                ei = LU->start[i + 1];
                for ( pj = si + 1; pj < ei; ++pj )
                {
                    j = LU->j[pj];
                    val = LU->val[pj];
                    x[i] -= val * x[j];
                }
                x[i] /= LU->val[si];
            }
        }
    }
}


/* Solve triangular system LU*x = y using level scheduling
 *
 * workspace: storage container for workspace structures
 * LU: lower/upper triangular, stored in CSR
 * y: constants in linear system (RHS)
 * x: solution
 * N: dimensions of matrix and vectors
 * tri: triangularity of LU (lower/upper)
 * find_levels: perform level search if positive, otherwise reuse existing levels
 *
 * Assumptions:
 *   LU has non-zero diagonals
 *   Each row of LU has at least one non-zero (i.e., no rows with all zeros) */
void tri_solve_level_sched( static_storage * workspace,
        const sparse_matrix * const LU,
        const real * const y, real * const x, const int N,
        const TRIANGULARITY tri, int find_levels )
{
    int i, j, pj, local_row, local_level;
    unsigned int *row_levels, *level_rows, *level_rows_cnt;
    int levels;

    if ( tri == LOWER )
    {
        row_levels = workspace->row_levels_L;
        level_rows = workspace->level_rows_L;
        level_rows_cnt = workspace->level_rows_cnt_L;
    }
    else
    {
        row_levels = workspace->row_levels_U;
        level_rows = workspace->level_rows_U;
        level_rows_cnt = workspace->level_rows_cnt_U;
    }

#ifdef _OPENMP
    #pragma omp single
#endif
    {
        /* find levels (row dependencies in substitutions) */
        if ( find_levels == TRUE )
        {
            memset( row_levels, 0, N * sizeof(unsigned int) );
            memset( level_rows_cnt, 0, N * sizeof(unsigned int) );
            memset( workspace->top, 0, N * sizeof(unsigned int) );
            levels = 1;

            if ( tri == LOWER )
            {
                for ( i = 0; i < N; ++i )
                {
                    local_level = 1;
                    for ( pj = LU->start[i]; pj < LU->start[i + 1] - 1; ++pj )
                    {
                        local_level = MAX( local_level, row_levels[LU->j[pj]] + 1 );
                    }

                    levels = MAX( levels, local_level );
                    row_levels[i] = local_level;
                    ++level_rows_cnt[local_level];
                }

                workspace->levels_L = levels;

                //#if defined(DEBUG)
                fprintf(stderr, "levels(L): %d\n", levels);
                fprintf(stderr, "NNZ(L): %d\n", LU->start[N]);
                //#endif
            }
            else
            {
                for ( i = N - 1; i >= 0; --i )
                {
                    local_level = 1;
                    for ( pj = LU->start[i] + 1; pj < LU->start[i + 1]; ++pj )
                    {
                        local_level = MAX( local_level, row_levels[LU->j[pj]] + 1 );
                    }

                    levels = MAX( levels, local_level );
                    row_levels[i] = local_level;
                    ++level_rows_cnt[local_level];
                }

                workspace->levels_U = levels;

                //#if defined(DEBUG)
                fprintf(stderr, "levels(U): %d\n", levels);
                fprintf(stderr, "NNZ(U): %d\n", LU->start[N]);
                //#endif
            }

            for ( i = 1; i < levels + 1; ++i )
            {
                level_rows_cnt[i] += level_rows_cnt[i - 1];
                workspace->top[i] = level_rows_cnt[i];
            }

            for ( i = 0; i < N; ++i )
            {
                level_rows[workspace->top[row_levels[i] - 1]] = i;
                ++workspace->top[row_levels[i] - 1];
            }
        }
    }

    if ( tri == LOWER )
    {
        levels = workspace->levels_L;
    }
    else
    {
        levels = workspace->levels_U;
    }

    /* perform substitutions by level */
    if ( tri == LOWER )
    {
        for ( i = 0; i < levels; ++i )
        {
#ifdef _OPENMP
            #pragma omp for schedule(static)
#endif
            for ( j = level_rows_cnt[i]; j < level_rows_cnt[i + 1]; ++j )
            {
                local_row = level_rows[j];
                x[local_row] = y[local_row];
                for ( pj = LU->start[local_row]; pj < LU->start[local_row + 1] - 1; ++pj )
                {
                    x[local_row] -= LU->val[pj] * x[LU->j[pj]];
                }
                x[local_row] /= LU->val[pj];
            }
        }
    }
    else
    {
        for ( i = 0; i < levels; ++i )
        {
#ifdef _OPENMP
            #pragma omp for schedule(static)
#endif
            for ( j = level_rows_cnt[i]; j < level_rows_cnt[i + 1]; ++j )
            {
                local_row = level_rows[j];
                x[local_row] = y[local_row];
                for ( pj = LU->start[local_row] + 1; pj < LU->start[local_row + 1]; ++pj )
                {
                    x[local_row] -= LU->val[pj] * x[LU->j[pj]];
                }
                x[local_row] /= LU->val[LU->start[local_row]];
            }
        }
    }
}


/* Iterative greedy shared-memory parallel graph coloring
 *
 * control: container for control info
 * workspace: storage container for workspace structures
 * A: matrix to use for coloring, stored in CSR format;
 *   rows represent vertices, columns of entries within a row represent adjacent vertices
 *   (i.e., dependent rows for elimination during LU factorization)
 * tri: triangularity of LU (lower/upper)
 * color: vertex color (1-based)
 *
 * Reference:
 * Umit V. Catalyurek et al.
 * Graph Coloring Algorithms for Multi-core
 *  and Massively Threaded Architectures
 * Parallel Computing, 2012
 */
void graph_coloring( const control_params * const control,
        static_storage * workspace,
        const sparse_matrix * const A, const TRIANGULARITY tri )
{
#ifdef _OPENMP
    #pragma omp parallel
#endif
    {
        int i, pj, v;
        unsigned int temp, recolor_cnt_local, *conflict_local;
        int tid, *fb_color;
        unsigned int *p_to_color, *p_conflict, *p_temp;

#ifdef _OPENMP
        tid = omp_get_thread_num( );
#else
        tid = 0;
#endif
        p_to_color = workspace->to_color;
        p_conflict = workspace->conflict;

#ifdef _OPENMP
        #pragma omp for schedule(static)
#endif
        for ( i = 0; i < A->n; ++i )
        {
            workspace->color[i] = 0;
        }

#ifdef _OPENMP
        #pragma omp single
#endif
        {
            workspace->recolor_cnt = A->n;
        }

        /* ordering of vertices to color depends on triangularity of factor
         * for which coloring is to be used for */
        if ( tri == LOWER )
        {
#ifdef _OPENMP
            #pragma omp for schedule(static)
#endif
            for ( i = 0; i < A->n; ++i )
            {
                p_to_color[i] = i;
            }
        }
        else
        {
#ifdef _OPENMP
            #pragma omp for schedule(static)
#endif
            for ( i = 0; i < A->n; ++i )
            {
                p_to_color[i] = A->n - 1 - i;
            }
        }

        fb_color = (int*) smalloc( sizeof(int) * A->n,
                "graph_coloring::fb_color" );
        conflict_local = (unsigned int*) smalloc( sizeof(unsigned int) * A->n,
                "graph_coloring::fb_color" );

        while ( workspace->recolor_cnt > 0 )
        {
            memset( fb_color, -1, sizeof(int) * A->n );

            /* color vertices */
#ifdef _OPENMP
            #pragma omp for schedule(static)
#endif
            for ( i = 0; i < workspace->recolor_cnt; ++i )
            {
                v = p_to_color[i];

                /* colors of adjacent vertices are forbidden */
                for ( pj = A->start[v]; pj < A->start[v + 1]; ++pj )
                {
                    if ( v != A->j[pj] )
                    {
                        fb_color[workspace->color[A->j[pj]]] = v;
                    }
                }

                /* search for min. color which is not in conflict with adjacent vertices;
                 * start at 1 since 0 is default (invalid) color for all vertices */
                for ( pj = 1; fb_color[pj] == v; ++pj )
                    ;

                /* assign discovered color (no conflict in neighborhood of adjacent vertices) */
                workspace->color[v] = pj;
            }

            /* determine if recoloring required */
            temp = workspace->recolor_cnt;
            recolor_cnt_local = 0;

#ifdef _OPENMP
            #pragma omp barrier

            #pragma omp single
#endif
            {
                workspace->recolor_cnt = 0;
            }

#ifdef _OPENMP
            #pragma omp for schedule(static)
#endif
            for ( i = 0; i < temp; ++i )
            {
                v = p_to_color[i];

                /* search for color conflicts with adjacent vertices */
                for ( pj = A->start[v]; pj < A->start[v + 1]; ++pj )
                {
                    if ( workspace->color[v] == workspace->color[A->j[pj]] && v > A->j[pj] )
                    {
                        conflict_local[recolor_cnt_local] = v;
                        ++recolor_cnt_local;
                        break;
                    }
                }
            }

            /* count thread-local conflicts and compute offsets for copying into shared buffer */
            workspace->conflict_cnt[tid + 1] = recolor_cnt_local;

#ifdef _OPENMP
            #pragma omp barrier

            #pragma omp single
#endif
            {
                workspace->conflict_cnt[0] = 0;
                for ( i = 1; i < control->num_threads + 1; ++i )
                {
                    workspace->conflict_cnt[i] += workspace->conflict_cnt[i - 1];
                }
                workspace->recolor_cnt = workspace->conflict_cnt[control->num_threads];
            }

            /* copy thread-local conflicts into shared buffer */
            for ( i = 0; i < recolor_cnt_local; ++i )
            {
                p_conflict[workspace->conflict_cnt[tid] + i] = conflict_local[i];
                workspace->color[conflict_local[i]] = 0;
            }

#ifdef _OPENMP
            #pragma omp barrier
#endif
            p_temp = p_to_color;
            p_to_color = p_conflict;
            p_conflict = p_temp;
        }

        sfree( conflict_local, "graph_coloring::conflict_local" );
        sfree( fb_color, "graph_coloring::fb_color" );
    }
}


/* Sort rows by coloring
 *
 * workspace: storage container for workspace structures
 * n: number of entries in coloring
 * tri: coloring to triangular factor to use (lower/upper)
 */
void sort_rows_by_colors( const static_storage * const workspace,
        const unsigned int n, const TRIANGULARITY tri )
{
    unsigned int i;

    memset( workspace->color_top, 0, sizeof(unsigned int) * (n + 1) );

    /* sort vertices by color (ascending within a color)
     *  1) count colors
     *  2) determine offsets of color ranges
     *  3) sort rows by color
     *
     *  note: color is 1-based */
    for ( i = 0; i < n; ++i )
    {
        ++workspace->color_top[workspace->color[i]];
    }
    for ( i = 1; i < n + 1; ++i )
    {
        workspace->color_top[i] += workspace->color_top[i - 1];
    }
    for ( i = 0; i < n; ++i )
    {
        workspace->permuted_row_col[workspace->color_top[workspace->color[i] - 1]] = i;
        ++workspace->color_top[workspace->color[i] - 1];
    }

    /* invert mapping to get map from current row/column to permuted (new) row/column */
    for ( i = 0; i < n; ++i )
    {
        workspace->permuted_row_col_inv[workspace->permuted_row_col[i]] = i;
    }
}


/* Apply permutation Q^T*x or Q*x based on graph coloring
 *
 * workspace: storage container for workspace structures
 * color: vertex color (1-based); vertices represent matrix rows/columns
 * x: vector to permute (in-place)
 * n: number of entries in x
 * invert_map: if TRUE, use Q^T, otherwise use Q
 * tri: coloring to triangular factor to use (lower/upper)
 */
static void permute_vector( const static_storage * const workspace,
        real * const x, const unsigned int n, const int invert_map,
        const TRIANGULARITY tri )
{
    unsigned int i;
    unsigned int *mapping;

    if ( invert_map == TRUE )
    {
        mapping = workspace->permuted_row_col_inv;
    }
    else
    {
        mapping = workspace->permuted_row_col;
    }

#ifdef _OPENMP
    #pragma omp for schedule(static)
#endif
    for ( i = 0; i < n; ++i )
    {
        workspace->x_p[i] = x[mapping[i]];
    }

#ifdef _OPENMP
    #pragma omp for schedule(static)
#endif
    for ( i = 0; i < n; ++i )
    {
        x[i] = workspace->x_p[i];
    }
}


/* Apply permutation Q^T*(LU)*Q based on graph coloring
 *
 * workspace: storage container for workspace structures
 * color: vertex color (1-based); vertices represent matrix rows/columns
 * LU: matrix to permute, stored in CSR format
 * tri: triangularity of LU (lower/upper)
 */
void permute_matrix( const static_storage * const workspace,
        sparse_matrix * const LU, const TRIANGULARITY tri )
{
    int i, pj, nr, nc;
    sparse_matrix *LUtemp;

    if ( Allocate_Matrix( &LUtemp, LU->n, LU->m ) == FAILURE )
    {
        fprintf( stderr, "Not enough space for graph coloring (factor permutation). Terminating...\n" );
        exit( INSUFFICIENT_MEMORY );
    }

    /* count nonzeros in each row of permuted factor (re-use color_top for counting) */
    memset( workspace->color_top, 0, sizeof(unsigned int) * (LU->n + 1) );

    if ( tri == LOWER )
    {
        for ( i = 0; i < LU->n; ++i )
        {
            nr = workspace->permuted_row_col_inv[i];

            for ( pj = LU->start[i]; pj < LU->start[i + 1]; ++pj )
            {
                nc = workspace->permuted_row_col_inv[LU->j[pj]];

                if ( nc <= nr )
                {
                    ++workspace->color_top[nr + 1];
                }
                /* correct entries to maintain triangularity (lower) */
                else
                {
                    ++workspace->color_top[nc + 1];
                }
            }
        }
    }
    else
    {
        for ( i = LU->n - 1; i >= 0; --i )
        {
            nr = workspace->permuted_row_col_inv[i];

            for ( pj = LU->start[i]; pj < LU->start[i + 1]; ++pj )
            {
                nc = workspace->permuted_row_col_inv[LU->j[pj]];

                if ( nc >= nr )
                {
                    ++workspace->color_top[nr + 1];
                }
                /* correct entries to maintain triangularity (upper) */
                else
                {
                    ++workspace->color_top[nc + 1];
                }
            }
        }
    }

    for ( i = 1; i < LU->n + 1; ++i )
    {
        workspace->color_top[i] += workspace->color_top[i - 1];
    }

    memcpy( LUtemp->start, workspace->color_top, sizeof(unsigned int) * (LU->n + 1) );

    /* permute factor */
    if ( tri == LOWER )
    {
        for ( i = 0; i < LU->n; ++i )
        {
            nr = workspace->permuted_row_col_inv[i];

            for ( pj = LU->start[i]; pj < LU->start[i + 1]; ++pj )
            {
                nc = workspace->permuted_row_col_inv[LU->j[pj]];

                if ( nc <= nr )
                {
                    LUtemp->j[workspace->color_top[nr]] = nc;
                    LUtemp->val[workspace->color_top[nr]] = LU->val[pj];
                    ++workspace->color_top[nr];
                }
                /* correct entries to maintain triangularity (lower) */
                else
                {
                    LUtemp->j[workspace->color_top[nc]] = nr;
                    LUtemp->val[workspace->color_top[nc]] = LU->val[pj];
                    ++workspace->color_top[nc];
                }
            }
        }
    }
    else
    {
        for ( i = LU->n - 1; i >= 0; --i )
        {
            nr = workspace->permuted_row_col_inv[i];

            for ( pj = LU->start[i]; pj < LU->start[i + 1]; ++pj )
            {
                nc = workspace->permuted_row_col_inv[LU->j[pj]];

                if ( nc >= nr )
                {
                    LUtemp->j[workspace->color_top[nr]] = nc;
                    LUtemp->val[workspace->color_top[nr]] = LU->val[pj];
                    ++workspace->color_top[nr];
                }
                /* correct entries to maintain triangularity (upper) */
                else
                {
                    LUtemp->j[workspace->color_top[nc]] = nr;
                    LUtemp->val[workspace->color_top[nc]] = LU->val[pj];
                    ++workspace->color_top[nc];
                }
            }
        }
    }

    memcpy( LU->start, LUtemp->start, sizeof(unsigned int) * (LU->n + 1) );
    memcpy( LU->j, LUtemp->j, sizeof(unsigned int) * LU->start[LU->n] );
    memcpy( LU->val, LUtemp->val, sizeof(real) * LU->start[LU->n] );

    Deallocate_Matrix( LUtemp );
}


/* Setup routines to build permuted QEq matrix H (via graph coloring),
 *  used for preconditioning (incomplete factorizations computed based on
 *  permuted H)
 *
 * control: container for control info
 * workspace: storage container for workspace structures
 * H: symmetric, lower triangular portion only, stored in CSR format;
 * H_full: symmetric, stored in CSR format;
 * H_p (output): permuted copy of H based on coloring, lower half stored, CSR format
 */
void setup_graph_coloring( const control_params * const control,
        const static_storage * const workspace, const sparse_matrix * const H,
        sparse_matrix ** H_full, sparse_matrix ** H_p )
{
    if ( *H_p == NULL )
    {
        if ( Allocate_Matrix( H_p, H->n, H->m ) == FAILURE )
        {
            fprintf( stderr, "[ERROR] not enough memory for graph coloring. terminating.\n" );
            exit( INSUFFICIENT_MEMORY );
        }
    }
    else if ( (*H_p)->m < H->m )
    {
        Deallocate_Matrix( *H_p );
        if ( Allocate_Matrix( H_p, H->n, H->m ) == FAILURE )
        {
            fprintf( stderr, "[ERROR] not enough memory for graph coloring. terminating.\n" );
            exit( INSUFFICIENT_MEMORY );
        }
    }

    compute_full_sparse_matrix( H, H_full );

    graph_coloring( control, (static_storage *) workspace, *H_full, LOWER );
    sort_rows_by_colors( workspace, (*H_full)->n, LOWER );

    memcpy( (*H_p)->start, H->start, sizeof(int) * (H->n + 1) );
    memcpy( (*H_p)->j, H->j, sizeof(int) * (H->start[H->n]) );
    memcpy( (*H_p)->val, H->val, sizeof(real) * (H->start[H->n]) );
    permute_matrix( workspace, (*H_p), LOWER );
}


/* Jacobi iteration using truncated Neumann series: x_{k+1} = Gx_k + D^{-1}b
 * where:
 *   G = I - D^{-1}R
 *   R = triangular matrix
 *   D = diagonal matrix, diagonals from R
 *
 * Note: used during the backsolves when applying preconditioners with
 * triangular factors in iterative linear solvers
 *
 * Note: Newmann series arises from series expansion of the inverse of
 * the coefficient matrix in the triangular system
 *
 * workspace: storage container for workspace structures
 * R:
 * Dinv:
 * b:
 * x (output):
 * tri:
 * maxiter:
 * */
void jacobi_iter( const static_storage * const workspace,
        const sparse_matrix * const R, const real * const Dinv,
        const real * const b, real * const x, const TRIANGULARITY tri,
        const unsigned int maxiter )
{
    unsigned int i, k, si, ei, iter;
    real *p1, *p2, *p3;

    si = 0;
    ei = 0;
    iter = 0;
    p1 = workspace->rp;
    p2 = workspace->rp2;

    Vector_MakeZero( p1, R->n );

    /* precompute and cache, as invariant in loop below */
#ifdef _OPENMP
    #pragma omp for schedule(static)
#endif
    for ( i = 0; i < R->n; ++i )
    {
        workspace->Dinv_b[i] = Dinv[i] * b[i];
    }

    do
    {
        /* x_{k+1} = G*x_{k} + Dinv*b */
#ifdef _OPENMP
        #pragma omp for schedule(guided)
#endif
        for ( i = 0; i < R->n; ++i )
        {
            if (tri == LOWER)
            {
                si = R->start[i];
                ei = R->start[i + 1] - 1;
            }
            else
            {

                si = R->start[i] + 1;
                ei = R->start[i + 1];
            }

            p2[i] = 0.;

            for ( k = si; k < ei; ++k )
            {
                p2[i] += R->val[k] * p1[R->j[k]];
            }

            p2[i] *= -Dinv[i];
            p2[i] += workspace->Dinv_b[i];
        }

        p3 = p1;
        p1 = p2;
        p2 = p3;

        ++iter;
    }
    while ( iter < maxiter );

    Vector_Copy( x, p1, R->n );
}


/* Apply left-sided preconditioning while solver M^{-1}Ax = M^{-1}b
 *
 * workspace: data struct containing matrices, stored in CSR
 * control: data struct containing parameters
 * y: vector to which to apply preconditioning,
 *  specific to internals of iterative solver being used
 * x: preconditioned vector (output)
 * fresh_pre: parameter indicating if this is a newly computed (fresh) preconditioner
 * side: used in determining how to apply preconditioner if the preconditioner is
 *  factorized as M = M_{1}M_{2} (e.g., incomplete LU, A \approx LU)
 *
 * Assumptions:
 *   Matrices have non-zero diagonals
 *   Each row of a matrix has at least one non-zero (i.e., no rows with all zeros) */
static void apply_preconditioner( const static_storage * const workspace, const control_params * const control,
                                  const real * const y, real * const x, const int fresh_pre,
                                  const int side )
{
    int i, si;

    /* no preconditioning */
    if ( control->cm_solver_pre_comp_type == NONE_PC )
    {
        if ( x != y )
        {
            Vector_Copy( x, y, workspace->H->n );
        }
    }
    else
    {
        switch ( side )
        {
        case LEFT:
            switch ( control->cm_solver_pre_app_type )
            {
            case TRI_SOLVE_PA:
                switch ( control->cm_solver_pre_comp_type )
                {
                case DIAG_PC:
                    diag_pre_app( workspace->Hdia_inv, y, x, workspace->H->n );
                    break;
                case ICHOLT_PC:
                case ILU_PAR_PC:
                case ILUT_PAR_PC:
                    tri_solve( workspace->L, y, x, workspace->L->n, LOWER );
                    break;
                case SAI_PC:
                    Sparse_MatVec_full( workspace->H_app_inv, y, x );
                    break;
                default:
                    fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
                    exit( INVALID_INPUT );
                    break;
                }
                break;
            case TRI_SOLVE_LEVEL_SCHED_PA:
                switch ( control->cm_solver_pre_comp_type )
                {
                case DIAG_PC:
                    diag_pre_app( workspace->Hdia_inv, y, x, workspace->H->n );
                    break;
                case ICHOLT_PC:
                case ILU_PAR_PC:
                case ILUT_PAR_PC:
                    tri_solve_level_sched( (static_storage *) workspace,
                            workspace->L, y, x, workspace->L->n, LOWER, fresh_pre );
                    break;
                case SAI_PC:
                    Sparse_MatVec_full( workspace->H_app_inv, y, x );
                    break;
                default:
                    fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
                    exit( INVALID_INPUT );
                    break;
                }
                break;
            case TRI_SOLVE_GC_PA:
                switch ( control->cm_solver_pre_comp_type )
                {
                case DIAG_PC:
                case SAI_PC:
                    fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" );
                    exit( INVALID_INPUT );
                    break;
                case ICHOLT_PC:
                case ILU_PAR_PC:
                case ILUT_PAR_PC:
#ifdef _OPENMP
                    #pragma omp for schedule(static)
#endif
                    for ( i = 0; i < workspace->H->n; ++i )
                    {
                        workspace->y_p[i] = y[i];
                    }

                    permute_vector( workspace, workspace->y_p, workspace->H->n, FALSE, LOWER );
                    tri_solve_level_sched( (static_storage *) workspace,
                            workspace->L, workspace->y_p, x, workspace->L->n, LOWER, fresh_pre );
                    break;
                default:
                    fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
                    exit( INVALID_INPUT );
                    break;
                }
                break;
            case JACOBI_ITER_PA:
                switch ( control->cm_solver_pre_comp_type )
                {
                case DIAG_PC:
                case SAI_PC:
                    fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" );
                    exit( INVALID_INPUT );
                    break;
                case ICHOLT_PC:
                case ILU_PAR_PC:
                case ILUT_PAR_PC:
                    /* construct D^{-1}_L */
                    if ( fresh_pre == TRUE )
                    {
#ifdef _OPENMP
                        #pragma omp for schedule(static)
#endif
                        for ( i = 0; i < workspace->L->n; ++i )
                        {
                            si = workspace->L->start[i + 1] - 1;
                            workspace->Dinv_L[i] = 1.0 / workspace->L->val[si];
                        }
                    }

                    jacobi_iter( workspace, workspace->L, workspace->Dinv_L,
                            y, x, LOWER, control->cm_solver_pre_app_jacobi_iters );
                    break;
                default:
                    fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
                    exit( INVALID_INPUT );
                    break;
                }
                break;
            default:
                fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
                exit( INVALID_INPUT );
                break;

            }
            break;

        case RIGHT:
            switch ( control->cm_solver_pre_app_type )
            {
            case TRI_SOLVE_PA:
                switch ( control->cm_solver_pre_comp_type )
                {
                case DIAG_PC:
                case SAI_PC:
                    if ( x != y )
                    {
                        Vector_Copy( x, y, workspace->H->n );
                    }
                    break;
                case ICHOLT_PC:
                case ILU_PAR_PC:
                case ILUT_PAR_PC:
                    tri_solve( workspace->U, y, x, workspace->U->n, UPPER );
                    break;
                default:
                    fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
                    exit( INVALID_INPUT );
                    break;
                }
                break;
            case TRI_SOLVE_LEVEL_SCHED_PA:
                switch ( control->cm_solver_pre_comp_type )
                {
                case DIAG_PC:
                case SAI_PC:
                    if ( x != y )
                    {
                        Vector_Copy( x, y, workspace->H->n );
                    }
                    break;
                case ICHOLT_PC:
                case ILU_PAR_PC:
                case ILUT_PAR_PC:
                    tri_solve_level_sched( (static_storage *) workspace,
                            workspace->U, y, x, workspace->U->n, UPPER, fresh_pre );
                    break;
                default:
                    fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
                    exit( INVALID_INPUT );
                    break;
                }
                break;
            case TRI_SOLVE_GC_PA:
                switch ( control->cm_solver_pre_comp_type )
                {
                case DIAG_PC:
                case SAI_PC:
                    fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" );
                    exit( INVALID_INPUT );
                    break;
                case ICHOLT_PC:
                case ILU_PAR_PC:
                case ILUT_PAR_PC:
                    tri_solve_level_sched( (static_storage *) workspace,
                            workspace->U, y, x, workspace->U->n, UPPER, fresh_pre );
                    permute_vector( workspace, x, workspace->H->n, TRUE, UPPER );
                    break;
                default:
                    fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
                    exit( INVALID_INPUT );
                    break;
                }
                break;
            case JACOBI_ITER_PA:
                switch ( control->cm_solver_pre_comp_type )
                {
                case DIAG_PC:
                case SAI_PC:
                    fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" );
                    exit( INVALID_INPUT );
                    break;
                case ICHOLT_PC:
                case ILU_PAR_PC:
                case ILUT_PAR_PC:
                    /* construct D^{-1}_U */
                    if ( fresh_pre == TRUE )
                    {
#ifdef _OPENMP
                        #pragma omp for schedule(static)
#endif
                        for ( i = 0; i < workspace->U->n; ++i )
                        {
                            si = workspace->U->start[i];
                            workspace->Dinv_U[i] = 1.0 / workspace->U->val[si];
                        }
                    }

                    jacobi_iter( workspace, workspace->U, workspace->Dinv_U,
                            y, x, UPPER, control->cm_solver_pre_app_jacobi_iters );
                    break;
                default:
                    fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
                    exit( INVALID_INPUT );
                    break;
                }
                break;
            default:
                fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
                exit( INVALID_INPUT );
                break;

            }
            break;
        }
    }
}


/* generalized minimual residual method with restarting for sparse linear systems */
int GMRES( const static_storage * const workspace, const control_params * const control,
           simulation_data * const data, const sparse_matrix * const H, const real * const b,
           const real tol, real * const x, const int fresh_pre )
{
    int i, j, k, itr, N, g_j, g_itr;
    real cc, tmp1, tmp2, temp, bnorm;
    real t_start, t_ortho, t_pa, t_spmv, t_ts, t_vops;

    N = H->n;
    g_j = 0;
    g_itr = 0;
    t_ortho = 0.0;
    t_pa = 0.0;
    t_spmv = 0.0;
    t_ts = 0.0;
    t_vops = 0.0;

#ifdef _OPENMP
    #pragma omp parallel default(none) \
    private(i, j, k, itr, bnorm, temp, t_start) \
    shared(N, cc, tmp1, tmp2, g_itr, g_j, stderr) \
    reduction(+: t_ortho, t_pa, t_spmv, t_ts, t_vops)
#endif
    {
        j = 0;
        itr = 0;
        t_ortho = 0.0;
        t_pa = 0.0;
        t_spmv = 0.0;
        t_ts = 0.0;
        t_vops = 0.0;

        t_start = Get_Time( );
        bnorm = Norm( b, N );
        t_vops += Get_Timing_Info( t_start );

        /* GMRES outer-loop */
        for ( itr = 0; itr < control->cm_solver_max_iters; ++itr )
        {
            /* calculate r0 */
            t_start = Get_Time( );
            Sparse_MatVec( workspace, H, x, workspace->b_prm );
            t_spmv += Get_Timing_Info( t_start );

            t_start = Get_Time( );
            Vector_Sum( workspace->b_prc, 1.0, b, -1.0, workspace->b_prm, N );
            t_vops += Get_Timing_Info( t_start );

            t_start = Get_Time( );
            apply_preconditioner( workspace, control, workspace->b_prc, workspace->b_prm,
                                  itr == 0 ? fresh_pre : FALSE, LEFT );
            apply_preconditioner( workspace, control, workspace->b_prm, workspace->v[0],
                                  itr == 0 ? fresh_pre : FALSE, RIGHT );
            t_pa += Get_Timing_Info( t_start );

            t_start = Get_Time( );
            temp = Norm( workspace->v[0], N );

#ifdef _OPENMP
            #pragma omp single
#endif
            workspace->g[0] = temp;

            Vector_Scale( workspace->v[0], 1.0 / temp, workspace->v[0], N );
            t_vops += Get_Timing_Info( t_start );

            /* GMRES inner-loop */
            for ( j = 0; j < control->cm_solver_restart && FABS(workspace->g[j]) / bnorm > tol; j++ )
            {
                /* matvec */
                t_start = Get_Time( );
                Sparse_MatVec( workspace, H, workspace->v[j], workspace->b_prc );
                t_spmv += Get_Timing_Info( t_start );

                t_start = Get_Time( );
                apply_preconditioner( workspace, control, workspace->b_prc,
                        workspace->b_prm, FALSE, LEFT );
                apply_preconditioner( workspace, control, workspace->b_prm,
                        workspace->v[j + 1], FALSE, RIGHT );
                t_pa += Get_Timing_Info( t_start );

                /* apply modified Gram-Schmidt to orthogonalize the new residual */
                t_start = Get_Time( );
                for ( i = 0; i <= j; i++ )
                {
                    temp = Dot( workspace->v[i], workspace->v[j + 1], N );

#ifdef _OPENMP
                    #pragma omp single
#endif
                    workspace->h[i][j] = temp;

                    Vector_Add( workspace->v[j + 1], -1.0 * temp, workspace->v[i], N );

                }
                t_vops += Get_Timing_Info( t_start );

                t_start = Get_Time( );
                temp = Norm( workspace->v[j + 1], N );

#ifdef _OPENMP
                #pragma omp single
#endif
                workspace->h[j + 1][j] = temp;

                Vector_Scale( workspace->v[j + 1], 1.0 / temp, workspace->v[j + 1], N );
                t_vops += Get_Timing_Info( t_start );

                t_start = Get_Time( );
                /* Givens rotations on the upper-Hessenberg matrix to make it U */
#ifdef _OPENMP
                #pragma omp single
#endif
                {
                    for ( i = 0; i <= j; i++ )
                    {
                        if ( i == j )
                        {
                            cc = SQRT( SQR(workspace->h[j][j]) + SQR(workspace->h[j + 1][j]) );
                            workspace->hc[j] = workspace->h[j][j] / cc;
                            workspace->hs[j] = workspace->h[j + 1][j] / cc;
                        }

                        tmp1 = workspace->hc[i] * workspace->h[i][j] +
                            workspace->hs[i] * workspace->h[i + 1][j];
                        tmp2 = -1.0 * workspace->hs[i] * workspace->h[i][j] +
                            workspace->hc[i] * workspace->h[i + 1][j];

                        workspace->h[i][j] = tmp1;
                        workspace->h[i + 1][j] = tmp2;
                    }

                    /* apply Givens rotations to the rhs as well */
                    tmp1 = workspace->hc[j] * workspace->g[j];
                    tmp2 = -1.0 * workspace->hs[j] * workspace->g[j];
                    workspace->g[j] = tmp1;
                    workspace->g[j + 1] = tmp2;

                }
                t_ortho += Get_Timing_Info( t_start );
            }

            /* solve Hy = g: H is now upper-triangular, do back-substitution */
            t_start = Get_Time( );
#ifdef _OPENMP
            #pragma omp single
#endif
            {
                for ( i = j - 1; i >= 0; i-- )
                {
                    temp = workspace->g[i];
                    for ( k = j - 1; k > i; k-- )
                    {
                        temp -= workspace->h[i][k] * workspace->y[k];
                    }

                    workspace->y[i] = temp / workspace->h[i][i];
                }
            }
            t_ts += Get_Timing_Info( t_start );

            /* update x = x_0 + Vy */
            t_start = Get_Time( );
            Vector_MakeZero( workspace->p, N );

            for ( i = 0; i < j; i++ )
            {
                Vector_Add( workspace->p, workspace->y[i], workspace->v[i], N );
            }

            Vector_Add( x, 1.0, workspace->p, N );
            t_vops += Get_Timing_Info( t_start );

            /* stopping condition */
            if ( FABS(workspace->g[j]) / bnorm <= tol )
            {
                break;
            }
        }

#ifdef _OPENMP
        #pragma omp single
#endif
        {
            g_j = j;
            g_itr = itr;
        }
    }

    data->timing.cm_solver_orthog += t_ortho / control->num_threads;
    data->timing.cm_solver_pre_app += t_pa / control->num_threads;
    data->timing.cm_solver_spmv += t_spmv / control->num_threads;
    data->timing.cm_solver_tri_solve += t_ts / control->num_threads;
    data->timing.cm_solver_vector_ops += t_vops / control->num_threads;

    if ( g_itr >= control->cm_solver_max_iters )
    {
        fprintf( stderr, "[WARNING] GMRES convergence failed (%d outer iters)\n", g_itr );
        return g_itr * (control->cm_solver_restart + 1) + g_j + 1;
    }

    return g_itr * (control->cm_solver_restart + 1) + g_j + 1;
}


int GMRES_HouseHolder( const static_storage * const workspace,
                       const control_params * const control, simulation_data * const data,
                       const sparse_matrix * const H, const real * const b, real tol,
                       real * const x, const int fresh_pre )
{
    int i, j, k, itr, N, g_j, g_itr;
    real cc, tmp1, tmp2, temp, bnorm;
    real v[10000], z[control->cm_solver_restart + 2][10000], w[control->cm_solver_restart + 2];
    real u[control->cm_solver_restart + 2][10000];
    real t_start, t_ortho, t_pa, t_spmv, t_ts, t_vops;

    j = 0;
    N = H->n;
    t_ortho = 0.0;
    t_pa = 0.0;
    t_spmv = 0.0;
    t_ts = 0.0;
    t_vops = 0.0;

#ifdef _OPENMP
    #pragma omp parallel default(none) \
    private(i, j, k, itr, bnorm, temp, t_start) \
    shared(v, z, w, u, tol, N, cc, tmp1, tmp2, g_itr, g_j, stderr) \
    reduction(+: t_ortho, t_pa, t_spmv, t_ts, t_vops)
#endif
    {
        j = 0;
        t_ortho = 0.0;
        t_pa = 0.0;
        t_spmv = 0.0;
        t_ts = 0.0;
        t_vops = 0.0;

        t_start = Get_Time( );
        bnorm = Norm( b, N );
        t_vops += Get_Timing_Info( t_start );

        // memset( x, 0, sizeof(real) * N );

        /* GMRES outer-loop */
        for ( itr = 0; itr < control->cm_solver_max_iters; ++itr )
        {
            /* compute z = r0 */
            t_start = Get_Time( );
            Sparse_MatVec( workspace, H, x, workspace->b_prm );
            t_spmv += Get_Timing_Info( t_start );

            t_start = Get_Time( );
            Vector_Sum( workspace->b_prc, 1.,  workspace->b, -1., workspace->b_prm, N );
            t_vops += Get_Timing_Info( t_start );

            t_start = Get_Time( );
            apply_preconditioner( workspace, control, workspace->b_prc,
                    workspace->b_prm, fresh_pre, LEFT );
            apply_preconditioner( workspace, control, workspace->b_prm,
                    z[0], fresh_pre, RIGHT );
            t_pa += Get_Timing_Info( t_start );

            t_start = Get_Time( );
            Vector_MakeZero( w, control->cm_solver_restart + 1 );
            w[0] = Norm( z[0], N );

            Vector_Copy( u[0], z[0], N );
            u[0][0] += ( u[0][0] < 0.0 ? -1 : 1 ) * w[0];
            Vector_Scale( u[0], 1 / Norm( u[0], N ), u[0], N );

            w[0] *= ( u[0][0] < 0.0 ?  1 : -1 );
            t_vops += Get_Timing_Info( t_start );

            /* GMRES inner-loop */
            for ( j = 0; j < control->cm_solver_restart && FABS( w[j] ) / bnorm > tol; j++ )
            {
                /* compute v_j */
                t_start = Get_Time( );
                Vector_Scale( z[j], -2 * u[j][j], u[j], N );
                z[j][j] += 1.; /* due to e_j */

                for ( i = j - 1; i >= 0; --i )
                {
                    Vector_Add( z[j] + i, -2 * Dot( u[i] + i, z[j] + i, N - i ), u[i] + i, N - i );
                }
                t_vops += Get_Timing_Info( t_start );

                /* matvec */
                t_start = Get_Time( );
                Sparse_MatVec( workspace, H, z[j], workspace->b_prc );
                t_spmv += Get_Timing_Info( t_start );

                t_start = Get_Time( );
                apply_preconditioner( workspace, control, workspace->b_prc,
                        workspace->b_prm, fresh_pre, LEFT );
                apply_preconditioner( workspace, control, workspace->b_prm,
                        v, fresh_pre, RIGHT );
                t_pa += Get_Timing_Info( t_start );

                t_start = Get_Time( );
                for ( i = 0; i <= j; ++i )
                {
                    Vector_Add( v + i, -2 * Dot( u[i] + i, v + i, N - i ), u[i] + i, N - i );
                }

                if ( !Vector_isZero( v + (j + 1), N - (j + 1) ) )
                {
                    /* compute the HouseHolder unit vector u_j+1 */
                    Vector_MakeZero( u[j + 1], j + 1 );
                    Vector_Copy( u[j + 1] + (j + 1), v + (j + 1), N - (j + 1) );
                    temp = Norm( v + (j + 1), N - (j + 1) );
#ifdef _OPENMP
                    #pragma omp single
#endif
                    u[j + 1][j + 1] += ( v[j + 1] < 0.0 ? -1 : 1 ) * temp;

#ifdef _OPENMP
                    #pragma omp barrier
#endif

                    Vector_Scale( u[j + 1], 1 / Norm( u[j + 1], N ), u[j + 1], N );

                    /* overwrite v with P_m+1 * v */
                    temp = 2 * Dot( u[j + 1] + (j + 1), v + (j + 1), N - (j + 1) ) * u[j + 1][j + 1];
#ifdef _OPENMP
                    #pragma omp single
#endif
                    v[j + 1] -= temp;

#ifdef _OPENMP
                    #pragma omp barrier
#endif

                    Vector_MakeZero( v + (j + 2), N - (j + 2) );
//                    Vector_Add( v, -2 * Dot( u[j+1], v, N ), u[j+1], N );
                }
                t_vops += Get_Timing_Info( t_start );

                /* prev Givens rots on the upper-Hessenberg matrix to make it U */
                t_start = Get_Time( );
#ifdef _OPENMP
                #pragma omp single
#endif
                {
                    for ( i = 0; i < j; i++ )
                    {
                        tmp1 =  workspace->hc[i] * v[i] + workspace->hs[i] * v[i + 1];
                        tmp2 = -workspace->hs[i] * v[i] + workspace->hc[i] * v[i + 1];

                        v[i]   = tmp1;
                        v[i + 1] = tmp2;
                    }

                    /* apply the new Givens rotation to H and right-hand side */
                    if ( FABS(v[j + 1]) >= ALMOST_ZERO )
                    {
                        cc = SQRT( SQR( v[j] ) + SQR( v[j + 1] ) );
                        workspace->hc[j] = v[j] / cc;
                        workspace->hs[j] = v[j + 1] / cc;

                        tmp1 =  workspace->hc[j] * v[j] + workspace->hs[j] * v[j + 1];
                        tmp2 = -workspace->hs[j] * v[j] + workspace->hc[j] * v[j + 1];

                        v[j]   = tmp1;
                        v[j + 1] = tmp2;

                        /* Givens rotations to rhs */
                        tmp1 =  workspace->hc[j] * w[j];
                        tmp2 = -workspace->hs[j] * w[j];
                        w[j]   = tmp1;
                        w[j + 1] = tmp2;
                    }

                    /* extend R */
                    for ( i = 0; i <= j; ++i )
                    {
                        workspace->h[i][j] = v[i];
                    }
                }
                t_ortho += Get_Timing_Info( t_start );
            }


            /* solve Hy = w.
               H is now upper-triangular, do back-substitution */
            t_start = Get_Time( );
#ifdef _OPENMP
            #pragma omp single
#endif
            {
                for ( i = j - 1; i >= 0; i-- )
                {
                    temp = w[i];
                    for ( k = j - 1; k > i; k-- )
                    {
                        temp -= workspace->h[i][k] * workspace->y[k];
                    }

                    workspace->y[i] = temp / workspace->h[i][i];
                }
            }
            t_ts += Get_Timing_Info( t_start );

            t_start = Get_Time( );
            for ( i = j - 1; i >= 0; i-- )
            {
                Vector_Add( x, workspace->y[i], z[i], N );
            }
            t_vops += Get_Timing_Info( t_start );

            /* stopping condition */
            if ( FABS( w[j] ) / bnorm <= tol )
            {
                break;
            }
        }

#ifdef _OPENMP
        #pragma omp single
#endif
        {
            g_j = j;
            g_itr = itr;
        }
    }

    data->timing.cm_solver_orthog += t_ortho / control->num_threads;
    data->timing.cm_solver_pre_app += t_pa / control->num_threads;
    data->timing.cm_solver_spmv += t_spmv / control->num_threads;
    data->timing.cm_solver_tri_solve += t_ts / control->num_threads;
    data->timing.cm_solver_vector_ops += t_vops / control->num_threads;

    if ( g_itr >= control->cm_solver_max_iters )
    {
        fprintf( stderr, "[WARNING] GMRES convergence failed (%d outer iters)\n", g_itr );
        return g_itr * (control->cm_solver_restart + 1) + j + 1;
    }

    return g_itr * (control->cm_solver_restart + 1) + g_j + 1;
}


/* Conjugate Gradient */
int CG( const static_storage * const workspace, const control_params * const control,
        simulation_data * const data, const sparse_matrix * const H, const real * const b,
        const real tol, real * const x, const int fresh_pre )
{
    int i, g_itr, N;
    real tmp, alpha, beta, b_norm, r_norm;
    real *d, *r, *p, *z;
    real sig_old, sig_new;
    real t_start, t_pa, t_spmv, t_vops;

    N = H->n;
    d = workspace->d;
    r = workspace->r;
    p = workspace->q;
    z = workspace->p;
    t_pa = 0.0;
    t_spmv = 0.0;
    t_vops = 0.0;

#ifdef _OPENMP
    #pragma omp parallel default(none) \
    private(i, tmp, alpha, beta, b_norm, r_norm, sig_old, sig_new, t_start) \
    reduction(+: t_pa, t_spmv, t_vops) \
    shared(g_itr, N, d, r, p, z)
#endif
    {
        t_pa = 0.0;
        t_spmv = 0.0;
        t_vops = 0.0;

        t_start = Get_Time( );
        b_norm = Norm( b, N );
        t_vops += Get_Timing_Info( t_start );

        t_start = Get_Time( );
        Sparse_MatVec( workspace, H, x, d );
        t_spmv += Get_Timing_Info( t_start );

        t_start = Get_Time( );
        Vector_Sum( r, 1.0,  b, -1.0, d, N );
        r_norm = Norm( r, N );
        t_vops += Get_Timing_Info( t_start );

        t_start = Get_Time( );
        apply_preconditioner( workspace, control, r, d, fresh_pre, LEFT );
        apply_preconditioner( workspace, control, d, z, fresh_pre, RIGHT );
        t_pa += Get_Timing_Info( t_start );

        t_start = Get_Time( );
        Vector_Copy( p, z, N );
        sig_new = Dot( r, z, N );
        t_vops += Get_Timing_Info( t_start );

        for ( i = 0; i < control->cm_solver_max_iters && r_norm / b_norm > tol; ++i )
        {
            t_start = Get_Time( );
            Sparse_MatVec( workspace, H, p, d );
            t_spmv += Get_Timing_Info( t_start );

            t_start = Get_Time( );
            tmp = Dot( d, p, N );
            alpha = sig_new / tmp;
            Vector_Add( x, alpha, p, N );
            Vector_Add( r, -alpha, d, N );
            r_norm = Norm( r, N );
            t_vops += Get_Timing_Info( t_start );

            t_start = Get_Time( );
            apply_preconditioner( workspace, control, r, d, FALSE, LEFT );
            apply_preconditioner( workspace, control, d, z, FALSE, RIGHT );
            t_pa += Get_Timing_Info( t_start );

            t_start = Get_Time( );
            sig_old = sig_new;
            sig_new = Dot( r, z, N );
            beta = sig_new / sig_old;
            Vector_Sum( p, 1., z, beta, p, N );
            t_vops += Get_Timing_Info( t_start );
        }

#ifdef _OPENMP
        #pragma omp single
#endif
        g_itr = i;
    }

    data->timing.cm_solver_pre_app += t_pa / control->num_threads;
    data->timing.cm_solver_spmv += t_spmv / control->num_threads;
    data->timing.cm_solver_vector_ops += t_vops / control->num_threads;

    if ( g_itr >= control->cm_solver_max_iters )
    {
        fprintf( stderr, "[WARNING] CG convergence failed (%d iters)\n", g_itr );
        return g_itr;
    }

    return g_itr;
}


/* Steepest Descent */
int SDM( const static_storage * const workspace, const control_params * const control,
         simulation_data * const data, const sparse_matrix * const H, const real * const b,
         const real tol, real * const x, const int fresh_pre )
{
    int i, g_itr, N;
    real tmp, alpha, b_norm;
    real sig;
    real t_start, t_pa, t_spmv, t_vops;

    N = H->n;
    t_pa = 0.0;
    t_spmv = 0.0;
    t_vops = 0.0;

#ifdef _OPENMP
    #pragma omp parallel default(none) \
    private(i, tmp, alpha, b_norm, sig, t_start) \
    reduction(+: t_pa, t_spmv, t_vops) \
    shared(g_itr, N)
#endif
    {
        t_pa = 0.0;
        t_spmv = 0.0;
        t_vops = 0.0;

        t_start = Get_Time( );
        b_norm = Norm( b, N );
        t_vops += Get_Timing_Info( t_start );

        t_start = Get_Time( );
        Sparse_MatVec( workspace, H, x, workspace->q );
        t_spmv += Get_Timing_Info( t_start );

        t_start = Get_Time( );
        Vector_Sum( workspace->r, 1.0,  b, -1.0, workspace->q, N );
        t_vops += Get_Timing_Info( t_start );

        t_start = Get_Time( );
        apply_preconditioner( workspace, control, workspace->r, workspace->d, fresh_pre, LEFT );
        apply_preconditioner( workspace, control, workspace->r, workspace->d, fresh_pre, RIGHT );
        t_pa += Get_Timing_Info( t_start );

        t_start = Get_Time( );
        sig = Dot( workspace->r, workspace->d, N );
        t_vops += Get_Timing_Info( t_start );

        for ( i = 0; i < control->cm_solver_max_iters && SQRT(sig) / b_norm > tol; ++i )
        {
            t_start = Get_Time( );
            Sparse_MatVec( workspace, H, workspace->d, workspace->q );
            t_spmv += Get_Timing_Info( t_start );

            t_start = Get_Time( );
            sig = Dot( workspace->r, workspace->d, N );

            /* ensure each thread gets a local copy of
             * the function return value before proceeding
             * (Dot function has persistent state in the form
             * of a shared global variable for the OpenMP version) */
#ifdef _OPENMP
            #pragma omp barrier
#endif

            tmp = Dot( workspace->d, workspace->q, N );
            alpha = sig / tmp;

            Vector_Add( x, alpha, workspace->d, N );
            Vector_Add( workspace->r, -alpha, workspace->q, N );
            t_vops += Get_Timing_Info( t_start );

            t_start = Get_Time( );
            apply_preconditioner( workspace, control, workspace->r, workspace->d, FALSE, LEFT );
            apply_preconditioner( workspace, control, workspace->r, workspace->d, FALSE, RIGHT );
            t_pa += Get_Timing_Info( t_start );
        }

#ifdef _OPENMP
        #pragma omp single
#endif
        g_itr = i;
    }

    data->timing.cm_solver_pre_app += t_pa / control->num_threads;
    data->timing.cm_solver_spmv += t_spmv / control->num_threads;
    data->timing.cm_solver_vector_ops += t_vops / control->num_threads;

    if ( g_itr >= control->cm_solver_max_iters  )
    {
        fprintf( stderr, "[WARNING] SDM convergence failed (%d iters)\n", g_itr );
        return g_itr;
    }

    return g_itr;
}


/* Estimate the stability of a 2-side preconditioning scheme
 * using the factorization A \approx LU. Specifically, estimate the 1-norm of A^{-1}
 * using the 1-norm of (LU)^{-1}e, with e = [1 1 ... 1]^T through 2 triangular solves:
 *   1) Ly = e
 *   2) Ux = y where y = Ux
 * That is, we seek to solve e = LUx for unknown x
 *
 * Reference: Incomplete LU Preconditioning with the Multilevel Fast Multipole Algorithm
 *   for Electromagnetic Scattering, SIAM J. Sci. Computing, 2007 */
real condest( const sparse_matrix * const L, const sparse_matrix * const U )
{
    unsigned int i, N;
    real *e, c;

    N = L->n;

    e = (real*) smalloc( sizeof(real) * N, "condest::e" );

    memset( e, 1., N * sizeof(real) );

    tri_solve( L, e, e, L->n, LOWER );
    tri_solve( U, e, e, U->n, UPPER );

    /* compute 1-norm of vector e */
    c = FABS(e[0]);
    for ( i = 1; i < N; ++i)
    {
        if ( FABS(e[i]) > c )
        {
            c = FABS(e[i]);
        }

    }

    sfree( e, "condest::e" );

    return c;
}