Skip to content
Snippets Groups Projects
lin_alg.c 112 KiB
Newer Older
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
/*----------------------------------------------------------------------
  SerialReax - Reax Force Field Simulator
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
  Copyright (2010) Purdue University
  Hasan Metin Aktulga, haktulga@cs.purdue.edu
  Joseph Fogarty, jcfogart@mail.usf.edu
  Sagar Pandit, pandit@usf.edu
  Ananth Y Grama, ayg@cs.purdue.edu
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
  This program is free software; you can redistribute it and/or
  modify it under the terms of the GNU General Public License as
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
  published by the Free Software Foundation; either version 2 of
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
  the License, or (at your option) any later version.
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
  This program is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
  See the GNU General Public License for more details:
  <http://www.gnu.org/licenses/>.
  ----------------------------------------------------------------------*/

Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
#include "vector.h"
#include "print_utils.h"

/* Intel MKL */
#include "mkl.h"
/* reference LAPACK */
#elif defined(HAVE_LAPACKE)
#include "lapacke.h"
#endif
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

typedef struct
{
    unsigned int j;
    real val;
} sparse_matrix_entry;


#if defined(TEST_MAT)
static sparse_matrix * create_test_mat( void )
{
    unsigned int i, n;
    sparse_matrix *H_test;

    if ( Allocate_Matrix( &H_test, 3, 6 ) == FAILURE )
    {
        fprintf( stderr, "not enough memory for test matrices. terminating.\n" );
        exit( INSUFFICIENT_MEMORY );
    }

    //3x3, SPD, store lower half
    i = 0;
    n = 0;
    H_test->start[n] = i;
    H_test->j[i] = 0;
    H_test->val[i] = 4.;
    ++i;
    ++n;
    H_test->start[n] = i;
    H_test->j[i] = 0;
    H_test->val[i] = 12.;
    ++i;
    H_test->j[i] = 1;
    H_test->val[i] = 37.;
    ++i;
    ++n;
    H_test->start[n] = i;
    H_test->j[i] = 0;
    H_test->val[i] = -16.;
    ++i;
    H_test->j[i] = 1;
    H_test->val[i] = -43.;
    ++i;
    H_test->j[i] = 2;
    H_test->val[i] = 98.;
    ++i;
    ++n;
    H_test->start[n] = i;

    return H_test;
}
#endif


/* Routine used with qsort for sorting nonzeros within a sparse matrix row
 *
 * v1/v2: pointers to column indices of nonzeros within a row (unsigned int)
 */
static int compare_matrix_entry(const void *v1, const void *v2)
{
    /* larger element has larger column index */
    return ((sparse_matrix_entry *)v1)->j - ((sparse_matrix_entry *)v2)->j;
}


/* Routine used for sorting nonzeros within a sparse matrix row;
 *  internally, a combination of qsort and manual sorting is utilized
 *  (parallel calls to qsort when multithreading, rows mapped to threads)
 *
 * A: sparse matrix for which to sort nonzeros within a row, stored in CSR format
 */
void Sort_Matrix_Rows( sparse_matrix * const A )
{
    unsigned int i, j, si, ei;
    sparse_matrix_entry *temp;

#ifdef _OPENMP
    //    #pragma omp parallel default(none) private(i, j, si, ei, temp) shared(stderr)
        temp = (sparse_matrix_entry *) smalloc( A->n * sizeof(sparse_matrix_entry),

        /* sort each row of A using column indices */
#ifdef _OPENMP
        //        #pragma omp for schedule(guided)
#endif
        for ( i = 0; i < A->n; ++i )
        {
            si = A->start[i];
            ei = A->start[i + 1];

            for ( j = 0; j < (ei - si); ++j )
            {
                (temp + j)->j = A->j[si + j];
                (temp + j)->val = A->val[si + j];
            }

            /* polymorphic sort in standard C library using column indices */
            qsort( temp, ei - si, sizeof(sparse_matrix_entry), compare_matrix_entry );

            for ( j = 0; j < (ei - si); ++j )
            {
                A->j[si + j] = (temp + j)->j;
                A->val[si + j] = (temp + j)->val;
            }
        }

        sfree( temp, "Sort_Matrix_Rows::temp" );
    }
}


/* Convert a symmetric, half-sored sparse matrix into
 * a full-stored sparse matrix
 *
 * A: symmetric sparse matrix, lower half stored in CSR
 * A_full: resultant full sparse matrix in CSR
 *   If A_full is NULL, allocate space, otherwise do not
 *
 * Assumptions:
 *   A has non-zero diagonals
 *   Each row of A has at least one non-zero (i.e., no rows with all zeros) */
static void compute_full_sparse_matrix( const sparse_matrix * const A,
{
    int count, i, pj;
    sparse_matrix *A_t;

    if ( *A_full == NULL )
    {
        if ( Allocate_Matrix( A_full, A->n, 2 * A->m - A->n ) == FAILURE )
        {
            fprintf( stderr, "not enough memory for full A. terminating.\n" );
            exit( INSUFFICIENT_MEMORY );
        }
    }
    else if ( (*A_full)->m < 2 * A->m - A->n )
    {
        Deallocate_Matrix( *A_full );
        if ( Allocate_Matrix( A_full, A->n, 2 * A->m - A->n ) == FAILURE )
        {
            fprintf( stderr, "not enough memory for full A. terminating.\n" );
            exit( INSUFFICIENT_MEMORY );
        }
    }
    if ( Allocate_Matrix( &A_t, A->n, A->m ) == FAILURE )
    {
        fprintf( stderr, "not enough memory for full A. terminating.\n" );
        exit( INSUFFICIENT_MEMORY );
    }

    /* Set up the sparse matrix data structure for A. */
    Transpose( A, A_t );

    count = 0;
    for ( i = 0; i < A->n; ++i )
    {
        (*A_full)->start[i] = count;

        /* A: symmetric, lower triangular portion only stored */
        for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
        {
            (*A_full)->val[count] = A->val[pj];
            (*A_full)->j[count] = A->j[pj];
            ++count;
        }
        /* A^T: symmetric, upper triangular portion only stored;
         * skip diagonal from A^T, as included from A above */
        for ( pj = A_t->start[i] + 1; pj < A_t->start[i + 1]; ++pj )
        {
            (*A_full)->val[count] = A_t->val[pj];
            (*A_full)->j[count] = A_t->j[pj];
            ++count;
        }
    }
    (*A_full)->start[i] = count;

    Deallocate_Matrix( A_t );
}


/* Setup routines for sparse approximate inverse preconditioner
 *
 * A: symmetric sparse matrix, lower half stored in CSR
 *
 * Assumptions:
 *   A has non-zero diagonals
 *   Each row of A has at least one non-zero (i.e., no rows with all zeros) */
void setup_sparse_approx_inverse( const sparse_matrix * const A, sparse_matrix ** A_full,
              sparse_matrix ** A_spar_patt, sparse_matrix **A_spar_patt_full,
                    sparse_matrix ** A_app_inv, const real filter )
{
    int i, pj, size;
    int left, right, k, p, turn;
    real pivot, tmp;
    real threshold;
    real *list;
    if ( *A_spar_patt == NULL )
        if ( Allocate_Matrix( A_spar_patt, A->n, A->m ) == FAILURE )
        {
            fprintf( stderr, "[SAI] Not enough memory for preconditioning matrices. terminating.\n" );
            exit( INSUFFICIENT_MEMORY );
        }
    }
    else if ( ((*A_spar_patt)->m) < (A->m) )
        Deallocate_Matrix( *A_spar_patt );
        if ( Allocate_Matrix( A_spar_patt, A->n, A->m ) == FAILURE )
        {
            fprintf( stderr, "[SAI] Not enough memory for preconditioning matrices. terminating.\n" );
            exit( INSUFFICIENT_MEMORY );
        }
    }

    /* quick-select algorithm for finding the kth greatest element in the matrix*/
    /* list: values from the matrix*/
    /* left-right: search space of the quick-select */

    list = (real *) smalloc( sizeof(real) * (A->start[A->n]),"Sparse_Approx_Inverse::list" );

    left = 0;
    right = A->start[A->n] - 1;
    k = (int)( (A->start[A->n])*filter );
    threshold = 0.0;

    for( i = left; i <= right ; ++i )
        list[i] = abs( A->val[i] );
    }

    turn = 0;
    while( k ) {

        p  = left;
        turn = 1 - turn;
        if( turn == 1)
            pivot = list[right];
        }
        else
        {
            pivot = list[left];
        }
        for( i = left + 1 - turn; i <= right-turn; ++i )
        {
            if( list[i] > pivot )
                tmp = list[i];
                list[i] = list[p];
                list[p] = tmp;
                p++;
        if(turn == 1)
        {
            tmp = list[p];
            list[p] = list[right];
            list[right] = tmp;
        }
        else
        {
            tmp = list[p];
            list[p] = list[left];
            list[left] = tmp;
        }

        if( p == k - 1)
        {
            threshold = list[p];
            break;
        }
        else if( p > k - 1 )
        {
            right = p - 1;
        }
        else
        {
            left = p + 1;
        }
    sfree( list, "setup_sparse_approx_inverse::list" );
    /* diagonal entries are always included */
    for ( size = 0, i = 0; i < A->n; ++i )
    {
        (*A_spar_patt)->start[i] = size;

        for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
        {
            if ( ( A->val[pj] >= threshold )  || ( A->j[pj] == i ) )
                (*A_spar_patt)->val[size] = A->val[pj];
                (*A_spar_patt)->j[size] = A->j[pj];
    (*A_spar_patt)->start[A->n] = size;
    compute_full_sparse_matrix( A, A_full );
    compute_full_sparse_matrix( *A_spar_patt, A_spar_patt_full );

    /* A_app_inv has the same sparsity pattern
     * * as A_spar_patt_full (omit non-zero values) */
    if ( Allocate_Matrix( A_app_inv, (*A_spar_patt_full)->n, (*A_spar_patt_full)->m ) == FAILURE )
    {
        fprintf( stderr, "not enough memory for approximate inverse matrix. terminating.\n" );
        exit( INSUFFICIENT_MEMORY );
    }
void Calculate_Droptol( const sparse_matrix * const A,
{
    int i, j, k;
    real val;
#ifdef _OPENMP
    static real *droptol_local;
    unsigned int tid;
#endif

#ifdef _OPENMP
    #pragma omp parallel default(none) private(i, j, k, val, tid), shared(droptol_local, stderr)
                droptol_local = (real*) smalloc( omp_get_num_threads() * A->n * sizeof(real),
#endif

        /* init droptol to 0 */
        for ( i = 0; i < A->n; ++i )
        {
#ifdef _OPENMP
            droptol_local[tid * A->n + i] = 0.0;
#else
            droptol[i] = 0.0;
#endif
        }

#ifdef _OPENMP
#endif

        /* calculate sqaure of the norm of each row */
#ifdef _OPENMP
#endif
        for ( i = 0; i < A->n; ++i )
        {
            for ( k = A->start[i]; k < A->start[i + 1] - 1; ++k )
            {
                j = A->j[k];
                val = A->val[k];

#ifdef _OPENMP
                droptol_local[tid * A->n + i] += val * val;
                droptol_local[tid * A->n + j] += val * val;
#else
                droptol[i] += val * val;
                droptol[j] += val * val;
#endif
            }

            // diagonal entry
            val = A->val[k];
#ifdef _OPENMP
            droptol_local[tid * A->n + i] += val * val;
#else
            droptol[i] += val * val;
#endif
        }

#ifdef _OPENMP
        for ( i = 0; i < A->n; ++i )
        {
            droptol[i] = 0.0;
            for ( k = 0; k < omp_get_num_threads(); ++k )
            {
                droptol[i] += droptol_local[k * A->n + i];
            }
        }

#endif

        /* calculate local droptol for each row */
        //fprintf( stderr, "droptol: " );
#ifdef _OPENMP
#endif
        for ( i = 0; i < A->n; ++i )
        {
            //fprintf( stderr, "%f-->", droptol[i] );
            droptol[i] = SQRT( droptol[i] ) * dtol;
            //fprintf( stderr, "%f  ", droptol[i] );
        }
        //fprintf( stderr, "\n" );
    }
}


int Estimate_LU_Fill( const sparse_matrix * const A, const real * const droptol )
{
    int i, pj;
    int fillin;
    real val;

    fillin = 0;

#ifdef _OPENMP
    default(none) private(i, pj, val) reduction(+: fillin)
#endif
    for ( i = 0; i < A->n; ++i )
    {
        for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
        {
            val = A->val[pj];

            if ( FABS(val) > droptol[i] )
            {
                ++fillin;
            }
        }
    }

    return fillin + A->n;
}


#if defined(HAVE_SUPERLU_MT)
real SuperLU_Factorize( const sparse_matrix * const A,
                        sparse_matrix * const L, sparse_matrix * const U )
{
    unsigned int i, pj, count, *Ltop, *Utop, r;
    sparse_matrix *A_t;
    SuperMatrix A_S, AC_S, L_S, U_S;
    NCformat *A_S_store;
    SCPformat *L_S_store;
    NCPformat *U_S_store;
    superlumt_options_t superlumt_options;
    pxgstrf_shared_t pxgstrf_shared;
    pdgstrf_threadarg_t *pdgstrf_threadarg;
    int_t nprocs;
    fact_t fact;
    trans_t trans;
    yes_no_t refact, usepr;
    real u, drop_tol;
    real *a, *at;
    int_t *asub, *atsub, *xa, *xat;
    int_t *perm_c; /* column permutation vector */
    int_t *perm_r; /* row permutations from partial pivoting */
    void *work;
    int_t info, lwork;
    int_t permc_spec, panel_size, relax;
    Gstat_t Gstat;
    flops_t flopcnt;

    /* Default parameters to control factorization. */
#ifdef _OPENMP
    //TODO: set as global parameter and use
    default(none) shared(nprocs)
        {
            /* SuperLU_MT spawns threads internally, so set and pass parameter */
            nprocs = omp_get_num_threads();
        }
    }
#else
    nprocs = 1;
#endif

    //    fact = EQUILIBRATE; /* equilibrate A (i.e., scale rows & cols to have unit norm), then factorize */
    fact = DOFACT; /* factor from scratch */
    trans = NOTRANS;
    refact = NO; /* first time factorization */
    //TODO: add to control file and use the value there to set these
    panel_size = sp_ienv(1); /* # consec. cols treated as unit task */
    relax = sp_ienv(2); /* # cols grouped as relaxed supernode */
    u = 1.0; /* diagonal pivoting threshold */
    usepr = NO;
    drop_tol = 0.0;
    work = NULL;
    lwork = 0;

#if defined(DEBUG)
    fprintf( stderr, "nprocs = %d\n", nprocs );
    fprintf( stderr, "Panel size = %d\n", panel_size );
    fprintf( stderr, "Relax = %d\n", relax );
#endif

    if ( !(perm_r = intMalloc(A->n)) )
    {
        SUPERLU_ABORT("Malloc fails for perm_r[].");
    }
    if ( !(perm_c = intMalloc(A->n)) )
    {
        SUPERLU_ABORT("Malloc fails for perm_c[].");
    }
    if ( !(superlumt_options.etree = intMalloc(A->n)) )
    {
        SUPERLU_ABORT("Malloc fails for etree[].");
    }
    if ( !(superlumt_options.colcnt_h = intMalloc(A->n)) )
    {
        SUPERLU_ABORT("Malloc fails for colcnt_h[].");
    }
    if ( !(superlumt_options.part_super_h = intMalloc(A->n)) )
    {
        SUPERLU_ABORT("Malloc fails for part_super__h[].");
    }
    a = (real*) smalloc( (2 * A->start[A->n] - A->n) * sizeof(real),
    asub = (int_t*) smalloc( (2 * A->start[A->n] - A->n) * sizeof(int_t),
    xa = (int_t*) smalloc( (A->n + 1) * sizeof(int_t),
    Ltop = (unsigned int*) smalloc( (A->n + 1) * sizeof(unsigned int),
    Utop = (unsigned int*) smalloc( (A->n + 1) * sizeof(unsigned int),
    if ( Allocate_Matrix( &A_t, A->n, A->m ) == FAILURE )
    {
        fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
        exit( INSUFFICIENT_MEMORY );
    }

    /* Set up the sparse matrix data structure for A. */
    Transpose( A, A_t );

    count = 0;
    for ( i = 0; i < A->n; ++i )
    {
        xa[i] = count;
        for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
        {
            a[count] = A->entries[pj].val;
            asub[count] = A->entries[pj].j;
            ++count;
        }
        for ( pj = A_t->start[i] + 1; pj < A_t->start[i + 1]; ++pj )
        {
            a[count] = A_t->entries[pj].val;
            asub[count] = A_t->entries[pj].j;
            ++count;
        }
    }
    xa[i] = count;

    dCompRow_to_CompCol( A->n, A->n, 2 * A->start[A->n] - A->n, a, asub, xa,

    for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
        fprintf( stderr, "%6d", asub[i] );
    fprintf( stderr, "\n" );
    for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
        fprintf( stderr, "%6.1f", a[i] );
    fprintf( stderr, "\n" );
    for ( i = 0; i <= A->n; ++i )
        fprintf( stderr, "%6d", xa[i] );
    fprintf( stderr, "\n" );
    for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
        fprintf( stderr, "%6d", atsub[i] );
    fprintf( stderr, "\n" );
    for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
        fprintf( stderr, "%6.1f", at[i] );
    fprintf( stderr, "\n" );
    for ( i = 0; i <= A->n; ++i )
        fprintf( stderr, "%6d", xat[i] );
    fprintf( stderr, "\n" );

    A_S.Stype = SLU_NC; /* column-wise, no supernode */
    A_S.Dtype = SLU_D; /* double-precision */
    A_S.Mtype = SLU_GE; /* full (general) matrix -- required for parallel factorization */
    A_S.nrow = A->n;
    A_S.ncol = A->n;
    A_S.Store = (void *) SUPERLU_MALLOC( sizeof(NCformat) );
    A_S_store = (NCformat *) A_S.Store;
    A_S_store->nnz = 2 * A->start[A->n] - A->n;
    A_S_store->nzval = at;
    A_S_store->rowind = atsub;
    A_S_store->colptr = xat;

    /* ------------------------------------------------------------
       Allocate storage and initialize statistics variables.
       ------------------------------------------------------------*/
    StatAlloc( A->n, nprocs, panel_size, relax, &Gstat );
    StatInit( A->n, nprocs, &Gstat );

    /* ------------------------------------------------------------
       Get column permutation vector perm_c[], according to permc_spec:
       permc_spec = 0: natural ordering
       permc_spec = 1: minimum degree ordering on structure of A'*A
       permc_spec = 2: minimum degree ordering on structure of A'+A
       permc_spec = 3: approximate minimum degree for unsymmetric matrices
       ------------------------------------------------------------*/
    permc_spec = 0;
    get_perm_c( permc_spec, &A_S, perm_c );

    /* ------------------------------------------------------------
       Initialize the option structure superlumt_options using the
       user-input parameters;
       Apply perm_c to the columns of original A to form AC.
       ------------------------------------------------------------*/
    pdgstrf_init( nprocs, fact, trans, refact, panel_size, relax,
                  u, usepr, drop_tol, perm_c, perm_r,
                  work, lwork, &A_S, &AC_S, &superlumt_options, &Gstat );

    for ( i = 0; i < ((NCPformat*)AC_S.Store)->nnz; ++i )
        fprintf( stderr, "%6.1f", ((real*)(((NCPformat*)AC_S.Store)->nzval))[i] );
    fprintf( stderr, "\n" );

    /* ------------------------------------------------------------
       Compute the LU factorization of A.
       The following routine will create nprocs threads.
       ------------------------------------------------------------*/
    pdgstrf( &superlumt_options, &AC_S, perm_r, &L_S, &U_S, &Gstat, &info );

    fprintf( stderr, "INFO: %d\n", info );

    flopcnt = 0;
    for (i = 0; i < nprocs; ++i)
    {
        flopcnt += Gstat.procstat[i].fcops;
    }
    Gstat.ops[FACT] = flopcnt;

#if defined(DEBUG)
    printf("\n** Result of sparse LU **\n");
    L_S_store = (SCPformat *) L_S.Store;
    U_S_store = (NCPformat *) U_S.Store;
    printf( "No of nonzeros in factor L = " IFMT "\n", L_S_store->nnz );
    printf( "No of nonzeros in factor U = " IFMT "\n", U_S_store->nnz );
    fflush( stdout );
#endif

    /* convert L and R from SuperLU formats to CSR */
    memset( Ltop, 0, (A->n + 1) * sizeof(int) );
    memset( Utop, 0, (A->n + 1) * sizeof(int) );
    memset( L->start, 0, (A->n + 1) * sizeof(int) );
    memset( U->start, 0, (A->n + 1) * sizeof(int) );

    for ( i = 0; i < 2 * L_S_store->nnz; ++i )
        fprintf( stderr, "%6.1f", ((real*)(L_S_store->nzval))[i] );
    fprintf( stderr, "\n" );
    for ( i = 0; i < 2 * U_S_store->nnz; ++i )
        fprintf( stderr, "%6.1f", ((real*)(U_S_store->nzval))[i] );
    fprintf( stderr, "\n" );

    printf( "No of supernodes in factor L = " IFMT "\n", L_S_store->nsuper );
    for ( i = 0; i < A->n; ++i )
    {
        fprintf( stderr, "nzval_col_beg[%5d] = %d\n", i, L_S_store->nzval_colbeg[i] );
        fprintf( stderr, "nzval_col_end[%5d] = %d\n", i, L_S_store->nzval_colend[i] );
        //TODO: correct for SCPformat for L?
        //for( pj = L_S_store->rowind_colbeg[i]; pj < L_S_store->rowind_colend[i]; ++pj )
        //        for( pj = 0; pj < L_S_store->rowind_colend[i] - L_S_store->rowind_colbeg[i]; ++pj )
        //        {
        //            ++Ltop[L_S_store->rowind[L_S_store->rowind_colbeg[i] + pj] + 1];
        //        }
        fprintf( stderr, "col_beg[%5d] = %d\n", i, U_S_store->colbeg[i] );
        fprintf( stderr, "col_end[%5d] = %d\n", i, U_S_store->colend[i] );
        for ( pj = U_S_store->colbeg[i]; pj < U_S_store->colend[i]; ++pj )
        {
            ++Utop[U_S_store->rowind[pj] + 1];
            fprintf( stderr, "Utop[%5d]     = %d\n", U_S_store->rowind[pj] + 1, Utop[U_S_store->rowind[pj] + 1] );
        }
    }
    for ( i = 1; i <= A->n; ++i )
    {
        //        Ltop[i] = L->start[i] = Ltop[i] + Ltop[i - 1];
        Utop[i] = U->start[i] = Utop[i] + Utop[i - 1];
        //        fprintf( stderr, "Utop[%5d]     = %d\n", i, Utop[i] );
        //        fprintf( stderr, "U->start[%5d] = %d\n", i, U->start[i] );
        //        for( pj = 0; pj < L_S_store->nzval_colend[i] - L_S_store->nzval_colbeg[i]; ++pj )
        //        {
        //            r = L_S_store->rowind[L_S_store->rowind_colbeg[i] + pj];
        //            L->entries[Ltop[r]].j = r;
        //            L->entries[Ltop[r]].val = ((real*)L_S_store->nzval)[L_S_store->nzval_colbeg[i] + pj];
        //            ++Ltop[r];
        //        }
        for ( pj = U_S_store->colbeg[i]; pj < U_S_store->colend[i]; ++pj )
        {
            r = U_S_store->rowind[pj];
            U->entries[Utop[r]].j = i;
            U->entries[Utop[r]].val = ((real*)U_S_store->nzval)[pj];
            ++Utop[r];
        }
    }

    /* ------------------------------------------------------------
       Deallocate storage after factorization.
       ------------------------------------------------------------*/
    pxgstrf_finalize( &superlumt_options, &AC_S );
    Deallocate_Matrix( A_t );
    sfree( xa, "SuperLU_Factorize::xa" );
    sfree( asub, "SuperLU_Factorize::asub" );
    sfree( a, "SuperLU_Factorize::a" );
    SUPERLU_FREE( perm_r );
    SUPERLU_FREE( perm_c );
    SUPERLU_FREE( ((NCformat *)A_S.Store)->rowind );
    SUPERLU_FREE( ((NCformat *)A_S.Store)->colptr );
    SUPERLU_FREE( ((NCformat *)A_S.Store)->nzval );
    SUPERLU_FREE( A_S.Store );
    if ( lwork == 0 )
    {
        Destroy_SuperNode_SCP(&L_S);
        Destroy_CompCol_NCP(&U_S);
    }
    else if ( lwork > 0 )
    {
        SUPERLU_FREE(work);
    }
    StatFree(&Gstat);

    sfree( Utop, "SuperLU_Factorize::Utop" );
    sfree( Ltop, "SuperLU_Factorize::Ltop" );

    //TODO: return iters
    return 0.;
}
#endif


/* Diagonal (Jacobi) preconditioner computation */
real diag_pre_comp( const sparse_matrix * const H, real * const Hdia_inv )
{
    unsigned int i;
    real start;

    start = Get_Time( );

#ifdef _OPENMP
    default(none) private(i)
#endif
    for ( i = 0; i < H->n; ++i )
    {
        if ( H->val[H->start[i + 1] - 1] != 0.0 )
        {
            Hdia_inv[i] = 1.0 / H->val[H->start[i + 1] - 1];
        }
        else
        {
            Hdia_inv[i] = 1.0;
        }
    }

    return Get_Timing_Info( start );
}


/* Incomplete Cholesky factorization with dual thresholding */
real ICHOLT( const sparse_matrix * const A, const real * const droptol,
             sparse_matrix * const L, sparse_matrix * const U )
{
    int *tmp_j;
    real *tmp_val;
    int i, j, pj, k1, k2, tmptop, Ltop;
    real val, start;
    unsigned int *Utop;

    start = Get_Time( );

    Utop = (unsigned int*) smalloc( (A->n + 1) * sizeof(unsigned int),
    tmp_j = (int*) smalloc( A->n * sizeof(int),
    tmp_val = (real*) smalloc( A->n * sizeof(real),

    // clear variables
    Ltop = 0;
    tmptop = 0;
    memset( L->start, 0, (A->n + 1) * sizeof(unsigned int) );
    memset( U->start, 0, (A->n + 1) * sizeof(unsigned int) );
    memset( Utop, 0, A->n * sizeof(unsigned int) );

    for ( i = 0; i < A->n; ++i )
    {
        L->start[i] = Ltop;
        tmptop = 0;

        for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
        {
            j = A->j[pj];
            val = A->val[pj];

            if ( FABS(val) > droptol[i] )
            {
                k1 = 0;
                k2 = L->start[j];
                while ( k1 < tmptop && k2 < L->start[j + 1] )
                {
                    if ( tmp_j[k1] < L->j[k2] )
                    {
                        ++k1;
                    }
                    else if ( tmp_j[k1] > L->j[k2] )
                    {
                        ++k2;
                    }
                    else
                    {
                        val -= (tmp_val[k1++] * L->val[k2++]);
                    }
                }

                // L matrix is lower triangular,
                // so right before the start of next row comes jth diagonal
                val /= L->val[L->start[j + 1] - 1];

                tmp_j[tmptop] = j;
                tmp_val[tmptop] = val;
                ++tmptop;
            }
        }

        // sanity check
        if ( A->j[pj] != i )
        {
            fprintf( stderr, "[ICHOLT] badly built A matrix!\n (i = %d) ", i );
            exit( NUMERIC_BREAKDOWN );
        }

        // compute the ith diagonal in L
        val = A->val[pj];
        for ( k1 = 0; k1 < tmptop; ++k1 )
        {
            val -= (tmp_val[k1] * tmp_val[k1]);
        }

#if defined(DEBUG)
        if ( val < 0.0 )
        {
            fprintf( stderr, "[ICHOLT] Numeric breakdown (SQRT of negative on diagonal i = %d). Terminating.\n", i );
            exit( NUMERIC_BREAKDOWN );

        }
#endif

        tmp_j[tmptop] = i;
        tmp_val[tmptop] = SQRT( val );

        // apply the dropping rule once again
        //fprintf( stderr, "row%d: tmptop: %d\n", i, tmptop );
        //for( k1 = 0; k1<= tmptop; ++k1 )
        //  fprintf( stderr, "%d(%f)  ", tmp[k1].j, tmp[k1].val );
        //fprintf( stderr, "\n" );
        //fprintf( stderr, "row(%d): droptol=%.4f\n", i+1, droptol[i] );
        for ( k1 = 0; k1 < tmptop; ++k1 )
        {
            if ( FABS(tmp_val[k1]) > droptol[i] / tmp_val[tmptop] )
            {
                L->j[Ltop] = tmp_j[k1];
                L->val[Ltop] = tmp_val[k1];
                U->start[tmp_j[k1] + 1]++;
                ++Ltop;
                //fprintf( stderr, "%d(%.4f)  ", tmp[k1].j+1, tmp[k1].val );
            }
        }
        // keep the diagonal in any case
        L->j[Ltop] = tmp_j[k1];
        L->val[Ltop] = tmp_val[k1];
        ++Ltop;
        //fprintf( stderr, "%d(%.4f)\n", tmp[k1].j+1,  tmp[k1].val );
    }

    L->start[i] = Ltop;
    //    fprintf( stderr, "nnz(L): %d, max: %d\n", Ltop, L->n * 50 );

    /* U = L^T (Cholesky factorization) */
    Transpose( L, U );
    //    for ( i = 1; i <= U->n; ++i )
    //    {
    //        Utop[i] = U->start[i] = U->start[i] + U->start[i - 1] + 1;
    //    }
    //    for ( i = 0; i < L->n; ++i )
    //    {
    //        for ( pj = L->start[i]; pj < L->start[i + 1]; ++pj )
    //        {
    //            j = L->j[pj];
    //            U->j[Utop[j]] = i;
    //            U->val[Utop[j]] = L->val[pj];
    //            Utop[j]++;
    //        }
    //    }

    //    fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 );

    sfree( tmp_val, "ICHOLT::tmp_val" );
    sfree( tmp_j, "ICHOLT::tmp_j" );
    sfree( Utop, "ICHOLT::Utop" );

    return Get_Timing_Info( start );
}


/* Fine-grained (parallel) incomplete Cholesky factorization
 *
 * Reference:
 * Edmond Chow and Aftab Patel
 * Fine-Grained Parallel Incomplete LU Factorization
 * SIAM J. Sci. Comp. */
#if defined(TESTING)
real ICHOL_PAR( const sparse_matrix * const A, const unsigned int sweeps,
                sparse_matrix * const U_t, sparse_matrix * const U )
{
    unsigned int i, j, k, pj, x = 0, y = 0, ei_x, ei_y;
    real *D, *D_inv, sum, start;
    sparse_matrix *DAD;
    int *Utop;

    start = Get_Time( );

    D = (real*) smalloc( A->n * sizeof(real), "ICHOL_PAR::D" );
    D_inv = (real*) smalloc( A->n * sizeof(real), "ICHOL_PAR::D_inv" );
    Utop = (int*) smalloc( (A->n + 1) * sizeof(int), "ICHOL_PAR::Utop" );
    if ( Allocate_Matrix( &DAD, A->n, A->m ) == FAILURE )
    {
        fprintf( stderr, "not enough memory for ICHOL_PAR preconditioning matrices. terminating.\n" );
        exit( INSUFFICIENT_MEMORY );
    }

#ifdef _OPENMP
    default(none) shared(D_inv, D) private(i)
#endif
    for ( i = 0; i < A->n; ++i )
    {
        D_inv[i] = SQRT( A->val[A->start[i + 1] - 1] );
        D[i] = 1. / D_inv[i];
    }

    memset( U->start, 0, sizeof(unsigned int) * (A->n + 1) );
    memset( Utop, 0, sizeof(unsigned int) * (A->n + 1) );

    /* to get convergence, A must have unit diagonal, so apply
     * transformation DAD, where D = D(1./SQRT(D(A))) */
    memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) );
#ifdef _OPENMP
    default(none) shared(DAD, D_inv, D) private(i, pj)
#endif
    for ( i = 0; i < A->n; ++i )
    {
        /* non-diagonals */
        for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
        {
            DAD->j[pj] = A->j[pj];
            DAD->val[pj] = A->val[pj] * D[i] * D[A->j[pj]];
        }
        /* diagonal */
        DAD->j[pj] = A->j[pj];
        DAD->val[pj] = 1.;
    }

    /* initial guesses for U^T,
     * assume: A and DAD symmetric and stored lower triangular */
    memcpy( U_t->start, DAD->start, sizeof(int) * (DAD->n + 1) );
    memcpy( U_t->j, DAD->j, sizeof(int) * (DAD->m) );
    memcpy( U_t->val, DAD->val, sizeof(real) * (DAD->m) );

    for ( i = 0; i < sweeps; ++i )
    {
        /* for each nonzero */
#ifdef _OPENMP
        default(none) shared(DAD, stderr) private(sum, ei_x, ei_y, k) firstprivate(x, y)
#endif
        for ( j = 0; j < A->start[A->n]; ++j )
        {
            sum = ZERO;

            /* determine row bounds of current nonzero */
            x = 0;
            ei_x = 0;
            for ( k = 0; k <= A->n; ++k )
            {
                if ( U_t->start[k] > j )
                {
                    x = U_t->start[k - 1];
                    ei_x = U_t->start[k];
                    break;
                }
            }
            /* column bounds of current nonzero */
            y = U_t->start[U_t->j[j]];
            ei_y = U_t->start[U_t->j[j] + 1];

            /* sparse dot product: dot( U^T(i,1:j-1), U^T(j,1:j-1) ) */
            while ( U_t->j[x] < U_t->j[j] &&
                    U_t->j[y] < U_t->j[j] &&
                    x < ei_x && y < ei_y )
            {
                if ( U_t->j[x] == U_t->j[y] )
                {
                    sum += (U_t->val[x] * U_t->val[y]);
                    ++x;
                    ++y;
                }
                else if ( U_t->j[x] < U_t->j[y] )
                {
                    ++x;
                }
                else
                {
                    ++y;
                }
            }

            sum = DAD->val[j] - sum;

            /* diagonal entries */
            if ( (k - 1) == U_t->j[j] )
            {
                /* sanity check */
                if ( sum < ZERO )
                {
                    fprintf( stderr, "Numeric breakdown in ICHOL_PAR. Terminating.\n");
#if defined(DEBUG_FOCUS)
                    fprintf( stderr, "A(%5d,%5d) = %10.3f\n",
                    fprintf( stderr, "sum = %10.3f\n", sum);
#endif
                    exit(NUMERIC_BREAKDOWN);
                }

                U_t->val[j] = SQRT( sum );
            }
            /* non-diagonal entries */
            else
            {
                U_t->val[j] = sum / U_t->val[ei_y - 1];
            }
        }
    }

    /* apply inverse transformation D^{-1}U^{T},
     * since DAD \approx U^{T}U, so
     * D^{-1}DADD^{-1} = A \approx D^{-1}U^{T}UD^{-1} */
#ifdef _OPENMP
    default(none) shared(D_inv) private(i, pj)
    for ( i = 0; i < A->n; ++i )
    {
        for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
        {
            U_t->val[pj] *= D_inv[i];
        }
    }

#if defined(DEBUG_FOCUS)
    fprintf( stderr, "nnz(L): %d, max: %d\n", U_t->start[U_t->n], U_t->n * 50 );
#endif

    /* transpose U^{T} and copy into U */
    Transpose( U_t, U );

#if defined(DEBUG_FOCUS)
    fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 );
#endif

    Deallocate_Matrix( DAD );
    sfree( D_inv, "ICHOL_PAR::D_inv" );
    sfree( D, "ICHOL_PAR::D" );
    sfree( Utop, "ICHOL_PAR::Utop" );

    return Get_Timing_Info( start );
}
#endif


/* Fine-grained (parallel) incomplete LU factorization
 *
 * Reference:
 * Edmond Chow and Aftab Patel
 * Fine-Grained Parallel Incomplete LU Factorization
 * SIAM J. Sci. Comp.
 *
 * A: symmetric, half-stored (lower triangular), CSR format
 * sweeps: number of loops over non-zeros for computation
 * L / U: factorized triangular matrices (A \approx LU), CSR format */
real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps,
              sparse_matrix * const L, sparse_matrix * const U )
{
    unsigned int i, j, k, pj, x, y, ei_x, ei_y;
    real *D, *D_inv, sum, start;
    sparse_matrix *DAD;

    start = Get_Time( );

    D = (real*) smalloc( A->n * sizeof(real), "ILU_PAR::D" );
    D_inv = (real*) smalloc( A->n * sizeof(real), "ILU_PAR::D_inv" );
    if ( Allocate_Matrix( &DAD, A->n, A->m ) == FAILURE )
    {
        fprintf( stderr, "[ILU_PAR] Not enough memory for preconditioning matrices. Terminating.\n" );
        exit( INSUFFICIENT_MEMORY );
    }

#ifdef _OPENMP
    default(none) shared(D, D_inv) private(i)
#endif
    for ( i = 0; i < A->n; ++i )
    {
        D_inv[i] = SQRT( FABS( A->val[A->start[i + 1] - 1] ) );
        D[i] = 1.0 / D_inv[i];
        //        printf( "A->val[%8d] = %f, D[%4d] = %f, D_inv[%4d] = %f\n", A->start[i + 1] - 1, A->val[A->start[i + 1] - 1], i, D[i], i, D_inv[i] );
    }

    /* to get convergence, A must have unit diagonal, so apply
     * transformation DAD, where D = D(1./SQRT(abs(D(A)))) */
    memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) );
#ifdef _OPENMP
    default(none) shared(DAD, D) private(i, pj)
#endif
    for ( i = 0; i < A->n; ++i )
    {
        /* non-diagonals */
        for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
        {
            DAD->j[pj] = A->j[pj];
            DAD->val[pj] = D[i] * A->val[pj] * D[A->j[pj]];
        }
        /* diagonal */
        DAD->j[pj] = A->j[pj];
        DAD->val[pj] = 1.0;
    }

    /* initial guesses for L and U,
     * assume: A and DAD symmetric and stored lower triangular */
    memcpy( L->start, DAD->start, sizeof(int) * (DAD->n + 1) );
    memcpy( L->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) );
    memcpy( L->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) );
    /* store U^T in CSR for row-wise access and tranpose later */
    memcpy( U->start, DAD->start, sizeof(int) * (DAD->n + 1) );
    memcpy( U->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) );
    memcpy( U->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) );

    /* L has unit diagonal, by convention */
#ifdef _OPENMP
    #pragma omp parallel for schedule(static) default(none) private(i)
#endif
    for ( i = 0; i < A->n; ++i )
    {
        L->val[L->start[i + 1] - 1] = 1.0;
    }

    for ( i = 0; i < sweeps; ++i )
    {
        /* for each nonzero in L */
#ifdef _OPENMP
        default(none) shared(DAD) private(j, k, x, y, ei_x, ei_y, sum)
#endif
        for ( j = 0; j < DAD->start[DAD->n]; ++j )
        {
            sum = ZERO;

            /* determine row bounds of current nonzero */
            x = 0;
            ei_x = 0;
            for ( k = 1; k <= DAD->n; ++k )
            {
                if ( DAD->start[k] > j )
                {
                    x = DAD->start[k - 1];
                    ei_x = DAD->start[k];
                    break;
                }
            }
            /* determine column bounds of current nonzero */
            y = DAD->start[DAD->j[j]];
            ei_y = DAD->start[DAD->j[j] + 1];

            /* sparse dot product:
             *   dot( L(i,1:j-1), U(1:j-1,j) ) */
            while ( L->j[x] < L->j[j] &&
                    L->j[y] < L->j[j] &&
                    x < ei_x && y < ei_y )
            {
                if ( L->j[x] == L->j[y] )
                {
                    sum += (L->val[x] * U->val[y]);
                    ++x;
                    ++y;
                }
                else if ( L->j[x] < L->j[y] )
                {
                    ++x;
                }
                else
                {
                    ++y;
                }
            }

            if ( j != ei_x - 1 )
            {
                L->val[j] = ( DAD->val[j] - sum ) / U->val[ei_y - 1];
            }
        }

#ifdef _OPENMP
        default(none) shared(DAD) private(j, k, x, y, ei_x, ei_y, sum)
#endif
        for ( j = 0; j < DAD->start[DAD->n]; ++j )
        {
            sum = ZERO;

            /* determine row bounds of current nonzero */
            x = 0;
            ei_x = 0;
            for ( k = 1; k <= DAD->n; ++k )
            {
                if ( DAD->start[k] > j )
                {
                    x = DAD->start[k - 1];
                    ei_x = DAD->start[k];
                    break;
                }
            }
            /* determine column bounds of current nonzero */
            y = DAD->start[DAD->j[j]];
            ei_y = DAD->start[DAD->j[j] + 1];

            /* sparse dot product:
             *   dot( L(i,1:i-1), U(1:i-1,j) ) */
            while ( U->j[x] < U->j[j] &&
                    U->j[y] < U->j[j] &&
                    x < ei_x && y < ei_y )
            {
                if ( U->j[x] == U->j[y] )
                {
                    sum += (L->val[y] * U->val[x]);
                    ++x;
                    ++y;
                }
                else if ( U->j[x] < U->j[y] )
                {
                    ++x;
                }
                else
                {
                    ++y;
                }
            }

            U->val[j] = DAD->val[j] - sum;
        }
    }

    /* apply inverse transformation:
     * since DAD \approx LU, then
     * D^{-1}DADD^{-1} = A \approx D^{-1}LUD^{-1} */
#ifdef _OPENMP
    default(none) shared(DAD, D_inv) private(i, pj)
#endif
    for ( i = 0; i < DAD->n; ++i )
    {
        for ( pj = DAD->start[i]; pj < DAD->start[i + 1]; ++pj )
        {
            L->val[pj] = D_inv[i] * L->val[pj];
            /* currently storing U^T, so use row index instead of column index */
            U->val[pj] = U->val[pj] * D_inv[i];
        }
    }

    Transpose_I( U );

#if defined(DEBUG_FOCUS)
    fprintf( stderr, "nnz(L): %d, max: %d\n", L->start[L->n], L->n * 50 );
    fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 );
#endif

    Deallocate_Matrix( DAD );
    sfree( D_inv, "ILU_PAR::D_inv" );
    sfree( D, "ILU_PAR::D_inv" );

    return Get_Timing_Info( start );
}


/* Fine-grained (parallel) incomplete LU factorization with thresholding
 *
 * Reference:
 * Edmond Chow and Aftab Patel
 * Fine-Grained Parallel Incomplete LU Factorization
 * SIAM J. Sci. Comp.
 *
 * A: symmetric, half-stored (lower triangular), CSR format
 * droptol: row-wise tolerances used for dropping
 * sweeps: number of loops over non-zeros for computation
 * L / U: factorized triangular matrices (A \approx LU), CSR format */
real ILUT_PAR( const sparse_matrix * const A, const real * droptol,
               const unsigned int sweeps, sparse_matrix * const L, sparse_matrix * const U )
{
    unsigned int i, j, k, pj, x, y, ei_x, ei_y, Ltop, Utop;
    real *D, *D_inv, sum, start;
    sparse_matrix *DAD, *L_temp, *U_temp;

    start = Get_Time( );

    if ( Allocate_Matrix( &DAD, A->n, A->m ) == FAILURE ||
            Allocate_Matrix( &L_temp, A->n, A->m ) == FAILURE ||
            Allocate_Matrix( &U_temp, A->n, A->m ) == FAILURE )
    {
        fprintf( stderr, "not enough memory for ILUT_PAR preconditioning matrices. terminating.\n" );
        exit( INSUFFICIENT_MEMORY );
    }

    D = (real*) smalloc( A->n * sizeof(real), "ILUT_PAR::D" );
    D_inv = (real*) smalloc( A->n * sizeof(real), "ILUT_PAR::D_inv" );
    default(none) shared(D, D_inv) private(i)
#endif
    for ( i = 0; i < A->n; ++i )
    {
        D_inv[i] = SQRT( FABS( A->val[A->start[i + 1] - 1] ) );
        D[i] = 1.0 / D_inv[i];
    }

    /* to get convergence, A must have unit diagonal, so apply
     * transformation DAD, where D = D(1./SQRT(D(A))) */
    memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) );
#ifdef _OPENMP
    default(none) shared(DAD, D) private(i, pj)
#endif
    for ( i = 0; i < A->n; ++i )
    {
        /* non-diagonals */
        for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
        {
            DAD->j[pj] = A->j[pj];
            DAD->val[pj] = D[i] * A->val[pj] * D[A->j[pj]];
        }
        /* diagonal */
        DAD->j[pj] = A->j[pj];
        DAD->val[pj] = 1.0;
    }

    /* initial guesses for L and U,
     * assume: A and DAD symmetric and stored lower triangular */
    memcpy( L_temp->start, DAD->start, sizeof(int) * (DAD->n + 1) );
    memcpy( L_temp->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) );
    memcpy( L_temp->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) );
    /* store U^T in CSR for row-wise access and tranpose later */
    memcpy( U_temp->start, DAD->start, sizeof(int) * (DAD->n + 1) );
    memcpy( U_temp->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) );
    memcpy( U_temp->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) );

    /* L has unit diagonal, by convention */
#ifdef _OPENMP
    default(none) private(i) shared(L_temp)
#endif
    for ( i = 0; i < A->n; ++i )
    {
        L_temp->val[L_temp->start[i + 1] - 1] = 1.0;
    }

    for ( i = 0; i < sweeps; ++i )
    {
        /* for each nonzero in L */
#ifdef _OPENMP
        default(none) shared(DAD, L_temp, U_temp) private(j, k, x, y, ei_x, ei_y, sum)
#endif
        for ( j = 0; j < DAD->start[DAD->n]; ++j )
        {
            sum = ZERO;

            /* determine row bounds of current nonzero */
            x = 0;
            ei_x = 0;
            for ( k = 1; k <= DAD->n; ++k )
            {
                if ( DAD->start[k] > j )
                {
                    x = DAD->start[k - 1];
                    ei_x = DAD->start[k];
                    break;
                }
            }
            /* determine column bounds of current nonzero */
            y = DAD->start[DAD->j[j]];
            ei_y = DAD->start[DAD->j[j] + 1];

            /* sparse dot product:
             *   dot( L(i,1:j-1), U(1:j-1,j) ) */
            while ( L_temp->j[x] < L_temp->j[j] &&
                    L_temp->j[y] < L_temp->j[j] &&
                    x < ei_x && y < ei_y )
            {
                if ( L_temp->j[x] == L_temp->j[y] )
                {
                    sum += (L_temp->val[x] * U_temp->val[y]);
                    ++x;
                    ++y;
                }
                else if ( L_temp->j[x] < L_temp->j[y] )
                {
                    ++x;
                }
                else
                {
                    ++y;
                }
            }

            if ( j != ei_x - 1 )
            {
                L_temp->val[j] = ( DAD->val[j] - sum ) / U_temp->val[ei_y - 1];
            }
        }

#ifdef _OPENMP
        default(none) shared(DAD, L_temp, U_temp) private(j, k, x, y, ei_x, ei_y, sum)
#endif
        for ( j = 0; j < DAD->start[DAD->n]; ++j )
        {
            sum = ZERO;

            /* determine row bounds of current nonzero */
            x = 0;
            ei_x = 0;
            for ( k = 1; k <= DAD->n; ++k )
            {
                if ( DAD->start[k] > j )
                {
                    x = DAD->start[k - 1];
                    ei_x = DAD->start[k];
                    break;
                }
            }
            /* determine column bounds of current nonzero */
            y = DAD->start[DAD->j[j]];
            ei_y = DAD->start[DAD->j[j] + 1];

            /* sparse dot product:
             *   dot( L(i,1:i-1), U(1:i-1,j) ) */
            while ( U_temp->j[x] < U_temp->j[j] &&
                    U_temp->j[y] < U_temp->j[j] &&
                    x < ei_x && y < ei_y )
            {
                if ( U_temp->j[x] == U_temp->j[y] )
                {
                    sum += (L_temp->val[y] * U_temp->val[x]);
                    ++x;
                    ++y;
                }
                else if ( U_temp->j[x] < U_temp->j[y] )
                {
                    ++x;
                }
                else
                {
                    ++y;
                }
            }

            U_temp->val[j] = DAD->val[j] - sum;
        }
    }

    /* apply inverse transformation:
     * since DAD \approx LU, then
     * D^{-1}DADD^{-1} = A \approx D^{-1}LUD^{-1} */
#ifdef _OPENMP
    default(none) shared(DAD, L_temp, U_temp, D_inv) private(i, pj)
#endif
    for ( i = 0; i < DAD->n; ++i )
    {
        for ( pj = DAD->start[i]; pj < DAD->start[i + 1]; ++pj )
        {
            L_temp->val[pj] = D_inv[i] * L_temp->val[pj];
            /* currently storing U^T, so use row index instead of column index */
            U_temp->val[pj] = U_temp->val[pj] * D_inv[i];
        }
    }

    /* apply the dropping rule */
    Ltop = 0;
    Utop = 0;
    for ( i = 0; i < DAD->n; ++i )
    {
        L->start[i] = Ltop;
        U->start[i] = Utop;

        for ( pj = L_temp->start[i]; pj < L_temp->start[i + 1] - 1; ++pj )
        {
            if ( FABS( L_temp->val[pj] ) > FABS( droptol[i] / L_temp->val[L_temp->start[i + 1] - 1] ) )
            {
                L->j[Ltop] = L_temp->j[pj];
                L->val[Ltop] = L_temp->val[pj];
                ++Ltop;
            }
        }

        /* diagonal */
        L->j[Ltop] = L_temp->j[pj];
        L->val[Ltop] = L_temp->val[pj];
        ++Ltop;

        for ( pj = U_temp->start[i]; pj < U_temp->start[i + 1] - 1; ++pj )
        {
            if ( FABS( U_temp->val[pj] ) > FABS( droptol[i] / U_temp->val[U_temp->start[i + 1] - 1] ) )
            {
                U->j[Utop] = U_temp->j[pj];
                U->val[Utop] = U_temp->val[pj];
                ++Utop;
            }
        }

        /* diagonal */
        U->j[Utop] = U_temp->j[pj];
        U->val[Utop] = U_temp->val[pj];
        ++Utop;
    }

    L->start[i] = Ltop;
    U->start[i] = Utop;

    Transpose_I( U );

#if defined(DEBUG_FOCUS)
    fprintf( stderr, "nnz(L): %d\n", L->start[L->n] );
    fprintf( stderr, "nnz(U): %d\n", U->start[U->n] );
#endif

    Deallocate_Matrix( U_temp );
    Deallocate_Matrix( L_temp );
    Deallocate_Matrix( DAD );
    sfree( D_inv, "ILUT_PAR::D_inv" );
    sfree( D, "ILUT_PAR::D_inv" );

    return Get_Timing_Info( start );
}
#if defined(HAVE_LAPACKE) || defined(HAVE_LAPACKE_MKL)
 *  \sum_{j=1}^{N} ||e_j - Am_j||_2^2
 *  where: e_j is the j-th column of the NxN identify matrix,
 *         m_j is the j-th column of the NxN approximate sparse matrix M
 * 
 * Internally, use LAPACKE to solve the least-squares problems
 *
 * A: symmetric, sparse matrix, stored in full CSR format
 * A_spar_patt: sparse matrix used as template sparsity pattern
 *   for approximating the inverse, stored in full CSR format
 * A_app_inv: approximate inverse to A, stored in full CSR format (result)
 *
 * Reference:
 * Michele Benzi et al.
 * A Comparative Study of Sparse Approximate Inverse
 *  Preconditioners
 * Applied Numerical Mathematics 30, 1999
 * */
real sparse_approx_inverse( const sparse_matrix * const A,
                            const sparse_matrix * const A_spar_patt,
                            sparse_matrix ** A_app_inv )
    int i, k, pj, j_temp, identity_pos;
    int N, M, d_i, d_j;
    lapack_int m, n, nrhs, lda, ldb, info;
    int *pos_x, *pos_y;
    real *e_j, *dense_matrix;
    (*A_app_inv)->start[(*A_app_inv)->n] = A_spar_patt->start[A_spar_patt->n];
#ifdef _OPENMP
    #pragma omp parallel default(none) \
    private(i, k, pj, j_temp, identity_pos, N, M, d_i, d_j, m, n, \
            nrhs, lda, ldb, info, X, Y, pos_x, pos_y, e_j, dense_matrix) \
    shared(A_app_inv, stderr)
        X = (char *) smalloc( sizeof(char) * A->n, "Sparse_Approx_Inverse::X" );
        Y = (char *) smalloc( sizeof(char) * A->n, "Sparse_Approx_Inverse::Y" );
        pos_x = (int *) smalloc( sizeof(int) * A->n, "Sparse_Approx_Inverse::pos_x" );
        pos_y = (int *) smalloc( sizeof(int) * A->n, "Sparse_Approx_Inverse::pos_y" );
#ifdef _OPENMP
        #pragma omp for schedule(static)
#endif
            // find column indices of nonzeros (which will be the columns indices of the dense matrix)
            for ( pj = A_spar_patt->start[i]; pj < A_spar_patt->start[i + 1]; ++pj )

                Y[j_temp] = 1;
                pos_y[j_temp] = N;
                ++N;

                // for each of those indices
                // search through the row of full A of that index
                for ( k = A->start[j_temp]; k < A->start[j_temp + 1]; ++k )
                {
                    // and accumulate the nonzero column indices to serve as the row indices of the dense matrix
            // enumerate the row indices from 0 to (# of nonzero rows - 1) for the dense matrix
            identity_pos = M;
            // allocate memory for NxM dense matrix
            dense_matrix = (real *) smalloc( sizeof(real) * N * M,
                                             "Sparse_Approx_Inverse::dense_matrix" );
            // fill in the entries of dense matrix
            for ( d_i = 0; d_i < M; ++d_i)
                // all rows are initialized to zero
                for ( d_j = 0; d_j < N; ++d_j )
                    dense_matrix[d_i * N + d_j] = 0.0;
                }
                // change the value if any of the column indices is seen
                        d_j < A->start[pos_x[d_i] + 1]; ++d_j )
                        dense_matrix[d_i * N + pos_y[A->j[d_j]]] = A->val[d_j];
            /* create the right hand side of the linear equation
               that is the full column of the identity matrix*/
            e_j = (real *) smalloc( sizeof(real) * M,
                                    "Sparse_Approx_Inverse::e_j" );
            for ( k = 0; k < M; ++k )
            {
                e_j[k] = 0.0;
            }
            e_j[identity_pos] = 1.0;

            /* Solve the overdetermined system AX = B through the least-squares problem:
             * min ||B - AX||_2 */
            m = M;
            n = N;
            nrhs = 1;
            lda = N;
            ldb = nrhs;
            info = LAPACKE_dgels( LAPACK_ROW_MAJOR, 'N', m, n, nrhs, dense_matrix, lda,
                                  e_j, ldb );

            /* Check for the full rank */
            if ( info > 0 )
            {
                fprintf( stderr, "The diagonal element %i of the triangular factor ", info );
                fprintf( stderr, "of A is zero, so that A does not have full rank;\n" );
                fprintf( stderr, "the least squares solution could not be computed.\n" );
                exit( INVALID_INPUT );
            }
            /* Print least squares solution */
            // print_matrix( "Least squares solution", n, nrhs, b, ldb );
            // accumulate the resulting vector to build A_app_inv
            (*A_app_inv)->start[i] = A_spar_patt->start[i];
            for ( k = A_spar_patt->start[i]; k < A_spar_patt->start[i + 1]; ++k)
                (*A_app_inv)->j[k] = A_spar_patt->j[k];
                (*A_app_inv)->val[k] = e_j[k - A_spar_patt->start[i]];
            //empty variables that will be used next iteration
            sfree( dense_matrix, "Sparse_Approx_Inverse::dense_matrix" );
            sfree( e_j, "Sparse_Approx_Inverse::e_j"  );
            for ( k = 0; k < A->n; ++k )
            {
                X[k] = 0;
                Y[k] = 0;
                pos_x[k] = 0;
                pos_y[k] = 0;
            }
        sfree( pos_y, "Sparse_Approx_Inverse::pos_y" );
        sfree( pos_x, "Sparse_Approx_Inverse::pos_x" );
        sfree( Y, "Sparse_Approx_Inverse::Y" );
        sfree( X, "Sparse_Approx_Inverse::X" );
    }

    return Get_Timing_Info( start );
}
 *
 * workspace: storage container for workspace structures
 * A: lower triangular matrix, stored in CSR format
 * x: vector
 * b (output): vector */
static void Sparse_MatVec( const static_storage * const workspace,
        const sparse_matrix * const A, const real * const x, real * const b )
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
{
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    int i, j, k, n, si, ei;
    real H;
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

    n = A->n;
    Vector_MakeZero( b, n );
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

    Vector_MakeZero( workspace->b_local, omp_get_num_threads() * n );
    for ( i = 0; i < n; ++i )
    {
        si = A->start[i];
        ei = A->start[i + 1] - 1;
        for ( k = si; k < ei; ++k )
        {
            j = A->j[k];
            H = A->val[k];
            workspace->b_local[tid * n + j] += H * x[i];
            workspace->b_local[tid * n + i] += H * x[j];
            b[j] += H * x[i];
            b[i] += H * x[j];
        // the diagonal entry is the last one in
        workspace->b_local[tid * n + i] += A->val[k] * x[i];
    for ( i = 0; i < n; ++i )
    {
        for ( j = 0; j < omp_get_num_threads(); ++j )
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
        {
            b[i] += workspace->b_local[j * n + i];
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
        }
/* sparse matrix-vector product Ax = b
 * where:
 *   A: matrix, stored in CSR format
 *   x: vector
 *   b: vector (result) */
static void Sparse_MatVec_full( const sparse_matrix * const A,
                                const real * const x, real * const b )
{
        for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
            b[i] += A->val[pj] * x[A->j[pj]];
/* Transpose A and copy into A^T
 *
void Transpose( const sparse_matrix * const A, sparse_matrix * const A_t )
{
    unsigned int i, j, pj, *A_t_top;

    A_t_top = (unsigned int*) scalloc( A->n + 1, sizeof(unsigned int),

    memset( A_t->start, 0, (A->n + 1) * sizeof(unsigned int) );

    /* count nonzeros in each column of A^T, store one row greater (see next loop) */
    for ( i = 0; i < A->n; ++i )
    {
        for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
        {
            ++A_t->start[A->j[pj] + 1];
        }
    }

    /* setup the row pointers for A^T */
    for ( i = 1; i <= A->n; ++i )
    {
        A_t_top[i] = A_t->start[i] = A_t->start[i] + A_t->start[i - 1];
    }

    /* fill in A^T */
    for ( i = 0; i < A->n; ++i )
    {
        for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
        {
            j = A->j[pj];
            A_t->j[A_t_top[j]] = i;
            A_t->val[A_t_top[j]] = A->val[pj];
            ++A_t_top[j];
        }
    }

    sfree( A_t_top, "Transpose::A_t_top" );
void Transpose_I( sparse_matrix * const A )
{
    sparse_matrix * A_t;

    if ( Allocate_Matrix( &A_t, A->n, A->m ) == FAILURE )
    {
        fprintf( stderr, "not enough memory for transposing matrices. terminating.\n" );
        exit( INSUFFICIENT_MEMORY );
    }

    Transpose( A, A_t );

    memcpy( A->start, A_t->start, sizeof(int) * (A_t->n + 1) );
    memcpy( A->j, A_t->j, sizeof(int) * (A_t->start[A_t->n]) );
    memcpy( A->val, A_t->val, sizeof(real) * (A_t->start[A_t->n]) );

    Deallocate_Matrix( A_t );
}


/* Apply diagonal inverse (Jacobi) preconditioner to system residual
 *
 * Hdia_inv: diagonal inverse preconditioner (constructed using H)
 * y: current residual
 * x: preconditioned residual
 * N: dimensions of preconditioner and vectors (# rows in H)
static void diag_pre_app( const real * const Hdia_inv, const real * const y,
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
{
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

/* Solve triangular system LU*x = y using level scheduling
 *
 * LU: lower/upper triangular, stored in CSR
 * y: constants in linear system (RHS)
 * x: solution
 * Assumptions:
 *   LU has non-zero diagonals
 *   Each row of LU has at least one non-zero (i.e., no rows with all zeros) */
void tri_solve( const sparse_matrix * const LU, const real * const y,
                real * const x, const int N, const TRIANGULARITY tri )
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
{
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    int i, pj, j, si, ei;
    real val;

Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
        {
                x[i] = y[i];
                si = LU->start[i];
                ei = LU->start[i + 1];
                for ( pj = si; pj < ei - 1; ++pj )
                {
                    j = LU->j[pj];
                    val = LU->val[pj];
                    x[i] -= val * x[j];
                }
                x[i] /= LU->val[pj];
                x[i] = y[i];
                si = LU->start[i];
                ei = LU->start[i + 1];
                for ( pj = si + 1; pj < ei; ++pj )
                {
                    j = LU->j[pj];
                    val = LU->val[pj];
                    x[i] -= val * x[j];
                }
                x[i] /= LU->val[si];
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
        }
/* Solve triangular system LU*x = y using level scheduling
 * workspace: storage container for workspace structures
 * LU: lower/upper triangular, stored in CSR
 * y: constants in linear system (RHS)
 * x: solution
 * tri: triangularity of LU (lower/upper)
 * find_levels: perform level search if positive, otherwise reuse existing levels
 * Assumptions:
 *   LU has non-zero diagonals
 *   Each row of LU has at least one non-zero (i.e., no rows with all zeros) */
void tri_solve_level_sched( static_storage * workspace,
        const sparse_matrix * const LU,
        const real * const y, real * const x, const int N,
        const TRIANGULARITY tri, int find_levels )
    int i, j, pj, local_row, local_level;
    unsigned int *row_levels, *level_rows, *level_rows_cnt;
    int levels;

    if ( tri == LOWER )
    {
        row_levels = workspace->row_levels_L;
        level_rows = workspace->level_rows_L;
        level_rows_cnt = workspace->level_rows_cnt_L;
    }
    else
    {
        row_levels = workspace->row_levels_U;
        level_rows = workspace->level_rows_U;
        level_rows_cnt = workspace->level_rows_cnt_U;
    }
        /* find levels (row dependencies in substitutions) */
            memset( row_levels, 0, N * sizeof(unsigned int) );
            memset( level_rows_cnt, 0, N * sizeof(unsigned int) );
            memset( workspace->top, 0, N * sizeof(unsigned int) );
                    local_level = 1;
                    for ( pj = LU->start[i]; pj < LU->start[i + 1] - 1; ++pj )
                    {
                        local_level = MAX( local_level, row_levels[LU->j[pj]] + 1 );
                    }

                    levels = MAX( levels, local_level );
                    row_levels[i] = local_level;
                    ++level_rows_cnt[local_level];
                //#if defined(DEBUG)
                fprintf(stderr, "levels(L): %d\n", levels);
                fprintf(stderr, "NNZ(L): %d\n", LU->start[N]);
                    local_level = 1;
                    for ( pj = LU->start[i] + 1; pj < LU->start[i + 1]; ++pj )
                    {
                        local_level = MAX( local_level, row_levels[LU->j[pj]] + 1 );
                    }

                    levels = MAX( levels, local_level );
                    row_levels[i] = local_level;
                    ++level_rows_cnt[local_level];
                //#if defined(DEBUG)
                fprintf(stderr, "levels(U): %d\n", levels);
                fprintf(stderr, "NNZ(U): %d\n", LU->start[N]);
            for ( i = 1; i < levels + 1; ++i )
            {
                level_rows_cnt[i] += level_rows_cnt[i - 1];
                workspace->top[i] = level_rows_cnt[i];
                level_rows[workspace->top[row_levels[i] - 1]] = i;
                ++workspace->top[row_levels[i] - 1];
    if ( tri == LOWER )
    {
        levels = workspace->levels_L;
    }
    else
    {
        levels = workspace->levels_U;
    }

    /* perform substitutions by level */
        for ( i = 0; i < levels; ++i )
            for ( j = level_rows_cnt[i]; j < level_rows_cnt[i + 1]; ++j )
                local_row = level_rows[j];
                x[local_row] = y[local_row];
                for ( pj = LU->start[local_row]; pj < LU->start[local_row + 1] - 1; ++pj )
                    x[local_row] -= LU->val[pj] * x[LU->j[pj]];
            for ( j = level_rows_cnt[i]; j < level_rows_cnt[i + 1]; ++j )
                local_row = level_rows[j];
                x[local_row] = y[local_row];
                for ( pj = LU->start[local_row] + 1; pj < LU->start[local_row + 1]; ++pj )
                    x[local_row] -= LU->val[pj] * x[LU->j[pj]];
                x[local_row] /= LU->val[LU->start[local_row]];
/* Iterative greedy shared-memory parallel graph coloring
 *
 * control: container for control info
 * workspace: storage container for workspace structures
 * A: matrix to use for coloring, stored in CSR format;
 *   rows represent vertices, columns of entries within a row represent adjacent vertices
 *   (i.e., dependent rows for elimination during LU factorization)
 * tri: triangularity of LU (lower/upper)
 * color: vertex color (1-based)
 *
 * Reference:
 * Umit V. Catalyurek et al.
 * Graph Coloring Algorithms for Multi-core
 *  and Massively Threaded Architectures
 * Parallel Computing, 2012
 */
void graph_coloring( const control_params * const control,
        static_storage * workspace,
        const sparse_matrix * const A, const TRIANGULARITY tri )
        unsigned int temp, recolor_cnt_local, *conflict_local;
        unsigned int *p_to_color, *p_conflict, *p_temp;
        p_to_color = workspace->to_color;
        p_conflict = workspace->conflict;

#ifdef _OPENMP
        #pragma omp for schedule(static)
#endif
        for ( i = 0; i < A->n; ++i )
        {
            workspace->color[i] = 0;
        }
        /* ordering of vertices to color depends on triangularity of factor
         * for which coloring is to be used for */
        if ( tri == LOWER )
        fb_color = (int*) smalloc( sizeof(int) * A->n,
        conflict_local = (unsigned int*) smalloc( sizeof(unsigned int) * A->n,
            memset( fb_color, -1, sizeof(int) * A->n );
            for ( i = 0; i < workspace->recolor_cnt; ++i )

                /* colors of adjacent vertices are forbidden */
                for ( pj = A->start[v]; pj < A->start[v + 1]; ++pj )
                        fb_color[workspace->color[A->j[pj]]] = v;
                /* search for min. color which is not in conflict with adjacent vertices;
                 * start at 1 since 0 is default (invalid) color for all vertices */
                /* assign discovered color (no conflict in neighborhood of adjacent vertices) */
                /* search for color conflicts with adjacent vertices */
                for ( pj = A->start[v]; pj < A->start[v + 1]; ++pj )
                {
                    if ( workspace->color[v] == workspace->color[A->j[pj]] && v > A->j[pj] )
                        conflict_local[recolor_cnt_local] = v;
                        ++recolor_cnt_local;
                        break;
            }

            /* count thread-local conflicts and compute offsets for copying into shared buffer */
            workspace->conflict_cnt[tid + 1] = recolor_cnt_local;
                workspace->conflict_cnt[0] = 0;
                for ( i = 1; i < control->num_threads + 1; ++i )
                    workspace->conflict_cnt[i] += workspace->conflict_cnt[i - 1];
                workspace->recolor_cnt = workspace->conflict_cnt[control->num_threads];
            /* copy thread-local conflicts into shared buffer */
            for ( i = 0; i < recolor_cnt_local; ++i )
            {
                p_conflict[workspace->conflict_cnt[tid] + i] = conflict_local[i];
                workspace->color[conflict_local[i]] = 0;
            p_temp = p_to_color;
            p_to_color = p_conflict;
            p_conflict = p_temp;
        sfree( conflict_local, "graph_coloring::conflict_local" );
        sfree( fb_color, "graph_coloring::fb_color" );
 * workspace: storage container for workspace structures
 * n: number of entries in coloring
 * tri: coloring to triangular factor to use (lower/upper)
 */
void sort_rows_by_colors( const static_storage * const workspace,
        const unsigned int n, const TRIANGULARITY tri )
    memset( workspace->color_top, 0, sizeof(unsigned int) * (n + 1) );

    /* sort vertices by color (ascending within a color)
     *  1) count colors
     *  2) determine offsets of color ranges
     *
     *  note: color is 1-based */
    for ( i = 0; i < n; ++i )
    {
        ++workspace->color_top[workspace->color[i]];
        workspace->color_top[i] += workspace->color_top[i - 1];
        workspace->permuted_row_col[workspace->color_top[workspace->color[i] - 1]] = i;
        ++workspace->color_top[workspace->color[i] - 1];
    }

    /* invert mapping to get map from current row/column to permuted (new) row/column */
    for ( i = 0; i < n; ++i )
    {
        workspace->permuted_row_col_inv[workspace->permuted_row_col[i]] = i;
 * workspace: storage container for workspace structures
 * color: vertex color (1-based); vertices represent matrix rows/columns
 * x: vector to permute (in-place)
 * n: number of entries in x
 * invert_map: if TRUE, use Q^T, otherwise use Q
 * tri: coloring to triangular factor to use (lower/upper)
 */
static void permute_vector( const static_storage * const workspace,
        real * const x, const unsigned int n, const int invert_map,
        const TRIANGULARITY tri )
        mapping = workspace->permuted_row_col_inv;
    }
    else
    {
        mapping = workspace->permuted_row_col;
/* Apply permutation Q^T*(LU)*Q based on graph coloring
 *
 * workspace: storage container for workspace structures
 * color: vertex color (1-based); vertices represent matrix rows/columns
 * LU: matrix to permute, stored in CSR format
 * tri: triangularity of LU (lower/upper)
 */
void permute_matrix( const static_storage * const workspace,
        sparse_matrix * const LU, const TRIANGULARITY tri )
{
    int i, pj, nr, nc;
    sparse_matrix *LUtemp;

    if ( Allocate_Matrix( &LUtemp, LU->n, LU->m ) == FAILURE )
    {
        fprintf( stderr, "Not enough space for graph coloring (factor permutation). Terminating...\n" );
        exit( INSUFFICIENT_MEMORY );
    }

    /* count nonzeros in each row of permuted factor (re-use color_top for counting) */
    memset( workspace->color_top, 0, sizeof(unsigned int) * (LU->n + 1) );
            nr = workspace->permuted_row_col_inv[i];

            for ( pj = LU->start[i]; pj < LU->start[i + 1]; ++pj )
                nc = workspace->permuted_row_col_inv[LU->j[pj]];
                }
                /* correct entries to maintain triangularity (lower) */
                else
                {
            nr = workspace->permuted_row_col_inv[i];

            for ( pj = LU->start[i]; pj < LU->start[i + 1]; ++pj )
                nc = workspace->permuted_row_col_inv[LU->j[pj]];
                }
                /* correct entries to maintain triangularity (upper) */
                else
                {
        workspace->color_top[i] += workspace->color_top[i - 1];
    memcpy( LUtemp->start, workspace->color_top, sizeof(unsigned int) * (LU->n + 1) );
    /* permute factor */
    if ( tri == LOWER )
    {
        for ( i = 0; i < LU->n; ++i )
            nr = workspace->permuted_row_col_inv[i];

            for ( pj = LU->start[i]; pj < LU->start[i + 1]; ++pj )
                nc = workspace->permuted_row_col_inv[LU->j[pj]];
                    LUtemp->j[workspace->color_top[nr]] = nc;
                    LUtemp->val[workspace->color_top[nr]] = LU->val[pj];
                    ++workspace->color_top[nr];
                }
                /* correct entries to maintain triangularity (lower) */
                else
                {
                    LUtemp->j[workspace->color_top[nc]] = nr;
                    LUtemp->val[workspace->color_top[nc]] = LU->val[pj];
                    ++workspace->color_top[nc];
            nr = workspace->permuted_row_col_inv[i];

            for ( pj = LU->start[i]; pj < LU->start[i + 1]; ++pj )
                nc = workspace->permuted_row_col_inv[LU->j[pj]];
                    LUtemp->j[workspace->color_top[nr]] = nc;
                    LUtemp->val[workspace->color_top[nr]] = LU->val[pj];
                    ++workspace->color_top[nr];
                }
                /* correct entries to maintain triangularity (upper) */
                else
                {
                    LUtemp->j[workspace->color_top[nc]] = nr;
                    LUtemp->val[workspace->color_top[nc]] = LU->val[pj];
                    ++workspace->color_top[nc];
    memcpy( LU->start, LUtemp->start, sizeof(unsigned int) * (LU->n + 1) );
    memcpy( LU->j, LUtemp->j, sizeof(unsigned int) * LU->start[LU->n] );
    memcpy( LU->val, LUtemp->val, sizeof(real) * LU->start[LU->n] );

/* Setup routines to build permuted QEq matrix H (via graph coloring),
 *  used for preconditioning (incomplete factorizations computed based on
 *  permuted H)
 *
 * control: container for control info
 * workspace: storage container for workspace structures
 * H: symmetric, lower triangular portion only, stored in CSR format;
 * H_full: symmetric, stored in CSR format;
 * H_p (output): permuted copy of H based on coloring, lower half stored, CSR format
void setup_graph_coloring( const control_params * const control,
        const static_storage * const workspace, const sparse_matrix * const H,
        sparse_matrix ** H_full, sparse_matrix ** H_p )
        if ( Allocate_Matrix( H_p, H->n, H->m ) == FAILURE )
            fprintf( stderr, "[ERROR] not enough memory for graph coloring. terminating.\n" );
            exit( INSUFFICIENT_MEMORY );
    }
    else if ( (*H_p)->m < H->m )
    {
        Deallocate_Matrix( *H_p );
        if ( Allocate_Matrix( H_p, H->n, H->m ) == FAILURE )
        {
            fprintf( stderr, "[ERROR] not enough memory for graph coloring. terminating.\n" );
    compute_full_sparse_matrix( H, H_full );
    graph_coloring( control, (static_storage *) workspace, *H_full, LOWER );
    sort_rows_by_colors( workspace, (*H_full)->n, LOWER );
    memcpy( (*H_p)->start, H->start, sizeof(int) * (H->n + 1) );
    memcpy( (*H_p)->j, H->j, sizeof(int) * (H->start[H->n]) );
    memcpy( (*H_p)->val, H->val, sizeof(real) * (H->start[H->n]) );
    permute_matrix( workspace, (*H_p), LOWER );
/* Jacobi iteration using truncated Neumann series: x_{k+1} = Gx_k + D^{-1}b
 * where:
 *   G = I - D^{-1}R
 *   R = triangular matrix
 *   D = diagonal matrix, diagonals from R
 *
 * Note: used during the backsolves when applying preconditioners with
 * triangular factors in iterative linear solvers
 *
 * Note: Newmann series arises from series expansion of the inverse of
 * the coefficient matrix in the triangular system
 *
 * workspace: storage container for workspace structures
 * R:
 * Dinv:
 * b:
 * x (output):
 * tri:
 * maxiter:
 * */
void jacobi_iter( const static_storage * const workspace,
        const sparse_matrix * const R, const real * const Dinv,
        const real * const b, real * const x, const TRIANGULARITY tri,
        const unsigned int maxiter )
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

    /* precompute and cache, as invariant in loop below */
        workspace->Dinv_b[i] = Dinv[i] * b[i];
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

        for ( i = 0; i < R->n; ++i )
        {
            if (tri == LOWER)
            {
                si = R->start[i];
                ei = R->start[i + 1] - 1;
            }
            else
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

                si = R->start[i] + 1;
                ei = R->start[i + 1];
            p2[i] *= -Dinv[i];
            p2[i] += workspace->Dinv_b[i];
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    }
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

/* Apply left-sided preconditioning while solver M^{-1}Ax = M^{-1}b
 * workspace: data struct containing matrices, stored in CSR
 * control: data struct containing parameters
 * y: vector to which to apply preconditioning,
 *  specific to internals of iterative solver being used
 * x: preconditioned vector (output)
 * fresh_pre: parameter indicating if this is a newly computed (fresh) preconditioner
 * side: used in determining how to apply preconditioner if the preconditioner is
 *  factorized as M = M_{1}M_{2} (e.g., incomplete LU, A \approx LU)
 *   Matrices have non-zero diagonals
 *   Each row of a matrix has at least one non-zero (i.e., no rows with all zeros) */
static void apply_preconditioner( const static_storage * const workspace, const control_params * const control,
                                  const real * const y, real * const x, const int fresh_pre,
                                  const int side )
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
{
    /* no preconditioning */
    if ( control->cm_solver_pre_comp_type == NONE_PC )
        case LEFT:
            switch ( control->cm_solver_pre_app_type )
            case TRI_SOLVE_PA:
                switch ( control->cm_solver_pre_comp_type )
                {
                case DIAG_PC:
                    diag_pre_app( workspace->Hdia_inv, y, x, workspace->H->n );
                    break;
                case ICHOLT_PC:
                case ILU_PAR_PC:
                case ILUT_PAR_PC:
                    tri_solve( workspace->L, y, x, workspace->L->n, LOWER );
                    break;
                case SAI_PC:
                    Sparse_MatVec_full( workspace->H_app_inv, y, x );
                    break;
                default:
                    fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
                    exit( INVALID_INPUT );
                    break;
                }
            case TRI_SOLVE_LEVEL_SCHED_PA:
                switch ( control->cm_solver_pre_comp_type )
                {
                case DIAG_PC:
                    diag_pre_app( workspace->Hdia_inv, y, x, workspace->H->n );
                    break;
                case ICHOLT_PC:
                case ILU_PAR_PC:
                case ILUT_PAR_PC:
                    tri_solve_level_sched( (static_storage *) workspace,
                            workspace->L, y, x, workspace->L->n, LOWER, fresh_pre );
                    break;
                case SAI_PC:
                    Sparse_MatVec_full( workspace->H_app_inv, y, x );
                    break;
                default:
                    fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
                    exit( INVALID_INPUT );
                    break;
                }
            case TRI_SOLVE_GC_PA:
                switch ( control->cm_solver_pre_comp_type )
                {
                case DIAG_PC:
                case SAI_PC:
                    fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" );
                    exit( INVALID_INPUT );
                    break;
                case ICHOLT_PC:
                case ILU_PAR_PC:
                case ILUT_PAR_PC:
#ifdef _OPENMP
                    for ( i = 0; i < workspace->H->n; ++i )
                    permute_vector( workspace, workspace->y_p, workspace->H->n, FALSE, LOWER );
                    tri_solve_level_sched( (static_storage *) workspace,
                            workspace->L, workspace->y_p, x, workspace->L->n, LOWER, fresh_pre );
                default:
                    fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
                    exit( INVALID_INPUT );
                    break;
                }
            case JACOBI_ITER_PA:
                switch ( control->cm_solver_pre_comp_type )
                {
                case DIAG_PC:
                case SAI_PC:
                    fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" );
                    exit( INVALID_INPUT );
                    break;
                case ICHOLT_PC:
                case ILU_PAR_PC:
                case ILUT_PAR_PC:
                    if ( fresh_pre == TRUE )
                    {
#ifdef _OPENMP
                        #pragma omp for schedule(static)
#endif
                        for ( i = 0; i < workspace->L->n; ++i )
                        {
                            si = workspace->L->start[i + 1] - 1;
                            workspace->Dinv_L[i] = 1.0 / workspace->L->val[si];
                    jacobi_iter( workspace, workspace->L, workspace->Dinv_L,
                            y, x, LOWER, control->cm_solver_pre_app_jacobi_iters );
                default:
                    fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
                    exit( INVALID_INPUT );
                    break;
                }
                break;
            default:
                fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
                exit( INVALID_INPUT );
                break;

        case RIGHT:
            switch ( control->cm_solver_pre_app_type )
            case TRI_SOLVE_PA:
                switch ( control->cm_solver_pre_comp_type )
                {
                case DIAG_PC:
                case SAI_PC:
                    if ( x != y )
                    {
                        Vector_Copy( x, y, workspace->H->n );
                    }
                    break;
                case ICHOLT_PC:
                case ILU_PAR_PC:
                case ILUT_PAR_PC:
                    tri_solve( workspace->U, y, x, workspace->U->n, UPPER );
                    break;
                default:
                    fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
                    exit( INVALID_INPUT );
                    break;
                }
            case TRI_SOLVE_LEVEL_SCHED_PA:
                switch ( control->cm_solver_pre_comp_type )
                case DIAG_PC:
                case SAI_PC:
                    if ( x != y )
                    {
                        Vector_Copy( x, y, workspace->H->n );
                    }
                    break;
                case ICHOLT_PC:
                case ILU_PAR_PC:
                case ILUT_PAR_PC:
                    tri_solve_level_sched( (static_storage *) workspace,
                            workspace->U, y, x, workspace->U->n, UPPER, fresh_pre );
                    break;
                default:
                    fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
                    exit( INVALID_INPUT );
                    break;
                break;
            case TRI_SOLVE_GC_PA:
                switch ( control->cm_solver_pre_comp_type )
                case DIAG_PC:
                case SAI_PC:
                    fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" );
                    exit( INVALID_INPUT );
                    break;
                case ICHOLT_PC:
                case ILU_PAR_PC:
                case ILUT_PAR_PC:
                    tri_solve_level_sched( (static_storage *) workspace,
                            workspace->U, y, x, workspace->U->n, UPPER, fresh_pre );
                    permute_vector( workspace, x, workspace->H->n, TRUE, UPPER );
                default:
                    fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
                    exit( INVALID_INPUT );
                    break;
                break;
            case JACOBI_ITER_PA:
                switch ( control->cm_solver_pre_comp_type )
                {
                case DIAG_PC:
                case SAI_PC:
                    fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" );
                    exit( INVALID_INPUT );
                    break;
                case ICHOLT_PC:
                case ILU_PAR_PC:
                case ILUT_PAR_PC:
                        for ( i = 0; i < workspace->U->n; ++i )
                        {
                            si = workspace->U->start[i];
                            workspace->Dinv_U[i] = 1.0 / workspace->U->val[si];
                    jacobi_iter( workspace, workspace->U, workspace->Dinv_U,
                            y, x, UPPER, control->cm_solver_pre_app_jacobi_iters );
                default:
                    fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
                    exit( INVALID_INPUT );
                    break;
                }
                break;
            default:
                fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
                exit( INVALID_INPUT );
                break;
Kurt A. O'Hearn's avatar
Loading
Loading full blame...