Skip to content
Snippets Groups Projects
charges.c 76.4 KiB
Newer Older
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
/*----------------------------------------------------------------------
  SerialReax - Reax Force Field Simulator
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
  Copyright (2010) Purdue University
  Hasan Metin Aktulga, haktulga@cs.purdue.edu
  Joseph Fogarty, jcfogart@mail.usf.edu
  Sagar Pandit, pandit@usf.edu
  Ananth Y Grama, ayg@cs.purdue.edu
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
  This program is free software; you can redistribute it and/or
  modify it under the terms of the GNU General Public License as
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
  published by the Free Software Foundation; either version 2 of
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
  the License, or (at your option) any later version.
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
  This program is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
  See the GNU General Public License for more details:
  <http://www.gnu.org/licenses/>.
  ----------------------------------------------------------------------*/

Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
#include "allocate.h"
#include "list.h"
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
#include "print_utils.h"
#if defined(HAVE_SUPERLU_MT)
#include "slu_mt_ddefs.h"
#endif
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

#if defined(TEST_MAT)
static sparse_matrix * create_test_mat( void )
{
    unsigned int i, n;
    sparse_matrix *H_test;

    if ( Allocate_Matrix( &H_test, 3, 6 ) == FAILURE )
    {
        fprintf( stderr, "not enough memory for test matrices. terminating.\n" );
        exit( INSUFFICIENT_MEMORY );
    }

    //3x3, SPD, store lower half
    i = 0;
    n = 0;
    H_test->start[n] = i;
    H_test->j[i] = 0;
    H_test->val[i] = 4.;
    ++i;
    ++n;
    H_test->start[n] = i;
    H_test->j[i] = 0;
    H_test->val[i] = 12.;
    ++i;
    H_test->j[i] = 1;
    H_test->val[i] = 37.;
    ++i;
    ++n;
    H_test->start[n] = i;
    H_test->j[i] = 0;
    H_test->val[i] = -16.;
    ++i;
    H_test->j[i] = 1;
    H_test->val[i] = -43.;
    ++i;
    H_test->j[i] = 2;
    H_test->val[i] = 98.;
    ++i;
    ++n;
    H_test->start[n] = i;

    return H_test;
}
#endif


/* Routine used with qsort for sorting nonzeros within a sparse matrix row
 *
 * v1/v2: pointers to column indices of nonzeros within a row (unsigned int)
 */
static int compare_matrix_entry(const void *v1, const void *v2)
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
{
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    /* larger element has larger column index */
    return *(unsigned int *)v1 - *(unsigned int *)v2;
/* Routine used for sorting nonzeros within a sparse matrix row;
 *  internally, a combination of qsort and manual sorting is utilized
 *  (parallel calls to qsort when multithreading, rows mapped to threads)
 *
 * A: sparse matrix for which to sort nonzeros within a row, stored in CSR format
 */
static void Sort_Matrix_Rows( sparse_matrix * const A )
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
{
    #pragma omp parallel default(none) private(i, j, k, si, ei, temp_j, temp_val) shared(stderr)
        if ( ( temp_j = (unsigned int*) malloc( A->n * sizeof(unsigned int)) ) == NULL
                || ( temp_val = (real*) malloc( A->n * sizeof(real)) ) == NULL )
        {
            fprintf( stderr, "Not enough space for matrix row sort. Terminating...\n" );
            exit( INSUFFICIENT_MEMORY );
        }
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

        /* sort each row of A using column indices */
        #pragma omp for schedule(guided)
        for ( i = 0; i < A->n; ++i )
        {
            si = A->start[i];
            ei = A->start[i + 1];
            memcpy( temp_j, A->j + si, sizeof(unsigned int) * (ei - si) );
            memcpy( temp_val, A->val + si, sizeof(real) * (ei - si) );
            //TODO: consider implementing single custom one-pass sort instead of using qsort + manual sort
            /* polymorphic sort in standard C library using column indices */
            qsort( temp_j, ei - si, sizeof(unsigned int), compare_matrix_entry );
            /* manually sort vals */
            for ( j = 0; j < (ei - si); ++j )
                    if ( A->j[si + j] == temp_j[k] )
                    {
                        A->val[si + k] = temp_val[j];
                        break;
                    }
            memcpy( A->j + si, temp_j, sizeof(unsigned int) * (ei - si) );
static void Calculate_Droptol( const sparse_matrix * const A,
        real * const droptol, const real dtol )
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
{
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    int i, j, k;
    real val;
#ifdef _OPENMP
    static real *droptol_local;
    unsigned int tid;
#endif
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

    #pragma omp parallel default(none) private(i, j, k, val, tid), shared(droptol_local, stderr)
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    {
#ifdef _OPENMP
        tid = omp_get_thread_num();

        #pragma omp master
        {
            /* keep b_local for program duration to avoid allocate/free
             * overhead per Sparse_MatVec call*/
            if ( droptol_local == NULL )
            {
                if ( (droptol_local = (real*) malloc( omp_get_num_threads() * A->n * sizeof(real))) == NULL )
                {
                    fprintf( stderr, "Not enough space for droptol. Terminating...\n" );

        #pragma omp barrier
#endif

        /* init droptol to 0 */
        for ( i = 0; i < A->n; ++i )
        {
#ifdef _OPENMP
            droptol_local[tid * A->n + i] = 0.0;
#else
            droptol[i] = 0.0;
#endif
        }

        #pragma omp barrier

        /* calculate sqaure of the norm of each row */
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
        {
            for ( k = A->start[i]; k < A->start[i + 1] - 1; ++k )
            {
                j = A->j[k];
                val = A->val[k];
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

#ifdef _OPENMP
                droptol_local[tid * A->n + i] += val * val;
                droptol_local[tid * A->n + j] += val * val;
#else
                droptol[i] += val * val;
                droptol[j] += val * val;
#endif
            }

#ifdef _OPENMP
            droptol_local[tid * A->n + i] += val * val;
#else
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
            droptol[i] += val * val;
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
        }

Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

        for ( i = 0; i < A->n; ++i )
        {
            droptol[i] = 0.0;
            for ( k = 0; k < omp_get_num_threads(); ++k )
            {
                droptol[i] += droptol_local[k * A->n + i];
            }
        }
#endif

        #pragma omp barrier

        /* calculate local droptol for each row */
        //fprintf( stderr, "droptol: " );
        for ( i = 0; i < A->n; ++i )
        {
            //fprintf( stderr, "%f-->", droptol[i] );
            droptol[i] = SQRT( droptol[i] ) * dtol;
            //fprintf( stderr, "%f  ", droptol[i] );
        }
        //fprintf( stderr, "\n" );
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    }
static int Estimate_LU_Fill( const sparse_matrix * const A, const real * const droptol )
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
{
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    int i, j, pj;
    int fillin;
    real val;

    fillin = 0;

    #pragma omp parallel for schedule(static) \
    default(none) private(i, j, pj, val) reduction(+: fillin)
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    for ( i = 0; i < A->n; ++i )
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
        for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
        {
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
                ++fillin;
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
        }
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

    return fillin + A->n;
#if defined(HAVE_SUPERLU_MT)
static real SuperLU_Factorize( const sparse_matrix * const A,
                               sparse_matrix * const L, sparse_matrix * const U )
{
    unsigned int i, pj, count, *Ltop, *Utop, r;
    SuperMatrix A_S, AC_S, L_S, U_S;
    SCPformat *L_S_store;
    NCPformat *U_S_store;
    superlumt_options_t superlumt_options;
    pxgstrf_shared_t pxgstrf_shared;
    pdgstrf_threadarg_t *pdgstrf_threadarg;
    int_t nprocs;
    fact_t fact;
    trans_t trans;
    yes_no_t refact, usepr;
    real u, drop_tol;
    int_t *perm_c; /* column permutation vector */
    int_t *perm_r; /* row permutations from partial pivoting */
    void *work;
    int_t permc_spec, panel_size, relax;
    Gstat_t Gstat;
    flops_t flopcnt;

    /* Default parameters to control factorization. */
#ifdef _OPENMP
    //TODO: set as global parameter and use
    #pragma omp parallel \
    {
        #pragma omp master
        {
            /* SuperLU_MT spawns threads internally, so set and pass parameter */
            nprocs = omp_get_num_threads();
        }
    }
#else
    nprocs = 1;
#endif
//    fact = EQUILIBRATE; /* equilibrate A (i.e., scale rows & cols to have unit norm), then factorize */
    fact = DOFACT; /* factor from scratch */
    trans = NOTRANS;
    refact = NO; /* first time factorization */
    //TODO: add to control file and use the value there to set these
    panel_size = sp_ienv(1); /* # consec. cols treated as unit task */
    relax = sp_ienv(2); /* # cols grouped as relaxed supernode */
    usepr = NO;
    drop_tol = 0.0;
    work = NULL;
    lwork = 0;

//#if defined(DEBUG)
    fprintf( stderr, "nprocs = %d\n", nprocs );
    fprintf( stderr, "Panel size = %d\n", panel_size );
    fprintf( stderr, "Relax = %d\n", relax );
//#endif

    if ( !(perm_r = intMalloc(A->n)) )
    {
        SUPERLU_ABORT("Malloc fails for perm_r[].");
    }
    {
        SUPERLU_ABORT("Malloc fails for perm_c[].");
    }
    if ( !(superlumt_options.etree = intMalloc(A->n)) )
    {
        SUPERLU_ABORT("Malloc fails for etree[].");
    }
    if ( !(superlumt_options.colcnt_h = intMalloc(A->n)) )
    {
        SUPERLU_ABORT("Malloc fails for colcnt_h[].");
    }
    if ( !(superlumt_options.part_super_h = intMalloc(A->n)) )
    {
        SUPERLU_ABORT("Malloc fails for part_super__h[].");
    if ( ( (a = (real*) malloc( (2 * A->start[A->n] - A->n) * sizeof(real))) == NULL )
            || ( (asub = (int_t*) malloc( (2 * A->start[A->n] - A->n) * sizeof(int_t))) == NULL )
            || ( (xa = (int_t*) malloc( (A->n + 1) * sizeof(int_t))) == NULL )
            || ( (Ltop = (unsigned int*) malloc( (A->n + 1) * sizeof(unsigned int))) == NULL )
            || ( (Utop = (unsigned int*) malloc( (A->n + 1) * sizeof(unsigned int))) == NULL ) )
        fprintf( stderr, "Not enough space for SuperLU factorization. Terminating...\n" );
        exit( INSUFFICIENT_MEMORY );
    if ( Allocate_Matrix( &A_t, A->n, A->m ) == FAILURE )
    {
        fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
    }

    /* Set up the sparse matrix data structure for A. */
        for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
        {
            a[count] = A->entries[pj].val;
            asub[count] = A->entries[pj].j;
            ++count;
        }
        for ( pj = A_t->start[i] + 1; pj < A_t->start[i + 1]; ++pj )
        {
            a[count] = A_t->entries[pj].val;
            asub[count] = A_t->entries[pj].j;
            ++count;
        }
    xa[i] = count;

    dCompRow_to_CompCol( A->n, A->n, 2 * A->start[A->n] - A->n, a, asub, xa,
    for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
        fprintf( stderr, "%6d", asub[i] );
    fprintf( stderr, "\n" );
    for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
        fprintf( stderr, "%6.1f", a[i] );
    fprintf( stderr, "\n" );
        fprintf( stderr, "%6d", xa[i] );
    fprintf( stderr, "\n" );
    for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
        fprintf( stderr, "%6d", atsub[i] );
    fprintf( stderr, "\n" );
    for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
        fprintf( stderr, "%6.1f", at[i] );
    fprintf( stderr, "\n" );
        fprintf( stderr, "%6d", xat[i] );
    fprintf( stderr, "\n" );

    A_S.Stype = SLU_NC; /* column-wise, no supernode */
    A_S.Dtype = SLU_D; /* double-precision */
    A_S.Mtype = SLU_GE; /* full (general) matrix -- required for parallel factorization */
    A_S.nrow = A->n;
    A_S.ncol = A->n;
    A_S.Store = (void *) SUPERLU_MALLOC( sizeof(NCformat) );
    A_S_store = (NCformat *) A_S.Store;
    A_S_store->nnz = 2 * A->start[A->n] - A->n;
    A_S_store->nzval = at;
    A_S_store->rowind = atsub;
    A_S_store->colptr = xat;

    /* ------------------------------------------------------------
       Allocate storage and initialize statistics variables.
       ------------------------------------------------------------*/
    StatAlloc( A->n, nprocs, panel_size, relax, &Gstat );
    StatInit( A->n, nprocs, &Gstat );

    /* ------------------------------------------------------------
       Get column permutation vector perm_c[], according to permc_spec:
       permc_spec = 1: minimum degree ordering on structure of A'*A
       permc_spec = 2: minimum degree ordering on structure of A'+A
       permc_spec = 3: approximate minimum degree for unsymmetric matrices
       ------------------------------------------------------------*/
    permc_spec = 0;

    /* ------------------------------------------------------------
       Initialize the option structure superlumt_options using the
       user-input parameters;
       Apply perm_c to the columns of original A to form AC.
       ------------------------------------------------------------*/
    pdgstrf_init( nprocs, fact, trans, refact, panel_size, relax,
                  u, usepr, drop_tol, perm_c, perm_r,
                  work, lwork, &A_S, &AC_S, &superlumt_options, &Gstat );
    for ( i = 0; i < ((NCPformat*)AC_S.Store)->nnz; ++i )
        fprintf( stderr, "%6.1f", ((real*)(((NCPformat*)AC_S.Store)->nzval))[i] );
    fprintf( stderr, "\n" );

    /* ------------------------------------------------------------
       Compute the LU factorization of A.
       The following routine will create nprocs threads.
       ------------------------------------------------------------*/
    pdgstrf( &superlumt_options, &AC_S, perm_r, &L_S, &U_S, &Gstat, &info );

    fprintf( stderr, "INFO: %d\n", info );
    flopcnt = 0;
    for (i = 0; i < nprocs; ++i)
    {
        flopcnt += Gstat.procstat[i].fcops;
    }
    Gstat.ops[FACT] = flopcnt;

    printf("\n** Result of sparse LU **\n");
    L_S_store = (SCPformat *) L_S.Store;
    U_S_store = (NCPformat *) U_S.Store;
    printf( "No of nonzeros in factor L = " IFMT "\n", L_S_store->nnz );
    printf( "No of nonzeros in factor U = " IFMT "\n", U_S_store->nnz );
    fflush( stdout );
//#endif
    /* convert L and R from SuperLU formats to CSR */
    memset( Ltop, 0, (A->n + 1) * sizeof(int) );
    memset( Utop, 0, (A->n + 1) * sizeof(int) );
    memset( L->start, 0, (A->n + 1) * sizeof(int) );
    memset( U->start, 0, (A->n + 1) * sizeof(int) );

        fprintf( stderr, "%6.1f", ((real*)(L_S_store->nzval))[i] );
    fprintf( stderr, "\n" );
        fprintf( stderr, "%6.1f", ((real*)(U_S_store->nzval))[i] );
    fprintf( stderr, "\n" );

    printf( "No of supernodes in factor L = " IFMT "\n", L_S_store->nsuper );
        fprintf( stderr, "nzval_col_beg[%5d] = %d\n", i, L_S_store->nzval_colbeg[i] );
        fprintf( stderr, "nzval_col_end[%5d] = %d\n", i, L_S_store->nzval_colend[i] );
        //TODO: correct for SCPformat for L?
        //for( pj = L_S_store->rowind_colbeg[i]; pj < L_S_store->rowind_colend[i]; ++pj )
//        for( pj = 0; pj < L_S_store->rowind_colend[i] - L_S_store->rowind_colbeg[i]; ++pj )
//        {
//            ++Ltop[L_S_store->rowind[L_S_store->rowind_colbeg[i] + pj] + 1];
//        }
        fprintf( stderr, "col_beg[%5d] = %d\n", i, U_S_store->colbeg[i] );
        fprintf( stderr, "col_end[%5d] = %d\n", i, U_S_store->colend[i] );
        for ( pj = U_S_store->colbeg[i]; pj < U_S_store->colend[i]; ++pj )
        {
            ++Utop[U_S_store->rowind[pj] + 1];
            fprintf( stderr, "Utop[%5d]     = %d\n", U_S_store->rowind[pj] + 1, Utop[U_S_store->rowind[pj] + 1] );
//        Ltop[i] = L->start[i] = Ltop[i] + Ltop[i - 1];
        Utop[i] = U->start[i] = Utop[i] + Utop[i - 1];
//        fprintf( stderr, "Utop[%5d]     = %d\n", i, Utop[i] );
//        fprintf( stderr, "U->start[%5d] = %d\n", i, U->start[i] );
//        for( pj = 0; pj < L_S_store->nzval_colend[i] - L_S_store->nzval_colbeg[i]; ++pj )
//        {
//            r = L_S_store->rowind[L_S_store->rowind_colbeg[i] + pj];
//            L->entries[Ltop[r]].j = r;
//            L->entries[Ltop[r]].val = ((real*)L_S_store->nzval)[L_S_store->nzval_colbeg[i] + pj];
//            ++Ltop[r];
//        }
        for ( pj = U_S_store->colbeg[i]; pj < U_S_store->colend[i]; ++pj )
        {
            r = U_S_store->rowind[pj];
            U->entries[Utop[r]].j = i;
            U->entries[Utop[r]].val = ((real*)U_S_store->nzval)[pj];
    /* ------------------------------------------------------------
      Deallocate storage after factorization.
      ------------------------------------------------------------*/
    pxgstrf_finalize( &superlumt_options, &AC_S );
    Deallocate_Matrix( A_t );
    free( xa );
    free( asub );
    free( a );
    SUPERLU_FREE( perm_r );
    SUPERLU_FREE( perm_c );
    SUPERLU_FREE( ((NCformat *)A_S.Store)->rowind );
    SUPERLU_FREE( ((NCformat *)A_S.Store)->colptr );
    SUPERLU_FREE( ((NCformat *)A_S.Store)->nzval );
    SUPERLU_FREE( A_S.Store );
    if ( lwork == 0 )
    {
        Destroy_SuperNode_SCP(&L_S);
        Destroy_CompCol_NCP(&U_S);
    }
    else if ( lwork > 0 )
    {
        SUPERLU_FREE(work);
    }
    StatFree(&Gstat);

    //TODO: return iters
    return 0.;
}
#endif


/* Diagonal (Jacobi) preconditioner computation */
static real diag_pre_comp( const reax_system * const system, real * const Hdia_inv )
    #pragma omp parallel for schedule(static) \
    {
        Hdia_inv[i] = 1.0 / system->reaxprm.sbp[system->atoms[i].type].eta;
    }
/* Incomplete Cholesky factorization with dual thresholding */
static real ICHOLT( const sparse_matrix * const A, const real * const droptol,
                    sparse_matrix * const L, sparse_matrix * const U )
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
{
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    int i, j, pj, k1, k2, tmptop, Ltop;
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    int *Utop;

    if ( ( Utop = (int*) malloc((A->n + 1) * sizeof(int)) ) == NULL ||
            ( tmp_j = (int*) malloc(A->n * sizeof(int)) ) == NULL ||
            ( tmp_val = (real*) malloc(A->n * sizeof(real)) ) == NULL )
    {
        fprintf( stderr, "not enough memory for ICHOLT preconditioning matrices. terminating.\n" );
        exit( INSUFFICIENT_MEMORY );
    }
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

    // clear variables
    Ltop = 0;
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    tmptop = 0;
    memset( L->start, 0, (A->n + 1) * sizeof(unsigned int) );
    memset( U->start, 0, (A->n + 1) * sizeof(unsigned int) );
    memset( Utop, 0, A->n * sizeof(unsigned int) );
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

//    fprintf( stderr, "n: %d\n", A->n );
//    fflush( stderr );

Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    for ( i = 0; i < A->n; ++i )
    {
        L->start[i] = Ltop;
        tmptop = 0;

        for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
        {

//            fprintf( stderr, "  i: %d, j: %d\n", i, j );
//            fflush( stderr );
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
            {
                k1 = 0;
                k2 = L->start[j];
                while ( k1 < tmptop && k2 < L->start[j + 1] )
                {
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
                        ++k1;
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
                        ++k2;
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
                    else
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
                }

                // L matrix is lower triangular,
                // so right before the start of next row comes jth diagonal
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
                ++tmptop;
            }

//            fprintf( stderr, " -- done\n" );
//            fflush( stderr );
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
        }

        // sanity check
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
        {
            fprintf( stderr, "i=%d, badly built A matrix!\n", i );
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
        }

        // compute the ith diagonal in L
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
        for ( k1 = 0; k1 < tmptop; ++k1 )
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

        tmp_j[tmptop] = i;
        tmp_val[tmptop] = SQRT(val);
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

        // apply the dropping rule once again
        //fprintf( stderr, "row%d: tmptop: %d\n", i, tmptop );
        //for( k1 = 0; k1<= tmptop; ++k1 )
        //  fprintf( stderr, "%d(%f)  ", tmp[k1].j, tmp[k1].val );
        //fprintf( stderr, "\n" );
        //fprintf( stderr, "row(%d): droptol=%.4f\n", i+1, droptol[i] );
        for ( k1 = 0; k1 < tmptop; ++k1 )
            if ( FABS(tmp_val[k1]) > droptol[i] / tmp_val[tmptop] )
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
            {
                L->j[Ltop] = tmp_j[k1];
                L->val[Ltop] = tmp_val[k1];
                U->start[tmp_j[k1] + 1]++;
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
                ++Ltop;
                //fprintf( stderr, "%d(%.4f)  ", tmp[k1].j+1, tmp[k1].val );
            }
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
        // keep the diagonal in any case
        L->j[Ltop] = tmp_j[k1];
        L->val[Ltop] = tmp_val[k1];
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
        ++Ltop;
        //fprintf( stderr, "%d(%.4f)\n", tmp[k1].j+1,  tmp[k1].val );
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    }
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

    L->start[i] = Ltop;
//    fprintf( stderr, "nnz(L): %d, max: %d\n", Ltop, L->n * 50 );
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

    /* U = L^T (Cholesky factorization) */
    Transpose( L, U );
//    for ( i = 1; i <= U->n; ++i )
//    {
//        Utop[i] = U->start[i] = U->start[i] + U->start[i - 1] + 1;
//    }
//    for ( i = 0; i < L->n; ++i )
//    {
//        for ( pj = L->start[i]; pj < L->start[i + 1]; ++pj )
//        {
//            j = L->j[pj];
//            U->j[Utop[j]] = i;
//            U->val[Utop[j]] = L->val[pj];
//            Utop[j]++;
//        }
//    }
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

//    fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 );
/* Fine-grained (parallel) incomplete Cholesky factorization
 * Reference:
 * Edmond Chow and Aftab Patel
 * Fine-Grained Parallel Incomplete LU Factorization
 * SIAM J. Sci. Comp. */
static real ICHOL_PAR( const sparse_matrix * const A, const unsigned int sweeps,
                       sparse_matrix * const U_t, sparse_matrix * const U )
{
    unsigned int i, j, k, pj, x = 0, y = 0, ei_x, ei_y;
    if ( Allocate_Matrix( &DAD, A->n, A->m ) == FAILURE ||
            ( D = (real*) malloc(A->n * sizeof(real)) ) == NULL ||
            ( D_inv = (real*) malloc(A->n * sizeof(real)) ) == NULL ||
            ( Utop = (int*) malloc((A->n + 1) * sizeof(int)) ) == NULL )
    {
        fprintf( stderr, "not enough memory for ICHOL_PAR preconditioning matrices. terminating.\n" );
    #pragma omp parallel for schedule(static) \
    default(none) shared(D_inv, D) private(i)
    for ( i = 0; i < A->n; ++i )
    {
        D_inv[i] = SQRT( A->val[A->start[i + 1] - 1] );
    memset( U->start, 0, sizeof(unsigned int) * (A->n + 1) );
    memset( Utop, 0, sizeof(unsigned int) * (A->n + 1) );
    /* to get convergence, A must have unit diagonal, so apply
     * transformation DAD, where D = D(1./sqrt(D(A))) */
    memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) );
    #pragma omp parallel for schedule(guided) \
    default(none) shared(DAD, D_inv, D) private(i, pj)
    for ( i = 0; i < A->n; ++i )
        /* non-diagonals */
        for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
            DAD->j[pj] = A->j[pj];
            DAD->val[pj] = A->val[pj] * D[i] * D[A->j[pj]];
    /* initial guesses for U^T,
     * assume: A and DAD symmetric and stored lower triangular */
    memcpy( U_t->start, DAD->start, sizeof(int) * (DAD->n + 1) );
    memcpy( U_t->j, DAD->j, sizeof(int) * (DAD->m) );
    memcpy( U_t->val, DAD->val, sizeof(real) * (DAD->m) );
    for ( i = 0; i < sweeps; ++i )
    {
        /* for each nonzero */
        #pragma omp parallel for schedule(static) \
        default(none) shared(DAD, stderr) private(sum, ei_x, ei_y, k) firstprivate(x, y)
        for ( j = 0; j < A->start[A->n]; ++j )
        {
            sum = ZERO;

            /* determine row bounds of current nonzero */
            x = 0;
            ei_x = 0;
            for ( k = 0; k <= A->n; ++k )
                if ( U_t->start[k] > j )
                    x = U_t->start[k - 1];
                    ei_x = U_t->start[k];
            /* column bounds of current nonzero */
            y = U_t->start[U_t->j[j]];
            ei_y = U_t->start[U_t->j[j] + 1];
            /* sparse dot product: dot( U^T(i,1:j-1), U^T(j,1:j-1) ) */
            while ( U_t->j[x] < U_t->j[j] &&
                    U_t->j[y] < U_t->j[j] &&
                    x < ei_x && y < ei_y )
            {
            /* diagonal entries */
                    fprintf( stderr, "Numeric breakdown in ICHOL Terminating.\n");
#if defined(DEBUG_FOCUS)
                    fprintf( stderr, "A(%5d,%5d) = %10.3f\n",
                             k - 1, A->entries[j].j, A->entries[j].val );
                    fprintf( stderr, "sum = %10.3f\n", sum);
#endif
            /* non-diagonal entries */
    /* apply inverse transformation D^{-1}U^{T},
     * since DAD \approx U^{T}U, so
     * D^{-1}DADD^{-1} = A \approx D^{-1}U^{T}UD^{-1} */
    #pragma omp parallel for schedule(guided) \
    default(none) shared(D_inv) private(i, pj)
    for ( i = 0; i < A->n; ++i )
    {
        for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
        {
#if defined(DEBUG_FOCUS)
    fprintf( stderr, "nnz(L): %d, max: %d\n", U_t->start[U_t->n], U_t->n * 50 );
#endif

    /* transpose U^{T} and copy into U */
    Transpose( U_t, U );
#if defined(DEBUG_FOCUS)
    fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 );
#endif

    Deallocate_Matrix( DAD );
    free(D_inv);
    free(D);
/* Fine-grained (parallel) incomplete LU factorization
 * Reference:
 * Edmond Chow and Aftab Patel
 * Fine-Grained Parallel Incomplete LU Factorization
 * SIAM J. Sci. Comp.
 * A: symmetric, half-stored (lower triangular), CSR format
 * sweeps: number of loops over non-zeros for computation
 * L / U: factorized triangular matrices (A \approx LU), CSR format */
static real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps,
        sparse_matrix * const L, sparse_matrix * const U )
    if ( Allocate_Matrix( &DAD, A->n, A->m ) == FAILURE ||
            ( D = (real*) malloc(A->n * sizeof(real)) ) == NULL ||
            ( D_inv = (real*) malloc(A->n * sizeof(real)) ) == NULL )
    {
        fprintf( stderr, "not enough memory for ILU_PAR preconditioning matrices. terminating.\n" );
    #pragma omp parallel for schedule(static) \
    default(none) shared(D, D_inv) private(i)
        D_inv[i] = SQRT( A->val[A->start[i + 1] - 1] );
        D[i] = 1.0 / D_inv[i];
    }

    /* to get convergence, A must have unit diagonal, so apply
     * transformation DAD, where D = D(1./sqrt(D(A))) */
    memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) );
    #pragma omp parallel for schedule(static) \
    default(none) shared(DAD, D) private(i, pj)
    for ( i = 0; i < A->n; ++i )
    {
        /* non-diagonals */
        for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
        {
            DAD->j[pj] = A->j[pj];
            DAD->val[pj] = D[i] * A->val[pj] * D[A->j[pj]];
    }

    /* initial guesses for L and U,
     * assume: A and DAD symmetric and stored lower triangular */
    memcpy( L->start, DAD->start, sizeof(int) * (DAD->n + 1) );
    memcpy( L->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) );
    memcpy( L->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) );
    /* store U^T in CSR for row-wise access and tranpose later */
    memcpy( U->start, DAD->start, sizeof(int) * (DAD->n + 1) );
    memcpy( U->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) );
    memcpy( U->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) );
    #pragma omp parallel for schedule(static) default(none) private(i)
    }

    for ( i = 0; i < sweeps; ++i )
    {
        /* for each nonzero in L */
        #pragma omp parallel for schedule(static) \
        default(none) shared(DAD) private(j, k, x, y, ei_x, ei_y, sum)
        for ( j = 0; j < DAD->start[DAD->n]; ++j )
        {
            sum = ZERO;

            /* determine row bounds of current nonzero */
            x = 0;
            ei_x = 0;
            for ( k = 1; k <= DAD->n; ++k )
            {
                if ( DAD->start[k] > j )
                {
                    x = DAD->start[k - 1];
                    ei_x = DAD->start[k];
                    break;
                }
            }
            /* determine column bounds of current nonzero */