diff --git a/sPuReMD/src/charges.c b/sPuReMD/src/charges.c
index 5003d28ae68aab663a5ec26640cdd6258de1dc0c..d2f038c43f072b0beec502d44e9900e13ae18026 100644
--- a/sPuReMD/src/charges.c
+++ b/sPuReMD/src/charges.c
@@ -32,1388 +32,6 @@
 #endif
 
 
-typedef struct
-{
-    unsigned int j;
-    real val;
-} sparse_matrix_entry;
-
-
-#if defined(TEST_MAT)
-static sparse_matrix * create_test_mat( void )
-{
-    unsigned int i, n;
-    sparse_matrix *H_test;
-
-    if ( Allocate_Matrix( &H_test, 3, 6 ) == FAILURE )
-    {
-        fprintf( stderr, "not enough memory for test matrices. terminating.\n" );
-        exit( INSUFFICIENT_MEMORY );
-    }
-
-    //3x3, SPD, store lower half
-    i = 0;
-    n = 0;
-    H_test->start[n] = i;
-    H_test->j[i] = 0;
-    H_test->val[i] = 4.;
-    ++i;
-    ++n;
-    H_test->start[n] = i;
-    H_test->j[i] = 0;
-    H_test->val[i] = 12.;
-    ++i;
-    H_test->j[i] = 1;
-    H_test->val[i] = 37.;
-    ++i;
-    ++n;
-    H_test->start[n] = i;
-    H_test->j[i] = 0;
-    H_test->val[i] = -16.;
-    ++i;
-    H_test->j[i] = 1;
-    H_test->val[i] = -43.;
-    ++i;
-    H_test->j[i] = 2;
-    H_test->val[i] = 98.;
-    ++i;
-    ++n;
-    H_test->start[n] = i;
-
-    return H_test;
-}
-#endif
-
-
-/* Routine used with qsort for sorting nonzeros within a sparse matrix row
- *
- * v1/v2: pointers to column indices of nonzeros within a row (unsigned int)
- */
-static int compare_matrix_entry(const void *v1, const void *v2)
-{
-    /* larger element has larger column index */
-    return ((sparse_matrix_entry *)v1)->j - ((sparse_matrix_entry *)v2)->j;
-}
-
-
-/* Routine used for sorting nonzeros within a sparse matrix row;
- *  internally, a combination of qsort and manual sorting is utilized
- *  (parallel calls to qsort when multithreading, rows mapped to threads)
- *
- * A: sparse matrix for which to sort nonzeros within a row, stored in CSR format
- */
-static void Sort_Matrix_Rows( sparse_matrix * const A )
-{
-    unsigned int i, j, si, ei;
-    sparse_matrix_entry *temp;
-
-#ifdef _OPENMP
-//    #pragma omp parallel default(none) private(i, j, si, ei, temp) shared(stderr)
-#endif
-    {
-        if ( ( temp = (sparse_matrix_entry *) malloc( A->n * sizeof(sparse_matrix_entry)) ) == NULL )
-        {
-            fprintf( stderr, "Not enough space for matrix row sort. Terminating...\n" );
-            exit( INSUFFICIENT_MEMORY );
-        }
-
-        /* sort each row of A using column indices */
-#ifdef _OPENMP
-//        #pragma omp for schedule(guided)
-#endif
-        for ( i = 0; i < A->n; ++i )
-        {
-            si = A->start[i];
-            ei = A->start[i + 1];
-
-            for ( j = 0; j < (ei - si); ++j )
-            {
-                (temp + j)->j = A->j[si + j];
-                (temp + j)->val = A->val[si + j];
-            }
-
-            /* polymorphic sort in standard C library using column indices */
-            qsort( temp, ei - si, sizeof(sparse_matrix_entry), compare_matrix_entry );
-
-            for ( j = 0; j < (ei - si); ++j )
-            {
-                A->j[si + j] = (temp + j)->j;
-                A->val[si + j] = (temp + j)->val;
-            }
-        }
-
-        sfree( temp, "Sort_Matrix_Rows::temp" );
-    }
-}
-
-
-static void Calculate_Droptol( const sparse_matrix * const A,
-        real * const droptol, const real dtol )
-{
-    int i, j, k;
-    real val;
-#ifdef _OPENMP
-    static real *droptol_local;
-    unsigned int tid;
-#endif
-
-#ifdef _OPENMP
-    #pragma omp parallel default(none) private(i, j, k, val, tid), shared(droptol_local, stderr)
-#endif
-    {
-#ifdef _OPENMP
-        tid = omp_get_thread_num();
-
-        #pragma omp master
-        {
-            /* keep b_local for program duration to avoid allocate/free
-             * overhead per Sparse_MatVec call*/
-            if ( droptol_local == NULL )
-            {
-                if ( (droptol_local = (real*) malloc( omp_get_num_threads() * A->n * sizeof(real))) == NULL )
-                {
-                    fprintf( stderr, "Not enough space for droptol. Terminating...\n" );
-                    exit( INSUFFICIENT_MEMORY );
-                }
-            }
-        }
-
-        #pragma omp barrier
-#endif
-
-        /* init droptol to 0 */
-        for ( i = 0; i < A->n; ++i )
-        {
-#ifdef _OPENMP
-            droptol_local[tid * A->n + i] = 0.0;
-#else
-            droptol[i] = 0.0;
-#endif
-        }
-
-#ifdef _OPENMP
-        #pragma omp barrier
-#endif
-
-        /* calculate sqaure of the norm of each row */
-#ifdef _OPENMP
-        #pragma omp for schedule(static)
-#endif
-        for ( i = 0; i < A->n; ++i )
-        {
-            for ( k = A->start[i]; k < A->start[i + 1] - 1; ++k )
-            {
-                j = A->j[k];
-                val = A->val[k];
-
-#ifdef _OPENMP
-                droptol_local[tid * A->n + i] += val * val;
-                droptol_local[tid * A->n + j] += val * val;
-#else
-                droptol[i] += val * val;
-                droptol[j] += val * val;
-#endif
-            }
-
-            // diagonal entry
-            val = A->val[k];
-#ifdef _OPENMP
-            droptol_local[tid * A->n + i] += val * val;
-#else
-            droptol[i] += val * val;
-#endif
-        }
-
-#ifdef _OPENMP
-        #pragma omp barrier
-
-        #pragma omp for schedule(static)
-        for ( i = 0; i < A->n; ++i )
-        {
-            droptol[i] = 0.0;
-            for ( k = 0; k < omp_get_num_threads(); ++k )
-            {
-                droptol[i] += droptol_local[k * A->n + i];
-            }
-        }
-
-        #pragma omp barrier
-#endif
-
-        /* calculate local droptol for each row */
-        //fprintf( stderr, "droptol: " );
-#ifdef _OPENMP
-        #pragma omp for schedule(static)
-#endif
-        for ( i = 0; i < A->n; ++i )
-        {
-            //fprintf( stderr, "%f-->", droptol[i] );
-            droptol[i] = SQRT( droptol[i] ) * dtol;
-            //fprintf( stderr, "%f  ", droptol[i] );
-        }
-        //fprintf( stderr, "\n" );
-    }
-}
-
-
-static int Estimate_LU_Fill( const sparse_matrix * const A, const real * const droptol )
-{
-    int i, pj;
-    int fillin;
-    real val;
-
-    fillin = 0;
-
-#ifdef _OPENMP
-    #pragma omp parallel for schedule(static) \
-        default(none) private(i, pj, val) reduction(+: fillin)
-#endif
-    for ( i = 0; i < A->n; ++i )
-    {
-        for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
-        {
-            val = A->val[pj];
-
-            if ( FABS(val) > droptol[i] )
-            {
-                ++fillin;
-            }
-        }
-    }
-
-    return fillin + A->n;
-}
-
-
-#if defined(HAVE_SUPERLU_MT)
-static real SuperLU_Factorize( const sparse_matrix * const A,
-        sparse_matrix * const L, sparse_matrix * const U )
-{
-    unsigned int i, pj, count, *Ltop, *Utop, r;
-    sparse_matrix *A_t;
-    SuperMatrix A_S, AC_S, L_S, U_S;
-    NCformat *A_S_store;
-    SCPformat *L_S_store;
-    NCPformat *U_S_store;
-    superlumt_options_t superlumt_options;
-    pxgstrf_shared_t pxgstrf_shared;
-    pdgstrf_threadarg_t *pdgstrf_threadarg;
-    int_t nprocs;
-    fact_t fact;
-    trans_t trans;
-    yes_no_t refact, usepr;
-    real u, drop_tol;
-    real *a, *at;
-    int_t *asub, *atsub, *xa, *xat;
-    int_t *perm_c; /* column permutation vector */
-    int_t *perm_r; /* row permutations from partial pivoting */
-    void *work;
-    int_t info, lwork;
-    int_t permc_spec, panel_size, relax;
-    Gstat_t Gstat;
-    flops_t flopcnt;
-
-    /* Default parameters to control factorization. */
-#ifdef _OPENMP
-    //TODO: set as global parameter and use
-    #pragma omp parallel \
-        default(none) shared(nprocs)
-    {
-        #pragma omp master
-        {
-            /* SuperLU_MT spawns threads internally, so set and pass parameter */
-            nprocs = omp_get_num_threads();
-        }
-    }
-#else
-    nprocs = 1;
-#endif
-
-//    fact = EQUILIBRATE; /* equilibrate A (i.e., scale rows & cols to have unit norm), then factorize */
-    fact = DOFACT; /* factor from scratch */
-    trans = NOTRANS;
-    refact = NO; /* first time factorization */
-    //TODO: add to control file and use the value there to set these
-    panel_size = sp_ienv(1); /* # consec. cols treated as unit task */
-    relax = sp_ienv(2); /* # cols grouped as relaxed supernode */
-    u = 1.0; /* diagonal pivoting threshold */
-    usepr = NO;
-    drop_tol = 0.0;
-    work = NULL;
-    lwork = 0;
-
-//#if defined(DEBUG)
-    fprintf( stderr, "nprocs = %d\n", nprocs );
-    fprintf( stderr, "Panel size = %d\n", panel_size );
-    fprintf( stderr, "Relax = %d\n", relax );
-//#endif
-
-    if ( !(perm_r = intMalloc(A->n)) )
-    {
-        SUPERLU_ABORT("Malloc fails for perm_r[].");
-    }
-    if ( !(perm_c = intMalloc(A->n)) )
-    {
-        SUPERLU_ABORT("Malloc fails for perm_c[].");
-    }
-    if ( !(superlumt_options.etree = intMalloc(A->n)) )
-    {
-        SUPERLU_ABORT("Malloc fails for etree[].");
-    }
-    if ( !(superlumt_options.colcnt_h = intMalloc(A->n)) )
-    {
-        SUPERLU_ABORT("Malloc fails for colcnt_h[].");
-    }
-    if ( !(superlumt_options.part_super_h = intMalloc(A->n)) )
-    {
-        SUPERLU_ABORT("Malloc fails for part_super__h[].");
-    }
-    if ( ( (a = (real*) malloc( (2 * A->start[A->n] - A->n) * sizeof(real))) == NULL )
-            || ( (asub = (int_t*) malloc( (2 * A->start[A->n] - A->n) * sizeof(int_t))) == NULL )
-            || ( (xa = (int_t*) malloc( (A->n + 1) * sizeof(int_t))) == NULL )
-            || ( (Ltop = (unsigned int*) malloc( (A->n + 1) * sizeof(unsigned int))) == NULL )
-            || ( (Utop = (unsigned int*) malloc( (A->n + 1) * sizeof(unsigned int))) == NULL ) )
-    {
-        fprintf( stderr, "Not enough space for SuperLU factorization. Terminating...\n" );
-        exit( INSUFFICIENT_MEMORY );
-    }
-    if ( Allocate_Matrix( &A_t, A->n, A->m ) == FAILURE )
-    {
-        fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
-        exit( INSUFFICIENT_MEMORY );
-    }
-
-    /* Set up the sparse matrix data structure for A. */
-    Transpose( A, A_t );
-
-    count = 0;
-    for ( i = 0; i < A->n; ++i )
-    {
-        xa[i] = count;
-        for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
-        {
-            a[count] = A->entries[pj].val;
-            asub[count] = A->entries[pj].j;
-            ++count;
-        }
-        for ( pj = A_t->start[i] + 1; pj < A_t->start[i + 1]; ++pj )
-        {
-            a[count] = A_t->entries[pj].val;
-            asub[count] = A_t->entries[pj].j;
-            ++count;
-        }
-    }
-    xa[i] = count;
-
-    dCompRow_to_CompCol( A->n, A->n, 2 * A->start[A->n] - A->n, a, asub, xa,
-                         &at, &atsub, &xat );
-
-    for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
-        fprintf( stderr, "%6d", asub[i] );
-    fprintf( stderr, "\n" );
-    for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
-        fprintf( stderr, "%6.1f", a[i] );
-    fprintf( stderr, "\n" );
-    for ( i = 0; i <= A->n; ++i )
-        fprintf( stderr, "%6d", xa[i] );
-    fprintf( stderr, "\n" );
-    for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
-        fprintf( stderr, "%6d", atsub[i] );
-    fprintf( stderr, "\n" );
-    for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
-        fprintf( stderr, "%6.1f", at[i] );
-    fprintf( stderr, "\n" );
-    for ( i = 0; i <= A->n; ++i )
-        fprintf( stderr, "%6d", xat[i] );
-    fprintf( stderr, "\n" );
-
-    A_S.Stype = SLU_NC; /* column-wise, no supernode */
-    A_S.Dtype = SLU_D; /* double-precision */
-    A_S.Mtype = SLU_GE; /* full (general) matrix -- required for parallel factorization */
-    A_S.nrow = A->n;
-    A_S.ncol = A->n;
-    A_S.Store = (void *) SUPERLU_MALLOC( sizeof(NCformat) );
-    A_S_store = (NCformat *) A_S.Store;
-    A_S_store->nnz = 2 * A->start[A->n] - A->n;
-    A_S_store->nzval = at;
-    A_S_store->rowind = atsub;
-    A_S_store->colptr = xat;
-
-    /* ------------------------------------------------------------
-       Allocate storage and initialize statistics variables.
-       ------------------------------------------------------------*/
-    StatAlloc( A->n, nprocs, panel_size, relax, &Gstat );
-    StatInit( A->n, nprocs, &Gstat );
-
-    /* ------------------------------------------------------------
-       Get column permutation vector perm_c[], according to permc_spec:
-       permc_spec = 0: natural ordering
-       permc_spec = 1: minimum degree ordering on structure of A'*A
-       permc_spec = 2: minimum degree ordering on structure of A'+A
-       permc_spec = 3: approximate minimum degree for unsymmetric matrices
-       ------------------------------------------------------------*/
-    permc_spec = 0;
-    get_perm_c( permc_spec, &A_S, perm_c );
-
-    /* ------------------------------------------------------------
-       Initialize the option structure superlumt_options using the
-       user-input parameters;
-       Apply perm_c to the columns of original A to form AC.
-       ------------------------------------------------------------*/
-    pdgstrf_init( nprocs, fact, trans, refact, panel_size, relax,
-                  u, usepr, drop_tol, perm_c, perm_r,
-                  work, lwork, &A_S, &AC_S, &superlumt_options, &Gstat );
-
-    for ( i = 0; i < ((NCPformat*)AC_S.Store)->nnz; ++i )
-        fprintf( stderr, "%6.1f", ((real*)(((NCPformat*)AC_S.Store)->nzval))[i] );
-    fprintf( stderr, "\n" );
-
-    /* ------------------------------------------------------------
-       Compute the LU factorization of A.
-       The following routine will create nprocs threads.
-       ------------------------------------------------------------*/
-    pdgstrf( &superlumt_options, &AC_S, perm_r, &L_S, &U_S, &Gstat, &info );
-
-    fprintf( stderr, "INFO: %d\n", info );
-
-    flopcnt = 0;
-    for (i = 0; i < nprocs; ++i)
-    {
-        flopcnt += Gstat.procstat[i].fcops;
-    }
-    Gstat.ops[FACT] = flopcnt;
-
-//#if defined(DEBUG)
-    printf("\n** Result of sparse LU **\n");
-    L_S_store = (SCPformat *) L_S.Store;
-    U_S_store = (NCPformat *) U_S.Store;
-    printf( "No of nonzeros in factor L = " IFMT "\n", L_S_store->nnz );
-    printf( "No of nonzeros in factor U = " IFMT "\n", U_S_store->nnz );
-    fflush( stdout );
-//#endif
-
-    /* convert L and R from SuperLU formats to CSR */
-    memset( Ltop, 0, (A->n + 1) * sizeof(int) );
-    memset( Utop, 0, (A->n + 1) * sizeof(int) );
-    memset( L->start, 0, (A->n + 1) * sizeof(int) );
-    memset( U->start, 0, (A->n + 1) * sizeof(int) );
-
-    for ( i = 0; i < 2 * L_S_store->nnz; ++i )
-        fprintf( stderr, "%6.1f", ((real*)(L_S_store->nzval))[i] );
-    fprintf( stderr, "\n" );
-    for ( i = 0; i < 2 * U_S_store->nnz; ++i )
-        fprintf( stderr, "%6.1f", ((real*)(U_S_store->nzval))[i] );
-    fprintf( stderr, "\n" );
-
-    printf( "No of supernodes in factor L = " IFMT "\n", L_S_store->nsuper );
-    for ( i = 0; i < A->n; ++i )
-    {
-        fprintf( stderr, "nzval_col_beg[%5d] = %d\n", i, L_S_store->nzval_colbeg[i] );
-        fprintf( stderr, "nzval_col_end[%5d] = %d\n", i, L_S_store->nzval_colend[i] );
-        //TODO: correct for SCPformat for L?
-        //for( pj = L_S_store->rowind_colbeg[i]; pj < L_S_store->rowind_colend[i]; ++pj )
-//        for( pj = 0; pj < L_S_store->rowind_colend[i] - L_S_store->rowind_colbeg[i]; ++pj )
-//        {
-//            ++Ltop[L_S_store->rowind[L_S_store->rowind_colbeg[i] + pj] + 1];
-//        }
-        fprintf( stderr, "col_beg[%5d] = %d\n", i, U_S_store->colbeg[i] );
-        fprintf( stderr, "col_end[%5d] = %d\n", i, U_S_store->colend[i] );
-        for ( pj = U_S_store->colbeg[i]; pj < U_S_store->colend[i]; ++pj )
-        {
-            ++Utop[U_S_store->rowind[pj] + 1];
-            fprintf( stderr, "Utop[%5d]     = %d\n", U_S_store->rowind[pj] + 1, Utop[U_S_store->rowind[pj] + 1] );
-        }
-    }
-    for ( i = 1; i <= A->n; ++i )
-    {
-//        Ltop[i] = L->start[i] = Ltop[i] + Ltop[i - 1];
-        Utop[i] = U->start[i] = Utop[i] + Utop[i - 1];
-//        fprintf( stderr, "Utop[%5d]     = %d\n", i, Utop[i] );
-//        fprintf( stderr, "U->start[%5d] = %d\n", i, U->start[i] );
-    }
-    for ( i = 0; i < A->n; ++i )
-    {
-//        for( pj = 0; pj < L_S_store->nzval_colend[i] - L_S_store->nzval_colbeg[i]; ++pj )
-//        {
-//            r = L_S_store->rowind[L_S_store->rowind_colbeg[i] + pj];
-//            L->entries[Ltop[r]].j = r;
-//            L->entries[Ltop[r]].val = ((real*)L_S_store->nzval)[L_S_store->nzval_colbeg[i] + pj];
-//            ++Ltop[r];
-//        }
-        for ( pj = U_S_store->colbeg[i]; pj < U_S_store->colend[i]; ++pj )
-        {
-            r = U_S_store->rowind[pj];
-            U->entries[Utop[r]].j = i;
-            U->entries[Utop[r]].val = ((real*)U_S_store->nzval)[pj];
-            ++Utop[r];
-        }
-    }
-
-    /* ------------------------------------------------------------
-      Deallocate storage after factorization.
-      ------------------------------------------------------------*/
-    pxgstrf_finalize( &superlumt_options, &AC_S );
-    Deallocate_Matrix( A_t );
-    sfree( xa, "SuperLU_Factorize::xa" );
-    sfree( asub, "SuperLU_Factorize::asub" );
-    sfree( a, "SuperLU_Factorize::a" );
-    SUPERLU_FREE( perm_r );
-    SUPERLU_FREE( perm_c );
-    SUPERLU_FREE( ((NCformat *)A_S.Store)->rowind );
-    SUPERLU_FREE( ((NCformat *)A_S.Store)->colptr );
-    SUPERLU_FREE( ((NCformat *)A_S.Store)->nzval );
-    SUPERLU_FREE( A_S.Store );
-    if ( lwork == 0 )
-    {
-        Destroy_SuperNode_SCP(&L_S);
-        Destroy_CompCol_NCP(&U_S);
-    }
-    else if ( lwork > 0 )
-    {
-        SUPERLU_FREE(work);
-    }
-    StatFree(&Gstat);
-
-    sfree( Utop, "SuperLU_Factorize::Utop" );
-    sfree( Ltop, "SuperLU_Factorize::Ltop" );
-
-    //TODO: return iters
-    return 0.;
-}
-#endif
-
-
-/* Diagonal (Jacobi) preconditioner computation */
-static real diag_pre_comp( const sparse_matrix * const H, real * const Hdia_inv )
-{
-    unsigned int i;
-    real start;
-
-    start = Get_Time( );
-
-#ifdef _OPENMP
-    #pragma omp parallel for schedule(static) \
-        default(none) private(i)
-#endif
-    for ( i = 0; i < H->n; ++i )
-    {
-        if ( H->val[H->start[i + 1] - 1] != 0.0 )
-        {
-            Hdia_inv[i] = 1.0 / H->val[H->start[i + 1] - 1];
-        }
-        else
-        {
-            Hdia_inv[i] = 1.0;
-        }
-    }
-
-    return Get_Timing_Info( start );
-}
-
-
-/* Incomplete Cholesky factorization with dual thresholding */
-static real ICHOLT( const sparse_matrix * const A, const real * const droptol,
-        sparse_matrix * const L, sparse_matrix * const U )
-{
-    int *tmp_j;
-    real *tmp_val;
-    int i, j, pj, k1, k2, tmptop, Ltop;
-    real val, start;
-    unsigned int *Utop;
-
-    start = Get_Time( );
-
-    if ( ( Utop = (unsigned int*) malloc((A->n + 1) * sizeof(unsigned int)) ) == NULL ||
-            ( tmp_j = (int*) malloc(A->n * sizeof(int)) ) == NULL ||
-            ( tmp_val = (real*) malloc(A->n * sizeof(real)) ) == NULL )
-    {
-        fprintf( stderr, "[ICHOLT] Not enough memory for preconditioning matrices. terminating.\n" );
-        exit( INSUFFICIENT_MEMORY );
-    }
-
-    // clear variables
-    Ltop = 0;
-    tmptop = 0;
-    memset( L->start, 0, (A->n + 1) * sizeof(unsigned int) );
-    memset( U->start, 0, (A->n + 1) * sizeof(unsigned int) );
-    memset( Utop, 0, A->n * sizeof(unsigned int) );
-
-    for ( i = 0; i < A->n; ++i )
-    {
-        L->start[i] = Ltop;
-        tmptop = 0;
-
-        for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
-        {
-            j = A->j[pj];
-            val = A->val[pj];
-
-            if ( FABS(val) > droptol[i] )
-            {
-                k1 = 0;
-                k2 = L->start[j];
-                while ( k1 < tmptop && k2 < L->start[j + 1] )
-                {
-                    if ( tmp_j[k1] < L->j[k2] )
-                    {
-                        ++k1;
-                    }
-                    else if ( tmp_j[k1] > L->j[k2] )
-                    {
-                        ++k2;
-                    }
-                    else
-                    {
-                        val -= (tmp_val[k1++] * L->val[k2++]);
-                    }
-                }
-
-                // L matrix is lower triangular,
-                // so right before the start of next row comes jth diagonal
-                val /= L->val[L->start[j + 1] - 1];
-
-                tmp_j[tmptop] = j;
-                tmp_val[tmptop] = val;
-                ++tmptop;
-            }
-        }
-
-        // sanity check
-        if ( A->j[pj] != i )
-        {
-            fprintf( stderr, "[ICHOLT] badly built A matrix!\n (i = %d) ", i );
-            exit( NUMERIC_BREAKDOWN );
-        }
-
-        // compute the ith diagonal in L
-        val = A->val[pj];
-        for ( k1 = 0; k1 < tmptop; ++k1 )
-        {
-            val -= (tmp_val[k1] * tmp_val[k1]);
-        }
-
-#if defined(DEBUG)
-        if ( val < 0.0 )
-        {
-            fprintf( stderr, "[ICHOLT] Numeric breakdown (SQRT of negative on diagonal i = %d). Terminating.\n", i );
-            exit( NUMERIC_BREAKDOWN );
-
-        }
-#endif
-
-        tmp_j[tmptop] = i;
-        tmp_val[tmptop] = SQRT( val );
-
-        // apply the dropping rule once again
-        //fprintf( stderr, "row%d: tmptop: %d\n", i, tmptop );
-        //for( k1 = 0; k1<= tmptop; ++k1 )
-        //  fprintf( stderr, "%d(%f)  ", tmp[k1].j, tmp[k1].val );
-        //fprintf( stderr, "\n" );
-        //fprintf( stderr, "row(%d): droptol=%.4f\n", i+1, droptol[i] );
-        for ( k1 = 0; k1 < tmptop; ++k1 )
-        {
-            if ( FABS(tmp_val[k1]) > droptol[i] / tmp_val[tmptop] )
-            {
-                L->j[Ltop] = tmp_j[k1];
-                L->val[Ltop] = tmp_val[k1];
-                U->start[tmp_j[k1] + 1]++;
-                ++Ltop;
-                //fprintf( stderr, "%d(%.4f)  ", tmp[k1].j+1, tmp[k1].val );
-            }
-        }
-        // keep the diagonal in any case
-        L->j[Ltop] = tmp_j[k1];
-        L->val[Ltop] = tmp_val[k1];
-        ++Ltop;
-        //fprintf( stderr, "%d(%.4f)\n", tmp[k1].j+1,  tmp[k1].val );
-    }
-
-    L->start[i] = Ltop;
-//    fprintf( stderr, "nnz(L): %d, max: %d\n", Ltop, L->n * 50 );
-
-    /* U = L^T (Cholesky factorization) */
-    Transpose( L, U );
-//    for ( i = 1; i <= U->n; ++i )
-//    {
-//        Utop[i] = U->start[i] = U->start[i] + U->start[i - 1] + 1;
-//    }
-//    for ( i = 0; i < L->n; ++i )
-//    {
-//        for ( pj = L->start[i]; pj < L->start[i + 1]; ++pj )
-//        {
-//            j = L->j[pj];
-//            U->j[Utop[j]] = i;
-//            U->val[Utop[j]] = L->val[pj];
-//            Utop[j]++;
-//        }
-//    }
-
-//    fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 );
-
-    sfree( tmp_val, "ICHOLT::tmp_val" );
-    sfree( tmp_j, "ICHOLT::tmp_j" );
-    sfree( Utop, "ICHOLT::Utop" );
-
-    return Get_Timing_Info( start );
-}
-
-
-/* Fine-grained (parallel) incomplete Cholesky factorization
- *
- * Reference:
- * Edmond Chow and Aftab Patel
- * Fine-Grained Parallel Incomplete LU Factorization
- * SIAM J. Sci. Comp. */
-#if defined(TESTING)
-static real ICHOL_PAR( const sparse_matrix * const A, const unsigned int sweeps,
-        sparse_matrix * const U_t, sparse_matrix * const U )
-{
-    unsigned int i, j, k, pj, x = 0, y = 0, ei_x, ei_y;
-    real *D, *D_inv, sum, start;
-    sparse_matrix *DAD;
-    int *Utop;
-
-    start = Get_Time( );
-
-    if ( Allocate_Matrix( &DAD, A->n, A->m ) == FAILURE ||
-            ( D = (real*) malloc(A->n * sizeof(real)) ) == NULL ||
-            ( D_inv = (real*) malloc(A->n * sizeof(real)) ) == NULL ||
-            ( Utop = (int*) malloc((A->n + 1) * sizeof(int)) ) == NULL )
-    {
-        fprintf( stderr, "not enough memory for ICHOL_PAR preconditioning matrices. terminating.\n" );
-        exit( INSUFFICIENT_MEMORY );
-    }
-
-#ifdef _OPENMP
-    #pragma omp parallel for schedule(static) \
-        default(none) shared(D_inv, D) private(i)
-#endif
-    for ( i = 0; i < A->n; ++i )
-    {
-        D_inv[i] = SQRT( A->val[A->start[i + 1] - 1] );
-        D[i] = 1. / D_inv[i];
-    }
-
-    memset( U->start, 0, sizeof(unsigned int) * (A->n + 1) );
-    memset( Utop, 0, sizeof(unsigned int) * (A->n + 1) );
-
-    /* to get convergence, A must have unit diagonal, so apply
-     * transformation DAD, where D = D(1./SQRT(D(A))) */
-    memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) );
-#ifdef _OPENMP
-    #pragma omp parallel for schedule(guided) \
-        default(none) shared(DAD, D_inv, D) private(i, pj)
-#endif
-    for ( i = 0; i < A->n; ++i )
-    {
-        /* non-diagonals */
-        for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
-        {
-            DAD->j[pj] = A->j[pj];
-            DAD->val[pj] = A->val[pj] * D[i] * D[A->j[pj]];
-        }
-        /* diagonal */
-        DAD->j[pj] = A->j[pj];
-        DAD->val[pj] = 1.;
-    }
-
-    /* initial guesses for U^T,
-     * assume: A and DAD symmetric and stored lower triangular */
-    memcpy( U_t->start, DAD->start, sizeof(int) * (DAD->n + 1) );
-    memcpy( U_t->j, DAD->j, sizeof(int) * (DAD->m) );
-    memcpy( U_t->val, DAD->val, sizeof(real) * (DAD->m) );
-
-    for ( i = 0; i < sweeps; ++i )
-    {
-        /* for each nonzero */
-#ifdef _OPENMP
-        #pragma omp parallel for schedule(static) \
-            default(none) shared(DAD, stderr) private(sum, ei_x, ei_y, k) firstprivate(x, y)
-#endif
-        for ( j = 0; j < A->start[A->n]; ++j )
-        {
-            sum = ZERO;
-
-            /* determine row bounds of current nonzero */
-            x = 0;
-            ei_x = 0;
-            for ( k = 0; k <= A->n; ++k )
-            {
-                if ( U_t->start[k] > j )
-                {
-                    x = U_t->start[k - 1];
-                    ei_x = U_t->start[k];
-                    break;
-                }
-            }
-            /* column bounds of current nonzero */
-            y = U_t->start[U_t->j[j]];
-            ei_y = U_t->start[U_t->j[j] + 1];
-
-            /* sparse dot product: dot( U^T(i,1:j-1), U^T(j,1:j-1) ) */
-            while ( U_t->j[x] < U_t->j[j] &&
-                    U_t->j[y] < U_t->j[j] &&
-                    x < ei_x && y < ei_y )
-            {
-                if ( U_t->j[x] == U_t->j[y] )
-                {
-                    sum += (U_t->val[x] * U_t->val[y]);
-                    ++x;
-                    ++y;
-                }
-                else if ( U_t->j[x] < U_t->j[y] )
-                {
-                    ++x;
-                }
-                else
-                {
-                    ++y;
-                }
-            }
-
-            sum = DAD->val[j] - sum;
-
-            /* diagonal entries */
-            if ( (k - 1) == U_t->j[j] )
-            {
-                /* sanity check */
-                if ( sum < ZERO )
-                {
-                    fprintf( stderr, "Numeric breakdown in ICHOL_PAR. Terminating.\n");
-#if defined(DEBUG_FOCUS)
-                    fprintf( stderr, "A(%5d,%5d) = %10.3f\n",
-                             k - 1, A->entries[j].j, A->entries[j].val );
-                    fprintf( stderr, "sum = %10.3f\n", sum);
-#endif
-                    exit(NUMERIC_BREAKDOWN);
-                }
-
-                U_t->val[j] = SQRT( sum );
-            }
-            /* non-diagonal entries */
-            else
-            {
-                U_t->val[j] = sum / U_t->val[ei_y - 1];
-            }
-        }
-    }
-
-    /* apply inverse transformation D^{-1}U^{T},
-     * since DAD \approx U^{T}U, so
-     * D^{-1}DADD^{-1} = A \approx D^{-1}U^{T}UD^{-1} */
-#ifdef _OPENMP
-    #pragma omp parallel for schedule(guided) \
-        default(none) shared(D_inv) private(i, pj)
-#endif
-    for ( i = 0; i < A->n; ++i )
-    {
-        for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
-        {
-            U_t->val[pj] *= D_inv[i];
-        }
-    }
-
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "nnz(L): %d, max: %d\n", U_t->start[U_t->n], U_t->n * 50 );
-#endif
-
-    /* transpose U^{T} and copy into U */
-    Transpose( U_t, U );
-
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 );
-#endif
-
-    Deallocate_Matrix( DAD );
-    sfree( D_inv, "ICHOL_PAR::D_inv" );
-    sfree( D, "ICHOL_PAR::D" );
-    sfree( Utop, "ICHOL_PAR::Utop" );
-
-    return Get_Timing_Info( start );
-}
-#endif
-
-
-/* Fine-grained (parallel) incomplete LU factorization
- *
- * Reference:
- * Edmond Chow and Aftab Patel
- * Fine-Grained Parallel Incomplete LU Factorization
- * SIAM J. Sci. Comp.
- *
- * A: symmetric, half-stored (lower triangular), CSR format
- * sweeps: number of loops over non-zeros for computation
- * L / U: factorized triangular matrices (A \approx LU), CSR format */
-static real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps,
-        sparse_matrix * const L, sparse_matrix * const U )
-{
-    unsigned int i, j, k, pj, x, y, ei_x, ei_y;
-    real *D, *D_inv, sum, start;
-    sparse_matrix *DAD;
-
-    start = Get_Time( );
-
-    if ( Allocate_Matrix( &DAD, A->n, A->m ) == FAILURE ||
-            ( D = (real*) malloc(A->n * sizeof(real)) ) == NULL ||
-            ( D_inv = (real*) malloc(A->n * sizeof(real)) ) == NULL )
-    {
-        fprintf( stderr, "[ILU_PAR] Not enough memory for preconditioning matrices. Terminating.\n" );
-        exit( INSUFFICIENT_MEMORY );
-    }
-
-#ifdef _OPENMP
-    #pragma omp parallel for schedule(static) \
-        default(none) shared(D, D_inv) private(i)
-#endif
-    for ( i = 0; i < A->n; ++i )
-    {
-        D_inv[i] = SQRT( FABS( A->val[A->start[i + 1] - 1] ) );
-        D[i] = 1.0 / D_inv[i];
-//        printf( "A->val[%8d] = %f, D[%4d] = %f, D_inv[%4d] = %f\n", A->start[i + 1] - 1, A->val[A->start[i + 1] - 1], i, D[i], i, D_inv[i] );
-    }
-
-    /* to get convergence, A must have unit diagonal, so apply
-     * transformation DAD, where D = D(1./SQRT(abs(D(A)))) */
-    memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) );
-#ifdef _OPENMP
-    #pragma omp parallel for schedule(static) \
-        default(none) shared(DAD, D) private(i, pj)
-#endif
-    for ( i = 0; i < A->n; ++i )
-    {
-        /* non-diagonals */
-        for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
-        {
-            DAD->j[pj] = A->j[pj];
-            DAD->val[pj] = D[i] * A->val[pj] * D[A->j[pj]];
-        }
-        /* diagonal */
-        DAD->j[pj] = A->j[pj];
-        DAD->val[pj] = 1.0;
-    }
-
-    /* initial guesses for L and U,
-     * assume: A and DAD symmetric and stored lower triangular */
-    memcpy( L->start, DAD->start, sizeof(int) * (DAD->n + 1) );
-    memcpy( L->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) );
-    memcpy( L->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) );
-    /* store U^T in CSR for row-wise access and tranpose later */
-    memcpy( U->start, DAD->start, sizeof(int) * (DAD->n + 1) );
-    memcpy( U->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) );
-    memcpy( U->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) );
-
-    /* L has unit diagonal, by convention */
-#ifdef _OPENMP
-    #pragma omp parallel for schedule(static) default(none) private(i)
-#endif
-    for ( i = 0; i < A->n; ++i )
-    {
-        L->val[L->start[i + 1] - 1] = 1.0;
-    }
-
-    for ( i = 0; i < sweeps; ++i )
-    {
-        /* for each nonzero in L */
-#ifdef _OPENMP
-        #pragma omp parallel for schedule(static) \
-            default(none) shared(DAD) private(j, k, x, y, ei_x, ei_y, sum)
-#endif
-        for ( j = 0; j < DAD->start[DAD->n]; ++j )
-        {
-            sum = ZERO;
-
-            /* determine row bounds of current nonzero */
-            x = 0;
-            ei_x = 0;
-            for ( k = 1; k <= DAD->n; ++k )
-            {
-                if ( DAD->start[k] > j )
-                {
-                    x = DAD->start[k - 1];
-                    ei_x = DAD->start[k];
-                    break;
-                }
-            }
-            /* determine column bounds of current nonzero */
-            y = DAD->start[DAD->j[j]];
-            ei_y = DAD->start[DAD->j[j] + 1];
-
-            /* sparse dot product:
-             *   dot( L(i,1:j-1), U(1:j-1,j) ) */
-            while ( L->j[x] < L->j[j] &&
-                    L->j[y] < L->j[j] &&
-                    x < ei_x && y < ei_y )
-            {
-                if ( L->j[x] == L->j[y] )
-                {
-                    sum += (L->val[x] * U->val[y]);
-                    ++x;
-                    ++y;
-                }
-                else if ( L->j[x] < L->j[y] )
-                {
-                    ++x;
-                }
-                else
-                {
-                    ++y;
-                }
-            }
-
-            if ( j != ei_x - 1 )
-            {
-                L->val[j] = ( DAD->val[j] - sum ) / U->val[ei_y - 1];
-            }
-        }
-
-#ifdef _OPENMP
-        #pragma omp parallel for schedule(static) \
-            default(none) shared(DAD) private(j, k, x, y, ei_x, ei_y, sum)
-#endif
-        for ( j = 0; j < DAD->start[DAD->n]; ++j )
-        {
-            sum = ZERO;
-
-            /* determine row bounds of current nonzero */
-            x = 0;
-            ei_x = 0;
-            for ( k = 1; k <= DAD->n; ++k )
-            {
-                if ( DAD->start[k] > j )
-                {
-                    x = DAD->start[k - 1];
-                    ei_x = DAD->start[k];
-                    break;
-                }
-            }
-            /* determine column bounds of current nonzero */
-            y = DAD->start[DAD->j[j]];
-            ei_y = DAD->start[DAD->j[j] + 1];
-
-            /* sparse dot product:
-             *   dot( L(i,1:i-1), U(1:i-1,j) ) */
-            while ( U->j[x] < U->j[j] &&
-                    U->j[y] < U->j[j] &&
-                    x < ei_x && y < ei_y )
-            {
-                if ( U->j[x] == U->j[y] )
-                {
-                    sum += (L->val[y] * U->val[x]);
-                    ++x;
-                    ++y;
-                }
-                else if ( U->j[x] < U->j[y] )
-                {
-                    ++x;
-                }
-                else
-                {
-                    ++y;
-                }
-            }
-
-            U->val[j] = DAD->val[j] - sum;
-        }
-    }
-
-    /* apply inverse transformation:
-     * since DAD \approx LU, then
-     * D^{-1}DADD^{-1} = A \approx D^{-1}LUD^{-1} */
-#ifdef _OPENMP
-    #pragma omp parallel for schedule(static) \
-        default(none) shared(DAD, D_inv) private(i, pj)
-#endif
-    for ( i = 0; i < DAD->n; ++i )
-    {
-        for ( pj = DAD->start[i]; pj < DAD->start[i + 1]; ++pj )
-        {
-            L->val[pj] = D_inv[i] * L->val[pj];
-            /* currently storing U^T, so use row index instead of column index */
-            U->val[pj] = U->val[pj] * D_inv[i];
-        }
-    }
-
-    Transpose_I( U );
-
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "nnz(L): %d, max: %d\n", L->start[L->n], L->n * 50 );
-    fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 );
-#endif
-
-    Deallocate_Matrix( DAD );
-    sfree( D_inv, "ILU_PAR::D_inv" );
-    sfree( D, "ILU_PAR::D_inv" );
-
-    return Get_Timing_Info( start );
-}
-
-
-/* Fine-grained (parallel) incomplete LU factorization with thresholding
- *
- * Reference:
- * Edmond Chow and Aftab Patel
- * Fine-Grained Parallel Incomplete LU Factorization
- * SIAM J. Sci. Comp.
- *
- * A: symmetric, half-stored (lower triangular), CSR format
- * droptol: row-wise tolerances used for dropping
- * sweeps: number of loops over non-zeros for computation
- * L / U: factorized triangular matrices (A \approx LU), CSR format */
-static real ILUT_PAR( const sparse_matrix * const A, const real * droptol,
-                      const unsigned int sweeps, sparse_matrix * const L, sparse_matrix * const U )
-{
-    unsigned int i, j, k, pj, x, y, ei_x, ei_y, Ltop, Utop;
-    real *D, *D_inv, sum, start;
-    sparse_matrix *DAD, *L_temp, *U_temp;
-
-    start = Get_Time( );
-
-    if ( Allocate_Matrix( &DAD, A->n, A->m ) == FAILURE ||
-            Allocate_Matrix( &L_temp, A->n, A->m ) == FAILURE ||
-            Allocate_Matrix( &U_temp, A->n, A->m ) == FAILURE )
-    {
-        fprintf( stderr, "not enough memory for ILUT_PAR preconditioning matrices. terminating.\n" );
-        exit( INSUFFICIENT_MEMORY );
-    }
-
-    if ( ( D = (real*) malloc(A->n * sizeof(real)) ) == NULL ||
-            ( D_inv = (real*) malloc(A->n * sizeof(real)) ) == NULL )
-    {
-        fprintf( stderr, "not enough memory for ILUT_PAR preconditioning matrices. terminating.\n" );
-        exit( INSUFFICIENT_MEMORY );
-    }
-
-#ifdef _OPENMP
-    #pragma omp parallel for schedule(static) \
-        default(none) shared(D, D_inv) private(i)
-#endif
-    for ( i = 0; i < A->n; ++i )
-    {
-        D_inv[i] = SQRT( FABS( A->val[A->start[i + 1] - 1] ) );
-        D[i] = 1.0 / D_inv[i];
-    }
-
-    /* to get convergence, A must have unit diagonal, so apply
-     * transformation DAD, where D = D(1./SQRT(D(A))) */
-    memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) );
-#ifdef _OPENMP
-    #pragma omp parallel for schedule(static) \
-        default(none) shared(DAD, D) private(i, pj)
-#endif
-    for ( i = 0; i < A->n; ++i )
-    {
-        /* non-diagonals */
-        for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
-        {
-            DAD->j[pj] = A->j[pj];
-            DAD->val[pj] = D[i] * A->val[pj] * D[A->j[pj]];
-        }
-        /* diagonal */
-        DAD->j[pj] = A->j[pj];
-        DAD->val[pj] = 1.0;
-    }
-
-    /* initial guesses for L and U,
-     * assume: A and DAD symmetric and stored lower triangular */
-    memcpy( L_temp->start, DAD->start, sizeof(int) * (DAD->n + 1) );
-    memcpy( L_temp->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) );
-    memcpy( L_temp->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) );
-    /* store U^T in CSR for row-wise access and tranpose later */
-    memcpy( U_temp->start, DAD->start, sizeof(int) * (DAD->n + 1) );
-    memcpy( U_temp->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) );
-    memcpy( U_temp->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) );
-
-    /* L has unit diagonal, by convention */
-#ifdef _OPENMP
-    #pragma omp parallel for schedule(static) \
-        default(none) private(i) shared(L_temp)
-#endif
-    for ( i = 0; i < A->n; ++i )
-    {
-        L_temp->val[L_temp->start[i + 1] - 1] = 1.0;
-    }
-
-    for ( i = 0; i < sweeps; ++i )
-    {
-        /* for each nonzero in L */
-#ifdef _OPENMP
-        #pragma omp parallel for schedule(static) \
-            default(none) shared(DAD, L_temp, U_temp) private(j, k, x, y, ei_x, ei_y, sum)
-#endif
-        for ( j = 0; j < DAD->start[DAD->n]; ++j )
-        {
-            sum = ZERO;
-
-            /* determine row bounds of current nonzero */
-            x = 0;
-            ei_x = 0;
-            for ( k = 1; k <= DAD->n; ++k )
-            {
-                if ( DAD->start[k] > j )
-                {
-                    x = DAD->start[k - 1];
-                    ei_x = DAD->start[k];
-                    break;
-                }
-            }
-            /* determine column bounds of current nonzero */
-            y = DAD->start[DAD->j[j]];
-            ei_y = DAD->start[DAD->j[j] + 1];
-
-            /* sparse dot product:
-             *   dot( L(i,1:j-1), U(1:j-1,j) ) */
-            while ( L_temp->j[x] < L_temp->j[j] &&
-                    L_temp->j[y] < L_temp->j[j] &&
-                    x < ei_x && y < ei_y )
-            {
-                if ( L_temp->j[x] == L_temp->j[y] )
-                {
-                    sum += (L_temp->val[x] * U_temp->val[y]);
-                    ++x;
-                    ++y;
-                }
-                else if ( L_temp->j[x] < L_temp->j[y] )
-                {
-                    ++x;
-                }
-                else
-                {
-                    ++y;
-                }
-            }
-
-            if ( j != ei_x - 1 )
-            {
-                L_temp->val[j] = ( DAD->val[j] - sum ) / U_temp->val[ei_y - 1];
-            }
-        }
-
-#ifdef _OPENMP
-        #pragma omp parallel for schedule(static) \
-            default(none) shared(DAD, L_temp, U_temp) private(j, k, x, y, ei_x, ei_y, sum)
-#endif
-        for ( j = 0; j < DAD->start[DAD->n]; ++j )
-        {
-            sum = ZERO;
-
-            /* determine row bounds of current nonzero */
-            x = 0;
-            ei_x = 0;
-            for ( k = 1; k <= DAD->n; ++k )
-            {
-                if ( DAD->start[k] > j )
-                {
-                    x = DAD->start[k - 1];
-                    ei_x = DAD->start[k];
-                    break;
-                }
-            }
-            /* determine column bounds of current nonzero */
-            y = DAD->start[DAD->j[j]];
-            ei_y = DAD->start[DAD->j[j] + 1];
-
-            /* sparse dot product:
-             *   dot( L(i,1:i-1), U(1:i-1,j) ) */
-            while ( U_temp->j[x] < U_temp->j[j] &&
-                    U_temp->j[y] < U_temp->j[j] &&
-                    x < ei_x && y < ei_y )
-            {
-                if ( U_temp->j[x] == U_temp->j[y] )
-                {
-                    sum += (L_temp->val[y] * U_temp->val[x]);
-                    ++x;
-                    ++y;
-                }
-                else if ( U_temp->j[x] < U_temp->j[y] )
-                {
-                    ++x;
-                }
-                else
-                {
-                    ++y;
-                }
-            }
-
-            U_temp->val[j] = DAD->val[j] - sum;
-        }
-    }
-
-    /* apply inverse transformation:
-     * since DAD \approx LU, then
-     * D^{-1}DADD^{-1} = A \approx D^{-1}LUD^{-1} */
-#ifdef _OPENMP
-    #pragma omp parallel for schedule(static) \
-        default(none) shared(DAD, L_temp, U_temp, D_inv) private(i, pj)
-#endif
-    for ( i = 0; i < DAD->n; ++i )
-    {
-        for ( pj = DAD->start[i]; pj < DAD->start[i + 1]; ++pj )
-        {
-            L_temp->val[pj] = D_inv[i] * L_temp->val[pj];
-            /* currently storing U^T, so use row index instead of column index */
-            U_temp->val[pj] = U_temp->val[pj] * D_inv[i];
-        }
-    }
-
-    /* apply the dropping rule */
-    Ltop = 0;
-    Utop = 0;
-    for ( i = 0; i < DAD->n; ++i )
-    {
-        L->start[i] = Ltop;
-        U->start[i] = Utop;
-
-        for ( pj = L_temp->start[i]; pj < L_temp->start[i + 1] - 1; ++pj )
-        {
-            if ( FABS( L_temp->val[pj] ) > FABS( droptol[i] / L_temp->val[L_temp->start[i + 1] - 1] ) )
-            {
-                L->j[Ltop] = L_temp->j[pj];
-                L->val[Ltop] = L_temp->val[pj];
-                ++Ltop;
-            }
-        }
-
-        /* diagonal */
-        L->j[Ltop] = L_temp->j[pj];
-        L->val[Ltop] = L_temp->val[pj];
-        ++Ltop;
-
-        for ( pj = U_temp->start[i]; pj < U_temp->start[i + 1] - 1; ++pj )
-        {
-            if ( FABS( U_temp->val[pj] ) > FABS( droptol[i] / U_temp->val[U_temp->start[i + 1] - 1] ) )
-            {
-                U->j[Utop] = U_temp->j[pj];
-                U->val[Utop] = U_temp->val[pj];
-                ++Utop;
-            }
-        }
-
-        /* diagonal */
-        U->j[Utop] = U_temp->j[pj];
-        U->val[Utop] = U_temp->val[pj];
-        ++Utop;
-    }
-
-    L->start[i] = Ltop;
-    U->start[i] = Utop;
-
-    Transpose_I( U );
-
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "nnz(L): %d\n", L->start[L->n] );
-    fprintf( stderr, "nnz(U): %d\n", U->start[U->n] );
-#endif
-
-    Deallocate_Matrix( U_temp );
-    Deallocate_Matrix( L_temp );
-    Deallocate_Matrix( DAD );
-    sfree( D_inv, "ILUT_PAR::D_inv" );
-    sfree( D, "ILUT_PAR::D_inv" );
-
-    return Get_Timing_Info( start );
-}
-
-
 static void Extrapolate_Charges_QEq( const reax_system * const system,
         const control_params * const control,
         simulation_data * const data, static_storage * const workspace )
@@ -1584,6 +202,10 @@ static void Compute_Preconditioner_QEq( const reax_system * const system,
 #endif
             break;
 
+        case SAI_PC:
+            //TODO: call function
+            break;
+
         default:
             fprintf( stderr, "Unrecognized preconditioner computation method. Terminating...\n" );
             exit( INVALID_INPUT );
@@ -1928,6 +550,10 @@ static void Compute_Preconditioner_EE( const reax_system * const system,
 #endif
             break;
 
+        case SAI_PC:
+            //TODO: call function
+            break;
+
         default:
             fprintf( stderr, "Unrecognized preconditioner computation method. Terminating...\n" );
             exit( INVALID_INPUT );
@@ -2031,6 +657,10 @@ static void Compute_Preconditioner_ACKS2( const reax_system * const system,
 #endif
             break;
 
+        case SAI_PC:
+            //TODO: call function
+            break;
+
         default:
             fprintf( stderr, "Unrecognized preconditioner computation method. Terminating...\n" );
             exit( INVALID_INPUT );
@@ -2194,6 +824,10 @@ static void Setup_Preconditioner_QEq( const reax_system * const system,
             }
             break;
 
+        case SAI_PC:
+            //TODO: call setup function
+            break;
+
         default:
             fprintf( stderr, "Unrecognized preconditioner computation method. Terminating...\n" );
             exit( INVALID_INPUT );
@@ -2341,6 +975,10 @@ static void Setup_Preconditioner_EE( const reax_system * const system,
             }
             break;
 
+        case SAI_PC:
+            //TODO: call setup function
+            break;
+
         default:
             fprintf( stderr, "Unrecognized preconditioner computation method. Terminating...\n" );
             exit( INVALID_INPUT );
@@ -2490,6 +1128,10 @@ static void Setup_Preconditioner_ACKS2( const reax_system * const system,
             }
             break;
 
+        case SAI_PC:
+            //TODO: call setup function
+            break;
+
         default:
             fprintf( stderr, "Unrecognized preconditioner computation method. Terminating...\n" );
             exit( INVALID_INPUT );
diff --git a/sPuReMD/src/lin_alg.c b/sPuReMD/src/lin_alg.c
index 2094d4f48cc9925eae23240ab170b84911f496fc..4a3fdf809c3115ea425c4a15e6e338f3e0428585 100644
--- a/sPuReMD/src/lin_alg.c
+++ b/sPuReMD/src/lin_alg.c
@@ -26,39 +26,1421 @@
 #include "vector.h"
 
 
+typedef struct
+{
+    unsigned int j;
+    real val;
+} sparse_matrix_entry;
+
+
 /* global to make OpenMP shared (Sparse_MatVec) */
 #ifdef _OPENMP
-real *b_local = NULL;
+real *b_local = NULL;
+#endif
+/* global to make OpenMP shared (apply_preconditioner) */
+real *Dinv_L = NULL, *Dinv_U = NULL;
+/* global to make OpenMP shared (tri_solve_level_sched) */
+int levels = 1;
+int levels_L = 1, levels_U = 1;
+unsigned int *row_levels_L = NULL, *level_rows_L = NULL, *level_rows_cnt_L = NULL;
+unsigned int *row_levels_U = NULL, *level_rows_U = NULL, *level_rows_cnt_U = NULL;
+unsigned int *row_levels, *level_rows, *level_rows_cnt;
+unsigned int *top = NULL;
+/* global to make OpenMP shared (graph_coloring) */
+unsigned int *color = NULL;
+unsigned int *to_color = NULL;
+unsigned int *conflict = NULL;
+unsigned int *conflict_cnt = NULL;
+unsigned int *temp_ptr;
+unsigned int *recolor = NULL;
+unsigned int recolor_cnt;
+unsigned int *color_top = NULL;
+/* global to make OpenMP shared (sort_colors) */
+unsigned int *permuted_row_col = NULL;
+unsigned int *permuted_row_col_inv = NULL;
+real *y_p = NULL;
+/* global to make OpenMP shared (permute_vector) */
+real *x_p = NULL;
+unsigned int *mapping = NULL;
+sparse_matrix *H_full;
+sparse_matrix *H_p;
+/* global to make OpenMP shared (jacobi_iter) */
+real *Dinv_b = NULL, *rp = NULL, *rp2 = NULL, *rp3 = NULL;
+
+
+#if defined(TEST_MAT)
+static sparse_matrix * create_test_mat( void )
+{
+    unsigned int i, n;
+    sparse_matrix *H_test;
+
+    if ( Allocate_Matrix( &H_test, 3, 6 ) == FAILURE )
+    {
+        fprintf( stderr, "not enough memory for test matrices. terminating.\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+    //3x3, SPD, store lower half
+    i = 0;
+    n = 0;
+    H_test->start[n] = i;
+    H_test->j[i] = 0;
+    H_test->val[i] = 4.;
+    ++i;
+    ++n;
+    H_test->start[n] = i;
+    H_test->j[i] = 0;
+    H_test->val[i] = 12.;
+    ++i;
+    H_test->j[i] = 1;
+    H_test->val[i] = 37.;
+    ++i;
+    ++n;
+    H_test->start[n] = i;
+    H_test->j[i] = 0;
+    H_test->val[i] = -16.;
+    ++i;
+    H_test->j[i] = 1;
+    H_test->val[i] = -43.;
+    ++i;
+    H_test->j[i] = 2;
+    H_test->val[i] = 98.;
+    ++i;
+    ++n;
+    H_test->start[n] = i;
+
+    return H_test;
+}
+#endif
+
+
+/* Routine used with qsort for sorting nonzeros within a sparse matrix row
+ *
+ * v1/v2: pointers to column indices of nonzeros within a row (unsigned int)
+ */
+static int compare_matrix_entry(const void *v1, const void *v2)
+{
+    /* larger element has larger column index */
+    return ((sparse_matrix_entry *)v1)->j - ((sparse_matrix_entry *)v2)->j;
+}
+
+
+/* Routine used for sorting nonzeros within a sparse matrix row;
+ *  internally, a combination of qsort and manual sorting is utilized
+ *  (parallel calls to qsort when multithreading, rows mapped to threads)
+ *
+ * A: sparse matrix for which to sort nonzeros within a row, stored in CSR format
+ */
+void Sort_Matrix_Rows( sparse_matrix * const A )
+{
+    unsigned int i, j, si, ei;
+    sparse_matrix_entry *temp;
+
+#ifdef _OPENMP
+//    #pragma omp parallel default(none) private(i, j, si, ei, temp) shared(stderr)
+#endif
+    {
+        if ( ( temp = (sparse_matrix_entry *) malloc( A->n * sizeof(sparse_matrix_entry)) ) == NULL )
+        {
+            fprintf( stderr, "Not enough space for matrix row sort. Terminating...\n" );
+            exit( INSUFFICIENT_MEMORY );
+        }
+
+        /* sort each row of A using column indices */
+#ifdef _OPENMP
+//        #pragma omp for schedule(guided)
+#endif
+        for ( i = 0; i < A->n; ++i )
+        {
+            si = A->start[i];
+            ei = A->start[i + 1];
+
+            for ( j = 0; j < (ei - si); ++j )
+            {
+                (temp + j)->j = A->j[si + j];
+                (temp + j)->val = A->val[si + j];
+            }
+
+            /* polymorphic sort in standard C library using column indices */
+            qsort( temp, ei - si, sizeof(sparse_matrix_entry), compare_matrix_entry );
+
+            for ( j = 0; j < (ei - si); ++j )
+            {
+                A->j[si + j] = (temp + j)->j;
+                A->val[si + j] = (temp + j)->val;
+            }
+        }
+
+        sfree( temp, "Sort_Matrix_Rows::temp" );
+    }
+}
+
+
+void Calculate_Droptol( const sparse_matrix * const A,
+        real * const droptol, const real dtol )
+{
+    int i, j, k;
+    real val;
+#ifdef _OPENMP
+    static real *droptol_local;
+    unsigned int tid;
+#endif
+
+#ifdef _OPENMP
+    #pragma omp parallel default(none) private(i, j, k, val, tid), shared(droptol_local, stderr)
+#endif
+    {
+#ifdef _OPENMP
+        tid = omp_get_thread_num();
+
+        #pragma omp master
+        {
+            /* keep b_local for program duration to avoid allocate/free
+             * overhead per Sparse_MatVec call*/
+            if ( droptol_local == NULL )
+            {
+                if ( (droptol_local = (real*) malloc( omp_get_num_threads() * A->n * sizeof(real))) == NULL )
+                {
+                    fprintf( stderr, "Not enough space for droptol. Terminating...\n" );
+                    exit( INSUFFICIENT_MEMORY );
+                }
+            }
+        }
+
+        #pragma omp barrier
+#endif
+
+        /* init droptol to 0 */
+        for ( i = 0; i < A->n; ++i )
+        {
+#ifdef _OPENMP
+            droptol_local[tid * A->n + i] = 0.0;
+#else
+            droptol[i] = 0.0;
+#endif
+        }
+
+#ifdef _OPENMP
+        #pragma omp barrier
+#endif
+
+        /* calculate sqaure of the norm of each row */
+#ifdef _OPENMP
+        #pragma omp for schedule(static)
+#endif
+        for ( i = 0; i < A->n; ++i )
+        {
+            for ( k = A->start[i]; k < A->start[i + 1] - 1; ++k )
+            {
+                j = A->j[k];
+                val = A->val[k];
+
+#ifdef _OPENMP
+                droptol_local[tid * A->n + i] += val * val;
+                droptol_local[tid * A->n + j] += val * val;
+#else
+                droptol[i] += val * val;
+                droptol[j] += val * val;
+#endif
+            }
+
+            // diagonal entry
+            val = A->val[k];
+#ifdef _OPENMP
+            droptol_local[tid * A->n + i] += val * val;
+#else
+            droptol[i] += val * val;
+#endif
+        }
+
+#ifdef _OPENMP
+        #pragma omp barrier
+
+        #pragma omp for schedule(static)
+        for ( i = 0; i < A->n; ++i )
+        {
+            droptol[i] = 0.0;
+            for ( k = 0; k < omp_get_num_threads(); ++k )
+            {
+                droptol[i] += droptol_local[k * A->n + i];
+            }
+        }
+
+        #pragma omp barrier
+#endif
+
+        /* calculate local droptol for each row */
+        //fprintf( stderr, "droptol: " );
+#ifdef _OPENMP
+        #pragma omp for schedule(static)
+#endif
+        for ( i = 0; i < A->n; ++i )
+        {
+            //fprintf( stderr, "%f-->", droptol[i] );
+            droptol[i] = SQRT( droptol[i] ) * dtol;
+            //fprintf( stderr, "%f  ", droptol[i] );
+        }
+        //fprintf( stderr, "\n" );
+    }
+}
+
+
+int Estimate_LU_Fill( const sparse_matrix * const A, const real * const droptol )
+{
+    int i, pj;
+    int fillin;
+    real val;
+
+    fillin = 0;
+
+#ifdef _OPENMP
+    #pragma omp parallel for schedule(static) \
+        default(none) private(i, pj, val) reduction(+: fillin)
+#endif
+    for ( i = 0; i < A->n; ++i )
+    {
+        for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
+        {
+            val = A->val[pj];
+
+            if ( FABS(val) > droptol[i] )
+            {
+                ++fillin;
+            }
+        }
+    }
+
+    return fillin + A->n;
+}
+
+
+#if defined(HAVE_SUPERLU_MT)
+real SuperLU_Factorize( const sparse_matrix * const A,
+        sparse_matrix * const L, sparse_matrix * const U )
+{
+    unsigned int i, pj, count, *Ltop, *Utop, r;
+    sparse_matrix *A_t;
+    SuperMatrix A_S, AC_S, L_S, U_S;
+    NCformat *A_S_store;
+    SCPformat *L_S_store;
+    NCPformat *U_S_store;
+    superlumt_options_t superlumt_options;
+    pxgstrf_shared_t pxgstrf_shared;
+    pdgstrf_threadarg_t *pdgstrf_threadarg;
+    int_t nprocs;
+    fact_t fact;
+    trans_t trans;
+    yes_no_t refact, usepr;
+    real u, drop_tol;
+    real *a, *at;
+    int_t *asub, *atsub, *xa, *xat;
+    int_t *perm_c; /* column permutation vector */
+    int_t *perm_r; /* row permutations from partial pivoting */
+    void *work;
+    int_t info, lwork;
+    int_t permc_spec, panel_size, relax;
+    Gstat_t Gstat;
+    flops_t flopcnt;
+
+    /* Default parameters to control factorization. */
+#ifdef _OPENMP
+    //TODO: set as global parameter and use
+    #pragma omp parallel \
+        default(none) shared(nprocs)
+    {
+        #pragma omp master
+        {
+            /* SuperLU_MT spawns threads internally, so set and pass parameter */
+            nprocs = omp_get_num_threads();
+        }
+    }
+#else
+    nprocs = 1;
+#endif
+
+//    fact = EQUILIBRATE; /* equilibrate A (i.e., scale rows & cols to have unit norm), then factorize */
+    fact = DOFACT; /* factor from scratch */
+    trans = NOTRANS;
+    refact = NO; /* first time factorization */
+    //TODO: add to control file and use the value there to set these
+    panel_size = sp_ienv(1); /* # consec. cols treated as unit task */
+    relax = sp_ienv(2); /* # cols grouped as relaxed supernode */
+    u = 1.0; /* diagonal pivoting threshold */
+    usepr = NO;
+    drop_tol = 0.0;
+    work = NULL;
+    lwork = 0;
+
+//#if defined(DEBUG)
+    fprintf( stderr, "nprocs = %d\n", nprocs );
+    fprintf( stderr, "Panel size = %d\n", panel_size );
+    fprintf( stderr, "Relax = %d\n", relax );
+//#endif
+
+    if ( !(perm_r = intMalloc(A->n)) )
+    {
+        SUPERLU_ABORT("Malloc fails for perm_r[].");
+    }
+    if ( !(perm_c = intMalloc(A->n)) )
+    {
+        SUPERLU_ABORT("Malloc fails for perm_c[].");
+    }
+    if ( !(superlumt_options.etree = intMalloc(A->n)) )
+    {
+        SUPERLU_ABORT("Malloc fails for etree[].");
+    }
+    if ( !(superlumt_options.colcnt_h = intMalloc(A->n)) )
+    {
+        SUPERLU_ABORT("Malloc fails for colcnt_h[].");
+    }
+    if ( !(superlumt_options.part_super_h = intMalloc(A->n)) )
+    {
+        SUPERLU_ABORT("Malloc fails for part_super__h[].");
+    }
+    if ( ( (a = (real*) malloc( (2 * A->start[A->n] - A->n) * sizeof(real))) == NULL )
+            || ( (asub = (int_t*) malloc( (2 * A->start[A->n] - A->n) * sizeof(int_t))) == NULL )
+            || ( (xa = (int_t*) malloc( (A->n + 1) * sizeof(int_t))) == NULL )
+            || ( (Ltop = (unsigned int*) malloc( (A->n + 1) * sizeof(unsigned int))) == NULL )
+            || ( (Utop = (unsigned int*) malloc( (A->n + 1) * sizeof(unsigned int))) == NULL ) )
+    {
+        fprintf( stderr, "Not enough space for SuperLU factorization. Terminating...\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+    if ( Allocate_Matrix( &A_t, A->n, A->m ) == FAILURE )
+    {
+        fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+    /* Set up the sparse matrix data structure for A. */
+    Transpose( A, A_t );
+
+    count = 0;
+    for ( i = 0; i < A->n; ++i )
+    {
+        xa[i] = count;
+        for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
+        {
+            a[count] = A->entries[pj].val;
+            asub[count] = A->entries[pj].j;
+            ++count;
+        }
+        for ( pj = A_t->start[i] + 1; pj < A_t->start[i + 1]; ++pj )
+        {
+            a[count] = A_t->entries[pj].val;
+            asub[count] = A_t->entries[pj].j;
+            ++count;
+        }
+    }
+    xa[i] = count;
+
+    dCompRow_to_CompCol( A->n, A->n, 2 * A->start[A->n] - A->n, a, asub, xa,
+                         &at, &atsub, &xat );
+
+    for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
+        fprintf( stderr, "%6d", asub[i] );
+    fprintf( stderr, "\n" );
+    for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
+        fprintf( stderr, "%6.1f", a[i] );
+    fprintf( stderr, "\n" );
+    for ( i = 0; i <= A->n; ++i )
+        fprintf( stderr, "%6d", xa[i] );
+    fprintf( stderr, "\n" );
+    for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
+        fprintf( stderr, "%6d", atsub[i] );
+    fprintf( stderr, "\n" );
+    for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
+        fprintf( stderr, "%6.1f", at[i] );
+    fprintf( stderr, "\n" );
+    for ( i = 0; i <= A->n; ++i )
+        fprintf( stderr, "%6d", xat[i] );
+    fprintf( stderr, "\n" );
+
+    A_S.Stype = SLU_NC; /* column-wise, no supernode */
+    A_S.Dtype = SLU_D; /* double-precision */
+    A_S.Mtype = SLU_GE; /* full (general) matrix -- required for parallel factorization */
+    A_S.nrow = A->n;
+    A_S.ncol = A->n;
+    A_S.Store = (void *) SUPERLU_MALLOC( sizeof(NCformat) );
+    A_S_store = (NCformat *) A_S.Store;
+    A_S_store->nnz = 2 * A->start[A->n] - A->n;
+    A_S_store->nzval = at;
+    A_S_store->rowind = atsub;
+    A_S_store->colptr = xat;
+
+    /* ------------------------------------------------------------
+       Allocate storage and initialize statistics variables.
+       ------------------------------------------------------------*/
+    StatAlloc( A->n, nprocs, panel_size, relax, &Gstat );
+    StatInit( A->n, nprocs, &Gstat );
+
+    /* ------------------------------------------------------------
+       Get column permutation vector perm_c[], according to permc_spec:
+       permc_spec = 0: natural ordering
+       permc_spec = 1: minimum degree ordering on structure of A'*A
+       permc_spec = 2: minimum degree ordering on structure of A'+A
+       permc_spec = 3: approximate minimum degree for unsymmetric matrices
+       ------------------------------------------------------------*/
+    permc_spec = 0;
+    get_perm_c( permc_spec, &A_S, perm_c );
+
+    /* ------------------------------------------------------------
+       Initialize the option structure superlumt_options using the
+       user-input parameters;
+       Apply perm_c to the columns of original A to form AC.
+       ------------------------------------------------------------*/
+    pdgstrf_init( nprocs, fact, trans, refact, panel_size, relax,
+                  u, usepr, drop_tol, perm_c, perm_r,
+                  work, lwork, &A_S, &AC_S, &superlumt_options, &Gstat );
+
+    for ( i = 0; i < ((NCPformat*)AC_S.Store)->nnz; ++i )
+        fprintf( stderr, "%6.1f", ((real*)(((NCPformat*)AC_S.Store)->nzval))[i] );
+    fprintf( stderr, "\n" );
+
+    /* ------------------------------------------------------------
+       Compute the LU factorization of A.
+       The following routine will create nprocs threads.
+       ------------------------------------------------------------*/
+    pdgstrf( &superlumt_options, &AC_S, perm_r, &L_S, &U_S, &Gstat, &info );
+
+    fprintf( stderr, "INFO: %d\n", info );
+
+    flopcnt = 0;
+    for (i = 0; i < nprocs; ++i)
+    {
+        flopcnt += Gstat.procstat[i].fcops;
+    }
+    Gstat.ops[FACT] = flopcnt;
+
+//#if defined(DEBUG)
+    printf("\n** Result of sparse LU **\n");
+    L_S_store = (SCPformat *) L_S.Store;
+    U_S_store = (NCPformat *) U_S.Store;
+    printf( "No of nonzeros in factor L = " IFMT "\n", L_S_store->nnz );
+    printf( "No of nonzeros in factor U = " IFMT "\n", U_S_store->nnz );
+    fflush( stdout );
+//#endif
+
+    /* convert L and R from SuperLU formats to CSR */
+    memset( Ltop, 0, (A->n + 1) * sizeof(int) );
+    memset( Utop, 0, (A->n + 1) * sizeof(int) );
+    memset( L->start, 0, (A->n + 1) * sizeof(int) );
+    memset( U->start, 0, (A->n + 1) * sizeof(int) );
+
+    for ( i = 0; i < 2 * L_S_store->nnz; ++i )
+        fprintf( stderr, "%6.1f", ((real*)(L_S_store->nzval))[i] );
+    fprintf( stderr, "\n" );
+    for ( i = 0; i < 2 * U_S_store->nnz; ++i )
+        fprintf( stderr, "%6.1f", ((real*)(U_S_store->nzval))[i] );
+    fprintf( stderr, "\n" );
+
+    printf( "No of supernodes in factor L = " IFMT "\n", L_S_store->nsuper );
+    for ( i = 0; i < A->n; ++i )
+    {
+        fprintf( stderr, "nzval_col_beg[%5d] = %d\n", i, L_S_store->nzval_colbeg[i] );
+        fprintf( stderr, "nzval_col_end[%5d] = %d\n", i, L_S_store->nzval_colend[i] );
+        //TODO: correct for SCPformat for L?
+        //for( pj = L_S_store->rowind_colbeg[i]; pj < L_S_store->rowind_colend[i]; ++pj )
+//        for( pj = 0; pj < L_S_store->rowind_colend[i] - L_S_store->rowind_colbeg[i]; ++pj )
+//        {
+//            ++Ltop[L_S_store->rowind[L_S_store->rowind_colbeg[i] + pj] + 1];
+//        }
+        fprintf( stderr, "col_beg[%5d] = %d\n", i, U_S_store->colbeg[i] );
+        fprintf( stderr, "col_end[%5d] = %d\n", i, U_S_store->colend[i] );
+        for ( pj = U_S_store->colbeg[i]; pj < U_S_store->colend[i]; ++pj )
+        {
+            ++Utop[U_S_store->rowind[pj] + 1];
+            fprintf( stderr, "Utop[%5d]     = %d\n", U_S_store->rowind[pj] + 1, Utop[U_S_store->rowind[pj] + 1] );
+        }
+    }
+    for ( i = 1; i <= A->n; ++i )
+    {
+//        Ltop[i] = L->start[i] = Ltop[i] + Ltop[i - 1];
+        Utop[i] = U->start[i] = Utop[i] + Utop[i - 1];
+//        fprintf( stderr, "Utop[%5d]     = %d\n", i, Utop[i] );
+//        fprintf( stderr, "U->start[%5d] = %d\n", i, U->start[i] );
+    }
+    for ( i = 0; i < A->n; ++i )
+    {
+//        for( pj = 0; pj < L_S_store->nzval_colend[i] - L_S_store->nzval_colbeg[i]; ++pj )
+//        {
+//            r = L_S_store->rowind[L_S_store->rowind_colbeg[i] + pj];
+//            L->entries[Ltop[r]].j = r;
+//            L->entries[Ltop[r]].val = ((real*)L_S_store->nzval)[L_S_store->nzval_colbeg[i] + pj];
+//            ++Ltop[r];
+//        }
+        for ( pj = U_S_store->colbeg[i]; pj < U_S_store->colend[i]; ++pj )
+        {
+            r = U_S_store->rowind[pj];
+            U->entries[Utop[r]].j = i;
+            U->entries[Utop[r]].val = ((real*)U_S_store->nzval)[pj];
+            ++Utop[r];
+        }
+    }
+
+    /* ------------------------------------------------------------
+      Deallocate storage after factorization.
+      ------------------------------------------------------------*/
+    pxgstrf_finalize( &superlumt_options, &AC_S );
+    Deallocate_Matrix( A_t );
+    sfree( xa, "SuperLU_Factorize::xa" );
+    sfree( asub, "SuperLU_Factorize::asub" );
+    sfree( a, "SuperLU_Factorize::a" );
+    SUPERLU_FREE( perm_r );
+    SUPERLU_FREE( perm_c );
+    SUPERLU_FREE( ((NCformat *)A_S.Store)->rowind );
+    SUPERLU_FREE( ((NCformat *)A_S.Store)->colptr );
+    SUPERLU_FREE( ((NCformat *)A_S.Store)->nzval );
+    SUPERLU_FREE( A_S.Store );
+    if ( lwork == 0 )
+    {
+        Destroy_SuperNode_SCP(&L_S);
+        Destroy_CompCol_NCP(&U_S);
+    }
+    else if ( lwork > 0 )
+    {
+        SUPERLU_FREE(work);
+    }
+    StatFree(&Gstat);
+
+    sfree( Utop, "SuperLU_Factorize::Utop" );
+    sfree( Ltop, "SuperLU_Factorize::Ltop" );
+
+    //TODO: return iters
+    return 0.;
+}
+#endif
+
+
+/* Diagonal (Jacobi) preconditioner computation */
+real diag_pre_comp( const sparse_matrix * const H, real * const Hdia_inv )
+{
+    unsigned int i;
+    real start;
+
+    start = Get_Time( );
+
+#ifdef _OPENMP
+    #pragma omp parallel for schedule(static) \
+        default(none) private(i)
+#endif
+    for ( i = 0; i < H->n; ++i )
+    {
+        if ( H->val[H->start[i + 1] - 1] != 0.0 )
+        {
+            Hdia_inv[i] = 1.0 / H->val[H->start[i + 1] - 1];
+        }
+        else
+        {
+            Hdia_inv[i] = 1.0;
+        }
+    }
+
+    return Get_Timing_Info( start );
+}
+
+
+/* Incomplete Cholesky factorization with dual thresholding */
+real ICHOLT( const sparse_matrix * const A, const real * const droptol,
+        sparse_matrix * const L, sparse_matrix * const U )
+{
+    int *tmp_j;
+    real *tmp_val;
+    int i, j, pj, k1, k2, tmptop, Ltop;
+    real val, start;
+    unsigned int *Utop;
+
+    start = Get_Time( );
+
+    if ( ( Utop = (unsigned int*) malloc((A->n + 1) * sizeof(unsigned int)) ) == NULL ||
+            ( tmp_j = (int*) malloc(A->n * sizeof(int)) ) == NULL ||
+            ( tmp_val = (real*) malloc(A->n * sizeof(real)) ) == NULL )
+    {
+        fprintf( stderr, "[ICHOLT] Not enough memory for preconditioning matrices. terminating.\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+    // clear variables
+    Ltop = 0;
+    tmptop = 0;
+    memset( L->start, 0, (A->n + 1) * sizeof(unsigned int) );
+    memset( U->start, 0, (A->n + 1) * sizeof(unsigned int) );
+    memset( Utop, 0, A->n * sizeof(unsigned int) );
+
+    for ( i = 0; i < A->n; ++i )
+    {
+        L->start[i] = Ltop;
+        tmptop = 0;
+
+        for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
+        {
+            j = A->j[pj];
+            val = A->val[pj];
+
+            if ( FABS(val) > droptol[i] )
+            {
+                k1 = 0;
+                k2 = L->start[j];
+                while ( k1 < tmptop && k2 < L->start[j + 1] )
+                {
+                    if ( tmp_j[k1] < L->j[k2] )
+                    {
+                        ++k1;
+                    }
+                    else if ( tmp_j[k1] > L->j[k2] )
+                    {
+                        ++k2;
+                    }
+                    else
+                    {
+                        val -= (tmp_val[k1++] * L->val[k2++]);
+                    }
+                }
+
+                // L matrix is lower triangular,
+                // so right before the start of next row comes jth diagonal
+                val /= L->val[L->start[j + 1] - 1];
+
+                tmp_j[tmptop] = j;
+                tmp_val[tmptop] = val;
+                ++tmptop;
+            }
+        }
+
+        // sanity check
+        if ( A->j[pj] != i )
+        {
+            fprintf( stderr, "[ICHOLT] badly built A matrix!\n (i = %d) ", i );
+            exit( NUMERIC_BREAKDOWN );
+        }
+
+        // compute the ith diagonal in L
+        val = A->val[pj];
+        for ( k1 = 0; k1 < tmptop; ++k1 )
+        {
+            val -= (tmp_val[k1] * tmp_val[k1]);
+        }
+
+#if defined(DEBUG)
+        if ( val < 0.0 )
+        {
+            fprintf( stderr, "[ICHOLT] Numeric breakdown (SQRT of negative on diagonal i = %d). Terminating.\n", i );
+            exit( NUMERIC_BREAKDOWN );
+
+        }
+#endif
+
+        tmp_j[tmptop] = i;
+        tmp_val[tmptop] = SQRT( val );
+
+        // apply the dropping rule once again
+        //fprintf( stderr, "row%d: tmptop: %d\n", i, tmptop );
+        //for( k1 = 0; k1<= tmptop; ++k1 )
+        //  fprintf( stderr, "%d(%f)  ", tmp[k1].j, tmp[k1].val );
+        //fprintf( stderr, "\n" );
+        //fprintf( stderr, "row(%d): droptol=%.4f\n", i+1, droptol[i] );
+        for ( k1 = 0; k1 < tmptop; ++k1 )
+        {
+            if ( FABS(tmp_val[k1]) > droptol[i] / tmp_val[tmptop] )
+            {
+                L->j[Ltop] = tmp_j[k1];
+                L->val[Ltop] = tmp_val[k1];
+                U->start[tmp_j[k1] + 1]++;
+                ++Ltop;
+                //fprintf( stderr, "%d(%.4f)  ", tmp[k1].j+1, tmp[k1].val );
+            }
+        }
+        // keep the diagonal in any case
+        L->j[Ltop] = tmp_j[k1];
+        L->val[Ltop] = tmp_val[k1];
+        ++Ltop;
+        //fprintf( stderr, "%d(%.4f)\n", tmp[k1].j+1,  tmp[k1].val );
+    }
+
+    L->start[i] = Ltop;
+//    fprintf( stderr, "nnz(L): %d, max: %d\n", Ltop, L->n * 50 );
+
+    /* U = L^T (Cholesky factorization) */
+    Transpose( L, U );
+//    for ( i = 1; i <= U->n; ++i )
+//    {
+//        Utop[i] = U->start[i] = U->start[i] + U->start[i - 1] + 1;
+//    }
+//    for ( i = 0; i < L->n; ++i )
+//    {
+//        for ( pj = L->start[i]; pj < L->start[i + 1]; ++pj )
+//        {
+//            j = L->j[pj];
+//            U->j[Utop[j]] = i;
+//            U->val[Utop[j]] = L->val[pj];
+//            Utop[j]++;
+//        }
+//    }
+
+//    fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 );
+
+    sfree( tmp_val, "ICHOLT::tmp_val" );
+    sfree( tmp_j, "ICHOLT::tmp_j" );
+    sfree( Utop, "ICHOLT::Utop" );
+
+    return Get_Timing_Info( start );
+}
+
+
+/* Fine-grained (parallel) incomplete Cholesky factorization
+ *
+ * Reference:
+ * Edmond Chow and Aftab Patel
+ * Fine-Grained Parallel Incomplete LU Factorization
+ * SIAM J. Sci. Comp. */
+#if defined(TESTING)
+real ICHOL_PAR( const sparse_matrix * const A, const unsigned int sweeps,
+        sparse_matrix * const U_t, sparse_matrix * const U )
+{
+    unsigned int i, j, k, pj, x = 0, y = 0, ei_x, ei_y;
+    real *D, *D_inv, sum, start;
+    sparse_matrix *DAD;
+    int *Utop;
+
+    start = Get_Time( );
+
+    if ( Allocate_Matrix( &DAD, A->n, A->m ) == FAILURE ||
+            ( D = (real*) malloc(A->n * sizeof(real)) ) == NULL ||
+            ( D_inv = (real*) malloc(A->n * sizeof(real)) ) == NULL ||
+            ( Utop = (int*) malloc((A->n + 1) * sizeof(int)) ) == NULL )
+    {
+        fprintf( stderr, "not enough memory for ICHOL_PAR preconditioning matrices. terminating.\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+#ifdef _OPENMP
+    #pragma omp parallel for schedule(static) \
+        default(none) shared(D_inv, D) private(i)
+#endif
+    for ( i = 0; i < A->n; ++i )
+    {
+        D_inv[i] = SQRT( A->val[A->start[i + 1] - 1] );
+        D[i] = 1. / D_inv[i];
+    }
+
+    memset( U->start, 0, sizeof(unsigned int) * (A->n + 1) );
+    memset( Utop, 0, sizeof(unsigned int) * (A->n + 1) );
+
+    /* to get convergence, A must have unit diagonal, so apply
+     * transformation DAD, where D = D(1./SQRT(D(A))) */
+    memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) );
+#ifdef _OPENMP
+    #pragma omp parallel for schedule(guided) \
+        default(none) shared(DAD, D_inv, D) private(i, pj)
+#endif
+    for ( i = 0; i < A->n; ++i )
+    {
+        /* non-diagonals */
+        for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
+        {
+            DAD->j[pj] = A->j[pj];
+            DAD->val[pj] = A->val[pj] * D[i] * D[A->j[pj]];
+        }
+        /* diagonal */
+        DAD->j[pj] = A->j[pj];
+        DAD->val[pj] = 1.;
+    }
+
+    /* initial guesses for U^T,
+     * assume: A and DAD symmetric and stored lower triangular */
+    memcpy( U_t->start, DAD->start, sizeof(int) * (DAD->n + 1) );
+    memcpy( U_t->j, DAD->j, sizeof(int) * (DAD->m) );
+    memcpy( U_t->val, DAD->val, sizeof(real) * (DAD->m) );
+
+    for ( i = 0; i < sweeps; ++i )
+    {
+        /* for each nonzero */
+#ifdef _OPENMP
+        #pragma omp parallel for schedule(static) \
+            default(none) shared(DAD, stderr) private(sum, ei_x, ei_y, k) firstprivate(x, y)
+#endif
+        for ( j = 0; j < A->start[A->n]; ++j )
+        {
+            sum = ZERO;
+
+            /* determine row bounds of current nonzero */
+            x = 0;
+            ei_x = 0;
+            for ( k = 0; k <= A->n; ++k )
+            {
+                if ( U_t->start[k] > j )
+                {
+                    x = U_t->start[k - 1];
+                    ei_x = U_t->start[k];
+                    break;
+                }
+            }
+            /* column bounds of current nonzero */
+            y = U_t->start[U_t->j[j]];
+            ei_y = U_t->start[U_t->j[j] + 1];
+
+            /* sparse dot product: dot( U^T(i,1:j-1), U^T(j,1:j-1) ) */
+            while ( U_t->j[x] < U_t->j[j] &&
+                    U_t->j[y] < U_t->j[j] &&
+                    x < ei_x && y < ei_y )
+            {
+                if ( U_t->j[x] == U_t->j[y] )
+                {
+                    sum += (U_t->val[x] * U_t->val[y]);
+                    ++x;
+                    ++y;
+                }
+                else if ( U_t->j[x] < U_t->j[y] )
+                {
+                    ++x;
+                }
+                else
+                {
+                    ++y;
+                }
+            }
+
+            sum = DAD->val[j] - sum;
+
+            /* diagonal entries */
+            if ( (k - 1) == U_t->j[j] )
+            {
+                /* sanity check */
+                if ( sum < ZERO )
+                {
+                    fprintf( stderr, "Numeric breakdown in ICHOL_PAR. Terminating.\n");
+#if defined(DEBUG_FOCUS)
+                    fprintf( stderr, "A(%5d,%5d) = %10.3f\n",
+                             k - 1, A->entries[j].j, A->entries[j].val );
+                    fprintf( stderr, "sum = %10.3f\n", sum);
+#endif
+                    exit(NUMERIC_BREAKDOWN);
+                }
+
+                U_t->val[j] = SQRT( sum );
+            }
+            /* non-diagonal entries */
+            else
+            {
+                U_t->val[j] = sum / U_t->val[ei_y - 1];
+            }
+        }
+    }
+
+    /* apply inverse transformation D^{-1}U^{T},
+     * since DAD \approx U^{T}U, so
+     * D^{-1}DADD^{-1} = A \approx D^{-1}U^{T}UD^{-1} */
+#ifdef _OPENMP
+    #pragma omp parallel for schedule(guided) \
+        default(none) shared(D_inv) private(i, pj)
 #endif
-/* global to make OpenMP shared (apply_preconditioner) */
-real *Dinv_L = NULL, *Dinv_U = NULL;
-/* global to make OpenMP shared (tri_solve_level_sched) */
-int levels = 1;
-int levels_L = 1, levels_U = 1;
-unsigned int *row_levels_L = NULL, *level_rows_L = NULL, *level_rows_cnt_L = NULL;
-unsigned int *row_levels_U = NULL, *level_rows_U = NULL, *level_rows_cnt_U = NULL;
-unsigned int *row_levels, *level_rows, *level_rows_cnt;
-unsigned int *top = NULL;
-/* global to make OpenMP shared (graph_coloring) */
-unsigned int *color = NULL;
-unsigned int *to_color = NULL;
-unsigned int *conflict = NULL;
-unsigned int *conflict_cnt = NULL;
-unsigned int *temp_ptr;
-unsigned int *recolor = NULL;
-unsigned int recolor_cnt;
-unsigned int *color_top = NULL;
-/* global to make OpenMP shared (sort_colors) */
-unsigned int *permuted_row_col = NULL;
-unsigned int *permuted_row_col_inv = NULL;
-real *y_p = NULL;
-/* global to make OpenMP shared (permute_vector) */
-real *x_p = NULL;
-unsigned int *mapping = NULL;
-sparse_matrix *H_full;
-sparse_matrix *H_p;
-/* global to make OpenMP shared (jacobi_iter) */
-real *Dinv_b = NULL, *rp = NULL, *rp2 = NULL, *rp3 = NULL;
+    for ( i = 0; i < A->n; ++i )
+    {
+        for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
+        {
+            U_t->val[pj] *= D_inv[i];
+        }
+    }
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "nnz(L): %d, max: %d\n", U_t->start[U_t->n], U_t->n * 50 );
+#endif
+
+    /* transpose U^{T} and copy into U */
+    Transpose( U_t, U );
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 );
+#endif
+
+    Deallocate_Matrix( DAD );
+    sfree( D_inv, "ICHOL_PAR::D_inv" );
+    sfree( D, "ICHOL_PAR::D" );
+    sfree( Utop, "ICHOL_PAR::Utop" );
+
+    return Get_Timing_Info( start );
+}
+#endif
+
+
+/* Fine-grained (parallel) incomplete LU factorization
+ *
+ * Reference:
+ * Edmond Chow and Aftab Patel
+ * Fine-Grained Parallel Incomplete LU Factorization
+ * SIAM J. Sci. Comp.
+ *
+ * A: symmetric, half-stored (lower triangular), CSR format
+ * sweeps: number of loops over non-zeros for computation
+ * L / U: factorized triangular matrices (A \approx LU), CSR format */
+real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps,
+        sparse_matrix * const L, sparse_matrix * const U )
+{
+    unsigned int i, j, k, pj, x, y, ei_x, ei_y;
+    real *D, *D_inv, sum, start;
+    sparse_matrix *DAD;
+
+    start = Get_Time( );
+
+    if ( Allocate_Matrix( &DAD, A->n, A->m ) == FAILURE ||
+            ( D = (real*) malloc(A->n * sizeof(real)) ) == NULL ||
+            ( D_inv = (real*) malloc(A->n * sizeof(real)) ) == NULL )
+    {
+        fprintf( stderr, "[ILU_PAR] Not enough memory for preconditioning matrices. Terminating.\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+#ifdef _OPENMP
+    #pragma omp parallel for schedule(static) \
+        default(none) shared(D, D_inv) private(i)
+#endif
+    for ( i = 0; i < A->n; ++i )
+    {
+        D_inv[i] = SQRT( FABS( A->val[A->start[i + 1] - 1] ) );
+        D[i] = 1.0 / D_inv[i];
+//        printf( "A->val[%8d] = %f, D[%4d] = %f, D_inv[%4d] = %f\n", A->start[i + 1] - 1, A->val[A->start[i + 1] - 1], i, D[i], i, D_inv[i] );
+    }
+
+    /* to get convergence, A must have unit diagonal, so apply
+     * transformation DAD, where D = D(1./SQRT(abs(D(A)))) */
+    memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) );
+#ifdef _OPENMP
+    #pragma omp parallel for schedule(static) \
+        default(none) shared(DAD, D) private(i, pj)
+#endif
+    for ( i = 0; i < A->n; ++i )
+    {
+        /* non-diagonals */
+        for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
+        {
+            DAD->j[pj] = A->j[pj];
+            DAD->val[pj] = D[i] * A->val[pj] * D[A->j[pj]];
+        }
+        /* diagonal */
+        DAD->j[pj] = A->j[pj];
+        DAD->val[pj] = 1.0;
+    }
+
+    /* initial guesses for L and U,
+     * assume: A and DAD symmetric and stored lower triangular */
+    memcpy( L->start, DAD->start, sizeof(int) * (DAD->n + 1) );
+    memcpy( L->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) );
+    memcpy( L->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) );
+    /* store U^T in CSR for row-wise access and tranpose later */
+    memcpy( U->start, DAD->start, sizeof(int) * (DAD->n + 1) );
+    memcpy( U->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) );
+    memcpy( U->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) );
+
+    /* L has unit diagonal, by convention */
+#ifdef _OPENMP
+    #pragma omp parallel for schedule(static) default(none) private(i)
+#endif
+    for ( i = 0; i < A->n; ++i )
+    {
+        L->val[L->start[i + 1] - 1] = 1.0;
+    }
+
+    for ( i = 0; i < sweeps; ++i )
+    {
+        /* for each nonzero in L */
+#ifdef _OPENMP
+        #pragma omp parallel for schedule(static) \
+            default(none) shared(DAD) private(j, k, x, y, ei_x, ei_y, sum)
+#endif
+        for ( j = 0; j < DAD->start[DAD->n]; ++j )
+        {
+            sum = ZERO;
+
+            /* determine row bounds of current nonzero */
+            x = 0;
+            ei_x = 0;
+            for ( k = 1; k <= DAD->n; ++k )
+            {
+                if ( DAD->start[k] > j )
+                {
+                    x = DAD->start[k - 1];
+                    ei_x = DAD->start[k];
+                    break;
+                }
+            }
+            /* determine column bounds of current nonzero */
+            y = DAD->start[DAD->j[j]];
+            ei_y = DAD->start[DAD->j[j] + 1];
+
+            /* sparse dot product:
+             *   dot( L(i,1:j-1), U(1:j-1,j) ) */
+            while ( L->j[x] < L->j[j] &&
+                    L->j[y] < L->j[j] &&
+                    x < ei_x && y < ei_y )
+            {
+                if ( L->j[x] == L->j[y] )
+                {
+                    sum += (L->val[x] * U->val[y]);
+                    ++x;
+                    ++y;
+                }
+                else if ( L->j[x] < L->j[y] )
+                {
+                    ++x;
+                }
+                else
+                {
+                    ++y;
+                }
+            }
+
+            if ( j != ei_x - 1 )
+            {
+                L->val[j] = ( DAD->val[j] - sum ) / U->val[ei_y - 1];
+            }
+        }
+
+#ifdef _OPENMP
+        #pragma omp parallel for schedule(static) \
+            default(none) shared(DAD) private(j, k, x, y, ei_x, ei_y, sum)
+#endif
+        for ( j = 0; j < DAD->start[DAD->n]; ++j )
+        {
+            sum = ZERO;
+
+            /* determine row bounds of current nonzero */
+            x = 0;
+            ei_x = 0;
+            for ( k = 1; k <= DAD->n; ++k )
+            {
+                if ( DAD->start[k] > j )
+                {
+                    x = DAD->start[k - 1];
+                    ei_x = DAD->start[k];
+                    break;
+                }
+            }
+            /* determine column bounds of current nonzero */
+            y = DAD->start[DAD->j[j]];
+            ei_y = DAD->start[DAD->j[j] + 1];
+
+            /* sparse dot product:
+             *   dot( L(i,1:i-1), U(1:i-1,j) ) */
+            while ( U->j[x] < U->j[j] &&
+                    U->j[y] < U->j[j] &&
+                    x < ei_x && y < ei_y )
+            {
+                if ( U->j[x] == U->j[y] )
+                {
+                    sum += (L->val[y] * U->val[x]);
+                    ++x;
+                    ++y;
+                }
+                else if ( U->j[x] < U->j[y] )
+                {
+                    ++x;
+                }
+                else
+                {
+                    ++y;
+                }
+            }
+
+            U->val[j] = DAD->val[j] - sum;
+        }
+    }
+
+    /* apply inverse transformation:
+     * since DAD \approx LU, then
+     * D^{-1}DADD^{-1} = A \approx D^{-1}LUD^{-1} */
+#ifdef _OPENMP
+    #pragma omp parallel for schedule(static) \
+        default(none) shared(DAD, D_inv) private(i, pj)
+#endif
+    for ( i = 0; i < DAD->n; ++i )
+    {
+        for ( pj = DAD->start[i]; pj < DAD->start[i + 1]; ++pj )
+        {
+            L->val[pj] = D_inv[i] * L->val[pj];
+            /* currently storing U^T, so use row index instead of column index */
+            U->val[pj] = U->val[pj] * D_inv[i];
+        }
+    }
+
+    Transpose_I( U );
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "nnz(L): %d, max: %d\n", L->start[L->n], L->n * 50 );
+    fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 );
+#endif
+
+    Deallocate_Matrix( DAD );
+    sfree( D_inv, "ILU_PAR::D_inv" );
+    sfree( D, "ILU_PAR::D_inv" );
+
+    return Get_Timing_Info( start );
+}
+
+
+/* Fine-grained (parallel) incomplete LU factorization with thresholding
+ *
+ * Reference:
+ * Edmond Chow and Aftab Patel
+ * Fine-Grained Parallel Incomplete LU Factorization
+ * SIAM J. Sci. Comp.
+ *
+ * A: symmetric, half-stored (lower triangular), CSR format
+ * droptol: row-wise tolerances used for dropping
+ * sweeps: number of loops over non-zeros for computation
+ * L / U: factorized triangular matrices (A \approx LU), CSR format */
+real ILUT_PAR( const sparse_matrix * const A, const real * droptol,
+        const unsigned int sweeps, sparse_matrix * const L, sparse_matrix * const U )
+{
+    unsigned int i, j, k, pj, x, y, ei_x, ei_y, Ltop, Utop;
+    real *D, *D_inv, sum, start;
+    sparse_matrix *DAD, *L_temp, *U_temp;
+
+    start = Get_Time( );
+
+    if ( Allocate_Matrix( &DAD, A->n, A->m ) == FAILURE ||
+            Allocate_Matrix( &L_temp, A->n, A->m ) == FAILURE ||
+            Allocate_Matrix( &U_temp, A->n, A->m ) == FAILURE )
+    {
+        fprintf( stderr, "not enough memory for ILUT_PAR preconditioning matrices. terminating.\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+    if ( ( D = (real*) malloc(A->n * sizeof(real)) ) == NULL ||
+            ( D_inv = (real*) malloc(A->n * sizeof(real)) ) == NULL )
+    {
+        fprintf( stderr, "not enough memory for ILUT_PAR preconditioning matrices. terminating.\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
+
+#ifdef _OPENMP
+    #pragma omp parallel for schedule(static) \
+        default(none) shared(D, D_inv) private(i)
+#endif
+    for ( i = 0; i < A->n; ++i )
+    {
+        D_inv[i] = SQRT( FABS( A->val[A->start[i + 1] - 1] ) );
+        D[i] = 1.0 / D_inv[i];
+    }
+
+    /* to get convergence, A must have unit diagonal, so apply
+     * transformation DAD, where D = D(1./SQRT(D(A))) */
+    memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) );
+#ifdef _OPENMP
+    #pragma omp parallel for schedule(static) \
+        default(none) shared(DAD, D) private(i, pj)
+#endif
+    for ( i = 0; i < A->n; ++i )
+    {
+        /* non-diagonals */
+        for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
+        {
+            DAD->j[pj] = A->j[pj];
+            DAD->val[pj] = D[i] * A->val[pj] * D[A->j[pj]];
+        }
+        /* diagonal */
+        DAD->j[pj] = A->j[pj];
+        DAD->val[pj] = 1.0;
+    }
+
+    /* initial guesses for L and U,
+     * assume: A and DAD symmetric and stored lower triangular */
+    memcpy( L_temp->start, DAD->start, sizeof(int) * (DAD->n + 1) );
+    memcpy( L_temp->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) );
+    memcpy( L_temp->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) );
+    /* store U^T in CSR for row-wise access and tranpose later */
+    memcpy( U_temp->start, DAD->start, sizeof(int) * (DAD->n + 1) );
+    memcpy( U_temp->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) );
+    memcpy( U_temp->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) );
+
+    /* L has unit diagonal, by convention */
+#ifdef _OPENMP
+    #pragma omp parallel for schedule(static) \
+        default(none) private(i) shared(L_temp)
+#endif
+    for ( i = 0; i < A->n; ++i )
+    {
+        L_temp->val[L_temp->start[i + 1] - 1] = 1.0;
+    }
+
+    for ( i = 0; i < sweeps; ++i )
+    {
+        /* for each nonzero in L */
+#ifdef _OPENMP
+        #pragma omp parallel for schedule(static) \
+            default(none) shared(DAD, L_temp, U_temp) private(j, k, x, y, ei_x, ei_y, sum)
+#endif
+        for ( j = 0; j < DAD->start[DAD->n]; ++j )
+        {
+            sum = ZERO;
+
+            /* determine row bounds of current nonzero */
+            x = 0;
+            ei_x = 0;
+            for ( k = 1; k <= DAD->n; ++k )
+            {
+                if ( DAD->start[k] > j )
+                {
+                    x = DAD->start[k - 1];
+                    ei_x = DAD->start[k];
+                    break;
+                }
+            }
+            /* determine column bounds of current nonzero */
+            y = DAD->start[DAD->j[j]];
+            ei_y = DAD->start[DAD->j[j] + 1];
+
+            /* sparse dot product:
+             *   dot( L(i,1:j-1), U(1:j-1,j) ) */
+            while ( L_temp->j[x] < L_temp->j[j] &&
+                    L_temp->j[y] < L_temp->j[j] &&
+                    x < ei_x && y < ei_y )
+            {
+                if ( L_temp->j[x] == L_temp->j[y] )
+                {
+                    sum += (L_temp->val[x] * U_temp->val[y]);
+                    ++x;
+                    ++y;
+                }
+                else if ( L_temp->j[x] < L_temp->j[y] )
+                {
+                    ++x;
+                }
+                else
+                {
+                    ++y;
+                }
+            }
+
+            if ( j != ei_x - 1 )
+            {
+                L_temp->val[j] = ( DAD->val[j] - sum ) / U_temp->val[ei_y - 1];
+            }
+        }
+
+#ifdef _OPENMP
+        #pragma omp parallel for schedule(static) \
+            default(none) shared(DAD, L_temp, U_temp) private(j, k, x, y, ei_x, ei_y, sum)
+#endif
+        for ( j = 0; j < DAD->start[DAD->n]; ++j )
+        {
+            sum = ZERO;
+
+            /* determine row bounds of current nonzero */
+            x = 0;
+            ei_x = 0;
+            for ( k = 1; k <= DAD->n; ++k )
+            {
+                if ( DAD->start[k] > j )
+                {
+                    x = DAD->start[k - 1];
+                    ei_x = DAD->start[k];
+                    break;
+                }
+            }
+            /* determine column bounds of current nonzero */
+            y = DAD->start[DAD->j[j]];
+            ei_y = DAD->start[DAD->j[j] + 1];
+
+            /* sparse dot product:
+             *   dot( L(i,1:i-1), U(1:i-1,j) ) */
+            while ( U_temp->j[x] < U_temp->j[j] &&
+                    U_temp->j[y] < U_temp->j[j] &&
+                    x < ei_x && y < ei_y )
+            {
+                if ( U_temp->j[x] == U_temp->j[y] )
+                {
+                    sum += (L_temp->val[y] * U_temp->val[x]);
+                    ++x;
+                    ++y;
+                }
+                else if ( U_temp->j[x] < U_temp->j[y] )
+                {
+                    ++x;
+                }
+                else
+                {
+                    ++y;
+                }
+            }
+
+            U_temp->val[j] = DAD->val[j] - sum;
+        }
+    }
+
+    /* apply inverse transformation:
+     * since DAD \approx LU, then
+     * D^{-1}DADD^{-1} = A \approx D^{-1}LUD^{-1} */
+#ifdef _OPENMP
+    #pragma omp parallel for schedule(static) \
+        default(none) shared(DAD, L_temp, U_temp, D_inv) private(i, pj)
+#endif
+    for ( i = 0; i < DAD->n; ++i )
+    {
+        for ( pj = DAD->start[i]; pj < DAD->start[i + 1]; ++pj )
+        {
+            L_temp->val[pj] = D_inv[i] * L_temp->val[pj];
+            /* currently storing U^T, so use row index instead of column index */
+            U_temp->val[pj] = U_temp->val[pj] * D_inv[i];
+        }
+    }
+
+    /* apply the dropping rule */
+    Ltop = 0;
+    Utop = 0;
+    for ( i = 0; i < DAD->n; ++i )
+    {
+        L->start[i] = Ltop;
+        U->start[i] = Utop;
+
+        for ( pj = L_temp->start[i]; pj < L_temp->start[i + 1] - 1; ++pj )
+        {
+            if ( FABS( L_temp->val[pj] ) > FABS( droptol[i] / L_temp->val[L_temp->start[i + 1] - 1] ) )
+            {
+                L->j[Ltop] = L_temp->j[pj];
+                L->val[Ltop] = L_temp->val[pj];
+                ++Ltop;
+            }
+        }
+
+        /* diagonal */
+        L->j[Ltop] = L_temp->j[pj];
+        L->val[Ltop] = L_temp->val[pj];
+        ++Ltop;
+
+        for ( pj = U_temp->start[i]; pj < U_temp->start[i + 1] - 1; ++pj )
+        {
+            if ( FABS( U_temp->val[pj] ) > FABS( droptol[i] / U_temp->val[U_temp->start[i + 1] - 1] ) )
+            {
+                U->j[Utop] = U_temp->j[pj];
+                U->val[Utop] = U_temp->val[pj];
+                ++Utop;
+            }
+        }
+
+        /* diagonal */
+        U->j[Utop] = U_temp->j[pj];
+        U->val[Utop] = U_temp->val[pj];
+        ++Utop;
+    }
+
+    L->start[i] = Ltop;
+    U->start[i] = Utop;
+
+    Transpose_I( U );
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "nnz(L): %d\n", L->start[L->n] );
+    fprintf( stderr, "nnz(U): %d\n", U->start[U->n] );
+#endif
+
+    Deallocate_Matrix( U_temp );
+    Deallocate_Matrix( L_temp );
+    Deallocate_Matrix( DAD );
+    sfree( D_inv, "ILUT_PAR::D_inv" );
+    sfree( D, "ILUT_PAR::D_inv" );
+
+    return Get_Timing_Info( start );
+}
 
 
 /* sparse matrix-vector product Ax=b
@@ -1133,6 +2515,10 @@ static void apply_preconditioner( const static_storage * const workspace, const
                 tri_solve( workspace->L, y, x, workspace->L->n, LOWER );
                 tri_solve( workspace->U, x, x, workspace->U->n, UPPER );
                 break;
+            case SAI_PC:
+                //TODO: add code to compute SAI first
+//                Sparse_MatVec( SAI, y, x );
+                break;
             default:
                 fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
                 exit( INVALID_INPUT );
@@ -1151,6 +2537,9 @@ static void apply_preconditioner( const static_storage * const workspace, const
                 tri_solve_level_sched( workspace->L, y, x, workspace->L->n, LOWER, fresh_pre );
                 tri_solve_level_sched( workspace->U, x, x, workspace->U->n, UPPER, fresh_pre );
                 break;
+            case SAI_PC:
+                //TODO: add code to compute SAI first
+//                Sparse_MatVec( SAI, y, x );
             default:
                 fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
                 exit( INVALID_INPUT );
@@ -1161,6 +2550,7 @@ static void apply_preconditioner( const static_storage * const workspace, const
             switch ( control->cm_solver_pre_comp_type )
             {
             case DIAG_PC:
+            case SAI_PC:
                 fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" );
                 exit( INVALID_INPUT );
                 break;
@@ -1189,6 +2579,7 @@ static void apply_preconditioner( const static_storage * const workspace, const
             switch ( control->cm_solver_pre_comp_type )
             {
             case DIAG_PC:
+            case SAI_PC:
                 fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" );
                 exit( INVALID_INPUT );
                 break;
diff --git a/sPuReMD/src/lin_alg.h b/sPuReMD/src/lin_alg.h
index 3946f60a7e628475a29485c31f48577ba0a70eee..ceb2a18bfcfb6dc8ddaff3e8c807459a9d401d0d 100644
--- a/sPuReMD/src/lin_alg.h
+++ b/sPuReMD/src/lin_alg.h
@@ -30,6 +30,33 @@ typedef enum
     UPPER = 1,
 } TRIANGULARITY;
 
+void Sort_Matrix_Rows( sparse_matrix * const );
+
+int Estimate_LU_Fill( const sparse_matrix * const, const real * const );
+
+void Calculate_Droptol( const sparse_matrix * const,
+        real * const, const real );
+
+#if defined(HAVE_SUPERLU_MT)
+real SuperLU_Factorize( const sparse_matrix * const,
+        sparse_matrix * const, sparse_matrix * const );
+#endif
+
+real diag_pre_comp( const sparse_matrix * const, real * const );
+
+real ICHOLT( const sparse_matrix * const, const real * const,
+        sparse_matrix * const, sparse_matrix * const );
+
+#if defined(TESTING)
+real ICHOL_PAR( const sparse_matrix * const, const unsigned int,
+        sparse_matrix * const, sparse_matrix * const );
+#endif
+
+real ILU_PAR( const sparse_matrix * const, const unsigned int,
+        sparse_matrix * const, sparse_matrix * const );
+
+real ILUT_PAR( const sparse_matrix * const, const real *,
+        const unsigned int, sparse_matrix * const, sparse_matrix * const );
 
 void Transpose( const sparse_matrix * const, sparse_matrix * const );
 
diff --git a/sPuReMD/src/mytypes.h b/sPuReMD/src/mytypes.h
index f75ad16891e70fb130233f0d506b1ffb5c05f0fc..e68c830aa6174255183bbac5eb03b483e54d7fcb 100644
--- a/sPuReMD/src/mytypes.h
+++ b/sPuReMD/src/mytypes.h
@@ -233,6 +233,7 @@ enum pre_comp
     ILU_PAR_PC = 3,
     ILUT_PAR_PC = 4,
     ILU_SUPERLU_MT_PC = 5,
+    SAI_PC = 6,
 };
 
 enum pre_app