diff --git a/sPuReMD/src/charges.c b/sPuReMD/src/charges.c index 5003d28ae68aab663a5ec26640cdd6258de1dc0c..d2f038c43f072b0beec502d44e9900e13ae18026 100644 --- a/sPuReMD/src/charges.c +++ b/sPuReMD/src/charges.c @@ -32,1388 +32,6 @@ #endif -typedef struct -{ - unsigned int j; - real val; -} sparse_matrix_entry; - - -#if defined(TEST_MAT) -static sparse_matrix * create_test_mat( void ) -{ - unsigned int i, n; - sparse_matrix *H_test; - - if ( Allocate_Matrix( &H_test, 3, 6 ) == FAILURE ) - { - fprintf( stderr, "not enough memory for test matrices. terminating.\n" ); - exit( INSUFFICIENT_MEMORY ); - } - - //3x3, SPD, store lower half - i = 0; - n = 0; - H_test->start[n] = i; - H_test->j[i] = 0; - H_test->val[i] = 4.; - ++i; - ++n; - H_test->start[n] = i; - H_test->j[i] = 0; - H_test->val[i] = 12.; - ++i; - H_test->j[i] = 1; - H_test->val[i] = 37.; - ++i; - ++n; - H_test->start[n] = i; - H_test->j[i] = 0; - H_test->val[i] = -16.; - ++i; - H_test->j[i] = 1; - H_test->val[i] = -43.; - ++i; - H_test->j[i] = 2; - H_test->val[i] = 98.; - ++i; - ++n; - H_test->start[n] = i; - - return H_test; -} -#endif - - -/* Routine used with qsort for sorting nonzeros within a sparse matrix row - * - * v1/v2: pointers to column indices of nonzeros within a row (unsigned int) - */ -static int compare_matrix_entry(const void *v1, const void *v2) -{ - /* larger element has larger column index */ - return ((sparse_matrix_entry *)v1)->j - ((sparse_matrix_entry *)v2)->j; -} - - -/* Routine used for sorting nonzeros within a sparse matrix row; - * internally, a combination of qsort and manual sorting is utilized - * (parallel calls to qsort when multithreading, rows mapped to threads) - * - * A: sparse matrix for which to sort nonzeros within a row, stored in CSR format - */ -static void Sort_Matrix_Rows( sparse_matrix * const A ) -{ - unsigned int i, j, si, ei; - sparse_matrix_entry *temp; - -#ifdef _OPENMP -// #pragma omp parallel default(none) private(i, j, si, ei, temp) shared(stderr) -#endif - { - if ( ( temp = (sparse_matrix_entry *) malloc( A->n * sizeof(sparse_matrix_entry)) ) == NULL ) - { - fprintf( stderr, "Not enough space for matrix row sort. Terminating...\n" ); - exit( INSUFFICIENT_MEMORY ); - } - - /* sort each row of A using column indices */ -#ifdef _OPENMP -// #pragma omp for schedule(guided) -#endif - for ( i = 0; i < A->n; ++i ) - { - si = A->start[i]; - ei = A->start[i + 1]; - - for ( j = 0; j < (ei - si); ++j ) - { - (temp + j)->j = A->j[si + j]; - (temp + j)->val = A->val[si + j]; - } - - /* polymorphic sort in standard C library using column indices */ - qsort( temp, ei - si, sizeof(sparse_matrix_entry), compare_matrix_entry ); - - for ( j = 0; j < (ei - si); ++j ) - { - A->j[si + j] = (temp + j)->j; - A->val[si + j] = (temp + j)->val; - } - } - - sfree( temp, "Sort_Matrix_Rows::temp" ); - } -} - - -static void Calculate_Droptol( const sparse_matrix * const A, - real * const droptol, const real dtol ) -{ - int i, j, k; - real val; -#ifdef _OPENMP - static real *droptol_local; - unsigned int tid; -#endif - -#ifdef _OPENMP - #pragma omp parallel default(none) private(i, j, k, val, tid), shared(droptol_local, stderr) -#endif - { -#ifdef _OPENMP - tid = omp_get_thread_num(); - - #pragma omp master - { - /* keep b_local for program duration to avoid allocate/free - * overhead per Sparse_MatVec call*/ - if ( droptol_local == NULL ) - { - if ( (droptol_local = (real*) malloc( omp_get_num_threads() * A->n * sizeof(real))) == NULL ) - { - fprintf( stderr, "Not enough space for droptol. Terminating...\n" ); - exit( INSUFFICIENT_MEMORY ); - } - } - } - - #pragma omp barrier -#endif - - /* init droptol to 0 */ - for ( i = 0; i < A->n; ++i ) - { -#ifdef _OPENMP - droptol_local[tid * A->n + i] = 0.0; -#else - droptol[i] = 0.0; -#endif - } - -#ifdef _OPENMP - #pragma omp barrier -#endif - - /* calculate sqaure of the norm of each row */ -#ifdef _OPENMP - #pragma omp for schedule(static) -#endif - for ( i = 0; i < A->n; ++i ) - { - for ( k = A->start[i]; k < A->start[i + 1] - 1; ++k ) - { - j = A->j[k]; - val = A->val[k]; - -#ifdef _OPENMP - droptol_local[tid * A->n + i] += val * val; - droptol_local[tid * A->n + j] += val * val; -#else - droptol[i] += val * val; - droptol[j] += val * val; -#endif - } - - // diagonal entry - val = A->val[k]; -#ifdef _OPENMP - droptol_local[tid * A->n + i] += val * val; -#else - droptol[i] += val * val; -#endif - } - -#ifdef _OPENMP - #pragma omp barrier - - #pragma omp for schedule(static) - for ( i = 0; i < A->n; ++i ) - { - droptol[i] = 0.0; - for ( k = 0; k < omp_get_num_threads(); ++k ) - { - droptol[i] += droptol_local[k * A->n + i]; - } - } - - #pragma omp barrier -#endif - - /* calculate local droptol for each row */ - //fprintf( stderr, "droptol: " ); -#ifdef _OPENMP - #pragma omp for schedule(static) -#endif - for ( i = 0; i < A->n; ++i ) - { - //fprintf( stderr, "%f-->", droptol[i] ); - droptol[i] = SQRT( droptol[i] ) * dtol; - //fprintf( stderr, "%f ", droptol[i] ); - } - //fprintf( stderr, "\n" ); - } -} - - -static int Estimate_LU_Fill( const sparse_matrix * const A, const real * const droptol ) -{ - int i, pj; - int fillin; - real val; - - fillin = 0; - -#ifdef _OPENMP - #pragma omp parallel for schedule(static) \ - default(none) private(i, pj, val) reduction(+: fillin) -#endif - for ( i = 0; i < A->n; ++i ) - { - for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj ) - { - val = A->val[pj]; - - if ( FABS(val) > droptol[i] ) - { - ++fillin; - } - } - } - - return fillin + A->n; -} - - -#if defined(HAVE_SUPERLU_MT) -static real SuperLU_Factorize( const sparse_matrix * const A, - sparse_matrix * const L, sparse_matrix * const U ) -{ - unsigned int i, pj, count, *Ltop, *Utop, r; - sparse_matrix *A_t; - SuperMatrix A_S, AC_S, L_S, U_S; - NCformat *A_S_store; - SCPformat *L_S_store; - NCPformat *U_S_store; - superlumt_options_t superlumt_options; - pxgstrf_shared_t pxgstrf_shared; - pdgstrf_threadarg_t *pdgstrf_threadarg; - int_t nprocs; - fact_t fact; - trans_t trans; - yes_no_t refact, usepr; - real u, drop_tol; - real *a, *at; - int_t *asub, *atsub, *xa, *xat; - int_t *perm_c; /* column permutation vector */ - int_t *perm_r; /* row permutations from partial pivoting */ - void *work; - int_t info, lwork; - int_t permc_spec, panel_size, relax; - Gstat_t Gstat; - flops_t flopcnt; - - /* Default parameters to control factorization. */ -#ifdef _OPENMP - //TODO: set as global parameter and use - #pragma omp parallel \ - default(none) shared(nprocs) - { - #pragma omp master - { - /* SuperLU_MT spawns threads internally, so set and pass parameter */ - nprocs = omp_get_num_threads(); - } - } -#else - nprocs = 1; -#endif - -// fact = EQUILIBRATE; /* equilibrate A (i.e., scale rows & cols to have unit norm), then factorize */ - fact = DOFACT; /* factor from scratch */ - trans = NOTRANS; - refact = NO; /* first time factorization */ - //TODO: add to control file and use the value there to set these - panel_size = sp_ienv(1); /* # consec. cols treated as unit task */ - relax = sp_ienv(2); /* # cols grouped as relaxed supernode */ - u = 1.0; /* diagonal pivoting threshold */ - usepr = NO; - drop_tol = 0.0; - work = NULL; - lwork = 0; - -//#if defined(DEBUG) - fprintf( stderr, "nprocs = %d\n", nprocs ); - fprintf( stderr, "Panel size = %d\n", panel_size ); - fprintf( stderr, "Relax = %d\n", relax ); -//#endif - - if ( !(perm_r = intMalloc(A->n)) ) - { - SUPERLU_ABORT("Malloc fails for perm_r[]."); - } - if ( !(perm_c = intMalloc(A->n)) ) - { - SUPERLU_ABORT("Malloc fails for perm_c[]."); - } - if ( !(superlumt_options.etree = intMalloc(A->n)) ) - { - SUPERLU_ABORT("Malloc fails for etree[]."); - } - if ( !(superlumt_options.colcnt_h = intMalloc(A->n)) ) - { - SUPERLU_ABORT("Malloc fails for colcnt_h[]."); - } - if ( !(superlumt_options.part_super_h = intMalloc(A->n)) ) - { - SUPERLU_ABORT("Malloc fails for part_super__h[]."); - } - if ( ( (a = (real*) malloc( (2 * A->start[A->n] - A->n) * sizeof(real))) == NULL ) - || ( (asub = (int_t*) malloc( (2 * A->start[A->n] - A->n) * sizeof(int_t))) == NULL ) - || ( (xa = (int_t*) malloc( (A->n + 1) * sizeof(int_t))) == NULL ) - || ( (Ltop = (unsigned int*) malloc( (A->n + 1) * sizeof(unsigned int))) == NULL ) - || ( (Utop = (unsigned int*) malloc( (A->n + 1) * sizeof(unsigned int))) == NULL ) ) - { - fprintf( stderr, "Not enough space for SuperLU factorization. Terminating...\n" ); - exit( INSUFFICIENT_MEMORY ); - } - if ( Allocate_Matrix( &A_t, A->n, A->m ) == FAILURE ) - { - fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" ); - exit( INSUFFICIENT_MEMORY ); - } - - /* Set up the sparse matrix data structure for A. */ - Transpose( A, A_t ); - - count = 0; - for ( i = 0; i < A->n; ++i ) - { - xa[i] = count; - for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj ) - { - a[count] = A->entries[pj].val; - asub[count] = A->entries[pj].j; - ++count; - } - for ( pj = A_t->start[i] + 1; pj < A_t->start[i + 1]; ++pj ) - { - a[count] = A_t->entries[pj].val; - asub[count] = A_t->entries[pj].j; - ++count; - } - } - xa[i] = count; - - dCompRow_to_CompCol( A->n, A->n, 2 * A->start[A->n] - A->n, a, asub, xa, - &at, &atsub, &xat ); - - for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i ) - fprintf( stderr, "%6d", asub[i] ); - fprintf( stderr, "\n" ); - for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i ) - fprintf( stderr, "%6.1f", a[i] ); - fprintf( stderr, "\n" ); - for ( i = 0; i <= A->n; ++i ) - fprintf( stderr, "%6d", xa[i] ); - fprintf( stderr, "\n" ); - for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i ) - fprintf( stderr, "%6d", atsub[i] ); - fprintf( stderr, "\n" ); - for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i ) - fprintf( stderr, "%6.1f", at[i] ); - fprintf( stderr, "\n" ); - for ( i = 0; i <= A->n; ++i ) - fprintf( stderr, "%6d", xat[i] ); - fprintf( stderr, "\n" ); - - A_S.Stype = SLU_NC; /* column-wise, no supernode */ - A_S.Dtype = SLU_D; /* double-precision */ - A_S.Mtype = SLU_GE; /* full (general) matrix -- required for parallel factorization */ - A_S.nrow = A->n; - A_S.ncol = A->n; - A_S.Store = (void *) SUPERLU_MALLOC( sizeof(NCformat) ); - A_S_store = (NCformat *) A_S.Store; - A_S_store->nnz = 2 * A->start[A->n] - A->n; - A_S_store->nzval = at; - A_S_store->rowind = atsub; - A_S_store->colptr = xat; - - /* ------------------------------------------------------------ - Allocate storage and initialize statistics variables. - ------------------------------------------------------------*/ - StatAlloc( A->n, nprocs, panel_size, relax, &Gstat ); - StatInit( A->n, nprocs, &Gstat ); - - /* ------------------------------------------------------------ - Get column permutation vector perm_c[], according to permc_spec: - permc_spec = 0: natural ordering - permc_spec = 1: minimum degree ordering on structure of A'*A - permc_spec = 2: minimum degree ordering on structure of A'+A - permc_spec = 3: approximate minimum degree for unsymmetric matrices - ------------------------------------------------------------*/ - permc_spec = 0; - get_perm_c( permc_spec, &A_S, perm_c ); - - /* ------------------------------------------------------------ - Initialize the option structure superlumt_options using the - user-input parameters; - Apply perm_c to the columns of original A to form AC. - ------------------------------------------------------------*/ - pdgstrf_init( nprocs, fact, trans, refact, panel_size, relax, - u, usepr, drop_tol, perm_c, perm_r, - work, lwork, &A_S, &AC_S, &superlumt_options, &Gstat ); - - for ( i = 0; i < ((NCPformat*)AC_S.Store)->nnz; ++i ) - fprintf( stderr, "%6.1f", ((real*)(((NCPformat*)AC_S.Store)->nzval))[i] ); - fprintf( stderr, "\n" ); - - /* ------------------------------------------------------------ - Compute the LU factorization of A. - The following routine will create nprocs threads. - ------------------------------------------------------------*/ - pdgstrf( &superlumt_options, &AC_S, perm_r, &L_S, &U_S, &Gstat, &info ); - - fprintf( stderr, "INFO: %d\n", info ); - - flopcnt = 0; - for (i = 0; i < nprocs; ++i) - { - flopcnt += Gstat.procstat[i].fcops; - } - Gstat.ops[FACT] = flopcnt; - -//#if defined(DEBUG) - printf("\n** Result of sparse LU **\n"); - L_S_store = (SCPformat *) L_S.Store; - U_S_store = (NCPformat *) U_S.Store; - printf( "No of nonzeros in factor L = " IFMT "\n", L_S_store->nnz ); - printf( "No of nonzeros in factor U = " IFMT "\n", U_S_store->nnz ); - fflush( stdout ); -//#endif - - /* convert L and R from SuperLU formats to CSR */ - memset( Ltop, 0, (A->n + 1) * sizeof(int) ); - memset( Utop, 0, (A->n + 1) * sizeof(int) ); - memset( L->start, 0, (A->n + 1) * sizeof(int) ); - memset( U->start, 0, (A->n + 1) * sizeof(int) ); - - for ( i = 0; i < 2 * L_S_store->nnz; ++i ) - fprintf( stderr, "%6.1f", ((real*)(L_S_store->nzval))[i] ); - fprintf( stderr, "\n" ); - for ( i = 0; i < 2 * U_S_store->nnz; ++i ) - fprintf( stderr, "%6.1f", ((real*)(U_S_store->nzval))[i] ); - fprintf( stderr, "\n" ); - - printf( "No of supernodes in factor L = " IFMT "\n", L_S_store->nsuper ); - for ( i = 0; i < A->n; ++i ) - { - fprintf( stderr, "nzval_col_beg[%5d] = %d\n", i, L_S_store->nzval_colbeg[i] ); - fprintf( stderr, "nzval_col_end[%5d] = %d\n", i, L_S_store->nzval_colend[i] ); - //TODO: correct for SCPformat for L? - //for( pj = L_S_store->rowind_colbeg[i]; pj < L_S_store->rowind_colend[i]; ++pj ) -// for( pj = 0; pj < L_S_store->rowind_colend[i] - L_S_store->rowind_colbeg[i]; ++pj ) -// { -// ++Ltop[L_S_store->rowind[L_S_store->rowind_colbeg[i] + pj] + 1]; -// } - fprintf( stderr, "col_beg[%5d] = %d\n", i, U_S_store->colbeg[i] ); - fprintf( stderr, "col_end[%5d] = %d\n", i, U_S_store->colend[i] ); - for ( pj = U_S_store->colbeg[i]; pj < U_S_store->colend[i]; ++pj ) - { - ++Utop[U_S_store->rowind[pj] + 1]; - fprintf( stderr, "Utop[%5d] = %d\n", U_S_store->rowind[pj] + 1, Utop[U_S_store->rowind[pj] + 1] ); - } - } - for ( i = 1; i <= A->n; ++i ) - { -// Ltop[i] = L->start[i] = Ltop[i] + Ltop[i - 1]; - Utop[i] = U->start[i] = Utop[i] + Utop[i - 1]; -// fprintf( stderr, "Utop[%5d] = %d\n", i, Utop[i] ); -// fprintf( stderr, "U->start[%5d] = %d\n", i, U->start[i] ); - } - for ( i = 0; i < A->n; ++i ) - { -// for( pj = 0; pj < L_S_store->nzval_colend[i] - L_S_store->nzval_colbeg[i]; ++pj ) -// { -// r = L_S_store->rowind[L_S_store->rowind_colbeg[i] + pj]; -// L->entries[Ltop[r]].j = r; -// L->entries[Ltop[r]].val = ((real*)L_S_store->nzval)[L_S_store->nzval_colbeg[i] + pj]; -// ++Ltop[r]; -// } - for ( pj = U_S_store->colbeg[i]; pj < U_S_store->colend[i]; ++pj ) - { - r = U_S_store->rowind[pj]; - U->entries[Utop[r]].j = i; - U->entries[Utop[r]].val = ((real*)U_S_store->nzval)[pj]; - ++Utop[r]; - } - } - - /* ------------------------------------------------------------ - Deallocate storage after factorization. - ------------------------------------------------------------*/ - pxgstrf_finalize( &superlumt_options, &AC_S ); - Deallocate_Matrix( A_t ); - sfree( xa, "SuperLU_Factorize::xa" ); - sfree( asub, "SuperLU_Factorize::asub" ); - sfree( a, "SuperLU_Factorize::a" ); - SUPERLU_FREE( perm_r ); - SUPERLU_FREE( perm_c ); - SUPERLU_FREE( ((NCformat *)A_S.Store)->rowind ); - SUPERLU_FREE( ((NCformat *)A_S.Store)->colptr ); - SUPERLU_FREE( ((NCformat *)A_S.Store)->nzval ); - SUPERLU_FREE( A_S.Store ); - if ( lwork == 0 ) - { - Destroy_SuperNode_SCP(&L_S); - Destroy_CompCol_NCP(&U_S); - } - else if ( lwork > 0 ) - { - SUPERLU_FREE(work); - } - StatFree(&Gstat); - - sfree( Utop, "SuperLU_Factorize::Utop" ); - sfree( Ltop, "SuperLU_Factorize::Ltop" ); - - //TODO: return iters - return 0.; -} -#endif - - -/* Diagonal (Jacobi) preconditioner computation */ -static real diag_pre_comp( const sparse_matrix * const H, real * const Hdia_inv ) -{ - unsigned int i; - real start; - - start = Get_Time( ); - -#ifdef _OPENMP - #pragma omp parallel for schedule(static) \ - default(none) private(i) -#endif - for ( i = 0; i < H->n; ++i ) - { - if ( H->val[H->start[i + 1] - 1] != 0.0 ) - { - Hdia_inv[i] = 1.0 / H->val[H->start[i + 1] - 1]; - } - else - { - Hdia_inv[i] = 1.0; - } - } - - return Get_Timing_Info( start ); -} - - -/* Incomplete Cholesky factorization with dual thresholding */ -static real ICHOLT( const sparse_matrix * const A, const real * const droptol, - sparse_matrix * const L, sparse_matrix * const U ) -{ - int *tmp_j; - real *tmp_val; - int i, j, pj, k1, k2, tmptop, Ltop; - real val, start; - unsigned int *Utop; - - start = Get_Time( ); - - if ( ( Utop = (unsigned int*) malloc((A->n + 1) * sizeof(unsigned int)) ) == NULL || - ( tmp_j = (int*) malloc(A->n * sizeof(int)) ) == NULL || - ( tmp_val = (real*) malloc(A->n * sizeof(real)) ) == NULL ) - { - fprintf( stderr, "[ICHOLT] Not enough memory for preconditioning matrices. terminating.\n" ); - exit( INSUFFICIENT_MEMORY ); - } - - // clear variables - Ltop = 0; - tmptop = 0; - memset( L->start, 0, (A->n + 1) * sizeof(unsigned int) ); - memset( U->start, 0, (A->n + 1) * sizeof(unsigned int) ); - memset( Utop, 0, A->n * sizeof(unsigned int) ); - - for ( i = 0; i < A->n; ++i ) - { - L->start[i] = Ltop; - tmptop = 0; - - for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj ) - { - j = A->j[pj]; - val = A->val[pj]; - - if ( FABS(val) > droptol[i] ) - { - k1 = 0; - k2 = L->start[j]; - while ( k1 < tmptop && k2 < L->start[j + 1] ) - { - if ( tmp_j[k1] < L->j[k2] ) - { - ++k1; - } - else if ( tmp_j[k1] > L->j[k2] ) - { - ++k2; - } - else - { - val -= (tmp_val[k1++] * L->val[k2++]); - } - } - - // L matrix is lower triangular, - // so right before the start of next row comes jth diagonal - val /= L->val[L->start[j + 1] - 1]; - - tmp_j[tmptop] = j; - tmp_val[tmptop] = val; - ++tmptop; - } - } - - // sanity check - if ( A->j[pj] != i ) - { - fprintf( stderr, "[ICHOLT] badly built A matrix!\n (i = %d) ", i ); - exit( NUMERIC_BREAKDOWN ); - } - - // compute the ith diagonal in L - val = A->val[pj]; - for ( k1 = 0; k1 < tmptop; ++k1 ) - { - val -= (tmp_val[k1] * tmp_val[k1]); - } - -#if defined(DEBUG) - if ( val < 0.0 ) - { - fprintf( stderr, "[ICHOLT] Numeric breakdown (SQRT of negative on diagonal i = %d). Terminating.\n", i ); - exit( NUMERIC_BREAKDOWN ); - - } -#endif - - tmp_j[tmptop] = i; - tmp_val[tmptop] = SQRT( val ); - - // apply the dropping rule once again - //fprintf( stderr, "row%d: tmptop: %d\n", i, tmptop ); - //for( k1 = 0; k1<= tmptop; ++k1 ) - // fprintf( stderr, "%d(%f) ", tmp[k1].j, tmp[k1].val ); - //fprintf( stderr, "\n" ); - //fprintf( stderr, "row(%d): droptol=%.4f\n", i+1, droptol[i] ); - for ( k1 = 0; k1 < tmptop; ++k1 ) - { - if ( FABS(tmp_val[k1]) > droptol[i] / tmp_val[tmptop] ) - { - L->j[Ltop] = tmp_j[k1]; - L->val[Ltop] = tmp_val[k1]; - U->start[tmp_j[k1] + 1]++; - ++Ltop; - //fprintf( stderr, "%d(%.4f) ", tmp[k1].j+1, tmp[k1].val ); - } - } - // keep the diagonal in any case - L->j[Ltop] = tmp_j[k1]; - L->val[Ltop] = tmp_val[k1]; - ++Ltop; - //fprintf( stderr, "%d(%.4f)\n", tmp[k1].j+1, tmp[k1].val ); - } - - L->start[i] = Ltop; -// fprintf( stderr, "nnz(L): %d, max: %d\n", Ltop, L->n * 50 ); - - /* U = L^T (Cholesky factorization) */ - Transpose( L, U ); -// for ( i = 1; i <= U->n; ++i ) -// { -// Utop[i] = U->start[i] = U->start[i] + U->start[i - 1] + 1; -// } -// for ( i = 0; i < L->n; ++i ) -// { -// for ( pj = L->start[i]; pj < L->start[i + 1]; ++pj ) -// { -// j = L->j[pj]; -// U->j[Utop[j]] = i; -// U->val[Utop[j]] = L->val[pj]; -// Utop[j]++; -// } -// } - -// fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 ); - - sfree( tmp_val, "ICHOLT::tmp_val" ); - sfree( tmp_j, "ICHOLT::tmp_j" ); - sfree( Utop, "ICHOLT::Utop" ); - - return Get_Timing_Info( start ); -} - - -/* Fine-grained (parallel) incomplete Cholesky factorization - * - * Reference: - * Edmond Chow and Aftab Patel - * Fine-Grained Parallel Incomplete LU Factorization - * SIAM J. Sci. Comp. */ -#if defined(TESTING) -static real ICHOL_PAR( const sparse_matrix * const A, const unsigned int sweeps, - sparse_matrix * const U_t, sparse_matrix * const U ) -{ - unsigned int i, j, k, pj, x = 0, y = 0, ei_x, ei_y; - real *D, *D_inv, sum, start; - sparse_matrix *DAD; - int *Utop; - - start = Get_Time( ); - - if ( Allocate_Matrix( &DAD, A->n, A->m ) == FAILURE || - ( D = (real*) malloc(A->n * sizeof(real)) ) == NULL || - ( D_inv = (real*) malloc(A->n * sizeof(real)) ) == NULL || - ( Utop = (int*) malloc((A->n + 1) * sizeof(int)) ) == NULL ) - { - fprintf( stderr, "not enough memory for ICHOL_PAR preconditioning matrices. terminating.\n" ); - exit( INSUFFICIENT_MEMORY ); - } - -#ifdef _OPENMP - #pragma omp parallel for schedule(static) \ - default(none) shared(D_inv, D) private(i) -#endif - for ( i = 0; i < A->n; ++i ) - { - D_inv[i] = SQRT( A->val[A->start[i + 1] - 1] ); - D[i] = 1. / D_inv[i]; - } - - memset( U->start, 0, sizeof(unsigned int) * (A->n + 1) ); - memset( Utop, 0, sizeof(unsigned int) * (A->n + 1) ); - - /* to get convergence, A must have unit diagonal, so apply - * transformation DAD, where D = D(1./SQRT(D(A))) */ - memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) ); -#ifdef _OPENMP - #pragma omp parallel for schedule(guided) \ - default(none) shared(DAD, D_inv, D) private(i, pj) -#endif - for ( i = 0; i < A->n; ++i ) - { - /* non-diagonals */ - for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj ) - { - DAD->j[pj] = A->j[pj]; - DAD->val[pj] = A->val[pj] * D[i] * D[A->j[pj]]; - } - /* diagonal */ - DAD->j[pj] = A->j[pj]; - DAD->val[pj] = 1.; - } - - /* initial guesses for U^T, - * assume: A and DAD symmetric and stored lower triangular */ - memcpy( U_t->start, DAD->start, sizeof(int) * (DAD->n + 1) ); - memcpy( U_t->j, DAD->j, sizeof(int) * (DAD->m) ); - memcpy( U_t->val, DAD->val, sizeof(real) * (DAD->m) ); - - for ( i = 0; i < sweeps; ++i ) - { - /* for each nonzero */ -#ifdef _OPENMP - #pragma omp parallel for schedule(static) \ - default(none) shared(DAD, stderr) private(sum, ei_x, ei_y, k) firstprivate(x, y) -#endif - for ( j = 0; j < A->start[A->n]; ++j ) - { - sum = ZERO; - - /* determine row bounds of current nonzero */ - x = 0; - ei_x = 0; - for ( k = 0; k <= A->n; ++k ) - { - if ( U_t->start[k] > j ) - { - x = U_t->start[k - 1]; - ei_x = U_t->start[k]; - break; - } - } - /* column bounds of current nonzero */ - y = U_t->start[U_t->j[j]]; - ei_y = U_t->start[U_t->j[j] + 1]; - - /* sparse dot product: dot( U^T(i,1:j-1), U^T(j,1:j-1) ) */ - while ( U_t->j[x] < U_t->j[j] && - U_t->j[y] < U_t->j[j] && - x < ei_x && y < ei_y ) - { - if ( U_t->j[x] == U_t->j[y] ) - { - sum += (U_t->val[x] * U_t->val[y]); - ++x; - ++y; - } - else if ( U_t->j[x] < U_t->j[y] ) - { - ++x; - } - else - { - ++y; - } - } - - sum = DAD->val[j] - sum; - - /* diagonal entries */ - if ( (k - 1) == U_t->j[j] ) - { - /* sanity check */ - if ( sum < ZERO ) - { - fprintf( stderr, "Numeric breakdown in ICHOL_PAR. Terminating.\n"); -#if defined(DEBUG_FOCUS) - fprintf( stderr, "A(%5d,%5d) = %10.3f\n", - k - 1, A->entries[j].j, A->entries[j].val ); - fprintf( stderr, "sum = %10.3f\n", sum); -#endif - exit(NUMERIC_BREAKDOWN); - } - - U_t->val[j] = SQRT( sum ); - } - /* non-diagonal entries */ - else - { - U_t->val[j] = sum / U_t->val[ei_y - 1]; - } - } - } - - /* apply inverse transformation D^{-1}U^{T}, - * since DAD \approx U^{T}U, so - * D^{-1}DADD^{-1} = A \approx D^{-1}U^{T}UD^{-1} */ -#ifdef _OPENMP - #pragma omp parallel for schedule(guided) \ - default(none) shared(D_inv) private(i, pj) -#endif - for ( i = 0; i < A->n; ++i ) - { - for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj ) - { - U_t->val[pj] *= D_inv[i]; - } - } - -#if defined(DEBUG_FOCUS) - fprintf( stderr, "nnz(L): %d, max: %d\n", U_t->start[U_t->n], U_t->n * 50 ); -#endif - - /* transpose U^{T} and copy into U */ - Transpose( U_t, U ); - -#if defined(DEBUG_FOCUS) - fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 ); -#endif - - Deallocate_Matrix( DAD ); - sfree( D_inv, "ICHOL_PAR::D_inv" ); - sfree( D, "ICHOL_PAR::D" ); - sfree( Utop, "ICHOL_PAR::Utop" ); - - return Get_Timing_Info( start ); -} -#endif - - -/* Fine-grained (parallel) incomplete LU factorization - * - * Reference: - * Edmond Chow and Aftab Patel - * Fine-Grained Parallel Incomplete LU Factorization - * SIAM J. Sci. Comp. - * - * A: symmetric, half-stored (lower triangular), CSR format - * sweeps: number of loops over non-zeros for computation - * L / U: factorized triangular matrices (A \approx LU), CSR format */ -static real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps, - sparse_matrix * const L, sparse_matrix * const U ) -{ - unsigned int i, j, k, pj, x, y, ei_x, ei_y; - real *D, *D_inv, sum, start; - sparse_matrix *DAD; - - start = Get_Time( ); - - if ( Allocate_Matrix( &DAD, A->n, A->m ) == FAILURE || - ( D = (real*) malloc(A->n * sizeof(real)) ) == NULL || - ( D_inv = (real*) malloc(A->n * sizeof(real)) ) == NULL ) - { - fprintf( stderr, "[ILU_PAR] Not enough memory for preconditioning matrices. Terminating.\n" ); - exit( INSUFFICIENT_MEMORY ); - } - -#ifdef _OPENMP - #pragma omp parallel for schedule(static) \ - default(none) shared(D, D_inv) private(i) -#endif - for ( i = 0; i < A->n; ++i ) - { - D_inv[i] = SQRT( FABS( A->val[A->start[i + 1] - 1] ) ); - D[i] = 1.0 / D_inv[i]; -// printf( "A->val[%8d] = %f, D[%4d] = %f, D_inv[%4d] = %f\n", A->start[i + 1] - 1, A->val[A->start[i + 1] - 1], i, D[i], i, D_inv[i] ); - } - - /* to get convergence, A must have unit diagonal, so apply - * transformation DAD, where D = D(1./SQRT(abs(D(A)))) */ - memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) ); -#ifdef _OPENMP - #pragma omp parallel for schedule(static) \ - default(none) shared(DAD, D) private(i, pj) -#endif - for ( i = 0; i < A->n; ++i ) - { - /* non-diagonals */ - for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj ) - { - DAD->j[pj] = A->j[pj]; - DAD->val[pj] = D[i] * A->val[pj] * D[A->j[pj]]; - } - /* diagonal */ - DAD->j[pj] = A->j[pj]; - DAD->val[pj] = 1.0; - } - - /* initial guesses for L and U, - * assume: A and DAD symmetric and stored lower triangular */ - memcpy( L->start, DAD->start, sizeof(int) * (DAD->n + 1) ); - memcpy( L->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) ); - memcpy( L->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) ); - /* store U^T in CSR for row-wise access and tranpose later */ - memcpy( U->start, DAD->start, sizeof(int) * (DAD->n + 1) ); - memcpy( U->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) ); - memcpy( U->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) ); - - /* L has unit diagonal, by convention */ -#ifdef _OPENMP - #pragma omp parallel for schedule(static) default(none) private(i) -#endif - for ( i = 0; i < A->n; ++i ) - { - L->val[L->start[i + 1] - 1] = 1.0; - } - - for ( i = 0; i < sweeps; ++i ) - { - /* for each nonzero in L */ -#ifdef _OPENMP - #pragma omp parallel for schedule(static) \ - default(none) shared(DAD) private(j, k, x, y, ei_x, ei_y, sum) -#endif - for ( j = 0; j < DAD->start[DAD->n]; ++j ) - { - sum = ZERO; - - /* determine row bounds of current nonzero */ - x = 0; - ei_x = 0; - for ( k = 1; k <= DAD->n; ++k ) - { - if ( DAD->start[k] > j ) - { - x = DAD->start[k - 1]; - ei_x = DAD->start[k]; - break; - } - } - /* determine column bounds of current nonzero */ - y = DAD->start[DAD->j[j]]; - ei_y = DAD->start[DAD->j[j] + 1]; - - /* sparse dot product: - * dot( L(i,1:j-1), U(1:j-1,j) ) */ - while ( L->j[x] < L->j[j] && - L->j[y] < L->j[j] && - x < ei_x && y < ei_y ) - { - if ( L->j[x] == L->j[y] ) - { - sum += (L->val[x] * U->val[y]); - ++x; - ++y; - } - else if ( L->j[x] < L->j[y] ) - { - ++x; - } - else - { - ++y; - } - } - - if ( j != ei_x - 1 ) - { - L->val[j] = ( DAD->val[j] - sum ) / U->val[ei_y - 1]; - } - } - -#ifdef _OPENMP - #pragma omp parallel for schedule(static) \ - default(none) shared(DAD) private(j, k, x, y, ei_x, ei_y, sum) -#endif - for ( j = 0; j < DAD->start[DAD->n]; ++j ) - { - sum = ZERO; - - /* determine row bounds of current nonzero */ - x = 0; - ei_x = 0; - for ( k = 1; k <= DAD->n; ++k ) - { - if ( DAD->start[k] > j ) - { - x = DAD->start[k - 1]; - ei_x = DAD->start[k]; - break; - } - } - /* determine column bounds of current nonzero */ - y = DAD->start[DAD->j[j]]; - ei_y = DAD->start[DAD->j[j] + 1]; - - /* sparse dot product: - * dot( L(i,1:i-1), U(1:i-1,j) ) */ - while ( U->j[x] < U->j[j] && - U->j[y] < U->j[j] && - x < ei_x && y < ei_y ) - { - if ( U->j[x] == U->j[y] ) - { - sum += (L->val[y] * U->val[x]); - ++x; - ++y; - } - else if ( U->j[x] < U->j[y] ) - { - ++x; - } - else - { - ++y; - } - } - - U->val[j] = DAD->val[j] - sum; - } - } - - /* apply inverse transformation: - * since DAD \approx LU, then - * D^{-1}DADD^{-1} = A \approx D^{-1}LUD^{-1} */ -#ifdef _OPENMP - #pragma omp parallel for schedule(static) \ - default(none) shared(DAD, D_inv) private(i, pj) -#endif - for ( i = 0; i < DAD->n; ++i ) - { - for ( pj = DAD->start[i]; pj < DAD->start[i + 1]; ++pj ) - { - L->val[pj] = D_inv[i] * L->val[pj]; - /* currently storing U^T, so use row index instead of column index */ - U->val[pj] = U->val[pj] * D_inv[i]; - } - } - - Transpose_I( U ); - -#if defined(DEBUG_FOCUS) - fprintf( stderr, "nnz(L): %d, max: %d\n", L->start[L->n], L->n * 50 ); - fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 ); -#endif - - Deallocate_Matrix( DAD ); - sfree( D_inv, "ILU_PAR::D_inv" ); - sfree( D, "ILU_PAR::D_inv" ); - - return Get_Timing_Info( start ); -} - - -/* Fine-grained (parallel) incomplete LU factorization with thresholding - * - * Reference: - * Edmond Chow and Aftab Patel - * Fine-Grained Parallel Incomplete LU Factorization - * SIAM J. Sci. Comp. - * - * A: symmetric, half-stored (lower triangular), CSR format - * droptol: row-wise tolerances used for dropping - * sweeps: number of loops over non-zeros for computation - * L / U: factorized triangular matrices (A \approx LU), CSR format */ -static real ILUT_PAR( const sparse_matrix * const A, const real * droptol, - const unsigned int sweeps, sparse_matrix * const L, sparse_matrix * const U ) -{ - unsigned int i, j, k, pj, x, y, ei_x, ei_y, Ltop, Utop; - real *D, *D_inv, sum, start; - sparse_matrix *DAD, *L_temp, *U_temp; - - start = Get_Time( ); - - if ( Allocate_Matrix( &DAD, A->n, A->m ) == FAILURE || - Allocate_Matrix( &L_temp, A->n, A->m ) == FAILURE || - Allocate_Matrix( &U_temp, A->n, A->m ) == FAILURE ) - { - fprintf( stderr, "not enough memory for ILUT_PAR preconditioning matrices. terminating.\n" ); - exit( INSUFFICIENT_MEMORY ); - } - - if ( ( D = (real*) malloc(A->n * sizeof(real)) ) == NULL || - ( D_inv = (real*) malloc(A->n * sizeof(real)) ) == NULL ) - { - fprintf( stderr, "not enough memory for ILUT_PAR preconditioning matrices. terminating.\n" ); - exit( INSUFFICIENT_MEMORY ); - } - -#ifdef _OPENMP - #pragma omp parallel for schedule(static) \ - default(none) shared(D, D_inv) private(i) -#endif - for ( i = 0; i < A->n; ++i ) - { - D_inv[i] = SQRT( FABS( A->val[A->start[i + 1] - 1] ) ); - D[i] = 1.0 / D_inv[i]; - } - - /* to get convergence, A must have unit diagonal, so apply - * transformation DAD, where D = D(1./SQRT(D(A))) */ - memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) ); -#ifdef _OPENMP - #pragma omp parallel for schedule(static) \ - default(none) shared(DAD, D) private(i, pj) -#endif - for ( i = 0; i < A->n; ++i ) - { - /* non-diagonals */ - for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj ) - { - DAD->j[pj] = A->j[pj]; - DAD->val[pj] = D[i] * A->val[pj] * D[A->j[pj]]; - } - /* diagonal */ - DAD->j[pj] = A->j[pj]; - DAD->val[pj] = 1.0; - } - - /* initial guesses for L and U, - * assume: A and DAD symmetric and stored lower triangular */ - memcpy( L_temp->start, DAD->start, sizeof(int) * (DAD->n + 1) ); - memcpy( L_temp->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) ); - memcpy( L_temp->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) ); - /* store U^T in CSR for row-wise access and tranpose later */ - memcpy( U_temp->start, DAD->start, sizeof(int) * (DAD->n + 1) ); - memcpy( U_temp->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) ); - memcpy( U_temp->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) ); - - /* L has unit diagonal, by convention */ -#ifdef _OPENMP - #pragma omp parallel for schedule(static) \ - default(none) private(i) shared(L_temp) -#endif - for ( i = 0; i < A->n; ++i ) - { - L_temp->val[L_temp->start[i + 1] - 1] = 1.0; - } - - for ( i = 0; i < sweeps; ++i ) - { - /* for each nonzero in L */ -#ifdef _OPENMP - #pragma omp parallel for schedule(static) \ - default(none) shared(DAD, L_temp, U_temp) private(j, k, x, y, ei_x, ei_y, sum) -#endif - for ( j = 0; j < DAD->start[DAD->n]; ++j ) - { - sum = ZERO; - - /* determine row bounds of current nonzero */ - x = 0; - ei_x = 0; - for ( k = 1; k <= DAD->n; ++k ) - { - if ( DAD->start[k] > j ) - { - x = DAD->start[k - 1]; - ei_x = DAD->start[k]; - break; - } - } - /* determine column bounds of current nonzero */ - y = DAD->start[DAD->j[j]]; - ei_y = DAD->start[DAD->j[j] + 1]; - - /* sparse dot product: - * dot( L(i,1:j-1), U(1:j-1,j) ) */ - while ( L_temp->j[x] < L_temp->j[j] && - L_temp->j[y] < L_temp->j[j] && - x < ei_x && y < ei_y ) - { - if ( L_temp->j[x] == L_temp->j[y] ) - { - sum += (L_temp->val[x] * U_temp->val[y]); - ++x; - ++y; - } - else if ( L_temp->j[x] < L_temp->j[y] ) - { - ++x; - } - else - { - ++y; - } - } - - if ( j != ei_x - 1 ) - { - L_temp->val[j] = ( DAD->val[j] - sum ) / U_temp->val[ei_y - 1]; - } - } - -#ifdef _OPENMP - #pragma omp parallel for schedule(static) \ - default(none) shared(DAD, L_temp, U_temp) private(j, k, x, y, ei_x, ei_y, sum) -#endif - for ( j = 0; j < DAD->start[DAD->n]; ++j ) - { - sum = ZERO; - - /* determine row bounds of current nonzero */ - x = 0; - ei_x = 0; - for ( k = 1; k <= DAD->n; ++k ) - { - if ( DAD->start[k] > j ) - { - x = DAD->start[k - 1]; - ei_x = DAD->start[k]; - break; - } - } - /* determine column bounds of current nonzero */ - y = DAD->start[DAD->j[j]]; - ei_y = DAD->start[DAD->j[j] + 1]; - - /* sparse dot product: - * dot( L(i,1:i-1), U(1:i-1,j) ) */ - while ( U_temp->j[x] < U_temp->j[j] && - U_temp->j[y] < U_temp->j[j] && - x < ei_x && y < ei_y ) - { - if ( U_temp->j[x] == U_temp->j[y] ) - { - sum += (L_temp->val[y] * U_temp->val[x]); - ++x; - ++y; - } - else if ( U_temp->j[x] < U_temp->j[y] ) - { - ++x; - } - else - { - ++y; - } - } - - U_temp->val[j] = DAD->val[j] - sum; - } - } - - /* apply inverse transformation: - * since DAD \approx LU, then - * D^{-1}DADD^{-1} = A \approx D^{-1}LUD^{-1} */ -#ifdef _OPENMP - #pragma omp parallel for schedule(static) \ - default(none) shared(DAD, L_temp, U_temp, D_inv) private(i, pj) -#endif - for ( i = 0; i < DAD->n; ++i ) - { - for ( pj = DAD->start[i]; pj < DAD->start[i + 1]; ++pj ) - { - L_temp->val[pj] = D_inv[i] * L_temp->val[pj]; - /* currently storing U^T, so use row index instead of column index */ - U_temp->val[pj] = U_temp->val[pj] * D_inv[i]; - } - } - - /* apply the dropping rule */ - Ltop = 0; - Utop = 0; - for ( i = 0; i < DAD->n; ++i ) - { - L->start[i] = Ltop; - U->start[i] = Utop; - - for ( pj = L_temp->start[i]; pj < L_temp->start[i + 1] - 1; ++pj ) - { - if ( FABS( L_temp->val[pj] ) > FABS( droptol[i] / L_temp->val[L_temp->start[i + 1] - 1] ) ) - { - L->j[Ltop] = L_temp->j[pj]; - L->val[Ltop] = L_temp->val[pj]; - ++Ltop; - } - } - - /* diagonal */ - L->j[Ltop] = L_temp->j[pj]; - L->val[Ltop] = L_temp->val[pj]; - ++Ltop; - - for ( pj = U_temp->start[i]; pj < U_temp->start[i + 1] - 1; ++pj ) - { - if ( FABS( U_temp->val[pj] ) > FABS( droptol[i] / U_temp->val[U_temp->start[i + 1] - 1] ) ) - { - U->j[Utop] = U_temp->j[pj]; - U->val[Utop] = U_temp->val[pj]; - ++Utop; - } - } - - /* diagonal */ - U->j[Utop] = U_temp->j[pj]; - U->val[Utop] = U_temp->val[pj]; - ++Utop; - } - - L->start[i] = Ltop; - U->start[i] = Utop; - - Transpose_I( U ); - -#if defined(DEBUG_FOCUS) - fprintf( stderr, "nnz(L): %d\n", L->start[L->n] ); - fprintf( stderr, "nnz(U): %d\n", U->start[U->n] ); -#endif - - Deallocate_Matrix( U_temp ); - Deallocate_Matrix( L_temp ); - Deallocate_Matrix( DAD ); - sfree( D_inv, "ILUT_PAR::D_inv" ); - sfree( D, "ILUT_PAR::D_inv" ); - - return Get_Timing_Info( start ); -} - - static void Extrapolate_Charges_QEq( const reax_system * const system, const control_params * const control, simulation_data * const data, static_storage * const workspace ) @@ -1584,6 +202,10 @@ static void Compute_Preconditioner_QEq( const reax_system * const system, #endif break; + case SAI_PC: + //TODO: call function + break; + default: fprintf( stderr, "Unrecognized preconditioner computation method. Terminating...\n" ); exit( INVALID_INPUT ); @@ -1928,6 +550,10 @@ static void Compute_Preconditioner_EE( const reax_system * const system, #endif break; + case SAI_PC: + //TODO: call function + break; + default: fprintf( stderr, "Unrecognized preconditioner computation method. Terminating...\n" ); exit( INVALID_INPUT ); @@ -2031,6 +657,10 @@ static void Compute_Preconditioner_ACKS2( const reax_system * const system, #endif break; + case SAI_PC: + //TODO: call function + break; + default: fprintf( stderr, "Unrecognized preconditioner computation method. Terminating...\n" ); exit( INVALID_INPUT ); @@ -2194,6 +824,10 @@ static void Setup_Preconditioner_QEq( const reax_system * const system, } break; + case SAI_PC: + //TODO: call setup function + break; + default: fprintf( stderr, "Unrecognized preconditioner computation method. Terminating...\n" ); exit( INVALID_INPUT ); @@ -2341,6 +975,10 @@ static void Setup_Preconditioner_EE( const reax_system * const system, } break; + case SAI_PC: + //TODO: call setup function + break; + default: fprintf( stderr, "Unrecognized preconditioner computation method. Terminating...\n" ); exit( INVALID_INPUT ); @@ -2490,6 +1128,10 @@ static void Setup_Preconditioner_ACKS2( const reax_system * const system, } break; + case SAI_PC: + //TODO: call setup function + break; + default: fprintf( stderr, "Unrecognized preconditioner computation method. Terminating...\n" ); exit( INVALID_INPUT ); diff --git a/sPuReMD/src/lin_alg.c b/sPuReMD/src/lin_alg.c index 2094d4f48cc9925eae23240ab170b84911f496fc..4a3fdf809c3115ea425c4a15e6e338f3e0428585 100644 --- a/sPuReMD/src/lin_alg.c +++ b/sPuReMD/src/lin_alg.c @@ -26,39 +26,1421 @@ #include "vector.h" +typedef struct +{ + unsigned int j; + real val; +} sparse_matrix_entry; + + /* global to make OpenMP shared (Sparse_MatVec) */ #ifdef _OPENMP -real *b_local = NULL; +real *b_local = NULL; +#endif +/* global to make OpenMP shared (apply_preconditioner) */ +real *Dinv_L = NULL, *Dinv_U = NULL; +/* global to make OpenMP shared (tri_solve_level_sched) */ +int levels = 1; +int levels_L = 1, levels_U = 1; +unsigned int *row_levels_L = NULL, *level_rows_L = NULL, *level_rows_cnt_L = NULL; +unsigned int *row_levels_U = NULL, *level_rows_U = NULL, *level_rows_cnt_U = NULL; +unsigned int *row_levels, *level_rows, *level_rows_cnt; +unsigned int *top = NULL; +/* global to make OpenMP shared (graph_coloring) */ +unsigned int *color = NULL; +unsigned int *to_color = NULL; +unsigned int *conflict = NULL; +unsigned int *conflict_cnt = NULL; +unsigned int *temp_ptr; +unsigned int *recolor = NULL; +unsigned int recolor_cnt; +unsigned int *color_top = NULL; +/* global to make OpenMP shared (sort_colors) */ +unsigned int *permuted_row_col = NULL; +unsigned int *permuted_row_col_inv = NULL; +real *y_p = NULL; +/* global to make OpenMP shared (permute_vector) */ +real *x_p = NULL; +unsigned int *mapping = NULL; +sparse_matrix *H_full; +sparse_matrix *H_p; +/* global to make OpenMP shared (jacobi_iter) */ +real *Dinv_b = NULL, *rp = NULL, *rp2 = NULL, *rp3 = NULL; + + +#if defined(TEST_MAT) +static sparse_matrix * create_test_mat( void ) +{ + unsigned int i, n; + sparse_matrix *H_test; + + if ( Allocate_Matrix( &H_test, 3, 6 ) == FAILURE ) + { + fprintf( stderr, "not enough memory for test matrices. terminating.\n" ); + exit( INSUFFICIENT_MEMORY ); + } + + //3x3, SPD, store lower half + i = 0; + n = 0; + H_test->start[n] = i; + H_test->j[i] = 0; + H_test->val[i] = 4.; + ++i; + ++n; + H_test->start[n] = i; + H_test->j[i] = 0; + H_test->val[i] = 12.; + ++i; + H_test->j[i] = 1; + H_test->val[i] = 37.; + ++i; + ++n; + H_test->start[n] = i; + H_test->j[i] = 0; + H_test->val[i] = -16.; + ++i; + H_test->j[i] = 1; + H_test->val[i] = -43.; + ++i; + H_test->j[i] = 2; + H_test->val[i] = 98.; + ++i; + ++n; + H_test->start[n] = i; + + return H_test; +} +#endif + + +/* Routine used with qsort for sorting nonzeros within a sparse matrix row + * + * v1/v2: pointers to column indices of nonzeros within a row (unsigned int) + */ +static int compare_matrix_entry(const void *v1, const void *v2) +{ + /* larger element has larger column index */ + return ((sparse_matrix_entry *)v1)->j - ((sparse_matrix_entry *)v2)->j; +} + + +/* Routine used for sorting nonzeros within a sparse matrix row; + * internally, a combination of qsort and manual sorting is utilized + * (parallel calls to qsort when multithreading, rows mapped to threads) + * + * A: sparse matrix for which to sort nonzeros within a row, stored in CSR format + */ +void Sort_Matrix_Rows( sparse_matrix * const A ) +{ + unsigned int i, j, si, ei; + sparse_matrix_entry *temp; + +#ifdef _OPENMP +// #pragma omp parallel default(none) private(i, j, si, ei, temp) shared(stderr) +#endif + { + if ( ( temp = (sparse_matrix_entry *) malloc( A->n * sizeof(sparse_matrix_entry)) ) == NULL ) + { + fprintf( stderr, "Not enough space for matrix row sort. Terminating...\n" ); + exit( INSUFFICIENT_MEMORY ); + } + + /* sort each row of A using column indices */ +#ifdef _OPENMP +// #pragma omp for schedule(guided) +#endif + for ( i = 0; i < A->n; ++i ) + { + si = A->start[i]; + ei = A->start[i + 1]; + + for ( j = 0; j < (ei - si); ++j ) + { + (temp + j)->j = A->j[si + j]; + (temp + j)->val = A->val[si + j]; + } + + /* polymorphic sort in standard C library using column indices */ + qsort( temp, ei - si, sizeof(sparse_matrix_entry), compare_matrix_entry ); + + for ( j = 0; j < (ei - si); ++j ) + { + A->j[si + j] = (temp + j)->j; + A->val[si + j] = (temp + j)->val; + } + } + + sfree( temp, "Sort_Matrix_Rows::temp" ); + } +} + + +void Calculate_Droptol( const sparse_matrix * const A, + real * const droptol, const real dtol ) +{ + int i, j, k; + real val; +#ifdef _OPENMP + static real *droptol_local; + unsigned int tid; +#endif + +#ifdef _OPENMP + #pragma omp parallel default(none) private(i, j, k, val, tid), shared(droptol_local, stderr) +#endif + { +#ifdef _OPENMP + tid = omp_get_thread_num(); + + #pragma omp master + { + /* keep b_local for program duration to avoid allocate/free + * overhead per Sparse_MatVec call*/ + if ( droptol_local == NULL ) + { + if ( (droptol_local = (real*) malloc( omp_get_num_threads() * A->n * sizeof(real))) == NULL ) + { + fprintf( stderr, "Not enough space for droptol. Terminating...\n" ); + exit( INSUFFICIENT_MEMORY ); + } + } + } + + #pragma omp barrier +#endif + + /* init droptol to 0 */ + for ( i = 0; i < A->n; ++i ) + { +#ifdef _OPENMP + droptol_local[tid * A->n + i] = 0.0; +#else + droptol[i] = 0.0; +#endif + } + +#ifdef _OPENMP + #pragma omp barrier +#endif + + /* calculate sqaure of the norm of each row */ +#ifdef _OPENMP + #pragma omp for schedule(static) +#endif + for ( i = 0; i < A->n; ++i ) + { + for ( k = A->start[i]; k < A->start[i + 1] - 1; ++k ) + { + j = A->j[k]; + val = A->val[k]; + +#ifdef _OPENMP + droptol_local[tid * A->n + i] += val * val; + droptol_local[tid * A->n + j] += val * val; +#else + droptol[i] += val * val; + droptol[j] += val * val; +#endif + } + + // diagonal entry + val = A->val[k]; +#ifdef _OPENMP + droptol_local[tid * A->n + i] += val * val; +#else + droptol[i] += val * val; +#endif + } + +#ifdef _OPENMP + #pragma omp barrier + + #pragma omp for schedule(static) + for ( i = 0; i < A->n; ++i ) + { + droptol[i] = 0.0; + for ( k = 0; k < omp_get_num_threads(); ++k ) + { + droptol[i] += droptol_local[k * A->n + i]; + } + } + + #pragma omp barrier +#endif + + /* calculate local droptol for each row */ + //fprintf( stderr, "droptol: " ); +#ifdef _OPENMP + #pragma omp for schedule(static) +#endif + for ( i = 0; i < A->n; ++i ) + { + //fprintf( stderr, "%f-->", droptol[i] ); + droptol[i] = SQRT( droptol[i] ) * dtol; + //fprintf( stderr, "%f ", droptol[i] ); + } + //fprintf( stderr, "\n" ); + } +} + + +int Estimate_LU_Fill( const sparse_matrix * const A, const real * const droptol ) +{ + int i, pj; + int fillin; + real val; + + fillin = 0; + +#ifdef _OPENMP + #pragma omp parallel for schedule(static) \ + default(none) private(i, pj, val) reduction(+: fillin) +#endif + for ( i = 0; i < A->n; ++i ) + { + for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj ) + { + val = A->val[pj]; + + if ( FABS(val) > droptol[i] ) + { + ++fillin; + } + } + } + + return fillin + A->n; +} + + +#if defined(HAVE_SUPERLU_MT) +real SuperLU_Factorize( const sparse_matrix * const A, + sparse_matrix * const L, sparse_matrix * const U ) +{ + unsigned int i, pj, count, *Ltop, *Utop, r; + sparse_matrix *A_t; + SuperMatrix A_S, AC_S, L_S, U_S; + NCformat *A_S_store; + SCPformat *L_S_store; + NCPformat *U_S_store; + superlumt_options_t superlumt_options; + pxgstrf_shared_t pxgstrf_shared; + pdgstrf_threadarg_t *pdgstrf_threadarg; + int_t nprocs; + fact_t fact; + trans_t trans; + yes_no_t refact, usepr; + real u, drop_tol; + real *a, *at; + int_t *asub, *atsub, *xa, *xat; + int_t *perm_c; /* column permutation vector */ + int_t *perm_r; /* row permutations from partial pivoting */ + void *work; + int_t info, lwork; + int_t permc_spec, panel_size, relax; + Gstat_t Gstat; + flops_t flopcnt; + + /* Default parameters to control factorization. */ +#ifdef _OPENMP + //TODO: set as global parameter and use + #pragma omp parallel \ + default(none) shared(nprocs) + { + #pragma omp master + { + /* SuperLU_MT spawns threads internally, so set and pass parameter */ + nprocs = omp_get_num_threads(); + } + } +#else + nprocs = 1; +#endif + +// fact = EQUILIBRATE; /* equilibrate A (i.e., scale rows & cols to have unit norm), then factorize */ + fact = DOFACT; /* factor from scratch */ + trans = NOTRANS; + refact = NO; /* first time factorization */ + //TODO: add to control file and use the value there to set these + panel_size = sp_ienv(1); /* # consec. cols treated as unit task */ + relax = sp_ienv(2); /* # cols grouped as relaxed supernode */ + u = 1.0; /* diagonal pivoting threshold */ + usepr = NO; + drop_tol = 0.0; + work = NULL; + lwork = 0; + +//#if defined(DEBUG) + fprintf( stderr, "nprocs = %d\n", nprocs ); + fprintf( stderr, "Panel size = %d\n", panel_size ); + fprintf( stderr, "Relax = %d\n", relax ); +//#endif + + if ( !(perm_r = intMalloc(A->n)) ) + { + SUPERLU_ABORT("Malloc fails for perm_r[]."); + } + if ( !(perm_c = intMalloc(A->n)) ) + { + SUPERLU_ABORT("Malloc fails for perm_c[]."); + } + if ( !(superlumt_options.etree = intMalloc(A->n)) ) + { + SUPERLU_ABORT("Malloc fails for etree[]."); + } + if ( !(superlumt_options.colcnt_h = intMalloc(A->n)) ) + { + SUPERLU_ABORT("Malloc fails for colcnt_h[]."); + } + if ( !(superlumt_options.part_super_h = intMalloc(A->n)) ) + { + SUPERLU_ABORT("Malloc fails for part_super__h[]."); + } + if ( ( (a = (real*) malloc( (2 * A->start[A->n] - A->n) * sizeof(real))) == NULL ) + || ( (asub = (int_t*) malloc( (2 * A->start[A->n] - A->n) * sizeof(int_t))) == NULL ) + || ( (xa = (int_t*) malloc( (A->n + 1) * sizeof(int_t))) == NULL ) + || ( (Ltop = (unsigned int*) malloc( (A->n + 1) * sizeof(unsigned int))) == NULL ) + || ( (Utop = (unsigned int*) malloc( (A->n + 1) * sizeof(unsigned int))) == NULL ) ) + { + fprintf( stderr, "Not enough space for SuperLU factorization. Terminating...\n" ); + exit( INSUFFICIENT_MEMORY ); + } + if ( Allocate_Matrix( &A_t, A->n, A->m ) == FAILURE ) + { + fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" ); + exit( INSUFFICIENT_MEMORY ); + } + + /* Set up the sparse matrix data structure for A. */ + Transpose( A, A_t ); + + count = 0; + for ( i = 0; i < A->n; ++i ) + { + xa[i] = count; + for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj ) + { + a[count] = A->entries[pj].val; + asub[count] = A->entries[pj].j; + ++count; + } + for ( pj = A_t->start[i] + 1; pj < A_t->start[i + 1]; ++pj ) + { + a[count] = A_t->entries[pj].val; + asub[count] = A_t->entries[pj].j; + ++count; + } + } + xa[i] = count; + + dCompRow_to_CompCol( A->n, A->n, 2 * A->start[A->n] - A->n, a, asub, xa, + &at, &atsub, &xat ); + + for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i ) + fprintf( stderr, "%6d", asub[i] ); + fprintf( stderr, "\n" ); + for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i ) + fprintf( stderr, "%6.1f", a[i] ); + fprintf( stderr, "\n" ); + for ( i = 0; i <= A->n; ++i ) + fprintf( stderr, "%6d", xa[i] ); + fprintf( stderr, "\n" ); + for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i ) + fprintf( stderr, "%6d", atsub[i] ); + fprintf( stderr, "\n" ); + for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i ) + fprintf( stderr, "%6.1f", at[i] ); + fprintf( stderr, "\n" ); + for ( i = 0; i <= A->n; ++i ) + fprintf( stderr, "%6d", xat[i] ); + fprintf( stderr, "\n" ); + + A_S.Stype = SLU_NC; /* column-wise, no supernode */ + A_S.Dtype = SLU_D; /* double-precision */ + A_S.Mtype = SLU_GE; /* full (general) matrix -- required for parallel factorization */ + A_S.nrow = A->n; + A_S.ncol = A->n; + A_S.Store = (void *) SUPERLU_MALLOC( sizeof(NCformat) ); + A_S_store = (NCformat *) A_S.Store; + A_S_store->nnz = 2 * A->start[A->n] - A->n; + A_S_store->nzval = at; + A_S_store->rowind = atsub; + A_S_store->colptr = xat; + + /* ------------------------------------------------------------ + Allocate storage and initialize statistics variables. + ------------------------------------------------------------*/ + StatAlloc( A->n, nprocs, panel_size, relax, &Gstat ); + StatInit( A->n, nprocs, &Gstat ); + + /* ------------------------------------------------------------ + Get column permutation vector perm_c[], according to permc_spec: + permc_spec = 0: natural ordering + permc_spec = 1: minimum degree ordering on structure of A'*A + permc_spec = 2: minimum degree ordering on structure of A'+A + permc_spec = 3: approximate minimum degree for unsymmetric matrices + ------------------------------------------------------------*/ + permc_spec = 0; + get_perm_c( permc_spec, &A_S, perm_c ); + + /* ------------------------------------------------------------ + Initialize the option structure superlumt_options using the + user-input parameters; + Apply perm_c to the columns of original A to form AC. + ------------------------------------------------------------*/ + pdgstrf_init( nprocs, fact, trans, refact, panel_size, relax, + u, usepr, drop_tol, perm_c, perm_r, + work, lwork, &A_S, &AC_S, &superlumt_options, &Gstat ); + + for ( i = 0; i < ((NCPformat*)AC_S.Store)->nnz; ++i ) + fprintf( stderr, "%6.1f", ((real*)(((NCPformat*)AC_S.Store)->nzval))[i] ); + fprintf( stderr, "\n" ); + + /* ------------------------------------------------------------ + Compute the LU factorization of A. + The following routine will create nprocs threads. + ------------------------------------------------------------*/ + pdgstrf( &superlumt_options, &AC_S, perm_r, &L_S, &U_S, &Gstat, &info ); + + fprintf( stderr, "INFO: %d\n", info ); + + flopcnt = 0; + for (i = 0; i < nprocs; ++i) + { + flopcnt += Gstat.procstat[i].fcops; + } + Gstat.ops[FACT] = flopcnt; + +//#if defined(DEBUG) + printf("\n** Result of sparse LU **\n"); + L_S_store = (SCPformat *) L_S.Store; + U_S_store = (NCPformat *) U_S.Store; + printf( "No of nonzeros in factor L = " IFMT "\n", L_S_store->nnz ); + printf( "No of nonzeros in factor U = " IFMT "\n", U_S_store->nnz ); + fflush( stdout ); +//#endif + + /* convert L and R from SuperLU formats to CSR */ + memset( Ltop, 0, (A->n + 1) * sizeof(int) ); + memset( Utop, 0, (A->n + 1) * sizeof(int) ); + memset( L->start, 0, (A->n + 1) * sizeof(int) ); + memset( U->start, 0, (A->n + 1) * sizeof(int) ); + + for ( i = 0; i < 2 * L_S_store->nnz; ++i ) + fprintf( stderr, "%6.1f", ((real*)(L_S_store->nzval))[i] ); + fprintf( stderr, "\n" ); + for ( i = 0; i < 2 * U_S_store->nnz; ++i ) + fprintf( stderr, "%6.1f", ((real*)(U_S_store->nzval))[i] ); + fprintf( stderr, "\n" ); + + printf( "No of supernodes in factor L = " IFMT "\n", L_S_store->nsuper ); + for ( i = 0; i < A->n; ++i ) + { + fprintf( stderr, "nzval_col_beg[%5d] = %d\n", i, L_S_store->nzval_colbeg[i] ); + fprintf( stderr, "nzval_col_end[%5d] = %d\n", i, L_S_store->nzval_colend[i] ); + //TODO: correct for SCPformat for L? + //for( pj = L_S_store->rowind_colbeg[i]; pj < L_S_store->rowind_colend[i]; ++pj ) +// for( pj = 0; pj < L_S_store->rowind_colend[i] - L_S_store->rowind_colbeg[i]; ++pj ) +// { +// ++Ltop[L_S_store->rowind[L_S_store->rowind_colbeg[i] + pj] + 1]; +// } + fprintf( stderr, "col_beg[%5d] = %d\n", i, U_S_store->colbeg[i] ); + fprintf( stderr, "col_end[%5d] = %d\n", i, U_S_store->colend[i] ); + for ( pj = U_S_store->colbeg[i]; pj < U_S_store->colend[i]; ++pj ) + { + ++Utop[U_S_store->rowind[pj] + 1]; + fprintf( stderr, "Utop[%5d] = %d\n", U_S_store->rowind[pj] + 1, Utop[U_S_store->rowind[pj] + 1] ); + } + } + for ( i = 1; i <= A->n; ++i ) + { +// Ltop[i] = L->start[i] = Ltop[i] + Ltop[i - 1]; + Utop[i] = U->start[i] = Utop[i] + Utop[i - 1]; +// fprintf( stderr, "Utop[%5d] = %d\n", i, Utop[i] ); +// fprintf( stderr, "U->start[%5d] = %d\n", i, U->start[i] ); + } + for ( i = 0; i < A->n; ++i ) + { +// for( pj = 0; pj < L_S_store->nzval_colend[i] - L_S_store->nzval_colbeg[i]; ++pj ) +// { +// r = L_S_store->rowind[L_S_store->rowind_colbeg[i] + pj]; +// L->entries[Ltop[r]].j = r; +// L->entries[Ltop[r]].val = ((real*)L_S_store->nzval)[L_S_store->nzval_colbeg[i] + pj]; +// ++Ltop[r]; +// } + for ( pj = U_S_store->colbeg[i]; pj < U_S_store->colend[i]; ++pj ) + { + r = U_S_store->rowind[pj]; + U->entries[Utop[r]].j = i; + U->entries[Utop[r]].val = ((real*)U_S_store->nzval)[pj]; + ++Utop[r]; + } + } + + /* ------------------------------------------------------------ + Deallocate storage after factorization. + ------------------------------------------------------------*/ + pxgstrf_finalize( &superlumt_options, &AC_S ); + Deallocate_Matrix( A_t ); + sfree( xa, "SuperLU_Factorize::xa" ); + sfree( asub, "SuperLU_Factorize::asub" ); + sfree( a, "SuperLU_Factorize::a" ); + SUPERLU_FREE( perm_r ); + SUPERLU_FREE( perm_c ); + SUPERLU_FREE( ((NCformat *)A_S.Store)->rowind ); + SUPERLU_FREE( ((NCformat *)A_S.Store)->colptr ); + SUPERLU_FREE( ((NCformat *)A_S.Store)->nzval ); + SUPERLU_FREE( A_S.Store ); + if ( lwork == 0 ) + { + Destroy_SuperNode_SCP(&L_S); + Destroy_CompCol_NCP(&U_S); + } + else if ( lwork > 0 ) + { + SUPERLU_FREE(work); + } + StatFree(&Gstat); + + sfree( Utop, "SuperLU_Factorize::Utop" ); + sfree( Ltop, "SuperLU_Factorize::Ltop" ); + + //TODO: return iters + return 0.; +} +#endif + + +/* Diagonal (Jacobi) preconditioner computation */ +real diag_pre_comp( const sparse_matrix * const H, real * const Hdia_inv ) +{ + unsigned int i; + real start; + + start = Get_Time( ); + +#ifdef _OPENMP + #pragma omp parallel for schedule(static) \ + default(none) private(i) +#endif + for ( i = 0; i < H->n; ++i ) + { + if ( H->val[H->start[i + 1] - 1] != 0.0 ) + { + Hdia_inv[i] = 1.0 / H->val[H->start[i + 1] - 1]; + } + else + { + Hdia_inv[i] = 1.0; + } + } + + return Get_Timing_Info( start ); +} + + +/* Incomplete Cholesky factorization with dual thresholding */ +real ICHOLT( const sparse_matrix * const A, const real * const droptol, + sparse_matrix * const L, sparse_matrix * const U ) +{ + int *tmp_j; + real *tmp_val; + int i, j, pj, k1, k2, tmptop, Ltop; + real val, start; + unsigned int *Utop; + + start = Get_Time( ); + + if ( ( Utop = (unsigned int*) malloc((A->n + 1) * sizeof(unsigned int)) ) == NULL || + ( tmp_j = (int*) malloc(A->n * sizeof(int)) ) == NULL || + ( tmp_val = (real*) malloc(A->n * sizeof(real)) ) == NULL ) + { + fprintf( stderr, "[ICHOLT] Not enough memory for preconditioning matrices. terminating.\n" ); + exit( INSUFFICIENT_MEMORY ); + } + + // clear variables + Ltop = 0; + tmptop = 0; + memset( L->start, 0, (A->n + 1) * sizeof(unsigned int) ); + memset( U->start, 0, (A->n + 1) * sizeof(unsigned int) ); + memset( Utop, 0, A->n * sizeof(unsigned int) ); + + for ( i = 0; i < A->n; ++i ) + { + L->start[i] = Ltop; + tmptop = 0; + + for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj ) + { + j = A->j[pj]; + val = A->val[pj]; + + if ( FABS(val) > droptol[i] ) + { + k1 = 0; + k2 = L->start[j]; + while ( k1 < tmptop && k2 < L->start[j + 1] ) + { + if ( tmp_j[k1] < L->j[k2] ) + { + ++k1; + } + else if ( tmp_j[k1] > L->j[k2] ) + { + ++k2; + } + else + { + val -= (tmp_val[k1++] * L->val[k2++]); + } + } + + // L matrix is lower triangular, + // so right before the start of next row comes jth diagonal + val /= L->val[L->start[j + 1] - 1]; + + tmp_j[tmptop] = j; + tmp_val[tmptop] = val; + ++tmptop; + } + } + + // sanity check + if ( A->j[pj] != i ) + { + fprintf( stderr, "[ICHOLT] badly built A matrix!\n (i = %d) ", i ); + exit( NUMERIC_BREAKDOWN ); + } + + // compute the ith diagonal in L + val = A->val[pj]; + for ( k1 = 0; k1 < tmptop; ++k1 ) + { + val -= (tmp_val[k1] * tmp_val[k1]); + } + +#if defined(DEBUG) + if ( val < 0.0 ) + { + fprintf( stderr, "[ICHOLT] Numeric breakdown (SQRT of negative on diagonal i = %d). Terminating.\n", i ); + exit( NUMERIC_BREAKDOWN ); + + } +#endif + + tmp_j[tmptop] = i; + tmp_val[tmptop] = SQRT( val ); + + // apply the dropping rule once again + //fprintf( stderr, "row%d: tmptop: %d\n", i, tmptop ); + //for( k1 = 0; k1<= tmptop; ++k1 ) + // fprintf( stderr, "%d(%f) ", tmp[k1].j, tmp[k1].val ); + //fprintf( stderr, "\n" ); + //fprintf( stderr, "row(%d): droptol=%.4f\n", i+1, droptol[i] ); + for ( k1 = 0; k1 < tmptop; ++k1 ) + { + if ( FABS(tmp_val[k1]) > droptol[i] / tmp_val[tmptop] ) + { + L->j[Ltop] = tmp_j[k1]; + L->val[Ltop] = tmp_val[k1]; + U->start[tmp_j[k1] + 1]++; + ++Ltop; + //fprintf( stderr, "%d(%.4f) ", tmp[k1].j+1, tmp[k1].val ); + } + } + // keep the diagonal in any case + L->j[Ltop] = tmp_j[k1]; + L->val[Ltop] = tmp_val[k1]; + ++Ltop; + //fprintf( stderr, "%d(%.4f)\n", tmp[k1].j+1, tmp[k1].val ); + } + + L->start[i] = Ltop; +// fprintf( stderr, "nnz(L): %d, max: %d\n", Ltop, L->n * 50 ); + + /* U = L^T (Cholesky factorization) */ + Transpose( L, U ); +// for ( i = 1; i <= U->n; ++i ) +// { +// Utop[i] = U->start[i] = U->start[i] + U->start[i - 1] + 1; +// } +// for ( i = 0; i < L->n; ++i ) +// { +// for ( pj = L->start[i]; pj < L->start[i + 1]; ++pj ) +// { +// j = L->j[pj]; +// U->j[Utop[j]] = i; +// U->val[Utop[j]] = L->val[pj]; +// Utop[j]++; +// } +// } + +// fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 ); + + sfree( tmp_val, "ICHOLT::tmp_val" ); + sfree( tmp_j, "ICHOLT::tmp_j" ); + sfree( Utop, "ICHOLT::Utop" ); + + return Get_Timing_Info( start ); +} + + +/* Fine-grained (parallel) incomplete Cholesky factorization + * + * Reference: + * Edmond Chow and Aftab Patel + * Fine-Grained Parallel Incomplete LU Factorization + * SIAM J. Sci. Comp. */ +#if defined(TESTING) +real ICHOL_PAR( const sparse_matrix * const A, const unsigned int sweeps, + sparse_matrix * const U_t, sparse_matrix * const U ) +{ + unsigned int i, j, k, pj, x = 0, y = 0, ei_x, ei_y; + real *D, *D_inv, sum, start; + sparse_matrix *DAD; + int *Utop; + + start = Get_Time( ); + + if ( Allocate_Matrix( &DAD, A->n, A->m ) == FAILURE || + ( D = (real*) malloc(A->n * sizeof(real)) ) == NULL || + ( D_inv = (real*) malloc(A->n * sizeof(real)) ) == NULL || + ( Utop = (int*) malloc((A->n + 1) * sizeof(int)) ) == NULL ) + { + fprintf( stderr, "not enough memory for ICHOL_PAR preconditioning matrices. terminating.\n" ); + exit( INSUFFICIENT_MEMORY ); + } + +#ifdef _OPENMP + #pragma omp parallel for schedule(static) \ + default(none) shared(D_inv, D) private(i) +#endif + for ( i = 0; i < A->n; ++i ) + { + D_inv[i] = SQRT( A->val[A->start[i + 1] - 1] ); + D[i] = 1. / D_inv[i]; + } + + memset( U->start, 0, sizeof(unsigned int) * (A->n + 1) ); + memset( Utop, 0, sizeof(unsigned int) * (A->n + 1) ); + + /* to get convergence, A must have unit diagonal, so apply + * transformation DAD, where D = D(1./SQRT(D(A))) */ + memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) ); +#ifdef _OPENMP + #pragma omp parallel for schedule(guided) \ + default(none) shared(DAD, D_inv, D) private(i, pj) +#endif + for ( i = 0; i < A->n; ++i ) + { + /* non-diagonals */ + for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj ) + { + DAD->j[pj] = A->j[pj]; + DAD->val[pj] = A->val[pj] * D[i] * D[A->j[pj]]; + } + /* diagonal */ + DAD->j[pj] = A->j[pj]; + DAD->val[pj] = 1.; + } + + /* initial guesses for U^T, + * assume: A and DAD symmetric and stored lower triangular */ + memcpy( U_t->start, DAD->start, sizeof(int) * (DAD->n + 1) ); + memcpy( U_t->j, DAD->j, sizeof(int) * (DAD->m) ); + memcpy( U_t->val, DAD->val, sizeof(real) * (DAD->m) ); + + for ( i = 0; i < sweeps; ++i ) + { + /* for each nonzero */ +#ifdef _OPENMP + #pragma omp parallel for schedule(static) \ + default(none) shared(DAD, stderr) private(sum, ei_x, ei_y, k) firstprivate(x, y) +#endif + for ( j = 0; j < A->start[A->n]; ++j ) + { + sum = ZERO; + + /* determine row bounds of current nonzero */ + x = 0; + ei_x = 0; + for ( k = 0; k <= A->n; ++k ) + { + if ( U_t->start[k] > j ) + { + x = U_t->start[k - 1]; + ei_x = U_t->start[k]; + break; + } + } + /* column bounds of current nonzero */ + y = U_t->start[U_t->j[j]]; + ei_y = U_t->start[U_t->j[j] + 1]; + + /* sparse dot product: dot( U^T(i,1:j-1), U^T(j,1:j-1) ) */ + while ( U_t->j[x] < U_t->j[j] && + U_t->j[y] < U_t->j[j] && + x < ei_x && y < ei_y ) + { + if ( U_t->j[x] == U_t->j[y] ) + { + sum += (U_t->val[x] * U_t->val[y]); + ++x; + ++y; + } + else if ( U_t->j[x] < U_t->j[y] ) + { + ++x; + } + else + { + ++y; + } + } + + sum = DAD->val[j] - sum; + + /* diagonal entries */ + if ( (k - 1) == U_t->j[j] ) + { + /* sanity check */ + if ( sum < ZERO ) + { + fprintf( stderr, "Numeric breakdown in ICHOL_PAR. Terminating.\n"); +#if defined(DEBUG_FOCUS) + fprintf( stderr, "A(%5d,%5d) = %10.3f\n", + k - 1, A->entries[j].j, A->entries[j].val ); + fprintf( stderr, "sum = %10.3f\n", sum); +#endif + exit(NUMERIC_BREAKDOWN); + } + + U_t->val[j] = SQRT( sum ); + } + /* non-diagonal entries */ + else + { + U_t->val[j] = sum / U_t->val[ei_y - 1]; + } + } + } + + /* apply inverse transformation D^{-1}U^{T}, + * since DAD \approx U^{T}U, so + * D^{-1}DADD^{-1} = A \approx D^{-1}U^{T}UD^{-1} */ +#ifdef _OPENMP + #pragma omp parallel for schedule(guided) \ + default(none) shared(D_inv) private(i, pj) #endif -/* global to make OpenMP shared (apply_preconditioner) */ -real *Dinv_L = NULL, *Dinv_U = NULL; -/* global to make OpenMP shared (tri_solve_level_sched) */ -int levels = 1; -int levels_L = 1, levels_U = 1; -unsigned int *row_levels_L = NULL, *level_rows_L = NULL, *level_rows_cnt_L = NULL; -unsigned int *row_levels_U = NULL, *level_rows_U = NULL, *level_rows_cnt_U = NULL; -unsigned int *row_levels, *level_rows, *level_rows_cnt; -unsigned int *top = NULL; -/* global to make OpenMP shared (graph_coloring) */ -unsigned int *color = NULL; -unsigned int *to_color = NULL; -unsigned int *conflict = NULL; -unsigned int *conflict_cnt = NULL; -unsigned int *temp_ptr; -unsigned int *recolor = NULL; -unsigned int recolor_cnt; -unsigned int *color_top = NULL; -/* global to make OpenMP shared (sort_colors) */ -unsigned int *permuted_row_col = NULL; -unsigned int *permuted_row_col_inv = NULL; -real *y_p = NULL; -/* global to make OpenMP shared (permute_vector) */ -real *x_p = NULL; -unsigned int *mapping = NULL; -sparse_matrix *H_full; -sparse_matrix *H_p; -/* global to make OpenMP shared (jacobi_iter) */ -real *Dinv_b = NULL, *rp = NULL, *rp2 = NULL, *rp3 = NULL; + for ( i = 0; i < A->n; ++i ) + { + for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj ) + { + U_t->val[pj] *= D_inv[i]; + } + } + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "nnz(L): %d, max: %d\n", U_t->start[U_t->n], U_t->n * 50 ); +#endif + + /* transpose U^{T} and copy into U */ + Transpose( U_t, U ); + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 ); +#endif + + Deallocate_Matrix( DAD ); + sfree( D_inv, "ICHOL_PAR::D_inv" ); + sfree( D, "ICHOL_PAR::D" ); + sfree( Utop, "ICHOL_PAR::Utop" ); + + return Get_Timing_Info( start ); +} +#endif + + +/* Fine-grained (parallel) incomplete LU factorization + * + * Reference: + * Edmond Chow and Aftab Patel + * Fine-Grained Parallel Incomplete LU Factorization + * SIAM J. Sci. Comp. + * + * A: symmetric, half-stored (lower triangular), CSR format + * sweeps: number of loops over non-zeros for computation + * L / U: factorized triangular matrices (A \approx LU), CSR format */ +real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps, + sparse_matrix * const L, sparse_matrix * const U ) +{ + unsigned int i, j, k, pj, x, y, ei_x, ei_y; + real *D, *D_inv, sum, start; + sparse_matrix *DAD; + + start = Get_Time( ); + + if ( Allocate_Matrix( &DAD, A->n, A->m ) == FAILURE || + ( D = (real*) malloc(A->n * sizeof(real)) ) == NULL || + ( D_inv = (real*) malloc(A->n * sizeof(real)) ) == NULL ) + { + fprintf( stderr, "[ILU_PAR] Not enough memory for preconditioning matrices. Terminating.\n" ); + exit( INSUFFICIENT_MEMORY ); + } + +#ifdef _OPENMP + #pragma omp parallel for schedule(static) \ + default(none) shared(D, D_inv) private(i) +#endif + for ( i = 0; i < A->n; ++i ) + { + D_inv[i] = SQRT( FABS( A->val[A->start[i + 1] - 1] ) ); + D[i] = 1.0 / D_inv[i]; +// printf( "A->val[%8d] = %f, D[%4d] = %f, D_inv[%4d] = %f\n", A->start[i + 1] - 1, A->val[A->start[i + 1] - 1], i, D[i], i, D_inv[i] ); + } + + /* to get convergence, A must have unit diagonal, so apply + * transformation DAD, where D = D(1./SQRT(abs(D(A)))) */ + memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) ); +#ifdef _OPENMP + #pragma omp parallel for schedule(static) \ + default(none) shared(DAD, D) private(i, pj) +#endif + for ( i = 0; i < A->n; ++i ) + { + /* non-diagonals */ + for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj ) + { + DAD->j[pj] = A->j[pj]; + DAD->val[pj] = D[i] * A->val[pj] * D[A->j[pj]]; + } + /* diagonal */ + DAD->j[pj] = A->j[pj]; + DAD->val[pj] = 1.0; + } + + /* initial guesses for L and U, + * assume: A and DAD symmetric and stored lower triangular */ + memcpy( L->start, DAD->start, sizeof(int) * (DAD->n + 1) ); + memcpy( L->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) ); + memcpy( L->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) ); + /* store U^T in CSR for row-wise access and tranpose later */ + memcpy( U->start, DAD->start, sizeof(int) * (DAD->n + 1) ); + memcpy( U->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) ); + memcpy( U->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) ); + + /* L has unit diagonal, by convention */ +#ifdef _OPENMP + #pragma omp parallel for schedule(static) default(none) private(i) +#endif + for ( i = 0; i < A->n; ++i ) + { + L->val[L->start[i + 1] - 1] = 1.0; + } + + for ( i = 0; i < sweeps; ++i ) + { + /* for each nonzero in L */ +#ifdef _OPENMP + #pragma omp parallel for schedule(static) \ + default(none) shared(DAD) private(j, k, x, y, ei_x, ei_y, sum) +#endif + for ( j = 0; j < DAD->start[DAD->n]; ++j ) + { + sum = ZERO; + + /* determine row bounds of current nonzero */ + x = 0; + ei_x = 0; + for ( k = 1; k <= DAD->n; ++k ) + { + if ( DAD->start[k] > j ) + { + x = DAD->start[k - 1]; + ei_x = DAD->start[k]; + break; + } + } + /* determine column bounds of current nonzero */ + y = DAD->start[DAD->j[j]]; + ei_y = DAD->start[DAD->j[j] + 1]; + + /* sparse dot product: + * dot( L(i,1:j-1), U(1:j-1,j) ) */ + while ( L->j[x] < L->j[j] && + L->j[y] < L->j[j] && + x < ei_x && y < ei_y ) + { + if ( L->j[x] == L->j[y] ) + { + sum += (L->val[x] * U->val[y]); + ++x; + ++y; + } + else if ( L->j[x] < L->j[y] ) + { + ++x; + } + else + { + ++y; + } + } + + if ( j != ei_x - 1 ) + { + L->val[j] = ( DAD->val[j] - sum ) / U->val[ei_y - 1]; + } + } + +#ifdef _OPENMP + #pragma omp parallel for schedule(static) \ + default(none) shared(DAD) private(j, k, x, y, ei_x, ei_y, sum) +#endif + for ( j = 0; j < DAD->start[DAD->n]; ++j ) + { + sum = ZERO; + + /* determine row bounds of current nonzero */ + x = 0; + ei_x = 0; + for ( k = 1; k <= DAD->n; ++k ) + { + if ( DAD->start[k] > j ) + { + x = DAD->start[k - 1]; + ei_x = DAD->start[k]; + break; + } + } + /* determine column bounds of current nonzero */ + y = DAD->start[DAD->j[j]]; + ei_y = DAD->start[DAD->j[j] + 1]; + + /* sparse dot product: + * dot( L(i,1:i-1), U(1:i-1,j) ) */ + while ( U->j[x] < U->j[j] && + U->j[y] < U->j[j] && + x < ei_x && y < ei_y ) + { + if ( U->j[x] == U->j[y] ) + { + sum += (L->val[y] * U->val[x]); + ++x; + ++y; + } + else if ( U->j[x] < U->j[y] ) + { + ++x; + } + else + { + ++y; + } + } + + U->val[j] = DAD->val[j] - sum; + } + } + + /* apply inverse transformation: + * since DAD \approx LU, then + * D^{-1}DADD^{-1} = A \approx D^{-1}LUD^{-1} */ +#ifdef _OPENMP + #pragma omp parallel for schedule(static) \ + default(none) shared(DAD, D_inv) private(i, pj) +#endif + for ( i = 0; i < DAD->n; ++i ) + { + for ( pj = DAD->start[i]; pj < DAD->start[i + 1]; ++pj ) + { + L->val[pj] = D_inv[i] * L->val[pj]; + /* currently storing U^T, so use row index instead of column index */ + U->val[pj] = U->val[pj] * D_inv[i]; + } + } + + Transpose_I( U ); + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "nnz(L): %d, max: %d\n", L->start[L->n], L->n * 50 ); + fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 ); +#endif + + Deallocate_Matrix( DAD ); + sfree( D_inv, "ILU_PAR::D_inv" ); + sfree( D, "ILU_PAR::D_inv" ); + + return Get_Timing_Info( start ); +} + + +/* Fine-grained (parallel) incomplete LU factorization with thresholding + * + * Reference: + * Edmond Chow and Aftab Patel + * Fine-Grained Parallel Incomplete LU Factorization + * SIAM J. Sci. Comp. + * + * A: symmetric, half-stored (lower triangular), CSR format + * droptol: row-wise tolerances used for dropping + * sweeps: number of loops over non-zeros for computation + * L / U: factorized triangular matrices (A \approx LU), CSR format */ +real ILUT_PAR( const sparse_matrix * const A, const real * droptol, + const unsigned int sweeps, sparse_matrix * const L, sparse_matrix * const U ) +{ + unsigned int i, j, k, pj, x, y, ei_x, ei_y, Ltop, Utop; + real *D, *D_inv, sum, start; + sparse_matrix *DAD, *L_temp, *U_temp; + + start = Get_Time( ); + + if ( Allocate_Matrix( &DAD, A->n, A->m ) == FAILURE || + Allocate_Matrix( &L_temp, A->n, A->m ) == FAILURE || + Allocate_Matrix( &U_temp, A->n, A->m ) == FAILURE ) + { + fprintf( stderr, "not enough memory for ILUT_PAR preconditioning matrices. terminating.\n" ); + exit( INSUFFICIENT_MEMORY ); + } + + if ( ( D = (real*) malloc(A->n * sizeof(real)) ) == NULL || + ( D_inv = (real*) malloc(A->n * sizeof(real)) ) == NULL ) + { + fprintf( stderr, "not enough memory for ILUT_PAR preconditioning matrices. terminating.\n" ); + exit( INSUFFICIENT_MEMORY ); + } + +#ifdef _OPENMP + #pragma omp parallel for schedule(static) \ + default(none) shared(D, D_inv) private(i) +#endif + for ( i = 0; i < A->n; ++i ) + { + D_inv[i] = SQRT( FABS( A->val[A->start[i + 1] - 1] ) ); + D[i] = 1.0 / D_inv[i]; + } + + /* to get convergence, A must have unit diagonal, so apply + * transformation DAD, where D = D(1./SQRT(D(A))) */ + memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) ); +#ifdef _OPENMP + #pragma omp parallel for schedule(static) \ + default(none) shared(DAD, D) private(i, pj) +#endif + for ( i = 0; i < A->n; ++i ) + { + /* non-diagonals */ + for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj ) + { + DAD->j[pj] = A->j[pj]; + DAD->val[pj] = D[i] * A->val[pj] * D[A->j[pj]]; + } + /* diagonal */ + DAD->j[pj] = A->j[pj]; + DAD->val[pj] = 1.0; + } + + /* initial guesses for L and U, + * assume: A and DAD symmetric and stored lower triangular */ + memcpy( L_temp->start, DAD->start, sizeof(int) * (DAD->n + 1) ); + memcpy( L_temp->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) ); + memcpy( L_temp->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) ); + /* store U^T in CSR for row-wise access and tranpose later */ + memcpy( U_temp->start, DAD->start, sizeof(int) * (DAD->n + 1) ); + memcpy( U_temp->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) ); + memcpy( U_temp->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) ); + + /* L has unit diagonal, by convention */ +#ifdef _OPENMP + #pragma omp parallel for schedule(static) \ + default(none) private(i) shared(L_temp) +#endif + for ( i = 0; i < A->n; ++i ) + { + L_temp->val[L_temp->start[i + 1] - 1] = 1.0; + } + + for ( i = 0; i < sweeps; ++i ) + { + /* for each nonzero in L */ +#ifdef _OPENMP + #pragma omp parallel for schedule(static) \ + default(none) shared(DAD, L_temp, U_temp) private(j, k, x, y, ei_x, ei_y, sum) +#endif + for ( j = 0; j < DAD->start[DAD->n]; ++j ) + { + sum = ZERO; + + /* determine row bounds of current nonzero */ + x = 0; + ei_x = 0; + for ( k = 1; k <= DAD->n; ++k ) + { + if ( DAD->start[k] > j ) + { + x = DAD->start[k - 1]; + ei_x = DAD->start[k]; + break; + } + } + /* determine column bounds of current nonzero */ + y = DAD->start[DAD->j[j]]; + ei_y = DAD->start[DAD->j[j] + 1]; + + /* sparse dot product: + * dot( L(i,1:j-1), U(1:j-1,j) ) */ + while ( L_temp->j[x] < L_temp->j[j] && + L_temp->j[y] < L_temp->j[j] && + x < ei_x && y < ei_y ) + { + if ( L_temp->j[x] == L_temp->j[y] ) + { + sum += (L_temp->val[x] * U_temp->val[y]); + ++x; + ++y; + } + else if ( L_temp->j[x] < L_temp->j[y] ) + { + ++x; + } + else + { + ++y; + } + } + + if ( j != ei_x - 1 ) + { + L_temp->val[j] = ( DAD->val[j] - sum ) / U_temp->val[ei_y - 1]; + } + } + +#ifdef _OPENMP + #pragma omp parallel for schedule(static) \ + default(none) shared(DAD, L_temp, U_temp) private(j, k, x, y, ei_x, ei_y, sum) +#endif + for ( j = 0; j < DAD->start[DAD->n]; ++j ) + { + sum = ZERO; + + /* determine row bounds of current nonzero */ + x = 0; + ei_x = 0; + for ( k = 1; k <= DAD->n; ++k ) + { + if ( DAD->start[k] > j ) + { + x = DAD->start[k - 1]; + ei_x = DAD->start[k]; + break; + } + } + /* determine column bounds of current nonzero */ + y = DAD->start[DAD->j[j]]; + ei_y = DAD->start[DAD->j[j] + 1]; + + /* sparse dot product: + * dot( L(i,1:i-1), U(1:i-1,j) ) */ + while ( U_temp->j[x] < U_temp->j[j] && + U_temp->j[y] < U_temp->j[j] && + x < ei_x && y < ei_y ) + { + if ( U_temp->j[x] == U_temp->j[y] ) + { + sum += (L_temp->val[y] * U_temp->val[x]); + ++x; + ++y; + } + else if ( U_temp->j[x] < U_temp->j[y] ) + { + ++x; + } + else + { + ++y; + } + } + + U_temp->val[j] = DAD->val[j] - sum; + } + } + + /* apply inverse transformation: + * since DAD \approx LU, then + * D^{-1}DADD^{-1} = A \approx D^{-1}LUD^{-1} */ +#ifdef _OPENMP + #pragma omp parallel for schedule(static) \ + default(none) shared(DAD, L_temp, U_temp, D_inv) private(i, pj) +#endif + for ( i = 0; i < DAD->n; ++i ) + { + for ( pj = DAD->start[i]; pj < DAD->start[i + 1]; ++pj ) + { + L_temp->val[pj] = D_inv[i] * L_temp->val[pj]; + /* currently storing U^T, so use row index instead of column index */ + U_temp->val[pj] = U_temp->val[pj] * D_inv[i]; + } + } + + /* apply the dropping rule */ + Ltop = 0; + Utop = 0; + for ( i = 0; i < DAD->n; ++i ) + { + L->start[i] = Ltop; + U->start[i] = Utop; + + for ( pj = L_temp->start[i]; pj < L_temp->start[i + 1] - 1; ++pj ) + { + if ( FABS( L_temp->val[pj] ) > FABS( droptol[i] / L_temp->val[L_temp->start[i + 1] - 1] ) ) + { + L->j[Ltop] = L_temp->j[pj]; + L->val[Ltop] = L_temp->val[pj]; + ++Ltop; + } + } + + /* diagonal */ + L->j[Ltop] = L_temp->j[pj]; + L->val[Ltop] = L_temp->val[pj]; + ++Ltop; + + for ( pj = U_temp->start[i]; pj < U_temp->start[i + 1] - 1; ++pj ) + { + if ( FABS( U_temp->val[pj] ) > FABS( droptol[i] / U_temp->val[U_temp->start[i + 1] - 1] ) ) + { + U->j[Utop] = U_temp->j[pj]; + U->val[Utop] = U_temp->val[pj]; + ++Utop; + } + } + + /* diagonal */ + U->j[Utop] = U_temp->j[pj]; + U->val[Utop] = U_temp->val[pj]; + ++Utop; + } + + L->start[i] = Ltop; + U->start[i] = Utop; + + Transpose_I( U ); + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "nnz(L): %d\n", L->start[L->n] ); + fprintf( stderr, "nnz(U): %d\n", U->start[U->n] ); +#endif + + Deallocate_Matrix( U_temp ); + Deallocate_Matrix( L_temp ); + Deallocate_Matrix( DAD ); + sfree( D_inv, "ILUT_PAR::D_inv" ); + sfree( D, "ILUT_PAR::D_inv" ); + + return Get_Timing_Info( start ); +} /* sparse matrix-vector product Ax=b @@ -1133,6 +2515,10 @@ static void apply_preconditioner( const static_storage * const workspace, const tri_solve( workspace->L, y, x, workspace->L->n, LOWER ); tri_solve( workspace->U, x, x, workspace->U->n, UPPER ); break; + case SAI_PC: + //TODO: add code to compute SAI first +// Sparse_MatVec( SAI, y, x ); + break; default: fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); exit( INVALID_INPUT ); @@ -1151,6 +2537,9 @@ static void apply_preconditioner( const static_storage * const workspace, const tri_solve_level_sched( workspace->L, y, x, workspace->L->n, LOWER, fresh_pre ); tri_solve_level_sched( workspace->U, x, x, workspace->U->n, UPPER, fresh_pre ); break; + case SAI_PC: + //TODO: add code to compute SAI first +// Sparse_MatVec( SAI, y, x ); default: fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); exit( INVALID_INPUT ); @@ -1161,6 +2550,7 @@ static void apply_preconditioner( const static_storage * const workspace, const switch ( control->cm_solver_pre_comp_type ) { case DIAG_PC: + case SAI_PC: fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" ); exit( INVALID_INPUT ); break; @@ -1189,6 +2579,7 @@ static void apply_preconditioner( const static_storage * const workspace, const switch ( control->cm_solver_pre_comp_type ) { case DIAG_PC: + case SAI_PC: fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" ); exit( INVALID_INPUT ); break; diff --git a/sPuReMD/src/lin_alg.h b/sPuReMD/src/lin_alg.h index 3946f60a7e628475a29485c31f48577ba0a70eee..ceb2a18bfcfb6dc8ddaff3e8c807459a9d401d0d 100644 --- a/sPuReMD/src/lin_alg.h +++ b/sPuReMD/src/lin_alg.h @@ -30,6 +30,33 @@ typedef enum UPPER = 1, } TRIANGULARITY; +void Sort_Matrix_Rows( sparse_matrix * const ); + +int Estimate_LU_Fill( const sparse_matrix * const, const real * const ); + +void Calculate_Droptol( const sparse_matrix * const, + real * const, const real ); + +#if defined(HAVE_SUPERLU_MT) +real SuperLU_Factorize( const sparse_matrix * const, + sparse_matrix * const, sparse_matrix * const ); +#endif + +real diag_pre_comp( const sparse_matrix * const, real * const ); + +real ICHOLT( const sparse_matrix * const, const real * const, + sparse_matrix * const, sparse_matrix * const ); + +#if defined(TESTING) +real ICHOL_PAR( const sparse_matrix * const, const unsigned int, + sparse_matrix * const, sparse_matrix * const ); +#endif + +real ILU_PAR( const sparse_matrix * const, const unsigned int, + sparse_matrix * const, sparse_matrix * const ); + +real ILUT_PAR( const sparse_matrix * const, const real *, + const unsigned int, sparse_matrix * const, sparse_matrix * const ); void Transpose( const sparse_matrix * const, sparse_matrix * const ); diff --git a/sPuReMD/src/mytypes.h b/sPuReMD/src/mytypes.h index f75ad16891e70fb130233f0d506b1ffb5c05f0fc..e68c830aa6174255183bbac5eb03b483e54d7fcb 100644 --- a/sPuReMD/src/mytypes.h +++ b/sPuReMD/src/mytypes.h @@ -233,6 +233,7 @@ enum pre_comp ILU_PAR_PC = 3, ILUT_PAR_PC = 4, ILU_SUPERLU_MT_PC = 5, + SAI_PC = 6, }; enum pre_app