diff --git a/sPuReMD/src/GMRES.c b/sPuReMD/src/GMRES.c index 9037d314162cc56ffb09778d630835181097d17f..c686ce887919d686c09665b140d9e5bb8ce174fe 100644 --- a/sPuReMD/src/GMRES.c +++ b/sPuReMD/src/GMRES.c @@ -617,6 +617,7 @@ void apply_preconditioner( const static_storage * const workspace, real * const break; case ICHOLT_PC: case ILU_PAR_PC: + case ILUT_PAR_PC: tri_solve( LU, y, x, tri ); break; default: @@ -631,6 +632,7 @@ void apply_preconditioner( const static_storage * const workspace, real * const break; case ICHOLT_PC: case ILU_PAR_PC: + case ILUT_PAR_PC: tri_solve_level_sched( LU, y, x, tri, extra ); break; default: @@ -646,6 +648,7 @@ void apply_preconditioner( const static_storage * const workspace, real * const break; case ICHOLT_PC: case ILU_PAR_PC: + case ILUT_PAR_PC: if ( tri == LOWER ) { if ( Dinv_L == NULL ) diff --git a/sPuReMD/src/QEq.c b/sPuReMD/src/QEq.c index eca3d5516afba1184a61091c98798cadba6b4a57..6e6cc539531ac4243a2c50a0affe9e556aeef225 100644 --- a/sPuReMD/src/QEq.c +++ b/sPuReMD/src/QEq.c @@ -24,10 +24,12 @@ #include "GMRES.h" #include "list.h" #include "print_utils.h" +#include "tool_box.h" #if defined(HAVE_SUPERLU_MT) #include "slu_mt_ddefs.h" #endif + static int compare_matrix_entry(const void *v1, const void *v2) { /* larger element has larger column index */ @@ -147,40 +149,103 @@ static void Transpose_I( sparse_matrix * const A ) } -static void Calculate_Droptol( const sparse_matrix * const A, real * const droptol, real dtol ) +static void Calculate_Droptol( const sparse_matrix * const A, real * const droptol, + const real dtol ) { int i, j, k; real val; +#ifdef _OPENMP + static real *droptol_local; + unsigned int tid; +#endif - /* init droptol to 0 */ - for ( i = 0; i < A->n; ++i ) - droptol[i] = 0; - - /* calculate sqaure of the norm of each row */ - for ( i = 0; i < A->n; ++i ) + #pragma omp parallel default(none) private(i, j, k, val, tid), shared(droptol_local) { - for ( k = A->start[i]; k < A->start[i + 1] - 1; ++k ) +#ifdef _OPENMP + tid = omp_get_thread_num(); + + #pragma omp master + { + /* keep b_local for program duration to avoid allocate/free + * overhead per Sparse_MatVec call*/ + if ( droptol_local == NULL ) + { + if ( (droptol_local = (real*) malloc( omp_get_num_threads() * A->n * sizeof(real))) == NULL ) + { + exit( INSUFFICIENT_MEMORY ); + } + } + } + + #pragma omp barrier +#endif + + /* init droptol to 0 */ + for ( i = 0; i < A->n; ++i ) { - j = A->j[k]; - val = A->val[k]; +#ifdef _OPENMP + droptol_local[tid * A->n + i] = 0.0; +#else + droptol[i] = 0.0; +#endif + } + + #pragma omp barrier + + /* calculate sqaure of the norm of each row */ + #pragma omp parallel for schedule(guided) \ + default(none) private(i, j, k, val, tid) shared(droptol_local) + for ( i = 0; i < A->n; ++i ) + { + for ( k = A->start[i]; k < A->start[i + 1] - 1; ++k ) + { + j = A->j[k]; + val = A->val[k]; +#ifdef _OPENMP + droptol_local[tid * A->n + i] += val * val; + droptol_local[tid * A->n + j] += val * val; +#else + droptol[i] += val * val; + droptol[j] += val * val; +#endif + } + + val = A->val[k]; // diagonal entry +#ifdef _OPENMP + droptol_local[tid * A->n + i] += val * val; +#else droptol[i] += val * val; - droptol[j] += val * val; +#endif } - val = A->val[k]; // diagonal entry - droptol[i] += val * val; - } + #pragma omp barrier - /* calculate local droptol for each row */ - //fprintf( stderr, "droptol: " ); - for ( i = 0; i < A->n; ++i ) - { - //fprintf( stderr, "%f-->", droptol[i] ); - droptol[i] = SQRT( droptol[i] ) * dtol; - //fprintf( stderr, "%f ", droptol[i] ); +#ifdef _OPENMP + #pragma omp parallel for schedule(guided) default(none) private(i, k) shared(droptol_local) + for ( i = 0; i < A->n; ++i ) + { + droptol[i] = 0.0; + for ( k = 0; k < omp_get_num_threads(); ++k ) + { + droptol[i] += droptol_local[k * A->n + i]; + } + } +#endif + + #pragma omp barrier + + /* calculate local droptol for each row */ + //fprintf( stderr, "droptol: " ); + #pragma omp parallel for schedule(guided) default(none) private(i) + for ( i = 0; i < A->n; ++i ) + { + //fprintf( stderr, "%f-->", droptol[i] ); + droptol[i] = SQRT( droptol[i] ) * dtol; + //fprintf( stderr, "%f ", droptol[i] ); + } + //fprintf( stderr, "\n" ); } - //fprintf( stderr, "\n" ); } @@ -192,17 +257,21 @@ static int Estimate_LU_Fill( const sparse_matrix * const A, const real * const d fillin = 0; - //fprintf( stderr, "n: %d\n", A->n ); + #pragma omp parallel for schedule(guided) \ + default(none) private(i, j, pj, val) reduction(+: fillin) for ( i = 0; i < A->n; ++i ) + { for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj ) { j = A->j[pj]; val = A->val[pj]; - //fprintf( stderr, "i: %d, j: %d", i, j ); - if ( fabs(val) > droptol[i] ) + if ( FABS(val) > droptol[i] ) + { ++fillin; + } } + } return fillin + A->n; } @@ -510,9 +579,9 @@ static real SuperLU_Factorize( const sparse_matrix * const A, static real diag_pre_comp( const reax_system * const system, real * const Hdia_inv ) { unsigned int i; - struct timeval start, stop; + real start; - gettimeofday( &start, NULL ); + start = Get_Time( ); #pragma omp parallel for schedule(guided) \ default(none) private(i) @@ -521,24 +590,22 @@ static real diag_pre_comp( const reax_system * const system, real * const Hdia_i Hdia_inv[i] = 1.0 / system->reaxprm.sbp[system->atoms[i].type].eta; } - gettimeofday( &stop, NULL ); - return (stop.tv_sec + stop.tv_usec / 1000000.0) - - (start.tv_sec + start.tv_usec / 1000000.0); + return Get_Timing_Info( start ); } -/* Incomplete Cholesky factorization with thresholding */ +/* Incomplete Cholesky factorization with dual thresholding */ static real ICHOLT( const sparse_matrix * const A, const real * const droptol, sparse_matrix * const L, sparse_matrix * const U ) { + //TODO: either add compilation parameter or dynamically allocate int tmp_j[1000]; real tmp_val[1000]; int i, j, pj, k1, k2, tmptop, Ltop; - real val; + real val, start; int *Utop; - struct timeval start, stop; - gettimeofday( &start, NULL ); + start = Get_Time( ); Utop = (int*) malloc((A->n + 1) * sizeof(int)); @@ -567,7 +634,7 @@ static real ICHOLT( const sparse_matrix * const A, const real * const droptol, val = A->val[pj]; //fprintf( stderr, "i: %d, j: %d", i, j ); - if ( fabs(val) > droptol[i] ) + if ( FABS(val) > droptol[i] ) { k1 = 0; k2 = L->start[j]; @@ -603,7 +670,7 @@ static real ICHOLT( const sparse_matrix * const A, const real * const droptol, if ( A->j[pj] != i ) { fprintf( stderr, "i=%d, badly built A matrix!\n", i ); - exit(999); + exit( NUMERIC_BREAKDOWN ); } val = A->val[pj]; @@ -623,7 +690,7 @@ static real ICHOLT( const sparse_matrix * const A, const real * const droptol, //fprintf( stderr, "row(%d): droptol=%.4f\n", i+1, droptol[i] ); for ( k1 = 0; k1 < tmptop; ++k1 ) { - if ( fabs(tmp_val[k1]) > droptol[i] / tmp_val[tmptop] ) + if ( FABS(tmp_val[k1]) > droptol[i] / tmp_val[tmptop] ) { L->j[Ltop] = tmp_j[k1]; L->val[Ltop] = tmp_val[k1]; @@ -662,9 +729,7 @@ static real ICHOLT( const sparse_matrix * const A, const real * const droptol, free(Utop); - gettimeofday( &stop, NULL ); - return (stop.tv_sec + stop.tv_usec / 1000000.0) - - (start.tv_sec + start.tv_usec / 1000000.0); + return Get_Timing_Info( start ); } @@ -678,12 +743,11 @@ static real ICHOL_PAR( const sparse_matrix * const A, const unsigned int sweeps, sparse_matrix * const U_t, sparse_matrix * const U ) { unsigned int i, j, k, pj, x = 0, y = 0, ei_x, ei_y; - real *D, *D_inv, sum; + real *D, *D_inv, sum, start; sparse_matrix *DAD; int *Utop; - struct timeval start, stop; - gettimeofday( &start, NULL ); + start = Get_Time( ); if ( Allocate_Matrix( &DAD, A->n, A->m ) == 0 ) { @@ -691,9 +755,13 @@ static real ICHOL_PAR( const sparse_matrix * const A, const unsigned int sweeps, exit( INSUFFICIENT_MEMORY ); } - D = (real*) malloc(A->n * sizeof(real)); - D_inv = (real*) malloc(A->n * sizeof(real)); - Utop = (int*) malloc((A->n + 1) * sizeof(int)); + if( ( D = (real*) malloc(A->n * sizeof(real)) ) == NULL || + ( D_inv = (real*) malloc(A->n * sizeof(real)) ) == NULL || + ( Utop = (int*) malloc((A->n + 1) * sizeof(int)) ) == NULL ) + { + fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" ); + exit( INSUFFICIENT_MEMORY ); + } for ( i = 0; i < A->n; ++i ) { @@ -856,9 +924,7 @@ static real ICHOL_PAR( const sparse_matrix * const A, const unsigned int sweeps, free(D); free(Utop); - gettimeofday( &stop, NULL ); - return (stop.tv_sec + stop.tv_usec / 1000000.0) - - (start.tv_sec + start.tv_usec / 1000000.0); + return Get_Timing_Info( start ); } @@ -869,18 +935,17 @@ static real ICHOL_PAR( const sparse_matrix * const A, const unsigned int sweeps, * Fine-Grained Parallel Incomplete LU Factorization * SIAM J. Sci. Comp. * - * A: symmetric, half-format (lower triangular) - * sweeps: number of ILUL_PAR loops over non-zeros - * L / U: factorized matrices (A \approx LU) */ + * A: symmetric, half-stored (lower triangular), CSR format + * sweeps: number of loops over non-zeros for computation + * L / U: factorized triangular matrices (A \approx LU), CSR format */ static real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps, - sparse_matrix * const L, sparse_matrix * const U ) + sparse_matrix * const L, sparse_matrix * const U ) { unsigned int i, j, k, pj, x, y, ei_x, ei_y; - real *D, *D_inv, sum; + real *D, *D_inv, sum, start; sparse_matrix *DAD; - struct timeval start, stop; - gettimeofday( &start, NULL ); + start = Get_Time( ); if ( Allocate_Matrix( &DAD, A->n, A->m ) == 0 ) { @@ -888,9 +953,12 @@ static real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps, exit( INSUFFICIENT_MEMORY ); } - /*TODO: check malloc return status*/ - D = (real*) malloc(A->n * sizeof(real)); - D_inv = (real*) malloc(A->n * sizeof(real)); + if( ( D = (real*) malloc(A->n * sizeof(real)) ) == NULL || + ( D_inv = (real*) malloc(A->n * sizeof(real)) ) == NULL ) + { + fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" ); + exit( INSUFFICIENT_MEMORY ); + } #pragma omp parallel for schedule(guided) \ default(none) shared(D, D_inv) private(i) @@ -1063,9 +1131,261 @@ static real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps, free( D_inv ); free( D ); - gettimeofday( &stop, NULL ); - return (stop.tv_sec + stop.tv_usec / 1000000.0) - - (start.tv_sec + start.tv_usec / 1000000.0); + return Get_Timing_Info( start ); +} + + +/* Fine-grained (parallel) incomplete LU factorization with thresholding + * + * Reference: + * Edmond Chow and Aftab Patel + * Fine-Grained Parallel Incomplete LU Factorization + * SIAM J. Sci. Comp. + * + * A: symmetric, half-stored (lower triangular), CSR format + * droptol: row-wise tolerances used for dropping + * sweeps: number of loops over non-zeros for computation + * L / U: factorized triangular matrices (A \approx LU), CSR format */ +static real ILUT_PAR( const sparse_matrix * const A, const real * droptol, + const unsigned int sweeps, sparse_matrix * const L, sparse_matrix * const U ) +{ + unsigned int i, j, k, pj, x, y, ei_x, ei_y, Ltop, Utop; + real *D, *D_inv, sum, start; + sparse_matrix *DAD, *L_temp, *U_temp; + + start = Get_Time( ); + + if ( Allocate_Matrix( &DAD, A->n, A->m ) == 0 || + Allocate_Matrix( &L_temp, A->n, A->m ) == 0 || + Allocate_Matrix( &U_temp, A->n, A->m ) == 0 ) + { + fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" ); + exit( INSUFFICIENT_MEMORY ); + } + + if( ( D = (real*) malloc(A->n * sizeof(real)) ) == NULL || + ( D_inv = (real*) malloc(A->n * sizeof(real)) ) == NULL ) + { + fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" ); + exit( INSUFFICIENT_MEMORY ); + } + + #pragma omp parallel for schedule(guided) \ + default(none) shared(D, D_inv) private(i) + for ( i = 0; i < A->n; ++i ) + { + D_inv[i] = SQRT( A->val[A->start[i + 1] - 1] ); + D[i] = 1.0 / D_inv[i]; + } + + /* to get convergence, A must have unit diagonal, so apply + * transformation DAD, where D = D(1./sqrt(D(A))) */ + memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) ); + #pragma omp parallel for schedule(guided) \ + default(none) shared(DAD, D) private(i, pj) + for ( i = 0; i < A->n; ++i ) + { + /* non-diagonals */ + for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj ) + { + DAD->j[pj] = A->j[pj]; + DAD->val[pj] = D[i] * A->val[pj] * D[A->j[pj]]; + } + /* diagonal */ + DAD->j[pj] = A->j[pj]; + DAD->val[pj] = 1.0; + } + + /* initial guesses for L and U, + * assume: A and DAD symmetric and stored lower triangular */ + memcpy( L_temp->start, DAD->start, sizeof(int) * (DAD->n + 1) ); + memcpy( L_temp->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) ); + memcpy( L_temp->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) ); + /* store U^T in CSR for row-wise access and tranpose later */ + memcpy( U_temp->start, DAD->start, sizeof(int) * (DAD->n + 1) ); + memcpy( U_temp->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) ); + memcpy( U_temp->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) ); + + /* L has unit diagonal, by convention */ + #pragma omp parallel for schedule(guided) \ + default(none) private(i) shared(L_temp) + for ( i = 0; i < A->n; ++i ) + { + L_temp->val[L_temp->start[i + 1] - 1] = 1.0; + } + + for ( i = 0; i < sweeps; ++i ) + { + /* for each nonzero in L */ + #pragma omp parallel for schedule(guided) \ + default(none) shared(DAD, L_temp, U_temp) private(j, k, x, y, ei_x, ei_y, sum) + for ( j = 0; j < DAD->start[DAD->n]; ++j ) + { + sum = ZERO; + + /* determine row bounds of current nonzero */ + x = 0; + ei_x = 0; + for ( k = 1; k <= DAD->n; ++k ) + { + if ( DAD->start[k] > j ) + { + x = DAD->start[k - 1]; + ei_x = DAD->start[k]; + break; + } + } + /* determine column bounds of current nonzero */ + y = DAD->start[DAD->j[j]]; + ei_y = DAD->start[DAD->j[j] + 1]; + + /* sparse dot product: + * dot( L(i,1:j-1), U(1:j-1,j) ) */ + while ( L_temp->j[x] < L_temp->j[j] && + L_temp->j[y] < L_temp->j[j] && + x < ei_x && y < ei_y ) + { + if ( L_temp->j[x] == L_temp->j[y] ) + { + sum += (L_temp->val[x] * U_temp->val[y]); + ++x; + ++y; + } + else if ( L_temp->j[x] < L_temp->j[y] ) + { + ++x; + } + else + { + ++y; + } + } + + if( j != ei_x - 1 ) + { + L_temp->val[j] = ( DAD->val[j] - sum ) / U_temp->val[ei_y - 1]; + } + } + + #pragma omp parallel for schedule(guided) \ + default(none) shared(DAD, L_temp, U_temp) private(j, k, x, y, ei_x, ei_y, sum) + for ( j = 0; j < DAD->start[DAD->n]; ++j ) + { + sum = ZERO; + + /* determine row bounds of current nonzero */ + x = 0; + ei_x = 0; + for ( k = 1; k <= DAD->n; ++k ) + { + if ( DAD->start[k] > j ) + { + x = DAD->start[k - 1]; + ei_x = DAD->start[k]; + break; + } + } + /* determine column bounds of current nonzero */ + y = DAD->start[DAD->j[j]]; + ei_y = DAD->start[DAD->j[j] + 1]; + + /* sparse dot product: + * dot( L(i,1:i-1), U(1:i-1,j) ) */ + while ( U_temp->j[x] < U_temp->j[j] && + U_temp->j[y] < U_temp->j[j] && + x < ei_x && y < ei_y ) + { + if ( U_temp->j[x] == U_temp->j[y] ) + { + sum += (L_temp->val[y] * U_temp->val[x]); + ++x; + ++y; + } + else if ( U_temp->j[x] < U_temp->j[y] ) + { + ++x; + } + else + { + ++y; + } + } + + U_temp->val[j] = DAD->val[j] - sum; + } + } + + /* apply inverse transformation: + * since DAD \approx LU, then + * D^{-1}DADD^{-1} = A \approx D^{-1}LUD^{-1} */ + #pragma omp parallel for schedule(guided) \ + default(none) shared(DAD, L_temp, U_temp, D_inv) private(i, pj) + for ( i = 0; i < DAD->n; ++i ) + { + for ( pj = DAD->start[i]; pj < DAD->start[i + 1]; ++pj ) + { + L_temp->val[pj] = D_inv[i] * L_temp->val[pj]; + /* currently storing U^T, so use row index instead of column index */ + U_temp->val[pj] = U_temp->val[pj] * D_inv[i]; + } + } + + /* apply the dropping rule */ + Ltop = 0; + Utop = 0; + for ( i = 0; i < DAD->n; ++i ) + { + L->start[i] = Ltop; + U->start[i] = Utop; + + for ( pj = L_temp->start[i]; pj < L_temp->start[i + 1] - 1; ++pj ) + { + if ( FABS( L_temp->val[pj] ) > FABS( droptol[i] / L_temp->val[L_temp->start[i + 1] - 1] ) ) + { + L->j[Ltop] = L_temp->j[pj]; + L->val[Ltop] = L_temp->val[pj]; + ++Ltop; + } + } + + /* diagonal */ + L->j[Ltop] = L_temp->j[pj]; + L->val[Ltop] = L_temp->val[pj]; + ++Ltop; + + for ( pj = U_temp->start[i]; pj < U_temp->start[i + 1] - 1; ++pj ) + { + if ( FABS( U_temp->val[pj] ) > FABS( droptol[i] / U_temp->val[U_temp->start[i + 1] - 1] ) ) + { + U->j[Utop] = U_temp->j[pj]; + U->val[Utop] = U_temp->val[pj]; + ++Utop; + } + } + + /* diagonal */ + U->j[Utop] = U_temp->j[pj]; + U->val[Utop] = U_temp->val[pj]; + ++Utop; + } + + L->start[i] = Ltop; + U->start[i] = Utop; + + Transpose_I( U ); + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "nnz(L): %d\n", L->start[L->n] ); + fprintf( stderr, "nnz(U): %d\n", U->start[U->n] ); +#endif + + Deallocate_Matrix( U_temp ); + Deallocate_Matrix( L_temp ); + Deallocate_Matrix( DAD ); + free( D_inv ); + free( D ); + + return Get_Timing_Info( start ); } @@ -1179,6 +1499,26 @@ static void Init_MatVec( const reax_system * const system, const control_params // Print_Sparse_Matrix2( U_test, "U_SLU.out" ); // exit( 0 ); break; + case ILUT_PAR_PC: + Calculate_Droptol( workspace->H, workspace->droptol, control->pre_comp_droptol ); +#if defined(DEBUG_FOCUS) + fprintf( stderr, "drop tolerances calculated\n" ); +#endif + + if ( workspace->L == NULL ) + { + /* TODO: safest storage estimate is ILU(0) (same as lower triangular portion of H), could improve later */ + if ( Allocate_Matrix( &(workspace->L), workspace->H->n, workspace->H->m ) == 0 || + Allocate_Matrix( &(workspace->U), workspace->H->n, workspace->H->m ) == 0 ) + { + fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" ); + exit( INSUFFICIENT_MEMORY ); + } + } + + data->timing.pre_comp += ILUT_PAR( workspace->H, workspace->droptol, control->pre_comp_sweeps, + workspace->L, workspace->U ); + break; case ILU_SUPERLU_MT_PC: if ( workspace->L == NULL ) { diff --git a/sPuReMD/src/control.c b/sPuReMD/src/control.c index d70f830979c6be19a65ade21da2d263150072ecd..62311c5934d5da95ee7e5473f799c5ec506e0c20 100644 --- a/sPuReMD/src/control.c +++ b/sPuReMD/src/control.c @@ -72,11 +72,11 @@ char Read_Control_File( FILE* fp, reax_system *system, control_params* control, control->qeq_solver_type = GMRES_S; control->qeq_solver_q_err = 0.000001; control->pre_comp_type = ICHOLT_PC; - control->pre_comp_sweeps = ICHOLT_PC; + control->pre_comp_sweeps = 3; control->pre_comp_refactor = 100; control->pre_comp_droptol = 0.01; control->pre_app_type = TRI_SOLVE_PA; - control->pre_app_jacobi_iters = 10; + control->pre_app_jacobi_iters = 50; control->T_init = 0.; control->T_final = 300.; @@ -274,8 +274,8 @@ char Read_Control_File( FILE* fp, reax_system *system, control_params* control, } else if ( strcmp(tmp[0], "pre_app_jacobi_iters") == 0 ) { - val = atof( tmp[1] ); - control->pre_app_jacobi_iters = val; + ival = atoi( tmp[1] ); + control->pre_app_jacobi_iters = ival; } else if ( strcmp(tmp[0], "temp_init") == 0 ) { diff --git a/sPuReMD/src/mytypes.h b/sPuReMD/src/mytypes.h index 7dd51a7b949005993c94288f548f265d2eef2162..3202c622bb842356ca453284573da12887234391 100644 --- a/sPuReMD/src/mytypes.h +++ b/sPuReMD/src/mytypes.h @@ -58,6 +58,7 @@ #define COS cos #define SIN sin #define TAN tan +#define FABS fabs #define FMOD fmod #define SQR(x) ((x)*(x)) @@ -195,7 +196,7 @@ enum solver enum pre_comp { - DIAG_PC = 0, ICHOLT_PC = 1, ILU_PAR_PC = 2, ILU_SUPERLU_MT_PC = 3, + DIAG_PC = 0, ICHOLT_PC = 1, ILU_PAR_PC = 2, ILUT_PAR_PC = 3, ILU_SUPERLU_MT_PC = 4, }; enum pre_app