From fe3c2d58fa9b06ec1228258bb1386ad1bd1c3ea4 Mon Sep 17 00:00:00 2001 From: "Kurt A. O'Hearn" <ohearnku@msu.edu> Date: Fri, 1 Jul 2016 09:50:49 -0400 Subject: [PATCH] sPuReMD: add support for SuperLU MT. --- PG-PuReMD/src/parallelreax.c | 1 - configure.ac | 10 ++ sPuReMD/configure.ac | 7 ++ sPuReMD/src/QEq.c | 234 ++++++++++++++++++++++++++++++++++- sPuReMD/src/mytypes.h | 7 +- 5 files changed, 255 insertions(+), 4 deletions(-) diff --git a/PG-PuReMD/src/parallelreax.c b/PG-PuReMD/src/parallelreax.c index 7057b836..15f03bb3 100644 --- a/PG-PuReMD/src/parallelreax.c +++ b/PG-PuReMD/src/parallelreax.c @@ -290,7 +290,6 @@ int main( int argc, char* argv[] ) #endif //END OF FIRST STEP - fprintf(stderr, "===>HERE\n"); // compute f_0 Comm_Atoms( system, control, data, workspace, lists, mpi_data, 1 ); Sync_Atoms ( system ); diff --git a/configure.ac b/configure.ac index 6a04a17f..035c0720 100644 --- a/configure.ac +++ b/configure.ac @@ -127,6 +127,16 @@ then export BUILD_TIMING="yes" fi +AC_ARG_WITH([superlu-mt], + [AS_HELP_STRING([--with-superlu-mt], + [enable usage of SuperLU MT for QEq preconditioner computation @<:@default: no@:>@])], + [package_superlu_mt=${withval}], [package_superlu_mt=no]) + +if test "x$package_superlu_mt" != "xno" +then + export BUILD_SUPERLU_MT="${package_superlu_mt}" +fi + AC_CONFIG_FILES([Makefile]) AC_OUTPUT diff --git a/sPuReMD/configure.ac b/sPuReMD/configure.ac index cd7e0eb1..be425c8a 100644 --- a/sPuReMD/configure.ac +++ b/sPuReMD/configure.ac @@ -57,6 +57,13 @@ if test "x$BUILD_OPENMP" = "xyes"; then fi fi +if test "x$BUILD_SUPERLU_MT" != "xno" +then + AC_DEFINE([HAVE_SUPERLU_MT], [1], [Define to 1 if you have SuperLU_MT support enabled.]) + AC_SUBST(AM_CPPFLAGS, "-I${BUILD_SUPERLU_MT}/include") + AC_SUBST(AM_LDFLAGS, "-L${BUILD_SUPERLU_MT}/lib") +fi + AC_CONFIG_FILES([Makefile]) AC_OUTPUT diff --git a/sPuReMD/src/QEq.c b/sPuReMD/src/QEq.c index 807715e8..7db80839 100644 --- a/sPuReMD/src/QEq.c +++ b/sPuReMD/src/QEq.c @@ -24,6 +24,9 @@ #include "GMRES.h" #include "list.h" #include "print_utils.h" +#if defined(HAVE_SUPERLU_MT) +#include "slu_mt_ddefs.h" +#endif int compare_matrix_entry(const void *v1, const void *v2) { @@ -109,6 +112,229 @@ int Estimate_LU_Fill( sparse_matrix *A, real *droptol ) } +#if defined(HAVE_SUPERLU_MT) +static real SuperLU_Factorize( const sparse_matrix * const A, + sparse_matrix * const L, sparse_matrix * const U ) +{ + unsigned int i, pj, count, *Ltop, *Utop, r; + SuperMatrix A_S, AC_S, L_S, U_S; + SCPformat *L_S_store; + NCPformat *U_S_store; + superlumt_options_t superlumt_options; +// pxgstrf_shared_t pxgstrf_shared; +// pdgstrf_threadarg_t *pdgstrf_threadarg; + int_t nprocs; + fact_t fact; + trans_t trans; + yes_no_t refact, usepr; + real u, drop_tol; + real *a; + int_t *asub, *xa; + int_t *perm_c; /* column permutation vector */ + int_t *perm_r; /* row permutations from partial pivoting */ + void *work; + int_t info, lwork; + int_t permc_spec, panel_size, relax; + Gstat_t Gstat; + flops_t flopcnt; + + /* Default parameters to control factorization. */ +#ifdef _OPENMP + //TODO: set as global parameter and use + #pragma omp parallel \ + default(none) shared(nprocs) + { + #pragma omp master + { + /* SuperLU_MT spawns threads internally, so set and pass parameter */ + nprocs = omp_get_num_threads(); + } + } +#else + nprocs = 1; +#endif +// fact = EQUILIBRATE; /* equilibrate A (i.e., scale rows & cols to have unit norm), then factorize */ + fact = DOFACT; /* factor from scratch */ + trans = NOTRANS; + refact = NO; /* first time factorization */ + //TODO: add to control file and use the value there to set these + panel_size = sp_ienv(1); /* # consec. cols treated as unit task */ + relax = sp_ienv(2); /* # cols grouped as relaxed supernode */ + u = 1.0; + usepr = NO; + drop_tol = 0.0; + work = NULL; + lwork = 0; + + if (!(perm_r = intMalloc(A->n))) + { + SUPERLU_ABORT("Malloc fails for perm_r[]."); + } + if (!(perm_c = intMalloc(A->n))) + { + SUPERLU_ABORT("Malloc fails for perm_c[]."); + } + if ( !(superlumt_options.etree = intMalloc(A->n)) ) + { + SUPERLU_ABORT("Malloc fails for etree[]."); + } + if ( !(superlumt_options.colcnt_h = intMalloc(A->n)) ) + { + SUPERLU_ABORT("Malloc fails for colcnt_h[]."); + } + if ( !(superlumt_options.part_super_h = intMalloc(A->n)) ) + { + SUPERLU_ABORT("Malloc fails for colcnt_h[]."); + } + if ( (a = (real*) malloc( A->start[A->n] * sizeof(real))) == NULL + || (asub = (int_t*) malloc( A->start[A->n] * sizeof(int_t))) == NULL + || (xa = (int_t*) malloc( (A->n + 1) * sizeof(int_t))) == NULL ) + { + + } + + /* Set up the sparse matrix data structure for A. */ + count = 0; + for( i = 0; i < A->n; ++i ) + { + for( pj = A->start[i]; pj < A->start[i + 1]; ++pj ) + { + a[count] = A->entries[pj].val; + asub[count] = A->entries[pj].j; + ++count; + } + } + memcpy( xa, A->start, sizeof(int) * (A->n + 1) ); + dCreate_CompRow_Matrix(&A_S, A->n, A->n, A->start[A->n], a, asub, xa, + SLU_NR, /* row-wise, no supernode */ + SLU_D, /* double-precision */ + SLU_SYL /* symmetric, store lower half */ + ); + + + /******************************** + * THE FIRST TIME FACTORIZATION * + ********************************/ + + /* ------------------------------------------------------------ + Allocate storage and initialize statistics variables. + ------------------------------------------------------------*/ + StatAlloc(A->n, nprocs, panel_size, relax, &Gstat); + StatInit(A->n, nprocs, &Gstat); + + /* ------------------------------------------------------------ + Get column permutation vector perm_c[], according to permc_spec: + permc_spec = 0: natural ordering + permc_spec = 1: minimum degree ordering on structure of A'*A + permc_spec = 2: minimum degree ordering on structure of A'+A + permc_spec = 3: approximate minimum degree for unsymmetric matrices + ------------------------------------------------------------*/ + permc_spec = 0; + get_perm_c(permc_spec, &A_S, perm_c); + + /* ------------------------------------------------------------ + Initialize the option structure superlumt_options using the + user-input parameters; + Apply perm_c to the columns of original A to form AC. + ------------------------------------------------------------*/ + pdgstrf_init(nprocs, fact, trans, refact, panel_size, relax, + u, usepr, drop_tol, perm_c, perm_r, + work, lwork, &A_S, &AC_S, &superlumt_options, &Gstat); + + /* ------------------------------------------------------------ + Compute the LU factorization of A. + The following routine will create nprocs threads. + ------------------------------------------------------------*/ + pdgstrf(&superlumt_options, &AC_S, perm_r, &L_S, &U_S, &Gstat, &info); + + flopcnt = 0; + for (i = 0; i < nprocs; ++i) + { + flopcnt += Gstat.procstat[i].fcops; + } + Gstat.ops[FACT] = flopcnt; + + /* Free extra arrays in AC_S. */ + Destroy_CompRow_Permuted(&AC_S); + + /* ------------------------------------------------------------ + Deallocate storage after factorization. + ------------------------------------------------------------*/ + pxgstrf_finalize(&superlumt_options, &AC_S); + + printf("\n** Result of sparse LU **\n"); + L_S_store = (SCPformat *) L_S.Store; + U_S_store = (NCPformat *) U_S.Store; + printf("No of nonzeros in factor L = " IFMT "\n", L_S_store->nnz); + printf("No of nonzeros in factor U = " IFMT "\n", U_S_store->nnz); + printf("No of nonzeros in L+U = " IFMT "\n", L_S_store->nnz + U_S_store->nnz - A->n); + fflush(stdout); + + /* convert L and R from SuperMatrix to CSR */ + memset( Ltop, 0, (A->n + 1) * sizeof(int) ); + memset( Utop, 0, (A->n + 1) * sizeof(int) ); + for( i = 0; i < A->n; ++i ) + { + //TODO: correct for SCPformat for L? + for( pj = L_S_store->rowind_colbeg[i]; pj < L_S_store->rowind_colend[i]; ++pj ) + { + ++Ltop[L_S_store->rowind[pj] + 1]; + } + for( pj = U_S_store->colbeg[i]; pj < U_S_store->colend[i]; ++pj ) + { + ++Utop[U_S_store->rowind[pj] + 1]; + } + } + for( i = 1; i <= A->n; ++i ) + { + Ltop[i] = L->start[i] = Ltop[i] + Ltop[i - 1]; + Utop[i] = U->start[i] = Ltop[i] + Utop[i - 1]; + } + for( i = 0; i < A->n; ++i ) + { + //TODO: correct for SCPformat for L? + for( pj = L_S_store->nzval_colbeg[i]; pj < L_S_store->nzval_colend[i]; ++pj ) + { + r = L_S_store->rowind[pj]; + L->entries[Ltop[r]].j = i; + L->entries[Ltop[r]].val = L_S_store->nzval[pj]; + ++Ltop[r]; + } + for( pj = U_S_store->colbeg[i]; pj < U_S_store->colend[i]; ++pj ) + { + r = U_S_store->rowind[pj]; + U->entries[Utop[r]].j = i; + U->entries[Utop[r]].val = U_S_store->nzval[pj]; + ++Utop[r]; + } + } + + free( xa ); + free( asub ); + free( a ); + SUPERLU_FREE (perm_r); + SUPERLU_FREE (perm_c); + SUPERLU_FREE (superlumt_options.etree); + SUPERLU_FREE (superlumt_options.colcnt_h); + SUPERLU_FREE (superlumt_options.part_super_h); + Destroy_CompRow_Matrix(&A_S); + if ( lwork == 0 ) + { + Destroy_SuperNode_SCP(&L_S); + Destroy_CompCol_NCP(&U_S); + } + else if ( lwork > 0 ) + { + SUPERLU_FREE(work); + } + StatFree(&Gstat); + + //TODO: return iters + return 0.; +} +#endif + + /* Incomplete Cholesky factorization with thresholding */ real ICHOLT( sparse_matrix *A, real *droptol, sparse_matrix *L, sparse_matrix *U ) @@ -458,12 +684,13 @@ void Init_MatVec( reax_system *system, control_params *control, if ( Allocate_Matrix( &(workspace->L), far_nbrs->n, fillin ) == 0 || Allocate_Matrix( &(workspace->U), far_nbrs->n, fillin ) == 0 ) { - fprintf( stderr, "not enough memory for LU matrices. terminating.\n" ); + fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" ); exit(INSUFFICIENT_SPACE); } break; case ICHOL_PAR_PC: - /* ICHOL_PAR: factors have sparsity pattern as H */ + case ILU_SUPERLU_MT_PC: + /* factors have sparsity pattern as H */ if ( Allocate_Matrix( &(workspace->L), workspace->H->n, workspace->H->m ) == 0 || Allocate_Matrix( &(workspace->U), workspace->H->n, workspace->H->m ) == 0 ) { @@ -490,6 +717,9 @@ void Init_MatVec( reax_system *system, control_params *control, case ICHOL_PAR_PC: data->timing.pre_comp += ICHOL_PAR( workspace->H, 1, workspace->L, workspace->U ); break; + case ILU_SUPERLU_MT_PC: + data->timing.pre_comp += SuperLU_Factorize( workspace->H, workspace->L, workspace->U ); + break; default: break; } diff --git a/sPuReMD/src/mytypes.h b/sPuReMD/src/mytypes.h index 3f98f74b..b3bb106c 100644 --- a/sPuReMD/src/mytypes.h +++ b/sPuReMD/src/mytypes.h @@ -22,6 +22,11 @@ #ifndef __MYTYPES_H_ #define __MYTYPES_H_ +#if (defined(HAVE_CONFIG_H) && !defined(__CONFIG_H_)) +#define __CONFIG_H_ +#include "config.h" +#endif + #include "math.h" #include "random.h" #include "stdio.h" @@ -157,7 +162,7 @@ enum solver { PGMRES_J_S = 3, CG_S = 4, PCG_S = 5, SDM_S = 6, }; enum pre_comp { - DIAG_PC = 0, ICHOLT_PC = 1, ICHOL_PAR_PC = 2, + DIAG_PC = 0, ICHOLT_PC = 1, ICHOL_PAR_PC = 2, ILU_SUPERLU_MT_PC = 3, }; -- GitLab