From 6514f449c939f6ce637ae6ca22e4f18312bd49b6 Mon Sep 17 00:00:00 2001 From: Abdullah Alperen <alperena@msu.edu> Date: Wed, 24 Jan 2018 23:04:40 -0500 Subject: [PATCH] first draft of SAI preconditioner --- sPuReMD/src/charges.c | 12 +- sPuReMD/src/lin_alg.c | 1085 ++++++++++++++++++++++++++--------------- 2 files changed, 692 insertions(+), 405 deletions(-) diff --git a/sPuReMD/src/charges.c b/sPuReMD/src/charges.c index d2f038c4..9276c82d 100644 --- a/sPuReMD/src/charges.c +++ b/sPuReMD/src/charges.c @@ -204,6 +204,8 @@ static void Compute_Preconditioner_QEq( const reax_system * const system, case SAI_PC: //TODO: call function + data->timing.cm_solver_pre_comp += + SAI_PAR( Hptr, HSPptr, H_AppInv ); break; default: @@ -552,6 +554,8 @@ static void Compute_Preconditioner_EE( const reax_system * const system, case SAI_PC: //TODO: call function + data->timing.cm_solver_pre_comp += + SAI_PAR( Hptr, HSPptr, H_AppInv ); break; default: @@ -659,6 +663,8 @@ static void Compute_Preconditioner_ACKS2( const reax_system * const system, case SAI_PC: //TODO: call function + data->timing.cm_solver_pre_comp += + SAI_PAR( Hptr, HSPptr, H_AppInv ); break; default: @@ -687,8 +693,7 @@ static void Compute_Preconditioner_ACKS2( const reax_system * const system, } -/* Setup routines before computing the preconditioner for QEq - */ + static void Setup_Preconditioner_QEq( const reax_system * const system, const control_params * const control, simulation_data * const data, static_storage * const workspace, @@ -826,6 +831,7 @@ static void Setup_Preconditioner_QEq( const reax_system * const system, case SAI_PC: //TODO: call setup function + Setup_Sparsity_Pattern( Hptr, workspace->saifilter, HSPptr ); break; default: @@ -977,6 +983,7 @@ static void Setup_Preconditioner_EE( const reax_system * const system, case SAI_PC: //TODO: call setup function + Setup_Sparsity_Pattern( Hptr, workspace->saifilter, HSPptr ); break; default: @@ -1130,6 +1137,7 @@ static void Setup_Preconditioner_ACKS2( const reax_system * const system, case SAI_PC: //TODO: call setup function + Setup_Sparsity_Pattern( Hptr, workspace->saifilter, HSPptr ); break; default: diff --git a/sPuReMD/src/lin_alg.c b/sPuReMD/src/lin_alg.c index 4a3fdf80..bb17a65b 100644 --- a/sPuReMD/src/lin_alg.c +++ b/sPuReMD/src/lin_alg.c @@ -24,7 +24,7 @@ #include "allocate.h" #include "tool_box.h" #include "vector.h" - +#include "mkl_lapacke.h" typedef struct { @@ -137,7 +137,7 @@ void Sort_Matrix_Rows( sparse_matrix * const A ) sparse_matrix_entry *temp; #ifdef _OPENMP -// #pragma omp parallel default(none) private(i, j, si, ei, temp) shared(stderr) + // #pragma omp parallel default(none) private(i, j, si, ei, temp) shared(stderr) #endif { if ( ( temp = (sparse_matrix_entry *) malloc( A->n * sizeof(sparse_matrix_entry)) ) == NULL ) @@ -148,7 +148,7 @@ void Sort_Matrix_Rows( sparse_matrix * const A ) /* sort each row of A using column indices */ #ifdef _OPENMP -// #pragma omp for schedule(guided) + // #pragma omp for schedule(guided) #endif for ( i = 0; i < A->n; ++i ) { @@ -176,6 +176,87 @@ void Sort_Matrix_Rows( sparse_matrix * const A ) } +void Setup_Sparsity_Pattern(const sparse_matrix * const A, + real * const saifilter, sparse_matrix * A_SP) +{ + int i, pj, size; + real mn, mx, threshold, val; + + if( A_SP == NULL ) + { + if( Allocate_Matrix( &A_SP, A->n, A->m ) == NULL ) + { + fprintf( stderr, "[SAI] Not enough memory for preconditioning matrices. terminating.\n" ); + exit( INSUFFICIENT_MEMORY ); + } + } + else if( (A_SP->m) < (A->m) ) + { + Deallocate_Matrix( A_SP ); + if( Allocate_Matrix( &A_SP, A->n, A->m ) == NULL ) + { + fprintf( stderr, "[SAI] Not enough memory for preconditioning matrices. terminating.\n" ); + exit( INSUFFICIENT_MEMORY ); + } + } + // find min and max element of the matrix + for ( i = 0; i < A->n; ++i ) + { + for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj ) + { + val = A->val[pj]; + if ( pj == 0 ) + { + mn = mx = val; + } + else + { + if ( mn > val ) + mn = val; + if ( mx < val ) + mx = val; + } + } + } + + threshold = mn + ( mx - mn ) * saifilter; + + // calculate the nnz of the sparsity pattern + /* for ( size = 0, i = 0; i < A->n; ++i ) + { + for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj ) + { + if ( threshold <= A->val[pj] ) + size++; + } + }*/ + + /*if( Allocate_Matrix( &A_SP, A->n, size ) == NULL ) + { + fprintf( stderr, "[SAI] Not enough memory for preconditioning matrices. terminating.\n" ); + exit( INSUFFICIENT_MEMORY ); + }*/ + + //A_SP->start[A_SP->n] = size; + + // fill the sparsity pattern + for ( size = 0, i = 0; i < A->n; ++i ) + { + A_SP->start[i] = size; + + for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj ) + { + if ( threshold <= A->val[pj] ) + { + A_SP->val[size] = A->val[pj]; + A_SP->j[size] = A->j[pj]; + size++; + } + } + } + A_SP->start[A->n] = size; +} + void Calculate_Droptol( const sparse_matrix * const A, real * const droptol, const real dtol ) { @@ -187,13 +268,13 @@ void Calculate_Droptol( const sparse_matrix * const A, #endif #ifdef _OPENMP - #pragma omp parallel default(none) private(i, j, k, val, tid), shared(droptol_local, stderr) +#pragma omp parallel default(none) private(i, j, k, val, tid), shared(droptol_local, stderr) #endif { #ifdef _OPENMP tid = omp_get_thread_num(); - #pragma omp master +#pragma omp master { /* keep b_local for program duration to avoid allocate/free * overhead per Sparse_MatVec call*/ @@ -207,7 +288,7 @@ void Calculate_Droptol( const sparse_matrix * const A, } } - #pragma omp barrier +#pragma omp barrier #endif /* init droptol to 0 */ @@ -221,12 +302,12 @@ void Calculate_Droptol( const sparse_matrix * const A, } #ifdef _OPENMP - #pragma omp barrier +#pragma omp barrier #endif /* calculate sqaure of the norm of each row */ #ifdef _OPENMP - #pragma omp for schedule(static) +#pragma omp for schedule(static) #endif for ( i = 0; i < A->n; ++i ) { @@ -254,9 +335,9 @@ void Calculate_Droptol( const sparse_matrix * const A, } #ifdef _OPENMP - #pragma omp barrier +#pragma omp barrier - #pragma omp for schedule(static) +#pragma omp for schedule(static) for ( i = 0; i < A->n; ++i ) { droptol[i] = 0.0; @@ -266,13 +347,13 @@ void Calculate_Droptol( const sparse_matrix * const A, } } - #pragma omp barrier +#pragma omp barrier #endif /* calculate local droptol for each row */ //fprintf( stderr, "droptol: " ); #ifdef _OPENMP - #pragma omp for schedule(static) +#pragma omp for schedule(static) #endif for ( i = 0; i < A->n; ++i ) { @@ -294,8 +375,8 @@ int Estimate_LU_Fill( const sparse_matrix * const A, const real * const droptol fillin = 0; #ifdef _OPENMP - #pragma omp parallel for schedule(static) \ - default(none) private(i, pj, val) reduction(+: fillin) +#pragma omp parallel for schedule(static) \ + default(none) private(i, pj, val) reduction(+: fillin) #endif for ( i = 0; i < A->n; ++i ) { @@ -345,10 +426,10 @@ real SuperLU_Factorize( const sparse_matrix * const A, /* Default parameters to control factorization. */ #ifdef _OPENMP //TODO: set as global parameter and use - #pragma omp parallel \ - default(none) shared(nprocs) +#pragma omp parallel \ + default(none) shared(nprocs) { - #pragma omp master +#pragma omp master { /* SuperLU_MT spawns threads internally, so set and pass parameter */ nprocs = omp_get_num_threads(); @@ -358,7 +439,7 @@ real SuperLU_Factorize( const sparse_matrix * const A, nprocs = 1; #endif -// fact = EQUILIBRATE; /* equilibrate A (i.e., scale rows & cols to have unit norm), then factorize */ + // fact = EQUILIBRATE; /* equilibrate A (i.e., scale rows & cols to have unit norm), then factorize */ fact = DOFACT; /* factor from scratch */ trans = NOTRANS; refact = NO; /* first time factorization */ @@ -371,11 +452,11 @@ real SuperLU_Factorize( const sparse_matrix * const A, work = NULL; lwork = 0; -//#if defined(DEBUG) + //#if defined(DEBUG) fprintf( stderr, "nprocs = %d\n", nprocs ); fprintf( stderr, "Panel size = %d\n", panel_size ); fprintf( stderr, "Relax = %d\n", relax ); -//#endif + //#endif if ( !(perm_r = intMalloc(A->n)) ) { @@ -435,7 +516,7 @@ real SuperLU_Factorize( const sparse_matrix * const A, xa[i] = count; dCompRow_to_CompCol( A->n, A->n, 2 * A->start[A->n] - A->n, a, asub, xa, - &at, &atsub, &xat ); + &at, &atsub, &xat ); for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i ) fprintf( stderr, "%6d", asub[i] ); @@ -490,8 +571,8 @@ real SuperLU_Factorize( const sparse_matrix * const A, Apply perm_c to the columns of original A to form AC. ------------------------------------------------------------*/ pdgstrf_init( nprocs, fact, trans, refact, panel_size, relax, - u, usepr, drop_tol, perm_c, perm_r, - work, lwork, &A_S, &AC_S, &superlumt_options, &Gstat ); + u, usepr, drop_tol, perm_c, perm_r, + work, lwork, &A_S, &AC_S, &superlumt_options, &Gstat ); for ( i = 0; i < ((NCPformat*)AC_S.Store)->nnz; ++i ) fprintf( stderr, "%6.1f", ((real*)(((NCPformat*)AC_S.Store)->nzval))[i] ); @@ -512,14 +593,14 @@ real SuperLU_Factorize( const sparse_matrix * const A, } Gstat.ops[FACT] = flopcnt; -//#if defined(DEBUG) + //#if defined(DEBUG) printf("\n** Result of sparse LU **\n"); L_S_store = (SCPformat *) L_S.Store; U_S_store = (NCPformat *) U_S.Store; printf( "No of nonzeros in factor L = " IFMT "\n", L_S_store->nnz ); printf( "No of nonzeros in factor U = " IFMT "\n", U_S_store->nnz ); fflush( stdout ); -//#endif + //#endif /* convert L and R from SuperLU formats to CSR */ memset( Ltop, 0, (A->n + 1) * sizeof(int) ); @@ -541,10 +622,10 @@ real SuperLU_Factorize( const sparse_matrix * const A, fprintf( stderr, "nzval_col_end[%5d] = %d\n", i, L_S_store->nzval_colend[i] ); //TODO: correct for SCPformat for L? //for( pj = L_S_store->rowind_colbeg[i]; pj < L_S_store->rowind_colend[i]; ++pj ) -// for( pj = 0; pj < L_S_store->rowind_colend[i] - L_S_store->rowind_colbeg[i]; ++pj ) -// { -// ++Ltop[L_S_store->rowind[L_S_store->rowind_colbeg[i] + pj] + 1]; -// } + // for( pj = 0; pj < L_S_store->rowind_colend[i] - L_S_store->rowind_colbeg[i]; ++pj ) + // { + // ++Ltop[L_S_store->rowind[L_S_store->rowind_colbeg[i] + pj] + 1]; + // } fprintf( stderr, "col_beg[%5d] = %d\n", i, U_S_store->colbeg[i] ); fprintf( stderr, "col_end[%5d] = %d\n", i, U_S_store->colend[i] ); for ( pj = U_S_store->colbeg[i]; pj < U_S_store->colend[i]; ++pj ) @@ -555,20 +636,20 @@ real SuperLU_Factorize( const sparse_matrix * const A, } for ( i = 1; i <= A->n; ++i ) { -// Ltop[i] = L->start[i] = Ltop[i] + Ltop[i - 1]; + // Ltop[i] = L->start[i] = Ltop[i] + Ltop[i - 1]; Utop[i] = U->start[i] = Utop[i] + Utop[i - 1]; -// fprintf( stderr, "Utop[%5d] = %d\n", i, Utop[i] ); -// fprintf( stderr, "U->start[%5d] = %d\n", i, U->start[i] ); + // fprintf( stderr, "Utop[%5d] = %d\n", i, Utop[i] ); + // fprintf( stderr, "U->start[%5d] = %d\n", i, U->start[i] ); } for ( i = 0; i < A->n; ++i ) { -// for( pj = 0; pj < L_S_store->nzval_colend[i] - L_S_store->nzval_colbeg[i]; ++pj ) -// { -// r = L_S_store->rowind[L_S_store->rowind_colbeg[i] + pj]; -// L->entries[Ltop[r]].j = r; -// L->entries[Ltop[r]].val = ((real*)L_S_store->nzval)[L_S_store->nzval_colbeg[i] + pj]; -// ++Ltop[r]; -// } + // for( pj = 0; pj < L_S_store->nzval_colend[i] - L_S_store->nzval_colbeg[i]; ++pj ) + // { + // r = L_S_store->rowind[L_S_store->rowind_colbeg[i] + pj]; + // L->entries[Ltop[r]].j = r; + // L->entries[Ltop[r]].val = ((real*)L_S_store->nzval)[L_S_store->nzval_colbeg[i] + pj]; + // ++Ltop[r]; + // } for ( pj = U_S_store->colbeg[i]; pj < U_S_store->colend[i]; ++pj ) { r = U_S_store->rowind[pj]; @@ -579,8 +660,8 @@ real SuperLU_Factorize( const sparse_matrix * const A, } /* ------------------------------------------------------------ - Deallocate storage after factorization. - ------------------------------------------------------------*/ + Deallocate storage after factorization. + ------------------------------------------------------------*/ pxgstrf_finalize( &superlumt_options, &AC_S ); Deallocate_Matrix( A_t ); sfree( xa, "SuperLU_Factorize::xa" ); @@ -621,8 +702,8 @@ real diag_pre_comp( const sparse_matrix * const H, real * const Hdia_inv ) start = Get_Time( ); #ifdef _OPENMP - #pragma omp parallel for schedule(static) \ - default(none) private(i) +#pragma omp parallel for schedule(static) \ + default(none) private(i) #endif for ( i = 0; i < H->n; ++i ) { @@ -758,26 +839,26 @@ real ICHOLT( const sparse_matrix * const A, const real * const droptol, } L->start[i] = Ltop; -// fprintf( stderr, "nnz(L): %d, max: %d\n", Ltop, L->n * 50 ); + // fprintf( stderr, "nnz(L): %d, max: %d\n", Ltop, L->n * 50 ); /* U = L^T (Cholesky factorization) */ Transpose( L, U ); -// for ( i = 1; i <= U->n; ++i ) -// { -// Utop[i] = U->start[i] = U->start[i] + U->start[i - 1] + 1; -// } -// for ( i = 0; i < L->n; ++i ) -// { -// for ( pj = L->start[i]; pj < L->start[i + 1]; ++pj ) -// { -// j = L->j[pj]; -// U->j[Utop[j]] = i; -// U->val[Utop[j]] = L->val[pj]; -// Utop[j]++; -// } -// } - -// fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 ); + // for ( i = 1; i <= U->n; ++i ) + // { + // Utop[i] = U->start[i] = U->start[i] + U->start[i - 1] + 1; + // } + // for ( i = 0; i < L->n; ++i ) + // { + // for ( pj = L->start[i]; pj < L->start[i + 1]; ++pj ) + // { + // j = L->j[pj]; + // U->j[Utop[j]] = i; + // U->val[Utop[j]] = L->val[pj]; + // Utop[j]++; + // } + // } + + // fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 ); sfree( tmp_val, "ICHOLT::tmp_val" ); sfree( tmp_j, "ICHOLT::tmp_j" ); @@ -814,8 +895,8 @@ real ICHOL_PAR( const sparse_matrix * const A, const unsigned int sweeps, } #ifdef _OPENMP - #pragma omp parallel for schedule(static) \ - default(none) shared(D_inv, D) private(i) +#pragma omp parallel for schedule(static) \ + default(none) shared(D_inv, D) private(i) #endif for ( i = 0; i < A->n; ++i ) { @@ -830,8 +911,8 @@ real ICHOL_PAR( const sparse_matrix * const A, const unsigned int sweeps, * transformation DAD, where D = D(1./SQRT(D(A))) */ memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) ); #ifdef _OPENMP - #pragma omp parallel for schedule(guided) \ - default(none) shared(DAD, D_inv, D) private(i, pj) +#pragma omp parallel for schedule(guided) \ + default(none) shared(DAD, D_inv, D) private(i, pj) #endif for ( i = 0; i < A->n; ++i ) { @@ -856,8 +937,8 @@ real ICHOL_PAR( const sparse_matrix * const A, const unsigned int sweeps, { /* for each nonzero */ #ifdef _OPENMP - #pragma omp parallel for schedule(static) \ - default(none) shared(DAD, stderr) private(sum, ei_x, ei_y, k) firstprivate(x, y) +#pragma omp parallel for schedule(static) \ + default(none) shared(DAD, stderr) private(sum, ei_x, ei_y, k) firstprivate(x, y) #endif for ( j = 0; j < A->start[A->n]; ++j ) { @@ -911,7 +992,7 @@ real ICHOL_PAR( const sparse_matrix * const A, const unsigned int sweeps, fprintf( stderr, "Numeric breakdown in ICHOL_PAR. Terminating.\n"); #if defined(DEBUG_FOCUS) fprintf( stderr, "A(%5d,%5d) = %10.3f\n", - k - 1, A->entries[j].j, A->entries[j].val ); + k - 1, A->entries[j].j, A->entries[j].val ); fprintf( stderr, "sum = %10.3f\n", sum); #endif exit(NUMERIC_BREAKDOWN); @@ -931,8 +1012,8 @@ real ICHOL_PAR( const sparse_matrix * const A, const unsigned int sweeps, * since DAD \approx U^{T}U, so * D^{-1}DADD^{-1} = A \approx D^{-1}U^{T}UD^{-1} */ #ifdef _OPENMP - #pragma omp parallel for schedule(guided) \ - default(none) shared(D_inv) private(i, pj) +#pragma omp parallel for schedule(guided) \ + default(none) shared(D_inv) private(i, pj) #endif for ( i = 0; i < A->n; ++i ) { @@ -991,22 +1072,22 @@ real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps, } #ifdef _OPENMP - #pragma omp parallel for schedule(static) \ - default(none) shared(D, D_inv) private(i) +#pragma omp parallel for schedule(static) \ + default(none) shared(D, D_inv) private(i) #endif for ( i = 0; i < A->n; ++i ) { D_inv[i] = SQRT( FABS( A->val[A->start[i + 1] - 1] ) ); D[i] = 1.0 / D_inv[i]; -// printf( "A->val[%8d] = %f, D[%4d] = %f, D_inv[%4d] = %f\n", A->start[i + 1] - 1, A->val[A->start[i + 1] - 1], i, D[i], i, D_inv[i] ); + // printf( "A->val[%8d] = %f, D[%4d] = %f, D_inv[%4d] = %f\n", A->start[i + 1] - 1, A->val[A->start[i + 1] - 1], i, D[i], i, D_inv[i] ); } /* to get convergence, A must have unit diagonal, so apply * transformation DAD, where D = D(1./SQRT(abs(D(A)))) */ memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) ); #ifdef _OPENMP - #pragma omp parallel for schedule(static) \ - default(none) shared(DAD, D) private(i, pj) +#pragma omp parallel for schedule(static) \ + default(none) shared(DAD, D) private(i, pj) #endif for ( i = 0; i < A->n; ++i ) { @@ -1033,7 +1114,7 @@ real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps, /* L has unit diagonal, by convention */ #ifdef _OPENMP - #pragma omp parallel for schedule(static) default(none) private(i) +#pragma omp parallel for schedule(static) default(none) private(i) #endif for ( i = 0; i < A->n; ++i ) { @@ -1044,8 +1125,8 @@ real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps, { /* for each nonzero in L */ #ifdef _OPENMP - #pragma omp parallel for schedule(static) \ - default(none) shared(DAD) private(j, k, x, y, ei_x, ei_y, sum) +#pragma omp parallel for schedule(static) \ + default(none) shared(DAD) private(j, k, x, y, ei_x, ei_y, sum) #endif for ( j = 0; j < DAD->start[DAD->n]; ++j ) { @@ -1096,8 +1177,8 @@ real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps, } #ifdef _OPENMP - #pragma omp parallel for schedule(static) \ - default(none) shared(DAD) private(j, k, x, y, ei_x, ei_y, sum) +#pragma omp parallel for schedule(static) \ + default(none) shared(DAD) private(j, k, x, y, ei_x, ei_y, sum) #endif for ( j = 0; j < DAD->start[DAD->n]; ++j ) { @@ -1149,8 +1230,8 @@ real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps, * since DAD \approx LU, then * D^{-1}DADD^{-1} = A \approx D^{-1}LUD^{-1} */ #ifdef _OPENMP - #pragma omp parallel for schedule(static) \ - default(none) shared(DAD, D_inv) private(i, pj) +#pragma omp parallel for schedule(static) \ + default(none) shared(DAD, D_inv) private(i, pj) #endif for ( i = 0; i < DAD->n; ++i ) { @@ -1213,8 +1294,8 @@ real ILUT_PAR( const sparse_matrix * const A, const real * droptol, } #ifdef _OPENMP - #pragma omp parallel for schedule(static) \ - default(none) shared(D, D_inv) private(i) +#pragma omp parallel for schedule(static) \ + default(none) shared(D, D_inv) private(i) #endif for ( i = 0; i < A->n; ++i ) { @@ -1226,8 +1307,8 @@ real ILUT_PAR( const sparse_matrix * const A, const real * droptol, * transformation DAD, where D = D(1./SQRT(D(A))) */ memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) ); #ifdef _OPENMP - #pragma omp parallel for schedule(static) \ - default(none) shared(DAD, D) private(i, pj) +#pragma omp parallel for schedule(static) \ + default(none) shared(DAD, D) private(i, pj) #endif for ( i = 0; i < A->n; ++i ) { @@ -1254,8 +1335,8 @@ real ILUT_PAR( const sparse_matrix * const A, const real * droptol, /* L has unit diagonal, by convention */ #ifdef _OPENMP - #pragma omp parallel for schedule(static) \ - default(none) private(i) shared(L_temp) +#pragma omp parallel for schedule(static) \ + default(none) private(i) shared(L_temp) #endif for ( i = 0; i < A->n; ++i ) { @@ -1266,8 +1347,8 @@ real ILUT_PAR( const sparse_matrix * const A, const real * droptol, { /* for each nonzero in L */ #ifdef _OPENMP - #pragma omp parallel for schedule(static) \ - default(none) shared(DAD, L_temp, U_temp) private(j, k, x, y, ei_x, ei_y, sum) +#pragma omp parallel for schedule(static) \ + default(none) shared(DAD, L_temp, U_temp) private(j, k, x, y, ei_x, ei_y, sum) #endif for ( j = 0; j < DAD->start[DAD->n]; ++j ) { @@ -1318,8 +1399,8 @@ real ILUT_PAR( const sparse_matrix * const A, const real * droptol, } #ifdef _OPENMP - #pragma omp parallel for schedule(static) \ - default(none) shared(DAD, L_temp, U_temp) private(j, k, x, y, ei_x, ei_y, sum) +#pragma omp parallel for schedule(static) \ + default(none) shared(DAD, L_temp, U_temp) private(j, k, x, y, ei_x, ei_y, sum) #endif for ( j = 0; j < DAD->start[DAD->n]; ++j ) { @@ -1371,8 +1452,8 @@ real ILUT_PAR( const sparse_matrix * const A, const real * droptol, * since DAD \approx LU, then * D^{-1}DADD^{-1} = A \approx D^{-1}LUD^{-1} */ #ifdef _OPENMP - #pragma omp parallel for schedule(static) \ - default(none) shared(DAD, L_temp, U_temp, D_inv) private(i, pj) +#pragma omp parallel for schedule(static) \ + default(none) shared(DAD, L_temp, U_temp, D_inv) private(i, pj) #endif for ( i = 0; i < DAD->n; ++i ) { @@ -1442,6 +1523,164 @@ real ILUT_PAR( const sparse_matrix * const A, const real * droptol, return Get_Timing_Info( start ); } +real SAI_PAR( const sparse_matrix * const A, const sparse_matrix * const A_SP, + sparse_matrix * A_AppInv) +{ + int i, j, k, pj, j_temp, identity_pos; + int N, M, d_i, d_j; + MKL_int m, n, nrhs, lda, ldb, info; + int * pos_i; + int * pos_j; + real start; + real * e_j; + real * dense_matrix; + sparse_matrix * A_full; + sparse_matrix * A_SP_full; + char * I; + char * J; + + start = Get_Time( ); + + + if( (I = (char *) malloc(sizeof(char) * A->n)) == NULL || + (J = (char *) malloc(sizeof(char) * A->n)) == NULL || + (pos_i = (int *) malloc(sizeof(int) * A->n)) == NULL || + (pos_j = (int *) malloc(sizeof(int) * A->n)) == NULL) + { + exit( INSUFFICIENT_MEMORY ); + } + + for( i = 0; i < A->n; ++i ) + I[i] = J[i] = pos_i[i] = pos_j[i] = 0; + + // Get A and A_SP full matrices + compute_H_full_for_SAI( A, A_full ); + compute_H_full_for_SAI( A_SP, A_SP_full ); + + // A_AppInv will be the same as A_SP_full except the val array + if ( Allocate_Matrix( &A_AppInv, A_SP_full->n, A_SP_full->m ) == FAILURE ) + { + fprintf( stderr, "not enough memory for approximate inverse matrix. terminating.\n" ); + exit( INSUFFICIENT_MEMORY ); + } + A_AppInv->start[A_AppInv->n] = A_SP_full->start[A_SP_full->n]; + + // For each row of full A_SP + for( i = 0; i < A_SP_full->n; ++i ) + { + + // N = A_SP_full->start[i + 1] - A_SP_full->start[i]; + N = M = 0; + + // find column indices of nonzeros (which will be the columns indices of the dense matrix) + for( pj = A_SP_full->start[i]; pj < A_SP_full->start[i + 1]; ++pj ) + { + j_temp = A_SP_full->j[pj]; + + J[j_temp] = 1; + pos_j[j_temp] = N; + ++N; + + // for each of those indices + // search through the row of full A of that index + for( k = A_full->start[j_temp]; k < A_full->start[j_temp + 1]; ++k ) + { + // and accumulate the nonzero column indices to serve as the row indices of the dense matrix + I[A_full->j[k]] = 1; + } + } + + // enumerate the row indices from 0 to (# of nonzero rows - 1) for the dense matrix + identity_pos = M; + for( k = 0; k < A_full->n; k++) + { + if( I[k] != 0 ) + { + pos_i[M] = k; + if( k == i ) + identity_pos = M; + ++M; + } + } + + // allocate memory for NxM dense matrix + if( (dense_matrix = (real *) malloc(sizeof(real) * N * M)) == NULL ) + { + exit( INSUFFICIENT_MEMORY ); + } + + // fill in the entries of dense matrix + for ( d_i = 0; d_i < M; ++d_i) + { + // all rows are initialized to zero + for( d_j = 0; d_j < N; ++d_j ) + dense_matrix[d_i][d_j] = 0.0; + + // change the value if any of the column indices is seen + for ( d_j = A_full->start_[pos_i[d_i]]; d_j < A_full->start[pos_i[d_i + 1]]; ++d_j) + { + if( J[A_full->j[d_j]] == 1 ) + { + dense_matrix[d_i][pos_j[d_j]] = A_full->val[d_j]; + } + } + } + + + /* create the right hand side of the linear equation + that is the full column of the identity matrix*/ + if( (e_j = (int *) malloc(sizeof(char) * M)) == NULL ) + { + exit( INSUFFICIENT_MEMORY ); + } + for( k = 0; k < M; ++k ) + { + e_j[k] = 0; + } + e_j[identity_pos] = 1.0; + + // call QR-decompostion from LAPACK + m = M, n = N, nrhs = 1, lda = N, ldb = 1; + /* Executable statements */ + // printf( "LAPACKE_dgels (row-major, high-level) Example Program Results\n" ); + /* Solve the equations A*X = B */ + info = LAPACKE_dgels( LAPACK_ROW_MAJOR, 'N', m, n, nrhs, dense_matrix, lda, + e_j, ldb ); + /* Check for the full rank */ + if( info > 0 ) { + fprintf( stderr, "The diagonal element %i of the triangular factor ", info ); + fprintf( stderr, "of A is zero, so that A does not have full rank;\n" ); + fprintf( stderr, "the least squares solution could not be computed.\n" ); + exit( 1 ); + } + /* Print least squares solution */ + // print_matrix( "Least squares solution", n, nrhs, b, ldb ); + + // accumulate the resulting vector to build A_AppInv + A_AppInv->start[i] = A_SP_full->start[i]; + for( k = A_SP_full->start[i]; k < A_SP_full->start[i + 1]; ++k) + { + A_Appinv->j[k] = A_SP_full->j[k]; + A_Appinv->val[k] = e_j[k - A_SP_full->start[i]]; + } + + //empty variables that will be used next iteration + realloc( dense_matrix, 0); + realloc( e_j, 0 ); + for( i = 0; i < A->n; ++i ) + I[i] = J[i] = pos_i[i] = pos_j[i] = 0; + } + + // Deallocate? + Deallocate_Matrix ( A_full ); + Deallocate_Matrix ( A_SP_full ); + realloc( I, 0 ); + realloc( J, 0 ); + realloc( pos_i, 0 ); + realloc( pos_j, 0 ); + + return Get_Timing_Info( start ); +} /* sparse matrix-vector product Ax=b * where: @@ -1463,7 +1702,7 @@ static void Sparse_MatVec( const sparse_matrix * const A, #ifdef _OPENMP tid = omp_get_thread_num( ); - #pragma omp single +#pragma omp single { /* keep b_local for program duration to avoid allocate/free * overhead per Sparse_MatVec call*/ @@ -1478,7 +1717,7 @@ static void Sparse_MatVec( const sparse_matrix * const A, Vector_MakeZero( (real * const)b_local, omp_get_num_threads() * n ); - #pragma omp for schedule(static) +#pragma omp for schedule(static) #endif for ( i = 0; i < n; ++i ) { @@ -1507,7 +1746,7 @@ static void Sparse_MatVec( const sparse_matrix * const A, } #ifdef _OPENMP - #pragma omp for schedule(static) +#pragma omp for schedule(static) for ( i = 0; i < n; ++i ) { for ( j = 0; j < omp_get_num_threads(); ++j ) @@ -1604,7 +1843,7 @@ static void diag_pre_app( const real * const Hdia_inv, const real * const y, unsigned int i; #ifdef _OPENMP - #pragma omp for schedule(static) +#pragma omp for schedule(static) #endif for ( i = 0; i < N; ++i ) { @@ -1631,7 +1870,7 @@ void tri_solve( const sparse_matrix * const LU, const real * const y, real val; #ifdef _OPENMP - #pragma omp single +#pragma omp single #endif { if ( tri == LOWER ) @@ -1689,7 +1928,7 @@ void tri_solve_level_sched( const sparse_matrix * const LU, int i, j, pj, local_row, local_level; #ifdef _OPENMP - #pragma omp single +#pragma omp single #endif { if ( tri == LOWER ) @@ -1750,10 +1989,10 @@ void tri_solve_level_sched( const sparse_matrix * const LU, ++level_rows_cnt[local_level]; } -//#if defined(DEBUG) + //#if defined(DEBUG) fprintf(stderr, "levels(L): %d\n", levels); fprintf(stderr, "NNZ(L): %d\n", LU->start[N]); -//#endif + //#endif } else { @@ -1770,10 +2009,10 @@ void tri_solve_level_sched( const sparse_matrix * const LU, ++level_rows_cnt[local_level]; } -//#if defined(DEBUG) + //#if defined(DEBUG) fprintf(stderr, "levels(U): %d\n", levels); fprintf(stderr, "NNZ(U): %d\n", LU->start[N]); -//#endif + //#endif } for ( i = 1; i < levels + 1; ++i ) @@ -1796,7 +2035,7 @@ void tri_solve_level_sched( const sparse_matrix * const LU, for ( i = 0; i < levels; ++i ) { #ifdef _OPENMP - #pragma omp for schedule(static) +#pragma omp for schedule(static) #endif for ( j = level_rows_cnt[i]; j < level_rows_cnt[i + 1]; ++j ) { @@ -1816,7 +2055,7 @@ void tri_solve_level_sched( const sparse_matrix * const LU, for ( i = 0; i < levels; ++i ) { #ifdef _OPENMP - #pragma omp for schedule(static) +#pragma omp for schedule(static) #endif for ( j = level_rows_cnt[i]; j < level_rows_cnt[i + 1]; ++j ) { @@ -1833,7 +2072,7 @@ void tri_solve_level_sched( const sparse_matrix * const LU, } #ifdef _OPENMP - #pragma omp single +#pragma omp single #endif { /* save level info for re-use if performing repeated triangular solves via preconditioning */ @@ -1895,6 +2134,46 @@ static void compute_H_full( const sparse_matrix * const H ) Deallocate_Matrix( H_t ); } +static void compute_H_full_for_SAI( const sparse_matrix * const H , sparse_matrix * H_new) +{ + int count, i, pj; + sparse_matrix *H_t; + + if ( Allocate_Matrix( &H_t, H->n, H->m ) == FAILURE || + Allocate_Matrix( &H_new, H->n, 2 * H->m - H->n ) == FAILURE ) + { + fprintf( stderr, "not enough memory for full H. terminating.\n" ); + exit( INSUFFICIENT_MEMORY ); + } + + /* Set up the sparse matrix data structure for A. */ + Transpose( H, H_t ); + + count = 0; + for ( i = 0; i < H->n; ++i ) + { + H_new->start[i] = count; + + /* H: symmetric, lower triangular portion only stored */ + for ( pj = H->start[i]; pj < H->start[i + 1]; ++pj ) + { + H_new->val[count] = H->val[pj]; + H_new->j[count] = H->j[pj]; + ++count; + } + /* H^T: symmetric, upper triangular portion only stored; + * skip diagonal from H^T, as included from H above */ + for ( pj = H_t->start[i] + 1; pj < H_t->start[i + 1]; ++pj ) + { + H_new->val[count] = H_t->val[pj]; + H_new->j[count] = H_t->j[pj]; + ++count; + } + } + H_new->start[i] = count; + + Deallocate_Matrix( H_t ); +} /* Iterative greedy shared-memory parallel graph coloring * @@ -1913,7 +2192,7 @@ static void compute_H_full( const sparse_matrix * const H ) void graph_coloring( const sparse_matrix * const A, const TRIANGULARITY tri ) { #ifdef _OPENMP - #pragma omp parallel +#pragma omp parallel #endif { #define MAX_COLOR (500) @@ -1930,7 +2209,7 @@ void graph_coloring( const sparse_matrix * const A, const TRIANGULARITY tri ) #endif #ifdef _OPENMP - #pragma omp single +#pragma omp single #endif { memset( color, 0, sizeof(unsigned int) * A->n ); @@ -1942,7 +2221,7 @@ void graph_coloring( const sparse_matrix * const A, const TRIANGULARITY tri ) if ( tri == LOWER ) { #ifdef _OPENMP - #pragma omp for schedule(static) +#pragma omp for schedule(static) #endif for ( i = 0; i < A->n; ++i ) { @@ -1952,7 +2231,7 @@ void graph_coloring( const sparse_matrix * const A, const TRIANGULARITY tri ) else { #ifdef _OPENMP - #pragma omp for schedule(static) +#pragma omp for schedule(static) #endif for ( i = 0; i < A->n; ++i ) { @@ -1968,7 +2247,7 @@ void graph_coloring( const sparse_matrix * const A, const TRIANGULARITY tri ) } #ifdef _OPENMP - #pragma omp barrier +#pragma omp barrier #endif while ( recolor_cnt > 0 ) @@ -1977,7 +2256,7 @@ void graph_coloring( const sparse_matrix * const A, const TRIANGULARITY tri ) /* color vertices */ #ifdef _OPENMP - #pragma omp for schedule(static) +#pragma omp for schedule(static) #endif for ( i = 0; i < recolor_cnt; ++i ) { @@ -2005,14 +2284,14 @@ void graph_coloring( const sparse_matrix * const A, const TRIANGULARITY tri ) recolor_cnt_local = 0; #ifdef _OPENMP - #pragma omp single +#pragma omp single #endif { recolor_cnt = 0; } #ifdef _OPENMP - #pragma omp for schedule(static) +#pragma omp for schedule(static) #endif for ( i = 0; i < temp; ++i ) { @@ -2034,9 +2313,9 @@ void graph_coloring( const sparse_matrix * const A, const TRIANGULARITY tri ) conflict_cnt[tid + 1] = recolor_cnt_local; #ifdef _OPENMP - #pragma omp barrier +#pragma omp barrier - #pragma omp master +#pragma omp master #endif { conflict_cnt[0] = 0; @@ -2048,7 +2327,7 @@ void graph_coloring( const sparse_matrix * const A, const TRIANGULARITY tri ) } #ifdef _OPENMP - #pragma omp barrier +#pragma omp barrier #endif /* copy thread-local conflicts into shared buffer */ @@ -2059,9 +2338,9 @@ void graph_coloring( const sparse_matrix * const A, const TRIANGULARITY tri ) } #ifdef _OPENMP - #pragma omp barrier +#pragma omp barrier - #pragma omp single +#pragma omp single #endif { temp_ptr = to_color; @@ -2073,18 +2352,18 @@ void graph_coloring( const sparse_matrix * const A, const TRIANGULARITY tri ) sfree( conflict_local, "graph_coloring::conflict_local" ); sfree( fb_color, "graph_coloring::fb_color" ); -//#if defined(DEBUG) -//#ifdef _OPENMP -// #pragma omp master -//#endif -// { -// for ( i = 0; i < A->n; ++i ) -// printf("Vertex: %5d, Color: %5d\n", i, color[i] ); -// } -//#endif + //#if defined(DEBUG) + //#ifdef _OPENMP + // #pragma omp master + //#endif + // { + // for ( i = 0; i < A->n; ++i ) + // printf("Vertex: %5d, Color: %5d\n", i, color[i] ); + // } + //#endif #ifdef _OPENMP - #pragma omp barrier +#pragma omp barrier #endif } } @@ -2138,12 +2417,12 @@ void sort_colors( const unsigned int n, const TRIANGULARITY tri ) * tri: coloring to triangular factor to use (lower/upper) */ static void permute_vector( real * const x, const unsigned int n, const int invert_map, - const TRIANGULARITY tri ) + const TRIANGULARITY tri ) { unsigned int i; #ifdef _OPENMP - #pragma omp single +#pragma omp single #endif { if ( x_p == NULL ) @@ -2166,7 +2445,7 @@ static void permute_vector( real * const x, const unsigned int n, const int inve } #ifdef _OPENMP - #pragma omp for schedule(static) +#pragma omp for schedule(static) #endif for ( i = 0; i < n; ++i ) { @@ -2174,7 +2453,7 @@ static void permute_vector( real * const x, const unsigned int n, const int inve } #ifdef _OPENMP - #pragma omp single +#pragma omp single #endif { memcpy( x, x_p, sizeof(real) * n ); @@ -2330,7 +2609,7 @@ sparse_matrix * setup_graph_coloring( sparse_matrix * const H ) if ( color == NULL ) { #ifdef _OPENMP - #pragma omp parallel +#pragma omp parallel { num_thread = omp_get_num_threads(); } @@ -2360,7 +2639,7 @@ sparse_matrix * setup_graph_coloring( sparse_matrix * const H ) graph_coloring( H_full, LOWER ); sort_colors( H_full->n, LOWER ); - + memcpy( H_p->start, H->start, sizeof(int) * (H->n + 1) ); memcpy( H_p->j, H->j, sizeof(int) * (H->start[H->n]) ); memcpy( H_p->val, H->val, sizeof(real) * (H->start[H->n]) ); @@ -2390,7 +2669,7 @@ void jacobi_iter( const sparse_matrix * const R, const real * const Dinv, iter = 0; #ifdef _OPENMP - #pragma omp single +#pragma omp single #endif { if ( Dinv_b == NULL ) @@ -2423,7 +2702,7 @@ void jacobi_iter( const sparse_matrix * const R, const real * const Dinv, /* precompute and cache, as invariant in loop below */ #ifdef _OPENMP - #pragma omp for schedule(static) +#pragma omp for schedule(static) #endif for ( i = 0; i < R->n; ++i ) { @@ -2434,7 +2713,7 @@ void jacobi_iter( const sparse_matrix * const R, const real * const Dinv, { // x_{k+1} = G*x_{k} + Dinv*b; #ifdef _OPENMP - #pragma omp for schedule(guided) +#pragma omp for schedule(guided) #endif for ( i = 0; i < R->n; ++i ) { @@ -2462,7 +2741,7 @@ void jacobi_iter( const sparse_matrix * const R, const real * const Dinv, } #ifdef _OPENMP - #pragma omp single +#pragma omp single #endif { rp3 = rp; @@ -2503,157 +2782,157 @@ static void apply_preconditioner( const static_storage * const workspace, const { switch ( control->cm_solver_pre_app_type ) { - case TRI_SOLVE_PA: - switch ( control->cm_solver_pre_comp_type ) - { - case DIAG_PC: - diag_pre_app( workspace->Hdia_inv, y, x, workspace->H->n ); - break; - case ICHOLT_PC: - case ILU_PAR_PC: - case ILUT_PAR_PC: - tri_solve( workspace->L, y, x, workspace->L->n, LOWER ); - tri_solve( workspace->U, x, x, workspace->U->n, UPPER ); - break; - case SAI_PC: - //TODO: add code to compute SAI first -// Sparse_MatVec( SAI, y, x ); - break; - default: - fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); - exit( INVALID_INPUT ); - break; - } - break; - case TRI_SOLVE_LEVEL_SCHED_PA: - switch ( control->cm_solver_pre_comp_type ) - { - case DIAG_PC: - diag_pre_app( workspace->Hdia_inv, y, x, workspace->H->n ); - break; - case ICHOLT_PC: - case ILU_PAR_PC: - case ILUT_PAR_PC: - tri_solve_level_sched( workspace->L, y, x, workspace->L->n, LOWER, fresh_pre ); - tri_solve_level_sched( workspace->U, x, x, workspace->U->n, UPPER, fresh_pre ); - break; - case SAI_PC: - //TODO: add code to compute SAI first -// Sparse_MatVec( SAI, y, x ); - default: - fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); - exit( INVALID_INPUT ); + case TRI_SOLVE_PA: + switch ( control->cm_solver_pre_comp_type ) + { + case DIAG_PC: + diag_pre_app( workspace->Hdia_inv, y, x, workspace->H->n ); + break; + case ICHOLT_PC: + case ILU_PAR_PC: + case ILUT_PAR_PC: + tri_solve( workspace->L, y, x, workspace->L->n, LOWER ); + tri_solve( workspace->U, x, x, workspace->U->n, UPPER ); + break; + case SAI_PC: + //TODO: add code to compute SAI first + // Sparse_MatVec( SAI, y, x ); + break; + default: + fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + } break; - } - break; - case TRI_SOLVE_GC_PA: - switch ( control->cm_solver_pre_comp_type ) - { - case DIAG_PC: - case SAI_PC: - fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" ); - exit( INVALID_INPUT ); + case TRI_SOLVE_LEVEL_SCHED_PA: + switch ( control->cm_solver_pre_comp_type ) + { + case DIAG_PC: + diag_pre_app( workspace->Hdia_inv, y, x, workspace->H->n ); + break; + case ICHOLT_PC: + case ILU_PAR_PC: + case ILUT_PAR_PC: + tri_solve_level_sched( workspace->L, y, x, workspace->L->n, LOWER, fresh_pre ); + tri_solve_level_sched( workspace->U, x, x, workspace->U->n, UPPER, fresh_pre ); + break; + case SAI_PC: + //TODO: add code to compute SAI first + // Sparse_MatVec( SAI, y, x ); + default: + fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + } break; - case ICHOLT_PC: - case ILU_PAR_PC: - case ILUT_PAR_PC: + case TRI_SOLVE_GC_PA: + switch ( control->cm_solver_pre_comp_type ) + { + case DIAG_PC: + case SAI_PC: + fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + case ICHOLT_PC: + case ILU_PAR_PC: + case ILUT_PAR_PC: #ifdef _OPENMP - #pragma omp single +#pragma omp single #endif - { - memcpy( y_p, y, sizeof(real) * workspace->H->n ); - } + { + memcpy( y_p, y, sizeof(real) * workspace->H->n ); + } - permute_vector( y_p, workspace->H->n, FALSE, LOWER ); - tri_solve_level_sched( workspace->L, y_p, x, workspace->L->n, LOWER, fresh_pre ); - tri_solve_level_sched( workspace->U, x, x, workspace->U->n, UPPER, fresh_pre ); - permute_vector( x, workspace->H->n, TRUE, UPPER ); - break; - default: - fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); - exit( INVALID_INPUT ); - break; - } - break; - case JACOBI_ITER_PA: - switch ( control->cm_solver_pre_comp_type ) - { - case DIAG_PC: - case SAI_PC: - fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" ); - exit( INVALID_INPUT ); + permute_vector( y_p, workspace->H->n, FALSE, LOWER ); + tri_solve_level_sched( workspace->L, y_p, x, workspace->L->n, LOWER, fresh_pre ); + tri_solve_level_sched( workspace->U, x, x, workspace->U->n, UPPER, fresh_pre ); + permute_vector( x, workspace->H->n, TRUE, UPPER ); + break; + default: + fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + } break; - case ICHOLT_PC: - case ILU_PAR_PC: - case ILUT_PAR_PC: + case JACOBI_ITER_PA: + switch ( control->cm_solver_pre_comp_type ) + { + case DIAG_PC: + case SAI_PC: + fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + case ICHOLT_PC: + case ILU_PAR_PC: + case ILUT_PAR_PC: #ifdef _OPENMP - #pragma omp single +#pragma omp single #endif - { - if ( Dinv_L == NULL ) - { - if ( (Dinv_L = (real*) malloc(sizeof(real) * workspace->L->n)) == NULL ) { - fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" ); - exit( INSUFFICIENT_MEMORY ); + if ( Dinv_L == NULL ) + { + if ( (Dinv_L = (real*) malloc(sizeof(real) * workspace->L->n)) == NULL ) + { + fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" ); + exit( INSUFFICIENT_MEMORY ); + } + } } - } - } - /* construct D^{-1}_L */ - if ( fresh_pre == TRUE ) - { + /* construct D^{-1}_L */ + if ( fresh_pre == TRUE ) + { #ifdef _OPENMP - #pragma omp for schedule(static) +#pragma omp for schedule(static) #endif - for ( i = 0; i < workspace->L->n; ++i ) - { - si = workspace->L->start[i + 1] - 1; - Dinv_L[i] = 1. / workspace->L->val[si]; - } - } + for ( i = 0; i < workspace->L->n; ++i ) + { + si = workspace->L->start[i + 1] - 1; + Dinv_L[i] = 1. / workspace->L->val[si]; + } + } - jacobi_iter( workspace->L, Dinv_L, y, x, LOWER, control->cm_solver_pre_app_jacobi_iters ); + jacobi_iter( workspace->L, Dinv_L, y, x, LOWER, control->cm_solver_pre_app_jacobi_iters ); #ifdef _OPENMP - #pragma omp single +#pragma omp single #endif - { - if ( Dinv_U == NULL ) - { - if ( (Dinv_U = (real*) malloc(sizeof(real) * workspace->U->n)) == NULL ) { - fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" ); - exit( INSUFFICIENT_MEMORY ); + if ( Dinv_U == NULL ) + { + if ( (Dinv_U = (real*) malloc(sizeof(real) * workspace->U->n)) == NULL ) + { + fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" ); + exit( INSUFFICIENT_MEMORY ); + } + } } - } - } - /* construct D^{-1}_U */ - if ( fresh_pre == TRUE ) - { + /* construct D^{-1}_U */ + if ( fresh_pre == TRUE ) + { #ifdef _OPENMP - #pragma omp for schedule(static) +#pragma omp for schedule(static) #endif - for ( i = 0; i < workspace->U->n; ++i ) - { - si = workspace->U->start[i]; - Dinv_U[i] = 1. / workspace->U->val[si]; - } - } + for ( i = 0; i < workspace->U->n; ++i ) + { + si = workspace->U->start[i]; + Dinv_U[i] = 1. / workspace->U->val[si]; + } + } - jacobi_iter( workspace->U, Dinv_U, y, x, UPPER, control->cm_solver_pre_app_jacobi_iters ); + jacobi_iter( workspace->U, Dinv_U, y, x, UPPER, control->cm_solver_pre_app_jacobi_iters ); + break; + default: + fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + } break; default: fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); exit( INVALID_INPUT ); break; - } - break; - default: - fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); - exit( INVALID_INPUT ); - break; } } @@ -2671,22 +2950,22 @@ int GMRES( const static_storage * const workspace, const control_params * const N = H->n; #ifdef _OPENMP - #pragma omp parallel default(none) private(i, j, k, itr, bnorm, ret_temp) \ - shared(N, cc, tmp1, tmp2, temp, time_start, g_itr, g_j, stderr) +#pragma omp parallel default(none) private(i, j, k, itr, bnorm, ret_temp) \ + shared(N, cc, tmp1, tmp2, temp, time_start, g_itr, g_j, stderr) #endif { j = 0; itr = 0; #ifdef _OPENMP - #pragma omp master +#pragma omp master #endif { time_start = Get_Time( ); } bnorm = Norm( b, N ); #ifdef _OPENMP - #pragma omp master +#pragma omp master #endif { data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start ); @@ -2696,14 +2975,14 @@ int GMRES( const static_storage * const workspace, const control_params * const { /* apply preconditioner to residual */ #ifdef _OPENMP - #pragma omp master +#pragma omp master #endif { time_start = Get_Time( ); } apply_preconditioner( workspace, control, b, workspace->b_prc, fresh_pre ); #ifdef _OPENMP - #pragma omp master +#pragma omp master #endif { data->timing.cm_solver_pre_app += Get_Timing_Info( time_start ); @@ -2715,14 +2994,14 @@ int GMRES( const static_storage * const workspace, const control_params * const { /* calculate r0 */ #ifdef _OPENMP - #pragma omp master +#pragma omp master #endif { time_start = Get_Time( ); } Sparse_MatVec( H, x, workspace->b_prm ); #ifdef _OPENMP - #pragma omp master +#pragma omp master #endif { data->timing.cm_solver_spmv += Get_Timing_Info( time_start ); @@ -2731,14 +3010,14 @@ int GMRES( const static_storage * const workspace, const control_params * const if ( control->cm_solver_pre_comp_type == DIAG_PC ) { #ifdef _OPENMP - #pragma omp master +#pragma omp master #endif { time_start = Get_Time( ); } apply_preconditioner( workspace, control, workspace->b_prm, workspace->b_prm, FALSE ); #ifdef _OPENMP - #pragma omp master +#pragma omp master #endif { data->timing.cm_solver_pre_app += Get_Timing_Info( time_start ); @@ -2748,14 +3027,14 @@ int GMRES( const static_storage * const workspace, const control_params * const if ( control->cm_solver_pre_comp_type == DIAG_PC ) { #ifdef _OPENMP - #pragma omp master +#pragma omp master #endif { time_start = Get_Time( ); } Vector_Sum( workspace->v[0], 1., workspace->b_prc, -1., workspace->b_prm, N ); #ifdef _OPENMP - #pragma omp master +#pragma omp master #endif { data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start ); @@ -2764,14 +3043,14 @@ int GMRES( const static_storage * const workspace, const control_params * const else { #ifdef _OPENMP - #pragma omp master +#pragma omp master #endif { time_start = Get_Time( ); } Vector_Sum( workspace->v[0], 1., b, -1., workspace->b_prm, N ); #ifdef _OPENMP - #pragma omp master +#pragma omp master #endif { data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start ); @@ -2781,7 +3060,7 @@ int GMRES( const static_storage * const workspace, const control_params * const if ( control->cm_solver_pre_comp_type != DIAG_PC ) { #ifdef _OPENMP - #pragma omp master +#pragma omp master #endif { time_start = Get_Time( ); @@ -2789,7 +3068,7 @@ int GMRES( const static_storage * const workspace, const control_params * const apply_preconditioner( workspace, control, workspace->v[0], workspace->v[0], itr == 0 ? fresh_pre : FALSE ); #ifdef _OPENMP - #pragma omp master +#pragma omp master #endif { data->timing.cm_solver_pre_app += Get_Timing_Info( time_start ); @@ -2797,14 +3076,14 @@ int GMRES( const static_storage * const workspace, const control_params * const } #ifdef _OPENMP - #pragma omp master +#pragma omp master #endif { time_start = Get_Time( ); } ret_temp = Norm( workspace->v[0], N ); #ifdef _OPENMP - #pragma omp single +#pragma omp single #endif { workspace->g[0] = ret_temp; @@ -2812,7 +3091,7 @@ int GMRES( const static_storage * const workspace, const control_params * const Vector_Scale( workspace->v[0], 1. / workspace->g[0], workspace->v[0], N ); #ifdef _OPENMP - #pragma omp master +#pragma omp master #endif { data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start ); @@ -2823,7 +3102,7 @@ int GMRES( const static_storage * const workspace, const control_params * const { /* matvec */ #ifdef _OPENMP - #pragma omp master +#pragma omp master #endif { time_start = Get_Time( ); @@ -2831,14 +3110,14 @@ int GMRES( const static_storage * const workspace, const control_params * const Sparse_MatVec( H, workspace->v[j], workspace->v[j + 1] ); #ifdef _OPENMP - #pragma omp master +#pragma omp master #endif { data->timing.cm_solver_spmv += Get_Timing_Info( time_start ); } #ifdef _OPENMP - #pragma omp master +#pragma omp master #endif { time_start = Get_Time( ); @@ -2846,91 +3125,91 @@ int GMRES( const static_storage * const workspace, const control_params * const apply_preconditioner( workspace, control, workspace->v[j + 1], workspace->v[j + 1], FALSE ); #ifdef _OPENMP - #pragma omp master +#pragma omp master #endif { data->timing.cm_solver_pre_app += Get_Timing_Info( time_start ); } -// if ( control->cm_solver_pre_comp_type == DIAG_PC ) -// { - /* apply modified Gram-Schmidt to orthogonalize the new residual */ + // if ( control->cm_solver_pre_comp_type == DIAG_PC ) + // { + /* apply modified Gram-Schmidt to orthogonalize the new residual */ #ifdef _OPENMP - #pragma omp master +#pragma omp master #endif - { - time_start = Get_Time( ); - } - for ( i = 0; i <= j; i++ ) - { - ret_temp = Dot( workspace->v[i], workspace->v[j + 1], N ); + { + time_start = Get_Time( ); + } + for ( i = 0; i <= j; i++ ) + { + ret_temp = Dot( workspace->v[i], workspace->v[j + 1], N ); #ifdef _OPENMP - #pragma omp single +#pragma omp single #endif - { - workspace->h[i][j] = ret_temp; - } + { + workspace->h[i][j] = ret_temp; + } - Vector_Add( workspace->v[j + 1], -workspace->h[i][j], workspace->v[i], N ); + Vector_Add( workspace->v[j + 1], -workspace->h[i][j], workspace->v[i], N ); - } + } #ifdef _OPENMP - #pragma omp master +#pragma omp master #endif - { - data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start ); - } -// } -// else -// { -// //TODO: investigate correctness of not explicitly orthogonalizing first few vectors -// /* apply modified Gram-Schmidt to orthogonalize the new residual */ -//#ifdef _OPENMP -// #pragma omp master -//#endif -// { -// time_start = Get_Time( ); -// } -//#ifdef _OPENMP -// #pragma omp single -//#endif -// { -// for ( i = 0; i < j - 1; i++ ) -// { -// workspace->h[i][j] = 0.0; -// } -// } -// -// for ( i = MAX(j - 1, 0); i <= j; i++ ) -// { -// ret_temp = Dot( workspace->v[i], workspace->v[j + 1], N ); -//#ifdef _OPENMP -// #pragma omp single -//#endif -// { -// workspace->h[i][j] = ret_temp; -// } -// -// Vector_Add( workspace->v[j + 1], -workspace->h[i][j], workspace->v[i], N ); -// } -//#ifdef _OPENMP -// #pragma omp master -//#endif -// { -// data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start ); -// } -// } - -#ifdef _OPENMP - #pragma omp master + { + data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start ); + } + // } + // else + // { + // //TODO: investigate correctness of not explicitly orthogonalizing first few vectors + // /* apply modified Gram-Schmidt to orthogonalize the new residual */ + //#ifdef _OPENMP + // #pragma omp master + //#endif + // { + // time_start = Get_Time( ); + // } + //#ifdef _OPENMP + // #pragma omp single + //#endif + // { + // for ( i = 0; i < j - 1; i++ ) + // { + // workspace->h[i][j] = 0.0; + // } + // } + // + // for ( i = MAX(j - 1, 0); i <= j; i++ ) + // { + // ret_temp = Dot( workspace->v[i], workspace->v[j + 1], N ); + //#ifdef _OPENMP + // #pragma omp single + //#endif + // { + // workspace->h[i][j] = ret_temp; + // } + // + // Vector_Add( workspace->v[j + 1], -workspace->h[i][j], workspace->v[i], N ); + // } + //#ifdef _OPENMP + // #pragma omp master + //#endif + // { + // data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start ); + // } + // } + +#ifdef _OPENMP +#pragma omp master #endif { time_start = Get_Time( ); } ret_temp = Norm( workspace->v[j + 1], N ); #ifdef _OPENMP - #pragma omp single +#pragma omp single #endif { workspace->h[j + 1][j] = ret_temp; @@ -2940,61 +3219,61 @@ int GMRES( const static_storage * const workspace, const control_params * const 1.0 / workspace->h[j + 1][j], workspace->v[j + 1], N ); #ifdef _OPENMP - #pragma omp master +#pragma omp master #endif { data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start ); } #ifdef _OPENMP - #pragma omp master +#pragma omp master #endif { time_start = Get_Time( ); -// if ( control->cm_solver_pre_comp_type == NONE_PC || -// control->cm_solver_pre_comp_type == DIAG_PC ) -// { - /* Givens rotations on the upper-Hessenberg matrix to make it U */ - for ( i = 0; i <= j; i++ ) + // if ( control->cm_solver_pre_comp_type == NONE_PC || + // control->cm_solver_pre_comp_type == DIAG_PC ) + // { + /* Givens rotations on the upper-Hessenberg matrix to make it U */ + for ( i = 0; i <= j; i++ ) + { + if ( i == j ) { - if ( i == j ) - { - cc = SQRT( SQR(workspace->h[j][j]) + SQR(workspace->h[j + 1][j]) ); - workspace->hc[j] = workspace->h[j][j] / cc; - workspace->hs[j] = workspace->h[j + 1][j] / cc; - } + cc = SQRT( SQR(workspace->h[j][j]) + SQR(workspace->h[j + 1][j]) ); + workspace->hc[j] = workspace->h[j][j] / cc; + workspace->hs[j] = workspace->h[j + 1][j] / cc; + } - tmp1 = workspace->hc[i] * workspace->h[i][j] + - workspace->hs[i] * workspace->h[i + 1][j]; - tmp2 = -workspace->hs[i] * workspace->h[i][j] + - workspace->hc[i] * workspace->h[i + 1][j]; + tmp1 = workspace->hc[i] * workspace->h[i][j] + + workspace->hs[i] * workspace->h[i + 1][j]; + tmp2 = -workspace->hs[i] * workspace->h[i][j] + + workspace->hc[i] * workspace->h[i + 1][j]; - workspace->h[i][j] = tmp1; - workspace->h[i + 1][j] = tmp2; - } -// } -// else -// { -// //TODO: investigate correctness of not explicitly orthogonalizing first few vectors -// /* Givens rotations on the upper-Hessenberg matrix to make it U */ -// for ( i = MAX(j - 1, 0); i <= j; i++ ) -// { -// if ( i == j ) -// { -// cc = SQRT( SQR(workspace->h[j][j]) + SQR(workspace->h[j + 1][j]) ); -// workspace->hc[j] = workspace->h[j][j] / cc; -// workspace->hs[j] = workspace->h[j + 1][j] / cc; -// } -// -// tmp1 = workspace->hc[i] * workspace->h[i][j] + -// workspace->hs[i] * workspace->h[i + 1][j]; -// tmp2 = -workspace->hs[i] * workspace->h[i][j] + -// workspace->hc[i] * workspace->h[i + 1][j]; -// -// workspace->h[i][j] = tmp1; -// workspace->h[i + 1][j] = tmp2; -// } -// } + workspace->h[i][j] = tmp1; + workspace->h[i + 1][j] = tmp2; + } + // } + // else + // { + // //TODO: investigate correctness of not explicitly orthogonalizing first few vectors + // /* Givens rotations on the upper-Hessenberg matrix to make it U */ + // for ( i = MAX(j - 1, 0); i <= j; i++ ) + // { + // if ( i == j ) + // { + // cc = SQRT( SQR(workspace->h[j][j]) + SQR(workspace->h[j + 1][j]) ); + // workspace->hc[j] = workspace->h[j][j] / cc; + // workspace->hs[j] = workspace->h[j + 1][j] / cc; + // } + // + // tmp1 = workspace->hc[i] * workspace->h[i][j] + + // workspace->hs[i] * workspace->h[i + 1][j]; + // tmp2 = -workspace->hs[i] * workspace->h[i][j] + + // workspace->hc[i] * workspace->h[i + 1][j]; + // + // workspace->h[i][j] = tmp1; + // workspace->h[i + 1][j] = tmp2; + // } + // } /* apply Givens rotations to the rhs as well */ tmp1 = workspace->hc[j] * workspace->g[j]; @@ -3006,13 +3285,13 @@ int GMRES( const static_storage * const workspace, const control_params * const } #ifdef _OPENMP - #pragma omp barrier +#pragma omp barrier #endif } /* solve Hy = g: H is now upper-triangular, do back-substitution */ #ifdef _OPENMP - #pragma omp master +#pragma omp master #endif { time_start = Get_Time( ); @@ -3044,7 +3323,7 @@ int GMRES( const static_storage * const workspace, const control_params * const Vector_Add( x, 1., workspace->p, N ); #ifdef _OPENMP - #pragma omp master +#pragma omp master #endif { data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start ); @@ -3058,7 +3337,7 @@ int GMRES( const static_storage * const workspace, const control_params * const } #ifdef _OPENMP - #pragma omp master +#pragma omp master #endif { g_itr = itr; @@ -3286,8 +3565,8 @@ int CG( const static_storage * const workspace, const control_params * const con z = workspace->p; #ifdef _OPENMP - #pragma omp parallel default(none) private(i, tmp, alpha, beta, b_norm, r_norm, sig_old, sig_new) \ - shared(itr, N, d, r, p, z) +#pragma omp parallel default(none) private(i, tmp, alpha, beta, b_norm, r_norm, sig_old, sig_new) \ + shared(itr, N, d, r, p, z) #endif { b_norm = Norm( b, N ); @@ -3322,7 +3601,7 @@ int CG( const static_storage * const workspace, const control_params * const con } #ifdef _OPENMP - #pragma omp single +#pragma omp single #endif itr = i; } @@ -3349,8 +3628,8 @@ int SDM( const static_storage * const workspace, const control_params * const co N = H->n; #ifdef _OPENMP - #pragma omp parallel default(none) private(i, tmp, alpha, b_norm, sig) \ - shared(itr, N) +#pragma omp parallel default(none) private(i, tmp, alpha, b_norm, sig) \ + shared(itr, N) #endif { b_norm = Norm( b, N ); @@ -3373,7 +3652,7 @@ int SDM( const static_storage * const workspace, const control_params * const co * (Dot function has persistent state in the form * of a shared global variable for the OpenMP version) */ #ifdef _OPENMP - #pragma omp barrier +#pragma omp barrier #endif tmp = Dot( workspace->d, workspace->q, N ); @@ -3386,7 +3665,7 @@ int SDM( const static_storage * const workspace, const control_params * const co } #ifdef _OPENMP - #pragma omp single +#pragma omp single #endif itr = i; } -- GitLab