-
Kurt A. O'Hearn authoredKurt A. O'Hearn authored
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
lin_alg.c 113.31 KiB
/*----------------------------------------------------------------------
SerialReax - Reax Force Field Simulator
Copyright (2010) Purdue University
Hasan Metin Aktulga, haktulga@cs.purdue.edu
Joseph Fogarty, jcfogart@mail.usf.edu
Sagar Pandit, pandit@usf.edu
Ananth Y Grama, ayg@cs.purdue.edu
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License as
published by the Free Software Foundation; either version 2 of
the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
See the GNU General Public License for more details:
<http://www.gnu.org/licenses/>.
----------------------------------------------------------------------*/
#include "lin_alg.h"
#include "allocate.h"
#include "tool_box.h"
#include "vector.h"
#include "print_utils.h"
/* Intel MKL */
#if defined(HAVE_LAPACKE_MKL)
#include "mkl.h"
/* reference LAPACK */
#elif defined(HAVE_LAPACKE)
#include "lapacke.h"
#endif
typedef struct
{
unsigned int j;
real val;
} sparse_matrix_entry;
/* global to make OpenMP shared (Sparse_MatVec) */
#ifdef _OPENMP
real *b_local = NULL;
#endif
/* global to make OpenMP shared (apply_preconditioner) */
real *Dinv_L = NULL, *Dinv_U = NULL;
/* global to make OpenMP shared (tri_solve_level_sched) */
int levels = 1;
int levels_L = 1, levels_U = 1;
unsigned int *row_levels_L = NULL, *level_rows_L = NULL, *level_rows_cnt_L = NULL;
unsigned int *row_levels_U = NULL, *level_rows_U = NULL, *level_rows_cnt_U = NULL;
unsigned int *row_levels, *level_rows, *level_rows_cnt;
unsigned int *top = NULL;
/* global to make OpenMP shared (graph_coloring) */
unsigned int *color = NULL;
unsigned int *to_color = NULL;
unsigned int *conflict = NULL;
unsigned int *conflict_cnt = NULL;
unsigned int *temp_ptr;
unsigned int *recolor = NULL;
unsigned int recolor_cnt;
unsigned int *color_top = NULL;
/* global to make OpenMP shared (sort_colors) */
unsigned int *permuted_row_col = NULL;
unsigned int *permuted_row_col_inv = NULL;
real *y_p = NULL;
/* global to make OpenMP shared (permute_vector) */
real *x_p = NULL;
unsigned int *mapping = NULL;
sparse_matrix *H_full;
sparse_matrix *H_p;
/* global to make OpenMP shared (jacobi_iter) */
real *Dinv_b = NULL, *rp = NULL, *rp2 = NULL, *rp3 = NULL;
#if defined(TEST_MAT)
static sparse_matrix * create_test_mat( void )
{
unsigned int i, n;
sparse_matrix *H_test;
if ( Allocate_Matrix( &H_test, 3, 6 ) == FAILURE )
{
fprintf( stderr, "not enough memory for test matrices. terminating.\n" );
exit( INSUFFICIENT_MEMORY );
}
//3x3, SPD, store lower half
i = 0;
n = 0;
H_test->start[n] = i;
H_test->j[i] = 0;
H_test->val[i] = 4.;
++i;
++n;
H_test->start[n] = i;
H_test->j[i] = 0;
H_test->val[i] = 12.;
++i;
H_test->j[i] = 1;
H_test->val[i] = 37.;
++i;
++n;
H_test->start[n] = i;
H_test->j[i] = 0;
H_test->val[i] = -16.;
++i;
H_test->j[i] = 1;
H_test->val[i] = -43.;
++i;
H_test->j[i] = 2;
H_test->val[i] = 98.;
++i;
++n;
H_test->start[n] = i;
return H_test;
}
#endif
/* Routine used with qsort for sorting nonzeros within a sparse matrix row
*
* v1/v2: pointers to column indices of nonzeros within a row (unsigned int)
*/
static int compare_matrix_entry(const void *v1, const void *v2)
{
/* larger element has larger column index */
return ((sparse_matrix_entry *)v1)->j - ((sparse_matrix_entry *)v2)->j;
}
/* Routine used for sorting nonzeros within a sparse matrix row;
* internally, a combination of qsort and manual sorting is utilized
* (parallel calls to qsort when multithreading, rows mapped to threads)
*
* A: sparse matrix for which to sort nonzeros within a row, stored in CSR format
*/
void Sort_Matrix_Rows( sparse_matrix * const A )
{
unsigned int i, j, si, ei;
sparse_matrix_entry *temp;
#ifdef _OPENMP
// #pragma omp parallel default(none) private(i, j, si, ei, temp) shared(stderr)
#endif
{
temp = (sparse_matrix_entry *) smalloc( A->n * sizeof(sparse_matrix_entry),
"Sort_Matrix_Rows::temp" );
/* sort each row of A using column indices */
#ifdef _OPENMP
// #pragma omp for schedule(guided)
#endif
for ( i = 0; i < A->n; ++i )
{
si = A->start[i];
ei = A->start[i + 1];
for ( j = 0; j < (ei - si); ++j )
{
(temp + j)->j = A->j[si + j];
(temp + j)->val = A->val[si + j];
}
/* polymorphic sort in standard C library using column indices */
qsort( temp, ei - si, sizeof(sparse_matrix_entry), compare_matrix_entry );
for ( j = 0; j < (ei - si); ++j )
{
A->j[si + j] = (temp + j)->j;
A->val[si + j] = (temp + j)->val;
}
}
sfree( temp, "Sort_Matrix_Rows::temp" );
}
}
/* Convert a symmetric, half-sored sparse matrix into
* a full-stored sparse matrix
*
* A: symmetric sparse matrix, lower half stored in CSR
* A_full: resultant full sparse matrix in CSR
* If A_full is NULL, allocate space, otherwise do not
*
* Assumptions:
* A has non-zero diagonals
* Each row of A has at least one non-zero (i.e., no rows with all zeros) */
static void compute_full_sparse_matrix( const sparse_matrix * const A,
sparse_matrix ** A_full )
{
int count, i, pj;
sparse_matrix *A_t;
if ( *A_full == NULL )
{
if ( Allocate_Matrix( A_full, A->n, 2 * A->m - A->n ) == FAILURE )
{
fprintf( stderr, "not enough memory for full A. terminating.\n" );
exit( INSUFFICIENT_MEMORY );
}
}
if ( Allocate_Matrix( &A_t, A->n, A->m ) == FAILURE )
{
fprintf( stderr, "not enough memory for full A. terminating.\n" );
exit( INSUFFICIENT_MEMORY );
}
/* Set up the sparse matrix data structure for A. */
Transpose( A, A_t );
count = 0;
for ( i = 0; i < A->n; ++i )
{
if ((*A_full)->start == NULL)
{
}
(*A_full)->start[i] = count;
/* A: symmetric, lower triangular portion only stored */
for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
{
(*A_full)->val[count] = A->val[pj];
(*A_full)->j[count] = A->j[pj];
++count;
}
/* A^T: symmetric, upper triangular portion only stored;
* skip diagonal from A^T, as included from A above */
for ( pj = A_t->start[i] + 1; pj < A_t->start[i + 1]; ++pj )
{
(*A_full)->val[count] = A_t->val[pj];
(*A_full)->j[count] = A_t->j[pj];
++count;
}
}
(*A_full)->start[i] = count;
Deallocate_Matrix( A_t );
}
/* Setup routines for sparse approximate inverse preconditioner
*
* A: symmetric sparse matrix, lower half stored in CSR
* filter:
* A_spar_patt:
*
* Assumptions:
* A has non-zero diagonals
* Each row of A has at least one non-zero (i.e., no rows with all zeros) */
void setup_sparse_approx_inverse( const sparse_matrix * const A, sparse_matrix ** A_full,
sparse_matrix ** A_spar_patt, sparse_matrix **A_spar_patt_full,
sparse_matrix ** A_app_inv, const real filter )
{
int i, pj, size;
real min, max, threshold, val;
min = 0.0;
max = 0.0;
if ( *A_spar_patt == NULL )
{
if ( Allocate_Matrix( A_spar_patt, A->n, A->m ) == FAILURE )
{
fprintf( stderr, "[SAI] Not enough memory for preconditioning matrices. terminating.\n" );
exit( INSUFFICIENT_MEMORY );
}
}
else if ( ((*A_spar_patt)->m) < (A->m) )
{
Deallocate_Matrix( *A_spar_patt );
if ( Allocate_Matrix( A_spar_patt, A->n, A->m ) == FAILURE )
{
fprintf( stderr, "[SAI] Not enough memory for preconditioning matrices. terminating.\n" );
exit( INSUFFICIENT_MEMORY );
}
}
/* find min and max elements of matrix */
for ( i = 0; i < A->n; ++i )
{
for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
{
val = A->val[pj];
if ( pj == 0 )
{
min = val;
max = val;
}
else
{
if ( min > val )
{
min = val;
}
if ( max < val )
{
max = val;
}
}
}
}
threshold = min + (max - min) * (1.0 - filter);
/* fill sparsity pattern */
for ( size = 0, i = 0; i < A->n; ++i )
{
(*A_spar_patt)->start[i] = size;
for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
{
if ( ( A->val[pj] >= threshold ) || ( A->j[pj] == i ) )
{
(*A_spar_patt)->val[size] = A->val[pj];
(*A_spar_patt)->j[size] = A->j[pj];
size++;
}
}
}
(*A_spar_patt)->start[A->n] = size;
compute_full_sparse_matrix( A, A_full );
compute_full_sparse_matrix( *A_spar_patt, A_spar_patt_full );
/* A_app_inv has the same sparsity pattern
* as A_spar_patt_full (omit non-zero values) */
if ( Allocate_Matrix( A_app_inv, (*A_spar_patt_full)->n, (*A_spar_patt_full)->m ) == FAILURE )
{
fprintf( stderr, "not enough memory for approximate inverse matrix. terminating.\n" );
exit( INSUFFICIENT_MEMORY );
}
}
void Calculate_Droptol( const sparse_matrix * const A,
real * const droptol, const real dtol )
{
int i, j, k;
real val;
#ifdef _OPENMP
static real *droptol_local;
unsigned int tid;
#endif
#ifdef _OPENMP
#pragma omp parallel default(none) private(i, j, k, val, tid), shared(droptol_local, stderr)
#endif
{
#ifdef _OPENMP
tid = omp_get_thread_num();
#pragma omp master
{
/* keep b_local for program duration to avoid allocate/free
* overhead per Sparse_MatVec call*/
if ( droptol_local == NULL )
{
droptol_local = (real*) smalloc( omp_get_num_threads() * A->n * sizeof(real),
"Calculate_Droptol::droptol_local" );
}
}
#pragma omp barrier
#endif
/* init droptol to 0 */
for ( i = 0; i < A->n; ++i )
{
#ifdef _OPENMP
droptol_local[tid * A->n + i] = 0.0;
#else
droptol[i] = 0.0;
#endif
}
#ifdef _OPENMP
#pragma omp barrier
#endif
/* calculate sqaure of the norm of each row */
#ifdef _OPENMP
#pragma omp for schedule(static)
#endif
for ( i = 0; i < A->n; ++i )
{
for ( k = A->start[i]; k < A->start[i + 1] - 1; ++k )
{
j = A->j[k];
val = A->val[k];
#ifdef _OPENMP
droptol_local[tid * A->n + i] += val * val;
droptol_local[tid * A->n + j] += val * val;
#else
droptol[i] += val * val;
droptol[j] += val * val;
#endif
}
// diagonal entry
val = A->val[k];
#ifdef _OPENMP
droptol_local[tid * A->n + i] += val * val;
#else
droptol[i] += val * val;
#endif
}
#ifdef _OPENMP
#pragma omp barrier
#pragma omp for schedule(static)
for ( i = 0; i < A->n; ++i )
{
droptol[i] = 0.0;
for ( k = 0; k < omp_get_num_threads(); ++k )
{
droptol[i] += droptol_local[k * A->n + i];
}
}
#pragma omp barrier
#endif
/* calculate local droptol for each row */
//fprintf( stderr, "droptol: " );
#ifdef _OPENMP
#pragma omp for schedule(static)
#endif
for ( i = 0; i < A->n; ++i )
{
//fprintf( stderr, "%f-->", droptol[i] );
droptol[i] = SQRT( droptol[i] ) * dtol;
//fprintf( stderr, "%f ", droptol[i] );
}
//fprintf( stderr, "\n" );
}
}
int Estimate_LU_Fill( const sparse_matrix * const A, const real * const droptol )
{
int i, pj;
int fillin;
real val;
fillin = 0;
#ifdef _OPENMP
#pragma omp parallel for schedule(static) \
default(none) private(i, pj, val) reduction(+: fillin)
#endif
for ( i = 0; i < A->n; ++i )
{
for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
{
val = A->val[pj];
if ( FABS(val) > droptol[i] )
{
++fillin;
}
}
}
return fillin + A->n;
}
#if defined(HAVE_SUPERLU_MT)
real SuperLU_Factorize( const sparse_matrix * const A,
sparse_matrix * const L, sparse_matrix * const U )
{
unsigned int i, pj, count, *Ltop, *Utop, r;
sparse_matrix *A_t;
SuperMatrix A_S, AC_S, L_S, U_S;
NCformat *A_S_store;
SCPformat *L_S_store;
NCPformat *U_S_store;
superlumt_options_t superlumt_options;
pxgstrf_shared_t pxgstrf_shared;
pdgstrf_threadarg_t *pdgstrf_threadarg;
int_t nprocs;
fact_t fact;
trans_t trans;
yes_no_t refact, usepr;
real u, drop_tol;
real *a, *at;
int_t *asub, *atsub, *xa, *xat;
int_t *perm_c; /* column permutation vector */
int_t *perm_r; /* row permutations from partial pivoting */
void *work;
int_t info, lwork;
int_t permc_spec, panel_size, relax;
Gstat_t Gstat;
flops_t flopcnt;
/* Default parameters to control factorization. */
#ifdef _OPENMP
//TODO: set as global parameter and use
#pragma omp parallel \
default(none) shared(nprocs)
{
#pragma omp master
{
/* SuperLU_MT spawns threads internally, so set and pass parameter */
nprocs = omp_get_num_threads();
}
}
#else
nprocs = 1;
#endif
// fact = EQUILIBRATE; /* equilibrate A (i.e., scale rows & cols to have unit norm), then factorize */
fact = DOFACT; /* factor from scratch */
trans = NOTRANS;
refact = NO; /* first time factorization */
//TODO: add to control file and use the value there to set these
panel_size = sp_ienv(1); /* # consec. cols treated as unit task */
relax = sp_ienv(2); /* # cols grouped as relaxed supernode */
u = 1.0; /* diagonal pivoting threshold */
usepr = NO;
drop_tol = 0.0;
work = NULL;
lwork = 0;
#if defined(DEBUG)
fprintf( stderr, "nprocs = %d\n", nprocs );
fprintf( stderr, "Panel size = %d\n", panel_size );
fprintf( stderr, "Relax = %d\n", relax );
#endif
if ( !(perm_r = intMalloc(A->n)) )
{
SUPERLU_ABORT("Malloc fails for perm_r[].");
}
if ( !(perm_c = intMalloc(A->n)) )
{
SUPERLU_ABORT("Malloc fails for perm_c[].");
}
if ( !(superlumt_options.etree = intMalloc(A->n)) )
{
SUPERLU_ABORT("Malloc fails for etree[].");
}
if ( !(superlumt_options.colcnt_h = intMalloc(A->n)) )
{
SUPERLU_ABORT("Malloc fails for colcnt_h[].");
}
if ( !(superlumt_options.part_super_h = intMalloc(A->n)) )
{
SUPERLU_ABORT("Malloc fails for part_super__h[].");
}
a = (real*) smalloc( (2 * A->start[A->n] - A->n) * sizeof(real),
"SuperLU_Factorize::a" );
asub = (int_t*) smalloc( (2 * A->start[A->n] - A->n) * sizeof(int_t),
"SuperLU_Factorize::asub" );
xa = (int_t*) smalloc( (A->n + 1) * sizeof(int_t),
"SuperLU_Factorize::xa" );
Ltop = (unsigned int*) smalloc( (A->n + 1) * sizeof(unsigned int),
"SuperLU_Factorize::Ltop" );
Utop = (unsigned int*) smalloc( (A->n + 1) * sizeof(unsigned int),
"SuperLU_Factorize::Utop" );
if ( Allocate_Matrix( &A_t, A->n, A->m ) == FAILURE )
{
fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
exit( INSUFFICIENT_MEMORY );
}
/* Set up the sparse matrix data structure for A. */
Transpose( A, A_t );
count = 0;
for ( i = 0; i < A->n; ++i )
{
xa[i] = count;
for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
{
a[count] = A->entries[pj].val;
asub[count] = A->entries[pj].j;
++count;
}
for ( pj = A_t->start[i] + 1; pj < A_t->start[i + 1]; ++pj )
{
a[count] = A_t->entries[pj].val;
asub[count] = A_t->entries[pj].j;
++count;
}
}
xa[i] = count;
dCompRow_to_CompCol( A->n, A->n, 2 * A->start[A->n] - A->n, a, asub, xa,
&at, &atsub, &xat );
for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
fprintf( stderr, "%6d", asub[i] );
fprintf( stderr, "\n" );
for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
fprintf( stderr, "%6.1f", a[i] );
fprintf( stderr, "\n" );
for ( i = 0; i <= A->n; ++i )
fprintf( stderr, "%6d", xa[i] );
fprintf( stderr, "\n" );
for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
fprintf( stderr, "%6d", atsub[i] );
fprintf( stderr, "\n" );
for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
fprintf( stderr, "%6.1f", at[i] );
fprintf( stderr, "\n" );
for ( i = 0; i <= A->n; ++i )
fprintf( stderr, "%6d", xat[i] );
fprintf( stderr, "\n" );
A_S.Stype = SLU_NC; /* column-wise, no supernode */
A_S.Dtype = SLU_D; /* double-precision */
A_S.Mtype = SLU_GE; /* full (general) matrix -- required for parallel factorization */
A_S.nrow = A->n;
A_S.ncol = A->n;
A_S.Store = (void *) SUPERLU_MALLOC( sizeof(NCformat) );
A_S_store = (NCformat *) A_S.Store;
A_S_store->nnz = 2 * A->start[A->n] - A->n;
A_S_store->nzval = at;
A_S_store->rowind = atsub;
A_S_store->colptr = xat;
/* ------------------------------------------------------------
Allocate storage and initialize statistics variables.
------------------------------------------------------------*/
StatAlloc( A->n, nprocs, panel_size, relax, &Gstat );
StatInit( A->n, nprocs, &Gstat );
/* ------------------------------------------------------------
Get column permutation vector perm_c[], according to permc_spec:
permc_spec = 0: natural ordering
permc_spec = 1: minimum degree ordering on structure of A'*A
permc_spec = 2: minimum degree ordering on structure of A'+A
permc_spec = 3: approximate minimum degree for unsymmetric matrices
------------------------------------------------------------*/
permc_spec = 0;
get_perm_c( permc_spec, &A_S, perm_c );
/* ------------------------------------------------------------
Initialize the option structure superlumt_options using the
user-input parameters;
Apply perm_c to the columns of original A to form AC.
------------------------------------------------------------*/
pdgstrf_init( nprocs, fact, trans, refact, panel_size, relax,
u, usepr, drop_tol, perm_c, perm_r,
work, lwork, &A_S, &AC_S, &superlumt_options, &Gstat );
for ( i = 0; i < ((NCPformat*)AC_S.Store)->nnz; ++i )
fprintf( stderr, "%6.1f", ((real*)(((NCPformat*)AC_S.Store)->nzval))[i] );
fprintf( stderr, "\n" );
/* ------------------------------------------------------------
Compute the LU factorization of A.
The following routine will create nprocs threads.
------------------------------------------------------------*/
pdgstrf( &superlumt_options, &AC_S, perm_r, &L_S, &U_S, &Gstat, &info );
fprintf( stderr, "INFO: %d\n", info );
flopcnt = 0;
for (i = 0; i < nprocs; ++i)
{
flopcnt += Gstat.procstat[i].fcops;
}
Gstat.ops[FACT] = flopcnt;
#if defined(DEBUG)
printf("\n** Result of sparse LU **\n");
L_S_store = (SCPformat *) L_S.Store;
U_S_store = (NCPformat *) U_S.Store;
printf( "No of nonzeros in factor L = " IFMT "\n", L_S_store->nnz );
printf( "No of nonzeros in factor U = " IFMT "\n", U_S_store->nnz );
fflush( stdout );
#endif
/* convert L and R from SuperLU formats to CSR */
memset( Ltop, 0, (A->n + 1) * sizeof(int) );
memset( Utop, 0, (A->n + 1) * sizeof(int) );
memset( L->start, 0, (A->n + 1) * sizeof(int) );
memset( U->start, 0, (A->n + 1) * sizeof(int) );
for ( i = 0; i < 2 * L_S_store->nnz; ++i )
fprintf( stderr, "%6.1f", ((real*)(L_S_store->nzval))[i] );
fprintf( stderr, "\n" );
for ( i = 0; i < 2 * U_S_store->nnz; ++i )
fprintf( stderr, "%6.1f", ((real*)(U_S_store->nzval))[i] );
fprintf( stderr, "\n" );
printf( "No of supernodes in factor L = " IFMT "\n", L_S_store->nsuper );
for ( i = 0; i < A->n; ++i )
{
fprintf( stderr, "nzval_col_beg[%5d] = %d\n", i, L_S_store->nzval_colbeg[i] );
fprintf( stderr, "nzval_col_end[%5d] = %d\n", i, L_S_store->nzval_colend[i] );
//TODO: correct for SCPformat for L?
//for( pj = L_S_store->rowind_colbeg[i]; pj < L_S_store->rowind_colend[i]; ++pj )
// for( pj = 0; pj < L_S_store->rowind_colend[i] - L_S_store->rowind_colbeg[i]; ++pj )
// {
// ++Ltop[L_S_store->rowind[L_S_store->rowind_colbeg[i] + pj] + 1];
// }
fprintf( stderr, "col_beg[%5d] = %d\n", i, U_S_store->colbeg[i] );
fprintf( stderr, "col_end[%5d] = %d\n", i, U_S_store->colend[i] );
for ( pj = U_S_store->colbeg[i]; pj < U_S_store->colend[i]; ++pj )
{
++Utop[U_S_store->rowind[pj] + 1];
fprintf( stderr, "Utop[%5d] = %d\n", U_S_store->rowind[pj] + 1, Utop[U_S_store->rowind[pj] + 1] );
}
}
for ( i = 1; i <= A->n; ++i )
{
// Ltop[i] = L->start[i] = Ltop[i] + Ltop[i - 1];
Utop[i] = U->start[i] = Utop[i] + Utop[i - 1];
// fprintf( stderr, "Utop[%5d] = %d\n", i, Utop[i] );
// fprintf( stderr, "U->start[%5d] = %d\n", i, U->start[i] );
}
for ( i = 0; i < A->n; ++i )
{
// for( pj = 0; pj < L_S_store->nzval_colend[i] - L_S_store->nzval_colbeg[i]; ++pj )
// {
// r = L_S_store->rowind[L_S_store->rowind_colbeg[i] + pj];
// L->entries[Ltop[r]].j = r;
// L->entries[Ltop[r]].val = ((real*)L_S_store->nzval)[L_S_store->nzval_colbeg[i] + pj];
// ++Ltop[r];
// }
for ( pj = U_S_store->colbeg[i]; pj < U_S_store->colend[i]; ++pj )
{
r = U_S_store->rowind[pj];
U->entries[Utop[r]].j = i;
U->entries[Utop[r]].val = ((real*)U_S_store->nzval)[pj];
++Utop[r];
}
}
/* ------------------------------------------------------------
Deallocate storage after factorization.
------------------------------------------------------------*/
pxgstrf_finalize( &superlumt_options, &AC_S );
Deallocate_Matrix( A_t );
sfree( xa, "SuperLU_Factorize::xa" );
sfree( asub, "SuperLU_Factorize::asub" );
sfree( a, "SuperLU_Factorize::a" );
SUPERLU_FREE( perm_r );
SUPERLU_FREE( perm_c );
SUPERLU_FREE( ((NCformat *)A_S.Store)->rowind );
SUPERLU_FREE( ((NCformat *)A_S.Store)->colptr );
SUPERLU_FREE( ((NCformat *)A_S.Store)->nzval );
SUPERLU_FREE( A_S.Store );
if ( lwork == 0 )
{
Destroy_SuperNode_SCP(&L_S);
Destroy_CompCol_NCP(&U_S);
}
else if ( lwork > 0 )
{
SUPERLU_FREE(work);
}
StatFree(&Gstat);
sfree( Utop, "SuperLU_Factorize::Utop" );
sfree( Ltop, "SuperLU_Factorize::Ltop" );
//TODO: return iters
return 0.;
}
#endif
/* Diagonal (Jacobi) preconditioner computation */
real diag_pre_comp( const sparse_matrix * const H, real * const Hdia_inv )
{
unsigned int i;
real start;
start = Get_Time( );
#ifdef _OPENMP
#pragma omp parallel for schedule(static) \
default(none) private(i)
#endif
for ( i = 0; i < H->n; ++i )
{
if ( H->val[H->start[i + 1] - 1] != 0.0 )
{
Hdia_inv[i] = 1.0 / H->val[H->start[i + 1] - 1];
}
else
{
Hdia_inv[i] = 1.0;
}
}
return Get_Timing_Info( start );
}
/* Incomplete Cholesky factorization with dual thresholding */
real ICHOLT( const sparse_matrix * const A, const real * const droptol,
sparse_matrix * const L, sparse_matrix * const U )
{
int *tmp_j;
real *tmp_val;
int i, j, pj, k1, k2, tmptop, Ltop;
real val, start;
unsigned int *Utop;
start = Get_Time( );
Utop = (unsigned int*) smalloc( (A->n + 1) * sizeof(unsigned int),
"ICHOLT::Utop" );
tmp_j = (int*) smalloc( A->n * sizeof(int),
"ICHOLT::Utop" );
tmp_val = (real*) smalloc( A->n * sizeof(real),
"ICHOLT::Utop" );
// clear variables
Ltop = 0;
tmptop = 0;
memset( L->start, 0, (A->n + 1) * sizeof(unsigned int) );
memset( U->start, 0, (A->n + 1) * sizeof(unsigned int) );
memset( Utop, 0, A->n * sizeof(unsigned int) );
for ( i = 0; i < A->n; ++i )
{
L->start[i] = Ltop;
tmptop = 0;
for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
{
j = A->j[pj];
val = A->val[pj];
if ( FABS(val) > droptol[i] )
{
k1 = 0;
k2 = L->start[j];
while ( k1 < tmptop && k2 < L->start[j + 1] )
{
if ( tmp_j[k1] < L->j[k2] )
{
++k1;
}
else if ( tmp_j[k1] > L->j[k2] )
{
++k2;
}
else
{
val -= (tmp_val[k1++] * L->val[k2++]);
}
}
// L matrix is lower triangular,
// so right before the start of next row comes jth diagonal
val /= L->val[L->start[j + 1] - 1];
tmp_j[tmptop] = j;
tmp_val[tmptop] = val;
++tmptop;
}
}
// sanity check
if ( A->j[pj] != i )
{
fprintf( stderr, "[ICHOLT] badly built A matrix!\n (i = %d) ", i );
exit( NUMERIC_BREAKDOWN );
}
// compute the ith diagonal in L
val = A->val[pj];
for ( k1 = 0; k1 < tmptop; ++k1 )
{
val -= (tmp_val[k1] * tmp_val[k1]);
}
#if defined(DEBUG)
if ( val < 0.0 )
{
fprintf( stderr, "[ICHOLT] Numeric breakdown (SQRT of negative on diagonal i = %d). Terminating.\n", i );
exit( NUMERIC_BREAKDOWN );
}
#endif
tmp_j[tmptop] = i;
tmp_val[tmptop] = SQRT( val );
// apply the dropping rule once again
//fprintf( stderr, "row%d: tmptop: %d\n", i, tmptop );
//for( k1 = 0; k1<= tmptop; ++k1 )
// fprintf( stderr, "%d(%f) ", tmp[k1].j, tmp[k1].val );
//fprintf( stderr, "\n" );
//fprintf( stderr, "row(%d): droptol=%.4f\n", i+1, droptol[i] );
for ( k1 = 0; k1 < tmptop; ++k1 )
{
if ( FABS(tmp_val[k1]) > droptol[i] / tmp_val[tmptop] )
{
L->j[Ltop] = tmp_j[k1];
L->val[Ltop] = tmp_val[k1];
U->start[tmp_j[k1] + 1]++;
++Ltop;
//fprintf( stderr, "%d(%.4f) ", tmp[k1].j+1, tmp[k1].val );
}
}
// keep the diagonal in any case
L->j[Ltop] = tmp_j[k1];
L->val[Ltop] = tmp_val[k1];
++Ltop;
//fprintf( stderr, "%d(%.4f)\n", tmp[k1].j+1, tmp[k1].val );
}
L->start[i] = Ltop;
// fprintf( stderr, "nnz(L): %d, max: %d\n", Ltop, L->n * 50 );
/* U = L^T (Cholesky factorization) */
Transpose( L, U );
// for ( i = 1; i <= U->n; ++i )
// {
// Utop[i] = U->start[i] = U->start[i] + U->start[i - 1] + 1;
// }
// for ( i = 0; i < L->n; ++i )
// {
// for ( pj = L->start[i]; pj < L->start[i + 1]; ++pj )
// {
// j = L->j[pj];
// U->j[Utop[j]] = i;
// U->val[Utop[j]] = L->val[pj];
// Utop[j]++;
// }
// }
// fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 );
sfree( tmp_val, "ICHOLT::tmp_val" );
sfree( tmp_j, "ICHOLT::tmp_j" );
sfree( Utop, "ICHOLT::Utop" );
return Get_Timing_Info( start );
}
/* Fine-grained (parallel) incomplete Cholesky factorization
*
* Reference:
* Edmond Chow and Aftab Patel
* Fine-Grained Parallel Incomplete LU Factorization
* SIAM J. Sci. Comp. */
#if defined(TESTING)
real ICHOL_PAR( const sparse_matrix * const A, const unsigned int sweeps,
sparse_matrix * const U_t, sparse_matrix * const U )
{
unsigned int i, j, k, pj, x = 0, y = 0, ei_x, ei_y;
real *D, *D_inv, sum, start;
sparse_matrix *DAD;
int *Utop;
start = Get_Time( );
D = (real*) smalloc( A->n * sizeof(real), "ICHOL_PAR::D" );
D_inv = (real*) smalloc( A->n * sizeof(real), "ICHOL_PAR::D_inv" );
Utop = (int*) smalloc( (A->n + 1) * sizeof(int), "ICHOL_PAR::Utop" );
if ( Allocate_Matrix( &DAD, A->n, A->m ) == FAILURE )
{
fprintf( stderr, "not enough memory for ICHOL_PAR preconditioning matrices. terminating.\n" );
exit( INSUFFICIENT_MEMORY );
}
#ifdef _OPENMP
#pragma omp parallel for schedule(static) \
default(none) shared(D_inv, D) private(i)
#endif
for ( i = 0; i < A->n; ++i )
{
D_inv[i] = SQRT( A->val[A->start[i + 1] - 1] );
D[i] = 1. / D_inv[i];
}
memset( U->start, 0, sizeof(unsigned int) * (A->n + 1) );
memset( Utop, 0, sizeof(unsigned int) * (A->n + 1) );
/* to get convergence, A must have unit diagonal, so apply
* transformation DAD, where D = D(1./SQRT(D(A))) */
memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) );
#ifdef _OPENMP
#pragma omp parallel for schedule(guided) \
default(none) shared(DAD, D_inv, D) private(i, pj)
#endif
for ( i = 0; i < A->n; ++i )
{
/* non-diagonals */
for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
{
DAD->j[pj] = A->j[pj];
DAD->val[pj] = A->val[pj] * D[i] * D[A->j[pj]];
}
/* diagonal */
DAD->j[pj] = A->j[pj];
DAD->val[pj] = 1.;
}
/* initial guesses for U^T,
* assume: A and DAD symmetric and stored lower triangular */
memcpy( U_t->start, DAD->start, sizeof(int) * (DAD->n + 1) );
memcpy( U_t->j, DAD->j, sizeof(int) * (DAD->m) );
memcpy( U_t->val, DAD->val, sizeof(real) * (DAD->m) );
for ( i = 0; i < sweeps; ++i )
{
/* for each nonzero */
#ifdef _OPENMP
#pragma omp parallel for schedule(static) \
default(none) shared(DAD, stderr) private(sum, ei_x, ei_y, k) firstprivate(x, y)
#endif
for ( j = 0; j < A->start[A->n]; ++j )
{
sum = ZERO;
/* determine row bounds of current nonzero */
x = 0;
ei_x = 0;
for ( k = 0; k <= A->n; ++k )
{
if ( U_t->start[k] > j )
{
x = U_t->start[k - 1];
ei_x = U_t->start[k];
break;
}
}
/* column bounds of current nonzero */
y = U_t->start[U_t->j[j]];
ei_y = U_t->start[U_t->j[j] + 1];
/* sparse dot product: dot( U^T(i,1:j-1), U^T(j,1:j-1) ) */
while ( U_t->j[x] < U_t->j[j] &&
U_t->j[y] < U_t->j[j] &&
x < ei_x && y < ei_y )
{
if ( U_t->j[x] == U_t->j[y] )
{
sum += (U_t->val[x] * U_t->val[y]);
++x;
++y;
}
else if ( U_t->j[x] < U_t->j[y] )
{
++x;
}
else
{
++y;
}
}
sum = DAD->val[j] - sum;
/* diagonal entries */
if ( (k - 1) == U_t->j[j] )
{
/* sanity check */
if ( sum < ZERO )
{
fprintf( stderr, "Numeric breakdown in ICHOL_PAR. Terminating.\n");
#if defined(DEBUG_FOCUS)
fprintf( stderr, "A(%5d,%5d) = %10.3f\n",
k - 1, A->entries[j].j, A->entries[j].val );
fprintf( stderr, "sum = %10.3f\n", sum);
#endif
exit(NUMERIC_BREAKDOWN);
}
U_t->val[j] = SQRT( sum );
}
/* non-diagonal entries */
else
{
U_t->val[j] = sum / U_t->val[ei_y - 1];
}
}
}
/* apply inverse transformation D^{-1}U^{T},
* since DAD \approx U^{T}U, so
* D^{-1}DADD^{-1} = A \approx D^{-1}U^{T}UD^{-1} */
#ifdef _OPENMP
#pragma omp parallel for schedule(guided) \
default(none) shared(D_inv) private(i, pj)
#endif
for ( i = 0; i < A->n; ++i )
{
for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
{
U_t->val[pj] *= D_inv[i];
}
}
#if defined(DEBUG_FOCUS)
fprintf( stderr, "nnz(L): %d, max: %d\n", U_t->start[U_t->n], U_t->n * 50 );
#endif
/* transpose U^{T} and copy into U */
Transpose( U_t, U );
#if defined(DEBUG_FOCUS)
fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 );
#endif
Deallocate_Matrix( DAD );
sfree( D_inv, "ICHOL_PAR::D_inv" );
sfree( D, "ICHOL_PAR::D" );
sfree( Utop, "ICHOL_PAR::Utop" );
return Get_Timing_Info( start );
}
#endif
/* Fine-grained (parallel) incomplete LU factorization
*
* Reference:
* Edmond Chow and Aftab Patel
* Fine-Grained Parallel Incomplete LU Factorization
* SIAM J. Sci. Comp.
*
* A: symmetric, half-stored (lower triangular), CSR format
* sweeps: number of loops over non-zeros for computation
* L / U: factorized triangular matrices (A \approx LU), CSR format */
real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps,
sparse_matrix * const L, sparse_matrix * const U )
{
unsigned int i, j, k, pj, x, y, ei_x, ei_y;
real *D, *D_inv, sum, start;
sparse_matrix *DAD;
start = Get_Time( );
D = (real*) smalloc( A->n * sizeof(real), "ILU_PAR::D" );
D_inv = (real*) smalloc( A->n * sizeof(real), "ILU_PAR::D_inv" );
if ( Allocate_Matrix( &DAD, A->n, A->m ) == FAILURE )
{
fprintf( stderr, "[ILU_PAR] Not enough memory for preconditioning matrices. Terminating.\n" );
exit( INSUFFICIENT_MEMORY );
}
#ifdef _OPENMP
#pragma omp parallel for schedule(static) \
default(none) shared(D, D_inv) private(i)
#endif
for ( i = 0; i < A->n; ++i )
{
D_inv[i] = SQRT( FABS( A->val[A->start[i + 1] - 1] ) );
D[i] = 1.0 / D_inv[i];
// printf( "A->val[%8d] = %f, D[%4d] = %f, D_inv[%4d] = %f\n", A->start[i + 1] - 1, A->val[A->start[i + 1] - 1], i, D[i], i, D_inv[i] );
}
/* to get convergence, A must have unit diagonal, so apply
* transformation DAD, where D = D(1./SQRT(abs(D(A)))) */
memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) );
#ifdef _OPENMP
#pragma omp parallel for schedule(static) \
default(none) shared(DAD, D) private(i, pj)
#endif
for ( i = 0; i < A->n; ++i )
{
/* non-diagonals */
for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
{
DAD->j[pj] = A->j[pj];
DAD->val[pj] = D[i] * A->val[pj] * D[A->j[pj]];
}
/* diagonal */
DAD->j[pj] = A->j[pj];
DAD->val[pj] = 1.0;
}
/* initial guesses for L and U,
* assume: A and DAD symmetric and stored lower triangular */
memcpy( L->start, DAD->start, sizeof(int) * (DAD->n + 1) );
memcpy( L->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) );
memcpy( L->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) );
/* store U^T in CSR for row-wise access and tranpose later */
memcpy( U->start, DAD->start, sizeof(int) * (DAD->n + 1) );
memcpy( U->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) );
memcpy( U->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) );
/* L has unit diagonal, by convention */
#ifdef _OPENMP
#pragma omp parallel for schedule(static) default(none) private(i)
#endif
for ( i = 0; i < A->n; ++i )
{
L->val[L->start[i + 1] - 1] = 1.0;
}
for ( i = 0; i < sweeps; ++i )
{
/* for each nonzero in L */
#ifdef _OPENMP
#pragma omp parallel for schedule(static) \
default(none) shared(DAD) private(j, k, x, y, ei_x, ei_y, sum)
#endif
for ( j = 0; j < DAD->start[DAD->n]; ++j )
{
sum = ZERO;
/* determine row bounds of current nonzero */
x = 0;
ei_x = 0;
for ( k = 1; k <= DAD->n; ++k )
{
if ( DAD->start[k] > j )
{
x = DAD->start[k - 1];
ei_x = DAD->start[k];
break;
}
}
/* determine column bounds of current nonzero */
y = DAD->start[DAD->j[j]];
ei_y = DAD->start[DAD->j[j] + 1];
/* sparse dot product:
* dot( L(i,1:j-1), U(1:j-1,j) ) */
while ( L->j[x] < L->j[j] &&
L->j[y] < L->j[j] &&
x < ei_x && y < ei_y )
{
if ( L->j[x] == L->j[y] )
{
sum += (L->val[x] * U->val[y]);
++x;
++y;
}
else if ( L->j[x] < L->j[y] )
{
++x;
}
else
{
++y;
}
}
if ( j != ei_x - 1 )
{
L->val[j] = ( DAD->val[j] - sum ) / U->val[ei_y - 1];
}
}
#ifdef _OPENMP
#pragma omp parallel for schedule(static) \
default(none) shared(DAD) private(j, k, x, y, ei_x, ei_y, sum)
#endif
for ( j = 0; j < DAD->start[DAD->n]; ++j )
{
sum = ZERO;
/* determine row bounds of current nonzero */
x = 0;
ei_x = 0;
for ( k = 1; k <= DAD->n; ++k )
{
if ( DAD->start[k] > j )
{
x = DAD->start[k - 1];
ei_x = DAD->start[k];
break;
}
}
/* determine column bounds of current nonzero */
y = DAD->start[DAD->j[j]];
ei_y = DAD->start[DAD->j[j] + 1];
/* sparse dot product:
* dot( L(i,1:i-1), U(1:i-1,j) ) */
while ( U->j[x] < U->j[j] &&
U->j[y] < U->j[j] &&
x < ei_x && y < ei_y )
{
if ( U->j[x] == U->j[y] )
{
sum += (L->val[y] * U->val[x]);
++x;
++y;
}
else if ( U->j[x] < U->j[y] )
{
++x;
}
else
{
++y;
}
}
U->val[j] = DAD->val[j] - sum;
}
}
/* apply inverse transformation:
* since DAD \approx LU, then
* D^{-1}DADD^{-1} = A \approx D^{-1}LUD^{-1} */
#ifdef _OPENMP
#pragma omp parallel for schedule(static) \
default(none) shared(DAD, D_inv) private(i, pj)
#endif
for ( i = 0; i < DAD->n; ++i )
{
for ( pj = DAD->start[i]; pj < DAD->start[i + 1]; ++pj )
{
L->val[pj] = D_inv[i] * L->val[pj];
/* currently storing U^T, so use row index instead of column index */
U->val[pj] = U->val[pj] * D_inv[i];
}
}
Transpose_I( U );
#if defined(DEBUG_FOCUS)
fprintf( stderr, "nnz(L): %d, max: %d\n", L->start[L->n], L->n * 50 );
fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 );
#endif
Deallocate_Matrix( DAD );
sfree( D_inv, "ILU_PAR::D_inv" );
sfree( D, "ILU_PAR::D_inv" );
return Get_Timing_Info( start );
}
/* Fine-grained (parallel) incomplete LU factorization with thresholding
*
* Reference:
* Edmond Chow and Aftab Patel
* Fine-Grained Parallel Incomplete LU Factorization
* SIAM J. Sci. Comp.
*
* A: symmetric, half-stored (lower triangular), CSR format
* droptol: row-wise tolerances used for dropping
* sweeps: number of loops over non-zeros for computation
* L / U: factorized triangular matrices (A \approx LU), CSR format */
real ILUT_PAR( const sparse_matrix * const A, const real * droptol,
const unsigned int sweeps, sparse_matrix * const L, sparse_matrix * const U )
{
unsigned int i, j, k, pj, x, y, ei_x, ei_y, Ltop, Utop;
real *D, *D_inv, sum, start;
sparse_matrix *DAD, *L_temp, *U_temp;
start = Get_Time( );
if ( Allocate_Matrix( &DAD, A->n, A->m ) == FAILURE ||
Allocate_Matrix( &L_temp, A->n, A->m ) == FAILURE ||
Allocate_Matrix( &U_temp, A->n, A->m ) == FAILURE )
{
fprintf( stderr, "not enough memory for ILUT_PAR preconditioning matrices. terminating.\n" );
exit( INSUFFICIENT_MEMORY );
}
D = (real*) smalloc( A->n * sizeof(real), "ILUT_PAR::D" );
D_inv = (real*) smalloc( A->n * sizeof(real), "ILUT_PAR::D_inv" );
#ifdef _OPENMP
#pragma omp parallel for schedule(static) \
default(none) shared(D, D_inv) private(i)
#endif
for ( i = 0; i < A->n; ++i )
{
D_inv[i] = SQRT( FABS( A->val[A->start[i + 1] - 1] ) );
D[i] = 1.0 / D_inv[i];
}
/* to get convergence, A must have unit diagonal, so apply
* transformation DAD, where D = D(1./SQRT(D(A))) */
memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) );
#ifdef _OPENMP
#pragma omp parallel for schedule(static) \
default(none) shared(DAD, D) private(i, pj)
#endif
for ( i = 0; i < A->n; ++i )
{
/* non-diagonals */
for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
{
DAD->j[pj] = A->j[pj];
DAD->val[pj] = D[i] * A->val[pj] * D[A->j[pj]];
}
/* diagonal */
DAD->j[pj] = A->j[pj];
DAD->val[pj] = 1.0;
}
/* initial guesses for L and U,
* assume: A and DAD symmetric and stored lower triangular */
memcpy( L_temp->start, DAD->start, sizeof(int) * (DAD->n + 1) );
memcpy( L_temp->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) );
memcpy( L_temp->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) );
/* store U^T in CSR for row-wise access and tranpose later */
memcpy( U_temp->start, DAD->start, sizeof(int) * (DAD->n + 1) );
memcpy( U_temp->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) );
memcpy( U_temp->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) );
/* L has unit diagonal, by convention */
#ifdef _OPENMP
#pragma omp parallel for schedule(static) \
default(none) private(i) shared(L_temp)
#endif
for ( i = 0; i < A->n; ++i )
{
L_temp->val[L_temp->start[i + 1] - 1] = 1.0;
}
for ( i = 0; i < sweeps; ++i )
{
/* for each nonzero in L */
#ifdef _OPENMP
#pragma omp parallel for schedule(static) \
default(none) shared(DAD, L_temp, U_temp) private(j, k, x, y, ei_x, ei_y, sum)
#endif
for ( j = 0; j < DAD->start[DAD->n]; ++j )
{
sum = ZERO;
/* determine row bounds of current nonzero */
x = 0;
ei_x = 0;
for ( k = 1; k <= DAD->n; ++k )
{
if ( DAD->start[k] > j )
{
x = DAD->start[k - 1];
ei_x = DAD->start[k];
break;
}
}
/* determine column bounds of current nonzero */
y = DAD->start[DAD->j[j]];
ei_y = DAD->start[DAD->j[j] + 1];
/* sparse dot product:
* dot( L(i,1:j-1), U(1:j-1,j) ) */
while ( L_temp->j[x] < L_temp->j[j] &&
L_temp->j[y] < L_temp->j[j] &&
x < ei_x && y < ei_y )
{
if ( L_temp->j[x] == L_temp->j[y] )
{
sum += (L_temp->val[x] * U_temp->val[y]);
++x;
++y;
}
else if ( L_temp->j[x] < L_temp->j[y] )
{
++x;
}
else
{
++y;
}
}
if ( j != ei_x - 1 )
{
L_temp->val[j] = ( DAD->val[j] - sum ) / U_temp->val[ei_y - 1];
}
}
#ifdef _OPENMP
#pragma omp parallel for schedule(static) \
default(none) shared(DAD, L_temp, U_temp) private(j, k, x, y, ei_x, ei_y, sum)
#endif
for ( j = 0; j < DAD->start[DAD->n]; ++j )
{
sum = ZERO;
/* determine row bounds of current nonzero */
x = 0;
ei_x = 0;
for ( k = 1; k <= DAD->n; ++k )
{
if ( DAD->start[k] > j )
{
x = DAD->start[k - 1];
ei_x = DAD->start[k];
break;
}
}
/* determine column bounds of current nonzero */
y = DAD->start[DAD->j[j]];
ei_y = DAD->start[DAD->j[j] + 1];
/* sparse dot product:
* dot( L(i,1:i-1), U(1:i-1,j) ) */
while ( U_temp->j[x] < U_temp->j[j] &&
U_temp->j[y] < U_temp->j[j] &&
x < ei_x && y < ei_y )
{
if ( U_temp->j[x] == U_temp->j[y] )
{
sum += (L_temp->val[y] * U_temp->val[x]);
++x;
++y;
}
else if ( U_temp->j[x] < U_temp->j[y] )
{
++x;
}
else
{
++y;
}
}
U_temp->val[j] = DAD->val[j] - sum;
}
}
/* apply inverse transformation:
* since DAD \approx LU, then
* D^{-1}DADD^{-1} = A \approx D^{-1}LUD^{-1} */
#ifdef _OPENMP
#pragma omp parallel for schedule(static) \
default(none) shared(DAD, L_temp, U_temp, D_inv) private(i, pj)
#endif
for ( i = 0; i < DAD->n; ++i )
{
for ( pj = DAD->start[i]; pj < DAD->start[i + 1]; ++pj )
{
L_temp->val[pj] = D_inv[i] * L_temp->val[pj];
/* currently storing U^T, so use row index instead of column index */
U_temp->val[pj] = U_temp->val[pj] * D_inv[i];
}
}
/* apply the dropping rule */
Ltop = 0;
Utop = 0;
for ( i = 0; i < DAD->n; ++i )
{
L->start[i] = Ltop;
U->start[i] = Utop;
for ( pj = L_temp->start[i]; pj < L_temp->start[i + 1] - 1; ++pj )
{
if ( FABS( L_temp->val[pj] ) > FABS( droptol[i] / L_temp->val[L_temp->start[i + 1] - 1] ) )
{
L->j[Ltop] = L_temp->j[pj];
L->val[Ltop] = L_temp->val[pj];
++Ltop;
}
}
/* diagonal */
L->j[Ltop] = L_temp->j[pj];
L->val[Ltop] = L_temp->val[pj];
++Ltop;
for ( pj = U_temp->start[i]; pj < U_temp->start[i + 1] - 1; ++pj )
{
if ( FABS( U_temp->val[pj] ) > FABS( droptol[i] / U_temp->val[U_temp->start[i + 1] - 1] ) )
{
U->j[Utop] = U_temp->j[pj];
U->val[Utop] = U_temp->val[pj];
++Utop;
}
}
/* diagonal */
U->j[Utop] = U_temp->j[pj];
U->val[Utop] = U_temp->val[pj];
++Utop;
}
L->start[i] = Ltop;
U->start[i] = Utop;
Transpose_I( U );
#if defined(DEBUG_FOCUS)
fprintf( stderr, "nnz(L): %d\n", L->start[L->n] );
fprintf( stderr, "nnz(U): %d\n", U->start[U->n] );
#endif
Deallocate_Matrix( U_temp );
Deallocate_Matrix( L_temp );
Deallocate_Matrix( DAD );
sfree( D_inv, "ILUT_PAR::D_inv" );
sfree( D, "ILUT_PAR::D_inv" );
return Get_Timing_Info( start );
}
#if defined(HAVE_LAPACKE) || defined(HAVE_LAPACKE_MKL)
/* Compute M^{1} \approx A which minimizes
*
* A: symmetric, sparse matrix, stored in full CSR format
* A_spar_patt: sparse matrix used as template sparsity pattern
* for approximating the inverse, stored in full CSR format
* A_app_inv: approximate inverse to A, stored in full CSR format (result) */
real sparse_approx_inverse( const sparse_matrix * const A,
const sparse_matrix * const A_spar_patt,
sparse_matrix ** A_app_inv )
{
int i, k, pj, j_temp, identity_pos;
int N, M, d_i, d_j;
lapack_int m, n, nrhs, lda, ldb, info;
int *pos_x, *pos_y;
real start;
real *e_j, *dense_matrix;
char *X, *Y;
start = Get_Time( );
(*A_app_inv)->start[(*A_app_inv)->n] = A_spar_patt->start[A_spar_patt->n];
#ifdef _OPENMP
#pragma omp parallel default(none) \
private(i, k, pj, j_temp, identity_pos, N, M, d_i, d_j, m, n, \
nrhs, lda, ldb, info, X, Y, pos_x, pos_y, e_j, dense_matrix) \
shared(A_app_inv, stderr)
#endif
{
X = (char *) smalloc( sizeof(char) * A->n, "Sparse_Approx_Inverse::X" );
Y = (char *) smalloc( sizeof(char) * A->n, "Sparse_Approx_Inverse::Y" );
pos_x = (int *) smalloc( sizeof(int) * A->n, "Sparse_Approx_Inverse::pos_x" );
pos_y = (int *) smalloc( sizeof(int) * A->n, "Sparse_Approx_Inverse::pos_y" );
for ( i = 0; i < A->n; ++i )
{
X[i] = 0;
Y[i] = 0;
pos_x[i] = 0;
pos_y[i] = 0;
}
#ifdef _OPENMP
#pragma omp for schedule(static)
#endif
for ( i = 0; i < A_spar_patt->n; ++i )
{
N = 0;
M = 0;
// find column indices of nonzeros (which will be the columns indices of the dense matrix)
for ( pj = A_spar_patt->start[i]; pj < A_spar_patt->start[i + 1]; ++pj )
{
j_temp = A_spar_patt->j[pj];
Y[j_temp] = 1;
pos_y[j_temp] = N;
++N;
// for each of those indices
// search through the row of full A of that index
for ( k = A->start[j_temp]; k < A->start[j_temp + 1]; ++k )
{
// and accumulate the nonzero column indices to serve as the row indices of the dense matrix
X[A->j[k]] = 1;
}
}
// enumerate the row indices from 0 to (# of nonzero rows - 1) for the dense matrix
identity_pos = M;
for ( k = 0; k < A->n; k++)
{
if ( X[k] != 0 )
{
pos_x[M] = k;
if ( k == i )
{
identity_pos = M;
}
++M;
}
}
// allocate memory for NxM dense matrix
dense_matrix = (real *) smalloc( sizeof(real) * N * M,
"Sparse_Approx_Inverse::dense_matrix" );
// fill in the entries of dense matrix
for ( d_i = 0; d_i < M; ++d_i)
{
// all rows are initialized to zero
for ( d_j = 0; d_j < N; ++d_j )
{
dense_matrix[d_i * N + d_j] = 0.0;
}
// change the value if any of the column indices is seen
for ( d_j = A->start[pos_x[d_i]];
d_j < A->start[pos_x[d_i + 1]]; ++d_j )
{
if ( Y[A->j[d_j]] == 1 )
{
dense_matrix[d_i * N + pos_y[A->j[d_j]]] = A->val[d_j];
}
}
}
/* create the right hand side of the linear equation
that is the full column of the identity matrix*/
e_j = (real *) smalloc( sizeof(real) * M,
"Sparse_Approx_Inverse::e_j" );
for ( k = 0; k < M; ++k )
{
e_j[k] = 0.0;
}
e_j[identity_pos] = 1.0;
/* Solve the overdetermined system AX = B through the least-squares problem:
* min ||B - AX||_2 */
m = M;
n = N;
nrhs = 1;
lda = N;
ldb = nrhs;
info = LAPACKE_dgels( LAPACK_ROW_MAJOR, 'N', m, n, nrhs, dense_matrix, lda,
e_j, ldb );
/* Check for the full rank */
if ( info > 0 )
{
fprintf( stderr, "The diagonal element %i of the triangular factor ", info );
fprintf( stderr, "of A is zero, so that A does not have full rank;\n" );
fprintf( stderr, "the least squares solution could not be computed.\n" );
exit( INVALID_INPUT );
}
/* Print least squares solution */
// print_matrix( "Least squares solution", n, nrhs, b, ldb );
// accumulate the resulting vector to build A_app_inv
(*A_app_inv)->start[i] = A_spar_patt->start[i];
for ( k = A_spar_patt->start[i]; k < A_spar_patt->start[i + 1]; ++k)
{
(*A_app_inv)->j[k] = A_spar_patt->j[k];
(*A_app_inv)->val[k] = e_j[k - A_spar_patt->start[i]];
}
//empty variables that will be used next iteration
sfree( dense_matrix, "Sparse_Approx_Inverse::dense_matrix" );
sfree( e_j, "Sparse_Approx_Inverse::e_j" );
for ( k = 0; k < A->n; ++k )
{
X[k] = 0;
Y[k] = 0;
pos_x[k] = 0;
pos_y[k] = 0;
}
}
sfree( pos_y, "Sparse_Approx_Inverse::pos_y" );
sfree( pos_x, "Sparse_Approx_Inverse::pos_x" );
sfree( Y, "Sparse_Approx_Inverse::Y" );
sfree( X, "Sparse_Approx_Inverse::X" );
}
return Get_Timing_Info( start );
}
#endif
/* sparse matrix-vector product Ax = b
* where:
* A: lower triangular matrix, stored in CSR format
* x: vector
* b: vector (result) */
static void Sparse_MatVec( const sparse_matrix * const A,
const real * const x, real * const b )
{
int i, j, k, n, si, ei;
real H;
#ifdef _OPENMP
unsigned int tid;
#endif
n = A->n;
Vector_MakeZero( b, n );
#ifdef _OPENMP
tid = omp_get_thread_num( );
#pragma omp single
{
/* keep b_local for program duration to avoid allocate/free
* overhead per Sparse_MatVec call*/
if ( b_local == NULL )
{
b_local = (real*) smalloc( omp_get_num_threads() * n * sizeof(real),
"Sparse_MatVec::b_local" );
}
}
Vector_MakeZero( (real * const)b_local, omp_get_num_threads() * n );
#pragma omp for schedule(static)
#endif
for ( i = 0; i < n; ++i )
{
si = A->start[i];
ei = A->start[i + 1] - 1;
for ( k = si; k < ei; ++k )
{
j = A->j[k];
H = A->val[k];
#ifdef _OPENMP
b_local[tid * n + j] += H * x[i];
b_local[tid * n + i] += H * x[j];
#else
b[j] += H * x[i];
b[i] += H * x[j];
#endif
}
// the diagonal entry is the last one in
#ifdef _OPENMP
b_local[tid * n + i] += A->val[k] * x[i];
#else
b[i] += A->val[k] * x[i];
#endif
}
#ifdef _OPENMP
#pragma omp for schedule(static)
for ( i = 0; i < n; ++i )
{
for ( j = 0; j < omp_get_num_threads(); ++j )
{
b[i] += b_local[j * n + i];
}
}
#endif
}
/* sparse matrix-vector product Ax = b
* where:
* A: matrix, stored in CSR format
* x: vector
* b: vector (result) */
static void Sparse_MatVec_full( const sparse_matrix * const A,
const real * const x, real * const b )
{
int i, pj;
Vector_MakeZero( b, A->n );
#ifdef _OPENMP
#pragma omp for schedule(static)
#endif
for ( i = 0; i < A->n; ++i )
{
for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
{
b[i] += A->val[pj] * x[A->j[pj]];
}
}
}
/* Transpose A and copy into A^T
*
* A: stored in CSR
* A_t: stored in CSR
*/
void Transpose( const sparse_matrix * const A, sparse_matrix * const A_t )
{
unsigned int i, j, pj, *A_t_top;
A_t_top = (unsigned int*) scalloc( A->n + 1, sizeof(unsigned int),
"Transpose::A_t_top" );
memset( A_t->start, 0, (A->n + 1) * sizeof(unsigned int) );
/* count nonzeros in each column of A^T, store one row greater (see next loop) */
for ( i = 0; i < A->n; ++i )
{
for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
{
++A_t->start[A->j[pj] + 1];
}
}
/* setup the row pointers for A^T */
for ( i = 1; i <= A->n; ++i )
{
A_t_top[i] = A_t->start[i] = A_t->start[i] + A_t->start[i - 1];
}
/* fill in A^T */
for ( i = 0; i < A->n; ++i )
{
for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
{
j = A->j[pj];
A_t->j[A_t_top[j]] = i;
A_t->val[A_t_top[j]] = A->val[pj];
++A_t_top[j];
}
}
sfree( A_t_top, "Transpose::A_t_top" );
}
/* Transpose A in-place
*
* A: stored in CSR
*/
void Transpose_I( sparse_matrix * const A )
{
sparse_matrix * A_t;
if ( Allocate_Matrix( &A_t, A->n, A->m ) == FAILURE )
{
fprintf( stderr, "not enough memory for transposing matrices. terminating.\n" );
exit( INSUFFICIENT_MEMORY );
}
Transpose( A, A_t );
memcpy( A->start, A_t->start, sizeof(int) * (A_t->n + 1) );
memcpy( A->j, A_t->j, sizeof(int) * (A_t->start[A_t->n]) );
memcpy( A->val, A_t->val, sizeof(real) * (A_t->start[A_t->n]) );
Deallocate_Matrix( A_t );
}
/* Apply diagonal inverse (Jacobi) preconditioner to system residual
*
* Hdia_inv: diagonal inverse preconditioner (constructed using H)
* y: current residual
* x: preconditioned residual
* N: dimensions of preconditioner and vectors (# rows in H)
*/
static void diag_pre_app( const real * const Hdia_inv, const real * const y,
real * const x, const int N )
{
unsigned int i;
#ifdef _OPENMP
#pragma omp for schedule(static)
#endif
for ( i = 0; i < N; ++i )
{
x[i] = y[i] * Hdia_inv[i];
}
}
/* Solve triangular system LU*x = y using level scheduling
*
* LU: lower/upper triangular, stored in CSR
* y: constants in linear system (RHS)
* x: solution
* N: dimensions of matrix and vectors
* tri: triangularity of LU (lower/upper)
*
* Assumptions:
* LU has non-zero diagonals
* Each row of LU has at least one non-zero (i.e., no rows with all zeros) */
void tri_solve( const sparse_matrix * const LU, const real * const y,
real * const x, const int N, const TRIANGULARITY tri )
{
int i, pj, j, si, ei;
real val;
#ifdef _OPENMP
#pragma omp single
#endif
{
if ( tri == LOWER )
{
for ( i = 0; i < N; ++i )
{
x[i] = y[i];
si = LU->start[i];
ei = LU->start[i + 1];
for ( pj = si; pj < ei - 1; ++pj )
{
j = LU->j[pj];
val = LU->val[pj];
x[i] -= val * x[j];
}
x[i] /= LU->val[pj];
}
}
else
{
for ( i = N - 1; i >= 0; --i )
{
x[i] = y[i];
si = LU->start[i];
ei = LU->start[i + 1];
for ( pj = si + 1; pj < ei; ++pj )
{
j = LU->j[pj];
val = LU->val[pj];
x[i] -= val * x[j];
}
x[i] /= LU->val[si];
}
}
}
}
/* Solve triangular system LU*x = y using level scheduling
*
* LU: lower/upper triangular, stored in CSR
* y: constants in linear system (RHS)
* x: solution
* N: dimensions of matrix and vectors
* tri: triangularity of LU (lower/upper)
* find_levels: perform level search if positive, otherwise reuse existing levels
*
* Assumptions:
* LU has non-zero diagonals
* Each row of LU has at least one non-zero (i.e., no rows with all zeros) */
void tri_solve_level_sched( const sparse_matrix * const LU,
const real * const y, real * const x, const int N,
const TRIANGULARITY tri, int find_levels )
{
int i, j, pj, local_row, local_level;
#ifdef _OPENMP
#pragma omp single
#endif
{
if ( tri == LOWER )
{
row_levels = row_levels_L;
level_rows = level_rows_L;
level_rows_cnt = level_rows_cnt_L;
levels = levels_L;
}
else
{
row_levels = row_levels_U;
level_rows = level_rows_U;
level_rows_cnt = level_rows_cnt_U;
levels = levels_U;
}
if ( row_levels == NULL || level_rows == NULL || level_rows_cnt == NULL )
{
row_levels = (unsigned int*) smalloc( (size_t)N * sizeof(unsigned int),
"tri_solve_level_sched::row_levels" );
level_rows = (unsigned int*) smalloc( (size_t)N * sizeof(unsigned int),
"tri_solve_level_sched::level_rows" );
level_rows_cnt = (unsigned int*) smalloc( (size_t)(N + 1) * sizeof(unsigned int),
"tri_solve_level_sched::level_rows_cnt" );
}
if ( top == NULL )
{
top = (unsigned int*) smalloc( (size_t)(N + 1) * sizeof(unsigned int),
"tri_solve_level_sched::top" );
}
/* find levels (row dependencies in substitutions) */
if ( find_levels == TRUE )
{
memset( row_levels, 0, N * sizeof(unsigned int) );
memset( level_rows_cnt, 0, N * sizeof(unsigned int) );
memset( top, 0, N * sizeof(unsigned int) );
levels = 1;
if ( tri == LOWER )
{
for ( i = 0; i < N; ++i )
{
local_level = 1;
for ( pj = LU->start[i]; pj < LU->start[i + 1] - 1; ++pj )
{
local_level = MAX( local_level, row_levels[LU->j[pj]] + 1 );
}
levels = MAX( levels, local_level );
row_levels[i] = local_level;
++level_rows_cnt[local_level];
}
//#if defined(DEBUG)
fprintf(stderr, "levels(L): %d\n", levels);
fprintf(stderr, "NNZ(L): %d\n", LU->start[N]);
//#endif
}
else
{
for ( i = N - 1; i >= 0; --i )
{
local_level = 1;
for ( pj = LU->start[i] + 1; pj < LU->start[i + 1]; ++pj )
{
local_level = MAX( local_level, row_levels[LU->j[pj]] + 1 );
}
levels = MAX( levels, local_level );
row_levels[i] = local_level;
++level_rows_cnt[local_level];
}
//#if defined(DEBUG)
fprintf(stderr, "levels(U): %d\n", levels);
fprintf(stderr, "NNZ(U): %d\n", LU->start[N]);
//#endif
}
for ( i = 1; i < levels + 1; ++i )
{
level_rows_cnt[i] += level_rows_cnt[i - 1];
top[i] = level_rows_cnt[i];
}
for ( i = 0; i < N; ++i )
{
level_rows[top[row_levels[i] - 1]] = i;
++top[row_levels[i] - 1];
}
}
}
/* perform substitutions by level */
if ( tri == LOWER )
{
for ( i = 0; i < levels; ++i )
{
#ifdef _OPENMP
#pragma omp for schedule(static)
#endif
for ( j = level_rows_cnt[i]; j < level_rows_cnt[i + 1]; ++j )
{
local_row = level_rows[j];
x[local_row] = y[local_row];
for ( pj = LU->start[local_row]; pj < LU->start[local_row + 1] - 1; ++pj )
{
x[local_row] -= LU->val[pj] * x[LU->j[pj]];
}
x[local_row] /= LU->val[pj];
}
}
}
else
{
for ( i = 0; i < levels; ++i )
{
#ifdef _OPENMP
#pragma omp for schedule(static)
#endif
for ( j = level_rows_cnt[i]; j < level_rows_cnt[i + 1]; ++j )
{
local_row = level_rows[j];
x[local_row] = y[local_row];
for ( pj = LU->start[local_row] + 1; pj < LU->start[local_row + 1]; ++pj )
{
x[local_row] -= LU->val[pj] * x[LU->j[pj]];
}
x[local_row] /= LU->val[LU->start[local_row]];
}
}
}
#ifdef _OPENMP
#pragma omp single
#endif
{
/* save level info for re-use if performing repeated triangular solves via preconditioning */
if ( tri == LOWER )
{
row_levels_L = row_levels;
level_rows_L = level_rows;
level_rows_cnt_L = level_rows_cnt;
levels_L = levels;
}
else
{
row_levels_U = row_levels;
level_rows_U = level_rows;
level_rows_cnt_U = level_rows_cnt;
levels_U = levels;
}
}
}
static void compute_H_full( const sparse_matrix * const H )
{
int count, i, pj;
sparse_matrix *H_t;
if ( Allocate_Matrix( &H_t, H->n, H->m ) == FAILURE )
{
fprintf( stderr, "not enough memory for full H. terminating.\n" );
exit( INSUFFICIENT_MEMORY );
}
/* Set up the sparse matrix data structure for A. */
Transpose( H, H_t );
count = 0;
for ( i = 0; i < H->n; ++i )
{
H_full->start[i] = count;
/* H: symmetric, lower triangular portion only stored */
for ( pj = H->start[i]; pj < H->start[i + 1]; ++pj )
{
H_full->val[count] = H->val[pj];
H_full->j[count] = H->j[pj];
++count;
}
/* H^T: symmetric, upper triangular portion only stored;
* skip diagonal from H^T, as included from H above */
for ( pj = H_t->start[i] + 1; pj < H_t->start[i + 1]; ++pj )
{
H_full->val[count] = H_t->val[pj];
H_full->j[count] = H_t->j[pj];
++count;
}
}
H_full->start[i] = count;
Deallocate_Matrix( H_t );
}
/* Iterative greedy shared-memory parallel graph coloring
*
* A: matrix to use for coloring, stored in CSR format;
* rows represent vertices, columns of entries within a row represent adjacent vertices
* (i.e., dependent rows for elimination during LU factorization)
* tri: triangularity of LU (lower/upper)
* color: vertex color (1-based)
*
* Reference:
* Umit V. Catalyurek et al.
* Graph Coloring Algorithms for Multi-core
* and Massively Threaded Architectures
* Parallel Computing, 2012
*/
void graph_coloring( const sparse_matrix * const A, const TRIANGULARITY tri )
{
#ifdef _OPENMP
#pragma omp parallel
#endif
{
#define MAX_COLOR (500)
int i, pj, v;
unsigned int temp, recolor_cnt_local, *conflict_local;
int tid, num_thread, *fb_color;
#ifdef _OPENMP
tid = omp_get_thread_num();
num_thread = omp_get_num_threads();
#else
tid = 0;
num_thread = 1;
#endif
#ifdef _OPENMP
#pragma omp single
#endif
{
memset( color, 0, sizeof(unsigned int) * A->n );
recolor_cnt = A->n;
}
/* ordering of vertices to color depends on triangularity of factor
* for which coloring is to be used for */
if ( tri == LOWER )
{
#ifdef _OPENMP
#pragma omp for schedule(static)
#endif
for ( i = 0; i < A->n; ++i )
{
to_color[i] = i;
}
}
else
{
#ifdef _OPENMP
#pragma omp for schedule(static)
#endif
for ( i = 0; i < A->n; ++i )
{
to_color[i] = A->n - 1 - i;
}
}
fb_color = (int*) smalloc( sizeof(int) * MAX_COLOR,
"graph_coloring::fb_color" );
conflict_local = (unsigned int*) smalloc( sizeof(unsigned int) * A->n,
"graph_coloring::fb_color" );
#ifdef _OPENMP
#pragma omp barrier
#endif
while ( recolor_cnt > 0 )
{
memset( fb_color, -1, sizeof(int) * MAX_COLOR );
/* color vertices */
#ifdef _OPENMP
#pragma omp for schedule(static)
#endif
for ( i = 0; i < recolor_cnt; ++i )
{
v = to_color[i];
/* colors of adjacent vertices are forbidden */
for ( pj = A->start[v]; pj < A->start[v + 1]; ++pj )
{
if ( v != A->j[pj] )
{
fb_color[color[A->j[pj]]] = v;
}
}
/* search for min. color which is not in conflict with adjacent vertices;
* start at 1 since 0 is default (invalid) color for all vertices */
for ( pj = 1; fb_color[pj] == v; ++pj );
/* assign discovered color (no conflict in neighborhood of adjacent vertices) */
color[v] = pj;
}
/* determine if recoloring required */
temp = recolor_cnt;
recolor_cnt_local = 0;
#ifdef _OPENMP
#pragma omp single
#endif
{
recolor_cnt = 0;
}
#ifdef _OPENMP
#pragma omp for schedule(static)
#endif
for ( i = 0; i < temp; ++i )
{
v = to_color[i];
/* search for color conflicts with adjacent vertices */
for ( pj = A->start[v]; pj < A->start[v + 1]; ++pj )
{
if ( color[v] == color[A->j[pj]] && v > A->j[pj] )
{
conflict_local[recolor_cnt_local] = v;
++recolor_cnt_local;
break;
}
}
}
/* count thread-local conflicts and compute offsets for copying into shared buffer */
conflict_cnt[tid + 1] = recolor_cnt_local;
#ifdef _OPENMP
#pragma omp barrier
#pragma omp master
#endif
{
conflict_cnt[0] = 0;
for ( i = 1; i < num_thread + 1; ++i )
{
conflict_cnt[i] += conflict_cnt[i - 1];
}
recolor_cnt = conflict_cnt[num_thread];
}
#ifdef _OPENMP
#pragma omp barrier
#endif
/* copy thread-local conflicts into shared buffer */
for ( i = 0; i < recolor_cnt_local; ++i )
{
conflict[conflict_cnt[tid] + i] = conflict_local[i];
color[conflict_local[i]] = 0;
}
#ifdef _OPENMP
#pragma omp barrier
#pragma omp single
#endif
{
temp_ptr = to_color;
to_color = conflict;
conflict = temp_ptr;
}
}
sfree( conflict_local, "graph_coloring::conflict_local" );
sfree( fb_color, "graph_coloring::fb_color" );
//#if defined(DEBUG)
//#ifdef _OPENMP
// #pragma omp master
//#endif
// {
// for ( i = 0; i < A->n; ++i )
// printf("Vertex: %5d, Color: %5d\n", i, color[i] );
// }
//#endif
#ifdef _OPENMP
#pragma omp barrier
#endif
}
}
/* Sort coloring
*
* n: number of entries in coloring
* tri: coloring to triangular factor to use (lower/upper)
*/
void sort_colors( const unsigned int n, const TRIANGULARITY tri )
{
unsigned int i;
memset( color_top, 0, sizeof(unsigned int) * (n + 1) );
/* sort vertices by color (ascending within a color)
* 1) count colors
* 2) determine offsets of color ranges
* 3) sort by color
*
* note: color is 1-based */
for ( i = 0; i < n; ++i )
{
++color_top[color[i]];
}
for ( i = 1; i < n + 1; ++i )
{
color_top[i] += color_top[i - 1];
}
for ( i = 0; i < n; ++i )
{
permuted_row_col[color_top[color[i] - 1]] = i;
++color_top[color[i] - 1];
}
/* invert mapping to get map from current row/column to permuted (new) row/column */
for ( i = 0; i < n; ++i )
{
permuted_row_col_inv[permuted_row_col[i]] = i;
}
}
/* Apply permutation Q^T*x or Q*x based on graph coloring
*
* color: vertex color (1-based); vertices represent matrix rows/columns
* x: vector to permute (in-place)
* n: number of entries in x
* invert_map: if TRUE, use Q^T, otherwise use Q
* tri: coloring to triangular factor to use (lower/upper)
*/
static void permute_vector( real * const x, const unsigned int n, const int invert_map,
const TRIANGULARITY tri )
{
unsigned int i;
#ifdef _OPENMP
#pragma omp single
#endif
{
if ( x_p == NULL )
{
x_p = (real*) smalloc( sizeof(real) * n, "permute_vector::x_p" );
}
if ( invert_map == TRUE )
{
mapping = permuted_row_col_inv;
}
else
{
mapping = permuted_row_col;
}
}
#ifdef _OPENMP
#pragma omp for schedule(static)
#endif
for ( i = 0; i < n; ++i )
{
x_p[i] = x[mapping[i]];
}
#ifdef _OPENMP
#pragma omp single
#endif
{
memcpy( x, x_p, sizeof(real) * n );
}
}
/* Apply permutation Q^T*(LU)*Q based on graph coloring
*
* color: vertex color (1-based); vertices represent matrix rows/columns
* LU: matrix to permute, stored in CSR format
* tri: triangularity of LU (lower/upper)
*/
void permute_matrix( sparse_matrix * const LU, const TRIANGULARITY tri )
{
int i, pj, nr, nc;
sparse_matrix *LUtemp;
if ( Allocate_Matrix( &LUtemp, LU->n, LU->m ) == FAILURE )
{
fprintf( stderr, "Not enough space for graph coloring (factor permutation). Terminating...\n" );
exit( INSUFFICIENT_MEMORY );
}
/* count nonzeros in each row of permuted factor (re-use color_top for counting) */
memset( color_top, 0, sizeof(unsigned int) * (LU->n + 1) );
if ( tri == LOWER )
{
for ( i = 0; i < LU->n; ++i )
{
nr = permuted_row_col_inv[i];
for ( pj = LU->start[i]; pj < LU->start[i + 1]; ++pj )
{
nc = permuted_row_col_inv[LU->j[pj]];
if ( nc <= nr )
{
++color_top[nr + 1];
}
/* correct entries to maintain triangularity (lower) */
else
{
++color_top[nc + 1];
}
}
}
}
else
{
for ( i = LU->n - 1; i >= 0; --i )
{
nr = permuted_row_col_inv[i];
for ( pj = LU->start[i]; pj < LU->start[i + 1]; ++pj )
{
nc = permuted_row_col_inv[LU->j[pj]];
if ( nc >= nr )
{
++color_top[nr + 1];
}
/* correct entries to maintain triangularity (upper) */
else
{
++color_top[nc + 1];
}
}
}
}
for ( i = 1; i < LU->n + 1; ++i )
{
color_top[i] += color_top[i - 1];
}
memcpy( LUtemp->start, color_top, sizeof(unsigned int) * (LU->n + 1) );
/* permute factor */
if ( tri == LOWER )
{
for ( i = 0; i < LU->n; ++i )
{
nr = permuted_row_col_inv[i];
for ( pj = LU->start[i]; pj < LU->start[i + 1]; ++pj )
{
nc = permuted_row_col_inv[LU->j[pj]];
if ( nc <= nr )
{
LUtemp->j[color_top[nr]] = nc;
LUtemp->val[color_top[nr]] = LU->val[pj];
++color_top[nr];
}
/* correct entries to maintain triangularity (lower) */
else
{
LUtemp->j[color_top[nc]] = nr;
LUtemp->val[color_top[nc]] = LU->val[pj];
++color_top[nc];
}
}
}
}
else
{
for ( i = LU->n - 1; i >= 0; --i )
{
nr = permuted_row_col_inv[i];
for ( pj = LU->start[i]; pj < LU->start[i + 1]; ++pj )
{
nc = permuted_row_col_inv[LU->j[pj]];
if ( nc >= nr )
{
LUtemp->j[color_top[nr]] = nc;
LUtemp->val[color_top[nr]] = LU->val[pj];
++color_top[nr];
}
/* correct entries to maintain triangularity (upper) */
else
{
LUtemp->j[color_top[nc]] = nr;
LUtemp->val[color_top[nc]] = LU->val[pj];
++color_top[nc];
}
}
}
}
memcpy( LU->start, LUtemp->start, sizeof(unsigned int) * (LU->n + 1) );
memcpy( LU->j, LUtemp->j, sizeof(unsigned int) * LU->start[LU->n] );
memcpy( LU->val, LUtemp->val, sizeof(real) * LU->start[LU->n] );
Deallocate_Matrix( LUtemp );
}
/* Setup routines to build permuted QEq matrix H (via graph coloring),
* used for preconditioning (incomplete factorizations computed based on
* permuted H)
*
* H: symmetric, lower triangular portion only, stored in CSR format;
* H is permuted in-place
*/
sparse_matrix * setup_graph_coloring( sparse_matrix * const H )
{
int num_thread;
if ( color == NULL )
{
#ifdef _OPENMP
#pragma omp parallel
{
num_thread = omp_get_num_threads();
}
#else
num_thread = 1;
#endif
/* internal storage for graph coloring (global to facilitate simultaneous access to OpenMP threads) */
color = (unsigned int*) smalloc( sizeof(unsigned int) * H->n,
"setup_graph_coloring::color" );
to_color = (unsigned int*) smalloc( sizeof(unsigned int) * H->n,
"setup_graph_coloring::to_color" );
conflict = (unsigned int*) smalloc( sizeof(unsigned int) * H->n,
"setup_graph_coloring::conflict" );
conflict_cnt = (unsigned int*) smalloc( sizeof(unsigned int) * (num_thread + 1),
"setup_graph_coloring::conflict_cnt" );
recolor = (unsigned int*) smalloc( sizeof(unsigned int) * H->n,
"setup_graph_coloring::recolor" );
color_top = (unsigned int*) smalloc( sizeof(unsigned int) * (H->n + 1),
"setup_graph_coloring::color_top" );
permuted_row_col = (unsigned int*) smalloc( sizeof(unsigned int) * H->n,
"setup_graph_coloring::premuted_row_col" );
permuted_row_col_inv = (unsigned int*) smalloc( sizeof(unsigned int) * H->n,
"setup_graph_coloring::premuted_row_col_inv" );
y_p = (real*) smalloc( sizeof(real) * H->n,
"setup_graph_coloring::y_p" );
if ( (Allocate_Matrix( &H_p, H->n, H->m ) == FAILURE ) ||
(Allocate_Matrix( &H_full, H->n, 2 * H->m - H->n ) == FAILURE ) )
{
fprintf( stderr, "not enough memory for graph coloring. terminating.\n" );
exit( INSUFFICIENT_MEMORY );
}
}
compute_H_full( H );
graph_coloring( H_full, LOWER );
sort_colors( H_full->n, LOWER );
memcpy( H_p->start, H->start, sizeof(int) * (H->n + 1) );
memcpy( H_p->j, H->j, sizeof(int) * (H->start[H->n]) );
memcpy( H_p->val, H->val, sizeof(real) * (H->start[H->n]) );
permute_matrix( H_p, LOWER );
return H_p;
}
/* Jacobi iteration using truncated Neumann series: x_{k+1} = Gx_k + D^{-1}b
* where:
* G = I - D^{-1}R
* R = triangular matrix
* D = diagonal matrix, diagonals from R
*
* Note: used during the backsolves when applying preconditioners with
* triangular factors in iterative linear solvers
*
* Note: Newmann series arises from series expansion of the inverse of
* the coefficient matrix in the triangular system */
void jacobi_iter( const sparse_matrix * const R, const real * const Dinv,
const real * const b, real * const x, const TRIANGULARITY tri, const
unsigned int maxiter )
{
unsigned int i, k, si = 0, ei = 0, iter;
iter = 0;
#ifdef _OPENMP
#pragma omp single
#endif
{
if ( Dinv_b == NULL )
{
Dinv_b = (real*) smalloc( sizeof(real) * R->n, "jacobi_iter::Dinv_b" );
}
if ( rp == NULL )
{
rp = (real*) smalloc( sizeof(real) * R->n, "jacobi_iter::rp" );
}
if ( rp2 == NULL )
{
rp2 = (real*) smalloc( sizeof(real) * R->n, "jacobi_iter::rp2" );
}
}
Vector_MakeZero( rp, R->n );
/* precompute and cache, as invariant in loop below */
#ifdef _OPENMP
#pragma omp for schedule(static)
#endif
for ( i = 0; i < R->n; ++i )
{
Dinv_b[i] = Dinv[i] * b[i];
}
do
{
// x_{k+1} = G*x_{k} + Dinv*b;
#ifdef _OPENMP
#pragma omp for schedule(guided)
#endif
for ( i = 0; i < R->n; ++i )
{
if (tri == LOWER)
{
si = R->start[i];
ei = R->start[i + 1] - 1;
}
else
{
si = R->start[i] + 1;
ei = R->start[i + 1];
}
rp2[i] = 0.;
for ( k = si; k < ei; ++k )
{
rp2[i] += R->val[k] * rp[R->j[k]];
}
rp2[i] *= -Dinv[i];
rp2[i] += Dinv_b[i];
}
#ifdef _OPENMP
#pragma omp single
#endif
{
rp3 = rp;
rp = rp2;
rp2 = rp3;
}
++iter;
}
while ( iter < maxiter );
Vector_Copy( x, rp, R->n );
}
/* Solve triangular system LU*x = y using level scheduling
*
* workspace: data struct containing matrices, lower/upper triangular, stored in CSR
* control: data struct containing parameters
* y: constants in linear system (RHS)
* x: solution
* fresh_pre: parameter indicating if this is a newly computed (fresh) preconditioner
*
* Assumptions:
* Matrices have non-zero diagonals
* Each row of a matrix has at least one non-zero (i.e., no rows with all zeros) */
static void apply_preconditioner( const static_storage * const workspace, const control_params * const control,
const real * const y, real * const x, const int fresh_pre )
{
int i, si;
/* no preconditioning */
if ( control->cm_solver_pre_comp_type == NONE_PC )
{
Vector_Copy( x, y, workspace->H->n );
}
else
{
switch ( control->cm_solver_pre_app_type )
{
case TRI_SOLVE_PA:
switch ( control->cm_solver_pre_comp_type )
{
case DIAG_PC:
diag_pre_app( workspace->Hdia_inv, y, x, workspace->H->n );
break;
case ICHOLT_PC:
case ILU_PAR_PC:
case ILUT_PAR_PC:
tri_solve( workspace->L, y, x, workspace->L->n, LOWER );
tri_solve( workspace->U, x, x, workspace->U->n, UPPER );
break;
case SAI_PC:
Sparse_MatVec_full( workspace->H_app_inv, y, x );
break;
default:
fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
exit( INVALID_INPUT );
break;
}
break;
case TRI_SOLVE_LEVEL_SCHED_PA:
switch ( control->cm_solver_pre_comp_type )
{
case DIAG_PC:
diag_pre_app( workspace->Hdia_inv, y, x, workspace->H->n );
break;
case ICHOLT_PC:
case ILU_PAR_PC:
case ILUT_PAR_PC:
tri_solve_level_sched( workspace->L, y, x, workspace->L->n, LOWER, fresh_pre );
tri_solve_level_sched( workspace->U, x, x, workspace->U->n, UPPER, fresh_pre );
break;
case SAI_PC:
Sparse_MatVec_full( workspace->H_app_inv, y, x );
break;
default:
fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
exit( INVALID_INPUT );
break;
}
break;
case TRI_SOLVE_GC_PA:
switch ( control->cm_solver_pre_comp_type )
{
case DIAG_PC:
case SAI_PC:
fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" );
exit( INVALID_INPUT );
break;
case ICHOLT_PC:
case ILU_PAR_PC:
case ILUT_PAR_PC:
#ifdef _OPENMP
#pragma omp single
#endif
{
memcpy( y_p, y, sizeof(real) * workspace->H->n );
}
permute_vector( y_p, workspace->H->n, FALSE, LOWER );
tri_solve_level_sched( workspace->L, y_p, x, workspace->L->n, LOWER, fresh_pre );
tri_solve_level_sched( workspace->U, x, x, workspace->U->n, UPPER, fresh_pre );
permute_vector( x, workspace->H->n, TRUE, UPPER );
break;
default:
fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
exit( INVALID_INPUT );
break;
}
break;
case JACOBI_ITER_PA:
switch ( control->cm_solver_pre_comp_type )
{
case DIAG_PC:
case SAI_PC:
fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" );
exit( INVALID_INPUT );
break;
case ICHOLT_PC:
case ILU_PAR_PC:
case ILUT_PAR_PC:
#ifdef _OPENMP
#pragma omp single
#endif
{
if ( Dinv_L == NULL )
{
Dinv_L = (real*) smalloc( sizeof(real) * workspace->L->n,
"apply_preconditioner::Dinv_L" );
}
}
/* construct D^{-1}_L */
if ( fresh_pre == TRUE )
{
#ifdef _OPENMP
#pragma omp for schedule(static)
#endif
for ( i = 0; i < workspace->L->n; ++i )
{
si = workspace->L->start[i + 1] - 1;
Dinv_L[i] = 1. / workspace->L->val[si];
}
}
jacobi_iter( workspace->L, Dinv_L, y, x, LOWER, control->cm_solver_pre_app_jacobi_iters );
#ifdef _OPENMP
#pragma omp single
#endif
{
if ( Dinv_U == NULL )
{
Dinv_U = (real*) smalloc( sizeof(real) * workspace->U->n,
"apply_preconditioner::Dinv_U" );
}
}
/* construct D^{-1}_U */
if ( fresh_pre == TRUE )
{
#ifdef _OPENMP
#pragma omp for schedule(static)
#endif
for ( i = 0; i < workspace->U->n; ++i )
{
si = workspace->U->start[i];
Dinv_U[i] = 1. / workspace->U->val[si];
}
}
jacobi_iter( workspace->U, Dinv_U, y, x, UPPER, control->cm_solver_pre_app_jacobi_iters );
break;
default:
fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
exit( INVALID_INPUT );
break;
}
break;
default:
fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
exit( INVALID_INPUT );
break;
}
}
}
/* generalized minimual residual method with restarting for sparse linear systems */
int GMRES( const static_storage * const workspace, const control_params * const control,
simulation_data * const data, const sparse_matrix * const H, const real * const b,
const real tol, real * const x, const int fresh_pre )
{
int i, j, k, itr, N, g_j, g_itr;
real cc, tmp1, tmp2, temp, ret_temp, bnorm;
real t_start, t_ortho, t_pa, t_spmv, t_ts, t_vops;
N = H->n;
g_j = 0;
g_itr = 0;
#ifdef _OPENMP
#pragma omp parallel default(none) \
private(i, j, k, itr, bnorm, ret_temp, t_start) \
shared(N, cc, tmp1, tmp2, temp, g_itr, g_j, stderr) \
reduction(+: t_ortho, t_pa, t_spmv, t_ts, t_vops)
#endif
{
j = 0;
itr = 0;
t_ortho = 0.0;
t_pa = 0.0;
t_spmv = 0.0;
t_ts = 0.0;
t_vops = 0.0;
t_start = Get_Time( );
bnorm = Norm( b, N );
t_vops += Get_Timing_Info( t_start );
/* GMRES outer-loop */
for ( itr = 0; itr < control->cm_solver_max_iters; ++itr )
{
/* calculate r0 */
t_start = Get_Time( );
Sparse_MatVec( H, x, workspace->b_prm );
t_spmv += Get_Timing_Info( t_start );
t_start = Get_Time( );
Vector_Sum( workspace->b_prc, 1., b, -1., workspace->b_prm, N );
t_vops += Get_Timing_Info( t_start );
t_start = Get_Time( );
apply_preconditioner( workspace, control, workspace->b_prc, workspace->v[0],
itr == 0 ? fresh_pre : FALSE );
t_pa += Get_Timing_Info( t_start );
t_start = Get_Time( );
ret_temp = Norm( workspace->v[0], N );
#ifdef _OPENMP
#pragma omp single
#endif
{
workspace->g[0] = ret_temp;
}
Vector_Scale( workspace->v[0], 1. / ret_temp, workspace->v[0], N );
t_vops += Get_Timing_Info( t_start );
/* GMRES inner-loop */
for ( j = 0; j < control->cm_solver_restart && FABS(workspace->g[j]) / bnorm > tol; j++ )
{
/* matvec */
t_start = Get_Time( );
Sparse_MatVec( H, workspace->v[j], workspace->b_prc );
t_spmv += Get_Timing_Info( t_start );
t_start = Get_Time( );
apply_preconditioner( workspace, control, workspace->b_prc, workspace->v[j + 1], FALSE );
t_pa += Get_Timing_Info( t_start );
// if ( control->cm_solver_pre_comp_type == DIAG_PC )
// {
/* apply modified Gram-Schmidt to orthogonalize the new residual */
t_start = Get_Time( );
for ( i = 0; i <= j; i++ )
{
ret_temp = Dot( workspace->v[i], workspace->v[j + 1], N );
#ifdef _OPENMP
#pragma omp single
#endif
{
workspace->h[i][j] = ret_temp;
}
Vector_Add( workspace->v[j + 1], -workspace->h[i][j], workspace->v[i], N );
}
t_vops += Get_Timing_Info( t_start );
// }
// else
// {
// //TODO: investigate correctness of not explicitly orthogonalizing first few vectors
// /* apply modified Gram-Schmidt to orthogonalize the new residual */
//
//
//
// {
// t_start = Get_Time( );
// }
//
// #pragma omp single
//
// {
// for ( i = 0; i < j - 1; i++ )
// {
// workspace->h[i][j] = 0.0;
// }
// }
//
// for ( i = MAX(j - 1, 0); i <= j; i++ )
// {
// ret_temp = Dot( workspace->v[i], workspace->v[j + 1], N );
//
// #pragma omp single
//
// {
// workspace->h[i][j] = ret_temp;
// }
//
// Vector_Add( workspace->v[j + 1], -workspace->h[i][j], workspace->v[i], N );
// }
//
//
//
// {
// t_vops += Get_Timing_Info( t_start );
// }
// }
t_start = Get_Time( );
ret_temp = Norm( workspace->v[j + 1], N );
#ifdef _OPENMP
#pragma omp single
#endif
{
workspace->h[j + 1][j] = ret_temp;
}
Vector_Scale( workspace->v[j + 1],
1.0 / workspace->h[j + 1][j], workspace->v[j + 1], N );
t_vops += Get_Timing_Info( t_start );
t_start = Get_Time( );
// if ( control->cm_solver_pre_comp_type == NONE_PC ||
// control->cm_solver_pre_comp_type == DIAG_PC )
// {
/* Givens rotations on the upper-Hessenberg matrix to make it U */
#ifdef _OPENMP
#pragma omp single
#endif
{
for ( i = 0; i <= j; i++ )
{
if ( i == j )
{
cc = SQRT( SQR(workspace->h[j][j]) + SQR(workspace->h[j + 1][j]) );
workspace->hc[j] = workspace->h[j][j] / cc;
workspace->hs[j] = workspace->h[j + 1][j] / cc;
}
tmp1 = workspace->hc[i] * workspace->h[i][j] +
workspace->hs[i] * workspace->h[i + 1][j];
tmp2 = -workspace->hs[i] * workspace->h[i][j] +
workspace->hc[i] * workspace->h[i + 1][j];
workspace->h[i][j] = tmp1;
workspace->h[i + 1][j] = tmp2;
}
// }
// else
// {
// //TODO: investigate correctness of not explicitly orthogonalizing first few vectors
// /* Givens rotations on the upper-Hessenberg matrix to make it U */
// for ( i = MAX(j - 1, 0); i <= j; i++ )
// {
// if ( i == j )
// {
// cc = SQRT( SQR(workspace->h[j][j]) + SQR(workspace->h[j + 1][j]) );
// workspace->hc[j] = workspace->h[j][j] / cc;
// workspace->hs[j] = workspace->h[j + 1][j] / cc;
// }
//
// tmp1 = workspace->hc[i] * workspace->h[i][j] +
// workspace->hs[i] * workspace->h[i + 1][j];
// tmp2 = -workspace->hs[i] * workspace->h[i][j] +
// workspace->hc[i] * workspace->h[i + 1][j];
//
// workspace->h[i][j] = tmp1;
// workspace->h[i + 1][j] = tmp2;
// }
// }
/* apply Givens rotations to the rhs as well */
tmp1 = workspace->hc[j] * workspace->g[j];
tmp2 = -workspace->hs[j] * workspace->g[j];
workspace->g[j] = tmp1;
workspace->g[j + 1] = tmp2;
}
t_ortho += Get_Timing_Info( t_start );
#ifdef _OPENMP
#pragma omp barrier
#endif
}
/* solve Hy = g: H is now upper-triangular, do back-substitution */
t_start = Get_Time( );
#ifdef _OPENMP
#pragma omp single
#endif
{
for ( i = j - 1; i >= 0; i-- )
{
temp = workspace->g[i];
for ( k = j - 1; k > i; k-- )
{
temp -= workspace->h[i][k] * workspace->y[k];
}
workspace->y[i] = temp / workspace->h[i][i];
}
}
t_ts += Get_Timing_Info( t_start );
/* update x = x_0 + Vy */
t_start = Get_Time( );
Vector_MakeZero( workspace->p, N );
for ( i = 0; i < j; i++ )
{
Vector_Add( workspace->p, workspace->y[i], workspace->v[i], N );
}
Vector_Add( x, 1., workspace->p, N );
t_vops += Get_Timing_Info( t_start );
/* stopping condition */
if ( FABS(workspace->g[j]) / bnorm <= tol )
{
break;
}
}
#ifdef _OPENMP
#pragma omp single
#endif
{
g_j = j;
g_itr = itr;
}
}
#ifdef _OPENMP
data->timing.cm_solver_orthog += t_ortho / control->num_threads;
data->timing.cm_solver_pre_app += t_pa / control->num_threads;
data->timing.cm_solver_spmv += t_spmv / control->num_threads;
data->timing.cm_solver_tri_solve += t_ts / control->num_threads;
data->timing.cm_solver_vector_ops += t_vops / control->num_threads;
#else
data->timing.cm_solver_orthog += t_ortho;
data->timing.cm_solver_pre_app += t_pa;
data->timing.cm_solver_spmv += t_spmv;
data->timing.cm_solver_tri_solve += t_ts;
data->timing.cm_solver_vector_ops += t_vops;
#endif
if ( g_itr >= control->cm_solver_max_iters )
{
fprintf( stderr, "[WARNING] GMRES convergence failed (%d outer iters)\n", g_itr );
return g_itr * (control->cm_solver_restart + 1) + g_j + 1;
}
return g_itr * (control->cm_solver_restart + 1) + g_j + 1;
}
int GMRES_HouseHolder( const static_storage * const workspace,
const control_params * const control, simulation_data * const data,
const sparse_matrix * const H, const real * const b, real tol,
real * const x, const int fresh_pre )
{
int i, j, k, itr, N, g_j, g_itr;
real cc, tmp1, tmp2, temp, bnorm, ret_temp;
real v[10000], z[control->cm_solver_restart + 2][10000], w[control->cm_solver_restart + 2];
real u[control->cm_solver_restart + 2][10000];
real t_start, t_ortho, t_pa, t_spmv, t_ts, t_vops;
j = 0;
N = H->n;
#ifdef _OPENMP
#pragma omp parallel default(none) \
private(i, j, k, itr, bnorm, ret_temp, t_start) \
shared(v, z, w, u, tol, N, cc, tmp1, tmp2, temp, g_itr, g_j, stderr) \
reduction(+: t_ortho, t_pa, t_spmv, t_ts, t_vops)
#endif
{
t_start = Get_Time( );
bnorm = Norm( b, N );
t_vops += Get_Timing_Info( t_start );
// memset( x, 0, sizeof(real) * N );
/* GMRES outer-loop */
for ( itr = 0; itr < control->cm_solver_max_iters; ++itr )
{
/* compute z = r0 */
t_start = Get_Time( );
Sparse_MatVec( H, x, workspace->b_prm );
t_spmv += Get_Timing_Info( t_start );
t_start = Get_Time( );
Vector_Sum( workspace->b_prc, 1., workspace->b, -1., workspace->b_prm, N );
t_vops += Get_Timing_Info( t_start );
t_start = Get_Time( );
apply_preconditioner( workspace, control, workspace->b_prc, z[0], fresh_pre );
t_pa += Get_Timing_Info( t_start );
t_start = Get_Time( );
Vector_MakeZero( w, control->cm_solver_restart + 1 );
w[0] = Norm( z[0], N );
Vector_Copy( u[0], z[0], N );
u[0][0] += ( u[0][0] < 0.0 ? -1 : 1 ) * w[0];
Vector_Scale( u[0], 1 / Norm( u[0], N ), u[0], N );
w[0] *= ( u[0][0] < 0.0 ? 1 : -1 );
t_vops += Get_Timing_Info( t_start );
/* GMRES inner-loop */
for ( j = 0; j < control->cm_solver_restart && FABS( w[j] ) / bnorm > tol; j++ )
{
/* compute v_j */
t_start = Get_Time( );
Vector_Scale( z[j], -2 * u[j][j], u[j], N );
z[j][j] += 1.; /* due to e_j */
for ( i = j - 1; i >= 0; --i )
{
Vector_Add( z[j] + i, -2 * Dot( u[i] + i, z[j] + i, N - i ), u[i] + i, N - i );
}
t_vops += Get_Timing_Info( t_start );
/* matvec */
t_start = Get_Time( );
Sparse_MatVec( H, z[j], workspace->b_prc );
t_spmv += Get_Timing_Info( t_start );
t_start = Get_Time( );
apply_preconditioner( workspace, control, workspace->b_prc, v, fresh_pre );
t_pa += Get_Timing_Info( t_start );
t_start = Get_Time( );
for ( i = 0; i <= j; ++i )
{
Vector_Add( v + i, -2 * Dot( u[i] + i, v + i, N - i ), u[i] + i, N - i );
}
if ( !Vector_isZero( v + (j + 1), N - (j + 1) ) )
{
/* compute the HouseHolder unit vector u_j+1 */
Vector_MakeZero( u[j + 1], j + 1 );
Vector_Copy( u[j + 1] + (j + 1), v + (j + 1), N - (j + 1) );
ret_temp = Norm( v + (j + 1), N - (j + 1) );
#ifdef _OPENMP
#pragma omp single
#endif
u[j + 1][j + 1] += ( v[j + 1] < 0.0 ? -1 : 1 ) * ret_temp;
#ifdef _OPENMP
#pragma omp barrier
#endif
Vector_Scale( u[j + 1], 1 / Norm( u[j + 1], N ), u[j + 1], N );
/* overwrite v with P_m+1 * v */
ret_temp = 2 * Dot( u[j + 1] + (j + 1), v + (j + 1), N - (j + 1) ) * u[j + 1][j + 1];
#ifdef _OPENMP
#pragma omp single
#endif
v[j + 1] -= ret_temp;
#ifdef _OPENMP
#pragma omp barrier
#endif
Vector_MakeZero( v + (j + 2), N - (j + 2) );
// Vector_Add( v, -2 * Dot( u[j+1], v, N ), u[j+1], N );
}
t_vops += Get_Timing_Info( t_start );
/* prev Givens rots on the upper-Hessenberg matrix to make it U */
t_start = Get_Time( );
#ifdef _OPENMP
#pragma omp single
#endif
{
for ( i = 0; i < j; i++ )
{
tmp1 = workspace->hc[i] * v[i] + workspace->hs[i] * v[i + 1];
tmp2 = -workspace->hs[i] * v[i] + workspace->hc[i] * v[i + 1];
v[i] = tmp1;
v[i + 1] = tmp2;
}
/* apply the new Givens rotation to H and right-hand side */
if ( FABS(v[j + 1]) >= ALMOST_ZERO )
{
cc = SQRT( SQR( v[j] ) + SQR( v[j + 1] ) );
workspace->hc[j] = v[j] / cc;
workspace->hs[j] = v[j + 1] / cc;
tmp1 = workspace->hc[j] * v[j] + workspace->hs[j] * v[j + 1];
tmp2 = -workspace->hs[j] * v[j] + workspace->hc[j] * v[j + 1];
v[j] = tmp1;
v[j + 1] = tmp2;
/* Givens rotations to rhs */
tmp1 = workspace->hc[j] * w[j];
tmp2 = -workspace->hs[j] * w[j];
w[j] = tmp1;
w[j + 1] = tmp2;
}
/* extend R */
for ( i = 0; i <= j; ++i )
{
workspace->h[i][j] = v[i];
}
}
t_ortho += Get_Timing_Info( t_start );
}
/* solve Hy = w.
H is now upper-triangular, do back-substitution */
t_start = Get_Time( );
#ifdef _OPENMP
#pragma omp single
#endif
{
for ( i = j - 1; i >= 0; i-- )
{
temp = w[i];
for ( k = j - 1; k > i; k-- )
{
temp -= workspace->h[i][k] * workspace->y[k];
}
workspace->y[i] = temp / workspace->h[i][i];
}
}
t_ts += Get_Timing_Info( t_start );
t_start = Get_Time( );
for ( i = j - 1; i >= 0; i-- )
{
Vector_Add( x, workspace->y[i], z[i], N );
}
t_vops += Get_Timing_Info( t_start );
/* stopping condition */
if ( FABS( w[j] ) / bnorm <= tol )
{
break;
}
}
#ifdef _OPENMP
#pragma omp single
#endif
{
g_j = j;
g_itr = itr;
}
}
#ifdef _OPENMP
data->timing.cm_solver_orthog += t_ortho / control->num_threads;
data->timing.cm_solver_pre_app += t_pa / control->num_threads;
data->timing.cm_solver_spmv += t_spmv / control->num_threads;
data->timing.cm_solver_tri_solve += t_ts / control->num_threads;
data->timing.cm_solver_vector_ops += t_vops / control->num_threads;
#else
data->timing.cm_solver_orthog += t_ortho;
data->timing.cm_solver_pre_app += t_pa;
data->timing.cm_solver_spmv += t_spmv;
data->timing.cm_solver_tri_solve += t_ts;
data->timing.cm_solver_vector_ops += t_vops;
#endif
if ( g_itr >= control->cm_solver_max_iters )
{
fprintf( stderr, "[WARNING] GMRES convergence failed (%d outer iters)\n", g_itr );
return g_itr * (control->cm_solver_restart + 1) + j + 1;
}
return g_itr * (control->cm_solver_restart + 1) + g_j + 1;
}
/* Conjugate Gradient */
int CG( const static_storage * const workspace, const control_params * const control,
simulation_data * const data, const sparse_matrix * const H, const real * const b,
const real tol, real * const x, const int fresh_pre )
{
int i, g_itr, N;
real tmp, alpha, beta, b_norm, r_norm;
real *d, *r, *p, *z;
real sig_old, sig_new;
real t_start, t_pa, t_spmv, t_vops;
N = H->n;
d = workspace->d;
r = workspace->r;
p = workspace->q;
z = workspace->p;
#ifdef _OPENMP
#pragma omp parallel default(none) \
private(i, tmp, alpha, beta, b_norm, r_norm, sig_old, sig_new, t_start) \
reduction(+: t_pa, t_spmv, t_vops) \
shared(g_itr, N, d, r, p, z)
#endif
{
t_pa = 0.0;
t_spmv = 0.0;
t_vops = 0.0;
t_start = Get_Time( );
b_norm = Norm( b, N );
t_vops += Get_Timing_Info( t_start );
t_start = Get_Time( );
Sparse_MatVec( H, x, d );
t_spmv += Get_Timing_Info( t_start );
t_start = Get_Time( );
Vector_Sum( r, 1.0, b, -1.0, d, N );
r_norm = Norm( r, N );
t_vops += Get_Timing_Info( t_start );
t_start = Get_Time( );
apply_preconditioner( workspace, control, r, z, fresh_pre );
t_pa += Get_Timing_Info( t_start );
t_start = Get_Time( );
Vector_Copy( p, z, N );
sig_new = Dot( r, z, N );
t_vops += Get_Timing_Info( t_start );
for ( i = 0; i < control->cm_solver_max_iters && r_norm / b_norm > tol; ++i )
{
t_start = Get_Time( );
Sparse_MatVec( H, p, d );
t_spmv += Get_Timing_Info( t_start );
t_start = Get_Time( );
tmp = Dot( d, p, N );
alpha = sig_new / tmp;
Vector_Add( x, alpha, p, N );
Vector_Add( r, -alpha, d, N );
r_norm = Norm( r, N );
t_vops += Get_Timing_Info( t_start );
t_start = Get_Time( );
apply_preconditioner( workspace, control, r, z, FALSE );
t_pa += Get_Timing_Info( t_start );
t_start = Get_Time( );
sig_old = sig_new;
sig_new = Dot( r, z, N );
beta = sig_new / sig_old;
Vector_Sum( p, 1., z, beta, p, N );
t_vops += Get_Timing_Info( t_start );
}
#ifdef _OPENMP
#pragma omp single
#endif
g_itr = i;
}
#ifdef _OPENMP
data->timing.cm_solver_pre_app += t_pa / control->num_threads;
data->timing.cm_solver_spmv += t_spmv / control->num_threads;
data->timing.cm_solver_vector_ops += t_vops / control->num_threads;
#else
data->timing.cm_solver_pre_app += t_pa;
data->timing.cm_solver_spmv += t_spmv;
data->timing.cm_solver_vector_ops += t_vops;
#endif
if ( g_itr >= control->cm_solver_max_iters )
{
fprintf( stderr, "[WARNING] CG convergence failed (%d iters)\n", g_itr );
return g_itr;
}
return g_itr;
}
/* Steepest Descent */
int SDM( const static_storage * const workspace, const control_params * const control,
simulation_data * const data, const sparse_matrix * const H, const real * const b,
const real tol, real * const x, const int fresh_pre )
{
int i, g_itr, N;
real tmp, alpha, b_norm;
real sig;
real t_start, t_pa, t_spmv, t_vops;
N = H->n;
#ifdef _OPENMP
#pragma omp parallel default(none) \
private(i, tmp, alpha, b_norm, sig, t_start) \
reduction(+: t_pa, t_spmv, t_vops) \
shared(g_itr, N)
#endif
{
t_pa = 0.0;
t_spmv = 0.0;
t_vops = 0.0;
t_start = Get_Time( );
b_norm = Norm( b, N );
t_vops += Get_Timing_Info( t_start );
t_start = Get_Time( );
Sparse_MatVec( H, x, workspace->q );
t_spmv += Get_Timing_Info( t_start );
t_start = Get_Time( );
Vector_Sum( workspace->r, 1.0, b, -1.0, workspace->q, N );
t_vops += Get_Timing_Info( t_start );
t_start = Get_Time( );
apply_preconditioner( workspace, control, workspace->r, workspace->d, fresh_pre );
t_pa += Get_Timing_Info( t_start );
t_start = Get_Time( );
sig = Dot( workspace->r, workspace->d, N );
t_vops += Get_Timing_Info( t_start );
for ( i = 0; i < control->cm_solver_max_iters && SQRT(sig) / b_norm > tol; ++i )
{
t_start = Get_Time( );
Sparse_MatVec( H, workspace->d, workspace->q );
t_spmv += Get_Timing_Info( t_start );
t_start = Get_Time( );
sig = Dot( workspace->r, workspace->d, N );
/* ensure each thread gets a local copy of
* the function return value before proceeding
* (Dot function has persistent state in the form
* of a shared global variable for the OpenMP version) */
#ifdef _OPENMP
#pragma omp barrier
#endif
tmp = Dot( workspace->d, workspace->q, N );
alpha = sig / tmp;
Vector_Add( x, alpha, workspace->d, N );
Vector_Add( workspace->r, -alpha, workspace->q, N );
t_vops += Get_Timing_Info( t_start );
t_start = Get_Time( );
apply_preconditioner( workspace, control, workspace->r, workspace->d, FALSE );
t_pa += Get_Timing_Info( t_start );
}
#ifdef _OPENMP
#pragma omp single
#endif
g_itr = i;
}
#ifdef _OPENMP
data->timing.cm_solver_pre_app += t_pa / control->num_threads;
data->timing.cm_solver_spmv += t_spmv / control->num_threads;
data->timing.cm_solver_vector_ops += t_vops / control->num_threads;
#else
data->timing.cm_solver_pre_app += t_pa;
data->timing.cm_solver_spmv += t_spmv;
data->timing.cm_solver_vector_ops += t_vops;
#endif
if ( g_itr >= control->cm_solver_max_iters )
{
fprintf( stderr, "[WARNING] SDM convergence failed (%d iters)\n", g_itr );
return g_itr;
}
return g_itr;
}
/* Estimate the stability of a 2-side preconditioning scheme
* using the factorization A \approx LU. Specifically, estimate the 1-norm of A^{-1}
* using the 1-norm of (LU)^{-1}e, with e = [1 1 ... 1]^T through 2 triangular solves:
* 1) Ly = e
* 2) Ux = y where y = Ux
* That is, we seek to solve e = LUx for unknown x
*
* Reference: Incomplete LU Preconditioning with the Multilevel Fast Multipole Algorithm
* for Electromagnetic Scattering, SIAM J. Sci. Computing, 2007 */
real condest( const sparse_matrix * const L, const sparse_matrix * const U )
{
unsigned int i, N;
real *e, c;
N = L->n;
e = (real*) smalloc( sizeof(real) * N, "condest::e" );
memset( e, 1., N * sizeof(real) );
tri_solve( L, e, e, L->n, LOWER );
tri_solve( U, e, e, U->n, UPPER );
/* compute 1-norm of vector e */
c = FABS(e[0]);
for ( i = 1; i < N; ++i)
{
if ( FABS(e[i]) > c )
{
c = FABS(e[i]);
}
}
sfree( e, "condest::e" );
return c;
}