Newer
Older
/*----------------------------------------------------------------------
SerialReax - Reax Force Field Simulator
Copyright (2010) Purdue University
Hasan Metin Aktulga, haktulga@cs.purdue.edu
Joseph Fogarty, jcfogart@mail.usf.edu
Sagar Pandit, pandit@usf.edu
Ananth Y Grama, ayg@cs.purdue.edu
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License as
published by the Free Software Foundation; either version 2 of
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
See the GNU General Public License for more details:
<http://www.gnu.org/licenses/>.
----------------------------------------------------------------------*/
Kurt A. O'Hearn
committed
#include "charges.h"
#include "lin_alg.h"
Kurt A. O'Hearn
committed
#include "tool_box.h"
#include "vector.h"
#include "slu_mt_ddefs.h"
Kurt A. O'Hearn
committed
Kurt A. O'Hearn
committed
typedef struct
{
unsigned int j;
real val;
} sparse_matrix_entry;
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#if defined(TEST_MAT)
static sparse_matrix * create_test_mat( void )
{
unsigned int i, n;
sparse_matrix *H_test;
if ( Allocate_Matrix( &H_test, 3, 6 ) == FAILURE )
{
fprintf( stderr, "not enough memory for test matrices. terminating.\n" );
exit( INSUFFICIENT_MEMORY );
}
//3x3, SPD, store lower half
i = 0;
n = 0;
H_test->start[n] = i;
H_test->j[i] = 0;
H_test->val[i] = 4.;
++i;
++n;
H_test->start[n] = i;
H_test->j[i] = 0;
H_test->val[i] = 12.;
++i;
H_test->j[i] = 1;
H_test->val[i] = 37.;
++i;
++n;
H_test->start[n] = i;
H_test->j[i] = 0;
H_test->val[i] = -16.;
++i;
H_test->j[i] = 1;
H_test->val[i] = -43.;
++i;
H_test->j[i] = 2;
H_test->val[i] = 98.;
++i;
++n;
H_test->start[n] = i;
return H_test;
}
#endif
Kurt A. O'Hearn
committed
/* Routine used with qsort for sorting nonzeros within a sparse matrix row
*
* v1/v2: pointers to column indices of nonzeros within a row (unsigned int)
*/
Kurt A. O'Hearn
committed
static int compare_matrix_entry(const void *v1, const void *v2)
Kurt A. O'Hearn
committed
return ((sparse_matrix_entry *)v1)->j - ((sparse_matrix_entry *)v2)->j;
Kurt A. O'Hearn
committed
/* Routine used for sorting nonzeros within a sparse matrix row;
* internally, a combination of qsort and manual sorting is utilized
* (parallel calls to qsort when multithreading, rows mapped to threads)
*
* A: sparse matrix for which to sort nonzeros within a row, stored in CSR format
*/
Kurt A. O'Hearn
committed
static void Sort_Matrix_Rows( sparse_matrix * const A )
Kurt A. O'Hearn
committed
unsigned int i, j, si, ei;
sparse_matrix_entry *temp;
Kurt A. O'Hearn
committed
Kurt A. O'Hearn
committed
#ifdef _OPENMP
Kurt A. O'Hearn
committed
// #pragma omp parallel default(none) private(i, j, si, ei, temp) shared(stderr)
Kurt A. O'Hearn
committed
#endif
Kurt A. O'Hearn
committed
{
Kurt A. O'Hearn
committed
if ( ( temp = (sparse_matrix_entry *) malloc( A->n * sizeof(sparse_matrix_entry)) ) == NULL )
Kurt A. O'Hearn
committed
{
fprintf( stderr, "Not enough space for matrix row sort. Terminating...\n" );
exit( INSUFFICIENT_MEMORY );
}
Kurt A. O'Hearn
committed
/* sort each row of A using column indices */
Kurt A. O'Hearn
committed
#ifdef _OPENMP
Kurt A. O'Hearn
committed
// #pragma omp for schedule(guided)
Kurt A. O'Hearn
committed
#endif
Kurt A. O'Hearn
committed
for ( i = 0; i < A->n; ++i )
{
si = A->start[i];
ei = A->start[i + 1];
Kurt A. O'Hearn
committed
Kurt A. O'Hearn
committed
for ( j = 0; j < (ei - si); ++j )
{
(temp + j)->j = A->j[si + j];
(temp + j)->val = A->val[si + j];
}
Kurt A. O'Hearn
committed
/* polymorphic sort in standard C library using column indices */
Kurt A. O'Hearn
committed
qsort( temp, ei - si, sizeof(sparse_matrix_entry), compare_matrix_entry );
Kurt A. O'Hearn
committed
Kurt A. O'Hearn
committed
for ( j = 0; j < (ei - si); ++j )
Kurt A. O'Hearn
committed
{
Kurt A. O'Hearn
committed
A->j[si + j] = (temp + j)->j;
A->val[si + j] = (temp + j)->val;
Kurt A. O'Hearn
committed
}
}
sfree( temp, "Sort_Matrix_Rows::temp" );
Kurt A. O'Hearn
committed
}
Kurt A. O'Hearn
committed
static void Calculate_Droptol( const sparse_matrix * const A,
real * const droptol, const real dtol )
Kurt A. O'Hearn
committed
#ifdef _OPENMP
static real *droptol_local;
unsigned int tid;
#endif
Kurt A. O'Hearn
committed
#ifdef _OPENMP
Kurt A. O'Hearn
committed
#pragma omp parallel default(none) private(i, j, k, val, tid), shared(droptol_local, stderr)
Kurt A. O'Hearn
committed
#endif
Kurt A. O'Hearn
committed
#ifdef _OPENMP
tid = omp_get_thread_num();
#pragma omp master
{
/* keep b_local for program duration to avoid allocate/free
* overhead per Sparse_MatVec call*/
if ( droptol_local == NULL )
{
if ( (droptol_local = (real*) malloc( omp_get_num_threads() * A->n * sizeof(real))) == NULL )
{
Kurt A. O'Hearn
committed
fprintf( stderr, "Not enough space for droptol. Terminating...\n" );
Kurt A. O'Hearn
committed
exit( INSUFFICIENT_MEMORY );
}
Kurt A. O'Hearn
committed
}
}
Kurt A. O'Hearn
committed
#pragma omp barrier
#endif
/* init droptol to 0 */
for ( i = 0; i < A->n; ++i )
{
#ifdef _OPENMP
droptol_local[tid * A->n + i] = 0.0;
#else
droptol[i] = 0.0;
#endif
}
Kurt A. O'Hearn
committed
#ifdef _OPENMP
Kurt A. O'Hearn
committed
#pragma omp barrier
Kurt A. O'Hearn
committed
#endif
Kurt A. O'Hearn
committed
/* calculate sqaure of the norm of each row */
Kurt A. O'Hearn
committed
#ifdef _OPENMP
Kurt A. O'Hearn
committed
#pragma omp for schedule(static)
Kurt A. O'Hearn
committed
#endif
Kurt A. O'Hearn
committed
for ( i = 0; i < A->n; ++i )
Kurt A. O'Hearn
committed
for ( k = A->start[i]; k < A->start[i + 1] - 1; ++k )
{
j = A->j[k];
val = A->val[k];
Kurt A. O'Hearn
committed
#ifdef _OPENMP
droptol_local[tid * A->n + i] += val * val;
droptol_local[tid * A->n + j] += val * val;
#else
droptol[i] += val * val;
droptol[j] += val * val;
#endif
}
Kurt A. O'Hearn
committed
// diagonal entry
val = A->val[k];
Kurt A. O'Hearn
committed
#ifdef _OPENMP
droptol_local[tid * A->n + i] += val * val;
#else
Kurt A. O'Hearn
committed
#endif
Kurt A. O'Hearn
committed
#ifdef _OPENMP
Kurt A. O'Hearn
committed
#pragma omp barrier
Kurt A. O'Hearn
committed
#pragma omp for schedule(static)
Kurt A. O'Hearn
committed
for ( i = 0; i < A->n; ++i )
{
droptol[i] = 0.0;
for ( k = 0; k < omp_get_num_threads(); ++k )
{
droptol[i] += droptol_local[k * A->n + i];
}
}
#pragma omp barrier
Kurt A. O'Hearn
committed
#endif
Kurt A. O'Hearn
committed
/* calculate local droptol for each row */
//fprintf( stderr, "droptol: " );
Kurt A. O'Hearn
committed
#ifdef _OPENMP
Kurt A. O'Hearn
committed
#pragma omp for schedule(static)
Kurt A. O'Hearn
committed
#endif
Kurt A. O'Hearn
committed
for ( i = 0; i < A->n; ++i )
{
//fprintf( stderr, "%f-->", droptol[i] );
droptol[i] = SQRT( droptol[i] ) * dtol;
//fprintf( stderr, "%f ", droptol[i] );
}
//fprintf( stderr, "\n" );
Kurt A. O'Hearn
committed
static int Estimate_LU_Fill( const sparse_matrix * const A, const real * const droptol )
Kurt A. O'Hearn
committed
int i, pj;
Kurt A. O'Hearn
committed
#ifdef _OPENMP
Kurt A. O'Hearn
committed
#pragma omp parallel for schedule(static) \
Kurt A. O'Hearn
committed
default(none) private(i, pj, val) reduction(+: fillin)
#endif
Kurt A. O'Hearn
committed
{
for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
{
Kurt A. O'Hearn
committed
val = A->val[pj];
Kurt A. O'Hearn
committed
if ( FABS(val) > droptol[i] )
{
Kurt A. O'Hearn
committed
}
Kurt A. O'Hearn
committed
}
#if defined(HAVE_SUPERLU_MT)
static real SuperLU_Factorize( const sparse_matrix * const A,
sparse_matrix * const L, sparse_matrix * const U )
{
unsigned int i, pj, count, *Ltop, *Utop, r;
Kurt A. O'Hearn
committed
sparse_matrix *A_t;
Kurt A. O'Hearn
committed
NCformat *A_S_store;
SCPformat *L_S_store;
NCPformat *U_S_store;
superlumt_options_t superlumt_options;
Kurt A. O'Hearn
committed
pxgstrf_shared_t pxgstrf_shared;
pdgstrf_threadarg_t *pdgstrf_threadarg;
int_t nprocs;
fact_t fact;
trans_t trans;
yes_no_t refact, usepr;
real u, drop_tol;
Kurt A. O'Hearn
committed
real *a, *at;
int_t *asub, *atsub, *xa, *xat;
int_t *perm_c; /* column permutation vector */
int_t *perm_r; /* row permutations from partial pivoting */
void *work;
Kurt A. O'Hearn
committed
int_t info, lwork;
int_t permc_spec, panel_size, relax;
Gstat_t Gstat;
flops_t flopcnt;
/* Default parameters to control factorization. */
#ifdef _OPENMP
//TODO: set as global parameter and use
#pragma omp parallel \
Kurt A. O'Hearn
committed
default(none) shared(nprocs)
{
#pragma omp master
{
/* SuperLU_MT spawns threads internally, so set and pass parameter */
nprocs = omp_get_num_threads();
}
}
#else
nprocs = 1;
#endif
Kurt A. O'Hearn
committed
// fact = EQUILIBRATE; /* equilibrate A (i.e., scale rows & cols to have unit norm), then factorize */
fact = DOFACT; /* factor from scratch */
trans = NOTRANS;
refact = NO; /* first time factorization */
//TODO: add to control file and use the value there to set these
panel_size = sp_ienv(1); /* # consec. cols treated as unit task */
relax = sp_ienv(2); /* # cols grouped as relaxed supernode */
Kurt A. O'Hearn
committed
u = 1.0; /* diagonal pivoting threshold */
usepr = NO;
drop_tol = 0.0;
work = NULL;
lwork = 0;
Kurt A. O'Hearn
committed
//#if defined(DEBUG)
fprintf( stderr, "nprocs = %d\n", nprocs );
fprintf( stderr, "Panel size = %d\n", panel_size );
fprintf( stderr, "Relax = %d\n", relax );
//#endif
if ( !(perm_r = intMalloc(A->n)) )
{
SUPERLU_ABORT("Malloc fails for perm_r[].");
}
Kurt A. O'Hearn
committed
if ( !(perm_c = intMalloc(A->n)) )
{
SUPERLU_ABORT("Malloc fails for perm_c[].");
}
if ( !(superlumt_options.etree = intMalloc(A->n)) )
{
Kurt A. O'Hearn
committed
SUPERLU_ABORT("Malloc fails for etree[].");
}
if ( !(superlumt_options.colcnt_h = intMalloc(A->n)) )
{
Kurt A. O'Hearn
committed
SUPERLU_ABORT("Malloc fails for colcnt_h[].");
}
if ( !(superlumt_options.part_super_h = intMalloc(A->n)) )
{
Kurt A. O'Hearn
committed
SUPERLU_ABORT("Malloc fails for part_super__h[].");
Kurt A. O'Hearn
committed
if ( ( (a = (real*) malloc( (2 * A->start[A->n] - A->n) * sizeof(real))) == NULL )
Kurt A. O'Hearn
committed
|| ( (asub = (int_t*) malloc( (2 * A->start[A->n] - A->n) * sizeof(int_t))) == NULL )
|| ( (xa = (int_t*) malloc( (A->n + 1) * sizeof(int_t))) == NULL )
|| ( (Ltop = (unsigned int*) malloc( (A->n + 1) * sizeof(unsigned int))) == NULL )
|| ( (Utop = (unsigned int*) malloc( (A->n + 1) * sizeof(unsigned int))) == NULL ) )
Kurt A. O'Hearn
committed
fprintf( stderr, "Not enough space for SuperLU factorization. Terminating...\n" );
exit( INSUFFICIENT_MEMORY );
Kurt A. O'Hearn
committed
}
if ( Allocate_Matrix( &A_t, A->n, A->m ) == FAILURE )
Kurt A. O'Hearn
committed
{
fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
Kurt A. O'Hearn
committed
exit( INSUFFICIENT_MEMORY );
}
/* Set up the sparse matrix data structure for A. */
Kurt A. O'Hearn
committed
Transpose( A, A_t );
Kurt A. O'Hearn
committed
for ( i = 0; i < A->n; ++i )
Kurt A. O'Hearn
committed
xa[i] = count;
Kurt A. O'Hearn
committed
for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
{
a[count] = A->entries[pj].val;
asub[count] = A->entries[pj].j;
++count;
}
Kurt A. O'Hearn
committed
for ( pj = A_t->start[i] + 1; pj < A_t->start[i + 1]; ++pj )
Kurt A. O'Hearn
committed
{
a[count] = A_t->entries[pj].val;
asub[count] = A_t->entries[pj].j;
++count;
}
Kurt A. O'Hearn
committed
xa[i] = count;
dCompRow_to_CompCol( A->n, A->n, 2 * A->start[A->n] - A->n, a, asub, xa,
Kurt A. O'Hearn
committed
&at, &atsub, &xat );
Kurt A. O'Hearn
committed
Kurt A. O'Hearn
committed
for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
Kurt A. O'Hearn
committed
fprintf( stderr, "%6d", asub[i] );
fprintf( stderr, "\n" );
Kurt A. O'Hearn
committed
for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
Kurt A. O'Hearn
committed
fprintf( stderr, "%6.1f", a[i] );
fprintf( stderr, "\n" );
Kurt A. O'Hearn
committed
for ( i = 0; i <= A->n; ++i )
Kurt A. O'Hearn
committed
fprintf( stderr, "%6d", xa[i] );
fprintf( stderr, "\n" );
Kurt A. O'Hearn
committed
for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
Kurt A. O'Hearn
committed
fprintf( stderr, "%6d", atsub[i] );
fprintf( stderr, "\n" );
Kurt A. O'Hearn
committed
for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
Kurt A. O'Hearn
committed
fprintf( stderr, "%6.1f", at[i] );
fprintf( stderr, "\n" );
Kurt A. O'Hearn
committed
for ( i = 0; i <= A->n; ++i )
Kurt A. O'Hearn
committed
fprintf( stderr, "%6d", xat[i] );
fprintf( stderr, "\n" );
A_S.Stype = SLU_NC; /* column-wise, no supernode */
A_S.Dtype = SLU_D; /* double-precision */
A_S.Mtype = SLU_GE; /* full (general) matrix -- required for parallel factorization */
A_S.nrow = A->n;
A_S.ncol = A->n;
A_S.Store = (void *) SUPERLU_MALLOC( sizeof(NCformat) );
A_S_store = (NCformat *) A_S.Store;
A_S_store->nnz = 2 * A->start[A->n] - A->n;
A_S_store->nzval = at;
A_S_store->rowind = atsub;
A_S_store->colptr = xat;
/* ------------------------------------------------------------
Kurt A. O'Hearn
committed
Allocate storage and initialize statistics variables.
------------------------------------------------------------*/
Kurt A. O'Hearn
committed
StatAlloc( A->n, nprocs, panel_size, relax, &Gstat );
StatInit( A->n, nprocs, &Gstat );
/* ------------------------------------------------------------
Get column permutation vector perm_c[], according to permc_spec:
Kurt A. O'Hearn
committed
permc_spec = 0: natural ordering
permc_spec = 1: minimum degree ordering on structure of A'*A
permc_spec = 2: minimum degree ordering on structure of A'+A
permc_spec = 3: approximate minimum degree for unsymmetric matrices
Kurt A. O'Hearn
committed
------------------------------------------------------------*/
Kurt A. O'Hearn
committed
get_perm_c( permc_spec, &A_S, perm_c );
/* ------------------------------------------------------------
Initialize the option structure superlumt_options using the
user-input parameters;
Apply perm_c to the columns of original A to form AC.
------------------------------------------------------------*/
Kurt A. O'Hearn
committed
pdgstrf_init( nprocs, fact, trans, refact, panel_size, relax,
Kurt A. O'Hearn
committed
u, usepr, drop_tol, perm_c, perm_r,
work, lwork, &A_S, &AC_S, &superlumt_options, &Gstat );
Kurt A. O'Hearn
committed
Kurt A. O'Hearn
committed
for ( i = 0; i < ((NCPformat*)AC_S.Store)->nnz; ++i )
Kurt A. O'Hearn
committed
fprintf( stderr, "%6.1f", ((real*)(((NCPformat*)AC_S.Store)->nzval))[i] );
fprintf( stderr, "\n" );
/* ------------------------------------------------------------
Compute the LU factorization of A.
The following routine will create nprocs threads.
------------------------------------------------------------*/
Kurt A. O'Hearn
committed
pdgstrf( &superlumt_options, &AC_S, perm_r, &L_S, &U_S, &Gstat, &info );
fprintf( stderr, "INFO: %d\n", info );
Kurt A. O'Hearn
committed
flopcnt = 0;
for (i = 0; i < nprocs; ++i)
{
flopcnt += Gstat.procstat[i].fcops;
}
Gstat.ops[FACT] = flopcnt;
Kurt A. O'Hearn
committed
//#if defined(DEBUG)
printf("\n** Result of sparse LU **\n");
L_S_store = (SCPformat *) L_S.Store;
U_S_store = (NCPformat *) U_S.Store;
Kurt A. O'Hearn
committed
printf( "No of nonzeros in factor L = " IFMT "\n", L_S_store->nnz );
printf( "No of nonzeros in factor U = " IFMT "\n", U_S_store->nnz );
fflush( stdout );
//#endif
Kurt A. O'Hearn
committed
/* convert L and R from SuperLU formats to CSR */
memset( Ltop, 0, (A->n + 1) * sizeof(int) );
memset( Utop, 0, (A->n + 1) * sizeof(int) );
Kurt A. O'Hearn
committed
memset( L->start, 0, (A->n + 1) * sizeof(int) );
memset( U->start, 0, (A->n + 1) * sizeof(int) );
Kurt A. O'Hearn
committed
for ( i = 0; i < 2 * L_S_store->nnz; ++i )
Kurt A. O'Hearn
committed
fprintf( stderr, "%6.1f", ((real*)(L_S_store->nzval))[i] );
fprintf( stderr, "\n" );
Kurt A. O'Hearn
committed
for ( i = 0; i < 2 * U_S_store->nnz; ++i )
Kurt A. O'Hearn
committed
fprintf( stderr, "%6.1f", ((real*)(U_S_store->nzval))[i] );
fprintf( stderr, "\n" );
printf( "No of supernodes in factor L = " IFMT "\n", L_S_store->nsuper );
Kurt A. O'Hearn
committed
for ( i = 0; i < A->n; ++i )
Kurt A. O'Hearn
committed
fprintf( stderr, "nzval_col_beg[%5d] = %d\n", i, L_S_store->nzval_colbeg[i] );
fprintf( stderr, "nzval_col_end[%5d] = %d\n", i, L_S_store->nzval_colend[i] );
//TODO: correct for SCPformat for L?
Kurt A. O'Hearn
committed
//for( pj = L_S_store->rowind_colbeg[i]; pj < L_S_store->rowind_colend[i]; ++pj )
// for( pj = 0; pj < L_S_store->rowind_colend[i] - L_S_store->rowind_colbeg[i]; ++pj )
// {
// ++Ltop[L_S_store->rowind[L_S_store->rowind_colbeg[i] + pj] + 1];
// }
fprintf( stderr, "col_beg[%5d] = %d\n", i, U_S_store->colbeg[i] );
fprintf( stderr, "col_end[%5d] = %d\n", i, U_S_store->colend[i] );
Kurt A. O'Hearn
committed
for ( pj = U_S_store->colbeg[i]; pj < U_S_store->colend[i]; ++pj )
{
++Utop[U_S_store->rowind[pj] + 1];
Kurt A. O'Hearn
committed
fprintf( stderr, "Utop[%5d] = %d\n", U_S_store->rowind[pj] + 1, Utop[U_S_store->rowind[pj] + 1] );
Kurt A. O'Hearn
committed
for ( i = 1; i <= A->n; ++i )
Kurt A. O'Hearn
committed
// Ltop[i] = L->start[i] = Ltop[i] + Ltop[i - 1];
Utop[i] = U->start[i] = Utop[i] + Utop[i - 1];
// fprintf( stderr, "Utop[%5d] = %d\n", i, Utop[i] );
// fprintf( stderr, "U->start[%5d] = %d\n", i, U->start[i] );
Kurt A. O'Hearn
committed
for ( i = 0; i < A->n; ++i )
Kurt A. O'Hearn
committed
// for( pj = 0; pj < L_S_store->nzval_colend[i] - L_S_store->nzval_colbeg[i]; ++pj )
// {
// r = L_S_store->rowind[L_S_store->rowind_colbeg[i] + pj];
// L->entries[Ltop[r]].j = r;
// L->entries[Ltop[r]].val = ((real*)L_S_store->nzval)[L_S_store->nzval_colbeg[i] + pj];
// ++Ltop[r];
// }
Kurt A. O'Hearn
committed
for ( pj = U_S_store->colbeg[i]; pj < U_S_store->colend[i]; ++pj )
{
r = U_S_store->rowind[pj];
U->entries[Utop[r]].j = i;
Kurt A. O'Hearn
committed
U->entries[Utop[r]].val = ((real*)U_S_store->nzval)[pj];
Kurt A. O'Hearn
committed
/* ------------------------------------------------------------
Deallocate storage after factorization.
------------------------------------------------------------*/
pxgstrf_finalize( &superlumt_options, &AC_S );
Deallocate_Matrix( A_t );
sfree( xa, "SuperLU_Factorize::xa" );
sfree( asub, "SuperLU_Factorize::asub" );
sfree( a, "SuperLU_Factorize::a" );
Kurt A. O'Hearn
committed
SUPERLU_FREE( perm_r );
SUPERLU_FREE( perm_c );
SUPERLU_FREE( ((NCformat *)A_S.Store)->rowind );
SUPERLU_FREE( ((NCformat *)A_S.Store)->colptr );
SUPERLU_FREE( ((NCformat *)A_S.Store)->nzval );
SUPERLU_FREE( A_S.Store );
if ( lwork == 0 )
{
Destroy_SuperNode_SCP(&L_S);
Destroy_CompCol_NCP(&U_S);
}
else if ( lwork > 0 )
{
SUPERLU_FREE(work);
}
StatFree(&Gstat);
sfree( Utop, "SuperLU_Factorize::Utop" );
sfree( Ltop, "SuperLU_Factorize::Ltop" );
Kurt A. O'Hearn
committed
//TODO: return iters
return 0.;
}
#endif
Kurt A. O'Hearn
committed
/* Diagonal (Jacobi) preconditioner computation */
Kurt A. O'Hearn
committed
static real diag_pre_comp( const sparse_matrix * const H, real * const Hdia_inv )
Kurt A. O'Hearn
committed
{
unsigned int i;
Kurt A. O'Hearn
committed
real start;
Kurt A. O'Hearn
committed
Kurt A. O'Hearn
committed
start = Get_Time( );
Kurt A. O'Hearn
committed
Kurt A. O'Hearn
committed
#ifdef _OPENMP
Kurt A. O'Hearn
committed
#pragma omp parallel for schedule(static) \
Kurt A. O'Hearn
committed
default(none) private(i)
Kurt A. O'Hearn
committed
#endif
Kurt A. O'Hearn
committed
for ( i = 0; i < H->n; ++i )
Kurt A. O'Hearn
committed
{
if ( H->val[H->start[i + 1] - 1] != 0.0 )
{
Hdia_inv[i] = 1.0 / H->val[H->start[i + 1] - 1];
}
else
{
Hdia_inv[i] = 1.0;
}
Kurt A. O'Hearn
committed
}
Kurt A. O'Hearn
committed
Kurt A. O'Hearn
committed
return Get_Timing_Info( start );
Kurt A. O'Hearn
committed
}
Kurt A. O'Hearn
committed
/* Incomplete Cholesky factorization with dual thresholding */
Kurt A. O'Hearn
committed
static real ICHOLT( const sparse_matrix * const A, const real * const droptol,
Kurt A. O'Hearn
committed
sparse_matrix * const L, sparse_matrix * const U )
Kurt A. O'Hearn
committed
int *tmp_j;
real *tmp_val;
Kurt A. O'Hearn
committed
real val, start;
Kurt A. O'Hearn
committed
unsigned int *Utop;
Kurt A. O'Hearn
committed
start = Get_Time( );
Kurt A. O'Hearn
committed
Kurt A. O'Hearn
committed
if ( ( Utop = (unsigned int*) malloc((A->n + 1) * sizeof(unsigned int)) ) == NULL ||
Kurt A. O'Hearn
committed
( tmp_j = (int*) malloc(A->n * sizeof(int)) ) == NULL ||
( tmp_val = (real*) malloc(A->n * sizeof(real)) ) == NULL )
{
Kurt A. O'Hearn
committed
fprintf( stderr, "[ICHOLT] Not enough memory for preconditioning matrices. terminating.\n" );
Kurt A. O'Hearn
committed
exit( INSUFFICIENT_MEMORY );
}
memset( L->start, 0, (A->n + 1) * sizeof(unsigned int) );
memset( U->start, 0, (A->n + 1) * sizeof(unsigned int) );
memset( Utop, 0, A->n * sizeof(unsigned int) );
for ( i = 0; i < A->n; ++i )
{
L->start[i] = Ltop;
tmptop = 0;
for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
{
Kurt A. O'Hearn
committed
j = A->j[pj];
val = A->val[pj];
Kurt A. O'Hearn
committed
Kurt A. O'Hearn
committed
if ( FABS(val) > droptol[i] )
{
k1 = 0;
k2 = L->start[j];
while ( k1 < tmptop && k2 < L->start[j + 1] )
{
Kurt A. O'Hearn
committed
if ( tmp_j[k1] < L->j[k2] )
{
Kurt A. O'Hearn
committed
}
else if ( tmp_j[k1] > L->j[k2] )
{
Kurt A. O'Hearn
committed
}
Kurt A. O'Hearn
committed
{
val -= (tmp_val[k1++] * L->val[k2++]);
}
}
// L matrix is lower triangular,
// so right before the start of next row comes jth diagonal
Kurt A. O'Hearn
committed
val /= L->val[L->start[j + 1] - 1];
Kurt A. O'Hearn
committed
tmp_j[tmptop] = j;
tmp_val[tmptop] = val;
Kurt A. O'Hearn
committed
if ( A->j[pj] != i )
Kurt A. O'Hearn
committed
fprintf( stderr, "[ICHOLT] badly built A matrix!\n (i = %d) ", i );
Kurt A. O'Hearn
committed
exit( NUMERIC_BREAKDOWN );
// compute the ith diagonal in L
Kurt A. O'Hearn
committed
val = A->val[pj];
Kurt A. O'Hearn
committed
{
val -= (tmp_val[k1] * tmp_val[k1]);
}
Kurt A. O'Hearn
committed
#if defined(DEBUG)
if ( val < 0.0 )
{
fprintf( stderr, "[ICHOLT] Numeric breakdown (SQRT of negative on diagonal i = %d). Terminating.\n", i );
exit( NUMERIC_BREAKDOWN );
}
#endif
Kurt A. O'Hearn
committed
tmp_j[tmptop] = i;
Kurt A. O'Hearn
committed
tmp_val[tmptop] = SQRT( val );
// apply the dropping rule once again
//fprintf( stderr, "row%d: tmptop: %d\n", i, tmptop );
//for( k1 = 0; k1<= tmptop; ++k1 )
// fprintf( stderr, "%d(%f) ", tmp[k1].j, tmp[k1].val );
//fprintf( stderr, "\n" );
//fprintf( stderr, "row(%d): droptol=%.4f\n", i+1, droptol[i] );
for ( k1 = 0; k1 < tmptop; ++k1 )
Kurt A. O'Hearn
committed
{
Kurt A. O'Hearn
committed
if ( FABS(tmp_val[k1]) > droptol[i] / tmp_val[tmptop] )
Kurt A. O'Hearn
committed
L->j[Ltop] = tmp_j[k1];
L->val[Ltop] = tmp_val[k1];
U->start[tmp_j[k1] + 1]++;
++Ltop;
//fprintf( stderr, "%d(%.4f) ", tmp[k1].j+1, tmp[k1].val );
}
Kurt A. O'Hearn
committed
}
Kurt A. O'Hearn
committed
L->j[Ltop] = tmp_j[k1];
L->val[Ltop] = tmp_val[k1];
++Ltop;
//fprintf( stderr, "%d(%.4f)\n", tmp[k1].j+1, tmp[k1].val );
// fprintf( stderr, "nnz(L): %d, max: %d\n", Ltop, L->n * 50 );
Kurt A. O'Hearn
committed
Transpose( L, U );
// for ( i = 1; i <= U->n; ++i )
// {
// Utop[i] = U->start[i] = U->start[i] + U->start[i - 1] + 1;
// }
// for ( i = 0; i < L->n; ++i )
// {
// for ( pj = L->start[i]; pj < L->start[i + 1]; ++pj )
// {
// j = L->j[pj];
// U->j[Utop[j]] = i;
// U->val[Utop[j]] = L->val[pj];
// Utop[j]++;
// }
// }
// fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 );
sfree( tmp_val, "ICHOLT::tmp_val" );
sfree( tmp_j, "ICHOLT::tmp_j" );
sfree( Utop, "ICHOLT::Utop" );
Kurt A. O'Hearn
committed
return Get_Timing_Info( start );
/* Fine-grained (parallel) incomplete Cholesky factorization
* Reference:
* Edmond Chow and Aftab Patel
* Fine-Grained Parallel Incomplete LU Factorization
* SIAM J. Sci. Comp. */
Kurt A. O'Hearn
committed
#if defined(TESTING)
static real ICHOL_PAR( const sparse_matrix * const A, const unsigned int sweeps,
sparse_matrix * const U_t, sparse_matrix * const U )
{
unsigned int i, j, k, pj, x = 0, y = 0, ei_x, ei_y;
Kurt A. O'Hearn
committed
real *D, *D_inv, sum, start;
Kurt A. O'Hearn
committed
start = Get_Time( );
if ( Allocate_Matrix( &DAD, A->n, A->m ) == FAILURE ||
( D = (real*) malloc(A->n * sizeof(real)) ) == NULL ||
Kurt A. O'Hearn
committed
( D_inv = (real*) malloc(A->n * sizeof(real)) ) == NULL ||
( Utop = (int*) malloc((A->n + 1) * sizeof(int)) ) == NULL )
{
Kurt A. O'Hearn
committed
fprintf( stderr, "not enough memory for ICHOL_PAR preconditioning matrices. terminating.\n" );
Kurt A. O'Hearn
committed
exit( INSUFFICIENT_MEMORY );
}
Kurt A. O'Hearn
committed
#ifdef _OPENMP
#pragma omp parallel for schedule(static) \
Kurt A. O'Hearn
committed
default(none) shared(D_inv, D) private(i)
#endif
for ( i = 0; i < A->n; ++i )
{
Kurt A. O'Hearn
committed
D_inv[i] = SQRT( A->val[A->start[i + 1] - 1] );
D[i] = 1. / D_inv[i];
}
memset( U->start, 0, sizeof(unsigned int) * (A->n + 1) );
memset( Utop, 0, sizeof(unsigned int) * (A->n + 1) );
/* to get convergence, A must have unit diagonal, so apply
Kurt A. O'Hearn
committed
* transformation DAD, where D = D(1./SQRT(D(A))) */
memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) );
Kurt A. O'Hearn
committed
#ifdef _OPENMP
#pragma omp parallel for schedule(guided) \
Kurt A. O'Hearn
committed
default(none) shared(DAD, D_inv, D) private(i, pj)
#endif
for ( i = 0; i < A->n; ++i )
/* non-diagonals */
for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
Kurt A. O'Hearn
committed
DAD->j[pj] = A->j[pj];
DAD->val[pj] = A->val[pj] * D[i] * D[A->j[pj]];
Kurt A. O'Hearn
committed
DAD->j[pj] = A->j[pj];
DAD->val[pj] = 1.;
/* initial guesses for U^T,
* assume: A and DAD symmetric and stored lower triangular */
memcpy( U_t->start, DAD->start, sizeof(int) * (DAD->n + 1) );
Kurt A. O'Hearn
committed
memcpy( U_t->j, DAD->j, sizeof(int) * (DAD->m) );
memcpy( U_t->val, DAD->val, sizeof(real) * (DAD->m) );
for ( i = 0; i < sweeps; ++i )
{
/* for each nonzero */
Kurt A. O'Hearn
committed
#ifdef _OPENMP
Kurt A. O'Hearn
committed
#pragma omp parallel for schedule(static) \
Kurt A. O'Hearn
committed
default(none) shared(DAD, stderr) private(sum, ei_x, ei_y, k) firstprivate(x, y)
#endif
{
sum = ZERO;
/* determine row bounds of current nonzero */
x = 0;
ei_x = 0;
for ( k = 0; k <= A->n; ++k )
if ( U_t->start[k] > j )
x = U_t->start[k - 1];
ei_x = U_t->start[k];
/* column bounds of current nonzero */
Kurt A. O'Hearn
committed
y = U_t->start[U_t->j[j]];
ei_y = U_t->start[U_t->j[j] + 1];
/* sparse dot product: dot( U^T(i,1:j-1), U^T(j,1:j-1) ) */
Kurt A. O'Hearn
committed
while ( U_t->j[x] < U_t->j[j] &&
U_t->j[y] < U_t->j[j] &&
Kurt A. O'Hearn
committed
if ( U_t->j[x] == U_t->j[y] )
Kurt A. O'Hearn
committed
sum += (U_t->val[x] * U_t->val[y]);
Kurt A. O'Hearn
committed
else if ( U_t->j[x] < U_t->j[y] )
{
++x;
}
else
{
++y;
}
}
Kurt A. O'Hearn
committed
sum = DAD->val[j] - sum;
/* diagonal entries */
Kurt A. O'Hearn
committed
if ( (k - 1) == U_t->j[j] )
Kurt A. O'Hearn
committed
fprintf( stderr, "Numeric breakdown in ICHOL_PAR. Terminating.\n");
#if defined(DEBUG_FOCUS)
fprintf( stderr, "A(%5d,%5d) = %10.3f\n",
Kurt A. O'Hearn
committed
k - 1, A->entries[j].j, A->entries[j].val );
fprintf( stderr, "sum = %10.3f\n", sum);
#endif
exit(NUMERIC_BREAKDOWN);
}
Kurt A. O'Hearn
committed
U_t->val[j] = SQRT( sum );
/* non-diagonal entries */
Kurt A. O'Hearn
committed
U_t->val[j] = sum / U_t->val[ei_y - 1];
/* apply inverse transformation D^{-1}U^{T},
* since DAD \approx U^{T}U, so
* D^{-1}DADD^{-1} = A \approx D^{-1}U^{T}UD^{-1} */
Kurt A. O'Hearn
committed
#ifdef _OPENMP
#pragma omp parallel for schedule(guided) \
Kurt A. O'Hearn
committed
default(none) shared(D_inv) private(i, pj)
#endif
for ( i = 0; i < A->n; ++i )
{
for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
{
Kurt A. O'Hearn
committed
U_t->val[pj] *= D_inv[i];
#if defined(DEBUG_FOCUS)
fprintf( stderr, "nnz(L): %d, max: %d\n", U_t->start[U_t->n], U_t->n * 50 );
#endif
/* transpose U^{T} and copy into U */
#if defined(DEBUG_FOCUS)
fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 );
#endif
Deallocate_Matrix( DAD );
sfree( D_inv, "ICHOL_PAR::D_inv" );
sfree( D, "ICHOL_PAR::D" );
sfree( Utop, "ICHOL_PAR::Utop" );
Kurt A. O'Hearn
committed
return Get_Timing_Info( start );
Kurt A. O'Hearn
committed
#endif
Kurt A. O'Hearn
committed
/* Fine-grained (parallel) incomplete LU factorization
Kurt A. O'Hearn
committed
*
Kurt A. O'Hearn
committed
* Reference:
* Edmond Chow and Aftab Patel
* Fine-Grained Parallel Incomplete LU Factorization
* SIAM J. Sci. Comp.
Kurt A. O'Hearn
committed
*
Kurt A. O'Hearn
committed
* A: symmetric, half-stored (lower triangular), CSR format
* sweeps: number of loops over non-zeros for computation
* L / U: factorized triangular matrices (A \approx LU), CSR format */
Kurt A. O'Hearn
committed
static real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps,
sparse_matrix * const L, sparse_matrix * const U )
Kurt A. O'Hearn
committed
{
unsigned int i, j, k, pj, x, y, ei_x, ei_y;
Kurt A. O'Hearn
committed
real *D, *D_inv, sum, start;
Kurt A. O'Hearn
committed
sparse_matrix *DAD;
Kurt A. O'Hearn
committed
start = Get_Time( );
Kurt A. O'Hearn
committed
if ( Allocate_Matrix( &DAD, A->n, A->m ) == FAILURE ||
( D = (real*) malloc(A->n * sizeof(real)) ) == NULL ||
Kurt A. O'Hearn
committed
( D_inv = (real*) malloc(A->n * sizeof(real)) ) == NULL )
{
Kurt A. O'Hearn
committed
fprintf( stderr, "[ILU_PAR] Not enough memory for preconditioning matrices. Terminating.\n" );
Kurt A. O'Hearn
committed
exit( INSUFFICIENT_MEMORY );
}
Kurt A. O'Hearn
committed
Kurt A. O'Hearn
committed
#ifdef _OPENMP
Kurt A. O'Hearn
committed
#pragma omp parallel for schedule(static) \
Kurt A. O'Hearn
committed
default(none) shared(D, D_inv) private(i)
#endif
Kurt A. O'Hearn
committed
for ( i = 0; i < A->n; ++i )
{
Kurt A. O'Hearn
committed
D_inv[i] = SQRT( FABS( A->val[A->start[i + 1] - 1] ) );
Kurt A. O'Hearn
committed
D[i] = 1.0 / D_inv[i];
Kurt A. O'Hearn
committed
// printf( "A->val[%8d] = %f, D[%4d] = %f, D_inv[%4d] = %f\n", A->start[i + 1] - 1, A->val[A->start[i + 1] - 1], i, D[i], i, D_inv[i] );
Kurt A. O'Hearn
committed
}
/* to get convergence, A must have unit diagonal, so apply
Kurt A. O'Hearn
committed
* transformation DAD, where D = D(1./SQRT(abs(D(A)))) */
Kurt A. O'Hearn
committed
memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) );
Kurt A. O'Hearn
committed
#ifdef _OPENMP
Kurt A. O'Hearn
committed
#pragma omp parallel for schedule(static) \
Kurt A. O'Hearn
committed
default(none) shared(DAD, D) private(i, pj)
#endif
Kurt A. O'Hearn
committed
for ( i = 0; i < A->n; ++i )
{
/* non-diagonals */
for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
{
Kurt A. O'Hearn
committed
DAD->j[pj] = A->j[pj];
DAD->val[pj] = D[i] * A->val[pj] * D[A->j[pj]];
Kurt A. O'Hearn
committed
}
/* diagonal */
Kurt A. O'Hearn
committed
DAD->j[pj] = A->j[pj];
DAD->val[pj] = 1.0;
Kurt A. O'Hearn
committed
}
/* initial guesses for L and U,
* assume: A and DAD symmetric and stored lower triangular */
memcpy( L->start, DAD->start, sizeof(int) * (DAD->n + 1) );
Kurt A. O'Hearn
committed
memcpy( L->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) );
memcpy( L->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) );
Kurt A. O'Hearn
committed
/* store U^T in CSR for row-wise access and tranpose later */