Newer
Older
/*----------------------------------------------------------------------
SerialReax - Reax Force Field Simulator
Copyright (2010) Purdue University
Hasan Metin Aktulga, haktulga@cs.purdue.edu
Joseph Fogarty, jcfogart@mail.usf.edu
Sagar Pandit, pandit@usf.edu
Ananth Y Grama, ayg@cs.purdue.edu
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License as
published by the Free Software Foundation; either version 2 of
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
See the GNU General Public License for more details:
<http://www.gnu.org/licenses/>.
----------------------------------------------------------------------*/
Kurt A. O'Hearn
committed
#include "charges.h"
#include "lin_alg.h"
Kurt A. O'Hearn
committed
#include "tool_box.h"
#include "vector.h"
#if defined(HAVE_SUPERLU_MT)
#include "slu_mt_ddefs.h"
#endif
Kurt A. O'Hearn
committed
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#if defined(TEST_MAT)
static sparse_matrix * create_test_mat( void )
{
unsigned int i, n;
sparse_matrix *H_test;
if ( Allocate_Matrix( &H_test, 3, 6 ) == FAILURE )
{
fprintf( stderr, "not enough memory for test matrices. terminating.\n" );
exit( INSUFFICIENT_MEMORY );
}
//3x3, SPD, store lower half
i = 0;
n = 0;
H_test->start[n] = i;
H_test->j[i] = 0;
H_test->val[i] = 4.;
++i;
++n;
H_test->start[n] = i;
H_test->j[i] = 0;
H_test->val[i] = 12.;
++i;
H_test->j[i] = 1;
H_test->val[i] = 37.;
++i;
++n;
H_test->start[n] = i;
H_test->j[i] = 0;
H_test->val[i] = -16.;
++i;
H_test->j[i] = 1;
H_test->val[i] = -43.;
++i;
H_test->j[i] = 2;
H_test->val[i] = 98.;
++i;
++n;
H_test->start[n] = i;
return H_test;
}
#endif
Kurt A. O'Hearn
committed
/* Routine used with qsort for sorting nonzeros within a sparse matrix row
*
* v1/v2: pointers to column indices of nonzeros within a row (unsigned int)
*/
Kurt A. O'Hearn
committed
static int compare_matrix_entry(const void *v1, const void *v2)
Kurt A. O'Hearn
committed
return *(unsigned int *)v1 - *(unsigned int *)v2;
Kurt A. O'Hearn
committed
/* Routine used for sorting nonzeros within a sparse matrix row;
* internally, a combination of qsort and manual sorting is utilized
* (parallel calls to qsort when multithreading, rows mapped to threads)
*
* A: sparse matrix for which to sort nonzeros within a row, stored in CSR format
*/
Kurt A. O'Hearn
committed
static void Sort_Matrix_Rows( sparse_matrix * const A )
Kurt A. O'Hearn
committed
unsigned int i, j, k, si, ei, *temp_j;
Kurt A. O'Hearn
committed
real *temp_val;
Kurt A. O'Hearn
committed
#pragma omp parallel default(none) private(i, j, k, si, ei, temp_j, temp_val) shared(stderr)
Kurt A. O'Hearn
committed
{
Kurt A. O'Hearn
committed
if ( ( temp_j = (unsigned int*) malloc( A->n * sizeof(unsigned int)) ) == NULL
Kurt A. O'Hearn
committed
|| ( temp_val = (real*) malloc( A->n * sizeof(real)) ) == NULL )
{
fprintf( stderr, "Not enough space for matrix row sort. Terminating...\n" );
exit( INSUFFICIENT_MEMORY );
}
Kurt A. O'Hearn
committed
/* sort each row of A using column indices */
#pragma omp for schedule(guided)
Kurt A. O'Hearn
committed
for ( i = 0; i < A->n; ++i )
{
si = A->start[i];
ei = A->start[i + 1];
Kurt A. O'Hearn
committed
memcpy( temp_j, A->j + si, sizeof(unsigned int) * (ei - si) );
Kurt A. O'Hearn
committed
memcpy( temp_val, A->val + si, sizeof(real) * (ei - si) );
Kurt A. O'Hearn
committed
Kurt A. O'Hearn
committed
//TODO: consider implementing single custom one-pass sort instead of using qsort + manual sort
/* polymorphic sort in standard C library using column indices */
Kurt A. O'Hearn
committed
qsort( temp_j, ei - si, sizeof(unsigned int), compare_matrix_entry );
Kurt A. O'Hearn
committed
Kurt A. O'Hearn
committed
/* manually sort vals */
for ( j = 0; j < (ei - si); ++j )
Kurt A. O'Hearn
committed
{
Kurt A. O'Hearn
committed
for ( k = 0; k < (ei - si); ++k )
Kurt A. O'Hearn
committed
{
Kurt A. O'Hearn
committed
if ( A->j[si + j] == temp_j[k] )
{
A->val[si + k] = temp_val[j];
break;
}
Kurt A. O'Hearn
committed
Kurt A. O'Hearn
committed
}
Kurt A. O'Hearn
committed
}
Kurt A. O'Hearn
committed
/* copy sorted column indices */
Kurt A. O'Hearn
committed
memcpy( A->j + si, temp_j, sizeof(unsigned int) * (ei - si) );
Kurt A. O'Hearn
committed
}
Kurt A. O'Hearn
committed
free( temp_val );
free( temp_j );
}
Kurt A. O'Hearn
committed
static void Calculate_Droptol( const sparse_matrix * const A,
real * const droptol, const real dtol )
Kurt A. O'Hearn
committed
#ifdef _OPENMP
static real *droptol_local;
unsigned int tid;
#endif
Kurt A. O'Hearn
committed
#pragma omp parallel default(none) private(i, j, k, val, tid), shared(droptol_local, stderr)
Kurt A. O'Hearn
committed
#ifdef _OPENMP
tid = omp_get_thread_num();
#pragma omp master
{
/* keep b_local for program duration to avoid allocate/free
* overhead per Sparse_MatVec call*/
if ( droptol_local == NULL )
{
if ( (droptol_local = (real*) malloc( omp_get_num_threads() * A->n * sizeof(real))) == NULL )
{
Kurt A. O'Hearn
committed
fprintf( stderr, "Not enough space for droptol. Terminating...\n" );
Kurt A. O'Hearn
committed
exit( INSUFFICIENT_MEMORY );
}
Kurt A. O'Hearn
committed
}
}
Kurt A. O'Hearn
committed
#pragma omp barrier
#endif
/* init droptol to 0 */
for ( i = 0; i < A->n; ++i )
{
#ifdef _OPENMP
droptol_local[tid * A->n + i] = 0.0;
#else
droptol[i] = 0.0;
#endif
}
#pragma omp barrier
/* calculate sqaure of the norm of each row */
Kurt A. O'Hearn
committed
#pragma omp for schedule(static)
Kurt A. O'Hearn
committed
for ( i = 0; i < A->n; ++i )
Kurt A. O'Hearn
committed
for ( k = A->start[i]; k < A->start[i + 1] - 1; ++k )
{
j = A->j[k];
val = A->val[k];
Kurt A. O'Hearn
committed
#ifdef _OPENMP
droptol_local[tid * A->n + i] += val * val;
droptol_local[tid * A->n + j] += val * val;
#else
droptol[i] += val * val;
droptol[j] += val * val;
#endif
}
Kurt A. O'Hearn
committed
// diagonal entry
val = A->val[k];
Kurt A. O'Hearn
committed
#ifdef _OPENMP
droptol_local[tid * A->n + i] += val * val;
#else
Kurt A. O'Hearn
committed
#endif
Kurt A. O'Hearn
committed
#pragma omp barrier
Kurt A. O'Hearn
committed
#ifdef _OPENMP
Kurt A. O'Hearn
committed
#pragma omp for schedule(static)
Kurt A. O'Hearn
committed
for ( i = 0; i < A->n; ++i )
{
droptol[i] = 0.0;
for ( k = 0; k < omp_get_num_threads(); ++k )
{
droptol[i] += droptol_local[k * A->n + i];
}
}
#endif
#pragma omp barrier
/* calculate local droptol for each row */
//fprintf( stderr, "droptol: " );
Kurt A. O'Hearn
committed
#pragma omp for schedule(static)
Kurt A. O'Hearn
committed
for ( i = 0; i < A->n; ++i )
{
//fprintf( stderr, "%f-->", droptol[i] );
droptol[i] = SQRT( droptol[i] ) * dtol;
//fprintf( stderr, "%f ", droptol[i] );
}
//fprintf( stderr, "\n" );
Kurt A. O'Hearn
committed
static int Estimate_LU_Fill( const sparse_matrix * const A, const real * const droptol )
int i, j, pj;
int fillin;
real val;
fillin = 0;
Kurt A. O'Hearn
committed
#pragma omp parallel for schedule(static) \
default(none) private(i, j, pj, val) reduction(+: fillin)
Kurt A. O'Hearn
committed
{
for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
{
Kurt A. O'Hearn
committed
j = A->j[pj];
val = A->val[pj];
Kurt A. O'Hearn
committed
if ( FABS(val) > droptol[i] )
{
Kurt A. O'Hearn
committed
}
Kurt A. O'Hearn
committed
}
#if defined(HAVE_SUPERLU_MT)
static real SuperLU_Factorize( const sparse_matrix * const A,
Kurt A. O'Hearn
committed
sparse_matrix * const L, sparse_matrix * const U )
{
unsigned int i, pj, count, *Ltop, *Utop, r;
Kurt A. O'Hearn
committed
sparse_matrix *A_t;
Kurt A. O'Hearn
committed
NCformat *A_S_store;
SCPformat *L_S_store;
NCPformat *U_S_store;
superlumt_options_t superlumt_options;
Kurt A. O'Hearn
committed
pxgstrf_shared_t pxgstrf_shared;
pdgstrf_threadarg_t *pdgstrf_threadarg;
int_t nprocs;
fact_t fact;
trans_t trans;
yes_no_t refact, usepr;
real u, drop_tol;
Kurt A. O'Hearn
committed
real *a, *at;
int_t *asub, *atsub, *xa, *xat;
int_t *perm_c; /* column permutation vector */
int_t *perm_r; /* row permutations from partial pivoting */
void *work;
Kurt A. O'Hearn
committed
int_t info, lwork;
int_t permc_spec, panel_size, relax;
Gstat_t Gstat;
flops_t flopcnt;
/* Default parameters to control factorization. */
#ifdef _OPENMP
//TODO: set as global parameter and use
#pragma omp parallel \
Kurt A. O'Hearn
committed
default(none) shared(nprocs)
{
#pragma omp master
{
/* SuperLU_MT spawns threads internally, so set and pass parameter */
nprocs = omp_get_num_threads();
}
}
#else
nprocs = 1;
#endif
Kurt A. O'Hearn
committed
// fact = EQUILIBRATE; /* equilibrate A (i.e., scale rows & cols to have unit norm), then factorize */
fact = DOFACT; /* factor from scratch */
trans = NOTRANS;
refact = NO; /* first time factorization */
//TODO: add to control file and use the value there to set these
panel_size = sp_ienv(1); /* # consec. cols treated as unit task */
relax = sp_ienv(2); /* # cols grouped as relaxed supernode */
Kurt A. O'Hearn
committed
u = 1.0; /* diagonal pivoting threshold */
usepr = NO;
drop_tol = 0.0;
work = NULL;
lwork = 0;
Kurt A. O'Hearn
committed
//#if defined(DEBUG)
fprintf( stderr, "nprocs = %d\n", nprocs );
fprintf( stderr, "Panel size = %d\n", panel_size );
fprintf( stderr, "Relax = %d\n", relax );
//#endif
if ( !(perm_r = intMalloc(A->n)) )
{
SUPERLU_ABORT("Malloc fails for perm_r[].");
}
Kurt A. O'Hearn
committed
if ( !(perm_c = intMalloc(A->n)) )
{
SUPERLU_ABORT("Malloc fails for perm_c[].");
}
if ( !(superlumt_options.etree = intMalloc(A->n)) )
{
Kurt A. O'Hearn
committed
SUPERLU_ABORT("Malloc fails for etree[].");
}
if ( !(superlumt_options.colcnt_h = intMalloc(A->n)) )
{
Kurt A. O'Hearn
committed
SUPERLU_ABORT("Malloc fails for colcnt_h[].");
}
if ( !(superlumt_options.part_super_h = intMalloc(A->n)) )
{
Kurt A. O'Hearn
committed
SUPERLU_ABORT("Malloc fails for part_super__h[].");
Kurt A. O'Hearn
committed
if ( ( (a = (real*) malloc( (2 * A->start[A->n] - A->n) * sizeof(real))) == NULL )
Kurt A. O'Hearn
committed
|| ( (asub = (int_t*) malloc( (2 * A->start[A->n] - A->n) * sizeof(int_t))) == NULL )
|| ( (xa = (int_t*) malloc( (A->n + 1) * sizeof(int_t))) == NULL )
|| ( (Ltop = (unsigned int*) malloc( (A->n + 1) * sizeof(unsigned int))) == NULL )
|| ( (Utop = (unsigned int*) malloc( (A->n + 1) * sizeof(unsigned int))) == NULL ) )
Kurt A. O'Hearn
committed
fprintf( stderr, "Not enough space for SuperLU factorization. Terminating...\n" );
exit( INSUFFICIENT_MEMORY );
Kurt A. O'Hearn
committed
}
if ( Allocate_Matrix( &A_t, A->n, A->m ) == FAILURE )
Kurt A. O'Hearn
committed
{
fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
Kurt A. O'Hearn
committed
exit( INSUFFICIENT_MEMORY );
}
/* Set up the sparse matrix data structure for A. */
Kurt A. O'Hearn
committed
Transpose( A, A_t );
Kurt A. O'Hearn
committed
for ( i = 0; i < A->n; ++i )
Kurt A. O'Hearn
committed
xa[i] = count;
Kurt A. O'Hearn
committed
for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
{
a[count] = A->entries[pj].val;
asub[count] = A->entries[pj].j;
++count;
}
Kurt A. O'Hearn
committed
for ( pj = A_t->start[i] + 1; pj < A_t->start[i + 1]; ++pj )
Kurt A. O'Hearn
committed
{
a[count] = A_t->entries[pj].val;
asub[count] = A_t->entries[pj].j;
++count;
}
Kurt A. O'Hearn
committed
xa[i] = count;
dCompRow_to_CompCol( A->n, A->n, 2 * A->start[A->n] - A->n, a, asub, xa,
Kurt A. O'Hearn
committed
&at, &atsub, &xat );
Kurt A. O'Hearn
committed
Kurt A. O'Hearn
committed
for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
Kurt A. O'Hearn
committed
fprintf( stderr, "%6d", asub[i] );
fprintf( stderr, "\n" );
Kurt A. O'Hearn
committed
for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
Kurt A. O'Hearn
committed
fprintf( stderr, "%6.1f", a[i] );
fprintf( stderr, "\n" );
Kurt A. O'Hearn
committed
for ( i = 0; i <= A->n; ++i )
Kurt A. O'Hearn
committed
fprintf( stderr, "%6d", xa[i] );
fprintf( stderr, "\n" );
Kurt A. O'Hearn
committed
for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
Kurt A. O'Hearn
committed
fprintf( stderr, "%6d", atsub[i] );
fprintf( stderr, "\n" );
Kurt A. O'Hearn
committed
for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i )
Kurt A. O'Hearn
committed
fprintf( stderr, "%6.1f", at[i] );
fprintf( stderr, "\n" );
Kurt A. O'Hearn
committed
for ( i = 0; i <= A->n; ++i )
Kurt A. O'Hearn
committed
fprintf( stderr, "%6d", xat[i] );
fprintf( stderr, "\n" );
A_S.Stype = SLU_NC; /* column-wise, no supernode */
A_S.Dtype = SLU_D; /* double-precision */
A_S.Mtype = SLU_GE; /* full (general) matrix -- required for parallel factorization */
A_S.nrow = A->n;
A_S.ncol = A->n;
A_S.Store = (void *) SUPERLU_MALLOC( sizeof(NCformat) );
A_S_store = (NCformat *) A_S.Store;
A_S_store->nnz = 2 * A->start[A->n] - A->n;
A_S_store->nzval = at;
A_S_store->rowind = atsub;
A_S_store->colptr = xat;
/* ------------------------------------------------------------
Kurt A. O'Hearn
committed
Allocate storage and initialize statistics variables.
------------------------------------------------------------*/
Kurt A. O'Hearn
committed
StatAlloc( A->n, nprocs, panel_size, relax, &Gstat );
StatInit( A->n, nprocs, &Gstat );
/* ------------------------------------------------------------
Get column permutation vector perm_c[], according to permc_spec:
Kurt A. O'Hearn
committed
permc_spec = 0: natural ordering
permc_spec = 1: minimum degree ordering on structure of A'*A
permc_spec = 2: minimum degree ordering on structure of A'+A
permc_spec = 3: approximate minimum degree for unsymmetric matrices
Kurt A. O'Hearn
committed
------------------------------------------------------------*/
Kurt A. O'Hearn
committed
get_perm_c( permc_spec, &A_S, perm_c );
/* ------------------------------------------------------------
Initialize the option structure superlumt_options using the
user-input parameters;
Apply perm_c to the columns of original A to form AC.
------------------------------------------------------------*/
Kurt A. O'Hearn
committed
pdgstrf_init( nprocs, fact, trans, refact, panel_size, relax,
Kurt A. O'Hearn
committed
u, usepr, drop_tol, perm_c, perm_r,
work, lwork, &A_S, &AC_S, &superlumt_options, &Gstat );
Kurt A. O'Hearn
committed
Kurt A. O'Hearn
committed
for ( i = 0; i < ((NCPformat*)AC_S.Store)->nnz; ++i )
Kurt A. O'Hearn
committed
fprintf( stderr, "%6.1f", ((real*)(((NCPformat*)AC_S.Store)->nzval))[i] );
fprintf( stderr, "\n" );
/* ------------------------------------------------------------
Compute the LU factorization of A.
The following routine will create nprocs threads.
------------------------------------------------------------*/
Kurt A. O'Hearn
committed
pdgstrf( &superlumt_options, &AC_S, perm_r, &L_S, &U_S, &Gstat, &info );
fprintf( stderr, "INFO: %d\n", info );
Kurt A. O'Hearn
committed
flopcnt = 0;
for (i = 0; i < nprocs; ++i)
{
flopcnt += Gstat.procstat[i].fcops;
}
Gstat.ops[FACT] = flopcnt;
Kurt A. O'Hearn
committed
//#if defined(DEBUG)
printf("\n** Result of sparse LU **\n");
L_S_store = (SCPformat *) L_S.Store;
U_S_store = (NCPformat *) U_S.Store;
Kurt A. O'Hearn
committed
printf( "No of nonzeros in factor L = " IFMT "\n", L_S_store->nnz );
printf( "No of nonzeros in factor U = " IFMT "\n", U_S_store->nnz );
fflush( stdout );
//#endif
Kurt A. O'Hearn
committed
/* convert L and R from SuperLU formats to CSR */
memset( Ltop, 0, (A->n + 1) * sizeof(int) );
memset( Utop, 0, (A->n + 1) * sizeof(int) );
Kurt A. O'Hearn
committed
memset( L->start, 0, (A->n + 1) * sizeof(int) );
memset( U->start, 0, (A->n + 1) * sizeof(int) );
Kurt A. O'Hearn
committed
for ( i = 0; i < 2 * L_S_store->nnz; ++i )
Kurt A. O'Hearn
committed
fprintf( stderr, "%6.1f", ((real*)(L_S_store->nzval))[i] );
fprintf( stderr, "\n" );
Kurt A. O'Hearn
committed
for ( i = 0; i < 2 * U_S_store->nnz; ++i )
Kurt A. O'Hearn
committed
fprintf( stderr, "%6.1f", ((real*)(U_S_store->nzval))[i] );
fprintf( stderr, "\n" );
printf( "No of supernodes in factor L = " IFMT "\n", L_S_store->nsuper );
Kurt A. O'Hearn
committed
for ( i = 0; i < A->n; ++i )
Kurt A. O'Hearn
committed
fprintf( stderr, "nzval_col_beg[%5d] = %d\n", i, L_S_store->nzval_colbeg[i] );
fprintf( stderr, "nzval_col_end[%5d] = %d\n", i, L_S_store->nzval_colend[i] );
//TODO: correct for SCPformat for L?
Kurt A. O'Hearn
committed
//for( pj = L_S_store->rowind_colbeg[i]; pj < L_S_store->rowind_colend[i]; ++pj )
// for( pj = 0; pj < L_S_store->rowind_colend[i] - L_S_store->rowind_colbeg[i]; ++pj )
// {
// ++Ltop[L_S_store->rowind[L_S_store->rowind_colbeg[i] + pj] + 1];
// }
fprintf( stderr, "col_beg[%5d] = %d\n", i, U_S_store->colbeg[i] );
fprintf( stderr, "col_end[%5d] = %d\n", i, U_S_store->colend[i] );
Kurt A. O'Hearn
committed
for ( pj = U_S_store->colbeg[i]; pj < U_S_store->colend[i]; ++pj )
{
++Utop[U_S_store->rowind[pj] + 1];
Kurt A. O'Hearn
committed
fprintf( stderr, "Utop[%5d] = %d\n", U_S_store->rowind[pj] + 1, Utop[U_S_store->rowind[pj] + 1] );
Kurt A. O'Hearn
committed
for ( i = 1; i <= A->n; ++i )
Kurt A. O'Hearn
committed
// Ltop[i] = L->start[i] = Ltop[i] + Ltop[i - 1];
Utop[i] = U->start[i] = Utop[i] + Utop[i - 1];
// fprintf( stderr, "Utop[%5d] = %d\n", i, Utop[i] );
// fprintf( stderr, "U->start[%5d] = %d\n", i, U->start[i] );
Kurt A. O'Hearn
committed
for ( i = 0; i < A->n; ++i )
Kurt A. O'Hearn
committed
// for( pj = 0; pj < L_S_store->nzval_colend[i] - L_S_store->nzval_colbeg[i]; ++pj )
// {
// r = L_S_store->rowind[L_S_store->rowind_colbeg[i] + pj];
// L->entries[Ltop[r]].j = r;
// L->entries[Ltop[r]].val = ((real*)L_S_store->nzval)[L_S_store->nzval_colbeg[i] + pj];
// ++Ltop[r];
// }
Kurt A. O'Hearn
committed
for ( pj = U_S_store->colbeg[i]; pj < U_S_store->colend[i]; ++pj )
{
r = U_S_store->rowind[pj];
U->entries[Utop[r]].j = i;
Kurt A. O'Hearn
committed
U->entries[Utop[r]].val = ((real*)U_S_store->nzval)[pj];
Kurt A. O'Hearn
committed
/* ------------------------------------------------------------
Deallocate storage after factorization.
------------------------------------------------------------*/
pxgstrf_finalize( &superlumt_options, &AC_S );
Deallocate_Matrix( A_t );
free( xa );
free( asub );
free( a );
Kurt A. O'Hearn
committed
SUPERLU_FREE( perm_r );
SUPERLU_FREE( perm_c );
SUPERLU_FREE( ((NCformat *)A_S.Store)->rowind );
SUPERLU_FREE( ((NCformat *)A_S.Store)->colptr );
SUPERLU_FREE( ((NCformat *)A_S.Store)->nzval );
SUPERLU_FREE( A_S.Store );
if ( lwork == 0 )
{
Destroy_SuperNode_SCP(&L_S);
Destroy_CompCol_NCP(&U_S);
}
else if ( lwork > 0 )
{
SUPERLU_FREE(work);
}
StatFree(&Gstat);
Kurt A. O'Hearn
committed
free( Utop );
free( Ltop );
//TODO: return iters
return 0.;
}
#endif
Kurt A. O'Hearn
committed
/* Diagonal (Jacobi) preconditioner computation */
static real diag_pre_comp( const reax_system * const system, real * const Hdia_inv )
Kurt A. O'Hearn
committed
{
unsigned int i;
Kurt A. O'Hearn
committed
real start;
Kurt A. O'Hearn
committed
Kurt A. O'Hearn
committed
start = Get_Time( );
Kurt A. O'Hearn
committed
Kurt A. O'Hearn
committed
#pragma omp parallel for schedule(static) \
Kurt A. O'Hearn
committed
default(none) private(i)
Kurt A. O'Hearn
committed
for ( i = 0; i < system->N; ++i )
Kurt A. O'Hearn
committed
{
Hdia_inv[i] = 1.0 / system->reaxprm.sbp[system->atoms[i].type].eta;
}
Kurt A. O'Hearn
committed
Hdia_inv[system->N_cm - 1] = 1.0;
Kurt A. O'Hearn
committed
Kurt A. O'Hearn
committed
return Get_Timing_Info( start );
Kurt A. O'Hearn
committed
}
Kurt A. O'Hearn
committed
/* Incomplete Cholesky factorization with dual thresholding */
Kurt A. O'Hearn
committed
static real ICHOLT( const sparse_matrix * const A, const real * const droptol,
Kurt A. O'Hearn
committed
sparse_matrix * const L, sparse_matrix * const U )
Kurt A. O'Hearn
committed
int *tmp_j;
real *tmp_val;
Kurt A. O'Hearn
committed
real val, start;
Kurt A. O'Hearn
committed
start = Get_Time( );
Kurt A. O'Hearn
committed
if ( ( Utop = (int*) malloc((A->n + 1) * sizeof(int)) ) == NULL ||
Kurt A. O'Hearn
committed
( tmp_j = (int*) malloc(A->n * sizeof(int)) ) == NULL ||
( tmp_val = (real*) malloc(A->n * sizeof(real)) ) == NULL )
{
fprintf( stderr, "not enough memory for ICHOLT preconditioning matrices. terminating.\n" );
exit( INSUFFICIENT_MEMORY );
}
memset( L->start, 0, (A->n + 1) * sizeof(unsigned int) );
memset( U->start, 0, (A->n + 1) * sizeof(unsigned int) );
memset( Utop, 0, A->n * sizeof(unsigned int) );
Kurt A. O'Hearn
committed
// fprintf( stderr, "n: %d\n", A->n );
// fflush( stderr );
for ( i = 0; i < A->n; ++i )
{
L->start[i] = Ltop;
tmptop = 0;
for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
{
Kurt A. O'Hearn
committed
j = A->j[pj];
val = A->val[pj];
Kurt A. O'Hearn
committed
// fprintf( stderr, " i: %d, j: %d\n", i, j );
// fflush( stderr );
Kurt A. O'Hearn
committed
if ( FABS(val) > droptol[i] )
{
k1 = 0;
k2 = L->start[j];
while ( k1 < tmptop && k2 < L->start[j + 1] )
{
Kurt A. O'Hearn
committed
if ( tmp_j[k1] < L->j[k2] )
{
Kurt A. O'Hearn
committed
}
else if ( tmp_j[k1] > L->j[k2] )
{
Kurt A. O'Hearn
committed
}
Kurt A. O'Hearn
committed
{
val -= (tmp_val[k1++] * L->val[k2++]);
}
}
// L matrix is lower triangular,
// so right before the start of next row comes jth diagonal
Kurt A. O'Hearn
committed
val /= L->val[L->start[j + 1] - 1];
Kurt A. O'Hearn
committed
tmp_j[tmptop] = j;
tmp_val[tmptop] = val;
Kurt A. O'Hearn
committed
// fprintf( stderr, " -- done\n" );
// fflush( stderr );
Kurt A. O'Hearn
committed
if ( A->j[pj] != i )
{
fprintf( stderr, "i=%d, badly built A matrix!\n", i );
Kurt A. O'Hearn
committed
exit( NUMERIC_BREAKDOWN );
// compute the ith diagonal in L
Kurt A. O'Hearn
committed
val = A->val[pj];
Kurt A. O'Hearn
committed
{
val -= (tmp_val[k1] * tmp_val[k1]);
}
Kurt A. O'Hearn
committed
tmp_j[tmptop] = i;
tmp_val[tmptop] = SQRT(val);
// apply the dropping rule once again
//fprintf( stderr, "row%d: tmptop: %d\n", i, tmptop );
//for( k1 = 0; k1<= tmptop; ++k1 )
// fprintf( stderr, "%d(%f) ", tmp[k1].j, tmp[k1].val );
//fprintf( stderr, "\n" );
//fprintf( stderr, "row(%d): droptol=%.4f\n", i+1, droptol[i] );
for ( k1 = 0; k1 < tmptop; ++k1 )
Kurt A. O'Hearn
committed
{
Kurt A. O'Hearn
committed
if ( FABS(tmp_val[k1]) > droptol[i] / tmp_val[tmptop] )
Kurt A. O'Hearn
committed
L->j[Ltop] = tmp_j[k1];
L->val[Ltop] = tmp_val[k1];
U->start[tmp_j[k1] + 1]++;
++Ltop;
//fprintf( stderr, "%d(%.4f) ", tmp[k1].j+1, tmp[k1].val );
}
Kurt A. O'Hearn
committed
}
Kurt A. O'Hearn
committed
L->j[Ltop] = tmp_j[k1];
L->val[Ltop] = tmp_val[k1];
++Ltop;
//fprintf( stderr, "%d(%.4f)\n", tmp[k1].j+1, tmp[k1].val );
// fprintf( stderr, "nnz(L): %d, max: %d\n", Ltop, L->n * 50 );
Kurt A. O'Hearn
committed
Transpose( L, U );
// for ( i = 1; i <= U->n; ++i )
// {
// Utop[i] = U->start[i] = U->start[i] + U->start[i - 1] + 1;
// }
// for ( i = 0; i < L->n; ++i )
// {
// for ( pj = L->start[i]; pj < L->start[i + 1]; ++pj )
// {
// j = L->j[pj];
// U->j[Utop[j]] = i;
// U->val[Utop[j]] = L->val[pj];
// Utop[j]++;
// }
// }
// fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 );
Kurt A. O'Hearn
committed
free( tmp_val );
free( tmp_j );
free( Utop );
Kurt A. O'Hearn
committed
return Get_Timing_Info( start );
/* Fine-grained (parallel) incomplete Cholesky factorization
* Reference:
* Edmond Chow and Aftab Patel
* Fine-Grained Parallel Incomplete LU Factorization
* SIAM J. Sci. Comp. */
static real ICHOL_PAR( const sparse_matrix * const A, const unsigned int sweeps,
Kurt A. O'Hearn
committed
sparse_matrix * const U_t, sparse_matrix * const U )
{
unsigned int i, j, k, pj, x = 0, y = 0, ei_x, ei_y;
Kurt A. O'Hearn
committed
real *D, *D_inv, sum, start;
Kurt A. O'Hearn
committed
start = Get_Time( );
if ( Allocate_Matrix( &DAD, A->n, A->m ) == FAILURE ||
( D = (real*) malloc(A->n * sizeof(real)) ) == NULL ||
Kurt A. O'Hearn
committed
( D_inv = (real*) malloc(A->n * sizeof(real)) ) == NULL ||
( Utop = (int*) malloc((A->n + 1) * sizeof(int)) ) == NULL )
{
Kurt A. O'Hearn
committed
fprintf( stderr, "not enough memory for ICHOL_PAR preconditioning matrices. terminating.\n" );
Kurt A. O'Hearn
committed
exit( INSUFFICIENT_MEMORY );
}
#pragma omp parallel for schedule(static) \
Kurt A. O'Hearn
committed
default(none) shared(D_inv, D) private(i)
for ( i = 0; i < A->n; ++i )
{
Kurt A. O'Hearn
committed
D_inv[i] = SQRT( A->val[A->start[i + 1] - 1] );
D[i] = 1. / D_inv[i];
}
memset( U->start, 0, sizeof(unsigned int) * (A->n + 1) );
memset( Utop, 0, sizeof(unsigned int) * (A->n + 1) );
/* to get convergence, A must have unit diagonal, so apply
* transformation DAD, where D = D(1./sqrt(D(A))) */
memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) );
#pragma omp parallel for schedule(guided) \
Kurt A. O'Hearn
committed
default(none) shared(DAD, D_inv, D) private(i, pj)
for ( i = 0; i < A->n; ++i )
/* non-diagonals */
for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
Kurt A. O'Hearn
committed
DAD->j[pj] = A->j[pj];
DAD->val[pj] = A->val[pj] * D[i] * D[A->j[pj]];
Kurt A. O'Hearn
committed
DAD->j[pj] = A->j[pj];
DAD->val[pj] = 1.;
/* initial guesses for U^T,
* assume: A and DAD symmetric and stored lower triangular */
memcpy( U_t->start, DAD->start, sizeof(int) * (DAD->n + 1) );
Kurt A. O'Hearn
committed
memcpy( U_t->j, DAD->j, sizeof(int) * (DAD->m) );
memcpy( U_t->val, DAD->val, sizeof(real) * (DAD->m) );
for ( i = 0; i < sweeps; ++i )
{
/* for each nonzero */
Kurt A. O'Hearn
committed
#pragma omp parallel for schedule(static) \
Kurt A. O'Hearn
committed
default(none) shared(DAD, stderr) private(sum, ei_x, ei_y, k) firstprivate(x, y)
{
sum = ZERO;
/* determine row bounds of current nonzero */
x = 0;
ei_x = 0;
for ( k = 0; k <= A->n; ++k )
if ( U_t->start[k] > j )
x = U_t->start[k - 1];
ei_x = U_t->start[k];
/* column bounds of current nonzero */
Kurt A. O'Hearn
committed
y = U_t->start[U_t->j[j]];
ei_y = U_t->start[U_t->j[j] + 1];
/* sparse dot product: dot( U^T(i,1:j-1), U^T(j,1:j-1) ) */
Kurt A. O'Hearn
committed
while ( U_t->j[x] < U_t->j[j] &&
U_t->j[y] < U_t->j[j] &&
Kurt A. O'Hearn
committed
if ( U_t->j[x] == U_t->j[y] )
Kurt A. O'Hearn
committed
sum += (U_t->val[x] * U_t->val[y]);
Kurt A. O'Hearn
committed
else if ( U_t->j[x] < U_t->j[y] )
{
++x;
}
else
{
++y;
}
}
Kurt A. O'Hearn
committed
sum = DAD->val[j] - sum;
/* diagonal entries */
Kurt A. O'Hearn
committed
if ( (k - 1) == U_t->j[j] )
fprintf( stderr, "Numeric breakdown in ICHOL Terminating.\n");
#if defined(DEBUG_FOCUS)
fprintf( stderr, "A(%5d,%5d) = %10.3f\n",
Kurt A. O'Hearn
committed
k - 1, A->entries[j].j, A->entries[j].val );
fprintf( stderr, "sum = %10.3f\n", sum);
#endif
exit(NUMERIC_BREAKDOWN);
}
Kurt A. O'Hearn
committed
U_t->val[j] = SQRT( sum );
/* non-diagonal entries */
Kurt A. O'Hearn
committed
U_t->val[j] = sum / U_t->val[ei_y - 1];
/* apply inverse transformation D^{-1}U^{T},
* since DAD \approx U^{T}U, so
* D^{-1}DADD^{-1} = A \approx D^{-1}U^{T}UD^{-1} */
#pragma omp parallel for schedule(guided) \
Kurt A. O'Hearn
committed
default(none) shared(D_inv) private(i, pj)
for ( i = 0; i < A->n; ++i )
{
for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
{
Kurt A. O'Hearn
committed
U_t->val[pj] *= D_inv[i];
#if defined(DEBUG_FOCUS)
fprintf( stderr, "nnz(L): %d, max: %d\n", U_t->start[U_t->n], U_t->n * 50 );
#endif
/* transpose U^{T} and copy into U */
#if defined(DEBUG_FOCUS)
fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 );
#endif
Deallocate_Matrix( DAD );
free(D_inv);
free(D);
Kurt A. O'Hearn
committed
return Get_Timing_Info( start );
Kurt A. O'Hearn
committed
/* Fine-grained (parallel) incomplete LU factorization
Kurt A. O'Hearn
committed
*
Kurt A. O'Hearn
committed
* Reference:
* Edmond Chow and Aftab Patel
* Fine-Grained Parallel Incomplete LU Factorization
* SIAM J. Sci. Comp.
Kurt A. O'Hearn
committed
*
Kurt A. O'Hearn
committed
* A: symmetric, half-stored (lower triangular), CSR format
* sweeps: number of loops over non-zeros for computation
* L / U: factorized triangular matrices (A \approx LU), CSR format */
Kurt A. O'Hearn
committed
static real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps,
sparse_matrix * const L, sparse_matrix * const U )
Kurt A. O'Hearn
committed
{
unsigned int i, j, k, pj, x, y, ei_x, ei_y;
Kurt A. O'Hearn
committed
real *D, *D_inv, sum, start;
Kurt A. O'Hearn
committed
sparse_matrix *DAD;
Kurt A. O'Hearn
committed
start = Get_Time( );
Kurt A. O'Hearn
committed
if ( Allocate_Matrix( &DAD, A->n, A->m ) == FAILURE ||
( D = (real*) malloc(A->n * sizeof(real)) ) == NULL ||
Kurt A. O'Hearn
committed
( D_inv = (real*) malloc(A->n * sizeof(real)) ) == NULL )
{
Kurt A. O'Hearn
committed
fprintf( stderr, "not enough memory for ILU_PAR preconditioning matrices. terminating.\n" );
Kurt A. O'Hearn
committed
exit( INSUFFICIENT_MEMORY );
}
Kurt A. O'Hearn
committed
Kurt A. O'Hearn
committed
#pragma omp parallel for schedule(static) \
Kurt A. O'Hearn
committed
default(none) shared(D, D_inv) private(i)
Kurt A. O'Hearn
committed
for ( i = 0; i < A->n; ++i )
{
Kurt A. O'Hearn
committed
D_inv[i] = SQRT( A->val[A->start[i + 1] - 1] );
Kurt A. O'Hearn
committed
D[i] = 1.0 / D_inv[i];
}
/* to get convergence, A must have unit diagonal, so apply
* transformation DAD, where D = D(1./sqrt(D(A))) */
memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) );
Kurt A. O'Hearn
committed
#pragma omp parallel for schedule(static) \
Kurt A. O'Hearn
committed
default(none) shared(DAD, D) private(i, pj)
Kurt A. O'Hearn
committed
for ( i = 0; i < A->n; ++i )
{
/* non-diagonals */
for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
{
Kurt A. O'Hearn
committed
DAD->j[pj] = A->j[pj];
DAD->val[pj] = D[i] * A->val[pj] * D[A->j[pj]];
Kurt A. O'Hearn
committed
}
/* diagonal */
Kurt A. O'Hearn
committed
DAD->j[pj] = A->j[pj];
DAD->val[pj] = 1.0;
Kurt A. O'Hearn
committed
}
/* initial guesses for L and U,
* assume: A and DAD symmetric and stored lower triangular */
memcpy( L->start, DAD->start, sizeof(int) * (DAD->n + 1) );
Kurt A. O'Hearn
committed
memcpy( L->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) );
memcpy( L->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) );
Kurt A. O'Hearn
committed
/* store U^T in CSR for row-wise access and tranpose later */
memcpy( U->start, DAD->start, sizeof(int) * (DAD->n + 1) );
Kurt A. O'Hearn
committed
memcpy( U->j, DAD->j, sizeof(int) * (DAD->start[DAD->n]) );
memcpy( U->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) );
Kurt A. O'Hearn
committed
/* L has unit diagonal, by convention */
Kurt A. O'Hearn
committed
#pragma omp parallel for schedule(static) default(none) private(i)
Kurt A. O'Hearn
committed
for ( i = 0; i < A->n; ++i )
{
Kurt A. O'Hearn
committed
L->val[L->start[i + 1] - 1] = 1.0;
Kurt A. O'Hearn
committed
}
for ( i = 0; i < sweeps; ++i )
{
/* for each nonzero in L */
Kurt A. O'Hearn
committed
#pragma omp parallel for schedule(static) \
Kurt A. O'Hearn
committed
default(none) shared(DAD) private(j, k, x, y, ei_x, ei_y, sum)
Kurt A. O'Hearn
committed
for ( j = 0; j < DAD->start[DAD->n]; ++j )
{
sum = ZERO;
/* determine row bounds of current nonzero */
x = 0;
ei_x = 0;
for ( k = 1; k <= DAD->n; ++k )
{
if ( DAD->start[k] > j )
{
x = DAD->start[k - 1];
ei_x = DAD->start[k];
break;
}
}
/* determine column bounds of current nonzero */