-
Kurt A. O'Hearn authoredKurt A. O'Hearn authored
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
QEq.c 20.12 KiB
/*----------------------------------------------------------------------
SerialReax - Reax Force Field Simulator
Copyright (2010) Purdue University
Hasan Metin Aktulga, haktulga@cs.purdue.edu
Joseph Fogarty, jcfogart@mail.usf.edu
Sagar Pandit, pandit@usf.edu
Ananth Y Grama, ayg@cs.purdue.edu
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License as
published by the Free Software Foundation; either version 2 of
the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
See the GNU General Public License for more details:
<http://www.gnu.org/licenses/>.
----------------------------------------------------------------------*/
#include "QEq.h"
#include "allocate.h"
#include "GMRES.h"
#include "list.h"
#include "print_utils.h"
int compare_matrix_entry(const void *v1, const void *v2)
{
/* larger element has larger column index */
return ((sparse_matrix_entry *)v1)->j - ((sparse_matrix_entry *)v2)->j;
}
void Sort_Matrix_Rows( sparse_matrix *A )
{
int i, si, ei;
/* sort each row of A using column indices */
for ( i = 0; i < A->n; ++i )
{
si = A->start[i];
ei = A->start[i + 1];
/* polymorphic sort in standard C library */
qsort( &(A->entries[si]), ei - si,
sizeof(sparse_matrix_entry), compare_matrix_entry );
}
}
void Calculate_Droptol( sparse_matrix *A, real *droptol, real dtol )
{
int i, j, k;
real val;
/* init droptol to 0 */
for ( i = 0; i < A->n; ++i )
droptol[i] = 0;
/* calculate sqaure of the norm of each row */
for ( i = 0; i < A->n; ++i )
{
for ( k = A->start[i]; k < A->start[i + 1] - 1; ++k )
{
j = A->entries[k].j;
val = A->entries[k].val;
droptol[i] += val * val;
droptol[j] += val * val;
}
val = A->entries[k].val; // diagonal entry
droptol[i] += val * val;
}
/* calculate local droptol for each row */
//fprintf( stderr, "droptol: " );
for ( i = 0; i < A->n; ++i )
{
//fprintf( stderr, "%f-->", droptol[i] );
droptol[i] = SQRT( droptol[i] ) * dtol;
//fprintf( stderr, "%f ", droptol[i] );
}
//fprintf( stderr, "\n" );
}
int Estimate_LU_Fill( sparse_matrix *A, real *droptol )
{
int i, j, pj;
int fillin;
real val;
fillin = 0;
//fprintf( stderr, "n: %d\n", A->n );
for ( i = 0; i < A->n; ++i )
for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
{
j = A->entries[pj].j;
val = A->entries[pj].val;
//fprintf( stderr, "i: %d, j: %d", i, j );
if ( fabs(val) > droptol[i] )
++fillin;
}
return fillin + A->n;
}
/* Incomplete Cholesky factorization with thresholding */
real ICHOLT( sparse_matrix *A, real *droptol,
sparse_matrix *L, sparse_matrix *U )
{
sparse_matrix_entry tmp[1000];
int i, j, pj, k1, k2, tmptop, Ltop;
real val;
int *Utop;
struct timeval start, stop;
gettimeofday( &start, NULL );
Utop = (int*) malloc((A->n + 1) * sizeof(int));
// clear variables
Ltop = 0;
tmptop = 0;
for ( i = 0; i <= A->n; ++i )
L->start[i] = U->start[i] = 0;
for ( i = 0; i < A->n; ++i )
Utop[i] = 0;
//fprintf( stderr, "n: %d\n", A->n );
for ( i = 0; i < A->n; ++i )
{
L->start[i] = Ltop;
tmptop = 0;
for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
{
j = A->entries[pj].j;
val = A->entries[pj].val;
//fprintf( stderr, "i: %d, j: %d", i, j );
if ( fabs(val) > droptol[i] )
{
k1 = 0;
k2 = L->start[j];
while ( k1 < tmptop && k2 < L->start[j + 1] )
{
if ( tmp[k1].j < L->entries[k2].j )
++k1;
else if ( tmp[k1].j > L->entries[k2].j )
++k2;
else
val -= (tmp[k1++].val * L->entries[k2++].val);
}
// L matrix is lower triangular,
// so right before the start of next row comes jth diagonal
val /= L->entries[L->start[j + 1] - 1].val;
tmp[tmptop].j = j;
tmp[tmptop].val = val;
++tmptop;
}
//fprintf( stderr, " -- done\n" );
}
// compute the ith diagonal in L
// sanity check
if ( A->entries[pj].j != i )
{
fprintf( stderr, "i=%d, badly built A matrix!\n", i );
exit(999);
}
val = A->entries[pj].val;
for ( k1 = 0; k1 < tmptop; ++k1 )
val -= (tmp[k1].val * tmp[k1].val);
tmp[tmptop].j = i;
tmp[tmptop].val = SQRT(val);
// apply the dropping rule once again
//fprintf( stderr, "row%d: tmptop: %d\n", i, tmptop );
//for( k1 = 0; k1<= tmptop; ++k1 )
// fprintf( stderr, "%d(%f) ", tmp[k1].j, tmp[k1].val );
//fprintf( stderr, "\n" );
//fprintf( stderr, "row(%d): droptol=%.4f\n", i+1, droptol[i] );
for ( k1 = 0; k1 < tmptop; ++k1 )
if ( fabs(tmp[k1].val) > droptol[i] / tmp[tmptop].val )
{
L->entries[Ltop].j = tmp[k1].j;
L->entries[Ltop].val = tmp[k1].val;
U->start[tmp[k1].j + 1]++;
++Ltop;
//fprintf( stderr, "%d(%.4f) ", tmp[k1].j+1, tmp[k1].val );
}
// keep the diagonal in any case
L->entries[Ltop].j = tmp[k1].j;
L->entries[Ltop].val = tmp[k1].val;
++Ltop;
//fprintf( stderr, "%d(%.4f)\n", tmp[k1].j+1, tmp[k1].val );
}
L->start[i] = Ltop;
// fprintf( stderr, "nnz(L): %d, max: %d\n", Ltop, L->n * 50 );
/* U = L^T (Cholesky factorization) */
for ( i = 1; i <= U->n; ++i )
{
Utop[i] = U->start[i] = U->start[i] + U->start[i - 1] + 1;
}
for ( i = 0; i < L->n; ++i )
{
for ( pj = L->start[i]; pj < L->start[i + 1]; ++pj )
{
j = L->entries[pj].j;
U->entries[Utop[j]].j = i;
U->entries[Utop[j]].val = L->entries[pj].val;
Utop[j]++;
}
}
// fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 );
free(Utop);
gettimeofday( &stop, NULL );
return (stop.tv_sec + stop.tv_usec / 1000000.0)
- (start.tv_sec + start.tv_usec / 1000000.0);
}
/* Fine-grained (parallel) incomplete Cholesky factorization
*
* Reference:
* Edmond Chow and Aftab Patel
* Fine-Grained Parallel Incomplete LU Factorization
* SIAM J. Sci. Comp. */
static void ICHOL_PAR( const sparse_matrix * const A, const unsigned int sweeps,
sparse_matrix * const U_t, sparse_matrix * const U )
{
unsigned int i, j, k, pj, x = 0, y = 0, ei_x, ei_y;
real *D, *D_inv, sum;
sparse_matrix *DAD;
int *Utop;
if ( Allocate_Matrix( &DAD, A->n, A->m ) == 0 )
{
fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
exit(INSUFFICIENT_SPACE);
}
D = (real*) malloc(A->n * sizeof(real));
D_inv = (real*) malloc(A->n * sizeof(real));
Utop = (int*) malloc((A->n + 1) * sizeof(int));
for ( i = 0; i < A->n; ++i )
{
D_inv[i] = SQRT( A->entries[A->start[i + 1] - 1].val );
D[i] = 1. / D_inv[i];
}
for ( i = 0; i <= A->n; ++i )
{
U->start[i] = 0;
Utop[i] = 0;
}
/* to get convergence, A must have unit diagonal, so apply
* transformation DAD, where D = D(1./sqrt(D(A))) */
memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) );
for ( i = 0; i < A->n; ++i )
{
/* non-diagonals */
for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
{
DAD->entries[pj].j = A->entries[pj].j;
DAD->entries[pj].val =
A->entries[pj].val * D[i] * D[A->entries[pj].j];
}
/* diagonal */
DAD->entries[pj].j = A->entries[pj].j;
DAD->entries[pj].val = 1.;
}
/* initial guesses for U^T,
* assume: A and DAD symmetric and stored lower triangular */
memcpy( U_t->start, DAD->start, sizeof(int) * (DAD->n + 1) );
memcpy( U_t->entries, DAD->entries, sizeof(sparse_matrix_entry) * (DAD->m) );
for ( i = 0; i < sweeps; ++i )
{
/* for each nonzero */
#pragma omp parallel for schedule(guided) \
default(none) shared(DAD) private(sum, ei_x, ei_y, k) firstprivate(x, y)
for ( j = 0; j < A->start[A->n]; ++j )
{
sum = ZERO;
/* determine row bounds of current nonzero */
x = 0;
ei_x = 0;
for ( k = 0; k <= A->n; ++k )
{
if ( U_t->start[k] > j )
{
x = U_t->start[k - 1];
ei_x = U_t->start[k];
break;
}
}
/* column bounds of current nonzero */
y = U_t->start[U_t->entries[j].j];
ei_y = U_t->start[U_t->entries[j].j + 1];
/* sparse dot product: dot( U^T(i,1:j-1), U^T(j,1:j-1) ) */
while ( U_t->entries[x].j < U_t->entries[j].j &&
U_t->entries[y].j < U_t->entries[j].j &&
x < ei_x && y < ei_y )
{
if ( U_t->entries[x].j == U_t->entries[y].j )
{
sum += (U_t->entries[x].val * U_t->entries[y].val);
++x;
++y;
}
else if ( U_t->entries[x].j < U_t->entries[y].j )
{
++x;
}
else
{
++y;
}
}
sum = DAD->entries[j].val - sum;
/* diagonal entries */
if ( (k - 1) == U_t->entries[j].j )
{
/* sanity check */
if ( sum < ZERO )
{
fprintf( stderr, "Numeric breakdown in ICHOL Terminating.\n");
#if defined(DEBUG_FOCUS)
fprintf( stderr, "A(%5d,%5d) = %10.3f\n",
k - 1, A->entries[j].j, A->entries[j].val );
fprintf( stderr, "sum = %10.3f\n", sum);
#endif
exit(NUMERIC_BREAKDOWN);
}
U_t->entries[j].val = SQRT( sum );
}
/* non-diagonal entries */
else
{
U_t->entries[j].val = sum / U_t->entries[ei_y - 1].val;
}
}
}
/* apply inverse transformation D^{-1}U^{T},
* since DAD \approx U^{T}U, so
* D^{-1}DADD^{-1} = A \approx D^{-1}U^{T}UD^{-1} */
for ( i = 0; i < A->n; ++i )
{
for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
{
U_t->entries[pj].val *= D_inv[i];
}
}
#if defined(DEBUG_FOCUS)
fprintf( stderr, "nnz(L): %d, max: %d\n", U_t->start[U_t->n], U_t->n * 50 );
#endif
/* transpose U^{T} and copy into U */
for ( i = 0; i < U_t->n; ++i )
{
/* count nonzeros in each row of U (columns of U^{T}),
* store in U->start */
for ( pj = U_t->start[i]; pj < U_t->start[i + 1]; ++pj )
{
U->start[U_t->entries[pj].j + 1]++;
}
}
/* set correct row pointer in U */
for ( i = 1; i <= U->n; ++i )
{
Utop[i] = U->start[i] = U->start[i] + U->start[i - 1];
}
/* for each row */
for ( i = 0; i < U_t->n; ++i )
{
/* for each nonzero (column-wise) in U^T */
for ( pj = U_t->start[i]; pj < U_t->start[i + 1]; ++pj )
{
j = U_t->entries[pj].j;
U->entries[Utop[j]].j = i;
U->entries[Utop[j]].val = U_t->entries[pj].val;
/* Utop contains pointer within rows of U
* (columns of U^T) to add next nonzero, so increment */
Utop[j]++;
}
}
#if defined(DEBUG_FOCUS)
fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 );
#endif
Deallocate_Matrix( DAD );
free(D_inv);
free(D);
free(Utop);
}
void Init_MatVec( reax_system *system, control_params *control,
simulation_data *data, static_storage *workspace,
list *far_nbrs )
{
int i, fillin;
real s_tmp, t_tmp;
// char fname[100];
if (control->refactor > 0 &&
((data->step - data->prev_steps) % control->refactor == 0 || workspace->L == NULL))
{
//Print_Linear_System( system, control, workspace, data->step );
Sort_Matrix_Rows( workspace->H );
Sort_Matrix_Rows( workspace->H_sp );
//fprintf( stderr, "H matrix sorted\n" );
Calculate_Droptol( workspace->H, workspace->droptol, control->droptol );
//fprintf( stderr, "drop tolerances calculated\n" );
if ( workspace->L == NULL )
{
fillin = Estimate_LU_Fill( workspace->H, workspace->droptol );
if ( Allocate_Matrix( &(workspace->L), far_nbrs->n, fillin ) == 0 ||
Allocate_Matrix( &(workspace->U), far_nbrs->n, fillin ) == 0 )
{
fprintf( stderr, "not enough memory for LU matrices. terminating.\n" );
exit(INSUFFICIENT_SPACE);
}
/* factors have sparsity pattern as H */
// if ( Allocate_Matrix( &(workspace->L), workspace->H->n, workspace->H->m ) == 0 ||
// Allocate_Matrix( &(workspace->U), workspace->H->n, workspace->H->m ) == 0 )
// {
// fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
// exit(INSUFFICIENT_SPACE);
// }
#if defined(DEBUG_FOCUS)
fprintf( stderr, "fillin = %d\n", fillin );
fprintf( stderr, "allocated memory: L = U = %ldMB\n",
fillin * sizeof(sparse_matrix_entry) / (1024 * 1024) );
#endif
}
data->timing.pre_comp += ICHOLT( workspace->H, workspace->droptol, workspace->L, workspace->U );
// data->timing.pre_comp += ICHOLT( workspace->H_sp, workspace->droptol, workspace->L, workspace->U );
// TODO: add parameters for sweeps to control file
// ICHOL_PAR( workspace->H, 1, workspace->L, workspace->U );
// fprintf( stderr, "condest = %f\n", condest(workspace->L, workspace->U) );
#if defined(DEBUG_FOCUS)
sprintf( fname, "%s.L%d.out", control->sim_name, data->step );
Print_Sparse_Matrix2( workspace->L, fname );
sprintf( fname, "%s.U%d.out", control->sim_name, data->step );
Print_Sparse_Matrix2( workspace->U, fname );
fprintf( stderr, "icholt-" );
//sprintf( fname, "%s.L%d.out", control->sim_name, data->step );
//Print_Sparse_Matrix2( workspace->L, fname );
//Print_Sparse_Matrix( U );
#endif
}
/* extrapolation for s & t */
for ( i = 0; i < system->N; ++i )
{
// no extrapolation
//s_tmp = workspace->s[0][i];
//t_tmp = workspace->t[0][i];
// linear
//s_tmp = 2 * workspace->s[0][i] - workspace->s[1][i];
//t_tmp = 2 * workspace->t[0][i] - workspace->t[1][i];
// quadratic
//s_tmp = workspace->s[2][i] + 3 * (workspace->s[0][i]-workspace->s[1][i]);
t_tmp = workspace->t[2][i] + 3 * (workspace->t[0][i] - workspace->t[1][i]);
// cubic
s_tmp = 4 * (workspace->s[0][i] + workspace->s[2][i]) -
(6 * workspace->s[1][i] + workspace->s[3][i] );
//t_tmp = 4 * (workspace->t[0][i] + workspace->t[2][i]) -
// (6 * workspace->t[1][i] + workspace->t[3][i] );
// 4th order
//s_tmp = 5 * (workspace->s[0][i] - workspace->s[3][i]) +
// 10 * (-workspace->s[1][i] + workspace->s[2][i] ) + workspace->s[4][i];
//t_tmp = 5 * (workspace->t[0][i] - workspace->t[3][i]) +
// 10 * (-workspace->t[1][i] + workspace->t[2][i] ) + workspace->t[4][i];
workspace->s[4][i] = workspace->s[3][i];
workspace->s[3][i] = workspace->s[2][i];
workspace->s[2][i] = workspace->s[1][i];
workspace->s[1][i] = workspace->s[0][i];
workspace->s[0][i] = s_tmp;
workspace->t[4][i] = workspace->t[3][i];
workspace->t[3][i] = workspace->t[2][i];
workspace->t[2][i] = workspace->t[1][i];
workspace->t[1][i] = workspace->t[0][i];
workspace->t[0][i] = t_tmp;
}
}
void Calculate_Charges( reax_system *system, static_storage *workspace )
{
int i;
real u, s_sum, t_sum;
s_sum = t_sum = 0.;
for ( i = 0; i < system->N; ++i )
{
s_sum += workspace->s[0][i];
t_sum += workspace->t[0][i];
}
u = s_sum / t_sum;
for ( i = 0; i < system->N; ++i )
system->atoms[i].q = workspace->s[0][i] - u * workspace->t[0][i];
}
void QEq( reax_system *system, control_params *control, simulation_data *data,
static_storage *workspace, list *far_nbrs,
output_controls *out_control )
{
int matvecs;
Init_MatVec( system, control, data, workspace, far_nbrs );
// if( data->step == 0 || data->step == 100 )
// Print_Linear_System( system, control, workspace, data->step );
//TODO: add parameters in control file for solver choice and options
// matvecs = GMRES( workspace, workspace->H,
// workspace->b_s, control->q_err, workspace->s[0], out_control->log, &(data->timing.pre_app) );
// matvecs += GMRES( workspace, workspace->H,
// workspace->b_t, control->q_err, workspace->t[0], out_control->log, &(data->timing.pre_app) );
// matvecs = GMRES_HouseHolder( workspace, workspace->H,
// workspace->b_s, control->q_err, workspace->s[0], out_control->log );
// matvecs += GMRES_HouseHolder( workspace, workspace->H,
// workspace->b_t, control->q_err, workspace->t[0], out_control->log );
matvecs = PGMRES( workspace, workspace->H, workspace->b_s, control->q_err,
workspace->L, workspace->U, workspace->s[0], out_control->log, &(data->timing.pre_app) );
matvecs += PGMRES( workspace, workspace->H, workspace->b_t, control->q_err,
workspace->L, workspace->U, workspace->t[0], out_control->log, &(data->timing.pre_app) );
// matvecs = PGMRES_Jacobi( workspace, workspace->H, workspace->b_s, control->q_err,
// workspace->L, workspace->U, workspace->s[0], out_control->log, &(data->timing.pre_app) );
// matvecs += PGMRES_Jacobi( workspace, workspace->H, workspace->b_t, control->q_err,
// workspace->L, workspace->U, workspace->t[0], out_control->log, &(data->timing.pre_app) );
//matvecs=PCG( workspace, workspace->H, workspace->b_s, control->q_err,
// workspace->L, workspace->U, workspace->s[0], out_control->log ) + 1;
///matvecs+=PCG( workspace, workspace->H, workspace->b_t, control->q_err,
// workspace->L, workspace->U, workspace->t[0], out_control->log ) + 1;
//matvecs = CG( workspace, workspace->H,
// workspace->b_s, control->q_err, workspace->s[0], out_control->log ) + 1;
//matvecs += CG( workspace, workspace->H,
// workspace->b_t, control->q_err, workspace->t[0], out_control->log ) + 1;
//matvecs = SDM( workspace, workspace->H,
// workspace->b_s, control->q_err, workspace->s[0], out_control->log ) + 1;
//matvecs += SDM( workspace, workspace->H,
// workspace->b_t, control->q_err, workspace->t[0], out_control->log ) + 1;
data->timing.matvecs += matvecs;
#if defined(DEBUG_FOCUS)
fprintf( stderr, "linsolve-" );
#endif
Calculate_Charges( system, workspace );
//fprintf( stderr, "%d %.9f %.9f %.9f %.9f %.9f %.9f\n",
// data->step,
// workspace->s[0][0], workspace->t[0][0],
// workspace->s[0][1], workspace->t[0][1],
// workspace->s[0][2], workspace->t[0][2] );
// if( data->step == control->nsteps )
//Print_Charges( system, control, workspace, data->step );
}