Newer
Older
/*----------------------------------------------------------------------
PuReMD - Purdue ReaxFF Molecular Dynamics Program
Copyright (2010) Purdue University
Hasan Metin Aktulga, haktulga@cs.purdue.edu
Joseph Fogarty, jcfogart@mail.usf.edu
Sagar Pandit, pandit@usf.edu
Ananth Y Grama, ayg@cs.purdue.edu
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License as
published by the Free Software Foundation; either version 2 of
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
See the GNU General Public License for more details:
<http://www.gnu.org/licenses/>.
----------------------------------------------------------------------*/
#include "allocate.h"
#include "basic_comm.h"
#include "io_tools.h"
Kurt A. O'Hearn
committed
#ifdef HAVE_CUDA
#include "cuda_charges.h"
#include "cuda_lin_alg.h"
#include "cuda_validation.h"
Kurt A. O'Hearn
committed
#endif
int compare_matrix_entry(const void *v1, const void *v2)
{
return ((sparse_matrix_entry *)v1)->j - ((sparse_matrix_entry *)v2)->j;
}
void Sort_Matrix_Rows( sparse_matrix *A )
{
int i, si, ei;
for ( i = 0; i < A->n; ++i )
{
si = A->start[i];
ei = A->end[i];
qsort( &(A->entries[si]), ei - si,
sizeof(sparse_matrix_entry), compare_matrix_entry );
}
void Calculate_Droptol( sparse_matrix *A, real *droptol, real dtol )
{
int i, j, k;
real val;
/* init droptol to 0 - not necessary for an upper-triangular A */
for ( i = 0; i < A->n; ++i )
/* calculate sqaure of the norm of each row */
for ( i = 0; i < A->n; ++i )
{
val = A->entries[A->start[i]].val; // diagonal entry
droptol[i] += val * val;
// only within my block
for ( k = A->start[i] + 1; A->entries[k].j < A->n; ++k )
{
j = A->entries[k].j;
val = A->entries[k].val;
droptol[i] += val * val;
droptol[j] += val * val;
}
/* calculate local droptol for each row */
//fprintf( stderr, "droptol: " );
for ( i = 0; i < A->n; ++i )
{
droptol[i] = SQRT( droptol[i] ) * dtol;
//fprintf( stderr, "%f\n", droptol[i] );
}
//fprintf( stderr, "\n" );
}
int Estimate_LU_Fill( sparse_matrix *A, real *droptol )
{
int i, j, pj;
int fillin;
real val;
fillin = 0;
for ( i = 0; i < A->n; ++i )
for ( pj = A->start[i] + 1; A->entries[pj].j < A->n; ++pj )
{
j = A->entries[pj].j;
val = A->entries[pj].val;
Kurt A. O'Hearn
committed
if ( FABS(val) > droptol[i] )
sparse_matrix_entry tmp[1000];
int i, j, pj, k1, k2, tmptop, Utop;
real val, dval;
int *Ltop;
Ltop = (int*) malloc((A->n) * sizeof(int));
// clear data structures
Utop = 0;
for ( i = A->n - 1; i >= 0; --i )
{
U->start[i] = Utop;
tmptop = 0;
for ( pj = A->end[i] - 1; A->entries[pj].j >= A->n; --pj ); // skip ghosts
for ( ; pj > A->start[i]; --pj )
{
j = A->entries[pj].j;
val = A->entries[pj].val;
fprintf( stderr, "i: %d, j: %d val=%f ", i, j, val );
//fprintf( stdout, "%d %d %24.16f\n", 6540-i, 6540-j, val );
//fprintf( stdout, "%d %d %24.16f\n", 6540-j, 6540-i, val );
Kurt A. O'Hearn
committed
if ( FABS(val) > droptol[i] )
{
k1 = tmptop - 1;
k2 = U->start[j] + 1;
while ( k1 >= 0 && k2 < U->end[j] )
{
if ( tmp[k1].j < U->entries[k2].j )
}
// U matrix is upper triangular
val /= U->entries[U->start[j]].val;
fprintf( stderr, " newval=%f", val );
tmp[tmptop].j = j;
tmp[tmptop].val = val;
tmptop++;
}
fprintf( stderr, "\n" );
}
//fprintf( stderr, "i = %d - tmptop = %d\n", i, tmptop );
// compute the ith diagonal in U
dval = A->entries[A->start[i]].val;
//fprintf( stdout, "%d %d %24.16f\n", 6540-i, 6540-i, dval );
for ( k1 = 0; k1 < tmptop; ++k1 )
Kurt A. O'Hearn
committed
//if( FABS(tmp[k1].val) > droptol[i] )
dval = SQRT(dval);
// keep the diagonal in any case
U->entries[Utop].j = i;
U->entries[Utop].val = dval;
Utop++;
fprintf(stderr, "row%d: droptol=%.15f val=%.15f\n", i, droptol[i], dval);
for ( k1 = tmptop - 1; k1 >= 0; --k1 )
{
// apply the dropping rule once again
Kurt A. O'Hearn
committed
if ( FABS(tmp[k1].val) > droptol[i] / dval )
{
U->entries[Utop].j = tmp[k1].j;
U->entries[Utop].val = tmp[k1].val;
Utop++;
Ltop[tmp[k1].j]++;
fprintf( stderr, "%d(%.15f)\n", tmp[k1].j, tmp[k1].val );
}
}
U->end[i] = Utop;
//fprintf( stderr, "i = %d - Utop = %d\n", i, Utop );
fprintf( stderr, "nnz(U): %d\n", Utop );
for ( i = 0; i < U->n; ++i )
{
fprintf( stderr, "row%d: ", i );
for ( pj = U->start[i]; pj < U->end[i]; ++pj )
// transpose matrix U into L
L->start[0] = L->end[0] = 0;
for ( i = 1; i < L->n; ++i )
{
L->start[i] = L->end[i] = L->start[i - 1] + Ltop[i - 1] + 1;
//fprintf( stderr, "i=%d L->start[i]=%d\n", i, L->start[i] );
for ( pj = U->start[i]; pj < U->end[i]; ++pj )
{
j = U->entries[pj].j;
L->entries[L->end[j]].j = i;
L->entries[L->end[j]].val = U->entries[pj].val;
L->end[j]++;
}
fprintf( stderr, "nnz(L): %d\n", L->end[L->n - 1] );
for ( i = 0; i < L->n; ++i )
{
fprintf( stderr, "row%d: ", i );
for ( pj = L->start[i]; pj < L->end[i]; ++pj )
void Init_MatVec( reax_system *system, simulation_data *data,
control_params *control, storage *workspace, mpi_datatypes *mpi_data )
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
int i; //, fillin;
reax_atom *atom;
/*if( (data->step - data->prev_steps) % control->refactor == 0 ||
workspace->L == NULL ) {
//Print_Linear_System( system, control, workspace, data->step );
Sort_Matrix_Rows( workspace->H );
fprintf( stderr, "H matrix sorted\n" );
Calculate_Droptol( workspace->H, workspace->droptol, control->droptol );
fprintf( stderr, "drop tolerances calculated\n" );
if( workspace->L == NULL ) {
fillin = Estimate_LU_Fill( workspace->H, workspace->droptol );
if( Allocate_Matrix( &(workspace->L), workspace->H->cap, fillin ) == 0 ||
Allocate_Matrix( &(workspace->U), workspace->H->cap, fillin ) == 0 ) {
fprintf( stderr, "not enough memory for LU matrices. terminating.\n" );
MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
}
workspace->L->n = workspace->H->n;
workspace->U->n = workspace->H->n;
#if defined(DEBUG_FOCUS)
fprintf( stderr, "p%d: n=%d, fillin = %d\n",
system->my_rank, workspace->L->n, fillin );
fprintf( stderr, "p%d: allocated memory: L = U = %ldMB\n",
system->my_rank,fillin*sizeof(sparse_matrix_entry)/(1024*1024) );
#endif
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
ICHOLT( workspace->H, workspace->droptol, workspace->L, workspace->U );
#if defined(DEBUG_FOCUS)
fprintf( stderr, "p%d: icholt finished\n", system->my_rank );
//sprintf( fname, "%s.L%d.out", control->sim_name, data->step );
//Print_Sparse_Matrix2( workspace->L, fname );
//Print_Sparse_Matrix( U );
#endif
}*/
for ( i = 0; i < system->n; ++i )
{
atom = &( system->my_atoms[i] );
/* init pre-conditioner for H and init solution vectors */
workspace->Hdia_inv[i] = 1. / system->reax_param.sbp[ atom->type ].eta;
workspace->b_s[i] = -system->reax_param.sbp[ atom->type ].chi;
workspace->b_t[i] = -1.0;
workspace->b[i][0] = -system->reax_param.sbp[ atom->type ].chi;
workspace->b[i][1] = -1.0;
/* linear extrapolation for s and for t */
// newQEq: no extrapolation!
//workspace->s[i] = 2 * atom->s[0] - atom->s[1]; //0;
//workspace->t[i] = 2 * atom->t[0] - atom->t[1]; //0;
//workspace->x[i][0] = 2 * atom->s[0] - atom->s[1]; //0;
//workspace->x[i][1] = 2 * atom->t[0] - atom->t[1]; //0;
/* quadratic extrapolation for s and t */
// workspace->s[i] = atom->s[2] + 3 * ( atom->s[0] - atom->s[1] );
// workspace->t[i] = atom->t[2] + 3 * ( atom->t[0] - atom->t[1] );
//workspace->x[i][0] = atom->s[2] + 3 * ( atom->s[0] - atom->s[1] );
workspace->x[i][1] = atom->t[2] + 3 * ( atom->t[0] - atom->t[1] );
/* cubic extrapolation for s and t */
workspace->x[i][0] = 4 * (atom->s[0] + atom->s[2]) - (6 * atom->s[1] + atom->s[3]);
//workspace->x[i][1] = 4*(atom->t[0]+atom->t[2])-(6*atom->t[1]+atom->t[3]);
// fprintf(stderr, "i=%d s=%f t=%f\n", i, workspace->s[i], workspace->t[i]);
}
void Calculate_Charges( reax_system *system, storage *workspace,
Kurt A. O'Hearn
committed
int i, scale;
real u;//, s_sum, t_sum;
rvec2 my_sum, all_sum;
reax_atom *atom;
real *q;
scale = sizeof(real) / sizeof(void);
q = (real*) malloc(system->N * sizeof(real));
//s_sum = Parallel_Vector_Acc(workspace->s, system->n, mpi_data->world);
//t_sum = Parallel_Vector_Acc(workspace->t, system->n, mpi_data->world);
my_sum[0] = my_sum[1] = 0;
for ( i = 0; i < system->n; ++i )
{
my_sum[0] += workspace->x[i][0];
my_sum[1] += workspace->x[i][1];
}
Kurt A. O'Hearn
committed
#if defined(DEBUG)
fprintf (stderr, "Host : my_sum[0]: %f and %f \n", my_sum[0], my_sum[1]);
Kurt A. O'Hearn
committed
#endif
MPI_Allreduce( &my_sum, &all_sum, 2, MPI_DOUBLE, MPI_SUM, mpi_data->world );
u = all_sum[0] / all_sum[1];
Kurt A. O'Hearn
committed
#if defined(DEBUG)
Kurt A. O'Hearn
committed
#endif
for ( i = 0; i < system->n; ++i )
{
atom = &( system->my_atoms[i] );
/* compute charge based on s & t */
//atom->q = workspace->s[i] - u * workspace->t[i];
q[i] = atom->q = workspace->x[i][0] - u * workspace->x[i][1];
/* backup s & t */
atom->s[3] = atom->s[2];
atom->s[2] = atom->s[1];
atom->s[1] = atom->s[0];
//atom->s[0] = workspace->s[i];
atom->s[0] = workspace->x[i][0];
atom->t[3] = atom->t[2];
atom->t[2] = atom->t[1];
atom->t[1] = atom->t[0];
//atom->t[0] = workspace->t[i];
atom->t[0] = workspace->x[i][1];
}
Dist( system, mpi_data, q, MPI_DOUBLE, scale, real_packer );
for ( i = system->n; i < system->N; ++i )
Kurt A. O'Hearn
committed
#ifdef HAVE_CUDA
void Cuda_Calculate_Charges( reax_system *system, storage *workspace,
mpi_datatypes *mpi_data )
int i, scale;
real u;//, s_sum, t_sum;
rvec2 my_sum, all_sum;
reax_atom *atom;
real *q;
my_sum [0] = my_sum[1] = 0.0;
scale = sizeof(real) / sizeof(void);
q = (real *) host_scratch;
//fprintf (stderr, "Device: my_sum[0]: %f and %f \n", my_sum[0], my_sum[1]);
MPI_Allreduce( &my_sum, &all_sum, 2, MPI_DOUBLE, MPI_SUM, mpi_data->world );
u = all_sum[0] / all_sum[1];
//fprintf (stderr, "Device: u: %f \n", u);
Dist( system, mpi_data, q, MPI_DOUBLE, scale, real_packer );
Kurt A. O'Hearn
committed
#endif
void QEq( reax_system *system, control_params *control, simulation_data *data,
storage *workspace, output_controls *out_control,
mpi_datatypes *mpi_data )
int s_matvecs, t_matvecs;
Init_MatVec( system, data, control, workspace, mpi_data );
//if( data->step == 50010 ) {
// Print_Linear_System( system, control, workspace, data->step );
// }
fprintf( stderr, "p%d: initialized qEq\n", system->my_rank );
//Print_Linear_System( system, control, workspace, data->step );
//MATRIX CHANGES
s_matvecs = dual_CG(system, workspace, &workspace->H, workspace->b,
control->q_err, workspace->x, mpi_data, out_control->log, data);
t_matvecs = 0;
//fprintf (stderr, "Host: First CG complated with iterations: %d \n", s_matvecs);
//s_matvecs = CG(system, workspace, workspace->H, workspace->b_s, //newQEq sCG
// control->q_err, workspace->s, mpi_data, out_control->log );
//s_matvecs = PCG( system, workspace, workspace->H, workspace->b_s,
// control->q_err, workspace->L, workspace->U, workspace->s,
// mpi_data, out_control->log );
fprintf( stderr, "p%d: first CG completed\n", system->my_rank );
//t_matvecs = CG(system, workspace, workspace->H, workspace->b_t, //newQEq sCG
// control->q_err, workspace->t, mpi_data, out_control->log );
//t_matvecs = PCG( system, workspace, workspace->H, workspace->b_t,
// control->q_err, workspace->L, workspace->U, workspace->t,
// mpi_data, out_control->log );
fprintf( stderr, "p%d: second CG completed\n", system->my_rank );
fprintf( stderr, "p%d: computed charges\n", system->my_rank );
//Print_Charges( system );
if ( system->my_rank == MASTER_NODE )
{
data->timing.s_matvecs += s_matvecs;
data->timing.t_matvecs += t_matvecs;
}
Kurt A. O'Hearn
committed
#ifdef HAVE_CUDA
void Cuda_QEq( reax_system *system, control_params *control, simulation_data
*data, storage *workspace, output_controls *out_control, mpi_datatypes
*mpi_data )
int s_matvecs, t_matvecs;
Cuda_Init_MatVec( system, workspace );
//if (data->step > 0) {
// compare_rvec2 (workspace->b, dev_workspace->b, system->n, "b");
// compare_rvec2 (workspace->x, dev_workspace->x, system->n, "x");
// compare_array (workspace->b_s, dev_workspace->b_s, system->n, "b_s");
// compare_array (workspace->b_t, dev_workspace->b_t, system->n, "b_t");
//}
//#ifdef __CUDA_DEBUG__
// Init_MatVec( system, data, control, workspace, mpi_data );
//#endif
#if defined(DEBUG)
fprintf( stderr, "p%d: initialized qEq\n", system->my_rank );
//Print_Linear_System( system, control, workspace, data->step );
s_matvecs = Cuda_dual_CG(system, workspace, &dev_workspace->H,
dev_workspace->b, control->q_err, dev_workspace->x, mpi_data,
out_control->log, data);
t_matvecs = 0;
//fprintf (stderr, "Device: First CG complated with iterations: %d \n", s_matvecs);
fprintf( stderr, "p%d: first CG completed\n", system->my_rank );
Cuda_Calculate_Charges( system, workspace, mpi_data );
fprintf( stderr, "p%d: computed charges\n", system->my_rank );
//Print_Charges( system );
if ( system->my_rank == MASTER_NODE )
{
data->timing.s_matvecs += s_matvecs;
data->timing.t_matvecs += t_matvecs;
}