From a31f1f11e67f9413d995906f7490e04d319ee80c Mon Sep 17 00:00:00 2001 From: "Kurt A. O'Hearn" <ohearnk@msu.edu> Date: Sat, 5 Jan 2019 16:47:39 -0500 Subject: [PATCH] PuReMD-old: add preconditioned pipelined CG (PIPECG) (WIP). --- PuReMD/src/allocate.c | 331 ++++---- PuReMD/src/basic_comm.c | 494 ++++++++---- PuReMD/src/basic_comm.h | 33 +- PuReMD/src/forces.c | 63 +- PuReMD/src/linear_solvers.c | 1519 ++++++++++++++++------------------- PuReMD/src/linear_solvers.h | 21 +- PuReMD/src/parallelreax.c | 60 +- PuReMD/src/qEq.c | 108 ++- PuReMD/src/reax_types.h | 18 +- PuReMD/src/vector.c | 16 + PuReMD/src/vector.h | 67 +- 11 files changed, 1468 insertions(+), 1262 deletions(-) diff --git a/PuReMD/src/allocate.c b/PuReMD/src/allocate.c index bb8bef3e..98fb6502 100644 --- a/PuReMD/src/allocate.c +++ b/PuReMD/src/allocate.c @@ -147,7 +147,7 @@ void DeAllocate_Workspace( control_params *control, storage *workspace ) sfree( workspace->bond_mark, "bond_mark" ); sfree( workspace->done_after, "done_after" ); - /* QEq storage */ + /* CM storage */ sfree( workspace->Hdia_inv, "Hdia_inv" ); sfree( workspace->b_s, "b_s" ); sfree( workspace->b_t, "b_t" ); @@ -159,28 +159,54 @@ void DeAllocate_Workspace( control_params *control, storage *workspace ) sfree( workspace->b, "b" ); sfree( workspace->x, "x" ); - /* GMRES storage */ - for ( i = 0; i < RESTART + 1; ++i ) + if ( control->cm_solver_type == GMRES_S + || control->cm_solver_type == GMRES_H_S ) { - sfree( workspace->h[i], "h[i]" ); - sfree( workspace->v[i], "v[i]" ); + for ( i = 0; i < RESTART + 1; ++i ) + { + sfree( workspace->h[i], "h[i]" ); + sfree( workspace->v[i], "v[i]" ); + } + + sfree( workspace->y, "y" ); + sfree( workspace->g, "g" ); + sfree( workspace->hc, "hc" ); + sfree( workspace->hs, "hs" ); + sfree( workspace->h, "h" ); + sfree( workspace->v, "v" ); + } + + if ( control->cm_solver_type == GMRES_S + || control->cm_solver_type == GMRES_H_S + || control->cm_solver_type == PIPECG_S ) + { + sfree( workspace->z, "z" ); + } + + if ( control->cm_solver_type == CG_S + || control->cm_solver_type == PIPECG_S ) + { + sfree( workspace->d, "d" ); + sfree( workspace->p, "p" ); + sfree( workspace->q, "q" ); + sfree( workspace->r, "r" ); + } + + if ( control->cm_solver_type == PIPECG_S ) + { + sfree( workspace->m, "m" ); + sfree( workspace->n, "n" ); + sfree( workspace->u, "u" ); + sfree( workspace->w, "w" ); + } + + if ( control->cm_solver_type == CG_S ) + { + sfree( workspace->r2, "r2" ); + sfree( workspace->d2, "d2" ); + sfree( workspace->q2, "q2" ); + sfree( workspace->p2, "p2" ); } - sfree( workspace->h, "h" ); - sfree( workspace->v, "v" ); - sfree( workspace->y, "y" ); - sfree( workspace->z, "z" ); - sfree( workspace->g, "g" ); - sfree( workspace->hs, "hs" ); - sfree( workspace->hc, "hc" ); - /* CG storage */ - sfree( workspace->r, "r" ); - sfree( workspace->d, "d" ); - sfree( workspace->q, "q" ); - sfree( workspace->p, "p" ); - sfree( workspace->r2, "r2" ); - sfree( workspace->d2, "d2" ); - sfree( workspace->q2, "q2" ); - sfree( workspace->p2, "p2" ); /* integrator */ // sfree( workspace->f_old ); @@ -239,8 +265,8 @@ void DeAllocate_Workspace( control_params *control, storage *workspace ) int Allocate_Workspace( reax_system *system, control_params *control, - storage *workspace, int local_cap, int total_cap, - MPI_Comm comm, char *msg ) + storage *workspace, int local_cap, int total_cap, + MPI_Comm comm, char *msg ) { int i, total_real, total_rvec, local_rvec; @@ -252,129 +278,147 @@ int Allocate_Workspace( reax_system *system, control_params *control, /* communication storage */ for ( i = 0; i < MAX_NBRS; ++i ) { - workspace->tmp_dbl[i] = (real*) - scalloc( total_cap, sizeof(real), "tmp_dbl", comm ); - workspace->tmp_rvec[i] = (rvec*) - scalloc( total_cap, sizeof(rvec), "tmp_rvec", comm ); - workspace->tmp_rvec2[i] = (rvec2*) - scalloc( total_cap, sizeof(rvec2), "tmp_rvec2", comm ); + workspace->tmp_dbl[i] = scalloc( total_cap, sizeof(real), "tmp_dbl", comm ); + workspace->tmp_rvec[i] = scalloc( total_cap, sizeof(rvec), "tmp_rvec", comm ); + workspace->tmp_rvec2[i] = scalloc( total_cap, sizeof(rvec2), "tmp_rvec2", comm ); } /* bond order related storage */ - workspace->within_bond_box = (int*) - scalloc( total_cap, sizeof(int), "skin", comm ); - workspace->total_bond_order = (real*) smalloc( total_real, "total_bo", comm ); - workspace->Deltap = (real*) smalloc( total_real, "Deltap", comm ); - workspace->Deltap_boc = (real*) smalloc( total_real, "Deltap_boc", comm ); - workspace->dDeltap_self = (rvec*) smalloc( total_rvec, "dDeltap_self", comm ); - workspace->Delta = (real*) smalloc( total_real, "Delta", comm ); - workspace->Delta_lp = (real*) smalloc( total_real, "Delta_lp", comm ); - workspace->Delta_lp_temp = (real*) - smalloc( total_real, "Delta_lp_temp", comm ); - workspace->dDelta_lp = (real*) smalloc( total_real, "dDelta_lp", comm ); - workspace->dDelta_lp_temp = (real*) - smalloc( total_real, "dDelta_lp_temp", comm ); - workspace->Delta_e = (real*) smalloc( total_real, "Delta_e", comm ); - workspace->Delta_boc = (real*) smalloc( total_real, "Delta_boc", comm ); - workspace->nlp = (real*) smalloc( total_real, "nlp", comm ); - workspace->nlp_temp = (real*) smalloc( total_real, "nlp_temp", comm ); - workspace->Clp = (real*) smalloc( total_real, "Clp", comm ); - workspace->vlpex = (real*) smalloc( total_real, "vlpex", comm ); - workspace->bond_mark = (int*) - scalloc( total_cap, sizeof(int), "bond_mark", comm ); - workspace->done_after = (int*) - scalloc( total_cap, sizeof(int), "done_after", comm ); - // fprintf( stderr, "p%d: bond order storage\n", system->my_rank ); - - /* QEq storage */ - workspace->Hdia_inv = (real*) - scalloc( total_cap, sizeof(real), "Hdia_inv", comm ); - workspace->b_s = (real*) scalloc( total_cap, sizeof(real), "b_s", comm ); - workspace->b_t = (real*) scalloc( total_cap, sizeof(real), "b_t", comm ); - workspace->b_prc = (real*) scalloc( total_cap, sizeof(real), "b_prc", comm ); - workspace->b_prm = (real*) scalloc( total_cap, sizeof(real), "b_prm", comm ); - workspace->s = (real*) scalloc( total_cap, sizeof(real), "s", comm ); - workspace->t = (real*) scalloc( total_cap, sizeof(real), "t", comm ); - workspace->droptol = (real*) - scalloc( total_cap, sizeof(real), "droptol", comm ); - workspace->b = (rvec2*) scalloc( total_cap, sizeof(rvec2), "b", comm ); - workspace->x = (rvec2*) scalloc( total_cap, sizeof(rvec2), "x", comm ); - - /* GMRES storage */ - workspace->y = (real*) scalloc( RESTART + 1, sizeof(real), "y", comm ); - workspace->z = (real*) scalloc( RESTART + 1, sizeof(real), "z", comm ); - workspace->g = (real*) scalloc( RESTART + 1, sizeof(real), "g", comm ); - workspace->h = (real**) scalloc( RESTART + 1, sizeof(real*), "h", comm ); - workspace->hs = (real*) scalloc( RESTART + 1, sizeof(real), "hs", comm ); - workspace->hc = (real*) scalloc( RESTART + 1, sizeof(real), "hc", comm ); - workspace->v = (real**) scalloc( RESTART + 1, sizeof(real*), "v", comm ); - - for ( i = 0; i < RESTART + 1; ++i ) + workspace->within_bond_box = scalloc( total_cap, sizeof(int), "skin", comm ); + workspace->total_bond_order = smalloc( total_real, "total_bo", comm ); + workspace->Deltap = smalloc( total_real, "Deltap", comm ); + workspace->Deltap_boc = smalloc( total_real, "Deltap_boc", comm ); + workspace->dDeltap_self = smalloc( total_rvec, "dDeltap_self", comm ); + workspace->Delta = smalloc( total_real, "Delta", comm ); + workspace->Delta_lp = smalloc( total_real, "Delta_lp", comm ); + workspace->Delta_lp_temp = smalloc( total_real, "Delta_lp_temp", comm ); + workspace->dDelta_lp = smalloc( total_real, "dDelta_lp", comm ); + workspace->dDelta_lp_temp = smalloc( total_real, "dDelta_lp_temp", comm ); + workspace->Delta_e = smalloc( total_real, "Delta_e", comm ); + workspace->Delta_boc = smalloc( total_real, "Delta_boc", comm ); + workspace->nlp = smalloc( total_real, "nlp", comm ); + workspace->nlp_temp = smalloc( total_real, "nlp_temp", comm ); + workspace->Clp = smalloc( total_real, "Clp", comm ); + workspace->vlpex = smalloc( total_real, "vlpex", comm ); + workspace->bond_mark = scalloc( total_cap, sizeof(int), "bond_mark", comm ); + workspace->done_after = scalloc( total_cap, sizeof(int), "done_after", comm ); + + /* CM storage */ + workspace->Hdia_inv = scalloc( total_cap, sizeof(real), "Hdia_inv", comm ); + workspace->b_s = scalloc( total_cap, sizeof(real), "b_s", comm ); + workspace->b_t = scalloc( total_cap, sizeof(real), "b_t", comm ); + workspace->b_prc = scalloc( total_cap, sizeof(real), "b_prc", comm ); + workspace->b_prm = scalloc( total_cap, sizeof(real), "b_prm", comm ); + workspace->s = scalloc( total_cap, sizeof(real), "s", comm ); + workspace->t = scalloc( total_cap, sizeof(real), "t", comm ); + workspace->droptol = scalloc( total_cap, sizeof(real), "droptol", comm ); + workspace->b = scalloc( total_cap, sizeof(rvec2), "b", comm ); + workspace->x = scalloc( total_cap, sizeof(rvec2), "x", comm ); + + if ( control->cm_solver_type == GMRES_S + || control->cm_solver_type == GMRES_H_S ) { - workspace->h[i] = (real*) scalloc( RESTART + 1, sizeof(real), "h[i]", comm ); - workspace->v[i] = (real*) scalloc( total_cap, sizeof(real), "v[i]", comm ); + workspace->y = scalloc( RESTART + 1, sizeof(real), "y", comm ); + workspace->g = scalloc( RESTART + 1, sizeof(real), "g", comm ); + workspace->hc = scalloc( RESTART + 1, sizeof(real), "hc", comm ); + workspace->hs = scalloc( RESTART + 1, sizeof(real), "hs", comm ); + workspace->h = scalloc( RESTART + 1, sizeof(real*), "h", comm ); + workspace->v = scalloc( RESTART + 1, sizeof(real*), "v", comm ); + + for ( i = 0; i < RESTART + 1; ++i ) + { + workspace->h[i] = (real*) scalloc( RESTART + 1, sizeof(real), "h[i]", comm ); + workspace->v[i] = (real*) scalloc( total_cap, sizeof(real), "v[i]", comm ); + } } - /* CG storage */ - workspace->r = (real*) scalloc( total_cap, sizeof(real), "r", comm ); - workspace->d = (real*) scalloc( total_cap, sizeof(real), "d", comm ); - workspace->q = (real*) scalloc( total_cap, sizeof(real), "q", comm ); - workspace->p = (real*) scalloc( total_cap, sizeof(real), "p", comm ); - workspace->r2 = (rvec2*) scalloc( total_cap, sizeof(rvec2), "r2", comm ); - workspace->d2 = (rvec2*) scalloc( total_cap, sizeof(rvec2), "d2", comm ); - workspace->q2 = (rvec2*) scalloc( total_cap, sizeof(rvec2), "q2", comm ); - workspace->p2 = (rvec2*) scalloc( total_cap, sizeof(rvec2), "p2", comm ); + if ( control->cm_solver_type == GMRES_S + || control->cm_solver_type == GMRES_H_S + || control->cm_solver_type == PIPECG_S ) + { + workspace->z = scalloc( RESTART + 1, sizeof(real), "z", comm ); + } + else if ( control->cm_solver_type == PIPECG_S ) + { + workspace->z = scalloc( total_cap, sizeof(real), "z", comm ); + } + + if ( control->cm_solver_type == CG_S + || control->cm_solver_type == PIPECG_S ) + { + workspace->d = scalloc( total_cap, sizeof(real), "d", comm ); + workspace->p = scalloc( total_cap, sizeof(real), "p", comm ); + workspace->q = scalloc( total_cap, sizeof(real), "q", comm ); + workspace->r = scalloc( total_cap, sizeof(real), "r", comm ); + } + + if ( control->cm_solver_type == PIPECG_S ) + { + workspace->m = scalloc( total_cap, sizeof(real), "m", comm ); + workspace->n = scalloc( total_cap, sizeof(real), "n", comm ); + workspace->u = scalloc( total_cap, sizeof(real), "u", comm ); + workspace->w = scalloc( total_cap, sizeof(real), "w", comm ); + } + + if ( control->cm_solver_type == CG_S ) + { + workspace->d2 = scalloc( total_cap, sizeof(rvec2), "d2", comm ); + workspace->r2 = scalloc( total_cap, sizeof(rvec2), "r2", comm ); + workspace->p2 = scalloc( total_cap, sizeof(rvec2), "p2", comm ); + workspace->q2 = scalloc( total_cap, sizeof(rvec2), "q2", comm ); + } /* integrator storage */ - workspace->v_const = (rvec*) smalloc( local_rvec, "v_const", comm ); + workspace->v_const = smalloc( local_rvec, "v_const", comm ); /* storage for analysis */ if ( control->molecular_analysis || control->diffusion_coef ) { - workspace->mark = (int*) scalloc( local_cap, sizeof(int), "mark", comm ); - workspace->old_mark = (int*) - scalloc( local_cap, sizeof(int), "old_mark", comm ); + workspace->mark = scalloc( local_cap, sizeof(int), "mark", comm ); + workspace->old_mark = scalloc( local_cap, sizeof(int), "old_mark", comm ); } else - workspace->mark = workspace->old_mark = NULL; + { + workspace->mark = NULL; + workspace->old_mark = NULL; + } if ( control->diffusion_coef ) - workspace->x_old = (rvec*) - scalloc( local_cap, sizeof(rvec), "x_old", comm ); - else workspace->x_old = NULL; + { + workspace->x_old = scalloc( local_cap, sizeof(rvec), "x_old", comm ); + } + else + { + workspace->x_old = NULL; + } /* force related storage */ - workspace->f = (rvec*) scalloc( total_cap, sizeof(rvec), "f", comm ); - workspace->CdDelta = (real*) - scalloc( total_cap, sizeof(real), "CdDelta", comm ); + workspace->f = scalloc( total_cap, sizeof(rvec), "f", comm ); + workspace->CdDelta = scalloc( total_cap, sizeof(real), "CdDelta", comm ); #ifdef TEST_FORCES - workspace->dDelta = (rvec*) smalloc( total_rvec, "dDelta", comm ); - workspace->f_ele = (rvec*) smalloc( total_rvec, "f_ele", comm ); - workspace->f_vdw = (rvec*) smalloc( total_rvec, "f_vdw", comm ); - workspace->f_bo = (rvec*) smalloc( total_rvec, "f_bo", comm ); - workspace->f_be = (rvec*) smalloc( total_rvec, "f_be", comm ); - workspace->f_lp = (rvec*) smalloc( total_rvec, "f_lp", comm ); - workspace->f_ov = (rvec*) smalloc( total_rvec, "f_ov", comm ); - workspace->f_un = (rvec*) smalloc( total_rvec, "f_un", comm ); - workspace->f_ang = (rvec*) smalloc( total_rvec, "f_ang", comm ); - workspace->f_coa = (rvec*) smalloc( total_rvec, "f_coa", comm ); - workspace->f_pen = (rvec*) smalloc( total_rvec, "f_pen", comm ); - workspace->f_hb = (rvec*) smalloc( total_rvec, "f_hb", comm ); - workspace->f_tor = (rvec*) smalloc( total_rvec, "f_tor", comm ); - workspace->f_con = (rvec*) smalloc( total_rvec, "f_con", comm ); - workspace->f_tot = (rvec*) smalloc( total_rvec, "f_tot", comm ); + workspace->dDelta = smalloc( total_rvec, "dDelta", comm ); + workspace->f_ele = smalloc( total_rvec, "f_ele", comm ); + workspace->f_vdw = smalloc( total_rvec, "f_vdw", comm ); + workspace->f_bo = smalloc( total_rvec, "f_bo", comm ); + workspace->f_be = smalloc( total_rvec, "f_be", comm ); + workspace->f_lp = smalloc( total_rvec, "f_lp", comm ); + workspace->f_ov = smalloc( total_rvec, "f_ov", comm ); + workspace->f_un = smalloc( total_rvec, "f_un", comm ); + workspace->f_ang = smalloc( total_rvec, "f_ang", comm ); + workspace->f_coa = smalloc( total_rvec, "f_coa", comm ); + workspace->f_pen = smalloc( total_rvec, "f_pen", comm ); + workspace->f_hb = smalloc( total_rvec, "f_hb", comm ); + workspace->f_tor = smalloc( total_rvec, "f_tor", comm ); + workspace->f_con = smalloc( total_rvec, "f_con", comm ); + workspace->f_tot = smalloc( total_rvec, "f_tot", comm ); if ( system->my_rank == MASTER_NODE ) { - workspace->rcounts = (int*) - smalloc( system->wsize * sizeof(int), "rcount", comm ); - workspace->displs = (int*) - smalloc( system->wsize * sizeof(int), "displs", comm ); - workspace->id_all = (int*) - smalloc( system->bigN * sizeof(int), "id_all", comm ); - workspace->f_all = (rvec*) - smalloc( system->bigN * sizeof(rvec), "f_all", comm ); + workspace->rcounts = smalloc( system->wsize * sizeof(int), "rcount", comm ); + workspace->displs = smalloc( system->wsize * sizeof(int), "displs", comm ); + workspace->id_all = smalloc( system->bigN * sizeof(int), "id_all", comm ); + workspace->f_all = smalloc( system->bigN * sizeof(rvec), "f_all", comm ); } else { @@ -717,22 +761,27 @@ int Allocate_MPI_Buffers( mpi_datatypes *mpi_data, int est_recv, comm = mpi_data->world; - /* in buffers */ - mpi_data->in1_buffer = (void*) - scalloc( est_recv, sizeof(boundary_atom), "in1_buffer", comm ); - mpi_data->in2_buffer = (void*) - scalloc( est_recv, sizeof(boundary_atom), "in2_buffer", comm ); - - /* out buffers */ + /* buffers for incoming messages, + * see SendRecv for MPI datatypes sent */ + mpi_data->in1_buffer = scalloc( est_recv, + MAX3( sizeof(mpi_atom), sizeof(boundary_atom), sizeof(rvec) ), + "Allocate_MPI_Buffers::in1_buffer", comm ); + mpi_data->in2_buffer = scalloc( est_recv, + MAX3( sizeof(mpi_atom), sizeof(boundary_atom), sizeof(rvec) ), + "Allocate_MPI_Buffers::in2_buffer", comm ); + + /* buffers for outgoing messages, + * see SendRecv for MPI datatypes sent */ for ( i = 0; i < MAX_NBRS; ++i ) { - mpi_buf = &( mpi_data->out_buffers[i] ); + mpi_buf = &mpi_data->out_buffers[i]; + /* allocate storage for the neighbor processor i */ - mpi_buf->index = (int*) - scalloc( my_nbrs[i].est_send, sizeof(int), "mpibuf:index", comm ); - mpi_buf->out_atoms = (void*) - scalloc( my_nbrs[i].est_send, sizeof(boundary_atom), "mpibuf:out_atoms", - comm ); + mpi_buf->index = scalloc( my_nbrs[i].est_send, sizeof(int), + "Allocate_MPI_Buffers::mpi_buf->index", comm ); + mpi_buf->out_atoms = scalloc( my_nbrs[i].est_send, + MAX3( sizeof(mpi_atom), sizeof(boundary_atom), sizeof(rvec) ), + "Allocate_MPI_Buffers::mpi_buf->out_atoms", comm ); } #if defined(NEUTRAL_TERRITORY) @@ -742,9 +791,9 @@ int Allocate_MPI_Buffers( mpi_datatypes *mpi_data, int est_recv, /* in buffers */ mpi_data->in_nt_buffer[i] = scalloc( my_nt_nbrs[i].est_recv, sizeof(real), "mpibuf:in_nt_buffer", comm ); - /* out buffer */ mpi_buf = &mpi_data->out_nt_buffers[i]; + /* allocate storage for the neighbor processor i */ mpi_buf->index = scalloc( my_nt_nbrs[i].est_send, sizeof(int), "mpibuf:nt_index", comm ); @@ -762,14 +811,14 @@ void Deallocate_MPI_Buffers( mpi_datatypes *mpi_data ) int i; mpi_out_data *mpi_buf; - sfree( mpi_data->in1_buffer, "in1_buffer" ); - sfree( mpi_data->in2_buffer, "in2_buffer" ); + sfree( mpi_data->in1_buffer, "Deallocate_MPI_Buffers::in1_buffer" ); + sfree( mpi_data->in2_buffer, "Deallocate_MPI_Buffers::in2_buffer" ); for ( i = 0; i < MAX_NBRS; ++i ) { mpi_buf = &mpi_data->out_buffers[i]; - sfree( mpi_buf->index, "mpibuf:index" ); - sfree( mpi_buf->out_atoms, "mpibuf:out_atoms" ); + sfree( mpi_buf->index, "Deallocate_MPI_Buffers::mpi_buf->index" ); + sfree( mpi_buf->out_atoms, "Deallocate_MPI_Buffers::mpi_buf->out_atoms" ); } #if defined(NEUTRAL_TERRITORY) diff --git a/PuReMD/src/basic_comm.c b/PuReMD/src/basic_comm.c index 18d27fa1..d9a7e97b 100644 --- a/PuReMD/src/basic_comm.c +++ b/PuReMD/src/basic_comm.c @@ -20,16 +20,21 @@ ----------------------------------------------------------------------*/ #include "reax_types.h" + #if defined(PURE_REAX) -#include "basic_comm.h" -#include "vector.h" + #include "basic_comm.h" + #include "vector.h" #elif defined(LAMMPS_REAX) -#include "reax_basic_comm.h" -#include "reax_vector.h" + #include "reax_basic_comm.h" + #include "reax_vector.h" #endif -#if defined(PURE_REAX) -void int_packer( void *dummy, mpi_out_data *out_buf ) + +typedef void (*dist_packer)( void*, mpi_out_data* ); +typedef void (*coll_unpacker)( void*, void*, mpi_out_data* ); + + +static void int_packer( void *dummy, mpi_out_data *out_buf ) { int i; int *buf = (int*) dummy; @@ -37,102 +42,225 @@ void int_packer( void *dummy, mpi_out_data *out_buf ) for ( i = 0; i < out_buf->cnt; ++i ) { + //if( buf[ out_buf->index[i] ] !=-1 ) out[i] = buf[ out_buf->index[i] ]; } } -void real_packer( void *dummy, mpi_out_data *out_buf ) +static void real_packer( void *dummy, mpi_out_data *out_buf ) { int i; real *buf = (real*) dummy; real *out = (real*) out_buf->out_atoms; for ( i = 0; i < out_buf->cnt; ++i ) + { out[i] = buf[ out_buf->index[i] ]; + } } -void rvec_packer( void *dummy, mpi_out_data *out_buf ) +static void rvec_packer( void *dummy, mpi_out_data *out_buf ) { int i; - rvec *buf = (rvec*) dummy; - rvec *out = (rvec*)out_buf->out_atoms; + rvec *buf, *out; + + buf = (rvec*) dummy; + out = (rvec*) out_buf->out_atoms; for ( i = 0; i < out_buf->cnt; ++i ) - memcpy( out[i], buf[ out_buf->index[i] ], sizeof(rvec) ); + { + memcpy( out + i, buf + out_buf->index[i], sizeof(rvec) ); + } } -void rvec2_packer( void *dummy, mpi_out_data *out_buf ) +static void rvec2_packer( void *dummy, mpi_out_data *out_buf ) { int i; - rvec2 *buf = (rvec2*) dummy; - rvec2 *out = (rvec2*) out_buf->out_atoms; + rvec2 *buf, *out; + + buf = (rvec2*) dummy; + out = (rvec2*) out_buf->out_atoms; for ( i = 0; i < out_buf->cnt; ++i ) - memcpy( out[i], buf[ out_buf->index[i] ], sizeof(rvec2) ); + { + memcpy( out + i, buf + out_buf->index[i], sizeof(rvec2) ); + } } -void Dist( reax_system* system, mpi_datatypes *mpi_data, - void *buf, MPI_Datatype type, int scale, dist_packer pack ) +static void int_unpacker( void *dummy_in, void *dummy_buf, mpi_out_data *out_buf ) { - int d; - mpi_out_data *out_bufs; - MPI_Comm comm; - MPI_Request req1, req2; - MPI_Status stat1, stat2; - neighbor_proc *nbr1, *nbr2; + int i; + int *in, *buf; + + in = (int*) dummy_in; + buf = (int*) dummy_buf; + + for ( i = 0; i < out_buf->cnt; ++i ) + { + if( buf[ out_buf->index[i] ] == -1 && in[i] != -1 ) + { + buf[ out_buf->index[i] ] = in[i]; + } + } +} + + +static void real_unpacker( void *dummy_in, void *dummy_buf, mpi_out_data *out_buf ) +{ + int i; + real *in, *buf; + + in = (real*) dummy_in; + buf = (real*) dummy_buf; + + for ( i = 0; i < out_buf->cnt; ++i ) + { + buf[ out_buf->index[i] ] += in[i]; + } +} + + +static void rvec_unpacker( void *dummy_in, void *dummy_buf, mpi_out_data *out_buf ) +{ + int i; + rvec *in, *buf; + + in = (rvec*) dummy_in; + buf = (rvec*) dummy_buf; + + for ( i = 0; i < out_buf->cnt; ++i ) + { + rvec_Add( buf[ out_buf->index[i] ], in[i] ); #if defined(DEBUG) - fprintf( stderr, "p%d dist: entered\n", system->my_rank ); + fprintf( stderr, "rvec_unpacker: cnt=%d i =%d index[i]=%d\n", + out_buf->cnt, i, out_buf->index[i] ); #endif - comm = mpi_data->comm_mesh3D; - out_bufs = mpi_data->out_buffers; + } +} - for ( d = 0; d < 3; ++d ) + +static void rvec2_unpacker( void *dummy_in, void *dummy_buf, mpi_out_data *out_buf ) +{ + int i; + rvec2 *in, *buf; + + in = (rvec2*) dummy_in; + buf = (rvec2*) dummy_buf; + + for ( i = 0; i < out_buf->cnt; ++i ) { - /* initiate recvs */ - nbr1 = &(system->my_nbrs[2 * d]); - if ( nbr1->atoms_cnt ) - MPI_Irecv( buf + nbr1->atoms_str * scale, nbr1->atoms_cnt, type, - nbr1->rank, 2 * d + 1, comm, &req1 ); + buf[ out_buf->index[i] ][0] += in[i][0]; + buf[ out_buf->index[i] ][1] += in[i][1]; + } +} - nbr2 = &(system->my_nbrs[2 * d + 1]); - if ( nbr2->atoms_cnt ) - MPI_Irecv( buf + nbr2->atoms_str * scale, nbr2->atoms_cnt, type, - nbr2->rank, 2 * d, comm, &req2 ); - /* send both messages in dimension d */ - if ( out_bufs[2 * d].cnt ) - { - pack( buf, out_bufs + (2 * d) ); - MPI_Send( out_bufs[2 * d].out_atoms, out_bufs[2 * d].cnt, type, - nbr1->rank, 2 * d, comm ); - } +static void * Get_Buffer_Offset( const void * const buffer, + const int offset, const int type ) +{ + void * ptr; - if ( out_bufs[2 * d + 1].cnt ) - { - pack( buf, out_bufs + (2 * d + 1) ); - MPI_Send( out_bufs[2 * d + 1].out_atoms, out_bufs[2 * d + 1].cnt, type, - nbr2->rank, 2 * d + 1, comm ); - } + switch ( type ) + { + case INT_PTR_TYPE: + ptr = (int *) buffer + offset; + break; + + case REAL_PTR_TYPE: + ptr = (real *) buffer + offset; + break; + + case RVEC_PTR_TYPE: + ptr = (rvec *) buffer + offset; + break; + + case RVEC2_PTR_TYPE: + ptr = (rvec2 *) buffer + offset; + break; + + default: + fprintf( stderr, "[ERROR] unknown pointer type. Terminating...\n" ); + exit( UNKNOWN_OPTION ); + break; + } + + return ptr; +} + + +static dist_packer Get_Packer( const int type ) +{ + dist_packer ptr; + + switch ( type ) + { + case INT_PTR_TYPE: + ptr = &int_packer; + break; + + case REAL_PTR_TYPE: + ptr = &real_packer; + break; + + case RVEC_PTR_TYPE: + ptr = &rvec_packer; + break; + + case RVEC2_PTR_TYPE: + ptr = &rvec2_packer; + break; + + default: + fprintf( stderr, "[ERROR] unknown pointer type. Terminating...\n" ); + exit( UNKNOWN_OPTION ); + break; + } + + return ptr; +} - if ( nbr1->atoms_cnt ) MPI_Wait( &req1, &stat1 ); - if ( nbr2->atoms_cnt ) MPI_Wait( &req2, &stat2 ); + +static coll_unpacker Get_Unpacker( const int type ) +{ + coll_unpacker ptr; + + switch ( type ) + { + case INT_PTR_TYPE: + ptr = &int_unpacker; + break; + + case REAL_PTR_TYPE: + ptr = &real_unpacker; + break; + + case RVEC_PTR_TYPE: + ptr = &rvec_unpacker; + break; + + case RVEC2_PTR_TYPE: + ptr = &rvec2_unpacker; + break; + + default: + fprintf( stderr, "[ERROR] unknown pointer type. Terminating...\n" ); + exit( UNKNOWN_OPTION ); + break; } -#if defined(DEBUG) - fprintf( stderr, "p%d dist: done\n", system->my_rank ); -#endif + return ptr; } -#if defined(NEUTRAL_TERRITORY) -void Dist_NT( reax_system* system, mpi_datatypes *mpi_data, - void *buf, MPI_Datatype type, int scale, dist_packer pack ) +void Dist( const reax_system * const system, mpi_datatypes * const mpi_data, + void *buf, int buf_type, MPI_Datatype type ) { +#if defined(NEUTRAL_TERRITORY) int d, count, index; mpi_out_data *out_bufs; MPI_Comm comm; @@ -140,31 +268,31 @@ void Dist_NT( reax_system* system, mpi_datatypes *mpi_data, MPI_Status stat[6]; neighbor_proc *nbr; -#if defined(DEBUG) -#endif comm = mpi_data->comm_mesh3D; out_bufs = mpi_data->out_nt_buffers; - count = 0; + /* initiate recvs */ - for( d = 0; d < 6; ++d ) + for ( d = 0; d < 6; ++d ) { - nbr = &(system->my_nt_nbrs[d]); + nbr = &system->my_nt_nbrs[d]; + if ( nbr->atoms_cnt ) { count++; - MPI_Irecv( buf + nbr->atoms_str * scale, nbr->atoms_cnt, type, - nbr->receive_rank, d, comm, &(req[d]) ); + MPI_Irecv( Get_Buffer_Offset( buf, nbr->atoms_str, buf_type ), + nbr->atoms_cnt, type, nbr->receive_rank, d, comm, &req[d] ); } } - for( d = 0; d < 6; ++d) + for ( d = 0; d < 6; ++d) { - nbr = &(system->my_nt_nbrs[d]); + nbr = &system->my_nt_nbrs[d]; + /* send both messages in dimension d */ if ( out_bufs[d].cnt ) { - pack( buf, out_bufs + d ); + pack( buf, out_bufs[d] ); MPI_Send( out_bufs[d].out_atoms, out_bufs[d].cnt, type, nbr->rank, d, comm ); } @@ -178,127 +306,78 @@ void Dist_NT( reax_system* system, mpi_datatypes *mpi_data, #if defined(DEBUG) fprintf( stderr, "p%d dist: done\n", system->my_rank ); #endif -} -#endif - -void real_unpacker( void *dummy_in, void *dummy_buf, mpi_out_data *out_buf ) -{ - int i; - real *in = (real*) dummy_in; - real *buf = (real*) dummy_buf; - - for ( i = 0; i < out_buf->cnt; ++i ) - { - buf[ out_buf->index[i] ] += in[i]; - } -} - - -void rvec_unpacker( void *dummy_in, void *dummy_buf, mpi_out_data *out_buf ) -{ - int i; - rvec *in = (rvec*) dummy_in; - rvec *buf = (rvec*) dummy_buf; - - for ( i = 0; i < out_buf->cnt; ++i ) - { - rvec_Add( buf[ out_buf->index[i] ], in[i] ); - -#if defined(DEBUG) - fprintf( stderr, "rvec_unpacker: cnt=%d i =%d index[i]=%d\n", - out_buf->cnt, i, out_buf->index[i] ); -#endif - } -} - - -void rvec2_unpacker( void *dummy_in, void *dummy_buf, mpi_out_data *out_buf ) -{ - int i; - rvec2 *in = (rvec2*) dummy_in; - rvec2 *buf = (rvec2*) dummy_buf; - - for ( i = 0; i < out_buf->cnt; ++i ) - { - buf[ out_buf->index[i] ][0] += in[i][0]; - buf[ out_buf->index[i] ][1] += in[i][1]; - } -} - - -void Coll( reax_system* system, mpi_datatypes *mpi_data, - void *buf, MPI_Datatype type, int scale, coll_unpacker unpack ) -{ +#else int d; - void *in1, *in2; mpi_out_data *out_bufs; MPI_Comm comm; MPI_Request req1, req2; MPI_Status stat1, stat2; - neighbor_proc *nbr1, *nbr2; + const neighbor_proc *nbr1, *nbr2; + dist_packer pack; #if defined(DEBUG) - fprintf( stderr, "p%d coll: entered\n", system->my_rank ); + fprintf( stderr, "p%d dist: entered\n", system->my_rank ); #endif comm = mpi_data->comm_mesh3D; - in1 = mpi_data->in1_buffer; - in2 = mpi_data->in2_buffer; out_bufs = mpi_data->out_buffers; + pack = Get_Packer( buf_type ); - for ( d = 2; d >= 0; --d ) + for ( d = 0; d < 3; ++d ) { /* initiate recvs */ - nbr1 = &(system->my_nbrs[2 * d]); - if ( out_bufs[2 * d].cnt ) - MPI_Irecv(in1, out_bufs[2 * d].cnt, type, nbr1->rank, 2 * d + 1, comm, &req1); - - nbr2 = &(system->my_nbrs[2 * d + 1]); - if ( out_bufs[2 * d + 1].cnt ) - MPI_Irecv(in2, out_bufs[2 * d + 1].cnt, type, nbr2->rank, 2 * d, comm, &req2); - - /* send both messages in dimension d */ + nbr1 = &system->my_nbrs[2 * d]; if ( nbr1->atoms_cnt ) - MPI_Send( buf + nbr1->atoms_str * scale, nbr1->atoms_cnt, type, - nbr1->rank, 2 * d, comm ); + { + MPI_Irecv( Get_Buffer_Offset( buf, nbr1->atoms_str, buf_type ), + nbr1->atoms_cnt, type, nbr1->rank, 2 * d + 1, comm, &req1 ); + } + nbr2 = &system->my_nbrs[2 * d + 1]; if ( nbr2->atoms_cnt ) - MPI_Send( buf + nbr2->atoms_str * scale, nbr2->atoms_cnt, type, - nbr2->rank, 2 * d + 1, comm ); - -#if defined(DEBUG) - fprintf( stderr, "p%d coll[%d] nbr1: str=%d cnt=%d recv=%d\n", - system->my_rank, d, nbr1->atoms_str, nbr1->atoms_cnt, - out_bufs[2 * d].cnt ); - fprintf( stderr, "p%d coll[%d] nbr2: str=%d cnt=%d recv=%d\n", - system->my_rank, d, nbr2->atoms_str, nbr2->atoms_cnt, - out_bufs[2 * d + 1].cnt ); -#endif + { + MPI_Irecv( Get_Buffer_Offset( buf, nbr2->atoms_str, buf_type ), + nbr2->atoms_cnt, type, nbr2->rank, 2 * d, comm, &req2 ); + } + /* send both messages in dimension d */ if ( out_bufs[2 * d].cnt ) { - MPI_Wait( &req1, &stat1 ); - unpack( in1, buf, out_bufs + (2 * d) ); + pack( buf, &out_bufs[2 * d] ); + MPI_Send( out_bufs[2 * d].out_atoms, out_bufs[2 * d].cnt, + type, nbr1->rank, 2 * d, comm ); } if ( out_bufs[2 * d + 1].cnt ) + { + pack( buf, &out_bufs[2 * d + 1] ); + MPI_Send( out_bufs[2 * d + 1].out_atoms, out_bufs[2 * d + 1].cnt, + type, nbr2->rank, 2 * d + 1, comm ); + } + + if( nbr1->atoms_cnt ) + { + MPI_Wait( &req1, &stat1 ); + } + if( nbr2->atoms_cnt ) { MPI_Wait( &req2, &stat2 ); - unpack( in2, buf, out_bufs + (2 * d + 1) ); } } + #if defined(DEBUG) - fprintf( stderr, "p%d coll: done\n", system->my_rank ); + fprintf( stderr, "p%d dist: done\n", system->my_rank ); +#endif #endif } +void Coll( const reax_system * const system, mpi_datatypes * const mpi_data, + void *buf, int buf_type, MPI_Datatype type ) +{ #if defined(NEUTRAL_TERRITORY) -void Coll_NT( reax_system* system, mpi_datatypes *mpi_data, - void *buf, MPI_Datatype type, int scale, coll_unpacker unpack ) -{ int d, count, index; void *in[6]; mpi_out_data *out_bufs; @@ -313,53 +392,133 @@ void Coll_NT( reax_system* system, mpi_datatypes *mpi_data, comm = mpi_data->comm_mesh3D; out_bufs = mpi_data->out_nt_buffers; - count = 0; + for ( d = 0; d < 6; ++d ) { nbr = &system->my_nt_nbrs[d]; in[d] = mpi_data->in_nt_buffer[d]; + if ( out_bufs[d].cnt ) { count++; - MPI_Irecv(in[d], out_bufs[d].cnt, type, nbr->rank, d, comm, &(req[d])); + MPI_Irecv( in[d], out_bufs[d].cnt, type, nbr->rank, d, comm, &req[d] ); } } - for( d = 0; d < 6; ++d ) + for ( d = 0; d < 6; ++d ) { + nbr = &system->my_nt_nbrs[d]; + /* send both messages in direction d */ - nbr = &(system->my_nt_nbrs[d]); if ( nbr->atoms_cnt ) { - MPI_Send( buf + nbr->atoms_str * scale, nbr->atoms_cnt, type, - nbr->receive_rank, d, comm ); + MPI_Send( Get_Buffer_Offset( buf, nbr->atoms_str, buf_type ), + nbr->atoms_cnt, type, nbr->receive_rank, d, comm ); } } for ( d = 0; d < count; ++d ) { MPI_Waitany( REAX_MAX_NT_NBRS, req, &index, stat); - unpack( in[index], buf, out_bufs + index ); + unpack( in[index], buf, out_bufs[index] ); } #if defined(DEBUG) fprintf( stderr, "p%d coll: done\n", system->my_rank ); #endif -} + +#else + int d; + mpi_out_data *out_bufs; + MPI_Comm comm; + MPI_Request req1, req2; + MPI_Status stat1, stat2; + const neighbor_proc *nbr1, *nbr2; + coll_unpacker unpack; + +#if defined(DEBUG) + fprintf( stderr, "p%d coll: entered\n", system->my_rank ); #endif -#endif /*PURE_REAX*/ + + comm = mpi_data->comm_mesh3D; + out_bufs = mpi_data->out_buffers; + unpack = Get_Unpacker( buf_type ); + + for ( d = 2; d >= 0; --d ) + { + /* initiate recvs */ + nbr1 = &system->my_nbrs[2 * d]; + + if ( out_bufs[2 * d].cnt ) + { + MPI_Irecv( mpi_data->in1_buffer, out_bufs[2 * d].cnt, + type, nbr1->rank, 2 * d + 1, comm, &req1 ); + } + + nbr2 = &system->my_nbrs[2 * d + 1]; + + if ( out_bufs[2 * d + 1].cnt ) + { + + MPI_Irecv( mpi_data->in2_buffer, out_bufs[2 * d + 1].cnt, + type, nbr2->rank, 2 * d, comm, &req2 ); + } + + /* send both messages in dimension d */ + if ( nbr1->atoms_cnt ) + { + MPI_Send( Get_Buffer_Offset( buf, nbr1->atoms_str, buf_type ), + nbr1->atoms_cnt, type, nbr1->rank, 2 * d, comm ); + } + + if ( nbr2->atoms_cnt ) + { + MPI_Send( Get_Buffer_Offset( buf, nbr2->atoms_str, buf_type ), + nbr2->atoms_cnt, type, nbr2->rank, 2 * d + 1, comm ); + } + +#if defined(DEBUG) + fprintf( stderr, "p%d coll[%d] nbr1: str=%d cnt=%d recv=%d\n", + system->my_rank, d, nbr1->atoms_str, nbr1->atoms_cnt, + out_bufs[2 * d].cnt ); + fprintf( stderr, "p%d coll[%d] nbr2: str=%d cnt=%d recv=%d\n", + system->my_rank, d, nbr2->atoms_str, nbr2->atoms_cnt, + out_bufs[2 * d + 1].cnt ); +#endif + + if ( out_bufs[2 * d].cnt ) + { + MPI_Wait( &req1, &stat1 ); + unpack( mpi_data->in1_buffer, buf, &out_bufs[2 * d] ); + } + + if ( out_bufs[2 * d + 1].cnt ) + { + MPI_Wait( &req2, &stat2 ); + unpack( mpi_data->in2_buffer, buf, &out_bufs[2 * d + 1] ); + } + } + +#if defined(DEBUG) + fprintf( stderr, "p%d coll: done\n", system->my_rank ); +#endif +#endif +} /*****************************************************************************/ real Parallel_Norm( real *v, int n, MPI_Comm comm ) { - int i; + int i; real my_sum, norm_sqr; - my_sum = 0; + my_sum = 0.0; + for ( i = 0; i < n; ++i ) + { my_sum += SQR( v[i] ); + } MPI_Allreduce( &my_sum, &norm_sqr, 1, MPI_DOUBLE, MPI_SUM, comm ); @@ -367,15 +526,17 @@ real Parallel_Norm( real *v, int n, MPI_Comm comm ) } - real Parallel_Dot( real *v1, real *v2, int n, MPI_Comm comm ) { int i; real my_dot, res; - my_dot = 0; + my_dot = 0.0; + for ( i = 0; i < n; ++i ) + { my_dot += v1[i] * v2[i]; + } MPI_Allreduce( &my_dot, &res, 1, MPI_DOUBLE, MPI_SUM, comm ); @@ -383,7 +544,6 @@ real Parallel_Dot( real *v1, real *v2, int n, MPI_Comm comm ) } - real Parallel_Vector_Acc( real *v, int n, MPI_Comm comm ) { int i; diff --git a/PuReMD/src/basic_comm.h b/PuReMD/src/basic_comm.h index 8c914973..bf461c05 100644 --- a/PuReMD/src/basic_comm.h +++ b/PuReMD/src/basic_comm.h @@ -24,30 +24,33 @@ #include "reax_types.h" -void int_packer( void*, mpi_out_data* ); -void real_packer( void*, mpi_out_data* ); -void rvec_packer( void*, mpi_out_data* ); -void rvec2_packer( void*, mpi_out_data* ); -void Dist(reax_system*, mpi_datatypes*, void*, MPI_Datatype, int, dist_packer); -#if defined(NEUTRAL_TERRITORY) -void Dist_NT(reax_system*, mpi_datatypes*, void*, MPI_Datatype, int, dist_packer); -#endif -void real_unpacker( void*, void*, mpi_out_data* ); -void rvec_unpacker( void*, void*, mpi_out_data* ); -void rvec2_unpacker( void*, void*, mpi_out_data* ); -void Coll( reax_system*, mpi_datatypes*, void*, MPI_Datatype, int, coll_unpacker ); -#if defined(NEUTRAL_TERRITORY) -void Coll_NT( reax_system*, mpi_datatypes*, void*, MPI_Datatype, int, coll_unpacker ); -#endif +enum pointer_type +{ + INT_PTR_TYPE = 0, + REAL_PTR_TYPE = 1, + RVEC_PTR_TYPE = 2, + RVEC2_PTR_TYPE = 3, +}; + + +void Dist( const reax_system * const, mpi_datatypes * const, + void*, int, MPI_Datatype ); + +void Coll( const reax_system * const, mpi_datatypes * const, + void*, int, MPI_Datatype ); real Parallel_Norm( real*, int, MPI_Comm ); + real Parallel_Dot( real*, real*, int, MPI_Comm ); + real Parallel_Vector_Acc( real*, int, MPI_Comm ); #if defined(TEST_FORCES) void Coll_ids_at_Master( reax_system*, storage*, mpi_datatypes* ); + void Coll_rvecs_at_Master( reax_system*, storage*, mpi_datatypes*, rvec* ); #endif + #endif diff --git a/PuReMD/src/forces.c b/PuReMD/src/forces.c index 4ebaf651..da66053c 100644 --- a/PuReMD/src/forces.c +++ b/PuReMD/src/forces.c @@ -147,49 +147,60 @@ void Compute_NonBonded_Forces( reax_system *system, control_params *control, /* this version of Compute_Total_Force computes forces from - coefficients accumulated by all interaction functions. - Saves enormous time & space! */ + * coefficients accumulated by all interaction functions. + * Saves enormous time & space! */ void Compute_Total_Force( reax_system *system, control_params *control, - simulation_data *data, storage *workspace, - reax_list **lists, mpi_datatypes *mpi_data ) + simulation_data *data, storage *workspace, + reax_list **lists, mpi_datatypes *mpi_data ) { int i, pj; reax_list *bonds = lists[BONDS]; for ( i = 0; i < system->N; ++i ) + { for ( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ) + { if ( i < bonds->bond_list[pj].nbr ) { if ( control->virial == 0 ) + { Add_dBond_to_Forces( i, pj, workspace, lists ); + } else + { Add_dBond_to_Forces_NPT( i, pj, data, workspace, lists ); + } } + } + } //Print_Total_Force( system, data, workspace ); + #if defined(PURE_REAX) /* now all forces are computed to their partially-final values - based on the neighbors information each processor has had. - final values of force on each atom needs to be computed by adding up - all partially-final pieces */ - Coll( system, mpi_data, workspace->f, mpi_data->mpi_rvec, - sizeof(rvec) / sizeof(void), rvec_unpacker ); + * based on the neighbors information each processor has had. + * final values of force on each atom needs to be computed by adding up + * all partially-final pieces */ + Coll( system, mpi_data, workspace->f, RVEC_PTR_TYPE, mpi_data->mpi_rvec ); + for ( i = 0; i < system->n; ++i ) + { rvec_Copy( system->my_atoms[i].f, workspace->f[i] ); + } #if defined(TEST_FORCES) - Coll( system, mpi_data, workspace->f_ele, mpi_data->mpi_rvec, rvec_unpacker); - Coll( system, mpi_data, workspace->f_vdw, mpi_data->mpi_rvec, rvec_unpacker); - Coll( system, mpi_data, workspace->f_be, mpi_data->mpi_rvec, rvec_unpacker ); - Coll( system, mpi_data, workspace->f_lp, mpi_data->mpi_rvec, rvec_unpacker ); - Coll( system, mpi_data, workspace->f_ov, mpi_data->mpi_rvec, rvec_unpacker ); - Coll( system, mpi_data, workspace->f_un, mpi_data->mpi_rvec, rvec_unpacker ); - Coll( system, mpi_data, workspace->f_ang, mpi_data->mpi_rvec, rvec_unpacker); - Coll( system, mpi_data, workspace->f_coa, mpi_data->mpi_rvec, rvec_unpacker); - Coll( system, mpi_data, workspace->f_pen, mpi_data->mpi_rvec, rvec_unpacker); - Coll( system, mpi_data, workspace->f_hb, mpi_data->mpi_rvec, rvec_unpacker ); - Coll( system, mpi_data, workspace->f_tor, mpi_data->mpi_rvec, rvec_unpacker); - Coll( system, mpi_data, workspace->f_con, mpi_data->mpi_rvec, rvec_unpacker); + Coll( system, mpi_data, workspace->f_ele, RVEC_PTR_TYPE, mpi_data->mpi_rvec ); + Coll( system, mpi_data, workspace->f_vdw, RVEC_PTR_TYPE, mpi_data->mpi_rvec ); + Coll( system, mpi_data, workspace->f_be, RVEC_PTR_TYPE, mpi_data->mpi_rvec ); + Coll( system, mpi_data, workspace->f_lp, RVEC_PTR_TYPE, mpi_data->mpi_rvec ); + Coll( system, mpi_data, workspace->f_ov, RVEC_PTR_TYPE, mpi_data->mpi_rvec ); + Coll( system, mpi_data, workspace->f_un, RVEC_PTR_TYPE, mpi_data->mpi_rvec ); + Coll( system, mpi_data, workspace->f_ang, RVEC_PTR_TYPE, mpi_data->mpi_rvec ); + Coll( system, mpi_data, workspace->f_coa, RVEC_PTR_TYPE, mpi_data->mpi_rvec ); + Coll( system, mpi_data, workspace->f_pen, RVEC_PTR_TYPE, mpi_data->mpi_rvec ); + Coll( system, mpi_data, workspace->f_hb, RVEC_PTR_TYPE, mpi_data->mpi_rvec ); + Coll( system, mpi_data, workspace->f_tor, RVEC_PTR_TYPE, mpi_data->mpi_rvec ); + Coll( system, mpi_data, workspace->f_con, RVEC_PTR_TYPE, mpi_data->mpi_rvec ); #endif #endif @@ -2309,7 +2320,7 @@ void Compute_Forces( reax_system *system, control_params *control, /********* bonded interactions ************/ Compute_Bonded_Forces( system, control, data, workspace, - lists, out_control, mpi_data->world ); + lists, out_control, mpi_data->world ); #if defined(LOG_PERFORMANCE) //MPI_Barrier( mpi_data->world ); @@ -2320,6 +2331,7 @@ void Compute_Forces( reax_system *system, control_params *control, t_start = t_end; } #endif + #if defined(DEBUG_FOCUS) fprintf( stderr, "p%d @ step%d: completed bonded\n", system->my_rank, data->step ); @@ -2329,7 +2341,9 @@ void Compute_Forces( reax_system *system, control_params *control, /**************** qeq ************************/ #if defined(PURE_REAX) if ( qeq_flag ) + { QEq( system, control, data, workspace, out_control, mpi_data ); + } #if defined(LOG_PERFORMANCE) //MPI_Barrier( mpi_data->world ); @@ -2340,6 +2354,7 @@ void Compute_Forces( reax_system *system, control_params *control, t_start = t_end; } #endif + #if defined(DEBUG_FOCUS) fprintf(stderr, "p%d @ step%d: qeq completed\n", system->my_rank, data->step); MPI_Barrier( mpi_data->world ); @@ -2348,7 +2363,7 @@ void Compute_Forces( reax_system *system, control_params *control, /********* nonbonded interactions ************/ Compute_NonBonded_Forces( system, control, data, workspace, - lists, out_control, mpi_data->world ); + lists, out_control, mpi_data->world ); #if defined(LOG_PERFORMANCE) //MPI_Barrier( mpi_data->world ); @@ -2359,6 +2374,7 @@ void Compute_Forces( reax_system *system, control_params *control, t_start = t_end; } #endif + #if defined(DEBUG_FOCUS) fprintf( stderr, "p%d @ step%d: nonbonded forces completed\n", system->my_rank, data->step ); @@ -2376,6 +2392,7 @@ void Compute_Forces( reax_system *system, control_params *control, data->timing.bonded += t_end - t_start; } #endif + #if defined(DEBUG_FOCUS) fprintf( stderr, "p%d @ step%d: total forces computed\n", system->my_rank, data->step ); diff --git a/PuReMD/src/linear_solvers.c b/PuReMD/src/linear_solvers.c index d1ad369b..f29eab07 100644 --- a/PuReMD/src/linear_solvers.c +++ b/PuReMD/src/linear_solvers.c @@ -106,6 +106,163 @@ static int find_bucket( double *list, int len, double a ) } +static void dual_Sparse_MatVec( sparse_matrix *A, rvec2 *x, rvec2 *b, int N ) +{ + int i, j, k, si; + real H; + + for ( i = 0; i < N; ++i ) + { + b[i][0] = b[i][1] = 0; + } + + if ( A->format == SYM_HALF_MATRIX ) + { + /* perform multiplication */ + for ( i = 0; i < A->n; ++i ) + { + si = A->start[i]; + + b[i][0] += A->entries[si].val * x[i][0]; + b[i][1] += A->entries[si].val * x[i][1]; + + for ( k = si + 1; k < A->end[i]; ++k ) + { + j = A->entries[k].j; + H = A->entries[k].val; + + b[i][0] += H * x[j][0]; + b[i][1] += H * x[j][1]; + + // comment out for tryQEq + //if( j < A->n ) { + b[j][0] += H * x[i][0]; + b[j][1] += H * x[i][1]; + //} + } + } + } + else if ( A->format == SYM_FULL_MATRIX || A->format == FULL_MATRIX ) + { + /* perform multiplication */ + for ( i = 0; i < A->n; ++i ) + { + si = A->start[i]; + + for ( k = si; k < A->end[i]; ++k ) + { + j = A->entries[k].j; + H = A->entries[k].val; + + b[i][0] += H * x[j][0]; + b[i][1] += H * x[j][1]; + } + } + } +} + + +static void Sparse_MatVec( sparse_matrix *A, real *x, real *b, int N ) +{ + int i, j, k, si, num_rows; + real val; + + for ( i = 0; i < N; ++i ) + { + b[i] = 0.0; + } + +#if defined(NEUTRAL_TERRITORY) + num_rows = A->NT; + + if ( A->format == SYM_HALF_MATRIX ) + { + for ( i = 0; i < num_rows; ++i ) + { + si = A->start[i]; + + /* diagonal only contributes once */ + if( i < A->n ) + { + b[i] += A->entries[si].val * x[i]; + k = si + 1; + } + /* zeros on the diagonal for i >= A->n, + * so skip the diagonal multplication step as zeros + * are not stored (idea: keep the NNZ's the same + * for full shell and neutral territory half-stored + * charge matrices to make debugging easier) */ + else + { + k = si; + } + + for ( ; k < A->end[i]; ++k ) + { + j = A->entries[k].j; + val = A->entries[k].val; + + b[i] += val * x[j]; + b[j] += val * x[i]; + } + } + } + else if ( A->format == SYM_FULL_MATRIX || A->format == FULL_MATRIX ) + { + for ( i = 0; i < num_rows; ++i ) + { + si = A->start[i]; + + for ( k = si; k < A->end[i]; ++k ) + { + j = A->entries[k].j; + val = A->entries[k].val; + + b[i] += val * x[j]; + } + } + } +#else + num_rows = A->n; + + if ( A->format == SYM_HALF_MATRIX ) + { + for ( i = 0; i < num_rows; ++i ) + { + si = A->start[i]; + + /* diagonal only contributes once */ + b[i] += A->entries[si].val * x[i]; + + for ( k = si + 1; k < A->end[i]; ++k ) + { + j = A->entries[k].j; + val = A->entries[k].val; + + b[i] += val * x[j]; + b[j] += val * x[i]; + } + } + } + else if ( A->format == SYM_FULL_MATRIX || A->format == FULL_MATRIX ) + { + for ( i = 0; i < num_rows; ++i ) + { + si = A->start[i]; + + for ( k = si; k < A->end[i]; ++k ) + { + j = A->entries[k].j; + val = A->entries[k].val; + + b[i] += val * x[j]; + } + } + } +#endif +} + + real setup_sparse_approx_inverse( reax_system *system, simulation_data *data, storage *workspace, mpi_datatypes *mpi_data, sparse_matrix *A, sparse_matrix **A_spar_patt, int nprocs, real filter ) @@ -508,7 +665,7 @@ real sparse_approx_inverse( reax_system *system, simulation_data *data, lapack_int m, n, nrhs, lda, ldb, info; int *pos_x, *X; real *e_j, *dense_matrix; - int cnt, scale; + int cnt; reax_atom *atom; int *row_nnz; @@ -580,10 +737,9 @@ real sparse_approx_inverse( reax_system *system, simulation_data *data, /* Announce the nnz's in each row that will be communicated later */ t_start = MPI_Wtime(); - scale = sizeof(int) / sizeof(void); - Dist_NT( system, mpi_data, row_nnz, MPI_INT, scale, int_packer ); + Dist( system, mpi_data, row_nnz, REAL_PTR_TYPE, MPI_INT ); t_comm += MPI_Wtime() - t_start; - fprintf( stdout,"SAI after Dist_NT call\n"); + fprintf( stdout,"SAI after Dist call\n"); fflush( stdout ); comm = mpi_data->comm_mesh3D; @@ -612,11 +768,11 @@ real sparse_approx_inverse( reax_system *system, simulation_data *data, j_recv[d] = (int *) malloc( sizeof(int) * cnt ); val_recv[d] = (real *) malloc( sizeof(real) * cnt ); - fprintf( stdout,"Dist_NT communication receive phase direction %d will receive %d\n", d, cnt); + fprintf( stdout,"Dist communication receive phase direction %d will receive %d\n", d, cnt); fflush( stdout ); t_start = MPI_Wtime(); - MPI_Irecv( j_recv + d, cnt, MPI_INT, nbr->receive_rank, d, comm, &(req[2*d]) ); - MPI_Irecv( val_recv + d, cnt, MPI_DOUBLE, nbr->receive_rank, d, comm, &(req[2*d+1]) ); + MPI_Irecv( j_recv + d, cnt, MPI_INT, nbr->receive_rank, d, comm, &req[2 * d] ); + MPI_Irecv( val_recv + d, cnt, MPI_DOUBLE, nbr->receive_rank, d, comm, &req[2 * d + 1] ); t_comm += MPI_Wtime() - t_start; } } @@ -639,7 +795,7 @@ real sparse_approx_inverse( reax_system *system, simulation_data *data, } // row_nnz[ out_bufs[d].index[i] ]; } - fprintf( stdout,"Dist_NT communication send phase direction %d should send %d\n", d, cnt); + fprintf( stdout,"Dist communication send phase direction %d should send %d\n", d, cnt); fflush( stdout ); if( cnt ) @@ -659,21 +815,21 @@ real sparse_approx_inverse( reax_system *system, simulation_data *data, } } - fprintf( stdout,"Dist_NT communication send phase direction %d will send %d\n", d, cnt ); + fprintf( stdout,"Dist communication send phase direction %d will send %d\n", d, cnt ); fflush( stdout ); t_start = MPI_Wtime(); MPI_Send( j_send, cnt, MPI_INT, nbr->rank, d, comm ); - fprintf( stdout,"Dist_NT communication send phase direction %d cnt = %d\n", d, cnt); + fprintf( stdout,"Dist communication send phase direction %d cnt = %d\n", d, cnt); fflush( stdout ); MPI_Send( val_send, cnt, MPI_DOUBLE, nbr->rank, d, comm ); - fprintf( stdout,"Dist_NT communication send phase direction %d cnt = %d\n", d, cnt); + fprintf( stdout,"Dist communication send phase direction %d cnt = %d\n", d, cnt); fflush( stdout ); t_comm += MPI_Wtime() - t_start; } } } - fprintf( stdout," Dist_NT communication for sending row info before waitany\n"); + fprintf( stdout," Dist communication for sending row info before waitany\n"); fflush( stdout ); /////////////////////// for ( d = 0; d < count; ++d ) @@ -708,7 +864,7 @@ real sparse_approx_inverse( reax_system *system, simulation_data *data, } } ////////////////////// - fprintf( stdout," wow wow wow, Dist_NT communication for sending row info worked\n"); + fprintf( stdout," wow wow wow, Dist communication for sending row info worked\n"); fflush( stdout ); //TODO: size? X = (int *) malloc( sizeof(int) * (system->bigN + 1) ); @@ -883,6 +1039,8 @@ real sparse_approx_inverse( reax_system *system, simulation_data *data, return MPI_Wtime() - start; } + + #else real sparse_approx_inverse( reax_system *system, simulation_data *data, storage *workspace, mpi_datatypes *mpi_data, @@ -896,7 +1054,7 @@ real sparse_approx_inverse( reax_system *system, simulation_data *data, int *X, *q; real *e_j, *dense_matrix; int size_e, size_dense; - int cnt, scale; + int cnt; reax_atom *atom; int *row_nnz; int **j_list; @@ -968,8 +1126,7 @@ real sparse_approx_inverse( reax_system *system, simulation_data *data, /* Announce the nnz's in each row that will be communicated later */ t_start = MPI_Wtime(); - scale = sizeof(int) / sizeof(void); - Dist( system, mpi_data, row_nnz, MPI_INT, scale, int_packer ); + Dist( system, mpi_data, row_nnz, INT_PTR_TYPE, MPI_INT ); t_comm += MPI_Wtime() - t_start; comm = mpi_data->comm_mesh3D; @@ -1424,66 +1581,11 @@ real sparse_approx_inverse( reax_system *system, simulation_data *data, #endif -void dual_Sparse_MatVec( sparse_matrix *A, rvec2 *x, rvec2 *b, int N ) -{ - int i, j, k, si; - real H; - - for ( i = 0; i < N; ++i ) - { - b[i][0] = b[i][1] = 0; - } - - if ( A->format == SYM_HALF_MATRIX ) - { - /* perform multiplication */ - for ( i = 0; i < A->n; ++i ) - { - si = A->start[i]; - - b[i][0] += A->entries[si].val * x[i][0]; - b[i][1] += A->entries[si].val * x[i][1]; - - for ( k = si + 1; k < A->end[i]; ++k ) - { - j = A->entries[k].j; - H = A->entries[k].val; - - b[i][0] += H * x[j][0]; - b[i][1] += H * x[j][1]; - - // comment out for tryQEq - //if( j < A->n ) { - b[j][0] += H * x[i][0]; - b[j][1] += H * x[i][1]; - //} - } - } - } - else if ( A->format == SYM_FULL_MATRIX || A->format == FULL_MATRIX ) - { - /* perform multiplication */ - for ( i = 0; i < A->n; ++i ) - { - si = A->start[i]; - - for ( k = si; k < A->end[i]; ++k ) - { - j = A->entries[k].j; - H = A->entries[k].val; - - b[i][0] += H * x[j][0]; - b[i][1] += H * x[j][1]; - } - } - } -} - - -/*int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, - rvec2 *b, real tol, rvec2 *x, mpi_datatypes* mpi_data, FILE *fout ) +int dual_CG( reax_system *system, control_params *control, simulation_data *data, + storage *workspace, sparse_matrix *H, rvec2 *b, + real tol, rvec2 *x, mpi_datatypes* mpi_data, FILE *fout ) { - int i, j, n, N, matvecs, scale; + int i, j, n, N, matvecs; rvec2 tmp, alpha, beta; rvec2 my_sum, norm_sqr, b_norm, my_dot; rvec2 sig_old, sig_new; @@ -1493,7 +1595,6 @@ void dual_Sparse_MatVec( sparse_matrix *A, rvec2 *x, rvec2 *b, int N ) N = system->N; comm = mpi_data->world; matvecs = 0; - scale = sizeof(rvec2) / sizeof(void); #if defined(CG_PERFORMANCE) if ( system->my_rank == MASTER_NODE ) @@ -1504,11 +1605,11 @@ void dual_Sparse_MatVec( sparse_matrix *A, rvec2 *x, rvec2 *b, int N ) } #endif - Dist( system, mpi_data, x, mpi_data->mpi_rvec2, scale, rvec2_packer ); + Dist( system, mpi_data, x, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 ); dual_Sparse_MatVec( H, x, workspace->q2, N ); if ( H->format == SYM_HALF_MATRIX ) { - Coll(system, mpi_data, workspace->q2, mpi_data->mpi_rvec2, scale, rvec2_unpacker); + Coll( system, mpi_data, workspace->q2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 ); } #if defined(CG_PERFORMANCE) @@ -1565,11 +1666,11 @@ void dual_Sparse_MatVec( sparse_matrix *A, rvec2 *x, rvec2 *b, int N ) for ( i = 1; i < 300; ++i ) { - Dist(system, mpi_data, workspace->d2, mpi_data->mpi_rvec2, scale, rvec2_packer); + Dist( system, mpi_data, workspace->d2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 ); dual_Sparse_MatVec( H, workspace->d2, workspace->q2, N ); if ( H->format == SYM_HALF_MATRIX ) { - Coll(system, mpi_data, workspace->q2, mpi_data->mpi_rvec2, scale, rvec2_unpacker); + Coll( system, mpi_data, workspace->q2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 ); } #if defined(CG_PERFORMANCE) @@ -1646,8 +1747,10 @@ void dual_Sparse_MatVec( sparse_matrix *A, rvec2 *x, rvec2 *b, int N ) { workspace->t[j] = workspace->x[j][1]; } - matvecs = CG( system, workspace, H, workspace->b_t, tol, - workspace->t,mpi_data, fout, -1 ); + + matvecs = CG( system, control, data, workspace, + H, workspace->b_t, tol, workspace->t, mpi_data ); + for ( j = 0; j < n; ++j ) { workspace->x[j][1] = workspace->t[j]; @@ -1659,8 +1762,10 @@ void dual_Sparse_MatVec( sparse_matrix *A, rvec2 *x, rvec2 *b, int N ) { workspace->s[j] = workspace->x[j][0]; } - matvecs = CG( system, workspace, H, workspace->b_s, tol, workspace->s, - mpi_data, fout, -1 ); + + matvecs = CG( system, control, data, workspace, + H, workspace->b_s, tol, workspace->s, mpi_data ); + for ( j = 0; j < system->n; ++j ) { workspace->x[j][0] = workspace->s[j]; @@ -1669,7 +1774,7 @@ void dual_Sparse_MatVec( sparse_matrix *A, rvec2 *x, rvec2 *b, int N ) if ( i >= 300 ) { - fprintf( stderr, "CG convergence failed!\n" ); + fprintf( stderr, "[WARNING] CG convergence failed!\n" ); } #if defined(CG_PERFORMANCE) @@ -1681,289 +1786,160 @@ void dual_Sparse_MatVec( sparse_matrix *A, rvec2 *x, rvec2 *b, int N ) #endif return (i + 1) + matvecs; -}*/ +} -void Sparse_MatVec( sparse_matrix *A, real *x, real *b, int N ) +/* Preconditioned Conjugate Gradient Method */ +int CG( reax_system *system, control_params *control, simulation_data *data, + storage *workspace, sparse_matrix *H, real *b, + real tol, real *x, mpi_datatypes* mpi_data ) { - int i, j, k, si, num_rows; - real val; + int i, j; + real tmp, alpha, beta, b_norm; + real sig_old, sig_new; + real t_start, t_pa, t_spmv, t_vops, t_comm, t_allreduce; + real timings[5]; - for ( i = 0; i < N; ++i ) - { - b[i] = 0; - } + t_pa = 0.0; + t_spmv = 0.0; + t_vops = 0.0; + t_comm = 0.0; + t_allreduce = 0.0; + t_start = MPI_Wtime( ); + Dist( system, mpi_data, x, REAL_PTR_TYPE, MPI_DOUBLE ); + t_comm += MPI_Wtime( ) - t_start; + + t_start = MPI_Wtime( ); #if defined(NEUTRAL_TERRITORY) - num_rows = A->NT; + Sparse_MatVec( H, x, workspace->q, H->NT ); +#else + Sparse_MatVec( H, x, workspace->q, system->N ); +#endif + t_spmv += MPI_Wtime( ) - t_start; - if ( A->format == SYM_HALF_MATRIX ) + if ( H->format == SYM_HALF_MATRIX ) { - for ( i = 0; i < num_rows; ++i ) - { - si = A->start[i]; + t_start = MPI_Wtime( ); + Coll( system, mpi_data, workspace->q, REAL_PTR_TYPE, MPI_DOUBLE ); + t_comm += MPI_Wtime( ) - t_start; + } +#if defined(NEUTRAL_TERRITORY) + else + { + t_start = MPI_Wtime( ); + Coll( system, mpi_data, workspace->q, REAL_PTR_TYPE, MPI_DOUBLE ); + t_comm += MPI_Wtime( ) - t_start; + } +#endif - /* diagonal only contributes once */ - if( i < A->n ) - { - b[i] += A->entries[si].val * x[i]; - k = si + 1; - } - /* zeros on the diagonal for i >= A->n, - * so skip the diagonal multplication step as zeros - * are not stored (idea: keep the NNZ's the same - * for full shell and neutral territory half-stored - * charge matrices to make debugging easier) */ - else - { - k = si; - } - - for ( ; k < A->end[i]; ++k ) - { - j = A->entries[k].j; - val = A->entries[k].val; - - b[i] += val * x[j]; - b[j] += val * x[i]; - } - } - } - else if ( A->format == SYM_FULL_MATRIX || A->format == FULL_MATRIX ) - { - for ( i = 0; i < num_rows; ++i ) - { - si = A->start[i]; - - for ( k = si; k < A->end[i]; ++k ) - { - j = A->entries[k].j; - val = A->entries[k].val; - - b[i] += val * x[j]; - } - } - } -#else - num_rows = A->n; - - if ( A->format == SYM_HALF_MATRIX ) - { - for ( i = 0; i < num_rows; ++i ) - { - si = A->start[i]; - - /* diagonal only contributes once */ - b[i] += A->entries[si].val * x[i]; - - for ( k = si + 1; k < A->end[i]; ++k ) - { - j = A->entries[k].j; - val = A->entries[k].val; - - b[i] += val * x[j]; - b[j] += val * x[i]; - } - } - } - else if ( A->format == SYM_FULL_MATRIX || A->format == FULL_MATRIX ) - { - for ( i = 0; i < num_rows; ++i ) - { - si = A->start[i]; - - for ( k = si; k < A->end[i]; ++k ) - { - j = A->entries[k].j; - val = A->entries[k].val; - - b[i] += val * x[j]; - } - } - } -#endif -} - - -int CG( reax_system *system, control_params *control, simulation_data *data, - storage *workspace, sparse_matrix *H, real *b, - real tol, real *x, mpi_datatypes* mpi_data, FILE *fout, int nprocs ) -{ - int i, j, scale; - real tmp, alpha, beta, b_norm; - real sig_old, sig_new; - real t_start, t_pa, t_spmv, t_vops, t_comm, t_allreduce; - //real total_pa, total_spmv, total_vops, total_comm, total_allreduce; - real timings[5], total_t[5]; - - t_pa = 0.0; - t_spmv = 0.0; - t_vops = 0.0; - t_comm = 0.0; - t_allreduce = 0.0; - - t_start = MPI_Wtime(); - scale = sizeof(real) / sizeof(void); -#if defined(NEUTRAL_TERRITORY) - Dist_NT( system, mpi_data, x, MPI_DOUBLE, scale, real_packer ); -#else - Dist( system, mpi_data, x, MPI_DOUBLE, scale, real_packer ); -#endif - t_comm += MPI_Wtime() - t_start; - - t_start = MPI_Wtime(); -#if defined(NEUTRAL_TERRITORY) - Sparse_MatVec( H, x, workspace->q, H->NT ); -#else - Sparse_MatVec( H, x, workspace->q, system->N ); -#endif - t_spmv += MPI_Wtime() - t_start; - - if ( H->format == SYM_HALF_MATRIX ) - { - t_start = MPI_Wtime(); -#if defined(NEUTRAL_TERRITORY) - Coll_NT( system, mpi_data, workspace->q, MPI_DOUBLE, scale, real_unpacker ); -#else - Coll( system, mpi_data, workspace->q, MPI_DOUBLE, scale, real_unpacker ); -#endif - t_comm += MPI_Wtime() - t_start; - } - - else - { - t_start = MPI_Wtime(); -#if defined(NEUTRAL_TERRITORY) - Coll_NT( system, mpi_data, workspace->q, MPI_DOUBLE, scale, real_unpacker ); -#endif - t_comm += MPI_Wtime() - t_start; - } - - t_start = MPI_Wtime(); + t_start = MPI_Wtime( ); Vector_Sum( workspace->r , 1., b, -1., workspace->q, system->n ); - t_vops += MPI_Wtime() - t_start; + t_vops += MPI_Wtime( ) - t_start; /* pre-conditioning */ - if( control->cm_solver_pre_comp_type == SAI_PC ) + if ( control->cm_solver_pre_comp_type == SAI_PC ) { - t_start = MPI_Wtime(); -#if defined(NEUTRAL_TERRITORY) - Dist_NT( system, mpi_data, workspace->r, MPI_DOUBLE, scale, real_packer ); -#else - Dist( system, mpi_data, workspace->r, MPI_DOUBLE, scale, real_packer ); -#endif - t_comm += MPI_Wtime() - t_start; + t_start = MPI_Wtime( ); + Dist( system, mpi_data, workspace->r, REAL_PTR_TYPE, MPI_DOUBLE ); + t_comm += MPI_Wtime( ) - t_start; - t_start = MPI_Wtime(); + t_start = MPI_Wtime( ); #if defined(NEUTRAL_TERRITORY) Sparse_MatVec( workspace->H_app_inv, workspace->r, workspace->d, H->NT ); #else Sparse_MatVec( workspace->H_app_inv, workspace->r, workspace->d, system->n ); #endif - t_pa += MPI_Wtime() - t_start; + t_pa += MPI_Wtime( ) - t_start; } - else if ( control->cm_solver_pre_comp_type == JACOBI_PC) { - t_start = MPI_Wtime(); + t_start = MPI_Wtime( ); for ( j = 0; j < system->n; ++j ) { workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; } - t_pa += MPI_Wtime() - t_start; + t_pa += MPI_Wtime( ) - t_start; } - t_start = MPI_Wtime(); + t_start = MPI_Wtime( ); b_norm = Parallel_Norm( b, system->n, mpi_data->world ); - sig_new = Parallel_Dot(workspace->r, workspace->d, system->n, mpi_data->world); - t_allreduce += MPI_Wtime() - t_start; + sig_new = Parallel_Dot( workspace->r, workspace->d, system->n, mpi_data->world ); + t_allreduce += MPI_Wtime( ) - t_start; for ( i = 0; i < control->cm_solver_max_iters && sqrt(sig_new) / b_norm > tol; ++i ) { - t_start = MPI_Wtime(); -#if defined(NEUTRAL_TERRITORY) - Dist_NT( system, mpi_data, workspace->d, MPI_DOUBLE, scale, real_packer ); -#else - Dist( system, mpi_data, workspace->d, MPI_DOUBLE, scale, real_packer ); -#endif - t_comm += MPI_Wtime() - t_start; + t_start = MPI_Wtime( ); + Dist( system, mpi_data, workspace->d, REAL_PTR_TYPE, MPI_DOUBLE ); + t_comm += MPI_Wtime( ) - t_start; - t_start = MPI_Wtime(); + t_start = MPI_Wtime( ); #if defined(NEUTRAL_TERRITORY) Sparse_MatVec( H, workspace->d, workspace->q, H->NT ); #else Sparse_MatVec( H, workspace->d, workspace->q, system->N ); #endif - t_spmv += MPI_Wtime() - t_start; + t_spmv += MPI_Wtime( ) - t_start; if ( H->format == SYM_HALF_MATRIX ) { - t_start = MPI_Wtime(); -#if defined(NEUTRAL_TERRITORY) - Coll_NT(system, mpi_data, workspace->q, MPI_DOUBLE, scale, real_unpacker); -#else - Coll(system, mpi_data, workspace->q, MPI_DOUBLE, scale, real_unpacker); -#endif - t_comm += MPI_Wtime() - t_start; + t_start = MPI_Wtime( ); + Coll( system, mpi_data, workspace->q, REAL_PTR_TYPE, MPI_DOUBLE ); + t_comm += MPI_Wtime( ) - t_start; } +#if defined(NEUTRAL_TERRITORY) else { - t_start = MPI_Wtime(); -#if defined(NEUTRAL_TERRITORY) - Coll_NT( system, mpi_data, workspace->q, MPI_DOUBLE, scale, real_unpacker ); -#endif - t_comm += MPI_Wtime() - t_start; + t_start = MPI_Wtime( ); + Coll( system, mpi_data, workspace->q, REAL_PTR_TYPE, MPI_DOUBLE ); + t_comm += MPI_Wtime( ) - t_start; } +#endif - t_start = MPI_Wtime(); - tmp = Parallel_Dot(workspace->d, workspace->q, system->n, mpi_data->world); - t_allreduce += MPI_Wtime() - t_start; + t_start = MPI_Wtime( ); + tmp = Parallel_Dot( workspace->d, workspace->q, system->n, mpi_data->world ); + t_allreduce += MPI_Wtime( ) - t_start; - t_start = MPI_Wtime(); + t_start = MPI_Wtime( ); alpha = sig_new / tmp; Vector_Add( x, alpha, workspace->d, system->n ); Vector_Add( workspace->r, -alpha, workspace->q, system->n ); - t_vops += MPI_Wtime() - t_start; + t_vops += MPI_Wtime( ) - t_start; /* pre-conditioning */ - if( control->cm_solver_pre_comp_type == SAI_PC ) + if ( control->cm_solver_pre_comp_type == SAI_PC ) { - t_start = MPI_Wtime(); -#if defined(NEUTRAL_TERRITORY) - Dist_NT( system, mpi_data, workspace->r, MPI_DOUBLE, scale, real_packer ); -#else - Dist( system, mpi_data, workspace->r, MPI_DOUBLE, scale, real_packer ); -#endif - t_comm += MPI_Wtime() - t_start; + t_start = MPI_Wtime( ); + Dist( system, mpi_data, workspace->r, REAL_PTR_TYPE, MPI_DOUBLE ); + t_comm += MPI_Wtime( ) - t_start; - t_start = MPI_Wtime(); + t_start = MPI_Wtime( ); #if defined(NEUTRAL_TERRITORY) Sparse_MatVec( workspace->H_app_inv, workspace->r, workspace->p, H->NT ); #else Sparse_MatVec( workspace->H_app_inv, workspace->r, workspace->p, system->n ); #endif - t_pa += MPI_Wtime() - t_start; + t_pa += MPI_Wtime( ) - t_start; } - - else if ( control->cm_solver_pre_comp_type == JACOBI_PC) + else if ( control->cm_solver_pre_comp_type == JACOBI_PC ) { - t_start = MPI_Wtime(); + t_start = MPI_Wtime( ); for ( j = 0; j < system->n; ++j ) { workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j]; } - t_pa += MPI_Wtime() - t_start; + t_pa += MPI_Wtime( ) - t_start; } - t_start = MPI_Wtime(); + t_start = MPI_Wtime( ); sig_old = sig_new; - sig_new = Parallel_Dot(workspace->r, workspace->p, system->n, mpi_data->world); + sig_new = Parallel_Dot( workspace->r, workspace->p, system->n, mpi_data->world ); t_allreduce += MPI_Wtime() - t_start; - t_start = MPI_Wtime(); + t_start = MPI_Wtime( ); beta = sig_new / sig_old; Vector_Sum( workspace->d, 1., workspace->p, beta, workspace->d, system->n ); - t_vops += MPI_Wtime() - t_start; + t_vops += MPI_Wtime( ) - t_start; } timings[0] = t_pa; @@ -1972,32 +1948,26 @@ int CG( reax_system *system, control_params *control, simulation_data *data, timings[3] = t_comm; timings[4] = t_allreduce; - //MPI_Reduce(&t_pa, &total_pa, 1, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world); - //MPI_Reduce(&t_spmv, &total_spmv, 1, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world); - //MPI_Reduce(&t_vops, &total_vops, 1, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world); - //MPI_Reduce(&t_comm, &total_comm, 1, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world); - //MPI_Reduce(&t_allreduce, &total_allreduce, 1, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world); - MPI_Reduce(timings, total_t, 5, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world); - if ( system->my_rank == MASTER_NODE ) { - //data->timing.cm_solver_pre_app += total_pa / nprocs; - //data->timing.cm_solver_spmv += total_spmv / nprocs; - //data->timing.cm_solver_vector_ops += total_vops / nprocs; - //data->timing.cm_solver_comm += total_comm / nprocs; - //data->timing.cm_solver_allreduce += total_allreduce / nprocs; - data->timing.cm_solver_pre_app += total_t[0] / nprocs; - data->timing.cm_solver_spmv += total_t[1] / nprocs; - data->timing.cm_solver_vector_ops += total_t[2] / nprocs; - data->timing.cm_solver_comm += total_t[3] / nprocs; - data->timing.cm_solver_allreduce += total_t[4] / nprocs; + MPI_Reduce( MPI_IN_PLACE, timings, 5, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world ); + + data->timing.cm_solver_pre_app += timings[0] / control->nprocs; + data->timing.cm_solver_spmv += timings[1] / control->nprocs; + data->timing.cm_solver_vector_ops += timings[2] / control->nprocs; + data->timing.cm_solver_comm += timings[3] / control->nprocs; + data->timing.cm_solver_allreduce += timings[4] / control->nprocs; + } + else + { + MPI_Reduce( timings, NULL, 5, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world ); } MPI_Barrier(mpi_data->world); if ( i >= control->cm_solver_max_iters && system->my_rank == MASTER_NODE ) { - fprintf( stderr, "CG convergence failed!\n" ); + fprintf( stderr, "[WARNING] CG convergence failed!\n" ); return i; } @@ -2005,634 +1975,495 @@ int CG( reax_system *system, control_params *control, simulation_data *data, } -/*int CG_test( reax_system *system, storage *workspace, sparse_matrix *H, - real *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout ) +/* Pipelined Preconditioned Conjugate Gradient Method + * + * References: + * 1) Hiding global synchronization latency in the preconditioned Conjugate Gradient algorithm, + * P. Ghysels and W. Vanroose, Parallel Computing, 2014. + * 2) Scalable Non-blocking Preconditioned Conjugate Gradient Methods, + * Paul R. Eller and William Gropp, SC '16 Proceedings of the International Conference + * for High Performance Computing, Networking, Storage and Analysis, 2016. + * */ +int PIPECG( reax_system *system, control_params *control, simulation_data *data, + storage *workspace, sparse_matrix *H, real *b, + real tol, real *x, mpi_datatypes* mpi_data ) { - int i, j, scale; - real tmp, alpha, beta, b_norm; - real sig_old, sig_new; - - scale = sizeof(real) / sizeof(void); - b_norm = Parallel_Norm( b, system->n, mpi_data->world ); -#if defined(DEBUG) - if ( system->my_rank == MASTER_NODE ) - { - fprintf( stderr, "n=%d, N=%d\n", system->n, system->N ); - fprintf( stderr, "p%d CGinit: b_norm=%24.15e\n", system->my_rank, b_norm ); - //Vector_Print( stderr, "d", workspace->d, system->N ); - //Vector_Print( stderr, "q", workspace->q, system->N ); - } - MPI_Barrier( mpi_data->world ); -#endif + int i, j; + real alpha, beta, b_norm, norm; + real delta; + real gamma_old, gamma_new; + real t_start, t_pa, t_spmv, t_vops, t_comm, t_allreduce; + real timings[5], redux[4]; + MPI_Request req; - Sparse_MatVec( H, x, workspace->q, system->N ); - //Coll( system, mpi_data, workspace->q, MPI_DOUBLE, real_unpacker ); + t_pa = 0.0; + t_spmv = 0.0; + t_vops = 0.0; + t_comm = 0.0; + t_allreduce = 0.0; - Vector_Sum( workspace->r , 1., b, -1., workspace->q, system->n ); - for ( j = 0; j < system->n; ++j ) - workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; //pre-condition + t_start = MPI_Wtime( ); + Dist( system, mpi_data, x, REAL_PTR_TYPE, MPI_DOUBLE ); + t_comm += MPI_Wtime( ) - t_start; - sig_new = Parallel_Dot( workspace->r, workspace->d, system->n, - mpi_data->world ); -#if defined(DEBUG) - //if( system->my_rank == MASTER_NODE ) { - fprintf( stderr, "p%d CG:sig_new=%24.15e,d_norm=%24.15e,q_norm=%24.15e\n", - system->my_rank, sqrt(sig_new), - Parallel_Norm(workspace->d, system->n, mpi_data->world), - Parallel_Norm(workspace->q, system->n, mpi_data->world) ); - //Vector_Print( stderr, "d", workspace->d, system->N ); - //Vector_Print( stderr, "q", workspace->q, system->N ); - //} - MPI_Barrier( mpi_data->world ); + t_start = MPI_Wtime( ); +#if defined(NEUTRAL_TERRITORY) + Sparse_MatVec( H, x, workspace->q, H->NT ); +#else + Sparse_MatVec( H, x, workspace->q, system->N ); #endif + t_spmv += MPI_Wtime( ) - t_start; - for ( i = 1; i < 300 && sqrt(sig_new) / b_norm > tol; ++i ) + if ( H->format == SYM_HALF_MATRIX ) { -#if defined(CG_PERFORMANCE) - if ( system->my_rank == MASTER_NODE ) - t_start = MPI_Wtime(); -#endif - Dist( system, mpi_data, workspace->d, MPI_DOUBLE, scale, real_packer ); - Sparse_MatVec( H, workspace->d, workspace->q, system->N ); - //tryQEq - //Coll(system, mpi_data, workspace->q, MPI_DOUBLE, real_unpacker); -#if defined(CG_PERFORMANCE) - if ( system->my_rank == MASTER_NODE ) - { - t_elapsed = MPI_Wtime() - t_start; - matvec_time += t_elapsed; - } -#endif - -#if defined(CG_PERFORMANCE) - if ( system->my_rank == MASTER_NODE ) - t_start = MPI_Wtime(); -#endif - tmp = Parallel_Dot(workspace->d, workspace->q, system->n, mpi_data->world); - alpha = sig_new / tmp; -#if defined(DEBUG) - //if( system->my_rank == MASTER_NODE ){ - fprintf(stderr, - "p%d CG iter%d:d_norm=%24.15e,q_norm=%24.15e,tmp = %24.15e\n", - system->my_rank, i, - //Parallel_Norm(workspace->d, system->n, mpi_data->world), - //Parallel_Norm(workspace->q, system->n, mpi_data->world), - Norm(workspace->d, system->n), Norm(workspace->q, system->n), tmp); - //Vector_Print( stderr, "d", workspace->d, system->N ); - //for( j = 0; j < system->N; ++j ) - // fprintf( stdout, "%d %24.15e\n", - // system->my_atoms[j].orig_id, workspace->q[j] ); - //fprintf( stdout, "\n" ); - //} - MPI_Barrier( mpi_data->world ); -#endif - - Vector_Add( x, alpha, workspace->d, system->n ); - Vector_Add( workspace->r, -alpha, workspace->q, system->n ); - // pre-conditioning - for ( j = 0; j < system->n; ++j ) - workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j]; - - sig_old = sig_new; - sig_new = Parallel_Dot(workspace->r, workspace->p, system->n, mpi_data->world); - beta = sig_new / sig_old; - Vector_Sum( workspace->d, 1., workspace->p, beta, workspace->d, system->n ); -#if defined(DEBUG) - if ( system->my_rank == MASTER_NODE ) - fprintf(stderr, "p%d CG iter%d: sig_new = %24.15e\n", - system->my_rank, i, sqrt(sig_new) ); - MPI_Barrier( mpi_data->world ); -#endif -#if defined(CG_PERFORMANCE) - if ( system->my_rank == MASTER_NODE ) - { - t_elapsed = MPI_Wtime() - t_start; - dot_time += t_elapsed; - } -#endif + t_start = MPI_Wtime( ); + Coll( system, mpi_data, workspace->q, REAL_PTR_TYPE, MPI_DOUBLE ); + t_comm += MPI_Wtime( ) - t_start; } - -#if defined(DEBUG) - if ( system->my_rank == MASTER_NODE ) - fprintf( stderr, "CG took %d iterations\n", i ); -#endif -#if defined(CG_PERFORMANCE) - if ( system->my_rank == MASTER_NODE ) - fprintf( stderr, "%f %f\n", matvec_time, dot_time ); -#endif - if ( i >= 300 ) +#if defined(NEUTRAL_TERRITORY) + else { - fprintf( stderr, "CG convergence failed!\n" ); - return i; + t_start = MPI_Wtime( ); + Coll( system, mpi_data, workspace->q, REAL_PTR_TYPE, MPI_DOUBLE ); + t_comm += MPI_Wtime( ) - t_start; } +#endif - return i; -}*/ - - -/*void Forward_Subs( sparse_matrix *L, real *b, real *y ) -{ - int i, pj, j, si, ei; - real val; + t_start = MPI_Wtime( ); + Vector_Sum( workspace->r , 1.0, b, -1.0, workspace->q, system->n ); + t_vops += MPI_Wtime( ) - t_start; - for ( i = 0; i < L->n; ++i ) + /* pre-conditioning */ + if ( control->cm_solver_pre_comp_type == SAI_PC ) { - y[i] = b[i]; - si = L->start[i]; - ei = L->end[i]; - for ( pj = si; pj < ei - 1; ++pj ) - { - j = L->entries[pj].j; - val = L->entries[pj].val; - y[i] -= val * y[j]; - } - y[i] /= L->entries[pj].val; + t_start = MPI_Wtime( ); + Dist( system, mpi_data, workspace->r, REAL_PTR_TYPE, MPI_DOUBLE ); + t_comm += MPI_Wtime( ) - t_start; + + t_start = MPI_Wtime( ); +#if defined(NEUTRAL_TERRITORY) + Sparse_MatVec( workspace->H_app_inv, workspace->r, workspace->u, H->NT ); +#else + Sparse_MatVec( workspace->H_app_inv, workspace->r, workspace->u, system->n ); +#endif + t_pa += MPI_Wtime( ) - t_start; } -}*/ - - -/*void Backward_Subs( sparse_matrix *U, real *y, real *x ) -{ - int i, pj, j, si, ei; - real val; - - for ( i = U->n - 1; i >= 0; --i ) + else if ( control->cm_solver_pre_comp_type == JACOBI_PC) { - x[i] = y[i]; - si = U->start[i]; - ei = U->end[i]; - for ( pj = si + 1; pj < ei; ++pj ) + t_start = MPI_Wtime( ); + for ( j = 0; j < system->n; ++j ) { - j = U->entries[pj].j; - val = U->entries[pj].val; - x[i] -= val * x[j]; + workspace->u[j] = workspace->r[j] * workspace->Hdia_inv[j]; } - x[i] /= U->entries[si].val; + t_pa += MPI_Wtime( ) - t_start; } -}*/ - - -/*int PCG( reax_system *system, storage *workspace, - sparse_matrix *H, real *b, real tol, - sparse_matrix *L, sparse_matrix *U, real *x, - mpi_datatypes* mpi_data, FILE *fout ) -{ - int i, n, N, scale; - real tmp, alpha, beta, b_norm, r_norm, sig_old, sig_new; - MPI_Comm world; -#if defined(DEBUG_FOCUS) - int me; - me = system->my_rank; -#endif + t_start = MPI_Wtime( ); + Dist( system, mpi_data, workspace->u, REAL_PTR_TYPE, MPI_DOUBLE ); + t_comm += MPI_Wtime( ) - t_start; - n = system->n; - N = system->N; - world = mpi_data->world; - scale = sizeof(real) / sizeof(void); - b_norm = Parallel_Norm( b, n, world ); -#if defined(DEBUG_FOCUS) - if ( me == MASTER_NODE ) - { - fprintf( stderr, "init_PCG: n=%d, N=%d\n", n, N ); - fprintf( stderr, "init_PCG: |b|=%24.15e\n", b_norm ); - } - MPI_Barrier( world ); + t_start = MPI_Wtime( ); +#if defined(NEUTRAL_TERRITORY) + Sparse_MatVec( H, workspace->u, workspace->w, H->NT ); +#else + Sparse_MatVec( H, workspace->u, workspace->w, system->N ); #endif + t_spmv += MPI_Wtime( ) - t_start; - Sparse_MatVec( H, x, workspace->q, N ); - //Coll( system, workspace, mpi_data, workspace->q ); - Vector_Sum( workspace->r , 1., b, -1., workspace->q, n ); - r_norm = Parallel_Norm( workspace->r, n, world ); - - Forward_Subs( L, workspace->r, workspace->d ); - Backward_Subs( U, workspace->d, workspace->p ); - sig_new = Parallel_Dot( workspace->r, workspace->p, n, world ); -#if defined(DEBUG_FOCUS) - if ( me == MASTER_NODE ) + if ( H->format == SYM_HALF_MATRIX ) { - fprintf( stderr, "init_PCG: sig_new=%.15e\n", r_norm ); - fprintf( stderr, "init_PCG: |d|=%.15e |q|=%.15e\n", - Parallel_Norm(workspace->d, n, world), - Parallel_Norm(workspace->q, n, world) ); + t_start = MPI_Wtime( ); + Coll( system, mpi_data, workspace->w, REAL_PTR_TYPE, MPI_DOUBLE ); + t_comm += MPI_Wtime( ) - t_start; } - MPI_Barrier( world ); -#endif - - for ( i = 1; i < 100 && r_norm / b_norm > tol; ++i ) +#if defined(NEUTRAL_TERRITORY) + else { - Dist( system, mpi_data, workspace->p, MPI_DOUBLE, scale, real_packer ); - Sparse_MatVec( H, workspace->p, workspace->q, N ); - // tryQEq - //Coll(system,mpi_data,workspace->q, MPI_DOUBLE, real_unpacker); - tmp = Parallel_Dot( workspace->q, workspace->p, n, world ); - alpha = sig_new / tmp; - Vector_Add( x, alpha, workspace->p, n ); -#if defined(DEBUG_FOCUS) - if ( me == MASTER_NODE ) - fprintf(stderr, "iter%d: |p|=%.15e |q|=%.15e tmp=%.15e\n", - i, Parallel_Norm(workspace->p, n, world), - Parallel_Norm(workspace->q, n, world), tmp ); - MPI_Barrier( world ); -#endif - - Vector_Add( workspace->r, -alpha, workspace->q, n ); - r_norm = Parallel_Norm( workspace->r, n, world ); -#if defined(DEBUG_FOCUS) - if ( me == MASTER_NODE ) - fprintf( stderr, "iter%d: res=%.15e\n", i, r_norm ); - MPI_Barrier( world ); -#endif - - Forward_Subs( L, workspace->r, workspace->d ); - Backward_Subs( U, workspace->d, workspace->d ); - sig_old = sig_new; - sig_new = Parallel_Dot( workspace->r, workspace->d, n, world ); - beta = sig_new / sig_old; - Vector_Sum( workspace->p, 1., workspace->d, beta, workspace->p, n ); + t_start = MPI_Wtime( ); + Coll( system, mpi_data, workspace->w, REAL_PTR_TYPE, MPI_DOUBLE ); + t_comm += MPI_Wtime( ) - t_start; } - -#if defined(DEBUG_FOCUS) - if ( me == MASTER_NODE ) - fprintf( stderr, "PCG took %d iterations\n", i ); #endif - if ( i >= 100 ) - fprintf( stderr, "PCG convergence failed!\n" ); - - return i; -}*/ + t_start = MPI_Wtime( ); + redux[0] = Dot_local( workspace->w, workspace->u, system->n ); + redux[1] = Dot_local( workspace->r, workspace->u, system->n ); + redux[2] = Dot_local( workspace->u, workspace->u, system->n ); + redux[3] = Dot_local( b, b, system->n ); + t_vops += MPI_Wtime( ) - t_start; -#if defined(OLD_STUFF) -/*int sCG( reax_system *system, storage *workspace, sparse_matrix *H, - real *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout ) -{ - int i, j; - real tmp, alpha, beta, b_norm; - real sig_old, sig_new; + t_start = MPI_Wtime( ); + MPI_Iallreduce( MPI_IN_PLACE, redux, 4, MPI_DOUBLE, MPI_SUM, mpi_data->world, &req ); - b_norm = Norm( b, system->n ); -#if defined(DEBUG) - if ( system->my_rank == MASTER_NODE ) + /* pre-conditioning */ + if ( control->cm_solver_pre_comp_type == SAI_PC ) { - fprintf( stderr, "n=%d, N=%d\n", system->n, system->N ); - fprintf( stderr, "p%d CGinit: b_norm=%24.15e\n", system->my_rank, b_norm ); - //Vector_Print( stderr, "d", workspace->d, system->N ); - //Vector_Print( stderr, "q", workspace->q, system->N ); - } - MPI_Barrier( mpi_data->world ); + t_start = MPI_Wtime( ); + Dist( system, mpi_data, workspace->w, REAL_PTR_TYPE, MPI_DOUBLE ); + t_comm += MPI_Wtime( ) - t_start; + + t_start = MPI_Wtime( ); +#if defined(NEUTRAL_TERRITORY) + Sparse_MatVec( workspace->H_app_inv, workspace->w, workspace->m, H->NT ); +#else + Sparse_MatVec( workspace->H_app_inv, workspace->w, workspace->m, system->n ); #endif - - Sparse_MatVec( H, x, workspace->q, system->N ); - //Coll_Vector( system, workspace, mpi_data, workspace->q ); - - Vector_Sum( workspace->r , 1., b, -1., workspace->q, system->n ); - for ( j = 0; j < system->n; ++j ) - workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; //pre-condition - - sig_new = Dot( workspace->r, workspace->d, system->n ); -#if defined(DEBUG) - if ( system->my_rank == MASTER_NODE ) - { - fprintf( stderr, "p%d CGinit:sig_new=%24.15e\n", system->my_rank, sig_new ); - //Vector_Print( stderr, "d", workspace->d, system->N ); - //Vector_Print( stderr, "q", workspace->q, system->N ); + t_pa += MPI_Wtime( ) - t_start; } - MPI_Barrier( mpi_data->world ); -#endif - - for ( i = 1; i < 100 && sqrt(sig_new) / b_norm > tol; ++i ) + else if ( control->cm_solver_pre_comp_type == JACOBI_PC ) { - //Dist_Vector( system, mpi_data, workspace->d ); - Sparse_MatVec( H, workspace->d, workspace->q, system->N ); - //Coll_Vector( system, workspace, mpi_data, workspace->q ); - - tmp = Dot( workspace->d, workspace->q, system->n ); - alpha = sig_new / tmp; -#if defined(DEBUG) - if ( system->my_rank == MASTER_NODE ) + t_start = MPI_Wtime( ); + for ( j = 0; j < system->n; ++j ) { - fprintf(stderr, - "p%d CG iter%d:d_norm=%24.15e,q_norm=%24.15e,tmp = %24.15e\n", - system->my_rank, i, - Parallel_Norm(workspace->d, system->n, mpi_data->world), - Parallel_Norm(workspace->q, system->n, mpi_data->world), tmp ); - //Vector_Print( stderr, "d", workspace->d, system->N ); - //Vector_Print( stderr, "q", workspace->q, system->N ); + workspace->m[j] = workspace->w[j] * workspace->Hdia_inv[j]; } - MPI_Barrier( mpi_data->world ); -#endif - - Vector_Add( x, alpha, workspace->d, system->n ); - Vector_Add( workspace->r, -alpha, workspace->q, system->n ); - // pre-conditioning - for ( j = 0; j < system->n; ++j ) - workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j]; + t_pa += MPI_Wtime( ) - t_start; + } - sig_old = sig_new; - sig_new = Dot( workspace->r, workspace->p, system->n ); + t_start = MPI_Wtime( ); + Dist( system, mpi_data, workspace->m, REAL_PTR_TYPE, MPI_DOUBLE ); + t_comm += MPI_Wtime( ) - t_start; - beta = sig_new / sig_old; - Vector_Sum( workspace->d, 1., workspace->p, beta, workspace->d, system->n ); -#if defined(DEBUG) - if ( system->my_rank == MASTER_NODE ) - fprintf(stderr, "p%d CG iter%d: sig_new = %24.15e\n", - system->my_rank, i, sig_new ); - MPI_Barrier( mpi_data->world ); + t_start = MPI_Wtime( ); +#if defined(NEUTRAL_TERRITORY) + Sparse_MatVec( H, workspace->m, workspace->n, H->NT ); +#else + Sparse_MatVec( H, workspace->m, workspace->n, system->N ); #endif - } + t_spmv += MPI_Wtime( ) - t_start; -#if defined(DEBUG) - if ( system->my_rank == MASTER_NODE ) - fprintf( stderr, "CG took %d iterations\n", i ); -#endif - if ( i >= 100 ) + if ( H->format == SYM_HALF_MATRIX ) { - fprintf( stderr, "CG convergence failed!\n" ); - return i; + t_start = MPI_Wtime( ); + Coll( system, mpi_data, workspace->n, REAL_PTR_TYPE, MPI_DOUBLE ); + t_comm += MPI_Wtime( ) - t_start; } +#if defined(NEUTRAL_TERRITORY) + else + { + t_start = MPI_Wtime( ); + Coll( system, mpi_data, workspace->n, REAL_PTR_TYPE, MPI_DOUBLE ); + t_comm += MPI_Wtime( ) - t_start; + } +#endif - return i; -}*/ - - -/*int GMRES( reax_system *system, storage *workspace, sparse_matrix *H, - real *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout ) -{ - int i, j, k, itr, N; - real cc, tmp1, tmp2, temp, bnorm; - - N = system->N; - bnorm = Norm( b, N ); - - // apply the diagonal pre-conditioner to rhs - for ( i = 0; i < N; ++i ) - workspace->b_prc[i] = b[i] * workspace->Hdia_inv[i]; + MPI_Wait( &req, MPI_STATUS_IGNORE ); + t_allreduce += MPI_Wtime( ) - t_start; + delta = redux[0]; + gamma_new = redux[1]; + norm = sqrt( redux[2] ); + b_norm = sqrt( redux[3] ); - // GMRES outer-loop - for ( itr = 0; itr < MAX_ITR; ++itr ) + for ( i = 0; i < control->cm_solver_max_iters && norm / b_norm > tol; ++i ) { - // calculate r0 - Sparse_MatVec( H, x, workspace->b_prm, N ); - for ( i = 0; i < N; ++i ) - workspace->b_prm[i] *= workspace->Hdia_inv[i]; // pre-conditioner + if ( i > 0 ) + { + beta = gamma_new / gamma_old; + alpha = gamma_new / (delta - beta / alpha * gamma_new); + } + else + { + beta = 0.0; + alpha = gamma_new / delta; + } - Vector_Sum( workspace->v[0], - 1., workspace->b_prc, -1., workspace->b_prm, N ); - workspace->g[0] = Norm( workspace->v[0], N ); - Vector_Scale( workspace->v[0], - 1. / workspace->g[0], workspace->v[0], N ); + t_start = MPI_Wtime( ); + Vector_Sum( workspace->u , 1.0, workspace->u, -alpha, workspace->q, system->n ); + Vector_Sum( workspace->w , 1.0, workspace->w, -alpha, workspace->z, system->n ); + Vector_Sum( workspace->r , 1.0, workspace->r, -alpha, workspace->d, system->n ); + Vector_Sum( x, 1.0, x, alpha, workspace->p, system->n ); + Vector_Sum( workspace->z , 1.0, workspace->n, beta, workspace->z, system->n ); + Vector_Sum( workspace->q , 1.0, workspace->m, beta, workspace->q, system->n ); + Vector_Sum( workspace->p , 1.0, workspace->u, beta, workspace->p, system->n ); + Vector_Sum( workspace->d , 1.0, workspace->w, beta, workspace->d, system->n ); + redux[0] = Dot_local( workspace->w, workspace->u, system->n ); + redux[1] = Dot_local( workspace->r, workspace->u, system->n ); + redux[2] = Dot_local( workspace->u, workspace->u, system->n ); + t_vops += MPI_Wtime( ) - t_start; - // fprintf( stderr, "%10.6f\n", workspace->g[0] ); + t_start = MPI_Wtime( ); + MPI_Iallreduce( MPI_IN_PLACE, redux, 3, MPI_DOUBLE, MPI_SUM, mpi_data->world, &req ); - // GMRES inner-loop - for ( j = 0; j < RESTART && fabs(workspace->g[j]) / bnorm > tol; j++ ) + /* pre-conditioning */ + if ( control->cm_solver_pre_comp_type == SAI_PC ) { - // matvec - Sparse_MatVec( H, workspace->v[j], workspace->v[j + 1], N ); - - for ( k = 0; k < N; ++k ) - workspace->v[j + 1][k] *= workspace->Hdia_inv[k]; // pre-conditioner - // fprintf( stderr, "%d-%d: matvec done.\n", itr, j ); - - // apply modified Gram-Schmidt to orthogonalize the new residual - for ( i = 0; i <= j; i++ ) + t_start = MPI_Wtime( ); + Dist( system, mpi_data, workspace->w, REAL_PTR_TYPE, MPI_DOUBLE ); + t_comm += MPI_Wtime( ) - t_start; + + t_start = MPI_Wtime( ); +#if defined(NEUTRAL_TERRITORY) + Sparse_MatVec( workspace->H_app_inv, workspace->w, workspace->m, H->NT ); +#else + Sparse_MatVec( workspace->H_app_inv, workspace->w, workspace->m, system->n ); +#endif + t_pa += MPI_Wtime( ) - t_start; + } + else if ( control->cm_solver_pre_comp_type == JACOBI_PC ) + { + t_start = MPI_Wtime( ); + for ( j = 0; j < system->n; ++j ) { - workspace->h[i][j] = Dot(workspace->v[i], workspace->v[j + 1], N); - Vector_Add( workspace->v[j + 1], - -workspace->h[i][j], workspace->v[i], N ); + workspace->m[j] = workspace->w[j] * workspace->Hdia_inv[j]; } + t_pa += MPI_Wtime( ) - t_start; + } - workspace->h[j + 1][j] = Norm( workspace->v[j + 1], N ); - Vector_Scale( workspace->v[j + 1], - 1. / workspace->h[j + 1][j], workspace->v[j + 1], N ); - // fprintf(stderr, "%d-%d: orthogonalization completed.\n", itr, j); - - // Givens rotations on the H matrix to make it U - for ( i = 0; i <= j; i++ ) - { - if ( i == j ) - { - cc = sqrt(SQR(workspace->h[j][j]) + SQR(workspace->h[j + 1][j])); - workspace->hc[j] = workspace->h[j][j] / cc; - workspace->hs[j] = workspace->h[j + 1][j] / cc; - } - - tmp1 = workspace->hc[i] * workspace->h[i][j] + - workspace->hs[i] * workspace->h[i + 1][j]; - tmp2 = -workspace->hs[i] * workspace->h[i][j] + - workspace->hc[i] * workspace->h[i + 1][j]; - - workspace->h[i][j] = tmp1; - workspace->h[i + 1][j] = tmp2; - } + t_start = MPI_Wtime( ); + Dist( system, mpi_data, workspace->m, REAL_PTR_TYPE, MPI_DOUBLE ); + t_comm += MPI_Wtime( ) - t_start; - // apply Givens rotations to the rhs as well - tmp1 = workspace->hc[j] * workspace->g[j]; - tmp2 = -workspace->hs[j] * workspace->g[j]; - workspace->g[j] = tmp1; - workspace->g[j + 1] = tmp2; + t_start = MPI_Wtime( ); +#if defined(NEUTRAL_TERRITORY) + Sparse_MatVec( H, workspace->m, workspace->n, H->NT ); +#else + Sparse_MatVec( H, workspace->m, workspace->n, system->N ); +#endif + t_spmv += MPI_Wtime( ) - t_start; - // fprintf( stderr, "%10.6f\n", fabs(workspace->g[j+1]) ); + if ( H->format == SYM_HALF_MATRIX ) + { + t_start = MPI_Wtime( ); + Coll( system, mpi_data, workspace->n, REAL_PTR_TYPE, MPI_DOUBLE ); + t_comm += MPI_Wtime( ) - t_start; } - - // solve Hy = g. - // H is now upper-triangular, do back-substitution - for ( i = j - 1; i >= 0; i-- ) +#if defined(NEUTRAL_TERRITORY) + else { - temp = workspace->g[i]; - for ( k = j - 1; k > i; k-- ) - temp -= workspace->h[i][k] * workspace->y[k]; - workspace->y[i] = temp / workspace->h[i][i]; + t_start = MPI_Wtime( ); + Coll( system, mpi_data, workspace->n, REAL_PTR_TYPE, MPI_DOUBLE ); + t_comm += MPI_Wtime( ) - t_start; } +#endif - // update x = x_0 + Vy - for ( i = 0; i < j; i++ ) - Vector_Add( x, workspace->y[i], workspace->v[i], N ); + gamma_old = gamma_new; - // stopping condition - if ( fabs(workspace->g[j]) / bnorm <= tol ) - break; + MPI_Wait( &req, MPI_STATUS_IGNORE ); + t_allreduce += MPI_Wtime( ) - t_start; + delta = redux[0]; + gamma_new = redux[1]; + norm = sqrt( redux[2] ); } - //Sparse_MatVec( system, H, x, workspace->b_prm, mpi_data ); - // for( i = 0; i < N; ++i ) - // workspace->b_prm[i] *= workspace->Hdia_inv[i]; + timings[0] = t_pa; + timings[1] = t_spmv; + timings[2] = t_vops; + timings[3] = t_comm; + timings[4] = t_allreduce; - // fprintf( fout, "\n%10s%15s%15s\n", "b_prc", "b_prm", "x" ); - // for( i = 0; i < N; ++i ) - //fprintf( fout, "%10.5f%15.12f%15.12f\n", - //workspace->b_prc[i], workspace->b_prm[i], x[i] ); + if ( system->my_rank == MASTER_NODE ) + { + MPI_Reduce( MPI_IN_PLACE, timings, 5, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world ); - fprintf( fout, "GMRES outer: %d, inner: %d - |rel residual| = %15.10f\n", - itr, j, fabs( workspace->g[j] ) / bnorm ); + data->timing.cm_solver_pre_app += timings[0] / control->nprocs; + data->timing.cm_solver_spmv += timings[1] / control->nprocs; + data->timing.cm_solver_vector_ops += timings[2] / control->nprocs; + data->timing.cm_solver_comm += timings[3] / control->nprocs; + data->timing.cm_solver_allreduce += timings[4] / control->nprocs; + } + else + { + MPI_Reduce( timings, NULL, 5, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world ); + } - if ( itr >= MAX_ITR ) + if ( i >= control->cm_solver_max_iters && system->my_rank == MASTER_NODE ) { - fprintf( stderr, "GMRES convergence failed\n" ); - return FAILURE; + fprintf( stderr, "[WARNING] PIPECG convergence failed!\n" ); + return i; } - return SUCCESS; -}*/ + return i; +} -/*int GMRES_HouseHolder( reax_system *system, storage *workspace, - sparse_matrix *H, real *b, real tol, real *x, - mpi_datatypes* mpi_data, FILE *fout ) +/* Pipelined Preconditioned Conjugate Residual Method + * + * References: + * 1) Hiding global synchronization latency in the preconditioned Conjugate Gradient algorithm, + * P. Ghysels and W. Vanroose, Parallel Computing, 2014. + * */ +int PIPECR( reax_system *system, control_params *control, simulation_data *data, + storage *workspace, sparse_matrix *H, real *b, + real tol, real *x, mpi_datatypes* mpi_data ) { - int i, j, k, itr, N; - real cc, tmp1, tmp2, temp, bnorm; - real v[10000], z[RESTART + 2][10000], w[RESTART + 2]; - real u[RESTART + 2][10000]; - - N = system->N; - bnorm = Norm( b, N ); - - // apply the diagonal pre-conditioner to rhs - for ( i = 0; i < N; ++i ) - workspace->b_prc[i] = b[i] * workspace->Hdia_inv[i]; + int i, j; + real tmp, alpha, beta, b_norm; + real sig_old, sig_new; + real t_start, t_pa, t_spmv, t_vops, t_comm, t_allreduce; + real timings[5]; - // GMRES outer-loop - for ( itr = 0; itr < MAX_ITR; ++itr ) - { - // compute z = r0 - Sparse_MatVec( H, x, workspace->b_prm, N ); + t_pa = 0.0; + t_spmv = 0.0; + t_vops = 0.0; + t_comm = 0.0; + t_allreduce = 0.0; - for ( i = 0; i < N; ++i ) - workspace->b_prm[i] *= workspace->Hdia_inv[i]; // pre-conditioner + t_start = MPI_Wtime( ); + Dist( system, mpi_data, x, REAL_PTR_TYPE, MPI_DOUBLE ); + t_comm += MPI_Wtime( ) - t_start; - Vector_Sum( z[0], 1., workspace->b_prc, -1., workspace->b_prm, N ); + t_start = MPI_Wtime( ); +#if defined(NEUTRAL_TERRITORY) + Sparse_MatVec( H, x, workspace->q, H->NT ); +#else + Sparse_MatVec( H, x, workspace->q, system->N ); +#endif + t_spmv += MPI_Wtime( ) - t_start; - Vector_MakeZero( w, RESTART + 1 ); - w[0] = Norm( z[0], N ); + if ( H->format == SYM_HALF_MATRIX ) + { + t_start = MPI_Wtime( ); + Coll( system, mpi_data, workspace->q, REAL_PTR_TYPE, MPI_DOUBLE ); + t_comm += MPI_Wtime( ) - t_start; + } +#if defined(NEUTRAL_TERRITORY) + else + { + t_start = MPI_Wtime( ); + Coll( system, mpi_data, workspace->q, REAL_PTR_TYPE, MPI_DOUBLE ); + t_comm += MPI_Wtime( ) - t_start; + } +#endif - Vector_Copy( u[0], z[0], N ); - u[0][0] += ( u[0][0] < 0.0 ? -1 : 1 ) * w[0]; - Vector_Scale( u[0], 1 / Norm( u[0], N ), u[0], N ); + t_start = MPI_Wtime( ); + Vector_Sum( workspace->r , 1., b, -1., workspace->q, system->n ); + t_vops += MPI_Wtime( ) - t_start; - w[0] *= ( u[0][0] < 0.0 ? 1 : -1 ); - // fprintf( stderr, "\n\n%12.6f\n", w[0] ); + /* pre-conditioning */ + if ( control->cm_solver_pre_comp_type == SAI_PC ) + { + t_start = MPI_Wtime( ); + Dist( system, mpi_data, workspace->r, REAL_PTR_TYPE, MPI_DOUBLE ); + t_comm += MPI_Wtime( ) - t_start; + + t_start = MPI_Wtime( ); +#if defined(NEUTRAL_TERRITORY) + Sparse_MatVec( workspace->H_app_inv, workspace->r, workspace->d, H->NT ); +#else + Sparse_MatVec( workspace->H_app_inv, workspace->r, workspace->d, system->n ); +#endif + t_pa += MPI_Wtime( ) - t_start; + } - // GMRES inner-loop - for ( j = 0; j < RESTART && fabs( w[j] ) / bnorm > tol; j++ ) + else if ( control->cm_solver_pre_comp_type == JACOBI_PC) + { + t_start = MPI_Wtime( ); + for ( j = 0; j < system->n; ++j ) { - // compute v_j - Vector_Scale( z[j], -2 * u[j][j], u[j], N ); - z[j][j] += 1.; // due to e_j - - for ( i = j - 1; i >= 0; --i ) - Vector_Add( z[j] + i, -2 * Dot( u[i] + i, z[j] + i, N - i ), u[i] + i, N - i ); - - // matvec - Sparse_MatVec( H, z[j], v, N ); - - for ( k = 0; k < N; ++k ) - v[k] *= workspace->Hdia_inv[k]; // pre-conditioner - - for ( i = 0; i <= j; ++i ) - Vector_Add( v + i, -2 * Dot( u[i] + i, v + i, N - i ), u[i] + i, N - i ); + workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; + } + t_pa += MPI_Wtime( ) - t_start; + } - if ( !Vector_isZero( v + (j + 1), N - (j + 1) ) ) - { - // compute the HouseHolder unit vector u_j+1 - for ( i = 0; i <= j; ++i ) - u[j + 1][i] = 0; + t_start = MPI_Wtime( ); + b_norm = Parallel_Norm( b, system->n, mpi_data->world ); + sig_new = Parallel_Dot( workspace->r, workspace->d, system->n, mpi_data->world ); + t_allreduce += MPI_Wtime( ) - t_start; - Vector_Copy( u[j + 1] + (j + 1), v + (j + 1), N - (j + 1) ); + for ( i = 0; i < control->cm_solver_max_iters && sqrt(sig_new) / b_norm > tol; ++i ) + { + t_start = MPI_Wtime( ); + Dist( system, mpi_data, workspace->d, REAL_PTR_TYPE, MPI_DOUBLE ); + t_comm += MPI_Wtime( ) - t_start; - u[j + 1][j + 1] += - ( v[j + 1] < 0.0 ? -1 : 1 ) * Norm( v + (j + 1), N - (j + 1) ); + t_start = MPI_Wtime( ); +#if defined(NEUTRAL_TERRITORY) + Sparse_MatVec( H, workspace->d, workspace->q, H->NT ); +#else + Sparse_MatVec( H, workspace->d, workspace->q, system->N ); +#endif + t_spmv += MPI_Wtime( ) - t_start; - Vector_Scale( u[j + 1], 1 / Norm( u[j + 1], N ), u[j + 1], N ); + if ( H->format == SYM_HALF_MATRIX ) + { + t_start = MPI_Wtime( ); + Coll( system, mpi_data, workspace->q, REAL_PTR_TYPE, MPI_DOUBLE ); + t_comm += MPI_Wtime( ) - t_start; + } +#if defined(NEUTRAL_TERRITORY) + else + { + t_start = MPI_Wtime( ); + Coll( system, mpi_data, workspace->q, REAL_PTR_TYPE, MPI_DOUBLE ); + t_comm += MPI_Wtime( ) - t_start; + } +#endif - // overwrite v with P_m+1 * v - v[j + 1] -= - 2 * Dot( u[j + 1] + (j + 1), v + (j + 1), N - (j + 1) ) * u[j + 1][j + 1]; - Vector_MakeZero( v + (j + 2), N - (j + 2) ); - } + t_start = MPI_Wtime( ); + tmp = Parallel_Dot( workspace->d, workspace->q, system->n, mpi_data->world ); + t_allreduce += MPI_Wtime( ) - t_start; + t_start = MPI_Wtime( ); + alpha = sig_new / tmp; + Vector_Add( x, alpha, workspace->d, system->n ); + Vector_Add( workspace->r, -alpha, workspace->q, system->n ); + t_vops += MPI_Wtime( ) - t_start; - // previous Givens rotations on H matrix to make it U - for ( i = 0; i < j; i++ ) - { - tmp1 = workspace->hc[i] * v[i] + workspace->hs[i] * v[i + 1]; - tmp2 = -workspace->hs[i] * v[i] + workspace->hc[i] * v[i + 1]; + /* pre-conditioning */ + if ( control->cm_solver_pre_comp_type == SAI_PC ) + { + t_start = MPI_Wtime( ); + Dist( system, mpi_data, workspace->r, REAL_PTR_TYPE, MPI_DOUBLE ); + t_comm += MPI_Wtime( ) - t_start; - v[i] = tmp1; - v[i + 1] = tmp2; - } + t_start = MPI_Wtime( ); +#if defined(NEUTRAL_TERRITORY) + Sparse_MatVec( workspace->H_app_inv, workspace->r, workspace->p, H->NT ); +#else + Sparse_MatVec( workspace->H_app_inv, workspace->r, workspace->p, system->n ); +#endif + t_pa += MPI_Wtime( ) - t_start; + } - // apply the new Givens rotation to H and right-hand side - if ( fabs(v[j + 1]) >= ALMOST_ZERO ) + else if ( control->cm_solver_pre_comp_type == JACOBI_PC) + { + t_start = MPI_Wtime( ); + for ( j = 0; j < system->n; ++j ) { - cc = sqrt( SQR( v[j] ) + SQR( v[j + 1] ) ); - workspace->hc[j] = v[j] / cc; - workspace->hs[j] = v[j + 1] / cc; - - tmp1 = workspace->hc[j] * v[j] + workspace->hs[j] * v[j + 1]; - tmp2 = -workspace->hs[j] * v[j] + workspace->hc[j] * v[j + 1]; - - v[j] = tmp1; - v[j + 1] = tmp2; - - / Givens rotations to rhs - tmp1 = workspace->hc[j] * w[j]; - tmp2 = -workspace->hs[j] * w[j]; - w[j] = tmp1; - w[j + 1] = tmp2; + workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j]; } - - // extend R - for ( i = 0; i <= j; ++i ) - workspace->h[i][j] = v[i]; - - - // fprintf( stderr, "h:" ); - // for( i = 0; i <= j+1 ; ++i ) - // fprintf( stderr, "%.6f ", h[i][j] ); - // fprintf( stderr, "\n" ); - // fprintf( stderr, "%12.6f\n", w[j+1] ); + t_pa += MPI_Wtime( ) - t_start; } + t_start = MPI_Wtime( ); + sig_old = sig_new; + sig_new = Parallel_Dot( workspace->r, workspace->p, system->n, mpi_data->world ); + t_allreduce += MPI_Wtime() - t_start; - // solve Hy = w. - // H is now upper-triangular, do back-substitution - for ( i = j - 1; i >= 0; i-- ) - { - temp = w[i]; - for ( k = j - 1; k > i; k-- ) - temp -= workspace->h[i][k] * workspace->y[k]; + t_start = MPI_Wtime( ); + beta = sig_new / sig_old; + Vector_Sum( workspace->d, 1., workspace->p, beta, workspace->d, system->n ); + t_vops += MPI_Wtime( ) - t_start; + } - workspace->y[i] = temp / workspace->h[i][i]; - } + timings[0] = t_pa; + timings[1] = t_spmv; + timings[2] = t_vops; + timings[3] = t_comm; + timings[4] = t_allreduce; - for ( i = j - 1; i >= 0; i-- ) - Vector_Add( x, workspace->y[i], z[i], N ); + if ( system->my_rank == MASTER_NODE ) + { + MPI_Reduce( MPI_IN_PLACE, timings, 5, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world ); - // stopping condition - if ( fabs( w[j] ) / bnorm <= tol ) - break; + data->timing.cm_solver_pre_app += timings[0] / control->nprocs; + data->timing.cm_solver_spmv += timings[1] / control->nprocs; + data->timing.cm_solver_vector_ops += timings[2] / control->nprocs; + data->timing.cm_solver_comm += timings[3] / control->nprocs; + data->timing.cm_solver_allreduce += timings[4] / control->nprocs; + } + else + { + MPI_Reduce( timings, NULL, 5, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world ); } - // Sparse_MatVec( system, H, x, workspace->b_prm ); - // for( i = 0; i < N; ++i ) - // workspace->b_prm[i] *= workspace->Hdia_inv[i]; - - // fprintf( fout, "\n%10s%15s%15s\n", "b_prc", "b_prm", "x" ); - // for( i = 0; i < N; ++i ) - // fprintf( fout, "%10.5f%15.12f%15.12f\n", - // workspace->b_prc[i], workspace->b_prm[i], x[i] ); - - fprintf( fout, "GMRES outer:%d inner:%d iters, |rel residual| = %15.10f\n", - itr, j, fabs( workspace->g[j] ) / bnorm ); + MPI_Barrier(mpi_data->world); - if ( itr >= MAX_ITR ) + if ( i >= control->cm_solver_max_iters && system->my_rank == MASTER_NODE ) { - fprintf( stderr, "GMRES convergence failed\n" ); - return FAILURE; + fprintf( stderr, "[WARNING] CG convergence failed!\n" ); + return i; } - return SUCCESS; -}*/ -#endif + return i; +} diff --git a/PuReMD/src/linear_solvers.h b/PuReMD/src/linear_solvers.h index 980a8b90..5bea768e 100644 --- a/PuReMD/src/linear_solvers.h +++ b/PuReMD/src/linear_solvers.h @@ -24,23 +24,24 @@ #include "reax_types.h" + real setup_sparse_approx_inverse( reax_system*, simulation_data*, storage*, mpi_datatypes*, sparse_matrix *, sparse_matrix **, int, double ); real sparse_approx_inverse( reax_system*, simulation_data*, storage*, mpi_datatypes*, sparse_matrix*, sparse_matrix*, sparse_matrix**, int ); -int GMRES( reax_system*, storage*, sparse_matrix*, - real*, real, real*, mpi_datatypes*, FILE* ); -int GMRES_HouseHolder( reax_system*, storage*, sparse_matrix*, - real*, real, real*, mpi_datatypes*, FILE* ); -int dual_CG( reax_system*, storage*, sparse_matrix*, +int dual_CG( reax_system*, control_params*, simulation_data*, storage*, sparse_matrix*, rvec2*, real, rvec2*, mpi_datatypes*, FILE* ); + int CG( reax_system*, control_params*, simulation_data*, storage*, sparse_matrix*, - real*, real, real*, mpi_datatypes*, FILE*, int ); -int PCG( reax_system*, storage*, sparse_matrix*, real*, real, - sparse_matrix*, sparse_matrix*, real*, mpi_datatypes*, FILE* ); -int sCG( reax_system*, storage*, sparse_matrix*, - real*, real, real*, mpi_datatypes*, FILE* ); + real*, real, real*, mpi_datatypes* ); + +int PIPECG( reax_system*, control_params*, simulation_data*, storage*, sparse_matrix*, + real*, real, real*, mpi_datatypes* ); + +int PIPECR( reax_system*, control_params*, simulation_data*, storage*, sparse_matrix*, + real*, real, real*, mpi_datatypes* ); + #endif diff --git a/PuReMD/src/parallelreax.c b/PuReMD/src/parallelreax.c index e633a3d3..12e09210 100644 --- a/PuReMD/src/parallelreax.c +++ b/PuReMD/src/parallelreax.c @@ -134,21 +134,15 @@ int main( int argc, char* argv[] ) } /* allocated main datastructures */ - system = (reax_system *) - smalloc( sizeof(reax_system), "system", MPI_COMM_WORLD ); - control = (control_params *) - smalloc( sizeof(control_params), "control", MPI_COMM_WORLD ); - data = (simulation_data *) - smalloc( sizeof(simulation_data), "data", MPI_COMM_WORLD ); - - workspace = (storage *) - smalloc( sizeof(storage), "workspace", MPI_COMM_WORLD ); - lists = (reax_list **) - smalloc( LIST_N * sizeof(reax_list*), "lists", MPI_COMM_WORLD ); + system = smalloc( sizeof(reax_system), "system", MPI_COMM_WORLD ); + control = smalloc( sizeof(control_params), "control", MPI_COMM_WORLD ); + data = smalloc( sizeof(simulation_data), "data", MPI_COMM_WORLD ); + + workspace = smalloc( sizeof(storage), "workspace", MPI_COMM_WORLD ); + lists = smalloc( LIST_N * sizeof(reax_list*), "lists", MPI_COMM_WORLD ); for ( i = 0; i < LIST_N; ++i ) { - lists[i] = (reax_list *) - smalloc( sizeof(reax_list), "lists[i]", MPI_COMM_WORLD ); + lists[i] = smalloc( sizeof(reax_list), "lists[i]", MPI_COMM_WORLD ); lists[i]->allocated = 0; lists[i]->n = 0; lists[i]->num_intrs = 0; @@ -162,17 +156,15 @@ int main( int argc, char* argv[] ) lists[i]->dDelta_list = NULL; lists[i]->hbond_list = NULL; } - out_control = (output_controls *) - smalloc( sizeof(output_controls), "out_control", MPI_COMM_WORLD ); - mpi_data = (mpi_datatypes *) - smalloc( sizeof(mpi_datatypes), "mpi_data", MPI_COMM_WORLD ); + out_control = smalloc( sizeof(output_controls), "out_control", MPI_COMM_WORLD ); + mpi_data = smalloc( sizeof(mpi_datatypes), "mpi_data", MPI_COMM_WORLD ); /* setup the parallel environment */ - MPI_Comm_size( MPI_COMM_WORLD, &(control->nprocs) ); - MPI_Comm_rank( MPI_COMM_WORLD, &(system->my_rank) ); + MPI_Comm_size( MPI_COMM_WORLD, &control->nprocs ); + MPI_Comm_rank( MPI_COMM_WORLD, &system->my_rank ); system->wsize = control->nprocs; - system->global_offset = (int*) - scalloc( system->wsize + 1, sizeof(int), "global_offset", MPI_COMM_WORLD ); + system->global_offset = scalloc( system->wsize + 1, sizeof(int), + "global_offset", MPI_COMM_WORLD ); /* read system description files */ Read_System( argv[1], argv[2], argv[3], system, control, @@ -185,7 +177,9 @@ int main( int argc, char* argv[] ) /* measure total simulation time after input is read */ if ( system->my_rank == MASTER_NODE ) - t_start = MPI_Wtime(); + { + t_start = MPI_Wtime( ); + } /* initialize datastructures */ Initialize( system, control, data, workspace, lists, out_control, mpi_data ); @@ -203,7 +197,6 @@ int main( int argc, char* argv[] ) lists, out_control, mpi_data ); Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D ); Output_Results( system, control, data, lists, out_control, mpi_data ); - #if defined(DEBUG) fprintf( stderr, "p%d: computed forces at t0\n", system->my_rank ); @@ -215,28 +208,39 @@ int main( int argc, char* argv[] ) for ( ++data->step; data->step <= control->nsteps; data->step++ ) { if ( control->T_mode ) + { Temperature_Control( control, data ); + } Evolve( system, control, data, workspace, lists, out_control, mpi_data ); Post_Evolve(system, control, data, workspace, lists, out_control, mpi_data); + if( system->my_rank == MASTER_NODE ) { total_itr += data->timing.cm_solver_iters; } + Output_Results( system, control, data, lists, out_control, mpi_data ); //Analysis(system, control, data, workspace, lists, out_control, mpi_data); /* dump restart info */ - if ( out_control->restart_freq && - (data->step - data->prev_steps) % out_control->restart_freq == 0 ) + if ( out_control->restart_freq + && (data->step - data->prev_steps) % out_control->restart_freq == 0 ) { if ( out_control->restart_format == WRITE_ASCII ) + { Write_Restart( system, control, data, out_control, mpi_data ); + } else if ( out_control->restart_format == WRITE_BINARY ) + { Write_Binary_Restart( system, control, data, out_control, mpi_data ); + } } - /*if(data->step == 1 || data->step == control->nsteps) - Write_PDB( system, lists, data, control, mpi_data, out_control );*/ + +// if ( data->step == 1 || data->step == control->nsteps ) +// { +// Write_PDB( system, lists, data, control, mpi_data, out_control ); +// } #if defined(DEBUG) fprintf( stderr, "p%d: step%d completed\n", system->my_rank, data->step ); @@ -255,7 +259,7 @@ int main( int argc, char* argv[] ) //Write_PDB( system, lists, data, control, mpi_data, out_control ); Close_Output_Files( system, control, out_control, mpi_data ); - MPI_Finalize(); + MPI_Finalize( ); /* de-allocate data structures */ sfree( system, "system" ); diff --git a/PuReMD/src/qEq.c b/PuReMD/src/qEq.c index d029c742..b936c46e 100644 --- a/PuReMD/src/qEq.c +++ b/PuReMD/src/qEq.c @@ -318,14 +318,13 @@ void Init_MatVec( reax_system *system, simulation_data *data, void Calculate_Charges( reax_system *system, storage *workspace, mpi_datatypes *mpi_data ) { - int i, scale; - real u;//, s_sum, t_sum; - rvec2 my_sum, all_sum; + int i; + real u;//, s_sum, t_sum; + rvec2 my_sum, all_sum; reax_atom *atom; real *q; - scale = sizeof(real) / sizeof(void); - q = (real*) malloc(system->N * sizeof(real)); + q = malloc( system->N * sizeof(real) ); //s_sum = Parallel_Vector_Acc(workspace->s, system->n, mpi_data->world); //t_sum = Parallel_Vector_Acc(workspace->t, system->n, mpi_data->world); @@ -360,11 +359,14 @@ void Calculate_Charges( reax_system *system, storage *workspace, atom->t[0] = workspace->x[i][1]; } - Dist( system, mpi_data, q, MPI_DOUBLE, scale, real_packer ); + Dist( system, mpi_data, q, REAL_PTR_TYPE, MPI_DOUBLE ); + for ( i = system->n; i < system->N; ++i ) + { system->my_atoms[i].q = q[i]; + } - sfree(q, "q"); + sfree( q, "q" ); } @@ -419,6 +421,8 @@ void QEq( reax_system *system, control_params *control, simulation_data *data, { int j, iters; + iters = 0; + Init_MatVec( system, data, control, workspace, mpi_data ); #if defined(DEBUG) @@ -428,7 +432,8 @@ void QEq( reax_system *system, control_params *control, simulation_data *data, if( control->cm_solver_pre_comp_type == SAI_PC ) { - if( control->cm_solver_pre_comp_refactor > 0 && ((data->step - data->prev_steps) % control->cm_solver_pre_comp_refactor == 0)) + if( control->cm_solver_pre_comp_refactor > 0 + && ((data->step - data->prev_steps) % control->cm_solver_pre_comp_refactor == 0)) { Setup_Preconditioner_QEq( system, control, data, workspace, mpi_data ); @@ -438,29 +443,80 @@ void QEq( reax_system *system, control_params *control, simulation_data *data, //TODO: used for timing to sync processors going into the linear solve, but remove for production code MPI_Barrier( mpi_data->world ); - for ( j = 0; j < system->n; ++j ) - workspace->s[j] = workspace->x[j][0]; - iters = CG(system, control, data, workspace, workspace->H, workspace->b_s, - control->cm_solver_q_err, workspace->s, mpi_data, out_control->log , control->nprocs ); - for ( j = 0; j < system->n; ++j ) - workspace->x[j][0] = workspace->s[j]; -#if defined(DEBUG) - fprintf( stderr, "p%d: first CG completed\n", system->my_rank ); -#endif + switch ( control->cm_solver_type ) + { + case CG_S: + for ( j = 0; j < system->n; ++j ) + { + workspace->s[j] = workspace->x[j][0]; + } - for ( j = 0; j < system->n; ++j ) - workspace->t[j] = workspace->x[j][1]; - iters += CG(system, control, data, workspace, workspace->H, workspace->b_t,//newQEq sCG - control->cm_solver_q_err, workspace->t, mpi_data, out_control->log, control->nprocs ); - for ( j = 0; j < system->n; ++j ) - workspace->x[j][1] = workspace->t[j]; + iters = CG( system, control, data, workspace, workspace->H, workspace->b_s, + control->cm_solver_q_err, workspace->s, mpi_data ); -#if defined(DEBUG) - fprintf( stderr, "p%d: second CG completed\n", system->my_rank ); -#endif + for ( j = 0; j < system->n; ++j ) + { + workspace->x[j][0] = workspace->s[j]; + } + + for ( j = 0; j < system->n; ++j ) + { + workspace->t[j] = workspace->x[j][1]; + } + + iters += CG( system, control, data, workspace, workspace->H, workspace->b_t, + control->cm_solver_q_err, workspace->t, mpi_data ); + + for ( j = 0; j < system->n; ++j ) + { + workspace->x[j][1] = workspace->t[j]; + } + break; + + case PIPECG_S: + for ( j = 0; j < system->n; ++j ) + { + workspace->s[j] = workspace->x[j][0]; + } + + iters = PIPECG( system, control, data, workspace, workspace->H, workspace->b_s, + control->cm_solver_q_err, workspace->s, mpi_data ); + + for ( j = 0; j < system->n; ++j ) + { + workspace->x[j][0] = workspace->s[j]; + } + + for ( j = 0; j < system->n; ++j ) + { + workspace->t[j] = workspace->x[j][1]; + } + + iters += PIPECG( system, control, data, workspace, workspace->H, workspace->b_t, + control->cm_solver_q_err, workspace->t, mpi_data ); + + for ( j = 0; j < system->n; ++j ) + { + workspace->x[j][1] = workspace->t[j]; + } + break; + + case GMRES_S: + case GMRES_H_S: + case SDM_S: + case BiCGStab_S: + fprintf( stderr, "[ERROR] Unsupported solver selection. Terminating...\n" ); + break; + + default: + fprintf( stderr, "[ERROR] Unrecognized solver selection. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + } Calculate_Charges( system, workspace, mpi_data ); + #if defined(DEBUG) fprintf( stderr, "p%d: computed charges\n", system->my_rank ); //Print_Charges( system ); diff --git a/PuReMD/src/reax_types.h b/PuReMD/src/reax_types.h index ab8a7b50..244fdc0f 100644 --- a/PuReMD/src/reax_types.h +++ b/PuReMD/src/reax_types.h @@ -87,6 +87,7 @@ enum solver CG_S = 2, SDM_S = 3, BiCGStab_S = 4, + PIPECG_S = 5, }; /* preconditioner computation type for charge method linear solver */ @@ -807,13 +808,13 @@ typedef struct /**/ real cm_solver_pre_comp; /**/ - real cm_solver_pre_app; // update CG() + real cm_solver_pre_app; /* num. of steps in iterative linear solver for charge distribution */ int cm_solver_iters; /**/ - real cm_solver_spmv; // update CG() + real cm_solver_spmv; /**/ - real cm_solver_vector_ops; // update CG() + real cm_solver_vector_ops; /**/ real cm_solver_orthog; /**/ @@ -1046,11 +1047,16 @@ typedef struct rvec2 *b, *x; /* GMRES storage */ - real *y, *z, *g; + real *y, *g; real *hc, *hs; real **h, **v; - /* CG storage */ - real *r, *d, *q, *p; + /* GMRES, PIPECG storage */ + real *z; + /* CG, PIPECG storage */ + real *d, *p, *q, *r; + /* PIPECG storage */ + real *m, *n, *u, *w; + /* dual-CG storage */ rvec2 *r2, *d2, *q2, *p2; /* Taper */ real Tap[8]; //Tap7, Tap6, Tap5, Tap4, Tap3, Tap2, Tap1, Tap0; diff --git a/PuReMD/src/vector.c b/PuReMD/src/vector.c index efbc0954..78ed514f 100644 --- a/PuReMD/src/vector.c +++ b/PuReMD/src/vector.c @@ -83,6 +83,22 @@ real Dot( real* v1, real* v2, int k ) } +real Dot_local( real *v1, real *v2, int k ) +{ + int i; + real sum; + + sum = 0.0; + + for ( i = 0; i < k; ++i ) + { + sum += v1[i] * v2[i]; + } + + return sum; +} + + real Norm( real* v1, int k ) { real ret = 0; diff --git a/PuReMD/src/vector.h b/PuReMD/src/vector.h index baf1b3d1..e133d8ae 100644 --- a/PuReMD/src/vector.h +++ b/PuReMD/src/vector.h @@ -25,75 +25,138 @@ #include "reax_types.h" #include "reax_defs.h" + int Vector_isZero( real*, int ); + void Vector_MakeZero( real*, int ); + void Vector_Copy( real*, real*, int ); + void Vector_Scale( real*, real, real*, int ); + + static inline void Vector_Sum( real* dest, real c, real* v, real d, real* y, int k ) { int i; + for ( i = 0; i < k; ++i ) + { dest[i] = c * v[i] + d * y[i]; + } } + + static inline void Vector_Add( real* dest, real c, real* v, int k ) { int i; + for ( i = 0; i < k; ++i ) + { dest[i] += c * v[i]; + } } + real Dot( real*, real*, int ); + +real Dot_local( real*, real*, int ); + real Norm( real*, int ); + void Vector_Print( FILE*, char*, real*, int ); void rvec_Copy( rvec, rvec ); + void rvec_Scale( rvec, real, rvec ); + void rvec_Add( rvec, rvec ); + void rvec_ScaledAdd( rvec, real, rvec ); + void rvec_Sum( rvec, rvec, rvec ); + void rvec_ScaledSum( rvec, real, rvec, real, rvec ); + real rvec_Dot( rvec, rvec ); + real rvec_ScaledDot( real, rvec, real, rvec ); + void rvec_Multiply( rvec, rvec, rvec ); + void rvec_iMultiply( rvec, ivec, rvec ); + void rvec_Divide( rvec, rvec, rvec ); + void rvec_iDivide( rvec, rvec, ivec ); + void rvec_Invert( rvec, rvec ); + void rvec_Cross( rvec, rvec, rvec ); + void rvec_OuterProduct( rtensor, rvec, rvec ); + real rvec_Norm_Sqr( rvec ); + real rvec_Norm( rvec ); + int rvec_isZero( rvec ); + void rvec_MakeZero( rvec ); + void rvec_Random( rvec ); void rtensor_MakeZero( rtensor ); + void rtensor_Multiply( rtensor, rtensor, rtensor ); + void rtensor_MatVec( rvec, rtensor, rvec ); + void rtensor_Scale( rtensor, real, rtensor ); + void rtensor_Add( rtensor, rtensor ); + void rtensor_ScaledAdd( rtensor, real, rtensor ); + void rtensor_Sum( rtensor, rtensor, rtensor ); + void rtensor_ScaledSum( rtensor, real, rtensor, real, rtensor ); + void rtensor_Scale( rtensor, real, rtensor ); + void rtensor_Copy( rtensor, rtensor ); + void rtensor_Identity( rtensor ); + void rtensor_Transpose( rtensor, rtensor ); + real rtensor_Det( rtensor ); + real rtensor_Trace( rtensor ); void Print_rTensor(FILE*, rtensor); -int ivec_isZero( ivec ); -int ivec_isEqual( ivec, ivec ); +int ivec_isZero( ivec ); + +int ivec_isEqual( ivec, ivec ); + void ivec_MakeZero( ivec ); + void ivec_Copy( ivec, ivec ); + void ivec_Scale( ivec, real, ivec ); + void ivec_rScale( ivec, real, rvec ); + void ivec_Sum( ivec, ivec, ivec ); + void ivec_ScaledSum( ivec, int, ivec, int, ivec ); + void ivec_Add( ivec, ivec ); + void ivec_ScaledAdd( ivec, int, ivec ); + void ivec_Max( ivec, ivec, ivec ); + void ivec_Max3( ivec, ivec, ivec, ivec ); + #endif -- GitLab