From a31f1f11e67f9413d995906f7490e04d319ee80c Mon Sep 17 00:00:00 2001
From: "Kurt A. O'Hearn" <ohearnk@msu.edu>
Date: Sat, 5 Jan 2019 16:47:39 -0500
Subject: [PATCH] PuReMD-old: add preconditioned pipelined CG (PIPECG) (WIP).

---
 PuReMD/src/allocate.c       |  331 ++++----
 PuReMD/src/basic_comm.c     |  494 ++++++++----
 PuReMD/src/basic_comm.h     |   33 +-
 PuReMD/src/forces.c         |   63 +-
 PuReMD/src/linear_solvers.c | 1519 ++++++++++++++++-------------------
 PuReMD/src/linear_solvers.h |   21 +-
 PuReMD/src/parallelreax.c   |   60 +-
 PuReMD/src/qEq.c            |  108 ++-
 PuReMD/src/reax_types.h     |   18 +-
 PuReMD/src/vector.c         |   16 +
 PuReMD/src/vector.h         |   67 +-
 11 files changed, 1468 insertions(+), 1262 deletions(-)

diff --git a/PuReMD/src/allocate.c b/PuReMD/src/allocate.c
index bb8bef3e..98fb6502 100644
--- a/PuReMD/src/allocate.c
+++ b/PuReMD/src/allocate.c
@@ -147,7 +147,7 @@ void DeAllocate_Workspace( control_params *control, storage *workspace )
     sfree( workspace->bond_mark, "bond_mark" );
     sfree( workspace->done_after, "done_after" );
 
-    /* QEq storage */
+    /* CM storage */
     sfree( workspace->Hdia_inv, "Hdia_inv" );
     sfree( workspace->b_s, "b_s" );
     sfree( workspace->b_t, "b_t" );
@@ -159,28 +159,54 @@ void DeAllocate_Workspace( control_params *control, storage *workspace )
     sfree( workspace->b, "b" );
     sfree( workspace->x, "x" );
 
-    /* GMRES storage */
-    for ( i = 0; i < RESTART + 1; ++i )
+    if ( control->cm_solver_type == GMRES_S
+            || control->cm_solver_type == GMRES_H_S )
     {
-        sfree( workspace->h[i], "h[i]" );
-        sfree( workspace->v[i], "v[i]" );
+        for ( i = 0; i < RESTART + 1; ++i )
+        {
+            sfree( workspace->h[i], "h[i]" );
+            sfree( workspace->v[i], "v[i]" );
+        }
+
+        sfree( workspace->y, "y" );
+        sfree( workspace->g, "g" );
+        sfree( workspace->hc, "hc" );
+        sfree( workspace->hs, "hs" );
+        sfree( workspace->h, "h" );
+        sfree( workspace->v, "v" );
+    }
+
+    if ( control->cm_solver_type == GMRES_S
+            || control->cm_solver_type == GMRES_H_S
+            || control->cm_solver_type == PIPECG_S )
+    {
+        sfree( workspace->z, "z" );
+    }
+
+    if ( control->cm_solver_type == CG_S
+            || control->cm_solver_type == PIPECG_S )
+    {
+        sfree( workspace->d, "d" );
+        sfree( workspace->p, "p" );
+        sfree( workspace->q, "q" );
+        sfree( workspace->r, "r" );
+    }
+
+    if ( control->cm_solver_type == PIPECG_S )
+    {
+        sfree( workspace->m, "m" );
+        sfree( workspace->n, "n" );
+        sfree( workspace->u, "u" );
+        sfree( workspace->w, "w" );
+    }
+
+    if ( control->cm_solver_type == CG_S )
+    {
+        sfree( workspace->r2, "r2" );
+        sfree( workspace->d2, "d2" );
+        sfree( workspace->q2, "q2" );
+        sfree( workspace->p2, "p2" );
     }
-    sfree( workspace->h, "h" );
-    sfree( workspace->v, "v" );
-    sfree( workspace->y, "y" );
-    sfree( workspace->z, "z" );
-    sfree( workspace->g, "g" );
-    sfree( workspace->hs, "hs" );
-    sfree( workspace->hc, "hc" );
-    /* CG storage */
-    sfree( workspace->r, "r" );
-    sfree( workspace->d, "d" );
-    sfree( workspace->q, "q" );
-    sfree( workspace->p, "p" );
-    sfree( workspace->r2, "r2" );
-    sfree( workspace->d2, "d2" );
-    sfree( workspace->q2, "q2" );
-    sfree( workspace->p2, "p2" );
 
     /* integrator */
     // sfree( workspace->f_old );
@@ -239,8 +265,8 @@ void DeAllocate_Workspace( control_params *control, storage *workspace )
 
 
 int Allocate_Workspace( reax_system *system, control_params *control,
-                        storage *workspace, int local_cap, int total_cap,
-                        MPI_Comm comm, char *msg )
+        storage *workspace, int local_cap, int total_cap,
+        MPI_Comm comm, char *msg )
 {
     int i, total_real, total_rvec, local_rvec;
 
@@ -252,129 +278,147 @@ int Allocate_Workspace( reax_system *system, control_params *control,
     /* communication storage */
     for ( i = 0; i < MAX_NBRS; ++i )
     {
-        workspace->tmp_dbl[i] = (real*)
-                                scalloc( total_cap, sizeof(real), "tmp_dbl", comm );
-        workspace->tmp_rvec[i] = (rvec*)
-                                 scalloc( total_cap, sizeof(rvec), "tmp_rvec", comm );
-        workspace->tmp_rvec2[i] = (rvec2*)
-                                  scalloc( total_cap, sizeof(rvec2), "tmp_rvec2", comm );
+        workspace->tmp_dbl[i] = scalloc( total_cap, sizeof(real), "tmp_dbl", comm );
+        workspace->tmp_rvec[i] = scalloc( total_cap, sizeof(rvec), "tmp_rvec", comm );
+        workspace->tmp_rvec2[i] = scalloc( total_cap, sizeof(rvec2), "tmp_rvec2", comm );
     }
 
     /* bond order related storage  */
-    workspace->within_bond_box = (int*)
-                                 scalloc( total_cap, sizeof(int), "skin", comm );
-    workspace->total_bond_order = (real*) smalloc( total_real, "total_bo", comm );
-    workspace->Deltap = (real*) smalloc( total_real, "Deltap", comm );
-    workspace->Deltap_boc = (real*) smalloc( total_real, "Deltap_boc", comm );
-    workspace->dDeltap_self = (rvec*) smalloc( total_rvec, "dDeltap_self", comm );
-    workspace->Delta = (real*) smalloc( total_real, "Delta", comm );
-    workspace->Delta_lp = (real*) smalloc( total_real, "Delta_lp", comm );
-    workspace->Delta_lp_temp = (real*)
-                               smalloc( total_real, "Delta_lp_temp", comm );
-    workspace->dDelta_lp = (real*) smalloc( total_real, "dDelta_lp", comm );
-    workspace->dDelta_lp_temp = (real*)
-                                smalloc( total_real, "dDelta_lp_temp", comm );
-    workspace->Delta_e = (real*) smalloc( total_real, "Delta_e", comm );
-    workspace->Delta_boc = (real*) smalloc( total_real, "Delta_boc", comm );
-    workspace->nlp = (real*) smalloc( total_real, "nlp", comm );
-    workspace->nlp_temp = (real*) smalloc( total_real, "nlp_temp", comm );
-    workspace->Clp = (real*) smalloc( total_real, "Clp", comm );
-    workspace->vlpex = (real*) smalloc( total_real, "vlpex", comm );
-    workspace->bond_mark = (int*)
-                           scalloc( total_cap, sizeof(int), "bond_mark", comm );
-    workspace->done_after = (int*)
-                            scalloc( total_cap, sizeof(int), "done_after", comm );
-    // fprintf( stderr, "p%d: bond order storage\n", system->my_rank );
-
-    /* QEq storage */
-    workspace->Hdia_inv = (real*)
-                          scalloc( total_cap, sizeof(real), "Hdia_inv", comm );
-    workspace->b_s = (real*) scalloc( total_cap, sizeof(real), "b_s", comm );
-    workspace->b_t = (real*) scalloc( total_cap, sizeof(real), "b_t", comm );
-    workspace->b_prc = (real*) scalloc( total_cap, sizeof(real), "b_prc", comm );
-    workspace->b_prm = (real*) scalloc( total_cap, sizeof(real), "b_prm", comm );
-    workspace->s = (real*) scalloc( total_cap, sizeof(real), "s", comm );
-    workspace->t = (real*) scalloc( total_cap, sizeof(real), "t", comm );
-    workspace->droptol = (real*)
-                         scalloc( total_cap, sizeof(real), "droptol", comm );
-    workspace->b = (rvec2*) scalloc( total_cap, sizeof(rvec2), "b", comm );
-    workspace->x = (rvec2*) scalloc( total_cap, sizeof(rvec2), "x", comm );
-
-    /* GMRES storage */
-    workspace->y = (real*) scalloc( RESTART + 1, sizeof(real), "y", comm );
-    workspace->z = (real*) scalloc( RESTART + 1, sizeof(real), "z", comm );
-    workspace->g = (real*) scalloc( RESTART + 1, sizeof(real), "g", comm );
-    workspace->h = (real**) scalloc( RESTART + 1, sizeof(real*), "h", comm );
-    workspace->hs = (real*) scalloc( RESTART + 1, sizeof(real), "hs", comm );
-    workspace->hc = (real*) scalloc( RESTART + 1, sizeof(real), "hc", comm );
-    workspace->v = (real**) scalloc( RESTART + 1, sizeof(real*), "v", comm );
-
-    for ( i = 0; i < RESTART + 1; ++i )
+    workspace->within_bond_box = scalloc( total_cap, sizeof(int), "skin", comm );
+    workspace->total_bond_order = smalloc( total_real, "total_bo", comm );
+    workspace->Deltap = smalloc( total_real, "Deltap", comm );
+    workspace->Deltap_boc = smalloc( total_real, "Deltap_boc", comm );
+    workspace->dDeltap_self = smalloc( total_rvec, "dDeltap_self", comm );
+    workspace->Delta = smalloc( total_real, "Delta", comm );
+    workspace->Delta_lp = smalloc( total_real, "Delta_lp", comm );
+    workspace->Delta_lp_temp = smalloc( total_real, "Delta_lp_temp", comm );
+    workspace->dDelta_lp = smalloc( total_real, "dDelta_lp", comm );
+    workspace->dDelta_lp_temp = smalloc( total_real, "dDelta_lp_temp", comm );
+    workspace->Delta_e = smalloc( total_real, "Delta_e", comm );
+    workspace->Delta_boc = smalloc( total_real, "Delta_boc", comm );
+    workspace->nlp = smalloc( total_real, "nlp", comm );
+    workspace->nlp_temp = smalloc( total_real, "nlp_temp", comm );
+    workspace->Clp = smalloc( total_real, "Clp", comm );
+    workspace->vlpex = smalloc( total_real, "vlpex", comm );
+    workspace->bond_mark = scalloc( total_cap, sizeof(int), "bond_mark", comm );
+    workspace->done_after = scalloc( total_cap, sizeof(int), "done_after", comm );
+
+    /* CM storage */
+    workspace->Hdia_inv = scalloc( total_cap, sizeof(real), "Hdia_inv", comm );
+    workspace->b_s = scalloc( total_cap, sizeof(real), "b_s", comm );
+    workspace->b_t = scalloc( total_cap, sizeof(real), "b_t", comm );
+    workspace->b_prc = scalloc( total_cap, sizeof(real), "b_prc", comm );
+    workspace->b_prm = scalloc( total_cap, sizeof(real), "b_prm", comm );
+    workspace->s = scalloc( total_cap, sizeof(real), "s", comm );
+    workspace->t = scalloc( total_cap, sizeof(real), "t", comm );
+    workspace->droptol = scalloc( total_cap, sizeof(real), "droptol", comm );
+    workspace->b = scalloc( total_cap, sizeof(rvec2), "b", comm );
+    workspace->x = scalloc( total_cap, sizeof(rvec2), "x", comm );
+
+    if ( control->cm_solver_type == GMRES_S
+            || control->cm_solver_type == GMRES_H_S )
     {
-        workspace->h[i] = (real*) scalloc( RESTART + 1, sizeof(real), "h[i]", comm );
-        workspace->v[i] = (real*) scalloc( total_cap, sizeof(real), "v[i]", comm );
+        workspace->y = scalloc( RESTART + 1, sizeof(real), "y", comm );
+        workspace->g = scalloc( RESTART + 1, sizeof(real), "g", comm );
+        workspace->hc = scalloc( RESTART + 1, sizeof(real), "hc", comm );
+        workspace->hs = scalloc( RESTART + 1, sizeof(real), "hs", comm );
+        workspace->h = scalloc( RESTART + 1, sizeof(real*), "h", comm );
+        workspace->v = scalloc( RESTART + 1, sizeof(real*), "v", comm );
+
+        for ( i = 0; i < RESTART + 1; ++i )
+        {
+            workspace->h[i] = (real*) scalloc( RESTART + 1, sizeof(real), "h[i]", comm );
+            workspace->v[i] = (real*) scalloc( total_cap, sizeof(real), "v[i]", comm );
+        }
     }
 
-    /* CG storage */
-    workspace->r = (real*) scalloc( total_cap, sizeof(real), "r", comm );
-    workspace->d = (real*) scalloc( total_cap, sizeof(real), "d", comm );
-    workspace->q = (real*) scalloc( total_cap, sizeof(real), "q", comm );
-    workspace->p = (real*) scalloc( total_cap, sizeof(real), "p", comm );
-    workspace->r2 = (rvec2*) scalloc( total_cap, sizeof(rvec2), "r2", comm );
-    workspace->d2 = (rvec2*) scalloc( total_cap, sizeof(rvec2), "d2", comm );
-    workspace->q2 = (rvec2*) scalloc( total_cap, sizeof(rvec2), "q2", comm );
-    workspace->p2 = (rvec2*) scalloc( total_cap, sizeof(rvec2), "p2", comm );
+    if ( control->cm_solver_type == GMRES_S
+            || control->cm_solver_type == GMRES_H_S
+            || control->cm_solver_type == PIPECG_S )
+    {
+        workspace->z = scalloc( RESTART + 1, sizeof(real), "z", comm );
+    }
+    else if ( control->cm_solver_type == PIPECG_S )
+    {
+        workspace->z = scalloc( total_cap, sizeof(real), "z", comm );
+    }
+
+    if ( control->cm_solver_type == CG_S
+            || control->cm_solver_type == PIPECG_S )
+    {
+        workspace->d = scalloc( total_cap, sizeof(real), "d", comm );
+        workspace->p = scalloc( total_cap, sizeof(real), "p", comm );
+        workspace->q = scalloc( total_cap, sizeof(real), "q", comm );
+        workspace->r = scalloc( total_cap, sizeof(real), "r", comm );
+    }
+
+    if ( control->cm_solver_type == PIPECG_S )
+    {
+        workspace->m = scalloc( total_cap, sizeof(real), "m", comm );
+        workspace->n = scalloc( total_cap, sizeof(real), "n", comm );
+        workspace->u = scalloc( total_cap, sizeof(real), "u", comm );
+        workspace->w = scalloc( total_cap, sizeof(real), "w", comm );
+    }
+
+    if ( control->cm_solver_type == CG_S )
+    {
+        workspace->d2 = scalloc( total_cap, sizeof(rvec2), "d2", comm );
+        workspace->r2 = scalloc( total_cap, sizeof(rvec2), "r2", comm );
+        workspace->p2 = scalloc( total_cap, sizeof(rvec2), "p2", comm );
+        workspace->q2 = scalloc( total_cap, sizeof(rvec2), "q2", comm );
+    }
 
     /* integrator storage */
-    workspace->v_const = (rvec*) smalloc( local_rvec, "v_const", comm );
+    workspace->v_const = smalloc( local_rvec, "v_const", comm );
 
     /* storage for analysis */
     if ( control->molecular_analysis || control->diffusion_coef )
     {
-        workspace->mark = (int*) scalloc( local_cap, sizeof(int), "mark", comm );
-        workspace->old_mark = (int*)
-                              scalloc( local_cap, sizeof(int), "old_mark", comm );
+        workspace->mark = scalloc( local_cap, sizeof(int), "mark", comm );
+        workspace->old_mark = scalloc( local_cap, sizeof(int), "old_mark", comm );
     }
     else
-        workspace->mark = workspace->old_mark = NULL;
+    {
+        workspace->mark = NULL;
+        workspace->old_mark = NULL;
+    }
 
     if ( control->diffusion_coef )
-        workspace->x_old = (rvec*)
-                           scalloc( local_cap, sizeof(rvec), "x_old", comm );
-    else workspace->x_old = NULL;
+    {
+        workspace->x_old = scalloc( local_cap, sizeof(rvec), "x_old", comm );
+    }
+    else
+    {
+        workspace->x_old = NULL;
+    }
 
     /* force related storage */
-    workspace->f = (rvec*) scalloc( total_cap, sizeof(rvec), "f", comm );
-    workspace->CdDelta = (real*)
-                         scalloc( total_cap, sizeof(real), "CdDelta", comm );
+    workspace->f = scalloc( total_cap, sizeof(rvec), "f", comm );
+    workspace->CdDelta = scalloc( total_cap, sizeof(real), "CdDelta", comm );
 
 #ifdef TEST_FORCES
-    workspace->dDelta = (rvec*) smalloc( total_rvec, "dDelta", comm );
-    workspace->f_ele = (rvec*) smalloc( total_rvec, "f_ele", comm );
-    workspace->f_vdw = (rvec*) smalloc( total_rvec, "f_vdw", comm );
-    workspace->f_bo = (rvec*) smalloc( total_rvec, "f_bo", comm );
-    workspace->f_be = (rvec*) smalloc( total_rvec, "f_be", comm );
-    workspace->f_lp = (rvec*) smalloc( total_rvec, "f_lp", comm );
-    workspace->f_ov = (rvec*) smalloc( total_rvec, "f_ov", comm );
-    workspace->f_un = (rvec*) smalloc( total_rvec, "f_un", comm );
-    workspace->f_ang = (rvec*) smalloc( total_rvec, "f_ang", comm );
-    workspace->f_coa = (rvec*) smalloc( total_rvec, "f_coa", comm );
-    workspace->f_pen = (rvec*) smalloc( total_rvec, "f_pen", comm );
-    workspace->f_hb = (rvec*) smalloc( total_rvec, "f_hb", comm );
-    workspace->f_tor = (rvec*) smalloc( total_rvec, "f_tor", comm );
-    workspace->f_con = (rvec*) smalloc( total_rvec, "f_con", comm );
-    workspace->f_tot = (rvec*) smalloc( total_rvec, "f_tot", comm );
+    workspace->dDelta = smalloc( total_rvec, "dDelta", comm );
+    workspace->f_ele = smalloc( total_rvec, "f_ele", comm );
+    workspace->f_vdw = smalloc( total_rvec, "f_vdw", comm );
+    workspace->f_bo = smalloc( total_rvec, "f_bo", comm );
+    workspace->f_be = smalloc( total_rvec, "f_be", comm );
+    workspace->f_lp = smalloc( total_rvec, "f_lp", comm );
+    workspace->f_ov = smalloc( total_rvec, "f_ov", comm );
+    workspace->f_un = smalloc( total_rvec, "f_un", comm );
+    workspace->f_ang = smalloc( total_rvec, "f_ang", comm );
+    workspace->f_coa = smalloc( total_rvec, "f_coa", comm );
+    workspace->f_pen = smalloc( total_rvec, "f_pen", comm );
+    workspace->f_hb = smalloc( total_rvec, "f_hb", comm );
+    workspace->f_tor = smalloc( total_rvec, "f_tor", comm );
+    workspace->f_con = smalloc( total_rvec, "f_con", comm );
+    workspace->f_tot = smalloc( total_rvec, "f_tot", comm );
 
     if ( system->my_rank == MASTER_NODE )
     {
-        workspace->rcounts = (int*)
-                             smalloc( system->wsize * sizeof(int), "rcount", comm );
-        workspace->displs = (int*)
-                            smalloc( system->wsize * sizeof(int), "displs", comm );
-        workspace->id_all = (int*)
-                            smalloc( system->bigN * sizeof(int), "id_all", comm );
-        workspace->f_all = (rvec*)
-                           smalloc( system->bigN * sizeof(rvec), "f_all", comm );
+        workspace->rcounts = smalloc( system->wsize * sizeof(int), "rcount", comm );
+        workspace->displs = smalloc( system->wsize * sizeof(int), "displs", comm );
+        workspace->id_all = smalloc( system->bigN * sizeof(int), "id_all", comm );
+        workspace->f_all = smalloc( system->bigN * sizeof(rvec), "f_all", comm );
     }
     else
     {
@@ -717,22 +761,27 @@ int  Allocate_MPI_Buffers( mpi_datatypes *mpi_data, int est_recv,
 
     comm = mpi_data->world;
 
-    /* in buffers */
-    mpi_data->in1_buffer = (void*)
-                           scalloc( est_recv, sizeof(boundary_atom), "in1_buffer", comm );
-    mpi_data->in2_buffer = (void*)
-                           scalloc( est_recv, sizeof(boundary_atom), "in2_buffer", comm );
-
-    /* out buffers */
+    /* buffers for incoming messages,
+     * see SendRecv for MPI datatypes sent */
+    mpi_data->in1_buffer = scalloc( est_recv,
+            MAX3( sizeof(mpi_atom), sizeof(boundary_atom), sizeof(rvec) ),
+            "Allocate_MPI_Buffers::in1_buffer", comm );
+    mpi_data->in2_buffer = scalloc( est_recv,
+            MAX3( sizeof(mpi_atom), sizeof(boundary_atom), sizeof(rvec) ),
+            "Allocate_MPI_Buffers::in2_buffer", comm );
+
+    /* buffers for outgoing messages,
+     * see SendRecv for MPI datatypes sent */
     for ( i = 0; i < MAX_NBRS; ++i )
     {
-        mpi_buf = &( mpi_data->out_buffers[i] );
+        mpi_buf = &mpi_data->out_buffers[i];
+
         /* allocate storage for the neighbor processor i */
-        mpi_buf->index = (int*)
-                         scalloc( my_nbrs[i].est_send, sizeof(int), "mpibuf:index", comm );
-        mpi_buf->out_atoms = (void*)
-                             scalloc( my_nbrs[i].est_send, sizeof(boundary_atom), "mpibuf:out_atoms",
-                                      comm );
+        mpi_buf->index = scalloc( my_nbrs[i].est_send, sizeof(int),
+                "Allocate_MPI_Buffers::mpi_buf->index", comm );
+        mpi_buf->out_atoms = scalloc( my_nbrs[i].est_send,
+                MAX3( sizeof(mpi_atom), sizeof(boundary_atom), sizeof(rvec) ),
+                "Allocate_MPI_Buffers::mpi_buf->out_atoms", comm );
     }
 
 #if defined(NEUTRAL_TERRITORY)
@@ -742,9 +791,9 @@ int  Allocate_MPI_Buffers( mpi_datatypes *mpi_data, int est_recv,
         /* in buffers */
         mpi_data->in_nt_buffer[i] = scalloc( my_nt_nbrs[i].est_recv, sizeof(real),
                 "mpibuf:in_nt_buffer", comm );
-
         /* out buffer */
         mpi_buf = &mpi_data->out_nt_buffers[i];
+
         /* allocate storage for the neighbor processor i */
         mpi_buf->index = scalloc( my_nt_nbrs[i].est_send, sizeof(int),
                 "mpibuf:nt_index", comm );
@@ -762,14 +811,14 @@ void Deallocate_MPI_Buffers( mpi_datatypes *mpi_data )
     int i;
     mpi_out_data  *mpi_buf;
 
-    sfree( mpi_data->in1_buffer, "in1_buffer" );
-    sfree( mpi_data->in2_buffer, "in2_buffer" );
+    sfree( mpi_data->in1_buffer, "Deallocate_MPI_Buffers::in1_buffer" );
+    sfree( mpi_data->in2_buffer, "Deallocate_MPI_Buffers::in2_buffer" );
 
     for ( i = 0; i < MAX_NBRS; ++i )
     {
         mpi_buf = &mpi_data->out_buffers[i];
-        sfree( mpi_buf->index, "mpibuf:index" );
-        sfree( mpi_buf->out_atoms, "mpibuf:out_atoms" );
+        sfree( mpi_buf->index, "Deallocate_MPI_Buffers::mpi_buf->index" );
+        sfree( mpi_buf->out_atoms, "Deallocate_MPI_Buffers::mpi_buf->out_atoms" );
     }
 
 #if defined(NEUTRAL_TERRITORY)
diff --git a/PuReMD/src/basic_comm.c b/PuReMD/src/basic_comm.c
index 18d27fa1..d9a7e97b 100644
--- a/PuReMD/src/basic_comm.c
+++ b/PuReMD/src/basic_comm.c
@@ -20,16 +20,21 @@
   ----------------------------------------------------------------------*/
 
 #include "reax_types.h"
+
 #if defined(PURE_REAX)
-#include "basic_comm.h"
-#include "vector.h"
+  #include "basic_comm.h"
+  #include "vector.h"
 #elif defined(LAMMPS_REAX)
-#include "reax_basic_comm.h"
-#include "reax_vector.h"
+  #include "reax_basic_comm.h"
+  #include "reax_vector.h"
 #endif
 
-#if defined(PURE_REAX)
-void int_packer( void *dummy, mpi_out_data *out_buf )
+
+typedef void (*dist_packer)( void*, mpi_out_data* );
+typedef void (*coll_unpacker)( void*, void*, mpi_out_data* );
+
+
+static void int_packer( void *dummy, mpi_out_data *out_buf )
 {
     int i;
     int *buf = (int*) dummy;
@@ -37,102 +42,225 @@ void int_packer( void *dummy, mpi_out_data *out_buf )
 
     for ( i = 0; i < out_buf->cnt; ++i )
     {
+        //if( buf[ out_buf->index[i] ] !=-1 )
         out[i] = buf[ out_buf->index[i] ];
     }
 }
 
 
-void real_packer( void *dummy, mpi_out_data *out_buf )
+static void real_packer( void *dummy, mpi_out_data *out_buf )
 {
     int i;
     real *buf = (real*) dummy;
     real *out = (real*) out_buf->out_atoms;
 
     for ( i = 0; i < out_buf->cnt; ++i )
+    {
         out[i] = buf[ out_buf->index[i] ];
+    }
 }
 
 
-void rvec_packer( void *dummy, mpi_out_data *out_buf )
+static void rvec_packer( void *dummy, mpi_out_data *out_buf )
 {
     int i;
-    rvec *buf = (rvec*) dummy;
-    rvec *out = (rvec*)out_buf->out_atoms;
+    rvec *buf, *out;
+
+    buf = (rvec*) dummy;
+    out = (rvec*) out_buf->out_atoms;
 
     for ( i = 0; i < out_buf->cnt; ++i )
-        memcpy( out[i], buf[ out_buf->index[i] ], sizeof(rvec) );
+    {
+        memcpy( out + i, buf + out_buf->index[i], sizeof(rvec) );
+    }
 }
 
 
-void rvec2_packer( void *dummy, mpi_out_data *out_buf )
+static void rvec2_packer( void *dummy, mpi_out_data *out_buf )
 {
     int i;
-    rvec2 *buf = (rvec2*) dummy;
-    rvec2 *out = (rvec2*) out_buf->out_atoms;
+    rvec2 *buf, *out;
+
+    buf = (rvec2*) dummy;
+    out = (rvec2*) out_buf->out_atoms;
 
     for ( i = 0; i < out_buf->cnt; ++i )
-        memcpy( out[i], buf[ out_buf->index[i] ], sizeof(rvec2) );
+    {
+        memcpy( out + i, buf + out_buf->index[i], sizeof(rvec2) );
+    }
 }
 
 
-void Dist( reax_system* system, mpi_datatypes *mpi_data,
-        void *buf, MPI_Datatype type, int scale, dist_packer pack )
+static void int_unpacker( void *dummy_in, void *dummy_buf, mpi_out_data *out_buf )
 {
-    int d;
-    mpi_out_data *out_bufs;
-    MPI_Comm comm;
-    MPI_Request req1, req2;
-    MPI_Status stat1, stat2;
-    neighbor_proc *nbr1, *nbr2;
+        int i;
+        int *in, *buf;
+
+        in = (int*) dummy_in;
+        buf = (int*) dummy_buf;
+
+        for ( i = 0; i < out_buf->cnt; ++i )
+        {
+            if( buf[ out_buf->index[i] ] == -1 && in[i] != -1 )
+            {
+                buf[ out_buf->index[i] ] = in[i];
+            }
+        }
+}
+
+
+static void real_unpacker( void *dummy_in, void *dummy_buf, mpi_out_data *out_buf )
+{
+    int i;
+    real *in, *buf;
+
+    in = (real*) dummy_in;
+    buf = (real*) dummy_buf;
+
+    for ( i = 0; i < out_buf->cnt; ++i )
+    {
+        buf[ out_buf->index[i] ] += in[i];
+    }
+}
+
+
+static void rvec_unpacker( void *dummy_in, void *dummy_buf, mpi_out_data *out_buf )
+{
+    int i;
+    rvec *in, *buf;
+
+    in = (rvec*) dummy_in;
+    buf = (rvec*) dummy_buf;
+
+    for ( i = 0; i < out_buf->cnt; ++i )
+    {
+        rvec_Add( buf[ out_buf->index[i] ], in[i] );
 
 #if defined(DEBUG)
-    fprintf( stderr, "p%d dist: entered\n", system->my_rank );
+        fprintf( stderr, "rvec_unpacker: cnt=%d  i =%d  index[i]=%d\n",
+                out_buf->cnt, i, out_buf->index[i] );
 #endif
-    comm = mpi_data->comm_mesh3D;
-    out_bufs = mpi_data->out_buffers;
+    }
+}
 
-    for ( d = 0; d < 3; ++d )
+
+static void rvec2_unpacker( void *dummy_in, void *dummy_buf, mpi_out_data *out_buf )
+{
+    int i;
+    rvec2 *in, *buf;
+
+    in = (rvec2*) dummy_in;
+    buf = (rvec2*) dummy_buf;
+
+    for ( i = 0; i < out_buf->cnt; ++i )
     {
-        /* initiate recvs */
-        nbr1 = &(system->my_nbrs[2 * d]);
-        if ( nbr1->atoms_cnt )
-            MPI_Irecv( buf + nbr1->atoms_str * scale, nbr1->atoms_cnt, type,
-                    nbr1->rank, 2 * d + 1, comm, &req1 );
+        buf[ out_buf->index[i] ][0] += in[i][0];
+        buf[ out_buf->index[i] ][1] += in[i][1];
+    }
+}
 
-        nbr2 = &(system->my_nbrs[2 * d + 1]);
-        if ( nbr2->atoms_cnt )
-            MPI_Irecv( buf + nbr2->atoms_str * scale, nbr2->atoms_cnt, type,
-                    nbr2->rank, 2 * d, comm, &req2 );
 
-        /* send both messages in dimension d */
-        if ( out_bufs[2 * d].cnt )
-        {
-            pack( buf, out_bufs + (2 * d) );
-            MPI_Send( out_bufs[2 * d].out_atoms, out_bufs[2 * d].cnt, type,
-                    nbr1->rank, 2 * d, comm );
-        }
+static void * Get_Buffer_Offset( const void * const buffer,
+        const int offset, const int type )
+{
+    void * ptr;
 
-        if ( out_bufs[2 * d + 1].cnt )
-        {
-            pack( buf, out_bufs + (2 * d + 1) );
-            MPI_Send( out_bufs[2 * d + 1].out_atoms, out_bufs[2 * d + 1].cnt, type,
-                    nbr2->rank, 2 * d + 1, comm );
-        }
+    switch ( type )
+    {
+        case INT_PTR_TYPE:
+            ptr = (int *) buffer + offset;
+            break;
+
+        case REAL_PTR_TYPE:
+            ptr = (real *) buffer + offset;
+            break;
+
+        case RVEC_PTR_TYPE:
+            ptr = (rvec *) buffer + offset;
+            break;
+
+        case RVEC2_PTR_TYPE:
+            ptr = (rvec2 *) buffer + offset;
+            break;
+
+        default:
+            fprintf( stderr, "[ERROR] unknown pointer type. Terminating...\n" );
+            exit( UNKNOWN_OPTION );
+            break;
+    }
+
+    return ptr;
+}
+
+
+static dist_packer Get_Packer( const int type )
+{
+    dist_packer ptr;
+
+    switch ( type )
+    {
+        case INT_PTR_TYPE:
+            ptr = &int_packer;
+            break;
+
+        case REAL_PTR_TYPE:
+            ptr = &real_packer;
+            break;
+
+        case RVEC_PTR_TYPE:
+            ptr = &rvec_packer;
+            break;
+
+        case RVEC2_PTR_TYPE:
+            ptr = &rvec2_packer;
+            break;
+
+        default:
+            fprintf( stderr, "[ERROR] unknown pointer type. Terminating...\n" );
+            exit( UNKNOWN_OPTION );
+            break;
+    }
+
+    return ptr;
+}
 
-        if ( nbr1->atoms_cnt ) MPI_Wait( &req1, &stat1 );
-        if ( nbr2->atoms_cnt ) MPI_Wait( &req2, &stat2 );
+
+static coll_unpacker Get_Unpacker( const int type )
+{
+    coll_unpacker ptr;
+
+    switch ( type )
+    {
+        case INT_PTR_TYPE:
+            ptr = &int_unpacker;
+            break;
+
+        case REAL_PTR_TYPE:
+            ptr = &real_unpacker;
+            break;
+
+        case RVEC_PTR_TYPE:
+            ptr = &rvec_unpacker;
+            break;
+
+        case RVEC2_PTR_TYPE:
+            ptr = &rvec2_unpacker;
+            break;
+
+        default:
+            fprintf( stderr, "[ERROR] unknown pointer type. Terminating...\n" );
+            exit( UNKNOWN_OPTION );
+            break;
     }
 
-#if defined(DEBUG)
-    fprintf( stderr, "p%d dist: done\n", system->my_rank );
-#endif
+    return ptr;
 }
 
 
-#if defined(NEUTRAL_TERRITORY)
-void Dist_NT( reax_system* system, mpi_datatypes *mpi_data,
-        void *buf, MPI_Datatype type, int scale, dist_packer pack )
+void Dist( const reax_system * const system, mpi_datatypes * const mpi_data,
+        void *buf, int buf_type, MPI_Datatype type )
 {
+#if defined(NEUTRAL_TERRITORY)
     int d, count, index;
     mpi_out_data *out_bufs;
     MPI_Comm comm;
@@ -140,31 +268,31 @@ void Dist_NT( reax_system* system, mpi_datatypes *mpi_data,
     MPI_Status stat[6];
     neighbor_proc *nbr;
 
-#if defined(DEBUG)
-#endif
     comm = mpi_data->comm_mesh3D;
     out_bufs = mpi_data->out_nt_buffers;
-
     count = 0;
+
     /* initiate recvs */
-    for( d = 0; d < 6; ++d )
+    for ( d = 0; d < 6; ++d )
     {
-        nbr = &(system->my_nt_nbrs[d]);
+        nbr = &system->my_nt_nbrs[d];
+
         if ( nbr->atoms_cnt )
         {
             count++;
-            MPI_Irecv( buf + nbr->atoms_str * scale, nbr->atoms_cnt, type,
-                    nbr->receive_rank, d, comm, &(req[d]) );
+            MPI_Irecv( Get_Buffer_Offset( buf, nbr->atoms_str, buf_type ),
+                    nbr->atoms_cnt, type, nbr->receive_rank, d, comm, &req[d] );
         }
     }
 
-    for( d = 0; d < 6; ++d)
+    for ( d = 0; d < 6; ++d)
     {
-        nbr = &(system->my_nt_nbrs[d]);
+        nbr = &system->my_nt_nbrs[d];
+
         /* send both messages in dimension d */
         if ( out_bufs[d].cnt )
         {
-            pack( buf, out_bufs + d );
+            pack( buf, out_bufs[d] );
             MPI_Send( out_bufs[d].out_atoms, out_bufs[d].cnt, type,
                     nbr->rank, d, comm );
         }
@@ -178,127 +306,78 @@ void Dist_NT( reax_system* system, mpi_datatypes *mpi_data,
 #if defined(DEBUG)
     fprintf( stderr, "p%d dist: done\n", system->my_rank );
 #endif
-}
-#endif
 
-
-void real_unpacker( void *dummy_in, void *dummy_buf, mpi_out_data *out_buf )
-{
-    int i;
-    real *in = (real*) dummy_in;
-    real *buf = (real*) dummy_buf;
-
-    for ( i = 0; i < out_buf->cnt; ++i )
-    {
-        buf[ out_buf->index[i] ] += in[i];
-    }
-}
-
-
-void rvec_unpacker( void *dummy_in, void *dummy_buf, mpi_out_data *out_buf )
-{
-    int i;
-    rvec *in = (rvec*) dummy_in;
-    rvec *buf = (rvec*) dummy_buf;
-
-    for ( i = 0; i < out_buf->cnt; ++i )
-    {
-        rvec_Add( buf[ out_buf->index[i] ], in[i] );
-
-#if defined(DEBUG)
-        fprintf( stderr, "rvec_unpacker: cnt=%d  i =%d  index[i]=%d\n",
-                out_buf->cnt, i, out_buf->index[i] );
-#endif
-    }
-}
-
-
-void rvec2_unpacker( void *dummy_in, void *dummy_buf, mpi_out_data *out_buf )
-{
-    int i;
-    rvec2 *in = (rvec2*) dummy_in;
-    rvec2 *buf = (rvec2*) dummy_buf;
-
-    for ( i = 0; i < out_buf->cnt; ++i )
-    {
-        buf[ out_buf->index[i] ][0] += in[i][0];
-        buf[ out_buf->index[i] ][1] += in[i][1];
-    }
-}
-
-
-void Coll( reax_system* system, mpi_datatypes *mpi_data,
-        void *buf, MPI_Datatype type, int scale, coll_unpacker unpack )
-{
+#else
     int d;
-    void *in1, *in2;
     mpi_out_data *out_bufs;
     MPI_Comm comm;
     MPI_Request req1, req2;
     MPI_Status stat1, stat2;
-    neighbor_proc *nbr1, *nbr2;
+    const neighbor_proc *nbr1, *nbr2;
+    dist_packer pack;
 
 #if defined(DEBUG)
-    fprintf( stderr, "p%d coll: entered\n", system->my_rank );
+    fprintf( stderr, "p%d dist: entered\n", system->my_rank );
 #endif
 
     comm = mpi_data->comm_mesh3D;
-    in1 = mpi_data->in1_buffer;
-    in2 = mpi_data->in2_buffer;
     out_bufs = mpi_data->out_buffers;
+    pack = Get_Packer( buf_type );
 
-    for ( d = 2; d >= 0; --d )
+    for ( d = 0; d < 3; ++d )
     {
         /* initiate recvs */
-        nbr1 = &(system->my_nbrs[2 * d]);
-        if ( out_bufs[2 * d].cnt )
-            MPI_Irecv(in1, out_bufs[2 * d].cnt, type, nbr1->rank, 2 * d + 1, comm, &req1);
-
-        nbr2 = &(system->my_nbrs[2 * d + 1]);
-        if ( out_bufs[2 * d + 1].cnt )
-            MPI_Irecv(in2, out_bufs[2 * d + 1].cnt, type, nbr2->rank, 2 * d, comm, &req2);
-
-        /* send both messages in dimension d */
+        nbr1 = &system->my_nbrs[2 * d];
         if ( nbr1->atoms_cnt )
-            MPI_Send( buf + nbr1->atoms_str * scale, nbr1->atoms_cnt, type,
-                    nbr1->rank, 2 * d, comm );
+        {
+            MPI_Irecv( Get_Buffer_Offset( buf, nbr1->atoms_str, buf_type ),
+                    nbr1->atoms_cnt, type, nbr1->rank, 2 * d + 1, comm, &req1 );
+        }
 
+        nbr2 = &system->my_nbrs[2 * d + 1];
         if ( nbr2->atoms_cnt )
-            MPI_Send( buf + nbr2->atoms_str * scale, nbr2->atoms_cnt, type,
-                    nbr2->rank, 2 * d + 1, comm );
-
-#if defined(DEBUG)
-        fprintf( stderr, "p%d coll[%d] nbr1: str=%d cnt=%d recv=%d\n",
-                system->my_rank, d, nbr1->atoms_str, nbr1->atoms_cnt,
-                out_bufs[2 * d].cnt );
-        fprintf( stderr, "p%d coll[%d] nbr2: str=%d cnt=%d recv=%d\n",
-                system->my_rank, d, nbr2->atoms_str, nbr2->atoms_cnt,
-                out_bufs[2 * d + 1].cnt );
-#endif
+        {
+            MPI_Irecv( Get_Buffer_Offset( buf, nbr2->atoms_str, buf_type ),
+                    nbr2->atoms_cnt, type, nbr2->rank, 2 * d, comm, &req2 );
+        }
 
+        /* send both messages in dimension d */
         if ( out_bufs[2 * d].cnt )
         {
-            MPI_Wait( &req1, &stat1 );
-            unpack( in1, buf, out_bufs + (2 * d) );
+            pack( buf, &out_bufs[2 * d] );
+            MPI_Send( out_bufs[2 * d].out_atoms, out_bufs[2 * d].cnt,
+                    type, nbr1->rank, 2 * d, comm );
         }
 
         if ( out_bufs[2 * d + 1].cnt )
+        {
+            pack( buf, &out_bufs[2 * d + 1] );
+            MPI_Send( out_bufs[2 * d + 1].out_atoms, out_bufs[2 * d + 1].cnt,
+                    type, nbr2->rank, 2 * d + 1, comm );
+        }
+
+        if( nbr1->atoms_cnt )
+        {
+            MPI_Wait( &req1, &stat1 );
+        }
+        if( nbr2->atoms_cnt )
         {
             MPI_Wait( &req2, &stat2 );
-            unpack( in2, buf, out_bufs + (2 * d + 1) );
         }
     }
 
+
 #if defined(DEBUG)
-    fprintf( stderr, "p%d coll: done\n", system->my_rank );
+    fprintf( stderr, "p%d dist: done\n", system->my_rank );
+#endif
 #endif
 }
 
 
+void Coll( const reax_system * const system, mpi_datatypes * const mpi_data,
+        void *buf, int buf_type, MPI_Datatype type )
+{   
 #if defined(NEUTRAL_TERRITORY)
-void Coll_NT( reax_system* system, mpi_datatypes *mpi_data,
-        void *buf, MPI_Datatype type, int scale, coll_unpacker unpack )
-{
     int d, count, index;
     void *in[6];
     mpi_out_data *out_bufs;
@@ -313,53 +392,133 @@ void Coll_NT( reax_system* system, mpi_datatypes *mpi_data,
 
     comm = mpi_data->comm_mesh3D;
     out_bufs = mpi_data->out_nt_buffers;
-
     count = 0;
+
     for ( d = 0; d < 6; ++d )
     {
         nbr = &system->my_nt_nbrs[d];
         in[d] = mpi_data->in_nt_buffer[d];
+
         if ( out_bufs[d].cnt )
         {
             count++;
-            MPI_Irecv(in[d], out_bufs[d].cnt, type, nbr->rank, d, comm, &(req[d]));
+            MPI_Irecv( in[d], out_bufs[d].cnt, type, nbr->rank, d, comm, &req[d] );
         }
     }
 
-    for( d = 0; d < 6; ++d )
+    for ( d = 0; d < 6; ++d )
     {
+        nbr = &system->my_nt_nbrs[d];
+
         /* send both messages in direction d */
-        nbr = &(system->my_nt_nbrs[d]);
         if ( nbr->atoms_cnt )
         {
-            MPI_Send( buf + nbr->atoms_str * scale, nbr->atoms_cnt, type,
-                    nbr->receive_rank, d, comm );
+            MPI_Send( Get_Buffer_Offset( buf, nbr->atoms_str, buf_type ),
+                    nbr->atoms_cnt, type, nbr->receive_rank, d, comm );
         }
     }
     
     for ( d = 0; d < count; ++d )
     {
         MPI_Waitany( REAX_MAX_NT_NBRS, req, &index, stat);
-        unpack( in[index], buf, out_bufs + index );
+        unpack( in[index], buf, out_bufs[index] );
     }
 
 #if defined(DEBUG)
     fprintf( stderr, "p%d coll: done\n", system->my_rank );
 #endif
-}
+
+#else
+    int d;
+    mpi_out_data *out_bufs;
+    MPI_Comm comm;
+    MPI_Request req1, req2;
+    MPI_Status stat1, stat2;
+    const neighbor_proc *nbr1, *nbr2;
+    coll_unpacker unpack;
+
+#if defined(DEBUG)
+    fprintf( stderr, "p%d coll: entered\n", system->my_rank );
 #endif
-#endif /*PURE_REAX*/
+
+    comm = mpi_data->comm_mesh3D;
+    out_bufs = mpi_data->out_buffers;
+    unpack = Get_Unpacker( buf_type );
+
+    for ( d = 2; d >= 0; --d )
+    {
+        /* initiate recvs */
+        nbr1 = &system->my_nbrs[2 * d];
+
+        if ( out_bufs[2 * d].cnt )
+        {
+            MPI_Irecv( mpi_data->in1_buffer, out_bufs[2 * d].cnt,
+                    type, nbr1->rank, 2 * d + 1, comm, &req1 );
+        }
+
+        nbr2 = &system->my_nbrs[2 * d + 1];
+
+        if ( out_bufs[2 * d + 1].cnt )
+        {
+
+            MPI_Irecv( mpi_data->in2_buffer, out_bufs[2 * d + 1].cnt,
+                    type, nbr2->rank, 2 * d, comm, &req2 );
+        }
+        
+        /* send both messages in dimension d */
+        if ( nbr1->atoms_cnt )
+        {
+            MPI_Send( Get_Buffer_Offset( buf, nbr1->atoms_str, buf_type ),
+                    nbr1->atoms_cnt, type, nbr1->rank, 2 * d, comm );
+        }
+        
+        if ( nbr2->atoms_cnt )
+        {
+            MPI_Send( Get_Buffer_Offset( buf, nbr2->atoms_str, buf_type ),
+                    nbr2->atoms_cnt, type, nbr2->rank, 2 * d + 1, comm );
+        }
+
+#if defined(DEBUG)
+        fprintf( stderr, "p%d coll[%d] nbr1: str=%d cnt=%d recv=%d\n",
+                system->my_rank, d, nbr1->atoms_str, nbr1->atoms_cnt,
+                out_bufs[2 * d].cnt );
+        fprintf( stderr, "p%d coll[%d] nbr2: str=%d cnt=%d recv=%d\n",
+                system->my_rank, d, nbr2->atoms_str, nbr2->atoms_cnt,
+                out_bufs[2 * d + 1].cnt );
+#endif
+
+        if ( out_bufs[2 * d].cnt )
+        {
+            MPI_Wait( &req1, &stat1 );
+            unpack( mpi_data->in1_buffer, buf, &out_bufs[2 * d] );
+        }
+
+        if ( out_bufs[2 * d + 1].cnt )
+        {
+            MPI_Wait( &req2, &stat2 );
+            unpack( mpi_data->in2_buffer, buf, &out_bufs[2 * d + 1] );
+        }
+    }
+
+#if defined(DEBUG)
+    fprintf( stderr, "p%d coll: done\n", system->my_rank );
+#endif
+#endif
+}
 
 
 /*****************************************************************************/
 real Parallel_Norm( real *v, int n, MPI_Comm comm )
 {
-    int  i;
+    int i;
     real my_sum, norm_sqr;
 
-    my_sum = 0;
+    my_sum = 0.0;
+
     for ( i = 0; i < n; ++i )
+    {
         my_sum += SQR( v[i] );
+    }
 
     MPI_Allreduce( &my_sum, &norm_sqr, 1, MPI_DOUBLE, MPI_SUM, comm );
 
@@ -367,15 +526,17 @@ real Parallel_Norm( real *v, int n, MPI_Comm comm )
 }
 
 
-
 real Parallel_Dot( real *v1, real *v2, int n, MPI_Comm comm )
 {
     int  i;
     real my_dot, res;
 
-    my_dot = 0;
+    my_dot = 0.0;
+
     for ( i = 0; i < n; ++i )
+    {
         my_dot += v1[i] * v2[i];
+    }
 
     MPI_Allreduce( &my_dot, &res, 1, MPI_DOUBLE, MPI_SUM, comm );
 
@@ -383,7 +544,6 @@ real Parallel_Dot( real *v1, real *v2, int n, MPI_Comm comm )
 }
 
 
-
 real Parallel_Vector_Acc( real *v, int n, MPI_Comm comm )
 {
     int  i;
diff --git a/PuReMD/src/basic_comm.h b/PuReMD/src/basic_comm.h
index 8c914973..bf461c05 100644
--- a/PuReMD/src/basic_comm.h
+++ b/PuReMD/src/basic_comm.h
@@ -24,30 +24,33 @@
 
 #include "reax_types.h"
 
-void int_packer( void*, mpi_out_data* );
-void real_packer( void*, mpi_out_data* );
-void rvec_packer( void*, mpi_out_data* );
-void rvec2_packer( void*, mpi_out_data* );
-void Dist(reax_system*, mpi_datatypes*, void*, MPI_Datatype, int, dist_packer);
-#if defined(NEUTRAL_TERRITORY)
-void Dist_NT(reax_system*, mpi_datatypes*, void*, MPI_Datatype, int, dist_packer);
-#endif
 
-void real_unpacker( void*, void*, mpi_out_data* );
-void rvec_unpacker( void*, void*, mpi_out_data* );
-void rvec2_unpacker( void*, void*, mpi_out_data* );
-void Coll( reax_system*, mpi_datatypes*, void*, MPI_Datatype, int, coll_unpacker );
-#if defined(NEUTRAL_TERRITORY)
-void Coll_NT( reax_system*, mpi_datatypes*, void*, MPI_Datatype, int, coll_unpacker );
-#endif
+enum pointer_type
+{
+    INT_PTR_TYPE = 0,
+    REAL_PTR_TYPE = 1,
+    RVEC_PTR_TYPE = 2,
+    RVEC2_PTR_TYPE = 3,
+};
+
+
+void Dist( const reax_system * const, mpi_datatypes * const,
+        void*, int, MPI_Datatype );
+
+void Coll( const reax_system * const, mpi_datatypes * const,
+        void*, int, MPI_Datatype );
 
 real Parallel_Norm( real*, int, MPI_Comm );
+
 real Parallel_Dot( real*, real*, int, MPI_Comm );
+
 real Parallel_Vector_Acc( real*, int, MPI_Comm );
 
 #if defined(TEST_FORCES)
 void Coll_ids_at_Master( reax_system*, storage*, mpi_datatypes* );
+
 void Coll_rvecs_at_Master( reax_system*, storage*, mpi_datatypes*, rvec* );
 #endif
 
+
 #endif
diff --git a/PuReMD/src/forces.c b/PuReMD/src/forces.c
index 4ebaf651..da66053c 100644
--- a/PuReMD/src/forces.c
+++ b/PuReMD/src/forces.c
@@ -147,49 +147,60 @@ void Compute_NonBonded_Forces( reax_system *system, control_params *control,
 
 
 /* this version of Compute_Total_Force computes forces from
-   coefficients accumulated by all interaction functions.
-   Saves enormous time & space! */
+ * coefficients accumulated by all interaction functions.
+ * Saves enormous time & space! */
 void Compute_Total_Force( reax_system *system, control_params *control,
-                          simulation_data *data, storage *workspace,
-                          reax_list **lists, mpi_datatypes *mpi_data )
+        simulation_data *data, storage *workspace,
+        reax_list **lists, mpi_datatypes *mpi_data )
 {
     int i, pj;
     reax_list *bonds = lists[BONDS];
 
     for ( i = 0; i < system->N; ++i )
+    {
         for ( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj )
+        {
             if ( i < bonds->bond_list[pj].nbr )
             {
                 if ( control->virial == 0 )
+                {
                     Add_dBond_to_Forces( i, pj, workspace, lists );
+                }
                 else
+                {
                     Add_dBond_to_Forces_NPT( i, pj, data, workspace, lists );
+                }
             }
+        }
+    }
 
     //Print_Total_Force( system, data, workspace );
+
 #if defined(PURE_REAX)
     /* now all forces are computed to their partially-final values
-       based on the neighbors information each processor has had.
-       final values of force on each atom needs to be computed by adding up
-       all partially-final pieces */
-    Coll( system, mpi_data, workspace->f, mpi_data->mpi_rvec,
-          sizeof(rvec) / sizeof(void), rvec_unpacker );
+     * based on the neighbors information each processor has had.
+     * final values of force on each atom needs to be computed by adding up
+     * all partially-final pieces */
+    Coll( system, mpi_data, workspace->f, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
+
     for ( i = 0; i < system->n; ++i )
+    {
         rvec_Copy( system->my_atoms[i].f, workspace->f[i] );
+    }
 
 #if defined(TEST_FORCES)
-    Coll( system, mpi_data, workspace->f_ele, mpi_data->mpi_rvec, rvec_unpacker);
-    Coll( system, mpi_data, workspace->f_vdw, mpi_data->mpi_rvec, rvec_unpacker);
-    Coll( system, mpi_data, workspace->f_be, mpi_data->mpi_rvec, rvec_unpacker );
-    Coll( system, mpi_data, workspace->f_lp, mpi_data->mpi_rvec, rvec_unpacker );
-    Coll( system, mpi_data, workspace->f_ov, mpi_data->mpi_rvec, rvec_unpacker );
-    Coll( system, mpi_data, workspace->f_un, mpi_data->mpi_rvec, rvec_unpacker );
-    Coll( system, mpi_data, workspace->f_ang, mpi_data->mpi_rvec, rvec_unpacker);
-    Coll( system, mpi_data, workspace->f_coa, mpi_data->mpi_rvec, rvec_unpacker);
-    Coll( system, mpi_data, workspace->f_pen, mpi_data->mpi_rvec, rvec_unpacker);
-    Coll( system, mpi_data, workspace->f_hb, mpi_data->mpi_rvec, rvec_unpacker );
-    Coll( system, mpi_data, workspace->f_tor, mpi_data->mpi_rvec, rvec_unpacker);
-    Coll( system, mpi_data, workspace->f_con, mpi_data->mpi_rvec, rvec_unpacker);
+    Coll( system, mpi_data, workspace->f_ele, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
+    Coll( system, mpi_data, workspace->f_vdw, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
+    Coll( system, mpi_data, workspace->f_be, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
+    Coll( system, mpi_data, workspace->f_lp, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
+    Coll( system, mpi_data, workspace->f_ov, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
+    Coll( system, mpi_data, workspace->f_un, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
+    Coll( system, mpi_data, workspace->f_ang, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
+    Coll( system, mpi_data, workspace->f_coa, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
+    Coll( system, mpi_data, workspace->f_pen, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
+    Coll( system, mpi_data, workspace->f_hb, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
+    Coll( system, mpi_data, workspace->f_tor, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
+    Coll( system, mpi_data, workspace->f_con, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
 #endif
 
 #endif
@@ -2309,7 +2320,7 @@ void Compute_Forces( reax_system *system, control_params *control,
 
     /********* bonded interactions ************/
     Compute_Bonded_Forces( system, control, data, workspace,
-                           lists, out_control, mpi_data->world );
+            lists, out_control, mpi_data->world );
 
 #if defined(LOG_PERFORMANCE)
     //MPI_Barrier( mpi_data->world );
@@ -2320,6 +2331,7 @@ void Compute_Forces( reax_system *system, control_params *control,
         t_start = t_end;
     }
 #endif
+
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d @ step%d: completed bonded\n",
              system->my_rank, data->step );
@@ -2329,7 +2341,9 @@ void Compute_Forces( reax_system *system, control_params *control,
     /**************** qeq ************************/
 #if defined(PURE_REAX)
     if ( qeq_flag )
+    {
         QEq( system, control, data, workspace, out_control, mpi_data );
+    }
 
 #if defined(LOG_PERFORMANCE)
     //MPI_Barrier( mpi_data->world );
@@ -2340,6 +2354,7 @@ void Compute_Forces( reax_system *system, control_params *control,
         t_start = t_end;
     }
 #endif
+
 #if defined(DEBUG_FOCUS)
     fprintf(stderr, "p%d @ step%d: qeq completed\n", system->my_rank, data->step);
     MPI_Barrier( mpi_data->world );
@@ -2348,7 +2363,7 @@ void Compute_Forces( reax_system *system, control_params *control,
 
     /********* nonbonded interactions ************/
     Compute_NonBonded_Forces( system, control, data, workspace,
-                              lists, out_control, mpi_data->world );
+            lists, out_control, mpi_data->world );
 
 #if defined(LOG_PERFORMANCE)
     //MPI_Barrier( mpi_data->world );
@@ -2359,6 +2374,7 @@ void Compute_Forces( reax_system *system, control_params *control,
         t_start = t_end;
     }
 #endif
+
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d @ step%d: nonbonded forces completed\n",
              system->my_rank, data->step );
@@ -2376,6 +2392,7 @@ void Compute_Forces( reax_system *system, control_params *control,
         data->timing.bonded += t_end - t_start;
     }
 #endif
+
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d @ step%d: total forces computed\n",
              system->my_rank, data->step );
diff --git a/PuReMD/src/linear_solvers.c b/PuReMD/src/linear_solvers.c
index d1ad369b..f29eab07 100644
--- a/PuReMD/src/linear_solvers.c
+++ b/PuReMD/src/linear_solvers.c
@@ -106,6 +106,163 @@ static int find_bucket( double *list, int len, double a )
 }
 
 
+static void dual_Sparse_MatVec( sparse_matrix *A, rvec2 *x, rvec2 *b, int N )
+{
+    int  i, j, k, si;
+    real H;
+
+    for ( i = 0; i < N; ++i )
+    {
+        b[i][0] = b[i][1] = 0;
+    }
+
+    if ( A->format == SYM_HALF_MATRIX )
+    {
+        /* perform multiplication */
+        for ( i = 0; i < A->n; ++i )
+        {
+            si = A->start[i];
+
+            b[i][0] += A->entries[si].val * x[i][0];
+            b[i][1] += A->entries[si].val * x[i][1];
+
+            for ( k = si + 1; k < A->end[i]; ++k )
+            {
+                j = A->entries[k].j;
+                H = A->entries[k].val;
+
+                b[i][0] += H * x[j][0];
+                b[i][1] += H * x[j][1];
+
+                // comment out for tryQEq
+                //if( j < A->n ) {
+                b[j][0] += H * x[i][0];
+                b[j][1] += H * x[i][1];
+                //}
+            }
+        }
+    }
+    else if ( A->format == SYM_FULL_MATRIX || A->format == FULL_MATRIX )
+    {
+        /* perform multiplication */
+        for ( i = 0; i < A->n; ++i )
+        {
+            si = A->start[i];
+
+            for ( k = si; k < A->end[i]; ++k )
+            {
+                j = A->entries[k].j;
+                H = A->entries[k].val;
+
+                b[i][0] += H * x[j][0];
+                b[i][1] += H * x[j][1];
+            }
+        }
+    }
+}
+
+
+static void Sparse_MatVec( sparse_matrix *A, real *x, real *b, int N )
+{
+    int i, j, k, si, num_rows;
+    real val;
+
+    for ( i = 0; i < N; ++i )
+    {
+        b[i] = 0.0;
+    }
+
+#if defined(NEUTRAL_TERRITORY)
+    num_rows = A->NT;
+
+    if ( A->format == SYM_HALF_MATRIX )
+    {
+        for ( i = 0; i < num_rows; ++i )
+        {
+            si = A->start[i];
+
+            /* diagonal only contributes once */
+            if( i < A->n )
+            {
+                b[i] += A->entries[si].val * x[i];
+                k = si + 1;
+            }
+            /* zeros on the diagonal for i >= A->n,
+             * so skip the diagonal multplication step as zeros
+             * are not stored (idea: keep the NNZ's the same
+             * for full shell and neutral territory half-stored
+             * charge matrices to make debugging easier) */
+            else
+            {
+                k = si;
+            }
+
+            for ( ; k < A->end[i]; ++k )
+            {
+                j = A->entries[k].j;
+                val = A->entries[k].val;
+
+                b[i] += val * x[j];
+                b[j] += val * x[i];
+            }
+        }
+    }
+    else if ( A->format == SYM_FULL_MATRIX || A->format == FULL_MATRIX )
+    {
+        for ( i = 0; i < num_rows; ++i )
+        {
+            si = A->start[i];
+
+            for ( k = si; k < A->end[i]; ++k )
+            {
+                j = A->entries[k].j;
+                val = A->entries[k].val;
+
+                b[i] += val * x[j];
+            }
+        }
+    }
+#else
+    num_rows = A->n;
+
+    if ( A->format == SYM_HALF_MATRIX )
+    {
+        for ( i = 0; i < num_rows; ++i )
+        {
+            si = A->start[i];
+
+            /* diagonal only contributes once */
+            b[i] += A->entries[si].val * x[i];
+
+            for ( k = si + 1; k < A->end[i]; ++k )
+            {
+                j = A->entries[k].j;
+                val = A->entries[k].val;
+
+                b[i] += val * x[j];
+                b[j] += val * x[i];
+            }
+        }
+    }
+    else if ( A->format == SYM_FULL_MATRIX || A->format == FULL_MATRIX )
+    {
+        for ( i = 0; i < num_rows; ++i )
+        {
+            si = A->start[i];
+
+            for ( k = si; k < A->end[i]; ++k )
+            {
+                j = A->entries[k].j;
+                val = A->entries[k].val;
+
+                b[i] += val * x[j];
+            }
+        }
+    }
+#endif
+}
+
+
 real setup_sparse_approx_inverse( reax_system *system, simulation_data *data, storage *workspace,
         mpi_datatypes *mpi_data, sparse_matrix *A, sparse_matrix **A_spar_patt,
         int nprocs, real filter )
@@ -508,7 +665,7 @@ real sparse_approx_inverse( reax_system *system, simulation_data *data,
     lapack_int m, n, nrhs, lda, ldb, info;
     int *pos_x, *X;
     real *e_j, *dense_matrix;
-    int cnt, scale;
+    int cnt;
     
     reax_atom *atom;
     int *row_nnz;
@@ -580,10 +737,9 @@ real sparse_approx_inverse( reax_system *system, simulation_data *data,
 
     /* Announce the nnz's in each row that will be communicated later */
     t_start = MPI_Wtime();
-    scale = sizeof(int) / sizeof(void);
-    Dist_NT( system, mpi_data, row_nnz, MPI_INT, scale, int_packer );
+    Dist( system, mpi_data, row_nnz, REAL_PTR_TYPE, MPI_INT );
     t_comm += MPI_Wtime() - t_start;
-    fprintf( stdout,"SAI after Dist_NT call\n");
+    fprintf( stdout,"SAI after Dist call\n");
     fflush( stdout );
 
     comm = mpi_data->comm_mesh3D;
@@ -612,11 +768,11 @@ real sparse_approx_inverse( reax_system *system, simulation_data *data,
                 j_recv[d] = (int *) malloc( sizeof(int) * cnt );
                 val_recv[d] = (real *) malloc( sizeof(real) * cnt );
 
-                fprintf( stdout,"Dist_NT communication receive phase direction %d will receive %d\n", d, cnt);
+                fprintf( stdout,"Dist communication receive phase direction %d will receive %d\n", d, cnt);
                 fflush( stdout );
                 t_start = MPI_Wtime();
-                MPI_Irecv( j_recv + d, cnt, MPI_INT, nbr->receive_rank, d, comm, &(req[2*d]) );
-                MPI_Irecv( val_recv + d, cnt, MPI_DOUBLE, nbr->receive_rank, d, comm, &(req[2*d+1]) );
+                MPI_Irecv( j_recv + d, cnt, MPI_INT, nbr->receive_rank, d, comm, &req[2 * d] );
+                MPI_Irecv( val_recv + d, cnt, MPI_DOUBLE, nbr->receive_rank, d, comm, &req[2 * d + 1] );
                 t_comm += MPI_Wtime() - t_start;
             }
         }
@@ -639,7 +795,7 @@ real sparse_approx_inverse( reax_system *system, simulation_data *data,
                 }
                //     row_nnz[ out_bufs[d].index[i] ];
             }
-            fprintf( stdout,"Dist_NT communication    send phase direction %d should  send %d\n", d, cnt);
+            fprintf( stdout,"Dist communication    send phase direction %d should  send %d\n", d, cnt);
             fflush( stdout );
 
             if( cnt )
@@ -659,21 +815,21 @@ real sparse_approx_inverse( reax_system *system, simulation_data *data,
                     }
                 }
 
-                fprintf( stdout,"Dist_NT communication    send phase direction %d will    send %d\n", d, cnt );
+                fprintf( stdout,"Dist communication    send phase direction %d will    send %d\n", d, cnt );
                 fflush( stdout );
 
                 t_start = MPI_Wtime();
                 MPI_Send( j_send, cnt, MPI_INT, nbr->rank, d, comm );
-                fprintf( stdout,"Dist_NT communication send phase direction %d cnt = %d\n", d, cnt);
+                fprintf( stdout,"Dist communication send phase direction %d cnt = %d\n", d, cnt);
                 fflush( stdout );
                 MPI_Send( val_send, cnt, MPI_DOUBLE, nbr->rank, d, comm );
-                fprintf( stdout,"Dist_NT communication send phase direction %d cnt = %d\n", d, cnt);
+                fprintf( stdout,"Dist communication send phase direction %d cnt = %d\n", d, cnt);
                 fflush( stdout );
                 t_comm += MPI_Wtime() - t_start;
             }
         }
     }
-    fprintf( stdout," Dist_NT communication for sending row info before waitany\n");
+    fprintf( stdout," Dist communication for sending row info before waitany\n");
     fflush( stdout );
     ///////////////////////
     for ( d = 0; d < count; ++d )
@@ -708,7 +864,7 @@ real sparse_approx_inverse( reax_system *system, simulation_data *data,
         }
     }
     //////////////////////
-    fprintf( stdout," wow wow wow, Dist_NT communication for sending row info worked\n");
+    fprintf( stdout," wow wow wow, Dist communication for sending row info worked\n");
     fflush( stdout );
     //TODO: size?
     X = (int *) malloc( sizeof(int) * (system->bigN + 1) );
@@ -883,6 +1039,8 @@ real sparse_approx_inverse( reax_system *system, simulation_data *data,
 
     return MPI_Wtime() - start;
 }
+
+
 #else
 real sparse_approx_inverse( reax_system *system, simulation_data *data,
         storage *workspace, mpi_datatypes *mpi_data, 
@@ -896,7 +1054,7 @@ real sparse_approx_inverse( reax_system *system, simulation_data *data,
     int *X, *q;
     real *e_j, *dense_matrix;
     int size_e, size_dense;
-    int cnt, scale;
+    int cnt;
     reax_atom *atom;
     int *row_nnz;
     int **j_list;
@@ -968,8 +1126,7 @@ real sparse_approx_inverse( reax_system *system, simulation_data *data,
 
     /* Announce the nnz's in each row that will be communicated later */
     t_start = MPI_Wtime();
-    scale = sizeof(int) / sizeof(void);
-    Dist( system, mpi_data, row_nnz, MPI_INT, scale, int_packer );
+    Dist( system, mpi_data, row_nnz, INT_PTR_TYPE, MPI_INT );
     t_comm += MPI_Wtime() - t_start;
 
     comm = mpi_data->comm_mesh3D;
@@ -1424,66 +1581,11 @@ real sparse_approx_inverse( reax_system *system, simulation_data *data,
 #endif
 
 
-void dual_Sparse_MatVec( sparse_matrix *A, rvec2 *x, rvec2 *b, int N )
-{
-    int  i, j, k, si;
-    real H;
-
-    for ( i = 0; i < N; ++i )
-    {
-        b[i][0] = b[i][1] = 0;
-    }
-
-    if ( A->format == SYM_HALF_MATRIX )
-    {
-        /* perform multiplication */
-        for ( i = 0; i < A->n; ++i )
-        {
-            si = A->start[i];
-
-            b[i][0] += A->entries[si].val * x[i][0];
-            b[i][1] += A->entries[si].val * x[i][1];
-
-            for ( k = si + 1; k < A->end[i]; ++k )
-            {
-                j = A->entries[k].j;
-                H = A->entries[k].val;
-
-                b[i][0] += H * x[j][0];
-                b[i][1] += H * x[j][1];
-
-                // comment out for tryQEq
-                //if( j < A->n ) {
-                b[j][0] += H * x[i][0];
-                b[j][1] += H * x[i][1];
-                //}
-            }
-        }
-    }
-    else if ( A->format == SYM_FULL_MATRIX || A->format == FULL_MATRIX )
-    {
-        /* perform multiplication */
-        for ( i = 0; i < A->n; ++i )
-        {
-            si = A->start[i];
-
-            for ( k = si; k < A->end[i]; ++k )
-            {
-                j = A->entries[k].j;
-                H = A->entries[k].val;
-
-                b[i][0] += H * x[j][0];
-                b[i][1] += H * x[j][1];
-            }
-        }
-    }
-}
-
-
-/*int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
-        rvec2 *b, real tol, rvec2 *x, mpi_datatypes* mpi_data, FILE *fout )
+int dual_CG( reax_system *system, control_params *control, simulation_data *data,
+        storage *workspace, sparse_matrix *H, rvec2 *b,
+        real tol, rvec2 *x, mpi_datatypes* mpi_data, FILE *fout )
 {
-    int  i, j, n, N, matvecs, scale;
+    int  i, j, n, N, matvecs;
     rvec2 tmp, alpha, beta;
     rvec2 my_sum, norm_sqr, b_norm, my_dot;
     rvec2 sig_old, sig_new;
@@ -1493,7 +1595,6 @@ void dual_Sparse_MatVec( sparse_matrix *A, rvec2 *x, rvec2 *b, int N )
     N = system->N;
     comm = mpi_data->world;
     matvecs = 0;
-    scale = sizeof(rvec2) / sizeof(void);
 
 #if defined(CG_PERFORMANCE)
     if ( system->my_rank == MASTER_NODE )
@@ -1504,11 +1605,11 @@ void dual_Sparse_MatVec( sparse_matrix *A, rvec2 *x, rvec2 *b, int N )
     }
 #endif
 
-    Dist( system, mpi_data, x, mpi_data->mpi_rvec2, scale, rvec2_packer );
+    Dist( system, mpi_data, x, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
     dual_Sparse_MatVec( H, x, workspace->q2, N );
     if ( H->format == SYM_HALF_MATRIX )
     {
-        Coll(system, mpi_data, workspace->q2, mpi_data->mpi_rvec2, scale, rvec2_unpacker);
+        Coll( system, mpi_data, workspace->q2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
     }
 
 #if defined(CG_PERFORMANCE)
@@ -1565,11 +1666,11 @@ void dual_Sparse_MatVec( sparse_matrix *A, rvec2 *x, rvec2 *b, int N )
 
     for ( i = 1; i < 300; ++i )
     {
-        Dist(system, mpi_data, workspace->d2, mpi_data->mpi_rvec2, scale, rvec2_packer);
+        Dist( system, mpi_data, workspace->d2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
         dual_Sparse_MatVec( H, workspace->d2, workspace->q2, N );
         if ( H->format == SYM_HALF_MATRIX )
         {
-            Coll(system, mpi_data, workspace->q2, mpi_data->mpi_rvec2, scale, rvec2_unpacker);
+            Coll( system, mpi_data, workspace->q2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
         }
 
 #if defined(CG_PERFORMANCE)
@@ -1646,8 +1747,10 @@ void dual_Sparse_MatVec( sparse_matrix *A, rvec2 *x, rvec2 *b, int N )
         {
             workspace->t[j] = workspace->x[j][1];
         }
-        matvecs = CG( system, workspace, H, workspace->b_t, tol,
-                workspace->t,mpi_data, fout, -1 );
+
+        matvecs = CG( system, control, data, workspace,
+                H, workspace->b_t, tol, workspace->t, mpi_data );
+
         for ( j = 0; j < n; ++j )
         {
             workspace->x[j][1] = workspace->t[j];
@@ -1659,8 +1762,10 @@ void dual_Sparse_MatVec( sparse_matrix *A, rvec2 *x, rvec2 *b, int N )
         {
             workspace->s[j] = workspace->x[j][0];
         }
-        matvecs = CG( system, workspace, H, workspace->b_s, tol, workspace->s,
-                mpi_data, fout, -1 );
+
+        matvecs = CG( system, control, data, workspace,
+                H, workspace->b_s, tol, workspace->s, mpi_data );
+
         for ( j = 0; j < system->n; ++j )
         {
             workspace->x[j][0] = workspace->s[j];
@@ -1669,7 +1774,7 @@ void dual_Sparse_MatVec( sparse_matrix *A, rvec2 *x, rvec2 *b, int N )
 
     if ( i >= 300 )
     {
-        fprintf( stderr, "CG convergence failed!\n" );
+        fprintf( stderr, "[WARNING] CG convergence failed!\n" );
     }
 
 #if defined(CG_PERFORMANCE)
@@ -1681,289 +1786,160 @@ void dual_Sparse_MatVec( sparse_matrix *A, rvec2 *x, rvec2 *b, int N )
 #endif
 
     return (i + 1) + matvecs;
-}*/
+}
 
 
-void Sparse_MatVec( sparse_matrix *A, real *x, real *b, int N )
+/* Preconditioned Conjugate Gradient Method */
+int CG( reax_system *system, control_params *control, simulation_data *data,
+        storage *workspace, sparse_matrix *H, real *b,
+        real tol, real *x, mpi_datatypes* mpi_data )
 {
-    int i, j, k, si, num_rows;
-    real val;
+    int  i, j;
+    real tmp, alpha, beta, b_norm;
+    real sig_old, sig_new;
+    real t_start, t_pa, t_spmv, t_vops, t_comm, t_allreduce;
+    real timings[5];
 
-    for ( i = 0; i < N; ++i )
-    {
-        b[i] = 0;
-    }
+    t_pa = 0.0;
+    t_spmv = 0.0;
+    t_vops = 0.0;
+    t_comm = 0.0;
+    t_allreduce = 0.0;
 
+    t_start = MPI_Wtime( );
+    Dist( system, mpi_data, x, REAL_PTR_TYPE, MPI_DOUBLE );
+    t_comm += MPI_Wtime( ) - t_start;
+
+    t_start = MPI_Wtime( );
 #if defined(NEUTRAL_TERRITORY)
-    num_rows = A->NT;
+    Sparse_MatVec( H, x, workspace->q, H->NT );
+#else
+    Sparse_MatVec( H, x, workspace->q, system->N );
+#endif
+    t_spmv += MPI_Wtime( ) - t_start;
 
-    if ( A->format == SYM_HALF_MATRIX )
+    if ( H->format == SYM_HALF_MATRIX )
     {
-        for ( i = 0; i < num_rows; ++i )
-        {
-            si = A->start[i];
+        t_start = MPI_Wtime( );
+        Coll( system, mpi_data, workspace->q, REAL_PTR_TYPE, MPI_DOUBLE );
+        t_comm += MPI_Wtime( ) - t_start;
+    }
+#if defined(NEUTRAL_TERRITORY)
+    else
+    {
+        t_start = MPI_Wtime( );
+        Coll( system, mpi_data, workspace->q, REAL_PTR_TYPE, MPI_DOUBLE );
+        t_comm += MPI_Wtime( ) - t_start;
+    }
+#endif
 
-            /* diagonal only contributes once */
-            if( i < A->n )
-            {
-                b[i] += A->entries[si].val * x[i];
-                k = si + 1;
-            }
-            /* zeros on the diagonal for i >= A->n,
-             * so skip the diagonal multplication step as zeros
-             * are not stored (idea: keep the NNZ's the same
-             * for full shell and neutral territory half-stored
-             * charge matrices to make debugging easier) */
-            else
-            {
-                k = si;
-            }
-
-            for ( ; k < A->end[i]; ++k )
-            {
-                j = A->entries[k].j;
-                val = A->entries[k].val;
-
-                b[i] += val * x[j];
-                b[j] += val * x[i];
-            }
-        }
-    }
-    else if ( A->format == SYM_FULL_MATRIX || A->format == FULL_MATRIX )
-    {
-        for ( i = 0; i < num_rows; ++i )
-        {
-            si = A->start[i];
-
-            for ( k = si; k < A->end[i]; ++k )
-            {
-                j = A->entries[k].j;
-                val = A->entries[k].val;
-
-                b[i] += val * x[j];
-            }
-        }
-    }
-#else
-    num_rows = A->n;
-
-    if ( A->format == SYM_HALF_MATRIX )
-    {
-        for ( i = 0; i < num_rows; ++i )
-        {
-            si = A->start[i];
-
-            /* diagonal only contributes once */
-            b[i] += A->entries[si].val * x[i];
-
-            for ( k = si + 1; k < A->end[i]; ++k )
-            {
-                j = A->entries[k].j;
-                val = A->entries[k].val;
-
-                b[i] += val * x[j];
-                b[j] += val * x[i];
-            }
-        }
-    }
-    else if ( A->format == SYM_FULL_MATRIX || A->format == FULL_MATRIX )
-    {
-        for ( i = 0; i < num_rows; ++i )
-        {
-            si = A->start[i];
-
-            for ( k = si; k < A->end[i]; ++k )
-            {
-                j = A->entries[k].j;
-                val = A->entries[k].val;
-
-                b[i] += val * x[j];
-            }
-        }
-    }
-#endif
-}
-
-
-int CG( reax_system *system, control_params *control, simulation_data *data,
-        storage *workspace, sparse_matrix *H, real *b,
-        real tol, real *x, mpi_datatypes* mpi_data, FILE *fout, int nprocs )
-{
-    int  i, j, scale;
-    real tmp, alpha, beta, b_norm;
-    real sig_old, sig_new;
-    real t_start, t_pa, t_spmv, t_vops, t_comm, t_allreduce;
-    //real total_pa, total_spmv, total_vops, total_comm, total_allreduce;
-    real timings[5], total_t[5];
-
-    t_pa = 0.0;
-    t_spmv = 0.0;
-    t_vops = 0.0;
-    t_comm = 0.0;
-    t_allreduce = 0.0;
-
-    t_start = MPI_Wtime();
-    scale = sizeof(real) / sizeof(void);
-#if defined(NEUTRAL_TERRITORY)
-    Dist_NT( system, mpi_data, x, MPI_DOUBLE, scale, real_packer );
-#else
-    Dist( system, mpi_data, x, MPI_DOUBLE, scale, real_packer );
-#endif
-    t_comm += MPI_Wtime() - t_start;
-
-    t_start = MPI_Wtime();
-#if defined(NEUTRAL_TERRITORY)
-    Sparse_MatVec( H, x, workspace->q, H->NT );
-#else
-    Sparse_MatVec( H, x, workspace->q, system->N );
-#endif
-    t_spmv += MPI_Wtime() - t_start;
-
-    if ( H->format == SYM_HALF_MATRIX )
-    {
-        t_start = MPI_Wtime();
-#if defined(NEUTRAL_TERRITORY)
-        Coll_NT( system, mpi_data, workspace->q, MPI_DOUBLE, scale, real_unpacker );
-#else
-        Coll( system, mpi_data, workspace->q, MPI_DOUBLE, scale, real_unpacker );
-#endif
-        t_comm += MPI_Wtime() - t_start;
-    }
-
-    else
-    {
-        t_start = MPI_Wtime();
-#if defined(NEUTRAL_TERRITORY)
-        Coll_NT( system, mpi_data, workspace->q, MPI_DOUBLE, scale, real_unpacker );
-#endif
-        t_comm += MPI_Wtime() - t_start;
-    }
-
-    t_start = MPI_Wtime();
+    t_start = MPI_Wtime( );
     Vector_Sum( workspace->r , 1.,  b, -1., workspace->q, system->n );
-    t_vops += MPI_Wtime() - t_start;
+    t_vops += MPI_Wtime( ) - t_start;
 
     /* pre-conditioning */
-    if( control->cm_solver_pre_comp_type == SAI_PC )
+    if ( control->cm_solver_pre_comp_type == SAI_PC )
     {
-        t_start = MPI_Wtime();
-#if defined(NEUTRAL_TERRITORY)
-        Dist_NT( system, mpi_data, workspace->r, MPI_DOUBLE, scale, real_packer );
-#else
-        Dist( system, mpi_data, workspace->r, MPI_DOUBLE, scale, real_packer );
-#endif
-        t_comm += MPI_Wtime() - t_start;
+        t_start = MPI_Wtime( );
+        Dist( system, mpi_data, workspace->r, REAL_PTR_TYPE, MPI_DOUBLE );
+        t_comm += MPI_Wtime( ) - t_start;
         
-        t_start = MPI_Wtime();
+        t_start = MPI_Wtime( );
 #if defined(NEUTRAL_TERRITORY)
         Sparse_MatVec( workspace->H_app_inv, workspace->r, workspace->d, H->NT );
 #else
         Sparse_MatVec( workspace->H_app_inv, workspace->r, workspace->d, system->n );
 #endif
-        t_pa += MPI_Wtime() - t_start;
+        t_pa += MPI_Wtime( ) - t_start;
     }
-
     else if ( control->cm_solver_pre_comp_type == JACOBI_PC)
     {
-        t_start = MPI_Wtime();
+        t_start = MPI_Wtime( );
         for ( j = 0; j < system->n; ++j )
         {
             workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j];
         }
-        t_pa += MPI_Wtime() - t_start;
+        t_pa += MPI_Wtime( ) - t_start;
     }
 
-    t_start = MPI_Wtime();
+    t_start = MPI_Wtime( );
     b_norm = Parallel_Norm( b, system->n, mpi_data->world );
-    sig_new = Parallel_Dot(workspace->r, workspace->d, system->n, mpi_data->world);
-    t_allreduce += MPI_Wtime() - t_start;
+    sig_new = Parallel_Dot( workspace->r, workspace->d, system->n, mpi_data->world );
+    t_allreduce += MPI_Wtime( ) - t_start;
 
     for ( i = 0; i < control->cm_solver_max_iters && sqrt(sig_new) / b_norm > tol; ++i )
     {
-        t_start = MPI_Wtime();
-#if defined(NEUTRAL_TERRITORY)
-        Dist_NT( system, mpi_data, workspace->d, MPI_DOUBLE, scale, real_packer );
-#else
-        Dist( system, mpi_data, workspace->d, MPI_DOUBLE, scale, real_packer );
-#endif
-        t_comm += MPI_Wtime() - t_start;
+        t_start = MPI_Wtime( );
+        Dist( system, mpi_data, workspace->d, REAL_PTR_TYPE, MPI_DOUBLE );
+        t_comm += MPI_Wtime( ) - t_start;
 
-        t_start = MPI_Wtime();
+        t_start = MPI_Wtime( );
 #if defined(NEUTRAL_TERRITORY)
         Sparse_MatVec( H, workspace->d, workspace->q, H->NT );
 #else
         Sparse_MatVec( H, workspace->d, workspace->q, system->N );
 #endif
-        t_spmv += MPI_Wtime() - t_start;
+        t_spmv += MPI_Wtime( ) - t_start;
 
         if ( H->format == SYM_HALF_MATRIX )
         {
-            t_start = MPI_Wtime();
-#if defined(NEUTRAL_TERRITORY)
-            Coll_NT(system, mpi_data, workspace->q, MPI_DOUBLE, scale, real_unpacker);
-#else
-            Coll(system, mpi_data, workspace->q, MPI_DOUBLE, scale, real_unpacker);
-#endif
-            t_comm += MPI_Wtime() - t_start;
+            t_start = MPI_Wtime( );
+            Coll( system, mpi_data, workspace->q, REAL_PTR_TYPE, MPI_DOUBLE );
+            t_comm += MPI_Wtime( ) - t_start;
         }
+#if defined(NEUTRAL_TERRITORY)
         else
         {
-            t_start = MPI_Wtime();
-#if defined(NEUTRAL_TERRITORY)
-            Coll_NT( system, mpi_data, workspace->q, MPI_DOUBLE, scale, real_unpacker );
-#endif
-            t_comm += MPI_Wtime() - t_start;
+            t_start = MPI_Wtime( );
+            Coll( system, mpi_data, workspace->q, REAL_PTR_TYPE, MPI_DOUBLE );
+            t_comm += MPI_Wtime( ) - t_start;
         }
+#endif
 
-        t_start =  MPI_Wtime();
-        tmp = Parallel_Dot(workspace->d, workspace->q, system->n, mpi_data->world);
-        t_allreduce += MPI_Wtime() - t_start;
+        t_start =  MPI_Wtime( );
+        tmp = Parallel_Dot( workspace->d, workspace->q, system->n, mpi_data->world );
+        t_allreduce += MPI_Wtime( ) - t_start;
 
-        t_start = MPI_Wtime();
+        t_start = MPI_Wtime( );
         alpha = sig_new / tmp;
         Vector_Add( x, alpha, workspace->d, system->n );
         Vector_Add( workspace->r, -alpha, workspace->q, system->n );
-        t_vops += MPI_Wtime() - t_start;
+        t_vops += MPI_Wtime( ) - t_start;
 
         /* pre-conditioning */
-        if( control->cm_solver_pre_comp_type == SAI_PC )
+        if ( control->cm_solver_pre_comp_type == SAI_PC )
         {
-            t_start = MPI_Wtime();
-#if defined(NEUTRAL_TERRITORY)
-            Dist_NT( system, mpi_data, workspace->r, MPI_DOUBLE, scale, real_packer );
-#else
-            Dist( system, mpi_data, workspace->r, MPI_DOUBLE, scale, real_packer );
-#endif
-            t_comm += MPI_Wtime() - t_start;
+            t_start = MPI_Wtime( );
+            Dist( system, mpi_data, workspace->r, REAL_PTR_TYPE, MPI_DOUBLE );
+            t_comm += MPI_Wtime( ) - t_start;
 
-            t_start = MPI_Wtime();
+            t_start = MPI_Wtime( );
 #if defined(NEUTRAL_TERRITORY)
             Sparse_MatVec( workspace->H_app_inv, workspace->r, workspace->p, H->NT );
 #else
             Sparse_MatVec( workspace->H_app_inv, workspace->r, workspace->p, system->n );
 #endif
-            t_pa += MPI_Wtime() - t_start;
+            t_pa += MPI_Wtime( ) - t_start;
         }
-
-        else if ( control->cm_solver_pre_comp_type == JACOBI_PC)
+        else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
         {
-            t_start = MPI_Wtime();
+            t_start = MPI_Wtime( );
             for ( j = 0; j < system->n; ++j )
             {
                 workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j];
             }
-            t_pa += MPI_Wtime() - t_start;
+            t_pa += MPI_Wtime( ) - t_start;
         }
 
-        t_start = MPI_Wtime();
+        t_start = MPI_Wtime( );
         sig_old = sig_new;
-        sig_new = Parallel_Dot(workspace->r, workspace->p, system->n, mpi_data->world);
+        sig_new = Parallel_Dot( workspace->r, workspace->p, system->n, mpi_data->world );
         t_allreduce += MPI_Wtime() - t_start;
 
-        t_start = MPI_Wtime();
+        t_start = MPI_Wtime( );
         beta = sig_new / sig_old;
         Vector_Sum( workspace->d, 1., workspace->p, beta, workspace->d, system->n );
-        t_vops += MPI_Wtime() - t_start;
+        t_vops += MPI_Wtime( ) - t_start;
     }
 
     timings[0] = t_pa;
@@ -1972,32 +1948,26 @@ int CG( reax_system *system, control_params *control, simulation_data *data,
     timings[3] = t_comm;
     timings[4] = t_allreduce;
 
-    //MPI_Reduce(&t_pa, &total_pa, 1, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world);
-    //MPI_Reduce(&t_spmv, &total_spmv, 1, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world);
-    //MPI_Reduce(&t_vops, &total_vops, 1, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world);
-    //MPI_Reduce(&t_comm, &total_comm, 1, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world);
-    //MPI_Reduce(&t_allreduce, &total_allreduce, 1, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world);
-    MPI_Reduce(timings, total_t, 5, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world);
-
     if ( system->my_rank == MASTER_NODE )
     {
-        //data->timing.cm_solver_pre_app += total_pa / nprocs;
-        //data->timing.cm_solver_spmv += total_spmv / nprocs;
-        //data->timing.cm_solver_vector_ops += total_vops / nprocs;
-        //data->timing.cm_solver_comm += total_comm / nprocs;
-        //data->timing.cm_solver_allreduce += total_allreduce / nprocs;
-        data->timing.cm_solver_pre_app += total_t[0] / nprocs;
-        data->timing.cm_solver_spmv += total_t[1] / nprocs;
-        data->timing.cm_solver_vector_ops += total_t[2] / nprocs;
-        data->timing.cm_solver_comm += total_t[3] / nprocs;
-        data->timing.cm_solver_allreduce += total_t[4] / nprocs;
+        MPI_Reduce( MPI_IN_PLACE, timings, 5, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world );
+
+        data->timing.cm_solver_pre_app += timings[0] / control->nprocs;
+        data->timing.cm_solver_spmv += timings[1] / control->nprocs;
+        data->timing.cm_solver_vector_ops += timings[2] / control->nprocs;
+        data->timing.cm_solver_comm += timings[3] / control->nprocs;
+        data->timing.cm_solver_allreduce += timings[4] / control->nprocs;
+    }
+    else
+    {
+        MPI_Reduce( timings, NULL, 5, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world );
     }
 
     MPI_Barrier(mpi_data->world);
 
     if ( i >= control->cm_solver_max_iters && system->my_rank == MASTER_NODE )
     {
-        fprintf( stderr, "CG convergence failed!\n" );
+        fprintf( stderr, "[WARNING] CG convergence failed!\n" );
         return i;
     }
 
@@ -2005,634 +1975,495 @@ int CG( reax_system *system, control_params *control, simulation_data *data,
 }
 
 
-/*int CG_test( reax_system *system, storage *workspace, sparse_matrix *H,
-        real *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
+/* Pipelined Preconditioned Conjugate Gradient Method
+ *
+ * References:
+ * 1) Hiding global synchronization latency in the preconditioned Conjugate Gradient algorithm,
+ *  P. Ghysels and W. Vanroose, Parallel Computing, 2014.
+ * 2) Scalable Non-blocking Preconditioned Conjugate Gradient Methods,
+ *  Paul R. Eller and William Gropp, SC '16 Proceedings of the International Conference
+ *  for High Performance Computing, Networking, Storage and Analysis, 2016.
+ *  */
+int PIPECG( reax_system *system, control_params *control, simulation_data *data,
+        storage *workspace, sparse_matrix *H, real *b,
+        real tol, real *x, mpi_datatypes* mpi_data )
 {
-    int  i, j, scale;
-    real tmp, alpha, beta, b_norm;
-    real sig_old, sig_new;
-
-    scale = sizeof(real) / sizeof(void);
-    b_norm = Parallel_Norm( b, system->n, mpi_data->world );
-#if defined(DEBUG)
-    if ( system->my_rank == MASTER_NODE )
-    {
-        fprintf( stderr, "n=%d, N=%d\n", system->n, system->N );
-        fprintf( stderr, "p%d CGinit: b_norm=%24.15e\n", system->my_rank, b_norm );
-        //Vector_Print( stderr, "d", workspace->d, system->N );
-        //Vector_Print( stderr, "q", workspace->q, system->N );
-    }
-    MPI_Barrier( mpi_data->world );
-#endif
+    int i, j;
+    real alpha, beta, b_norm, norm;
+    real delta;
+    real gamma_old, gamma_new;
+    real t_start, t_pa, t_spmv, t_vops, t_comm, t_allreduce;
+    real timings[5], redux[4];
+    MPI_Request req;
 
-    Sparse_MatVec( H, x, workspace->q, system->N );
-    //Coll( system, mpi_data, workspace->q, MPI_DOUBLE, real_unpacker );
+    t_pa = 0.0;
+    t_spmv = 0.0;
+    t_vops = 0.0;
+    t_comm = 0.0;
+    t_allreduce = 0.0;
 
-    Vector_Sum( workspace->r , 1.,  b, -1., workspace->q, system->n );
-    for ( j = 0; j < system->n; ++j )
-        workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; //pre-condition
+    t_start = MPI_Wtime( );
+    Dist( system, mpi_data, x, REAL_PTR_TYPE, MPI_DOUBLE );
+    t_comm += MPI_Wtime( ) - t_start;
 
-    sig_new = Parallel_Dot( workspace->r, workspace->d, system->n,
-            mpi_data->world );
-#if defined(DEBUG)
-    //if( system->my_rank == MASTER_NODE ) {
-    fprintf( stderr, "p%d CG:sig_new=%24.15e,d_norm=%24.15e,q_norm=%24.15e\n",
-            system->my_rank, sqrt(sig_new),
-            Parallel_Norm(workspace->d, system->n, mpi_data->world),
-            Parallel_Norm(workspace->q, system->n, mpi_data->world) );
-    //Vector_Print( stderr, "d", workspace->d, system->N );
-    //Vector_Print( stderr, "q", workspace->q, system->N );
-    //}
-    MPI_Barrier( mpi_data->world );
+    t_start = MPI_Wtime( );
+#if defined(NEUTRAL_TERRITORY)
+    Sparse_MatVec( H, x, workspace->q, H->NT );
+#else
+    Sparse_MatVec( H, x, workspace->q, system->N );
 #endif
+    t_spmv += MPI_Wtime( ) - t_start;
 
-    for ( i = 1; i < 300 && sqrt(sig_new) / b_norm > tol; ++i )
+    if ( H->format == SYM_HALF_MATRIX )
     {
-#if defined(CG_PERFORMANCE)
-        if ( system->my_rank == MASTER_NODE )
-            t_start = MPI_Wtime();
-#endif
-        Dist( system, mpi_data, workspace->d, MPI_DOUBLE, scale, real_packer );
-        Sparse_MatVec( H, workspace->d, workspace->q, system->N );
-        //tryQEq
-        //Coll(system, mpi_data, workspace->q, MPI_DOUBLE, real_unpacker);
-#if defined(CG_PERFORMANCE)
-        if ( system->my_rank == MASTER_NODE )
-        {
-            t_elapsed = MPI_Wtime() - t_start;
-            matvec_time += t_elapsed;
-        }
-#endif
-
-#if defined(CG_PERFORMANCE)
-        if ( system->my_rank == MASTER_NODE )
-            t_start = MPI_Wtime();
-#endif
-        tmp = Parallel_Dot(workspace->d, workspace->q, system->n, mpi_data->world);
-        alpha = sig_new / tmp;
-#if defined(DEBUG)
-        //if( system->my_rank == MASTER_NODE ){
-        fprintf(stderr,
-                "p%d CG iter%d:d_norm=%24.15e,q_norm=%24.15e,tmp = %24.15e\n",
-                system->my_rank, i,
-                //Parallel_Norm(workspace->d, system->n, mpi_data->world),
-                //Parallel_Norm(workspace->q, system->n, mpi_data->world),
-                Norm(workspace->d, system->n), Norm(workspace->q, system->n), tmp);
-        //Vector_Print( stderr, "d", workspace->d, system->N );
-        //for( j = 0; j < system->N; ++j )
-        //  fprintf( stdout, "%d  %24.15e\n",
-        //     system->my_atoms[j].orig_id, workspace->q[j] );
-        //fprintf( stdout, "\n" );
-        //}
-        MPI_Barrier( mpi_data->world );
-#endif
-
-        Vector_Add( x, alpha, workspace->d, system->n );
-        Vector_Add( workspace->r, -alpha, workspace->q, system->n );
-        // pre-conditioning
-        for ( j = 0; j < system->n; ++j )
-            workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j];
-
-        sig_old = sig_new;
-        sig_new = Parallel_Dot(workspace->r, workspace->p, system->n, mpi_data->world);
-        beta = sig_new / sig_old;
-        Vector_Sum( workspace->d, 1., workspace->p, beta, workspace->d, system->n );
-#if defined(DEBUG)
-        if ( system->my_rank == MASTER_NODE )
-            fprintf(stderr, "p%d CG iter%d: sig_new = %24.15e\n",
-                    system->my_rank, i, sqrt(sig_new) );
-        MPI_Barrier( mpi_data->world );
-#endif
-#if defined(CG_PERFORMANCE)
-        if ( system->my_rank == MASTER_NODE )
-        {
-            t_elapsed = MPI_Wtime() - t_start;
-            dot_time += t_elapsed;
-        }
-#endif
+        t_start = MPI_Wtime( );
+        Coll( system, mpi_data, workspace->q, REAL_PTR_TYPE, MPI_DOUBLE );
+        t_comm += MPI_Wtime( ) - t_start;
     }
-
-#if defined(DEBUG)
-    if ( system->my_rank == MASTER_NODE )
-        fprintf( stderr, "CG took %d iterations\n", i );
-#endif
-#if defined(CG_PERFORMANCE)
-    if ( system->my_rank == MASTER_NODE )
-        fprintf( stderr, "%f  %f\n", matvec_time, dot_time );
-#endif
-    if ( i >= 300 )
+#if defined(NEUTRAL_TERRITORY)
+    else
     {
-        fprintf( stderr, "CG convergence failed!\n" );
-        return i;
+        t_start = MPI_Wtime( );
+        Coll( system, mpi_data, workspace->q, REAL_PTR_TYPE, MPI_DOUBLE );
+        t_comm += MPI_Wtime( ) - t_start;
     }
+#endif
 
-    return i;
-}*/
-
-
-/*void Forward_Subs( sparse_matrix *L, real *b, real *y )
-{
-    int i, pj, j, si, ei;
-    real val;
+    t_start = MPI_Wtime( );
+    Vector_Sum( workspace->r , 1.0,  b, -1.0, workspace->q, system->n );
+    t_vops += MPI_Wtime( ) - t_start;
 
-    for ( i = 0; i < L->n; ++i )
+    /* pre-conditioning */
+    if ( control->cm_solver_pre_comp_type == SAI_PC )
     {
-        y[i] = b[i];
-        si = L->start[i];
-        ei = L->end[i];
-        for ( pj = si; pj < ei - 1; ++pj )
-        {
-            j = L->entries[pj].j;
-            val = L->entries[pj].val;
-            y[i] -= val * y[j];
-        }
-        y[i] /= L->entries[pj].val;
+        t_start = MPI_Wtime( );
+        Dist( system, mpi_data, workspace->r, REAL_PTR_TYPE, MPI_DOUBLE );
+        t_comm += MPI_Wtime( ) - t_start;
+        
+        t_start = MPI_Wtime( );
+#if defined(NEUTRAL_TERRITORY)
+        Sparse_MatVec( workspace->H_app_inv, workspace->r, workspace->u, H->NT );
+#else
+        Sparse_MatVec( workspace->H_app_inv, workspace->r, workspace->u, system->n );
+#endif
+        t_pa += MPI_Wtime( ) - t_start;
     }
-}*/
-
-
-/*void Backward_Subs( sparse_matrix *U, real *y, real *x )
-{
-    int i, pj, j, si, ei;
-    real val;
-
-    for ( i = U->n - 1; i >= 0; --i )
+    else if ( control->cm_solver_pre_comp_type == JACOBI_PC)
     {
-        x[i] = y[i];
-        si = U->start[i];
-        ei = U->end[i];
-        for ( pj = si + 1; pj < ei; ++pj )
+        t_start = MPI_Wtime( );
+        for ( j = 0; j < system->n; ++j )
         {
-            j = U->entries[pj].j;
-            val = U->entries[pj].val;
-            x[i] -= val * x[j];
+            workspace->u[j] = workspace->r[j] * workspace->Hdia_inv[j];
         }
-        x[i] /= U->entries[si].val;
+        t_pa += MPI_Wtime( ) - t_start;
     }
-}*/
-
-
-/*int PCG( reax_system *system, storage *workspace,
-        sparse_matrix *H, real *b, real tol,
-        sparse_matrix *L, sparse_matrix *U, real *x,
-        mpi_datatypes* mpi_data, FILE *fout )
-{
-    int  i, n, N, scale;
-    real tmp, alpha, beta, b_norm, r_norm, sig_old, sig_new;
-    MPI_Comm world;
 
-#if defined(DEBUG_FOCUS)
-    int me;
-    me = system->my_rank;
-#endif
+    t_start = MPI_Wtime( );
+    Dist( system, mpi_data, workspace->u, REAL_PTR_TYPE, MPI_DOUBLE );
+    t_comm += MPI_Wtime( ) - t_start;
 
-    n = system->n;
-    N = system->N;
-    world = mpi_data->world;
-    scale = sizeof(real) / sizeof(void);
-    b_norm = Parallel_Norm( b, n, world );
-#if defined(DEBUG_FOCUS)
-    if ( me == MASTER_NODE )
-    {
-        fprintf( stderr, "init_PCG: n=%d, N=%d\n", n, N );
-        fprintf( stderr, "init_PCG: |b|=%24.15e\n", b_norm );
-    }
-    MPI_Barrier( world );
+    t_start = MPI_Wtime( );
+#if defined(NEUTRAL_TERRITORY)
+    Sparse_MatVec( H, workspace->u, workspace->w, H->NT );
+#else
+    Sparse_MatVec( H, workspace->u, workspace->w, system->N );
 #endif
+    t_spmv += MPI_Wtime( ) - t_start;
 
-    Sparse_MatVec( H, x, workspace->q, N );
-    //Coll( system, workspace, mpi_data, workspace->q );
-    Vector_Sum( workspace->r , 1.,  b, -1., workspace->q, n );
-    r_norm = Parallel_Norm( workspace->r, n, world );
-
-    Forward_Subs( L, workspace->r, workspace->d );
-    Backward_Subs( U, workspace->d, workspace->p );
-    sig_new = Parallel_Dot( workspace->r, workspace->p, n, world );
-#if defined(DEBUG_FOCUS)
-    if ( me == MASTER_NODE )
+    if ( H->format == SYM_HALF_MATRIX )
     {
-        fprintf( stderr, "init_PCG: sig_new=%.15e\n", r_norm );
-        fprintf( stderr, "init_PCG: |d|=%.15e |q|=%.15e\n",
-                Parallel_Norm(workspace->d, n, world),
-                Parallel_Norm(workspace->q, n, world) );
+        t_start = MPI_Wtime( );
+        Coll( system, mpi_data, workspace->w, REAL_PTR_TYPE, MPI_DOUBLE );
+        t_comm += MPI_Wtime( ) - t_start;
     }
-    MPI_Barrier( world );
-#endif
-
-    for ( i = 1; i < 100 && r_norm / b_norm > tol; ++i )
+#if defined(NEUTRAL_TERRITORY)
+    else
     {
-        Dist( system, mpi_data, workspace->p, MPI_DOUBLE, scale, real_packer );
-        Sparse_MatVec( H, workspace->p, workspace->q, N );
-        // tryQEq
-        //Coll(system,mpi_data,workspace->q, MPI_DOUBLE, real_unpacker);
-        tmp = Parallel_Dot( workspace->q, workspace->p, n, world );
-        alpha = sig_new / tmp;
-        Vector_Add( x, alpha, workspace->p, n );
-#if defined(DEBUG_FOCUS)
-        if ( me == MASTER_NODE )
-            fprintf(stderr, "iter%d: |p|=%.15e |q|=%.15e tmp=%.15e\n",
-                    i, Parallel_Norm(workspace->p, n, world),
-                    Parallel_Norm(workspace->q, n, world), tmp );
-        MPI_Barrier( world );
-#endif
-
-        Vector_Add( workspace->r, -alpha, workspace->q, n );
-        r_norm = Parallel_Norm( workspace->r, n, world );
-#if defined(DEBUG_FOCUS)
-        if ( me == MASTER_NODE )
-            fprintf( stderr, "iter%d: res=%.15e\n", i, r_norm );
-        MPI_Barrier( world );
-#endif
-
-        Forward_Subs( L, workspace->r, workspace->d );
-        Backward_Subs( U, workspace->d, workspace->d );
-        sig_old = sig_new;
-        sig_new = Parallel_Dot( workspace->r, workspace->d, n, world );
-        beta = sig_new / sig_old;
-        Vector_Sum( workspace->p, 1., workspace->d, beta, workspace->p, n );
+        t_start = MPI_Wtime( );
+        Coll( system, mpi_data, workspace->w, REAL_PTR_TYPE, MPI_DOUBLE );
+        t_comm += MPI_Wtime( ) - t_start;
     }
-
-#if defined(DEBUG_FOCUS)
-    if ( me == MASTER_NODE )
-        fprintf( stderr, "PCG took %d iterations\n", i );
 #endif
-    if ( i >= 100 )
-        fprintf( stderr, "PCG convergence failed!\n" );
-
-    return i;
-}*/
 
+    t_start = MPI_Wtime( );
+    redux[0] = Dot_local( workspace->w, workspace->u, system->n );
+    redux[1] = Dot_local( workspace->r, workspace->u, system->n );
+    redux[2] = Dot_local( workspace->u, workspace->u, system->n );
+    redux[3] = Dot_local( b, b, system->n );
+    t_vops += MPI_Wtime( ) - t_start;
 
-#if defined(OLD_STUFF)
-/*int sCG( reax_system *system, storage *workspace, sparse_matrix *H,
-        real *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
-{
-    int  i, j;
-    real tmp, alpha, beta, b_norm;
-    real sig_old, sig_new;
+    t_start = MPI_Wtime( );
+    MPI_Iallreduce( MPI_IN_PLACE, redux, 4, MPI_DOUBLE, MPI_SUM, mpi_data->world, &req );
 
-    b_norm = Norm( b, system->n );
-#if defined(DEBUG)
-    if ( system->my_rank == MASTER_NODE )
+    /* pre-conditioning */
+    if ( control->cm_solver_pre_comp_type == SAI_PC )
     {
-        fprintf( stderr, "n=%d, N=%d\n", system->n, system->N );
-        fprintf( stderr, "p%d CGinit: b_norm=%24.15e\n", system->my_rank, b_norm );
-        //Vector_Print( stderr, "d", workspace->d, system->N );
-        //Vector_Print( stderr, "q", workspace->q, system->N );
-    }
-    MPI_Barrier( mpi_data->world );
+        t_start = MPI_Wtime( );
+        Dist( system, mpi_data, workspace->w, REAL_PTR_TYPE, MPI_DOUBLE );
+        t_comm += MPI_Wtime( ) - t_start;
+        
+        t_start = MPI_Wtime( );
+#if defined(NEUTRAL_TERRITORY)
+        Sparse_MatVec( workspace->H_app_inv, workspace->w, workspace->m, H->NT );
+#else
+        Sparse_MatVec( workspace->H_app_inv, workspace->w, workspace->m, system->n );
 #endif
-
-    Sparse_MatVec( H, x, workspace->q, system->N );
-    //Coll_Vector( system, workspace, mpi_data, workspace->q );
-
-    Vector_Sum( workspace->r , 1.,  b, -1., workspace->q, system->n );
-    for ( j = 0; j < system->n; ++j )
-        workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; //pre-condition
-
-    sig_new = Dot( workspace->r, workspace->d, system->n );
-#if defined(DEBUG)
-    if ( system->my_rank == MASTER_NODE )
-    {
-        fprintf( stderr, "p%d CGinit:sig_new=%24.15e\n", system->my_rank, sig_new );
-        //Vector_Print( stderr, "d", workspace->d, system->N );
-        //Vector_Print( stderr, "q", workspace->q, system->N );
+        t_pa += MPI_Wtime( ) - t_start;
     }
-    MPI_Barrier( mpi_data->world );
-#endif
-
-    for ( i = 1; i < 100 && sqrt(sig_new) / b_norm > tol; ++i )
+    else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
     {
-        //Dist_Vector( system, mpi_data, workspace->d );
-        Sparse_MatVec( H, workspace->d, workspace->q, system->N );
-        //Coll_Vector( system, workspace, mpi_data, workspace->q );
-
-        tmp = Dot( workspace->d, workspace->q, system->n );
-        alpha = sig_new / tmp;
-#if defined(DEBUG)
-        if ( system->my_rank == MASTER_NODE )
+        t_start = MPI_Wtime( );
+        for ( j = 0; j < system->n; ++j )
         {
-            fprintf(stderr,
-                    "p%d CG iter%d:d_norm=%24.15e,q_norm=%24.15e,tmp = %24.15e\n",
-                    system->my_rank, i,
-                    Parallel_Norm(workspace->d, system->n, mpi_data->world),
-                    Parallel_Norm(workspace->q, system->n, mpi_data->world), tmp );
-            //Vector_Print( stderr, "d", workspace->d, system->N );
-            //Vector_Print( stderr, "q", workspace->q, system->N );
+            workspace->m[j] = workspace->w[j] * workspace->Hdia_inv[j];
         }
-        MPI_Barrier( mpi_data->world );
-#endif
-
-        Vector_Add( x, alpha, workspace->d, system->n );
-        Vector_Add( workspace->r, -alpha, workspace->q, system->n );
-        // pre-conditioning
-        for ( j = 0; j < system->n; ++j )
-            workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j];
+        t_pa += MPI_Wtime( ) - t_start;
+    }
 
-        sig_old = sig_new;
-        sig_new = Dot( workspace->r, workspace->p, system->n );
+    t_start = MPI_Wtime( );
+    Dist( system, mpi_data, workspace->m, REAL_PTR_TYPE, MPI_DOUBLE );
+    t_comm += MPI_Wtime( ) - t_start;
 
-        beta = sig_new / sig_old;
-        Vector_Sum( workspace->d, 1., workspace->p, beta, workspace->d, system->n );
-#if defined(DEBUG)
-        if ( system->my_rank == MASTER_NODE )
-            fprintf(stderr, "p%d CG iter%d: sig_new = %24.15e\n",
-                    system->my_rank, i, sig_new );
-        MPI_Barrier( mpi_data->world );
+    t_start = MPI_Wtime( );
+#if defined(NEUTRAL_TERRITORY)
+    Sparse_MatVec( H, workspace->m, workspace->n, H->NT );
+#else
+    Sparse_MatVec( H, workspace->m, workspace->n, system->N );
 #endif
-    }
+    t_spmv += MPI_Wtime( ) - t_start;
 
-#if defined(DEBUG)
-    if ( system->my_rank == MASTER_NODE )
-        fprintf( stderr, "CG took %d iterations\n", i );
-#endif
-    if ( i >= 100 )
+    if ( H->format == SYM_HALF_MATRIX )
     {
-        fprintf( stderr, "CG convergence failed!\n" );
-        return i;
+        t_start = MPI_Wtime( );
+        Coll( system, mpi_data, workspace->n, REAL_PTR_TYPE, MPI_DOUBLE );
+        t_comm += MPI_Wtime( ) - t_start;
     }
+#if defined(NEUTRAL_TERRITORY)
+    else
+    {
+        t_start = MPI_Wtime( );
+        Coll( system, mpi_data, workspace->n, REAL_PTR_TYPE, MPI_DOUBLE );
+        t_comm += MPI_Wtime( ) - t_start;
+    }
+#endif
 
-    return i;
-}*/
-
-
-/*int GMRES( reax_system *system, storage *workspace, sparse_matrix *H,
-        real *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
-{
-    int i, j, k, itr, N;
-    real cc, tmp1, tmp2, temp, bnorm;
-
-    N = system->N;
-    bnorm = Norm( b, N );
-
-    // apply the diagonal pre-conditioner to rhs
-    for ( i = 0; i < N; ++i )
-        workspace->b_prc[i] = b[i] * workspace->Hdia_inv[i];
+    MPI_Wait( &req, MPI_STATUS_IGNORE );
+    t_allreduce += MPI_Wtime( ) - t_start;
+    delta = redux[0];
+    gamma_new = redux[1];
+    norm = sqrt( redux[2] );
+    b_norm = sqrt( redux[3] );
 
-    // GMRES outer-loop
-    for ( itr = 0; itr < MAX_ITR; ++itr )
+    for ( i = 0; i < control->cm_solver_max_iters && norm / b_norm > tol; ++i )
     {
-        // calculate r0
-        Sparse_MatVec( H, x, workspace->b_prm, N );
-        for ( i = 0; i < N; ++i )
-            workspace->b_prm[i] *= workspace->Hdia_inv[i]; // pre-conditioner
+        if ( i > 0 )
+        {
+            beta = gamma_new / gamma_old;
+            alpha = gamma_new / (delta - beta / alpha * gamma_new);
+        }
+        else
+        {
+            beta = 0.0;
+            alpha = gamma_new / delta;
+        }
 
-        Vector_Sum( workspace->v[0],
-                1.,  workspace->b_prc, -1., workspace->b_prm, N );
-        workspace->g[0] = Norm( workspace->v[0], N );
-        Vector_Scale( workspace->v[0],
-                1. / workspace->g[0], workspace->v[0], N );
+        t_start = MPI_Wtime( );
+        Vector_Sum( workspace->u , 1.0,  workspace->u, -alpha, workspace->q, system->n );
+        Vector_Sum( workspace->w , 1.0,  workspace->w, -alpha, workspace->z, system->n );
+        Vector_Sum( workspace->r , 1.0,  workspace->r, -alpha, workspace->d, system->n );
+        Vector_Sum( x, 1.0,  x, alpha, workspace->p, system->n );
+        Vector_Sum( workspace->z , 1.0,  workspace->n, beta, workspace->z, system->n );
+        Vector_Sum( workspace->q , 1.0,  workspace->m, beta, workspace->q, system->n );
+        Vector_Sum( workspace->p , 1.0,  workspace->u, beta, workspace->p, system->n );
+        Vector_Sum( workspace->d , 1.0,  workspace->w, beta, workspace->d, system->n );
+        redux[0] = Dot_local( workspace->w, workspace->u, system->n );
+        redux[1] = Dot_local( workspace->r, workspace->u, system->n );
+        redux[2] = Dot_local( workspace->u, workspace->u, system->n );
+        t_vops += MPI_Wtime( ) - t_start;
 
-        // fprintf( stderr, "%10.6f\n", workspace->g[0] );
+        t_start = MPI_Wtime( );
+        MPI_Iallreduce( MPI_IN_PLACE, redux, 3, MPI_DOUBLE, MPI_SUM, mpi_data->world, &req );
 
-        // GMRES inner-loop
-        for ( j = 0; j < RESTART && fabs(workspace->g[j]) / bnorm > tol; j++ )
+        /* pre-conditioning */
+        if ( control->cm_solver_pre_comp_type == SAI_PC )
         {
-            // matvec
-            Sparse_MatVec( H, workspace->v[j], workspace->v[j + 1], N );
-
-            for ( k = 0; k < N; ++k )
-                workspace->v[j + 1][k] *= workspace->Hdia_inv[k]; // pre-conditioner
-            // fprintf( stderr, "%d-%d: matvec done.\n", itr, j );
-
-            // apply modified Gram-Schmidt to orthogonalize the new residual
-            for ( i = 0; i <= j; i++ )
+            t_start = MPI_Wtime( );
+            Dist( system, mpi_data, workspace->w, REAL_PTR_TYPE, MPI_DOUBLE );
+            t_comm += MPI_Wtime( ) - t_start;
+            
+            t_start = MPI_Wtime( );
+#if defined(NEUTRAL_TERRITORY)
+            Sparse_MatVec( workspace->H_app_inv, workspace->w, workspace->m, H->NT );
+#else
+            Sparse_MatVec( workspace->H_app_inv, workspace->w, workspace->m, system->n );
+#endif
+            t_pa += MPI_Wtime( ) - t_start;
+        }
+        else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
+        {
+            t_start = MPI_Wtime( );
+            for ( j = 0; j < system->n; ++j )
             {
-                workspace->h[i][j] = Dot(workspace->v[i], workspace->v[j + 1], N);
-                Vector_Add( workspace->v[j + 1],
-                        -workspace->h[i][j], workspace->v[i], N );
+                workspace->m[j] = workspace->w[j] * workspace->Hdia_inv[j];
             }
+            t_pa += MPI_Wtime( ) - t_start;
+        }
 
-            workspace->h[j + 1][j] = Norm( workspace->v[j + 1], N );
-            Vector_Scale( workspace->v[j + 1],
-                    1. / workspace->h[j + 1][j], workspace->v[j + 1], N );
-            // fprintf(stderr, "%d-%d: orthogonalization completed.\n", itr, j);
-
-            // Givens rotations on the H matrix to make it U
-            for ( i = 0; i <= j; i++ )
-            {
-                if ( i == j )
-                {
-                    cc = sqrt(SQR(workspace->h[j][j]) + SQR(workspace->h[j + 1][j]));
-                    workspace->hc[j] = workspace->h[j][j] / cc;
-                    workspace->hs[j] = workspace->h[j + 1][j] / cc;
-                }
-
-                tmp1 =  workspace->hc[i] * workspace->h[i][j] +
-                    workspace->hs[i] * workspace->h[i + 1][j];
-                tmp2 = -workspace->hs[i] * workspace->h[i][j] +
-                    workspace->hc[i] * workspace->h[i + 1][j];
-
-                workspace->h[i][j] = tmp1;
-                workspace->h[i + 1][j] = tmp2;
-            }
+        t_start = MPI_Wtime( );
+        Dist( system, mpi_data, workspace->m, REAL_PTR_TYPE, MPI_DOUBLE );
+        t_comm += MPI_Wtime( ) - t_start;
 
-            // apply Givens rotations to the rhs as well
-            tmp1 =  workspace->hc[j] * workspace->g[j];
-            tmp2 = -workspace->hs[j] * workspace->g[j];
-            workspace->g[j] = tmp1;
-            workspace->g[j + 1] = tmp2;
+        t_start = MPI_Wtime( );
+#if defined(NEUTRAL_TERRITORY)
+        Sparse_MatVec( H, workspace->m, workspace->n, H->NT );
+#else
+        Sparse_MatVec( H, workspace->m, workspace->n, system->N );
+#endif
+        t_spmv += MPI_Wtime( ) - t_start;
 
-            // fprintf( stderr, "%10.6f\n", fabs(workspace->g[j+1]) );
+        if ( H->format == SYM_HALF_MATRIX )
+        {
+            t_start = MPI_Wtime( );
+            Coll( system, mpi_data, workspace->n, REAL_PTR_TYPE, MPI_DOUBLE );
+            t_comm += MPI_Wtime( ) - t_start;
         }
-
-        // solve Hy = g.
-        //   H is now upper-triangular, do back-substitution
-        for ( i = j - 1; i >= 0; i-- )
+#if defined(NEUTRAL_TERRITORY)
+        else
         {
-            temp = workspace->g[i];
-            for ( k = j - 1; k > i; k-- )
-                temp -= workspace->h[i][k] * workspace->y[k];
-            workspace->y[i] = temp / workspace->h[i][i];
+            t_start = MPI_Wtime( );
+            Coll( system, mpi_data, workspace->n, REAL_PTR_TYPE, MPI_DOUBLE );
+            t_comm += MPI_Wtime( ) - t_start;
         }
+#endif
 
-        // update x = x_0 + Vy
-        for ( i = 0; i < j; i++ )
-            Vector_Add( x, workspace->y[i], workspace->v[i], N );
+        gamma_old = gamma_new;
 
-        // stopping condition
-        if ( fabs(workspace->g[j]) / bnorm <= tol )
-            break;
+        MPI_Wait( &req, MPI_STATUS_IGNORE );
+        t_allreduce += MPI_Wtime( ) - t_start;
+        delta = redux[0];
+        gamma_new = redux[1];
+        norm = sqrt( redux[2] );
     }
 
-    //Sparse_MatVec( system, H, x, workspace->b_prm, mpi_data );
-    //  for( i = 0; i < N; ++i )
-    //  workspace->b_prm[i] *= workspace->Hdia_inv[i];
+    timings[0] = t_pa;
+    timings[1] = t_spmv;
+    timings[2] = t_vops;
+    timings[3] = t_comm;
+    timings[4] = t_allreduce;
 
-    //  fprintf( fout, "\n%10s%15s%15s\n", "b_prc", "b_prm", "x" );
-     // for( i = 0; i < N; ++i )
-      //fprintf( fout, "%10.5f%15.12f%15.12f\n",
-      //workspace->b_prc[i], workspace->b_prm[i], x[i] );
+    if ( system->my_rank == MASTER_NODE )
+    {
+        MPI_Reduce( MPI_IN_PLACE, timings, 5, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world );
 
-    fprintf( fout, "GMRES outer: %d, inner: %d - |rel residual| = %15.10f\n",
-            itr, j, fabs( workspace->g[j] ) / bnorm );
+        data->timing.cm_solver_pre_app += timings[0] / control->nprocs;
+        data->timing.cm_solver_spmv += timings[1] / control->nprocs;
+        data->timing.cm_solver_vector_ops += timings[2] / control->nprocs;
+        data->timing.cm_solver_comm += timings[3] / control->nprocs;
+        data->timing.cm_solver_allreduce += timings[4] / control->nprocs;
+    }
+    else
+    {
+        MPI_Reduce( timings, NULL, 5, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world );
+    }
 
-    if ( itr >= MAX_ITR )
+    if ( i >= control->cm_solver_max_iters && system->my_rank == MASTER_NODE )
     {
-        fprintf( stderr, "GMRES convergence failed\n" );
-        return FAILURE;
+        fprintf( stderr, "[WARNING] PIPECG convergence failed!\n" );
+        return i;
     }
 
-    return SUCCESS;
-}*/
+    return i;
+}
 
 
-/*int GMRES_HouseHolder( reax_system *system, storage *workspace,
-        sparse_matrix *H, real *b, real tol, real *x,
-        mpi_datatypes* mpi_data, FILE *fout )
+/* Pipelined Preconditioned Conjugate Residual Method
+ *
+ * References:
+ * 1) Hiding global synchronization latency in the preconditioned Conjugate Gradient algorithm,
+ *  P. Ghysels and W. Vanroose, Parallel Computing, 2014.
+ *  */
+int PIPECR( reax_system *system, control_params *control, simulation_data *data,
+        storage *workspace, sparse_matrix *H, real *b,
+        real tol, real *x, mpi_datatypes* mpi_data )
 {
-    int  i, j, k, itr, N;
-    real cc, tmp1, tmp2, temp, bnorm;
-    real v[10000], z[RESTART + 2][10000], w[RESTART + 2];
-    real u[RESTART + 2][10000];
-
-    N = system->N;
-    bnorm = Norm( b, N );
-
-    // apply the diagonal pre-conditioner to rhs
-    for ( i = 0; i < N; ++i )
-        workspace->b_prc[i] = b[i] * workspace->Hdia_inv[i];
+    int  i, j;
+    real tmp, alpha, beta, b_norm;
+    real sig_old, sig_new;
+    real t_start, t_pa, t_spmv, t_vops, t_comm, t_allreduce;
+    real timings[5];
 
-    // GMRES outer-loop
-    for ( itr = 0; itr < MAX_ITR; ++itr )
-    {
-        // compute z = r0
-        Sparse_MatVec( H, x, workspace->b_prm, N );
+    t_pa = 0.0;
+    t_spmv = 0.0;
+    t_vops = 0.0;
+    t_comm = 0.0;
+    t_allreduce = 0.0;
 
-        for ( i = 0; i < N; ++i )
-            workspace->b_prm[i] *= workspace->Hdia_inv[i]; // pre-conditioner
+    t_start = MPI_Wtime( );
+    Dist( system, mpi_data, x, REAL_PTR_TYPE, MPI_DOUBLE );
+    t_comm += MPI_Wtime( ) - t_start;
 
-        Vector_Sum( z[0], 1.,  workspace->b_prc, -1., workspace->b_prm, N );
+    t_start = MPI_Wtime( );
+#if defined(NEUTRAL_TERRITORY)
+    Sparse_MatVec( H, x, workspace->q, H->NT );
+#else
+    Sparse_MatVec( H, x, workspace->q, system->N );
+#endif
+    t_spmv += MPI_Wtime( ) - t_start;
 
-        Vector_MakeZero( w, RESTART + 1 );
-        w[0] = Norm( z[0], N );
+    if ( H->format == SYM_HALF_MATRIX )
+    {
+        t_start = MPI_Wtime( );
+        Coll( system, mpi_data, workspace->q, REAL_PTR_TYPE, MPI_DOUBLE );
+        t_comm += MPI_Wtime( ) - t_start;
+    }
+#if defined(NEUTRAL_TERRITORY)
+    else
+    {
+        t_start = MPI_Wtime( );
+        Coll( system, mpi_data, workspace->q, REAL_PTR_TYPE, MPI_DOUBLE );
+        t_comm += MPI_Wtime( ) - t_start;
+    }
+#endif
 
-        Vector_Copy( u[0], z[0], N );
-        u[0][0] += ( u[0][0] < 0.0 ? -1 : 1 ) * w[0];
-        Vector_Scale( u[0], 1 / Norm( u[0], N ), u[0], N );
+    t_start = MPI_Wtime( );
+    Vector_Sum( workspace->r , 1.,  b, -1., workspace->q, system->n );
+    t_vops += MPI_Wtime( ) - t_start;
 
-        w[0]    *= ( u[0][0] < 0.0 ?  1 : -1 );
-        // fprintf( stderr, "\n\n%12.6f\n", w[0] );
+    /* pre-conditioning */
+    if ( control->cm_solver_pre_comp_type == SAI_PC )
+    {
+        t_start = MPI_Wtime( );
+        Dist( system, mpi_data, workspace->r, REAL_PTR_TYPE, MPI_DOUBLE );
+        t_comm += MPI_Wtime( ) - t_start;
+        
+        t_start = MPI_Wtime( );
+#if defined(NEUTRAL_TERRITORY)
+        Sparse_MatVec( workspace->H_app_inv, workspace->r, workspace->d, H->NT );
+#else
+        Sparse_MatVec( workspace->H_app_inv, workspace->r, workspace->d, system->n );
+#endif
+        t_pa += MPI_Wtime( ) - t_start;
+    }
 
-        // GMRES inner-loop
-        for ( j = 0; j < RESTART && fabs( w[j] ) / bnorm > tol; j++ )
+    else if ( control->cm_solver_pre_comp_type == JACOBI_PC)
+    {
+        t_start = MPI_Wtime( );
+        for ( j = 0; j < system->n; ++j )
         {
-            // compute v_j
-            Vector_Scale( z[j], -2 * u[j][j], u[j], N );
-            z[j][j] += 1.; // due to e_j
-
-            for ( i = j - 1; i >= 0; --i )
-                Vector_Add( z[j] + i, -2 * Dot( u[i] + i, z[j] + i, N - i ), u[i] + i, N - i );
-
-            // matvec
-            Sparse_MatVec( H, z[j], v, N );
-
-            for ( k = 0; k < N; ++k )
-                v[k] *= workspace->Hdia_inv[k]; // pre-conditioner
-
-            for ( i = 0; i <= j; ++i )
-                Vector_Add( v + i, -2 * Dot( u[i] + i, v + i, N - i ), u[i] + i, N - i );
+            workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j];
+        }
+        t_pa += MPI_Wtime( ) - t_start;
+    }
 
-            if ( !Vector_isZero( v + (j + 1), N - (j + 1) ) )
-            {
-                // compute the HouseHolder unit vector u_j+1
-                for ( i = 0; i <= j; ++i )
-                    u[j + 1][i] = 0;
+    t_start = MPI_Wtime( );
+    b_norm = Parallel_Norm( b, system->n, mpi_data->world );
+    sig_new = Parallel_Dot( workspace->r, workspace->d, system->n, mpi_data->world );
+    t_allreduce += MPI_Wtime( ) - t_start;
 
-                Vector_Copy( u[j + 1] + (j + 1), v + (j + 1), N - (j + 1) );
+    for ( i = 0; i < control->cm_solver_max_iters && sqrt(sig_new) / b_norm > tol; ++i )
+    {
+        t_start = MPI_Wtime( );
+        Dist( system, mpi_data, workspace->d, REAL_PTR_TYPE, MPI_DOUBLE );
+        t_comm += MPI_Wtime( ) - t_start;
 
-                u[j + 1][j + 1] +=
-                    ( v[j + 1] < 0.0 ? -1 : 1 ) * Norm( v + (j + 1), N - (j + 1) );
+        t_start = MPI_Wtime( );
+#if defined(NEUTRAL_TERRITORY)
+        Sparse_MatVec( H, workspace->d, workspace->q, H->NT );
+#else
+        Sparse_MatVec( H, workspace->d, workspace->q, system->N );
+#endif
+        t_spmv += MPI_Wtime( ) - t_start;
 
-                Vector_Scale( u[j + 1], 1 / Norm( u[j + 1], N ), u[j + 1], N );
+        if ( H->format == SYM_HALF_MATRIX )
+        {
+            t_start = MPI_Wtime( );
+            Coll( system, mpi_data, workspace->q, REAL_PTR_TYPE, MPI_DOUBLE );
+            t_comm += MPI_Wtime( ) - t_start;
+        }
+#if defined(NEUTRAL_TERRITORY)
+        else
+        {
+            t_start = MPI_Wtime( );
+            Coll( system, mpi_data, workspace->q, REAL_PTR_TYPE, MPI_DOUBLE );
+            t_comm += MPI_Wtime( ) - t_start;
+        }
+#endif
 
-                // overwrite v with P_m+1 * v
-                v[j + 1] -=
-                    2 * Dot( u[j + 1] + (j + 1), v + (j + 1), N - (j + 1) ) * u[j + 1][j + 1];
-                Vector_MakeZero( v + (j + 2), N - (j + 2) );
-            }
+        t_start =  MPI_Wtime( );
+        tmp = Parallel_Dot( workspace->d, workspace->q, system->n, mpi_data->world );
+        t_allreduce += MPI_Wtime( ) - t_start;
 
+        t_start = MPI_Wtime( );
+        alpha = sig_new / tmp;
+        Vector_Add( x, alpha, workspace->d, system->n );
+        Vector_Add( workspace->r, -alpha, workspace->q, system->n );
+        t_vops += MPI_Wtime( ) - t_start;
 
-            // previous Givens rotations on H matrix to make it U
-            for ( i = 0; i < j; i++ )
-            {
-                tmp1 =  workspace->hc[i] * v[i] + workspace->hs[i] * v[i + 1];
-                tmp2 = -workspace->hs[i] * v[i] + workspace->hc[i] * v[i + 1];
+        /* pre-conditioning */
+        if ( control->cm_solver_pre_comp_type == SAI_PC )
+        {
+            t_start = MPI_Wtime( );
+            Dist( system, mpi_data, workspace->r, REAL_PTR_TYPE, MPI_DOUBLE );
+            t_comm += MPI_Wtime( ) - t_start;
 
-                v[i]   = tmp1;
-                v[i + 1] = tmp2;
-            }
+            t_start = MPI_Wtime( );
+#if defined(NEUTRAL_TERRITORY)
+            Sparse_MatVec( workspace->H_app_inv, workspace->r, workspace->p, H->NT );
+#else
+            Sparse_MatVec( workspace->H_app_inv, workspace->r, workspace->p, system->n );
+#endif
+            t_pa += MPI_Wtime( ) - t_start;
+        }
 
-            // apply the new Givens rotation to H and right-hand side
-            if ( fabs(v[j + 1]) >= ALMOST_ZERO )
+        else if ( control->cm_solver_pre_comp_type == JACOBI_PC)
+        {
+            t_start = MPI_Wtime( );
+            for ( j = 0; j < system->n; ++j )
             {
-                cc = sqrt( SQR( v[j] ) + SQR( v[j + 1] ) );
-                workspace->hc[j] = v[j] / cc;
-                workspace->hs[j] = v[j + 1] / cc;
-
-                tmp1 =  workspace->hc[j] * v[j] + workspace->hs[j] * v[j + 1];
-                tmp2 = -workspace->hs[j] * v[j] + workspace->hc[j] * v[j + 1];
-
-                v[j]   = tmp1;
-                v[j + 1] = tmp2;
-
-                / Givens rotations to rhs
-                tmp1 =  workspace->hc[j] * w[j];
-                tmp2 = -workspace->hs[j] * w[j];
-                w[j]   = tmp1;
-                w[j + 1] = tmp2;
+                workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j];
             }
-
-            // extend R
-            for ( i = 0; i <= j; ++i )
-                workspace->h[i][j] = v[i];
-
-
-            // fprintf( stderr, "h:" );
-            // for( i = 0; i <= j+1 ; ++i )
-            // fprintf( stderr, "%.6f ", h[i][j] );
-            // fprintf( stderr, "\n" );
-            // fprintf( stderr, "%12.6f\n", w[j+1] );
+            t_pa += MPI_Wtime( ) - t_start;
         }
 
+        t_start = MPI_Wtime( );
+        sig_old = sig_new;
+        sig_new = Parallel_Dot( workspace->r, workspace->p, system->n, mpi_data->world );
+        t_allreduce += MPI_Wtime() - t_start;
 
-        // solve Hy = w.
-        //   H is now upper-triangular, do back-substitution 
-        for ( i = j - 1; i >= 0; i-- )
-        {
-            temp = w[i];
-            for ( k = j - 1; k > i; k-- )
-                temp -= workspace->h[i][k] * workspace->y[k];
+        t_start = MPI_Wtime( );
+        beta = sig_new / sig_old;
+        Vector_Sum( workspace->d, 1., workspace->p, beta, workspace->d, system->n );
+        t_vops += MPI_Wtime( ) - t_start;
+    }
 
-            workspace->y[i] = temp / workspace->h[i][i];
-        }
+    timings[0] = t_pa;
+    timings[1] = t_spmv;
+    timings[2] = t_vops;
+    timings[3] = t_comm;
+    timings[4] = t_allreduce;
 
-        for ( i = j - 1; i >= 0; i-- )
-            Vector_Add( x, workspace->y[i], z[i], N );
+    if ( system->my_rank == MASTER_NODE )
+    {
+        MPI_Reduce( MPI_IN_PLACE, timings, 5, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world );
 
-        // stopping condition
-        if ( fabs( w[j] ) / bnorm <= tol )
-            break;
+        data->timing.cm_solver_pre_app += timings[0] / control->nprocs;
+        data->timing.cm_solver_spmv += timings[1] / control->nprocs;
+        data->timing.cm_solver_vector_ops += timings[2] / control->nprocs;
+        data->timing.cm_solver_comm += timings[3] / control->nprocs;
+        data->timing.cm_solver_allreduce += timings[4] / control->nprocs;
+    }
+    else
+    {
+        MPI_Reduce( timings, NULL, 5, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world );
     }
 
-    // Sparse_MatVec( system, H, x, workspace->b_prm );
-    // for( i = 0; i < N; ++i )
-    // workspace->b_prm[i] *= workspace->Hdia_inv[i];
-
-    // fprintf( fout, "\n%10s%15s%15s\n", "b_prc", "b_prm", "x" );
-    // for( i = 0; i < N; ++i )
-    // fprintf( fout, "%10.5f%15.12f%15.12f\n",
-    //          workspace->b_prc[i], workspace->b_prm[i], x[i] );
-
-    fprintf( fout, "GMRES outer:%d  inner:%d iters, |rel residual| = %15.10f\n",
-            itr, j, fabs( workspace->g[j] ) / bnorm );
+    MPI_Barrier(mpi_data->world);
 
-    if ( itr >= MAX_ITR )
+    if ( i >= control->cm_solver_max_iters && system->my_rank == MASTER_NODE )
     {
-        fprintf( stderr, "GMRES convergence failed\n" );
-        return FAILURE;
+        fprintf( stderr, "[WARNING] CG convergence failed!\n" );
+        return i;
     }
 
-    return SUCCESS;
-}*/
-#endif
+    return i;
+}
diff --git a/PuReMD/src/linear_solvers.h b/PuReMD/src/linear_solvers.h
index 980a8b90..5bea768e 100644
--- a/PuReMD/src/linear_solvers.h
+++ b/PuReMD/src/linear_solvers.h
@@ -24,23 +24,24 @@
 
 #include "reax_types.h"
 
+
 real setup_sparse_approx_inverse( reax_system*, simulation_data*, storage*, mpi_datatypes*, 
         sparse_matrix *, sparse_matrix **, int, double );
 
 real sparse_approx_inverse( reax_system*, simulation_data*, storage*, mpi_datatypes*, 
         sparse_matrix*, sparse_matrix*, sparse_matrix**, int );
 
-int GMRES( reax_system*, storage*, sparse_matrix*,
-           real*, real, real*, mpi_datatypes*, FILE* );
-int GMRES_HouseHolder( reax_system*, storage*, sparse_matrix*,
-                       real*, real, real*, mpi_datatypes*, FILE* );
-int dual_CG( reax_system*, storage*, sparse_matrix*,
+int dual_CG( reax_system*, control_params*, simulation_data*, storage*, sparse_matrix*,
              rvec2*, real, rvec2*, mpi_datatypes*, FILE* );
+
 int CG( reax_system*, control_params*, simulation_data*, storage*, sparse_matrix*,
-        real*, real, real*, mpi_datatypes*, FILE*, int );
-int PCG( reax_system*, storage*, sparse_matrix*, real*, real,
-         sparse_matrix*, sparse_matrix*, real*, mpi_datatypes*, FILE* );
-int sCG( reax_system*, storage*, sparse_matrix*,
-         real*, real, real*, mpi_datatypes*, FILE* );
+        real*, real, real*, mpi_datatypes* );
+
+int PIPECG( reax_system*, control_params*, simulation_data*, storage*, sparse_matrix*,
+        real*, real, real*, mpi_datatypes* );
+
+int PIPECR( reax_system*, control_params*, simulation_data*, storage*, sparse_matrix*,
+        real*, real, real*, mpi_datatypes* );
+
 
 #endif
diff --git a/PuReMD/src/parallelreax.c b/PuReMD/src/parallelreax.c
index e633a3d3..12e09210 100644
--- a/PuReMD/src/parallelreax.c
+++ b/PuReMD/src/parallelreax.c
@@ -134,21 +134,15 @@ int main( int argc, char* argv[] )
     }
 
     /* allocated main datastructures */
-    system = (reax_system *)
-        smalloc( sizeof(reax_system), "system", MPI_COMM_WORLD );
-    control = (control_params *)
-        smalloc( sizeof(control_params), "control", MPI_COMM_WORLD );
-    data = (simulation_data *)
-        smalloc( sizeof(simulation_data), "data", MPI_COMM_WORLD );
-
-    workspace = (storage *)
-        smalloc( sizeof(storage), "workspace", MPI_COMM_WORLD );
-    lists = (reax_list **)
-        smalloc( LIST_N * sizeof(reax_list*), "lists", MPI_COMM_WORLD );
+    system = smalloc( sizeof(reax_system), "system", MPI_COMM_WORLD );
+    control = smalloc( sizeof(control_params), "control", MPI_COMM_WORLD );
+    data = smalloc( sizeof(simulation_data), "data", MPI_COMM_WORLD );
+
+    workspace = smalloc( sizeof(storage), "workspace", MPI_COMM_WORLD );
+    lists = smalloc( LIST_N * sizeof(reax_list*), "lists", MPI_COMM_WORLD );
     for ( i = 0; i < LIST_N; ++i )
     {
-        lists[i] = (reax_list *)
-            smalloc( sizeof(reax_list), "lists[i]", MPI_COMM_WORLD );
+        lists[i] = smalloc( sizeof(reax_list), "lists[i]", MPI_COMM_WORLD );
         lists[i]->allocated = 0;
         lists[i]->n = 0;
         lists[i]->num_intrs = 0;
@@ -162,17 +156,15 @@ int main( int argc, char* argv[] )
         lists[i]->dDelta_list = NULL;
         lists[i]->hbond_list = NULL;
     }
-    out_control = (output_controls *)
-        smalloc( sizeof(output_controls), "out_control", MPI_COMM_WORLD );
-    mpi_data = (mpi_datatypes *)
-        smalloc( sizeof(mpi_datatypes), "mpi_data", MPI_COMM_WORLD );
+    out_control = smalloc( sizeof(output_controls), "out_control", MPI_COMM_WORLD );
+    mpi_data = smalloc( sizeof(mpi_datatypes), "mpi_data", MPI_COMM_WORLD );
 
     /* setup the parallel environment */
-    MPI_Comm_size( MPI_COMM_WORLD, &(control->nprocs) );
-    MPI_Comm_rank( MPI_COMM_WORLD, &(system->my_rank) );
+    MPI_Comm_size( MPI_COMM_WORLD, &control->nprocs );
+    MPI_Comm_rank( MPI_COMM_WORLD, &system->my_rank );
     system->wsize = control->nprocs;
-    system->global_offset = (int*)
-        scalloc( system->wsize + 1, sizeof(int), "global_offset", MPI_COMM_WORLD );
+    system->global_offset = scalloc( system->wsize + 1, sizeof(int),
+            "global_offset", MPI_COMM_WORLD );
 
     /* read system description files */
     Read_System( argv[1], argv[2], argv[3], system, control,
@@ -185,7 +177,9 @@ int main( int argc, char* argv[] )
 
     /* measure total simulation time after input is read */
     if ( system->my_rank == MASTER_NODE )
-        t_start = MPI_Wtime();
+    {
+        t_start = MPI_Wtime( );
+    }
 
     /* initialize datastructures */
     Initialize( system, control, data, workspace, lists, out_control, mpi_data );
@@ -203,7 +197,6 @@ int main( int argc, char* argv[] )
             lists, out_control, mpi_data );
     Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
     Output_Results( system, control, data, lists, out_control, mpi_data );
-    
 
 #if defined(DEBUG)
     fprintf( stderr, "p%d: computed forces at t0\n", system->my_rank );
@@ -215,28 +208,39 @@ int main( int argc, char* argv[] )
     for ( ++data->step; data->step <= control->nsteps; data->step++ )
     {
         if ( control->T_mode )
+        {
             Temperature_Control( control, data );
+        }
 
         Evolve( system, control, data, workspace, lists, out_control, mpi_data );
         Post_Evolve(system, control, data, workspace, lists, out_control, mpi_data);
+
         if( system->my_rank == MASTER_NODE )
         {
             total_itr += data->timing.cm_solver_iters;
         }
+
         Output_Results( system, control, data, lists, out_control, mpi_data );
         //Analysis(system, control, data, workspace, lists, out_control, mpi_data);
 
         /* dump restart info */
-        if ( out_control->restart_freq &&
-                (data->step - data->prev_steps) % out_control->restart_freq == 0 )
+        if ( out_control->restart_freq
+                && (data->step - data->prev_steps) % out_control->restart_freq == 0 )
         {
             if ( out_control->restart_format == WRITE_ASCII )
+            {
                 Write_Restart( system, control, data, out_control, mpi_data );
+            }
             else if ( out_control->restart_format == WRITE_BINARY )
+            {
                 Write_Binary_Restart( system, control, data, out_control, mpi_data );
+            }
         }
-        /*if(data->step == 1 || data->step == control->nsteps)
-            Write_PDB( system, lists, data, control, mpi_data, out_control );*/
+
+//        if ( data->step == 1 || data->step == control->nsteps )
+//        {
+//            Write_PDB( system, lists, data, control, mpi_data, out_control );
+//        }
 
 #if defined(DEBUG)
         fprintf( stderr, "p%d: step%d completed\n", system->my_rank, data->step );
@@ -255,7 +259,7 @@ int main( int argc, char* argv[] )
     //Write_PDB( system, lists, data, control, mpi_data, out_control );
     Close_Output_Files( system, control, out_control, mpi_data );
 
-    MPI_Finalize();
+    MPI_Finalize( );
 
     /* de-allocate data structures */
     sfree( system, "system" );
diff --git a/PuReMD/src/qEq.c b/PuReMD/src/qEq.c
index d029c742..b936c46e 100644
--- a/PuReMD/src/qEq.c
+++ b/PuReMD/src/qEq.c
@@ -318,14 +318,13 @@ void Init_MatVec( reax_system *system, simulation_data *data,
 void Calculate_Charges( reax_system *system, storage *workspace,
         mpi_datatypes *mpi_data )
 {
-    int        i, scale;
-    real       u;//, s_sum, t_sum;
-    rvec2      my_sum, all_sum;
+    int i;
+    real u;//, s_sum, t_sum;
+    rvec2 my_sum, all_sum;
     reax_atom *atom;
     real *q;
 
-    scale = sizeof(real) / sizeof(void);
-    q = (real*) malloc(system->N * sizeof(real));
+    q = malloc( system->N * sizeof(real) );
 
     //s_sum = Parallel_Vector_Acc(workspace->s, system->n, mpi_data->world);
     //t_sum = Parallel_Vector_Acc(workspace->t, system->n, mpi_data->world);
@@ -360,11 +359,14 @@ void Calculate_Charges( reax_system *system, storage *workspace,
         atom->t[0] = workspace->x[i][1];
     }
 
-    Dist( system, mpi_data, q, MPI_DOUBLE, scale, real_packer );
+    Dist( system, mpi_data, q, REAL_PTR_TYPE, MPI_DOUBLE );
+
     for ( i = system->n; i < system->N; ++i )
+    {
         system->my_atoms[i].q = q[i];
+    }
 
-    sfree(q, "q");
+    sfree( q, "q" );
 }
 
 
@@ -419,6 +421,8 @@ void QEq( reax_system *system, control_params *control, simulation_data *data,
 {
     int j, iters;
 
+    iters = 0;
+
     Init_MatVec( system, data, control, workspace, mpi_data );
 
 #if defined(DEBUG)
@@ -428,7 +432,8 @@ void QEq( reax_system *system, control_params *control, simulation_data *data,
 
     if( control->cm_solver_pre_comp_type == SAI_PC )
     {
-        if( control->cm_solver_pre_comp_refactor > 0 && ((data->step - data->prev_steps) % control->cm_solver_pre_comp_refactor == 0))
+        if( control->cm_solver_pre_comp_refactor > 0
+                && ((data->step - data->prev_steps) % control->cm_solver_pre_comp_refactor == 0))
         {
             Setup_Preconditioner_QEq( system, control, data, workspace, mpi_data );
 
@@ -438,29 +443,80 @@ void QEq( reax_system *system, control_params *control, simulation_data *data,
     
     //TODO: used for timing to sync processors going into the linear solve, but remove for production code
     MPI_Barrier( mpi_data->world ); 
-    for ( j = 0; j < system->n; ++j )
-        workspace->s[j] = workspace->x[j][0];
-    iters = CG(system, control, data, workspace, workspace->H, workspace->b_s,
-            control->cm_solver_q_err, workspace->s, mpi_data, out_control->log , control->nprocs );
-    for ( j = 0; j < system->n; ++j )
-        workspace->x[j][0] = workspace->s[j];
 
-#if defined(DEBUG)
-    fprintf( stderr, "p%d: first CG completed\n", system->my_rank );
-#endif
+    switch ( control->cm_solver_type )
+    {
+    case CG_S:
+        for ( j = 0; j < system->n; ++j )
+        {
+            workspace->s[j] = workspace->x[j][0];
+        }
 
-    for ( j = 0; j < system->n; ++j )
-        workspace->t[j] = workspace->x[j][1];
-    iters += CG(system, control, data, workspace, workspace->H, workspace->b_t,//newQEq sCG
-            control->cm_solver_q_err, workspace->t, mpi_data, out_control->log, control->nprocs );
-    for ( j = 0; j < system->n; ++j )
-        workspace->x[j][1] = workspace->t[j];
+        iters = CG( system, control, data, workspace, workspace->H, workspace->b_s,
+                control->cm_solver_q_err, workspace->s, mpi_data );
 
-#if defined(DEBUG)
-    fprintf( stderr, "p%d: second CG completed\n", system->my_rank );
-#endif
+        for ( j = 0; j < system->n; ++j )
+        {
+            workspace->x[j][0] = workspace->s[j];
+        }
+
+        for ( j = 0; j < system->n; ++j )
+        {
+            workspace->t[j] = workspace->x[j][1];
+        }
+
+        iters += CG( system, control, data, workspace, workspace->H, workspace->b_t,
+                control->cm_solver_q_err, workspace->t, mpi_data );
+
+        for ( j = 0; j < system->n; ++j )
+        {
+            workspace->x[j][1] = workspace->t[j];
+        }
+        break;
+
+    case PIPECG_S:
+        for ( j = 0; j < system->n; ++j )
+        {
+            workspace->s[j] = workspace->x[j][0];
+        }
+
+        iters = PIPECG( system, control, data, workspace, workspace->H, workspace->b_s,
+                control->cm_solver_q_err, workspace->s, mpi_data );
+
+        for ( j = 0; j < system->n; ++j )
+        {
+            workspace->x[j][0] = workspace->s[j];
+        }
+
+        for ( j = 0; j < system->n; ++j )
+        {
+            workspace->t[j] = workspace->x[j][1];
+        }
+
+        iters += PIPECG( system, control, data, workspace, workspace->H, workspace->b_t,
+                control->cm_solver_q_err, workspace->t, mpi_data );
+
+        for ( j = 0; j < system->n; ++j )
+        {
+            workspace->x[j][1] = workspace->t[j];
+        }
+        break;
+
+    case GMRES_S:
+    case GMRES_H_S:
+    case SDM_S:
+    case BiCGStab_S:
+        fprintf( stderr, "[ERROR] Unsupported solver selection. Terminating...\n" );
+        break;
+
+    default:
+        fprintf( stderr, "[ERROR] Unrecognized solver selection. Terminating...\n" );
+        exit( INVALID_INPUT );
+        break;
+    }
 
     Calculate_Charges( system, workspace, mpi_data );
+
 #if defined(DEBUG)
     fprintf( stderr, "p%d: computed charges\n", system->my_rank );
     //Print_Charges( system );
diff --git a/PuReMD/src/reax_types.h b/PuReMD/src/reax_types.h
index ab8a7b50..244fdc0f 100644
--- a/PuReMD/src/reax_types.h
+++ b/PuReMD/src/reax_types.h
@@ -87,6 +87,7 @@ enum solver
     CG_S = 2,
     SDM_S = 3,
     BiCGStab_S = 4,
+    PIPECG_S = 5,
 };
 
 /* preconditioner computation type for charge method linear solver */
@@ -807,13 +808,13 @@ typedef struct
     /**/
     real cm_solver_pre_comp;
     /**/
-    real cm_solver_pre_app; // update CG()
+    real cm_solver_pre_app;
     /* num. of steps in iterative linear solver for charge distribution */
     int cm_solver_iters;
     /**/
-    real cm_solver_spmv; // update CG()
+    real cm_solver_spmv;
     /**/
-    real cm_solver_vector_ops; // update CG()
+    real cm_solver_vector_ops;
     /**/
     real cm_solver_orthog;
     /**/
@@ -1046,11 +1047,16 @@ typedef struct
     rvec2 *b, *x;
 
     /* GMRES storage */
-    real *y, *z, *g;
+    real *y, *g;
     real *hc, *hs;
     real **h, **v;
-    /* CG storage */
-    real *r, *d, *q, *p;
+    /* GMRES, PIPECG storage */
+    real *z;
+    /* CG, PIPECG storage */
+    real *d, *p, *q, *r;
+    /* PIPECG storage */
+    real *m, *n, *u, *w;
+    /* dual-CG storage */
     rvec2 *r2, *d2, *q2, *p2;
     /* Taper */
     real Tap[8]; //Tap7, Tap6, Tap5, Tap4, Tap3, Tap2, Tap1, Tap0;
diff --git a/PuReMD/src/vector.c b/PuReMD/src/vector.c
index efbc0954..78ed514f 100644
--- a/PuReMD/src/vector.c
+++ b/PuReMD/src/vector.c
@@ -83,6 +83,22 @@ real Dot( real* v1, real* v2, int k )
 }
 
 
+real Dot_local( real *v1, real *v2, int k )
+{
+    int i;
+    real sum;
+
+    sum = 0.0;
+
+    for ( i = 0; i < k; ++i )
+    {
+        sum += v1[i] * v2[i];
+    }
+
+    return sum;
+}
+
+
 real Norm( real* v1, int k )
 {
     real ret = 0;
diff --git a/PuReMD/src/vector.h b/PuReMD/src/vector.h
index baf1b3d1..e133d8ae 100644
--- a/PuReMD/src/vector.h
+++ b/PuReMD/src/vector.h
@@ -25,75 +25,138 @@
 #include "reax_types.h"
 #include "reax_defs.h"
 
+
 int  Vector_isZero( real*, int );
+
 void Vector_MakeZero( real*, int );
+
 void Vector_Copy( real*, real*, int );
+
 void Vector_Scale( real*, real, real*, int );
+
+
 static inline void Vector_Sum( real* dest, real c, real* v, real d, real* y, int k )
 {
     int i;
+
     for ( i = 0; i < k; ++i )
+    {
         dest[i] = c * v[i] + d * y[i];
+    }
 }
+
+
 static inline void Vector_Add( real* dest, real c, real* v, int k )
 {
     int i;
+
     for ( i = 0; i < k; ++i )
+    {
         dest[i] += c * v[i];
+    }
 }
+
 real Dot( real*, real*, int );
+
+real Dot_local( real*, real*, int );
+
 real Norm( real*, int );
+
 void Vector_Print( FILE*, char*, real*, int );
 
 void rvec_Copy( rvec, rvec );
+
 void rvec_Scale( rvec, real, rvec );
+
 void rvec_Add( rvec, rvec );
+
 void rvec_ScaledAdd( rvec, real, rvec );
+
 void rvec_Sum( rvec, rvec, rvec );
+
 void rvec_ScaledSum( rvec, real, rvec, real, rvec );
+
 real rvec_Dot( rvec, rvec );
+
 real rvec_ScaledDot( real, rvec, real, rvec );
+
 void rvec_Multiply( rvec, rvec, rvec );
+
 void rvec_iMultiply( rvec, ivec, rvec );
+
 void rvec_Divide( rvec, rvec, rvec );
+
 void rvec_iDivide( rvec, rvec, ivec );
+
 void rvec_Invert( rvec, rvec );
+
 void rvec_Cross( rvec, rvec, rvec );
+
 void rvec_OuterProduct( rtensor, rvec, rvec );
+
 real rvec_Norm_Sqr( rvec );
+
 real rvec_Norm( rvec );
+
 int  rvec_isZero( rvec );
+
 void rvec_MakeZero( rvec );
+
 void rvec_Random( rvec );
 
 void rtensor_MakeZero( rtensor );
+
 void rtensor_Multiply( rtensor, rtensor, rtensor );
+
 void rtensor_MatVec( rvec, rtensor, rvec );
+
 void rtensor_Scale( rtensor, real, rtensor );
+
 void rtensor_Add( rtensor, rtensor );
+
 void rtensor_ScaledAdd( rtensor, real, rtensor );
+
 void rtensor_Sum( rtensor, rtensor, rtensor );
+
 void rtensor_ScaledSum( rtensor, real, rtensor, real, rtensor );
+
 void rtensor_Scale( rtensor, real, rtensor );
+
 void rtensor_Copy( rtensor, rtensor );
+
 void rtensor_Identity( rtensor );
+
 void rtensor_Transpose( rtensor, rtensor );
+
 real rtensor_Det( rtensor );
+
 real rtensor_Trace( rtensor );
 
 void Print_rTensor(FILE*, rtensor);
 
-int  ivec_isZero( ivec );
-int  ivec_isEqual( ivec, ivec );
+int ivec_isZero( ivec );
+
+int ivec_isEqual( ivec, ivec );
+
 void ivec_MakeZero( ivec );
+
 void ivec_Copy( ivec, ivec );
+
 void ivec_Scale( ivec, real, ivec );
+
 void ivec_rScale( ivec, real, rvec );
+
 void ivec_Sum( ivec, ivec, ivec );
+
 void ivec_ScaledSum( ivec, int, ivec, int, ivec );
+
 void ivec_Add( ivec, ivec );
+
 void ivec_ScaledAdd( ivec, int, ivec );
+
 void ivec_Max( ivec, ivec, ivec );
+
 void ivec_Max3( ivec, ivec, ivec, ivec );
 
+
 #endif
-- 
GitLab