diff --git a/PuReMD/src/allocate.c b/PuReMD/src/allocate.c
index 0753ea1090ec9b2f4c60007047c86f7c4a91728e..b2c9aa5b14e096cca696033083de3d1f804d57ab 100644
--- a/PuReMD/src/allocate.c
+++ b/PuReMD/src/allocate.c
@@ -145,9 +145,8 @@ void DeAllocate_Workspace( control_params *control, storage *workspace )
     sfree( workspace->Clp, "Clp" );
     sfree( workspace->vlpex, "vlpex" );
     sfree( workspace->bond_mark, "bond_mark" );
-    sfree( workspace->done_after, "done_after" );
 
-    /* QEq storage */
+    /* CM storage */
     sfree( workspace->Hdia_inv, "Hdia_inv" );
     sfree( workspace->b_s, "b_s" );
     sfree( workspace->b_t, "b_t" );
@@ -159,28 +158,67 @@ void DeAllocate_Workspace( control_params *control, storage *workspace )
     sfree( workspace->b, "b" );
     sfree( workspace->x, "x" );
 
-    /* GMRES storage */
-    for ( i = 0; i < RESTART + 1; ++i )
-    {
-        sfree( workspace->h[i], "h[i]" );
-        sfree( workspace->v[i], "v[i]" );
-    }
-    sfree( workspace->h, "h" );
-    sfree( workspace->v, "v" );
-    sfree( workspace->y, "y" );
-    sfree( workspace->z, "z" );
-    sfree( workspace->g, "g" );
-    sfree( workspace->hs, "hs" );
-    sfree( workspace->hc, "hc" );
-    /* CG storage */
-    sfree( workspace->r, "r" );
-    sfree( workspace->d, "d" );
-    sfree( workspace->q, "q" );
-    sfree( workspace->p, "p" );
-    sfree( workspace->r2, "r2" );
-    sfree( workspace->d2, "d2" );
-    sfree( workspace->q2, "q2" );
-    sfree( workspace->p2, "p2" );
+    if ( control->cm_solver_type == GMRES_S
+            || control->cm_solver_type == GMRES_H_S )
+    {
+        for ( i = 0; i < RESTART + 1; ++i )
+        {
+            sfree( workspace->h[i], "h[i]" );
+            sfree( workspace->v[i], "v[i]" );
+        }
+
+        sfree( workspace->y, "y" );
+        sfree( workspace->g, "g" );
+        sfree( workspace->hc, "hc" );
+        sfree( workspace->hs, "hs" );
+        sfree( workspace->h, "h" );
+        sfree( workspace->v, "v" );
+    }
+
+    if ( control->cm_solver_type == GMRES_S
+            || control->cm_solver_type == GMRES_H_S
+            || control->cm_solver_type == PIPECG_S
+            || control->cm_solver_type == PIPECR_S )
+    {
+        sfree( workspace->z, "z" );
+    }
+
+    if ( control->cm_solver_type == CG_S
+            || control->cm_solver_type == PIPECG_S
+            || control->cm_solver_type == PIPECR_S )
+    {
+        sfree( workspace->d, "d" );
+        sfree( workspace->p, "p" );
+        sfree( workspace->q, "q" );
+        sfree( workspace->r, "r" );
+    }
+
+    if ( control->cm_solver_type == PIPECG_S
+            || control->cm_solver_type == PIPECR_S )
+    {
+        sfree( workspace->m, "m" );
+        sfree( workspace->n, "n" );
+        sfree( workspace->u, "u" );
+        sfree( workspace->w, "w" );
+    }
+
+    if ( control->cm_solver_type == CG_S 
+            || control->cm_solver_type == PIPECG_S )
+    {
+        sfree( workspace->r2, "r2" );
+        sfree( workspace->d2, "d2" );
+        sfree( workspace->q2, "q2" );
+        sfree( workspace->p2, "p2" );
+    }
+
+    if ( control->cm_solver_type == PIPECG_S )
+    {
+        sfree( workspace->m2, "m2" );
+        sfree( workspace->n2, "n2" );
+        sfree( workspace->u2, "u2" );
+        sfree( workspace->w2, "w2" );
+        sfree( workspace->w2, "z2" );
+    }
 
     /* integrator */
     // sfree( workspace->f_old );
@@ -239,144 +277,171 @@ void DeAllocate_Workspace( control_params *control, storage *workspace )
 
 
 int Allocate_Workspace( reax_system *system, control_params *control,
-                        storage *workspace, int local_cap, int total_cap,
-                        MPI_Comm comm, char *msg )
+        storage *workspace, int local_cap, int total_cap,
+        MPI_Comm comm, char *msg )
 {
-    int i, total_real, total_rvec, local_int, local_real, local_rvec;
+    int i, total_real, total_rvec, local_rvec;
 
     workspace->allocated = 1;
     total_real = total_cap * sizeof(real);
     total_rvec = total_cap * sizeof(rvec);
-    local_int = local_cap * sizeof(int);
-    local_real = local_cap * sizeof(real);
     local_rvec = local_cap * sizeof(rvec);
 
     /* communication storage */
     for ( i = 0; i < MAX_NBRS; ++i )
     {
-        workspace->tmp_dbl[i] = (real*)
-                                scalloc( total_cap, sizeof(real), "tmp_dbl", comm );
-        workspace->tmp_rvec[i] = (rvec*)
-                                 scalloc( total_cap, sizeof(rvec), "tmp_rvec", comm );
-        workspace->tmp_rvec2[i] = (rvec2*)
-                                  scalloc( total_cap, sizeof(rvec2), "tmp_rvec2", comm );
+        workspace->tmp_dbl[i] = scalloc( total_cap, sizeof(real), "tmp_dbl", comm );
+        workspace->tmp_rvec[i] = scalloc( total_cap, sizeof(rvec), "tmp_rvec", comm );
+        workspace->tmp_rvec2[i] = scalloc( total_cap, sizeof(rvec2), "tmp_rvec2", comm );
     }
 
     /* bond order related storage  */
-    workspace->within_bond_box = (int*)
-                                 scalloc( total_cap, sizeof(int), "skin", comm );
-    workspace->total_bond_order = (real*) smalloc( total_real, "total_bo", comm );
-    workspace->Deltap = (real*) smalloc( total_real, "Deltap", comm );
-    workspace->Deltap_boc = (real*) smalloc( total_real, "Deltap_boc", comm );
-    workspace->dDeltap_self = (rvec*) smalloc( total_rvec, "dDeltap_self", comm );
-    workspace->Delta = (real*) smalloc( total_real, "Delta", comm );
-    workspace->Delta_lp = (real*) smalloc( total_real, "Delta_lp", comm );
-    workspace->Delta_lp_temp = (real*)
-                               smalloc( total_real, "Delta_lp_temp", comm );
-    workspace->dDelta_lp = (real*) smalloc( total_real, "dDelta_lp", comm );
-    workspace->dDelta_lp_temp = (real*)
-                                smalloc( total_real, "dDelta_lp_temp", comm );
-    workspace->Delta_e = (real*) smalloc( total_real, "Delta_e", comm );
-    workspace->Delta_boc = (real*) smalloc( total_real, "Delta_boc", comm );
-    workspace->nlp = (real*) smalloc( total_real, "nlp", comm );
-    workspace->nlp_temp = (real*) smalloc( total_real, "nlp_temp", comm );
-    workspace->Clp = (real*) smalloc( total_real, "Clp", comm );
-    workspace->vlpex = (real*) smalloc( total_real, "vlpex", comm );
-    workspace->bond_mark = (int*)
-                           scalloc( total_cap, sizeof(int), "bond_mark", comm );
-    workspace->done_after = (int*)
-                            scalloc( total_cap, sizeof(int), "done_after", comm );
-    // fprintf( stderr, "p%d: bond order storage\n", system->my_rank );
-
-    /* QEq storage */
-    workspace->Hdia_inv = (real*)
-                          scalloc( total_cap, sizeof(real), "Hdia_inv", comm );
-    workspace->b_s = (real*) scalloc( total_cap, sizeof(real), "b_s", comm );
-    workspace->b_t = (real*) scalloc( total_cap, sizeof(real), "b_t", comm );
-    workspace->b_prc = (real*) scalloc( total_cap, sizeof(real), "b_prc", comm );
-    workspace->b_prm = (real*) scalloc( total_cap, sizeof(real), "b_prm", comm );
-    workspace->s = (real*) scalloc( total_cap, sizeof(real), "s", comm );
-    workspace->t = (real*) scalloc( total_cap, sizeof(real), "t", comm );
-    workspace->droptol = (real*)
-                         scalloc( total_cap, sizeof(real), "droptol", comm );
-    workspace->b = (rvec2*) scalloc( total_cap, sizeof(rvec2), "b", comm );
-    workspace->x = (rvec2*) scalloc( total_cap, sizeof(rvec2), "x", comm );
-
-    /* GMRES storage */
-    workspace->y = (real*) scalloc( RESTART + 1, sizeof(real), "y", comm );
-    workspace->z = (real*) scalloc( RESTART + 1, sizeof(real), "z", comm );
-    workspace->g = (real*) scalloc( RESTART + 1, sizeof(real), "g", comm );
-    workspace->h = (real**) scalloc( RESTART + 1, sizeof(real*), "h", comm );
-    workspace->hs = (real*) scalloc( RESTART + 1, sizeof(real), "hs", comm );
-    workspace->hc = (real*) scalloc( RESTART + 1, sizeof(real), "hc", comm );
-    workspace->v = (real**) scalloc( RESTART + 1, sizeof(real*), "v", comm );
-
-    for ( i = 0; i < RESTART + 1; ++i )
-    {
-        workspace->h[i] = (real*) scalloc( RESTART + 1, sizeof(real), "h[i]", comm );
-        workspace->v[i] = (real*) scalloc( total_cap, sizeof(real), "v[i]", comm );
-    }
-
-    /* CG storage */
-    workspace->r = (real*) scalloc( total_cap, sizeof(real), "r", comm );
-    workspace->d = (real*) scalloc( total_cap, sizeof(real), "d", comm );
-    workspace->q = (real*) scalloc( total_cap, sizeof(real), "q", comm );
-    workspace->p = (real*) scalloc( total_cap, sizeof(real), "p", comm );
-    workspace->r2 = (rvec2*) scalloc( total_cap, sizeof(rvec2), "r2", comm );
-    workspace->d2 = (rvec2*) scalloc( total_cap, sizeof(rvec2), "d2", comm );
-    workspace->q2 = (rvec2*) scalloc( total_cap, sizeof(rvec2), "q2", comm );
-    workspace->p2 = (rvec2*) scalloc( total_cap, sizeof(rvec2), "p2", comm );
+    workspace->within_bond_box = scalloc( total_cap, sizeof(int), "skin", comm );
+    workspace->total_bond_order = smalloc( total_real, "total_bo", comm );
+    workspace->Deltap = smalloc( total_real, "Deltap", comm );
+    workspace->Deltap_boc = smalloc( total_real, "Deltap_boc", comm );
+    workspace->dDeltap_self = smalloc( total_rvec, "dDeltap_self", comm );
+    workspace->Delta = smalloc( total_real, "Delta", comm );
+    workspace->Delta_lp = smalloc( total_real, "Delta_lp", comm );
+    workspace->Delta_lp_temp = smalloc( total_real, "Delta_lp_temp", comm );
+    workspace->dDelta_lp = smalloc( total_real, "dDelta_lp", comm );
+    workspace->dDelta_lp_temp = smalloc( total_real, "dDelta_lp_temp", comm );
+    workspace->Delta_e = smalloc( total_real, "Delta_e", comm );
+    workspace->Delta_boc = smalloc( total_real, "Delta_boc", comm );
+    workspace->nlp = smalloc( total_real, "nlp", comm );
+    workspace->nlp_temp = smalloc( total_real, "nlp_temp", comm );
+    workspace->Clp = smalloc( total_real, "Clp", comm );
+    workspace->vlpex = smalloc( total_real, "vlpex", comm );
+    workspace->bond_mark = scalloc( total_cap, sizeof(int), "bond_mark", comm );
+
+    /* CM storage */
+    workspace->Hdia_inv = scalloc( total_cap, sizeof(real), "Hdia_inv", comm );
+    workspace->b_s = scalloc( total_cap, sizeof(real), "b_s", comm );
+    workspace->b_t = scalloc( total_cap, sizeof(real), "b_t", comm );
+    workspace->b_prc = scalloc( total_cap, sizeof(real), "b_prc", comm );
+    workspace->b_prm = scalloc( total_cap, sizeof(real), "b_prm", comm );
+    workspace->s = scalloc( total_cap, sizeof(real), "s", comm );
+    workspace->t = scalloc( total_cap, sizeof(real), "t", comm );
+    workspace->droptol = scalloc( total_cap, sizeof(real), "droptol", comm );
+    workspace->b = scalloc( total_cap, sizeof(rvec2), "b", comm );
+    workspace->x = scalloc( total_cap, sizeof(rvec2), "x", comm );
+
+    if ( control->cm_solver_type == GMRES_S
+            || control->cm_solver_type == GMRES_H_S )
+    {
+        workspace->y = scalloc( RESTART + 1, sizeof(real), "y", comm );
+        workspace->g = scalloc( RESTART + 1, sizeof(real), "g", comm );
+        workspace->hc = scalloc( RESTART + 1, sizeof(real), "hc", comm );
+        workspace->hs = scalloc( RESTART + 1, sizeof(real), "hs", comm );
+        workspace->h = scalloc( RESTART + 1, sizeof(real*), "h", comm );
+        workspace->v = scalloc( RESTART + 1, sizeof(real*), "v", comm );
+
+        for ( i = 0; i < RESTART + 1; ++i )
+        {
+            workspace->h[i] = scalloc( RESTART + 1, sizeof(real), "h[i]", comm );
+            workspace->v[i] = scalloc( total_cap, sizeof(real), "v[i]", comm );
+        }
+    }
+
+    if ( control->cm_solver_type == GMRES_S
+            || control->cm_solver_type == GMRES_H_S )
+    {
+        workspace->z = scalloc( RESTART + 1, sizeof(real), "z", comm );
+    }
+    else if ( control->cm_solver_type == PIPECG_S
+            || control->cm_solver_type == PIPECR_S )
+    {
+        workspace->z = scalloc( total_cap, sizeof(real), "z", comm );
+    }
+
+    if ( control->cm_solver_type == CG_S
+            || control->cm_solver_type == PIPECG_S
+            || control->cm_solver_type == PIPECR_S )
+    {
+        workspace->d = scalloc( total_cap, sizeof(real), "d", comm );
+        workspace->p = scalloc( total_cap, sizeof(real), "p", comm );
+        workspace->q = scalloc( total_cap, sizeof(real), "q", comm );
+        workspace->r = scalloc( total_cap, sizeof(real), "r", comm );
+    }
+
+    if ( control->cm_solver_type == PIPECG_S
+            || control->cm_solver_type == PIPECR_S )
+    {
+        workspace->m = scalloc( total_cap, sizeof(real), "m", comm );
+        workspace->n = scalloc( total_cap, sizeof(real), "n", comm );
+        workspace->u = scalloc( total_cap, sizeof(real), "u", comm );
+        workspace->w = scalloc( total_cap, sizeof(real), "w", comm );
+    }
+
+    if ( control->cm_solver_type == CG_S
+            || control->cm_solver_type == PIPECG_S )
+    {
+        workspace->d2 = scalloc( total_cap, sizeof(rvec2), "d2", comm );
+        workspace->r2 = scalloc( total_cap, sizeof(rvec2), "r2", comm );
+        workspace->p2 = scalloc( total_cap, sizeof(rvec2), "p2", comm );
+        workspace->q2 = scalloc( total_cap, sizeof(rvec2), "q2", comm );
+    }
+
+    if ( control->cm_solver_type == PIPECG_S )
+    {
+        workspace->m2 = scalloc( total_cap, sizeof(rvec2), "m2", comm );
+        workspace->n2 = scalloc( total_cap, sizeof(rvec2), "n2", comm );
+        workspace->u2 = scalloc( total_cap, sizeof(rvec2), "u2", comm );
+        workspace->w2 = scalloc( total_cap, sizeof(rvec2), "w2", comm );
+        workspace->z2 = scalloc( total_cap, sizeof(rvec2), "z2", comm );
+    }
 
     /* integrator storage */
-    workspace->v_const = (rvec*) smalloc( local_rvec, "v_const", comm );
+    workspace->v_const = smalloc( local_rvec, "v_const", comm );
 
     /* storage for analysis */
     if ( control->molecular_analysis || control->diffusion_coef )
     {
-        workspace->mark = (int*) scalloc( local_cap, sizeof(int), "mark", comm );
-        workspace->old_mark = (int*)
-                              scalloc( local_cap, sizeof(int), "old_mark", comm );
+        workspace->mark = scalloc( local_cap, sizeof(int), "mark", comm );
+        workspace->old_mark = scalloc( local_cap, sizeof(int), "old_mark", comm );
     }
     else
-        workspace->mark = workspace->old_mark = NULL;
+    {
+        workspace->mark = NULL;
+        workspace->old_mark = NULL;
+    }
 
     if ( control->diffusion_coef )
-        workspace->x_old = (rvec*)
-                           scalloc( local_cap, sizeof(rvec), "x_old", comm );
-    else workspace->x_old = NULL;
+    {
+        workspace->x_old = scalloc( local_cap, sizeof(rvec), "x_old", comm );
+    }
+    else
+    {
+        workspace->x_old = NULL;
+    }
 
     /* force related storage */
-    workspace->f = (rvec*) scalloc( total_cap, sizeof(rvec), "f", comm );
-    workspace->CdDelta = (real*)
-                         scalloc( total_cap, sizeof(real), "CdDelta", comm );
+    workspace->f = scalloc( total_cap, sizeof(rvec), "f", comm );
+    workspace->CdDelta = scalloc( total_cap, sizeof(real), "CdDelta", comm );
 
 #ifdef TEST_FORCES
-    workspace->dDelta = (rvec*) smalloc( total_rvec, "dDelta", comm );
-    workspace->f_ele = (rvec*) smalloc( total_rvec, "f_ele", comm );
-    workspace->f_vdw = (rvec*) smalloc( total_rvec, "f_vdw", comm );
-    workspace->f_bo = (rvec*) smalloc( total_rvec, "f_bo", comm );
-    workspace->f_be = (rvec*) smalloc( total_rvec, "f_be", comm );
-    workspace->f_lp = (rvec*) smalloc( total_rvec, "f_lp", comm );
-    workspace->f_ov = (rvec*) smalloc( total_rvec, "f_ov", comm );
-    workspace->f_un = (rvec*) smalloc( total_rvec, "f_un", comm );
-    workspace->f_ang = (rvec*) smalloc( total_rvec, "f_ang", comm );
-    workspace->f_coa = (rvec*) smalloc( total_rvec, "f_coa", comm );
-    workspace->f_pen = (rvec*) smalloc( total_rvec, "f_pen", comm );
-    workspace->f_hb = (rvec*) smalloc( total_rvec, "f_hb", comm );
-    workspace->f_tor = (rvec*) smalloc( total_rvec, "f_tor", comm );
-    workspace->f_con = (rvec*) smalloc( total_rvec, "f_con", comm );
-    workspace->f_tot = (rvec*) smalloc( total_rvec, "f_tot", comm );
+    workspace->dDelta = smalloc( total_rvec, "dDelta", comm );
+    workspace->f_ele = smalloc( total_rvec, "f_ele", comm );
+    workspace->f_vdw = smalloc( total_rvec, "f_vdw", comm );
+    workspace->f_bo = smalloc( total_rvec, "f_bo", comm );
+    workspace->f_be = smalloc( total_rvec, "f_be", comm );
+    workspace->f_lp = smalloc( total_rvec, "f_lp", comm );
+    workspace->f_ov = smalloc( total_rvec, "f_ov", comm );
+    workspace->f_un = smalloc( total_rvec, "f_un", comm );
+    workspace->f_ang = smalloc( total_rvec, "f_ang", comm );
+    workspace->f_coa = smalloc( total_rvec, "f_coa", comm );
+    workspace->f_pen = smalloc( total_rvec, "f_pen", comm );
+    workspace->f_hb = smalloc( total_rvec, "f_hb", comm );
+    workspace->f_tor = smalloc( total_rvec, "f_tor", comm );
+    workspace->f_con = smalloc( total_rvec, "f_con", comm );
+    workspace->f_tot = smalloc( total_rvec, "f_tot", comm );
 
     if ( system->my_rank == MASTER_NODE )
     {
-        workspace->rcounts = (int*)
-                             smalloc( system->wsize * sizeof(int), "rcount", comm );
-        workspace->displs = (int*)
-                            smalloc( system->wsize * sizeof(int), "displs", comm );
-        workspace->id_all = (int*)
-                            smalloc( system->bigN * sizeof(int), "id_all", comm );
-        workspace->f_all = (rvec*)
-                           smalloc( system->bigN * sizeof(rvec), "f_all", comm );
+        workspace->rcounts = smalloc( system->wsize * sizeof(int), "rcount", comm );
+        workspace->displs = smalloc( system->wsize * sizeof(int), "displs", comm );
+        workspace->id_all = smalloc( system->bigN * sizeof(int), "id_all", comm );
+        workspace->f_all = smalloc( system->bigN * sizeof(rvec), "f_all", comm );
     }
     else
     {
@@ -394,8 +459,12 @@ int Allocate_Workspace( reax_system *system, control_params *control,
 void Reallocate_Neighbor_List( reax_list *far_nbrs, int n, int num_intrs,
                                MPI_Comm comm )
 {
+    int format;
+
+    format = far_nbrs->format;
+
     Delete_List( far_nbrs, comm );
-    if (!Make_List( n, num_intrs, TYP_FAR_NEIGHBOR, far_nbrs, comm ))
+    if (!Make_List( n, num_intrs, TYP_FAR_NEIGHBOR, format, far_nbrs, comm ))
     {
         fprintf(stderr, "Problem in initializing far nbrs list. Terminating!\n");
         MPI_Abort( comm, INSUFFICIENT_MEMORY );
@@ -403,7 +472,8 @@ void Reallocate_Neighbor_List( reax_list *far_nbrs, int n, int num_intrs,
 }
 
 
-int Allocate_Matrix( sparse_matrix **pH, int cap, int m, MPI_Comm comm )
+int Allocate_Matrix( sparse_matrix **pH, int cap, int m,
+       int format, MPI_Comm comm )
 {
     sparse_matrix *H;
 
@@ -412,6 +482,26 @@ int Allocate_Matrix( sparse_matrix **pH, int cap, int m, MPI_Comm comm )
     H = *pH;
     H->cap = cap;
     H->m = m;
+    H->format = format;
+    H->start = (int*) smalloc( sizeof(int) * cap, "matrix_start", comm );
+    H->end = (int*) smalloc( sizeof(int) * cap, "matrix_end", comm );
+    H->entries = (sparse_matrix_entry*)
+                 smalloc( sizeof(sparse_matrix_entry) * m, "matrix_entries", comm );
+
+    return SUCCESS;
+}
+int Allocate_Matrix2( sparse_matrix **pH, int n, int cap, int m,
+        int format, MPI_Comm comm )
+{
+    sparse_matrix *H;
+
+    *pH = (sparse_matrix*)
+          smalloc( sizeof(sparse_matrix), "sparse_matrix", comm );
+    H = *pH;
+    H->n = n;
+    H->cap = cap;
+    H->m = m;
+    H->format = format;
     H->start = (int*) smalloc( sizeof(int) * cap, "matrix_start", comm );
     H->end = (int*) smalloc( sizeof(int) * cap, "matrix_end", comm );
     H->entries = (sparse_matrix_entry*)
@@ -433,8 +523,12 @@ void Deallocate_Matrix( sparse_matrix *H )
 int Reallocate_Matrix( sparse_matrix **H, int n, int m, char *name,
                        MPI_Comm comm )
 {
+    int format;
+
+    format = (*H)->format;
+
     Deallocate_Matrix( *H );
-    if ( !Allocate_Matrix( H, n, m, comm ) )
+    if ( !Allocate_Matrix( H, n, m, format, comm ) )
     {
         fprintf(stderr, "not enough space for %s matrix. terminating!\n", name);
         MPI_Abort( comm, INSUFFICIENT_MEMORY );
@@ -452,7 +546,9 @@ int Reallocate_Matrix( sparse_matrix **H, int n, int m, char *name,
 int Reallocate_HBonds_List( reax_system *system, reax_list *hbonds,
                             MPI_Comm comm )
 {
-    int i, id, total_hbonds;
+    int i, id, total_hbonds, format;
+
+    format = hbonds->format;
 
     total_hbonds = 0;
     for ( i = 0; i < system->n; ++i )
@@ -466,7 +562,7 @@ int Reallocate_HBonds_List( reax_system *system, reax_list *hbonds,
     total_hbonds = (int)(MAX( total_hbonds * SAFER_ZONE, MIN_CAP * MIN_HBONDS ));
 
     Delete_List( hbonds, comm );
-    if ( !Make_List( system->Hcap, total_hbonds, TYP_HBOND, hbonds, comm ) )
+    if ( !Make_List( system->Hcap, total_hbonds, TYP_HBOND, format, hbonds, comm ) )
     {
         fprintf( stderr, "not enough space for hbonds list. terminating!\n" );
         MPI_Abort( comm, INSUFFICIENT_MEMORY );
@@ -479,7 +575,9 @@ int Reallocate_HBonds_List( reax_system *system, reax_list *hbonds,
 int Reallocate_Bonds_List( reax_system *system, reax_list *bonds,
                            int *total_bonds, int *est_3body, MPI_Comm comm )
 {
-    int i;
+    int i, format;
+
+    format = bonds->format;
 
     *total_bonds = 0;
     *est_3body = 0;
@@ -493,7 +591,7 @@ int Reallocate_Bonds_List( reax_system *system, reax_list *bonds,
     *total_bonds = (int)(MAX( *total_bonds * SAFE_ZONE, MIN_CAP * MIN_BONDS ));
 
     Delete_List( bonds, comm );
-    if (!Make_List(system->total_cap, *total_bonds, TYP_BOND, bonds, comm))
+    if (!Make_List(system->total_cap, *total_bonds, TYP_BOND, format, bonds, comm))
     {
         fprintf( stderr, "not enough space for bonds list. terminating!\n" );
         MPI_Abort( comm, INSUFFICIENT_MEMORY );
@@ -510,10 +608,11 @@ int Estimate_GCell_Population( reax_system* system, MPI_Comm comm )
     ivec c;
     grid *g;
     grid_cell *gc;
-    simulation_box *big_box, *my_ext_box;
+    simulation_box *my_ext_box;
+    //simulation_box *big_box;
     reax_atom *atoms;
 
-    big_box    = &(system->big_box);
+    //big_box    = &(system->big_box);
     my_ext_box = &(system->my_ext_box);
     g          = &(system->my_grid);
     atoms      = system->my_atoms;
@@ -676,7 +775,8 @@ void Deallocate_Grid( grid *g )
    buffers are void*, type cast to the correct pointer type to access
    the allocated buffers */
 int  Allocate_MPI_Buffers( mpi_datatypes *mpi_data, int est_recv,
-                           neighbor_proc *my_nbrs, char *msg )
+                           neighbor_proc *my_nbrs, neighbor_proc *my_nt_nbrs,
+                           char *msg )
 {
     int i;
     mpi_out_data  *mpi_buf;
@@ -684,24 +784,47 @@ int  Allocate_MPI_Buffers( mpi_datatypes *mpi_data, int est_recv,
 
     comm = mpi_data->world;
 
-    /* in buffers */
-    mpi_data->in1_buffer = (void*)
-                           scalloc( est_recv, sizeof(boundary_atom), "in1_buffer", comm );
-    mpi_data->in2_buffer = (void*)
-                           scalloc( est_recv, sizeof(boundary_atom), "in2_buffer", comm );
-
-    /* out buffers */
+    /* buffers for incoming messages,
+     * see SendRecv for MPI datatypes sent */
+    mpi_data->in1_buffer = scalloc( est_recv,
+            MAX3( sizeof(mpi_atom), sizeof(boundary_atom), sizeof(rvec) ),
+            "Allocate_MPI_Buffers::in1_buffer", comm );
+    mpi_data->in2_buffer = scalloc( est_recv,
+            MAX3( sizeof(mpi_atom), sizeof(boundary_atom), sizeof(rvec) ),
+            "Allocate_MPI_Buffers::in2_buffer", comm );
+
+    /* buffers for outgoing messages,
+     * see SendRecv for MPI datatypes sent */
     for ( i = 0; i < MAX_NBRS; ++i )
     {
-        mpi_buf = &( mpi_data->out_buffers[i] );
+        mpi_buf = &mpi_data->out_buffers[i];
+
         /* allocate storage for the neighbor processor i */
-        mpi_buf->index = (int*)
-                         scalloc( my_nbrs[i].est_send, sizeof(int), "mpibuf:index", comm );
-        mpi_buf->out_atoms = (void*)
-                             scalloc( my_nbrs[i].est_send, sizeof(boundary_atom), "mpibuf:out_atoms",
-                                      comm );
+        mpi_buf->index = scalloc( my_nbrs[i].est_send, sizeof(int),
+                "Allocate_MPI_Buffers::mpi_buf->index", comm );
+        mpi_buf->out_atoms = scalloc( my_nbrs[i].est_send,
+                MAX3( sizeof(mpi_atom), sizeof(boundary_atom), sizeof(rvec) ),
+                "Allocate_MPI_Buffers::mpi_buf->out_atoms", comm );
     }
 
+#if defined(NEUTRAL_TERRITORY)
+    /* Neutral Territory out buffers */
+    for ( i = 0; i < REAX_MAX_NT_NBRS; ++i )
+    {
+        /* in buffers */
+        mpi_data->in_nt_buffer[i] = scalloc( my_nt_nbrs[i].est_recv, sizeof(real),
+                "mpibuf:in_nt_buffer", comm );
+        /* out buffer */
+        mpi_buf = &mpi_data->out_nt_buffers[i];
+
+        /* allocate storage for the neighbor processor i */
+        mpi_buf->index = scalloc( my_nt_nbrs[i].est_send, sizeof(int),
+                "mpibuf:nt_index", comm );
+        mpi_buf->out_atoms = scalloc( my_nt_nbrs[i].est_send, sizeof(real),
+                "mpibuf:nt_out_atoms", comm );
+    }
+#endif
+
     return SUCCESS;
 }
 
@@ -711,15 +834,26 @@ void Deallocate_MPI_Buffers( mpi_datatypes *mpi_data )
     int i;
     mpi_out_data  *mpi_buf;
 
-    sfree( mpi_data->in1_buffer, "in1_buffer" );
-    sfree( mpi_data->in2_buffer, "in2_buffer" );
+    sfree( mpi_data->in1_buffer, "Deallocate_MPI_Buffers::in1_buffer" );
+    sfree( mpi_data->in2_buffer, "Deallocate_MPI_Buffers::in2_buffer" );
 
     for ( i = 0; i < MAX_NBRS; ++i )
     {
-        mpi_buf = &( mpi_data->out_buffers[i] );
-        sfree( mpi_buf->index, "mpibuf:index" );
-        sfree( mpi_buf->out_atoms, "mpibuf:out_atoms" );
+        mpi_buf = &mpi_data->out_buffers[i];
+        sfree( mpi_buf->index, "Deallocate_MPI_Buffers::mpi_buf->index" );
+        sfree( mpi_buf->out_atoms, "Deallocate_MPI_Buffers::mpi_buf->out_atoms" );
+    }
+
+#if defined(NEUTRAL_TERRITORY)
+    for ( i = 0; i < REAX_MAX_NT_NBRS; ++i )
+    {
+        sfree( mpi_data->in_nt_buffer[i], "in_nt_buffer" );
+
+        mpi_buf = &mpi_data->out_nt_buffers[i];
+        sfree( mpi_buf->index, "mpibuf:nt_index" );
+        sfree( mpi_buf->out_atoms, "mpibuf:nt_out_atoms" );
     }
+#endif
 }
 
 
@@ -729,7 +863,7 @@ void ReAllocate( reax_system *system, control_params *control,
 {
     int i, j, k, p;
     int num_bonds, est_3body, nflag, Nflag, Hflag, mpi_flag, ret, total_send;
-    int renbr;
+    int renbr, format;
     reallocate_data *realloc;
     reax_list *far_nbrs;
     sparse_matrix *H;
@@ -766,10 +900,20 @@ void ReAllocate( reax_system *system, control_params *control,
     if ( system->n >= DANGER_ZONE * system->local_cap ||
             (0 && system->n <= LOOSE_ZONE * system->local_cap) )
     {
+#if !defined(NEUTRAL_TERRITORY)
         nflag = 1;
+#endif
         system->local_cap = (int)(system->n * SAFE_ZONE);
     }
 
+#if defined(NEUTRAL_TERRITORY)
+    if ( workspace->H->NT >= DANGER_ZONE * workspace->H->cap )
+    {
+        nflag = 1;
+        workspace->H->cap = (int)(workspace->H->NT * SAFE_ZONE_NT);
+    }
+#endif
+
     Nflag = 0;
     if ( system->N >= DANGER_ZONE * system->total_cap ||
             (0 && system->N <= LOOSE_ZONE * system->total_cap) )
@@ -855,8 +999,13 @@ void ReAllocate( reax_system *system, control_params *control,
                  (int)(realloc->Htop * SAFE_ZONE * sizeof(sparse_matrix_entry) /
                        (1024 * 1024)) );
 #endif
+#if defined(NEUTRAL_TERRITORY)
+        Reallocate_Matrix( &(workspace->H), H->cap,
+                           realloc->Htop * SAFE_ZONE_NT, "H", comm );
+#else
         Reallocate_Matrix( &(workspace->H), system->local_cap,
                            realloc->Htop * SAFE_ZONE, "H", comm );
+#endif
         //Deallocate_Matrix( workspace->L );
         //Deallocate_Matrix( workspace->U );
         workspace->L = NULL;
@@ -911,6 +1060,9 @@ void ReAllocate( reax_system *system, control_params *control,
                  (int)(realloc->num_3body * sizeof(three_body_interaction_data) /
                        (1024 * 1024)) );
 #endif
+
+        format = lists[THREE_BODIES]->format;
+
         Delete_List( lists[THREE_BODIES], comm );
 
         if ( num_bonds == -1 )
@@ -919,7 +1071,7 @@ void ReAllocate( reax_system *system, control_params *control,
         realloc->num_3body = (int)(MAX(realloc->num_3body * SAFE_ZONE, MIN_3BODIES));
 
         if ( !Make_List( num_bonds, realloc->num_3body, TYP_THREE_BODY,
-                         lists[THREE_BODIES], comm ) )
+                    format, lists[THREE_BODIES], comm ) )
         {
             fprintf( stderr, "Problem in initializing angles list. Terminating!\n" );
             MPI_Abort( comm, CANNOT_INITIALIZE );
@@ -968,6 +1120,21 @@ void ReAllocate( reax_system *system, control_params *control,
                 break;
             }
         }
+
+#if defined(NEUTRAL_TERRITORY)
+        /* also check individual outgoing Neutral Territory buffers */
+        for ( p = 0; p < REAX_MAX_NT_NBRS; ++p )
+        {
+            nbr_pr = &system->my_nt_nbrs[p];
+            nbr_data = &mpi_data->out_nt_buffers[p];
+
+            if ( nbr_data->cnt >= nbr_pr->est_send * 0.90 )
+            {
+                mpi_flag = 1;
+                break;
+            }
+        }
+#endif
     }
 
     if ( mpi_flag )
@@ -984,6 +1151,7 @@ void ReAllocate( reax_system *system, control_params *control,
         system->est_trans =
             (system->est_recv * sizeof(boundary_atom)) / sizeof(mpi_atom);
         total_send = 0;
+
         for ( p = 0; p < MAX_NBRS; ++p )
         {
             nbr_pr   = &( system->my_nbrs[p] );
@@ -991,6 +1159,16 @@ void ReAllocate( reax_system *system, control_params *control,
             nbr_pr->est_send = MAX( nbr_data->cnt * SAFER_ZONE, MIN_SEND );
             total_send += nbr_pr->est_send;
         }
+
+#if defined(NEUTRAL_TERRITORY)
+        for ( p = 0; p < REAX_MAX_NT_NBRS; ++p )
+        {
+            nbr_pr = &system->my_nt_nbrs[p];
+            nbr_data = &mpi_data->out_nt_buffers[p];
+            nbr_pr->est_send = MAX( nbr_data->cnt * SAFER_ZONE_NT, MIN_SEND );
+        }
+#endif
+
 #if defined(DEBUG_FOCUS)
         fprintf( stderr, "p%d: reallocating mpi_buf: recv=%d send=%d total=%dMB\n",
                  system->my_rank, system->est_recv, total_send,
@@ -1004,7 +1182,8 @@ void ReAllocate( reax_system *system, control_params *control,
         /* reallocate mpi buffers */
         Deallocate_MPI_Buffers( mpi_data );
         ret = Allocate_MPI_Buffers( mpi_data, system->est_recv,
-                                    system->my_nbrs, msg );
+                                    system->my_nbrs, system->my_nt_nbrs, 
+                                    msg );
         if ( ret != SUCCESS )
         {
             fprintf( stderr, "%s", msg );
diff --git a/PuReMD/src/allocate.h b/PuReMD/src/allocate.h
index 271cb054d636f19d1d3beeea09546f0a22b1c380..669861a2498afb97d24a8f04baf8a5e87c045788 100644
--- a/PuReMD/src/allocate.h
+++ b/PuReMD/src/allocate.h
@@ -26,17 +26,23 @@
 int PreAllocate_Space( reax_system*, control_params*, storage*, MPI_Comm );
 
 void reax_atom_Copy( reax_atom*, reax_atom* );
+
 int  Allocate_System( reax_system*, int, int, char* );
 
 int  Allocate_Workspace( reax_system*, control_params*, storage*,
                          int, int, MPI_Comm, char* );
 
 void Allocate_Grid( reax_system*, MPI_Comm );
+
 void Deallocate_Grid( grid* );
 
-int  Allocate_MPI_Buffers( mpi_datatypes*, int, neighbor_proc*, char* );
+int Allocate_MPI_Buffers( mpi_datatypes*, int, neighbor_proc*, neighbor_proc*, char* );
+
+int Allocate_Matrix( sparse_matrix**, int, int, int, MPI_Comm );
+
+int Allocate_Matrix2( sparse_matrix**, int, int, int, int, MPI_Comm );
 
-int Allocate_Matrix( sparse_matrix**, int, int, MPI_Comm );
+void Deallocate_Matrix( sparse_matrix * );
 
 int Allocate_HBond_List( int, int, int*, int*, reax_list* );
 
diff --git a/PuReMD/src/basic_comm.c b/PuReMD/src/basic_comm.c
index 96c8397653e440d95feb25c9b074dc5b84de24d1..a50dbbf7ed150bb44a20fc2348d052192ccbb545 100644
--- a/PuReMD/src/basic_comm.c
+++ b/PuReMD/src/basic_comm.c
@@ -20,202 +20,630 @@
   ----------------------------------------------------------------------*/
 
 #include "reax_types.h"
+
 #if defined(PURE_REAX)
-#include "basic_comm.h"
-#include "vector.h"
+  #include "basic_comm.h"
+  #include "vector.h"
 #elif defined(LAMMPS_REAX)
-#include "reax_basic_comm.h"
-#include "reax_vector.h"
+  #include "reax_basic_comm.h"
+  #include "reax_vector.h"
 #endif
 
-#if defined(PURE_REAX)
-void real_packer( void *dummy, mpi_out_data *out_buf )
+
+typedef void (*dist_packer)( void*, mpi_out_data* );
+typedef void (*coll_unpacker)( void*, void*, mpi_out_data* );
+
+
+static void int_packer( void *dummy, mpi_out_data *out_buf )
+{
+    int i;
+    int *buf = (int*) dummy;
+    int *out = (int*) out_buf->out_atoms;
+
+    for ( i = 0; i < out_buf->cnt; ++i )
+    {
+        //if( buf[ out_buf->index[i] ] !=-1 )
+        out[i] = buf[ out_buf->index[i] ];
+    }
+}
+
+
+static void real_packer( void *dummy, mpi_out_data *out_buf )
 {
     int i;
     real *buf = (real*) dummy;
     real *out = (real*) out_buf->out_atoms;
 
     for ( i = 0; i < out_buf->cnt; ++i )
+    {
         out[i] = buf[ out_buf->index[i] ];
+    }
 }
 
 
-void rvec_packer( void *dummy, mpi_out_data *out_buf )
+static void rvec_packer( void *dummy, mpi_out_data *out_buf )
 {
     int i;
-    rvec *buf = (rvec*) dummy;
-    rvec *out = (rvec*)out_buf->out_atoms;
+    rvec *buf, *out;
+
+    buf = (rvec*) dummy;
+    out = (rvec*) out_buf->out_atoms;
 
     for ( i = 0; i < out_buf->cnt; ++i )
-        memcpy( out[i], buf[ out_buf->index[i] ], sizeof(rvec) );
+    {
+        memcpy( out + i, buf + out_buf->index[i], sizeof(rvec) );
+    }
 }
 
 
-void rvec2_packer( void *dummy, mpi_out_data *out_buf )
+static void rvec2_packer( void *dummy, mpi_out_data *out_buf )
 {
     int i;
-    rvec2 *buf = (rvec2*) dummy;
-    rvec2 *out = (rvec2*) out_buf->out_atoms;
+    rvec2 *buf, *out;
+
+    buf = (rvec2*) dummy;
+    out = (rvec2*) out_buf->out_atoms;
 
     for ( i = 0; i < out_buf->cnt; ++i )
-        memcpy( out[i], buf[ out_buf->index[i] ], sizeof(rvec2) );
+    {
+        memcpy( out + i, buf + out_buf->index[i], sizeof(rvec2) );
+    }
+}
+
+
+static void int_unpacker( void *dummy_in, void *dummy_buf, mpi_out_data *out_buf )
+{
+        int i;
+        int *in, *buf;
+
+        in = (int*) dummy_in;
+        buf = (int*) dummy_buf;
+
+        for ( i = 0; i < out_buf->cnt; ++i )
+        {
+            if( buf[ out_buf->index[i] ] == -1 && in[i] != -1 )
+            {
+                buf[ out_buf->index[i] ] = in[i];
+            }
+        }
+}
+
+
+static void real_unpacker( void *dummy_in, void *dummy_buf, mpi_out_data *out_buf )
+{
+    int i;
+    real *in, *buf;
+
+    in = (real*) dummy_in;
+    buf = (real*) dummy_buf;
+
+    for ( i = 0; i < out_buf->cnt; ++i )
+    {
+        buf[ out_buf->index[i] ] += in[i];
+    }
+}
+
+
+static void rvec_unpacker( void *dummy_in, void *dummy_buf, mpi_out_data *out_buf )
+{
+    int i;
+    rvec *in, *buf;
+
+    in = (rvec*) dummy_in;
+    buf = (rvec*) dummy_buf;
+
+    for ( i = 0; i < out_buf->cnt; ++i )
+    {
+        rvec_Add( buf[ out_buf->index[i] ], in[i] );
+
+#if defined(DEBUG)
+        fprintf( stderr, "rvec_unpacker: cnt=%d  i =%d  index[i]=%d\n",
+                out_buf->cnt, i, out_buf->index[i] );
+#endif
+    }
+}
+
+
+static void rvec2_unpacker( void *dummy_in, void *dummy_buf, mpi_out_data *out_buf )
+{
+    int i;
+    rvec2 *in, *buf;
+
+    in = (rvec2*) dummy_in;
+    buf = (rvec2*) dummy_buf;
+
+    for ( i = 0; i < out_buf->cnt; ++i )
+    {
+        buf[ out_buf->index[i] ][0] += in[i][0];
+        buf[ out_buf->index[i] ][1] += in[i][1];
+    }
+}
+
+
+static void * Get_Buffer_Offset( const void * const buffer,
+        const int offset, const int type )
+{
+    void * ptr;
+
+    switch ( type )
+    {
+        case INT_PTR_TYPE:
+            ptr = (int *) buffer + offset;
+            break;
+
+        case REAL_PTR_TYPE:
+            ptr = (real *) buffer + offset;
+            break;
+
+        case RVEC_PTR_TYPE:
+            ptr = (rvec *) buffer + offset;
+            break;
+
+        case RVEC2_PTR_TYPE:
+            ptr = (rvec2 *) buffer + offset;
+            break;
+
+        default:
+            fprintf( stderr, "[ERROR] unknown pointer type. Terminating...\n" );
+            exit( UNKNOWN_OPTION );
+            break;
+    }
+
+    return ptr;
+}
+
+
+static dist_packer Get_Packer( const int type )
+{
+    dist_packer ptr;
+
+    switch ( type )
+    {
+        case INT_PTR_TYPE:
+            ptr = &int_packer;
+            break;
+
+        case REAL_PTR_TYPE:
+            ptr = &real_packer;
+            break;
+
+        case RVEC_PTR_TYPE:
+            ptr = &rvec_packer;
+            break;
+
+        case RVEC2_PTR_TYPE:
+            ptr = &rvec2_packer;
+            break;
+
+        default:
+            fprintf( stderr, "[ERROR] unknown pointer type. Terminating...\n" );
+            exit( UNKNOWN_OPTION );
+            break;
+    }
+
+    return ptr;
+}
+
+
+static coll_unpacker Get_Unpacker( const int type )
+{
+    coll_unpacker ptr;
+
+    switch ( type )
+    {
+        case INT_PTR_TYPE:
+            ptr = &int_unpacker;
+            break;
+
+        case REAL_PTR_TYPE:
+            ptr = &real_unpacker;
+            break;
+
+        case RVEC_PTR_TYPE:
+            ptr = &rvec_unpacker;
+            break;
+
+        case RVEC2_PTR_TYPE:
+            ptr = &rvec2_unpacker;
+            break;
+
+        default:
+            fprintf( stderr, "[ERROR] unknown pointer type. Terminating...\n" );
+            exit( UNKNOWN_OPTION );
+            break;
+    }
+
+    return ptr;
 }
 
 
-void Dist( reax_system* system, mpi_datatypes *mpi_data,
-           void *buf, MPI_Datatype type, int scale, dist_packer pack )
+void Dist( const reax_system * const system, mpi_datatypes * const mpi_data,
+        void *buf, int buf_type, MPI_Datatype type )
 {
+#if defined(NEUTRAL_TERRITORY)
+    int d, count, index;
+    mpi_out_data *out_bufs;
+    MPI_Comm comm;
+    MPI_Request req[6];
+    MPI_Status stat[6];
+    dist_packer pack;
+
+    comm = mpi_data->comm_mesh3D;
+    out_bufs = mpi_data->out_nt_buffers;
+    pack = Get_Packer( buf_type );
+    count = 0;
+
+    /* initiate recvs */
+    for ( d = 0; d < 6; ++d )
+    {
+        if ( system->my_nt_nbrs[d].atoms_cnt )
+        {
+            count++;
+            MPI_Irecv( Get_Buffer_Offset( buf, system->my_nt_nbrs[d].atoms_str, buf_type ),
+                    system->my_nt_nbrs[d].atoms_cnt, type,
+                    system->my_nt_nbrs[d].receive_rank, d, comm, &req[d] );
+        }
+    }
+
+    for ( d = 0; d < 6; ++d)
+    {
+        /* send both messages in dimension d */
+        if ( out_bufs[d].cnt )
+        {
+            pack( buf, &out_bufs[d] );
+            MPI_Send( out_bufs[d].out_atoms, out_bufs[d].cnt, type,
+                    system->my_nt_nbrs[d].rank, d, comm );
+        }
+    }
+
+    for ( d = 0; d < count; ++d )
+    {
+        MPI_Waitany( REAX_MAX_NT_NBRS, req, &index, stat);
+    }
+    
+#if defined(DEBUG)
+    fprintf( stderr, "p%d dist: done\n", system->my_rank );
+#endif
+
+#else
     int d;
     mpi_out_data *out_bufs;
     MPI_Comm comm;
     MPI_Request req1, req2;
     MPI_Status stat1, stat2;
-    neighbor_proc *nbr1, *nbr2;
+    const neighbor_proc *nbr1, *nbr2;
+    dist_packer pack;
 
 #if defined(DEBUG)
     fprintf( stderr, "p%d dist: entered\n", system->my_rank );
 #endif
+
     comm = mpi_data->comm_mesh3D;
     out_bufs = mpi_data->out_buffers;
+    pack = Get_Packer( buf_type );
 
     for ( d = 0; d < 3; ++d )
     {
         /* initiate recvs */
-        nbr1 = &(system->my_nbrs[2 * d]);
+        nbr1 = &system->my_nbrs[2 * d];
         if ( nbr1->atoms_cnt )
-            MPI_Irecv( buf + nbr1->atoms_str * scale, nbr1->atoms_cnt, type,
-                       nbr1->rank, 2 * d + 1, comm, &req1 );
+        {
+            MPI_Irecv( Get_Buffer_Offset( buf, nbr1->atoms_str, buf_type ),
+                    nbr1->atoms_cnt, type, nbr1->rank, 2 * d + 1, comm, &req1 );
+        }
 
-        nbr2 = &(system->my_nbrs[2 * d + 1]);
+        nbr2 = &system->my_nbrs[2 * d + 1];
         if ( nbr2->atoms_cnt )
-            MPI_Irecv( buf + nbr2->atoms_str * scale, nbr2->atoms_cnt, type,
-                       nbr2->rank, 2 * d, comm, &req2 );
+        {
+            MPI_Irecv( Get_Buffer_Offset( buf, nbr2->atoms_str, buf_type ),
+                    nbr2->atoms_cnt, type, nbr2->rank, 2 * d, comm, &req2 );
+        }
 
         /* send both messages in dimension d */
         if ( out_bufs[2 * d].cnt )
         {
-            pack( buf, out_bufs + (2 * d) );
-            MPI_Send( out_bufs[2 * d].out_atoms, out_bufs[2 * d].cnt, type,
-                      nbr1->rank, 2 * d, comm );
+            pack( buf, &out_bufs[2 * d] );
+            MPI_Send( out_bufs[2 * d].out_atoms, out_bufs[2 * d].cnt,
+                    type, nbr1->rank, 2 * d, comm );
         }
 
         if ( out_bufs[2 * d + 1].cnt )
         {
-            pack( buf, out_bufs + (2 * d + 1) );
-            MPI_Send( out_bufs[2 * d + 1].out_atoms, out_bufs[2 * d + 1].cnt, type,
-                      nbr2->rank, 2 * d + 1, comm );
+            pack( buf, &out_bufs[2 * d + 1] );
+            MPI_Send( out_bufs[2 * d + 1].out_atoms, out_bufs[2 * d + 1].cnt,
+                    type, nbr2->rank, 2 * d + 1, comm );
         }
 
-        if ( nbr1->atoms_cnt ) MPI_Wait( &req1, &stat1 );
-        if ( nbr2->atoms_cnt ) MPI_Wait( &req2, &stat2 );
+        if( nbr1->atoms_cnt )
+        {
+            MPI_Wait( &req1, &stat1 );
+        }
+        if( nbr2->atoms_cnt )
+        {
+            MPI_Wait( &req2, &stat2 );
+        }
     }
 
+
 #if defined(DEBUG)
     fprintf( stderr, "p%d dist: done\n", system->my_rank );
 #endif
+#endif
 }
 
 
-void real_unpacker( void *dummy_in, void *dummy_buf, mpi_out_data *out_buf )
+void Dist_FS( const reax_system * const system, mpi_datatypes * const mpi_data,
+        void *buf, int buf_type, MPI_Datatype type )
 {
-    int i;
-    real *in = (real*) dummy_in;
-    real *buf = (real*) dummy_buf;
+    int d;
+    mpi_out_data *out_bufs;
+    MPI_Comm comm;
+    MPI_Request req1, req2;
+    MPI_Status stat1, stat2;
+    const neighbor_proc *nbr1, *nbr2;
+    dist_packer pack;
 
-    for ( i = 0; i < out_buf->cnt; ++i )
-        buf[ out_buf->index[i] ] += in[i];
+#if defined(DEBUG)
+    fprintf( stderr, "p%d dist: entered\n", system->my_rank );
+#endif
+
+    comm = mpi_data->comm_mesh3D;
+    out_bufs = mpi_data->out_buffers;
+    pack = Get_Packer( buf_type );
+
+    for ( d = 0; d < 3; ++d )
+    {
+        /* initiate recvs */
+        nbr1 = &system->my_nbrs[2 * d];
+        if ( nbr1->atoms_cnt )
+        {
+            MPI_Irecv( Get_Buffer_Offset( buf, nbr1->atoms_str, buf_type ),
+                    nbr1->atoms_cnt, type, nbr1->rank, 2 * d + 1, comm, &req1 );
+        }
+
+        nbr2 = &system->my_nbrs[2 * d + 1];
+        if ( nbr2->atoms_cnt )
+        {
+            MPI_Irecv( Get_Buffer_Offset( buf, nbr2->atoms_str, buf_type ),
+                    nbr2->atoms_cnt, type, nbr2->rank, 2 * d, comm, &req2 );
+        }
+
+        /* send both messages in dimension d */
+        if ( out_bufs[2 * d].cnt )
+        {
+            pack( buf, &out_bufs[2 * d] );
+            MPI_Send( out_bufs[2 * d].out_atoms, out_bufs[2 * d].cnt,
+                    type, nbr1->rank, 2 * d, comm );
+        }
+
+        if ( out_bufs[2 * d + 1].cnt )
+        {
+            pack( buf, &out_bufs[2 * d + 1] );
+            MPI_Send( out_bufs[2 * d + 1].out_atoms, out_bufs[2 * d + 1].cnt,
+                    type, nbr2->rank, 2 * d + 1, comm );
+        }
+
+        if( nbr1->atoms_cnt )
+        {
+            MPI_Wait( &req1, &stat1 );
+        }
+        if( nbr2->atoms_cnt )
+        {
+            MPI_Wait( &req2, &stat2 );
+        }
+    }
+
+
+#if defined(DEBUG)
+    fprintf( stderr, "p%d dist: done\n", system->my_rank );
+#endif
 }
 
 
-void rvec_unpacker( void *dummy_in, void *dummy_buf, mpi_out_data *out_buf )
-{
-    int i;
-    rvec *in = (rvec*) dummy_in;
-    rvec *buf = (rvec*) dummy_buf;
+void Coll( const reax_system * const system, mpi_datatypes * const mpi_data,
+        void *buf, int buf_type, MPI_Datatype type )
+{   
+#if defined(NEUTRAL_TERRITORY)
+    int d, count, index;
+    void *in[6];
+    mpi_out_data *out_bufs;
+    MPI_Comm comm;
+    MPI_Request req[6];
+    MPI_Status stat[6];
+    coll_unpacker unpack;
 
-    for ( i = 0; i < out_buf->cnt; ++i )
-    {
-        rvec_Add( buf[ out_buf->index[i] ], in[i] );
 #if defined(DEBUG)
-        fprintf( stderr, "rvec_unpacker: cnt=%d  i =%d  index[i]=%d\n",
-                 out_buf->cnt, i, out_buf->index[i] );
+    fprintf( stderr, "p%d coll: entered\n", system->my_rank );
 #endif
+
+    comm = mpi_data->comm_mesh3D;
+    out_bufs = mpi_data->out_nt_buffers;
+    unpack = Get_Unpacker( buf_type );
+    count = 0;
+
+    for ( d = 0; d < 6; ++d )
+    {
+        in[d] = mpi_data->in_nt_buffer[d];
+
+        if ( out_bufs[d].cnt )
+        {
+            count++;
+            MPI_Irecv( in[d], out_bufs[d].cnt, type,
+                    system->my_nt_nbrs[d].rank, d, comm, &req[d] );
+        }
     }
-}
 
+    for ( d = 0; d < 6; ++d )
+    {
+        /* send both messages in direction d */
+        if ( system->my_nt_nbrs[d].atoms_cnt )
+        {
+            MPI_Send( Get_Buffer_Offset( buf, system->my_nt_nbrs[d].atoms_str, buf_type ),
+                    system->my_nt_nbrs[d].atoms_cnt, type,
+                    system->my_nt_nbrs[d].receive_rank, d, comm );
+        }
+    }
+    
+    for ( d = 0; d < count; ++d )
+    {
+        MPI_Waitany( REAX_MAX_NT_NBRS, req, &index, stat);
+        unpack( in[index], buf, &out_bufs[index] );
+    }
 
-void rvec2_unpacker( void *dummy_in, void *dummy_buf, mpi_out_data *out_buf )
-{
-    int i;
-    rvec2 *in = (rvec2*) dummy_in;
-    rvec2 *buf = (rvec2*) dummy_buf;
+#if defined(DEBUG)
+    fprintf( stderr, "p%d coll: done\n", system->my_rank );
+#endif
 
-    for ( i = 0; i < out_buf->cnt; ++i )
+#else
+    int d;
+    mpi_out_data *out_bufs;
+    MPI_Comm comm;
+    MPI_Request req1, req2;
+    MPI_Status stat1, stat2;
+    const neighbor_proc *nbr1, *nbr2;
+    coll_unpacker unpack;
+
+#if defined(DEBUG)
+    fprintf( stderr, "p%d coll: entered\n", system->my_rank );
+#endif
+
+    comm = mpi_data->comm_mesh3D;
+    out_bufs = mpi_data->out_buffers;
+    unpack = Get_Unpacker( buf_type );
+
+    for ( d = 2; d >= 0; --d )
     {
-        buf[ out_buf->index[i] ][0] += in[i][0];
-        buf[ out_buf->index[i] ][1] += in[i][1];
+        /* initiate recvs */
+        nbr1 = &system->my_nbrs[2 * d];
+
+        if ( out_bufs[2 * d].cnt )
+        {
+            MPI_Irecv( mpi_data->in1_buffer, out_bufs[2 * d].cnt,
+                    type, nbr1->rank, 2 * d + 1, comm, &req1 );
+        }
+
+        nbr2 = &system->my_nbrs[2 * d + 1];
+
+        if ( out_bufs[2 * d + 1].cnt )
+        {
+
+            MPI_Irecv( mpi_data->in2_buffer, out_bufs[2 * d + 1].cnt,
+                    type, nbr2->rank, 2 * d, comm, &req2 );
+        }
+        
+        /* send both messages in dimension d */
+        if ( nbr1->atoms_cnt )
+        {
+            MPI_Send( Get_Buffer_Offset( buf, nbr1->atoms_str, buf_type ),
+                    nbr1->atoms_cnt, type, nbr1->rank, 2 * d, comm );
+        }
+        
+        if ( nbr2->atoms_cnt )
+        {
+            MPI_Send( Get_Buffer_Offset( buf, nbr2->atoms_str, buf_type ),
+                    nbr2->atoms_cnt, type, nbr2->rank, 2 * d + 1, comm );
+        }
+
+#if defined(DEBUG)
+        fprintf( stderr, "p%d coll[%d] nbr1: str=%d cnt=%d recv=%d\n",
+                system->my_rank, d, nbr1->atoms_str, nbr1->atoms_cnt,
+                out_bufs[2 * d].cnt );
+        fprintf( stderr, "p%d coll[%d] nbr2: str=%d cnt=%d recv=%d\n",
+                system->my_rank, d, nbr2->atoms_str, nbr2->atoms_cnt,
+                out_bufs[2 * d + 1].cnt );
+#endif
+
+        if ( out_bufs[2 * d].cnt )
+        {
+            MPI_Wait( &req1, &stat1 );
+            unpack( mpi_data->in1_buffer, buf, &out_bufs[2 * d] );
+        }
+
+        if ( out_bufs[2 * d + 1].cnt )
+        {
+            MPI_Wait( &req2, &stat2 );
+            unpack( mpi_data->in2_buffer, buf, &out_bufs[2 * d + 1] );
+        }
     }
+
+#if defined(DEBUG)
+    fprintf( stderr, "p%d coll: done\n", system->my_rank );
+#endif
+#endif
 }
 
 
-void Coll( reax_system* system, mpi_datatypes *mpi_data,
-           void *buf, MPI_Datatype type, int scale, coll_unpacker unpack )
-{
+void Coll_FS( const reax_system * const system, mpi_datatypes * const mpi_data,
+        void *buf, int buf_type, MPI_Datatype type )
+{   
     int d;
-    void *in1, *in2;
     mpi_out_data *out_bufs;
     MPI_Comm comm;
     MPI_Request req1, req2;
     MPI_Status stat1, stat2;
-    neighbor_proc *nbr1, *nbr2;
+    const neighbor_proc *nbr1, *nbr2;
+    coll_unpacker unpack;
 
 #if defined(DEBUG)
     fprintf( stderr, "p%d coll: entered\n", system->my_rank );
 #endif
+
     comm = mpi_data->comm_mesh3D;
-    in1 = mpi_data->in1_buffer;
-    in2 = mpi_data->in2_buffer;
     out_bufs = mpi_data->out_buffers;
+    unpack = Get_Unpacker( buf_type );
 
     for ( d = 2; d >= 0; --d )
     {
         /* initiate recvs */
-        nbr1 = &(system->my_nbrs[2 * d]);
+        nbr1 = &system->my_nbrs[2 * d];
+
         if ( out_bufs[2 * d].cnt )
-            MPI_Irecv(in1, out_bufs[2 * d].cnt, type, nbr1->rank, 2 * d + 1, comm, &req1);
+        {
+            MPI_Irecv( mpi_data->in1_buffer, out_bufs[2 * d].cnt,
+                    type, nbr1->rank, 2 * d + 1, comm, &req1 );
+        }
+
+        nbr2 = &system->my_nbrs[2 * d + 1];
 
-        nbr2 = &(system->my_nbrs[2 * d + 1]);
         if ( out_bufs[2 * d + 1].cnt )
-            MPI_Irecv(in2, out_bufs[2 * d + 1].cnt, type, nbr2->rank, 2 * d, comm, &req2);
+        {
 
+            MPI_Irecv( mpi_data->in2_buffer, out_bufs[2 * d + 1].cnt,
+                    type, nbr2->rank, 2 * d, comm, &req2 );
+        }
+        
         /* send both messages in dimension d */
         if ( nbr1->atoms_cnt )
-            MPI_Send( buf + nbr1->atoms_str * scale, nbr1->atoms_cnt, type,
-                      nbr1->rank, 2 * d, comm );
-
+        {
+            MPI_Send( Get_Buffer_Offset( buf, nbr1->atoms_str, buf_type ),
+                    nbr1->atoms_cnt, type, nbr1->rank, 2 * d, comm );
+        }
+        
         if ( nbr2->atoms_cnt )
-            MPI_Send( buf + nbr2->atoms_str * scale, nbr2->atoms_cnt, type,
-                      nbr2->rank, 2 * d + 1, comm );
+        {
+            MPI_Send( Get_Buffer_Offset( buf, nbr2->atoms_str, buf_type ),
+                    nbr2->atoms_cnt, type, nbr2->rank, 2 * d + 1, comm );
+        }
 
 #if defined(DEBUG)
         fprintf( stderr, "p%d coll[%d] nbr1: str=%d cnt=%d recv=%d\n",
-                 system->my_rank, d, nbr1->atoms_str, nbr1->atoms_cnt,
-                 out_bufs[2 * d].cnt );
+                system->my_rank, d, nbr1->atoms_str, nbr1->atoms_cnt,
+                out_bufs[2 * d].cnt );
         fprintf( stderr, "p%d coll[%d] nbr2: str=%d cnt=%d recv=%d\n",
-                 system->my_rank, d, nbr2->atoms_str, nbr2->atoms_cnt,
-                 out_bufs[2 * d + 1].cnt );
+                system->my_rank, d, nbr2->atoms_str, nbr2->atoms_cnt,
+                out_bufs[2 * d + 1].cnt );
 #endif
 
         if ( out_bufs[2 * d].cnt )
         {
             MPI_Wait( &req1, &stat1 );
-            unpack( in1, buf, out_bufs + (2 * d) );
+            unpack( mpi_data->in1_buffer, buf, &out_bufs[2 * d] );
         }
 
         if ( out_bufs[2 * d + 1].cnt )
         {
             MPI_Wait( &req2, &stat2 );
-            unpack( in2, buf, out_bufs + (2 * d + 1) );
+            unpack( mpi_data->in2_buffer, buf, &out_bufs[2 * d + 1] );
         }
     }
 
@@ -223,17 +651,20 @@ void Coll( reax_system* system, mpi_datatypes *mpi_data,
     fprintf( stderr, "p%d coll: done\n", system->my_rank );
 #endif
 }
-#endif /*PURE_REAX*/
+
 
 /*****************************************************************************/
 real Parallel_Norm( real *v, int n, MPI_Comm comm )
 {
-    int  i;
+    int i;
     real my_sum, norm_sqr;
 
-    my_sum = 0;
+    my_sum = 0.0;
+
     for ( i = 0; i < n; ++i )
+    {
         my_sum += SQR( v[i] );
+    }
 
     MPI_Allreduce( &my_sum, &norm_sqr, 1, MPI_DOUBLE, MPI_SUM, comm );
 
@@ -241,15 +672,17 @@ real Parallel_Norm( real *v, int n, MPI_Comm comm )
 }
 
 
-
 real Parallel_Dot( real *v1, real *v2, int n, MPI_Comm comm )
 {
     int  i;
     real my_dot, res;
 
-    my_dot = 0;
+    my_dot = 0.0;
+
     for ( i = 0; i < n; ++i )
+    {
         my_dot += v1[i] * v2[i];
+    }
 
     MPI_Allreduce( &my_dot, &res, 1, MPI_DOUBLE, MPI_SUM, comm );
 
@@ -257,7 +690,6 @@ real Parallel_Dot( real *v1, real *v2, int n, MPI_Comm comm )
 }
 
 
-
 real Parallel_Vector_Acc( real *v, int n, MPI_Comm comm )
 {
     int  i;
@@ -276,13 +708,13 @@ real Parallel_Vector_Acc( real *v, int n, MPI_Comm comm )
 /*****************************************************************************/
 #if defined(TEST_FORCES)
 void Coll_ids_at_Master( reax_system *system, storage *workspace,
-                         mpi_datatypes *mpi_data )
+        mpi_datatypes *mpi_data )
 {
     int i;
     int *id_list;
 
     MPI_Gather( &system->n, 1, MPI_INT, workspace->rcounts, 1, MPI_INT,
-                MASTER_NODE, mpi_data->world );
+            MASTER_NODE, mpi_data->world );
 
     if ( system->my_rank == MASTER_NODE )
     {
@@ -296,8 +728,8 @@ void Coll_ids_at_Master( reax_system *system, storage *workspace,
         id_list[i] = system->my_atoms[i].orig_id;
 
     MPI_Gatherv( id_list, system->n, MPI_INT,
-                 workspace->id_all, workspace->rcounts, workspace->displs,
-                 MPI_INT, MASTER_NODE, mpi_data->world );
+            workspace->id_all, workspace->rcounts, workspace->displs,
+            MPI_INT, MASTER_NODE, mpi_data->world );
 
     sfree( id_list, "id_list" );
 
@@ -312,11 +744,10 @@ void Coll_ids_at_Master( reax_system *system, storage *workspace,
 
 
 void Coll_rvecs_at_Master( reax_system *system, storage *workspace,
-                           mpi_datatypes *mpi_data, rvec* v )
+        mpi_datatypes *mpi_data, rvec* v )
 {
     MPI_Gatherv( v, system->n, mpi_data->mpi_rvec,
-                 workspace->f_all, workspace->rcounts, workspace->displs,
-                 mpi_data->mpi_rvec, MASTER_NODE, mpi_data->world );
+            workspace->f_all, workspace->rcounts, workspace->displs,
+            mpi_data->mpi_rvec, MASTER_NODE, mpi_data->world );
 }
-
 #endif
diff --git a/PuReMD/src/basic_comm.h b/PuReMD/src/basic_comm.h
index b3d7a5222c786f8c71662547e3a36a77abf8fb92..e2fe70903a15f4dd61423cf14fe4b557d6802f6d 100644
--- a/PuReMD/src/basic_comm.h
+++ b/PuReMD/src/basic_comm.h
@@ -24,24 +24,39 @@
 
 #include "reax_types.h"
 
-void real_packer( void*, mpi_out_data* );
-void rvec_packer( void*, mpi_out_data* );
-void rvec2_packer( void*, mpi_out_data* );
-void Dist(reax_system*, mpi_datatypes*, void*, MPI_Datatype, int, dist_packer);
 
-void real_unpacker( void*, void*, mpi_out_data* );
-void rvec_unpacker( void*, void*, mpi_out_data* );
-void rvec2_unpacker( void*, void*, mpi_out_data* );
-void Coll( reax_system*, mpi_datatypes*, void*, MPI_Datatype,
-           int, coll_unpacker );
+enum pointer_type
+{
+    INT_PTR_TYPE = 0,
+    REAL_PTR_TYPE = 1,
+    RVEC_PTR_TYPE = 2,
+    RVEC2_PTR_TYPE = 3,
+};
+
+
+void Dist( const reax_system * const, mpi_datatypes * const,
+        void*, int, MPI_Datatype );
+
+void Dist_FS( const reax_system * const, mpi_datatypes * const,
+        void*, int, MPI_Datatype );
+
+void Coll( const reax_system * const, mpi_datatypes * const,
+        void*, int, MPI_Datatype );
+
+void Coll_FS( const reax_system * const, mpi_datatypes * const,
+        void*, int, MPI_Datatype );
 
 real Parallel_Norm( real*, int, MPI_Comm );
+
 real Parallel_Dot( real*, real*, int, MPI_Comm );
+
 real Parallel_Vector_Acc( real*, int, MPI_Comm );
 
 #if defined(TEST_FORCES)
 void Coll_ids_at_Master( reax_system*, storage*, mpi_datatypes* );
+
 void Coll_rvecs_at_Master( reax_system*, storage*, mpi_datatypes*, rvec* );
 #endif
 
+
 #endif
diff --git a/PuReMD/src/bond_orders.c b/PuReMD/src/bond_orders.c
index cf6c69911f889bb528ef32c39e6542960ba0d7e0..0c5950e5b28c28a82591288b2ab0a3be82fbd0ec 100644
--- a/PuReMD/src/bond_orders.c
+++ b/PuReMD/src/bond_orders.c
@@ -662,41 +662,58 @@ void Add_dBond_to_Forces( int i, int pj,
 }
 
 
+/* Compute the bond order term between atoms i and j,
+ * and if this term exceeds the cutoff bo_cut, then adds
+ * BOTH atoms the bonds list (i.e., compute term once
+ * and copy to avoid redundant computation) */
 int BOp( storage *workspace, reax_list *bonds, real bo_cut,
-         int i, int btop_i, far_neighbor_data *nbr_pj,
-         single_body_parameters *sbp_i, single_body_parameters *sbp_j,
-         two_body_parameters *twbp )
+         int i, int btop_i, int j, ivec *rel_box, real d, rvec *dvec,
+         int far_nbr_list_format, single_body_parameters *sbp_i,
+         single_body_parameters *sbp_j, two_body_parameters *twbp )
 {
-    int j, btop_j;
     real r2, C12, C34, C56;
     real Cln_BOp_s, Cln_BOp_pi, Cln_BOp_pi2;
     real BO, BO_s, BO_pi, BO_pi2;
-    bond_data *ibond, *jbond;
-    bond_order_data *bo_ij, *bo_ji;
+    bond_data *ibond;
+    bond_order_data *bo_ij;
+    int btop_j;
+    bond_data *jbond;
+    bond_order_data *bo_ji;
 
-    j = nbr_pj->nbr;
-    r2 = SQR(nbr_pj->d);
+    r2 = SQR(d);
 
     if ( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0 )
     {
-        C12 = twbp->p_bo1 * pow( nbr_pj->d / twbp->r_s, twbp->p_bo2 );
+        C12 = twbp->p_bo1 * pow( d / twbp->r_s, twbp->p_bo2 );
         BO_s = (1.0 + bo_cut) * exp( C12 );
     }
-    else BO_s = C12 = 0.0;
+    else
+    {
+        C12 = 0.0;
+        BO_s = 0.0;
+    }
 
     if ( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0 )
     {
-        C34 = twbp->p_bo3 * pow( nbr_pj->d / twbp->r_p, twbp->p_bo4 );
+        C34 = twbp->p_bo3 * pow( d / twbp->r_p, twbp->p_bo4 );
         BO_pi = exp( C34 );
     }
-    else BO_pi = C34 = 0.0;
+    else
+    {
+        C34 = 0.0;
+        BO_pi = 0.0;
+    }
 
     if ( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0 )
     {
-        C56 = twbp->p_bo5 * pow( nbr_pj->d / twbp->r_pp, twbp->p_bo6 );
+        C56 = twbp->p_bo5 * pow( d / twbp->r_pp, twbp->p_bo6 );
         BO_pi2 = exp( C56 );
     }
-    else BO_pi2 = C56 = 0.0;
+    else
+    {
+        C56 = 0.0;
+        BO_pi2 = 0.0;
+    }
 
     /* Initially BO values are the uncorrected ones, page 1 */
     BO = BO_s + BO_pi + BO_pi2;
@@ -704,30 +721,35 @@ int BOp( storage *workspace, reax_list *bonds, real bo_cut,
     if ( BO >= bo_cut )
     {
         /****** bonds i-j and j-i ******/
-        ibond = &( bonds->bond_list[btop_i] );
+        ibond = &bonds->bond_list[btop_i];
         btop_j = End_Index( j, bonds );
-        jbond = &(bonds->bond_list[btop_j]);
+        jbond = &bonds->bond_list[btop_j];
 
         ibond->nbr = j;
-        jbond->nbr = i;
-        ibond->d = nbr_pj->d;
-        jbond->d = nbr_pj->d;
-        rvec_Copy( ibond->dvec, nbr_pj->dvec );
-        rvec_Scale( jbond->dvec, -1, nbr_pj->dvec );
-        ivec_Copy( ibond->rel_box, nbr_pj->rel_box );
-        ivec_Scale( jbond->rel_box, -1, nbr_pj->rel_box );
+        ibond->d = d;
+        rvec_Copy( ibond->dvec, *dvec );
+        ivec_Copy( ibond->rel_box, *rel_box );
         ibond->dbond_index = btop_i;
-        jbond->dbond_index = btop_i;
         ibond->sym_index = btop_j;
+        jbond->nbr = i;
+        jbond->d = d;
+        rvec_Scale( jbond->dvec, -1.0, *dvec );
+        ivec_Scale( jbond->rel_box, -1.0, *rel_box );
+        jbond->dbond_index = btop_i;
         jbond->sym_index = btop_i;
-        Set_End_Index( j, btop_j + 1, bonds );
 
-        bo_ij = &( ibond->bo_data );
-        bo_ji = &( jbond->bo_data );
-        bo_ji->BO = bo_ij->BO = BO;
-        bo_ji->BO_s = bo_ij->BO_s = BO_s;
-        bo_ji->BO_pi = bo_ij->BO_pi = BO_pi;
-        bo_ji->BO_pi2 = bo_ij->BO_pi2 = BO_pi2;
+        Set_End_Index( j, btop_j + 1, bonds );
+        
+        bo_ij = &ibond->bo_data;
+        bo_ij->BO = BO;
+        bo_ij->BO_s = BO_s;
+        bo_ij->BO_pi = BO_pi;
+        bo_ij->BO_pi2 = BO_pi2;
+        bo_ji = &jbond->bo_data;
+        bo_ji->BO = BO;
+        bo_ji->BO_s = BO_s;
+        bo_ji->BO_pi = BO_pi;
+        bo_ji->BO_pi2 = BO_pi2;
 
         /* Bond Order page2-3, derivative of total bond order prime */
         Cln_BOp_s = twbp->p_bo2 * C12 / r2;
@@ -735,63 +757,193 @@ int BOp( storage *workspace, reax_list *bonds, real bo_cut,
         Cln_BOp_pi2 = twbp->p_bo6 * C56 / r2;
 
         /* Only dln_BOp_xx wrt. dr_i is stored here, note that
-           dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 */
-        rvec_Scale(bo_ij->dln_BOp_s, -bo_ij->BO_s * Cln_BOp_s, ibond->dvec);
-        rvec_Scale(bo_ij->dln_BOp_pi, -bo_ij->BO_pi * Cln_BOp_pi, ibond->dvec);
-        rvec_Scale(bo_ij->dln_BOp_pi2,
-                   -bo_ij->BO_pi2 * Cln_BOp_pi2, ibond->dvec);
-        rvec_Scale(bo_ji->dln_BOp_s, -1., bo_ij->dln_BOp_s);
-        rvec_Scale(bo_ji->dln_BOp_pi, -1., bo_ij->dln_BOp_pi );
-        rvec_Scale(bo_ji->dln_BOp_pi2, -1., bo_ij->dln_BOp_pi2 );
+         * dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 */
+        rvec_Scale( bo_ij->dln_BOp_s, -1.0 * bo_ij->BO_s * Cln_BOp_s, ibond->dvec );
+        rvec_Scale( bo_ij->dln_BOp_pi, -1.0 * bo_ij->BO_pi * Cln_BOp_pi, ibond->dvec );
+        rvec_Scale( bo_ij->dln_BOp_pi2, -1.0 * bo_ij->BO_pi2 * Cln_BOp_pi2, ibond->dvec );
+        rvec_Scale( bo_ji->dln_BOp_s, -1.0, bo_ij->dln_BOp_s );
+        rvec_Scale( bo_ji->dln_BOp_pi, -1.0, bo_ij->dln_BOp_pi );
+        rvec_Scale( bo_ji->dln_BOp_pi2, -1.0, bo_ij->dln_BOp_pi2 );
 
         /* Only dBOp wrt. dr_i is stored here, note that
-           dBOp/dr_i = -dBOp/dr_j and all others are 0 */
-        rvec_Scale( bo_ij->dBOp,
-                    -(bo_ij->BO_s * Cln_BOp_s +
-                      bo_ij->BO_pi * Cln_BOp_pi +
-                      bo_ij->BO_pi2 * Cln_BOp_pi2), ibond->dvec );
-        rvec_Scale( bo_ji->dBOp, -1., bo_ij->dBOp );
+         * dBOp/dr_i = -dBOp/dr_j and all others are 0 */
+        rvec_Scale( bo_ij->dBOp, -1.0 * (bo_ij->BO_s * Cln_BOp_s 
+                    + bo_ij->BO_pi * Cln_BOp_pi 
+                    + bo_ij->BO_pi2 * Cln_BOp_pi2), ibond->dvec );
+        rvec_Scale( bo_ji->dBOp, -1.0, bo_ij->dBOp );
 
         rvec_Add( workspace->dDeltap_self[i], bo_ij->dBOp );
         rvec_Add( workspace->dDeltap_self[j], bo_ji->dBOp );
 
         bo_ij->BO_s -= bo_cut;
         bo_ij->BO -= bo_cut;
+        workspace->total_bond_order[i] += bo_ij->BO; //currently total_BOp
+        bo_ij->Cdbo = 0.0;
+        bo_ij->Cdbopi = 0.0;
+        bo_ij->Cdbopi2 = 0.0;
         bo_ji->BO_s -= bo_cut;
         bo_ji->BO -= bo_cut;
-        workspace->total_bond_order[i] += bo_ij->BO; //currently total_BOp
         workspace->total_bond_order[j] += bo_ji->BO; //currently total_BOp
-        bo_ij->Cdbo = bo_ij->Cdbopi = bo_ij->Cdbopi2 = 0.0;
-        bo_ji->Cdbo = bo_ji->Cdbopi = bo_ji->Cdbopi2 = 0.0;
-
-        /*fprintf( stderr, "%d %d %g %g %g\n",
-          i+1, j+1, bo_ij->BO, bo_ij->BO_pi, bo_ij->BO_pi2 );*/
-
-        /*fprintf( stderr, "Cln_BOp_s: %f, pbo2: %f, C12:%f\n",
-          Cln_BOp_s, twbp->p_bo2, C12 );
-          fprintf( stderr, "Cln_BOp_pi: %f, pbo4: %f, C34:%f\n",
-          Cln_BOp_pi, twbp->p_bo4, C34 );
-          fprintf( stderr, "Cln_BOp_pi2: %f, pbo6: %f, C56:%f\n",
-          Cln_BOp_pi2, twbp->p_bo6, C56 );*/
-        /*fprintf(stderr, "pbo1: %f, pbo2:%f\n", twbp->p_bo1, twbp->p_bo2);
-          fprintf(stderr, "pbo3: %f, pbo4:%f\n", twbp->p_bo3, twbp->p_bo4);
-          fprintf(stderr, "pbo5: %f, pbo6:%f\n", twbp->p_bo5, twbp->p_bo6);
-          fprintf( stderr, "r_s: %f, r_p: %f, r_pp: %f\n",
-          twbp->r_s, twbp->r_p, twbp->r_pp );
-          fprintf( stderr, "C12: %g, C34:%g, C56:%g\n", C12, C34, C56 );*/
-
-        /*fprintf( stderr, "\tfactors: %g %g %g\n",
-          -(bo_ij->BO_s * Cln_BOp_s + bo_ij->BO_pi * Cln_BOp_pi +
-          bo_ij->BO_pi2 * Cln_BOp_pp),
-          -bo_ij->BO_pi * Cln_BOp_pi, -bo_ij->BO_pi2 * Cln_BOp_pi2 );*/
-        /*fprintf( stderr, "dBOpi:\t[%g, %g, %g]\n",
-          bo_ij->dBOp[0], bo_ij->dBOp[1], bo_ij->dBOp[2] );
-          fprintf( stderr, "dBOpi:\t[%g, %g, %g]\n",
-          bo_ij->dln_BOp_pi[0], bo_ij->dln_BOp_pi[1],
-          bo_ij->dln_BOp_pi[2] );
-          fprintf( stderr, "dBOpi2:\t[%g, %g, %g]\n\n",
-          bo_ij->dln_BOp_pi2[0], bo_ij->dln_BOp_pi2[1],
-          bo_ij->dln_BOp_pi2[2] );*/
+        bo_ji->Cdbo = 0.0;
+        bo_ji->Cdbopi = 0.0;
+        bo_ji->Cdbopi2 = 0.0;
+
+        return 1;
+    }
+
+    return 0;
+}
+
+
+/* Compute the bond order term between atoms i and j,
+ * and if this term exceeds the cutoff bo_cut, then adds
+ * to the bond list according to the following convention:
+ *   * if the far neighbor list is store in half format,
+ *      add BOTH atoms to each other's portion of the bond list
+ *   * if the far neighbor list is store in full format,
+ *      add atom i to atom j's bonds list ONLY */
+int BOp_redundant( storage *workspace, reax_list *bonds, real bo_cut,
+         int i, int btop_i, int j, ivec *rel_box, real d, rvec *dvec,
+         int far_nbr_list_format, single_body_parameters *sbp_i,
+         single_body_parameters *sbp_j, two_body_parameters *twbp )
+{
+    real r2, C12, C34, C56;
+    real Cln_BOp_s, Cln_BOp_pi, Cln_BOp_pi2;
+    real BO, BO_s, BO_pi, BO_pi2;
+    bond_data *ibond;
+    bond_order_data *bo_ij;
+    int btop_j;
+    bond_data *jbond;
+    bond_order_data *bo_ji;
+
+    r2 = SQR(d);
+
+    if ( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0 )
+    {
+        C12 = twbp->p_bo1 * pow( d / twbp->r_s, twbp->p_bo2 );
+        BO_s = (1.0 + bo_cut) * exp( C12 );
+    }
+    else
+    {
+        C12 = 0.0;
+        BO_s = 0.0;
+    }
+
+    if ( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0 )
+    {
+        C34 = twbp->p_bo3 * pow( d / twbp->r_p, twbp->p_bo4 );
+        BO_pi = exp( C34 );
+    }
+    else
+    {
+        C34 = 0.0;
+        BO_pi = 0.0;
+    }
+
+    if ( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0 )
+    {
+        C56 = twbp->p_bo5 * pow( d / twbp->r_pp, twbp->p_bo6 );
+        BO_pi2 = exp( C56 );
+    }
+    else
+    {
+        C56 = 0.0;
+        BO_pi2 = 0.0;
+    }
+
+    /* Initially BO values are the uncorrected ones, page 1 */
+    BO = BO_s + BO_pi + BO_pi2;
+
+    if ( BO >= bo_cut )
+    {
+        /****** bonds i-j and j-i ******/
+        ibond = &bonds->bond_list[btop_i];
+        if ( far_nbr_list_format == HALF_LIST )
+        {
+            btop_j = End_Index( j, bonds );
+            jbond = &bonds->bond_list[btop_j];
+        }
+
+        ibond->nbr = j;
+        ibond->d = d;
+        rvec_Copy( ibond->dvec, *dvec );
+        ivec_Copy( ibond->rel_box, *rel_box );
+        ibond->dbond_index = btop_i;
+        if ( far_nbr_list_format == HALF_LIST )
+        {
+            ibond->sym_index = btop_j;
+            jbond->nbr = i;
+            jbond->d = d;
+            rvec_Scale( jbond->dvec, -1.0, *dvec );
+            ivec_Scale( jbond->rel_box, -1.0, *rel_box );
+            jbond->dbond_index = btop_i;
+            jbond->sym_index = btop_i;
+
+            Set_End_Index( j, btop_j + 1, bonds );
+        }
+        
+        bo_ij = &ibond->bo_data;
+        bo_ij->BO = BO;
+        bo_ij->BO_s = BO_s;
+        bo_ij->BO_pi = BO_pi;
+        bo_ij->BO_pi2 = BO_pi2;
+        if ( far_nbr_list_format == HALF_LIST )
+        {
+            bo_ji = &jbond->bo_data;
+            bo_ji->BO = BO;
+            bo_ji->BO_s = BO_s;
+            bo_ji->BO_pi = BO_pi;
+            bo_ji->BO_pi2 = BO_pi2;
+        }
+
+        /* Bond Order page2-3, derivative of total bond order prime */
+        Cln_BOp_s = twbp->p_bo2 * C12 / r2;
+        Cln_BOp_pi = twbp->p_bo4 * C34 / r2;
+        Cln_BOp_pi2 = twbp->p_bo6 * C56 / r2;
+
+        /* Only dln_BOp_xx wrt. dr_i is stored here, note that
+         * dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 */
+        rvec_Scale( bo_ij->dln_BOp_s, -1.0 * bo_ij->BO_s * Cln_BOp_s, ibond->dvec );
+        rvec_Scale( bo_ij->dln_BOp_pi, -1.0 * bo_ij->BO_pi * Cln_BOp_pi, ibond->dvec );
+        rvec_Scale( bo_ij->dln_BOp_pi2, -1.0 * bo_ij->BO_pi2 * Cln_BOp_pi2, ibond->dvec );
+        if ( far_nbr_list_format == HALF_LIST )
+        {
+            rvec_Scale( bo_ji->dln_BOp_s, -1.0, bo_ij->dln_BOp_s );
+            rvec_Scale( bo_ji->dln_BOp_pi, -1.0, bo_ij->dln_BOp_pi );
+            rvec_Scale( bo_ji->dln_BOp_pi2, -1.0, bo_ij->dln_BOp_pi2 );
+        }
+
+        /* Only dBOp wrt. dr_i is stored here, note that
+         * dBOp/dr_i = -dBOp/dr_j and all others are 0 */
+        rvec_Scale( bo_ij->dBOp, -1.0 * (bo_ij->BO_s * Cln_BOp_s 
+                    + bo_ij->BO_pi * Cln_BOp_pi 
+                    + bo_ij->BO_pi2 * Cln_BOp_pi2), ibond->dvec );
+        if ( far_nbr_list_format == HALF_LIST )
+        {
+            rvec_Scale( bo_ji->dBOp, -1.0, bo_ij->dBOp );
+        }
+
+        rvec_Add( workspace->dDeltap_self[i], bo_ij->dBOp );
+        if ( far_nbr_list_format == HALF_LIST )
+        {
+            rvec_Add( workspace->dDeltap_self[j], bo_ji->dBOp );
+        }
+
+        bo_ij->BO_s -= bo_cut;
+        bo_ij->BO -= bo_cut;
+        workspace->total_bond_order[i] += bo_ij->BO; //currently total_BOp
+        bo_ij->Cdbo = 0.0;
+        bo_ij->Cdbopi = 0.0;
+        bo_ij->Cdbopi2 = 0.0;
+        if ( far_nbr_list_format == HALF_LIST )
+        {
+            bo_ji->BO_s -= bo_cut;
+            bo_ji->BO -= bo_cut;
+            workspace->total_bond_order[j] += bo_ji->BO; //currently total_BOp
+            bo_ji->Cdbo = 0.0;
+            bo_ji->Cdbopi = 0.0;
+            bo_ji->Cdbopi2 = 0.0;
+        }
 
         return 1;
     }
@@ -800,7 +952,7 @@ int BOp( storage *workspace, reax_list *bonds, real bo_cut,
 }
 
 
-int compare_bonds( const void *p1, const void *p2 )
+static int compare_bonds( const void *p1, const void *p2 )
 {
     return ((bond_data *)p1)->nbr - ((bond_data *)p2)->nbr;
 }
diff --git a/PuReMD/src/bond_orders.h b/PuReMD/src/bond_orders.h
index 1975e20b6320a003b08527fae665dbd0bbc3c2e4..fcf4d71a24a6acbe90e49d5ad087cb4da973f51a 100644
--- a/PuReMD/src/bond_orders.h
+++ b/PuReMD/src/bond_orders.h
@@ -24,6 +24,7 @@
 
 #include "reax_types.h"
 
+
 typedef struct
 {
     real C1dbo, C2dbo, C3dbo;
@@ -32,28 +33,42 @@ typedef struct
     real C1dDelta, C2dDelta, C3dDelta;
 } dbond_coefficients;
 
+
 #ifdef TEST_FORCES
 void Get_dBO( reax_system*, reax_list**, int, int, real, rvec* );
+
 void Get_dBOpinpi2( reax_system*, reax_list**,
-                    int, int, real, real, rvec*, rvec* );
+        int, int, real, real, rvec*, rvec* );
 
 void Add_dBO( reax_system*, reax_list**, int, int, real, rvec* );
+
 void Add_dBOpinpi2( reax_system*, reax_list**,
-                    int, int, real, real, rvec*, rvec* );
+        int, int, real, real, rvec*, rvec* );
 
 void Add_dBO_to_Forces( reax_system*, reax_list**, int, int, real );
+
 void Add_dBOpinpi2_to_Forces( reax_system*, reax_list**,
-                              int, int, real, real );
+        int, int, real, real );
 
 void Add_dDelta( reax_system*, reax_list**, int, real, rvec* );
+
 void Add_dDelta_to_Forces( reax_system *, reax_list**, int, real );
 #endif
 
 void Add_dBond_to_Forces( int, int, storage*, reax_list** );
+
 void Add_dBond_to_Forces_NPT( int, int, simulation_data*,
-                              storage*, reax_list** );
-int BOp(storage*, reax_list*, real, int, int, far_neighbor_data*,
-        single_body_parameters*, single_body_parameters*, two_body_parameters*);
+        storage*, reax_list** );
+
+int BOp( storage*, reax_list*, real, int, int, int, ivec*, real, rvec*,
+        int, single_body_parameters*, single_body_parameters*,
+        two_body_parameters* );
+
+int BOp_redundant( storage*, reax_list*, real, int, int, int, ivec*, real, rvec*,
+        int, single_body_parameters*, single_body_parameters*,
+        two_body_parameters* );
+
 void BO( reax_system*, control_params*, simulation_data*,
          storage*, reax_list**, output_controls* );
+
 #endif
diff --git a/PuReMD/src/box.c b/PuReMD/src/box.c
index c5f18cb35c6305da91536e9b7a26e1788aa83cde..d81a96db80b21fceabe13fc3b518f932c13320af 100644
--- a/PuReMD/src/box.c
+++ b/PuReMD/src/box.c
@@ -285,6 +285,10 @@ void Setup_Environment( reax_system *system, control_params *control,
     Setup_My_Box( system, control );
     Setup_My_Ext_Box( system, control );
     Setup_Comm( system, control, mpi_data );
+#if defined(NEUTRAL_TERRITORY)
+    Setup_NT_Comm( system, control, mpi_data );
+#endif
+
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d coord: %d %d %d\n",
              system->my_rank,
diff --git a/PuReMD/src/comm_tools.c b/PuReMD/src/comm_tools.c
index 8419e3efe44377e7d1680fe09b2bc06ef5756ca6..52b8c98618b19e626b71757febd6e795fddfaea0 100644
--- a/PuReMD/src/comm_tools.c
+++ b/PuReMD/src/comm_tools.c
@@ -25,6 +25,195 @@
 #include "tool_box.h"
 #include "vector.h"
 
+#if defined(NEUTRAL_TERRITORY)
+void Setup_NT_Comm( reax_system* system, control_params* control,
+                 mpi_datatypes *mpi_data )
+{
+    int i, d;
+    real bndry_cut;
+    neighbor_proc *nbr_pr;
+    simulation_box *my_box;
+    ivec nbr_coords, nbr_recv_coords;
+    ivec r[12] = {
+        {0, 0, -1}, // -z
+        {0, 0, +1}, // +z
+        {0, -1, 0}, // -y
+        {-1, -1, 0}, // -x-y
+        {-1, 0, 0}, // -x
+        {-1, +1, 0},  // -x+y
+
+        {0, 0, +1}, // +z
+        {0, 0, -1}, // -z
+        {0, +1, 0}, // +y
+        {+1, +1, 0}, // +x+y
+        {+1, 0, 0}, // +x
+        {+1, -1, 0}  // +x-y
+    };
+    my_box = &system->my_box;
+    bndry_cut = system->bndry_cuts.ghost_cutoff;
+    system->num_nt_nbrs = REAX_MAX_NT_NBRS;
+
+    /* identify my neighbors */
+    for ( i = 0; i < system->num_nt_nbrs; ++i )
+    {
+        nbr_pr = &system->my_nt_nbrs[i];
+        ivec_Sum( nbr_coords, system->my_coords, r[i] ); /* actual nbr coords */
+        MPI_Cart_rank( mpi_data->comm_mesh3D, nbr_coords, &nbr_pr->rank );
+        
+        /* set the rank of the neighbor processor in the receiving direction */
+        ivec_Sum( nbr_recv_coords, system->my_coords, r[i + 6] ); /* actual nbr coords */
+        MPI_Cart_rank( mpi_data->comm_mesh3D, nbr_recv_coords, &nbr_pr->receive_rank );
+
+        for ( d = 0; d < 3; ++d )
+        {
+            /* determine the boundary area with this nbr */
+            if ( r[i][d] < 0 )
+            {
+                nbr_pr->bndry_min[d] = my_box->min[d];
+                nbr_pr->bndry_max[d] = my_box->min[d] + bndry_cut;
+            }
+            else if ( r[i][d] > 0 )
+            {
+                nbr_pr->bndry_min[d] = my_box->max[d] - bndry_cut;
+                nbr_pr->bndry_max[d] = my_box->max[d];
+            }
+            else
+            {
+                nbr_pr->bndry_min[d] = my_box->min[d];
+                nbr_pr->bndry_max[d] = my_box->max[d];
+            }
+
+            /* determine if it is a periodic neighbor */
+            if ( nbr_coords[d] < 0 )
+            {
+                nbr_pr->prdc[d] = -1;
+            }
+            else if ( nbr_coords[d] >= control->procs_by_dim[d] )
+            {
+                nbr_pr->prdc[d] = 1;
+            }
+            else
+            {
+                nbr_pr->prdc[d] = 0;
+            }
+        }
+
+    }
+}
+#endif
+
+
+#if defined(NEUTRAL_TERRITORY)
+int Sort_Neutral_Territory( reax_system *system, int dir, mpi_out_data *out_bufs, int write )
+{
+    int i, cnt;
+    reax_atom *atoms;
+    neighbor_proc *nbr_pr;
+
+    cnt = 0;
+    atoms = system->my_atoms;
+    /* place each atom into the appropriate outgoing list */
+    nbr_pr = &( system->my_nt_nbrs[dir] );
+
+    for ( i = 0; i < system->n; ++i )
+    {
+        if ( nbr_pr->bndry_min[0] <= atoms[i].x[0]
+                && atoms[i].x[0] < nbr_pr->bndry_max[0]
+                && nbr_pr->bndry_min[1] <= atoms[i].x[1]
+                && atoms[i].x[1] < nbr_pr->bndry_max[1]
+                && nbr_pr->bndry_min[2] <= atoms[i].x[2]
+                && atoms[i].x[2] < nbr_pr->bndry_max[2] )
+        {
+            if ( write )
+            {
+                out_bufs[dir].index[out_bufs[dir].cnt] = i;
+                out_bufs[dir].cnt++;
+            }
+            else
+            {
+                cnt++;
+            }
+        }
+    }
+
+    return cnt;
+}
+#endif
+
+
+#if defined(NEUTRAL_TERRITORY)
+void Init_Neutral_Territory( reax_system* system, mpi_datatypes *mpi_data )
+{
+    int d, end, cnt;
+    mpi_out_data *out_bufs;
+    MPI_Comm comm;
+    MPI_Request req;
+    MPI_Status stat;
+    neighbor_proc *nbr;
+
+    Reset_Out_Buffers( mpi_data->out_nt_buffers, system->num_nt_nbrs );
+    comm = mpi_data->comm_mesh3D;
+    out_bufs = mpi_data->out_nt_buffers;
+    cnt = 0;
+    end = system->n;
+
+    for ( d = 0; d < 6; ++d )
+    {
+        nbr = &system->my_nt_nbrs[d];
+        
+        Sort_Neutral_Territory( system, d, out_bufs, 1 );
+        
+        MPI_Irecv( &cnt, 1, MPI_INT, nbr->receive_rank, d, comm, &req );
+        MPI_Send( &out_bufs[d].cnt, 1, MPI_INT, nbr->rank, d, comm );
+        MPI_Wait( &req, &stat );
+        
+        if ( mpi_data->in_nt_buffer[d] == NULL )
+        {
+            nbr->est_recv = MAX( SAFER_ZONE_NT * cnt, MIN_SEND );
+            mpi_data->in_nt_buffer[d] = smalloc( nbr->est_recv * sizeof(real),
+                    "Init_Neural_Territory::mpi_data->in_nt_buffer[d]", comm );
+        }
+
+        nbr = &system->my_nt_nbrs[d];
+        nbr->atoms_str = end;
+        nbr->atoms_cnt = cnt;
+        end += cnt;
+    }
+}
+#endif
+
+
+#if defined(NEUTRAL_TERRITORY)
+void Estimate_NT_Atoms( reax_system *system, mpi_datatypes *mpi_data )
+{
+    int d;
+    mpi_out_data *out_bufs;
+    neighbor_proc *nbr;
+
+    out_bufs = mpi_data->out_nt_buffers;
+
+    for ( d = 0; d < 6; ++d )
+    {
+        /* count the number of atoms in each processor's outgoing list */
+        nbr = &system->my_nt_nbrs[d];
+        nbr->est_send = Sort_Neutral_Territory( system, d, out_bufs, 0 );
+
+        /* estimate the space needed based on the count above */
+        nbr->est_send = MAX( MIN_SEND, nbr->est_send * SAFER_ZONE_NT );
+
+        /* allocate the estimated space */
+        out_bufs[d].index = scalloc( nbr->est_send, sizeof(int),
+                "Estimate_NT_Atoms::out_bufs[d].index", MPI_COMM_WORLD );
+        out_bufs[d].out_atoms = scalloc( nbr->est_send, sizeof(real),
+                "Estimate_NT_Atoms::out_bufs[d].out_atoms", MPI_COMM_WORLD );
+
+        /* sort the atoms to their outgoing buffers */
+        // TODO: to call or not to call?
+        //Sort_Neutral_Territory( system, d, out_bufs, 1 );
+    }
+}
+#endif
+
 
 void Setup_Comm( reax_system* system, control_params* control,
                  mpi_datatypes *mpi_data )
@@ -270,7 +459,6 @@ void Sort_Boundary_Atoms( reax_system *system, int start, int end,
 {
     int i, d, p, out_cnt;
     reax_atom *atoms;
-    simulation_box *my_box;
     boundary_atom *out_buf;
     neighbor_proc *nbr_pr;
 
@@ -280,7 +468,6 @@ void Sort_Boundary_Atoms( reax_system *system, int start, int end,
 #endif
 
     atoms = system->my_atoms;
-    my_box = &( system->my_box );
 
     /* place each atom into the appropriate outgoing list */
     for ( i = start; i < end; ++i )
@@ -320,7 +507,6 @@ void Estimate_Boundary_Atoms( reax_system *system, int start, int end,
 {
     int i, p, out_cnt;
     reax_atom *atoms;
-    simulation_box *my_box;
     boundary_atom *out_buf;
     neighbor_proc *nbr1, *nbr2, *nbr_pr;
 
@@ -329,7 +515,6 @@ void Estimate_Boundary_Atoms( reax_system *system, int start, int end,
              system->my_rank, start, end, d );
 #endif
     atoms = system->my_atoms;
-    my_box = &( system->my_box );
     nbr1 = &(system->my_nbrs[2 * d]);
     nbr2 = &(system->my_nbrs[2 * d + 1]);
     nbr1->est_send = 0;
@@ -609,7 +794,7 @@ void Comm_Atoms( reax_system *system, control_params *control,
 
     if ( system->my_rank == MASTER_NODE )
     {
-        t_start = Get_Time( );
+        t_start = MPI_Wtime();
     }
 #endif
 
@@ -653,6 +838,10 @@ void Comm_Atoms( reax_system *system, control_params *control,
 #endif
 
         Bin_Boundary_Atoms( system );
+        
+#if defined(NEUTRAL_TERRITORY)
+        Init_Neutral_Territory( system, mpi_data );
+#endif
     }
     else
     {
@@ -673,7 +862,7 @@ void Comm_Atoms( reax_system *system, control_params *control,
 #if defined(LOG_PERFORMANCE)
     if ( system->my_rank == MASTER_NODE )
     {
-        t_elapsed = Get_Timing_Info( t_start );
+        t_elapsed = MPI_Wtime() - t_start;
         data->timing.comm += t_elapsed;
     }
 #endif
diff --git a/PuReMD/src/comm_tools.h b/PuReMD/src/comm_tools.h
index 48b676ebbe67cbd8fb17af717fd7da5eac96ffc8..c333fa0cd1bce2bc3ae1b71029a9b9522e75573c 100644
--- a/PuReMD/src/comm_tools.h
+++ b/PuReMD/src/comm_tools.h
@@ -25,10 +25,16 @@
 #include "reax_types.h"
 
 void Setup_Comm( reax_system*, control_params*, mpi_datatypes* );
+#if defined(NEUTRAL_TERRITORY)
+void Setup_NT_Comm( reax_system*, control_params*, mpi_datatypes* );
+#endif
 void Update_Comm( reax_system* );
 
 void Sort_Boundary_Atoms( reax_system*, int, int, int, mpi_out_data* );
 void Estimate_Boundary_Atoms( reax_system*, int, int, int, mpi_out_data* );
+#if defined(NEUTRAL_TERRITORY)
+void Estimate_NT_Atoms( reax_system*, mpi_datatypes* );
+#endif
 void Unpack_Exchange_Message( reax_system*, int, void*, int,
                               neighbor_proc*, int );
 void Unpack_Estimate_Message( reax_system*, int, void*, int,
diff --git a/PuReMD/src/ffield.c b/PuReMD/src/ffield.c
index b05216bdcb19f8002bfe02293a3cf6e28dd4faa7..b25b8db44ab7d20d1380069533e9a65bc82db55b 100644
--- a/PuReMD/src/ffield.c
+++ b/PuReMD/src/ffield.c
@@ -43,11 +43,7 @@ char Read_Force_Field( char *ffield_file, reax_interaction *reax,
     comm = MPI_COMM_WORLD;
 
     /* open force field file */
-    if ( (fp = fopen( ffield_file, "r" ) ) == NULL )
-    {
-        fprintf( stderr, "error opening the force filed file! terminating...\n" );
-        MPI_Abort( comm, FILE_NOT_FOUND );
-    }
+    fp = sfopen( ffield_file, "r", "Read_Force_Field::fp" );
 
     s = (char*) malloc(sizeof(char) * MAX_LINE);
     tmp = (char**) malloc(sizeof(char*)*MAX_TOKENS);
diff --git a/PuReMD/src/forces.c b/PuReMD/src/forces.c
index c406417b315dc4181fe286fd0e143a7a2e2f5fc7..fbd7acf952b5b4c053aa3e19d868b6bbd78d6077 100644
--- a/PuReMD/src/forces.c
+++ b/PuReMD/src/forces.c
@@ -20,176 +20,65 @@
   ----------------------------------------------------------------------*/
 
 #include "reax_types.h"
+
 #if defined(PURE_REAX)
-#include "forces.h"
-#include "bond_orders.h"
-#include "bonds.h"
-#include "basic_comm.h"
-#include "hydrogen_bonds.h"
-#include "io_tools.h"
-#include "list.h"
-#include "lookup.h"
-#include "multi_body.h"
-#include "nonbonded.h"
-#include "qEq.h"
-#include "tool_box.h"
-#include "torsion_angles.h"
-#include "valence_angles.h"
-#include "vector.h"
+  #include "forces.h"
+  #include "bond_orders.h"
+  #include "bonds.h"
+  #include "basic_comm.h"
+  #include "hydrogen_bonds.h"
+  #include "io_tools.h"
+  #include "list.h"
+  #include "lookup.h"
+  #include "multi_body.h"
+  #include "nonbonded.h"
+  #include "qEq.h"
+  #include "tool_box.h"
+  #include "torsion_angles.h"
+  #include "valence_angles.h"
+  #include "vector.h"
 #elif defined(LAMMPS_REAX)
-#include "reax_forces.h"
-#include "reax_bond_orders.h"
-#include "reax_bonds.h"
-#include "reax_basic_comm.h"
-#include "reax_hydrogen_bonds.h"
-#include "reax_io_tools.h"
-#include "reax_list.h"
-#include "reax_lookup.h"
-#include "reax_multi_body.h"
-#include "reax_nonbonded.h"
-#include "reax_tool_box.h"
-#include "reax_torsion_angles.h"
-#include "reax_valence_angles.h"
-#include "reax_vector.h"
+  #include "reax_forces.h"
+  #include "reax_bond_orders.h"
+  #include "reax_bonds.h"
+  #include "reax_basic_comm.h"
+  #include "reax_hydrogen_bonds.h"
+  #include "reax_io_tools.h"
+  #include "reax_list.h"
+  #include "reax_lookup.h"
+  #include "reax_multi_body.h"
+  #include "reax_nonbonded.h"
+  #include "reax_tool_box.h"
+  #include "reax_torsion_angles.h"
+  #include "reax_valence_angles.h"
+  #include "reax_vector.h"
 #endif
 
-interaction_function Interaction_Functions[NUM_INTRS];
-
-void Dummy_Interaction( reax_system *system, control_params *control,
-                        simulation_data *data, storage *workspace,
-                        reax_list **lists, output_controls *out_control )
-{
-}
-
 
-void Init_Force_Functions( control_params *control )
-{
-    Interaction_Functions[0] = BO;
-    Interaction_Functions[1] = Bonds; //Dummy_Interaction;
-    Interaction_Functions[2] = Atom_Energy; //Dummy_Interaction;
-    Interaction_Functions[3] = Valence_Angles; //Dummy_Interaction;
-    Interaction_Functions[4] = Torsion_Angles; //Dummy_Interaction;
-    if ( control->hbond_cut > 0 )
-        Interaction_Functions[5] = Hydrogen_Bonds;
-    else Interaction_Functions[5] = Dummy_Interaction;
-    Interaction_Functions[6] = Dummy_Interaction; //empty
-    Interaction_Functions[7] = Dummy_Interaction; //empty
-    Interaction_Functions[8] = Dummy_Interaction; //empty
-    Interaction_Functions[9] = Dummy_Interaction; //empty
-}
+interaction_function Interaction_Functions[NUM_INTRS];
 
 
-void Compute_Bonded_Forces( reax_system *system, control_params *control,
-                            simulation_data *data, storage *workspace,
-                            reax_list **lists, output_controls *out_control,
-                            MPI_Comm comm )
+static int compare_bonds( const void *p1, const void *p2 )
 {
-    int i;
-
-    /* Mark beginning of a new timestep in bonded energy files */
-#if defined(TEST_ENERGY)
-    Debug_Marker_Bonded( out_control, data->step );
-#endif
-
-    /* Implement all force calls as function pointers */
-    for ( i = 0; i < NUM_INTRS; i++ )
-    {
-#if defined(DEBUG)
-        fprintf( stderr, "p%d: starting f%d\n", system->my_rank, i );
-        MPI_Barrier( comm );
-#endif
-        (Interaction_Functions[i])( system, control, data, workspace,
-                                    lists, out_control );
-#if defined(DEBUG)
-        fprintf( stderr, "p%d: f%d done\n", system->my_rank, i );
-        MPI_Barrier( comm );
-#endif
-    }
+    return ((bond_data *)p1)->nbr - ((bond_data *)p2)->nbr;
 }
 
 
-void Compute_NonBonded_Forces( reax_system *system, control_params *control,
-                               simulation_data *data, storage *workspace,
-                               reax_list **lists, output_controls *out_control,
-                               MPI_Comm comm )
+static void Dummy_Interaction( reax_system *system, control_params *control,
+        simulation_data *data, storage *workspace,
+        reax_list **lists, output_controls *out_control )
 {
-    /* Mark beginning of a new timestep in nonbonded energy files */
-#if defined(TEST_ENERGY)
-    Debug_Marker_Nonbonded( out_control, data->step );
-#endif
-
-    /* van der Waals and Coulomb interactions */
-    if ( control->tabulate == 0 )
-        vdW_Coulomb_Energy( system, control, data, workspace,
-                            lists, out_control );
-    else
-        Tabulated_vdW_Coulomb_Energy( system, control, data, workspace,
-                                      lists, out_control );
-
-#if defined(DEBUG)
-    fprintf( stderr, "p%d: nonbonded forces done\n", system->my_rank );
-    MPI_Barrier( comm );
-#endif
+    ;
 }
 
 
-
-/* this version of Compute_Total_Force computes forces from
-   coefficients accumulated by all interaction functions.
-   Saves enormous time & space! */
-void Compute_Total_Force( reax_system *system, control_params *control,
-                          simulation_data *data, storage *workspace,
-                          reax_list **lists, mpi_datatypes *mpi_data )
-{
-    int i, pj;
-    reax_list *bonds = lists[BONDS];
-
-    for ( i = 0; i < system->N; ++i )
-        for ( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj )
-            if ( i < bonds->bond_list[pj].nbr )
-            {
-                if ( control->virial == 0 )
-                    Add_dBond_to_Forces( i, pj, workspace, lists );
-                else
-                    Add_dBond_to_Forces_NPT( i, pj, data, workspace, lists );
-            }
-
-    //Print_Total_Force( system, data, workspace );
-#if defined(PURE_REAX)
-    /* now all forces are computed to their partially-final values
-       based on the neighbors information each processor has had.
-       final values of force on each atom needs to be computed by adding up
-       all partially-final pieces */
-    Coll( system, mpi_data, workspace->f, mpi_data->mpi_rvec,
-          sizeof(rvec) / sizeof(void), rvec_unpacker );
-    for ( i = 0; i < system->n; ++i )
-        rvec_Copy( system->my_atoms[i].f, workspace->f[i] );
-
-#if defined(TEST_FORCES)
-    Coll( system, mpi_data, workspace->f_ele, mpi_data->mpi_rvec, rvec_unpacker);
-    Coll( system, mpi_data, workspace->f_vdw, mpi_data->mpi_rvec, rvec_unpacker);
-    Coll( system, mpi_data, workspace->f_be, mpi_data->mpi_rvec, rvec_unpacker );
-    Coll( system, mpi_data, workspace->f_lp, mpi_data->mpi_rvec, rvec_unpacker );
-    Coll( system, mpi_data, workspace->f_ov, mpi_data->mpi_rvec, rvec_unpacker );
-    Coll( system, mpi_data, workspace->f_un, mpi_data->mpi_rvec, rvec_unpacker );
-    Coll( system, mpi_data, workspace->f_ang, mpi_data->mpi_rvec, rvec_unpacker);
-    Coll( system, mpi_data, workspace->f_coa, mpi_data->mpi_rvec, rvec_unpacker);
-    Coll( system, mpi_data, workspace->f_pen, mpi_data->mpi_rvec, rvec_unpacker);
-    Coll( system, mpi_data, workspace->f_hb, mpi_data->mpi_rvec, rvec_unpacker );
-    Coll( system, mpi_data, workspace->f_tor, mpi_data->mpi_rvec, rvec_unpacker);
-    Coll( system, mpi_data, workspace->f_con, mpi_data->mpi_rvec, rvec_unpacker);
-#endif
-
-#endif
-}
-
-void Validate_Lists( reax_system *system, storage *workspace, reax_list **lists,
-                     int step, int n, int N, int numH, MPI_Comm comm )
+static void Validate_Lists( reax_system *system, storage *workspace,
+        reax_list **lists, int step, int n, int N, int numH, MPI_Comm comm )
 {
     int i, comp, Hindex;
     reax_list *bonds, *hbonds;
     reallocate_data *realloc;
-    realloc = &(workspace->realloc);
+    realloc = &workspace->realloc;
 
     /* bond list */
     if ( N > 0 )
@@ -205,19 +94,23 @@ void Validate_Lists( reax_system *system, storage *workspace, reax_list **lists,
             //workspace->realloc.bonds = 1;
 
             if ( i < N - 1 )
+            {
                 comp = Start_Index(i + 1, bonds);
-            else comp = bonds->num_intrs;
+            }
+            else
+            {
+                comp = bonds->num_intrs;
+            }
 
             if ( End_Index(i, bonds) > comp )
             {
-                fprintf( stderr, "step%d-bondchk failed: i=%d end(i)=%d str(i+1)=%d\n",
+                fprintf( stderr, "[ERROR] step%d-bondchk failed: i=%d end(i)=%d str(i+1)=%d\n",
                          step, i, End_Index(i, bonds), comp );
                 MPI_Abort( comm, INSUFFICIENT_MEMORY );
             }
         }
     }
 
-
     /* hbonds list */
     if ( numH > 0 )
     {
@@ -226,6 +119,7 @@ void Validate_Lists( reax_system *system, storage *workspace, reax_list **lists,
         for ( i = 0; i < n; ++i )
         {
             Hindex = system->my_atoms[i].Hindex;
+
             if ( Hindex > -1 )
             {
                 system->my_atoms[i].num_hbonds =
@@ -236,49 +130,55 @@ void Validate_Lists( reax_system *system, storage *workspace, reax_list **lists,
                 //  workspace->realloc.hbonds = 1;
 
                 if ( Hindex < numH - 1 )
-                    comp = Start_Index(Hindex + 1, hbonds);
-                else comp = hbonds->num_intrs;
+                {
+                    comp = Start_Index( Hindex + 1, hbonds );
+                }
+                else
+                {
+                    comp = hbonds->num_intrs;
+                }
 
                 if ( End_Index(Hindex, hbonds) > comp )
                 {
-                    fprintf(stderr, "step%d-hbondchk failed: H=%d end(H)=%d str(H+1)=%d\n",
+                    fprintf(stderr, "[ERROR] step%d-hbondchk failed: H=%d end(H)=%d str(H+1)=%d\n",
                             step, Hindex, End_Index(Hindex, hbonds), comp );
                     MPI_Abort( comm, INSUFFICIENT_MEMORY );
                 }
             }
-/*
-            if ( Hindex > -1 )
-            {
-                system->my_atoms[i].num_hbonds =
-                    MAX( Num_Entries(Hindex, hbonds) * SAFER_ZONE, MIN_HBONDS );
-*/
+
+//            if ( Hindex > -1 )
+//            {
+//                system->my_atoms[i].num_hbonds =
+//                    MAX( Num_Entries(Hindex, hbonds) * SAFER_ZONE, MIN_HBONDS );
+
                 //if( Num_Entries(i, hbonds) >=
                 //(Start_Index(i+1,hbonds)-Start_Index(i,hbonds))*0.90/*DANGER_ZONE*/){
                 //  workspace->realloc.hbonds = 1;
-/*
+                
                 //TODO
-                if ( Hindex < system->n - 1 )
-                    comp = Start_Index(Hindex + 1, hbonds);
-                else comp = hbonds->num_intrs;
-
-                if ( End_Index(Hindex, hbonds) > comp )
-                {
-                    fprintf(stderr, "step%d-hbondchk failed: H=%d end(H)=%d str(H+1)=%d\n",
-                            step, Hindex, End_Index(Hindex, hbonds), comp );
-                    MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
-                }
-            }
-		
-*/
-
-
-
+//                if ( Hindex < system->n - 1 )
+//                {
+//                    comp = Start_Index(Hindex + 1, hbonds);
+//                }
+//                else
+//                {
+//                    comp = hbonds->num_intrs;
+//                }
+//
+//                if ( End_Index(Hindex, hbonds) > comp )
+//                {
+//                    fprintf(stderr, "[ERROR] step%d-hbondchk failed: H=%d end(H)=%d str(H+1)=%d\n",
+//                            step, Hindex, End_Index(Hindex, hbonds), comp );
+//                    MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
+//                }
+//            }
         }
     }
 }
 
 
-real Compute_H( real r, real gamma, real *ctap )
+/* Computes a charge matrix entry using the Taper function */
+static real Compute_H( real r, real gamma, real *ctap )
 {
     real taper, dr3gamij_1, dr3gamij_3;
 
@@ -291,24 +191,32 @@ real Compute_H( real r, real gamma, real *ctap )
     taper = taper * r + ctap[0];
 
     dr3gamij_1 = ( r * r * r + gamma );
-    dr3gamij_3 = pow( dr3gamij_1 , 0.33333333333333 );
+    dr3gamij_3 = pow( dr3gamij_1, 1.0 / 3.0 );
+
     return taper * EV_to_KCALpMOL / dr3gamij_3;
 }
 
 
-real Compute_tabH( real r_ij, int ti, int tj )
+/* Computes a charge matrix entry using the force tabulation
+ * (i.e., an arithmetic-reducing optimization) */
+static real Compute_tabH( real r_ij, int ti, int tj )
 {
     int r, tmin, tmax;
     real val, dif, base;
     LR_lookup_table *t;
 
-    tmin  = MIN( ti, tj );
-    tmax  = MAX( ti, tj );
-    t = &( LR[tmin][tmax] );
+    tmin = MIN( ti, tj );
+    tmax = MAX( ti, tj );
+    t = &LR[tmin][tmax];
 
     /* cubic spline interpolation */
     r = (int)(r_ij * t->inv_dx);
-    if ( r == 0 )  ++r;
+
+    if ( r == 0 )
+    {
+        ++r;
+    }
+
     base = (real)(r + 1) * t->dx;
     dif = r_ij - base;
     val = ((t->ele[r].d * dif + t->ele[r].c) * dif + t->ele[r].b) * dif +
@@ -319,307 +227,1680 @@ real Compute_tabH( real r_ij, int ti, int tj )
 }
 
 
-void Init_Forces( reax_system *system, control_params *control,
-                  simulation_data *data, storage *workspace, reax_list **lists,
-                  output_controls *out_control, MPI_Comm comm )
+/* Compute the distances and displacement vectors for entries
+ * in the far neighbors list if it's a NOT re-neighboring step */
+static void Init_Distance( reax_system *system, control_params *control,
+        simulation_data *data, storage *workspace, reax_list **lists,
+        output_controls *out_control, MPI_Comm comm, mpi_datatypes *mpi_data )
 {
     int i, j, pj;
     int start_i, end_i;
-    int type_i, type_j;
-    int Htop, btop_i, btop_j, num_bonds, num_hbonds;
-    int ihb, jhb, ihb_top, jhb_top;
-    int local, flag, renbr;
-    real r_ij, cutoff;
-    sparse_matrix *H;
-    reax_list *far_nbrs, *bonds, *hbonds;
-    single_body_parameters *sbp_i, *sbp_j;
-    two_body_parameters *twbp;
-    far_neighbor_data *nbr_pj;
+    int renbr;
+    reax_list *far_nbrs;
     reax_atom *atom_i, *atom_j;
 
     far_nbrs = lists[FAR_NBRS];
-    bonds = lists[BONDS];
-    hbonds = lists[HBONDS];
+    renbr = (data->step - data->prev_steps) % control->reneighbor == 0;
 
-    for ( i = 0; i < system->n; ++i )
-        workspace->bond_mark[i] = 0;
-    for ( i = system->n; i < system->N; ++i )
+    if ( !renbr )
     {
-        workspace->bond_mark[i] = 1000; // put ghost atoms to an infinite distance
-        //workspace->done_after[i] = Start_Index( i, far_nbrs );
+        for ( i = 0; i < system->N; ++i )
+        {
+            atom_i = &system->my_atoms[i];
+            start_i = Start_Index( i, far_nbrs );
+            end_i = End_Index( i, far_nbrs );
+
+            /* update distance and displacement vector between atoms i and j (i-j) */
+            for ( pj = start_i; pj < end_i; ++pj )
+            {
+                j = far_nbrs->far_nbr_list.nbr[pj];
+                atom_j = &system->my_atoms[j];
+                
+                far_nbrs->far_nbr_list.dvec[pj][0] = atom_j->x[0] - atom_i->x[0];
+                far_nbrs->far_nbr_list.dvec[pj][1] = atom_j->x[1] - atom_i->x[1];
+                far_nbrs->far_nbr_list.dvec[pj][2] = atom_j->x[2] - atom_i->x[2];
+                far_nbrs->far_nbr_list.d[pj] = rvec_Norm_Sqr( far_nbrs->far_nbr_list.dvec[pj] );
+                far_nbrs->far_nbr_list.d[pj] = sqrt( far_nbrs->far_nbr_list.d[pj] );
+            }
+        }
     }
+}
+
+
+#if defined(NEUTRAL_TERRITORY)
+/* Compute the charge matrix entries and store the matrix in half format
+ * using the far neighbors list (stored in full format) and according to
+ * the neutral territory communication method */
+static void Init_CM_Half_NT( reax_system *system, control_params *control,
+        simulation_data *data, storage *workspace, reax_list **lists,
+        output_controls *out_control, MPI_Comm comm, mpi_datatypes *mpi_data )
+{
+    int i, j, pj;
+    int start_i, end_i;
+    int type_i, type_j;
+    int Htop;
+    int local, renbr;
+    real r_ij;
+    sparse_matrix *H;
+    reax_list *far_nbrs;
+    single_body_parameters *sbp_i;
+    two_body_parameters *twbp;
+    reax_atom *atom_i, *atom_j;
+    int mark[6];
+    int total_cnt[6];
+    int bin[6];
+    int total_sum[6];
+    int nt_flag;
+
+    far_nbrs = lists[FAR_NBRS];
 
     H = workspace->H;
     H->n = system->n;
     Htop = 0;
-    num_bonds = 0;
-    num_hbonds = 0;
-    btop_i = btop_j = 0;
     renbr = (data->step - data->prev_steps) % control->reneighbor == 0;
 
+    nt_flag = 1;
+    if( renbr )
+    {
+        for ( i = 0; i < 6; ++i )
+        {
+            total_cnt[i] = 0;
+            bin[i] = 0;
+            total_sum[i] = 0;
+        }
+
+        for ( i = system->n; i < system->N; ++i )
+        {
+            atom_i = &system->my_atoms[i];
+
+            if( atom_i->nt_dir != -1 )
+            {
+                total_cnt[ atom_i->nt_dir ]++;
+            }
+        }
+
+        total_sum[0] = system->n;
+        for ( i = 1; i < 6; ++i )
+        {
+            total_sum[i] = total_sum[i-1] + total_cnt[i-1];
+        }
+
+        for ( i = system->n; i < system->N; ++i )
+        {
+            atom_i = &system->my_atoms[i];
+
+            if( atom_i->nt_dir != -1 )
+            {
+                atom_i->pos = total_sum[ atom_i->nt_dir ] + bin[ atom_i->nt_dir ];
+                bin[ atom_i->nt_dir ]++;
+            }
+        }
+        H->NT = total_sum[5] + total_cnt[5];
+    }
+
+    mark[0] = mark[1] = 1;
+    mark[2] = mark[3] = mark[4] = mark[5] = 2;
+
     for ( i = 0; i < system->N; ++i )
     {
-        atom_i = &(system->my_atoms[i]);
-        type_i  = atom_i->type;
-        start_i = Start_Index(i, far_nbrs);
-        end_i   = End_Index(i, far_nbrs);
-        btop_i = End_Index( i, bonds );
-        sbp_i = &(system->reax_param.sbp[type_i]);
+        atom_i = &system->my_atoms[i];
+        type_i = atom_i->type;
+        start_i = Start_Index( i, far_nbrs );
+        end_i = End_Index( i, far_nbrs );
+
+        sbp_i = &system->reax_param.sbp[type_i];
 
         if ( i < system->n )
         {
             local = 1;
-            cutoff = control->nonb_cut;
+        }
+        else if ( atom_i->nt_dir != -1 )
+        {
+            local = 2;
+            nt_flag = 0;
         }
         else
         {
-            local = 0;
-            cutoff = control->bond_cut;
+            continue;
         }
 
-        ihb = -1;
-        ihb_top = -1;
-        if ( local )
+        if ( local == 1 )
         {
             H->start[i] = Htop;
             H->entries[Htop].j = i;
             H->entries[Htop].val = sbp_i->eta;
             ++Htop;
+        }
 
-            if ( control->hbond_cut > 0 )
+        for ( pj = start_i; pj < end_i; ++pj )
+        {
+            j = far_nbrs->far_nbr_list.nbr[pj];
+            atom_j = &system->my_atoms[j];
+
+            if ( far_nbrs->far_nbr_list.d[pj] <= control->nonb_cut )
             {
-                ihb = sbp_i->p_hbond;
-                if ( ihb == 1 )
-                    ihb_top = End_Index( atom_i->Hindex, hbonds );
-                else ihb_top = -1;
+                type_j = atom_j->type;
+                r_ij = far_nbrs->far_nbr_list.d[pj];
+                twbp = &system->reax_param.tbp[type_i][type_j];
+
+                if ( local == 1 )
+                {
+                    /* H matrix entry */
+                    if ( atom_j->nt_dir > 0 || (j < system->n && i < j) )
+                    {
+                        if ( j < system->n )
+                        {
+                            H->entries[Htop].j = j;
+                        }
+                        else
+                        {
+                            H->entries[Htop].j = atom_j->pos;
+                        }
+
+                        if ( control->tabulate == 0 )
+                        {
+                            H->entries[Htop].val = Compute_H( r_ij, twbp->gamma, workspace->Tap );
+                        }
+                        else 
+                        {
+                            H->entries[Htop].val = Compute_tabH( r_ij, type_i, type_j );
+                        }
+
+                        ++Htop;
+                    }
+
+                }
+                else if ( local == 2 )
+                {
+                    /* H matrix entry */
+                    if ( atom_j->nt_dir != -1
+                            && mark[atom_i->nt_dir] != mark[atom_j->nt_dir]
+                            && atom_i->pos < atom_j->pos )
+                    {
+                        if ( !nt_flag )
+                        {
+                            nt_flag = 1;
+                            H->start[atom_i->pos] = Htop;
+                        }
+
+                        //TODO: necessary?
+                        if ( j < system->n )
+                        {
+                            H->entries[Htop].j = j;
+                        }
+                        else
+                        {
+                            H->entries[Htop].j = atom_j->pos;
+                        }
+
+                        if ( control->tabulate == 0 )
+                        {
+                            H->entries[Htop].val = Compute_H( r_ij, twbp->gamma, workspace->Tap );
+                        }
+                        else 
+                        {
+                            H->entries[Htop].val = Compute_tabH( r_ij, type_i, type_j );
+                        }
+
+                        ++Htop;
+                    }
+                }
+
+            }
+        }
+
+        if ( local == 1 )
+        {
+            H->end[i] = Htop;
+        }
+        else if ( local == 2 )
+        {
+            if ( nt_flag )
+            {
+                H->end[atom_i->pos] = Htop;
+            }
+            else
+            {
+                 H->start[atom_i->pos] = 0;
+                 H->end[atom_i->pos] = 0;
             }
         }
+    }
+
+    workspace->realloc.Htop = Htop;
+
+#if defined( DEBUG )
+    Print_Sparse_Matrix( system, H );
+    for ( i = 0; i < H->n; ++i )
+        for ( j = H->start[i]; j < H->end[i]; ++j )
+            fprintf( stderr, "%d %d %.15e\n",
+                     MIN(system->my_atoms[i].orig_id,
+                         system->my_atoms[H->entries[j].j].orig_id),
+                     MAX(system->my_atoms[i].orig_id,
+                         system->my_atoms[H->entries[j].j].orig_id),
+                     H->entries[j].val );
+#endif
+
+}
+
+
+/* Compute the charge matrix entries and store the matrix in full format
+ * using the far neighbors list (stored in full format) and according to
+ * the neutral territory communication method */
+static void Init_CM_Full_NT( reax_system *system, control_params *control,
+        simulation_data *data, storage *workspace, reax_list **lists,
+        output_controls *out_control, MPI_Comm comm, mpi_datatypes *mpi_data )
+{
+    int i, j, pj;
+    int start_i, end_i;
+    int type_i, type_j;
+    int Htop;
+    int local, renbr;
+    real r_ij;
+    sparse_matrix *H;
+    reax_list *far_nbrs;
+    single_body_parameters *sbp_i;
+    two_body_parameters *twbp;
+    reax_atom *atom_i, *atom_j;
+    int mark[6];
+    int total_cnt[6];
+    int bin[6];
+    int total_sum[6];
+    int nt_flag;
+
+    far_nbrs = lists[FAR_NBRS];
+
+    H = workspace->H;
+    H->n = system->n;
+    Htop = 0;
+    renbr = (data->step - data->prev_steps) % control->reneighbor == 0;
+
+    nt_flag = 1;
+    if ( renbr )
+    {
+        for ( i = 0; i < 6; ++i )
+        {
+            total_cnt[i] = 0;
+            bin[i] = 0;
+            total_sum[i] = 0;
+        }
+
+        for ( i = system->n; i < system->N; ++i )
+        {
+            atom_i = &system->my_atoms[i];
+
+            if ( atom_i->nt_dir != -1 )
+            {
+                total_cnt[ atom_i->nt_dir ]++;
+            }
+        }
+
+        total_sum[0] = system->n;
+        for ( i = 1; i < 6; ++i )
+        {
+            total_sum[i] = total_sum[i-1] + total_cnt[i-1];
+        }
+
+        for ( i = system->n; i < system->N; ++i )
+        {
+            atom_i = &system->my_atoms[i];
+
+            if ( atom_i->nt_dir != -1 )
+            {
+                atom_i->pos = total_sum[ atom_i->nt_dir ] + bin[ atom_i->nt_dir ];
+                bin[ atom_i->nt_dir ]++;
+            }
+        }
+        H->NT = total_sum[5] + total_cnt[5];
+    }
+
+    mark[0] = mark[1] = 1;
+    mark[2] = mark[3] = mark[4] = mark[5] = 2;
+
+    for ( i = 0; i < system->N; ++i )
+    {
+        atom_i = &system->my_atoms[i];
+        type_i = atom_i->type;
+        start_i = Start_Index( i, far_nbrs );
+        end_i = End_Index( i, far_nbrs );
+
+        sbp_i = &system->reax_param.sbp[type_i];
+
+        if ( i < system->n )
+        {
+            local = 1;
+        }
+        else if ( atom_i->nt_dir != -1 )
+        {
+            local = 2;
+            nt_flag = 0;
+        }
+        else
+        {
+            continue;
+        }
+
+        if ( local == 1 )
+        {
+            H->start[i] = Htop;
+            H->entries[Htop].j = i;
+            H->entries[Htop].val = sbp_i->eta;
+            ++Htop;
+        }
+
+        for ( pj = start_i; pj < end_i; ++pj )
+        {
+            if ( far_nbrs->far_nbr_list.d[pj] <= control->nonb_cut )
+            {
+                j = far_nbrs->far_nbr_list.nbr[pj];
+                atom_j = &system->my_atoms[j];
+
+                type_j = atom_j->type;
+                r_ij = far_nbrs->far_nbr_list.d[pj];
+                twbp = &system->reax_param.tbp[type_i][type_j];
+
+                if ( local == 1 )
+                {
+                    /* H matrix entry */
+                    if ( atom_j->nt_dir > 0 || (j < system->n) )
+                    {
+                        if ( j < system->n )
+                        {
+                            H->entries[Htop].j = j;
+                        }
+                        else
+                        {
+                            H->entries[Htop].j = atom_j->pos;
+                        }
+
+                        if ( control->tabulate == 0 )
+                        {
+                            H->entries[Htop].val = Compute_H(r_ij, twbp->gamma, workspace->Tap);
+                        }
+                        else 
+                        {
+                            H->entries[Htop].val = Compute_tabH(r_ij, type_i, type_j);
+                        }
+
+                        ++Htop;
+                    }
+
+                }
+                else if ( local == 2 )
+                {
+                    /* H matrix entry */
+                    if ( ( atom_j->nt_dir != -1
+                                && mark[atom_i->nt_dir] != mark[atom_j->nt_dir] )
+                            || ( j < system->n && atom_i->nt_dir != 0 ) )
+                    {
+                        if ( !nt_flag )
+                        {
+                            nt_flag = 1;
+                            H->start[atom_i->pos] = Htop;
+                        }
+
+                        if ( j < system->n )
+                        {
+                            H->entries[Htop].j = j;
+                        }
+                        else
+                        {
+                            H->entries[Htop].j = atom_j->pos;
+                        }
+
+                        if ( control->tabulate == 0 )
+                        {
+                            H->entries[Htop].val = Compute_H( r_ij, twbp->gamma, workspace->Tap );
+                        }
+                        else 
+                        {
+                            H->entries[Htop].val = Compute_tabH( r_ij, type_i, type_j );
+                        }
+
+                        ++Htop;
+                    }
+                }
+
+            }
+        }
+
+        if ( local == 1 )
+        {
+            H->end[i] = Htop;
+        }
+        else if ( local == 2 )
+        {
+            if ( nt_flag )
+            {
+                H->end[atom_i->pos] = Htop;
+            }
+            else
+            {
+                 H->start[atom_i->pos] = 0;
+                 H->end[atom_i->pos] = 0;
+            }
+        }
+    }
+
+    workspace->realloc.Htop = Htop;
+
+#if defined( DEBUG )
+    Print_Sparse_Matrix( system, H );
+    for ( i = 0; i < H->n; ++i )
+        for ( j = H->start[i]; j < H->end[i]; ++j )
+            fprintf( stderr, "%d %d %.15e\n",
+                     MIN(system->my_atoms[i].orig_id,
+                         system->my_atoms[H->entries[j].j].orig_id),
+                     MAX(system->my_atoms[i].orig_id,
+                         system->my_atoms[H->entries[j].j].orig_id),
+                     H->entries[j].val );
+#endif
+
+}
+
+
+#else
+/* Compute the charge matrix entries and store the matrix in half format
+ * using the far neighbors list (stored in half format) and according to
+ * the full shell communication method */
+static void Init_CM_Half_FS( reax_system *system, control_params *control,
+        simulation_data *data, storage *workspace, reax_list **lists,
+        output_controls *out_control, MPI_Comm comm, mpi_datatypes *mpi_data )
+{
+    int i, j, pj;
+    int start_i, end_i;
+    int type_i, type_j;
+    int Htop;
+    real r_ij;
+    sparse_matrix *H;
+    reax_list *far_nbrs;
+    single_body_parameters *sbp_i;
+    two_body_parameters *twbp;
+    reax_atom *atom_i, *atom_j;
+
+    far_nbrs = lists[FAR_NBRS];
+
+    H = workspace->H;
+    H->n = system->n;
+    Htop = 0;
+
+    for ( i = 0; i < system->n; ++i )
+    {
+        atom_i = &system->my_atoms[i];
+        type_i = atom_i->type;
+        start_i = Start_Index( i, far_nbrs );
+        end_i = End_Index( i, far_nbrs );
+
+        sbp_i = &system->reax_param.sbp[type_i];
+
+        H->start[i] = Htop;
+        H->entries[Htop].j = i;
+        H->entries[Htop].val = sbp_i->eta;
+        ++Htop;
+
+        for ( pj = start_i; pj < end_i; ++pj )
+        {
+            // H matrix entry
+            if ( far_nbrs->far_nbr_list.d[pj] <= control->nonb_cut )
+            {
+                j = far_nbrs->far_nbr_list.nbr[pj];
+                atom_j = &system->my_atoms[j];
+            
+                if ( j < system->n || atom_i->orig_id < atom_j->orig_id )
+                {
+                    type_j = atom_j->type;
+                    r_ij = far_nbrs->far_nbr_list.d[pj];
+                    twbp = &system->reax_param.tbp[type_i][type_j];
+
+                    H->entries[Htop].j = j;
+
+                    if ( control->tabulate == 0 )
+                    {
+                        H->entries[Htop].val = Compute_H( r_ij, twbp->gamma, workspace->Tap );
+                    }
+                    else
+                    {
+                        H->entries[Htop].val = Compute_tabH( r_ij, type_i, type_j );
+                    }
+
+                    ++Htop;
+                }
+            }
+        }
+
+        H->end[i] = Htop;
+    }
+
+    workspace->realloc.Htop = Htop;
+
+#if defined( DEBUG )
+    Print_Sparse_Matrix( system, H );
+    for ( i = 0; i < H->n; ++i )
+        for ( j = H->start[i]; j < H->end[i]; ++j )
+            fprintf( stderr, "%d %d %.15e\n",
+                     MIN(system->my_atoms[i].orig_id,
+                         system->my_atoms[H->entries[j].j].orig_id),
+                     MAX(system->my_atoms[i].orig_id,
+                         system->my_atoms[H->entries[j].j].orig_id),
+                     H->entries[j].val );
+#endif
+}
+
+
+/* Compute the charge matrix entries and store the matrix in full format
+ * using the far neighbors list (stored in full format) and according to
+ * the full shell communication method */
+static void Init_CM_Full_FS( reax_system *system, control_params *control,
+        simulation_data *data, storage *workspace, reax_list **lists,
+        output_controls *out_control, MPI_Comm comm, mpi_datatypes *mpi_data )
+{
+    int i, j, pj;
+    int start_i, end_i;
+    int type_i, type_j;
+    int Htop;
+    real r_ij;
+    sparse_matrix *H;
+    reax_list *far_nbrs;
+    single_body_parameters *sbp_i;
+    two_body_parameters *twbp;
+    reax_atom *atom_i, *atom_j;
+
+    far_nbrs = lists[FAR_NBRS];
+
+    H = workspace->H;
+    H->n = system->n;
+    Htop = 0;
+
+    for ( i = 0; i < system->n; ++i )
+    {
+        atom_i = &system->my_atoms[i];
+        type_i = atom_i->type;
+        start_i = Start_Index( i, far_nbrs );
+        end_i = End_Index( i, far_nbrs );
+
+        sbp_i = &system->reax_param.sbp[type_i];
+
+        H->start[i] = Htop;
+        H->entries[Htop].j = i;
+        H->entries[Htop].val = sbp_i->eta;
+        ++Htop;
+
+        for ( pj = start_i; pj < end_i; ++pj )
+        {
+            if ( far_nbrs->far_nbr_list.d[pj] <= control->nonb_cut )
+            {
+                j = far_nbrs->far_nbr_list.nbr[pj];
+                atom_j = &system->my_atoms[j];
+                type_j = atom_j->type;
+                r_ij = far_nbrs->far_nbr_list.d[pj];
+                twbp = &system->reax_param.tbp[type_i][type_j];
+
+                // H matrix entry
+                H->entries[Htop].j = j;
+
+                if ( control->tabulate == 0 )
+                {
+                    H->entries[Htop].val = Compute_H(r_ij, twbp->gamma, workspace->Tap);
+                }
+                else
+                {
+                    H->entries[Htop].val = Compute_tabH(r_ij, type_i, type_j);
+                }
+
+                ++Htop;
+            }
+        }
+
+        H->end[i] = Htop;
+    }
+
+    workspace->realloc.Htop = Htop;
+
+#if defined( DEBUG )
+    Print_Sparse_Matrix( system, H );
+    for ( i = 0; i < H->n; ++i )
+        for ( j = H->start[i]; j < H->end[i]; ++j )
+            fprintf( stderr, "%d %d %.15e\n",
+                     MIN(system->my_atoms[i].orig_id,
+                         system->my_atoms[H->entries[j].j].orig_id),
+                     MAX(system->my_atoms[i].orig_id,
+                         system->my_atoms[H->entries[j].j].orig_id),
+                     H->entries[j].val );
+#endif
+}
+#endif
+
+
+/* Compute entries of the bonds/hbonds lists and store the lists in full format
+ * using the far neighbors list (stored in half format)
+ * 
+ * Note: this version does NOT contain an optimization to restrict the bond_mark
+ *  array to at most the 3-hop neighborhood */
+static void Init_Bond_Half( reax_system *system, control_params *control,
+        simulation_data *data, storage *workspace, reax_list **lists,
+        output_controls *out_control, MPI_Comm comm, mpi_datatypes *mpi_data )
+{
+    int i, j, pj;
+    int start_i, end_i;
+    int type_i, type_j;
+    int btop_i, num_bonds, num_hbonds;
+    int ihb, jhb, ihb_top;
+    int local;
+    real cutoff;
+    reax_list *far_nbrs, *bonds, *hbonds;
+    single_body_parameters *sbp_i, *sbp_j;
+    two_body_parameters *twbp;
+    reax_atom *atom_i, *atom_j;
+    int jhb_top;
+    
+    far_nbrs = lists[FAR_NBRS];
+    bonds = lists[BONDS];
+    hbonds = lists[HBONDS];
+
+    for ( i = 0; i < system->n; ++i )
+    {
+        workspace->bond_mark[i] = 0;
+    }
+    for ( i = system->n; i < system->N; ++i )
+    {
+        /* put ghost atoms to an infinite distance (i.e., 1000) */
+        workspace->bond_mark[i] = 1000;
+    }
+
+    num_bonds = 0;
+    num_hbonds = 0;
+    btop_i = 0;
+
+    for ( i = 0; i < system->N; ++i )
+    {
+        atom_i = &system->my_atoms[i];
+        type_i = atom_i->type;
+        start_i = Start_Index( i, far_nbrs );
+        end_i = End_Index( i, far_nbrs );
+
+        /* start at end because other atoms
+         * can add to this atom's list (half-list) */
+        btop_i = End_Index( i, bonds );
+        sbp_i = &system->reax_param.sbp[type_i];
+
+        if ( i < system->n )
+        {
+            local = 1;
+            cutoff = control->nonb_cut;
+        }
+        else
+        {
+            local = 0;
+            cutoff = control->bond_cut;
+        }
+
+        ihb = -1;
+        ihb_top = -1;
+        if ( local == 1 )
+        {
+            if ( control->hbond_cut > 0 )
+            {
+                ihb = sbp_i->p_hbond;
+
+                if ( ihb == 1 )
+                {
+                    /* start at end because other atoms
+                     * can add to this atom's list (half-list) */ 
+                    ihb_top = End_Index( atom_i->Hindex, hbonds );
+                }
+                else
+                {
+                    ihb_top = -1;
+                }
+            }
+        }
+
+        /* update i-j distance - check if j is within cutoff */
+        for ( pj = start_i; pj < end_i; ++pj )
+        {
+            j = far_nbrs->far_nbr_list.nbr[pj];
+            atom_j = &system->my_atoms[j];
+            
+            if ( far_nbrs->far_nbr_list.d[pj] <= cutoff )
+            {
+                type_j = atom_j->type;
+                sbp_j = &system->reax_param.sbp[type_j];
+                twbp = &system->reax_param.tbp[type_i][type_j];
+
+                if ( local == 1 )
+                {
+                    /* hydrogen bond lists */
+                    if ( control->hbond_cut > 0
+                            && (ihb == 1 || ihb == 2)
+                            && far_nbrs->far_nbr_list.d[pj] <= control->hbond_cut )
+                    {
+                        // fprintf( stderr, "%d %d\n", atom1, atom2 );
+                        jhb = sbp_j->p_hbond;
+
+                        if ( ihb == 1 && jhb == 2 )
+                        {
+                            hbonds->hbond_list[ihb_top].nbr = j;
+                            hbonds->hbond_list[ihb_top].scl = 1;
+                            hbonds->hbond_list[ihb_top].ptr = pj;
+                            ++ihb_top;
+                            ++num_hbonds;
+                        }
+                        /* only add to list for local j (far nbrs is half-list) */
+                        else if ( j < system->n && ihb == 2 && jhb == 1 ) 
+                        {
+                            jhb_top = End_Index( atom_j->Hindex, hbonds );
+                            hbonds->hbond_list[jhb_top].nbr = i;
+                            hbonds->hbond_list[jhb_top].scl = -1;
+                            hbonds->hbond_list[jhb_top].ptr = pj;
+                            Set_End_Index( atom_j->Hindex, jhb_top + 1, hbonds );
+                            ++num_hbonds;
+                        }
+                    }
+                }
+
+                /* uncorrected bond orders */
+                if ( far_nbrs->far_nbr_list.d[pj] <= control->bond_cut
+                        && BOp( workspace, bonds, control->bo_cut,
+                            i, btop_i, far_nbrs->far_nbr_list.nbr[pj],
+                            &far_nbrs->far_nbr_list.rel_box[pj], far_nbrs->far_nbr_list.d[pj],
+                            &far_nbrs->far_nbr_list.dvec[pj], far_nbrs->format,
+                            sbp_i, sbp_j, twbp ) )
+                {
+                    num_bonds += 2;
+                    ++btop_i;
+
+                    if ( workspace->bond_mark[j] > workspace->bond_mark[i] + 1 )
+                    {
+                        workspace->bond_mark[j] = workspace->bond_mark[i] + 1;
+                    }
+                    else if ( workspace->bond_mark[i] > workspace->bond_mark[j] + 1 )
+                    {
+                        workspace->bond_mark[i] = workspace->bond_mark[j] + 1;
+                    }
+                }
+
+            }
+        }
+
+        Set_End_Index( i, btop_i, bonds );
+
+        if ( local == 1 && ihb == 1 )
+        {
+            Set_End_Index( atom_i->Hindex, ihb_top, hbonds );
+        }
+    }
+
+    workspace->realloc.num_bonds = num_bonds;
+    workspace->realloc.num_hbonds = num_hbonds;
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "p%d @ step%d: Htop = %d num_bonds = %d num_hbonds = %d\n",
+        system->my_rank, data->step, workspace->realloc.Htop, num_bonds, num_hbonds );
+    MPI_Barrier( comm );
+#endif
+
+#if defined( DEBUG )
+    Print_Bonds( system, bonds, "debugbonds.out" );
+    Print_Bond_List2( system, bonds, "pbonds.out" );
+#endif
+
+    Validate_Lists( system, workspace, lists, data->step,
+            system->n, system->N, system->numH, comm );
+
+}
+
+
+/* Compute entries of the bonds/hbonds lists and store the lists in full format
+ * using the far neighbors list (stored in full format) */
+static void Init_Bond_Full( reax_system *system, control_params *control,
+        simulation_data *data, storage *workspace, reax_list **lists,
+        output_controls *out_control, MPI_Comm comm, mpi_datatypes *mpi_data )
+{
+    int i, j, pj;
+    int start_i, end_i;
+    int type_i, type_j;
+    int num_bonds, num_hbonds;
+    int ihb, jhb, ihb_top;
+    real cutoff;
+    reax_list *far_nbrs, *bonds, *hbonds;
+    single_body_parameters *sbp_i, *sbp_j;
+    two_body_parameters *twbp;
+    reax_atom *atom_i, *atom_j;
+    int start_j, end_j;
+    int btop_i, btop_j;
+    int k, push;
+    int *q;
+
+    far_nbrs = lists[FAR_NBRS];
+    bonds = lists[BONDS];
+    hbonds = lists[HBONDS];
+    num_hbonds = 0;
+    push = 0;
+    num_bonds = 0;
+    btop_i = 0;
+    bonds = lists[BONDS];
+
+    q = smalloc( sizeof(int) * (system->N - system->n),
+            "Init_Distance::q", MPI_COMM_WORLD );
+
+    for ( i = 0; i < system->n; ++i )
+    {
+        workspace->bond_mark[i] = 0;
+    }
+    for ( i = system->n; i < system->N; ++i )
+    {
+        /* put ghost atoms to an infinite distance (i.e., 1000) */
+        workspace->bond_mark[i] = 1000;
+    }
+
+    /* bonds that are directly connected to local atoms */
+    for ( i = 0; i < system->n; ++i )
+    {
+        atom_i = &system->my_atoms[i];
+        type_i = atom_i->type;
+        btop_i = End_Index( i, bonds );
+        sbp_i = &system->reax_param.sbp[type_i];
+        start_i = Start_Index( i, far_nbrs );
+        end_i = End_Index( i, far_nbrs );
+        ihb = sbp_i->p_hbond;
+        ihb_top = Start_Index( atom_i->Hindex, hbonds );
+
+        for ( pj = start_i; pj < end_i; ++pj )
+        {
+            j = far_nbrs->far_nbr_list.nbr[pj];
+            atom_j = &system->my_atoms[j];
+
+            if ( control->hbond_cut > 0.0 && ihb == 1 )
+            {
+                /* check if j is within cutoff */
+                if ( far_nbrs->far_nbr_list.d[pj] <= control->hbond_cut
+                  && system->reax_param.sbp[atom_j->type].p_hbond == 2 )
+                {
+                    hbonds->hbond_list[ihb_top].nbr = j;
+                    hbonds->hbond_list[ihb_top].scl = 1;
+                    hbonds->hbond_list[ihb_top].ptr = pj;
+                    ++ihb_top;
+                    ++num_hbonds;
+                }
+            }
+
+            if ( i <= j && far_nbrs->far_nbr_list.d[pj] <= control->bond_cut )
+            {
+                type_j = atom_j->type;
+                sbp_j = &system->reax_param.sbp[type_j];
+                twbp = &system->reax_param.tbp[type_i][type_j];
+
+                if ( BOp( workspace, bonds, control->bo_cut,
+                            i, btop_i, far_nbrs->far_nbr_list.nbr[pj],
+                            &far_nbrs->far_nbr_list.rel_box[pj], far_nbrs->far_nbr_list.d[pj],
+                            &far_nbrs->far_nbr_list.dvec[pj], far_nbrs->format,
+                            sbp_i, sbp_j, twbp ) )
+                {
+                    num_bonds += 2;
+                    ++btop_i;
+
+                    /* if j is a non-local atom, push it on the queue
+                     * to search for it's bonded neighbors later */
+                    if ( workspace->bond_mark[j] == 1000 )
+                    {
+                        workspace->bond_mark[j] = 101;
+                        q[ push++ ] = j;
+                    }
+                }
+            }
+        }
+
+        if ( control->hbond_cut > 0.0 && ihb == 1 )
+        {
+            Set_End_Index( atom_i->Hindex, ihb_top, hbonds );
+        }
+
+        Set_End_Index( i, btop_i, bonds );
+    }
+
+    /* bonds that are indirectly connected to local atoms */
+    for ( k = 0; k < push; ++k )
+    {
+        i = q[k];
+        workspace->bond_mark[i] -= 100;
+        atom_i = &system->my_atoms[i];
+        type_i = atom_i->type;
+        btop_i = End_Index( i, bonds );
+        sbp_i = &system->reax_param.sbp[type_i];
+        start_i = Start_Index( i, far_nbrs );
+        end_i = End_Index( i, far_nbrs );
+
+        for ( pj = start_i; pj < end_i; ++pj )
+        {
+            j = far_nbrs->far_nbr_list.nbr[pj];
+
+            if ( workspace->bond_mark[i] == 3
+                    && workspace->bond_mark[j] == 1000 )
+            {
+                continue;
+            }
+
+            atom_j = &system->my_atoms[j];
+
+            if (  workspace->bond_mark[j] > 100
+                    && far_nbrs->far_nbr_list.d[pj] <= control->bond_cut )
+            {
+                type_j = atom_j->type;
+                sbp_j = &system->reax_param.sbp[type_j];
+                twbp = &system->reax_param.tbp[type_i][type_j];
+
+                if ( BOp( workspace, bonds, control->bo_cut,
+                            i, btop_i, far_nbrs->far_nbr_list.nbr[pj],
+                            &far_nbrs->far_nbr_list.rel_box[pj], far_nbrs->far_nbr_list.d[pj],
+                            &far_nbrs->far_nbr_list.dvec[pj], far_nbrs->format,
+                            sbp_i, sbp_j, twbp ) )
+                {
+                    num_bonds += 2;
+                    ++btop_i;
+
+                    if ( workspace->bond_mark[j] == 1000 )
+                    {
+                        workspace->bond_mark[j] = workspace->bond_mark[i] + 100;
+
+                        if ( workspace->bond_mark[i] < 3 )
+                        {
+                            q[ push++ ] = j;
+                        }
+                    }
+                }
+            }
+        }
+
+        Set_End_Index( i, btop_i, bonds );
+    }
+
+    workspace->realloc.num_bonds = num_bonds;
+    sfree( q, "Init_Bond_Full::q" );
+
+    workspace->realloc.num_hbonds = num_hbonds;
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "p%d @ step%d: Htop = %d num_bonds = %d num_hbonds = %d\n",
+             system->my_rank, data->step, workspace->realloc.Htop, workspace->realloc.num_bonds, num_hbonds );
+    MPI_Barrier( comm );
+#endif
+
+#if defined( DEBUG )
+    Print_Bonds( system, bonds, "debugbonds.out" );
+    Print_Bond_List2( system, bonds, "pbonds.out" );
+#endif
+
+    Validate_Lists( system, workspace, lists, data->step,
+            system->n, system->N, system->numH, comm );
+
+}
+
+
+void Init_Force_Functions( control_params *control )
+{
+    Interaction_Functions[0] = &BO;
+    Interaction_Functions[1] = &Bonds; //Dummy_Interaction;
+    Interaction_Functions[2] = &Atom_Energy; //Dummy_Interaction;
+    Interaction_Functions[3] = &Valence_Angles; //Dummy_Interaction;
+    Interaction_Functions[4] = &Torsion_Angles; //Dummy_Interaction;
+    if ( control->hbond_cut > 0.0 )
+    {
+        Interaction_Functions[5] = &Hydrogen_Bonds;
+    }
+    else
+    {
+        Interaction_Functions[5] = &Dummy_Interaction;
+    }
+    Interaction_Functions[6] = &Dummy_Interaction; //empty
+    Interaction_Functions[7] = &Dummy_Interaction; //empty
+    Interaction_Functions[8] = &Dummy_Interaction; //empty
+    Interaction_Functions[9] = &Dummy_Interaction; //empty
+}
+
+
+void Compute_Bonded_Forces( reax_system *system, control_params *control,
+        simulation_data *data, storage *workspace, reax_list **lists,
+        output_controls *out_control, MPI_Comm comm )
+{
+    int i;
+
+    /* Mark beginning of a new timestep in bonded energy files */
+#if defined(TEST_ENERGY)
+    Debug_Marker_Bonded( out_control, data->step );
+#endif
+
+    /* Implement all force calls as function pointers */
+    for ( i = 0; i < NUM_INTRS; i++ )
+    {
+#if defined(DEBUG)
+        fprintf( stderr, "p%d: starting f%d\n", system->my_rank, i );
+        MPI_Barrier( comm );
+#endif
+
+        (Interaction_Functions[i])( system, control, data, workspace,
+                lists, out_control );
+
+#if defined(DEBUG)
+        fprintf( stderr, "p%d: f%d done\n", system->my_rank, i );
+        MPI_Barrier( comm );
+#endif
+    }
+}
+
+
+void Compute_NonBonded_Forces( reax_system *system, control_params *control,
+        simulation_data *data, storage *workspace, reax_list **lists,
+        output_controls *out_control, MPI_Comm comm )
+{
+    /* Mark beginning of a new timestep in nonbonded energy files */
+#if defined(TEST_ENERGY)
+    Debug_Marker_Nonbonded( out_control, data->step );
+#endif
+
+    /* van der Waals and Coulomb interactions */
+    if ( control->tabulate == 0 )
+    {
+        vdW_Coulomb_Energy( system, control, data, workspace,
+                lists, out_control );
+    }
+    else
+    {
+        Tabulated_vdW_Coulomb_Energy( system, control, data, workspace,
+                lists, out_control );
+    }
+
+#if defined(DEBUG)
+    fprintf( stderr, "p%d: nonbonded forces done\n", system->my_rank );
+    MPI_Barrier( comm );
+#endif
+}
+
+
+/* this version of Compute_Total_Force computes forces from
+ * coefficients accumulated by all interaction functions.
+ * Saves enormous time & space! */
+void Compute_Total_Force( reax_system *system, control_params *control,
+        simulation_data *data, storage *workspace,
+        reax_list **lists, mpi_datatypes *mpi_data )
+{
+    int i, pj;
+    reax_list *bonds;
+
+    bonds = lists[BONDS];
 
-        /* update i-j distance - check if j is within cutoff */
-        for ( pj = start_i; pj < end_i; ++pj )
+    for ( i = 0; i < system->N; ++i )
+    {
+        for ( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj )
         {
-            nbr_pj = &( far_nbrs->far_nbr_list[pj] );
-            j = nbr_pj->nbr;
-            atom_j = &(system->my_atoms[j]);
-            //fprintf( stderr, "%d%d i=%d x_i: %f %f %f,j=%d x_j: %f %f %f, d=%f\n",
-            //     MIN(atom_i->orig_id, atom_j->orig_id),
-            //     MAX(atom_i->orig_id, atom_j->orig_id),
-            //     i, atom_i->x[0], atom_i->x[1], atom_i->x[2],
-            //     j, atom_j->x[0], atom_j->x[1], atom_j->x[2], nbr_pj->d );
-            if ( renbr )
-            {
-                if (nbr_pj->d <= cutoff)
-                    flag = 1;
-                else flag = 0;
-            }
-            else
+            if ( i < bonds->bond_list[pj].nbr )
             {
-                nbr_pj->dvec[0] = atom_j->x[0] - atom_i->x[0];
-                nbr_pj->dvec[1] = atom_j->x[1] - atom_i->x[1];
-                nbr_pj->dvec[2] = atom_j->x[2] - atom_i->x[2];
-                nbr_pj->d = rvec_Norm_Sqr( nbr_pj->dvec );
-                if ( nbr_pj->d <= SQR(cutoff) )
+                if ( control->virial == 0 )
                 {
-                    nbr_pj->d = sqrt(nbr_pj->d);
-                    flag = 1;
+                    Add_dBond_to_Forces( i, pj, workspace, lists );
                 }
                 else
                 {
-                    flag = 0;
+                    Add_dBond_to_Forces_NPT( i, pj, data, workspace, lists );
                 }
             }
+        }
+    }
 
-            if ( flag )
-            {
-                type_j = atom_j->type;
-                r_ij = nbr_pj->d;
-                sbp_j = &(system->reax_param.sbp[type_j]);
-                twbp = &(system->reax_param.tbp[type_i][type_j]);
+    //Print_Total_Force( system, data, workspace );
 
-                if ( local )
-                {
-                    /* H matrix entry */
-                    if ( j < system->n || atom_i->orig_id < atom_j->orig_id ) //tryQEq||1
-                    {
-                        H->entries[Htop].j = j;
-                        //fprintf( stdout, "%d%d %d %d\n",
-                        //     MIN(atom_i->orig_id, atom_j->orig_id),
-                        //     MAX(atom_i->orig_id, atom_j->orig_id),
-                        //     MIN(atom_i->orig_id, atom_j->orig_id),
-                        //     MAX(atom_i->orig_id, atom_j->orig_id) );
-                        if ( control->tabulate == 0 )
-                            H->entries[Htop].val = Compute_H(r_ij, twbp->gamma, workspace->Tap);
-                        else H->entries[Htop].val = Compute_tabH(r_ij, type_i, type_j);
-                        ++Htop;
-                    }
+#if defined(PURE_REAX)
+    /* now all forces are computed to their partially-final values
+     * based on the neighbors information each processor has had.
+     * final values of force on each atom needs to be computed by adding up
+     * all partially-final pieces */
+    Coll_FS( system, mpi_data, workspace->f, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
 
-                    /* hydrogen bond lists */
-                    if ( control->hbond_cut > 0 && (ihb == 1 || ihb == 2) &&
-                            nbr_pj->d <= control->hbond_cut )
-                    {
-                        // fprintf( stderr, "%d %d\n", atom1, atom2 );
-                        jhb = sbp_j->p_hbond;
-                        if ( ihb == 1 && jhb == 2 )
-                        {
-                            hbonds->hbond_list[ihb_top].nbr = j;
-                            hbonds->hbond_list[ihb_top].scl = 1;
-                            hbonds->hbond_list[ihb_top].ptr = nbr_pj;
-                            ++ihb_top;
-                            ++num_hbonds;
-                        }
-                        else if ( j < system->n && ihb == 2 && jhb == 1 )
-                        {
-                            jhb_top = End_Index( atom_j->Hindex, hbonds );
-                            hbonds->hbond_list[jhb_top].nbr = i;
-                            hbonds->hbond_list[jhb_top].scl = -1;
-                            hbonds->hbond_list[jhb_top].ptr = nbr_pj;
-                            Set_End_Index( atom_j->Hindex, jhb_top + 1, hbonds );
-                            ++num_hbonds;
-                        }
-                    }
-                }
+    for ( i = 0; i < system->n; ++i )
+    {
+        rvec_Copy( system->my_atoms[i].f, workspace->f[i] );
+    }
 
-                /* uncorrected bond orders */
-                if ( //(workspace->bond_mark[i] < 3 || workspace->bond_mark[j] < 3) &&
-                    nbr_pj->d <= control->bond_cut &&
-                    BOp( workspace, bonds, control->bo_cut,
-                         i , btop_i, nbr_pj, sbp_i, sbp_j, twbp ) )
-                {
-                    num_bonds += 2;
-                    ++btop_i;
+#if defined(TEST_FORCES)
+    Coll_FS( system, mpi_data, workspace->f_ele, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
+    Coll_FS( system, mpi_data, workspace->f_vdw, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
+    Coll_FS( system, mpi_data, workspace->f_be, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
+    Coll_FS( system, mpi_data, workspace->f_lp, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
+    Coll_FS( system, mpi_data, workspace->f_ov, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
+    Coll_FS( system, mpi_data, workspace->f_un, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
+    Coll_FS( system, mpi_data, workspace->f_ang, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
+    Coll_FS( system, mpi_data, workspace->f_coa, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
+    Coll_FS( system, mpi_data, workspace->f_pen, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
+    Coll_FS( system, mpi_data, workspace->f_hb, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
+    Coll_FS( system, mpi_data, workspace->f_tor, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
+    Coll_FS( system, mpi_data, workspace->f_con, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
+#endif
 
-                    if ( workspace->bond_mark[j] > workspace->bond_mark[i] + 1 )
-                        workspace->bond_mark[j] = workspace->bond_mark[i] + 1;
-                    else if ( workspace->bond_mark[i] > workspace->bond_mark[j] + 1 )
-                    {
-                        workspace->bond_mark[i] = workspace->bond_mark[j] + 1;
-                        //if( workspace->bond_mark[i] == 1000 )
-                        //  workspace->done_after[i] = pj;
-                    }
-                    //fprintf( stdout, "%d%d - %d(%d) %d(%d)\n",
-                    //   i , j, i, workspace->bond_mark[i], j, workspace->bond_mark[j] );
-                }
-            }
-        }
+#endif
+}
 
-        Set_End_Index( i, btop_i, bonds );
-        if ( local )
-        {
-            H->end[i] = Htop;
-            if ( ihb == 1 )
-                Set_End_Index( atom_i->Hindex, ihb_top, hbonds );
-        }
-    }
 
-    //fprintf( stderr, "after the first init loop\n" );
-    /*for( i = system->n; i < system->N; ++i )
-      if( workspace->bond_mark[i] > 3 ) {
-        start_i = Start_Index(i, bonds);
-        end_i = End_Index(i, bonds);
-        num_bonds -= (end_i - start_i);
-        Set_End_Index(i, start_i, bonds );
-        }*/
+void Init_Forces( reax_system *system, control_params *control,
+                  simulation_data *data, storage *workspace, reax_list **lists,
+                  output_controls *out_control, MPI_Comm comm, mpi_datatypes *mpi_data )
+{
+    double t_start, t_dist, t_cm, t_bond;
+    double timings[3], t_total[3];
+    
+    t_start = MPI_Wtime( );
 
-    /*for( i = system->n; i < system->N; ++i ) {
-      start_i = Start_Index(i, far_nbrs);
-      end_i = workspace->done_after[i];
+    Init_Distance( system, control, data, workspace, lists, out_control, comm, mpi_data );
 
-      if( workspace->bond_mark[i] >= 2 && start_i < end_i ) {
-        atom_i = &(system->my_atoms[i]);
-        type_i = atom_i->type;
-        btop_i = End_Index( i, bonds );
-        sbp_i = &(system->reax_param.sbp[type_i]);
+    t_dist = MPI_Wtime( );
 
-        for( pj = start_i; pj < end_i; ++pj ) {
-    nbr_pj = &( far_nbrs->far_nbr_list[pj] );
-    j = nbr_pj->nbr;
+#if defined(NEUTRAL_TERRITORY)
+    if ( workspace->H->format == SYM_HALF_MATRIX )
+    {
+        Init_CM_Half_NT( system, control, data, workspace, lists, out_control, comm, mpi_data );
+    }
+    else
+    {
+        Init_CM_Full_NT( system, control, data, workspace, lists, out_control, comm, mpi_data );
+    }
+#else
+    if ( workspace->H->format == SYM_HALF_MATRIX )
+    {
+        Init_CM_Half_FS( system, control, data, workspace, lists, out_control, comm, mpi_data );
+    }
+    else
+    {
+        Init_CM_Full_FS( system, control, data, workspace, lists, out_control, comm, mpi_data );
+    }
+#endif
 
-    if( workspace->bond_mark[j] >= 2 && nbr_pj->d <= control->bond_cut ) {
-      atom_j = &(system->my_atoms[j]);
-      type_j = atom_j->type;
-      sbp_j = &(system->reax_param.sbp[type_j]);
-      twbp = &(system->reax_param.tbp[type_i][type_j]);
+    t_cm = MPI_Wtime();
 
-      if( BOp( workspace, bonds, control->bo_cut,
-         i , btop_i, nbr_pj, sbp_i, sbp_j, twbp ) ) {
-        num_bonds += 2;
-        ++btop_i;
+    if ( lists[FAR_NBRS]->format == HALF_LIST )
+    {
+        Init_Bond_Half( system, control, data, workspace, lists, out_control, comm, mpi_data );
+    }
+    else
+    {
+        Init_Bond_Full( system, control, data, workspace, lists, out_control, comm, mpi_data );
+    }
 
-        if( workspace->bond_mark[j] > workspace->bond_mark[i] + 1 )
-          workspace->bond_mark[j] = workspace->bond_mark[i] + 1;
-        else if( workspace->bond_mark[i] > workspace->bond_mark[j] + 1 )
-          workspace->bond_mark[i] = workspace->bond_mark[j] + 1;
+    t_bond = MPI_Wtime();
 
-        //fprintf( stdout, "%d%d - %d(%d) %d(%d) new\n",
-        // i , j, i, workspace->bond_mark[i], j, workspace->bond_mark[j] );
-      }
-    }
-        }
-        Set_End_Index( i, btop_i, bonds );
-      }
-      }*/
+    timings[0] = t_dist - t_start;
+    timings[1] = t_cm - t_dist;
+    timings[2] = t_bond - t_cm;
 
-    workspace->realloc.Htop = Htop;
-    workspace->realloc.num_bonds = num_bonds;
-    workspace->realloc.num_hbonds = num_hbonds;
+    MPI_Reduce( timings, t_total, 3, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world );
 
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "p%d @ step%d: Htop = %d num_bonds = %d num_hbonds = %d\n",
-             system->my_rank, data->step, Htop, num_bonds, num_hbonds );
-    MPI_Barrier( comm );
-#endif
-#if defined( DEBUG )
-    Print_Bonds( system, bonds, "debugbonds.out" );
-    Print_Bond_List2( system, bonds, "pbonds.out" );
-    Print_Sparse_Matrix( system, H );
-    for ( i = 0; i < H->n; ++i )
-        for ( j = H->start[i]; j < H->end[i]; ++j )
-            fprintf( stderr, "%d %d %.15e\n",
-                     MIN(system->my_atoms[i].orig_id,
-                         system->my_atoms[H->entries[j].j].orig_id),
-                     MAX(system->my_atoms[i].orig_id,
-                         system->my_atoms[H->entries[j].j].orig_id),
-                     H->entries[j].val );
-#endif
+    if ( system->my_rank == MASTER_NODE ) 
+    {
+        data->timing.init_dist += t_total[0] / control->nprocs;
+        data->timing.init_cm += t_total[1] / control->nprocs;
+        data->timing.init_bond += t_total[2] / control->nprocs;
+    }
 
-    Validate_Lists( system, workspace, lists, data->step,
-                    system->n, system->N, system->numH, comm );
 }
 
 
+//void Init_Forces( reax_system *system, control_params *control,
+//                  simulation_data *data, storage *workspace, reax_list **lists,
+//                  output_controls *out_control, MPI_Comm comm, mpi_datatypes *mpi_data )
+//{
+//    int i, j, pj;
+//    int start_i, end_i;
+//    int type_i, type_j;
+//    int Htop, btop_i, num_bonds, num_hbonds;
+//    int ihb, jhb, ihb_top;
+//    int local, flag, renbr;
+//    real r_ij, cutoff;
+//    sparse_matrix *H;
+//    reax_list *far_nbrs, *bonds, *hbonds;
+//    single_body_parameters *sbp_i, *sbp_j;
+//    two_body_parameters *twbp;
+//    reax_atom *atom_i, *atom_j;
+//    int jhb_top;
+//    int start_j, end_j;
+//    int btop_j;
+//#if defined(NEUTRAL_TERRITORY)
+//    int mark[6];
+//    int total_cnt[6];
+//    int bin[6];
+//    int total_sum[6];
+//    int nt_flag;
+//#endif
+//
+//    far_nbrs = lists[FAR_NBRS];
+//    bonds = lists[BONDS];
+//    hbonds = lists[HBONDS];
+//
+//
+//    for ( i = 0; i < system->n; ++i )
+//        workspace->bond_mark[i] = 0;
+//    for ( i = system->n; i < system->N; ++i )
+//    {
+//        /* put ghost atoms to an infinite distance (i.e., 1000) */
+//        workspace->bond_mark[i] = 1000;
+//    }
+//
+//    H = workspace->H;
+//    H->n = system->n;
+//    Htop = 0;
+//    num_bonds = 0;
+//    num_hbonds = 0;
+//    btop_i = 0;
+//    renbr = (data->step - data->prev_steps) % control->reneighbor == 0;
+//
+//#if defined(NEUTRAL_TERRITORY)
+//    nt_flag = 1;
+//    if( renbr )
+//    {
+//        for ( i = 0; i < 6; ++i )
+//        {
+//            total_cnt[i] = 0;
+//            bin[i] = 0;
+//            total_sum[i] = 0;
+//        }
+//
+//        for ( i = system->n; i < system->N; ++i )
+//        {
+//            atom_i = &system->my_atoms[i];
+//
+//            if( atom_i->nt_dir != -1 )
+//            {
+//                total_cnt[ atom_i->nt_dir ]++;
+//            }
+//        }
+//
+//        total_sum[0] = system->n;
+//        for ( i = 1; i < 6; ++i )
+//        {
+//            total_sum[i] = total_sum[i-1] + total_cnt[i-1];
+//        }
+//
+//        for ( i = system->n; i < system->N; ++i )
+//        {
+//            atom_i = &system->my_atoms[i];
+//
+//            if( atom_i->nt_dir != -1 )
+//            {
+//                atom_i->pos = total_sum[ atom_i->nt_dir ] + bin[ atom_i->nt_dir ];
+//                bin[ atom_i->nt_dir ]++;
+//            }
+//        }
+//        H->NT = total_sum[5] + total_cnt[5];
+//    }
+//
+//    mark[0] = mark[1] = 1;
+//    mark[2] = mark[3] = mark[4] = mark[5] = 2;
+//#endif
+//
+//    for ( i = 0; i < system->N; ++i )
+//    {
+//        atom_i = &system->my_atoms[i];
+//        type_i  = atom_i->type;
+//        start_i = Start_Index(i, far_nbrs);
+//        end_i = End_Index(i, far_nbrs);
+//
+//        if ( far_nbrs->format == HALF_LIST )
+//        {
+//            // start at end because other atoms
+//            // can add to this atom's list (half-list)
+//            btop_i = End_Index( i, bonds );
+//        }
+//        else if ( far_nbrs->format == FULL_LIST )
+//        {
+//            btop_i = Start_Index( i, bonds );
+//        }
+//        sbp_i = &system->reax_param.sbp[type_i];
+//
+//        if ( i < system->n )
+//        {
+//            local = 1;
+//            cutoff = control->nonb_cut;
+//        }
+//#if defined(NEUTRAL_TERRITORY)
+//        else if ( atom_i->nt_dir != -1 )
+//        {
+//            local = 2;
+//            cutoff = control->nonb_cut;
+//            nt_flag = 0;
+//        }
+//#endif
+//        else
+//        {
+//            local = 0;
+//            cutoff = control->bond_cut;
+//        }
+//
+//        ihb = -1;
+//        ihb_top = -1;
+//        if ( local == 1 )
+//        {
+//            H->start[i] = Htop;
+//            H->entries[Htop].j = i;
+//            H->entries[Htop].val = sbp_i->eta;
+//            ++Htop;
+//
+//            if ( control->hbond_cut > 0 )
+//            {
+//                ihb = sbp_i->p_hbond;
+//                if ( ihb == 1 )
+//                {
+//                    if ( far_nbrs->format == HALF_LIST )
+//                    {
+//                        // start at end because other atoms
+//                        // can add to this atom's list (half-list)
+//                        ihb_top = End_Index( atom_i->Hindex, hbonds );
+//                    }
+//                    else if ( far_nbrs->format == FULL_LIST )
+//                    {
+//                        ihb_top = Start_Index( atom_i->Hindex, hbonds );
+//                    }
+//                }
+//                else
+//                {
+//                    ihb_top = -1;
+//                }
+//            }
+//        }
+//
+//        // update i-j distance - check if j is within cutoff
+//        for ( pj = start_i; pj < end_i; ++pj )
+//        {
+//            j = far_nbrs->far_nbr_list.nbr[pj];
+//            atom_j = &system->my_atoms[j];
+//
+//            if ( renbr )
+//            {
+//                if ( far_nbrs->far_nbr_list.d[pj] <= cutoff )
+//                    flag = 1;
+//                else
+//                    flag = 0;
+//            }
+//            else
+//            {
+//                far_nbrs->far_nbr_list.dvec[pj][0] = atom_j->x[0] - atom_i->x[0];
+//                far_nbrs->far_nbr_list.dvec[pj][1] = atom_j->x[1] - atom_i->x[1];
+//                far_nbrs->far_nbr_list.dvec[pj][2] = atom_j->x[2] - atom_i->x[2];
+//                far_nbrs->far_nbr_list.d[pj] = rvec_Norm_Sqr( far_nbrs->far_nbr_list.dvec[pj] );
+//
+//                if ( far_nbrs->far_nbr_list.d[pj] <= SQR(cutoff) )
+//                {
+//                    far_nbrs->far_nbr_list.d[pj] = sqrt( far_nbrs->far_nbr_list.d[pj] );
+//                    flag = 1;
+//                }
+//                else
+//                {
+//                    flag = 0;
+//                }
+//            }
+//
+//            if ( flag )
+//            {
+//                type_j = atom_j->type;
+//                r_ij = far_nbrs->far_nbr_list.d[pj];
+//                sbp_j = &system->reax_param.sbp[type_j];
+//                twbp = &system->reax_param.tbp[type_i][type_j];
+//
+//                if ( local == 1 )
+//                {
+//                    // H matrix entry
+//#if defined(NEUTRAL_TERRITORY)
+//                    if ( atom_j->nt_dir > 0 || (j < system->n
+//                                && (H->format == SYM_FULL_MATRIX
+//                                    || (H->format == SYM_HALF_MATRIX && i < j))) )
+//                    {
+//                        if( j < system->n )
+//                        {
+//                            H->entries[Htop].j = j;
+//                        }
+//                        else
+//                        {
+//                            H->entries[Htop].j = atom_j->pos;
+//                        }
+//
+//                        if ( control->tabulate == 0 )
+//                        {
+//                            H->entries[Htop].val = Compute_H(r_ij, twbp->gamma, workspace->Tap);
+//                        }
+//                        else 
+//                        {
+//                            H->entries[Htop].val = Compute_tabH(r_ij, type_i, type_j);
+//                        }
+//
+//                        ++Htop;
+//                    }
+//#else
+//                    if ( (far_nbrs->format == HALF_LIST
+//                            && (j < system->n || atom_i->orig_id < atom_j->orig_id))
+//                      || far_nbrs->format == FULL_LIST )
+//                    {
+//                        H->entries[Htop].j = j;
+//
+//                        if ( control->tabulate == 0 )
+//                        {
+//                            H->entries[Htop].val = Compute_H(r_ij, twbp->gamma, workspace->Tap);
+//                        }
+//                        else
+//                        {
+//                            H->entries[Htop].val = Compute_tabH(r_ij, type_i, type_j);
+//                        }
+//
+//                        ++Htop;
+//                    }
+//#endif
+//
+//                    // hydrogen bond lists
+//                    if ( control->hbond_cut > 0.0
+//                            && (ihb == 1 || ihb == 2)
+//                            && far_nbrs->far_nbr_list.d[pj] <= control->hbond_cut )
+//                    {
+//                        // fprintf( stderr, "%d %d\n", atom1, atom2 );
+//                        jhb = sbp_j->p_hbond;
+//                        if ( ihb == 1 && jhb == 2 )
+//                        {
+//                            hbonds->hbond_list[ihb_top].nbr = j;
+//                            hbonds->hbond_list[ihb_top].scl = 1;
+//                            hbonds->hbond_list[ihb_top].ptr = pj;
+//                            ++ihb_top;
+//                            ++num_hbonds;
+//                        }
+//                        // only add to list for local j (far nbrs is half-list)
+//                        else if ( far_nbrs->format == HALF_LIST
+//                                && (j < system->n && ihb == 2 && jhb == 1) )
+//                        {
+//                            jhb_top = End_Index( atom_j->Hindex, hbonds );
+//                            hbonds->hbond_list[jhb_top].nbr = i;
+//                            hbonds->hbond_list[jhb_top].scl = -1;
+//                            hbonds->hbond_list[jhb_top].ptr = pj;
+//                            Set_End_Index( atom_j->Hindex, jhb_top + 1, hbonds );
+//                            ++num_hbonds;
+//                        }
+//                    }
+//                }
+//#if defined(NEUTRAL_TERRITORY)
+//                else if ( local == 2 )
+//                {
+//                    // H matrix entry 
+//                    if( ( atom_j->nt_dir != -1 && mark[atom_i->nt_dir] != mark[atom_j->nt_dir] 
+//                                && ( H->format == SYM_FULL_MATRIX
+//                                    || (H->format == SYM_HALF_MATRIX && atom_i->pos < atom_j->pos))) 
+//                            || ( j < system->n && atom_i->nt_dir != 0 && H->format == SYM_FULL_MATRIX ))
+//                    {
+//                        if( !nt_flag )
+//                        {
+//                            nt_flag = 1;
+//                            H->start[atom_i->pos] = Htop;
+//                        }
+//
+//                        if( j < system->n )
+//                        {
+//                            H->entries[Htop].j = j;
+//                        }
+//                        else
+//                        {
+//                            H->entries[Htop].j = atom_j->pos;
+//                        }
+//
+//                        if ( control->tabulate == 0 )
+//                        {
+//                            H->entries[Htop].val = Compute_H(r_ij, twbp->gamma, workspace->Tap);
+//                        }
+//                        else 
+//                        {
+//                            H->entries[Htop].val = Compute_tabH(r_ij, type_i, type_j);
+//                        }
+//
+//                        ++Htop;
+//                    }
+//                }
+//#endif
+//
+//                // uncorrected bond orders
+//                if ( //(workspace->bond_mark[i] < 3 || workspace->bond_mark[j] < 3) &&
+//                    far_nbrs->far_nbr_list.d[pj] <= control->bond_cut
+//                    && BOp( workspace, bonds, control->bo_cut,
+//                         i, btop_i, far_nbrs->far_nbr_list.nbr[pj],
+//                         &far_nbrs->far_nbr_list.rel_box[pj], far_nbrs->far_nbr_list.d[pj],
+//                         &far_nbrs->far_nbr_list.dvec[pj], far_nbrs->format,
+//                         sbp_i, sbp_j, twbp ) )
+//                {
+//                    num_bonds += 2;
+//                    ++btop_i;
+//
+//                    if ( workspace->bond_mark[j] > workspace->bond_mark[i] + 1 )
+//                        workspace->bond_mark[j] = workspace->bond_mark[i] + 1;
+//                    else if ( workspace->bond_mark[i] > workspace->bond_mark[j] + 1 )
+//                    {
+//                        workspace->bond_mark[i] = workspace->bond_mark[j] + 1;
+//                    }
+//                }
+//            }
+//        }
+//
+//        Set_End_Index( i, btop_i, bonds );
+//        if ( local == 1 )
+//        {
+//            H->end[i] = Htop;
+//            if ( ihb == 1 )
+//                Set_End_Index( atom_i->Hindex, ihb_top, hbonds );
+//        }
+//#if defined(NEUTRAL_TERRITORY)
+//        else if ( local == 2 )
+//        {
+//            if( nt_flag )
+//            {
+//                H->end[atom_i->pos] = Htop;
+//            }
+//            else
+//            {
+//                 H->start[atom_i->pos] = 0;
+//                 H->end[atom_i->pos] = 0;
+//            }
+//        }
+//#endif
+//    }
+//
+//    if ( far_nbrs->format == FULL_LIST )
+//    {
+//
+//        for( i = 0; i < system->N; ++i )
+//            qsort( &bonds->bond_list[Start_Index(i, bonds)],
+//                    Num_Entries(i, bonds), sizeof(bond_data), compare_bonds );
+//
+//        // set sym_index for bonds list (far_nbrs full list)
+//        for ( i = 0; i < system->N; ++i )
+//        {
+//            start_i = Start_Index( i, bonds );
+//            end_i = End_Index( i, bonds );
+//
+//            for ( btop_i = start_i; btop_i < end_i; ++btop_i )
+//            {
+//                j = bonds->bond_list[btop_i].nbr;
+//                start_j = Start_Index( j, bonds );
+//                end_j = End_Index( j, bonds );
+//
+//                for ( btop_j = start_j; btop_j < end_j; ++btop_j )
+//                {
+//                    if ( bonds->bond_list[btop_j].nbr == i )
+//                    {
+//                        bonds->bond_list[btop_i].sym_index = btop_j;
+//                        break;
+//                    }
+//                }
+//            }
+//        }
+//    }
+//
+//#if defined(DEBUG)
+//    Print_Sparse_Matrix2( system, H, NULL );
+//#endif
+//
+//    workspace->realloc.Htop = Htop;
+//    workspace->realloc.num_bonds = num_bonds;
+//    workspace->realloc.num_hbonds = num_hbonds;
+//
+//#if defined(DEBUG_FOCUS)
+//    fprintf( stderr, "p%d @ step%d: Htop = %d num_bonds = %d num_hbonds = %d\n",
+//             system->my_rank, data->step, Htop, num_bonds, num_hbonds );
+//    MPI_Barrier( comm );
+//#endif
+//
+//#if defined( DEBUG )
+//    Print_Bonds( system, bonds, "debugbonds.out" );
+//    Print_Bond_List2( system, bonds, "pbonds.out" );
+//    Print_Sparse_Matrix( system, H );
+//    for ( i = 0; i < H->n; ++i )
+//        for ( j = H->start[i]; j < H->end[i]; ++j )
+//            fprintf( stderr, "%d %d %.15e\n",
+//                     MIN(system->my_atoms[i].orig_id,
+//                         system->my_atoms[H->entries[j].j].orig_id),
+//                     MAX(system->my_atoms[i].orig_id,
+//                         system->my_atoms[H->entries[j].j].orig_id),
+//                     H->entries[j].val );
+//#endif
+//
+//    Validate_Lists( system, workspace, lists, data->step,
+//                    system->n, system->N, system->numH, comm );
+//
+//}
+
+
 void Init_Forces_noQEq( reax_system *system, control_params *control,
-                        simulation_data *data, storage *workspace,
-                        reax_list **lists, output_controls *out_control,
-                        MPI_Comm comm )
+        simulation_data *data, storage *workspace,
+        reax_list **lists, output_controls *out_control, MPI_Comm comm )
 {
     int i, j, pj;
     int start_i, end_i;
     int type_i, type_j;
-    int btop_i, btop_j, num_bonds, num_hbonds;
-    int ihb, jhb, ihb_top, jhb_top;
+    int btop_i, num_bonds, num_hbonds;
+    int ihb, jhb, ihb_top;
     int local, flag, renbr;
     real r_ij, cutoff;
     reax_list *far_nbrs, *bonds, *hbonds;
     single_body_parameters *sbp_i, *sbp_j;
     two_body_parameters *twbp;
-    far_neighbor_data *nbr_pj;
     reax_atom *atom_i, *atom_j;
+    int jhb_top;
+    int start_j, end_j;
+    int btop_j;
 
     far_nbrs = lists[FAR_NBRS];
     bonds = lists[BONDS];
     hbonds = lists[HBONDS];
 
     for ( i = 0; i < system->n; ++i )
+    {
         workspace->bond_mark[i] = 0;
+    }
     for ( i = system->n; i < system->N; ++i )
     {
-        workspace->bond_mark[i] = 1000; // put ghost atoms to an infinite distance
-        //workspace->done_after[i] = Start_Index( i, far_nbrs );
+        /* put ghost atoms to an infinite distance (i.e., 1000) */
+        workspace->bond_mark[i] = 1000;
     }
 
     num_bonds = 0;
     num_hbonds = 0;
-    btop_i = btop_j = 0;
+    btop_i = 0;
     renbr = (data->step - data->prev_steps) % control->reneighbor == 0;
 
     for ( i = 0; i < system->N; ++i )
@@ -628,7 +1909,16 @@ void Init_Forces_noQEq( reax_system *system, control_params *control,
         type_i  = atom_i->type;
         start_i = Start_Index(i, far_nbrs);
         end_i   = End_Index(i, far_nbrs);
-        btop_i = End_Index( i, bonds );
+        if ( far_nbrs->format == HALF_LIST )
+        {
+            /* start at end because other atoms
+             * can add to this atom's list (half-list) */
+            btop_i = End_Index( i, bonds );
+        }
+        else if ( far_nbrs->format == FULL_LIST )
+        {
+            btop_i = Start_Index( i, bonds );
+        }
         sbp_i = &(system->reax_param.sbp[type_i]);
 
         if ( i < system->n )
@@ -648,32 +1938,51 @@ void Init_Forces_noQEq( reax_system *system, control_params *control,
         {
             ihb = sbp_i->p_hbond;
             if ( ihb == 1 )
-                ihb_top = End_Index( atom_i->Hindex, hbonds );
-            else ihb_top = -1;
+            {
+                if ( far_nbrs->format == HALF_LIST )
+                {
+                    /* start at end because other atoms
+                     * can add to this atom's list (half-list) */
+                    ihb_top = End_Index( atom_i->Hindex, hbonds );
+                }
+                else if ( far_nbrs->format == FULL_LIST )
+                {
+                    ihb_top = Start_Index( atom_i->Hindex, hbonds );
+                }
+            }
+            else 
+            {
+                ihb_top = -1;
+            }
         }
 
         /* update i-j distance - check if j is within cutoff */
         for ( pj = start_i; pj < end_i; ++pj )
         {
-            nbr_pj = &( far_nbrs->far_nbr_list[pj] );
-            j = nbr_pj->nbr;
-            atom_j = &(system->my_atoms[j]);
+            j = far_nbrs->far_nbr_list.nbr[pj];
+            atom_j = &system->my_atoms[j];
 
             if ( renbr )
             {
-                if ( nbr_pj->d <= cutoff )
+                if ( far_nbrs->far_nbr_list.d[pj] <= cutoff )
+                {
                     flag = 1;
-                else flag = 0;
+                }
+                else
+                {
+                    flag = 0;
+                }
             }
             else
             {
-                nbr_pj->dvec[0] = atom_j->x[0] - atom_i->x[0];
-                nbr_pj->dvec[1] = atom_j->x[1] - atom_i->x[1];
-                nbr_pj->dvec[2] = atom_j->x[2] - atom_i->x[2];
-                nbr_pj->d = rvec_Norm_Sqr( nbr_pj->dvec );
-                if ( nbr_pj->d <= SQR(cutoff) )
+                far_nbrs->far_nbr_list.dvec[pj][0] = atom_j->x[0] - atom_i->x[0];
+                far_nbrs->far_nbr_list.dvec[pj][1] = atom_j->x[1] - atom_i->x[1];
+                far_nbrs->far_nbr_list.dvec[pj][2] = atom_j->x[2] - atom_i->x[2];
+                far_nbrs->far_nbr_list.d[pj] = rvec_Norm_Sqr( far_nbrs->far_nbr_list.dvec[pj] );
+
+                if ( far_nbrs->far_nbr_list.d[pj] <= SQR(cutoff) )
                 {
-                    nbr_pj->d = sqrt(nbr_pj->d);
+                    far_nbrs->far_nbr_list.d[pj] = sqrt( far_nbrs->far_nbr_list.d[pj] );
                     flag = 1;
                 }
                 else
@@ -685,15 +1994,16 @@ void Init_Forces_noQEq( reax_system *system, control_params *control,
             if ( flag )
             {
                 type_j = atom_j->type;
-                r_ij = nbr_pj->d;
-                sbp_j = &(system->reax_param.sbp[type_j]);
-                twbp = &(system->reax_param.tbp[type_i][type_j]);
+                r_ij = far_nbrs->far_nbr_list.d[pj];
+                sbp_j = &system->reax_param.sbp[type_j];
+                twbp = &system->reax_param.tbp[type_i][type_j];
 
                 if ( local )
                 {
                     /* hydrogen bond lists */
-                    if ( control->hbond_cut > 0 && (ihb == 1 || ihb == 2) &&
-                            nbr_pj->d <= control->hbond_cut )
+                    if ( control->hbond_cut > 0.0
+                            && (ihb == 1 || ihb == 2)
+                            && far_nbrs->far_nbr_list.d[pj] <= control->hbond_cut )
                     {
                         // fprintf( stderr, "%d %d\n", atom1, atom2 );
                         jhb = sbp_j->p_hbond;
@@ -701,16 +2011,18 @@ void Init_Forces_noQEq( reax_system *system, control_params *control,
                         {
                             hbonds->hbond_list[ihb_top].nbr = j;
                             hbonds->hbond_list[ihb_top].scl = 1;
-                            hbonds->hbond_list[ihb_top].ptr = nbr_pj;
+                            hbonds->hbond_list[ihb_top].ptr = pj;
                             ++ihb_top;
                             ++num_hbonds;
                         }
-                        else if ( j < system->n && ihb == 2 && jhb == 1 )
+                        /* only add to list for local j (far nbrs is half-list) */
+                        else if ( far_nbrs->format == HALF_LIST
+                                && (j < system->n && ihb == 2 && jhb == 1) )
                         {
                             jhb_top = End_Index( atom_j->Hindex, hbonds );
                             hbonds->hbond_list[jhb_top].nbr = i;
                             hbonds->hbond_list[jhb_top].scl = -1;
-                            hbonds->hbond_list[jhb_top].ptr = nbr_pj;
+                            hbonds->hbond_list[jhb_top].ptr = pj;
                             Set_End_Index( atom_j->Hindex, jhb_top + 1, hbonds );
                             ++num_hbonds;
                         }
@@ -720,9 +2032,12 @@ void Init_Forces_noQEq( reax_system *system, control_params *control,
 
                 /* uncorrected bond orders */
                 if ( //(workspace->bond_mark[i] < 3 || workspace->bond_mark[j] < 3) &&
-                    nbr_pj->d <= control->bond_cut &&
-                    BOp( workspace, bonds, control->bo_cut,
-                         i , btop_i, nbr_pj, sbp_i, sbp_j, twbp ) )
+                    far_nbrs->far_nbr_list.d[pj] <= control->bond_cut
+                    && BOp( workspace, bonds, control->bo_cut,
+                         i, btop_i, far_nbrs->far_nbr_list.nbr[pj],
+                         &far_nbrs->far_nbr_list.rel_box[pj], far_nbrs->far_nbr_list.d[pj],
+                         &far_nbrs->far_nbr_list.dvec[pj], far_nbrs->format,
+                         sbp_i, sbp_j, twbp ) )
                 {
                     num_bonds += 2;
                     ++btop_i;
@@ -746,13 +2061,31 @@ void Init_Forces_noQEq( reax_system *system, control_params *control,
             Set_End_Index( atom_i->Hindex, ihb_top, hbonds );
     }
 
-    /*for( i = system->n; i < system->N; ++i )
-      if( workspace->bond_mark[i] > 3 ) {
-        start_i = Start_Index(i, bonds);
-        end_i = End_Index(i, bonds);
-        num_bonds -= (end_i - start_i);
-        Set_End_Index(i, start_i, bonds );
-        }*/
+    if ( far_nbrs->format == FULL_LIST )
+    {
+        /* set sym_index for bonds list (far_nbrs full list) */
+        for ( i = 0; i < system->N; ++i )
+        {
+            start_i = Start_Index( i, bonds );
+            end_i = End_Index( i, bonds );
+
+            for ( btop_i = start_i; btop_i < end_i; ++btop_i )
+            {
+                j = bonds->bond_list[btop_i].nbr;
+                start_j = Start_Index( j, bonds );
+                end_j = End_Index( j, bonds );
+
+                for ( btop_j = start_j; btop_j < end_j; ++btop_j )
+                {
+                    if ( bonds->bond_list[btop_j].nbr == i )
+                    {
+                        bonds->bond_list[btop_i].sym_index = btop_j;
+                        break;
+                    }
+                }
+            }
+        }
+    }
 
     workspace->realloc.num_bonds = num_bonds;
     workspace->realloc.num_hbonds = num_hbonds;
@@ -768,13 +2101,14 @@ void Init_Forces_noQEq( reax_system *system, control_params *control,
 #endif
 
     Validate_Lists( system, workspace, lists, data->step,
-                    system->n, system->N, system->numH, comm );
+            system->n, system->N, system->numH, comm );
 }
 
 
 void Estimate_Storages( reax_system *system, control_params *control,
-                        reax_list **lists, int *Htop, int *hb_top,
-                        int *bond_top, int *num_3body, MPI_Comm comm )
+        reax_list **lists, int *Htop, int *hb_top,
+        int *bond_top, int *num_3body, MPI_Comm comm,
+        int *matrix_dim, int cm_format )
 {
     int i, j, pj;
     int start_i, end_i;
@@ -788,30 +2122,44 @@ void Estimate_Storages( reax_system *system, control_params *control,
     reax_list *far_nbrs;
     single_body_parameters *sbp_i, *sbp_j;
     two_body_parameters *twbp;
-    far_neighbor_data *nbr_pj;
     reax_atom *atom_i, *atom_j;
 
     far_nbrs = lists[FAR_NBRS];
     *Htop = 0;
+    *matrix_dim = 0;
     memset( hb_top, 0, sizeof(int) * system->local_cap );
     memset( bond_top, 0, sizeof(int) * system->total_cap );
     *num_3body = 0;
 
+#if defined(NEUTRAL_TERRITORY)
+    int mark[6] = {1, 1, 2, 2, 2, 2};
+#endif
+
     for ( i = 0; i < system->N; ++i )
     {
-        atom_i = &(system->my_atoms[i]);
+        atom_i = &system->my_atoms[i];
         type_i  = atom_i->type;
         start_i = Start_Index(i, far_nbrs);
-        end_i   = End_Index(i, far_nbrs);
-        sbp_i = &(system->reax_param.sbp[type_i]);
+        end_i = End_Index(i, far_nbrs);
+        sbp_i = &system->reax_param.sbp[type_i];
 
         if ( i < system->n )
         {
             local = 1;
             cutoff = control->nonb_cut;
             ++(*Htop);
+            ++(*matrix_dim);
             ihb = sbp_i->p_hbond;
         }
+#if defined(NEUTRAL_TERRITORY)
+        else if ( atom_i->nt_dir != -1 )
+        {
+            local = 2;
+            cutoff = control->nonb_cut;
+            ++(*matrix_dim);
+            ihb = -1;
+        }
+#endif
         else
         {
             local = 0;
@@ -821,36 +2169,71 @@ void Estimate_Storages( reax_system *system, control_params *control,
 
         for ( pj = start_i; pj < end_i; ++pj )
         {
-            nbr_pj = &( far_nbrs->far_nbr_list[pj] );
-            j = nbr_pj->nbr;
-            atom_j = &(system->my_atoms[j]);
+            j = far_nbrs->far_nbr_list.nbr[pj];
+
+#if !defined(NEUTRAL_TERRITORY)
+            if ( far_nbrs->format == HALF_LIST )
+#endif
+            {
+                atom_j = &system->my_atoms[j];
+            }
 
-            if (nbr_pj->d <= cutoff)
+            if ( far_nbrs->far_nbr_list.d[pj] <= cutoff )
             {
                 type_j = system->my_atoms[j].type;
-                r_ij = nbr_pj->d;
-                sbp_j = &(system->reax_param.sbp[type_j]);
-                twbp = &(system->reax_param.tbp[type_i][type_j]);
+                r_ij = far_nbrs->far_nbr_list.d[pj];
+                sbp_j = &system->reax_param.sbp[type_j];
+                twbp = &system->reax_param.tbp[type_i][type_j];
 
-                if ( local )
+                if ( local == 1 )
                 {
-                    if ( j < system->n || atom_i->orig_id < atom_j->orig_id ) //tryQEq ||1
+#if defined(NEUTRAL_TERRITORY)
+                    if( atom_j->nt_dir > 0 || j < system->n )
+                    {
+                        ++(*Htop);
+                    }
+#else
+                    if ( (far_nbrs->format == HALF_LIST
+                                && (j < system->n || atom_i->orig_id < atom_j->orig_id))
+                            || far_nbrs->format == FULL_LIST )
+                    {
                         ++(*Htop);
+                    }
+#endif
 
                     /* hydrogen bond lists */
-                    if ( control->hbond_cut > 0.1 && (ihb == 1 || ihb == 2) &&
-                            nbr_pj->d <= control->hbond_cut )
+                    if ( control->hbond_cut > 0.1
+                            && (ihb == 1 || ihb == 2)
+                            && far_nbrs->far_nbr_list.d[pj] <= control->hbond_cut )
                     {
                         jhb = sbp_j->p_hbond;
+
                         if ( ihb == 1 && jhb == 2 )
+                        {
                             ++hb_top[i];
-                        else if ( j < system->n && ihb == 2 && jhb == 1 )
+                        }
+                        /* only add to list for local j (far nbrs is half-list) */
+                        else if ( far_nbrs->format == HALF_LIST
+                                && (j < system->n && ihb == 2 && jhb == 1) )
+                        {
                             ++hb_top[j];
+                        }
+                    }
+                }
+
+#if defined(NEUTRAL_TERRITORY)
+                else if ( local == 2 )
+                {
+                    if( ( atom_j->nt_dir != -1 && mark[atom_i->nt_dir] != mark[atom_j->nt_dir] ) 
+                            || ( j < system->n && atom_i->nt_dir != 0 ))
+                    {
+                        ++(*Htop);
                     }
                 }
+#endif
 
                 /* uncorrected bond orders */
-                if ( nbr_pj->d <= control->bond_cut )
+                if ( far_nbrs->far_nbr_list.d[pj] <= control->bond_cut )
                 {
                     r2 = SQR(r_ij);
 
@@ -859,21 +2242,33 @@ void Estimate_Storages( reax_system *system, control_params *control,
                         C12 = twbp->p_bo1 * pow( r_ij / twbp->r_s, twbp->p_bo2 );
                         BO_s = (1.0 + control->bo_cut) * exp( C12 );
                     }
-                    else BO_s = C12 = 0.0;
+                    else
+                    {
+                        C12 = 0.0;
+                        BO_s = 0.0;
+                    }
 
                     if ( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0)
                     {
                         C34 = twbp->p_bo3 * pow( r_ij / twbp->r_p, twbp->p_bo4 );
                         BO_pi = exp( C34 );
                     }
-                    else BO_pi = C34 = 0.0;
+                    else
+                    {
+                        C34 = 0.0;
+                        BO_pi = 0.0;
+                    }
 
                     if ( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0)
                     {
                         C56 = twbp->p_bo5 * pow( r_ij / twbp->r_pp, twbp->p_bo6 );
                         BO_pi2 = exp( C56 );
                     }
-                    else BO_pi2 = C56 = 0.0;
+                    else
+                    {
+                        C56 = 0.0;
+                        BO_pi2 = 0.0;
+                    }
 
                     /* Initially BO values are the uncorrected ones, page 1 */
                     BO = BO_s + BO_pi + BO_pi2;
@@ -881,137 +2276,187 @@ void Estimate_Storages( reax_system *system, control_params *control,
                     if ( BO >= control->bo_cut )
                     {
                         ++bond_top[i];
-                        ++bond_top[j];
+                        if ( far_nbrs->format == HALF_LIST )
+                        {
+                            ++bond_top[j];
+                        }
                     }
                 }
             }
         }
     }
 
-    *Htop = (int)(MAX( *Htop * SAFE_ZONE, MIN_CAP * MIN_HENTRIES ));
+#if defined(NEUTRAL_TERRITORY)
+    /* Since we don't know the NT atoms' position yet, Htop cannot be calculated accurately.
+     * Therefore, we assume it is full and divide 2 if necessary. */
+    if ( cm_format == SYM_HALF_MATRIX )
+    {
+        *Htop = (*Htop + system->n + 1) / 2;
+    }
+#endif
+
+#if defined(NEUTRAL_TERRITORY)
+    *matrix_dim = (int) MAX( *matrix_dim * SAFE_ZONE_NT, MIN_CAP );
+    *Htop = (int) MAX( *Htop * SAFE_ZONE_NT, MIN_CAP * MIN_HENTRIES );
+#else
+    *matrix_dim = (int) MAX( *matrix_dim * SAFE_ZONE, MIN_CAP );
+    *Htop = (int) MAX( *Htop * SAFE_ZONE, MIN_CAP * MIN_HENTRIES );
+#endif
+
     for ( i = 0; i < system->n; ++i )
-        hb_top[i] = (int)(MAX( hb_top[i] * SAFER_ZONE, MIN_HBONDS ));
+    {
+        hb_top[i] = (int) MAX( hb_top[i] * SAFER_ZONE, MIN_HBONDS );
+    }
 
     for ( i = 0; i < system->N; ++i )
     {
-        *num_3body += SQR(bond_top[i]);
-        //if( i < system->n )
+        *num_3body += SQR( bond_top[i] );
+        //TODO: why x2?
         bond_top[i] = MAX( bond_top[i] * 2, MIN_BONDS );
-        //else bond_top[i] = MAX_BONDS;
     }
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d @ estimate storages: Htop = %d, num_3body = %d\n",
-             system->my_rank, *Htop, *num_3body );
+            system->my_rank, *Htop, *num_3body );
     MPI_Barrier( comm );
 #endif
 }
 
 
 void Compute_Forces( reax_system *system, control_params *control,
-                     simulation_data *data, storage *workspace,
-                     reax_list **lists, output_controls *out_control,
-                     mpi_datatypes *mpi_data )
+        simulation_data *data, storage *workspace,
+        reax_list **lists, output_controls *out_control,
+        mpi_datatypes *mpi_data )
 {
     MPI_Comm comm;
     int qeq_flag;
 #if defined(LOG_PERFORMANCE)
-    real t_start = 0;
+    real t_start = 0.0, t_end;
 
-    //MPI_Barrier( mpi_data->world );
     if ( system->my_rank == MASTER_NODE )
-        t_start = Get_Time( );
+    {
+        t_start = MPI_Wtime();
+    }
 #endif
 
     comm = mpi_data->world;
+
     /********* init forces ************/
 #if defined(PURE_REAX)
     if ( control->charge_freq && (data->step - data->prev_steps) % control->charge_freq == 0 )
+    {
         qeq_flag = 1;
-    else qeq_flag = 0;
+    }
+    else
+    {
+        qeq_flag = 0;
+    }
 #elif defined(LAMMPS_REAX)
     qeq_flag = 0;
 #endif
 
     if ( qeq_flag )
-        Init_Forces( system, control, data, workspace, lists, out_control, comm );
+    {
+        Init_Forces( system, control, data, workspace, lists, out_control, comm, mpi_data );
+    }
     else
+    {
         Init_Forces_noQEq( system, control, data, workspace,
-                           lists, out_control, comm );
+                lists, out_control, comm );
+    }
 
 #if defined(LOG_PERFORMANCE)
     //MPI_Barrier( mpi_data->world );
     if ( system->my_rank == MASTER_NODE )
-        Update_Timing_Info( &t_start, &(data->timing.init_forces) );
+    {
+        t_end = MPI_Wtime( );
+        data->timing.init_forces += t_end - t_start;
+        t_start = t_end;
+    }
 #endif
 
-
     /********* bonded interactions ************/
     Compute_Bonded_Forces( system, control, data, workspace,
-                           lists, out_control, mpi_data->world );
+            lists, out_control, mpi_data->world );
 
 #if defined(LOG_PERFORMANCE)
     //MPI_Barrier( mpi_data->world );
     if ( system->my_rank == MASTER_NODE )
-        Update_Timing_Info( &t_start, &(data->timing.bonded) );
+    {
+        t_end = MPI_Wtime( );
+        data->timing.bonded += t_end - t_start;
+        t_start = t_end;
+    }
 #endif
+
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d @ step%d: completed bonded\n",
              system->my_rank, data->step );
     MPI_Barrier( mpi_data->world );
 #endif
 
-
     /**************** qeq ************************/
 #if defined(PURE_REAX)
     if ( qeq_flag )
+    {
         QEq( system, control, data, workspace, out_control, mpi_data );
+    }
 
 #if defined(LOG_PERFORMANCE)
-    //MPI_Barrier( mpi_data->world );
     if ( system->my_rank == MASTER_NODE )
-        Update_Timing_Info( &t_start, &data->timing.cm );
+    {
+        t_end = MPI_Wtime( );
+        data->timing.cm += t_end - t_start;
+        t_start = t_end;
+    }
 #endif
+
 #if defined(DEBUG_FOCUS)
     fprintf(stderr, "p%d @ step%d: qeq completed\n", system->my_rank, data->step);
     MPI_Barrier( mpi_data->world );
 #endif
 #endif //PURE_REAX
 
-
     /********* nonbonded interactions ************/
     Compute_NonBonded_Forces( system, control, data, workspace,
-                              lists, out_control, mpi_data->world );
+            lists, out_control, mpi_data->world );
 
 #if defined(LOG_PERFORMANCE)
-    //MPI_Barrier( mpi_data->world );
     if ( system->my_rank == MASTER_NODE )
-        Update_Timing_Info( &t_start, &(data->timing.nonb) );
+    {
+        t_end = MPI_Wtime( );
+        data->timing.nonb += t_end - t_start;
+        t_start = t_end;
+    }
 #endif
+
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d @ step%d: nonbonded forces completed\n",
              system->my_rank, data->step );
     MPI_Barrier( mpi_data->world );
 #endif
 
-
     /*********** total force ***************/
     Compute_Total_Force( system, control, data, workspace, lists, mpi_data );
 
 #if defined(LOG_PERFORMANCE)
-    //MPI_Barrier( mpi_data->world );
     if ( system->my_rank == MASTER_NODE )
-        Update_Timing_Info( &t_start, &(data->timing.bonded) );
+    {
+        t_end = MPI_Wtime( );
+        data->timing.bonded += t_end - t_start;
+    }
 #endif
+
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d @ step%d: total forces computed\n",
              system->my_rank, data->step );
+
     //Print_Total_Force( system, data, workspace );
     MPI_Barrier( mpi_data->world );
 #endif
 
 #if defined(TEST_FORCES)
     Print_Force_Files( system, control, data, workspace,
-                       lists, out_control, mpi_data );
+            lists, out_control, mpi_data );
 #endif
 }
diff --git a/PuReMD/src/forces.h b/PuReMD/src/forces.h
index 43d47cb46f66f45dc2b54d9415f8c92405a1d557..105f35941c8aa0d791f12b55a59649ba2d4ead8b 100644
--- a/PuReMD/src/forces.h
+++ b/PuReMD/src/forces.h
@@ -31,5 +31,5 @@ void Init_Force_Functions( control_params* );
 void Compute_Forces( reax_system*, control_params*, simulation_data*,
                      storage*, reax_list**, output_controls*, mpi_datatypes* );
 void Estimate_Storages( reax_system*, control_params*, reax_list**,
-                        int*, int*, int*, int*, MPI_Comm );
+                        int*, int*, int*, int*, MPI_Comm, int*, int );
 #endif
diff --git a/PuReMD/src/geo_tools.c b/PuReMD/src/geo_tools.c
index c1e3549fedf2039f96af84aa263cd114c7dee3cb..77e1f95b32b7bf8e12b5de46578af8364fe505e3 100644
--- a/PuReMD/src/geo_tools.c
+++ b/PuReMD/src/geo_tools.c
@@ -81,11 +81,7 @@ char Read_Geo( char* geo_file, reax_system* system, control_params *control,
     comm = MPI_COMM_WORLD;
 
     /* open the geometry file */
-    if ( (geo = fopen(geo_file, "r")) == NULL )
-    {
-        fprintf( stderr, "fopen: error opening the geo file! terminating...\n" );
-        MPI_Abort( comm, FILE_NOT_FOUND );
-    }
+    geo = sfopen( geo_file, "r", "Read_Geo::geo" );
 
     /* read box information */
     fscanf( geo, CUSTOM_BOXGEO_FORMAT,
@@ -140,7 +136,7 @@ char Read_Geo( char* geo_file, reax_system* system, control_params *control,
         }
     }
 
-    fclose( geo );
+    sfclose( geo, "Read_Geo::geo" );
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d: finished reading the geo file\n", system->my_rank );
@@ -239,12 +235,12 @@ void Count_PDB_Atoms( FILE *geo, reax_system *system )
 
     system->N = system->n;
 
-    //#if defined(DEBUG)
+#if defined(DEBUG)
     fprintf( stderr, "p%d@count atoms:\n", system->my_rank );
     fprintf( stderr, "p%d: bigN = %d\n", system->my_rank, system->bigN );
     fprintf( stderr, "p%d: n = %d\n", system->my_rank, system->n );
     fprintf( stderr, "p%d: N = %d\n\n", system->my_rank, system->N );
-    //#endif
+#endif
 }
 
 
@@ -271,11 +267,7 @@ char Read_PDB( char* pdb_file, reax_system* system, control_params *control,
     comm = MPI_COMM_WORLD;
 
     /* open pdb file */
-    if ( (pdb = fopen(pdb_file, "r")) == NULL )
-    {
-        fprintf( stderr, "fopen: error opening the pdb file! terminating...\n" );
-        MPI_Abort( comm, FILE_NOT_FOUND );
-    }
+    pdb = sfopen( pdb_file, "r", "Read_PDB::pdb" );
 
     /* allocate memory for tokenizing pdb lines */
     if ( Allocate_Tokenizer_Space( &s, &s1, &tmp ) == FAILURE )
@@ -481,7 +473,7 @@ char Read_PDB( char* pdb_file, reax_system* system, control_params *control,
         return FAILURE;
     }
 
-    fclose( pdb );
+    sfclose( pdb, "Read_PDB::pdb" );
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d: finished reading the pdb file\n", system->my_rank );
@@ -497,7 +489,7 @@ char Read_PDB( char* pdb_file, reax_system* system, control_params *control,
    Also, we do not write connect lines yet.
 */
 
-char Write_PDB(reax_system* system, reax_list* bonds, simulation_data *data,
+char Write_PDB(reax_system* system, reax_list** bonds, simulation_data *data,
                control_params *control, mpi_datatypes *mpi_data,
                output_controls *out_control)
 {
@@ -550,14 +542,14 @@ char Write_PDB(reax_system* system, reax_list* bonds, simulation_data *data,
 
 
         sprintf(fname, "%s-%d.pdb", control->sim_name, data->step);
-        pdb = fopen(fname, "w");
-        fprintf( pdb, PDB_CRYST1_FORMAT_O,
+        pdb = sfopen( fname, "w", "Write_PDB::pdb" );
+        /*fprintf( pdb, PDB_CRYST1_FORMAT_O,
                  "CRYST1",
                  system->big_box.box_norms[0], system->big_box.box_norms[1],
                  system->big_box.box_norms[2],
                  RAD2DEG(alpha), RAD2DEG(beta), RAD2DEG(gamma), " ", 0 );
         fprintf( out_control->log, "Box written\n" );
-        fflush( out_control->log );
+        fflush( out_control->log );*/
     }
 
     /*write atom lines to buffer*/
@@ -566,11 +558,13 @@ char Write_PDB(reax_system* system, reax_list* bonds, simulation_data *data,
         p_atom = &(system->my_atoms[i]);
         strncpy(name, p_atom->name, 8);
         Trim_Spaces(name);
-        sprintf( line, PDB_ATOM_FORMAT_O,
+        /*sprintf( line, PDB_ATOM_FORMAT_O,
                  "ATOM  ", p_atom->orig_id, p_atom->name, ' ', "REX", ' ', 1, ' ',
                  p_atom->x[0], p_atom->x[1], p_atom->x[2],
-                 1.0, 0.0, "0", name, "  " );
-        fprintf(stderr, "PDB NAME <%s>\n", p_atom->name);
+                 1.0, 0.0, "0", name, "  " );*/
+        sprintf( line, PDB_ATOM_FORMAT_O,
+                 p_atom->orig_id, p_atom->x[0], p_atom->x[1], p_atom->x[2] );
+        //fprintf( stderr, "PDB NAME <%s>\n", p_atom->name);
         strncpy( buffer + i * PDB_ATOM_FORMAT_O_LENGTH, line,
                  PDB_ATOM_FORMAT_O_LENGTH );
     }
@@ -599,7 +593,7 @@ char Write_PDB(reax_system* system, reax_list* bonds, simulation_data *data,
     if ( me == MASTER_NODE)
     {
         fprintf( pdb, "%s", buffer );
-        fclose( pdb );
+        sfclose( pdb, "Write_PDB::pdb" );
     }
 
     /* Writing connect information */
diff --git a/PuReMD/src/geo_tools.h b/PuReMD/src/geo_tools.h
index 8078685689afa1d6edbe7b4534dd3bc65c45d1c5..d34caaffade4fa4c8b8dee0efffa58afe977de63 100644
--- a/PuReMD/src/geo_tools.h
+++ b/PuReMD/src/geo_tools.h
@@ -110,14 +110,16 @@ COLUMNS       DATA TYPE       FIELD         DEFINITION
 #define PDB_CONECT_FORMAT "%6s%5d%5d%5d%5d%5d\n"
 #define PDB_CRYST1_FORMAT "%6s%9s%9s%9s%7s%7s%7s%11s%4s\n"
 
-#define PDB_ATOM_FORMAT_O "%6s%5d %4s%c%3s %c%4d%c   %8.3f%8.3f%8.3f%6.2f%6.2f      %-4s%2s%2s\n"
-#define PDB_ATOM_FORMAT_O_LENGTH 81
+//#define PDB_ATOM_FORMAT_O "%6s%5d %4s%c%3s %c%4d%c   %8.3f%8.3f%8.3f%6.2f%6.2f      %-4s%2s%2s\n"
+#define PDB_ATOM_FORMAT_O "%5d%8.3f%8.3f%8.3f\n"
+//#define PDB_ATOM_FORMAT_O_LENGTH 81
+#define PDB_ATOM_FORMAT_O_LENGTH 30
 #define PDB_CRYST1_FORMAT_O "%6s%9.3f%9.3f%9.3f%7.2f%7.2f%7.2f%11s%4d\n"
 
 char Read_PDB( char*, reax_system*, control_params*,
                simulation_data*, storage*, mpi_datatypes* );
 
-char Write_PDB( reax_system*, reax_list*, simulation_data*,
+char Write_PDB( reax_system*, reax_list**, simulation_data*,
                 control_params*, mpi_datatypes*, output_controls* );
 
 #endif
diff --git a/PuReMD/src/grid.c b/PuReMD/src/grid.c
index 0064f2201695b85625dcd15db75861fe7fcb80b9..804999b437d60928ad9ac6dce010408c5ab95a74 100644
--- a/PuReMD/src/grid.c
+++ b/PuReMD/src/grid.c
@@ -30,11 +30,22 @@
 /* determines the exchange boundaries with nbrs in terms of gcells */
 void Mark_GCells( reax_system* system, grid *g, ivec procs, MPI_Comm comm )
 {
-    int x, y, z, d;
+    int i, x, y, z, d, len;
     ivec r, nbr_coord, prdc;
     ivec send_span, recv_span;
     ivec str_send, end_send;
     ivec str_recv, end_recv;
+#if defined(NEUTRAL_TERRITORY)
+    ivec nt_str, nt_end;
+    ivec dir[6] = {
+        {0, 0, +1}, // +z
+        {0, 0, -1}, // -z
+        {0, +1, 0}, // +y
+        {+1, +1, 0}, // +x+y
+        {+1, 0, 0}, // +x
+        {+1, -1, 0}  // +x-y
+    };
+#endif
 
     /* clear all gcell type info */
     for ( x = 0; x < g->ncells[0]; x++ )
@@ -50,6 +61,42 @@ void Mark_GCells( reax_system* system, grid *g, ivec procs, MPI_Comm comm )
                 g->cells[x][y][z].type = NATIVE;
                 ivec_MakeZero( g->cells[x][y][z].rel_box );
             }
+    
+#if defined(NEUTRAL_TERRITORY)
+    /* mark NT cells */
+    for ( i = 0; i < 6; ++i )
+    {
+        for ( d = 0; d < 3; ++d )
+        {
+            if ( dir[i][d] > 0 )
+            {
+                nt_str[d] = MIN( g->native_end[d], g->ncells[d] );
+                nt_end[d] = MIN( g->native_end[d] + g->vlist_span[d],
+                        g->ncells[d] );
+            }
+            else if ( dir[i][d] < 0 )
+            {
+                nt_str[d] = MAX( 0, g->native_str[d] - g->vlist_span[d] );
+                nt_end[d] = g->native_str[d];
+            }
+            else
+            {
+                nt_str[d] = g->native_str[d];
+                nt_end[d] = g->native_end[d];
+            }
+        }
+        for ( x = nt_str[0]; x < nt_end[0]; x++ )
+        {
+            for ( y = nt_str[1]; y < nt_end[1]; y++ )
+            {
+                for ( z = nt_str[2]; z < nt_end[2]; z++ )
+                {
+                    g->cells[x][y][z].type = NT_NBRS + i;
+                }
+            }
+        }
+    }
+#endif
 
     /* loop over neighbors */
     for ( r[0] = -1; r[0] <= 1; ++r[0])
@@ -136,8 +183,11 @@ void Find_Neighbor_GridCells( grid *g, control_params *control )
                 gc = &(g->cells[ci[0]][ci[1]][ci[2]]);
                 top = 0;
                 //fprintf( stderr, "grid1: %d %d %d:\n", ci[0], ci[1], ci[2] );
-
+#if defined(NEUTRAL_TERRITORY)
+                if ( gc->type == NATIVE || ( gc->type >= NT_NBRS && gc->type < NT_NBRS + 6 ) )
+#else
                 if ( gc->type == NATIVE )
+#endif
                     gc->cutoff = control->vlist_cut;
                 else gc->cutoff = control->bond_cut;
 
@@ -201,13 +251,13 @@ void Reorder_GridCells( grid *g )
     fprintf( stderr, "reordered gcells:\n" );
     for ( i = 0; i < top; ++i )
         fprintf( stderr, "order%d: %d %d %d\n",
-                 i, g->order[i][0], g->order[i][1], g->order[i][2] );
+                i, g->order[i][0], g->order[i][1], g->order[i][2] );
 #endif
 }
 
 
 void Setup_New_Grid( reax_system* system, control_params* control,
-                     MPI_Comm comm )
+        MPI_Comm comm )
 {
     int              d, i, j, k;
     grid            *g;
@@ -242,13 +292,13 @@ void Setup_New_Grid( reax_system* system, control_params* control,
         g->bond_span[d] = (int)ceil( control->bond_cut / g->cell_len[d] );
         /* span of the ghost region in terms of gcells */
         g->ghost_span[d] = (int)ceil(system->bndry_cuts.ghost_cutoff /
-                                     g->cell_len[d]);
+                g->cell_len[d]);
         g->ghost_nonb_span[d] = (int)ceil(system->bndry_cuts.ghost_nonb /
-                                          g->cell_len[d]);
+                g->cell_len[d]);
         g->ghost_hbond_span[d] = (int)ceil( system->bndry_cuts.ghost_hbond /
-                                            g->cell_len[d] );
+                g->cell_len[d] );
         g->ghost_bond_span[d] = (int)ceil( system->bndry_cuts.ghost_bond /
-                                           g->cell_len[d] );
+                g->cell_len[d] );
     }
 
     /* total number of grid cells */
@@ -262,8 +312,8 @@ void Setup_New_Grid( reax_system* system, control_params* control,
     /* upper bound on the number of gcells to be exchanged with a single nbr */
     system->gcell_cap =
         MAX3( g->native_cells[0] * g->native_cells[1] * g->ghost_span[2],
-              g->native_cells[0] * g->native_cells[2] * g->ghost_span[1],
-              g->native_cells[1] * g->native_cells[2] * g->ghost_span[0] ) + 1;
+                g->native_cells[0] * g->native_cells[2] * g->ghost_span[1],
+                g->native_cells[1] * g->native_cells[2] * g->ghost_span[0] ) + 1;
 
     /* allocate grid space */
     Allocate_Grid( system, comm );
@@ -331,9 +381,9 @@ void Update_Grid( reax_system* system, control_params* control, MPI_Comm comm )
         ghost_span[d] = (int)ceil(system->bndry_cuts.ghost_cutoff / cell_len[d]);
         ghost_nonb_span[d] = (int)ceil(system->bndry_cuts.ghost_nonb / cell_len[d]);
         ghost_hbond_span[d] = (int)ceil( system->bndry_cuts.ghost_hbond /
-                                         cell_len[d] );
+                cell_len[d] );
         ghost_bond_span[d] = (int)ceil( system->bndry_cuts.ghost_bond /
-                                        cell_len[d] );
+                cell_len[d] );
     }
 
 
@@ -418,14 +468,14 @@ void Bin_My_Atoms( reax_system *system, reallocate_data *realloc )
                 if ( atoms[l].x[d] < my_box->min[d] || atoms[l].x[d] > my_box->max[d] )
                 {
                     fprintf( stderr, "p%d: local atom%d [%f %f %f] is out of my box!\n",
-                             system->my_rank, l,
-                             atoms[l].x[0], atoms[l].x[1], atoms[l].x[2] );
+                            system->my_rank, l,
+                            atoms[l].x[0], atoms[l].x[1], atoms[l].x[2] );
                     fprintf( stderr, "p%d: orig atom id is %d!\n",
-                             system->my_rank, atoms[l].orig_id);
+                            system->my_rank, atoms[l].orig_id);
                     fprintf( stderr, "p%d: my_box=[%f-%f, %f-%f, %f-%f]\n",
-                             system->my_rank, my_box->min[0], my_box->max[0],
-                             my_box->min[1], my_box->max[1],
-                             my_box->min[2], my_box->max[2] );
+                            system->my_rank, my_box->min[0], my_box->max[0],
+                            my_box->min[1], my_box->max[1],
+                            my_box->min[2], my_box->max[2] );
                     MPI_Abort( MPI_COMM_WORLD, -1 );
                 }
 
@@ -437,10 +487,10 @@ void Bin_My_Atoms( reax_system *system, reallocate_data *realloc )
             }
 #if defined(DEBUG)
             fprintf( stderr, "p%d bin_my_atoms: l:%d - atom%d @ %.5f %.5f %.5f"\
-                     "--> cell: %d %d %d\n",
-                     system->my_rank, l, atoms[l].orig_id,
-                     atoms[l].x[0], atoms[l].x[1], atoms[l].x[2],
-                     c[0], c[1], c[2] );
+                    "--> cell: %d %d %d\n",
+                    system->my_rank, l, atoms[l].orig_id,
+                    atoms[l].x[0], atoms[l].x[1], atoms[l].x[2],
+                    c[0], c[1], c[2] );
 #endif
             gc = &( g->cells[c[0]][c[1]][c[2]] );
             gc->atoms[ gc->top++ ] = l;
@@ -460,13 +510,13 @@ void Bin_My_Atoms( reax_system *system, reallocate_data *realloc )
                     max_atoms = gc->top;
 #if defined(DEBUG)
                 fprintf( stderr, "p%d gc[%d,%d,%d]->top=%d\n",
-                         system->my_rank, i, j, k, gc->top );
+                        system->my_rank, i, j, k, gc->top );
 #endif
             }
 
 #if defined(DEBUG)
     fprintf( stderr, "p%d max_atoms=%d, g->max_atoms=%d\n",
-             system->my_rank, max_atoms, g->max_atoms );
+            system->my_rank, max_atoms, g->max_atoms );
 #endif
     /* check if current gcell->max_atoms is safe */
     if ( max_atoms >= g->max_atoms * DANGER_ZONE )
@@ -524,7 +574,7 @@ void Reorder_My_Atoms( reax_system *system, storage *workspace )
 
 
 void Get_Boundary_GCell( grid *g, rvec base, rvec x, grid_cell **gc,
-                         rvec *cur_min, rvec *cur_max )
+        rvec *cur_min, rvec *cur_max )
 {
     int d;
     ivec c;
@@ -540,7 +590,7 @@ void Get_Boundary_GCell( grid *g, rvec base, rvec x, grid_cell **gc,
     }
 #if defined(DEBUG)
     fprintf( stderr, "get_bndry_gc: base=[%f %f %f] x=[%f %f %f] c=[%d %d %d]\n",
-             base[0], base[1], base[2], x[0], x[1], x[2], c[0], c[1], c[2] );
+            base[0], base[1], base[2], x[0], x[1], x[2], c[0], c[1], c[2] );
 #endif
 
     *gc = &( g->cells[c[0]][c[1]][c[2]] );
@@ -548,11 +598,11 @@ void Get_Boundary_GCell( grid *g, rvec base, rvec x, grid_cell **gc,
     rvec_Sum( *cur_max, (*gc)->max, loosen );
 #if defined(DEBUG)
     fprintf( stderr, "get_bndry_gc: gcmin=[%f %f %f] gcmax=[%f %f %f]\n",
-             (*gc)->min[0], (*gc)->min[1], (*gc)->min[2],
-             (*gc)->max[0], (*gc)->max[1], (*gc)->max[2] );
+            (*gc)->min[0], (*gc)->min[1], (*gc)->min[2],
+            (*gc)->max[0], (*gc)->max[1], (*gc)->max[2] );
     fprintf( stderr, "get_bndry_gc: curmin=[%f %f %f] curmax=[%f %f %f]\n",
-             (*cur_min)[0], (*cur_min)[1], (*cur_min)[2],
-             (*cur_max)[0], (*cur_max)[1], (*cur_max)[2] );
+            (*cur_min)[0], (*cur_min)[1], (*cur_min)[2],
+            (*cur_max)[0], (*cur_max)[1], (*cur_max)[2] );
 #endif
 }
 
@@ -599,8 +649,8 @@ void Bin_Boundary_Atoms( reax_system *system )
     if ( !is_Within_GCell( atoms[start].x, ext_box->min, ext_box->max ) )
     {
         fprintf( stderr, "p%d: ghost atom%d [%f %f %f] is out of my box!\n",
-                 system->my_rank, start,
-                 atoms[start].x[0], atoms[start].x[1], atoms[start].x[2] );
+                system->my_rank, start,
+                atoms[start].x[0], atoms[start].x[1], atoms[start].x[2] );
         MPI_Abort( MPI_COMM_WORLD, -1 );
     }
 
@@ -613,8 +663,8 @@ void Bin_Boundary_Atoms( reax_system *system )
         if ( !is_Within_GCell( atoms[i].x, ext_box->min, ext_box->max ) )
         {
             fprintf( stderr, "p%d: ghost atom%d [%f %f %f] is out of my box!\n",
-                     system->my_rank, i,
-                     atoms[i].x[0], atoms[i].x[1], atoms[i].x[2] );
+                    system->my_rank, i,
+                    atoms[i].x[0], atoms[i].x[1], atoms[i].x[2] );
             MPI_Abort( MPI_COMM_WORLD, -1 );
         }
 
@@ -628,11 +678,11 @@ void Bin_Boundary_Atoms( reax_system *system )
             if ( gc->top != 0 )
             {
                 fprintf( stderr, "p%d bin_boundary_atoms: atom%d map was unexpected! ",
-                         system->my_rank, i );
+                        system->my_rank, i );
                 fprintf( stderr, "[%f %f %f] --> [%f %f %f] to [%f %f %f]\n",
-                         atoms[i].x[0], atoms[i].x[1], atoms[i].x[2],
-                         gc->min[0], gc->min[1], gc->min[2],
-                         gc->max[0], gc->max[1], gc->max[2] );
+                        atoms[i].x[0], atoms[i].x[1], atoms[i].x[2],
+                        gc->min[0], gc->min[1], gc->min[2],
+                        gc->max[0], gc->max[1], gc->max[2] );
                 MPI_Abort( MPI_COMM_WORLD, INVALID_INPUT );
             }
             gc->str = i;
@@ -646,4 +696,4 @@ void Bin_Boundary_Atoms( reax_system *system )
 #if defined(DEBUG)
     fprintf( stderr, "p%d bin_boundary_atoms: done\n", system->my_rank );
 #endif
-}
+    }
diff --git a/PuReMD/src/hydrogen_bonds.c b/PuReMD/src/hydrogen_bonds.c
index cba267fffa71ef7a11ac23def3c72cae92838653..699e7bc104f7a98f6bb533b90b5ae94026ba0795 100644
--- a/PuReMD/src/hydrogen_bonds.c
+++ b/PuReMD/src/hydrogen_bonds.c
@@ -39,12 +39,13 @@ void Hydrogen_Bonds( reax_system *system, control_params *control,
                      simulation_data *data, storage *workspace,
                      reax_list **lists, output_controls *out_control )
 {
-    int  i, j, k, pi, pk;
-    int  type_i, type_j, type_k;
-    int  start_j, end_j, hb_start_j, hb_end_j;
-    int  hblist[MAX_BONDS];
-    int  itr, top;
-    int  num_hb_intrs = 0;
+    int i, j, k, pi, pk;
+    int type_i, type_j, type_k;
+    int start_j, end_j, hb_start_j, hb_end_j;
+    int hblist[MAX_BONDS];
+    int itr, top;
+    int num_hb_intrs = 0;
+    int nbr_jk;
     ivec rel_jk;
     real r_ij, r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2;
     real e_hb, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3;
@@ -54,11 +55,11 @@ void Hydrogen_Bonds( reax_system *system, control_params *control,
     hbond_parameters *hbp;
     bond_order_data *bo_ij;
     bond_data *pbond_ij;
-    far_neighbor_data *nbr_jk;
-    reax_list *bonds, *hbonds;
+    reax_list *far_nbrs, *bonds, *hbonds;
     bond_data *bond_list;
     hbond_data *hbond_list;
 
+    far_nbrs = lists[FAR_NBRS];
     bonds = lists[BONDS];
     bond_list = bonds->bond_list;
     hbonds = lists[HBONDS];
@@ -102,8 +103,8 @@ void Hydrogen_Bonds( reax_system *system, control_params *control,
                 k = hbond_list[pk].nbr;
                 type_k = system->my_atoms[k].type;
                 nbr_jk = hbond_list[pk].ptr;
-                r_jk = nbr_jk->d;
-                rvec_Scale( dvec_jk, hbond_list[pk].scl, nbr_jk->dvec );
+                r_jk = far_nbrs->far_nbr_list.d[nbr_jk];
+                rvec_Scale( dvec_jk, hbond_list[pk].scl, far_nbrs->far_nbr_list.dvec[nbr_jk] );
 
                 for ( itr = 0; itr < top; ++itr )
                 {
@@ -174,7 +175,8 @@ void Hydrogen_Bonds( reax_system *system, control_params *control,
 
                             rvec_ScaledAdd( workspace->f[j], +CEhb2, dcos_theta_dj );
 
-                            ivec_Scale( rel_jk, hbond_list[pk].scl, nbr_jk->rel_box );
+                            ivec_Scale( rel_jk, hbond_list[pk].scl,
+                                    far_nbrs->far_nbr_list.rel_box[nbr_jk] );
                             rvec_Scale( force, +CEhb2, dcos_theta_dk );
                             rvec_Add( workspace->f[k], force );
                             rvec_iMultiply( ext_press, rel_jk, force );
diff --git a/PuReMD/src/init_md.c b/PuReMD/src/init_md.c
index da661aa4f42f2e3ccd893975cfd684876339edc8..b202227f2ead91caab11badd8f900b6f9c7d8be8 100644
--- a/PuReMD/src/init_md.c
+++ b/PuReMD/src/init_md.c
@@ -54,8 +54,8 @@
 #if defined(PURE_REAX)
 /************************ initialize system ************************/
 int Reposition_Atoms( reax_system *system, control_params *control,
-                      simulation_data *data, mpi_datatypes *mpi_data,
-                      char *msg )
+        simulation_data *data, mpi_datatypes *mpi_data,
+        char *msg )
 {
     int   i;
     rvec  dx;
@@ -130,8 +130,8 @@ void Generate_Initial_Velocities( reax_system *system, real T )
 
 
 int Init_System( reax_system *system, control_params *control,
-                 simulation_data *data, storage *workspace,
-                 mpi_datatypes *mpi_data, char *msg )
+        simulation_data *data, storage *workspace,
+        mpi_datatypes *mpi_data, char *msg )
 {
     int i;
     reax_atom *atom;
@@ -152,9 +152,12 @@ int Init_System( reax_system *system, control_params *control,
     for ( i = 0; i < MAX_NBRS; ++i ) nrecv[i] = 0;
     system->max_recved = 0;
     system->N = SendRecv( system, mpi_data, mpi_data->boundary_atom_type, nrecv,
-                          Estimate_Boundary_Atoms, Unpack_Estimate_Message, 1 );
+            Estimate_Boundary_Atoms, Unpack_Estimate_Message, 1 );
     system->total_cap = MAX( (int)(system->N * SAFE_ZONE), MIN_CAP );
     Bin_Boundary_Atoms( system );
+#if defined(NEUTRAL_TERRITORY)
+    Estimate_NT_Atoms( system, mpi_data );
+#endif
 
     //fprintf( stderr, "p%d SEND RECV SEND!\n", system->my_rank );
     //MPI_Barrier( mpi_data->world );
@@ -177,11 +180,11 @@ int Init_System( reax_system *system, control_params *control,
     //Allocate_System( system, system->local_cap, system->total_cap, msg );
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d: n=%d local_cap=%d\n",
-             system->my_rank, system->n, system->local_cap );
+            system->my_rank, system->n, system->local_cap );
     fprintf( stderr, "p%d: N=%d total_cap=%d\n",
-             system->my_rank, system->N, system->total_cap );
+            system->my_rank, system->N, system->total_cap );
     fprintf( stderr, "p%d: numH=%d H_cap=%d\n",
-             system->my_rank, system->numH, system->Hcap );
+            system->my_rank, system->numH, system->Hcap );
     MPI_Barrier( mpi_data->world );
 #endif
 
@@ -198,8 +201,8 @@ int Init_System( reax_system *system, control_params *control,
 
 /************************ initialize simulation data ************************/
 int Init_Simulation_Data( reax_system *system, control_params *control,
-                          simulation_data *data, mpi_datatypes *mpi_data,
-                          char *msg )
+        simulation_data *data, mpi_datatypes *mpi_data,
+        char *msg )
 {
     Reset_Simulation_Data( data, control->virial );
 
@@ -212,74 +215,95 @@ int Init_Simulation_Data( reax_system *system, control_params *control,
 
     switch ( control->ensemble )
     {
-    case NVE:
-        data->N_f = 3 * system->bigN;
-        Evolve = Velocity_Verlet_NVE;
-        break;
-
-    case bNVT:
-        data->N_f = 3 * system->bigN + 1;
-        Evolve = Velocity_Verlet_Berendsen_NVT;
-        break;
-
-    case nhNVT:
-        fprintf( stderr, "WARNING: Nose-Hoover NVT is still under testing.\n" );
-        //return FAILURE;
-        data->N_f = 3 * system->bigN + 1;
-        Evolve = Velocity_Verlet_Nose_Hoover_NVT_Klein;
-        if ( !control->restart || (control->restart && control->random_vel) )
-        {
-            data->therm.G_xi = control->Tau_T *
-                               (2.0 * data->sys_en.e_kin - data->N_f * K_B * control->T );
-            data->therm.v_xi = data->therm.G_xi * control->dt;
-            data->therm.v_xi_old = 0;
-            data->therm.xi = 0;
-        }
-        break;
-
-    case sNPT: /* Semi-Isotropic NPT */
-        data->N_f = 3 * system->bigN + 4;
-        Evolve = Velocity_Verlet_Berendsen_NPT;
-        if ( !control->restart )
-            Reset_Pressures( data );
-        break;
-
-    case iNPT: /* Isotropic NPT */
-        data->N_f = 3 * system->bigN + 2;
-        Evolve = Velocity_Verlet_Berendsen_NPT;
-        if ( !control->restart )
-            Reset_Pressures( data );
-        break;
-
-    case NPT: /* Anisotropic NPT */
-        strcpy( msg, "init_simulation_data: option not yet implemented" );
-        return FAILURE;
-
-        data->N_f = 3 * system->bigN + 9;
-        Evolve = Velocity_Verlet_Berendsen_NPT;
-        /*if( !control->restart ) {
-          data->therm.G_xi = control->Tau_T *
-          (2.0 * data->my_en.e_Kin - data->N_f * K_B * control->T );
-          data->therm.v_xi = data->therm.G_xi * control->dt;
-          data->iso_bar.eps = 0.33333 * log(system->box.volume);
-          data->inv_W = 1.0 /
-          ( data->N_f * K_B * control->T * SQR(control->Tau_P) );
-          Compute_Pressure( system, control, data, out_control );
-          }*/
-        break;
-
-    default:
-        strcpy( msg, "init_simulation_data: ensemble not recognized" );
-        return FAILURE;
+        case NVE:
+            data->N_f = 3 * system->bigN;
+            Evolve = Velocity_Verlet_NVE;
+            break;
+
+        case bNVT:
+            data->N_f = 3 * system->bigN + 1;
+            Evolve = Velocity_Verlet_Berendsen_NVT;
+            break;
+
+        case nhNVT:
+            fprintf( stderr, "WARNING: Nose-Hoover NVT is still under testing.\n" );
+            //return FAILURE;
+            data->N_f = 3 * system->bigN + 1;
+            Evolve = Velocity_Verlet_Nose_Hoover_NVT_Klein;
+            if ( !control->restart || (control->restart && control->random_vel) )
+            {
+                data->therm.G_xi = control->Tau_T *
+                    (2.0 * data->sys_en.e_kin - data->N_f * K_B * control->T );
+                data->therm.v_xi = data->therm.G_xi * control->dt;
+                data->therm.v_xi_old = 0;
+                data->therm.xi = 0;
+            }
+            break;
+
+        case sNPT: /* Semi-Isotropic NPT */
+            data->N_f = 3 * system->bigN + 4;
+            Evolve = Velocity_Verlet_Berendsen_NPT;
+            if ( !control->restart )
+                Reset_Pressures( data );
+            break;
+
+        case iNPT: /* Isotropic NPT */
+            data->N_f = 3 * system->bigN + 2;
+            Evolve = Velocity_Verlet_Berendsen_NPT;
+            if ( !control->restart )
+                Reset_Pressures( data );
+            break;
+
+        case NPT: /* Anisotropic NPT */
+            strcpy( msg, "init_simulation_data: option not yet implemented" );
+            return FAILURE;
+
+            data->N_f = 3 * system->bigN + 9;
+            Evolve = Velocity_Verlet_Berendsen_NPT;
+            /*if( !control->restart ) {
+              data->therm.G_xi = control->Tau_T *
+              (2.0 * data->my_en.e_Kin - data->N_f * K_B * control->T );
+              data->therm.v_xi = data->therm.G_xi * control->dt;
+              data->iso_bar.eps = 0.33333 * log(system->box.volume);
+              data->inv_W = 1.0 /
+              ( data->N_f * K_B * control->T * SQR(control->Tau_P) );
+              Compute_Pressure( system, control, data, out_control );
+              }*/
+            break;
+
+        default:
+            strcpy( msg, "init_simulation_data: ensemble not recognized" );
+            return FAILURE;
     }
 
     /* initialize the timer(s) */
     MPI_Barrier( mpi_data->world );  // wait for everyone to come here
     if ( system->my_rank == MASTER_NODE )
     {
-        data->timing.start = Get_Time( );
+        data->timing.start = MPI_Wtime();
 #if defined(LOG_PERFORMANCE)
-        Reset_Timing( &data->timing );
+        //Reset_Timing( &data->timing );
+        /* init timing info */
+        data->timing.total = data->timing.start;
+        data->timing.comm = ZERO;
+        data->timing.nbrs = 0;
+        data->timing.init_forces = 0;
+        data->timing.bonded = 0;
+        data->timing.nonb = 0;
+        data->timing.init_dist = ZERO;
+        data->timing.init_cm = ZERO;
+        data->timing.init_bond = ZERO;
+        data->timing.cm = ZERO;
+        data->timing.cm_sort = ZERO;
+        data->timing.cm_solver_comm = ZERO;
+        data->timing.cm_solver_allreduce = ZERO;
+        data->timing.cm_solver_pre_comp = ZERO;
+        data->timing.cm_solver_pre_app = ZERO;
+        data->timing.cm_solver_iters = 0;
+        data->timing.cm_solver_spmv = ZERO;
+        data->timing.cm_solver_vector_ops = ZERO;
+        data->timing.cm_solver_orthog = ZERO;
+        data->timing.cm_solver_tri_solve = ZERO;
 #endif
     }
 
@@ -314,11 +338,11 @@ int Init_System( reax_system *system, control_params *control, char *msg )
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d: n=%d local_cap=%d\n",
-             system->my_rank, system->n, system->local_cap );
+            system->my_rank, system->n, system->local_cap );
     fprintf( stderr, "p%d: N=%d total_cap=%d\n",
-             system->my_rank, system->N, system->total_cap );
+            system->my_rank, system->N, system->total_cap );
     fprintf( stderr, "p%d: numH=%d H_cap=%d\n",
-             system->my_rank, system->numH, system->Hcap );
+            system->my_rank, system->numH, system->Hcap );
 #endif
 
     return SUCCESS;
@@ -326,16 +350,37 @@ int Init_System( reax_system *system, control_params *control, char *msg )
 
 
 int Init_Simulation_Data( reax_system *system, control_params *control,
-                          simulation_data *data, char *msg )
+        simulation_data *data, char *msg )
 {
     Reset_Simulation_Data( data, control->virial );
 
     /* initialize the timer(s) */
     if ( system->my_rank == MASTER_NODE )
     {
-        data->timing.start = Get_Time( );
+        data->timing.start = MPI_Wtime();
 #if defined(LOG_PERFORMANCE)
-        Reset_Timing( &data->timing );
+        //Reset_Timing( &data->timing );
+        /* init timing info */
+        data->timing.total = data->timing.start;
+        data->timing.comm = ZERO;
+        data->timing.nbrs = 0;
+        data->timing.init_forces = 0;
+        data->timing.bonded = 0;
+        data->timing.nonb = 0;
+        data->timing.init_dist = ZERO;
+        data->timing.init_cm = ZERO;
+        data->timing.init_bond = ZERO;
+        data->timing.cm = ZERO;
+        data->timing.cm_sort = ZERO;
+        data->timing.cm_solver_comm = ZERO;
+        data->timing.cm_solver_allreduce = ZERO;
+        data->timing.cm_solver_pre_comp = ZERO;
+        data->timing.cm_solver_pre_app = ZERO;
+        data->timing.cm_solver_iters = 0;
+        data->timing.cm_solver_spmv = ZERO;
+        data->timing.cm_solver_vector_ops = ZERO;
+        data->timing.cm_solver_orthog = ZERO;
+        data->timing.cm_solver_tri_solve = ZERO;
 #endif
     }
 
@@ -385,17 +430,17 @@ void Init_Taper( control_params *control,  storage *workspace, MPI_Comm comm )
     workspace->Tap[2] = -210.0 * (swa3 * swb2 + swa2 * swb3) / d7;
     workspace->Tap[1] = 140.0 * swa3 * swb3 / d7;
     workspace->Tap[0] = (-35.0 * swa3 * swb2 * swb2 + 21.0 * swa2 * swb3 * swb2 +
-                         7.0 * swa * swb3 * swb3 + swb3 * swb3 * swb ) / d7;
+            7.0 * swa * swb3 * swb3 + swb3 * swb3 * swb ) / d7;
 }
 
 
 int Init_Workspace( reax_system *system, control_params *control,
-                    storage *workspace, MPI_Comm comm, char *msg )
+        storage *workspace, MPI_Comm comm, char *msg )
 {
     int ret;
 
     ret = Allocate_Workspace( system, control, workspace,
-                              system->local_cap, system->total_cap, comm, msg );
+            system->local_cap, system->total_cap, comm, msg );
     if ( ret != SUCCESS )
         return ret;
 
@@ -411,7 +456,7 @@ int Init_Workspace( reax_system *system, control_params *control,
 
 /************** setup communication data structures  **************/
 int Init_MPI_Datatypes( reax_system *system, storage *workspace,
-                        mpi_datatypes *mpi_data, MPI_Comm comm, char *msg )
+        mpi_datatypes *mpi_data, MPI_Comm comm, char *msg )
 {
 #if defined(PURE_REAX)
     int           i, block[11];
@@ -432,9 +477,15 @@ int Init_MPI_Datatypes( reax_system *system, storage *workspace,
     /* init mpi buffers  */
     mpi_data->in1_buffer = NULL;
     mpi_data->in2_buffer = NULL;
+#if defined(NEUTRAL_TERRITORY)
+    for ( i = 0; i < REAX_MAX_NT_NBRS; ++i )
+    {
+        mpi_data->in_nt_buffer[i] = NULL;
+    }
+#endif
 
     /* mpi_atom - [orig_id, imprt_id, type, num_bonds, num_hbonds, name,
-                   x, v, f_old, s, t] */
+       x, v, f_old, s, t] */
     block[0] = block[1] = block[2] = block[3] = block[4] = 1;
     block[5] = 8;
     block[6] = block[7] = block[8] = 3;
@@ -529,74 +580,95 @@ int Init_MPI_Datatypes( reax_system *system, storage *workspace,
 /********************** allocate lists *************************/
 #if defined(PURE_REAX)
 int  Init_Lists( reax_system *system, control_params *control,
-                 simulation_data *data, storage *workspace, reax_list **lists,
-                 mpi_datatypes *mpi_data, char *msg )
+        simulation_data *data, storage *workspace, reax_list **lists,
+        mpi_datatypes *mpi_data, char *msg )
 {
-    int i, num_nbrs;
+    int i, num_nbrs, far_nbr_list_format, cm_format, matrix_dim;
     int total_hbonds, total_bonds, bond_cap, num_3body, cap_3body, Htop;
     int *hb_top, *bond_top;
     MPI_Comm comm;
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d: before est_nbrs - local_cap=%d, total_cap=%d\n",
-             system->my_rank, system->local_cap, system->total_cap );
+            system->my_rank, system->local_cap, system->total_cap );
 #endif
 
     comm = mpi_data->world;
+
+    if ( control->cm_solver_pre_comp_type == SAI_PC )
+    {
+        far_nbr_list_format = FULL_LIST;
+        cm_format = SYM_FULL_MATRIX;
+    }
+    else
+    {
+#if defined(NEUTRAL_TERRITORY)
+        far_nbr_list_format = FULL_LIST;
+        cm_format = SYM_HALF_MATRIX;
+#else
+        far_nbr_list_format = HALF_LIST;
+        cm_format = SYM_HALF_MATRIX;
+#endif
+    }
+
     //for( i = 0; i < MAX_NBRS; ++i ) nrecv[i] = system->my_nbrs[i].est_recv;
     //system->N = SendRecv( system, mpi_data, mpi_data->boundary_atom_type, nrecv,
     //        Sort_Boundary_Atoms, Unpack_Exchange_Message, 1 );
-    num_nbrs = Estimate_NumNeighbors( system, lists );
+
+    num_nbrs = Estimate_NumNeighbors( system, lists, far_nbr_list_format );
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d: after est_nbrs - local_cap=%d, total_cap=%d\n",
-             system->my_rank, system->local_cap, system->total_cap );
+            system->my_rank, system->local_cap, system->total_cap );
 #endif
 
     if ( !Make_List( system->total_cap, num_nbrs, TYP_FAR_NEIGHBOR,
-                lists[FAR_NBRS], comm ) )
+                far_nbr_list_format, lists[FAR_NBRS], comm ) )
     {
         fprintf(stderr, "Problem in initializing far nbrs list. Terminating!\n");
         MPI_Abort( comm, INSUFFICIENT_MEMORY );
     }
+
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d: allocated far_nbrs: num_far=%d, space=%dMB\n",
-             system->my_rank, num_nbrs,
-             (int)(num_nbrs * sizeof(far_neighbor_data) / (1024 * 1024)) );
+            system->my_rank, num_nbrs,
+            (int)(num_nbrs * sizeof(far_neighbor_data) / (1024 * 1024)) );
 #endif
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d: before gen_nbrs - local_cap=%d, total_cap=%d\n",
-             system->my_rank, system->local_cap, system->total_cap );
+            system->my_rank, system->local_cap, system->total_cap );
 #endif
 
     Generate_Neighbor_Lists( system, data, workspace, lists );
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d: after gen_nbrs - local_cap=%d, total_cap=%d\n",
-             system->my_rank, system->local_cap, system->total_cap );
+            system->my_rank, system->local_cap, system->total_cap );
 #endif
 
     bond_top = (int*) calloc( system->total_cap, sizeof(int) );
     hb_top = (int*) calloc( system->local_cap, sizeof(int) );
     //bond_top = (int*) malloc( system->total_cap * sizeof(int) );
     //hb_top = (int*) malloc( system->local_cap * sizeof(int) );
-    Estimate_Storages( system, control, lists,
-                       &Htop, hb_top, bond_top, &num_3body, comm );
+    
+    Estimate_Storages( system, control, lists, &Htop, hb_top, 
+            bond_top, &num_3body, comm, &matrix_dim, cm_format );
 
-    Allocate_Matrix( &(workspace->H), system->local_cap, Htop, comm );
+#if defined(NEUTRAL_TERRITORY)
+    Allocate_Matrix( &workspace->H, matrix_dim, Htop, cm_format, comm );
+#else
+    Allocate_Matrix( &workspace->H, system->local_cap, Htop, cm_format, comm );
+#endif
     workspace->L = NULL;
     workspace->U = NULL;
-    
-    //TODO: uncomment for SAI
-//    Allocate_Matrix( &(workspace->H_spar_patt), workspace->H->n, workspace->H->m );
-//    Allocate_Matrix( &(workspace->H_spar_patt_full), workspace->H->n, 2 * workspace->H->m - workspace->H->n );
-//    Allocate_Matrix( &(workspace->H_app_inv), workspace->H->n, 2 * workspace->H->m - workspace->H->n );
+    workspace->H_spar_patt = NULL;
+    workspace->H_app_inv = NULL;
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d: allocated H matrix: Htop=%d, space=%dMB\n",
-             system->my_rank, Htop,
-             (int)(Htop * sizeof(sparse_matrix_entry) / (1024 * 1024)) );
+            system->my_rank, Htop,
+            (int)(Htop * sizeof(sparse_matrix_entry) / (1024 * 1024)) );
 #endif
 
     if ( control->hbond_cut > 0 )
@@ -611,15 +683,15 @@ int  Init_Lists( reax_system *system, control_params *control,
         total_hbonds = MAX( total_hbonds * SAFER_ZONE, MIN_CAP * MIN_HBONDS );
 
         if ( !Make_List( system->Hcap, total_hbonds, TYP_HBOND,
-                         lists[HBONDS], comm ) )
+                    HALF_LIST, lists[HBONDS], comm ) )
         {
             fprintf( stderr, "not enough space for hbonds list. terminating!\n" );
             MPI_Abort( comm, INSUFFICIENT_MEMORY );
         }
 #if defined(DEBUG_FOCUS)
         fprintf( stderr, "p%d: allocated hbonds: total_hbonds=%d, space=%dMB\n",
-                 system->my_rank, total_hbonds,
-                 (int)(total_hbonds * sizeof(hbond_data) / (1024 * 1024)) );
+                system->my_rank, total_hbonds,
+                (int)(total_hbonds * sizeof(hbond_data) / (1024 * 1024)) );
 #endif
     }
 
@@ -635,50 +707,50 @@ int  Init_Lists( reax_system *system, control_params *control,
     bond_cap = MAX( total_bonds * SAFE_ZONE, MIN_CAP * MIN_BONDS );
 
     if ( !Make_List( system->total_cap, bond_cap, TYP_BOND,
-                     lists[BONDS], comm ) )
+                HALF_LIST, lists[BONDS], comm ) )
     {
         fprintf( stderr, "not enough space for bonds list. terminating!\n" );
         MPI_Abort( comm, INSUFFICIENT_MEMORY );
     }
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d: allocated bonds: total_bonds=%d, space=%dMB\n",
-             system->my_rank, bond_cap,
-             (int)(bond_cap * sizeof(bond_data) / (1024 * 1024)) );
+            system->my_rank, bond_cap,
+            (int)(bond_cap * sizeof(bond_data) / (1024 * 1024)) );
 #endif
 
     /* 3bodies list */
     cap_3body = MAX( num_3body * SAFE_ZONE, MIN_3BODIES );
     if ( !Make_List( bond_cap, cap_3body, TYP_THREE_BODY,
-                     lists[THREE_BODIES], comm ) )
+                HALF_LIST, lists[THREE_BODIES], comm ) )
     {
         fprintf( stderr, "Problem in initializing angles list. Terminating!\n" );
         MPI_Abort( comm, INSUFFICIENT_MEMORY );
     }
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d: allocated 3-body list: num_3body=%d, space=%dMB\n",
-             system->my_rank, cap_3body,
-             (int)(cap_3body * sizeof(three_body_interaction_data) / (1024 * 1024)) );
+            system->my_rank, cap_3body,
+            (int)(cap_3body * sizeof(three_body_interaction_data) / (1024 * 1024)) );
 #endif
 
 #if defined(TEST_FORCES)
     if ( !Make_List( system->total_cap, bond_cap * 8, TYP_DDELTA,
-                     lists[DDELTAS], comm ) )
+                HALF_LIST, lists[DDELTAS], comm ) )
     {
         fprintf( stderr, "Problem in initializing dDelta list. Terminating!\n" );
         MPI_Abort( comm, INSUFFICIENT_MEMORY );
     }
     fprintf( stderr, "p%d: allocated dDelta list: num_ddelta=%d space=%ldMB\n",
-             system->my_rank, bond_cap * 30,
-             bond_cap * 8 * sizeof(dDelta_data) / (1024 * 1024) );
+            system->my_rank, bond_cap * 30,
+            bond_cap * 8 * sizeof(dDelta_data) / (1024 * 1024) );
 
-    if ( !Make_List( bond_cap, bond_cap * 50, TYP_DBO, lists[DBOS], comm ) )
+    if ( !Make_List( bond_cap, bond_cap * 50, TYP_DBO, HALF_LIST, lists[DBOS], comm ) )
     {
         fprintf( stderr, "Problem in initializing dBO list. Terminating!\n" );
         MPI_Abort( comm, INSUFFICIENT_MEMORY );
     }
     fprintf( stderr, "p%d: allocated dbond list: num_dbonds=%d space=%ldMB\n",
-             system->my_rank, bond_cap * MAX_BONDS * 3,
-             bond_cap * MAX_BONDS * 3 * sizeof(dbond_data) / (1024 * 1024) );
+            system->my_rank, bond_cap * MAX_BONDS * 3,
+            bond_cap * MAX_BONDS * 3 * sizeof(dbond_data) / (1024 * 1024) );
 #endif
 
     sfree( hb_top, "hb_top" );
@@ -686,22 +758,26 @@ int  Init_Lists( reax_system *system, control_params *control,
 
     return SUCCESS;
 }
+
+
 #elif defined(LAMMPS_REAX)
 int  Init_Lists( reax_system *system, control_params *control,
-                 simulation_data *data, storage *workspace, reax_list **lists,
-                 mpi_datatypes *mpi_data, char *msg )
+        simulation_data *data, storage *workspace, reax_list **lists,
+        mpi_datatypes *mpi_data, char *msg )
 {
-    int i, num_nbrs;
+    int i, num_nbrs, matrix_dim;
     int total_hbonds, total_bonds, bond_cap, num_3body, cap_3body, Htop;
     int *hb_top, *bond_top;
     int nrecv[MAX_NBRS];
     MPI_Comm comm;
 
     comm = mpi_data->world;
+
     bond_top = (int*) calloc( system->total_cap, sizeof(int) );
     hb_top = (int*) calloc( system->local_cap, sizeof(int) );
-    Estimate_Storages( system, control, lists,
-                       &Htop, hb_top, bond_top, &num_3body, comm );
+    //TODO: add one paramater at the end for charge matrix format - half or full
+    Estimate_Storages( system, control, lists, &Htop, hb_top, 
+            bond_top, &num_3body, comm, &matrix_dim );
 
     if ( control->hbond_cut > 0 )
     {
@@ -715,15 +791,15 @@ int  Init_Lists( reax_system *system, control_params *control,
         total_hbonds = (int)(MAX( total_hbonds * SAFER_ZONE, MIN_CAP * MIN_HBONDS ));
 
         if ( !Make_List( system->Hcap, total_hbonds, TYP_HBOND,
-                         lists[HBONDS], comm ) )
+                    HALF_LIST, lists[HBONDS], comm ) )
         {
             fprintf( stderr, "not enough space for hbonds list. terminating!\n" );
             MPI_Abort( comm, INSUFFICIENT_MEMORY );
         }
 #if defined(DEBUG_FOCUS)
         fprintf( stderr, "p%d: allocated hbonds: total_hbonds=%d, space=%dMB\n",
-                 system->my_rank, total_hbonds,
-                 (int)(total_hbonds * sizeof(hbond_data) / (1024 * 1024)) );
+                system->my_rank, total_hbonds,
+                (int)(total_hbonds * sizeof(hbond_data) / (1024 * 1024)) );
 #endif
     }
 
@@ -739,50 +815,50 @@ int  Init_Lists( reax_system *system, control_params *control,
     bond_cap = (int)(MAX( total_bonds * SAFE_ZONE, MIN_CAP * MIN_BONDS ));
 
     if ( !Make_List( system->total_cap, bond_cap, TYP_BOND,
-                     lists[BONDS], comm ) )
+                HALF_LIST, lists[BONDS], comm ) )
     {
         fprintf( stderr, "not enough space for bonds list. terminating!\n" );
         MPI_Abort( comm, INSUFFICIENT_MEMORY );
     }
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d: allocated bonds: total_bonds=%d, space=%dMB\n",
-             system->my_rank, bond_cap,
-             (int)(bond_cap * sizeof(bond_data) / (1024 * 1024)) );
+            system->my_rank, bond_cap,
+            (int)(bond_cap * sizeof(bond_data) / (1024 * 1024)) );
 #endif
 
     /* 3bodies list */
     cap_3body = (int)(MAX( num_3body * SAFE_ZONE, MIN_3BODIES ));
     if ( !Make_List( bond_cap, cap_3body, TYP_THREE_BODY,
-                     lists[THREE_BODIES], comm ) )
+                HALF_LIST, lists[THREE_BODIES], comm ) )
     {
         fprintf( stderr, "Problem in initializing angles list. Terminating!\n" );
         MPI_Abort( comm, INSUFFICIENT_MEMORY );
     }
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d: allocated 3-body list: num_3body=%d, space=%dMB\n",
-             system->my_rank, cap_3body,
-             (int)(cap_3body * sizeof(three_body_interaction_data) / (1024 * 1024)) );
+            system->my_rank, cap_3body,
+            (int)(cap_3body * sizeof(three_body_interaction_data) / (1024 * 1024)) );
 #endif
 
 #if defined(TEST_FORCES)
     if ( !Make_List( system->total_cap, bond_cap * 8, TYP_DDELTA,
-                     lists[DDELTAS], comm ) )
+                HALF_LIST, lists[DDELTAS], comm ) )
     {
         fprintf( stderr, "Problem in initializing dDelta list. Terminating!\n" );
         MPI_Abort( comm, INSUFFICIENT_MEMORY );
     }
     fprintf( stderr, "p%d: allocated dDelta list: num_ddelta=%d space=%ldMB\n",
-             system->my_rank, bond_cap * 30,
-             bond_cap * 8 * sizeof(dDelta_data) / (1024 * 1024) );
+            system->my_rank, bond_cap * 30,
+            bond_cap * 8 * sizeof(dDelta_data) / (1024 * 1024) );
 
-    if ( !Make_List( bond_cap, bond_cap * 50, TYP_DBO, lists[DBOS], comm ) )
+    if ( !Make_List( bond_cap, bond_cap * 50, TYP_DBO, HALF_LIST, lists[DBOS], comm ) )
     {
         fprintf( stderr, "Problem in initializing dBO list. Terminating!\n" );
         MPI_Abort( comm, INSUFFICIENT_MEMORY );
     }
     fprintf( stderr, "p%d: allocated dbond list: num_dbonds=%d space=%ldMB\n",
-             system->my_rank, bond_cap * MAX_BONDS * 3,
-             bond_cap * MAX_BONDS * 3 * sizeof(dbond_data) / (1024 * 1024) );
+            system->my_rank, bond_cap * MAX_BONDS * 3,
+            bond_cap * MAX_BONDS * 3 * sizeof(dbond_data) / (1024 * 1024) );
 #endif
 
     sfree( hb_top, "hb_top" );
@@ -796,9 +872,9 @@ int  Init_Lists( reax_system *system, control_params *control,
 
 #if defined(PURE_REAX)
 void Initialize( reax_system *system, control_params *control,
-                 simulation_data *data, storage *workspace,
-                 reax_list **lists, output_controls *out_control,
-                 mpi_datatypes *mpi_data )
+        simulation_data *data, storage *workspace,
+        reax_list **lists, output_controls *out_control,
+        mpi_datatypes *mpi_data )
 {
     char msg[MAX_STR];
 
@@ -806,9 +882,9 @@ void Initialize( reax_system *system, control_params *control,
             FAILURE )
     {
         fprintf( stderr, "p%d: init_mpi_datatypes: could not create datatypes\n",
-                 system->my_rank );
+                system->my_rank );
         fprintf( stderr, "p%d: mpi_data couldn't be initialized! terminating.\n",
-                 system->my_rank );
+                system->my_rank );
         MPI_Abort( mpi_data->world, CANNOT_INITIALIZE );
     }
 #if defined(DEBUG)
@@ -819,7 +895,7 @@ void Initialize( reax_system *system, control_params *control,
     {
         fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
         fprintf( stderr, "p%d: system could not be initialized! terminating.\n",
-                 system->my_rank );
+                system->my_rank );
         MPI_Abort( mpi_data->world, CANNOT_INITIALIZE );
     }
 #if defined(DEBUG)
@@ -830,7 +906,7 @@ void Initialize( reax_system *system, control_params *control,
     {
         fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
         fprintf( stderr, "p%d: sim_data couldn't be initialized! terminating.\n",
-                 system->my_rank );
+                system->my_rank );
         MPI_Abort( mpi_data->world, CANNOT_INITIALIZE );
     }
 #if defined(DEBUG)
@@ -841,9 +917,9 @@ void Initialize( reax_system *system, control_params *control,
             FAILURE )
     {
         fprintf( stderr, "p%d:init_workspace: not enough memory\n",
-                 system->my_rank );
+                system->my_rank );
         fprintf( stderr, "p%d:workspace couldn't be initialized! terminating.\n",
-                 system->my_rank );
+                system->my_rank );
         MPI_Abort( mpi_data->world, CANNOT_INITIALIZE );
     }
 #if defined(DEBUG)
@@ -855,7 +931,7 @@ void Initialize( reax_system *system, control_params *control,
     {
         fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
         fprintf( stderr, "p%d: system could not be initialized! terminating.\n",
-                 system->my_rank );
+                system->my_rank );
         MPI_Abort( mpi_data->world, CANNOT_INITIALIZE );
     }
 #if defined(DEBUG)
@@ -866,7 +942,7 @@ void Initialize( reax_system *system, control_params *control,
     {
         fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
         fprintf( stderr, "p%d: could not open output files! terminating...\n",
-                 system->my_rank );
+                system->my_rank );
         MPI_Abort( mpi_data->world, CANNOT_INITIALIZE );
     }
 #if defined(DEBUG)
@@ -879,7 +955,7 @@ void Initialize( reax_system *system, control_params *control,
         {
             fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
             fprintf( stderr, "p%d: couldn't create lookup table! terminating.\n",
-                     system->my_rank );
+                    system->my_rank );
             MPI_Abort( mpi_data->world, CANNOT_INITIALIZE );
         }
 #if defined(DEBUG)
@@ -894,14 +970,14 @@ void Initialize( reax_system *system, control_params *control,
     /*#ifdef TEST_FORCES
       Init_Force_Test_Functions();
       fprintf(stderr,"p%d: initialized force test functions\n",system->my_rank);
-      #endif */
+#endif */
 }
 
 #elif defined(LAMMPS_REAX)
 void Initialize( reax_system *system, control_params *control,
-                 simulation_data *data, storage *workspace,
-                 reax_list **lists, output_controls *out_control,
-                 mpi_datatypes *mpi_data, MPI_Comm comm )
+        simulation_data *data, storage *workspace,
+        reax_list **lists, output_controls *out_control,
+        mpi_datatypes *mpi_data, MPI_Comm comm )
 {
     char msg[MAX_STR];
 
@@ -909,9 +985,9 @@ void Initialize( reax_system *system, control_params *control,
     if ( Init_MPI_Datatypes(system, workspace, mpi_data, comm, msg) == FAILURE )
     {
         fprintf( stderr, "p%d: init_mpi_datatypes: could not create datatypes\n",
-                 system->my_rank );
+                system->my_rank );
         fprintf( stderr, "p%d: mpi_data couldn't be initialized! terminating.\n",
-                 system->my_rank );
+                system->my_rank );
         MPI_Abort( mpi_data->world, CANNOT_INITIALIZE );
     }
 #if defined(DEBUG)
@@ -922,7 +998,7 @@ void Initialize( reax_system *system, control_params *control,
     {
         fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
         fprintf( stderr, "p%d: system could not be initialized! terminating.\n",
-                 system->my_rank );
+                system->my_rank );
         MPI_Abort( mpi_data->world, CANNOT_INITIALIZE );
     }
 #if defined(DEBUG)
@@ -933,7 +1009,7 @@ void Initialize( reax_system *system, control_params *control,
     {
         fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
         fprintf( stderr, "p%d: sim_data couldn't be initialized! terminating.\n",
-                 system->my_rank );
+                system->my_rank );
         MPI_Abort( mpi_data->world, CANNOT_INITIALIZE );
     }
 #if defined(DEBUG)
@@ -944,9 +1020,9 @@ void Initialize( reax_system *system, control_params *control,
             FAILURE )
     {
         fprintf( stderr, "p%d:init_workspace: not enough memory\n",
-                 system->my_rank );
+                system->my_rank );
         fprintf( stderr, "p%d:workspace couldn't be initialized! terminating.\n",
-                 system->my_rank );
+                system->my_rank );
         MPI_Abort( mpi_data->world, CANNOT_INITIALIZE );
     }
 #if defined(DEBUG)
@@ -958,7 +1034,7 @@ void Initialize( reax_system *system, control_params *control,
     {
         fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
         fprintf( stderr, "p%d: system could not be initialized! terminating.\n",
-                 system->my_rank );
+                system->my_rank );
         MPI_Abort( mpi_data->world, CANNOT_INITIALIZE );
     }
 #if defined(DEBUG)
@@ -969,7 +1045,7 @@ void Initialize( reax_system *system, control_params *control,
     {
         fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
         fprintf( stderr, "p%d: could not open output files! terminating...\n",
-                 system->my_rank );
+                system->my_rank );
         MPI_Abort( mpi_data->world, CANNOT_INITIALIZE );
     }
 #if defined(DEBUG)
@@ -982,7 +1058,7 @@ void Initialize( reax_system *system, control_params *control,
         {
             fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
             fprintf( stderr, "p%d: couldn't create lookup table! terminating.\n",
-                     system->my_rank );
+                    system->my_rank );
             MPI_Abort( mpi_data->world, CANNOT_INITIALIZE );
         }
 #if defined(DEBUG)
@@ -999,5 +1075,5 @@ void Initialize( reax_system *system, control_params *control,
       Init_Force_Test_Functions();
       fprintf(stderr,"p%d: initialized force test functions\n",system->my_rank);
 #endif*/
-    }
+}
 #endif
diff --git a/PuReMD/src/integrate.c b/PuReMD/src/integrate.c
index 3672651c01b27cb6b96a77bf94624278d899901a..6b9d6b5ae11cda6c847f604d3a1f202927afbd6c 100644
--- a/PuReMD/src/integrate.c
+++ b/PuReMD/src/integrate.c
@@ -51,6 +51,22 @@ void Velocity_Verlet_NVE( reax_system* system, control_params* control,
     dt_sqr = SQR(dt);
     steps = data->step - data->prev_steps;
     renbr = (steps % control->reneighbor == 0);
+    if ( control->cm_solver_pre_comp_type == SAI_PC )
+    {
+        /* HACK: currently required that preconditioner (re)computation step
+         * and reneighbor step (i.e., (re)construct far nbr list)
+         * are the same value, so use reneighbor for now */
+//        if ( renbr )
+//        {
+//            lists[FAR_NBRS]->format = FULL_LIST;
+//            workspace->H->format = SYM_FULL_MATRIX;
+//        }
+//        else
+//        {
+//            lists[FAR_NBRS]->format = HALF_LIST;
+//            workspace->H->format = SYM_HALF_MATRIX;
+//        }
+    }
 
     for ( i = 0; i < system->n; i++ )
     {
@@ -114,6 +130,22 @@ void Velocity_Verlet_Nose_Hoover_NVT_Klein( reax_system* system,
     therm = &( data->therm );
     steps = data->step - data->prev_steps;
     renbr = (steps % control->reneighbor == 0);
+    if ( control->cm_solver_pre_comp_type == SAI_PC )
+    {
+        /* HACK: currently required that preconditioner (re)computation step
+         * and reneighbor step (i.e., (re)construct far nbr list)
+         * are the same value, so use reneighbor for now */
+//        if ( renbr )
+//        {
+//            lists[FAR_NBRS]->format = FULL_LIST;
+//            workspace->H->format = SYM_FULL_MATRIX;
+//        }
+//        else
+//        {
+//            lists[FAR_NBRS]->format = HALF_LIST;
+//            workspace->H->format = SYM_HALF_MATRIX;
+//        }
+    }
 
     for ( i = 0; i < system->n; i++ )
     {
@@ -209,6 +241,22 @@ void Velocity_Verlet_Berendsen_NVT( reax_system* system,
     dt = control->dt;
     steps = data->step - data->prev_steps;
     renbr = (steps % control->reneighbor == 0);
+    if ( control->cm_solver_pre_comp_type == SAI_PC )
+    {
+        /* HACK: currently required that preconditioner (re)computation step
+         * and reneighbor step (i.e., (re)construct far nbr list)
+         * are the same value, so use reneighbor for now */
+//        if ( renbr )
+//        {
+//            lists[FAR_NBRS]->format = FULL_LIST;
+//            workspace->H->format = SYM_FULL_MATRIX;
+//        }
+//        else
+//        {
+//            lists[FAR_NBRS]->format = HALF_LIST;
+//            workspace->H->format = SYM_HALF_MATRIX;
+//        }
+    }
 
     /* velocity verlet, 1st part */
     for ( i = 0; i < system->n; i++ )
@@ -300,6 +348,22 @@ void Velocity_Verlet_Berendsen_NPT( reax_system* system,
     dt = control->dt;
     steps = data->step - data->prev_steps;
     renbr = (steps % control->reneighbor == 0);
+    if ( control->cm_solver_pre_comp_type == SAI_PC )
+    {
+        /* HACK: currently required that preconditioner (re)computation step
+         * and reneighbor step (i.e., (re)construct far nbr list)
+         * are the same value, so use reneighbor for now */
+//        if ( renbr )
+//        {
+//            lists[FAR_NBRS]->format = FULL_LIST;
+//            workspace->H->format = SYM_FULL_MATRIX;
+//        }
+//        else
+//        {
+//            lists[FAR_NBRS]->format = HALF_LIST;
+//            workspace->H->format = SYM_HALF_MATRIX;
+//        }
+    }
 
     /* velocity verlet, 1st part */
     for ( i = 0; i < system->n; i++ )
diff --git a/PuReMD/src/io_tools.c b/PuReMD/src/io_tools.c
index 7f3c4c02bf04f0836169bd6048b6bd8f2b0a1846..1c878f6efa665cd7d79fbde4c87424dbf66645d7 100644
--- a/PuReMD/src/io_tools.c
+++ b/PuReMD/src/io_tools.c
@@ -44,8 +44,8 @@ print_interaction Print_Interactions[NUM_INTRS];
 
 /************************ initialize output controls ************************/
 int Init_Output_Files( reax_system *system, control_params *control,
-                       output_controls *out_control, mpi_datatypes *mpi_data,
-                       char *msg )
+        output_controls *out_control, mpi_datatypes *mpi_data,
+        char *msg )
 {
     char temp[MAX_STR];
     int ret;
@@ -64,65 +64,46 @@ int Init_Output_Files( reax_system *system, control_params *control,
         {
             /* init out file */
             sprintf( temp, "%s.out", control->sim_name );
-            if ( (out_control->out = fopen( temp, "w" )) != NULL )
-            {
+            out_control->out = sfopen( temp, "w", "Init_Output_Files" );
 #if !defined(DEBUG) && !defined(DEBUG_FOCUS)
-                fprintf( out_control->out, "%-6s%14s%14s%14s%11s%13s%13s\n",
-                         "step", "total energy", "potential", "kinetic",
-                         "T(K)", "V(A^3)", "P(Gpa)" );
+            fprintf( out_control->out, "%-6s%14s%14s%14s%11s%13s%13s\n",
+                    "step", "total energy", "potential", "kinetic",
+                    "T(K)", "V(A^3)", "P(Gpa)" );
 #else
-                fprintf( out_control->out, "%-6s%24s%24s%24s%13s%16s%13s\n",
-                         "step", "total energy", "potential", "kinetic",
-                         "T(K)", "V(A^3)", "P(GPa)" );
+            fprintf( out_control->out, "%-6s%24s%24s%24s%13s%16s%13s\n",
+                    "step", "total energy", "potential", "kinetic",
+                    "T(K)", "V(A^3)", "P(GPa)" );
 #endif
-                fflush( out_control->out );
-            }
-            else
-            {
-                strcpy( msg, "init_out_controls: .out file could not be opened\n" );
-                return FAILURE;
-            }
+            fflush( out_control->out );
 
             /* init potentials file */
             sprintf( temp, "%s.pot", control->sim_name );
-            if ( (out_control->pot = fopen( temp, "w" )) != NULL )
-            {
+            out_control->pot = sfopen( temp, "w", "Init_Output_Files" );
 #if !defined(DEBUG) && !defined(DEBUG_FOCUS)
-                fprintf( out_control->pot,
-                         "%-6s%14s%14s%14s%14s%14s%14s%14s%14s%14s%14s%14s\n",
-                         "step", "ebond", "eatom", "elp",
-                         "eang", "ecoa", "ehb", "etor", "econj",
-                         "evdw", "ecoul", "epol" );
+            fprintf( out_control->pot,
+                    "%-6s%14s%14s%14s%14s%14s%14s%14s%14s%14s%14s%14s\n",
+                    "step", "ebond", "eatom", "elp",
+                    "eang", "ecoa", "ehb", "etor", "econj",
+                    "evdw", "ecoul", "epol" );
 #else
-                fprintf( out_control->pot,
-                         "%-6s%24s%24s%24s%24s%24s%24s%24s%24s%24s%24s%24s\n",
-                         "step", "ebond", "eatom", "elp",
-                         "eang", "ecoa", "ehb", "etor", "econj",
-                         "evdw", "ecoul", "epol" );
+            fprintf( out_control->pot,
+                    "%-6s%24s%24s%24s%24s%24s%24s%24s%24s%24s%24s%24s\n",
+                    "step", "ebond", "eatom", "elp",
+                    "eang", "ecoa", "ehb", "etor", "econj",
+                    "evdw", "ecoul", "epol" );
 #endif
-                fflush( out_control->pot );
-            }
-            else
-            {
-                strcpy( msg, "init_out_controls: .pot file could not be opened\n" );
-                return FAILURE;
-            }
+            fflush( out_control->pot );
 
             /* init log file */
 #if defined(LOG_PERFORMANCE)
             sprintf( temp, "%s.log", control->sim_name );
-            if ( (out_control->log = fopen( temp, "w" )) != NULL )
-            {
-                fprintf( out_control->log, "%6s%8s%8s%8s%8s%8s%8s%8s%8s\n",
-                         "step", "total", "comm", "nbrs", "init", "bonded", "nonb",
-                         "qeq", "matvecs" );
-                fflush( out_control->log );
-            }
-            else
-            {
-                strcpy( msg, "init_out_controls: .log file could not be opened\n" );
-                return FAILURE;
-            }
+            out_control->log = sfopen( temp, "w", "Init_Output_Files" );
+            fprintf( out_control->log, "%-6s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s\n",
+                    "step", "total", "comm", "neighbors", "init",
+                    "init_dist", "init_cm", "init_bond", "bonded", "nonbonded",  
+                    "cm", "cm_sort", "s_iters", "pre_comp", "pre_app", "s_comm", "s_allr",
+                    "s_spmv", "s_vec_ops", "s_orthog", "s_tsolve" );
+            fflush( out_control->log );
 #endif
         }
 
@@ -132,52 +113,31 @@ int Init_Output_Files( reax_system *system, control_params *control,
                 control->ensemble == sNPT )
         {
             sprintf( temp, "%s.prs", control->sim_name );
-            if ( (out_control->prs = fopen( temp, "w" )) != NULL )
-            {
-                fprintf(out_control->prs, "%8s%13s%13s%13s%13s%13s%13s%13s\n",
-                        "step", "Pint/norm[x]", "Pint/norm[y]", "Pint/norm[z]",
-                        "Pext/Ptot[x]", "Pext/Ptot[y]", "Pext/Ptot[z]", "Pkin/V" );
-                fflush( out_control->prs );
-            }
-            else
-            {
-                strcpy(msg, "init_out_controls: .prs file couldn't be opened\n");
-                return FAILURE;
-            }
+            out_control->prs = sfopen( temp, "w", "Init_Output_Files" );
+            fprintf(out_control->prs, "%8s%13s%13s%13s%13s%13s%13s%13s\n",
+                    "step", "Pint/norm[x]", "Pint/norm[y]", "Pint/norm[z]",
+                    "Pext/Ptot[x]", "Pext/Ptot[y]", "Pext/Ptot[z]", "Pkin/V" );
+            fflush( out_control->prs );
         }
 
         /* init electric dipole moment analysis file */
         if ( control->dipole_anal )
         {
             sprintf( temp, "%s.dpl", control->sim_name );
-            if ( (out_control->dpl = fopen( temp, "w" )) != NULL )
-            {
-                fprintf( out_control->dpl, "%6s%20s%30s",
-                         "step", "molecule count", "avg dipole moment norm" );
-                fflush( out_control->dpl );
-            }
-            else
-            {
-                strcpy(msg, "init_out_controls: .dpl file couldn't be opened\n");
-                return FAILURE;
-            }
+            out_control->dpl = sfopen( temp, "w", "Init_Output_Files" );
+            fprintf( out_control->dpl, "%6s%20s%30s",
+                    "step", "molecule count", "avg dipole moment norm" );
+            fflush( out_control->dpl );
         }
 
         /* init diffusion coef analysis file */
         if ( control->diffusion_coef )
         {
             sprintf( temp, "%s.drft", control->sim_name );
-            if ( (out_control->drft = fopen( temp, "w" )) != NULL )
-            {
-                fprintf( out_control->drft, "%7s%20s%20s\n",
-                         "step", "type count", "avg disp^2" );
-                fflush( out_control->drft );
-            }
-            else
-            {
-                strcpy(msg, "init_out_controls: .drft file couldn't be opened\n");
-                return FAILURE;
-            }
+            out_control->drft = sfopen( temp, "w", "Init_Output_Files" );
+            fprintf( out_control->drft, "%7s%20s%20s\n",
+                    "step", "type count", "avg disp^2" );
+            fflush( out_control->drft );
         }
     }
 
@@ -188,11 +148,9 @@ int Init_Output_Files( reax_system *system, control_params *control,
        fashion controlled by their rank */
     /*if( control->molecular_analysis ) {
       if( system->my_rank == MASTER_NODE ) {
-        sprintf( temp, "%s.mol", control->sim_name );
-        if( (out_control->mol = fopen( temp, "w" )) == NULL ) {
-    strcpy(msg,"init_out_controls: .mol file could not be opened\n");
-    return FAILURE;
-        }
+      sprintf( temp, "%s.mol", control->sim_name );
+      out_control->mol = sfopen( temp, "w", "Init_Output_Files" );
+      }
       }
 
       MPI_Bcast( &(out_control->mol), 1, MPI_LONG, 0, MPI_COMM_WORLD );
@@ -202,233 +160,121 @@ int Init_Output_Files( reax_system *system, control_params *control,
 #ifdef TEST_ENERGY
     /* open bond energy file */
     sprintf( temp, "%s.ebond.%d", control->sim_name, system->my_rank );
-    if ( (out_control->ebond = fopen( temp, "w" )) == NULL )
-    {
-        strcpy(msg, "Init_Out_Files: .ebond file couldn't be opened\n");
-        return FAILURE;
-    }
+    out_control->ebond = sfopen( temp, "w", "Init_Output_Files" );
 
     /* open lone-pair energy file */
     sprintf( temp, "%s.elp.%d", control->sim_name, system->my_rank );
-    if ( (out_control->elp = fopen( temp, "w" )) == NULL )
-    {
-        strcpy(msg, "Init_Out_Files: .elp file couldn't be opened\n");
-        return FAILURE;
-    }
+    out_control->elp = sfopen( temp, "w", "Init_Output_Files" );
 
     /* open overcoordination energy file */
     sprintf( temp, "%s.eov.%d", control->sim_name, system->my_rank );
-    if ( (out_control->eov = fopen( temp, "w" )) == NULL )
-    {
-        strcpy(msg, "Init_Out_Files: .eov file couldn't be opened\n");
-        return FAILURE;
-    }
+    out_control->eov = sfopen( temp, "w", "Init_Output_Files" );
 
     /* open undercoordination energy file */
     sprintf( temp, "%s.eun.%d", control->sim_name, system->my_rank );
-    if ( (out_control->eun = fopen( temp, "w" )) == NULL )
-    {
-        strcpy(msg, "Init_Out_Files: .eun file couldn't be opened\n");
-        return FAILURE;
-    }
+    out_control->eun = sfopen( temp, "w", "Init_Output_Files" );
 
     /* open angle energy file */
     sprintf( temp, "%s.eval.%d", control->sim_name, system->my_rank );
-    if ( (out_control->eval = fopen( temp, "w" )) == NULL )
-    {
-        strcpy(msg, "Init_Out_Files: .eval file couldn't be opened\n");
-        return FAILURE;
-    }
+    out_control->eval = sfopen( temp, "w", "Init_Output_Files" );
 
     /* open coalition energy file */
     sprintf( temp, "%s.ecoa.%d", control->sim_name, system->my_rank );
-    if ( (out_control->ecoa = fopen( temp, "w" )) == NULL )
-    {
-        strcpy(msg, "Init_Out_Files: .ecoa file couldn't be opened\n");
-        return FAILURE;
-    }
+    out_control->ecoa = sfopen( temp, "w", "Init_Output_Files" );
 
     /* open penalty energy file */
     sprintf( temp, "%s.epen.%d", control->sim_name, system->my_rank );
-    if ( (out_control->epen = fopen( temp, "w" )) == NULL )
-    {
-        strcpy(msg, "Init_Out_Files: .epen file couldn't be opened\n");
-        return FAILURE;
-    }
+    out_control->epen = sfopen( temp, "w", "Init_Output_Files" );
 
     /* open torsion energy file */
     sprintf( temp, "%s.etor.%d", control->sim_name, system->my_rank );
-    if ( (out_control->etor = fopen( temp, "w" )) == NULL )
-    {
-        strcpy(msg, "Init_Out_Files: .etor file couldn't be opened\n");
-        return FAILURE;
-    }
+    out_control->etor = sfopen( temp, "w", "Init_Output_Files" );
 
     /* open conjugation energy file */
     sprintf( temp, "%s.econ.%d", control->sim_name, system->my_rank );
-    if ( (out_control->econ = fopen( temp, "w" )) == NULL )
-    {
-        strcpy(msg, "Init_Out_Files: .econ file couldn't be opened\n");
-        return FAILURE;
-    }
+    out_control->econ = sfopen( temp, "w", "Init_Output_Files" );
 
     /* open hydrogen bond energy file */
     sprintf( temp, "%s.ehb.%d", control->sim_name, system->my_rank );
-    if ( (out_control->ehb = fopen( temp, "w" )) == NULL )
-    {
-        strcpy(msg, "Init_Out_Files: .ehb file couldn't be opened\n");
-        return FAILURE;
-    }
+    out_control->ehb = sfopen( temp, "w", "Init_Output_Files" );
 
     /* open vdWaals energy file */
     sprintf( temp, "%s.evdw.%d", control->sim_name, system->my_rank );
-    if ( (out_control->evdw = fopen( temp, "w" )) == NULL )
-    {
-        strcpy(msg, "Init_Out_Files: .evdw file couldn't be opened\n");
-        return FAILURE;
-    }
+    out_control->evdw = sfopen( temp, "w", "Init_Output_Files" );
 
     /* open coulomb energy file */
     sprintf( temp, "%s.ecou.%d", control->sim_name, system->my_rank );
-    if ( (out_control->ecou = fopen( temp, "w" )) == NULL )
-    {
-        strcpy(msg, "Init_Out_Files: .ecou file couldn't be opened\n");
-        return FAILURE;
-    }
+    out_control->ecou = sfopen( temp, "w", "Init_Output_Files" );
 #endif
 
 
 #ifdef TEST_FORCES
     /* open bond orders file */
     sprintf( temp, "%s.fbo.%d", control->sim_name, system->my_rank );
-    if ( (out_control->fbo = fopen( temp, "w" )) == NULL )
-    {
-        strcpy(msg, "Init_Out_Files: .fbo file couldn't be opened\n");
-        return FAILURE;
-    }
+    out_control->fbo = sfopen( temp, "w", "Init_Output_Files" );
 
     /* open bond orders derivatives file */
     sprintf( temp, "%s.fdbo.%d", control->sim_name, system->my_rank );
-    if ( (out_control->fdbo = fopen( temp, "w" )) == NULL )
-    {
-        strcpy(msg, "Init_Out_Files: .fdbo file couldn't be opened\n");
-        return FAILURE;
-    }
+    out_control->fdbo = sfopen( temp, "w", "Init_Output_Files" );
 
     /* produce a single force file - to be written by p0 */
     if ( system->my_rank == MASTER_NODE )
     {
         /* open bond forces file */
         sprintf( temp, "%s.fbond", control->sim_name );
-        if ( (out_control->fbond = fopen( temp, "w" )) == NULL )
-        {
-            strcpy(msg, "Init_Out_Files: .fbond file couldn't be opened\n");
-            return FAILURE;
-        }
+        out_control->fbond = sfopen( temp, "w", "Init_Output_Files" );
 
         /* open lone-pair forces file */
         sprintf( temp, "%s.flp", control->sim_name );
-        if ( (out_control->flp = fopen( temp, "w" )) == NULL )
-        {
-            strcpy(msg, "Init_Out_Files: .flp file couldn't be opened\n");
-            return FAILURE;
-        }
+        out_control->flp = sfopen( temp, "w", "Init_Output_Files" );
 
         /* open overcoordination forces file */
         sprintf( temp, "%s.fov", control->sim_name );
-        if ( (out_control->fov = fopen( temp, "w" )) == NULL )
-        {
-            strcpy(msg, "Init_Out_Files: .fov file couldn't be opened\n");
-            return FAILURE;
-        }
+        out_control->fov = sfopen( temp, "w", "Init_Output_Files" );
 
         /* open undercoordination forces file */
         sprintf( temp, "%s.fun", control->sim_name );
-        if ( (out_control->fun = fopen( temp, "w" )) == NULL )
-        {
-            strcpy(msg, "Init_Out_Files: .fun file couldn't be opened\n");
-            return FAILURE;
-        }
+        out_control->fun = sfopen( temp, "w", "Init_Output_Files" );
 
         /* open angle forces file */
         sprintf( temp, "%s.fang", control->sim_name );
-        if ( (out_control->fang = fopen( temp, "w" )) == NULL )
-        {
-            strcpy(msg, "Init_Out_Files: .fang file couldn't be opened\n");
-            return FAILURE;
-        }
+        out_control->fang = sfopen( temp, "w", "Init_Output_Files" );
 
         /* open coalition forces file */
         sprintf( temp, "%s.fcoa", control->sim_name );
-        if ( (out_control->fcoa = fopen( temp, "w" )) == NULL )
-        {
-            strcpy(msg, "Init_Out_Files: .fcoa file couldn't be opened\n");
-            return FAILURE;
-        }
+        out_control->fcoa = sfopen( temp, "w", "Init_Output_Files" );
 
         /* open penalty forces file */
         sprintf( temp, "%s.fpen", control->sim_name );
-        if ( (out_control->fpen = fopen( temp, "w" )) == NULL )
-        {
-            strcpy(msg, "Init_Out_Files: .fpen file couldn't be opened\n");
-            return FAILURE;
-        }
+        out_control->fpen = sfopen( temp, "w", "Init_Output_Files" );
 
         /* open torsion forces file */
         sprintf( temp, "%s.ftor", control->sim_name );
-        if ( (out_control->ftor = fopen( temp, "w" )) == NULL )
-        {
-            strcpy(msg, "Init_Out_Files: .ftor file couldn't be opened\n");
-            return FAILURE;
-        }
+        out_control->ftor = sfopen( temp, "w", "Init_Output_Files" );
 
         /* open conjugation forces file */
         sprintf( temp, "%s.fcon", control->sim_name );
-        if ( (out_control->fcon = fopen( temp, "w" )) == NULL )
-        {
-            strcpy(msg, "Init_Out_Files: .fcon file couldn't be opened\n");
-            return FAILURE;
-        }
+        out_control->fcon = sfopen( temp, "w", "Init_Output_Files" );
 
         /* open hydrogen bond forces file */
         sprintf( temp, "%s.fhb", control->sim_name );
-        if ( (out_control->fhb = fopen( temp, "w" )) == NULL )
-        {
-            strcpy(msg, "Init_Out_Files: .fhb file couldn't be opened\n");
-            return FAILURE;
-        }
+        out_control->fhb = sfopen( temp, "w", "Init_Output_Files" );
 
         /* open vdw forces file */
         sprintf( temp, "%s.fvdw", control->sim_name );
-        if ( (out_control->fvdw = fopen( temp, "w" )) == NULL )
-        {
-            strcpy(msg, "Init_Out_Files: .fvdw file couldn't be opened\n");
-            return FAILURE;
-        }
+        out_control->fvdw = sfopen( temp, "w", "Init_Output_Files" );
 
         /* open nonbonded forces file */
         sprintf( temp, "%s.fele", control->sim_name );
-        if ( (out_control->fele = fopen( temp, "w" )) == NULL )
-        {
-            strcpy(msg, "Init_Out_Files: .fele file couldn't be opened\n");
-            return FAILURE;
-        }
+        out_control->fele = sfopen( temp, "w", "Init_Output_Files" );
 
         /* open total force file */
         sprintf( temp, "%s.ftot", control->sim_name );
-        if ( (out_control->ftot = fopen( temp, "w" )) == NULL )
-        {
-            strcpy(msg, "Init_Out_Files: .ftot file couldn't be opened\n");
-            return FAILURE;
-        }
+        out_control->ftot = sfopen( temp, "w", "Init_Output_Files" );
 
         /* open force comprison file */
         sprintf( temp, "%s.fcomp", control->sim_name );
-        if ( (out_control->fcomp = fopen( temp, "w" )) == NULL )
-        {
-            strcpy(msg, "Init_Out_Files: .fcomp file couldn't be opened\n");
-            return FAILURE;
-        }
+        out_control->fcomp = sfopen( temp, "w", "Init_Output_Files" );
     }
 #endif
 
@@ -436,27 +282,15 @@ int Init_Output_Files( reax_system *system, control_params *control,
 #if defined(TEST_FORCES) || defined(TEST_ENERGY)
     /* open far neighbor list file */
     sprintf( temp, "%s.far_nbrs_list.%d", control->sim_name, system->my_rank );
-    if ( (out_control->flist = fopen( temp, "w" )) == NULL )
-    {
-        strcpy(msg, "Init_Out_Files: .far_nbrs_list file couldn't be opened\n");
-        return FAILURE;
-    }
+    out_control->flist = sfopen( temp, "w", "Init_Output_Files" );
 
     /* open bond list file */
     sprintf( temp, "%s.bond_list.%d", control->sim_name, system->my_rank );
-    if ( (out_control->blist = fopen( temp, "w" )) == NULL )
-    {
-        strcpy(msg, "Init_Out_Files: .bond_list file couldn't be opened\n");
-        return FAILURE;
-    }
+    out_control->blist = sfopen( temp, "w", "Init_Output_Files" );
 
     /* open near neighbor list file */
     sprintf( temp, "%s.near_nbrs_list.%d", control->sim_name, system->my_rank );
-    if ( (out_control->nlist = fopen( temp, "w" )) == NULL )
-    {
-        strcpy(msg, "Init_Out_Files: .near_nbrs_list file couldn't be opened\n");
-        return FAILURE;
-    }
+    out_control->nlist = sfopen( temp, "w", "Init_Output_Files" );
 #endif
 #endif
 
@@ -466,7 +300,7 @@ int Init_Output_Files( reax_system *system, control_params *control,
 
 /************************ close output files ************************/
 int Close_Output_Files( reax_system *system, control_params *control,
-                        output_controls *out_control, mpi_datatypes *mpi_data )
+        output_controls *out_control, mpi_datatypes *mpi_data )
 {
     if ( out_control->write_steps > 0 )
         End_Traj( system->my_rank, out_control );
@@ -475,65 +309,68 @@ int Close_Output_Files( reax_system *system, control_params *control,
     {
         if ( out_control->energy_update_freq > 0 )
         {
-            fclose( out_control->out );
-            fclose( out_control->pot );
+            sfclose( out_control->out, "Close_Output_Files" );
+            sfclose( out_control->pot, "Close_Output_Files" );
 #if defined(LOG_PERFORMANCE)
-            fclose( out_control->log );
+            sfclose( out_control->log, "Close_Output_Files" );
 #endif
         }
 
         if ( control->ensemble == NPT || control->ensemble == iNPT ||
                 control->ensemble == sNPT )
-            fclose( out_control->prs );
+            sfclose( out_control->prs, "Close_Output_Files" );
 
-        if ( control->dipole_anal ) fclose( out_control->dpl );
-        if ( control->diffusion_coef ) fclose( out_control->drft );
-        if ( control->molecular_analysis ) fclose( out_control->mol );
+        if ( control->dipole_anal )
+            sfclose( out_control->dpl, "Close_Output_Files" );
+        if ( control->diffusion_coef )
+            sfclose( out_control->drft, "Close_Output_Files" );
+        if ( control->molecular_analysis )
+            sfclose( out_control->mol, "Close_Output_Files" );
     }
 
 #ifdef TEST_ENERGY
-    fclose( out_control->ebond );
-    fclose( out_control->elp );
-    fclose( out_control->eov );
-    fclose( out_control->eun );
-    fclose( out_control->eval );
-    fclose( out_control->epen );
-    fclose( out_control->ecoa );
-    fclose( out_control->ehb );
-    fclose( out_control->etor );
-    fclose( out_control->econ );
-    fclose( out_control->evdw );
-    fclose( out_control->ecou );
+    sfclose( out_control->ebond, "Close_Output_Files" );
+    sfclose( out_control->elp, "Close_Output_Files" );
+    sfclose( out_control->eov, "Close_Output_Files" );
+    sfclose( out_control->eun, "Close_Output_Files" );
+    sfclose( out_control->eval, "Close_Output_Files" );
+    sfclose( out_control->epen, "Close_Output_Files" );
+    sfclose( out_control->ecoa, "Close_Output_Files" );
+    sfclose( out_control->ehb, "Close_Output_Files" );
+    sfclose( out_control->etor, "Close_Output_Files" );
+    sfclose( out_control->econ, "Close_Output_Files" );
+    sfclose( out_control->evdw, "Close_Output_Files" );
+    sfclose( out_control->ecou, "Close_Output_Files" );
 #endif
 
 #ifdef TEST_FORCES
-    fclose( out_control->fbo );
-    fclose( out_control->fdbo );
+    sfclose( out_control->fbo, "Close_Output_Files" );
+    sfclose( out_control->fdbo, "Close_Output_Files" );
 
     if ( system->my_rank == MASTER_NODE )
     {
-        fclose( out_control->fbond );
-        fclose( out_control->flp );
-        fclose( out_control->fov );
-        fclose( out_control->fun );
-        fclose( out_control->fang );
-        fclose( out_control->fcoa );
-        fclose( out_control->fpen );
-        fclose( out_control->ftor );
-        fclose( out_control->fcon );
-        fclose( out_control->fhb );
-        fclose( out_control->fvdw );
-        fclose( out_control->fele );
-        fclose( out_control->ftot );
-        fclose( out_control->fcomp );
+        sfclose( out_control->fbond, "Close_Output_Files" );
+        sfclose( out_control->flp, "Close_Output_Files" );
+        sfclose( out_control->fov, "Close_Output_Files" );
+        sfclose( out_control->fun, "Close_Output_Files" );
+        sfclose( out_control->fang, "Close_Output_Files" );
+        sfclose( out_control->fcoa, "Close_Output_Files" );
+        sfclose( out_control->fpen, "Close_Output_Files" );
+        sfclose( out_control->ftor, "Close_Output_Files" );
+        sfclose( out_control->fcon, "Close_Output_Files" );
+        sfclose( out_control->fhb, "Close_Output_Files" );
+        sfclose( out_control->fvdw, "Close_Output_Files" );
+        sfclose( out_control->fele, "Close_Output_Files" );
+        sfclose( out_control->ftot, "Close_Output_Files" );
+        sfclose( out_control->fcomp, "Close_Output_Files" );
     }
 #endif
 
 #if defined(PURE_REAX)
 #if defined(TEST_FORCES) || defined(TEST_ENERGY)
-    fclose( out_control->flist );
-    fclose( out_control->blist );
-    fclose( out_control->nlist );
+    sfclose( out_control->flist, "Close_Output_Files" );
+    sfclose( out_control->blist, "Close_Output_Files" );
+    sfclose( out_control->nlist, "Close_Output_Files" );
 #endif
 #endif
 
@@ -548,11 +385,11 @@ void Print_Box( simulation_box* box, char *name, FILE *out )
 
     fprintf( out, "%s:\n", name );
     fprintf( out, "\tmin[%8.3f %8.3f %8.3f]\n",
-             box->min[0], box->min[1], box->min[2] );
+            box->min[0], box->min[1], box->min[2] );
     fprintf( out, "\tmax[%8.3f %8.3f %8.3f]\n",
-             box->max[0], box->max[1], box->max[2] );
+            box->max[0], box->max[1], box->max[2] );
     fprintf( out, "\tdims[%8.3f%8.3f%8.3f]\n",
-             box->box_norms[0], box->box_norms[1], box->box_norms[2] );
+            box->box_norms[0], box->box_norms[1], box->box_norms[2] );
 
     // fprintf( out, "box: {" );
     // for( i = 0; i < 3; ++i )
@@ -598,34 +435,34 @@ void Print_Grid( grid* g, FILE *out )
     };
 
     fprintf( out, "\tnumber of grid cells: %d %d %d\n",
-             g->ncells[0], g->ncells[1], g->ncells[2] );
+            g->ncells[0], g->ncells[1], g->ncells[2] );
     fprintf( out, "\tgcell lengths: %8.3f %8.3f %8.3f\n",
-             g->cell_len[0], g->cell_len[1], g->cell_len[2] );
+            g->cell_len[0], g->cell_len[1], g->cell_len[2] );
     fprintf( out, "\tinverses of gcell lengths: %8.3f %8.3f %8.3f\n",
-             g->inv_len[0], g->inv_len[1], g->inv_len[2] );
+            g->inv_len[0], g->inv_len[1], g->inv_len[2] );
     fprintf( out, "\t---------------------------------\n" );
     fprintf( out, "\tnumber of native gcells: %d %d %d\n",
-             g->native_cells[0], g->native_cells[1], g->native_cells[2] );
+            g->native_cells[0], g->native_cells[1], g->native_cells[2] );
     fprintf( out, "\tnative gcell span: %d-%d  %d-%d  %d-%d\n",
-             g->native_str[0], g->native_end[0],
-             g->native_str[1], g->native_end[1],
-             g->native_str[2], g->native_end[2] );
+            g->native_str[0], g->native_end[0],
+            g->native_str[1], g->native_end[1],
+            g->native_str[2], g->native_end[2] );
     fprintf( out, "\t---------------------------------\n" );
     fprintf( out, "\tvlist gcell stretch: %d %d %d\n",
-             g->vlist_span[0], g->vlist_span[1], g->vlist_span[2] );
+            g->vlist_span[0], g->vlist_span[1], g->vlist_span[2] );
     fprintf( out, "\tnonbonded nbrs gcell stretch: %d %d %d\n",
-             g->nonb_span[0], g->nonb_span[1], g->nonb_span[2] );
+            g->nonb_span[0], g->nonb_span[1], g->nonb_span[2] );
     fprintf( out, "\tbonded nbrs gcell stretch: %d %d %d\n",
-             g->bond_span[0], g->bond_span[1], g->bond_span[2] );
+            g->bond_span[0], g->bond_span[1], g->bond_span[2] );
     fprintf( out, "\t---------------------------------\n" );
     fprintf( out, "\tghost gcell span: %d %d %d\n",
-             g->ghost_span[0], g->ghost_span[1], g->ghost_span[2] );
+            g->ghost_span[0], g->ghost_span[1], g->ghost_span[2] );
     fprintf( out, "\tnonbonded ghost gcell span: %d %d %d\n",
-             g->ghost_nonb_span[0], g->ghost_nonb_span[1], g->ghost_nonb_span[2]);
+            g->ghost_nonb_span[0], g->ghost_nonb_span[1], g->ghost_nonb_span[2]);
     fprintf(out, "\thbonded ghost gcell span: %d %d %d\n",
             g->ghost_hbond_span[0], g->ghost_hbond_span[1], g->ghost_hbond_span[2]);
     fprintf( out, "\tbonded ghost gcell span: %d %d %d\n",
-             g->ghost_bond_span[0], g->ghost_bond_span[1], g->ghost_bond_span[2]);
+            g->ghost_bond_span[0], g->ghost_bond_span[1], g->ghost_bond_span[2]);
     //fprintf(out, "\t---------------------------------\n" );
     //fprintf(out, "\tmax number of gcells at the boundary: %d\n", g->gcell_cap);
     fprintf( out, "\t---------------------------------\n" );
@@ -641,17 +478,17 @@ void Print_Grid( grid* g, FILE *out )
                 if ( g->cells[x][y][z].type != gc_type )
                 {
                     fprintf( stderr,
-                             "\tgcells from(%2d %2d %2d) to (%2d %2d %2d): %d - %s\n",
-                             gc_str[0], gc_str[1], gc_str[2], x, y, z,
-                             gc_type, gcell_type_text[gc_type] );
+                            "\tgcells from(%2d %2d %2d) to (%2d %2d %2d): %d - %s\n",
+                            gc_str[0], gc_str[1], gc_str[2], x, y, z,
+                            gc_type, gcell_type_text[gc_type] );
                     gc_type = g->cells[x][y][z].type;
                     gc_str[0] = x;
                     gc_str[1] = y;
                     gc_str[2] = z;
                 }
     fprintf( stderr, "\tgcells from(%2d %2d %2d) to (%2d %2d %2d): %d - %s\n",
-             gc_str[0], gc_str[1], gc_str[2], x, y, z,
-             gc_type, gcell_type_text[gc_type] );
+            gc_str[0], gc_str[1], gc_str[2], x, y, z,
+            gc_type, gcell_type_text[gc_type] );
     fprintf( out, "-------------------------------------\n" );
 }
 
@@ -667,7 +504,7 @@ void Print_GCell_Exchange_Bounds( int my_rank, neighbor_proc *my_nbrs )
     char exch[3][10] = { "NONE", "NEAR_EXCH", "FULL_EXCH" };
 
     sprintf( fname, "gcell_exchange_bounds%d", my_rank );
-    f = fopen( fname, "w" );
+    f = sfopen( fname, "w", "Print_GCell_Exchange_Bounds" );
 
     /* loop over neighbor processes */
     for ( r[0] = -1; r[0] <= 1; ++r[0])
@@ -678,24 +515,24 @@ void Print_GCell_Exchange_Bounds( int my_rank, neighbor_proc *my_nbrs )
                     nbr_pr = &(my_nbrs[nbr]);
 
                     fprintf( f, "p%-2d GCELL BOUNDARIES with r(%2d %2d %2d):\n",
-                             my_rank, r[0], r[1], r[2] );
+                            my_rank, r[0], r[1], r[2] );
 
                     fprintf( f, "\tsend_type %s: send(%d %d %d) to (%d %d %d)\n",
-                             exch[nbr_pr->send_type],
-                             nbr_pr->str_send[0], nbr_pr->str_send[1],
-                             nbr_pr->str_send[2],
-                             nbr_pr->end_send[0], nbr_pr->end_send[1],
-                             nbr_pr->end_send[2] );
+                            exch[nbr_pr->send_type],
+                            nbr_pr->str_send[0], nbr_pr->str_send[1],
+                            nbr_pr->str_send[2],
+                            nbr_pr->end_send[0], nbr_pr->end_send[1],
+                            nbr_pr->end_send[2] );
 
                     fprintf( f, "\trecv_type %s: recv(%d %d %d) to (%d %d %d)\n",
-                             exch[nbr_pr->recv_type],
-                             nbr_pr->str_recv[0], nbr_pr->str_recv[1],
-                             nbr_pr->str_recv[2],
-                             nbr_pr->end_recv[0], nbr_pr->end_recv[1],
-                             nbr_pr->end_recv[2] );
+                            exch[nbr_pr->recv_type],
+                            nbr_pr->str_recv[0], nbr_pr->str_recv[1],
+                            nbr_pr->str_recv[2],
+                            nbr_pr->end_recv[0], nbr_pr->end_recv[1],
+                            nbr_pr->end_recv[2] );
                 }
 
-    fclose(f);
+    sfclose( f, "Print_GCell_Exchange_Bounds" );
 }
 
 
@@ -714,7 +551,7 @@ void Print_Native_GCells( reax_system *system )
     };
 
     sprintf( fname, "native_gcells.%d", system->my_rank );
-    f = fopen( fname, "w" );
+    f = sfopen( fname, "w", "Print_Native_GCells" );
     g = &(system->my_grid);
 
     for ( i = g->native_str[0]; i < g->native_end[0]; i++ )
@@ -724,8 +561,8 @@ void Print_Native_GCells( reax_system *system )
                 gc = &( g->cells[i][j][k] );
 
                 fprintf( f, "p%d gcell(%2d %2d %2d) of type %d(%s)\n",
-                         system->my_rank, i, j, k,
-                         gc->type, gcell_type_text[gc->type] );
+                        system->my_rank, i, j, k,
+                        gc->type, gcell_type_text[gc->type] );
 
                 fprintf( f, "\tatom list start: %d, end: %d\n\t", gc->str, gc->end );
 
@@ -734,7 +571,7 @@ void Print_Native_GCells( reax_system *system )
                 fprintf( f, "\n" );
             }
 
-    fclose(f);
+    sfclose( f, "Print_Native_GCells" );
 }
 
 
@@ -753,7 +590,7 @@ void Print_All_GCells( reax_system *system )
     };
 
     sprintf( fname, "all_gcells.%d", system->my_rank );
-    f = fopen( fname, "w" );
+    f = sfopen( fname, "w", "Print_All_GCells" );
     g = &(system->my_grid);
 
     for ( i = 0; i < g->ncells[0]; i++ )
@@ -763,8 +600,8 @@ void Print_All_GCells( reax_system *system )
                 gc = &( g->cells[i][j][k] );
 
                 fprintf( f, "p%d gcell(%2d %2d %2d) of type %d(%s)\n",
-                         system->my_rank, i, j, k,
-                         gc->type, gcell_type_text[gc->type] );
+                        system->my_rank, i, j, k,
+                        gc->type, gcell_type_text[gc->type] );
 
                 fprintf( f, "\tatom list start: %d, end: %d\n\t", gc->str, gc->end );
 
@@ -773,7 +610,7 @@ void Print_All_GCells( reax_system *system )
                 fprintf( f, "\n" );
             }
 
-    fclose(f);
+    sfclose( f, "Print_All_GCells" );
 }
 
 
@@ -785,24 +622,20 @@ void Print_My_Atoms( reax_system *system, control_params *control, int step )
     FILE *fh;
 
     sprintf( fname, "%s.my_atoms.%d.%d", control->sim_name, step, system->my_rank );
-    if ( (fh = fopen( fname, "w" )) == NULL )
-    {
-        fprintf( stderr, "error in opening my_atoms file" );
-        MPI_Abort( MPI_COMM_WORLD, FILE_NOT_FOUND );
-    }
+    fh = sfopen( fname, "w", "Print_My_Atoms" );
 
     // fprintf( stderr, "p%d had %d atoms\n",
     //   system->my_rank, system->n );
 
     for ( i = 0; i < system->n; ++i )
         fprintf( fh, "p%-2d %-5d %2d %24.15e%24.15e%24.15e\n",
-                 system->my_rank,
-                 system->my_atoms[i].orig_id, system->my_atoms[i].type,
-                 system->my_atoms[i].x[0],
-                 system->my_atoms[i].x[1],
-                 system->my_atoms[i].x[2] );
+                system->my_rank,
+                system->my_atoms[i].orig_id, system->my_atoms[i].type,
+                system->my_atoms[i].x[0],
+                system->my_atoms[i].x[1],
+                system->my_atoms[i].x[2] );
 
-    fclose( fh );
+    sfclose( fh, "Print_My_Atoms" );
 }
 
 
@@ -813,29 +646,25 @@ void Print_My_Ext_Atoms( reax_system *system )
     FILE *fh;
 
     sprintf( fname, "my_ext_atoms.%d", system->my_rank );
-    if ( (fh = fopen( fname, "w" )) == NULL )
-    {
-        fprintf( stderr, "error in opening my_ext_atoms file" );
-        MPI_Abort( MPI_COMM_WORLD, FILE_NOT_FOUND );
-    }
+    fh = sfopen( fname, "w", "Print_My_Ext_Atoms" );
 
     // fprintf( stderr, "p%d had %d atoms\n",
     //   system->my_rank, system->n );
 
     for ( i = 0; i < system->N; ++i )
         fprintf( fh, "p%-2d %-5d imprt%-5d %2d %24.15e%24.15e%24.15e\n",
-                 system->my_rank, system->my_atoms[i].orig_id,
-                 system->my_atoms[i].imprt_id, system->my_atoms[i].type,
-                 system->my_atoms[i].x[0],
-                 system->my_atoms[i].x[1],
-                 system->my_atoms[i].x[2] );
+                system->my_rank, system->my_atoms[i].orig_id,
+                system->my_atoms[i].imprt_id, system->my_atoms[i].type,
+                system->my_atoms[i].x[0],
+                system->my_atoms[i].x[1],
+                system->my_atoms[i].x[2] );
 
-    fclose( fh );
+    sfclose( fh, "Print_My_Ext_Atoms" );
 }
 
 
 void Print_Far_Neighbors( reax_system *system, reax_list **lists,
-                          control_params *control )
+        control_params *control )
 {
     char  fname[100];
     int   i, j, id_i, id_j, nbr, natoms;
@@ -843,7 +672,7 @@ void Print_Far_Neighbors( reax_system *system, reax_list **lists,
     reax_list *far_nbrs;
 
     sprintf( fname, "%s.far_nbrs.%d", control->sim_name, system->my_rank );
-    fout      = fopen( fname, "w" );
+    fout = sfopen( fname, "w", "Print_Far_Neighbors" );
     far_nbrs = lists[FAR_NBRS];
     natoms = system->N;
 
@@ -853,24 +682,24 @@ void Print_Far_Neighbors( reax_system *system, reax_list **lists,
 
         for ( j = Start_Index(i, far_nbrs); j < End_Index(i, far_nbrs); ++j )
         {
-            nbr = far_nbrs->far_nbr_list[j].nbr;
+            nbr = far_nbrs->far_nbr_list.nbr[j];
             id_j = system->my_atoms[nbr].orig_id;
 
             fprintf( fout, "%6d%6d%24.15e%24.15e%24.15e%24.15e\n",
-                     id_i, id_j, far_nbrs->far_nbr_list[j].d,
-                     far_nbrs->far_nbr_list[j].dvec[0],
-                     far_nbrs->far_nbr_list[j].dvec[1],
-                     far_nbrs->far_nbr_list[j].dvec[2] );
+                    id_i, id_j, far_nbrs->far_nbr_list.d[j],
+                    far_nbrs->far_nbr_list.dvec[j][0],
+                    far_nbrs->far_nbr_list.dvec[j][1],
+                    far_nbrs->far_nbr_list.dvec[j][2] );
 
             fprintf( fout, "%6d%6d%24.15e%24.15e%24.15e%24.15e\n",
-                     id_j, id_i, far_nbrs->far_nbr_list[j].d,
-                     -far_nbrs->far_nbr_list[j].dvec[0],
-                     -far_nbrs->far_nbr_list[j].dvec[1],
-                     -far_nbrs->far_nbr_list[j].dvec[2] );
+                    id_j, id_i, far_nbrs->far_nbr_list.d[j],
+                    -far_nbrs->far_nbr_list.dvec[j][0],
+                    -far_nbrs->far_nbr_list.dvec[j][1],
+                    -far_nbrs->far_nbr_list.dvec[j][2] );
         }
     }
 
-    fclose( fout );
+    sfclose( fout, "Print_Far_Neighbors" );
 }
 
 
@@ -881,25 +710,29 @@ void Print_Sparse_Matrix( reax_system *system, sparse_matrix *A )
     for ( i = 0; i < A->n; ++i )
         for ( j = A->start[i]; j < A->end[i]; ++j )
             fprintf( stderr, "%d %d %.15e\n",
-                     system->my_atoms[i].orig_id,
-                     system->my_atoms[A->entries[j].j].orig_id,
-                     A->entries[j].val );
+                    system->my_atoms[i].orig_id,
+                    system->my_atoms[A->entries[j].j].orig_id,
+                    A->entries[j].val );
 }
 
 
 void Print_Sparse_Matrix2( reax_system *system, sparse_matrix *A, char *fname )
 {
     int i, j;
-    FILE *f = fopen( fname, "w" );
+    FILE *f = sfopen( fname, "w", "Print_Sparse_Matrix2" );
 
-    for ( i = 0; i < A->n; ++i )
-        for ( j = A->start[i]; j < A->end[i]; ++j )
-            fprintf( f, "%d %d %.15e\n",
-                     system->my_atoms[i].orig_id,
-                     system->my_atoms[A->entries[j].j].orig_id,
-                     A->entries[j].val );
+    if( system->my_rank == 0 )
+    {
+        for ( i = 0; i < A->n; ++i )
+            for ( j = A->start[i]; j < A->end[i]; ++j )
+                fprintf( f, "%d %d %.15e\n",
+                        system->my_atoms[i].orig_id,
+                        system->my_atoms[A->entries[j].j].orig_id,
+                        A->entries[j].val );
+
+    }
 
-    fclose(f);
+    sfclose( f, "Print_Sparse_Matrix2" );
 }
 
 
@@ -907,7 +740,7 @@ void Print_Symmetric_Sparse(reax_system *system, sparse_matrix *A, char *fname)
 {
     int i, j;
     reax_atom *ai, *aj;
-    FILE *f = fopen( fname, "w" );
+    FILE *f = sfopen( fname, "w", "Print_Symmetric_Sparse" );
 
     for ( i = 0; i < A->n; ++i )
     {
@@ -916,64 +749,75 @@ void Print_Symmetric_Sparse(reax_system *system, sparse_matrix *A, char *fname)
         {
             aj = &(system->my_atoms[A->entries[j].j]);
             fprintf( f, "%d %d %.15e\n",
-                     ai->renumber, aj->renumber, A->entries[j].val );
+                    ai->renumber, aj->renumber, A->entries[j].val );
             if ( A->entries[j].j < system->n && ai->renumber != aj->renumber )
                 fprintf( f, "%d %d %.15e\n",
-                         aj->renumber, ai->renumber, A->entries[j].val );
+                        aj->renumber, ai->renumber, A->entries[j].val );
         }
     }
 
-    fclose(f);
+    sfclose( f, "Print_Symmetric_Sparse" );
 }
 
 
 void Print_Linear_System( reax_system *system, control_params *control,
-                          storage *workspace, int step )
+        storage *workspace, int step )
 {
-    int   i, j;
-    char  fname[100];
-    reax_atom *ai, *aj;
-    sparse_matrix *H;
+    int i;
+//    int j;
+    char fname[100];
+    reax_atom *ai;
+//    reax_atom *aj;
+//    sparse_matrix *H;
     FILE *out;
 
     // print rhs and init guesses for QEq
     sprintf( fname, "%s.p%dstate%d", control->sim_name, system->my_rank, step );
-    out = fopen( fname, "w" );
+    out = sfopen( fname, "w", "Print_Linear_System" );
     for ( i = 0; i < system->n; i++ )
     {
         ai = &(system->my_atoms[i]);
         fprintf( out, "%6d%2d%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e\n",
-                 ai->renumber, ai->type, ai->x[0], ai->x[1], ai->x[2],
-                 workspace->s[i], workspace->b_s[i],
-                 workspace->t[i], workspace->b_t[i] );
+                ai->renumber, ai->type, ai->x[0], ai->x[1], ai->x[2],
+                workspace->s[i], workspace->b_s[i],
+                workspace->t[i], workspace->b_t[i] );
     }
-    fclose( out );
+    sfclose( out, "Print_Linear_System" );
 
     // print QEq coef matrix
     sprintf( fname, "%s.p%dH%d", control->sim_name, system->my_rank, step );
     Print_Symmetric_Sparse( system, workspace->H, fname );
 
     // print the incomplete H matrix
-    /*sprintf( fname, "%s.p%dHinc%d", control->sim_name, system->my_rank, step );
-    out = fopen( fname, "w" );
-    H = workspace->H;
-    for( i = 0; i < H->n; ++i ) {
-      ai = &(system->my_atoms[i]);
-      for( j = H->start[i]; j < H->end[i]; ++j )
-        if( H->entries[j].j < system->n ) {
-    aj = &(system->my_atoms[H->entries[j].j]);
-    fprintf( out, "%d %d %.15e\n",
-       ai->orig_id, aj->orig_id, H->entries[j].val );
-    if( ai->orig_id != aj->orig_id )
-      fprintf( out, "%d %d %.15e\n",
-         aj->orig_id, ai->orig_id, H->entries[j].val );
-        }
-    }
-    fclose( out );*/
+//    sprintf( fname, "%s.p%dHinc%d", control->sim_name, system->my_rank, step );
+//    out = sfopen( fname, "w", "Print_Linear_System" );
+//    H = workspace->H;
+//
+//    for( i = 0; i < H->n; ++i )
+//    {
+//        ai = &(system->my_atoms[i]);
+//
+//        for( j = H->start[i]; j < H->end[i]; ++j )
+//        {
+//            if( H->entries[j].j < system->n ) {
+//                aj = &(system->my_atoms[H->entries[j].j]);
+//
+//                fprintf( out, "%d %d %.15e\n",
+//                        ai->orig_id, aj->orig_id, H->entries[j].val );
+//
+//                if( ai->orig_id != aj->orig_id )
+//                {
+//                    fprintf( out, "%d %d %.15e\n",
+//                            aj->orig_id, ai->orig_id, H->entries[j].val );
+//                }
+//            }
+//        }
+//    }
+//    sfclose( out, "Print_Linear_System" );
 
     // print the L from incomplete cholesky decomposition
-    /*sprintf( fname, "%s.p%dL%d", control->sim_name, system->my_rank, step );
-      Print_Sparse_Matrix2( system, workspace->L, fname );*/
+//    sprintf( fname, "%s.p%dL%d", control->sim_name, system->my_rank, step );
+//    Print_Sparse_Matrix2( system, workspace->L, fname );
 }
 
 
@@ -984,13 +828,13 @@ void Print_LinSys_Soln( reax_system *system, real *x, real *b_prm, real *b )
     FILE  *fout;
 
     sprintf( fname, "qeq.%d.out", system->my_rank );
-    fout = fopen( fname, "w" );
+    fout = sfopen( fname, "w", "Print_LinSys_Soln" );
 
     for ( i = 0; i < system->n; ++i )
         fprintf( fout, "%6d%10.4f%10.4f%10.4f\n",
-                 system->my_atoms[i].orig_id, x[i], b_prm[i], b[i] );
+                system->my_atoms[i].orig_id, x[i], b_prm[i], b[i] );
 
-    fclose( fout );
+    sfclose( fout, "Print_LinSys_Soln" );
 }
 
 
@@ -1001,16 +845,16 @@ void Print_Charges( reax_system *system )
     FILE  *fout;
 
     sprintf( fname, "q.%d.out", system->my_rank );
-    fout = fopen( fname, "w" );
+    fout = sfopen( fname, "w", "Print_Charges" );
 
     for ( i = 0; i < system->n; ++i )
         fprintf( fout, "%6d %10.7f %10.7f %10.7f\n",
-                 system->my_atoms[i].orig_id,
-                 system->my_atoms[i].s[0],
-                 system->my_atoms[i].t[0],
-                 system->my_atoms[i].q );
+                system->my_atoms[i].orig_id,
+                system->my_atoms[i].s[0],
+                system->my_atoms[i].t[0],
+                system->my_atoms[i].q );
 
-    fclose( fout );
+    sfclose( fout, "Print_Charges" );
 }
 
 
@@ -1021,10 +865,11 @@ void Print_HBonds( reax_system *system, reax_list **lists,
     char fname[MAX_STR]; 
     hbond_data *phbond;
     FILE *fout;
+    reax_list *far_nbrs = lists[FAR_NBRS];
     reax_list *hbonds = lists[HBONDS];
 
     sprintf( fname, "%s.hbonds.%d.%d", control->sim_name, step, system->my_rank );
-    fout = fopen( fname, "w" );
+    fout = sfopen( fname, "w", "Print_HBonds" );
 
     for ( i = 0; i < system->numH; ++i )
     {
@@ -1033,16 +878,18 @@ void Print_HBonds( reax_system *system, reax_list **lists,
             phbond = &hbonds->hbond_list[pj];
 
             fprintf( fout, "%8d%8d %24.15e %24.15e %24.15e\n", i, phbond->nbr,
-                    phbond->ptr->dvec[0], phbond->ptr->dvec[1], phbond->ptr->dvec[2] );
-//            fprintf( fout, "%8d%8d %8d %8d\n", i, phbond->nbr,
-//                  phbond->scl, phbond->sym_index );
+                    far_nbrs->far_nbr_list.dvec[phbond->ptr][0],
+                    far_nbrs->far_nbr_list.dvec[phbond->ptr][1],
+                    far_nbrs->far_nbr_list.dvec[phbond->ptr][2] );
+            //            fprintf( fout, "%8d%8d %8d %8d\n", i, phbond->nbr,
+            //                  phbond->scl, phbond->sym_index );
         }
     }
 
-    fclose( fout );
+    sfclose( fout, "Print_HBonds" );
 }
 
- 
+
 void Print_HBond_Indices( reax_system *system, reax_list **lists,
         control_params *control, int step )
 {
@@ -1052,7 +899,7 @@ void Print_HBond_Indices( reax_system *system, reax_list **lists,
     reax_list *hbonds = lists[HBONDS];
 
     sprintf( fname, "%s.hbonds_indices.%d.%d", control->sim_name, step, system->my_rank );
-    fout = fopen( fname, "w" );
+    fout = sfopen( fname, "w", "Print_HBond_Indices" );
 
     for ( i = 0; i < system->N; ++i )
     {
@@ -1060,7 +907,7 @@ void Print_HBond_Indices( reax_system *system, reax_list **lists,
                 i, Start_Index(i, hbonds), End_Index(i, hbonds) );
     }
 
-    fclose( fout );
+    sfclose( fout, "Print_HBond_Indices" );
 }
 
 
@@ -1075,7 +922,7 @@ void Print_Bonds( reax_system *system, reax_list **lists,
     reax_list *bonds = lists[BONDS];
 
     sprintf( fname, "%s.bonds.%d.%d", control->sim_name, step, system->my_rank );
-    fout = fopen( fname, "w" );
+    fout = sfopen( fname, "w", "Print_Bonds" );
 
     for ( i = 0; i < system->N; ++i )
     {
@@ -1083,16 +930,16 @@ void Print_Bonds( reax_system *system, reax_list **lists,
         {
             pbond = &bonds->bond_list[pj];
             bo_ij = &pbond->bo_data;
-//            fprintf( fout, "%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n",
-//                    system->my_atoms[i].orig_id, system->my_atoms[j].orig_id,
-//                    pbond->d, bo_ij->BO, bo_ij->BO_s, bo_ij->BO_pi, bo_ij->BO_pi2 );
+            //            fprintf( fout, "%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n",
+            //                    system->my_atoms[i].orig_id, system->my_atoms[j].orig_id,
+            //                    pbond->d, bo_ij->BO, bo_ij->BO_s, bo_ij->BO_pi, bo_ij->BO_pi2 );
             fprintf( fout, "%8d%8d %24.15f %24.15f\n",
                     i, pbond->nbr, //system->my_atoms[i].orig_id, system->my_atoms[j].orig_id,
                     pbond->d, bo_ij->BO );
         }
     }
 
-    fclose( fout );
+    sfclose( fout, "Print_Bonds" );
 }
 
 
@@ -1104,7 +951,7 @@ int fn_qsort_intcmp( const void *a, const void *b )
 void Print_Bond_List2( reax_system *system, reax_list *bonds, char *fname )
 {
     int i, j, id_i, id_j, nbr, pj;
-    FILE *f = fopen( fname, "w" );
+    FILE *f = sfopen( fname, "w", "Print_Bond_List2" );
     int temp[500];
     int num = 0;
 
@@ -1130,7 +977,7 @@ void Print_Bond_List2( reax_system *system, reax_list *bonds, char *fname )
 
 
 void Print_Total_Force( reax_system *system, simulation_data *data,
-                        storage *workspace )
+        storage *workspace )
 {
     int    i;
 
@@ -1139,9 +986,9 @@ void Print_Total_Force( reax_system *system, simulation_data *data,
 
     for ( i = 0; i < system->N; ++i )
         fprintf( stderr, "%6d %f %f %f\n",
-                 //"%6d%24.15e%24.15e%24.15e\n",
-                 system->my_atoms[i].orig_id,
-                 workspace->f[i][0], workspace->f[i][1], workspace->f[i][2] );
+                //"%6d%24.15e%24.15e%24.15e\n",
+                system->my_atoms[i].orig_id,
+                workspace->f[i][0], workspace->f[i][1], workspace->f[i][2] );
 }
 
 
@@ -1155,7 +1002,7 @@ void Print_Far_Neighbors_List_Adj_Format( reax_system *system,
     FILE *fout;
 
     sprintf( fname, "%s.far.%d.%d", control->sim_name, step, system->my_rank );
-    fout = fopen( fname, "w" );
+    fout = sfopen( fname, "w", "Print_Far_Neighbors_Adj_Format" );
 
     num_intrs = 0;
     intrs = NULL;
@@ -1180,7 +1027,7 @@ void Print_Far_Neighbors_List_Adj_Format( reax_system *system,
 
         for ( pj = Start_Index(i, list); pj < End_Index(i, list); ++pj )
         {
-            nbr = list->far_nbr_list[pj].nbr;
+            nbr = list->far_nbr_list.nbr[pj];
             id_j = system->my_atoms[nbr].orig_id;
             intrs[cnt++] = id_j;
         }
@@ -1202,19 +1049,19 @@ void Print_Far_Neighbors_List_Adj_Format( reax_system *system,
         free( intrs );
     }
 
-    fclose( fout );
+    sfclose( fout, "Print_Far_Neighbors_List_Adj_Format" );
 }
 
 void Output_Results( reax_system *system, control_params *control,
-                     simulation_data *data, reax_list **lists,
-                     output_controls *out_control, mpi_datatypes *mpi_data )
+        simulation_data *data, reax_list **lists,
+        output_controls *out_control, mpi_datatypes *mpi_data )
 {
 #if defined(LOG_PERFORMANCE)
     real t_elapsed, denom;
 #endif
 
     if ((out_control->energy_update_freq > 0 &&
-            data->step % out_control->energy_update_freq == 0) ||
+                data->step % out_control->energy_update_freq == 0) ||
             (out_control->write_steps > 0 &&
              data->step % out_control->write_steps == 0))
     {
@@ -1228,74 +1075,111 @@ void Output_Results( reax_system *system, control_params *control,
         {
 #if !defined(DEBUG) && !defined(DEBUG_FOCUS)
             fprintf( out_control->out,
-                     "%-6d%14.2f%14.2f%14.2f%11.2f%13.2f%13.5f\n",
-                     data->step, data->sys_en.e_tot, data->sys_en.e_pot,
-                     E_CONV * data->sys_en.e_kin, data->therm.T,
-                     system->big_box.V, data->iso_bar.P );
+                    "%-6d%14.2f%14.2f%14.2f%11.2f%13.2f%13.5f\n",
+                    data->step, data->sys_en.e_tot, data->sys_en.e_pot,
+                    E_CONV * data->sys_en.e_kin, data->therm.T,
+                    system->big_box.V, data->iso_bar.P );
 
             fprintf( out_control->pot,
-                     "%-6d%14.2f%14.2f%14.2f%14.2f%14.2f%14.2f%14.2f%14.2f%14.2f%14.2f%14.2f\n",
-                     data->step,
-                     data->sys_en.e_bond,
-                     data->sys_en.e_ov + data->sys_en.e_un,  data->sys_en.e_lp,
-                     data->sys_en.e_ang + data->sys_en.e_pen, data->sys_en.e_coa,
-                     data->sys_en.e_hb,
-                     data->sys_en.e_tor, data->sys_en.e_con,
-                     data->sys_en.e_vdW, data->sys_en.e_ele, data->sys_en.e_pol);
+                    "%-6d%14.2f%14.2f%14.2f%14.2f%14.2f%14.2f%14.2f%14.2f%14.2f%14.2f%14.2f\n",
+                    data->step,
+                    data->sys_en.e_bond,
+                    data->sys_en.e_ov + data->sys_en.e_un,  data->sys_en.e_lp,
+                    data->sys_en.e_ang + data->sys_en.e_pen, data->sys_en.e_coa,
+                    data->sys_en.e_hb,
+                    data->sys_en.e_tor, data->sys_en.e_con,
+                    data->sys_en.e_vdW, data->sys_en.e_ele, data->sys_en.e_pol);
 #else
             fprintf( out_control->out,
-                     "%-6d%24.15e%24.15e%24.15e%13.5f%16.5f%13.5f\n",
-                     data->step, data->sys_en.e_tot, data->sys_en.e_pot,
-                     E_CONV * data->sys_en.e_kin, data->therm.T,
-                     system->big_box.V, data->iso_bar.P );
+                    "%-6d%24.15e%24.15e%24.15e%13.5f%16.5f%13.5f\n",
+                    data->step, data->sys_en.e_tot, data->sys_en.e_pot,
+                    E_CONV * data->sys_en.e_kin, data->therm.T,
+                    system->big_box.V, data->iso_bar.P );
 
             fprintf( out_control->pot,
-                     "%-6d%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e\n",
-                     data->step,
-                     data->sys_en.e_bond,
-                     data->sys_en.e_ov + data->sys_en.e_un,  data->sys_en.e_lp,
-                     data->sys_en.e_ang + data->sys_en.e_pen, data->sys_en.e_coa,
-                     data->sys_en.e_hb,
-                     data->sys_en.e_tor, data->sys_en.e_con,
-                     data->sys_en.e_vdW, data->sys_en.e_ele, data->sys_en.e_pol);
+                    "%-6d%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e\n",
+                    data->step,
+                    data->sys_en.e_bond,
+                    data->sys_en.e_ov + data->sys_en.e_un,  data->sys_en.e_lp,
+                    data->sys_en.e_ang + data->sys_en.e_pen, data->sys_en.e_coa,
+                    data->sys_en.e_hb,
+                    data->sys_en.e_tor, data->sys_en.e_con,
+                    data->sys_en.e_vdW, data->sys_en.e_ele, data->sys_en.e_pol);
 #endif //DEBUG
 
 #if defined(LOG_PERFORMANCE)
-            t_elapsed = Get_Timing_Info( data->timing.total );
+            t_elapsed = MPI_Wtime() - data->timing.total;
             if ( data->step - data->prev_steps > 0 )
+            {
                 denom = 1.0 / out_control->energy_update_freq;
-            else denom = 1;
-
-            fprintf( out_control->log, "%6d%8.3f%8.3f%8.3f%8.3f%8.3f%8.3f%8.3f%6d\n",
-                     data->step,
-                     t_elapsed * denom,
-                     data->timing.comm * denom,
-                     data->timing.nbrs * denom,
-                     data->timing.init_forces * denom,
-                     data->timing.bonded * denom,
-                     data->timing.nonb * denom,
-                     data->timing.cm * denom,
-                     (int)(data->timing.cm_solver_iters * denom) );
-
-            Reset_Timing( &(data->timing) );
+            }
+            else
+            {
+                denom = 1.0;
+            }
+
+            fprintf( out_control->log, "%6d %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f %10.2f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f\n",
+                    data->step,
+                    t_elapsed * denom,
+                    data->timing.comm * denom,
+                    data->timing.nbrs * denom,
+                    data->timing.init_forces * denom,
+                    data->timing.init_dist * denom,
+                    data->timing.init_cm * denom,
+                    data->timing.init_bond * denom,
+                    data->timing.bonded * denom,
+                    (data->timing.nonb + data->timing.cm) * denom,
+                    data->timing.cm * denom,
+                    data->timing.cm_sort * denom,
+                    (double)(data->timing.cm_solver_iters * denom),
+                    data->timing.cm_solver_pre_comp * denom,
+                    data->timing.cm_solver_pre_app * denom,
+                    data->timing.cm_solver_comm * denom,
+                    data->timing.cm_solver_allreduce * denom,
+                    data->timing.cm_solver_spmv * denom,
+                    data->timing.cm_solver_vector_ops * denom,
+                    data->timing.cm_solver_orthog * denom,
+                    data->timing.cm_solver_tri_solve * denom );
+
+            //Reset_Timing( &(data->timing) );
+            data->timing.total = MPI_Wtime( );
+            data->timing.comm = ZERO;
+            data->timing.nbrs = ZERO;
+            data->timing.init_forces = ZERO;
+            data->timing.init_dist = ZERO;
+            data->timing.init_cm = ZERO;
+            data->timing.init_bond = ZERO;
+            data->timing.bonded = ZERO;
+            data->timing.nonb = ZERO;
+            data->timing.cm = ZERO;
+            data->timing.cm_sort = ZERO;
+            data->timing.cm_solver_pre_comp = ZERO;
+            data->timing.cm_solver_pre_app = ZERO;
+            data->timing.cm_solver_comm = ZERO;
+            data->timing.cm_solver_allreduce = ZERO;
+            data->timing.cm_solver_iters = 0;
+            data->timing.cm_solver_spmv = ZERO;
+            data->timing.cm_solver_vector_ops = ZERO;
+            data->timing.cm_solver_orthog = ZERO;
+            data->timing.cm_solver_tri_solve = ZERO;
             fflush( out_control->log );
 #endif //LOG_PERFORMANCE
 
             if ( control->virial )
             {
                 fprintf( out_control->prs,
-                         "%8d%13.6f%13.6f%13.6f%13.6f%13.6f%13.6f%13.6f\n",
-                         data->step,
-                         data->int_press[0], data->int_press[1], data->int_press[2],
-                         data->ext_press[0], data->ext_press[1], data->ext_press[2],
-                         data->kin_press );
+                        "%8d%13.6f%13.6f%13.6f%13.6f%13.6f%13.6f%13.6f\n",
+                        data->step,
+                        data->int_press[0], data->int_press[1], data->int_press[2],
+                        data->ext_press[0], data->ext_press[1], data->ext_press[2],
+                        data->kin_press );
 
                 fprintf( out_control->prs,
-                         "%8s%13.6f%13.6f%13.6f%13.6f%13.6f%13.6f%13.6f\n",
-                         "", system->big_box.box_norms[0], system->big_box.box_norms[1],
-                         system->big_box.box_norms[2],
-                         data->tot_press[0], data->tot_press[1], data->tot_press[2],
-                         system->big_box.V );
+                        "%8s%13.6f%13.6f%13.6f%13.6f%13.6f%13.6f%13.6f\n",
+                        "", system->big_box.box_norms[0], system->big_box.box_norms[1],
+                        system->big_box.box_norms[2],
+                        data->tot_press[0], data->tot_press[1], data->tot_press[2],
+                        system->big_box.V );
 
                 fflush( out_control->prs);
             }
@@ -1322,39 +1206,39 @@ void Output_Results( reax_system *system, control_params *control,
 void Debug_Marker_Bonded( output_controls *out_control, int step )
 {
     fprintf( out_control->ebond, "step: %d\n%6s%6s%12s%12s%12s\n",
-             step, "atom1", "atom2", "bo", "ebond", "total" );
+            step, "atom1", "atom2", "bo", "ebond", "total" );
     fprintf( out_control->elp, "step: %d\n%6s%12s%12s%12s\n",
-             step, "atom", "nlp", "elp", "total" );
+            step, "atom", "nlp", "elp", "total" );
     fprintf( out_control->eov, "step: %d\n%6s%12s%12s\n",
-             step, "atom", "eov", "total" );
+            step, "atom", "eov", "total" );
     fprintf( out_control->eun, "step: %d\n%6s%12s%12s\n",
-             step, "atom", "eun", "total" );
+            step, "atom", "eun", "total" );
     fprintf( out_control->eval, "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s%12s\n",
-             step, "atom1", "atom2", "atom3", "angle", "theta0",
-             "bo(12)", "bo(23)", "eval", "total" );
+            step, "atom1", "atom2", "atom3", "angle", "theta0",
+            "bo(12)", "bo(23)", "eval", "total" );
     fprintf( out_control->epen, "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s\n",
-             step, "atom1", "atom2", "atom3", "angle", "bo(12)", "bo(23)",
-             "epen", "total" );
+            step, "atom1", "atom2", "atom3", "angle", "bo(12)", "bo(23)",
+            "epen", "total" );
     fprintf( out_control->ecoa, "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s\n",
-             step, "atom1", "atom2", "atom3", "angle", "bo(12)", "bo(23)",
-             "ecoa", "total" );
+            step, "atom1", "atom2", "atom3", "angle", "bo(12)", "bo(23)",
+            "ecoa", "total" );
     fprintf( out_control->ehb,  "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s\n",
-             step, "atom1", "atom2", "atom3", "r(23)", "angle", "bo(12)",
-             "ehb", "total" );
+            step, "atom1", "atom2", "atom3", "r(23)", "angle", "bo(12)",
+            "ehb", "total" );
     fprintf( out_control->etor, "step: %d\n%6s%6s%6s%6s%12s%12s%12s%12s\n",
-             step, "atom1", "atom2", "atom3", "atom4", "phi", "bo(23)",
-             "etor", "total" );
+            step, "atom1", "atom2", "atom3", "atom4", "phi", "bo(23)",
+            "etor", "total" );
     fprintf( out_control->econ, "step:%d\n%6s%6s%6s%6s%12s%12s%12s%12s%12s%12s\n",
-             step, "atom1", "atom2", "atom3", "atom4",
-             "phi", "bo(12)", "bo(23)", "bo(34)", "econ", "total" );
+            step, "atom1", "atom2", "atom3", "atom4",
+            "phi", "bo(12)", "bo(23)", "bo(34)", "econ", "total" );
 }
 
 void Debug_Marker_Nonbonded( output_controls *out_control, int step )
 {
     fprintf( out_control->evdw, "step: %d\n%6s%6s%12s%12s%12s\n",
-             step, "atom1", "atom2", "r12", "evdw", "total" );
+            step, "atom1", "atom2", "r12", "evdw", "total" );
     fprintf( out_control->ecou, "step: %d\n%6s%6s%12s%12s%12s%12s%12s\n",
-             step, "atom1", "atom2", "r12", "q1", "q2", "ecou", "total" );
+            step, "atom1", "atom2", "r12", "q1", "q2", "ecou", "total" );
 }
 
 #endif
@@ -1362,16 +1246,16 @@ void Debug_Marker_Nonbonded( output_controls *out_control, int step )
 
 #ifdef TEST_FORCES
 void Dummy_Printer( reax_system *system, control_params *control,
-                    simulation_data *data, storage *workspace,
-                    reax_list **lists, output_controls *out_control )
+        simulation_data *data, storage *workspace,
+        reax_list **lists, output_controls *out_control )
 {
 }
 
 
 
 void Print_Bond_Orders( reax_system *system, control_params *control,
-                        simulation_data *data, storage *workspace,
-                        reax_list **lists, output_controls *out_control )
+        simulation_data *data, storage *workspace,
+        reax_list **lists, output_controls *out_control )
 {
     int i, pj, pk;
     bond_order_data *bo_ij;
@@ -1382,52 +1266,52 @@ void Print_Bond_Orders( reax_system *system, control_params *control,
     /* bond orders */
     fprintf( out_control->fbo, "step: %d\n", data->step );
     fprintf( out_control->fbo, "%6s%6s%12s%12s%12s%12s%12s\n",
-             "atom1", "atom2", "r_ij", "total_bo", "bo_s", "bo_p", "bo_pp" );
+            "atom1", "atom2", "r_ij", "total_bo", "bo_s", "bo_p", "bo_pp" );
 
     for ( i = 0; i < system->N; ++i )
         for ( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj )
         {
             bo_ij = &(bonds->bond_list[pj].bo_data);
             fprintf( out_control->fbo,
-                     "%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n",
-                     system->my_atoms[i].orig_id,
-                     system->my_atoms[bonds->bond_list[pj].nbr].orig_id,
-                     bonds->bond_list[pj].d,
-                     bo_ij->BO, bo_ij->BO_s, bo_ij->BO_pi, bo_ij->BO_pi2 );
+                    "%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n",
+                    system->my_atoms[i].orig_id,
+                    system->my_atoms[bonds->bond_list[pj].nbr].orig_id,
+                    bonds->bond_list[pj].d,
+                    bo_ij->BO, bo_ij->BO_s, bo_ij->BO_pi, bo_ij->BO_pi2 );
         }
 
 
     /* derivatives of bond orders */
     fprintf( out_control->fdbo, "step: %d\n", data->step );
     fprintf( out_control->fdbo, "%6s%6s%6s%24s%24s%24s\n",
-             "atom1", "atom2", "atom2", "dBO", "dBOpi", "dBOpi2" );
+            "atom1", "atom2", "atom2", "dBO", "dBOpi", "dBOpi2" );
     for ( i = 0; i < system->N; ++i )
         for ( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj )
         {
             /* fprintf( out_control->fdbo, "%6d %6d\tstart: %6d\tend: %6d\n",
-            system->my_atoms[i].orig_id,
-             system->my_atoms[bonds->bond_list[pj].nbr].orig_id,
-             Start_Index( pj, dBOs ), End_Index( pj, dBOs ) ); */
+               system->my_atoms[i].orig_id,
+               system->my_atoms[bonds->bond_list[pj].nbr].orig_id,
+               Start_Index( pj, dBOs ), End_Index( pj, dBOs ) ); */
             for ( pk = Start_Index(pj, dBOs); pk < End_Index(pj, dBOs); ++pk )
             {
                 dbo_k = &(dBOs->dbo_list[pk]);
                 fprintf( out_control->fdbo, "%6d%6d%6d%24.15e%24.15e%24.15e\n",
-                         system->my_atoms[i].orig_id,
-                         system->my_atoms[bonds->bond_list[pj].nbr].orig_id,
-                         system->my_atoms[dbo_k->wrt].orig_id,
-                         dbo_k->dBO[0], dbo_k->dBO[1], dbo_k->dBO[2] );
+                        system->my_atoms[i].orig_id,
+                        system->my_atoms[bonds->bond_list[pj].nbr].orig_id,
+                        system->my_atoms[dbo_k->wrt].orig_id,
+                        dbo_k->dBO[0], dbo_k->dBO[1], dbo_k->dBO[2] );
 
                 fprintf( out_control->fdbo, "%6d%6d%6d%24.15e%24.15e%24.15e\n",
-                         system->my_atoms[i].orig_id,
-                         system->my_atoms[bonds->bond_list[pj].nbr].orig_id,
-                         system->my_atoms[dbo_k->wrt].orig_id,
-                         dbo_k->dBOpi[0], dbo_k->dBOpi[1], dbo_k->dBOpi[2] );
+                        system->my_atoms[i].orig_id,
+                        system->my_atoms[bonds->bond_list[pj].nbr].orig_id,
+                        system->my_atoms[dbo_k->wrt].orig_id,
+                        dbo_k->dBOpi[0], dbo_k->dBOpi[1], dbo_k->dBOpi[2] );
 
                 fprintf( out_control->fdbo, "%6d%6d%6d%24.15e%24.15e%24.15e\n",
-                         system->my_atoms[i].orig_id,
-                         system->my_atoms[bonds->bond_list[pj].nbr].orig_id,
-                         system->my_atoms[dbo_k->wrt].orig_id,
-                         dbo_k->dBOpi2[0], dbo_k->dBOpi2[1], dbo_k->dBOpi2[2] );
+                        system->my_atoms[i].orig_id,
+                        system->my_atoms[bonds->bond_list[pj].nbr].orig_id,
+                        system->my_atoms[dbo_k->wrt].orig_id,
+                        dbo_k->dBOpi2[0], dbo_k->dBOpi2[1], dbo_k->dBOpi2[2] );
             }
         }
 }
@@ -1442,15 +1326,15 @@ void Print_Forces( FILE *f, storage *workspace, int N, int step )
         //fprintf( f, "%6d %23.15e %23.15e %23.15e\n",
         //fprintf( f, "%6d%12.6f%12.6f%12.6f\n",
         fprintf( f, "%6d %19.9e %19.9e %19.9e\n",
-                 workspace->id_all[i], workspace->f_all[i][0],
-                 workspace->f_all[i][1], workspace->f_all[i][2] );
+                workspace->id_all[i], workspace->f_all[i][0],
+                workspace->f_all[i][1], workspace->f_all[i][2] );
 }
 
 
 void Print_Force_Files( reax_system *system, control_params *control,
-                        simulation_data *data, storage *workspace,
-                        reax_list **lists, output_controls *out_control,
-                        mpi_datatypes *mpi_data )
+        simulation_data *data, storage *workspace,
+        reax_list **lists, output_controls *out_control,
+        mpi_datatypes *mpi_data )
 {
     int i, d;
 
@@ -1514,11 +1398,11 @@ void Print_Force_Files( reax_system *system, control_params *control,
     {
         for ( d = 0; d < 3; ++d )
             workspace->f_tot[i][d] = workspace->f_be[i][d] +
-                                     workspace->f_lp[i][d] + workspace->f_ov[i][d] + workspace->f_un[i][d] +
-                                     workspace->f_ang[i][d] + workspace->f_pen[i][d] + workspace->f_coa[i][d] +
-                                     workspace->f_tor[i][d] + workspace->f_con[i][d] +
-                                     workspace->f_vdw[i][d] + workspace->f_ele[i][d] +
-                                     workspace->f_hb[i][d];
+                workspace->f_lp[i][d] + workspace->f_ov[i][d] + workspace->f_un[i][d] +
+                workspace->f_ang[i][d] + workspace->f_pen[i][d] + workspace->f_coa[i][d] +
+                workspace->f_tor[i][d] + workspace->f_con[i][d] +
+                workspace->f_vdw[i][d] + workspace->f_ele[i][d] +
+                workspace->f_hb[i][d];
     }
 
     Coll_rvecs_at_Master( system, workspace, mpi_data, workspace->f_tot );
@@ -1531,8 +1415,8 @@ void Print_Force_Files( reax_system *system, control_params *control,
 #if defined(TEST_FORCES) || defined(TEST_ENERGY)
 
 void Print_Far_Neighbors_List( reax_system *system, reax_list **lists,
-                               control_params *control, simulation_data *data,
-                               output_controls *out_control )
+        control_params *control, simulation_data *data,
+        output_controls *out_control )
 {
     int   i, j, id_i, id_j, nbr, natoms;
     int num = 0;
@@ -1566,8 +1450,8 @@ void Print_Far_Neighbors_List( reax_system *system, reax_list **lists,
 }
 
 void Print_Bond_List( reax_system *system, control_params *control,
-                      simulation_data *data, reax_list **lists,
-                      output_controls *out_control)
+        simulation_data *data, reax_list **lists,
+        output_controls *out_control)
 {
     int i, j, id_i, id_j, nbr, pj;
     reax_list *bonds = lists[BONDS];
@@ -1609,283 +1493,283 @@ void Print_Init_Atoms( reax_system *system, storage *workspace )
     int i;
 
     fprintf( stderr, "p%d had %d atoms\n",
-             system->my_rank, workspace->init_cnt );
+            system->my_rank, workspace->init_cnt );
 
     for ( i = 0; i < workspace->init_cnt; ++i )
         fprintf( stderr, "p%d, atom%d: %d  %s  %8.3f %8.3f %8.3f\n",
-                 system->my_rank, i,
-                 workspace->init_atoms[i].type, workspace->init_atoms[i].name,
-                 workspace->init_atoms[i].x[0],
-                 workspace->init_atoms[i].x[1],
-                 workspace->init_atoms[i].x[2] );
+                system->my_rank, i,
+                workspace->init_atoms[i].type, workspace->init_atoms[i].name,
+                workspace->init_atoms[i].x[0],
+                workspace->init_atoms[i].x[1],
+                workspace->init_atoms[i].x[2] );
 }
 #endif //OLD_VERSION
 
 
 /*void Print_Bond_Forces( reax_system *system, control_params *control,
-            simulation_data *data, storage *workspace,
-            reax_list **lists, output_controls *out_control )
-{
+  simulation_data *data, storage *workspace,
+  reax_list **lists, output_controls *out_control )
+  {
   int i;
 
   fprintf( out_control->fbond, "step: %d\n", data->step );
   fprintf( out_control->fbond, "%6s%24s%24s%24s\n",
-       "atom", "f_be[0]", "f_be[1]", "f_be[2]" );
+  "atom", "f_be[0]", "f_be[1]", "f_be[2]" );
 
   for( i = 0; i < system->bigN; ++i )
-    fprintf(out_control->fbond, "%6d%24.15e%24.15e%24.15e\n",
-        system->my_atoms[i].orig_id,
-        workspace->f_all[i][0], workspace->f_all[i][1],
-        workspace->f_all[i][2]);
-}
+  fprintf(out_control->fbond, "%6d%24.15e%24.15e%24.15e\n",
+  system->my_atoms[i].orig_id,
+  workspace->f_all[i][0], workspace->f_all[i][1],
+  workspace->f_all[i][2]);
+  }
 
-void Print_LonePair_Forces( reax_system *system, control_params *control,
-                simulation_data *data, storage *workspace,
-                reax_list **lists, output_controls *out_control )
-{
+  void Print_LonePair_Forces( reax_system *system, control_params *control,
+  simulation_data *data, storage *workspace,
+  reax_list **lists, output_controls *out_control )
+  {
   int i;
 
   fprintf( out_control->flp, "step: %d\n", data->step );
   fprintf( out_control->flp, "%6s%24s\n", "atom", "f_lonepair" );
 
   for( i = 0; i < system->bigN; ++i )
-    fprintf(out_control->flp, "%6d%24.15e%24.15e%24.15e\n",
-        system->my_atoms[i].orig_id,
-        workspace->f_all[i][0], workspace->f_all[i][1],
-        workspace->f_all[i][2]);
-}
+  fprintf(out_control->flp, "%6d%24.15e%24.15e%24.15e\n",
+  system->my_atoms[i].orig_id,
+  workspace->f_all[i][0], workspace->f_all[i][1],
+  workspace->f_all[i][2]);
+  }
 
 
-void Print_OverCoor_Forces( reax_system *system, control_params *control,
-                simulation_data *data, storage *workspace,
-                reax_list **lists, output_controls *out_control )
-{
+  void Print_OverCoor_Forces( reax_system *system, control_params *control,
+  simulation_data *data, storage *workspace,
+  reax_list **lists, output_controls *out_control )
+  {
   int i;
 
   fprintf( out_control->fov, "step: %d\n", data->step );
   fprintf( out_control->fov, "%6s%-38s%-38s%-38s\n",
-       "atom","f_over[0]", "f_over[1]", "f_over[2]" );
+  "atom","f_over[0]", "f_over[1]", "f_over[2]" );
 
   for( i = 0; i < system->bigN; ++i )
-    fprintf( out_control->fov,
-         "%6d %24.15e%24.15e%24.15e 0 0 0\n",
-         system->my_atoms[i].orig_id,
-         workspace->f_all[i][0], workspace->f_all[i][1],
-         workspace->f_all[i][2] );
-}
+  fprintf( out_control->fov,
+  "%6d %24.15e%24.15e%24.15e 0 0 0\n",
+  system->my_atoms[i].orig_id,
+  workspace->f_all[i][0], workspace->f_all[i][1],
+  workspace->f_all[i][2] );
+  }
 
 
-void Print_UnderCoor_Forces( reax_system *system, control_params *control,
-                 simulation_data *data, storage *workspace,
-                 reax_list **lists, output_controls *out_control )
-{
+  void Print_UnderCoor_Forces( reax_system *system, control_params *control,
+  simulation_data *data, storage *workspace,
+  reax_list **lists, output_controls *out_control )
+  {
   int i;
 
   fprintf( out_control->fun, "step: %d\n", data->step );
   fprintf( out_control->fun, "%6s%-38s%-38s%-38s\n",
-       "atom","f_under[0]", "f_under[1]", "f_under[2]" );
+  "atom","f_under[0]", "f_under[1]", "f_under[2]" );
 
   for( i = 0; i < system->bigN; ++i )
-    fprintf( out_control->fun,
-         "%6d %24.15e%24.15e%24.15e 0 0 0\n",
-         system->my_atoms[i].orig_id,
-         workspace->f_all[i][0], workspace->f_all[i][1],
-         workspace->f_all[i][2] );
-}
+  fprintf( out_control->fun,
+  "%6d %24.15e%24.15e%24.15e 0 0 0\n",
+  system->my_atoms[i].orig_id,
+  workspace->f_all[i][0], workspace->f_all[i][1],
+  workspace->f_all[i][2] );
+  }
 
 
 void Print_ValAngle_Forces( reax_system *system, control_params *control,
-                simulation_data *data, storage *workspace,
-                reax_list **lists, output_controls *out_control )
+        simulation_data *data, storage *workspace,
+        reax_list **lists, output_controls *out_control )
 {
-  int j;
-
-  fprintf( out_control->f3body, "step: %d\n", data->step );
-  fprintf( out_control->f3body, "%6s%-37s%-37s%-37s%-38s\n",
-       "atom", "3-body total", "f_ang", "f_pen", "f_coa" );
-
-  for( j = 0; j < system->N; ++j ){
-    if( rvec_isZero(workspace->f_pen[j]) && rvec_isZero(workspace->f_coa[j]) )
-      fprintf( out_control->f3body,
-           "%6d %24.15e%24.15e%24.15e  0 0 0  0 0 0\n",
-           system->my_atoms[j].orig_id,
-           workspace->f_ang[j][0], workspace->f_ang[j][1],
-           workspace->f_ang[j][2] );
-    else if( rvec_isZero(workspace->f_coa[j]) )
-      fprintf( out_control->f3body,
-           "%6d %24.15e%24.15e%24.15e %24.15e%24.15e%24.15e "   \
-           "%24.15e%24.15e%24.15e\n",
-           system->my_atoms[j].orig_id,
-           workspace->f_ang[j][0] + workspace->f_pen[j][0],
-           workspace->f_ang[j][1] + workspace->f_pen[j][1],
-           workspace->f_ang[j][2] + workspace->f_pen[j][2],
-           workspace->f_ang[j][0], workspace->f_ang[j][1],
-           workspace->f_ang[j][2],
-           workspace->f_pen[j][0], workspace->f_pen[j][1],
-           workspace->f_pen[j][2] );
-    else{
-      fprintf( out_control->f3body, "%6d %24.15e%24.15e%24.15e ",
-         system->my_atoms[j].orig_id,
-           workspace->f_ang[j][0] + workspace->f_pen[j][0] +
-           workspace->f_coa[j][0],
-           workspace->f_ang[j][1] + workspace->f_pen[j][1] +
-           workspace->f_coa[j][1],
-           workspace->f_ang[j][2] + workspace->f_pen[j][2] +
-           workspace->f_coa[j][2] );
-
-      fprintf( out_control->f3body,
-           "%24.15e%24.15e%24.15e %24.15e%24.15e%24.15e "\
-           "%24.15e%24.15e%24.15e\n",
-           workspace->f_ang[j][0], workspace->f_ang[j][1],
-           workspace->f_ang[j][2],
-           workspace->f_pen[j][0], workspace->f_pen[j][1],
-           workspace->f_pen[j][2],
-           workspace->f_coa[j][0], workspace->f_coa[j][1],
-           workspace->f_coa[j][2] );
+    int j;
+
+    fprintf( out_control->f3body, "step: %d\n", data->step );
+    fprintf( out_control->f3body, "%6s%-37s%-37s%-37s%-38s\n",
+            "atom", "3-body total", "f_ang", "f_pen", "f_coa" );
+
+    for( j = 0; j < system->N; ++j ){
+        if( rvec_isZero(workspace->f_pen[j]) && rvec_isZero(workspace->f_coa[j]) )
+            fprintf( out_control->f3body,
+                    "%6d %24.15e%24.15e%24.15e  0 0 0  0 0 0\n",
+                    system->my_atoms[j].orig_id,
+                    workspace->f_ang[j][0], workspace->f_ang[j][1],
+                    workspace->f_ang[j][2] );
+        else if( rvec_isZero(workspace->f_coa[j]) )
+            fprintf( out_control->f3body,
+                    "%6d %24.15e%24.15e%24.15e %24.15e%24.15e%24.15e "   \
+                    "%24.15e%24.15e%24.15e\n",
+                    system->my_atoms[j].orig_id,
+                    workspace->f_ang[j][0] + workspace->f_pen[j][0],
+                    workspace->f_ang[j][1] + workspace->f_pen[j][1],
+                    workspace->f_ang[j][2] + workspace->f_pen[j][2],
+                    workspace->f_ang[j][0], workspace->f_ang[j][1],
+                    workspace->f_ang[j][2],
+                    workspace->f_pen[j][0], workspace->f_pen[j][1],
+                    workspace->f_pen[j][2] );
+        else{
+            fprintf( out_control->f3body, "%6d %24.15e%24.15e%24.15e ",
+                    system->my_atoms[j].orig_id,
+                    workspace->f_ang[j][0] + workspace->f_pen[j][0] +
+                    workspace->f_coa[j][0],
+                    workspace->f_ang[j][1] + workspace->f_pen[j][1] +
+                    workspace->f_coa[j][1],
+                    workspace->f_ang[j][2] + workspace->f_pen[j][2] +
+                    workspace->f_coa[j][2] );
+
+            fprintf( out_control->f3body,
+                    "%24.15e%24.15e%24.15e %24.15e%24.15e%24.15e "\
+                    "%24.15e%24.15e%24.15e\n",
+                    workspace->f_ang[j][0], workspace->f_ang[j][1],
+                    workspace->f_ang[j][2],
+                    workspace->f_pen[j][0], workspace->f_pen[j][1],
+                    workspace->f_pen[j][2],
+                    workspace->f_coa[j][0], workspace->f_coa[j][1],
+                    workspace->f_coa[j][2] );
+        }
     }
-  }
 }
 
 
 void Print_Hydrogen_Bond_Forces( reax_system *system, control_params *control,
-                 simulation_data *data, storage *workspace,
-                 reax_list **lists, output_controls *out_control)
+        simulation_data *data, storage *workspace,
+        reax_list **lists, output_controls *out_control)
 {
-  int j;
+    int j;
 
-  fprintf( out_control->fhb, "step: %d\n", data->step );
-  fprintf( out_control->fhb, "%6s\t%-38s\n", "atom", "f_hb[0,1,2]" );
+    fprintf( out_control->fhb, "step: %d\n", data->step );
+    fprintf( out_control->fhb, "%6s\t%-38s\n", "atom", "f_hb[0,1,2]" );
 
-  for( j = 0; j < system->N; ++j )
-    fprintf(out_control->fhb, "%6d%24.15e%24.15e%24.15e\n",
-         system->my_atoms[j].orig_id,
-         workspace->f_hb[j][0],
-         workspace->f_hb[j][1],
-         workspace->f_hb[j][2] );
+    for( j = 0; j < system->N; ++j )
+        fprintf(out_control->fhb, "%6d%24.15e%24.15e%24.15e\n",
+                system->my_atoms[j].orig_id,
+                workspace->f_hb[j][0],
+                workspace->f_hb[j][1],
+                workspace->f_hb[j][2] );
 }
 
 
 void Print_Four_Body_Forces( reax_system *system, control_params *control,
-                 simulation_data *data, storage *workspace,
-                 reax_list **lists, output_controls *out_control )
+        simulation_data *data, storage *workspace,
+        reax_list **lists, output_controls *out_control )
 {
-  int j;
-
-  fprintf( out_control->f4body, "step: %d\n", data->step );
-  fprintf( out_control->f4body, "%6s\t%-38s%-38s%-38s\n",
-       "atom", "4-body total", "f_tor", "f_con" );
-
-  for( j = 0; j < system->N; ++j ){
-    if( !rvec_isZero( workspace->f_con[j] ) )
-      fprintf( out_control->f4body,
-           "%6d %24.15e%24.15e%24.15e %24.15e%24.15e%24.15e "\
-           "%24.15e%24.15e%24.15e\n",
-         system->my_atoms[j].orig_id,
-           workspace->f_tor[j][0] + workspace->f_con[j][0],
-           workspace->f_tor[j][1] + workspace->f_con[j][1],
-           workspace->f_tor[j][2] + workspace->f_con[j][2],
-           workspace->f_tor[j][0], workspace->f_tor[j][1],
-           workspace->f_tor[j][2],
-           workspace->f_con[j][0], workspace->f_con[j][1],
-           workspace->f_con[j][2] );
-    else
-      fprintf( out_control->f4body,
-           "%6d %24.15e%24.15e%24.15e  0 0 0\n",
-           system->my_atoms[j].orig_id, workspace->f_tor[j][0],
-           workspace->f_tor[j][1], workspace->f_tor[j][2] );
-  }
+    int j;
+
+    fprintf( out_control->f4body, "step: %d\n", data->step );
+    fprintf( out_control->f4body, "%6s\t%-38s%-38s%-38s\n",
+            "atom", "4-body total", "f_tor", "f_con" );
+
+    for( j = 0; j < system->N; ++j ){
+        if( !rvec_isZero( workspace->f_con[j] ) )
+            fprintf( out_control->f4body,
+                    "%6d %24.15e%24.15e%24.15e %24.15e%24.15e%24.15e "\
+                    "%24.15e%24.15e%24.15e\n",
+                    system->my_atoms[j].orig_id,
+                    workspace->f_tor[j][0] + workspace->f_con[j][0],
+                    workspace->f_tor[j][1] + workspace->f_con[j][1],
+                    workspace->f_tor[j][2] + workspace->f_con[j][2],
+                    workspace->f_tor[j][0], workspace->f_tor[j][1],
+                    workspace->f_tor[j][2],
+                    workspace->f_con[j][0], workspace->f_con[j][1],
+                    workspace->f_con[j][2] );
+        else
+            fprintf( out_control->f4body,
+                    "%6d %24.15e%24.15e%24.15e  0 0 0\n",
+                    system->my_atoms[j].orig_id, workspace->f_tor[j][0],
+                    workspace->f_tor[j][1], workspace->f_tor[j][2] );
+    }
 
 }
 
 
 void Print_vdW_Coulomb_Forces( reax_system *system, control_params *control,
-                   simulation_data *data, storage *workspace,
-                   reax_list **lists, output_controls *out_control )
+        simulation_data *data, storage *workspace,
+        reax_list **lists, output_controls *out_control )
 {
-  int  i;
-
-  return;
-
-  fprintf( out_control->fnonb, "step: %d\n", data->step );
-  fprintf( out_control->fnonb, "%6s\t%-38s%-38s%-38s\n",
-       "atom", "nonbonded_total[0,1,2]", "f_vdw[0,1,2]", "f_ele[0,1,2]" );
-
-  for( i = 0; i < system->N; ++i )
-    fprintf( out_control->fnonb,
-         "%6d%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e\n",
-         system->my_atoms[i].orig_id,
-         workspace->f_vdw[i][0] + workspace->f_ele[i][0],
-         workspace->f_vdw[i][1] + workspace->f_ele[i][1],
-         workspace->f_vdw[i][2] + workspace->f_ele[i][2],
-         workspace->f_vdw[i][0],
-         workspace->f_vdw[i][1],
-         workspace->f_vdw[i][2],
-         workspace->f_ele[i][0],
-         workspace->f_ele[i][1],
-         workspace->f_ele[i][2] );
+    int  i;
+
+    return;
+
+    fprintf( out_control->fnonb, "step: %d\n", data->step );
+    fprintf( out_control->fnonb, "%6s\t%-38s%-38s%-38s\n",
+            "atom", "nonbonded_total[0,1,2]", "f_vdw[0,1,2]", "f_ele[0,1,2]" );
+
+    for( i = 0; i < system->N; ++i )
+        fprintf( out_control->fnonb,
+                "%6d%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e\n",
+                system->my_atoms[i].orig_id,
+                workspace->f_vdw[i][0] + workspace->f_ele[i][0],
+                workspace->f_vdw[i][1] + workspace->f_ele[i][1],
+                workspace->f_vdw[i][2] + workspace->f_ele[i][2],
+                workspace->f_vdw[i][0],
+                workspace->f_vdw[i][1],
+                workspace->f_vdw[i][2],
+                workspace->f_ele[i][0],
+                workspace->f_ele[i][1],
+                workspace->f_ele[i][2] );
 }
 
 
 void Print_Total_Force( reax_system *system, control_params *control,
-            simulation_data *data, storage *workspace,
-            reax_list **lists, output_controls *out_control )
+        simulation_data *data, storage *workspace,
+        reax_list **lists, output_controls *out_control )
 {
-  int    i;
+    int    i;
 
-  return;
+    return;
 
-  fprintf( out_control->ftot, "step: %d\n", data->step );
-  fprintf( out_control->ftot, "%6s\t%-38s\n", "atom", "atom.f[0,1,2]");
+    fprintf( out_control->ftot, "step: %d\n", data->step );
+    fprintf( out_control->ftot, "%6s\t%-38s\n", "atom", "atom.f[0,1,2]");
 
-  for( i = 0; i < system->n; ++i )
-    fprintf( out_control->ftot, "%6d%24.15e%24.15e%24.15e\n",
-         system->my_atoms[i].orig_id,
-         system->my_atoms[i].f[0],
-         system->my_atoms[i].f[1],
-         system->my_atoms[i].f[2] );
+    for( i = 0; i < system->n; ++i )
+        fprintf( out_control->ftot, "%6d%24.15e%24.15e%24.15e\n",
+                system->my_atoms[i].orig_id,
+                system->my_atoms[i].f[0],
+                system->my_atoms[i].f[1],
+                system->my_atoms[i].f[2] );
 }
 
 
 void Compare_Total_Forces( reax_system *system, control_params *control,
-               simulation_data *data, storage *workspace,
-               reax_list **lists, output_controls *out_control )
+        simulation_data *data, storage *workspace,
+        reax_list **lists, output_controls *out_control )
 {
-  int i;
+    int i;
 
-  return;
-
-  fprintf( out_control->ftot2, "step: %d\n", data->step );
-  fprintf( out_control->ftot2, "%6s\t%-38s%-38s\n",
-       "atom", "f_total[0,1,2]", "test_force_total[0,1,2]" );
-
-  for( i = 0; i < system->N; ++i )
-    fprintf( out_control->ftot2, "%6d%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e\n",
-         system->my_atoms[i].orig_id,
-         system->my_atoms[i].f[0],
-         system->my_atoms[i].f[1],
-         system->my_atoms[i].f[2],
-         workspace->f_be[i][0] + workspace->f_lp[i][0] +
-         workspace->f_ov[i][0] + workspace->f_un[i][0] +
-         workspace->f_ang[i][0]+ workspace->f_pen[i][0]+
-         workspace->f_coa[i][0]+ + workspace->f_hb[i][0] +
-         workspace->f_tor[i][0] + workspace->f_con[i][0] +
-         workspace->f_vdw[i][0] + workspace->f_ele[i][0],
-         workspace->f_be[i][1] + workspace->f_lp[i][1] +
-         workspace->f_ov[i][1] + workspace->f_un[i][1] +
-             workspace->f_ang[i][1]+ workspace->f_pen[i][1]+
-         workspace->f_coa[i][1]+ + workspace->f_hb[i][1] +
-         workspace->f_tor[i][1] + workspace->f_con[i][1] +
-         workspace->f_vdw[i][1] + workspace->f_ele[i][1],
-         workspace->f_be[i][2] + workspace->f_lp[i][2] +
-         workspace->f_ov[i][2] + workspace->f_un[i][2] +
-             workspace->f_ang[i][2]+ workspace->f_pen[i][2] +
-         workspace->f_coa[i][2]+ + workspace->f_hb[i][2] +
-         workspace->f_tor[i][2] + workspace->f_con[i][2] +
-         workspace->f_vdw[i][2] + workspace->f_ele[i][2] );
+    return;
+
+    fprintf( out_control->ftot2, "step: %d\n", data->step );
+    fprintf( out_control->ftot2, "%6s\t%-38s%-38s\n",
+            "atom", "f_total[0,1,2]", "test_force_total[0,1,2]" );
+
+    for( i = 0; i < system->N; ++i )
+        fprintf( out_control->ftot2, "%6d%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e\n",
+                system->my_atoms[i].orig_id,
+                system->my_atoms[i].f[0],
+                system->my_atoms[i].f[1],
+                system->my_atoms[i].f[2],
+                workspace->f_be[i][0] + workspace->f_lp[i][0] +
+                workspace->f_ov[i][0] + workspace->f_un[i][0] +
+                workspace->f_ang[i][0]+ workspace->f_pen[i][0]+
+                workspace->f_coa[i][0]+ + workspace->f_hb[i][0] +
+                workspace->f_tor[i][0] + workspace->f_con[i][0] +
+                workspace->f_vdw[i][0] + workspace->f_ele[i][0],
+                workspace->f_be[i][1] + workspace->f_lp[i][1] +
+                workspace->f_ov[i][1] + workspace->f_un[i][1] +
+                workspace->f_ang[i][1]+ workspace->f_pen[i][1]+
+                workspace->f_coa[i][1]+ + workspace->f_hb[i][1] +
+                workspace->f_tor[i][1] + workspace->f_con[i][1] +
+                workspace->f_vdw[i][1] + workspace->f_ele[i][1],
+                workspace->f_be[i][2] + workspace->f_lp[i][2] +
+                workspace->f_ov[i][2] + workspace->f_un[i][2] +
+                workspace->f_ang[i][2]+ workspace->f_pen[i][2] +
+                workspace->f_coa[i][2]+ + workspace->f_hb[i][2] +
+                workspace->f_tor[i][2] + workspace->f_con[i][2] +
+                workspace->f_vdw[i][2] + workspace->f_ele[i][2] );
 }*/
 
 /*void Init_Force_Test_Functions( )
-{
+  {
   Print_Interactions[0] = Print_Bond_Orders;
   Print_Interactions[1] = Print_Bond_Forces;
   Print_Interactions[2] = Print_LonePair_Forces;
diff --git a/PuReMD/src/linear_solvers.c b/PuReMD/src/linear_solvers.c
index 541a132be7bc18354069022785af438030c2e286..939b702bf4ff6f716de299fe5f0543cd9a911a72 100644
--- a/PuReMD/src/linear_solvers.c
+++ b/PuReMD/src/linear_solvers.c
@@ -24,511 +24,2555 @@
 #include "io_tools.h"
 #include "tool_box.h"
 #include "vector.h"
+#include "allocate.h"
+
+/* Intel MKL */
+#if defined(HAVE_LAPACKE_MKL)
+#include "mkl.h"
+/* reference LAPACK */
+#elif defined(HAVE_LAPACKE)
+#include "lapacke.h"
+#endif
+
+/*#if defined(CG_PERFORMANCE)
+real t_start, t_elapsed, matvec_time, dot_time;
+#endif*/
+
+
+static int compare_dbls( const void* arg1, const void* arg2 )
+{   
+    int ret;
+    double a1, a2;
+
+    a1 = *(double *) arg1;
+    a2 = *(double *) arg2;
+
+    if ( a1 < a2 )
+    {   
+        ret = -1;
+    }
+    else if (a1 == a2)
+    {   
+        ret = 0;
+    }
+    else
+    {   
+        ret = 1;
+    }
+
+    return ret;
+}
+
+
+static void qsort_dbls( double *array, int array_len )
+{
+    qsort( array, (size_t) array_len, sizeof(double),
+            compare_dbls );
+}
+
+
+static int find_bucket( double *list, int len, double a )
+{
+    int s, e, m;
+
+    if ( len == 0 )
+    {
+        return 0;
+    }
+
+    if ( a > list[len - 1] )
+    {
+        return len;
+    }
+
+    s = 0;
+    e = len - 1;
+
+    while ( s < e )
+    {
+        m = (s + e) / 2;
+
+        if ( list[m] < a )
+        {
+            s = m + 1;
+        }
+        else
+        {
+            e = m;
+        }
+    }
+
+    return s;
+}
+
+
+static void dual_Sparse_MatVec( sparse_matrix *A, rvec2 *x, rvec2 *b, int N )
+{
+    int i, j, k, si, num_rows;
+    real val;
+
+    for ( i = 0; i < N; ++i )
+    {
+        b[i][0] = 0.0;
+        b[i][1] = 0.0;
+    }
+
+#if defined(NEUTRAL_TERRITORY)
+    num_rows = A->NT;
+
+    if ( A->format == SYM_HALF_MATRIX )
+    {
+        for ( i = 0; i < num_rows; ++i )
+        {
+            si = A->start[i];
+
+            /* diagonal only contributes once */
+            if( i < A->n )
+            {
+                b[i][0] += A->entries[si].val * x[i][0];
+                b[i][1] += A->entries[si].val * x[i][1];
+                k = si + 1;
+            }
+            /* zeros on the diagonal for i >= A->n,
+             * so skip the diagonal multplication step as zeros
+             * are not stored (idea: keep the NNZ's the same
+             * for full shell and neutral territory half-stored
+             * charge matrices to make debugging easier) */
+            else
+            {
+                k = si;
+            }
+
+            for ( ; k < A->end[i]; ++k )
+            {
+                j = A->entries[k].j;
+                val = A->entries[k].val;
+
+                b[i][0] += val * x[j][0];
+                b[i][1] += val * x[j][1];
+                
+                b[j][0] += val * x[i][0];
+                b[j][1] += val * x[i][1];
+            }
+        }
+    }
+    else if ( A->format == SYM_FULL_MATRIX || A->format == FULL_MATRIX )
+    {
+        for ( i = 0; i < num_rows; ++i )
+        {
+            si = A->start[i];
+
+            for ( k = si; k < A->end[i]; ++k )
+            {
+                j = A->entries[k].j;
+                val = A->entries[k].val;
+
+                b[i][0] += val * x[j][0];
+                b[i][1] += val * x[j][1];
+            }
+        }
+    }
+#else
+    num_rows = A->n;
+
+    if ( A->format == SYM_HALF_MATRIX )
+    {
+        for ( i = 0; i < num_rows; ++i )
+        {
+            si = A->start[i];
+
+            /* diagonal only contributes once */
+            b[i][0] += A->entries[si].val * x[i][0];
+            b[i][1] += A->entries[si].val * x[i][1];
+
+            for ( k = si + 1; k < A->end[i]; ++k )
+            {
+                j = A->entries[k].j;
+                val = A->entries[k].val;
+
+                b[i][0] += val * x[j][0];
+                b[i][1] += val * x[j][1];
+                
+                b[j][0] += val * x[i][0];
+                b[j][1] += val * x[i][1];
+            }
+        }
+    }
+    else if ( A->format == SYM_FULL_MATRIX || A->format == FULL_MATRIX )
+    {
+        for ( i = 0; i < num_rows; ++i )
+        {
+            si = A->start[i];
+
+            for ( k = si; k < A->end[i]; ++k )
+            {
+                j = A->entries[k].j;
+                val = A->entries[k].val;
+
+                b[i][0] += val * x[j][0];
+                b[i][1] += val * x[j][1];
+            }
+        }
+    }
+#endif
+}
+
+
+static void Sparse_MatVec( sparse_matrix *A, real *x, real *b, int N )
+{
+    int i, j, k, si, num_rows;
+    real val;
+
+    for ( i = 0; i < N; ++i )
+    {
+        b[i] = 0.0;
+    }
+
+#if defined(NEUTRAL_TERRITORY)
+    num_rows = A->NT;
+
+    if ( A->format == SYM_HALF_MATRIX )
+    {
+        for ( i = 0; i < num_rows; ++i )
+        {
+            si = A->start[i];
+
+            /* diagonal only contributes once */
+            if( i < A->n )
+            {
+                b[i] += A->entries[si].val * x[i];
+                k = si + 1;
+            }
+            /* zeros on the diagonal for i >= A->n,
+             * so skip the diagonal multplication step as zeros
+             * are not stored (idea: keep the NNZ's the same
+             * for full shell and neutral territory half-stored
+             * charge matrices to make debugging easier) */
+            else
+            {
+                k = si;
+            }
+
+            for ( ; k < A->end[i]; ++k )
+            {
+                j = A->entries[k].j;
+                val = A->entries[k].val;
+
+                b[i] += val * x[j];
+                b[j] += val * x[i];
+            }
+        }
+    }
+    else if ( A->format == SYM_FULL_MATRIX || A->format == FULL_MATRIX )
+    {
+        for ( i = 0; i < num_rows; ++i )
+        {
+            si = A->start[i];
+
+            for ( k = si; k < A->end[i]; ++k )
+            {
+                j = A->entries[k].j;
+                val = A->entries[k].val;
+
+                b[i] += val * x[j];
+            }
+        }
+    }
+#else
+    num_rows = A->n;
+
+    if ( A->format == SYM_HALF_MATRIX )
+    {
+        for ( i = 0; i < num_rows; ++i )
+        {
+            si = A->start[i];
+
+            /* diagonal only contributes once */
+            b[i] += A->entries[si].val * x[i];
+
+            for ( k = si + 1; k < A->end[i]; ++k )
+            {
+                j = A->entries[k].j;
+                val = A->entries[k].val;
+
+                b[i] += val * x[j];
+                b[j] += val * x[i];
+            }
+        }
+    }
+    else if ( A->format == SYM_FULL_MATRIX || A->format == FULL_MATRIX )
+    {
+        for ( i = 0; i < num_rows; ++i )
+        {
+            si = A->start[i];
+
+            for ( k = si; k < A->end[i]; ++k )
+            {
+                j = A->entries[k].j;
+                val = A->entries[k].val;
+
+                b[i] += val * x[j];
+            }
+        }
+    }
+#endif
+}
+
+
+real setup_sparse_approx_inverse( reax_system *system, simulation_data *data, storage *workspace,
+        mpi_datatypes *mpi_data, sparse_matrix *A, sparse_matrix **A_spar_patt,
+        int nprocs, real filter )
+{
+    int i, bin, total, pos;
+    int n, n_gather, s_local, s, n_local;
+    int target_proc;
+    int k;
+    int pj, size;
+    int left, right, p, turn;
+    int num_rows;
+
+    real threshold, pivot, tmp;
+    real *input_array;
+    real *samplelist_local, *samplelist;
+    real *pivotlist;
+    real *bucketlist_local, *bucketlist;
+
+    int *srecv, *sdispls;
+    int *scounts_local, *scounts;
+    int *dspls_local, *dspls;
+    int *bin_elements;
+
+    MPI_Comm comm;
+
+    real start, t_start, t_comm;
+    real total_comm;
+
+    start = MPI_Wtime();
+    t_comm = 0.0;
+
+    srecv = NULL;
+    sdispls = NULL;
+    samplelist_local = NULL;
+    samplelist = NULL;
+    pivotlist = NULL;
+    input_array = NULL;
+    bucketlist_local = NULL;
+    bucketlist = NULL;
+    scounts_local = NULL;
+    scounts = NULL;
+    dspls_local = NULL;
+    dspls = NULL;
+    bin_elements = NULL;
+
+    comm = mpi_data->world;
+#if defined(NEUTRAL_TERRITORY)
+    num_rows = A->NT;
+    fprintf( stdout,"%d %d %d\n", A->n, A->NT, A->m );
+    fflush( stdout );
+#else
+    num_rows = A->n;
+#endif
+
+    if ( *A_spar_patt == NULL )
+    {
+#if defined(NEUTRAL_TERRITORY)
+        Allocate_Matrix2( A_spar_patt, A->n, A->NT, A->m,
+                A->format, comm );
+#else
+        Allocate_Matrix2( A_spar_patt, A->n, system->local_cap, A->m,
+                A->format, comm );
+#endif
+    }
+
+    else /*if ( (*A_spar_patt)->m < A->m )*/
+    {
+        Deallocate_Matrix( *A_spar_patt );
+#if defined(NEUTRAL_TERRITORY)
+        Allocate_Matrix2( A_spar_patt, A->n, A->NT, A->m,
+                A->format, comm );
+#else
+        Allocate_Matrix2( A_spar_patt, A->n, system->local_cap, A->m,
+                A->format, comm );
+#endif
+    }
+
+    n_local = 0;
+    for( i = 0; i < num_rows; ++i )
+    {
+        n_local += (A->end[i] - A->start[i] + 9)/10;
+    }
+    s_local = (int) (12.0 * (log2(n_local) + log2(nprocs)));
+    
+    t_start = MPI_Wtime();
+    MPI_Allreduce( &n_local, &n, 1, MPI_INT, MPI_SUM, comm );
+    MPI_Reduce( &s_local, &s, 1, MPI_INT, MPI_SUM, MASTER_NODE, comm );
+    t_comm += MPI_Wtime() - t_start;
+
+    /* count num. bin elements for each processor, uniform bin sizes */
+    input_array = smalloc( sizeof(real) * n_local,
+           "setup_sparse_approx_inverse::input_array", MPI_COMM_WORLD );
+    scounts_local = smalloc( sizeof(int) * nprocs,
+           "setup_sparse_approx_inverse::scounts_local", MPI_COMM_WORLD );
+    scounts = smalloc( sizeof(int) * nprocs,
+           "setup_sparse_approx_inverse::scounts", MPI_COMM_WORLD );
+    bin_elements = smalloc( sizeof(int) * nprocs,
+           "setup_sparse_approx_inverse::bin_elements", MPI_COMM_WORLD );
+    dspls_local = smalloc( sizeof(int) * nprocs,
+           "setup_sparse_approx_inverse::displs_local", MPI_COMM_WORLD );
+    bucketlist_local = smalloc( sizeof(real) * n_local,
+          "setup_sparse_approx_inverse::bucketlist_local", MPI_COMM_WORLD );
+    dspls = smalloc( sizeof(int) * nprocs,
+           "setup_sparse_approx_inverse::dspls", MPI_COMM_WORLD );
+    if ( nprocs > 1 )
+    {
+        pivotlist = smalloc( sizeof(real) *  (nprocs - 1),
+                "setup_sparse_approx_inverse::pivotlist", MPI_COMM_WORLD );
+    }
+    samplelist_local = smalloc( sizeof(real) * s_local,
+           "setup_sparse_approx_inverse::samplelist_local", MPI_COMM_WORLD );
+    if ( system->my_rank == MASTER_NODE )
+    {
+        samplelist = smalloc( sizeof(real) * s,
+               "setup_sparse_approx_inverse::samplelist", MPI_COMM_WORLD );
+        srecv = smalloc( sizeof(int) * nprocs,
+               "setup_sparse_approx_inverse::srecv", MPI_COMM_WORLD );
+        sdispls = smalloc( sizeof(int) * nprocs,
+               "setup_sparse_approx_inverse::sdispls", MPI_COMM_WORLD );
+    }
+
+    n_local = 0;
+    for ( i = 0; i < num_rows; ++i )
+    {
+        for ( pj = A->start[i]; pj < A->end[i]; pj += 10 )
+        {
+            input_array[n_local++] = A->entries[pj].val;
+        }
+    }
+
+    for ( i = 0; i < s_local; i++)
+    {
+        /* samplelist_local[i] = input_array[rand( ) % n_local]; */
+        samplelist_local[i] = input_array[ i ];
+    }
+
+    /* gather samples at the root process */
+    t_start = MPI_Wtime();
+    MPI_Gather( &s_local, 1, MPI_INT, srecv, 1, MPI_INT, MASTER_NODE, comm );
+    t_comm += MPI_Wtime() - t_start;
+
+    if( system->my_rank == MASTER_NODE )
+    {
+        sdispls[0] = 0;
+        for ( i = 0; i < nprocs - 1; ++i )
+        {
+            sdispls[i + 1] = sdispls[i] + srecv[i];
+        }
+    }
+
+    t_start = MPI_Wtime();
+    MPI_Gatherv( samplelist_local, s_local, MPI_DOUBLE,
+            samplelist, srecv, sdispls, MPI_DOUBLE, MASTER_NODE, comm);
+    t_comm += MPI_Wtime() - t_start;
+
+    /* sort samples at the root process and select pivots */
+    if ( system->my_rank == MASTER_NODE )
+    {
+        qsort_dbls( samplelist, s );
+
+        for ( i = 1; i < nprocs; ++i )
+        {
+            pivotlist[i - 1] = samplelist[(i * s) / nprocs];
+        }
+    }
+
+    /* broadcast pivots */
+    t_start = MPI_Wtime();
+    MPI_Bcast( pivotlist, nprocs - 1, MPI_DOUBLE, MASTER_NODE, comm );
+    t_comm += MPI_Wtime() - t_start;
+
+    for ( i = 0; i < nprocs; ++i )
+    {
+        scounts_local[i] = 0;
+    }
+
+    for ( i = 0; i < n_local; ++i )
+    {
+        pos = find_bucket( pivotlist, nprocs - 1, input_array[i] );
+        scounts_local[pos]++;
+    }
+
+    for ( i = 0; i < nprocs; ++i )
+    {
+        bin_elements[i] = scounts_local[i];
+        scounts[i] = scounts_local[i];
+    }
+
+    /* compute displacements for MPI comm */
+    dspls_local[0] = 0;
+    for ( i = 0; i < nprocs - 1; ++i )
+    {
+        dspls_local[i + 1] = dspls_local[i] + scounts_local[i];
+    }
+
+    /* bin elements */
+    for ( i = 0; i < n_local; ++i )
+    {
+        bin = find_bucket( pivotlist, nprocs - 1, input_array[i] );
+        pos = dspls_local[bin] + scounts_local[bin] - bin_elements[bin];
+        bucketlist_local[pos] = input_array[i];
+        bin_elements[bin]--;
+    }
+
+    /* determine counts for elements per process */
+    t_start = MPI_Wtime();
+    MPI_Allreduce( MPI_IN_PLACE, scounts, nprocs, MPI_INT, MPI_SUM, comm );
+    t_comm += MPI_Wtime() - t_start;
+
+    /* find the target process */
+    target_proc = 0;
+    total = 0;
+    k = n * filter;
+    for (i = nprocs - 1; i >= 0; --i )
+    {
+        if ( total + scounts[i] >= k )
+        {
+            /* global k becomes local k*/
+            k -= total;
+            target_proc = i;
+            break;
+        }
+        total += scounts[i];
+    }
+
+    n_gather = scounts[target_proc];
+    if ( system->my_rank == target_proc )
+    {
+        bucketlist = smalloc( sizeof( real ) * n_gather,
+               "setup_sparse_approx_inverse::bucketlist", MPI_COMM_WORLD );
+    }
+
+    /* send local buckets to target processor for quickselect */
+    t_start = MPI_Wtime();
+    MPI_Gather( scounts_local + target_proc, 1, MPI_INT, scounts,
+            1, MPI_INT, target_proc, comm );
+    t_comm += MPI_Wtime() - t_start;
+
+    if ( system->my_rank == target_proc )
+    {
+        dspls[0] = 0;
+        for ( i = 0; i < nprocs - 1; ++i )
+        {
+            dspls[i + 1] = dspls[i] + scounts[i];
+        }
+    }
+
+    t_start = MPI_Wtime();
+    MPI_Gatherv( bucketlist_local + dspls_local[target_proc], scounts_local[target_proc], MPI_DOUBLE,
+            bucketlist, scounts, dspls, MPI_DOUBLE, target_proc, comm);
+    t_comm += MPI_Wtime() - t_start;
+
+    /* apply quick select algorithm at the target process */
+    if ( system->my_rank == target_proc )
+    {
+        left = 0;
+        right = n_gather-1;
+
+        turn = 0;
+        while( k )
+        {
+            p  = left;
+            turn = 1 - turn;
+
+            /* alternating pivots in order to handle corner cases */
+            if ( turn == 1 )
+            {
+                pivot = bucketlist[right];
+            }
+            else
+            {
+                pivot = bucketlist[left];
+            }
+            for ( i = left + 1 - turn; i <= right-turn; ++i )
+            {
+                if ( bucketlist[i] > pivot )
+                {
+                    tmp = bucketlist[i];
+                    bucketlist[i] = bucketlist[p];
+                    bucketlist[p] = tmp;
+                    p++;
+                }
+            }
+            if ( turn == 1 )
+            {
+                tmp = bucketlist[p];
+                bucketlist[p] = bucketlist[right];
+                bucketlist[right] = tmp;
+            }
+            else
+            {
+                tmp = bucketlist[p];
+                bucketlist[p] = bucketlist[left];
+                bucketlist[left] = tmp;
+            }
+
+            if( p == k - 1)
+            {
+                threshold = bucketlist[p];
+                break;
+            }
+            else if( p > k - 1 )
+            {
+                right = p - 1;
+            }
+            else
+            {
+                left = p + 1;
+            }
+        }
+        /* comment out if ACKS2 and/or EE is not an option
+           if(threshold < 1.000000)
+           {
+           threshold = 1.000001;
+           } */
+    }
+
+    /* broadcast the filtering value */
+    t_start = MPI_Wtime();
+    MPI_Bcast( &threshold, 1, MPI_DOUBLE, target_proc, comm );
+    t_comm += MPI_Wtime() - t_start;
+
+#if defined(DEBUG)
+    int nnz = 0;
+#endif
+
+    /* build entries of that pattern*/
+    for ( i = 0; i < num_rows; ++i )
+    {
+        (*A_spar_patt)->start[i] = A->start[i];
+        size = A->start[i];
+
+        for ( pj = A->start[i]; pj < A->end[i]; ++pj )
+        {
+            if ( ( A->entries[pj].val >= threshold )  || ( A->entries[pj].j == i ) )
+            {
+                (*A_spar_patt)->entries[size].val = A->entries[pj].val;
+                (*A_spar_patt)->entries[size].j = A->entries[pj].j;
+                size++;
+
+#if defined(DEBUG)
+                nnz++;
+#endif
+            }
+        }
+        (*A_spar_patt)->end[i] = size;
+    }
+
+#if defined(DEBUG)
+    MPI_Allreduce( MPI_IN_PLACE, &nnz, 1, MPI_INT, MPI_SUM, comm );
+    if ( system->my_rank == MASTER_NODE )
+    {
+        fprintf( stdout, "    [INFO] \ntotal nnz in all charge matrices = %d\ntotal nnz in all sparsity patterns = %d\nthreshold = %.15lf\n",
+                n, nnz, threshold );
+        fprintf( stdout, "SAI SETUP takes %.2f seconds\n", MPI_Wtime() - start );
+        fflush( stdout );
+    }
+#endif
+ 
+    MPI_Reduce( &t_comm, &total_comm, 1, MPI_DOUBLE, MPI_SUM, MASTER_NODE,
+            mpi_data->world );
+
+    if( system->my_rank == MASTER_NODE )
+    {
+        data->timing.cm_solver_comm += total_comm / nprocs;
+    }
+
+    sfree( input_array, "setup_sparse_approx_inverse::input_array" );
+    sfree( scounts_local, "setup_sparse_approx_inverse::scounts_local" );
+    sfree( scounts, "setup_sparse_approx_inverse::scounts" );
+    sfree( bin_elements, "setup_sparse_approx_inverse::bin_elements" );
+    sfree( dspls_local, "setup_sparse_approx_inverse::displs_local" );
+    sfree( bucketlist_local, "setup_sparse_approx_inverse::bucketlist_local" );
+    sfree( dspls, "setup_sparse_approx_inverse::dspls" );
+    if ( nprocs > 1)
+    {
+        sfree( pivotlist, "setup_sparse_approx_inverse::pivotlist" );
+    }
+    sfree( samplelist_local, "setup_sparse_approx_inverse::samplelist_local" );
+    if ( system->my_rank == MASTER_NODE )
+    {
+        sfree( samplelist, "setup_sparse_approx_inverse::samplelist" );
+        sfree( srecv, "setup_sparse_approx_inverse::srecv" );
+        sfree( sdispls, "setup_sparse_approx_inverse::sdispls" );
+    }
+    if ( system->my_rank == target_proc )
+    {
+        sfree( bucketlist, "setup_sparse_approx_inverse::bucketlist" );
+    }
+
+    return MPI_Wtime() - start;
+}
+
+
+#if defined(HAVE_LAPACKE) || defined(HAVE_LAPACKE_MKL)
+#if defined(NEUTRAL_TERRITORY)
+real sparse_approx_inverse( reax_system *system, simulation_data *data,
+        storage *workspace, mpi_datatypes *mpi_data, 
+        sparse_matrix *A, sparse_matrix *A_spar_patt,
+        sparse_matrix **A_app_inv, int nprocs )
+{
+    ///////////////
+    int N, M, d_i, d_j;
+    int i, k, pj, j_temp;
+    int local_pos, atom_pos, identity_pos;
+    lapack_int m, n, nrhs, lda, ldb, info;
+    int *pos_x, *X;
+    real *e_j, *dense_matrix;
+    int cnt;
+    
+    reax_atom *atom;
+    int *row_nnz;
+    int **j_list;
+    real **val_list;
+
+    int d, count, index;
+    mpi_out_data *out_bufs;
+    MPI_Comm comm;
+    MPI_Request req[12];
+    MPI_Status stat[12];
+    neighbor_proc *nbr;
+    int *j_send, *j_recv[6];
+    real *val_send, *val_recv[6];
+    
+    real start, t_start, t_comm;
+    real total_comm;
+    ///////////////////
+    start = MPI_Wtime();
+    t_comm = 0.0;
+
+    comm = mpi_data->world;
+
+    if ( *A_app_inv == NULL)
+    {
+        //TODO: FULL_MATRIX?
+        Allocate_Matrix2( A_app_inv, A_spar_patt->n, A->NT, A_spar_patt->m,
+                SYM_FULL_MATRIX, comm );
+    }
+    
+    else /* if ( (*A_app_inv)->m < A_spar_patt->m ) */
+    {
+        Deallocate_Matrix( *A_app_inv );
+        Allocate_Matrix2( A_app_inv, A_spar_patt->n, A->NT, A_spar_patt->m,
+                SYM_FULL_MATRIX, comm );
+    }
+
+    pos_x = NULL;
+    X = NULL;
+
+    row_nnz = NULL;
+    j_list = NULL;
+    val_list = NULL;
+
+    j_send = NULL;
+    val_send = NULL;
+    for( d = 0; d < 6; ++d )
+    {
+        j_recv[d] = NULL;
+        val_recv[d] = NULL;
+    }
+    ////////////////////
+    row_nnz = (int *) malloc( sizeof(int) * A->NT );
+
+    //TODO: allocation size
+    j_list = (int **) malloc( sizeof(int *) * system->N );
+    val_list = (real **) malloc( sizeof(real *) * system->N );
+
+    for ( i = 0; i < A->NT; ++i )
+    {
+        row_nnz[i] = 0;
+    }
+
+    /* mark the atoms that already have their row stored in the local matrix */
+    for ( i = 0; i < A->n; ++i )
+    {
+        row_nnz[i] = A->end[i] - A->start[i];
+    }
+
+    /* Announce the nnz's in each row that will be communicated later */
+    t_start = MPI_Wtime();
+    Dist( system, mpi_data, row_nnz, REAL_PTR_TYPE, MPI_INT );
+    t_comm += MPI_Wtime() - t_start;
+    fprintf( stdout,"SAI after Dist call\n");
+    fflush( stdout );
+
+    comm = mpi_data->comm_mesh3D;
+    out_bufs = mpi_data->out_nt_buffers;
+    count = 0;
+
+    /*  use a Dist-like approach to send the row information */
+    for ( d = 0; d < 6; ++d)
+    {
+        /* initiate recvs */
+        nbr = &(system->my_nt_nbrs[d]);
+        if ( nbr->atoms_cnt )
+        {
+            /* calculate the total data that will be received */
+            cnt = 0;
+            for( i = nbr->atoms_str; i < (nbr->atoms_str + nbr->atoms_cnt); ++i )
+            {
+                cnt += row_nnz[i];
+            }
+
+            /* initiate Irecv */
+            if( cnt )
+            {
+                count += 2;
+
+                j_recv[d] = (int *) malloc( sizeof(int) * cnt );
+                val_recv[d] = (real *) malloc( sizeof(real) * cnt );
+
+                fprintf( stdout,"Dist communication receive phase direction %d will receive %d\n", d, cnt);
+                fflush( stdout );
+                t_start = MPI_Wtime();
+                MPI_Irecv( j_recv + d, cnt, MPI_INT, nbr->receive_rank, d, comm, &req[2 * d] );
+                MPI_Irecv( val_recv + d, cnt, MPI_DOUBLE, nbr->receive_rank, d, comm, &req[2 * d + 1] );
+                t_comm += MPI_Wtime() - t_start;
+            }
+        }
+    }
+    /////////////////////
+    for( d = 0; d < 6; ++d)
+    {
+        nbr = &(system->my_nt_nbrs[d]);
+        /* send both messages in dimension d */
+        if( out_bufs[d].cnt )
+        {
+            cnt = 0;
+            for( i = 0; i < out_bufs[d].cnt; ++i )
+            {
+                cnt += A->end[ out_bufs[d].index[i] ] - A->start[ out_bufs[d].index[i] ];
+                if(out_bufs[d].index[i] < 0 || out_bufs[d].index[i] >= A->n)
+                {
+                    fprintf( stdout, "INDEXING ERROR %d > %d\n", out_bufs[d].index[i], A->n );
+                    fflush( stdout );
+                }
+               //     row_nnz[ out_bufs[d].index[i] ];
+            }
+            fprintf( stdout,"Dist communication    send phase direction %d should  send %d\n", d, cnt);
+            fflush( stdout );
+
+            if( cnt )
+            {
+                j_send = (int *) malloc( sizeof(int) * cnt );
+                val_send = (real *) malloc( sizeof(real) * cnt );
+
+                cnt = 0;
+                for( i = 0; i < out_bufs[d].cnt; ++i )
+                {
+                    for( pj = A->start[ out_bufs[d].index[i] ]; pj < A->end[ out_bufs[d].index[i] ]; ++pj )
+                    {
+                        atom = &system->my_atoms[ A->entries[pj].j ];
+                        j_send[cnt] = atom->orig_id;
+                        val_send[cnt] = A->entries[pj].val;
+                        cnt++;
+                    }
+                }
+
+                fprintf( stdout,"Dist communication    send phase direction %d will    send %d\n", d, cnt );
+                fflush( stdout );
+
+                t_start = MPI_Wtime();
+                MPI_Send( j_send, cnt, MPI_INT, nbr->rank, d, comm );
+                fprintf( stdout,"Dist communication send phase direction %d cnt = %d\n", d, cnt);
+                fflush( stdout );
+                MPI_Send( val_send, cnt, MPI_DOUBLE, nbr->rank, d, comm );
+                fprintf( stdout,"Dist communication send phase direction %d cnt = %d\n", d, cnt);
+                fflush( stdout );
+                t_comm += MPI_Wtime() - t_start;
+            }
+        }
+    }
+    fprintf( stdout," Dist communication for sending row info before waitany\n");
+    fflush( stdout );
+    ///////////////////////
+    for ( d = 0; d < count; ++d )
+    {
+        t_start = MPI_Wtime();
+        MPI_Waitany( REAX_MAX_NT_NBRS, req, &index, stat);
+        t_comm += MPI_Wtime() - t_start;
+
+        nbr = &(system->my_nt_nbrs[index/2]);
+        cnt = 0;
+        for( i = nbr->atoms_str; i < (nbr->atoms_str + nbr->atoms_cnt); ++i )
+        {
+            if( (index%2) == 0 )
+            {
+                j_list[i] = (int *) malloc( sizeof(int) *  row_nnz[i] );
+                for( pj = 0; pj < row_nnz[i]; ++pj )
+                {
+                    j_list[i][pj] = j_recv[index/2][cnt];
+                    cnt++;
+                }
+            }
+            else
+            {
+                val_list[i] = (real *) malloc( sizeof(real) * row_nnz[i] );
+                for( pj = 0; pj < row_nnz[i]; ++pj )
+                {
+                    val_list[i][pj] = val_recv[index/2][cnt];
+                    cnt++;
+                }
+            }
+
+        }
+    }
+    //////////////////////
+    fprintf( stdout," wow wow wow, Dist communication for sending row info worked\n");
+    fflush( stdout );
+    //TODO: size?
+    X = (int *) malloc( sizeof(int) * (system->bigN + 1) );
+    pos_x = (int *) malloc( sizeof(int) * (system->bigN + 1) );
+
+    for ( i = 0; i < A_spar_patt->NT; ++i )
+    {
+        N = 0;
+        M = 0;
+        for ( k = 0; k <= system->bigN; ++k )
+        {
+            X[k] = 0;
+            pos_x[k] = 0;
+        }
+
+        /* find column indices of nonzeros (which will be the columns indices of the dense matrix) */
+        for ( pj = A_spar_patt->start[i]; pj < A_spar_patt->end[i]; ++pj )
+        {
+            j_temp = A_spar_patt->entries[pj].j;
+            atom = &system->my_atoms[j_temp];
+            ++N;
+
+            /* for each of those indices
+             * search through the row of full A of that index */
+
+            /* the case where the local matrix has that index's row */
+            if( j_temp < A->NT )
+            {
+                for ( k = A->start[ j_temp ]; k < A->end[ j_temp ]; ++k )
+                {
+                    /* and accumulate the nonzero column indices to serve as the row indices of the dense matrix */
+                    atom = &system->my_atoms[ A->entries[k].j ];
+                    X[atom->orig_id] = 1;
+                }
+            }
+
+            /* the case where we communicated that index's row */
+            else
+            {
+                for ( k = 0; k < row_nnz[j_temp]; ++k )
+                {
+                    /* and accumulate the nonzero column indices to serve as the row indices of the dense matrix */
+                    X[ j_list[j_temp][k] ] = 1;
+                }
+            }
+        }
+
+        /* enumerate the row indices from 0 to (# of nonzero rows - 1) for the dense matrix */
+        identity_pos = M;
+        atom = &system->my_atoms[ i ];
+        atom_pos = atom->orig_id;
+
+        for ( k = 0; k <= system->bigN; k++)
+        {
+            if ( X[k] != 0 )
+            {
+                pos_x[k] = M;
+                if ( k == atom_pos )
+                {
+                    identity_pos = M;
+                }
+                ++M;
+            }
+        }
+
+        /* allocate memory for NxM dense matrix */
+        dense_matrix = (real *) malloc( sizeof(real) * N * M );
+
+        /* fill in the entries of dense matrix */
+        for ( d_j = 0; d_j < N; ++d_j)
+        {
+            /* all rows are initialized to zero */
+            for ( d_i = 0; d_i < M; ++d_i )
+            {
+                dense_matrix[d_i * N + d_j] = 0.0;
+            }
+            /* change the value if any of the column indices is seen */
+
+            /* it is in the original list */
+            local_pos = A_spar_patt->entries[ A_spar_patt->start[i] + d_j ].j;
+            if( local_pos < 0 || local_pos >= system->N )
+            {
+                fprintf( stderr, "THE LOCAL POSITION OF THE ATOM IS NOT VALID, STOP THE EXECUTION\n");
+                fflush( stderr );
+
+            }
+            /////////////////////////////
+            if( local_pos < A->NT )
+            {
+                for ( d_i = A->start[local_pos]; d_i < A->end[local_pos]; ++d_i )
+                {
+                    atom = &system->my_atoms[ A->entries[d_i].j ];
+                    if (pos_x[ atom->orig_id ] >= M || d_j >=  N )
+                    {
+                        fprintf( stderr, "CANNOT MAP IT TO THE DENSE MATRIX, STOP THE EXECUTION, orig_id = %d, i =  %d, j = %d, M = %d N = %d\n", atom->orig_id, pos_x[ atom->orig_id ], d_j, M, N );
+                        fflush( stderr );
+                    }
+                    if ( X[ atom->orig_id ] == 1 )
+                    {
+                        dense_matrix[ pos_x[ atom->orig_id ] * N + d_j ] = A->entries[d_i].val;
+                    }
+                }
+            }
+            else
+            {
+                for ( d_i = 0; d_i < row_nnz[ local_pos ]; ++d_i )
+                {
+                    if (pos_x[ j_list[local_pos][d_i] ] >= M || d_j  >= N )
+                    {
+                        fprintf( stderr, "CANNOT MAP IT TO THE DENSE MATRIX, STOP THE EXECUTION, %d %d\n", pos_x[ j_list[local_pos][d_i] ], d_j);
+                        fflush( stderr );
+                    }
+                    if ( X[ j_list[local_pos][d_i] ] == 1 )
+                    {
+                        dense_matrix[ pos_x[ j_list[local_pos][d_i] ] * N + d_j ] = val_list[local_pos][d_i];
+                    }
+                }
+            }
+        }
+
+        /* create the right hand side of the linear equation
+         * that is the full column of the identity matrix */
+        e_j = (real *) malloc( sizeof(real) * M );
+        //////////////////////
+        for ( k = 0; k < M; ++k )
+        {
+            e_j[k] = 0.0;
+        }
+        e_j[identity_pos] = 1.0;
+
+        /* Solve the overdetermined system AX = B through the least-squares problem:
+         * min ||B - AX||_2 */
+        m = M;
+        n = N;
+        nrhs = 1;
+        lda = N;
+        ldb = nrhs;
+
+        info = LAPACKE_dgels( LAPACK_ROW_MAJOR, 'N', m, n, nrhs, dense_matrix, lda,
+                e_j, ldb );
+
+        /* Check for the full rank */
+        if ( info > 0 )
+        {
+            fprintf( stderr, "The diagonal element %i of the triangular factor ", info );
+            fprintf( stderr, "of A is zero, so that A does not have full rank;\n" );
+            fprintf( stderr, "the least squares solution could not be computed.\n" );
+            exit( INVALID_INPUT );
+        }
+
+        /* accumulate the resulting vector to build A_app_inv */
+        (*A_app_inv)->start[i] = A_spar_patt->start[i];
+        (*A_app_inv)->end[i] = A_spar_patt->end[i];
+        for ( k = (*A_app_inv)->start[i]; k < (*A_app_inv)->end[i]; ++k)
+        {
+            (*A_app_inv)->entries[k].j = A_spar_patt->entries[k].j;
+            (*A_app_inv)->entries[k].val = e_j[k - A_spar_patt->start[i]];
+        }
+        free( dense_matrix );
+        free( e_j );
+    }
+
+    free( pos_x);
+    free( X );
+    /////////////////////
+    MPI_Reduce(&t_comm, &total_comm, 1, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world);
+
+    if( system->my_rank == MASTER_NODE )
+    {
+        data->timing.cm_solver_comm += total_comm / nprocs;
+    }
+
+    return MPI_Wtime() - start;
+}
+
+
+#else
+real sparse_approx_inverse( reax_system *system, simulation_data *data,
+        storage *workspace, mpi_datatypes *mpi_data, 
+        sparse_matrix *A, sparse_matrix *A_spar_patt,
+        sparse_matrix **A_app_inv, int nprocs )
+{
+    int N, M, d_i, d_j, mark;
+    int i, k, pj, j_temp, push;
+    int local_pos, atom_pos, identity_pos;
+    lapack_int m, n, nrhs, lda, ldb, info;
+    int *X, *q;
+    real *e_j, *dense_matrix;
+    int size_e, size_dense;
+    int cnt;
+    reax_atom *atom;
+    int *row_nnz;
+    int **j_list;
+    real **val_list;
+    int d;
+    mpi_out_data *out_bufs;
+    MPI_Comm comm;
+    MPI_Request req1, req2, req3, req4;
+    int flag1, flag2;
+    MPI_Status stat1, stat2, stat3, stat4;
+    const neighbor_proc *nbr1, *nbr2;
+    int *j_send, *j_recv1, *j_recv2;
+    int size_send, size_recv1, size_recv2;
+    real *val_send, *val_recv1, *val_recv2;
+    real start, t_start, t_comm;
+    real total_comm;
+
+    start = MPI_Wtime();
+    t_comm = 0.0;
+
+    comm = mpi_data->world;
+
+    if ( *A_app_inv == NULL)
+    {
+        Allocate_Matrix2( A_app_inv, A_spar_patt->n, system->local_cap, A_spar_patt->m,
+                SYM_FULL_MATRIX, comm );
+    }
+    else /* if ( (*A_app_inv)->m < A_spar_patt->m ) */
+    {
+        Deallocate_Matrix( *A_app_inv );
+        Allocate_Matrix2( A_app_inv, A_spar_patt->n, system->local_cap, A_spar_patt->m,
+                SYM_FULL_MATRIX, comm );
+    }
+
+    X = NULL;
+    j_send = NULL;
+    val_send = NULL;
+    j_recv1 = NULL;
+    j_recv2 = NULL;
+    val_recv1 = NULL;
+    val_recv2 = NULL;
+    size_send = 0;
+    size_recv1 = 0;
+    size_recv2 = 0;
+
+    e_j = NULL;
+    dense_matrix = NULL;
+    size_e = 0;
+    size_dense = 0;
+
+
+    row_nnz = smalloc( sizeof(int) * system->total_cap,
+           "sparse_approx_inverse::row_nnz", MPI_COMM_WORLD );
+    j_list = smalloc( sizeof(int *) * system->N,
+           "sparse_approx_inverse::j_list", MPI_COMM_WORLD );
+    val_list = smalloc( sizeof(real *) * system->N,
+           "sparse_approx_inverse::val_list", MPI_COMM_WORLD );
+
+    for ( i = 0; i < system->total_cap; ++i )
+    {
+        row_nnz[i] = 0;
+    }
+
+    /* mark the atoms that already have their row stored in the local matrix */
+    for ( i = 0; i < system->n; ++i )
+    {
+        row_nnz[i] = A->end[i] - A->start[i];
+    }
+
+    /* Announce the nnz's in each row that will be communicated later */
+    t_start = MPI_Wtime();
+    Dist( system, mpi_data, row_nnz, INT_PTR_TYPE, MPI_INT );
+    t_comm += MPI_Wtime() - t_start;
+
+    comm = mpi_data->comm_mesh3D;
+    out_bufs = mpi_data->out_buffers;
+
+    /* use a Dist-like approach to send the row information */
+    for ( d = 0; d < 3; ++d)
+    {
+        flag1 = 0;
+        flag2 = 0;
+        cnt = 0;
+
+        /* initiate recvs */
+        nbr1 = &system->my_nbrs[2 * d];
+        if ( nbr1->atoms_cnt )
+        {
+            cnt = 0;
+
+            /* calculate the total data that will be received */
+            for( i = nbr1->atoms_str; i < (nbr1->atoms_str + nbr1->atoms_cnt); ++i )
+            {
+                cnt += row_nnz[i];
+            }
+
+            /* initiate Irecv */
+            if( cnt )
+            {
+                flag1 = 1;
+                
+                if ( size_recv1 < cnt )
+                {
+                    if ( size_recv1 )
+                    {
+                        sfree( j_recv1, "sparse_approx_inverse::j_recv1" );
+                        sfree( val_recv1, "sparse_approx_inverse::val_recv1" );
+                    }
+
+                    size_recv1 = cnt * SAFE_ZONE;
+
+                    j_recv1 = smalloc( sizeof(int) * size_recv1,
+                            "sparse_approx_inverse::j_recv1", MPI_COMM_WORLD );
+                    val_recv1 = smalloc( sizeof(real) * size_recv1,
+                            "sparse_approx_inverse::val_recv1", MPI_COMM_WORLD );
+                }
+
+                t_start = MPI_Wtime();
+                MPI_Irecv( j_recv1, cnt, MPI_INT, nbr1->rank, 2 * d + 1, comm, &req1 );
+                MPI_Irecv( val_recv1, cnt, MPI_DOUBLE, nbr1->rank, 2 * d + 1, comm, &req2 );
+                t_comm += MPI_Wtime() - t_start;
+            }
+        }
+
+        nbr2 = &system->my_nbrs[2 * d + 1];
+        if ( nbr2->atoms_cnt )
+        {
+            /* calculate the total data that will be received */
+            cnt = 0;
+            for( i = nbr2->atoms_str; i < (nbr2->atoms_str + nbr2->atoms_cnt); ++i )
+            {
+                cnt += row_nnz[i];
+            }
+
+            /* initiate Irecv */
+            if( cnt )
+            {
+                flag2 = 1;
+
+                if ( size_recv2 < cnt )
+                {
+                    if ( size_recv2 )
+                    {
+                        sfree( j_recv2, "sparse_approx_inverse::j_recv2" );
+                        sfree( val_recv2, "sparse_approx_inverse::val_recv2" );
+                    }
+
+                    size_recv2 = cnt * SAFE_ZONE;
+
+                    j_recv2 = smalloc( sizeof(int) * size_recv2,
+                            "sparse_approx_inverse::j_recv2", MPI_COMM_WORLD );
+                    val_recv2 = smalloc( sizeof(real) * size_recv2,
+                            "sparse_approx_inverse::val_recv2", MPI_COMM_WORLD );
+                }
+
+                t_start = MPI_Wtime();
+                MPI_Irecv( j_recv2, cnt, MPI_INT, nbr2->rank, 2 * d, comm, &req3 );
+                MPI_Irecv( val_recv2, cnt, MPI_DOUBLE, nbr2->rank, 2 * d, comm, &req4 );
+                t_comm += MPI_Wtime() - t_start;
+            }
+        }
+
+        /* send both messages in dimension d */
+        if ( out_bufs[2 * d].cnt )
+        {
+            cnt = 0;
+            for ( i = 0; i < out_bufs[2 * d].cnt; ++i )
+            {
+                cnt += row_nnz[ out_bufs[2 * d].index[i] ];
+            }
+
+            if ( cnt > 0 )
+            {
+                if ( size_send < cnt )
+                {
+                    if ( size_send )
+                    {
+                        sfree( j_send, "sparse_approx_inverse::j_send" );
+                        sfree( val_send, "sparse_approx_inverse::val_send" );
+                    }
+
+                    size_send = cnt * SAFE_ZONE;
+
+                    j_send = smalloc( sizeof(int) * size_send,
+                            "sparse_approx_inverse::j_send", MPI_COMM_WORLD );
+                    val_send = smalloc( sizeof(real) * size_send,
+                            "sparse_approx_inverse::j_send", MPI_COMM_WORLD );
+                }
+
+                cnt = 0;
+                for ( i = 0; i < out_bufs[2 * d].cnt; ++i )
+                {
+                    if ( out_bufs[2 * d].index[i] < A->n )
+                    {
+                        for ( pj = A->start[ out_bufs[2 * d].index[i] ]; pj < A->end[ out_bufs[2 * d].index[i] ]; ++pj )
+                        {
+                            atom = &system->my_atoms[ A->entries[pj].j ];
+                            j_send[cnt] = atom->orig_id;
+                            val_send[cnt] = A->entries[pj].val;
+                            cnt++;
+                        }
+                    }
+                    else
+                    {
+                        for ( pj = 0; pj < row_nnz[ out_bufs[2 * d].index[i] ]; ++pj )
+                        {
+                            j_send[cnt] = j_list[ out_bufs[2 * d].index[i] ][pj];
+                            val_send[cnt] = val_list[ out_bufs[2 * d].index[i] ][pj];
+                            cnt++;
+                        }
+                    }
+                }
+
+                t_start = MPI_Wtime();
+                MPI_Send( j_send, cnt, MPI_INT, nbr1->rank, 2 * d, comm );
+                MPI_Send( val_send, cnt, MPI_DOUBLE, nbr1->rank, 2 * d, comm );
+                t_comm += MPI_Wtime() - t_start;
+            }
+        }
+
+        if ( out_bufs[2 * d + 1].cnt )
+        {
+            cnt = 0;
+            for ( i = 0; i < out_bufs[2 * d + 1].cnt; ++i )
+            {
+                cnt += row_nnz[ out_bufs[2 * d + 1].index[i] ];
+            }
+
+            if ( cnt > 0 )
+            {
+
+                if ( size_send < cnt )
+                {
+                    if ( size_send )
+                    {
+                        sfree( j_send, "sparse_approx_inverse::j_send" );
+                        sfree( val_send, "sparse_approx_inverse::j_send" );
+                    }
+
+                    size_send = cnt * SAFE_ZONE;
+
+                    j_send = smalloc( sizeof(int) * size_send,
+                            "sparse_approx_inverse::j_send", MPI_COMM_WORLD );
+                    val_send = smalloc( sizeof(real) * size_send,
+                            "sparse_approx_inverse::val_send", MPI_COMM_WORLD );
+                }
+
+                cnt = 0;
+                for ( i = 0; i < out_bufs[2 * d + 1].cnt; ++i )
+                {
+                    if ( out_bufs[2 * d + 1].index[i] < A->n )
+                    {
+                        for ( pj = A->start[ out_bufs[2 * d + 1].index[i] ]; pj < A->end[ out_bufs[2 * d + 1].index[i] ]; ++pj )
+                        {
+                            atom = &system->my_atoms[ A->entries[pj].j ];
+                            j_send[cnt] = atom->orig_id;
+                            val_send[cnt] = A->entries[pj].val;
+                            cnt++;
+                        }
+                    }
+                    else
+                    {
+                        for ( pj = 0; pj < row_nnz[ out_bufs[2 * d + 1].index[i] ]; ++pj )
+                        {
+                            j_send[cnt] = j_list[ out_bufs[2 * d + 1].index[i] ][pj];
+                            val_send[cnt] = val_list[ out_bufs[2 * d + 1].index[i] ][pj];
+                            cnt++;
+                        }
+                    }
+                }
+
+                t_start = MPI_Wtime();
+                MPI_Send( j_send, cnt, MPI_INT, nbr2->rank, 2 * d + 1, comm );
+                MPI_Send( val_send, cnt, MPI_DOUBLE, nbr2->rank, 2 * d + 1, comm );
+                t_comm += MPI_Wtime() - t_start;
+            }
+
+        }
+
+        if ( flag1 )
+        {
+            t_start = MPI_Wtime();
+            MPI_Wait( &req1, &stat1 );
+            MPI_Wait( &req2, &stat2 );
+            t_comm += MPI_Wtime() - t_start;
+
+            cnt = 0;
+            for ( i = nbr1->atoms_str; i < (nbr1->atoms_str + nbr1->atoms_cnt); ++i )
+            {
+                j_list[i] = smalloc( sizeof(int) *  row_nnz[i],
+                       "sparse_approx_inverse::j_list[i]", MPI_COMM_WORLD );
+                val_list[i] = smalloc( sizeof(real) * row_nnz[i],
+                       "sparse_approx_inverse::val_list[i]", MPI_COMM_WORLD );
+
+                for ( pj = 0; pj < row_nnz[i]; ++pj )
+                {
+                    j_list[i][pj] = j_recv1[cnt];
+                    val_list[i][pj] = val_recv1[cnt];
+                    cnt++;
+                }
+            }
+        }
+
+        if ( flag2 )
+        {
+            t_start = MPI_Wtime();
+            MPI_Wait( &req3, &stat3 );
+            MPI_Wait( &req4, &stat4 );
+            t_comm += MPI_Wtime() - t_start;
+
+            cnt = 0;
+            for ( i = nbr2->atoms_str; i < (nbr2->atoms_str + nbr2->atoms_cnt); ++i )
+            {
+                j_list[i] = smalloc( sizeof(int) *  row_nnz[i],
+                       "sparse_approx_inverse::j_list[i]", MPI_COMM_WORLD );
+                val_list[i] = smalloc( sizeof(real) * row_nnz[i],
+                       "sparse_approx_inverse::val_list[i]", MPI_COMM_WORLD );
+
+                for ( pj = 0; pj < row_nnz[i]; ++pj )
+                {
+                    j_list[i][pj] = j_recv2[cnt];
+                    val_list[i][pj] = val_recv2[cnt];
+                    cnt++;
+                }
+            }
+        }
+    }
+
+    sfree( j_send, "sparse_approx_inverse::j_send" );
+    sfree( val_send, "sparse_approx_inverse::val_send" );
+    sfree( j_recv1, "sparse_approx_inverse::j_recv1" );
+    sfree( j_recv2, "sparse_approx_inverse::j_recv2" );
+    sfree( val_recv1, "sparse_approx_inverse::val_recv1" );
+    sfree( val_recv2, "sparse_approx_inverse::val_recv2" );
+
+    X = smalloc( sizeof(int) * (system->bigN + 1),
+            "sparse_approx_inverse::X", MPI_COMM_WORLD );
+    //size of q should be equal to the maximum possible cardinalty 
+    //of the set formed by neighbors of neighbors of an atom
+    //i.e, maximum number of rows of dense matrix
+    //for water systems, this number is 34000
+    //for silica systems, it is 12000
+    q = smalloc( sizeof(int) * 50000,
+            "sparse_approx_inverse::q", MPI_COMM_WORLD );
+
+    for ( i = 0; i <= system->bigN; ++i )
+    {
+        X[i] = -1;
+    }
+
+    for ( i = 0; i < A_spar_patt->n; ++i )
+    {
+        N = 0;
+        M = 0;
+        push = 0;
+        mark = i + system->bigN;
+        
+        /* find column indices of nonzeros (which will be the columns indices of the dense matrix) */
+        for ( pj = A_spar_patt->start[i]; pj < A_spar_patt->end[i]; ++pj )
+        {
+            j_temp = A_spar_patt->entries[pj].j;
+            atom = &system->my_atoms[j_temp];
+            ++N;
+
+            /* for each of those indices
+             * search through the row of full A of that index */
+
+            /* the case where the local matrix has that index's row */
+            if( j_temp < A->n )
+            {
+                for ( k = A->start[ j_temp ]; k < A->end[ j_temp ]; ++k )
+                {
+                    /* and accumulate the nonzero column indices to serve as the row indices of the dense matrix */
+                    atom = &system->my_atoms[ A->entries[k].j ];
+                    X[atom->orig_id] = mark;
+                    q[push++] = atom->orig_id;
+                }
+            }
+
+            /* the case where we communicated that index's row */
+            else
+            {
+                for ( k = 0; k < row_nnz[j_temp]; ++k )
+                {
+                    /* and accumulate the nonzero column indices to serve as the row indices of the dense matrix */
+                    X[ j_list[j_temp][k] ] = mark;
+                    q[push++] = j_list[j_temp][k];
+                }
+            }
+        }
+
+        /* enumerate the row indices from 0 to (# of nonzero rows - 1) for the dense matrix */
+        identity_pos = M;
+        atom = &system->my_atoms[ i ];
+        atom_pos = atom->orig_id;
+
+        for ( k = 0; k < push; k++)
+        {
+            if ( X[ q[k] ] == mark )
+            {
+                X[ q[k] ] = M;
+                ++M;
+            }
+        }
+        identity_pos = X[atom_pos];
+
+        /* allocate memory for NxM dense matrix */
+        if ( size_dense < N * M )
+        {
+            if ( size_dense )
+            {
+                sfree( dense_matrix, "sparse_approx_inverse::dense_matrix" );
+            }
+            
+            size_dense = N * M * SAFE_ZONE;
+
+            dense_matrix = smalloc( sizeof(real) * size_dense,
+                "sparse_approx_inverse::dense_matrix", MPI_COMM_WORLD );
+        }
 
-#if defined(CG_PERFORMANCE)
-real t_start, t_elapsed, matvec_time, dot_time;
-#endif
+        /* fill in the entries of dense matrix */
+        for ( d_j = 0; d_j < N; ++d_j)
+        {
+            /* all rows are initialized to zero */
+            for ( d_i = 0; d_i < M; ++d_i )
+            {
+                dense_matrix[d_i * N + d_j] = 0.0;
+            }
+            /* change the value if any of the column indices is seen */
 
+            /* it is in the original list */
+            local_pos = A_spar_patt->entries[ A_spar_patt->start[i] + d_j ].j;
 
-void dual_Sparse_MatVec( sparse_matrix *A, rvec2 *x, rvec2 *b, int N )
-{
-    int  i, j, k, si;
-    real H;
+            if ( local_pos < A->n )
+            {
+                for ( d_i = A->start[local_pos]; d_i < A->end[local_pos]; ++d_i )
+                {
+                    atom = &system->my_atoms[ A->entries[d_i].j ];
+                    dense_matrix[ X[ atom->orig_id ] * N + d_j ] = A->entries[d_i].val;
+                }
+            }
+            else
+            {
+                for ( d_i = 0; d_i < row_nnz[ local_pos ]; ++d_i )
+                {
+                    dense_matrix[ X[ j_list[local_pos][d_i] ] * N + d_j ] = val_list[local_pos][d_i];
+                }
+            }
+        }
 
-    for ( i = 0; i < N; ++i )
-    {
-        b[i][0] = b[i][1] = 0;
-    }
+        /* create the right hand side of the linear equation
+         * that is the full column of the identity matrix */
+        if ( size_e < M )
+        {
+            if ( size_e )
+            {
+                sfree( e_j, "sparse_approx_inverse::e_j" );
+            }
 
-    /* perform multiplication */
-    for ( i = 0; i < A->n; ++i )
-    {
-        si = A->start[i];
-        b[i][0] += A->entries[si].val * x[i][0];
-        b[i][1] += A->entries[si].val * x[i][1];
+            size_e = M * SAFE_ZONE;
 
-        for ( k = si + 1; k < A->end[i]; ++k )
+            e_j = smalloc( sizeof(real) * size_e, "sparse_approx_inverse::e_j", MPI_COMM_WORLD );
+        }
+
+        for ( k = 0; k < M; ++k )
         {
-            j = A->entries[k].j;
-            H = A->entries[k].val;
+            e_j[k] = 0.0;
+        }
+        e_j[identity_pos] = 1.0;
+
+        /* Solve the overdetermined system AX = B through the least-squares problem:
+         * min ||B - AX||_2 */
+        m = M;
+        n = N;
+        nrhs = 1;
+        lda = N;
+        ldb = nrhs;
 
-            b[i][0] += H * x[j][0];
-            b[i][1] += H * x[j][1];
+        info = LAPACKE_dgels( LAPACK_ROW_MAJOR, 'N', m, n, nrhs, dense_matrix, lda,
+                e_j, ldb );
 
-            // comment out for tryQEq
-            //if( j < A->n ) {
-            b[j][0] += H * x[i][0];
-            b[j][1] += H * x[i][1];
-            //}
+        /* Check for the full rank */
+        if ( info > 0 )
+        {
+            fprintf( stderr, "[ERROR] The diagonal element %i of the triangular factor ", info );
+            fprintf( stderr, "of A is zero, so that A does not have full rank;\n" );
+            fprintf( stderr, "the least squares solution could not be computed.\n" );
+            MPI_Abort( MPI_COMM_WORLD, RUNTIME_ERROR );
+        }
+
+        /* accumulate the resulting vector to build A_app_inv */
+        (*A_app_inv)->start[i] = A_spar_patt->start[i];
+        (*A_app_inv)->end[i] = A_spar_patt->end[i];
+        for ( k = (*A_app_inv)->start[i]; k < (*A_app_inv)->end[i]; ++k)
+        {
+            (*A_app_inv)->entries[k].j = A_spar_patt->entries[k].j;
+            (*A_app_inv)->entries[k].val = e_j[k - A_spar_patt->start[i]];
         }
     }
+
+    sfree( dense_matrix, "sparse_approx_inverse::dense_matrix" );
+    sfree( e_j, "sparse_approx_inverse::e_j" );
+    sfree( X, "sparse_approx_inverse::X" );
+    /*for ( i = 0; i < system->N; ++i )
+    {
+        sfree( j_list[i], "sparse_approx_inverse::j_list" );
+        sfree( val_list[i], "sparse_approx_inverse::val_list" );
+    }
+    sfree( j_list, "sparse_approx_inverse::j_list" );
+    sfree( val_list, "sparse_approx_inverse::val_list" );*/
+    sfree( row_nnz, "sparse_approx_inverse::row_nnz" );
+
+    MPI_Reduce( &t_comm, &total_comm, 1, MPI_DOUBLE, MPI_SUM, MASTER_NODE,
+            mpi_data->world );
+
+    if ( system->my_rank == MASTER_NODE )
+    {
+        data->timing.cm_solver_comm += total_comm / nprocs;
+    }
+
+    return MPI_Wtime() - start;
 }
+#endif
+#endif
 
 
-int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
-        rvec2 *b, real tol, rvec2 *x, mpi_datatypes* mpi_data, FILE *fout )
+int dual_CG( reax_system *system, control_params *control, simulation_data *data,
+        storage *workspace, sparse_matrix *H, rvec2 *b,
+        real tol, rvec2 *x, mpi_datatypes* mpi_data )
 {
-    int  i, j, n, N, matvecs, scale;
+    int  i, j;
     rvec2 tmp, alpha, beta;
-    rvec2 my_sum, norm_sqr, b_norm, my_dot;
+    rvec2 norm, b_norm;
     rvec2 sig_old, sig_new;
-    MPI_Comm comm;
+    real t_start, t_pa, t_spmv, t_vops, t_comm, t_allreduce;
+    real timings[5], redux[6];
+
+    t_pa = 0.0;
+    t_spmv = 0.0;
+    t_vops = 0.0;
+    t_comm = 0.0;
+    t_allreduce = 0.0;
+
+    t_start = MPI_Wtime( );
+    Dist( system, mpi_data, x, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
+    t_comm += MPI_Wtime( ) - t_start;
+
+    t_start = MPI_Wtime( );
+#if defined(NEUTRAL_TERRITORY)
+    dual_Sparse_MatVec( H, x, workspace->q2, H->NT );
+#else
+    dual_Sparse_MatVec( H, x, workspace->q2, system->N );
+#endif
+    t_spmv += MPI_Wtime( ) - t_start;
 
-    n = system->n;
-    N = system->N;
-    comm = mpi_data->world;
-    matvecs = 0;
-    scale = sizeof(rvec2) / sizeof(void);
 
-#if defined(CG_PERFORMANCE)
-    if ( system->my_rank == MASTER_NODE )
+    if ( H->format == SYM_HALF_MATRIX )
     {
-        matvecs = 0;
-        t_start = matvec_time = dot_time = 0;
-        t_start = Get_Time( );
+        t_start = MPI_Wtime( );
+        Coll( system, mpi_data, workspace->q2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
+        t_comm += MPI_Wtime( ) - t_start;
     }
-#endif
-
-    Dist( system, mpi_data, x, mpi_data->mpi_rvec2, scale, rvec2_packer );
-    dual_Sparse_MatVec( H, x, workspace->q2, N );
-    // tryQEq
-    Coll(system, mpi_data, workspace->q2, mpi_data->mpi_rvec2, scale, rvec2_unpacker);
-
-#if defined(CG_PERFORMANCE)
-    if ( system->my_rank == MASTER_NODE )
+#if defined(NEUTRAL_TERRITORY)
+    else
     {
-        Update_Timing_Info( &t_start, &matvec_time );
+        t_start = MPI_Wtime( );
+        Coll( system, mpi_data, workspace->q2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
+        t_comm += MPI_Wtime( ) - t_start;
     }
 #endif
 
+    t_start = MPI_Wtime( );
     for ( j = 0; j < system->n; ++j )
     {
-        /* residual */
+        // residual
         workspace->r2[j][0] = b[j][0] - workspace->q2[j][0];
         workspace->r2[j][1] = b[j][1] - workspace->q2[j][1];
-        /* apply diagonal pre-conditioner */
-        workspace->d2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
-        workspace->d2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
     }
+    t_vops += MPI_Wtime( ) - t_start;
 
-    /* norm of b */
-    my_sum[0] = my_sum[1] = 0;
-    for ( j = 0; j < n; ++j )
+    if ( control->cm_solver_pre_comp_type == SAI_PC )
     {
-        my_sum[0] += SQR( b[j][0] );
-        my_sum[1] += SQR( b[j][1] );
+         t_start = MPI_Wtime( );
+         Dist( system, mpi_data, workspace->r2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
+         t_comm += MPI_Wtime( ) - t_start;
+
+         t_start = MPI_Wtime( );
+#if defined(NEUTRAL_TERRITORY)
+         dual_Sparse_MatVec( workspace->H_app_inv, workspace->r2, workspace->d2, H->NT );
+#else
+         dual_Sparse_MatVec( workspace->H_app_inv, workspace->r2, workspace->d2, system->n );
+#endif
+         t_pa += MPI_Wtime( ) - t_start;
+    }
+    else if ( control->cm_solver_pre_comp_type == JACOBI_PC)
+    {
+        t_start = MPI_Wtime( );
+        for ( j = 0; j < system->n; ++j )
+        {
+            workspace->d2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
+            workspace->d2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
+        }
+        t_pa += MPI_Wtime( ) - t_start;
     }
-    MPI_Allreduce( &my_sum, &norm_sqr, 2, MPI_DOUBLE, MPI_SUM, comm );
-    b_norm[0] = sqrt( norm_sqr[0] );
-    b_norm[1] = sqrt( norm_sqr[1] );
-    //fprintf( stderr, "bnorm = %f %f\n", b_norm[0], b_norm[1] );
 
-    /* dot product: r.d */
-    my_dot[0] = my_dot[1] = 0;
-    for ( j = 0; j < n; ++j )
+    t_start = MPI_Wtime( );
+    for ( j = 0; j < 6; ++j )
+    {
+        redux[j] = 0;
+    }
+    for ( j = 0; j < system->n; ++j )
     {
-        my_dot[0] += workspace->r2[j][0] * workspace->d2[j][0];
-        my_dot[1] += workspace->r2[j][1] * workspace->d2[j][1];
+        redux[0] += workspace->r2[j][0] * workspace->d2[j][0];
+        redux[1] += workspace->r2[j][1] * workspace->d2[j][1];
+        
+        redux[2] += workspace->d2[j][0] * workspace->d2[j][0];
+        redux[3] += workspace->d2[j][1] * workspace->d2[j][1];
+
+        redux[4] += b[j][0] * b[j][0];
+        redux[5] += b[j][1] * b[j][1];
     }
-    MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm );
-    //fprintf( stderr, "sig_new: %f %f\n", sig_new[0], sig_new[1] );
+    t_vops += MPI_Wtime( ) - t_start;
 
-#if defined(CG_PERFORMANCE)
-    if ( system->my_rank == MASTER_NODE )
-        Update_Timing_Info( &t_start, &dot_time );
-#endif
+    t_start = MPI_Wtime( );
+    MPI_Allreduce( MPI_IN_PLACE, redux, 6, MPI_DOUBLE, MPI_SUM, mpi_data->world );
+    t_allreduce += MPI_Wtime( ) - t_start;
+
+    sig_new[0] = redux[0];
+    sig_new[1] = redux[1];
+    norm[0] = sqrt( redux[2] );
+    norm[1] = sqrt( redux[3] );
+    b_norm[0] = sqrt( redux[4] );
+    b_norm[1] = sqrt( redux[5] );
 
-    for ( i = 1; i < 300; ++i )
+    for ( i = 0; i < control->cm_solver_max_iters; ++i )
     {
-        Dist(system, mpi_data, workspace->d2, mpi_data->mpi_rvec2, scale, rvec2_packer);
-        dual_Sparse_MatVec( H, workspace->d2, workspace->q2, N );
-        // tryQEq
-        Coll(system, mpi_data, workspace->q2, mpi_data->mpi_rvec2, scale, rvec2_unpacker);
+        if ( norm[0] / b_norm[0] <= tol || norm[1] / b_norm[1] <= tol )
+        {
+            break;
+        }
+
+        t_start = MPI_Wtime( );
+        Dist( system, mpi_data, workspace->d2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
+        t_comm += MPI_Wtime( ) - t_start;
 
-#if defined(CG_PERFORMANCE)
-        if ( system->my_rank == MASTER_NODE )
+        t_start = MPI_Wtime( );
+#if defined(NEUTRAL_TERRITORY)
+        dual_Sparse_MatVec( H, workspace->d2, workspace->q2, H->NT );
+#else
+        dual_Sparse_MatVec( H, workspace->d2, workspace->q2, system->N );
+#endif
+        t_spmv += MPI_Wtime( ) - t_start;
+
+        if ( H->format == SYM_HALF_MATRIX )
+        {
+            t_start = MPI_Wtime( );
+            Coll( system, mpi_data, workspace->q2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
+            t_comm += MPI_Wtime( ) - t_start;
+        }
+#if defined(NEUTRAL_TERRITORY)
+        else
         {
-            Update_Timing_Info( &t_start, &matvec_time );
+            t_start = MPI_Wtime( );
+            Coll( system, mpi_data, workspace->q2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
+            t_comm += MPI_Wtime( ) - t_start;
         }
 #endif
 
-        /* dot product: d.q */
-        my_dot[0] = my_dot[1] = 0;
-        for ( j = 0; j < n; ++j )
+        // dot product: d.q
+        t_start =  MPI_Wtime( );
+        redux[0] = redux[1] = 0;
+        for ( j = 0; j < system->n; ++j )
         {
-            my_dot[0] += workspace->d2[j][0] * workspace->q2[j][0];
-            my_dot[1] += workspace->d2[j][1] * workspace->q2[j][1];
+            redux[0] += workspace->d2[j][0] * workspace->q2[j][0];
+            redux[1] += workspace->d2[j][1] * workspace->q2[j][1];
         }
-        MPI_Allreduce( &my_dot, &tmp, 2, MPI_DOUBLE, MPI_SUM, comm );
-        //fprintf( stderr, "tmp: %f %f\n", tmp[0], tmp[1] );
+        t_vops += MPI_Wtime( ) - t_start;
+
+        t_start =  MPI_Wtime( );
+        MPI_Allreduce( &redux, &tmp, 2, MPI_DOUBLE, MPI_SUM, mpi_data->world );
+        t_allreduce += MPI_Wtime( ) - t_start;
 
+        t_start = MPI_Wtime( );
         alpha[0] = sig_new[0] / tmp[0];
         alpha[1] = sig_new[1] / tmp[1];
-        my_dot[0] = my_dot[1] = 0;
         for ( j = 0; j < system->n; ++j )
         {
-            /* update x */
+            // update x
             x[j][0] += alpha[0] * workspace->d2[j][0];
             x[j][1] += alpha[1] * workspace->d2[j][1];
-            /* update residual */
+            // update residual
             workspace->r2[j][0] -= alpha[0] * workspace->q2[j][0];
             workspace->r2[j][1] -= alpha[1] * workspace->q2[j][1];
-            /* apply diagonal pre-conditioner */
-            workspace->p2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
-            workspace->p2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
-            /* dot product: r.p */
-            my_dot[0] += workspace->r2[j][0] * workspace->p2[j][0];
-            my_dot[1] += workspace->r2[j][1] * workspace->p2[j][1];
         }
-        sig_old[0] = sig_new[0];
-        sig_old[1] = sig_new[1];
-        MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm );
-        //fprintf( stderr, "sig_new: %f %f\n", sig_new[0], sig_new[1] );
+        t_vops += MPI_Wtime( ) - t_start;
 
-#if defined(CG_PERFORMANCE)
-        if ( system->my_rank == MASTER_NODE )
+        if ( control->cm_solver_pre_comp_type == SAI_PC )
         {
-            Update_Timing_Info( &t_start, &dot_time );
-        }
+             t_start = MPI_Wtime( );
+             Dist( system, mpi_data, workspace->r2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
+             t_comm += MPI_Wtime( ) - t_start;
+
+             t_start = MPI_Wtime( );
+#if defined(NEUTRAL_TERRITORY)
+             dual_Sparse_MatVec( workspace->H_app_inv, workspace->r2, workspace->p2, H->NT );
+#else
+             dual_Sparse_MatVec( workspace->H_app_inv, workspace->r2, workspace->p2, system->n );
 #endif
+             t_pa += MPI_Wtime( ) - t_start;
+        }
+        else if ( control->cm_solver_pre_comp_type == JACOBI_PC)
+        {
+            t_start = MPI_Wtime( );
+            for ( j = 0; j < system->n; ++j )
+            {
+                workspace->p2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
+                workspace->p2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
+            }
+            t_pa += MPI_Wtime( ) - t_start;
+        }
 
-        if ( sqrt(sig_new[0]) / b_norm[0] <= tol || sqrt(sig_new[1]) / b_norm[1] <= tol )
+        t_start = MPI_Wtime( );
+        redux[0] = 0.0;
+        redux[1] = 0.0;
+        redux[2] = 0.0;
+        redux[3] = 0.0;
+        for ( j = 0; j < system->n; ++j )
         {
-            break;
+            // dot product: r.p
+            redux[0] += workspace->r2[j][0] * workspace->p2[j][0];
+            redux[1] += workspace->r2[j][1] * workspace->p2[j][1];
+
+            // dot product: p.p
+            redux[2] += workspace->p2[j][0] * workspace->p2[j][0];
+            redux[3] += workspace->p2[j][1] * workspace->p2[j][1];
         }
+        t_vops += MPI_Wtime( ) - t_start;
 
+        t_start = MPI_Wtime( );
+        MPI_Allreduce( MPI_IN_PLACE, redux, 4, MPI_DOUBLE, MPI_SUM, mpi_data->world );
+        t_allreduce += MPI_Wtime( ) - t_start;
+        
+        t_start = MPI_Wtime( );
+        sig_old[0] = sig_new[0];
+        sig_old[1] = sig_new[1];
+        sig_new[0] = redux[0];
+        sig_new[1] = redux[1];
+        norm[0] = sqrt( redux[2] );
+        norm[1] = sqrt( redux[3] );
         beta[0] = sig_new[0] / sig_old[0];
         beta[1] = sig_new[1] / sig_old[1];
         for ( j = 0; j < system->n; ++j )
         {
-            /* d = p + beta * d */
+            // d = p + beta * d
             workspace->d2[j][0] = workspace->p2[j][0] + beta[0] * workspace->d2[j][0];
             workspace->d2[j][1] = workspace->p2[j][1] + beta[1] * workspace->d2[j][1];
         }
+        t_vops += MPI_Wtime( ) - t_start;
+    }
+
+    timings[0] = t_pa;
+    timings[1] = t_spmv;
+    timings[2] = t_vops;
+    timings[3] = t_comm;
+    timings[4] = t_allreduce;
+    
+    if ( system->my_rank == MASTER_NODE )
+    {
+        MPI_Reduce( MPI_IN_PLACE, timings, 5, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world );
+ 
+        data->timing.cm_solver_pre_app += timings[0] / control->nprocs;
+        data->timing.cm_solver_spmv += timings[1] / control->nprocs;
+        data->timing.cm_solver_vector_ops += timings[2] / control->nprocs;
+        data->timing.cm_solver_comm += timings[3] / control->nprocs;
+        data->timing.cm_solver_allreduce += timings[4] / control->nprocs;
+    }
+    else
+    {
+        MPI_Reduce( timings, NULL, 5, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world );
     }
 
-    if ( sqrt(sig_new[0]) / b_norm[0] <= tol )
+    // continue to solve the system that has not converged yet
+    if ( norm[0] / b_norm[0] > tol )
     {
-        for ( j = 0; j < n; ++j )
+        for ( j = 0; j < system->n; ++j )
         {
-            workspace->t[j] = workspace->x[j][1];
+            workspace->s[j] = workspace->x[j][0];
         }
-        matvecs = CG( system, workspace, H, workspace->b_t, tol,
-                workspace->t,mpi_data, fout );
-        for ( j = 0; j < n; ++j )
+
+        i += CG( system, control, data, workspace,
+                H, workspace->b_s, tol, workspace->s, mpi_data );
+
+        for ( j = 0; j < system->n; ++j )
         {
-            workspace->x[j][1] = workspace->t[j];
+            workspace->x[j][0] = workspace->s[j];
         }
     }
-    else if ( sqrt(sig_new[1]) / b_norm[1] <= tol )
+    else if ( norm[1] / b_norm[1] > tol )
     {
-        for ( j = 0; j < n; ++j )
+        for ( j = 0; j < system->n; ++j )
         {
-            workspace->s[j] = workspace->x[j][0];
+            workspace->t[j] = workspace->x[j][1];
         }
-        matvecs = CG( system, workspace, H, workspace->b_s, tol, workspace->s,
-                mpi_data, fout );
+
+        i += CG( system, control, data, workspace,
+                H, workspace->b_t, tol, workspace->t, mpi_data );
+
         for ( j = 0; j < system->n; ++j )
         {
-            workspace->x[j][0] = workspace->s[j];
+            workspace->x[j][1] = workspace->t[j];
         }
     }
 
-    if ( i >= 300 )
+    if ( i >= control->cm_solver_max_iters && system->my_rank == MASTER_NODE )
     {
-        fprintf( stderr, "CG convergence failed!\n" );
+        fprintf( stderr, "[WARNING] CG convergence failed!\n" );
+        return i;
     }
 
-#if defined(CG_PERFORMANCE)
-    if ( system->my_rank == MASTER_NODE )
-    {
-        fprintf( fout, "QEq %d + %d iters. matvecs: %f  dot: %f\n", i + 1,
-                matvecs, matvec_time, dot_time );
-    }
-#endif
+    return i;
 
-    return (i + 1) + matvecs;
 }
 
 
-void Sparse_MatVec( sparse_matrix *A, real *x, real *b, int N )
+/* Preconditioned Conjugate Gradient Method */
+int CG( reax_system *system, control_params *control, simulation_data *data,
+        storage *workspace, sparse_matrix *H, real *b,
+        real tol, real *x, mpi_datatypes* mpi_data )
 {
-    int  i, j, k, si;
-    real H;
+    int i, j;
+    real tmp, alpha, beta, norm, b_norm;
+    real sig_old, sig_new;
+    real t_start, t_pa, t_spmv, t_vops, t_comm, t_allreduce;
+    real timings[5], redux[3];
+
+    t_pa = 0.0;
+    t_spmv = 0.0;
+    t_vops = 0.0;
+    t_comm = 0.0;
+    t_allreduce = 0.0;
+
+    t_start = MPI_Wtime( );
+    Dist( system, mpi_data, x, REAL_PTR_TYPE, MPI_DOUBLE );
+    t_comm += MPI_Wtime( ) - t_start;
+
+    t_start = MPI_Wtime( );
+#if defined(NEUTRAL_TERRITORY)
+    Sparse_MatVec( H, x, workspace->q, H->NT );
+#else
+    Sparse_MatVec( H, x, workspace->q, system->N );
+#endif
+    t_spmv += MPI_Wtime( ) - t_start;
 
-    for ( i = 0; i < N; ++i )
+    if ( H->format == SYM_HALF_MATRIX )
+    {
+        t_start = MPI_Wtime( );
+        Coll( system, mpi_data, workspace->q, REAL_PTR_TYPE, MPI_DOUBLE );
+        t_comm += MPI_Wtime( ) - t_start;
+    }
+#if defined(NEUTRAL_TERRITORY)
+    else
     {
-        b[i] = 0;
+        t_start = MPI_Wtime( );
+        Coll( system, mpi_data, workspace->q, REAL_PTR_TYPE, MPI_DOUBLE );
+        t_comm += MPI_Wtime( ) - t_start;
     }
+#endif
 
-    /* perform multiplication */
-    for ( i = 0; i < A->n; ++i )
+    t_start = MPI_Wtime( );
+    Vector_Sum( workspace->r , 1.,  b, -1., workspace->q, system->n );
+    t_vops += MPI_Wtime( ) - t_start;
+
+    /* pre-conditioning */
+    if ( control->cm_solver_pre_comp_type == SAI_PC )
+    {
+        t_start = MPI_Wtime( );
+        Dist( system, mpi_data, workspace->r, REAL_PTR_TYPE, MPI_DOUBLE );
+        t_comm += MPI_Wtime( ) - t_start;
+        
+        t_start = MPI_Wtime( );
+#if defined(NEUTRAL_TERRITORY)
+        Sparse_MatVec( workspace->H_app_inv, workspace->r, workspace->d, H->NT );
+#else
+        Sparse_MatVec( workspace->H_app_inv, workspace->r, workspace->d, system->n );
+#endif
+        t_pa += MPI_Wtime( ) - t_start;
+    }
+    else if ( control->cm_solver_pre_comp_type == JACOBI_PC)
     {
-        si = A->start[i];
-        b[i] += A->entries[si].val * x[i];
-        for ( k = si + 1; k < A->end[i]; ++k )
+        t_start = MPI_Wtime( );
+        for ( j = 0; j < system->n; ++j )
         {
-            j = A->entries[k].j;
-            H = A->entries[k].val;
-            b[i] += H * x[j];
-            //if( j < A->n ) // comment out for tryQEq
-            b[j] += H * x[i];
+            workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j];
         }
+        t_pa += MPI_Wtime( ) - t_start;
     }
-}
 
+    t_start = MPI_Wtime( );
+    redux[0] = Dot_local( workspace->r, workspace->d, system->n );
+    redux[1] = Dot_local( workspace->d, workspace->d, system->n );
+    redux[2] = Dot_local( b, b, system->n );
+    t_vops += MPI_Wtime( ) - t_start;
 
-/* sparse matrix-vector product Ax = b
- * where:
- *   A: matrix, stored in CSR format
- *   x: vector
- *   b: vector (result) */
-static void Sparse_MatVec_full( const sparse_matrix * const A,
-                                const real * const x, real * const b )
-{
-    //TODO: implement full SpMV in MPI
-//    int i, pj;
-//
-//    Vector_MakeZero( b, A->n );
-//
-//#ifdef _OPENMP
-//    #pragma omp for schedule(static)
-//#endif
-//    for ( i = 0; i < A->n; ++i )
-//    {
-//        for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
-//        {
-//            b[i] += A->val[pj] * x[A->j[pj]];
-//        }
-//    }
+    t_start = MPI_Wtime( );
+    MPI_Allreduce( MPI_IN_PLACE, redux, 3, MPI_DOUBLE, MPI_SUM, mpi_data->world );
+    t_allreduce += MPI_Wtime( ) - t_start;
+    sig_new = redux[0];
+    norm = sqrt( redux[1] );
+    b_norm = sqrt( redux[2] );
+
+    for ( i = 0; i < control->cm_solver_max_iters && norm / b_norm > tol; ++i )
+    {
+        t_start = MPI_Wtime( );
+        Dist( system, mpi_data, workspace->d, REAL_PTR_TYPE, MPI_DOUBLE );
+        t_comm += MPI_Wtime( ) - t_start;
+
+        t_start = MPI_Wtime( );
+#if defined(NEUTRAL_TERRITORY)
+        Sparse_MatVec( H, workspace->d, workspace->q, H->NT );
+#else
+        Sparse_MatVec( H, workspace->d, workspace->q, system->N );
+#endif
+        t_spmv += MPI_Wtime( ) - t_start;
+
+        if ( H->format == SYM_HALF_MATRIX )
+        {
+            t_start = MPI_Wtime( );
+            Coll( system, mpi_data, workspace->q, REAL_PTR_TYPE, MPI_DOUBLE );
+            t_comm += MPI_Wtime( ) - t_start;
+        }
+#if defined(NEUTRAL_TERRITORY)
+        else
+        {
+            t_start = MPI_Wtime( );
+            Coll( system, mpi_data, workspace->q, REAL_PTR_TYPE, MPI_DOUBLE );
+            t_comm += MPI_Wtime( ) - t_start;
+        }
+#endif
+
+        t_start =  MPI_Wtime( );
+        tmp = Parallel_Dot( workspace->d, workspace->q, system->n, mpi_data->world );
+        t_allreduce += MPI_Wtime( ) - t_start;
+
+        t_start = MPI_Wtime( );
+        alpha = sig_new / tmp;
+        Vector_Add( x, alpha, workspace->d, system->n );
+        Vector_Add( workspace->r, -alpha, workspace->q, system->n );
+        t_vops += MPI_Wtime( ) - t_start;
+
+        /* pre-conditioning */
+        if ( control->cm_solver_pre_comp_type == SAI_PC )
+        {
+            t_start = MPI_Wtime( );
+            Dist( system, mpi_data, workspace->r, REAL_PTR_TYPE, MPI_DOUBLE );
+            t_comm += MPI_Wtime( ) - t_start;
+
+            t_start = MPI_Wtime( );
+#if defined(NEUTRAL_TERRITORY)
+            Sparse_MatVec( workspace->H_app_inv, workspace->r, workspace->p, H->NT );
+#else
+            Sparse_MatVec( workspace->H_app_inv, workspace->r, workspace->p, system->n );
+#endif
+            t_pa += MPI_Wtime( ) - t_start;
+        }
+        else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
+        {
+            t_start = MPI_Wtime( );
+            for ( j = 0; j < system->n; ++j )
+            {
+                workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j];
+            }
+            t_pa += MPI_Wtime( ) - t_start;
+        }
+
+        t_start = MPI_Wtime( );
+        redux[0] = Dot_local( workspace->r, workspace->p, system->n );
+        redux[1] = Dot_local( workspace->p, workspace->p, system->n );
+        t_vops += MPI_Wtime( ) - t_start;
+
+        t_start = MPI_Wtime( );
+        MPI_Allreduce( MPI_IN_PLACE, redux, 2, MPI_DOUBLE, MPI_SUM, mpi_data->world );
+        t_allreduce += MPI_Wtime( ) - t_start;
+        sig_old = sig_new;
+        sig_new = redux[0];
+        norm = sqrt( redux[1] );
+
+        t_start = MPI_Wtime( );
+        beta = sig_new / sig_old;
+        Vector_Sum( workspace->d, 1., workspace->p, beta, workspace->d, system->n );
+        t_vops += MPI_Wtime( ) - t_start;
+    }
+
+    timings[0] = t_pa;
+    timings[1] = t_spmv;
+    timings[2] = t_vops;
+    timings[3] = t_comm;
+    timings[4] = t_allreduce;
+
+    if ( system->my_rank == MASTER_NODE )
+    {
+        MPI_Reduce( MPI_IN_PLACE, timings, 5, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world );
+
+        data->timing.cm_solver_pre_app += timings[0] / control->nprocs;
+        data->timing.cm_solver_spmv += timings[1] / control->nprocs;
+        data->timing.cm_solver_vector_ops += timings[2] / control->nprocs;
+        data->timing.cm_solver_comm += timings[3] / control->nprocs;
+        data->timing.cm_solver_allreduce += timings[4] / control->nprocs;
+    }
+    else
+    {
+        MPI_Reduce( timings, NULL, 5, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world );
+    }
+
+    if ( i >= control->cm_solver_max_iters && system->my_rank == MASTER_NODE )
+    {
+        fprintf( stderr, "[WARNING] CG convergence failed!\n" );
+        return i;
+    }
+
+    return i;
 }
 
 
-int CG( reax_system *system, storage *workspace, sparse_matrix *H, real *b,
-        real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
+/* Pipelined Preconditioned Conjugate Gradient Method
+ *
+ * References:
+ * 1) Hiding global synchronization latency in the preconditioned Conjugate Gradient algorithm,
+ *  P. Ghysels and W. Vanroose, Parallel Computing, 2014.
+ * 2) Scalable Non-blocking Preconditioned Conjugate Gradient Methods,
+ *  Paul R. Eller and William Gropp, SC '16 Proceedings of the International Conference
+ *  for High Performance Computing, Networking, Storage and Analysis, 2016.
+ *  */
+int dual_PIPECG( reax_system *system, control_params *control, simulation_data *data,
+        storage *workspace, sparse_matrix *H, rvec2 *b,
+        real tol, rvec2 *x, mpi_datatypes* mpi_data )
 {
-    int  i, j, scale;
-    real tmp, alpha, beta, b_norm;
-    real sig_old, sig_new, sig0;
+    int i, j;
+    rvec2 alpha, beta, delta, gamma_old, gamma_new, norm, b_norm;
+    real t_start, t_pa, t_spmv, t_vops, t_comm, t_allreduce;
+    real timings[5], redux[8];
+    MPI_Request req;
+
+    t_pa = 0.0;
+    t_spmv = 0.0;
+    t_vops = 0.0;
+    t_comm = 0.0;
+    t_allreduce = 0.0;
+
+    t_start = MPI_Wtime( );
+    Dist( system, mpi_data, x, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
+    t_comm += MPI_Wtime( ) - t_start;
+
+    t_start = MPI_Wtime( );
+#if defined(NEUTRAL_TERRITORY)
+    dual_Sparse_MatVec( H, x, workspace->u2, H->NT );
+#else
+    dual_Sparse_MatVec( H, x, workspace->u2, system->N );
+#endif
+    t_spmv += MPI_Wtime( ) - t_start;
 
-#if defined(CG_PERFORMANCE)
-    if ( system->my_rank == MASTER_NODE )
+    if ( H->format == SYM_HALF_MATRIX )
     {
-        t_start = matvec_time = dot_time = 0;
-        t_start = Get_Time( );
+        t_start = MPI_Wtime( );
+        Coll( system, mpi_data, workspace->u2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
+        t_comm += MPI_Wtime( ) - t_start;
+    }
+#if defined(NEUTRAL_TERRITORY)
+    else
+    {
+        t_start = MPI_Wtime( );
+        Coll( system, mpi_data, workspace->u2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
+        t_comm += MPI_Wtime( ) - t_start;
     }
 #endif
 
-    scale = sizeof(real) / sizeof(void);
-    Dist( system, mpi_data, x, MPI_DOUBLE, scale, real_packer );
-    Sparse_MatVec( H, x, workspace->q, system->N );
-    // tryQEq
-    Coll( system, mpi_data, workspace->q, MPI_DOUBLE, scale, real_unpacker );
+    t_start = MPI_Wtime( );
+    //Vector_Sum( workspace->r , 1.0,  b, -1.0, workspace->u, system->n );
+    for ( j = 0; j < system->n; ++j )
+    {
+        workspace->r2[j][0] = b[j][0] - workspace->u2[j][0];
+        workspace->r2[j][1] = b[j][1] - workspace->u2[j][1];
+    }
+    t_vops += MPI_Wtime( ) - t_start;
 
-#if defined(CG_PERFORMANCE)
-    if ( system->my_rank == MASTER_NODE )
+    /* pre-conditioning */
+    if ( control->cm_solver_pre_comp_type == NONE_PC )
+    {
+        //Vector_Copy( workspace->u, workspace->r, system->n );
+        for ( j = 0; j < system->n ; ++j )
+        {
+            workspace->u2[j][0] = workspace->r2[j][0];
+            workspace->u2[j][1] = workspace->r2[j][1];
+        }
+    }
+    else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
+    {
+        t_start = MPI_Wtime( );
+        for ( j = 0; j < system->n; ++j )
+        {
+            workspace->u2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
+            workspace->u2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
+        }
+        t_pa += MPI_Wtime( ) - t_start;
+    }
+    else if ( control->cm_solver_pre_comp_type == SAI_PC )
     {
-        Update_Timing_Info( &t_start, &matvec_time );
+        t_start = MPI_Wtime( );
+        Dist( system, mpi_data, workspace->r2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
+        t_comm += MPI_Wtime( ) - t_start;
+        
+        t_start = MPI_Wtime( );
+#if defined(NEUTRAL_TERRITORY)
+        dual_Sparse_MatVec( workspace->H_app_inv, workspace->r2, workspace->u2, H->NT );
+#else
+        dual_Sparse_MatVec( workspace->H_app_inv, workspace->r2, workspace->u2, system->n );
+#endif
+        t_pa += MPI_Wtime( ) - t_start;
     }
+
+    t_start = MPI_Wtime( );
+    Dist( system, mpi_data, workspace->u2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
+    t_comm += MPI_Wtime( ) - t_start;
+
+    t_start = MPI_Wtime( );
+#if defined(NEUTRAL_TERRITORY)
+    dual_Sparse_MatVec( H, workspace->u2, workspace->w2, H->NT );
+#else
+    dual_Sparse_MatVec( H, workspace->u2, workspace->w2, system->N );
 #endif
+    t_spmv += MPI_Wtime( ) - t_start;
 
-    Vector_Sum( workspace->r , 1.,  b, -1., workspace->q, system->n );
+    if ( H->format == SYM_HALF_MATRIX )
+    {
+        t_start = MPI_Wtime( );
+        Coll( system, mpi_data, workspace->w2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
+        t_comm += MPI_Wtime( ) - t_start;
+    }
+#if defined(NEUTRAL_TERRITORY)
+    else
+    {
+        t_start = MPI_Wtime( );
+        Coll( system, mpi_data, workspace->w2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
+        t_comm += MPI_Wtime( ) - t_start;
+    }
+#endif
 
-    for ( j = 0; j < system->n; ++j )
+    t_start = MPI_Wtime( );
+    //redux[0] = Dot_local( workspace->w, workspace->u, system->n );
+    //redux[1] = Dot_local( workspace->r, workspace->u, system->n );
+    //redux[2] = Dot_local( workspace->u, workspace->u, system->n );
+    //redux[3] = Dot_local( b, b, system->n );
+    for ( j = 0; j < 8; ++j )
     {
-        workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; //pre-condition
+        redux[j] = 0.0;
     }
-    //TODO: apply SAI preconditioner here, comment out diagonal preconditioning above
-//    Sparse_MatVec_full( workspace->H_app_inv, workspace->r, workspace->d );
+    for( j = 0; j < system->n; ++j )
+    {
+        redux[0] += workspace->w2[j][0] * workspace->u2[j][0];
+        redux[1] += workspace->w2[j][1] * workspace->u2[j][1];
 
-    b_norm = Parallel_Norm( b, system->n, mpi_data->world );
-    sig_new = Parallel_Dot(workspace->r, workspace->d, system->n, mpi_data->world);
-    sig0 = sig_new;
+        redux[2] += workspace->r2[j][0] * workspace->u2[j][0];
+        redux[3] += workspace->r2[j][1] * workspace->u2[j][1];
 
-#if defined(CG_PERFORMANCE)
-    if ( system->my_rank == MASTER_NODE )
+        redux[4] += workspace->u2[j][0] * workspace->u2[j][0];
+        redux[5] += workspace->u2[j][1] * workspace->u2[j][1];
+
+        redux[6] += b[j][0] * b[j][0];
+        redux[7] += b[j][1] * b[j][1];
+    }
+    t_vops += MPI_Wtime( ) - t_start;
+
+    MPI_Iallreduce( MPI_IN_PLACE, redux, 8, MPI_DOUBLE, MPI_SUM, mpi_data->world, &req );
+
+    /* pre-conditioning */
+    if ( control->cm_solver_pre_comp_type == NONE_PC )
     {
-        Update_Timing_Info( &t_start, &dot_time );
+        //Vector_Copy( workspace->m, workspace->w, system->n );
+        for ( j = 0; j < system->n; ++j )
+        {
+            workspace->m2[j][0] = workspace->w2[j][0];
+            workspace->m2[j][1] = workspace->w2[j][1];
+        }
+    }
+    else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
+    {
+        t_start = MPI_Wtime( );
+        for ( j = 0; j < system->n; ++j )
+        {
+            workspace->m2[j][0] = workspace->w2[j][0] * workspace->Hdia_inv[j];
+            workspace->m2[j][1] = workspace->w2[j][1] * workspace->Hdia_inv[j];
+        }
+        t_pa += MPI_Wtime( ) - t_start;
     }
+    else if ( control->cm_solver_pre_comp_type == SAI_PC )
+    {
+        t_start = MPI_Wtime( );
+        Dist( system, mpi_data, workspace->w2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
+        t_comm += MPI_Wtime( ) - t_start;
+        
+        t_start = MPI_Wtime( );
+#if defined(NEUTRAL_TERRITORY)
+        dual_Sparse_MatVec( workspace->H_app_inv, workspace->w2, workspace->m2, H->NT );
+#else
+        dual_Sparse_MatVec( workspace->H_app_inv, workspace->w2, workspace->m2, system->n );
+#endif
+        t_pa += MPI_Wtime( ) - t_start;
+    }
+
+    t_start = MPI_Wtime( );
+    Dist( system, mpi_data, workspace->m2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
+    t_comm += MPI_Wtime( ) - t_start;
+
+    t_start = MPI_Wtime( );
+#if defined(NEUTRAL_TERRITORY)
+    dual_Sparse_MatVec( H, workspace->m2, workspace->n2, H->NT );
+#else
+    dual_Sparse_MatVec( H, workspace->m2, workspace->n2, system->N );
 #endif
+    t_spmv += MPI_Wtime( ) - t_start;
 
-    for ( i = 1; i < 300 && sqrt(sig_new) / b_norm > tol; ++i )
+    if ( H->format == SYM_HALF_MATRIX )
     {
-        Dist( system, mpi_data, workspace->d, MPI_DOUBLE, scale, real_packer );
-        Sparse_MatVec( H, workspace->d, workspace->q, system->N );
-        //tryQEq
-        Coll(system, mpi_data, workspace->q, MPI_DOUBLE, scale, real_unpacker);
+        t_start = MPI_Wtime( );
+        Coll( system, mpi_data, workspace->n2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
+        t_comm += MPI_Wtime( ) - t_start;
+    }
+#if defined(NEUTRAL_TERRITORY)
+    else
+    {
+        t_start = MPI_Wtime( );
+        Coll( system, mpi_data, workspace->n2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
+        t_comm += MPI_Wtime( ) - t_start;
+    }
+#endif
+
+    t_start = MPI_Wtime( );
+    MPI_Wait( &req, MPI_STATUS_IGNORE );
+    t_allreduce += MPI_Wtime( ) - t_start;
+    delta[0] = redux[0];
+    delta[1] = redux[1];
+    gamma_new[0] = redux[2];
+    gamma_new[1] = redux[3];
+    norm[0] = sqrt( redux[4] );
+    norm[1] = sqrt( redux[5] );
+    b_norm[0] = sqrt( redux[6] );
+    b_norm[1] = sqrt( redux[7] );
+
+    for ( i = 0; i < control->cm_solver_max_iters; ++i )
+    {
+        if ( norm[0] / b_norm[0] <= tol || norm[1] / b_norm[1] <= tol )
+        {
+            break;
+        }
+        if ( i > 0 )
+        {
+            beta[0] = gamma_new[0] / gamma_old[0];
+            beta[1] = gamma_new[1] / gamma_old[1];
+            alpha[0] = gamma_new[0] / (delta[0] - beta[0] / alpha[0] * gamma_new[0]);
+            alpha[1] = gamma_new[1] / (delta[1] - beta[1] / alpha[1] * gamma_new[1]);
+        }
+        else
+        {
+            beta[0] = 0.0;
+            beta[1] = 0.0;
+            alpha[0] = gamma_new[0] / delta[0];
+            alpha[1] = gamma_new[1] / delta[1];
+        }
+
+        t_start = MPI_Wtime( );
+        //Vector_Sum( workspace->z, 1.0, workspace->n, beta, workspace->z, system->n );
+        //Vector_Sum( workspace->q, 1.0, workspace->m, beta, workspace->q, system->n );
+        //Vector_Sum( workspace->p, 1.0, workspace->u, beta, workspace->p, system->n );
+        //Vector_Sum( workspace->d, 1.0, workspace->w, beta, workspace->d, system->n );
+        //Vector_Sum( x, 1.0, x, alpha, workspace->p, system->n );
+        //Vector_Sum( workspace->u, 1.0, workspace->u, -alpha, workspace->q, system->n );
+        //Vector_Sum( workspace->w, 1.0, workspace->w, -alpha, workspace->z, system->n );
+        //Vector_Sum( workspace->r, 1.0, workspace->r, -alpha, workspace->d, system->n );
+        //redux[0] = Dot_local( workspace->w, workspace->u, system->n );
+        //redux[1] = Dot_local( workspace->r, workspace->u, system->n );
+        //redux[2] = Dot_local( workspace->u, workspace->u, system->n );
+        for ( j = 0; j < 6; ++j )
+        {
+            redux[j] = 0.0;
+        }
+        for ( j = 0; j < system->n; ++j )
+        {
+            workspace->z2[j][0] = workspace->n2[j][0] + beta[0] * workspace->z2[j][0];
+            workspace->z2[j][1] = workspace->n2[j][1] + beta[1] * workspace->z2[j][1];
 
-#if defined(CG_PERFORMANCE)
-        if ( system->my_rank == MASTER_NODE )
+            workspace->q2[j][0] = workspace->m2[j][0] + beta[0] * workspace->q2[j][0];
+            workspace->q2[j][1] = workspace->m2[j][1] + beta[1] * workspace->q2[j][1];
+
+            workspace->p2[j][0] = workspace->u2[j][0] + beta[0] * workspace->p2[j][0];
+            workspace->p2[j][1] = workspace->u2[j][1] + beta[1] * workspace->p2[j][1];
+
+            workspace->d2[j][0] = workspace->w2[j][0] + beta[0] * workspace->d2[j][0];
+            workspace->d2[j][1] = workspace->w2[j][1] + beta[1] * workspace->d2[j][1];
+
+            x[j][0] += alpha[0] * workspace->p2[j][0];
+            x[j][1] += alpha[1] * workspace->p2[j][1];
+
+            workspace->u2[j][0] -= alpha[0] * workspace->q2[j][0];
+            workspace->u2[j][1] -= alpha[1] * workspace->q2[j][1];
+
+            workspace->w2[j][0] -= alpha[0] * workspace->z2[j][0];
+            workspace->w2[j][1] -= alpha[1] * workspace->z2[j][1];
+
+            workspace->r2[j][0] -= alpha[0] * workspace->d2[j][0];
+            workspace->r2[j][1] -= alpha[1] * workspace->d2[j][1];
+
+            redux[0] += workspace->w2[j][0] * workspace->u2[j][0];
+            redux[1] += workspace->w2[j][1] * workspace->u2[j][1];
+            
+            redux[2] += workspace->r2[j][0] * workspace->u2[j][0];
+            redux[3] += workspace->r2[j][1] * workspace->u2[j][1];
+            
+            redux[4] += workspace->u2[j][0] * workspace->u2[j][0];
+            redux[5] += workspace->u2[j][1] * workspace->u2[j][1];
+
+        }
+        t_vops += MPI_Wtime( ) - t_start;
+
+        MPI_Iallreduce( MPI_IN_PLACE, redux, 6, MPI_DOUBLE, MPI_SUM, mpi_data->world, &req );
+
+        /* pre-conditioning */
+        if ( control->cm_solver_pre_comp_type == NONE_PC )
+        {
+            //Vector_Copy( workspace->m, workspace->w, system->n );
+            for ( j = 0; j < system->n; ++j )
+            {
+                workspace->m2[j][0] = workspace->w2[j][0];
+                workspace->m2[j][1] = workspace->w2[j][1];
+            }
+        }
+        else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
         {
-            Update_Timing_Info( &t_start, &matvec_time );
+            t_start = MPI_Wtime( );
+            for ( j = 0; j < system->n; ++j )
+            {
+                workspace->m2[j][0] = workspace->w2[j][0] * workspace->Hdia_inv[j];
+                workspace->m2[j][1] = workspace->w2[j][1] * workspace->Hdia_inv[j];
+            }
+            t_pa += MPI_Wtime( ) - t_start;
         }
+        else if ( control->cm_solver_pre_comp_type == SAI_PC )
+        {
+            t_start = MPI_Wtime( );
+            Dist( system, mpi_data, workspace->w2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
+            t_comm += MPI_Wtime( ) - t_start;
+            
+            t_start = MPI_Wtime( );
+#if defined(NEUTRAL_TERRITORY)
+            dual_Sparse_MatVec( workspace->H_app_inv, workspace->w2, workspace->m2, H->NT );
+#else
+            dual_Sparse_MatVec( workspace->H_app_inv, workspace->w2, workspace->m2, system->n );
 #endif
+            t_pa += MPI_Wtime( ) - t_start;
+        }
 
-        tmp = Parallel_Dot(workspace->d, workspace->q, system->n, mpi_data->world);
-        alpha = sig_new / tmp;
-        Vector_Add( x, alpha, workspace->d, system->n );
-        Vector_Add( workspace->r, -alpha, workspace->q, system->n );
+        t_start = MPI_Wtime( );
+        Dist( system, mpi_data, workspace->m2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2);
+        t_comm += MPI_Wtime( ) - t_start;
 
-        /* pre-conditioning */
-        for ( j = 0; j < system->n; ++j )
+        t_start = MPI_Wtime( );
+#if defined(NEUTRAL_TERRITORY)
+        dual_Sparse_MatVec( H, workspace->m2, workspace->n2, H->NT );
+#else
+        dual_Sparse_MatVec( H, workspace->m2, workspace->n2, system->N );
+#endif
+        t_spmv += MPI_Wtime( ) - t_start;
+
+        if ( H->format == SYM_HALF_MATRIX )
         {
-            workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j];
+            t_start = MPI_Wtime( );
+            Coll( system, mpi_data, workspace->n2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
+            t_comm += MPI_Wtime( ) - t_start;
         }
-        //TODO: apply SAI preconditioner here, comment out diagonal preconditioning above
-//        Sparse_MatVec_full( workspace->H_app_inv, workspace->r, workspace->d );
-
-        sig_old = sig_new;
-        sig_new = Parallel_Dot(workspace->r, workspace->p, system->n, mpi_data->world);
-        beta = sig_new / sig_old;
-        Vector_Sum( workspace->d, 1., workspace->p, beta, workspace->d, system->n );
-
-#if defined(CG_PERFORMANCE)
-        if ( system->my_rank == MASTER_NODE )
+#if defined(NEUTRAL_TERRITORY)
+        else
         {
-            Update_Timing_Info( &t_start, &dot_time );
+            t_start = MPI_Wtime( );
+            Coll( system, mpi_data, workspace->n2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2);
+            t_comm += MPI_Wtime( ) - t_start;
         }
 #endif
-    }
 
-    if ( i >= 300 )
-    {
-        fprintf( stderr, "CG convergence failed!\n" );
-        return i;
+        gamma_old[0] = gamma_new[0];
+        gamma_old[1] = gamma_new[1];
+
+        t_start = MPI_Wtime( );
+        MPI_Wait( &req, MPI_STATUS_IGNORE );
+        t_allreduce += MPI_Wtime( ) - t_start;
+        delta[0] = redux[0];
+        delta[1] = redux[1];
+        gamma_new[0] = redux[2];
+        gamma_new[1] = redux[3];
+        norm[0] = sqrt( redux[4] );
+        norm[1] = sqrt( redux[5] );
     }
 
-#if defined(CG_PERFORMANCE)
+    timings[0] = t_pa;
+    timings[1] = t_spmv;
+    timings[2] = t_vops;
+    timings[3] = t_comm;
+    timings[4] = t_allreduce;
+
     if ( system->my_rank == MASTER_NODE )
     {
-        fprintf( fout, "QEq %d iters. matvecs: %f  dot: %f\n", i, matvec_time,
-                dot_time );
-    }
-#endif
-
-    return i;
-}
-
-
-int CG_test( reax_system *system, storage *workspace, sparse_matrix *H,
-        real *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
-{
-    int  i, j, scale;
-    real tmp, alpha, beta, b_norm;
-    real sig_old, sig_new, sig0;
+        MPI_Reduce( MPI_IN_PLACE, timings, 5, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world );
 
-    scale = sizeof(real) / sizeof(void);
-    b_norm = Parallel_Norm( b, system->n, mpi_data->world );
-#if defined(DEBUG)
-    if ( system->my_rank == MASTER_NODE )
+        data->timing.cm_solver_pre_app += timings[0] / control->nprocs;
+        data->timing.cm_solver_spmv += timings[1] / control->nprocs;
+        data->timing.cm_solver_vector_ops += timings[2] / control->nprocs;
+        data->timing.cm_solver_comm += timings[3] / control->nprocs;
+        data->timing.cm_solver_allreduce += timings[4] / control->nprocs;
+    }
+    else
     {
-        fprintf( stderr, "n=%d, N=%d\n", system->n, system->N );
-        fprintf( stderr, "p%d CGinit: b_norm=%24.15e\n", system->my_rank, b_norm );
-        //Vector_Print( stderr, "d", workspace->d, system->N );
-        //Vector_Print( stderr, "q", workspace->q, system->N );
+        MPI_Reduce( timings, NULL, 5, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world );
     }
-    MPI_Barrier( mpi_data->world );
-#endif
 
-    Sparse_MatVec( H, x, workspace->q, system->N );
-    //Coll( system, mpi_data, workspace->q, MPI_DOUBLE, real_unpacker );
-
-    Vector_Sum( workspace->r , 1.,  b, -1., workspace->q, system->n );
-    for ( j = 0; j < system->n; ++j )
-        workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; //pre-condition
+    // continue to solve the system that has not converged yet
+    if ( norm[0] / b_norm[0] > tol )
+    {
+        for ( j = 0; j < system->n; ++j )
+        {
+            workspace->s[j] = workspace->x[j][0];
+        }
 
-    sig_new = Parallel_Dot( workspace->r, workspace->d, system->n,
-                            mpi_data->world );
-    sig0 = sig_new;
-#if defined(DEBUG)
-    //if( system->my_rank == MASTER_NODE ) {
-    fprintf( stderr, "p%d CG:sig_new=%24.15e,d_norm=%24.15e,q_norm=%24.15e\n",
-             system->my_rank, sqrt(sig_new),
-             Parallel_Norm(workspace->d, system->n, mpi_data->world),
-             Parallel_Norm(workspace->q, system->n, mpi_data->world) );
-    //Vector_Print( stderr, "d", workspace->d, system->N );
-    //Vector_Print( stderr, "q", workspace->q, system->N );
-    //}
-    MPI_Barrier( mpi_data->world );
-#endif
+        i += PIPECG( system, control, data, workspace,
+                H, workspace->b_s, tol, workspace->s, mpi_data );
 
-    for ( i = 1; i < 300 && sqrt(sig_new) / b_norm > tol; ++i )
+        for ( j = 0; j < system->n; ++j )
+        {
+            workspace->x[j][0] = workspace->s[j];
+        }
+    }
+    else if ( norm[1] / b_norm[1] > tol )
     {
-#if defined(CG_PERFORMANCE)
-        if ( system->my_rank == MASTER_NODE )
-            t_start = Get_Time( );
-#endif
-        Dist( system, mpi_data, workspace->d, MPI_DOUBLE, scale, real_packer );
-        Sparse_MatVec( H, workspace->d, workspace->q, system->N );
-        //tryQEq
-        //Coll(system, mpi_data, workspace->q, MPI_DOUBLE, real_unpacker);
-#if defined(CG_PERFORMANCE)
-        if ( system->my_rank == MASTER_NODE )
+        for ( j = 0; j < system->n; ++j )
         {
-            t_elapsed = Get_Timing_Info( t_start );
-            matvec_time += t_elapsed;
+            workspace->t[j] = workspace->x[j][1];
         }
-#endif
 
-#if defined(CG_PERFORMANCE)
-        if ( system->my_rank == MASTER_NODE )
-            t_start = Get_Time( );
-#endif
-        tmp = Parallel_Dot(workspace->d, workspace->q, system->n, mpi_data->world);
-        alpha = sig_new / tmp;
-#if defined(DEBUG)
-        //if( system->my_rank == MASTER_NODE ){
-        fprintf(stderr,
-                "p%d CG iter%d:d_norm=%24.15e,q_norm=%24.15e,tmp = %24.15e\n",
-                system->my_rank, i,
-                //Parallel_Norm(workspace->d, system->n, mpi_data->world),
-                //Parallel_Norm(workspace->q, system->n, mpi_data->world),
-                Norm(workspace->d, system->n), Norm(workspace->q, system->n), tmp);
-        //Vector_Print( stderr, "d", workspace->d, system->N );
-        //for( j = 0; j < system->N; ++j )
-        //  fprintf( stdout, "%d  %24.15e\n",
-        //     system->my_atoms[j].orig_id, workspace->q[j] );
-        //fprintf( stdout, "\n" );
-        //}
-        MPI_Barrier( mpi_data->world );
-#endif
+        i += PIPECG( system, control, data, workspace,
+                H, workspace->b_t, tol, workspace->t, mpi_data );
 
-        Vector_Add( x, alpha, workspace->d, system->n );
-        Vector_Add( workspace->r, -alpha, workspace->q, system->n );
-        /* pre-conditioning */
         for ( j = 0; j < system->n; ++j )
-            workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j];
-
-        sig_old = sig_new;
-        sig_new = Parallel_Dot(workspace->r, workspace->p, system->n, mpi_data->world);
-        beta = sig_new / sig_old;
-        Vector_Sum( workspace->d, 1., workspace->p, beta, workspace->d, system->n );
-#if defined(DEBUG)
-        if ( system->my_rank == MASTER_NODE )
-            fprintf(stderr, "p%d CG iter%d: sig_new = %24.15e\n",
-                    system->my_rank, i, sqrt(sig_new) );
-        MPI_Barrier( mpi_data->world );
-#endif
-#if defined(CG_PERFORMANCE)
-        if ( system->my_rank == MASTER_NODE )
         {
-            t_elapsed = Get_Timing_Info( t_start );
-            dot_time += t_elapsed;
+            workspace->x[j][1] = workspace->t[j];
         }
-#endif
     }
 
-#if defined(DEBUG)
-    if ( system->my_rank == MASTER_NODE )
-        fprintf( stderr, "CG took %d iterations\n", i );
-#endif
-#if defined(CG_PERFORMANCE)
-    if ( system->my_rank == MASTER_NODE )
-        fprintf( stderr, "%f  %f\n", matvec_time, dot_time );
-#endif
-    if ( i >= 300 )
+    if ( i >= control->cm_solver_max_iters && system->my_rank == MASTER_NODE )
     {
-        fprintf( stderr, "CG convergence failed!\n" );
+        fprintf( stderr, "[WARNING] PIPECG convergence failed!\n" );
         return i;
     }
 
@@ -536,506 +2580,555 @@ int CG_test( reax_system *system, storage *workspace, sparse_matrix *H,
 }
 
 
-void Forward_Subs( sparse_matrix *L, real *b, real *y )
+/* Pipelined Preconditioned Conjugate Gradient Method
+ *
+ * References:
+ * 1) Hiding global synchronization latency in the preconditioned Conjugate Gradient algorithm,
+ *  P. Ghysels and W. Vanroose, Parallel Computing, 2014.
+ * 2) Scalable Non-blocking Preconditioned Conjugate Gradient Methods,
+ *  Paul R. Eller and William Gropp, SC '16 Proceedings of the International Conference
+ *  for High Performance Computing, Networking, Storage and Analysis, 2016.
+ *  */
+int PIPECG( reax_system *system, control_params *control, simulation_data *data,
+        storage *workspace, sparse_matrix *H, real *b,
+        real tol, real *x, mpi_datatypes* mpi_data )
 {
-    int i, pj, j, si, ei;
-    real val;
+    int i, j;
+    real alpha, beta, delta, gamma_old, gamma_new, norm, b_norm;
+    real t_start, t_pa, t_spmv, t_vops, t_comm, t_allreduce;
+    real timings[5], redux[4];
+    MPI_Request req;
+
+    t_pa = 0.0;
+    t_spmv = 0.0;
+    t_vops = 0.0;
+    t_comm = 0.0;
+    t_allreduce = 0.0;
+
+    t_start = MPI_Wtime( );
+    Dist( system, mpi_data, x, REAL_PTR_TYPE, MPI_DOUBLE );
+    t_comm += MPI_Wtime( ) - t_start;
+
+    t_start = MPI_Wtime( );
+#if defined(NEUTRAL_TERRITORY)
+    Sparse_MatVec( H, x, workspace->u, H->NT );
+#else
+    Sparse_MatVec( H, x, workspace->u, system->N );
+#endif
+    t_spmv += MPI_Wtime( ) - t_start;
 
-    for ( i = 0; i < L->n; ++i )
+    if ( H->format == SYM_HALF_MATRIX )
     {
-        y[i] = b[i];
-        si = L->start[i];
-        ei = L->end[i];
-        for ( pj = si; pj < ei - 1; ++pj )
-        {
-            j = L->entries[pj].j;
-            val = L->entries[pj].val;
-            y[i] -= val * y[j];
-        }
-        y[i] /= L->entries[pj].val;
+        t_start = MPI_Wtime( );
+        Coll( system, mpi_data, workspace->u, REAL_PTR_TYPE, MPI_DOUBLE );
+        t_comm += MPI_Wtime( ) - t_start;
     }
-}
-
-
-void Backward_Subs( sparse_matrix *U, real *y, real *x )
-{
-    int i, pj, j, si, ei;
-    real val;
-
-    for ( i = U->n - 1; i >= 0; --i )
+#if defined(NEUTRAL_TERRITORY)
+    else
     {
-        x[i] = y[i];
-        si = U->start[i];
-        ei = U->end[i];
-        for ( pj = si + 1; pj < ei; ++pj )
-        {
-            j = U->entries[pj].j;
-            val = U->entries[pj].val;
-            x[i] -= val * x[j];
-        }
-        x[i] /= U->entries[si].val;
+        t_start = MPI_Wtime( );
+        Coll( system, mpi_data, workspace->u, REAL_PTR_TYPE, MPI_DOUBLE );
+        t_comm += MPI_Wtime( ) - t_start;
     }
-}
-
-
-int PCG( reax_system *system, storage *workspace,
-         sparse_matrix *H, real *b, real tol,
-         sparse_matrix *L, sparse_matrix *U, real *x,
-         mpi_datatypes* mpi_data, FILE *fout )
-{
-    int  i, me, n, N, scale;
-    real tmp, alpha, beta, b_norm, r_norm, sig_old, sig_new;
-    MPI_Comm world;
-
-    me = system->my_rank;
-    n = system->n;
-    N = system->N;
-    world = mpi_data->world;
-    scale = sizeof(real) / sizeof(void);
-    b_norm = Parallel_Norm( b, n, world );
-#if defined(DEBUG_FOCUS)
-    if ( me == MASTER_NODE )
-    {
-        fprintf( stderr, "init_PCG: n=%d, N=%d\n", n, N );
-        fprintf( stderr, "init_PCG: |b|=%24.15e\n", b_norm );
-    }
-    MPI_Barrier( world );
 #endif
 
-    Sparse_MatVec( H, x, workspace->q, N );
-    //Coll( system, workspace, mpi_data, workspace->q );
-    Vector_Sum( workspace->r , 1.,  b, -1., workspace->q, n );
-    r_norm = Parallel_Norm( workspace->r, n, world );
+    t_start = MPI_Wtime( );
+    Vector_Sum( workspace->r , 1.0,  b, -1.0, workspace->u, system->n );
+    t_vops += MPI_Wtime( ) - t_start;
 
-    Forward_Subs( L, workspace->r, workspace->d );
-    Backward_Subs( U, workspace->d, workspace->p );
-    sig_new = Parallel_Dot( workspace->r, workspace->p, n, world );
-#if defined(DEBUG_FOCUS)
-    if ( me == MASTER_NODE )
+    /* pre-conditioning */
+    if ( control->cm_solver_pre_comp_type == NONE_PC )
     {
-        fprintf( stderr, "init_PCG: sig_new=%.15e\n", r_norm );
-        fprintf( stderr, "init_PCG: |d|=%.15e |q|=%.15e\n",
-                 Parallel_Norm(workspace->d, n, world),
-                 Parallel_Norm(workspace->q, n, world) );
+        Vector_Copy( workspace->u, workspace->r, system->n );
     }
-    MPI_Barrier( world );
-#endif
-
-    for ( i = 1; i < 100 && r_norm / b_norm > tol; ++i )
+    else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
     {
-        Dist( system, mpi_data, workspace->p, MPI_DOUBLE, scale, real_packer );
-        Sparse_MatVec( H, workspace->p, workspace->q, N );
-        // tryQEq
-        //Coll(system,mpi_data,workspace->q, MPI_DOUBLE, real_unpacker);
-        tmp = Parallel_Dot( workspace->q, workspace->p, n, world );
-        alpha = sig_new / tmp;
-        Vector_Add( x, alpha, workspace->p, n );
-#if defined(DEBUG_FOCUS)
-        if ( me == MASTER_NODE )
-            fprintf(stderr, "iter%d: |p|=%.15e |q|=%.15e tmp=%.15e\n",
-                    i, Parallel_Norm(workspace->p, n, world),
-                    Parallel_Norm(workspace->q, n, world), tmp );
-        MPI_Barrier( world );
-#endif
-
-        Vector_Add( workspace->r, -alpha, workspace->q, n );
-        r_norm = Parallel_Norm( workspace->r, n, world );
-#if defined(DEBUG_FOCUS)
-        if ( me == MASTER_NODE )
-            fprintf( stderr, "iter%d: res=%.15e\n", i, r_norm );
-        MPI_Barrier( world );
-#endif
-
-        Forward_Subs( L, workspace->r, workspace->d );
-        Backward_Subs( U, workspace->d, workspace->d );
-        sig_old = sig_new;
-        sig_new = Parallel_Dot( workspace->r, workspace->d, n, world );
-        beta = sig_new / sig_old;
-        Vector_Sum( workspace->p, 1., workspace->d, beta, workspace->p, n );
+        t_start = MPI_Wtime( );
+        for ( j = 0; j < system->n; ++j )
+        {
+            workspace->u[j] = workspace->r[j] * workspace->Hdia_inv[j];
+        }
+        t_pa += MPI_Wtime( ) - t_start;
     }
-
-#if defined(DEBUG_FOCUS)
-    if ( me == MASTER_NODE )
-        fprintf( stderr, "PCG took %d iterations\n", i );
+    else if ( control->cm_solver_pre_comp_type == SAI_PC )
+    {
+        t_start = MPI_Wtime( );
+        Dist( system, mpi_data, workspace->r, REAL_PTR_TYPE, MPI_DOUBLE );
+        t_comm += MPI_Wtime( ) - t_start;
+        
+        t_start = MPI_Wtime( );
+#if defined(NEUTRAL_TERRITORY)
+        Sparse_MatVec( workspace->H_app_inv, workspace->r, workspace->u, H->NT );
+#else
+        Sparse_MatVec( workspace->H_app_inv, workspace->r, workspace->u, system->n );
 #endif
-    if ( i >= 100 )
-        fprintf( stderr, "PCG convergence failed!\n" );
-
-    return i;
-}
+        t_pa += MPI_Wtime( ) - t_start;
+    }
 
+    t_start = MPI_Wtime( );
+    Dist( system, mpi_data, workspace->u, REAL_PTR_TYPE, MPI_DOUBLE );
+    t_comm += MPI_Wtime( ) - t_start;
 
-#if defined(OLD_STUFF)
-int sCG( reax_system *system, storage *workspace, sparse_matrix *H,
-         real *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
-{
-    int  i, j;
-    real tmp, alpha, beta, b_norm;
-    real sig_old, sig_new, sig0;
+    t_start = MPI_Wtime( );
+#if defined(NEUTRAL_TERRITORY)
+    Sparse_MatVec( H, workspace->u, workspace->w, H->NT );
+#else
+    Sparse_MatVec( H, workspace->u, workspace->w, system->N );
+#endif
+    t_spmv += MPI_Wtime( ) - t_start;
 
-    b_norm = Norm( b, system->n );
-#if defined(DEBUG)
-    if ( system->my_rank == MASTER_NODE )
+    if ( H->format == SYM_HALF_MATRIX )
     {
-        fprintf( stderr, "n=%d, N=%d\n", system->n, system->N );
-        fprintf( stderr, "p%d CGinit: b_norm=%24.15e\n", system->my_rank, b_norm );
-        //Vector_Print( stderr, "d", workspace->d, system->N );
-        //Vector_Print( stderr, "q", workspace->q, system->N );
+        t_start = MPI_Wtime( );
+        Coll( system, mpi_data, workspace->w, REAL_PTR_TYPE, MPI_DOUBLE );
+        t_comm += MPI_Wtime( ) - t_start;
+    }
+#if defined(NEUTRAL_TERRITORY)
+    else
+    {
+        t_start = MPI_Wtime( );
+        Coll( system, mpi_data, workspace->w, REAL_PTR_TYPE, MPI_DOUBLE );
+        t_comm += MPI_Wtime( ) - t_start;
     }
-    MPI_Barrier( mpi_data->world );
 #endif
 
-    Sparse_MatVec( H, x, workspace->q, system->N );
-    //Coll_Vector( system, workspace, mpi_data, workspace->q );
+    t_start = MPI_Wtime( );
+    redux[0] = Dot_local( workspace->w, workspace->u, system->n );
+    redux[1] = Dot_local( workspace->r, workspace->u, system->n );
+    redux[2] = Dot_local( workspace->u, workspace->u, system->n );
+    redux[3] = Dot_local( b, b, system->n );
+    t_vops += MPI_Wtime( ) - t_start;
 
-    Vector_Sum( workspace->r , 1.,  b, -1., workspace->q, system->n );
-    for ( j = 0; j < system->n; ++j )
-        workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; //pre-condition
+    MPI_Iallreduce( MPI_IN_PLACE, redux, 4, MPI_DOUBLE, MPI_SUM, mpi_data->world, &req );
 
-    sig_new = Dot( workspace->r, workspace->d, system->n );
-    sig0 = sig_new;
-#if defined(DEBUG)
-    if ( system->my_rank == MASTER_NODE )
+    /* pre-conditioning */
+    if ( control->cm_solver_pre_comp_type == NONE_PC )
     {
-        fprintf( stderr, "p%d CGinit:sig_new=%24.15e\n", system->my_rank, sig_new );
-        //Vector_Print( stderr, "d", workspace->d, system->N );
-        //Vector_Print( stderr, "q", workspace->q, system->N );
+        Vector_Copy( workspace->m, workspace->w, system->n );
     }
-    MPI_Barrier( mpi_data->world );
-#endif
-
-    for ( i = 1; i < 100 && sqrt(sig_new) / b_norm > tol; ++i )
+    else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
     {
-        //Dist_Vector( system, mpi_data, workspace->d );
-        Sparse_MatVec( H, workspace->d, workspace->q, system->N );
-        //Coll_Vector( system, workspace, mpi_data, workspace->q );
-
-        tmp = Dot( workspace->d, workspace->q, system->n );
-        alpha = sig_new / tmp;
-#if defined(DEBUG)
-        if ( system->my_rank == MASTER_NODE )
+        t_start = MPI_Wtime( );
+        for ( j = 0; j < system->n; ++j )
         {
-            fprintf(stderr,
-                    "p%d CG iter%d:d_norm=%24.15e,q_norm=%24.15e,tmp = %24.15e\n",
-                    system->my_rank, i,
-                    Parallel_Norm(workspace->d, system->n, mpi_data->world),
-                    Parallel_Norm(workspace->q, system->n, mpi_data->world), tmp );
-            //Vector_Print( stderr, "d", workspace->d, system->N );
-            //Vector_Print( stderr, "q", workspace->q, system->N );
+            workspace->m[j] = workspace->w[j] * workspace->Hdia_inv[j];
         }
-        MPI_Barrier( mpi_data->world );
+        t_pa += MPI_Wtime( ) - t_start;
+    }
+    else if ( control->cm_solver_pre_comp_type == SAI_PC )
+    {
+        t_start = MPI_Wtime( );
+        Dist( system, mpi_data, workspace->w, REAL_PTR_TYPE, MPI_DOUBLE );
+        t_comm += MPI_Wtime( ) - t_start;
+        
+        t_start = MPI_Wtime( );
+#if defined(NEUTRAL_TERRITORY)
+        Sparse_MatVec( workspace->H_app_inv, workspace->w, workspace->m, H->NT );
+#else
+        Sparse_MatVec( workspace->H_app_inv, workspace->w, workspace->m, system->n );
 #endif
+        t_pa += MPI_Wtime( ) - t_start;
+    }
 
-        Vector_Add( x, alpha, workspace->d, system->n );
-        Vector_Add( workspace->r, -alpha, workspace->q, system->n );
-        /* pre-conditioning */
-        for ( j = 0; j < system->n; ++j )
-            workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j];
-
-        sig_old = sig_new;
-        sig_new = Dot( workspace->r, workspace->p, system->n );
+    t_start = MPI_Wtime( );
+    Dist( system, mpi_data, workspace->m, REAL_PTR_TYPE, MPI_DOUBLE );
+    t_comm += MPI_Wtime( ) - t_start;
 
-        beta = sig_new / sig_old;
-        Vector_Sum( workspace->d, 1., workspace->p, beta, workspace->d, system->n );
-#if defined(DEBUG)
-        if ( system->my_rank == MASTER_NODE )
-            fprintf(stderr, "p%d CG iter%d: sig_new = %24.15e\n",
-                    system->my_rank, i, sig_new );
-        MPI_Barrier( mpi_data->world );
+    t_start = MPI_Wtime( );
+#if defined(NEUTRAL_TERRITORY)
+    Sparse_MatVec( H, workspace->m, workspace->n, H->NT );
+#else
+    Sparse_MatVec( H, workspace->m, workspace->n, system->N );
 #endif
-    }
+    t_spmv += MPI_Wtime( ) - t_start;
 
-#if defined(DEBUG)
-    if ( system->my_rank == MASTER_NODE )
-        fprintf( stderr, "CG took %d iterations\n", i );
-#endif
-    if ( i >= 100 )
+    if ( H->format == SYM_HALF_MATRIX )
     {
-        fprintf( stderr, "CG convergence failed!\n" );
-        return i;
+        t_start = MPI_Wtime( );
+        Coll( system, mpi_data, workspace->n, REAL_PTR_TYPE, MPI_DOUBLE );
+        t_comm += MPI_Wtime( ) - t_start;
     }
-
-    return i;
-}
-
-
-int GMRES( reax_system *system, storage *workspace, sparse_matrix *H,
-           real *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
-{
-    int i, j, k, itr, N;
-    real cc, tmp1, tmp2, temp, bnorm;
-
-    N = system->N;
-    bnorm = Norm( b, N );
-
-    /* apply the diagonal pre-conditioner to rhs */
-    for ( i = 0; i < N; ++i )
-        workspace->b_prc[i] = b[i] * workspace->Hdia_inv[i];
-
-    /* GMRES outer-loop */
-    for ( itr = 0; itr < MAX_ITR; ++itr )
+#if defined(NEUTRAL_TERRITORY)
+    else
     {
-        /* calculate r0 */
-        Sparse_MatVec( H, x, workspace->b_prm, N );
-        for ( i = 0; i < N; ++i )
-            workspace->b_prm[i] *= workspace->Hdia_inv[i]; // pre-conditioner
-
-        Vector_Sum( workspace->v[0],
-                    1.,  workspace->b_prc, -1., workspace->b_prm, N );
-        workspace->g[0] = Norm( workspace->v[0], N );
-        Vector_Scale( workspace->v[0],
-                      1. / workspace->g[0], workspace->v[0], N );
+        t_start = MPI_Wtime( );
+        Coll( system, mpi_data, workspace->n, REAL_PTR_TYPE, MPI_DOUBLE );
+        t_comm += MPI_Wtime( ) - t_start;
+    }
+#endif
 
-        // fprintf( stderr, "%10.6f\n", workspace->g[0] );
+    t_start = MPI_Wtime( );
+    MPI_Wait( &req, MPI_STATUS_IGNORE );
+    t_allreduce += MPI_Wtime( ) - t_start;
+    delta = redux[0];
+    gamma_new = redux[1];
+    norm = sqrt( redux[2] );
+    b_norm = sqrt( redux[3] );
 
-        /* GMRES inner-loop */
-        for ( j = 0; j < RESTART && fabs(workspace->g[j]) / bnorm > tol; j++ )
+    for ( i = 0; i < control->cm_solver_max_iters && norm / b_norm > tol; ++i )
+    {
+        if ( i > 0 )
+        {
+            beta = gamma_new / gamma_old;
+            alpha = gamma_new / (delta - beta / alpha * gamma_new);
+        }
+        else
         {
-            /* matvec */
-            Sparse_MatVec( H, workspace->v[j], workspace->v[j + 1], N );
+            beta = 0.0;
+            alpha = gamma_new / delta;
+        }
 
-            for ( k = 0; k < N; ++k )
-                workspace->v[j + 1][k] *= workspace->Hdia_inv[k]; // pre-conditioner
-            // fprintf( stderr, "%d-%d: matvec done.\n", itr, j );
+        t_start = MPI_Wtime( );
+        Vector_Sum( workspace->z, 1.0, workspace->n, beta, workspace->z, system->n );
+        Vector_Sum( workspace->q, 1.0, workspace->m, beta, workspace->q, system->n );
+        Vector_Sum( workspace->p, 1.0, workspace->u, beta, workspace->p, system->n );
+        Vector_Sum( workspace->d, 1.0, workspace->w, beta, workspace->d, system->n );
+        Vector_Sum( x, 1.0, x, alpha, workspace->p, system->n );
+        Vector_Sum( workspace->u, 1.0, workspace->u, -alpha, workspace->q, system->n );
+        Vector_Sum( workspace->w, 1.0, workspace->w, -alpha, workspace->z, system->n );
+        Vector_Sum( workspace->r, 1.0, workspace->r, -alpha, workspace->d, system->n );
+        redux[0] = Dot_local( workspace->w, workspace->u, system->n );
+        redux[1] = Dot_local( workspace->r, workspace->u, system->n );
+        redux[2] = Dot_local( workspace->u, workspace->u, system->n );
+        t_vops += MPI_Wtime( ) - t_start;
+
+        MPI_Iallreduce( MPI_IN_PLACE, redux, 3, MPI_DOUBLE, MPI_SUM, mpi_data->world, &req );
 
-            /* apply modified Gram-Schmidt to orthogonalize the new residual */
-            for ( i = 0; i <= j; i++ )
+        /* pre-conditioning */
+        if ( control->cm_solver_pre_comp_type == NONE_PC )
+        {
+            Vector_Copy( workspace->m, workspace->w, system->n );
+        }
+        else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
+        {
+            t_start = MPI_Wtime( );
+            for ( j = 0; j < system->n; ++j )
             {
-                workspace->h[i][j] = Dot(workspace->v[i], workspace->v[j + 1], N);
-                Vector_Add( workspace->v[j + 1],
-                            -workspace->h[i][j], workspace->v[i], N );
+                workspace->m[j] = workspace->w[j] * workspace->Hdia_inv[j];
             }
+            t_pa += MPI_Wtime( ) - t_start;
+        }
+        else if ( control->cm_solver_pre_comp_type == SAI_PC )
+        {
+            t_start = MPI_Wtime( );
+            Dist( system, mpi_data, workspace->w, REAL_PTR_TYPE, MPI_DOUBLE );
+            t_comm += MPI_Wtime( ) - t_start;
+            
+            t_start = MPI_Wtime( );
+#if defined(NEUTRAL_TERRITORY)
+            Sparse_MatVec( workspace->H_app_inv, workspace->w, workspace->m, H->NT );
+#else
+            Sparse_MatVec( workspace->H_app_inv, workspace->w, workspace->m, system->n );
+#endif
+            t_pa += MPI_Wtime( ) - t_start;
+        }
 
-            workspace->h[j + 1][j] = Norm( workspace->v[j + 1], N );
-            Vector_Scale( workspace->v[j + 1],
-                          1. / workspace->h[j + 1][j], workspace->v[j + 1], N );
-            // fprintf(stderr, "%d-%d: orthogonalization completed.\n", itr, j);
-
-            /* Givens rotations on the H matrix to make it U */
-            for ( i = 0; i <= j; i++ )
-            {
-                if ( i == j )
-                {
-                    cc = sqrt(SQR(workspace->h[j][j]) + SQR(workspace->h[j + 1][j]));
-                    workspace->hc[j] = workspace->h[j][j] / cc;
-                    workspace->hs[j] = workspace->h[j + 1][j] / cc;
-                }
-
-                tmp1 =  workspace->hc[i] * workspace->h[i][j] +
-                        workspace->hs[i] * workspace->h[i + 1][j];
-                tmp2 = -workspace->hs[i] * workspace->h[i][j] +
-                       workspace->hc[i] * workspace->h[i + 1][j];
-
-                workspace->h[i][j] = tmp1;
-                workspace->h[i + 1][j] = tmp2;
-            }
+        t_start = MPI_Wtime( );
+        Dist( system, mpi_data, workspace->m, REAL_PTR_TYPE, MPI_DOUBLE );
+        t_comm += MPI_Wtime( ) - t_start;
 
-            /* apply Givens rotations to the rhs as well */
-            tmp1 =  workspace->hc[j] * workspace->g[j];
-            tmp2 = -workspace->hs[j] * workspace->g[j];
-            workspace->g[j] = tmp1;
-            workspace->g[j + 1] = tmp2;
+        t_start = MPI_Wtime( );
+#if defined(NEUTRAL_TERRITORY)
+        Sparse_MatVec( H, workspace->m, workspace->n, H->NT );
+#else
+        Sparse_MatVec( H, workspace->m, workspace->n, system->N );
+#endif
+        t_spmv += MPI_Wtime( ) - t_start;
 
-            // fprintf( stderr, "%10.6f\n", fabs(workspace->g[j+1]) );
+        if ( H->format == SYM_HALF_MATRIX )
+        {
+            t_start = MPI_Wtime( );
+            Coll( system, mpi_data, workspace->n, REAL_PTR_TYPE, MPI_DOUBLE );
+            t_comm += MPI_Wtime( ) - t_start;
         }
-
-        /* solve Hy = g.
-           H is now upper-triangular, do back-substitution */
-        for ( i = j - 1; i >= 0; i-- )
+#if defined(NEUTRAL_TERRITORY)
+        else
         {
-            temp = workspace->g[i];
-            for ( k = j - 1; k > i; k-- )
-                temp -= workspace->h[i][k] * workspace->y[k];
-            workspace->y[i] = temp / workspace->h[i][i];
+            t_start = MPI_Wtime( );
+            Coll( system, mpi_data, workspace->n, REAL_PTR_TYPE, MPI_DOUBLE );
+            t_comm += MPI_Wtime( ) - t_start;
         }
+#endif
 
-        /* update x = x_0 + Vy */
-        for ( i = 0; i < j; i++ )
-            Vector_Add( x, workspace->y[i], workspace->v[i], N );
+        gamma_old = gamma_new;
 
-        /* stopping condition */
-        if ( fabs(workspace->g[j]) / bnorm <= tol )
-            break;
+        t_start = MPI_Wtime( );
+        MPI_Wait( &req, MPI_STATUS_IGNORE );
+        t_allreduce += MPI_Wtime( ) - t_start;
+        delta = redux[0];
+        gamma_new = redux[1];
+        norm = sqrt( redux[2] );
     }
 
-    /*Sparse_MatVec( system, H, x, workspace->b_prm, mpi_data );
-      for( i = 0; i < N; ++i )
-      workspace->b_prm[i] *= workspace->Hdia_inv[i];
+    timings[0] = t_pa;
+    timings[1] = t_spmv;
+    timings[2] = t_vops;
+    timings[3] = t_comm;
+    timings[4] = t_allreduce;
 
-      fprintf( fout, "\n%10s%15s%15s\n", "b_prc", "b_prm", "x" );
-      for( i = 0; i < N; ++i )
-      fprintf( fout, "%10.5f%15.12f%15.12f\n",
-      workspace->b_prc[i], workspace->b_prm[i], x[i] );*/
+    if ( system->my_rank == MASTER_NODE )
+    {
+        MPI_Reduce( MPI_IN_PLACE, timings, 5, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world );
 
-    fprintf( fout, "GMRES outer: %d, inner: %d - |rel residual| = %15.10f\n",
-             itr, j, fabs( workspace->g[j] ) / bnorm );
+        data->timing.cm_solver_pre_app += timings[0] / control->nprocs;
+        data->timing.cm_solver_spmv += timings[1] / control->nprocs;
+        data->timing.cm_solver_vector_ops += timings[2] / control->nprocs;
+        data->timing.cm_solver_comm += timings[3] / control->nprocs;
+        data->timing.cm_solver_allreduce += timings[4] / control->nprocs;
+    }
+    else
+    {
+        MPI_Reduce( timings, NULL, 5, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world );
+    }
 
-    if ( itr >= MAX_ITR )
+    if ( i >= control->cm_solver_max_iters && system->my_rank == MASTER_NODE )
     {
-        fprintf( stderr, "GMRES convergence failed\n" );
-        return FAILURE;
+        fprintf( stderr, "[WARNING] PIPECG convergence failed!\n" );
+        return i;
     }
 
-    return SUCCESS;
+    return i;
 }
 
 
-int GMRES_HouseHolder( reax_system *system, storage *workspace,
-                       sparse_matrix *H, real *b, real tol, real *x,
-                       mpi_datatypes* mpi_data, FILE *fout )
+/* Pipelined Preconditioned Conjugate Residual Method
+ *
+ * References:
+ * 1) Hiding global synchronization latency in the preconditioned Conjugate Gradient algorithm,
+ *  P. Ghysels and W. Vanroose, Parallel Computing, 2014.
+ *  */
+int PIPECR( reax_system *system, control_params *control, simulation_data *data,
+        storage *workspace, sparse_matrix *H, real *b,
+        real tol, real *x, mpi_datatypes* mpi_data )
 {
-    int  i, j, k, itr, N;
-    real cc, tmp1, tmp2, temp, bnorm;
-    real v[10000], z[RESTART + 2][10000], w[RESTART + 2];
-    real u[RESTART + 2][10000];
-
-    N = system->N;
-    bnorm = Norm( b, N );
-
-    /* apply the diagonal pre-conditioner to rhs */
-    for ( i = 0; i < N; ++i )
-        workspace->b_prc[i] = b[i] * workspace->Hdia_inv[i];
+    int i, j;
+    real alpha, beta, delta, gamma_old, gamma_new, norm, b_norm;
+    real t_start, t_pa, t_spmv, t_vops, t_comm, t_allreduce;
+    real timings[5], redux[4];
+    MPI_Request req;
+
+    t_pa = 0.0;
+    t_spmv = 0.0;
+    t_vops = 0.0;
+    t_comm = 0.0;
+    t_allreduce = 0.0;
+
+    t_start = MPI_Wtime( );
+    Dist( system, mpi_data, x, REAL_PTR_TYPE, MPI_DOUBLE );
+    t_comm += MPI_Wtime( ) - t_start;
+
+    t_start = MPI_Wtime( );
+#if defined(NEUTRAL_TERRITORY)
+    Sparse_MatVec( H, x, workspace->u, H->NT );
+#else
+    Sparse_MatVec( H, x, workspace->u, system->N );
+#endif
+    t_spmv += MPI_Wtime( ) - t_start;
 
-    /* GMRES outer-loop */
-    for ( itr = 0; itr < MAX_ITR; ++itr )
+    if ( H->format == SYM_HALF_MATRIX )
     {
-        /* compute z = r0 */
-        Sparse_MatVec( H, x, workspace->b_prm, N );
-
-        for ( i = 0; i < N; ++i )
-            workspace->b_prm[i] *= workspace->Hdia_inv[i]; /* pre-conditioner */
-
-        Vector_Sum( z[0], 1.,  workspace->b_prc, -1., workspace->b_prm, N );
-
-        Vector_MakeZero( w, RESTART + 1 );
-        w[0] = Norm( z[0], N );
-
-        Vector_Copy( u[0], z[0], N );
-        u[0][0] += ( u[0][0] < 0.0 ? -1 : 1 ) * w[0];
-        Vector_Scale( u[0], 1 / Norm( u[0], N ), u[0], N );
+        t_start = MPI_Wtime( );
+        Coll( system, mpi_data, workspace->u, REAL_PTR_TYPE, MPI_DOUBLE );
+        t_comm += MPI_Wtime( ) - t_start;
+    }
+#if defined(NEUTRAL_TERRITORY)
+    else
+    {
+        t_start = MPI_Wtime( );
+        Coll( system, mpi_data, workspace->u, REAL_PTR_TYPE, MPI_DOUBLE );
+        t_comm += MPI_Wtime( ) - t_start;
+    }
+#endif
 
-        w[0]    *= ( u[0][0] < 0.0 ?  1 : -1 );
-        // fprintf( stderr, "\n\n%12.6f\n", w[0] );
+    t_start = MPI_Wtime( );
+    Vector_Sum( workspace->r , 1.0,  b, -1.0, workspace->u, system->n );
+    t_vops += MPI_Wtime( ) - t_start;
 
-        /* GMRES inner-loop */
-        for ( j = 0; j < RESTART && fabs( w[j] ) / bnorm > tol; j++ )
+    /* pre-conditioning */
+    if ( control->cm_solver_pre_comp_type == NONE_PC )
+    {
+        Vector_Copy( workspace->u, workspace->r, system->n );
+    }
+    else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
+    {
+        t_start = MPI_Wtime( );
+        for ( j = 0; j < system->n; ++j )
         {
-            /* compute v_j */
-            Vector_Scale( z[j], -2 * u[j][j], u[j], N );
-            z[j][j] += 1.; /* due to e_j */
-
-            for ( i = j - 1; i >= 0; --i )
-                Vector_Add( z[j] + i, -2 * Dot( u[i] + i, z[j] + i, N - i ), u[i] + i, N - i );
-
-            /* matvec */
-            Sparse_MatVec( H, z[j], v, N );
-
-            for ( k = 0; k < N; ++k )
-                v[k] *= workspace->Hdia_inv[k]; /* pre-conditioner */
-
-            for ( i = 0; i <= j; ++i )
-                Vector_Add( v + i, -2 * Dot( u[i] + i, v + i, N - i ), u[i] + i, N - i );
-
-            if ( !Vector_isZero( v + (j + 1), N - (j + 1) ) )
-            {
-                /* compute the HouseHolder unit vector u_j+1 */
-                for ( i = 0; i <= j; ++i )
-                    u[j + 1][i] = 0;
-
-                Vector_Copy( u[j + 1] + (j + 1), v + (j + 1), N - (j + 1) );
+            workspace->u[j] = workspace->r[j] * workspace->Hdia_inv[j];
+        }
+        t_pa += MPI_Wtime( ) - t_start;
+    }
+    else if ( control->cm_solver_pre_comp_type == SAI_PC )
+    {
+        t_start = MPI_Wtime( );
+        Dist( system, mpi_data, workspace->r, REAL_PTR_TYPE, MPI_DOUBLE );
+        t_comm += MPI_Wtime( ) - t_start;
+        
+        t_start = MPI_Wtime( );
+#if defined(NEUTRAL_TERRITORY)
+        Sparse_MatVec( workspace->H_app_inv, workspace->r, workspace->u, H->NT );
+#else
+        Sparse_MatVec( workspace->H_app_inv, workspace->r, workspace->u, system->n );
+#endif
+        t_pa += MPI_Wtime( ) - t_start;
+    }
 
-                u[j + 1][j + 1] +=
-                    ( v[j + 1] < 0.0 ? -1 : 1 ) * Norm( v + (j + 1), N - (j + 1) );
+    t_start = MPI_Wtime( );
+    Dist( system, mpi_data, workspace->u, REAL_PTR_TYPE, MPI_DOUBLE );
+    t_comm += MPI_Wtime( ) - t_start;
 
-                Vector_Scale( u[j + 1], 1 / Norm( u[j + 1], N ), u[j + 1], N );
+    t_start = MPI_Wtime( );
+#if defined(NEUTRAL_TERRITORY)
+    Sparse_MatVec( H, workspace->u, workspace->w, H->NT );
+#else
+    Sparse_MatVec( H, workspace->u, workspace->w, system->N );
+#endif
+    t_spmv += MPI_Wtime( ) - t_start;
 
-                /* overwrite v with P_m+1 * v */
-                v[j + 1] -=
-                    2 * Dot( u[j + 1] + (j + 1), v + (j + 1), N - (j + 1) ) * u[j + 1][j + 1];
-                Vector_MakeZero( v + (j + 2), N - (j + 2) );
-            }
+    if ( H->format == SYM_HALF_MATRIX )
+    {
+        t_start = MPI_Wtime( );
+        Coll( system, mpi_data, workspace->w, REAL_PTR_TYPE, MPI_DOUBLE );
+        t_comm += MPI_Wtime( ) - t_start;
+    }
+#if defined(NEUTRAL_TERRITORY)
+    else
+    {
+        t_start = MPI_Wtime( );
+        Coll( system, mpi_data, workspace->w, REAL_PTR_TYPE, MPI_DOUBLE );
+        t_comm += MPI_Wtime( ) - t_start;
+    }
+#endif
 
+    //TODO: better loop unrolling and termination condition check
+    norm = tol + 1.0;
 
-            /* previous Givens rotations on H matrix to make it U */
-            for ( i = 0; i < j; i++ )
+    // TODO: warning: b_norm might be uninitialized
+    for ( i = 0; i < control->cm_solver_max_iters && norm / b_norm > tol; ++i )
+    {
+        /* pre-conditioning */
+        if ( control->cm_solver_pre_comp_type == NONE_PC )
+        {
+            Vector_Copy( workspace->m, workspace->w, system->n );
+        }
+        else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
+        {
+            t_start = MPI_Wtime( );
+            for ( j = 0; j < system->n; ++j )
             {
-                tmp1 =  workspace->hc[i] * v[i] + workspace->hs[i] * v[i + 1];
-                tmp2 = -workspace->hs[i] * v[i] + workspace->hc[i] * v[i + 1];
-
-                v[i]   = tmp1;
-                v[i + 1] = tmp2;
+                workspace->m[j] = workspace->w[j] * workspace->Hdia_inv[j];
             }
+            t_pa += MPI_Wtime( ) - t_start;
+        }
+        else if ( control->cm_solver_pre_comp_type == SAI_PC )
+        {
+            t_start = MPI_Wtime( );
+            Dist( system, mpi_data, workspace->w, REAL_PTR_TYPE, MPI_DOUBLE );
+            t_comm += MPI_Wtime( ) - t_start;
+            
+            t_start = MPI_Wtime( );
+#if defined(NEUTRAL_TERRITORY)
+            Sparse_MatVec( workspace->H_app_inv, workspace->w, workspace->m, H->NT );
+#else
+            Sparse_MatVec( workspace->H_app_inv, workspace->w, workspace->m, system->n );
+#endif
+            t_pa += MPI_Wtime( ) - t_start;
+        }
 
-            /* apply the new Givens rotation to H and right-hand side */
-            if ( fabs(v[j + 1]) >= ALMOST_ZERO )
-            {
-                cc = sqrt( SQR( v[j] ) + SQR( v[j + 1] ) );
-                workspace->hc[j] = v[j] / cc;
-                workspace->hs[j] = v[j + 1] / cc;
-
-                tmp1 =  workspace->hc[j] * v[j] + workspace->hs[j] * v[j + 1];
-                tmp2 = -workspace->hs[j] * v[j] + workspace->hc[j] * v[j + 1];
-
-                v[j]   = tmp1;
-                v[j + 1] = tmp2;
+        t_start = MPI_Wtime( );
+        redux[0] = Dot_local( workspace->w, workspace->u, system->n );
+        redux[1] = Dot_local( workspace->m, workspace->w, system->n );
+        redux[2] = Dot_local( workspace->u, workspace->u, system->n );
+        redux[3] = Dot_local( b, b, system->n );
+        t_vops += MPI_Wtime( ) - t_start;
 
-                /* Givens rotations to rhs */
-                tmp1 =  workspace->hc[j] * w[j];
-                tmp2 = -workspace->hs[j] * w[j];
-                w[j]   = tmp1;
-                w[j + 1] = tmp2;
-            }
+        MPI_Iallreduce( MPI_IN_PLACE, redux, 4, MPI_DOUBLE, MPI_SUM, mpi_data->world, &req );
 
-            /* extend R */
-            for ( i = 0; i <= j; ++i )
-                workspace->h[i][j] = v[i];
+        t_start = MPI_Wtime( );
+        Dist( system, mpi_data, workspace->m, REAL_PTR_TYPE, MPI_DOUBLE );
+        t_comm += MPI_Wtime( ) - t_start;
 
+        t_start = MPI_Wtime( );
+#if defined(NEUTRAL_TERRITORY)
+        Sparse_MatVec( H, workspace->m, workspace->n, H->NT );
+#else
+        Sparse_MatVec( H, workspace->m, workspace->n, system->N );
+#endif
+        t_spmv += MPI_Wtime( ) - t_start;
 
-            // fprintf( stderr, "h:" );
-            // for( i = 0; i <= j+1 ; ++i )
-            // fprintf( stderr, "%.6f ", h[i][j] );
-            // fprintf( stderr, "\n" );
-            // fprintf( stderr, "%12.6f\n", w[j+1] );
+        if ( H->format == SYM_HALF_MATRIX )
+        {
+            t_start = MPI_Wtime( );
+            Coll( system, mpi_data, workspace->n, REAL_PTR_TYPE, MPI_DOUBLE );
+            t_comm += MPI_Wtime( ) - t_start;
         }
-
-
-        /* solve Hy = w.
-           H is now upper-triangular, do back-substitution */
-        for ( i = j - 1; i >= 0; i-- )
+#if defined(NEUTRAL_TERRITORY)
+        else
         {
-            temp = w[i];
-            for ( k = j - 1; k > i; k-- )
-                temp -= workspace->h[i][k] * workspace->y[k];
-
-            workspace->y[i] = temp / workspace->h[i][i];
+            t_start = MPI_Wtime( );
+            Coll( system, mpi_data, workspace->n, REAL_PTR_TYPE, MPI_DOUBLE );
+            t_comm += MPI_Wtime( ) - t_start;
+        }
+#endif
+        t_start = MPI_Wtime( );
+        MPI_Wait( &req, MPI_STATUS_IGNORE );
+        t_allreduce += MPI_Wtime( ) - t_start;
+        gamma_new = redux[0];
+        delta = redux[1];
+        norm = sqrt( redux[2] );
+        b_norm = sqrt( redux[3] );
+
+        if ( i > 0 )
+        {
+            beta = gamma_new / gamma_old;
+            alpha = gamma_new / (delta - beta / alpha * gamma_new);
+        }
+        else
+        {
+            beta = 0.0;
+            alpha = gamma_new / delta;
         }
 
-        for ( i = j - 1; i >= 0; i-- )
-            Vector_Add( x, workspace->y[i], z[i], N );
-
-        /* stopping condition */
-        if ( fabs( w[j] ) / bnorm <= tol )
-            break;
+        t_start = MPI_Wtime( );
+        Vector_Sum( workspace->z, 1.0, workspace->n, beta, workspace->z, system->n );
+        Vector_Sum( workspace->q, 1.0, workspace->m, beta, workspace->q, system->n );
+        Vector_Sum( workspace->p, 1.0, workspace->u, beta, workspace->p, system->n );
+        Vector_Sum( workspace->d, 1.0, workspace->w, beta, workspace->d, system->n );
+        Vector_Sum( x, 1.0, x, alpha, workspace->p, system->n );
+        Vector_Sum( workspace->u, 1.0, workspace->u, -alpha, workspace->q, system->n );
+        Vector_Sum( workspace->w, 1.0, workspace->w, -alpha, workspace->z, system->n );
+        Vector_Sum( workspace->r, 1.0, workspace->r, -alpha, workspace->d, system->n );
+        t_vops += MPI_Wtime( ) - t_start;
+
+        gamma_old = gamma_new;
     }
 
-    // Sparse_MatVec( system, H, x, workspace->b_prm );
-    // for( i = 0; i < N; ++i )
-    // workspace->b_prm[i] *= workspace->Hdia_inv[i];
+    timings[0] = t_pa;
+    timings[1] = t_spmv;
+    timings[2] = t_vops;
+    timings[3] = t_comm;
+    timings[4] = t_allreduce;
 
-    // fprintf( fout, "\n%10s%15s%15s\n", "b_prc", "b_prm", "x" );
-    // for( i = 0; i < N; ++i )
-    // fprintf( fout, "%10.5f%15.12f%15.12f\n",
-    //          workspace->b_prc[i], workspace->b_prm[i], x[i] );
+    if ( system->my_rank == MASTER_NODE )
+    {
+        MPI_Reduce( MPI_IN_PLACE, timings, 5, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world );
 
-    fprintf( fout, "GMRES outer:%d  inner:%d iters, |rel residual| = %15.10f\n",
-             itr, j, fabs( workspace->g[j] ) / bnorm );
+        data->timing.cm_solver_pre_app += timings[0] / control->nprocs;
+        data->timing.cm_solver_spmv += timings[1] / control->nprocs;
+        data->timing.cm_solver_vector_ops += timings[2] / control->nprocs;
+        data->timing.cm_solver_comm += timings[3] / control->nprocs;
+        data->timing.cm_solver_allreduce += timings[4] / control->nprocs;
+    }
+    else
+    {
+        MPI_Reduce( timings, NULL, 5, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world );
+    }
 
-    if ( itr >= MAX_ITR )
+    if ( i >= control->cm_solver_max_iters && system->my_rank == MASTER_NODE )
     {
-        fprintf( stderr, "GMRES convergence failed\n" );
-        return FAILURE;
+        fprintf( stderr, "[WARNING] PIPECR convergence failed!\n" );
+        return i;
     }
 
-    return SUCCESS;
+    return i;
 }
-#endif
diff --git a/PuReMD/src/linear_solvers.h b/PuReMD/src/linear_solvers.h
index 87c2f0ade19586169029b566aa8871d5a0794a77..701ee82d695150b7f4a8660ee843cf46cc6fa873 100644
--- a/PuReMD/src/linear_solvers.h
+++ b/PuReMD/src/linear_solvers.h
@@ -24,17 +24,27 @@
 
 #include "reax_types.h"
 
-int GMRES( reax_system*, storage*, sparse_matrix*,
-           real*, real, real*, mpi_datatypes*, FILE* );
-int GMRES_HouseHolder( reax_system*, storage*, sparse_matrix*,
-                       real*, real, real*, mpi_datatypes*, FILE* );
-int dual_CG( reax_system*, storage*, sparse_matrix*,
-             rvec2*, real, rvec2*, mpi_datatypes*, FILE* );
-int CG( reax_system*, storage*, sparse_matrix*,
-        real*, real, real*, mpi_datatypes*, FILE* );
-int PCG( reax_system*, storage*, sparse_matrix*, real*, real,
-         sparse_matrix*, sparse_matrix*, real*, mpi_datatypes*, FILE* );
-int sCG( reax_system*, storage*, sparse_matrix*,
-         real*, real, real*, mpi_datatypes*, FILE* );
+
+real setup_sparse_approx_inverse( reax_system*, simulation_data*, storage*, mpi_datatypes*, 
+        sparse_matrix *, sparse_matrix **, int, double );
+
+real sparse_approx_inverse( reax_system*, simulation_data*, storage*, mpi_datatypes*, 
+        sparse_matrix*, sparse_matrix*, sparse_matrix**, int );
+
+int dual_CG( reax_system*, control_params*, simulation_data*, storage*, sparse_matrix*,
+             rvec2*, real, rvec2*, mpi_datatypes* );
+
+int CG( reax_system*, control_params*, simulation_data*, storage*, sparse_matrix*,
+        real*, real, real*, mpi_datatypes* );
+
+int dual_PIPECG( reax_system*, control_params*, simulation_data*, storage*, sparse_matrix*,
+        rvec2*, real, rvec2*, mpi_datatypes* );
+
+int PIPECG( reax_system*, control_params*, simulation_data*, storage*, sparse_matrix*,
+        real*, real, real*, mpi_datatypes* );
+
+int PIPECR( reax_system*, control_params*, simulation_data*, storage*, sparse_matrix*,
+        real*, real, real*, mpi_datatypes* );
+
 
 #endif
diff --git a/PuReMD/src/list.c b/PuReMD/src/list.c
index 4ccb03ed94174c6a5bb184ae48687a6e40c73515..922f42cc62ad5b4ccf1494970c5e576bc72f766e 100644
--- a/PuReMD/src/list.c
+++ b/PuReMD/src/list.c
@@ -30,17 +30,19 @@
 
 
 /************* allocate list space ******************/
-int Make_List(int n, int num_intrs, int type, reax_list *l, MPI_Comm comm)
+int Make_List( int n, int num_intrs, int type, int format,
+        reax_list *l, MPI_Comm comm )
 {
     l->allocated = 1;
 
     l->n = n;
     l->num_intrs = num_intrs;
 
-    l->index = (int*) smalloc( n * sizeof(int), "list:index", comm );
-    l->end_index = (int*) smalloc( n * sizeof(int), "list:end_index", comm );
+    l->index = smalloc( n * sizeof(int), "Make_List:index", comm );
+    l->end_index = smalloc( n * sizeof(int), "Make_List:end_index", comm );
 
     l->type = type;
+    l->format = format;
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "list: n=%d num_intrs=%d type=%d\n", l->n, l->num_intrs, l->type );
@@ -49,42 +51,48 @@ int Make_List(int n, int num_intrs, int type, reax_list *l, MPI_Comm comm)
     switch ( l->type )
     {
     case TYP_VOID:
-        l->v = (void*) smalloc(l->num_intrs * sizeof(void*), "list:v", comm);
+        l->v = smalloc( l->num_intrs * sizeof(void*),
+                "Make_List:v", comm );
         break;
 
     case TYP_THREE_BODY:
-        l->three_body_list = (three_body_interaction_data*)
-                             smalloc( l->num_intrs * sizeof(three_body_interaction_data),
-                                      "list:three_bodies", comm );
+        l->three_body_list = smalloc( l->num_intrs * sizeof(three_body_interaction_data),
+                "Make_List:three_bodies", comm );
         break;
 
     case TYP_BOND:
-        l->bond_list = (bond_data*)
-                       smalloc( l->num_intrs * sizeof(bond_data), "list:bonds", comm );
+        l->bond_list = smalloc( l->num_intrs * sizeof(bond_data),
+                "Make_List:bonds", comm );
         break;
 
     case TYP_DBO:
-        l->dbo_list = (dbond_data*)
-                      smalloc( l->num_intrs * sizeof(dbond_data), "list:dbonds", comm );
+        l->dbo_list = smalloc( l->num_intrs * sizeof(dbond_data),
+                "Make_List:dbonds", comm );
         break;
 
     case TYP_DDELTA:
-        l->dDelta_list = (dDelta_data*)
-                         smalloc( l->num_intrs * sizeof(dDelta_data), "list:dDeltas", comm );
+        l->dDelta_list = smalloc( l->num_intrs * sizeof(dDelta_data),
+                "Make_List:dDeltas", comm );
         break;
 
     case TYP_FAR_NEIGHBOR:
-        l->far_nbr_list = (far_neighbor_data*)
-                          smalloc(l->num_intrs * sizeof(far_neighbor_data), "list:far_nbrs", comm);
+        l->far_nbr_list.nbr = smalloc( l->num_intrs * sizeof(int),
+                "Make_List:far_nbr_list.nbr", comm );
+        l->far_nbr_list.rel_box = smalloc( l->num_intrs * sizeof(ivec),
+                "Make_List:far_nbr_list.rel_box", comm );
+        l->far_nbr_list.d = smalloc( l->num_intrs * sizeof(real),
+                "Make_List:far_nbr_list.d", comm );
+        l->far_nbr_list.dvec = smalloc( l->num_intrs * sizeof(rvec),
+                "Make_List:far_nbr_list.dvec", comm );
         break;
 
     case TYP_HBOND:
-        l->hbond_list = (hbond_data*)
-                        smalloc( l->num_intrs * sizeof(hbond_data), "list:hbonds", comm );
+        l->hbond_list = smalloc( l->num_intrs * sizeof(hbond_data),
+                "Make_List:hbonds", comm );
         break;
 
     default:
-        fprintf( stderr, "ERROR: no %d list type defined!\n", l->type );
+        fprintf( stderr, "[ERROR]: no %d list type defined!\n", l->type );
         MPI_Abort( comm, INVALID_INPUT );
     }
 
@@ -98,31 +106,34 @@ void Delete_List( reax_list *l, MPI_Comm comm )
         return;
     l->allocated = 0;
 
-    sfree( l->index, "list:index" );
-    sfree( l->end_index, "list:end_index" );
+    sfree( l->index, "Delete_List:index" );
+    sfree( l->end_index, "Delete_List:end_index" );
 
     switch (l->type)
     {
     case TYP_VOID:
-        sfree( l->v, "list:v" );
+        sfree( l->v, "Delete_List:v" );
         break;
     case TYP_HBOND:
-        sfree( l->hbond_list, "list:hbonds" );
+        sfree( l->hbond_list, "Delete_List:hbonds" );
         break;
     case TYP_FAR_NEIGHBOR:
-        sfree( l->far_nbr_list, "list:far_nbrs" );
+        sfree( l->far_nbr_list.nbr, "Delete_List:far_nbr_list.nbr" );
+        sfree( l->far_nbr_list.rel_box, "Delete_List:far_nbr_list.rel_box" );
+        sfree( l->far_nbr_list.d, "Delete_List:far_nbr_list.d" );
+        sfree( l->far_nbr_list.dvec, "Delete_List:far_nbr_list.dvec" );
         break;
     case TYP_BOND:
-        sfree( l->bond_list, "list:bonds" );
+        sfree( l->bond_list, "Delete_List:bonds" );
         break;
     case TYP_DBO:
-        sfree( l->dbo_list, "list:dbos" );
+        sfree( l->dbo_list, "Delete_List:dbos" );
         break;
     case TYP_DDELTA:
-        sfree( l->dDelta_list, "list:dDeltas" );
+        sfree( l->dDelta_list, "Delete_List:dDeltas" );
         break;
     case TYP_THREE_BODY:
-        sfree( l->three_body_list, "list:three_bodies" );
+        sfree( l->three_body_list, "Delete_List:three_bodies" );
         break;
 
     default:
diff --git a/PuReMD/src/list.h b/PuReMD/src/list.h
index da400b76e1303bf5e587e949f664bc93225de3dc..918256f595cc6994626693b5b74db1d14b4af0fc 100644
--- a/PuReMD/src/list.h
+++ b/PuReMD/src/list.h
@@ -25,7 +25,7 @@
 #include "reax_types.h"
 
 
-int Make_List( int, int, int, reax_list*, MPI_Comm );
+int Make_List( int, int, int, int, reax_list*, MPI_Comm );
 
 void Delete_List( reax_list*, MPI_Comm );
 
diff --git a/PuReMD/src/neighbors.c b/PuReMD/src/neighbors.c
index 5be0016eda64a6e4ec56f02d77695cd212fcdd55..a0701698b2f3ad1e5cfec0b7c5c903c5c7470eb1 100644
--- a/PuReMD/src/neighbors.c
+++ b/PuReMD/src/neighbors.c
@@ -74,14 +74,13 @@ void Generate_Neighbor_Lists( reax_system *system, simulation_data *data,
     grid *g;
     grid_cell *gci, *gcj;
     reax_list *far_nbrs;
-    far_neighbor_data *nbr_data;
     reax_atom *atom1, *atom2;
 
 #if defined(LOG_PERFORMANCE)
     real t_start = 0, t_elapsed = 0;
 
     if ( system->my_rank == MASTER_NODE )
-        t_start = Get_Time( );
+        t_start = MPI_Wtime();
 #endif
 
     // fprintf( stderr, "\n\tentered nbrs - " );
@@ -91,7 +90,9 @@ void Generate_Neighbor_Lists( reax_system *system, simulation_data *data,
 
     /* first pick up a cell in the grid */
     for ( i = 0; i < g->ncells[0]; i++ )
+    {
         for ( j = 0; j < g->ncells[1]; j++ )
+        {
             for ( k = 0; k < g->ncells[2]; k++ )
             {
                 gci = &(g->cells[i][j][k]);
@@ -99,20 +100,37 @@ void Generate_Neighbor_Lists( reax_system *system, simulation_data *data,
                 //fprintf( stderr, "gridcell %d %d %d\n", i, j, k );
 
                 /* pick up an atom from the current cell */
-                for (l = gci->str; l < gci->end; ++l )
+                for ( l = gci->str; l < gci->end; ++l )
                 {
-                    atom1 = &(system->my_atoms[l]);
+                    atom1 = &system->my_atoms[l];
+#if defined(NEUTRAL_TERRITORY)
+                    if( gci->type >= NT_NBRS && gci->type < NT_NBRS + 6 )
+                    {
+                        atom1->nt_dir = gci->type - NT_NBRS;
+                    }
+                    else
+                    {
+                        atom1->nt_dir = -1;
+                    }
+#endif
                     Set_Start_Index( l, num_far, far_nbrs );
                     //fprintf( stderr, "\tatom %d\n", atom1 );
 
                     itr = 0;
                     while ( (gcj = gci->nbrs[itr]) != NULL )
                     {
-                        if ( gci->str <= gcj->str &&
-                                (DistSqr_to_Special_Point(gci->nbrs_cp[itr], atom1->x) <= cutoff) )
+                        if ( ((far_nbrs->format == HALF_LIST && gci->str <= gcj->str)
+                                    || far_nbrs->format == FULL_LIST)
+                            && (DistSqr_to_Special_Point(gci->nbrs_cp[itr], atom1->x) <= cutoff) )
+                        {
                             /* pick up another atom from the neighbor cell */
                             for ( m = gcj->str; m < gcj->end; ++m )
-                                if ( l < m )  // prevent recounting same pairs within a gcell
+                            {
+                                /* HALF_LIST: prevent recounting same pairs within a gcell and
+                                 *  make half-list
+                                 * FULL_LIST: prevent recounting same pairs within a gcell */
+                                if ( (far_nbrs->format == HALF_LIST && l < m)
+                                  || (far_nbrs->format == FULL_LIST && l != m) )
                                 {
                                     atom2 = &(system->my_atoms[m]);
                                     dvec[0] = atom2->x[0] - atom1->x[0];
@@ -121,31 +139,32 @@ void Generate_Neighbor_Lists( reax_system *system, simulation_data *data,
                                     d = rvec_Norm_Sqr( dvec );
                                     if ( d <= cutoff )
                                     {
-                                        nbr_data = &(far_nbrs->far_nbr_list[num_far]);
-                                        nbr_data->nbr = m;
-                                        nbr_data->d = sqrt(d);
-                                        rvec_Copy( nbr_data->dvec, dvec );
-                                        //ivec_Copy( nbr_data->rel_box, gcj->rel_box );
-                                        ivec_ScaledSum( nbr_data->rel_box,
-                                                        1, gcj->rel_box, -1, gci->rel_box );
+                                        far_nbrs->far_nbr_list.nbr[num_far] = m;
+                                        far_nbrs->far_nbr_list.d[num_far] = sqrt(d);
+                                        rvec_Copy( far_nbrs->far_nbr_list.dvec[num_far], dvec );
+                                        ivec_ScaledSum( far_nbrs->far_nbr_list.rel_box[num_far],
+                                                1, gcj->rel_box, -1, gci->rel_box );
                                         ++num_far;
                                     }
                                 }
+                            }
+                        }
+
                         ++itr;
                     }
+
                     Set_End_Index( l, num_far, far_nbrs );
-                    //fprintf(stderr, "i:%d, start: %d, end: %d - itr: %d\n",
-                    //  atom1,Start_Index(atom1,far_nbrs),End_Index(atom1,far_nbrs),
-                    //  itr);
                 }
             }
+        }
+    }
 
     workspace->realloc.num_far = num_far;
 
 #if defined(LOG_PERFORMANCE)
     if ( system->my_rank == MASTER_NODE )
     {
-        t_elapsed = Get_Timing_Info( t_start );
+        t_elapsed = MPI_Wtime() - t_start;
         data->timing.nbrs += t_elapsed;
     }
 #endif
@@ -165,7 +184,8 @@ void Generate_Neighbor_Lists( reax_system *system, simulation_data *data,
 }
 
 
-int Estimate_NumNeighbors( reax_system *system, reax_list **lists )
+int Estimate_NumNeighbors( reax_system *system, reax_list **lists,
+       int far_nbr_list_format )
 {
     int  i, j, k, l, m, itr, num_far; //, tmp, tested;
     real d, cutoff;
@@ -180,7 +200,9 @@ int Estimate_NumNeighbors( reax_system *system, reax_list **lists )
 
     /* first pick up a cell in the grid */
     for ( i = 0; i < g->ncells[0]; i++ )
+    {
         for ( j = 0; j < g->ncells[1]; j++ )
+        {
             for ( k = 0; k < g->ncells[2]; k++ )
             {
                 gci = &(g->cells[i][j][k]);
@@ -190,18 +212,34 @@ int Estimate_NumNeighbors( reax_system *system, reax_list **lists )
                 /* pick up an atom from the current cell */
                 for ( l = gci->str; l < gci->end; ++l )
                 {
-                    atom1 = &(system->my_atoms[l]);
+                    atom1 = &system->my_atoms[l];
+#if defined(NEUTRAL_TERRITORY)
+                    if( gci->type >= NT_NBRS && gci->type < NT_NBRS + 6 )
+                    {
+                        atom1->nt_dir = gci->type - NT_NBRS;
+                    }
+                    else
+                    {
+                        atom1->nt_dir = -1;
+                    }
+#endif
                     //fprintf( stderr, "\tatom %d: ", l );
                     //tmp = num_far; tested = 0;
                     itr = 0;
                     while ( (gcj = gci->nbrs[itr]) != NULL )
                     {
-                        if (gci->str <= gcj->str &&
-                                (DistSqr_to_Special_Point(gci->nbrs_cp[itr], atom1->x) <= cutoff))
-                            //fprintf( stderr, "\t\tgcell2: %d\n", itr );
+                        if ( ((far_nbr_list_format == HALF_LIST && gci->str <= gcj->str)
+                                    || far_nbr_list_format == FULL_LIST)
+                                && (DistSqr_to_Special_Point(gci->nbrs_cp[itr], atom1->x) <= cutoff))
+                        {
                             /* pick up another atom from the neighbor cell */
                             for ( m = gcj->str; m < gcj->end; ++m )
-                                if ( l < m )
+                            {
+                                /* HALF_LIST: prevent recounting same pairs within a gcell and
+                                 *  make half-list
+                                 * FULL_LIST: prevent recounting same pairs within a gcell */
+                                if ( (far_nbr_list_format == HALF_LIST && l < m)
+                                  || (far_nbr_list_format == FULL_LIST && l != m) )
                                 {
                                     //fprintf( stderr, "\t\t\tatom2=%d\n", m );
                                     atom2 = &(system->my_atoms[m]);
@@ -212,13 +250,15 @@ int Estimate_NumNeighbors( reax_system *system, reax_list **lists )
                                     if ( d <= cutoff )
                                         ++num_far;
                                 }
+                            }
+                        }
 
                         ++itr;
                     }
-                    //fprintf( stderr, "itr: %d, tested: %d, num_nbrs: %d\n",
-                    //   itr, tested, num_far-tmp );
                 }
             }
+        }
+    }
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d: estimate nbrs done - num_far=%d\n",
diff --git a/PuReMD/src/neighbors.h b/PuReMD/src/neighbors.h
index 0a1e3daf289883268e77fbefd7f6a24deaa582dd..818fc7245cf39e132a7198907372f002784d8273 100644
--- a/PuReMD/src/neighbors.h
+++ b/PuReMD/src/neighbors.h
@@ -33,6 +33,6 @@
 
 void Generate_Neighbor_Lists( reax_system*, simulation_data*, storage*,
                               reax_list** );
-int Estimate_NumNeighbors( reax_system*, reax_list** );
+int Estimate_NumNeighbors( reax_system*, reax_list**, int );
 
 #endif
diff --git a/PuReMD/src/nonbonded.c b/PuReMD/src/nonbonded.c
index ab25b807d5e0a0263a4d8f89ecc475c8615b8bdd..174bfada844ca271ac2e68c48aacd7014091540a 100644
--- a/PuReMD/src/nonbonded.c
+++ b/PuReMD/src/nonbonded.c
@@ -47,7 +47,6 @@ void vdW_Coulomb_Energy( reax_system *system, control_params *control,
     real e_ele, e_vdW, e_core;
     rvec temp, ext_press;
     two_body_parameters *twbp;
-    far_neighbor_data *nbr_pj;
     reax_list *far_nbrs;
     // rtensor temp_rtensor, total_rtensor;
 
@@ -60,25 +59,25 @@ void vdW_Coulomb_Energy( reax_system *system, control_params *control,
 
     for ( i = 0; i < natoms; ++i )
     {
-        start_i = Start_Index(i, far_nbrs);
-        end_i   = End_Index(i, far_nbrs);
+        start_i = Start_Index( i, far_nbrs );
+        end_i = End_Index( i, far_nbrs );
         orig_i  = system->my_atoms[i].orig_id;
         //fprintf( stderr, "i:%d, start_i: %d, end_i: %d\n", i, start_i, end_i );
 
         for ( pj = start_i; pj < end_i; ++pj )
         {
-            nbr_pj = &(far_nbrs->far_nbr_list[pj]);
-            j = nbr_pj->nbr;
-            orig_j  = system->my_atoms[j].orig_id;
+            j = far_nbrs->far_nbr_list.nbr[pj];
+            orig_j = system->my_atoms[j].orig_id;
 
-            if ( nbr_pj->d <= control->nonb_cut && (j < natoms || orig_i < orig_j) )
+            if ( far_nbrs->far_nbr_list.d[pj] <= control->nonb_cut
+                    && ((far_nbrs->format == HALF_LIST && (j < natoms || orig_i < orig_j))
+                        || (far_nbrs->format == FULL_LIST && orig_i < orig_j)) )
             {
-                r_ij = nbr_pj->d;
-                twbp = &(system->reax_param.tbp[ system->my_atoms[i].type ]
-                         [ system->my_atoms[j].type ]);
+                r_ij = far_nbrs->far_nbr_list.d[pj];
+                twbp = &system->reax_param.tbp[ 
+                    system->my_atoms[i].type ][ system->my_atoms[j].type ];
 
                 /* Calculate Taper and its derivative */
-                // Tap = nbr_pj->Tap;   -- precomputed during compte_H
                 Tap = workspace->Tap[7] * r_ij + workspace->Tap[6];
                 Tap = Tap * r_ij + workspace->Tap[5];
                 Tap = Tap * r_ij + workspace->Tap[4];
@@ -94,12 +93,13 @@ void vdW_Coulomb_Energy( reax_system *system, control_params *control,
                 dTap = dTap * r_ij + 2 * workspace->Tap[2];
                 dTap += workspace->Tap[1] / r_ij;
 
-                /*vdWaals Calculations*/
-                if (system->reax_param.gp.vdw_type == 1 || system->reax_param.gp.vdw_type == 3)
+                /* vdWaals Calculations */
+                if ( system->reax_param.gp.vdw_type == 1
+                        || system->reax_param.gp.vdw_type == 3 )
                 {
                     // shielding
-                    powr_vdW1 = pow(r_ij, p_vdW1);
-                    powgi_vdW1 = pow( 1.0 / twbp->gamma_w, p_vdW1);
+                    powr_vdW1 = pow( r_ij, p_vdW1 );
+                    powgi_vdW1 = pow( 1.0 / twbp->gamma_w, p_vdW1 );
 
                     fn13 = pow( powr_vdW1 + powgi_vdW1, p_vdW1i );
                     exp1 = exp( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
@@ -108,11 +108,11 @@ void vdW_Coulomb_Energy( reax_system *system, control_params *control,
                     e_vdW = twbp->D * (exp1 - 2.0 * exp2);
                     data->my_en.e_vdW += Tap * e_vdW;
 
-                    dfn13 = pow( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) *
-                            pow(r_ij, p_vdW1 - 2.0);
+                    dfn13 = pow( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0 )
+                        * pow( r_ij, p_vdW1 - 2.0 );
 
-                    CEvd = dTap * e_vdW -
-                           Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13;
+                    CEvd = dTap * e_vdW - Tap * twbp->D
+                        * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13;
                 }
                 else  // no shielding
                 {
@@ -154,24 +154,30 @@ void vdW_Coulomb_Energy( reax_system *system, control_params *control,
 
                 if ( control->virial == 0 )
                 {
-                    rvec_ScaledAdd( workspace->f[i], -(CEvd + CEclmb), nbr_pj->dvec );
-                    rvec_ScaledAdd( workspace->f[j], +(CEvd + CEclmb), nbr_pj->dvec );
+                    rvec_ScaledAdd( workspace->f[i], -(CEvd + CEclmb),
+                            far_nbrs->far_nbr_list.dvec[pj] );
+                    rvec_ScaledAdd( workspace->f[j], +(CEvd + CEclmb),
+                            far_nbrs->far_nbr_list.dvec[pj] );
                 }
                 else   /* NPT, iNPT or sNPT */
                 {
                     /* for pressure coupling, terms not related to bond order
                        derivatives are added directly into pressure vector/tensor */
-                    rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec );
+                    rvec_Scale( temp, CEvd + CEclmb,
+                            far_nbrs->far_nbr_list.dvec[pj] );
 
                     rvec_ScaledAdd( workspace->f[i], -1., temp );
                     rvec_Add( workspace->f[j], temp );
 
-                    rvec_iMultiply( ext_press, nbr_pj->rel_box, temp );
+                    rvec_iMultiply( ext_press,
+                            far_nbrs->far_nbr_list.rel_box[pj], temp );
                     rvec_Add( data->my_ext_press, ext_press );
 
                     // fprintf( stderr, "nonbonded(%d,%d): rel_box (%f %f %f)
                     //   force(%f %f %f) ext_press (%12.6f %12.6f %12.6f)\n",
-                    //   i, j, nbr_pj->rel_box[0], nbr_pj->rel_box[1], nbr_pj->rel_box[2],
+                    //   i, j, far_nbrs->far_nbr_list.rel_box[pj][0],
+                    //   far_nbrs->far_nbr_list.rel_box[pj][1],
+                    //   far_nbrs->far_nbr_list.rel_box[pj][2],
                     //   temp[0], temp[1], temp[2],
                     //   data->ext_press[0], data->ext_press[1], data->ext_press[2] );
                 }
@@ -193,10 +199,14 @@ void vdW_Coulomb_Energy( reax_system *system, control_params *control,
                          e_ele, data->my_en.e_ele );
 #endif
 #ifdef TEST_FORCES
-                rvec_ScaledAdd( workspace->f_vdw[i], -CEvd, nbr_pj->dvec );
-                rvec_ScaledAdd( workspace->f_vdw[j], +CEvd, nbr_pj->dvec );
-                rvec_ScaledAdd( workspace->f_ele[i], -CEclmb, nbr_pj->dvec );
-                rvec_ScaledAdd( workspace->f_ele[j], +CEclmb, nbr_pj->dvec );
+                rvec_ScaledAdd( workspace->f_vdw[i], -CEvd,
+                        far_nbrs->far_nbr_list.dvec[pj] );
+                rvec_ScaledAdd( workspace->f_vdw[j], +CEvd,
+                        far_nbrs->far_nbr_list.dvec[pj] );
+                rvec_ScaledAdd( workspace->f_ele[i], -CEclmb,
+                        far_nbrs->far_nbr_list.dvec[pj] );
+                rvec_ScaledAdd( workspace->f_ele[j], +CEclmb,
+                        far_nbrs->far_nbr_list.dvec[pj] );
 #endif
             }
         }
@@ -225,7 +235,6 @@ void Tabulated_vdW_Coulomb_Energy( reax_system *system, control_params *control,
     real e_vdW, e_ele;
     real CEvd, CEclmb;
     rvec temp, ext_press;
-    far_neighbor_data *nbr_pj;
     reax_list *far_nbrs;
     LR_lookup_table *t;
 
@@ -245,19 +254,19 @@ void Tabulated_vdW_Coulomb_Energy( reax_system *system, control_params *control,
 
         for ( pj = start_i; pj < end_i; ++pj )
         {
-            nbr_pj = &(far_nbrs->far_nbr_list[pj]);
-            j = nbr_pj->nbr;
-            orig_j  = system->my_atoms[j].orig_id;
+            j = far_nbrs->far_nbr_list.nbr[pj];
+            orig_j = system->my_atoms[j].orig_id;
 
-            if ( nbr_pj->d <= control->nonb_cut && (j < natoms || orig_i < orig_j) )
+            if ( far_nbrs->far_nbr_list.d[pj] <= control->nonb_cut
+                    && ((far_nbrs->format == HALF_LIST && (j < natoms || orig_i < orig_j))
+                        || (far_nbrs->format == FULL_LIST && orig_i < orig_j)) )
             {
-                j = nbr_pj->nbr;
                 type_j = system->my_atoms[j].type;
-                r_ij   = nbr_pj->d;
-                tmin  = MIN( type_i, type_j );
-                tmax  = MAX( type_i, type_j );
-                t = &( LR[tmin][tmax] );
-                // table = &( LR[type_i][type_j] );
+                r_ij = far_nbrs->far_nbr_list.d[pj];
+                tmin = MIN( type_i, type_j );
+                tmax = MAX( type_i, type_j );
+                t = &LR[tmin][tmax];
+                // table = &LR[type_i][type_j];
 
                 /* Cubic Spline Interpolation */
                 r = (int)(r_ij * t->inv_dx);
@@ -288,19 +297,21 @@ void Tabulated_vdW_Coulomb_Energy( reax_system *system, control_params *control,
 
                 if ( control->virial == 0 )
                 {
-                    rvec_ScaledAdd( workspace->f[i], -(CEvd + CEclmb), nbr_pj->dvec );
-                    rvec_ScaledAdd( workspace->f[j], +(CEvd + CEclmb), nbr_pj->dvec );
+                    rvec_ScaledAdd( workspace->f[i], -(CEvd + CEclmb),
+                            far_nbrs->far_nbr_list.dvec[pj] );
+                    rvec_ScaledAdd( workspace->f[j], +(CEvd + CEclmb),
+                            far_nbrs->far_nbr_list.dvec[pj] );
                 }
                 else   // NPT, iNPT or sNPT
                 {
                     /* for pressure coupling, terms not related to bond order derivatives
                        are added directly into pressure vector/tensor */
-                    rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec );
+                    rvec_Scale( temp, CEvd + CEclmb, far_nbrs->far_nbr_list.dvec[pj] );
 
                     rvec_ScaledAdd( workspace->f[i], -1., temp );
                     rvec_Add( workspace->f[j], temp );
 
-                    rvec_iMultiply( ext_press, nbr_pj->rel_box, temp );
+                    rvec_iMultiply( ext_press, far_nbrs->far_nbr_list.rel_box[pj], temp );
                     rvec_Add( data->my_ext_press, ext_press );
                 }
 
@@ -316,10 +327,14 @@ void Tabulated_vdW_Coulomb_Energy( reax_system *system, control_params *control,
                          e_ele, data->my_en.e_ele );
 #endif
 #ifdef TEST_FORCES
-                rvec_ScaledAdd( workspace->f_vdw[i], -CEvd, nbr_pj->dvec );
-                rvec_ScaledAdd( workspace->f_vdw[j], +CEvd, nbr_pj->dvec );
-                rvec_ScaledAdd( workspace->f_ele[i], -CEclmb, nbr_pj->dvec );
-                rvec_ScaledAdd( workspace->f_ele[j], +CEclmb, nbr_pj->dvec );
+                rvec_ScaledAdd( workspace->f_vdw[i], -CEvd,
+                        far_nbrs->far_nbr_list.dvec[pj] );
+                rvec_ScaledAdd( workspace->f_vdw[j], +CEvd,
+                        far_nbrs->far_nbr_list.dvec[pj] );
+                rvec_ScaledAdd( workspace->f_ele[i], -CEclmb,
+                        far_nbrs->far_nbr_list.dvec[pj] );
+                rvec_ScaledAdd( workspace->f_ele[j], +CEclmb,
+                        far_nbrs->far_nbr_list.dvec[pj] );
 #endif
             }
         }
diff --git a/PuReMD/src/parallelreax.c b/PuReMD/src/parallelreax.c
index 4b401ad603db53f5b982a2c7fc24861a9b9a6495..359aa1683f9489e0e11f0da53ed52804f2af338f 100644
--- a/PuReMD/src/parallelreax.c
+++ b/PuReMD/src/parallelreax.c
@@ -134,21 +134,15 @@ int main( int argc, char* argv[] )
     }
 
     /* allocated main datastructures */
-    system = (reax_system *)
-        smalloc( sizeof(reax_system), "system", MPI_COMM_WORLD );
-    control = (control_params *)
-        smalloc( sizeof(control_params), "control", MPI_COMM_WORLD );
-    data = (simulation_data *)
-        smalloc( sizeof(simulation_data), "data", MPI_COMM_WORLD );
-
-    workspace = (storage *)
-        smalloc( sizeof(storage), "workspace", MPI_COMM_WORLD );
-    lists = (reax_list **)
-        smalloc( LIST_N * sizeof(reax_list*), "lists", MPI_COMM_WORLD );
+    system = smalloc( sizeof(reax_system), "system", MPI_COMM_WORLD );
+    control = smalloc( sizeof(control_params), "control", MPI_COMM_WORLD );
+    data = smalloc( sizeof(simulation_data), "data", MPI_COMM_WORLD );
+
+    workspace = smalloc( sizeof(storage), "workspace", MPI_COMM_WORLD );
+    lists = smalloc( LIST_N * sizeof(reax_list*), "lists", MPI_COMM_WORLD );
     for ( i = 0; i < LIST_N; ++i )
     {
-        lists[i] = (reax_list *)
-            smalloc( sizeof(reax_list), "lists[i]", MPI_COMM_WORLD );
+        lists[i] = smalloc( sizeof(reax_list), "lists[i]", MPI_COMM_WORLD );
         lists[i]->allocated = 0;
         lists[i]->n = 0;
         lists[i]->num_intrs = 0;
@@ -160,25 +154,22 @@ int main( int argc, char* argv[] )
         lists[i]->bond_list = NULL;
         lists[i]->dbo_list = NULL;
         lists[i]->dDelta_list = NULL;
-        lists[i]->far_nbr_list = NULL;
         lists[i]->hbond_list = NULL;
     }
-    out_control = (output_controls *)
-        smalloc( sizeof(output_controls), "out_control", MPI_COMM_WORLD );
-    mpi_data = (mpi_datatypes *)
-        smalloc( sizeof(mpi_datatypes), "mpi_data", MPI_COMM_WORLD );
+    out_control = smalloc( sizeof(output_controls), "out_control", MPI_COMM_WORLD );
+    mpi_data = smalloc( sizeof(mpi_datatypes), "mpi_data", MPI_COMM_WORLD );
 
     /* setup the parallel environment */
-    MPI_Comm_size( MPI_COMM_WORLD, &(control->nprocs) );
-    MPI_Comm_rank( MPI_COMM_WORLD, &(system->my_rank) );
+    MPI_Comm_size( MPI_COMM_WORLD, &control->nprocs );
+    MPI_Comm_rank( MPI_COMM_WORLD, &system->my_rank );
     system->wsize = control->nprocs;
-    system->global_offset = (int*)
-        scalloc( system->wsize + 1, sizeof(int), "global_offset", MPI_COMM_WORLD );
+    system->global_offset = scalloc( system->wsize + 1, sizeof(int),
+            "global_offset", MPI_COMM_WORLD );
 
     /* read system description files */
     Read_System( argv[1], argv[2], argv[3], system, control,
             data, workspace, out_control, mpi_data );
-
+    
 #if defined(DEBUG)
     fprintf( stderr, "p%d: read simulation info\n", system->my_rank );
     MPI_Barrier( MPI_COMM_WORLD );
@@ -186,7 +177,9 @@ int main( int argc, char* argv[] )
 
     /* measure total simulation time after input is read */
     if ( system->my_rank == MASTER_NODE )
-        t_start = Get_Time( );
+    {
+        t_start = MPI_Wtime( );
+    }
 
     /* initialize datastructures */
     Initialize( system, control, data, workspace, lists, out_control, mpi_data );
@@ -211,26 +204,46 @@ int main( int argc, char* argv[] )
 #endif
 
     /* start the simulation */
+    int total_itr = data->timing.cm_solver_iters;
     for ( ++data->step; data->step <= control->nsteps; data->step++ )
     {
         if ( control->T_mode )
+        {
             Temperature_Control( control, data );
+        }
 
         Evolve( system, control, data, workspace, lists, out_control, mpi_data );
         Post_Evolve(system, control, data, workspace, lists, out_control, mpi_data);
+
+        if ( system->my_rank == MASTER_NODE 
+                && out_control->energy_update_freq > 0
+                && data->step % out_control->energy_update_freq == 0 )
+        {
+            total_itr += data->timing.cm_solver_iters;
+        }
+
         Output_Results( system, control, data, lists, out_control, mpi_data );
         //Analysis(system, control, data, workspace, lists, out_control, mpi_data);
 
         /* dump restart info */
-        if ( out_control->restart_freq &&
-                (data->step - data->prev_steps) % out_control->restart_freq == 0 )
+        if ( out_control->restart_freq
+                && (data->step - data->prev_steps) % out_control->restart_freq == 0 )
         {
             if ( out_control->restart_format == WRITE_ASCII )
+            {
                 Write_Restart( system, control, data, out_control, mpi_data );
+            }
             else if ( out_control->restart_format == WRITE_BINARY )
+            {
                 Write_Binary_Restart( system, control, data, out_control, mpi_data );
+            }
         }
 
+//        if ( data->step == 1 || data->step == control->nsteps )
+//        {
+//            Write_PDB( system, lists, data, control, mpi_data, out_control );
+//        }
+
 #if defined(DEBUG)
         fprintf( stderr, "p%d: step%d completed\n", system->my_rank, data->step );
         MPI_Barrier( mpi_data->world );
@@ -240,14 +253,15 @@ int main( int argc, char* argv[] )
     /* end of the simulation, write total simulation time */
     if ( system->my_rank == MASTER_NODE )
     {
-        t_elapsed = Get_Timing_Info( t_start );
+        t_elapsed = MPI_Wtime() - t_start;
         fprintf( out_control->out, "Total Simulation Time: %.2f secs\n", t_elapsed );
+        fprintf( out_control->log, "Avg. # of Solver Itrs: %.2f\n", total_itr/((double)control->nsteps) );
     }
 
-    // Write_PDB( &system, &(lists[BOND]), &out_control );
+    //Write_PDB( system, lists, data, control, mpi_data, out_control );
     Close_Output_Files( system, control, out_control, mpi_data );
 
-    MPI_Finalize();
+    MPI_Finalize( );
 
     /* de-allocate data structures */
     sfree( system, "system" );
diff --git a/PuReMD/src/qEq.c b/PuReMD/src/qEq.c
index 15cc0249cab467b4813e1d6edeacc17de460472f..3ba18eeb8bb78533f4b894d7b28ff8b29b86f0a1 100644
--- a/PuReMD/src/qEq.c
+++ b/PuReMD/src/qEq.c
@@ -41,7 +41,7 @@ void Sort_Matrix_Rows( sparse_matrix *A )
         si = A->start[i];
         ei = A->end[i];
         qsort( &(A->entries[si]), ei - si,
-               sizeof(sparse_matrix_entry), compare_matrix_entry );
+                sizeof(sparse_matrix_entry), compare_matrix_entry );
     }
 }
 
@@ -103,7 +103,7 @@ int Estimate_LU_Fill( sparse_matrix *A, real *droptol )
 
 
 void ICHOLT( sparse_matrix *A, real *droptol,
-             sparse_matrix *L, sparse_matrix *U )
+        sparse_matrix *L, sparse_matrix *U )
 {
     sparse_matrix_entry tmp[1000];
     int i, j, pj, k1, k2, tmptop, Utop;
@@ -231,52 +231,55 @@ void ICHOLT( sparse_matrix *A, real *droptol,
 
 
 void Init_MatVec( reax_system *system, simulation_data *data,
-                  control_params *control,  storage *workspace,
-                  mpi_datatypes *mpi_data )
+        control_params *control,  storage *workspace,
+        mpi_datatypes *mpi_data )
 {
     int i; //, fillin;
     reax_atom *atom;
 
-    /*if( (data->step - data->prev_steps) % control->refactor == 0 ||
-        workspace->L == NULL ) {
-      //Print_Linear_System( system, control, workspace, data->step );
-      Sort_Matrix_Rows( workspace->H );
-      fprintf( stderr, "H matrix sorted\n" );
-      Calculate_Droptol( workspace->H, workspace->droptol, control->droptol );
-      fprintf( stderr, "drop tolerances calculated\n" );
-      if( workspace->L == NULL ) {
-        fillin = Estimate_LU_Fill( workspace->H, workspace->droptol );
-
-        if( Allocate_Matrix( &(workspace->L), workspace->H->cap, fillin ) == 0 ||
-      Allocate_Matrix( &(workspace->U), workspace->H->cap, fillin ) == 0 ) {
-    fprintf( stderr, "not enough memory for LU matrices. terminating.\n" );
-    MPI_Abort( mpi_data->world, INSUFFICIENT_MEMORY );
+    /*if( (data->step - data->prev_steps) % control->cm_solver_pre_comp_refactor == 0 ||
+      workspace->L == NULL ) 
+    {
+        //Print_Linear_System( system, control, workspace, data->step );
+        Sort_Matrix_Rows( workspace->H );
+        fprintf( stderr, "H matrix sorted\n" );
+        Calculate_Droptol( workspace->H, workspace->droptol, control->cm_solver_pre_comp_droptol );
+        fprintf( stderr, "drop tolerances calculated\n" );
+        if( workspace->L == NULL ) 
+        {
+            fillin = Estimate_LU_Fill( workspace->H, workspace->droptol );
+
+            if( Allocate_Matrix( &(workspace->L), workspace->H->cap, fillin, FULL_MATRIX, comm ) == 0 ||
+            Allocate_Matrix( &(workspace->U), workspace->H->cap, fillin, FULL_MATRIX, comm ) == 0 ) 
+            {
+                fprintf( stderr, "not enough memory for LU matrices. terminating.\n" );
+                MPI_Abort( mpi_data->world, INSUFFICIENT_MEMORY );
+            }
+
+            workspace->L->n = workspace->H->n;
+            workspace->U->n = workspace->H->n;
+#if defined(DEBUG_FOCUS)
+            fprintf( stderr, "p%d: n=%d, fillin = %d\n",
+            system->my_rank, workspace->L->n, fillin );
+            fprintf( stderr, "p%d: allocated memory: L = U = %ldMB\n",
+            system->my_rank,fillin*sizeof(sparse_matrix_entry)/(1024*1024) );
+#endif
         }
 
-        workspace->L->n = workspace->H->n;
-        workspace->U->n = workspace->H->n;
-    #if defined(DEBUG_FOCUS)
-        fprintf( stderr, "p%d: n=%d, fillin = %d\n",
-           system->my_rank, workspace->L->n, fillin );
-        fprintf( stderr, "p%d: allocated memory: L = U = %ldMB\n",
-                 system->my_rank,fillin*sizeof(sparse_matrix_entry)/(1024*1024) );
-    #endif
-      }
-
-      ICHOLT( workspace->H, workspace->droptol, workspace->L, workspace->U );
-    #if defined(DEBUG_FOCUS)
-      fprintf( stderr, "p%d: icholt finished\n", system->my_rank );
-      //sprintf( fname, "%s.L%d.out", control->sim_name, data->step );
-      //Print_Sparse_Matrix2( workspace->L, fname );
-      //Print_Sparse_Matrix( U );
-    #endif
+        ICHOLT( workspace->H, workspace->droptol, workspace->L, workspace->U );
+#if defined(DEBUG_FOCUS)
+        fprintf( stderr, "p%d: icholt finished\n", system->my_rank );
+        //sprintf( fname, "%s.L%d.out", control->sim_name, data->step );
+        //Print_Sparse_Matrix2( workspace->L, fname );
+        //Print_Sparse_Matrix( U );
+#endif
     }*/
 
     //TODO: fill in code for setting up and computing SAI, see sPuReMD code,
     //  and remove diagonal preconditioner computation below (workspace->Hdia_inv)
-//    setup_sparse_approx_inverse( Hptr, &workspace->H_full, &workspace->H_spar_patt,
-//            &workspace->H_spar_patt_full, &workspace->H_app_inv,
-//            control->cm_solver_pre_comp_sai_thres );
+    //    setup_sparse_approx_inverse( Hptr, &workspace->H_full, &workspace->H_spar_patt,
+    //            &workspace->H_spar_patt_full, &workspace->H_app_inv,
+    //            control->cm_solver_pre_comp_sai_thres );
 
     for ( i = 0; i < system->n; ++i )
     {
@@ -313,16 +316,15 @@ void Init_MatVec( reax_system *system, simulation_data *data,
 
 
 void Calculate_Charges( reax_system *system, storage *workspace,
-                        mpi_datatypes *mpi_data )
+        mpi_datatypes *mpi_data )
 {
-    int        i, scale;
-    real       u;//, s_sum, t_sum;
-    rvec2      my_sum, all_sum;
+    int i;
+    real u;//, s_sum, t_sum;
+    rvec2 my_sum, all_sum;
     reax_atom *atom;
     real *q;
 
-    scale = sizeof(real) / sizeof(void);
-    q = (real*) malloc(system->N * sizeof(real));
+    q = malloc( system->N * sizeof(real) );
 
     //s_sum = Parallel_Vector_Acc(workspace->s, system->n, mpi_data->world);
     //t_sum = Parallel_Vector_Acc(workspace->t, system->n, mpi_data->world);
@@ -347,73 +349,210 @@ void Calculate_Charges( reax_system *system, storage *workspace,
         atom->s[3] = atom->s[2];
         atom->s[2] = atom->s[1];
         atom->s[1] = atom->s[0];
-        //atom->s[0] = workspace->s[i];
         atom->s[0] = workspace->x[i][0];
 
         atom->t[3] = atom->t[2];
         atom->t[2] = atom->t[1];
         atom->t[1] = atom->t[0];
-        //atom->t[0] = workspace->t[i];
         atom->t[0] = workspace->x[i][1];
     }
 
-    Dist( system, mpi_data, q, MPI_DOUBLE, scale, real_packer );
+    Dist_FS( system, mpi_data, q, REAL_PTR_TYPE, MPI_DOUBLE );
+
     for ( i = system->n; i < system->N; ++i )
+    {
         system->my_atoms[i].q = q[i];
+    }
 
-    sfree(q, "q");
+    sfree( q, "q" );
+}
+
+
+static void Setup_Preconditioner_QEq( reax_system *system, control_params *control, 
+        simulation_data *data, storage *workspace, mpi_datatypes *mpi_data )
+{
+    real time, t_sort, t_pc, total_sort, total_pc;
+
+    /* sort H needed for SpMV's in linear solver, H or H_sp needed for preconditioning */
+    time = MPI_Wtime();
+    Sort_Matrix_Rows( workspace->H );
+    t_sort = MPI_Wtime() - time;
+
+    t_pc = setup_sparse_approx_inverse( system, data, workspace, mpi_data, workspace->H, &workspace->H_spar_patt, 
+            control->nprocs, control->cm_solver_pre_comp_sai_thres );
+
+
+    MPI_Reduce(&t_sort, &total_sort, 1, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world);
+    MPI_Reduce(&t_pc, &total_pc, 1, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world);
+
+    if( system->my_rank == MASTER_NODE )
+    {
+        data->timing.cm_sort += total_sort / control->nprocs;
+        data->timing.cm_solver_pre_comp += total_pc / control->nprocs;
+    }
+}
+
+static void Compute_Preconditioner_QEq( reax_system *system, control_params *control, 
+        simulation_data *data, storage *workspace, mpi_datatypes *mpi_data )
+{
+    real t_pc, total_pc;
+#if defined(HAVE_LAPACKE) || defined(HAVE_LAPACKE_MKL)
+    t_pc = sparse_approx_inverse( system, data, workspace, mpi_data,
+            workspace->H, workspace->H_spar_patt, &workspace->H_app_inv, control->nprocs );
+
+    MPI_Reduce( &t_pc, &total_pc, 1, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world );
+
+    if( system->my_rank == MASTER_NODE )
+    {
+        data->timing.cm_solver_pre_comp += total_pc / control->nprocs;
+    }
+#else
+    fprintf( stderr, "[ERROR] LAPACKE support disabled. Re-compile before enabling. Terminating...\n" );
+    exit( INVALID_INPUT );
+#endif
 }
 
 
 void QEq( reax_system *system, control_params *control, simulation_data *data,
-          storage *workspace, output_controls *out_control,
-          mpi_datatypes *mpi_data )
+        storage *workspace, output_controls *out_control,
+        mpi_datatypes *mpi_data )
 {
-    int j, s_matvecs, t_matvecs;
+    int j, iters;
+
+    iters = 0;
 
     Init_MatVec( system, data, control, workspace, mpi_data );
 
-    //if( data->step == 50010 ) {
-    //  Print_Linear_System( system, control, workspace, data->step );
-    // }
 #if defined(DEBUG)
     fprintf( stderr, "p%d: initialized qEq\n", system->my_rank );
     //Print_Linear_System( system, control, workspace, data->step );
 #endif
 
-    //s_matvecs = dual_CG(system, workspace, workspace->H, workspace->b,
-    //          control->cm_solver_q_err, workspace->x, mpi_data, out_control->log);
-    //t_matvecs = 0;
+    if( control->cm_solver_pre_comp_type == SAI_PC )
+    {
+        if( control->cm_solver_pre_comp_refactor > 0
+                && ((data->step - data->prev_steps) % control->cm_solver_pre_comp_refactor == 0))
+        {
+            Setup_Preconditioner_QEq( system, control, data, workspace, mpi_data );
 
-    for ( j = 0; j < system->n; ++j )
-        workspace->s[j] = workspace->x[j][0];
-    s_matvecs = CG(system, workspace, workspace->H, workspace->b_s,//newQEq sCG
-                   control->cm_solver_q_err, workspace->s, mpi_data, out_control->log );
-    for ( j = 0; j < system->n; ++j )
-        workspace->x[j][0] = workspace->s[j];
+            Compute_Preconditioner_QEq( system, control, data, workspace, mpi_data );
+        }
+    }
+    
+    //TODO: used for timing to sync processors going into the linear solve, but remove for production code
+    MPI_Barrier( mpi_data->world ); 
 
-    //s_matvecs = PCG( system, workspace, workspace->H, workspace->b_s,
-    //   control->cm_solver_q_err, workspace->L, workspace->U, workspace->s,
-    //   mpi_data, out_control->log );
-#if defined(DEBUG)
-    fprintf( stderr, "p%d: first CG completed\n", system->my_rank );
+    switch ( control->cm_solver_type )
+    {
+    case CG_S:
+#if defined(DUAL_SOLVER)
+        iters = dual_CG( system, control, data, workspace, workspace->H, workspace->b,
+                control->cm_solver_q_err, workspace->x, mpi_data );
+#else
+        for ( j = 0; j < system->n; ++j )
+        {
+            workspace->s[j] = workspace->x[j][0];
+        }
+
+        iters = CG( system, control, data, workspace, workspace->H, workspace->b_s,
+                control->cm_solver_q_err, workspace->s, mpi_data );
+
+        for ( j = 0; j < system->n; ++j )
+        {
+            workspace->x[j][0] = workspace->s[j];
+        }
+
+        for ( j = 0; j < system->n; ++j )
+        {
+            workspace->t[j] = workspace->x[j][1];
+        }
+
+        iters += CG( system, control, data, workspace, workspace->H, workspace->b_t,
+                control->cm_solver_q_err, workspace->t, mpi_data );
+
+        for ( j = 0; j < system->n; ++j )
+        {
+            workspace->x[j][1] = workspace->t[j];
+        }
 #endif
+        break;
+
+    case PIPECG_S:
+#if defined(DUAL_SOLVER)
+        iters = dual_PIPECG( system, control, data, workspace, workspace->H, workspace->b,
+                control->cm_solver_q_err, workspace->x, mpi_data );
+#else
+        for ( j = 0; j < system->n; ++j )
+        {
+            workspace->s[j] = workspace->x[j][0];
+        }
 
-    for ( j = 0; j < system->n; ++j )
-        workspace->t[j] = workspace->x[j][1];
-    t_matvecs = CG(system, workspace, workspace->H, workspace->b_t,//newQEq sCG
-                   control->cm_solver_q_err, workspace->t, mpi_data, out_control->log );
-    for ( j = 0; j < system->n; ++j )
-        workspace->x[j][1] = workspace->t[j];
+        iters = PIPECG( system, control, data, workspace, workspace->H, workspace->b_s,
+                control->cm_solver_q_err, workspace->s, mpi_data );
 
-    //t_matvecs = PCG( system, workspace, workspace->H, workspace->b_t,
-    //   control->cm_solver_q_err, workspace->L, workspace->U, workspace->t,
-    //   mpi_data, out_control->log );
-#if defined(DEBUG)
-    fprintf( stderr, "p%d: second CG completed\n", system->my_rank );
+        for ( j = 0; j < system->n; ++j )
+        {
+            workspace->x[j][0] = workspace->s[j];
+        }
+
+        for ( j = 0; j < system->n; ++j )
+        {
+            workspace->t[j] = workspace->x[j][1];
+        }
+
+        iters += PIPECG( system, control, data, workspace, workspace->H, workspace->b_t,
+                control->cm_solver_q_err, workspace->t, mpi_data );
+
+        for ( j = 0; j < system->n; ++j )
+        {
+            workspace->x[j][1] = workspace->t[j];
+        }
 #endif
+        break;
+
+    case PIPECR_S:
+        for ( j = 0; j < system->n; ++j )
+        {
+            workspace->s[j] = workspace->x[j][0];
+        }
+
+        iters = PIPECR( system, control, data, workspace, workspace->H, workspace->b_s,
+                control->cm_solver_q_err, workspace->s, mpi_data );
+
+        for ( j = 0; j < system->n; ++j )
+        {
+            workspace->x[j][0] = workspace->s[j];
+        }
+
+        for ( j = 0; j < system->n; ++j )
+        {
+            workspace->t[j] = workspace->x[j][1];
+        }
+
+        iters += PIPECR( system, control, data, workspace, workspace->H, workspace->b_t,
+                control->cm_solver_q_err, workspace->t, mpi_data );
+
+        for ( j = 0; j < system->n; ++j )
+        {
+            workspace->x[j][1] = workspace->t[j];
+        }
+        break;
+
+    case GMRES_S:
+    case GMRES_H_S:
+    case SDM_S:
+    case BiCGStab_S:
+        fprintf( stderr, "[ERROR] Unsupported solver selection. Terminating...\n" );
+        break;
+
+    default:
+        fprintf( stderr, "[ERROR] Unrecognized solver selection. Terminating...\n" );
+        exit( INVALID_INPUT );
+        break;
+    }
 
     Calculate_Charges( system, workspace, mpi_data );
+
 #if defined(DEBUG)
     fprintf( stderr, "p%d: computed charges\n", system->my_rank );
     //Print_Charges( system );
@@ -422,7 +561,7 @@ void QEq( reax_system *system, control_params *control, simulation_data *data,
 #if defined(LOG_PERFORMANCE)
     if ( system->my_rank == MASTER_NODE )
     {
-        data->timing.cm_solver_iters += s_matvecs + t_matvecs;
+        data->timing.cm_solver_iters += iters;
     }
 #endif
 }
diff --git a/PuReMD/src/reax_defs.h b/PuReMD/src/reax_defs.h
index a61ce245a5fc29c580158f4f425805dd16506c5a..4bd9d740b23d0f5d90d118e0783808ebcb901e7e 100644
--- a/PuReMD/src/reax_defs.h
+++ b/PuReMD/src/reax_defs.h
@@ -26,10 +26,10 @@
 #define inline __inline__
 #endif /*IBMC*/
 
-#define SUCCESS  1
-#define FAILURE  0
-#define TRUE  1
-#define FALSE 0
+#define SUCCESS  (1)
+#define FAILURE  (0)
+#define TRUE  (1)
+#define FALSE (0)
 
 #define SQR(x)        ((x)*(x))
 #define CUBE(x)       ((x)*(x)*(x))
@@ -39,66 +39,67 @@
 #define MIN(x,y)      (((x) < (y)) ? (x) : (y))
 #define MAX3(x,y,z)   MAX( MAX(x,y), z)
 
-#define constPI        3.14159265
-#define C_ele          332.06371
-//#define K_B         503.398008   // kcal/mol/K
-#define K_B             0.831687   // amu A^2 / ps^2 / K
-#define F_CONV          1e6 / 48.88821291 / 48.88821291   // --> amu A / ps^2
-#define E_CONV          0.002391   // amu A^2 / ps^2 --> kcal/mol
-#define EV_to_KCALpMOL 14.400000   // ElectronVolt --> KCAL per MOLe
-#define KCALpMOL_to_EV 23.02       // 23.060549 //KCAL per MOLe --> ElectronVolt
-#define ECxA_to_DEBYE   4.803204   // elem. charge * Ang -> debye
-#define CAL_to_JOULES   4.184000   // CALories --> JOULES
-#define JOULES_to_CAL   1/4.184000 // JOULES --> CALories
-#define AMU_to_GRAM     1.6605e-24
-#define ANG_to_CM       1e-8
-#define AVOGNR          6.0221367e23
-#define P_CONV          1e-24 * AVOGNR * JOULES_to_CAL
-
-#define MAX_STR             1024
-#define MAX_LINE            1024
-#define MAX_TOKENS          1024
-#define MAX_TOKEN_LEN       1024
-
-#define MAX_ATOM_ID         100000
-#define MAX_RESTRICT        15
-#define MAX_MOLECULE_SIZE   20
-#define MAX_ATOM_TYPES      25
-
-#define NUM_INTRS      10
-#define ALMOST_ZERO    1e-10
-#define NEG_INF       -1e10
-#define NO_BOND        1e-3  // 0.001
-#define HB_THRESHOLD   1e-2  // 0.01
-
-#define MIN_CAP        50
-#define MIN_NBRS       100
-#define MIN_HENTRIES   100
-#define MAX_BONDS      30
-#define MIN_BONDS      15
-#define MIN_HBONDS     25
-#define MIN_3BODIES    1000
-#define MIN_GCELL_POPL 50
-#define MIN_SEND       100
-#define SAFE_ZONE      1.2
-#define SAFER_ZONE     1.4
-#define DANGER_ZONE    0.90
-#define LOOSE_ZONE     0.75
-#define MAX_3BODY_PARAM     5
-#define MAX_4BODY_PARAM     5
-
-#define MAX_dV              1.01
-#define MIN_dV              0.99
-#define MAX_dT              4.00
-#define MIN_dT              0.00
-
-#define MASTER_NODE 0
-#define MAX_NBRS 6 //27
-#define MYSELF   13  // encoding of relative coordinate (0,0,0)
-
-#define MAX_ITR 10
-#define RESTART 30
-
+#define constPI        (3.14159265)
+#define C_ele          (332.06371)
+//#define K_B         (503.398008)   // kcal/mol/K
+#define K_B             (0.831687)   // amu A^2 / ps^2 / K
+#define F_CONV          (1e6 / 48.88821291 / 48.88821291)   // --> amu A / ps^2
+#define E_CONV          (0.002391)   // amu A^2 / ps^2 --> kcal/mol
+#define EV_to_KCALpMOL (14.400000)   // ElectronVolt --> KCAL per MOLe
+#define KCALpMOL_to_EV (23.02)       // 23.060549 //KCAL per MOLe --> ElectronVolt
+#define ECxA_to_DEBYE   (4.803204)   // elem. charge * Ang -> debye
+#define CAL_to_JOULES   (4.184000)   // CALories --> JOULES
+#define JOULES_to_CAL   (1/4.184000) // JOULES --> CALories
+#define AMU_to_GRAM     (1.6605e-24)
+#define ANG_to_CM       (1e-8)
+#define AVOGNR          (6.0221367e23)
+#define P_CONV          (1e-24 * AVOGNR * JOULES_to_CAL)
+
+#define MAX_STR             (1024)
+#define MAX_LINE            (1024)
+#define MAX_TOKENS          (1024)
+#define MAX_TOKEN_LEN       (1024)
+
+#define MAX_ATOM_ID         (100000)
+#define MAX_RESTRICT        (15)
+#define MAX_MOLECULE_SIZE   (20)
+#define MAX_ATOM_TYPES      (25)
+
+#define NUM_INTRS      (10)
+#define ALMOST_ZERO    (1e-10)
+#define NEG_INF       (-1e10)
+#define NO_BOND        (1e-3)  // 0.001
+#define HB_THRESHOLD   (1e-2)  // 0.01
+
+#define MIN_CAP        (50)
+#define MIN_NBRS       (100)
+#define MIN_HENTRIES   (100)
+#define MAX_BONDS      (30)
+#define MIN_BONDS      (15)
+#define MIN_HBONDS     (25)
+#define MIN_3BODIES    (1000)
+#define MIN_GCELL_POPL (50)
+#define MIN_SEND       (100)
+#define SAFE_ZONE      (1.2)
+#define SAFER_ZONE     (1.4)
+#define SAFE_ZONE_NT   (2.0)
+#define SAFER_ZONE_NT  (2.5)
+#define DANGER_ZONE    (0.90)
+#define LOOSE_ZONE     (0.75)
+#define MAX_3BODY_PARAM     (5)
+#define MAX_4BODY_PARAM     (5)
+
+#define MAX_dV              (1.01)
+#define MIN_dV              (0.99)
+#define MAX_dT              (4.00)
+#define MIN_dT              (0.00)
+
+#define MASTER_NODE (0)
+#define MAX_NBRS (6) //27
+#define MYSELF   (13)  // encoding of relative coordinate (0,0,0)
+
+#define MAX_ITR (10)
+#define RESTART (30)
 
 
 /******************* ENUMERATIONS *************************/
@@ -125,15 +126,17 @@ enum message_tags { INIT = 0, UPDATE = 1, BNDRY = 2, UPDATE_BNDRY = 3,
 enum errors { FILE_NOT_FOUND = -10, UNKNOWN_ATOM_TYPE = -11,
               CANNOT_OPEN_FILE = -12, CANNOT_INITIALIZE = -13,
               INSUFFICIENT_MEMORY = -14, UNKNOWN_OPTION = -15,
-              INVALID_INPUT = -16, INVALID_GEO = -17
+              INVALID_INPUT = -16, INVALID_GEO = -17,
+              RUNTIME_ERROR = -18,
             };
 
 enum exchanges { NONE = 0, NEAR_EXCH = 1, FULL_EXCH = 2 };
 
 enum gcell_types { NO_NBRS = 0, NEAR_ONLY = 1, HBOND_ONLY = 2, FAR_ONLY = 4,
                    NEAR_HBOND = 3, NEAR_FAR = 5, HBOND_FAR = 6, FULL_NBRS = 7,
-                   NATIVE = 8
+                   NATIVE = 8, NT_NBRS = 9 // 9 through 14
                  };
+enum nt_atom_type { TOWER = 1, PLATE = 2 };
 
 enum atoms { C_ATOM = 0, H_ATOM = 1, O_ATOM = 2, N_ATOM = 3,
              S_ATOM = 4, SI_ATOM = 5, GE_ATOM = 6, X_ATOM = 7
diff --git a/PuReMD/src/reax_types.h b/PuReMD/src/reax_types.h
index dfaa9c37af08cb5ddbc695d44555c2accd673ac7..136a5ff5728b9ee859355cfdea4a0bbe58614e65 100644
--- a/PuReMD/src/reax_types.h
+++ b/PuReMD/src/reax_types.h
@@ -40,6 +40,8 @@
 /************* SOME DEFS - crucial for reax_types.h *********/
 
 #define PURE_REAX
+#define DUAL_SOLVER
+//#define NEUTRAL_TERRITORY
 //#define LAMMPS_REAX
 //#define DEBUG
 //#define DEBUG_FOCUS
@@ -85,6 +87,8 @@ enum solver
     CG_S = 2,
     SDM_S = 3,
     BiCGStab_S = 4,
+    PIPECG_S = 5,
+    PIPECR_S = 6,
 };
 
 /* preconditioner computation type for charge method linear solver */
@@ -196,8 +200,8 @@ typedef struct
 
 typedef struct
 {
-    MPI_Comm     world;
-    MPI_Comm     comm_mesh3D;
+    MPI_Comm world;
+    MPI_Comm comm_mesh3D;
 
     MPI_Datatype sys_info;
     MPI_Datatype mpi_atom_type;
@@ -219,6 +223,11 @@ typedef struct
 
     void *in1_buffer;
     void *in2_buffer;
+
+#if defined(NEUTRAL_TERRITORY)
+    mpi_out_data out_nt_buffers[REAX_MAX_NT_NBRS];
+    void *in_nt_buffer[REAX_MAX_NT_NBRS];
+#endif
 } mpi_datatypes;
 
 
@@ -431,6 +440,10 @@ typedef struct
     int num_bonds;
     int num_hbonds;
     int renumber;
+#if defined(NEUTRAL_TERRITORY)
+    int nt_dir;
+    int pos;
+#endif
 } reax_atom;
 
 
@@ -496,6 +509,9 @@ typedef struct
 typedef struct
 {
     int  rank;
+#if defined(NEUTRAL_TERRITORY)
+    int  receive_rank;
+#endif
     int  est_send, est_recv;
     int  atoms_str, atoms_cnt;
     ivec rltv, prdc;
@@ -540,12 +556,17 @@ typedef struct
     int              wsize, my_rank, num_nbrs;
     ivec             my_coords;
     neighbor_proc    my_nbrs[REAX_MAX_NBRS];
+    neighbor_proc    my_nt_nbrs[REAX_MAX_NT_NBRS];
     int             *global_offset;
     simulation_box   big_box, my_box, my_ext_box;
     grid             my_grid;
     boundary_cutoff  bndry_cuts;
 
     reax_atom       *my_atoms;
+
+#if defined(NEUTRAL_TERRITORY)
+    int num_nt_nbrs;
+#endif
 } reax_system;
 
 
@@ -771,10 +792,16 @@ typedef struct
     real bonded;
     /* non-bonded force calculation time */
     real nonb;
+    /* distance between pairs calculation time */
+    real init_dist;
+    /* charge matrix calculation time */
+    real init_cm;
+    /* bonded interactions calculation time */
+    real init_bond;
     /* atomic charge distribution calculation time */
     real cm;
     /**/
-    real cm_sort_mat_rows;
+    real cm_sort;
     /**/
     real cm_solver_comm;
     /**/
@@ -782,13 +809,13 @@ typedef struct
     /**/
     real cm_solver_pre_comp;
     /**/
-    real cm_solver_pre_app; // update CG()
+    real cm_solver_pre_app;
     /* num. of steps in iterative linear solver for charge distribution */
     int cm_solver_iters;
     /**/
-    real cm_solver_spmv; // update CG()
+    real cm_solver_spmv;
     /**/
-    real cm_solver_vector_ops; // update CG()
+    real cm_solver_vector_ops;
     /**/
     real cm_solver_orthog;
     /**/
@@ -870,20 +897,40 @@ typedef struct
 } three_body_interaction_data;
 
 
+typedef struct
+{
+    /* neighbor atom IDs */
+    int *nbr;
+    /* set of three integers which deterimine if the neighbor
+     * atom is a non-periodic neighbor (all zeros) or a periodic
+     * neighbor and which perioidic image this neighbor comes from */
+    ivec *rel_box;
+    /* distance to the neighboring atom */
+    real *d;
+    /* difference between positions of this atom and its neighboring atom */
+    rvec *dvec;
+} far_neighbor_data;
+
+
+#if defined(NEUTRAL_TERRITORY)
 typedef struct
 {
     int nbr;
     ivec rel_box;
     real d;
     rvec dvec;
-} far_neighbor_data;
+} nt_neighbor_data;
+#endif
 
 
 typedef struct
 {
+    /* neighbor atom ID */
     int nbr;
+    /* ??? */
     int scl;
-    far_neighbor_data *ptr;
+    /* position of neighbor in far neighbor list */
+    int ptr;
 } hbond_data;
 
 
@@ -950,6 +997,9 @@ typedef struct
     /* matrix storage format */
     int format;
     int cap, n, m;
+#if defined(NEUTRAL_TERRITORY)
+    int NT;
+#endif
     int *start, *end;
     sparse_matrix_entry *entries;
 } sparse_matrix;
@@ -983,9 +1033,9 @@ typedef struct
     real *dDelta_lp, *dDelta_lp_temp;
     real *nlp, *nlp_temp, *Clp, *vlpex;
     rvec *dDeltap_self;
-    int *bond_mark, *done_after;
+    int *bond_mark;
 
-    /* QEq storage */
+    /* charge matrix storage */
     sparse_matrix *H;
     sparse_matrix *L;
     sparse_matrix *U;
@@ -998,21 +1048,28 @@ typedef struct
     rvec2 *b, *x;
 
     /* GMRES storage */
-    real *y, *z, *g;
+    real *y, *g;
     real *hc, *hs;
     real **h, **v;
-    /* CG storage */
-    real *r, *d, *q, *p;
-    rvec2 *r2, *d2, *q2, *p2;
+    /* GMRES, PIPECG, PIPECR storage */
+    real *z;
+    /* CG, PIPECG, PIPECR storage */
+    real *d, *p, *q, *r;
+    /* PIPECG, PIPECR storage */
+    real *m, *n, *u, *w;
+    /* dual-CG storage */
+    rvec2 *d2, *p2, *q2, *r2;
+    /* dual-PIPECG storage */
+    rvec2 *m2, *n2, *u2, *w2, *z2;
     /* Taper */
-    real Tap[8]; //Tap7, Tap6, Tap5, Tap4, Tap3, Tap2, Tap1, Tap0;
+    real Tap[8];
 
     /* storage for analysis */
-    int  *mark, *old_mark;
+    int *mark, *old_mark;
     rvec *x_old;
 
     /* storage space for bond restrictions */
-    int  *restricted;
+    int *restricted;
     int **restricted_list;
 
     /* integrator */
@@ -1088,7 +1145,10 @@ typedef struct
     bond_data *bond_list;
     dbond_data *dbo_list;
     dDelta_data *dDelta_list;
-    far_neighbor_data *far_nbr_list;
+    far_neighbor_data far_nbr_list;
+#if defined(NEUTRAL_TERRITORY)
+    nt_neighbor_data *nt_nbr_list;
+#endif
     hbond_data *hbond_list;
 } reax_list;
 
@@ -1122,7 +1182,7 @@ typedef struct
     int   write_steps;
     int   traj_compress;
     int   traj_method;
-    char  traj_title[81];
+    char  traj_title[REAX_MAX_STR];
     int   atom_info;
     int   bond_info;
     int   angle_info;
diff --git a/PuReMD/src/restart.c b/PuReMD/src/restart.c
index b687d82abc0a2618b0cc7d496f39b8630d51d450..bdd2476f74fdbe797c009643b021ef69f1fd78a6 100644
--- a/PuReMD/src/restart.c
+++ b/PuReMD/src/restart.c
@@ -49,11 +49,7 @@ void Write_Binary_Restart( reax_system *system, control_params *control,
     {
         /* master handles the restart file */
         sprintf( fname, "%s.res%d", control->sim_name, data->step );
-        if ( (fres = fopen( fname, "wb" )) == NULL )
-        {
-            fprintf( stderr, "ERROR: can't open the restart file! terminating...\n" );
-            MPI_Abort( MPI_COMM_WORLD, FILE_NOT_FOUND );
-        }
+        fres = sfopen( fname, "wb", "Write_Binary_Restart" );
 
         /* master can write the header by itself */
         res_header.step = data->step;
@@ -108,7 +104,7 @@ void Write_Binary_Restart( reax_system *system, control_params *control,
     if ( me == MASTER_NODE )
     {
         fwrite( buffer, system->bigN, sizeof(restart_atom), fres );
-        fclose( fres );
+        sfclose( fres, "Write_Binary_Restart" );
     }
 
     sfree(buffer, "buffer");
@@ -139,11 +135,7 @@ void Write_Restart( reax_system *system, control_params *control,
     if ( me == MASTER_NODE )
     {
         sprintf( fname, "%s.res%d", control->sim_name, data->step );
-        if ( (fres = fopen( fname, "w" )) == NULL )
-        {
-            fprintf( stderr, "ERROR: can't open the restart file! terminating...\n" );
-            MPI_Abort( MPI_COMM_WORLD, FILE_NOT_FOUND );
-        }
+        fres = sfopen( fname, "w", "Write_Restart" );
 
         /* write the header - only master writes it */
         fprintf( fres, RESTART_HEADER,
@@ -204,7 +196,7 @@ void Write_Restart( reax_system *system, control_params *control,
     if ( me == MASTER_NODE )
     {
         fprintf( fres, "%s", buffer );
-        fclose( fres );
+        sfclose( fres, "Write_Restart" );
     }
     sfree(buffer, "buffer");
     sfree(line, "line");
@@ -250,11 +242,7 @@ void Read_Binary_Restart( char *res_file, reax_system *system,
 
     comm = MPI_COMM_WORLD;
 
-    if ( (fres = fopen(res_file, "rb")) == NULL )
-    {
-        fprintf( stderr, "ERROR: cannot open the restart file! terminating...\n" );
-        MPI_Abort( comm, FILE_NOT_FOUND );
-    }
+    fres = sfopen( res_file, "rb", "Read_Binary_Restart" );
 
     /* first read the header lines */
     fread(&res_header, sizeof(restart_header), 1, fres);
@@ -313,7 +301,7 @@ void Read_Binary_Restart( char *res_file, reax_system *system,
         }
     }
 
-    fclose( fres );
+    sfclose( fres, "Read_Binary_Restart" );
 
     data->step = data->prev_steps;
     // nsteps is updated based on the number of steps in the previous run
@@ -368,11 +356,7 @@ void Read_Restart( char *res_file, reax_system *system,
 
     comm = MPI_COMM_WORLD;
 
-    if ( (fres = fopen(res_file, "r")) == NULL )
-    {
-        fprintf( stderr, "ERROR: cannot open the restart file! terminating...\n" );
-        MPI_Abort( comm, FILE_NOT_FOUND );
-    }
+    fres = sfopen( res_file, "r", "Read_Binary_Restart" );
 
     s = (char*) malloc(sizeof(char) * MAX_LINE);
     tmp = (char**) malloc(sizeof(char*)*MAX_TOKENS);
@@ -464,7 +448,7 @@ void Read_Restart( char *res_file, reax_system *system,
             top++;
         }
     }
-    fclose( fres );
+    sfclose( fres, "Read_Restart" );
     /* free memory allocations at the top */
     for ( i = 0; i < MAX_TOKENS; i++ )
         sfree( tmp[i], "tmp[i]" );
diff --git a/PuReMD/src/tool_box.c b/PuReMD/src/tool_box.c
index d898f2190d72de1424eea2730ed506e9e04ec610..9374fc3c5fe04aeedb1838fde0288258ff8f30c4 100644
--- a/PuReMD/src/tool_box.c
+++ b/PuReMD/src/tool_box.c
@@ -318,6 +318,8 @@ void Trim_Spaces( char *element )
 struct timeval tim;
 real t_end;
 
+// NOTE: these timing functions are not being used
+// replaced by MPI_Wtime()
 real Get_Time( )
 {
     gettimeofday(&tim, NULL );
diff --git a/PuReMD/src/torsion_angles.c b/PuReMD/src/torsion_angles.c
index f915fdb8325de91f5c5ea72f47e36ae0ab306992..4719aa33c2d79b6c07636afb69cd8dfa5857c63f 100644
--- a/PuReMD/src/torsion_angles.c
+++ b/PuReMD/src/torsion_angles.c
@@ -197,7 +197,7 @@ void Torsion_Angles( reax_system *system, control_params *control,
     // FILE *ftor;
 
     // sprintf( fname, "tor%d.out", system->my_rank );
-    // ftor = fopen( fname, "w" );
+    // ftor = sfopen( fname, "w", "Torsion_Angles" );
 
     natoms = system->n;
 
@@ -342,7 +342,8 @@ void Torsion_Angles( reax_system *system, control_params *control,
                                                  fbp->V2 * exp_tor1 * (1.0 - cos2omega) +
                                                  fbp->V3 * (1.0 + cos3omega) );
 
-                                    data->my_en.e_tor += e_tor = fn10 * sin_ijk * sin_jkl * CV;
+                                    e_tor = fn10 * sin_ijk * sin_jkl * CV;
+                                    data->my_en.e_tor += e_tor;
 
                                     dfn11 = (-p_tor3 * exp_tor3_DjDk +
                                              (p_tor3 * exp_tor3_DjDk - p_tor4 * exp_tor4_DjDk) *
@@ -375,9 +376,8 @@ void Torsion_Angles( reax_system *system, control_params *control,
 
                                     /* 4-body conjugation energy */
                                     fn12 = exp_cot2_ij * exp_cot2_jk * exp_cot2_kl;
-                                    data->my_en.e_con += e_con =
-                                                             fbp->p_cot1 * fn12 *
-                                                             (1.0 + (SQR(cos_omega) - 1.0) * sin_ijk * sin_jkl);
+                                    e_con = fbp->p_cot1 * fn12 * (1.0 + (SQR(cos_omega) - 1.0) * sin_ijk * sin_jkl);
+                                    data->my_en.e_con += e_con;
 
                                     Cconj = -2.0 * fn12 * fbp->p_cot1 * p_cot2 *
                                             (1.0 + (SQR(cos_omega) - 1.0) * sin_ijk * sin_jkl);
diff --git a/PuReMD/src/traj.c b/PuReMD/src/traj.c
index d03a8289f22639477db5c9c7b3af919d37adba9e..e189c4fc2c479c6bb0a5b7a7180e267c820d29d2 100644
--- a/PuReMD/src/traj.c
+++ b/PuReMD/src/traj.c
@@ -527,7 +527,7 @@ int Init_Traj( reax_system *system, control_params *control,
     else if ( out_control->traj_method == REG_TRAJ)
     {
         if ( system->my_rank == MASTER_NODE )
-            out_control->strj = fopen( fname, "w" );
+            out_control->strj = sfopen( fname, "w", "Init_Traj" );
     }
     else
     {
@@ -538,7 +538,7 @@ int Init_Traj( reax_system *system, control_params *control,
     if ( out_control->traj_method == REG_TRAJ)
     {
         if ( system->my_rank == MASTER_NODE )
-            out_control->strj = fopen( fname, "w" );
+            out_control->strj = sfopen( fname, "w", "Init_Traj" );
     }
     else
     {
@@ -1116,10 +1116,10 @@ int End_Traj( int my_rank, output_controls *out_control )
     if ( out_control->traj_method == MPI_TRAJ )
         MPI_File_close( &(out_control->trj) );
     else if ( my_rank == MASTER_NODE )
-        fclose( out_control->strj );
+        sfclose( out_control->strj, "End_Traj" );
 #elif defined(LAMMPS_REAX)
     if ( my_rank == MASTER_NODE )
-        fclose( out_control->strj );
+        sfclose( out_control->strj, "End_Traj" );
 #endif
 
     sfree( out_control->buffer, "out_control->buffer" );
diff --git a/PuReMD/src/vector.c b/PuReMD/src/vector.c
index 27c33db810ca7e1458f4226b35a1f885072baf17..ee4cf8ee96595150683526f34c26f1b1ed01251f 100644
--- a/PuReMD/src/vector.c
+++ b/PuReMD/src/vector.c
@@ -27,68 +27,102 @@
 #include "reax_vector.h"
 #endif
 
+
 int Vector_isZero( real* v, int k )
 {
-    for ( --k; k >= 0; --k )
-        if ( fabs( v[k] ) > ALMOST_ZERO )
-            return 0;
+    int i, ret;
 
-    return 1;
+    ret = 1;
+
+    for ( i = 0; i < k; ++i )
+    {
+        if ( fabs( v[i] ) > ALMOST_ZERO )
+        {
+            ret = 0;
+            break;
+        }
+    }
+
+    return ret;
 }
 
 
 void Vector_MakeZero( real *v, int k )
 {
-    for ( --k; k >= 0; --k )
-        v[k] = 0;
+    int i;
+
+    for ( i = 0; i < k; ++i )
+    {
+        v[i] = 0;
+    }
 }
 
 
 void Vector_Copy( real* dest, real* v, int k )
 {
-    for ( --k; k >= 0; --k )
-        dest[k] = v[k];
+    int i;
+
+    for ( i = 0; i < k; ++i )
+    {
+        dest[i] = v[i];
+    }
 }
 
 
 void Vector_Scale( real* dest, real c, real* v, int k )
 {
-    for ( --k; k >= 0; --k )
-        dest[k] = c * v[k];
+    int i;
+
+    for ( i = 0; i < k; ++i )
+    {
+        dest[i] = c * v[i];
+    }
 }
 
 
-void Vector_Sum( real* dest, real c, real* v, real d, real* y, int k )
+real Dot( real* v1, real* v2, int k )
 {
-    for ( --k; k >= 0; --k )
-        dest[k] = c * v[k] + d * y[k];
-}
+    int i;
+    real ret;
 
+    ret = 0.0;
 
-void Vector_Add( real* dest, real c, real* v, int k )
-{
-    for ( --k; k >= 0; --k )
-        dest[k] += c * v[k];
+    for ( i = 0; i < k; ++i )
+    {
+        ret += v1[i] * v2[i];
+    }
+
+    return ret;
 }
 
 
-real Dot( real* v1, real* v2, int k )
+real Dot_local( real *v1, real *v2, int k )
 {
-    real ret = 0;
+    int i;
+    real sum;
 
-    for ( --k; k >= 0; --k )
-        ret +=  v1[k] * v2[k];
+    sum = 0.0;
 
-    return ret;
+    for ( i = 0; i < k; ++i )
+    {
+        sum += v1[i] * v2[i];
+    }
+
+    return sum;
 }
 
 
 real Norm( real* v1, int k )
 {
-    real ret = 0;
+    int i;
+    real ret;
+
+    ret = 0.0;
 
-    for ( --k; k >= 0; --k )
-        ret +=  SQR( v1[k] );
+    for ( i = 0; i < k; ++i )
+    {
+        ret +=  SQR( v1[i] );
+    }
 
     return sqrt( ret );
 }
diff --git a/PuReMD/src/vector.h b/PuReMD/src/vector.h
index 199810eb9203a983870b12b4686702256bd81288..1b12342307e180eac1ecdfb96d894ce9f6be41bb 100644
--- a/PuReMD/src/vector.h
+++ b/PuReMD/src/vector.h
@@ -25,65 +25,139 @@
 #include "reax_types.h"
 #include "reax_defs.h"
 
+
 int  Vector_isZero( real*, int );
+
 void Vector_MakeZero( real*, int );
+
 void Vector_Copy( real*, real*, int );
+
 void Vector_Scale( real*, real, real*, int );
-void Vector_Sum( real*, real, real*, real, real*, int );
-void Vector_Add( real*, real, real*, int );
+
+
+static inline void Vector_Sum( real* dest, real c, real* v, real d, real* y, int k )
+{
+    int i;
+
+    for ( i = 0; i < k; ++i )
+    {
+        dest[i] = c * v[i] + d * y[i];
+    }
+}
+
+
+static inline void Vector_Add( real* dest, real c, real* v, int k )
+{
+    int i;
+
+    for ( i = 0; i < k; ++i )
+    {
+        dest[i] += c * v[i];
+    }
+}
+
+
 real Dot( real*, real*, int );
+
+real Dot_local( real*, real*, int );
+
 real Norm( real*, int );
+
 void Vector_Print( FILE*, char*, real*, int );
 
 void rvec_Copy( rvec, rvec );
+
 void rvec_Scale( rvec, real, rvec );
+
 void rvec_Add( rvec, rvec );
+
 void rvec_ScaledAdd( rvec, real, rvec );
+
 void rvec_Sum( rvec, rvec, rvec );
+
 void rvec_ScaledSum( rvec, real, rvec, real, rvec );
+
 real rvec_Dot( rvec, rvec );
+
 real rvec_ScaledDot( real, rvec, real, rvec );
+
 void rvec_Multiply( rvec, rvec, rvec );
+
 void rvec_iMultiply( rvec, ivec, rvec );
+
 void rvec_Divide( rvec, rvec, rvec );
+
 void rvec_iDivide( rvec, rvec, ivec );
+
 void rvec_Invert( rvec, rvec );
+
 void rvec_Cross( rvec, rvec, rvec );
+
 void rvec_OuterProduct( rtensor, rvec, rvec );
+
 real rvec_Norm_Sqr( rvec );
+
 real rvec_Norm( rvec );
+
 int  rvec_isZero( rvec );
+
 void rvec_MakeZero( rvec );
+
 void rvec_Random( rvec );
 
 void rtensor_MakeZero( rtensor );
+
 void rtensor_Multiply( rtensor, rtensor, rtensor );
+
 void rtensor_MatVec( rvec, rtensor, rvec );
+
 void rtensor_Scale( rtensor, real, rtensor );
+
 void rtensor_Add( rtensor, rtensor );
+
 void rtensor_ScaledAdd( rtensor, real, rtensor );
+
 void rtensor_Sum( rtensor, rtensor, rtensor );
+
 void rtensor_ScaledSum( rtensor, real, rtensor, real, rtensor );
+
 void rtensor_Scale( rtensor, real, rtensor );
+
 void rtensor_Copy( rtensor, rtensor );
+
 void rtensor_Identity( rtensor );
+
 void rtensor_Transpose( rtensor, rtensor );
+
 real rtensor_Det( rtensor );
+
 real rtensor_Trace( rtensor );
 
 void Print_rTensor(FILE*, rtensor);
 
-int  ivec_isZero( ivec );
-int  ivec_isEqual( ivec, ivec );
+int ivec_isZero( ivec );
+
+int ivec_isEqual( ivec, ivec );
+
 void ivec_MakeZero( ivec );
+
 void ivec_Copy( ivec, ivec );
+
 void ivec_Scale( ivec, real, ivec );
+
 void ivec_rScale( ivec, real, rvec );
+
 void ivec_Sum( ivec, ivec, ivec );
+
 void ivec_ScaledSum( ivec, int, ivec, int, ivec );
+
 void ivec_Add( ivec, ivec );
+
 void ivec_ScaledAdd( ivec, int, ivec );
+
 void ivec_Max( ivec, ivec, ivec );
+
 void ivec_Max3( ivec, ivec, ivec, ivec );
 
+
 #endif