diff --git a/PuReMD/src/init_md.c b/PuReMD/src/init_md.c
index 0d9bcc4006e4d6e3aa51dea91700bbb4b391a877..ef991d27c87620bc272f6c50c6e82e387819b756 100644
--- a/PuReMD/src/init_md.c
+++ b/PuReMD/src/init_md.c
@@ -290,6 +290,7 @@ int Init_Simulation_Data( reax_system *system, control_params *control,
         data->timing.cm = ZERO;
         data->timing.cm_sort_mat_rows = ZERO;
         data->timing.cm_solver_comm = ZERO;
+        data->timing.cm_solver_allreduce = ZERO;
         data->timing.cm_solver_pre_comp = ZERO;
         data->timing.cm_solver_pre_app = ZERO;
         data->timing.cm_solver_iters = 0;
@@ -363,6 +364,7 @@ int Init_Simulation_Data( reax_system *system, control_params *control,
         data->timing.cm = ZERO;
         data->timing.cm_sort_mat_rows = ZERO;
         data->timing.cm_solver_comm = ZERO;
+        data->timing.cm_solver_allreduce = ZERO;
         data->timing.cm_solver_pre_comp = ZERO;
         data->timing.cm_solver_pre_app = ZERO;
         data->timing.cm_solver_iters = 0;
diff --git a/PuReMD/src/io_tools.c b/PuReMD/src/io_tools.c
index 122b4e4eb0c73e6c29b2400e761a90577e5f4698..ef7a76c73d24496aad99cef2d7b60693fa9c98b8 100644
--- a/PuReMD/src/io_tools.c
+++ b/PuReMD/src/io_tools.c
@@ -113,9 +113,9 @@ int Init_Output_Files( reax_system *system, control_params *control,
             sprintf( temp, "%s.log", control->sim_name );
             if ( (out_control->log = fopen( temp, "w" )) != NULL )
             {
-                fprintf( out_control->log, "%-6s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s\n",
-                        "step", "total", "comm", "neighbors", "init", "bonded",
-                        "nonbonded", "CM", "CM Sort", "S iters", "S comm", "Pre Comp", "Pre App",
+                fprintf( out_control->log, "%-6s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s\n",
+                        "step", "total", "comm", "neighbors", "init", "bonded", "nonbonded", 
+                        "CM", "CM Sort", "S iters", "Pre Comp", "Pre App", "S comm", "S allr",
                         "S spmv", "S vec ops", "S orthog", "S tsolve" );
                 fflush( out_control->log );
             }
@@ -1267,7 +1267,7 @@ void Output_Results( reax_system *system, control_params *control,
                 denom = 1.0 / out_control->energy_update_freq;
             else denom = 1;
 
-            fprintf( out_control->log, "%6d %10.2f %10.2f %10.2f %10.2f %10.2f %10.2f %10.4f %10.4f %10.2f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f\n",
+            fprintf( out_control->log, "%6d %10.2f %10.2f %10.2f %10.2f %10.2f %10.2f %10.4f %10.4f %10.2f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f\n",
                     data->step,
                     t_elapsed * denom,
                     data->timing.comm * denom,
@@ -1278,9 +1278,10 @@ void Output_Results( reax_system *system, control_params *control,
                     data->timing.cm * denom,
                     data->timing.cm_sort_mat_rows * denom,
                     (double)( data->timing.cm_solver_iters * denom),
-                    data->timing.cm_solver_comm * denom,
                     data->timing.cm_solver_pre_comp * denom,
                     data->timing.cm_solver_pre_app * denom,
+                    data->timing.cm_solver_comm * denom,
+                    data->timing.cm_solver_allreduce * denom,
                     data->timing.cm_solver_spmv * denom,
                     data->timing.cm_solver_vector_ops * denom,
                     data->timing.cm_solver_orthog * denom,
@@ -1295,9 +1296,10 @@ void Output_Results( reax_system *system, control_params *control,
             data->timing.nonb = 0;
             data->timing.cm = ZERO;
             data->timing.cm_sort_mat_rows = ZERO;
-            data->timing.cm_solver_comm = ZERO;
             data->timing.cm_solver_pre_comp = ZERO;
             data->timing.cm_solver_pre_app = ZERO;
+            data->timing.cm_solver_comm = ZERO;
+            data->timing.cm_solver_allreduce = ZERO;
             data->timing.cm_solver_iters = 0;
             data->timing.cm_solver_spmv = ZERO;
             data->timing.cm_solver_vector_ops = ZERO;
diff --git a/PuReMD/src/linear_solvers.c b/PuReMD/src/linear_solvers.c
index 34b3f8e9b38157abd5797d9b33beace90a6df492..2d09f7daf4f570b4d52e7641cabc558dc3361d92 100644
--- a/PuReMD/src/linear_solvers.c
+++ b/PuReMD/src/linear_solvers.c
@@ -387,7 +387,7 @@ real setup_sparse_approx_inverse( reax_system *system, simulation_data *data, st
     MPI_Bcast( &threshold, 1, MPI_DOUBLE, target_proc, comm );
     t_comm += Get_Timing_Info( t_start );
 
-    // int nnz = 0; uncomment to check the nnz's in the sparsity pattern
+    //int nnz = 0; //uncomment to check the nnz's in the sparsity pattern
 
     /* build entries of that pattern*/
     for ( i = 0; i < A->n; ++i )
@@ -509,6 +509,8 @@ real sparse_approx_inverse(reax_system *system, simulation_data *data, storage *
     comm = mpi_data->comm_mesh3D;
     out_bufs = mpi_data->out_buffers;
 
+    //fprintf(stderr, "Before dist call, p%d\n", system->my_rank );
+
     /*  use a Dist-like approach to send the row information */
     for ( d = 0; d < 3; ++d)
     {
@@ -654,8 +656,8 @@ real sparse_approx_inverse(reax_system *system, simulation_data *data, storage *
                 }
 
                 t_start = Get_Time( );
-                MPI_Send( j_send, cnt, MPI_INT, nbr1->rank, 2 * d + 1, comm );
-                MPI_Send( val_send, cnt, MPI_DOUBLE, nbr1->rank, 2 * d + 1, comm );
+                MPI_Send( j_send, cnt, MPI_INT, nbr2->rank, 2 * d + 1, comm );
+                MPI_Send( val_send, cnt, MPI_DOUBLE, nbr2->rank, 2 * d + 1, comm );
                 t_comm += Get_Timing_Info( t_start );
             }
 
@@ -711,6 +713,9 @@ real sparse_approx_inverse(reax_system *system, simulation_data *data, storage *
             }
         }
     }
+    
+    //fprintf(stderr, "After dist call, p%d\n", system->my_rank );
+    //fflush(stderr);
 
     X = (int *) malloc( sizeof(int) * (system->bigN + 1) );
     pos_x = (int *) malloc( sizeof(int) * (system->bigN + 1) );
@@ -1188,13 +1193,14 @@ int CG( reax_system *system, control_params *control, simulation_data *data,
     int  i, j, scale;
     real tmp, alpha, beta, b_norm;
     real sig_old, sig_new;
-    real t_start, t_pa, t_spmv, t_vops, t_comm;
-    real total_pa, total_spmv, total_vops, total_comm;
+    real t_start, t_pa, t_spmv, t_vops, t_comm, t_allreduce;
+    real total_pa, total_spmv, total_vops, total_comm, total_allreduce;
 
     t_pa = 0.0;
     t_spmv = 0.0;
     t_vops = 0.0;
     t_comm = 0.0;
+    t_allreduce = 0.0;
 
     t_start = Get_Time( );
     scale = sizeof(real) / sizeof(void);
@@ -1260,6 +1266,10 @@ int CG( reax_system *system, control_params *control, simulation_data *data,
 
         t_start = Get_Time( );
         tmp = Parallel_Dot(workspace->d, workspace->q, system->n, mpi_data->world);
+        //TODO: all_Reduce time
+        t_allreduce += Get_Timing_Info ( t_start );
+
+        t_start = Get_Time( );
         alpha = sig_new / tmp;
         Vector_Add( x, alpha, workspace->d, system->n );
         Vector_Add( workspace->r, -alpha, workspace->q, system->n );
@@ -1290,6 +1300,10 @@ int CG( reax_system *system, control_params *control, simulation_data *data,
         t_start = Get_Time( );
         sig_old = sig_new;
         sig_new = Parallel_Dot(workspace->r, workspace->p, system->n, mpi_data->world);
+        //TODO all_reduce time
+        t_allreduce += Get_Timing_Info( t_start );
+
+        t_start = Get_Time( );
         beta = sig_new / sig_old;
         Vector_Sum( workspace->d, 1., workspace->p, beta, workspace->d, system->n );
         t_vops += Get_Timing_Info( t_start );
@@ -1299,6 +1313,7 @@ int CG( reax_system *system, control_params *control, simulation_data *data,
     MPI_Reduce(&t_spmv, &total_spmv, 1, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world);
     MPI_Reduce(&t_vops, &total_vops, 1, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world);
     MPI_Reduce(&t_comm, &total_comm, 1, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world);
+    MPI_Reduce(&t_allreduce, &total_allreduce, 1, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world);
 
     if( system->my_rank == MASTER_NODE )
     {
@@ -1306,6 +1321,7 @@ int CG( reax_system *system, control_params *control, simulation_data *data,
         data->timing.cm_solver_spmv += total_spmv / nprocs;
         data->timing.cm_solver_vector_ops += total_vops / nprocs;
         data->timing.cm_solver_comm += total_comm / nprocs;
+        data->timing.cm_solver_allreduce += total_allreduce / nprocs;
     }
 
     MPI_Barrier(mpi_data->world);
diff --git a/PuReMD/src/reax_types.h b/PuReMD/src/reax_types.h
index f9933ffe9b3181c2555dd6708af2614a1608e506..56a8c1c9841d0f6ea67f74d364277ff9a768f5d3 100644
--- a/PuReMD/src/reax_types.h
+++ b/PuReMD/src/reax_types.h
@@ -766,6 +766,8 @@ typedef struct
     /**/
     real cm_solver_comm;
     /**/
+    real cm_solver_allreduce;
+    /**/
     real cm_solver_pre_comp;
     /**/
     real cm_solver_pre_app; // update CG()