diff --git a/PuReMD/src/forces.c b/PuReMD/src/forces.c
index cb30f01d67a567c4145b14bf618f0588f1eb7bbc..21823d49de3dd295bad85bfa20dbc7f73c540bbd 100644
--- a/PuReMD/src/forces.c
+++ b/PuReMD/src/forces.c
@@ -328,7 +328,7 @@ real Compute_tabH( real r_ij, int ti, int tj )
 
 void Init_Forces( reax_system *system, control_params *control,
                   simulation_data *data, storage *workspace, reax_list **lists,
-                  output_controls *out_control, MPI_Comm comm )
+                  output_controls *out_control, MPI_Comm comm, mpi_datatypes *mpi_data )
 {
     int i, j, pj;
     int start_i, end_i;
@@ -353,6 +353,9 @@ void Init_Forces( reax_system *system, control_params *control,
     int total_sum[6];
     int nt_flag;
 #endif
+    real t_start, t_cm_init, total_cm_init;
+
+    t_cm_init = 0;
 
     far_nbrs = lists[FAR_NBRS];
     bonds = lists[BONDS];
@@ -376,6 +379,7 @@ void Init_Forces( reax_system *system, control_params *control,
     renbr = (data->step - data->prev_steps) % control->reneighbor == 0;
 
 #if defined(NEUTRAL_TERRITORY)
+    t_start = Get_Time( );
     nt_flag = 1;
     if( renbr )
     {
@@ -417,6 +421,7 @@ void Init_Forces( reax_system *system, control_params *control,
 
     mark[0] = mark[1] = 1;
     mark[2] = mark[3] = mark[4] = mark[5] = 2;
+    t_cm_init += Get_Timing_Info( t_start );
 #endif
 
     for ( i = 0; i < system->N; ++i )
@@ -461,10 +466,12 @@ void Init_Forces( reax_system *system, control_params *control,
         ihb_top = -1;
         if ( local == 1 )
         {
+            t_start = Get_Time( );
             H->start[i] = Htop;
             H->entries[Htop].j = i;
             H->entries[Htop].val = sbp_i->eta;
             ++Htop;
+            t_cm_init += Get_Timing_Info( t_start );
 
             if ( control->hbond_cut > 0 )
             {
@@ -527,6 +534,7 @@ void Init_Forces( reax_system *system, control_params *control,
 
                 if ( local == 1 )
                 {
+                    t_start = Get_Time( );
                     /* H matrix entry */
 #if defined(NEUTRAL_TERRITORY)
                     if ( atom_j->nt_dir > 0 || (j < system->n
@@ -572,6 +580,7 @@ void Init_Forces( reax_system *system, control_params *control,
                         ++Htop;
                     }
 #endif
+                    t_cm_init += Get_Timing_Info( t_start );
 
                     /* hydrogen bond lists */
                     if ( control->hbond_cut > 0 && (ihb == 1 || ihb == 2) &&
@@ -603,6 +612,7 @@ void Init_Forces( reax_system *system, control_params *control,
 #if defined(NEUTRAL_TERRITORY)
                 else if ( local == 2 )
                 {
+                    t_start = Get_Time( );
                     /* H matrix entry */
                     if( ( atom_j->nt_dir != -1 && mark[atom_i->nt_dir] != mark[atom_j->nt_dir] 
                                 && ( H->format == SYM_FULL_MATRIX
@@ -635,6 +645,7 @@ void Init_Forces( reax_system *system, control_params *control,
 
                         ++Htop;
                     }
+                    t_cm_init += Get_Timing_Info( t_start );
                 }
 #endif
 
@@ -659,6 +670,7 @@ void Init_Forces( reax_system *system, control_params *control,
         }
 
         Set_End_Index( i, btop_i, bonds );
+        t_start = Get_Time( );
         if ( local == 1 )
         {
             H->end[i] = Htop;
@@ -679,6 +691,7 @@ void Init_Forces( reax_system *system, control_params *control,
             }
         }
 #endif
+        t_cm_init += Get_Timing_Info( t_start );
     }
 
     if ( far_nbrs->format == FULL_LIST )
@@ -712,6 +725,12 @@ void Init_Forces( reax_system *system, control_params *control,
         }
     }
 
+    MPI_Reduce(&t_cm_init, &total_cm_init, 1, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world);
+    if( system->my_rank == MASTER_NODE )
+    {
+        data->timing.init_qeq += total_cm_init / control->nprocs;
+    }
+
 #if defined(DEBUG)
     Print_Sparse_Matrix2( system, H, NULL );
 #endif
@@ -1178,16 +1197,16 @@ void Estimate_Storages( reax_system *system, control_params *control,
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d @ estimate storages: Htop = %d, num_3body = %d\n",
-             system->my_rank, *Htop, *num_3body );
+            system->my_rank, *Htop, *num_3body );
     MPI_Barrier( comm );
 #endif
 }
 
 
 void Compute_Forces( reax_system *system, control_params *control,
-                     simulation_data *data, storage *workspace,
-                     reax_list **lists, output_controls *out_control,
-                     mpi_datatypes *mpi_data )
+        simulation_data *data, storage *workspace,
+        reax_list **lists, output_controls *out_control,
+        mpi_datatypes *mpi_data )
 {
     MPI_Comm comm;
     int qeq_flag;
@@ -1209,7 +1228,7 @@ void Compute_Forces( reax_system *system, control_params *control,
     qeq_flag = 0;
 #endif
     if ( qeq_flag )
-        Init_Forces( system, control, data, workspace, lists, out_control, comm );
+        Init_Forces( system, control, data, workspace, lists, out_control, comm, mpi_data );
     else
         Init_Forces_noQEq( system, control, data, workspace,
                            lists, out_control, comm );
diff --git a/PuReMD/src/init_md.c b/PuReMD/src/init_md.c
index 2df6f5e92ba7849047264cb39ae4a9d211007a20..f3efe9d1ac25acde1d4ea1bb0d58bf64fb834449 100644
--- a/PuReMD/src/init_md.c
+++ b/PuReMD/src/init_md.c
@@ -291,7 +291,7 @@ int Init_Simulation_Data( reax_system *system, control_params *control,
         data->timing.bonded = 0;
         data->timing.nonb = 0;
         data->timing.cm = ZERO;
-        data->timing.cm_sort_mat_rows = ZERO;
+        data->timing.init_qeq = ZERO;
         data->timing.cm_solver_comm = ZERO;
         data->timing.cm_solver_allreduce = ZERO;
         data->timing.cm_solver_pre_comp = ZERO;
@@ -365,7 +365,7 @@ int Init_Simulation_Data( reax_system *system, control_params *control,
         data->timing.bonded = 0;
         data->timing.nonb = 0;
         data->timing.cm = ZERO;
-        data->timing.cm_sort_mat_rows = ZERO;
+        data->timing.init_qeq = ZERO;
         data->timing.cm_solver_comm = ZERO;
         data->timing.cm_solver_allreduce = ZERO;
         data->timing.cm_solver_pre_comp = ZERO;
diff --git a/PuReMD/src/io_tools.c b/PuReMD/src/io_tools.c
index eaa5503cb32c2bd88c82ed1dab1ba8dfd359585b..cf62b27b124b072b1a8a5358fa5e074545fe7ee1 100644
--- a/PuReMD/src/io_tools.c
+++ b/PuReMD/src/io_tools.c
@@ -100,7 +100,7 @@ int Init_Output_Files( reax_system *system, control_params *control,
             out_control->log = sfopen( temp, "w", "Init_Output_Files" );
             fprintf( out_control->log, "%-6s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s\n",
                     "step", "total", "comm", "neighbors", "init", "bonded", "nonbonded", 
-                    "CM", "CM Sort", "S iters", "Pre Comp", "Pre App", "S comm", "S allr",
+                    "CM", "QEq Init", "S iters", "Pre Comp", "Pre App", "S comm", "S allr",
                     "S spmv", "S vec ops", "S orthog", "S tsolve" );
             fflush( out_control->log );
 #endif
@@ -1107,7 +1107,7 @@ void Output_Results( reax_system *system, control_params *control,
                     data->timing.bonded * denom,
                     data->timing.nonb * denom,
                     data->timing.cm * denom,
-                    data->timing.cm_sort_mat_rows * denom,
+                    data->timing.init_qeq * denom,
                     (double)( data->timing.cm_solver_iters * denom),
                     data->timing.cm_solver_pre_comp * denom,
                     data->timing.cm_solver_pre_app * denom,
@@ -1126,7 +1126,7 @@ void Output_Results( reax_system *system, control_params *control,
             data->timing.bonded = 0;
             data->timing.nonb = 0;
             data->timing.cm = ZERO;
-            data->timing.cm_sort_mat_rows = ZERO;
+            data->timing.init_qeq = ZERO;
             data->timing.cm_solver_pre_comp = ZERO;
             data->timing.cm_solver_pre_app = ZERO;
             data->timing.cm_solver_comm = ZERO;
diff --git a/PuReMD/src/linear_solvers.c b/PuReMD/src/linear_solvers.c
index f22abd889073db65fa51f42e112c2c1e7aa83bf9..2ff188c20b6a9d7431cbb8c9327d7c477b70b845 100644
--- a/PuReMD/src/linear_solvers.c
+++ b/PuReMD/src/linear_solvers.c
@@ -1915,7 +1915,7 @@ int CG( reax_system *system, control_params *control, simulation_data *data,
     MPI_Barrier(mpi_data->world);
 
     //TODO only master node should report it
-    if ( i >= control->cm_solver_max_iters )
+    if ( i >= control->cm_solver_max_iters && system->my_rank == MASTER_NODE )
     {
         fprintf( stderr, "CG convergence failed!\n" );
         return i;
diff --git a/PuReMD/src/qEq.c b/PuReMD/src/qEq.c
index 066993d700b08dbb1c6ac0ce67d8f107cc1467f9..29cccf43ca375d07a91f01b28625911b09026a50 100644
--- a/PuReMD/src/qEq.c
+++ b/PuReMD/src/qEq.c
@@ -238,39 +238,42 @@ void Init_MatVec( reax_system *system, simulation_data *data,
     reax_atom *atom;
 
     /*if( (data->step - data->prev_steps) % control->cm_solver_pre_comp_refactor == 0 ||
-      workspace->L == NULL ) {
-    //Print_Linear_System( system, control, workspace, data->step );
-    Sort_Matrix_Rows( workspace->H );
-    fprintf( stderr, "H matrix sorted\n" );
-    Calculate_Droptol( workspace->H, workspace->droptol, control->cm_solver_pre_comp_droptol );
-    fprintf( stderr, "drop tolerances calculated\n" );
-    if( workspace->L == NULL ) {
-    fillin = Estimate_LU_Fill( workspace->H, workspace->droptol );
-
-    if( Allocate_Matrix( &(workspace->L), workspace->H->cap, fillin, FULL_MATRIX, comm ) == 0 ||
-    Allocate_Matrix( &(workspace->U), workspace->H->cap, fillin, FULL_MATRIX, comm ) == 0 ) {
-    fprintf( stderr, "not enough memory for LU matrices. terminating.\n" );
-    MPI_Abort( mpi_data->world, INSUFFICIENT_MEMORY );
-    }
+      workspace->L == NULL ) 
+    {
+        //Print_Linear_System( system, control, workspace, data->step );
+        Sort_Matrix_Rows( workspace->H );
+        fprintf( stderr, "H matrix sorted\n" );
+        Calculate_Droptol( workspace->H, workspace->droptol, control->cm_solver_pre_comp_droptol );
+        fprintf( stderr, "drop tolerances calculated\n" );
+        if( workspace->L == NULL ) 
+        {
+            fillin = Estimate_LU_Fill( workspace->H, workspace->droptol );
 
-    workspace->L->n = workspace->H->n;
-    workspace->U->n = workspace->H->n;
+            if( Allocate_Matrix( &(workspace->L), workspace->H->cap, fillin, FULL_MATRIX, comm ) == 0 ||
+            Allocate_Matrix( &(workspace->U), workspace->H->cap, fillin, FULL_MATRIX, comm ) == 0 ) 
+            {
+                fprintf( stderr, "not enough memory for LU matrices. terminating.\n" );
+                MPI_Abort( mpi_data->world, INSUFFICIENT_MEMORY );
+            }
+
+            workspace->L->n = workspace->H->n;
+            workspace->U->n = workspace->H->n;
 #if defined(DEBUG_FOCUS)
-fprintf( stderr, "p%d: n=%d, fillin = %d\n",
-system->my_rank, workspace->L->n, fillin );
-fprintf( stderr, "p%d: allocated memory: L = U = %ldMB\n",
-system->my_rank,fillin*sizeof(sparse_matrix_entry)/(1024*1024) );
+            fprintf( stderr, "p%d: n=%d, fillin = %d\n",
+            system->my_rank, workspace->L->n, fillin );
+            fprintf( stderr, "p%d: allocated memory: L = U = %ldMB\n",
+            system->my_rank,fillin*sizeof(sparse_matrix_entry)/(1024*1024) );
 #endif
-}
+        }
 
-ICHOLT( workspace->H, workspace->droptol, workspace->L, workspace->U );
+        ICHOLT( workspace->H, workspace->droptol, workspace->L, workspace->U );
 #if defined(DEBUG_FOCUS)
-fprintf( stderr, "p%d: icholt finished\n", system->my_rank );
-    //sprintf( fname, "%s.L%d.out", control->sim_name, data->step );
-    //Print_Sparse_Matrix2( workspace->L, fname );
-    //Print_Sparse_Matrix( U );
+        fprintf( stderr, "p%d: icholt finished\n", system->my_rank );
+        //sprintf( fname, "%s.L%d.out", control->sim_name, data->step );
+        //Print_Sparse_Matrix2( workspace->L, fname );
+        //Print_Sparse_Matrix( U );
 #endif
-}*/
+    }*/
 
     //TODO: fill in code for setting up and computing SAI, see sPuReMD code,
     //  and remove diagonal preconditioner computation below (workspace->Hdia_inv)
@@ -279,35 +282,35 @@ fprintf( stderr, "p%d: icholt finished\n", system->my_rank );
     //            control->cm_solver_pre_comp_sai_thres );
 
     for ( i = 0; i < system->n; ++i )
-{
-    atom = &( system->my_atoms[i] );
-
-    /* init pre-conditioner for H and init solution vectors */
-    workspace->Hdia_inv[i] = 1. / system->reax_param.sbp[ atom->type ].eta;
-    workspace->b_s[i] = -system->reax_param.sbp[ atom->type ].chi;
-    workspace->b_t[i] = -1.0;
-    workspace->b[i][0] = -system->reax_param.sbp[ atom->type ].chi;
-    workspace->b[i][1] = -1.0;
-
-    /* linear extrapolation for s and for t */
-    // newQEq: no extrapolation!
-    //workspace->s[i] = 2 * atom->s[0] - atom->s[1]; //0;
-    //workspace->t[i] = 2 * atom->t[0] - atom->t[1]; //0;
-    //workspace->x[i][0] = 2 * atom->s[0] - atom->s[1]; //0;
-    //workspace->x[i][1] = 2 * atom->t[0] - atom->t[1]; //0;
-
-    /* quadratic extrapolation for s and t */
-    // workspace->s[i] = atom->s[2] + 3 * ( atom->s[0] - atom->s[1] );
-    // workspace->t[i] = atom->t[2] + 3 * ( atom->t[0] - atom->t[1] );
-    //workspace->x[i][0] = atom->s[2] + 3 * ( atom->s[0] - atom->s[1] );
-    workspace->x[i][1] = atom->t[2] + 3 * ( atom->t[0] - atom->t[1] );
-
-    /* cubic extrapolation for s and t */
-    workspace->x[i][0] = 4 * (atom->s[0] + atom->s[2]) - (6 * atom->s[1] + atom->s[3]);
-    //workspace->x[i][1] = 4*(atom->t[0]+atom->t[2])-(6*atom->t[1]+atom->t[3]);
-
-    // fprintf(stderr, "i=%d s=%f t=%f\n", i, workspace->s[i], workspace->t[i]);
-}
+    {
+        atom = &( system->my_atoms[i] );
+
+        /* init pre-conditioner for H and init solution vectors */
+        workspace->Hdia_inv[i] = 1. / system->reax_param.sbp[ atom->type ].eta;
+        workspace->b_s[i] = -system->reax_param.sbp[ atom->type ].chi;
+        workspace->b_t[i] = -1.0;
+        workspace->b[i][0] = -system->reax_param.sbp[ atom->type ].chi;
+        workspace->b[i][1] = -1.0;
+
+        /* linear extrapolation for s and for t */
+        // newQEq: no extrapolation!
+        //workspace->s[i] = 2 * atom->s[0] - atom->s[1]; //0;
+        //workspace->t[i] = 2 * atom->t[0] - atom->t[1]; //0;
+        //workspace->x[i][0] = 2 * atom->s[0] - atom->s[1]; //0;
+        //workspace->x[i][1] = 2 * atom->t[0] - atom->t[1]; //0;
+
+        /* quadratic extrapolation for s and t */
+        // workspace->s[i] = atom->s[2] + 3 * ( atom->s[0] - atom->s[1] );
+        // workspace->t[i] = atom->t[2] + 3 * ( atom->t[0] - atom->t[1] );
+        //workspace->x[i][0] = atom->s[2] + 3 * ( atom->s[0] - atom->s[1] );
+        workspace->x[i][1] = atom->t[2] + 3 * ( atom->t[0] - atom->t[1] );
+
+        /* cubic extrapolation for s and t */
+        workspace->x[i][0] = 4 * (atom->s[0] + atom->s[2]) - (6 * atom->s[1] + atom->s[3]);
+        //workspace->x[i][1] = 4*(atom->t[0]+atom->t[2])-(6*atom->t[1]+atom->t[3]);
+
+        // fprintf(stderr, "i=%d s=%f t=%f\n", i, workspace->s[i], workspace->t[i]);
+    }
 }
 
 
@@ -324,6 +327,9 @@ void Calculate_Charges( reax_system *system, storage *workspace,
     scale = sizeof(real) / sizeof(void);
     q = (real*) malloc(system->N * sizeof(real));
 
+    //s_sum = Parallel_Vector_Acc(workspace->s, system->n, mpi_data->world);
+    q = (real*) malloc(system->N * sizeof(real));
+
     //s_sum = Parallel_Vector_Acc(workspace->s, system->n, mpi_data->world);
     //t_sum = Parallel_Vector_Acc(workspace->t, system->n, mpi_data->world);
     my_sum[0] = my_sum[1] = 0;
@@ -346,8 +352,7 @@ void Calculate_Charges( reax_system *system, storage *workspace,
         /* backup s & t */
         atom->s[3] = atom->s[2];
         atom->s[2] = atom->s[1];
-        atom->s[1] = atom->s[0];
-        //atom->s[0] = workspace->s[i];
+
         atom->s[0] = workspace->x[i][0];
 
         atom->t[3] = atom->t[2];
@@ -384,7 +389,7 @@ static void Setup_Preconditioner_QEq( reax_system *system, control_params *contr
 
     if( system->my_rank == MASTER_NODE )
     {
-        data->timing.cm_sort_mat_rows += total_sort / control->nprocs;
+        data->timing.init_qeq += total_sort / control->nprocs;
         data->timing.cm_solver_pre_comp += total_pc / control->nprocs;
     }
 }
@@ -427,8 +432,7 @@ void QEq( reax_system *system, control_params *control, simulation_data *data,
     {
         if( control->cm_solver_pre_comp_refactor > 0 && ((data->step - data->prev_steps) % control->cm_solver_pre_comp_refactor == 0))
         {
-            Setup_Preconditioner_QEq( system, control, data, workspace, mpi_data );
-
+            Compute_Preconditioner_QEq( system, control, data, workspace, mpi_data );
             Compute_Preconditioner_QEq( system, control, data, workspace, mpi_data );
         }
     }
diff --git a/PuReMD/src/reax_types.h b/PuReMD/src/reax_types.h
index c711320303b11c50a1713b446eb1ffaaea189683..644fceae9cca7cdf117b4b6292a77800bd710868 100644
--- a/PuReMD/src/reax_types.h
+++ b/PuReMD/src/reax_types.h
@@ -792,8 +792,8 @@ typedef struct
     real nonb;
     /* atomic charge distribution calculation time */
     real cm;
-    /**/
-    real cm_sort_mat_rows;
+    /* matrix initiation and sort time */
+    real init_qeq;
     /**/
     real cm_solver_comm;
     /**/