From 99c6a251d1eb12931525ce7e6177befbca87dd64 Mon Sep 17 00:00:00 2001
From: Abdullah Alperen <alperena@msu.edu>
Date: Wed, 19 Sep 2018 22:16:53 -0400
Subject: [PATCH] PuReMD: cleand code and add timing performance

---
 PuReMD/src/control.c        |  11 +
 PuReMD/src/forces.c         |  11 +-
 PuReMD/src/init_md.c        | 338 +++++++-------
 PuReMD/src/io_tools.c       | 877 ++++++++++++++++++------------------
 PuReMD/src/linear_solvers.c | 353 +++++++++------
 PuReMD/src/linear_solvers.h |   8 +-
 PuReMD/src/parallelreax.c   |   4 +-
 PuReMD/src/qEq.c            |  63 +--
 PuReMD/src/reax_types.h     |  53 ++-
 PuReMD/src/reset_tools.c    |   5 +-
 10 files changed, 963 insertions(+), 760 deletions(-)

diff --git a/PuReMD/src/control.c b/PuReMD/src/control.c
index cbdd4147..f0f6ea0c 100644
--- a/PuReMD/src/control.c
+++ b/PuReMD/src/control.c
@@ -74,6 +74,7 @@ char Read_Control_File( char *control_file, control_params* control,
     control->tabulate = 0;
 
     control->qeq_freq = 1;
+    control->cm_solver_max_iters = 100;
     control->q_err = 1e-6;
     control->sai_thres = 0.1;
     control->refactor = 100;
@@ -246,11 +247,21 @@ char Read_Control_File( char *control_file, control_params* control,
             ival = atoi( tmp[1] );
             control->qeq_freq = ival;
         }
+        else if ( strcmp(tmp[0], "cm_solver_max_iters") == 0 )
+        {
+            ival = atoi( tmp[1] );
+            control->cm_solver_max_iters = ival;
+        }
         else if ( strcmp(tmp[0], "q_err") == 0 )
         {
             val = atof( tmp[1] );
             control->q_err = val;
         }
+        else if ( strcmp(tmp[0], "cm_solver_pre_comp_type") == 0 )
+        {
+            val = atoi( tmp[1] );
+            control->cm_solver_pre_comp_type = val;
+        }
         else if ( strcmp(tmp[0], "sai_thres") == 0 )
         {
             val = atof( tmp[1] );
diff --git a/PuReMD/src/forces.c b/PuReMD/src/forces.c
index d157eb6f..bbbf110a 100644
--- a/PuReMD/src/forces.c
+++ b/PuReMD/src/forces.c
@@ -968,7 +968,7 @@ void Compute_Forces( reax_system *system, control_params *control,
     MPI_Comm comm;
     int qeq_flag;
 #if defined(LOG_PERFORMANCE)
-    real t_start = 0;
+    real t_start = 0, t_elapsed;
 
     //MPI_Barrier( mpi_data->world );
     if ( system->my_rank == MASTER_NODE )
@@ -1005,7 +1005,9 @@ void Compute_Forces( reax_system *system, control_params *control,
 #if defined(LOG_PERFORMANCE)
     //MPI_Barrier( mpi_data->world );
     if ( system->my_rank == MASTER_NODE )
-        Update_Timing_Info( &t_start, &(data->timing.bonded) );
+    {
+        t_start = Get_Time( );
+    }
 #endif
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d @ step%d: completed bonded\n",
@@ -1022,7 +1024,10 @@ void Compute_Forces( reax_system *system, control_params *control,
 #if defined(LOG_PERFORMANCE)
     //MPI_Barrier( mpi_data->world );
     if ( system->my_rank == MASTER_NODE )
-        Update_Timing_Info( &t_start, &(data->timing.qEq) );
+    {
+        t_elapsed = Get_Timing_Info( t_start );
+        data->timing.cm += t_elapsed;
+    }
 #endif
 #if defined(DEBUG_FOCUS)
     fprintf(stderr, "p%d @ step%d: qeq completed\n", system->my_rank, data->step);
diff --git a/PuReMD/src/init_md.c b/PuReMD/src/init_md.c
index 0720b0f7..0d9bcc40 100644
--- a/PuReMD/src/init_md.c
+++ b/PuReMD/src/init_md.c
@@ -54,8 +54,8 @@
 #if defined(PURE_REAX)
 /************************ initialize system ************************/
 int Reposition_Atoms( reax_system *system, control_params *control,
-                      simulation_data *data, mpi_datatypes *mpi_data,
-                      char *msg )
+        simulation_data *data, mpi_datatypes *mpi_data,
+        char *msg )
 {
     int   i;
     rvec  dx;
@@ -130,8 +130,8 @@ void Generate_Initial_Velocities( reax_system *system, real T )
 
 
 int Init_System( reax_system *system, control_params *control,
-                 simulation_data *data, storage *workspace,
-                 mpi_datatypes *mpi_data, char *msg )
+        simulation_data *data, storage *workspace,
+        mpi_datatypes *mpi_data, char *msg )
 {
     int i;
     reax_atom *atom;
@@ -152,7 +152,7 @@ int Init_System( reax_system *system, control_params *control,
     for ( i = 0; i < MAX_NBRS; ++i ) nrecv[i] = 0;
     system->max_recved = 0;
     system->N = SendRecv( system, mpi_data, mpi_data->boundary_atom_type, nrecv,
-                          Estimate_Boundary_Atoms, Unpack_Estimate_Message, 1 );
+            Estimate_Boundary_Atoms, Unpack_Estimate_Message, 1 );
     system->total_cap = MAX( (int)(system->N * SAFE_ZONE), MIN_CAP );
     Bin_Boundary_Atoms( system );
 
@@ -177,11 +177,11 @@ int Init_System( reax_system *system, control_params *control,
     //Allocate_System( system, system->local_cap, system->total_cap, msg );
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d: n=%d local_cap=%d\n",
-             system->my_rank, system->n, system->local_cap );
+            system->my_rank, system->n, system->local_cap );
     fprintf( stderr, "p%d: N=%d total_cap=%d\n",
-             system->my_rank, system->N, system->total_cap );
+            system->my_rank, system->N, system->total_cap );
     fprintf( stderr, "p%d: numH=%d H_cap=%d\n",
-             system->my_rank, system->numH, system->Hcap );
+            system->my_rank, system->numH, system->Hcap );
     MPI_Barrier( mpi_data->world );
 #endif
 
@@ -198,8 +198,8 @@ int Init_System( reax_system *system, control_params *control,
 
 /************************ initialize simulation data ************************/
 int Init_Simulation_Data( reax_system *system, control_params *control,
-                          simulation_data *data, mpi_datatypes *mpi_data,
-                          char *msg )
+        simulation_data *data, mpi_datatypes *mpi_data,
+        char *msg )
 {
     Reset_Simulation_Data( data, control->virial );
 
@@ -212,65 +212,65 @@ int Init_Simulation_Data( reax_system *system, control_params *control,
 
     switch ( control->ensemble )
     {
-    case NVE:
-        data->N_f = 3 * system->bigN;
-        Evolve = Velocity_Verlet_NVE;
-        break;
-
-    case bNVT:
-        data->N_f = 3 * system->bigN + 1;
-        Evolve = Velocity_Verlet_Berendsen_NVT;
-        break;
-
-    case nhNVT:
-        fprintf( stderr, "WARNING: Nose-Hoover NVT is still under testing.\n" );
-        //return FAILURE;
-        data->N_f = 3 * system->bigN + 1;
-        Evolve = Velocity_Verlet_Nose_Hoover_NVT_Klein;
-        if ( !control->restart || (control->restart && control->random_vel) )
-        {
-            data->therm.G_xi = control->Tau_T *
-                               (2.0 * data->sys_en.e_kin - data->N_f * K_B * control->T );
-            data->therm.v_xi = data->therm.G_xi * control->dt;
-            data->therm.v_xi_old = 0;
-            data->therm.xi = 0;
-        }
-        break;
-
-    case sNPT: /* Semi-Isotropic NPT */
-        data->N_f = 3 * system->bigN + 4;
-        Evolve = Velocity_Verlet_Berendsen_NPT;
-        if ( !control->restart )
-            Reset_Pressures( data );
-        break;
-
-    case iNPT: /* Isotropic NPT */
-        data->N_f = 3 * system->bigN + 2;
-        Evolve = Velocity_Verlet_Berendsen_NPT;
-        if ( !control->restart )
-            Reset_Pressures( data );
-        break;
-
-    case NPT: /* Anisotropic NPT */
-        strcpy( msg, "init_simulation_data: option not yet implemented" );
-        return FAILURE;
-
-        data->N_f = 3 * system->bigN + 9;
-        Evolve = Velocity_Verlet_Berendsen_NPT;
-        /*if( !control->restart ) {
-          data->therm.G_xi = control->Tau_T *
-          (2.0 * data->my_en.e_Kin - data->N_f * K_B * control->T );
-          data->therm.v_xi = data->therm.G_xi * control->dt;
-          data->iso_bar.eps = 0.33333 * log(system->box.volume);
-          data->inv_W = 1.0 /
-          ( data->N_f * K_B * control->T * SQR(control->Tau_P) );
-          Compute_Pressure( system, control, data, out_control );
-          }*/
-        break;
-
-    default:
-        strcpy( msg, "init_simulation_data: ensemble not recognized" );
-        return FAILURE;
+        case NVE:
+            data->N_f = 3 * system->bigN;
+            Evolve = Velocity_Verlet_NVE;
+            break;
+
+        case bNVT:
+            data->N_f = 3 * system->bigN + 1;
+            Evolve = Velocity_Verlet_Berendsen_NVT;
+            break;
+
+        case nhNVT:
+            fprintf( stderr, "WARNING: Nose-Hoover NVT is still under testing.\n" );
+            //return FAILURE;
+            data->N_f = 3 * system->bigN + 1;
+            Evolve = Velocity_Verlet_Nose_Hoover_NVT_Klein;
+            if ( !control->restart || (control->restart && control->random_vel) )
+            {
+                data->therm.G_xi = control->Tau_T *
+                    (2.0 * data->sys_en.e_kin - data->N_f * K_B * control->T );
+                data->therm.v_xi = data->therm.G_xi * control->dt;
+                data->therm.v_xi_old = 0;
+                data->therm.xi = 0;
+            }
+            break;
+
+        case sNPT: /* Semi-Isotropic NPT */
+            data->N_f = 3 * system->bigN + 4;
+            Evolve = Velocity_Verlet_Berendsen_NPT;
+            if ( !control->restart )
+                Reset_Pressures( data );
+            break;
+
+        case iNPT: /* Isotropic NPT */
+            data->N_f = 3 * system->bigN + 2;
+            Evolve = Velocity_Verlet_Berendsen_NPT;
+            if ( !control->restart )
+                Reset_Pressures( data );
+            break;
+
+        case NPT: /* Anisotropic NPT */
+            strcpy( msg, "init_simulation_data: option not yet implemented" );
+            return FAILURE;
+
+            data->N_f = 3 * system->bigN + 9;
+            Evolve = Velocity_Verlet_Berendsen_NPT;
+            /*if( !control->restart ) {
+              data->therm.G_xi = control->Tau_T *
+              (2.0 * data->my_en.e_Kin - data->N_f * K_B * control->T );
+              data->therm.v_xi = data->therm.G_xi * control->dt;
+              data->iso_bar.eps = 0.33333 * log(system->box.volume);
+              data->inv_W = 1.0 /
+              ( data->N_f * K_B * control->T * SQR(control->Tau_P) );
+              Compute_Pressure( system, control, data, out_control );
+              }*/
+            break;
+
+        default:
+            strcpy( msg, "init_simulation_data: ensemble not recognized" );
+            return FAILURE;
     }
 
     /* initialize the timer(s) */
@@ -279,7 +279,24 @@ int Init_Simulation_Data( reax_system *system, control_params *control,
     {
         data->timing.start = Get_Time( );
 #if defined(LOG_PERFORMANCE)
-        Reset_Timing( &data->timing );
+        //Reset_Timing( &data->timing );
+        /* init timing info */
+        data->timing.total = data->timing.start;
+        data->timing.comm = ZERO;
+        data->timing.nbrs = 0;
+        data->timing.init_forces = 0;
+        data->timing.bonded = 0;
+        data->timing.nonb = 0;
+        data->timing.cm = ZERO;
+        data->timing.cm_sort_mat_rows = ZERO;
+        data->timing.cm_solver_comm = ZERO;
+        data->timing.cm_solver_pre_comp = ZERO;
+        data->timing.cm_solver_pre_app = ZERO;
+        data->timing.cm_solver_iters = 0;
+        data->timing.cm_solver_spmv = ZERO;
+        data->timing.cm_solver_vector_ops = ZERO;
+        data->timing.cm_solver_orthog = ZERO;
+        data->timing.cm_solver_tri_solve = ZERO;
 #endif
     }
 
@@ -314,11 +331,11 @@ int Init_System( reax_system *system, control_params *control, char *msg )
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d: n=%d local_cap=%d\n",
-             system->my_rank, system->n, system->local_cap );
+            system->my_rank, system->n, system->local_cap );
     fprintf( stderr, "p%d: N=%d total_cap=%d\n",
-             system->my_rank, system->N, system->total_cap );
+            system->my_rank, system->N, system->total_cap );
     fprintf( stderr, "p%d: numH=%d H_cap=%d\n",
-             system->my_rank, system->numH, system->Hcap );
+            system->my_rank, system->numH, system->Hcap );
 #endif
 
     return SUCCESS;
@@ -326,7 +343,7 @@ int Init_System( reax_system *system, control_params *control, char *msg )
 
 
 int Init_Simulation_Data( reax_system *system, control_params *control,
-                          simulation_data *data, char *msg )
+        simulation_data *data, char *msg )
 {
     Reset_Simulation_Data( data, control->virial );
 
@@ -335,7 +352,24 @@ int Init_Simulation_Data( reax_system *system, control_params *control,
     {
         data->timing.start = Get_Time( );
 #if defined(LOG_PERFORMANCE)
-        Reset_Timing( &data->timing );
+        //Reset_Timing( &data->timing );
+        /* init timing info */
+        data->timing.total = data->timing.start;
+        data->timing.comm = ZERO;
+        data->timing.nbrs = 0;
+        data->timing.init_forces = 0;
+        data->timing.bonded = 0;
+        data->timing.nonb = 0;
+        data->timing.cm = ZERO;
+        data->timing.cm_sort_mat_rows = ZERO;
+        data->timing.cm_solver_comm = ZERO;
+        data->timing.cm_solver_pre_comp = ZERO;
+        data->timing.cm_solver_pre_app = ZERO;
+        data->timing.cm_solver_iters = 0;
+        data->timing.cm_solver_spmv = ZERO;
+        data->timing.cm_solver_vector_ops = ZERO;
+        data->timing.cm_solver_orthog = ZERO;
+        data->timing.cm_solver_tri_solve = ZERO;
 #endif
     }
 
@@ -385,17 +419,17 @@ void Init_Taper( control_params *control,  storage *workspace, MPI_Comm comm )
     workspace->Tap[2] = -210.0 * (swa3 * swb2 + swa2 * swb3) / d7;
     workspace->Tap[1] = 140.0 * swa3 * swb3 / d7;
     workspace->Tap[0] = (-35.0 * swa3 * swb2 * swb2 + 21.0 * swa2 * swb3 * swb2 +
-                         7.0 * swa * swb3 * swb3 + swb3 * swb3 * swb ) / d7;
+            7.0 * swa * swb3 * swb3 + swb3 * swb3 * swb ) / d7;
 }
 
 
 int Init_Workspace( reax_system *system, control_params *control,
-                    storage *workspace, MPI_Comm comm, char *msg )
+        storage *workspace, MPI_Comm comm, char *msg )
 {
     int ret;
 
     ret = Allocate_Workspace( system, control, workspace,
-                              system->local_cap, system->total_cap, comm, msg );
+            system->local_cap, system->total_cap, comm, msg );
     if ( ret != SUCCESS )
         return ret;
 
@@ -411,7 +445,7 @@ int Init_Workspace( reax_system *system, control_params *control,
 
 /************** setup communication data structures  **************/
 int Init_MPI_Datatypes( reax_system *system, storage *workspace,
-                        mpi_datatypes *mpi_data, MPI_Comm comm, char *msg )
+        mpi_datatypes *mpi_data, MPI_Comm comm, char *msg )
 {
 #if defined(PURE_REAX)
     int           i, block[11];
@@ -434,7 +468,7 @@ int Init_MPI_Datatypes( reax_system *system, storage *workspace,
     mpi_data->in2_buffer = NULL;
 
     /* mpi_atom - [orig_id, imprt_id, type, num_bonds, num_hbonds, name,
-                   x, v, f_old, s, t] */
+       x, v, f_old, s, t] */
     block[0] = block[1] = block[2] = block[3] = block[4] = 1;
     block[5] = 8;
     block[6] = block[7] = block[8] = 3;
@@ -529,8 +563,8 @@ int Init_MPI_Datatypes( reax_system *system, storage *workspace,
 /********************** allocate lists *************************/
 #if defined(PURE_REAX)
 int  Init_Lists( reax_system *system, control_params *control,
-                 simulation_data *data, storage *workspace, reax_list **lists,
-                 mpi_datatypes *mpi_data, char *msg )
+        simulation_data *data, storage *workspace, reax_list **lists,
+        mpi_datatypes *mpi_data, char *msg )
 {
     int i, num_nbrs;
     int total_hbonds, total_bonds, bond_cap, num_3body, cap_3body, Htop;
@@ -539,7 +573,7 @@ int  Init_Lists( reax_system *system, control_params *control,
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d: before est_nbrs - local_cap=%d, total_cap=%d\n",
-             system->my_rank, system->local_cap, system->total_cap );
+            system->my_rank, system->local_cap, system->total_cap );
 #endif
 
     comm = mpi_data->world;
@@ -550,7 +584,7 @@ int  Init_Lists( reax_system *system, control_params *control,
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d: after est_nbrs - local_cap=%d, total_cap=%d\n",
-             system->my_rank, system->local_cap, system->total_cap );
+            system->my_rank, system->local_cap, system->total_cap );
 #endif
 
     if ( !Make_List( system->total_cap, num_nbrs, TYP_FAR_NEIGHBOR,
@@ -561,20 +595,20 @@ int  Init_Lists( reax_system *system, control_params *control,
     }
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d: allocated far_nbrs: num_far=%d, space=%dMB\n",
-             system->my_rank, num_nbrs,
-             (int)(num_nbrs * sizeof(far_neighbor_data) / (1024 * 1024)) );
+            system->my_rank, num_nbrs,
+            (int)(num_nbrs * sizeof(far_neighbor_data) / (1024 * 1024)) );
 #endif
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d: before gen_nbrs - local_cap=%d, total_cap=%d\n",
-             system->my_rank, system->local_cap, system->total_cap );
+            system->my_rank, system->local_cap, system->total_cap );
 #endif
 
     Generate_Neighbor_Lists( system, data, workspace, lists );
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d: after gen_nbrs - local_cap=%d, total_cap=%d\n",
-             system->my_rank, system->local_cap, system->total_cap );
+            system->my_rank, system->local_cap, system->total_cap );
 #endif
 
     bond_top = (int*) calloc( system->total_cap, sizeof(int) );
@@ -582,23 +616,23 @@ int  Init_Lists( reax_system *system, control_params *control,
     //bond_top = (int*) malloc( system->total_cap * sizeof(int) );
     //hb_top = (int*) malloc( system->local_cap * sizeof(int) );
     Estimate_Storages( system, control, lists,
-                       &Htop, hb_top, bond_top, &num_3body, comm );
+            &Htop, hb_top, bond_top, &num_3body, comm );
 
     Allocate_Matrix( &(workspace->H), system->local_cap, Htop, comm );
     workspace->L = NULL;
     workspace->U = NULL;
-    
+
     //TODO: uncomment for SAI
-//    Allocate_Matrix2( &(workspace->H_spar_patt), workspace->H->n, system->local_cap, workspace->H->m, comm );
-//    Allocate_Matrix( &(workspace->H_spar_patt_full), workspace->H->n, 2 * workspace->H->m - workspace->H->n );
-//    Allocate_Matrix2( &(workspace->H_app_inv), workspace->H->n, system->local_cap, workspace->H->m, comm );
+    //    Allocate_Matrix2( &(workspace->H_spar_patt), workspace->H->n, system->local_cap, workspace->H->m, comm );
+    //    Allocate_Matrix( &(workspace->H_spar_patt_full), workspace->H->n, 2 * workspace->H->m - workspace->H->n );
+    //    Allocate_Matrix2( &(workspace->H_app_inv), workspace->H->n, system->local_cap, workspace->H->m, comm );
     workspace->H_spar_patt = NULL;
     workspace->H_app_inv = NULL;
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d: allocated H matrix: Htop=%d, space=%dMB\n",
-             system->my_rank, Htop,
-             (int)(Htop * sizeof(sparse_matrix_entry) / (1024 * 1024)) );
+            system->my_rank, Htop,
+            (int)(Htop * sizeof(sparse_matrix_entry) / (1024 * 1024)) );
 #endif
 
     if ( control->hbond_cut > 0 )
@@ -613,15 +647,15 @@ int  Init_Lists( reax_system *system, control_params *control,
         total_hbonds = MAX( total_hbonds * SAFER_ZONE, MIN_CAP * MIN_HBONDS );
 
         if ( !Make_List( system->Hcap, total_hbonds, TYP_HBOND,
-                         lists[HBONDS], comm ) )
+                    lists[HBONDS], comm ) )
         {
             fprintf( stderr, "not enough space for hbonds list. terminating!\n" );
             MPI_Abort( comm, INSUFFICIENT_MEMORY );
         }
 #if defined(DEBUG_FOCUS)
         fprintf( stderr, "p%d: allocated hbonds: total_hbonds=%d, space=%dMB\n",
-                 system->my_rank, total_hbonds,
-                 (int)(total_hbonds * sizeof(hbond_data) / (1024 * 1024)) );
+                system->my_rank, total_hbonds,
+                (int)(total_hbonds * sizeof(hbond_data) / (1024 * 1024)) );
 #endif
     }
 
@@ -637,41 +671,41 @@ int  Init_Lists( reax_system *system, control_params *control,
     bond_cap = MAX( total_bonds * SAFE_ZONE, MIN_CAP * MIN_BONDS );
 
     if ( !Make_List( system->total_cap, bond_cap, TYP_BOND,
-                     lists[BONDS], comm ) )
+                lists[BONDS], comm ) )
     {
         fprintf( stderr, "not enough space for bonds list. terminating!\n" );
         MPI_Abort( comm, INSUFFICIENT_MEMORY );
     }
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d: allocated bonds: total_bonds=%d, space=%dMB\n",
-             system->my_rank, bond_cap,
-             (int)(bond_cap * sizeof(bond_data) / (1024 * 1024)) );
+            system->my_rank, bond_cap,
+            (int)(bond_cap * sizeof(bond_data) / (1024 * 1024)) );
 #endif
 
     /* 3bodies list */
     cap_3body = MAX( num_3body * SAFE_ZONE, MIN_3BODIES );
     if ( !Make_List( bond_cap, cap_3body, TYP_THREE_BODY,
-                     lists[THREE_BODIES], comm ) )
+                lists[THREE_BODIES], comm ) )
     {
         fprintf( stderr, "Problem in initializing angles list. Terminating!\n" );
         MPI_Abort( comm, INSUFFICIENT_MEMORY );
     }
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d: allocated 3-body list: num_3body=%d, space=%dMB\n",
-             system->my_rank, cap_3body,
-             (int)(cap_3body * sizeof(three_body_interaction_data) / (1024 * 1024)) );
+            system->my_rank, cap_3body,
+            (int)(cap_3body * sizeof(three_body_interaction_data) / (1024 * 1024)) );
 #endif
 
 #if defined(TEST_FORCES)
     if ( !Make_List( system->total_cap, bond_cap * 8, TYP_DDELTA,
-                     lists[DDELTAS], comm ) )
+                lists[DDELTAS], comm ) )
     {
         fprintf( stderr, "Problem in initializing dDelta list. Terminating!\n" );
         MPI_Abort( comm, INSUFFICIENT_MEMORY );
     }
     fprintf( stderr, "p%d: allocated dDelta list: num_ddelta=%d space=%ldMB\n",
-             system->my_rank, bond_cap * 30,
-             bond_cap * 8 * sizeof(dDelta_data) / (1024 * 1024) );
+            system->my_rank, bond_cap * 30,
+            bond_cap * 8 * sizeof(dDelta_data) / (1024 * 1024) );
 
     if ( !Make_List( bond_cap, bond_cap * 50, TYP_DBO, lists[DBOS], comm ) )
     {
@@ -679,8 +713,8 @@ int  Init_Lists( reax_system *system, control_params *control,
         MPI_Abort( comm, INSUFFICIENT_MEMORY );
     }
     fprintf( stderr, "p%d: allocated dbond list: num_dbonds=%d space=%ldMB\n",
-             system->my_rank, bond_cap * MAX_BONDS * 3,
-             bond_cap * MAX_BONDS * 3 * sizeof(dbond_data) / (1024 * 1024) );
+            system->my_rank, bond_cap * MAX_BONDS * 3,
+            bond_cap * MAX_BONDS * 3 * sizeof(dbond_data) / (1024 * 1024) );
 #endif
 
     sfree( hb_top, "hb_top" );
@@ -690,8 +724,8 @@ int  Init_Lists( reax_system *system, control_params *control,
 }
 #elif defined(LAMMPS_REAX)
 int  Init_Lists( reax_system *system, control_params *control,
-                 simulation_data *data, storage *workspace, reax_list **lists,
-                 mpi_datatypes *mpi_data, char *msg )
+        simulation_data *data, storage *workspace, reax_list **lists,
+        mpi_datatypes *mpi_data, char *msg )
 {
     int i, num_nbrs;
     int total_hbonds, total_bonds, bond_cap, num_3body, cap_3body, Htop;
@@ -703,7 +737,7 @@ int  Init_Lists( reax_system *system, control_params *control,
     bond_top = (int*) calloc( system->total_cap, sizeof(int) );
     hb_top = (int*) calloc( system->local_cap, sizeof(int) );
     Estimate_Storages( system, control, lists,
-                       &Htop, hb_top, bond_top, &num_3body, comm );
+            &Htop, hb_top, bond_top, &num_3body, comm );
 
     if ( control->hbond_cut > 0 )
     {
@@ -717,15 +751,15 @@ int  Init_Lists( reax_system *system, control_params *control,
         total_hbonds = (int)(MAX( total_hbonds * SAFER_ZONE, MIN_CAP * MIN_HBONDS ));
 
         if ( !Make_List( system->Hcap, total_hbonds, TYP_HBOND,
-                         lists[HBONDS], comm ) )
+                    lists[HBONDS], comm ) )
         {
             fprintf( stderr, "not enough space for hbonds list. terminating!\n" );
             MPI_Abort( comm, INSUFFICIENT_MEMORY );
         }
 #if defined(DEBUG_FOCUS)
         fprintf( stderr, "p%d: allocated hbonds: total_hbonds=%d, space=%dMB\n",
-                 system->my_rank, total_hbonds,
-                 (int)(total_hbonds * sizeof(hbond_data) / (1024 * 1024)) );
+                system->my_rank, total_hbonds,
+                (int)(total_hbonds * sizeof(hbond_data) / (1024 * 1024)) );
 #endif
     }
 
@@ -741,41 +775,41 @@ int  Init_Lists( reax_system *system, control_params *control,
     bond_cap = (int)(MAX( total_bonds * SAFE_ZONE, MIN_CAP * MIN_BONDS ));
 
     if ( !Make_List( system->total_cap, bond_cap, TYP_BOND,
-                     lists[BONDS], comm ) )
+                lists[BONDS], comm ) )
     {
         fprintf( stderr, "not enough space for bonds list. terminating!\n" );
         MPI_Abort( comm, INSUFFICIENT_MEMORY );
     }
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d: allocated bonds: total_bonds=%d, space=%dMB\n",
-             system->my_rank, bond_cap,
-             (int)(bond_cap * sizeof(bond_data) / (1024 * 1024)) );
+            system->my_rank, bond_cap,
+            (int)(bond_cap * sizeof(bond_data) / (1024 * 1024)) );
 #endif
 
     /* 3bodies list */
     cap_3body = (int)(MAX( num_3body * SAFE_ZONE, MIN_3BODIES ));
     if ( !Make_List( bond_cap, cap_3body, TYP_THREE_BODY,
-                     lists[THREE_BODIES], comm ) )
+                lists[THREE_BODIES], comm ) )
     {
         fprintf( stderr, "Problem in initializing angles list. Terminating!\n" );
         MPI_Abort( comm, INSUFFICIENT_MEMORY );
     }
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d: allocated 3-body list: num_3body=%d, space=%dMB\n",
-             system->my_rank, cap_3body,
-             (int)(cap_3body * sizeof(three_body_interaction_data) / (1024 * 1024)) );
+            system->my_rank, cap_3body,
+            (int)(cap_3body * sizeof(three_body_interaction_data) / (1024 * 1024)) );
 #endif
 
 #if defined(TEST_FORCES)
     if ( !Make_List( system->total_cap, bond_cap * 8, TYP_DDELTA,
-                     lists[DDELTAS], comm ) )
+                lists[DDELTAS], comm ) )
     {
         fprintf( stderr, "Problem in initializing dDelta list. Terminating!\n" );
         MPI_Abort( comm, INSUFFICIENT_MEMORY );
     }
     fprintf( stderr, "p%d: allocated dDelta list: num_ddelta=%d space=%ldMB\n",
-             system->my_rank, bond_cap * 30,
-             bond_cap * 8 * sizeof(dDelta_data) / (1024 * 1024) );
+            system->my_rank, bond_cap * 30,
+            bond_cap * 8 * sizeof(dDelta_data) / (1024 * 1024) );
 
     if ( !Make_List( bond_cap, bond_cap * 50, TYP_DBO, lists[DBOS], comm ) )
     {
@@ -783,8 +817,8 @@ int  Init_Lists( reax_system *system, control_params *control,
         MPI_Abort( comm, INSUFFICIENT_MEMORY );
     }
     fprintf( stderr, "p%d: allocated dbond list: num_dbonds=%d space=%ldMB\n",
-             system->my_rank, bond_cap * MAX_BONDS * 3,
-             bond_cap * MAX_BONDS * 3 * sizeof(dbond_data) / (1024 * 1024) );
+            system->my_rank, bond_cap * MAX_BONDS * 3,
+            bond_cap * MAX_BONDS * 3 * sizeof(dbond_data) / (1024 * 1024) );
 #endif
 
     sfree( hb_top, "hb_top" );
@@ -798,9 +832,9 @@ int  Init_Lists( reax_system *system, control_params *control,
 
 #if defined(PURE_REAX)
 void Initialize( reax_system *system, control_params *control,
-                 simulation_data *data, storage *workspace,
-                 reax_list **lists, output_controls *out_control,
-                 mpi_datatypes *mpi_data )
+        simulation_data *data, storage *workspace,
+        reax_list **lists, output_controls *out_control,
+        mpi_datatypes *mpi_data )
 {
     char msg[MAX_STR];
 
@@ -808,9 +842,9 @@ void Initialize( reax_system *system, control_params *control,
             FAILURE )
     {
         fprintf( stderr, "p%d: init_mpi_datatypes: could not create datatypes\n",
-                 system->my_rank );
+                system->my_rank );
         fprintf( stderr, "p%d: mpi_data couldn't be initialized! terminating.\n",
-                 system->my_rank );
+                system->my_rank );
         MPI_Abort( mpi_data->world, CANNOT_INITIALIZE );
     }
 #if defined(DEBUG)
@@ -821,7 +855,7 @@ void Initialize( reax_system *system, control_params *control,
     {
         fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
         fprintf( stderr, "p%d: system could not be initialized! terminating.\n",
-                 system->my_rank );
+                system->my_rank );
         MPI_Abort( mpi_data->world, CANNOT_INITIALIZE );
     }
 #if defined(DEBUG)
@@ -832,7 +866,7 @@ void Initialize( reax_system *system, control_params *control,
     {
         fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
         fprintf( stderr, "p%d: sim_data couldn't be initialized! terminating.\n",
-                 system->my_rank );
+                system->my_rank );
         MPI_Abort( mpi_data->world, CANNOT_INITIALIZE );
     }
 #if defined(DEBUG)
@@ -843,9 +877,9 @@ void Initialize( reax_system *system, control_params *control,
             FAILURE )
     {
         fprintf( stderr, "p%d:init_workspace: not enough memory\n",
-                 system->my_rank );
+                system->my_rank );
         fprintf( stderr, "p%d:workspace couldn't be initialized! terminating.\n",
-                 system->my_rank );
+                system->my_rank );
         MPI_Abort( mpi_data->world, CANNOT_INITIALIZE );
     }
 #if defined(DEBUG)
@@ -857,7 +891,7 @@ void Initialize( reax_system *system, control_params *control,
     {
         fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
         fprintf( stderr, "p%d: system could not be initialized! terminating.\n",
-                 system->my_rank );
+                system->my_rank );
         MPI_Abort( mpi_data->world, CANNOT_INITIALIZE );
     }
 #if defined(DEBUG)
@@ -868,7 +902,7 @@ void Initialize( reax_system *system, control_params *control,
     {
         fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
         fprintf( stderr, "p%d: could not open output files! terminating...\n",
-                 system->my_rank );
+                system->my_rank );
         MPI_Abort( mpi_data->world, CANNOT_INITIALIZE );
     }
 #if defined(DEBUG)
@@ -881,7 +915,7 @@ void Initialize( reax_system *system, control_params *control,
         {
             fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
             fprintf( stderr, "p%d: couldn't create lookup table! terminating.\n",
-                     system->my_rank );
+                    system->my_rank );
             MPI_Abort( mpi_data->world, CANNOT_INITIALIZE );
         }
 #if defined(DEBUG)
@@ -896,14 +930,14 @@ void Initialize( reax_system *system, control_params *control,
     /*#ifdef TEST_FORCES
       Init_Force_Test_Functions();
       fprintf(stderr,"p%d: initialized force test functions\n",system->my_rank);
-      #endif */
+#endif */
 }
 
 #elif defined(LAMMPS_REAX)
 void Initialize( reax_system *system, control_params *control,
-                 simulation_data *data, storage *workspace,
-                 reax_list **lists, output_controls *out_control,
-                 mpi_datatypes *mpi_data, MPI_Comm comm )
+        simulation_data *data, storage *workspace,
+        reax_list **lists, output_controls *out_control,
+        mpi_datatypes *mpi_data, MPI_Comm comm )
 {
     char msg[MAX_STR];
 
@@ -911,9 +945,9 @@ void Initialize( reax_system *system, control_params *control,
     if ( Init_MPI_Datatypes(system, workspace, mpi_data, comm, msg) == FAILURE )
     {
         fprintf( stderr, "p%d: init_mpi_datatypes: could not create datatypes\n",
-                 system->my_rank );
+                system->my_rank );
         fprintf( stderr, "p%d: mpi_data couldn't be initialized! terminating.\n",
-                 system->my_rank );
+                system->my_rank );
         MPI_Abort( mpi_data->world, CANNOT_INITIALIZE );
     }
 #if defined(DEBUG)
@@ -924,7 +958,7 @@ void Initialize( reax_system *system, control_params *control,
     {
         fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
         fprintf( stderr, "p%d: system could not be initialized! terminating.\n",
-                 system->my_rank );
+                system->my_rank );
         MPI_Abort( mpi_data->world, CANNOT_INITIALIZE );
     }
 #if defined(DEBUG)
@@ -935,7 +969,7 @@ void Initialize( reax_system *system, control_params *control,
     {
         fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
         fprintf( stderr, "p%d: sim_data couldn't be initialized! terminating.\n",
-                 system->my_rank );
+                system->my_rank );
         MPI_Abort( mpi_data->world, CANNOT_INITIALIZE );
     }
 #if defined(DEBUG)
@@ -946,9 +980,9 @@ void Initialize( reax_system *system, control_params *control,
             FAILURE )
     {
         fprintf( stderr, "p%d:init_workspace: not enough memory\n",
-                 system->my_rank );
+                system->my_rank );
         fprintf( stderr, "p%d:workspace couldn't be initialized! terminating.\n",
-                 system->my_rank );
+                system->my_rank );
         MPI_Abort( mpi_data->world, CANNOT_INITIALIZE );
     }
 #if defined(DEBUG)
@@ -960,7 +994,7 @@ void Initialize( reax_system *system, control_params *control,
     {
         fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
         fprintf( stderr, "p%d: system could not be initialized! terminating.\n",
-                 system->my_rank );
+                system->my_rank );
         MPI_Abort( mpi_data->world, CANNOT_INITIALIZE );
     }
 #if defined(DEBUG)
@@ -971,7 +1005,7 @@ void Initialize( reax_system *system, control_params *control,
     {
         fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
         fprintf( stderr, "p%d: could not open output files! terminating...\n",
-                 system->my_rank );
+                system->my_rank );
         MPI_Abort( mpi_data->world, CANNOT_INITIALIZE );
     }
 #if defined(DEBUG)
@@ -984,7 +1018,7 @@ void Initialize( reax_system *system, control_params *control,
         {
             fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
             fprintf( stderr, "p%d: couldn't create lookup table! terminating.\n",
-                     system->my_rank );
+                    system->my_rank );
             MPI_Abort( mpi_data->world, CANNOT_INITIALIZE );
         }
 #if defined(DEBUG)
@@ -1001,5 +1035,5 @@ void Initialize( reax_system *system, control_params *control,
       Init_Force_Test_Functions();
       fprintf(stderr,"p%d: initialized force test functions\n",system->my_rank);
 #endif*/
-    }
+}
 #endif
diff --git a/PuReMD/src/io_tools.c b/PuReMD/src/io_tools.c
index 560bb5a1..122b4e4e 100644
--- a/PuReMD/src/io_tools.c
+++ b/PuReMD/src/io_tools.c
@@ -44,8 +44,8 @@ print_interaction Print_Interactions[NUM_INTRS];
 
 /************************ initialize output controls ************************/
 int Init_Output_Files( reax_system *system, control_params *control,
-                       output_controls *out_control, mpi_datatypes *mpi_data,
-                       char *msg )
+        output_controls *out_control, mpi_datatypes *mpi_data,
+        char *msg )
 {
     char temp[MAX_STR];
     int ret;
@@ -68,12 +68,12 @@ int Init_Output_Files( reax_system *system, control_params *control,
             {
 #if !defined(DEBUG) && !defined(DEBUG_FOCUS)
                 fprintf( out_control->out, "%-6s%14s%14s%14s%11s%13s%13s\n",
-                         "step", "total energy", "potential", "kinetic",
-                         "T(K)", "V(A^3)", "P(Gpa)" );
+                        "step", "total energy", "potential", "kinetic",
+                        "T(K)", "V(A^3)", "P(Gpa)" );
 #else
                 fprintf( out_control->out, "%-6s%24s%24s%24s%13s%16s%13s\n",
-                         "step", "total energy", "potential", "kinetic",
-                         "T(K)", "V(A^3)", "P(GPa)" );
+                        "step", "total energy", "potential", "kinetic",
+                        "T(K)", "V(A^3)", "P(GPa)" );
 #endif
                 fflush( out_control->out );
             }
@@ -89,16 +89,16 @@ int Init_Output_Files( reax_system *system, control_params *control,
             {
 #if !defined(DEBUG) && !defined(DEBUG_FOCUS)
                 fprintf( out_control->pot,
-                         "%-6s%14s%14s%14s%14s%14s%14s%14s%14s%14s%14s%14s\n",
-                         "step", "ebond", "eatom", "elp",
-                         "eang", "ecoa", "ehb", "etor", "econj",
-                         "evdw", "ecoul", "epol" );
+                        "%-6s%14s%14s%14s%14s%14s%14s%14s%14s%14s%14s%14s\n",
+                        "step", "ebond", "eatom", "elp",
+                        "eang", "ecoa", "ehb", "etor", "econj",
+                        "evdw", "ecoul", "epol" );
 #else
                 fprintf( out_control->pot,
-                         "%-6s%24s%24s%24s%24s%24s%24s%24s%24s%24s%24s%24s\n",
-                         "step", "ebond", "eatom", "elp",
-                         "eang", "ecoa", "ehb", "etor", "econj",
-                         "evdw", "ecoul", "epol" );
+                        "%-6s%24s%24s%24s%24s%24s%24s%24s%24s%24s%24s%24s\n",
+                        "step", "ebond", "eatom", "elp",
+                        "eang", "ecoa", "ehb", "etor", "econj",
+                        "evdw", "ecoul", "epol" );
 #endif
                 fflush( out_control->pot );
             }
@@ -113,9 +113,10 @@ int Init_Output_Files( reax_system *system, control_params *control,
             sprintf( temp, "%s.log", control->sim_name );
             if ( (out_control->log = fopen( temp, "w" )) != NULL )
             {
-                fprintf( out_control->log, "%6s%8s%8s%8s%8s%8s%8s%8s%8s\n",
-                         "step", "total", "comm", "nbrs", "init", "bonded", "nonb",
-                         "qeq", "matvecs" );
+                fprintf( out_control->log, "%-6s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s\n",
+                        "step", "total", "comm", "neighbors", "init", "bonded",
+                        "nonbonded", "CM", "CM Sort", "S iters", "S comm", "Pre Comp", "Pre App",
+                        "S spmv", "S vec ops", "S orthog", "S tsolve" );
                 fflush( out_control->log );
             }
             else
@@ -153,7 +154,7 @@ int Init_Output_Files( reax_system *system, control_params *control,
             if ( (out_control->dpl = fopen( temp, "w" )) != NULL )
             {
                 fprintf( out_control->dpl, "%6s%20s%30s",
-                         "step", "molecule count", "avg dipole moment norm" );
+                        "step", "molecule count", "avg dipole moment norm" );
                 fflush( out_control->dpl );
             }
             else
@@ -170,7 +171,7 @@ int Init_Output_Files( reax_system *system, control_params *control,
             if ( (out_control->drft = fopen( temp, "w" )) != NULL )
             {
                 fprintf( out_control->drft, "%7s%20s%20s\n",
-                         "step", "type count", "avg disp^2" );
+                        "step", "type count", "avg disp^2" );
                 fflush( out_control->drft );
             }
             else
@@ -188,11 +189,11 @@ int Init_Output_Files( reax_system *system, control_params *control,
        fashion controlled by their rank */
     /*if( control->molecular_analysis ) {
       if( system->my_rank == MASTER_NODE ) {
-        sprintf( temp, "%s.mol", control->sim_name );
-        if( (out_control->mol = fopen( temp, "w" )) == NULL ) {
-    strcpy(msg,"init_out_controls: .mol file could not be opened\n");
-    return FAILURE;
-        }
+      sprintf( temp, "%s.mol", control->sim_name );
+      if( (out_control->mol = fopen( temp, "w" )) == NULL ) {
+      strcpy(msg,"init_out_controls: .mol file could not be opened\n");
+      return FAILURE;
+      }
       }
 
       MPI_Bcast( &(out_control->mol), 1, MPI_LONG, 0, MPI_COMM_WORLD );
@@ -466,7 +467,7 @@ int Init_Output_Files( reax_system *system, control_params *control,
 
 /************************ close output files ************************/
 int Close_Output_Files( reax_system *system, control_params *control,
-                        output_controls *out_control, mpi_datatypes *mpi_data )
+        output_controls *out_control, mpi_datatypes *mpi_data )
 {
     if ( out_control->write_steps > 0 )
         End_Traj( system->my_rank, out_control );
@@ -548,11 +549,11 @@ void Print_Box( simulation_box* box, char *name, FILE *out )
 
     fprintf( out, "%s:\n", name );
     fprintf( out, "\tmin[%8.3f %8.3f %8.3f]\n",
-             box->min[0], box->min[1], box->min[2] );
+            box->min[0], box->min[1], box->min[2] );
     fprintf( out, "\tmax[%8.3f %8.3f %8.3f]\n",
-             box->max[0], box->max[1], box->max[2] );
+            box->max[0], box->max[1], box->max[2] );
     fprintf( out, "\tdims[%8.3f%8.3f%8.3f]\n",
-             box->box_norms[0], box->box_norms[1], box->box_norms[2] );
+            box->box_norms[0], box->box_norms[1], box->box_norms[2] );
 
     // fprintf( out, "box: {" );
     // for( i = 0; i < 3; ++i )
@@ -598,34 +599,34 @@ void Print_Grid( grid* g, FILE *out )
     };
 
     fprintf( out, "\tnumber of grid cells: %d %d %d\n",
-             g->ncells[0], g->ncells[1], g->ncells[2] );
+            g->ncells[0], g->ncells[1], g->ncells[2] );
     fprintf( out, "\tgcell lengths: %8.3f %8.3f %8.3f\n",
-             g->cell_len[0], g->cell_len[1], g->cell_len[2] );
+            g->cell_len[0], g->cell_len[1], g->cell_len[2] );
     fprintf( out, "\tinverses of gcell lengths: %8.3f %8.3f %8.3f\n",
-             g->inv_len[0], g->inv_len[1], g->inv_len[2] );
+            g->inv_len[0], g->inv_len[1], g->inv_len[2] );
     fprintf( out, "\t---------------------------------\n" );
     fprintf( out, "\tnumber of native gcells: %d %d %d\n",
-             g->native_cells[0], g->native_cells[1], g->native_cells[2] );
+            g->native_cells[0], g->native_cells[1], g->native_cells[2] );
     fprintf( out, "\tnative gcell span: %d-%d  %d-%d  %d-%d\n",
-             g->native_str[0], g->native_end[0],
-             g->native_str[1], g->native_end[1],
-             g->native_str[2], g->native_end[2] );
+            g->native_str[0], g->native_end[0],
+            g->native_str[1], g->native_end[1],
+            g->native_str[2], g->native_end[2] );
     fprintf( out, "\t---------------------------------\n" );
     fprintf( out, "\tvlist gcell stretch: %d %d %d\n",
-             g->vlist_span[0], g->vlist_span[1], g->vlist_span[2] );
+            g->vlist_span[0], g->vlist_span[1], g->vlist_span[2] );
     fprintf( out, "\tnonbonded nbrs gcell stretch: %d %d %d\n",
-             g->nonb_span[0], g->nonb_span[1], g->nonb_span[2] );
+            g->nonb_span[0], g->nonb_span[1], g->nonb_span[2] );
     fprintf( out, "\tbonded nbrs gcell stretch: %d %d %d\n",
-             g->bond_span[0], g->bond_span[1], g->bond_span[2] );
+            g->bond_span[0], g->bond_span[1], g->bond_span[2] );
     fprintf( out, "\t---------------------------------\n" );
     fprintf( out, "\tghost gcell span: %d %d %d\n",
-             g->ghost_span[0], g->ghost_span[1], g->ghost_span[2] );
+            g->ghost_span[0], g->ghost_span[1], g->ghost_span[2] );
     fprintf( out, "\tnonbonded ghost gcell span: %d %d %d\n",
-             g->ghost_nonb_span[0], g->ghost_nonb_span[1], g->ghost_nonb_span[2]);
+            g->ghost_nonb_span[0], g->ghost_nonb_span[1], g->ghost_nonb_span[2]);
     fprintf(out, "\thbonded ghost gcell span: %d %d %d\n",
             g->ghost_hbond_span[0], g->ghost_hbond_span[1], g->ghost_hbond_span[2]);
     fprintf( out, "\tbonded ghost gcell span: %d %d %d\n",
-             g->ghost_bond_span[0], g->ghost_bond_span[1], g->ghost_bond_span[2]);
+            g->ghost_bond_span[0], g->ghost_bond_span[1], g->ghost_bond_span[2]);
     //fprintf(out, "\t---------------------------------\n" );
     //fprintf(out, "\tmax number of gcells at the boundary: %d\n", g->gcell_cap);
     fprintf( out, "\t---------------------------------\n" );
@@ -641,17 +642,17 @@ void Print_Grid( grid* g, FILE *out )
                 if ( g->cells[x][y][z].type != gc_type )
                 {
                     fprintf( stderr,
-                             "\tgcells from(%2d %2d %2d) to (%2d %2d %2d): %d - %s\n",
-                             gc_str[0], gc_str[1], gc_str[2], x, y, z,
-                             gc_type, gcell_type_text[gc_type] );
+                            "\tgcells from(%2d %2d %2d) to (%2d %2d %2d): %d - %s\n",
+                            gc_str[0], gc_str[1], gc_str[2], x, y, z,
+                            gc_type, gcell_type_text[gc_type] );
                     gc_type = g->cells[x][y][z].type;
                     gc_str[0] = x;
                     gc_str[1] = y;
                     gc_str[2] = z;
                 }
     fprintf( stderr, "\tgcells from(%2d %2d %2d) to (%2d %2d %2d): %d - %s\n",
-             gc_str[0], gc_str[1], gc_str[2], x, y, z,
-             gc_type, gcell_type_text[gc_type] );
+            gc_str[0], gc_str[1], gc_str[2], x, y, z,
+            gc_type, gcell_type_text[gc_type] );
     fprintf( out, "-------------------------------------\n" );
 }
 
@@ -678,21 +679,21 @@ void Print_GCell_Exchange_Bounds( int my_rank, neighbor_proc *my_nbrs )
                     nbr_pr = &(my_nbrs[nbr]);
 
                     fprintf( f, "p%-2d GCELL BOUNDARIES with r(%2d %2d %2d):\n",
-                             my_rank, r[0], r[1], r[2] );
+                            my_rank, r[0], r[1], r[2] );
 
                     fprintf( f, "\tsend_type %s: send(%d %d %d) to (%d %d %d)\n",
-                             exch[nbr_pr->send_type],
-                             nbr_pr->str_send[0], nbr_pr->str_send[1],
-                             nbr_pr->str_send[2],
-                             nbr_pr->end_send[0], nbr_pr->end_send[1],
-                             nbr_pr->end_send[2] );
+                            exch[nbr_pr->send_type],
+                            nbr_pr->str_send[0], nbr_pr->str_send[1],
+                            nbr_pr->str_send[2],
+                            nbr_pr->end_send[0], nbr_pr->end_send[1],
+                            nbr_pr->end_send[2] );
 
                     fprintf( f, "\trecv_type %s: recv(%d %d %d) to (%d %d %d)\n",
-                             exch[nbr_pr->recv_type],
-                             nbr_pr->str_recv[0], nbr_pr->str_recv[1],
-                             nbr_pr->str_recv[2],
-                             nbr_pr->end_recv[0], nbr_pr->end_recv[1],
-                             nbr_pr->end_recv[2] );
+                            exch[nbr_pr->recv_type],
+                            nbr_pr->str_recv[0], nbr_pr->str_recv[1],
+                            nbr_pr->str_recv[2],
+                            nbr_pr->end_recv[0], nbr_pr->end_recv[1],
+                            nbr_pr->end_recv[2] );
                 }
 
     fclose(f);
@@ -724,8 +725,8 @@ void Print_Native_GCells( reax_system *system )
                 gc = &( g->cells[i][j][k] );
 
                 fprintf( f, "p%d gcell(%2d %2d %2d) of type %d(%s)\n",
-                         system->my_rank, i, j, k,
-                         gc->type, gcell_type_text[gc->type] );
+                        system->my_rank, i, j, k,
+                        gc->type, gcell_type_text[gc->type] );
 
                 fprintf( f, "\tatom list start: %d, end: %d\n\t", gc->str, gc->end );
 
@@ -763,8 +764,8 @@ void Print_All_GCells( reax_system *system )
                 gc = &( g->cells[i][j][k] );
 
                 fprintf( f, "p%d gcell(%2d %2d %2d) of type %d(%s)\n",
-                         system->my_rank, i, j, k,
-                         gc->type, gcell_type_text[gc->type] );
+                        system->my_rank, i, j, k,
+                        gc->type, gcell_type_text[gc->type] );
 
                 fprintf( f, "\tatom list start: %d, end: %d\n\t", gc->str, gc->end );
 
@@ -796,11 +797,11 @@ void Print_My_Atoms( reax_system *system, control_params *control, int step )
 
     for ( i = 0; i < system->n; ++i )
         fprintf( fh, "p%-2d %-5d %2d %24.15e%24.15e%24.15e\n",
-                 system->my_rank,
-                 system->my_atoms[i].orig_id, system->my_atoms[i].type,
-                 system->my_atoms[i].x[0],
-                 system->my_atoms[i].x[1],
-                 system->my_atoms[i].x[2] );
+                system->my_rank,
+                system->my_atoms[i].orig_id, system->my_atoms[i].type,
+                system->my_atoms[i].x[0],
+                system->my_atoms[i].x[1],
+                system->my_atoms[i].x[2] );
 
     fclose( fh );
 }
@@ -824,18 +825,18 @@ void Print_My_Ext_Atoms( reax_system *system )
 
     for ( i = 0; i < system->N; ++i )
         fprintf( fh, "p%-2d %-5d imprt%-5d %2d %24.15e%24.15e%24.15e\n",
-                 system->my_rank, system->my_atoms[i].orig_id,
-                 system->my_atoms[i].imprt_id, system->my_atoms[i].type,
-                 system->my_atoms[i].x[0],
-                 system->my_atoms[i].x[1],
-                 system->my_atoms[i].x[2] );
+                system->my_rank, system->my_atoms[i].orig_id,
+                system->my_atoms[i].imprt_id, system->my_atoms[i].type,
+                system->my_atoms[i].x[0],
+                system->my_atoms[i].x[1],
+                system->my_atoms[i].x[2] );
 
     fclose( fh );
 }
 
 
 void Print_Far_Neighbors( reax_system *system, reax_list **lists,
-                          control_params *control )
+        control_params *control )
 {
     char  fname[100];
     int   i, j, id_i, id_j, nbr, natoms;
@@ -857,16 +858,16 @@ void Print_Far_Neighbors( reax_system *system, reax_list **lists,
             id_j = system->my_atoms[nbr].orig_id;
 
             fprintf( fout, "%6d%6d%24.15e%24.15e%24.15e%24.15e\n",
-                     id_i, id_j, far_nbrs->far_nbr_list[j].d,
-                     far_nbrs->far_nbr_list[j].dvec[0],
-                     far_nbrs->far_nbr_list[j].dvec[1],
-                     far_nbrs->far_nbr_list[j].dvec[2] );
+                    id_i, id_j, far_nbrs->far_nbr_list[j].d,
+                    far_nbrs->far_nbr_list[j].dvec[0],
+                    far_nbrs->far_nbr_list[j].dvec[1],
+                    far_nbrs->far_nbr_list[j].dvec[2] );
 
             fprintf( fout, "%6d%6d%24.15e%24.15e%24.15e%24.15e\n",
-                     id_j, id_i, far_nbrs->far_nbr_list[j].d,
-                     -far_nbrs->far_nbr_list[j].dvec[0],
-                     -far_nbrs->far_nbr_list[j].dvec[1],
-                     -far_nbrs->far_nbr_list[j].dvec[2] );
+                    id_j, id_i, far_nbrs->far_nbr_list[j].d,
+                    -far_nbrs->far_nbr_list[j].dvec[0],
+                    -far_nbrs->far_nbr_list[j].dvec[1],
+                    -far_nbrs->far_nbr_list[j].dvec[2] );
         }
     }
 
@@ -881,9 +882,9 @@ void Print_Sparse_Matrix( reax_system *system, sparse_matrix *A )
     for ( i = 0; i < A->n; ++i )
         for ( j = A->start[i]; j < A->end[i]; ++j )
             fprintf( stderr, "%d %d %.15e\n",
-                     system->my_atoms[i].orig_id,
-                     system->my_atoms[A->entries[j].j].orig_id,
-                     A->entries[j].val );
+                    system->my_atoms[i].orig_id,
+                    system->my_atoms[A->entries[j].j].orig_id,
+                    A->entries[j].val );
 }
 
 
@@ -895,9 +896,9 @@ void Print_Sparse_Matrix2( reax_system *system, sparse_matrix *A, char *fname )
     for ( i = 0; i < A->n; ++i )
         for ( j = A->start[i]; j < A->end[i]; ++j )
             fprintf( f, "%d %d %.15e\n",
-                     system->my_atoms[i].orig_id,
-                     system->my_atoms[A->entries[j].j].orig_id,
-                     A->entries[j].val );
+                    system->my_atoms[i].orig_id,
+                    system->my_atoms[A->entries[j].j].orig_id,
+                    A->entries[j].val );
 
     fclose(f);
 }
@@ -916,10 +917,10 @@ void Print_Symmetric_Sparse(reax_system *system, sparse_matrix *A, char *fname)
         {
             aj = &(system->my_atoms[A->entries[j].j]);
             fprintf( f, "%d %d %.15e\n",
-                     ai->renumber, aj->renumber, A->entries[j].val );
+                    ai->renumber, aj->renumber, A->entries[j].val );
             if ( A->entries[j].j < system->n && ai->renumber != aj->renumber )
                 fprintf( f, "%d %d %.15e\n",
-                         aj->renumber, ai->renumber, A->entries[j].val );
+                        aj->renumber, ai->renumber, A->entries[j].val );
         }
     }
 
@@ -928,7 +929,7 @@ void Print_Symmetric_Sparse(reax_system *system, sparse_matrix *A, char *fname)
 
 
 void Print_Linear_System( reax_system *system, control_params *control,
-                          storage *workspace, int step )
+        storage *workspace, int step )
 {
     int   i, j;
     char  fname[100];
@@ -943,9 +944,9 @@ void Print_Linear_System( reax_system *system, control_params *control,
     {
         ai = &(system->my_atoms[i]);
         fprintf( out, "%6d%2d%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e\n",
-                 ai->renumber, ai->type, ai->x[0], ai->x[1], ai->x[2],
-                 workspace->s[i], workspace->b_s[i],
-                 workspace->t[i], workspace->b_t[i] );
+                ai->renumber, ai->type, ai->x[0], ai->x[1], ai->x[2],
+                workspace->s[i], workspace->b_s[i],
+                workspace->t[i], workspace->b_t[i] );
     }
     fclose( out );
 
@@ -955,21 +956,21 @@ void Print_Linear_System( reax_system *system, control_params *control,
 
     // print the incomplete H matrix
     /*sprintf( fname, "%s.p%dHinc%d", control->sim_name, system->my_rank, step );
-    out = fopen( fname, "w" );
-    H = workspace->H;
-    for( i = 0; i < H->n; ++i ) {
+      out = fopen( fname, "w" );
+      H = workspace->H;
+      for( i = 0; i < H->n; ++i ) {
       ai = &(system->my_atoms[i]);
       for( j = H->start[i]; j < H->end[i]; ++j )
-        if( H->entries[j].j < system->n ) {
-    aj = &(system->my_atoms[H->entries[j].j]);
-    fprintf( out, "%d %d %.15e\n",
-       ai->orig_id, aj->orig_id, H->entries[j].val );
-    if( ai->orig_id != aj->orig_id )
+      if( H->entries[j].j < system->n ) {
+      aj = &(system->my_atoms[H->entries[j].j]);
       fprintf( out, "%d %d %.15e\n",
-         aj->orig_id, ai->orig_id, H->entries[j].val );
-        }
-    }
-    fclose( out );*/
+      ai->orig_id, aj->orig_id, H->entries[j].val );
+      if( ai->orig_id != aj->orig_id )
+      fprintf( out, "%d %d %.15e\n",
+      aj->orig_id, ai->orig_id, H->entries[j].val );
+      }
+      }
+      fclose( out );*/
 
     // print the L from incomplete cholesky decomposition
     /*sprintf( fname, "%s.p%dL%d", control->sim_name, system->my_rank, step );
@@ -988,7 +989,7 @@ void Print_LinSys_Soln( reax_system *system, real *x, real *b_prm, real *b )
 
     for ( i = 0; i < system->n; ++i )
         fprintf( fout, "%6d%10.4f%10.4f%10.4f\n",
-                 system->my_atoms[i].orig_id, x[i], b_prm[i], b[i] );
+                system->my_atoms[i].orig_id, x[i], b_prm[i], b[i] );
 
     fclose( fout );
 }
@@ -1005,10 +1006,10 @@ void Print_Charges( reax_system *system )
 
     for ( i = 0; i < system->n; ++i )
         fprintf( fout, "%6d %10.7f %10.7f %10.7f\n",
-                 system->my_atoms[i].orig_id,
-                 system->my_atoms[i].s[0],
-                 system->my_atoms[i].t[0],
-                 system->my_atoms[i].q );
+                system->my_atoms[i].orig_id,
+                system->my_atoms[i].s[0],
+                system->my_atoms[i].t[0],
+                system->my_atoms[i].q );
 
     fclose( fout );
 }
@@ -1034,15 +1035,15 @@ void Print_HBonds( reax_system *system, reax_list **lists,
 
             fprintf( fout, "%8d%8d %24.15e %24.15e %24.15e\n", i, phbond->nbr,
                     phbond->ptr->dvec[0], phbond->ptr->dvec[1], phbond->ptr->dvec[2] );
-//            fprintf( fout, "%8d%8d %8d %8d\n", i, phbond->nbr,
-//                  phbond->scl, phbond->sym_index );
+            //            fprintf( fout, "%8d%8d %8d %8d\n", i, phbond->nbr,
+            //                  phbond->scl, phbond->sym_index );
         }
     }
 
     fclose( fout );
 }
 
- 
+
 void Print_HBond_Indices( reax_system *system, reax_list **lists,
         control_params *control, int step )
 {
@@ -1083,9 +1084,9 @@ void Print_Bonds( reax_system *system, reax_list **lists,
         {
             pbond = &bonds->bond_list[pj];
             bo_ij = &pbond->bo_data;
-//            fprintf( fout, "%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n",
-//                    system->my_atoms[i].orig_id, system->my_atoms[j].orig_id,
-//                    pbond->d, bo_ij->BO, bo_ij->BO_s, bo_ij->BO_pi, bo_ij->BO_pi2 );
+            //            fprintf( fout, "%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n",
+            //                    system->my_atoms[i].orig_id, system->my_atoms[j].orig_id,
+            //                    pbond->d, bo_ij->BO, bo_ij->BO_s, bo_ij->BO_pi, bo_ij->BO_pi2 );
             fprintf( fout, "%8d%8d %24.15f %24.15f\n",
                     i, pbond->nbr, //system->my_atoms[i].orig_id, system->my_atoms[j].orig_id,
                     pbond->d, bo_ij->BO );
@@ -1130,7 +1131,7 @@ void Print_Bond_List2( reax_system *system, reax_list *bonds, char *fname )
 
 
 void Print_Total_Force( reax_system *system, simulation_data *data,
-                        storage *workspace )
+        storage *workspace )
 {
     int    i;
 
@@ -1139,9 +1140,9 @@ void Print_Total_Force( reax_system *system, simulation_data *data,
 
     for ( i = 0; i < system->N; ++i )
         fprintf( stderr, "%6d %f %f %f\n",
-                 //"%6d%24.15e%24.15e%24.15e\n",
-                 system->my_atoms[i].orig_id,
-                 workspace->f[i][0], workspace->f[i][1], workspace->f[i][2] );
+                //"%6d%24.15e%24.15e%24.15e\n",
+                system->my_atoms[i].orig_id,
+                workspace->f[i][0], workspace->f[i][1], workspace->f[i][2] );
 }
 
 
@@ -1206,15 +1207,15 @@ void Print_Far_Neighbors_List_Adj_Format( reax_system *system,
 }
 
 void Output_Results( reax_system *system, control_params *control,
-                     simulation_data *data, reax_list **lists,
-                     output_controls *out_control, mpi_datatypes *mpi_data )
+        simulation_data *data, reax_list **lists,
+        output_controls *out_control, mpi_datatypes *mpi_data )
 {
 #if defined(LOG_PERFORMANCE)
     real t_elapsed, denom;
 #endif
 
     if ((out_control->energy_update_freq > 0 &&
-            data->step % out_control->energy_update_freq == 0) ||
+                data->step % out_control->energy_update_freq == 0) ||
             (out_control->write_steps > 0 &&
              data->step % out_control->write_steps == 0))
     {
@@ -1228,36 +1229,36 @@ void Output_Results( reax_system *system, control_params *control,
         {
 #if !defined(DEBUG) && !defined(DEBUG_FOCUS)
             fprintf( out_control->out,
-                     "%-6d%14.2f%14.2f%14.2f%11.2f%13.2f%13.5f\n",
-                     data->step, data->sys_en.e_tot, data->sys_en.e_pot,
-                     E_CONV * data->sys_en.e_kin, data->therm.T,
-                     system->big_box.V, data->iso_bar.P );
+                    "%-6d%14.2f%14.2f%14.2f%11.2f%13.2f%13.5f\n",
+                    data->step, data->sys_en.e_tot, data->sys_en.e_pot,
+                    E_CONV * data->sys_en.e_kin, data->therm.T,
+                    system->big_box.V, data->iso_bar.P );
 
             fprintf( out_control->pot,
-                     "%-6d%14.2f%14.2f%14.2f%14.2f%14.2f%14.2f%14.2f%14.2f%14.2f%14.2f%14.2f\n",
-                     data->step,
-                     data->sys_en.e_bond,
-                     data->sys_en.e_ov + data->sys_en.e_un,  data->sys_en.e_lp,
-                     data->sys_en.e_ang + data->sys_en.e_pen, data->sys_en.e_coa,
-                     data->sys_en.e_hb,
-                     data->sys_en.e_tor, data->sys_en.e_con,
-                     data->sys_en.e_vdW, data->sys_en.e_ele, data->sys_en.e_pol);
+                    "%-6d%14.2f%14.2f%14.2f%14.2f%14.2f%14.2f%14.2f%14.2f%14.2f%14.2f%14.2f\n",
+                    data->step,
+                    data->sys_en.e_bond,
+                    data->sys_en.e_ov + data->sys_en.e_un,  data->sys_en.e_lp,
+                    data->sys_en.e_ang + data->sys_en.e_pen, data->sys_en.e_coa,
+                    data->sys_en.e_hb,
+                    data->sys_en.e_tor, data->sys_en.e_con,
+                    data->sys_en.e_vdW, data->sys_en.e_ele, data->sys_en.e_pol);
 #else
             fprintf( out_control->out,
-                     "%-6d%24.15e%24.15e%24.15e%13.5f%16.5f%13.5f\n",
-                     data->step, data->sys_en.e_tot, data->sys_en.e_pot,
-                     E_CONV * data->sys_en.e_kin, data->therm.T,
-                     system->big_box.V, data->iso_bar.P );
+                    "%-6d%24.15e%24.15e%24.15e%13.5f%16.5f%13.5f\n",
+                    data->step, data->sys_en.e_tot, data->sys_en.e_pot,
+                    E_CONV * data->sys_en.e_kin, data->therm.T,
+                    system->big_box.V, data->iso_bar.P );
 
             fprintf( out_control->pot,
-                     "%-6d%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e\n",
-                     data->step,
-                     data->sys_en.e_bond,
-                     data->sys_en.e_ov + data->sys_en.e_un,  data->sys_en.e_lp,
-                     data->sys_en.e_ang + data->sys_en.e_pen, data->sys_en.e_coa,
-                     data->sys_en.e_hb,
-                     data->sys_en.e_tor, data->sys_en.e_con,
-                     data->sys_en.e_vdW, data->sys_en.e_ele, data->sys_en.e_pol);
+                    "%-6d%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e\n",
+                    data->step,
+                    data->sys_en.e_bond,
+                    data->sys_en.e_ov + data->sys_en.e_un,  data->sys_en.e_lp,
+                    data->sys_en.e_ang + data->sys_en.e_pen, data->sys_en.e_coa,
+                    data->sys_en.e_hb,
+                    data->sys_en.e_tor, data->sys_en.e_con,
+                    data->sys_en.e_vdW, data->sys_en.e_ele, data->sys_en.e_pol);
 #endif //DEBUG
 
 #if defined(LOG_PERFORMANCE)
@@ -1266,36 +1267,60 @@ void Output_Results( reax_system *system, control_params *control,
                 denom = 1.0 / out_control->energy_update_freq;
             else denom = 1;
 
-            fprintf( out_control->log, "%6d%8.3f%8.3f%8.3f%8.3f%8.3f%8.3f%8.3f%6d\n",
-                     data->step,
-                     t_elapsed * denom,
-                     data->timing.comm * denom,
-                     data->timing.nbrs * denom,
-                     data->timing.init_forces * denom,
-                     data->timing.bonded * denom,
-                     data->timing.nonb * denom,
-                     data->timing.qEq * denom,
-                     (int)((data->timing.s_matvecs + data->timing.t_matvecs)*denom) );
-
-            Reset_Timing( &(data->timing) );
+            fprintf( out_control->log, "%6d %10.2f %10.2f %10.2f %10.2f %10.2f %10.2f %10.4f %10.4f %10.2f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f\n",
+                    data->step,
+                    t_elapsed * denom,
+                    data->timing.comm * denom,
+                    data->timing.nbrs * denom,
+                    data->timing.init_forces * denom,
+                    data->timing.bonded * denom,
+                    data->timing.nonb * denom,
+                    data->timing.cm * denom,
+                    data->timing.cm_sort_mat_rows * denom,
+                    (double)( data->timing.cm_solver_iters * denom),
+                    data->timing.cm_solver_comm * denom,
+                    data->timing.cm_solver_pre_comp * denom,
+                    data->timing.cm_solver_pre_app * denom,
+                    data->timing.cm_solver_spmv * denom,
+                    data->timing.cm_solver_vector_ops * denom,
+                    data->timing.cm_solver_orthog * denom,
+                    data->timing.cm_solver_tri_solve * denom );
+
+            //Reset_Timing( &(data->timing) );
+            data->timing.total = Get_Time( );
+            data->timing.comm = ZERO;
+            data->timing.nbrs = 0;
+            data->timing.init_forces = 0;
+            data->timing.bonded = 0;
+            data->timing.nonb = 0;
+            data->timing.cm = ZERO;
+            data->timing.cm_sort_mat_rows = ZERO;
+            data->timing.cm_solver_comm = ZERO;
+            data->timing.cm_solver_pre_comp = ZERO;
+            data->timing.cm_solver_pre_app = ZERO;
+            data->timing.cm_solver_iters = 0;
+            data->timing.cm_solver_spmv = ZERO;
+            data->timing.cm_solver_vector_ops = ZERO;
+            data->timing.cm_solver_orthog = ZERO;
+            data->timing.cm_solver_tri_solve = ZERO;
             fflush( out_control->log );
 #endif //LOG_PERFORMANCE
 
             if ( control->virial )
             {
                 fprintf( out_control->prs,
-                         "%8d%13.6f%13.6f%13.6f%13.6f%13.6f%13.6f%13.6f\n",
-                         data->step,
-                         data->int_press[0], data->int_press[1], data->int_press[2],
-                         data->ext_press[0], data->ext_press[1], data->ext_press[2],
-                         data->kin_press );
+                        "%8d%13.6f%13.6f%13.6f%13.6f%13.6f%13.6f%13.6f\n",
+                        data->step,
+                        data->int_press[0], data->int_press[1], data->int_press[2],
+                        data->ext_press[0], data->ext_press[1], data->ext_press[2],
+                        data->kin_press );
 
                 fprintf( out_control->prs,
-                         "%8s%13.6f%13.6f%13.6f%13.6f%13.6f%13.6f%13.6f\n",
-                         "", system->big_box.box_norms[0], system->big_box.box_norms[1],
-                         system->big_box.box_norms[2],
-                         data->tot_press[0], data->tot_press[1], data->tot_press[2],
-                         system->big_box.V );
+                        "%8s%13.6f%13.6f%13.6f%13.6f%13.6f%13.6f%13.6f\n",
+                        "", system->big_box.box_norms[0], system->big_box.box_norms[1],
+                        system->big_box.box_norms[2],
+                        data->tot_press[0], data->tot_press[1], data->tot_press[2],
+                        system->big_box.V );
 
                 fflush( out_control->prs);
             }
@@ -1322,39 +1347,39 @@ void Output_Results( reax_system *system, control_params *control,
 void Debug_Marker_Bonded( output_controls *out_control, int step )
 {
     fprintf( out_control->ebond, "step: %d\n%6s%6s%12s%12s%12s\n",
-             step, "atom1", "atom2", "bo", "ebond", "total" );
+            step, "atom1", "atom2", "bo", "ebond", "total" );
     fprintf( out_control->elp, "step: %d\n%6s%12s%12s%12s\n",
-             step, "atom", "nlp", "elp", "total" );
+            step, "atom", "nlp", "elp", "total" );
     fprintf( out_control->eov, "step: %d\n%6s%12s%12s\n",
-             step, "atom", "eov", "total" );
+            step, "atom", "eov", "total" );
     fprintf( out_control->eun, "step: %d\n%6s%12s%12s\n",
-             step, "atom", "eun", "total" );
+            step, "atom", "eun", "total" );
     fprintf( out_control->eval, "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s%12s\n",
-             step, "atom1", "atom2", "atom3", "angle", "theta0",
-             "bo(12)", "bo(23)", "eval", "total" );
+            step, "atom1", "atom2", "atom3", "angle", "theta0",
+            "bo(12)", "bo(23)", "eval", "total" );
     fprintf( out_control->epen, "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s\n",
-             step, "atom1", "atom2", "atom3", "angle", "bo(12)", "bo(23)",
-             "epen", "total" );
+            step, "atom1", "atom2", "atom3", "angle", "bo(12)", "bo(23)",
+            "epen", "total" );
     fprintf( out_control->ecoa, "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s\n",
-             step, "atom1", "atom2", "atom3", "angle", "bo(12)", "bo(23)",
-             "ecoa", "total" );
+            step, "atom1", "atom2", "atom3", "angle", "bo(12)", "bo(23)",
+            "ecoa", "total" );
     fprintf( out_control->ehb,  "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s\n",
-             step, "atom1", "atom2", "atom3", "r(23)", "angle", "bo(12)",
-             "ehb", "total" );
+            step, "atom1", "atom2", "atom3", "r(23)", "angle", "bo(12)",
+            "ehb", "total" );
     fprintf( out_control->etor, "step: %d\n%6s%6s%6s%6s%12s%12s%12s%12s\n",
-             step, "atom1", "atom2", "atom3", "atom4", "phi", "bo(23)",
-             "etor", "total" );
+            step, "atom1", "atom2", "atom3", "atom4", "phi", "bo(23)",
+            "etor", "total" );
     fprintf( out_control->econ, "step:%d\n%6s%6s%6s%6s%12s%12s%12s%12s%12s%12s\n",
-             step, "atom1", "atom2", "atom3", "atom4",
-             "phi", "bo(12)", "bo(23)", "bo(34)", "econ", "total" );
+            step, "atom1", "atom2", "atom3", "atom4",
+            "phi", "bo(12)", "bo(23)", "bo(34)", "econ", "total" );
 }
 
 void Debug_Marker_Nonbonded( output_controls *out_control, int step )
 {
     fprintf( out_control->evdw, "step: %d\n%6s%6s%12s%12s%12s\n",
-             step, "atom1", "atom2", "r12", "evdw", "total" );
+            step, "atom1", "atom2", "r12", "evdw", "total" );
     fprintf( out_control->ecou, "step: %d\n%6s%6s%12s%12s%12s%12s%12s\n",
-             step, "atom1", "atom2", "r12", "q1", "q2", "ecou", "total" );
+            step, "atom1", "atom2", "r12", "q1", "q2", "ecou", "total" );
 }
 
 #endif
@@ -1362,16 +1387,16 @@ void Debug_Marker_Nonbonded( output_controls *out_control, int step )
 
 #ifdef TEST_FORCES
 void Dummy_Printer( reax_system *system, control_params *control,
-                    simulation_data *data, storage *workspace,
-                    reax_list **lists, output_controls *out_control )
+        simulation_data *data, storage *workspace,
+        reax_list **lists, output_controls *out_control )
 {
 }
 
 
 
 void Print_Bond_Orders( reax_system *system, control_params *control,
-                        simulation_data *data, storage *workspace,
-                        reax_list **lists, output_controls *out_control )
+        simulation_data *data, storage *workspace,
+        reax_list **lists, output_controls *out_control )
 {
     int i, pj, pk;
     bond_order_data *bo_ij;
@@ -1382,52 +1407,52 @@ void Print_Bond_Orders( reax_system *system, control_params *control,
     /* bond orders */
     fprintf( out_control->fbo, "step: %d\n", data->step );
     fprintf( out_control->fbo, "%6s%6s%12s%12s%12s%12s%12s\n",
-             "atom1", "atom2", "r_ij", "total_bo", "bo_s", "bo_p", "bo_pp" );
+            "atom1", "atom2", "r_ij", "total_bo", "bo_s", "bo_p", "bo_pp" );
 
     for ( i = 0; i < system->N; ++i )
         for ( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj )
         {
             bo_ij = &(bonds->bond_list[pj].bo_data);
             fprintf( out_control->fbo,
-                     "%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n",
-                     system->my_atoms[i].orig_id,
-                     system->my_atoms[bonds->bond_list[pj].nbr].orig_id,
-                     bonds->bond_list[pj].d,
-                     bo_ij->BO, bo_ij->BO_s, bo_ij->BO_pi, bo_ij->BO_pi2 );
+                    "%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n",
+                    system->my_atoms[i].orig_id,
+                    system->my_atoms[bonds->bond_list[pj].nbr].orig_id,
+                    bonds->bond_list[pj].d,
+                    bo_ij->BO, bo_ij->BO_s, bo_ij->BO_pi, bo_ij->BO_pi2 );
         }
 
 
     /* derivatives of bond orders */
     fprintf( out_control->fdbo, "step: %d\n", data->step );
     fprintf( out_control->fdbo, "%6s%6s%6s%24s%24s%24s\n",
-             "atom1", "atom2", "atom2", "dBO", "dBOpi", "dBOpi2" );
+            "atom1", "atom2", "atom2", "dBO", "dBOpi", "dBOpi2" );
     for ( i = 0; i < system->N; ++i )
         for ( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj )
         {
             /* fprintf( out_control->fdbo, "%6d %6d\tstart: %6d\tend: %6d\n",
-            system->my_atoms[i].orig_id,
-             system->my_atoms[bonds->bond_list[pj].nbr].orig_id,
-             Start_Index( pj, dBOs ), End_Index( pj, dBOs ) ); */
+               system->my_atoms[i].orig_id,
+               system->my_atoms[bonds->bond_list[pj].nbr].orig_id,
+               Start_Index( pj, dBOs ), End_Index( pj, dBOs ) ); */
             for ( pk = Start_Index(pj, dBOs); pk < End_Index(pj, dBOs); ++pk )
             {
                 dbo_k = &(dBOs->dbo_list[pk]);
                 fprintf( out_control->fdbo, "%6d%6d%6d%24.15e%24.15e%24.15e\n",
-                         system->my_atoms[i].orig_id,
-                         system->my_atoms[bonds->bond_list[pj].nbr].orig_id,
-                         system->my_atoms[dbo_k->wrt].orig_id,
-                         dbo_k->dBO[0], dbo_k->dBO[1], dbo_k->dBO[2] );
+                        system->my_atoms[i].orig_id,
+                        system->my_atoms[bonds->bond_list[pj].nbr].orig_id,
+                        system->my_atoms[dbo_k->wrt].orig_id,
+                        dbo_k->dBO[0], dbo_k->dBO[1], dbo_k->dBO[2] );
 
                 fprintf( out_control->fdbo, "%6d%6d%6d%24.15e%24.15e%24.15e\n",
-                         system->my_atoms[i].orig_id,
-                         system->my_atoms[bonds->bond_list[pj].nbr].orig_id,
-                         system->my_atoms[dbo_k->wrt].orig_id,
-                         dbo_k->dBOpi[0], dbo_k->dBOpi[1], dbo_k->dBOpi[2] );
+                        system->my_atoms[i].orig_id,
+                        system->my_atoms[bonds->bond_list[pj].nbr].orig_id,
+                        system->my_atoms[dbo_k->wrt].orig_id,
+                        dbo_k->dBOpi[0], dbo_k->dBOpi[1], dbo_k->dBOpi[2] );
 
                 fprintf( out_control->fdbo, "%6d%6d%6d%24.15e%24.15e%24.15e\n",
-                         system->my_atoms[i].orig_id,
-                         system->my_atoms[bonds->bond_list[pj].nbr].orig_id,
-                         system->my_atoms[dbo_k->wrt].orig_id,
-                         dbo_k->dBOpi2[0], dbo_k->dBOpi2[1], dbo_k->dBOpi2[2] );
+                        system->my_atoms[i].orig_id,
+                        system->my_atoms[bonds->bond_list[pj].nbr].orig_id,
+                        system->my_atoms[dbo_k->wrt].orig_id,
+                        dbo_k->dBOpi2[0], dbo_k->dBOpi2[1], dbo_k->dBOpi2[2] );
             }
         }
 }
@@ -1442,15 +1467,15 @@ void Print_Forces( FILE *f, storage *workspace, int N, int step )
         //fprintf( f, "%6d %23.15e %23.15e %23.15e\n",
         //fprintf( f, "%6d%12.6f%12.6f%12.6f\n",
         fprintf( f, "%6d %19.9e %19.9e %19.9e\n",
-                 workspace->id_all[i], workspace->f_all[i][0],
-                 workspace->f_all[i][1], workspace->f_all[i][2] );
+                workspace->id_all[i], workspace->f_all[i][0],
+                workspace->f_all[i][1], workspace->f_all[i][2] );
 }
 
 
 void Print_Force_Files( reax_system *system, control_params *control,
-                        simulation_data *data, storage *workspace,
-                        reax_list **lists, output_controls *out_control,
-                        mpi_datatypes *mpi_data )
+        simulation_data *data, storage *workspace,
+        reax_list **lists, output_controls *out_control,
+        mpi_datatypes *mpi_data )
 {
     int i, d;
 
@@ -1514,11 +1539,11 @@ void Print_Force_Files( reax_system *system, control_params *control,
     {
         for ( d = 0; d < 3; ++d )
             workspace->f_tot[i][d] = workspace->f_be[i][d] +
-                                     workspace->f_lp[i][d] + workspace->f_ov[i][d] + workspace->f_un[i][d] +
-                                     workspace->f_ang[i][d] + workspace->f_pen[i][d] + workspace->f_coa[i][d] +
-                                     workspace->f_tor[i][d] + workspace->f_con[i][d] +
-                                     workspace->f_vdw[i][d] + workspace->f_ele[i][d] +
-                                     workspace->f_hb[i][d];
+                workspace->f_lp[i][d] + workspace->f_ov[i][d] + workspace->f_un[i][d] +
+                workspace->f_ang[i][d] + workspace->f_pen[i][d] + workspace->f_coa[i][d] +
+                workspace->f_tor[i][d] + workspace->f_con[i][d] +
+                workspace->f_vdw[i][d] + workspace->f_ele[i][d] +
+                workspace->f_hb[i][d];
     }
 
     Coll_rvecs_at_Master( system, workspace, mpi_data, workspace->f_tot );
@@ -1531,8 +1556,8 @@ void Print_Force_Files( reax_system *system, control_params *control,
 #if defined(TEST_FORCES) || defined(TEST_ENERGY)
 
 void Print_Far_Neighbors_List( reax_system *system, reax_list **lists,
-                               control_params *control, simulation_data *data,
-                               output_controls *out_control )
+        control_params *control, simulation_data *data,
+        output_controls *out_control )
 {
     int   i, j, id_i, id_j, nbr, natoms;
     int num = 0;
@@ -1566,8 +1591,8 @@ void Print_Far_Neighbors_List( reax_system *system, reax_list **lists,
 }
 
 void Print_Bond_List( reax_system *system, control_params *control,
-                      simulation_data *data, reax_list **lists,
-                      output_controls *out_control)
+        simulation_data *data, reax_list **lists,
+        output_controls *out_control)
 {
     int i, j, id_i, id_j, nbr, pj;
     reax_list *bonds = lists[BONDS];
@@ -1609,283 +1634,283 @@ void Print_Init_Atoms( reax_system *system, storage *workspace )
     int i;
 
     fprintf( stderr, "p%d had %d atoms\n",
-             system->my_rank, workspace->init_cnt );
+            system->my_rank, workspace->init_cnt );
 
     for ( i = 0; i < workspace->init_cnt; ++i )
         fprintf( stderr, "p%d, atom%d: %d  %s  %8.3f %8.3f %8.3f\n",
-                 system->my_rank, i,
-                 workspace->init_atoms[i].type, workspace->init_atoms[i].name,
-                 workspace->init_atoms[i].x[0],
-                 workspace->init_atoms[i].x[1],
-                 workspace->init_atoms[i].x[2] );
+                system->my_rank, i,
+                workspace->init_atoms[i].type, workspace->init_atoms[i].name,
+                workspace->init_atoms[i].x[0],
+                workspace->init_atoms[i].x[1],
+                workspace->init_atoms[i].x[2] );
 }
 #endif //OLD_VERSION
 
 
 /*void Print_Bond_Forces( reax_system *system, control_params *control,
-            simulation_data *data, storage *workspace,
-            reax_list **lists, output_controls *out_control )
-{
+  simulation_data *data, storage *workspace,
+  reax_list **lists, output_controls *out_control )
+  {
   int i;
 
   fprintf( out_control->fbond, "step: %d\n", data->step );
   fprintf( out_control->fbond, "%6s%24s%24s%24s\n",
-       "atom", "f_be[0]", "f_be[1]", "f_be[2]" );
+  "atom", "f_be[0]", "f_be[1]", "f_be[2]" );
 
   for( i = 0; i < system->bigN; ++i )
-    fprintf(out_control->fbond, "%6d%24.15e%24.15e%24.15e\n",
-        system->my_atoms[i].orig_id,
-        workspace->f_all[i][0], workspace->f_all[i][1],
-        workspace->f_all[i][2]);
-}
+  fprintf(out_control->fbond, "%6d%24.15e%24.15e%24.15e\n",
+  system->my_atoms[i].orig_id,
+  workspace->f_all[i][0], workspace->f_all[i][1],
+  workspace->f_all[i][2]);
+  }
 
-void Print_LonePair_Forces( reax_system *system, control_params *control,
-                simulation_data *data, storage *workspace,
-                reax_list **lists, output_controls *out_control )
-{
+  void Print_LonePair_Forces( reax_system *system, control_params *control,
+  simulation_data *data, storage *workspace,
+  reax_list **lists, output_controls *out_control )
+  {
   int i;
 
   fprintf( out_control->flp, "step: %d\n", data->step );
   fprintf( out_control->flp, "%6s%24s\n", "atom", "f_lonepair" );
 
   for( i = 0; i < system->bigN; ++i )
-    fprintf(out_control->flp, "%6d%24.15e%24.15e%24.15e\n",
-        system->my_atoms[i].orig_id,
-        workspace->f_all[i][0], workspace->f_all[i][1],
-        workspace->f_all[i][2]);
-}
+  fprintf(out_control->flp, "%6d%24.15e%24.15e%24.15e\n",
+  system->my_atoms[i].orig_id,
+  workspace->f_all[i][0], workspace->f_all[i][1],
+  workspace->f_all[i][2]);
+  }
 
 
-void Print_OverCoor_Forces( reax_system *system, control_params *control,
-                simulation_data *data, storage *workspace,
-                reax_list **lists, output_controls *out_control )
-{
+  void Print_OverCoor_Forces( reax_system *system, control_params *control,
+  simulation_data *data, storage *workspace,
+  reax_list **lists, output_controls *out_control )
+  {
   int i;
 
   fprintf( out_control->fov, "step: %d\n", data->step );
   fprintf( out_control->fov, "%6s%-38s%-38s%-38s\n",
-       "atom","f_over[0]", "f_over[1]", "f_over[2]" );
+  "atom","f_over[0]", "f_over[1]", "f_over[2]" );
 
   for( i = 0; i < system->bigN; ++i )
-    fprintf( out_control->fov,
-         "%6d %24.15e%24.15e%24.15e 0 0 0\n",
-         system->my_atoms[i].orig_id,
-         workspace->f_all[i][0], workspace->f_all[i][1],
-         workspace->f_all[i][2] );
-}
+  fprintf( out_control->fov,
+  "%6d %24.15e%24.15e%24.15e 0 0 0\n",
+  system->my_atoms[i].orig_id,
+  workspace->f_all[i][0], workspace->f_all[i][1],
+  workspace->f_all[i][2] );
+  }
 
 
-void Print_UnderCoor_Forces( reax_system *system, control_params *control,
-                 simulation_data *data, storage *workspace,
-                 reax_list **lists, output_controls *out_control )
-{
+  void Print_UnderCoor_Forces( reax_system *system, control_params *control,
+  simulation_data *data, storage *workspace,
+  reax_list **lists, output_controls *out_control )
+  {
   int i;
 
   fprintf( out_control->fun, "step: %d\n", data->step );
   fprintf( out_control->fun, "%6s%-38s%-38s%-38s\n",
-       "atom","f_under[0]", "f_under[1]", "f_under[2]" );
+  "atom","f_under[0]", "f_under[1]", "f_under[2]" );
 
   for( i = 0; i < system->bigN; ++i )
-    fprintf( out_control->fun,
-         "%6d %24.15e%24.15e%24.15e 0 0 0\n",
-         system->my_atoms[i].orig_id,
-         workspace->f_all[i][0], workspace->f_all[i][1],
-         workspace->f_all[i][2] );
-}
+  fprintf( out_control->fun,
+  "%6d %24.15e%24.15e%24.15e 0 0 0\n",
+  system->my_atoms[i].orig_id,
+  workspace->f_all[i][0], workspace->f_all[i][1],
+  workspace->f_all[i][2] );
+  }
 
 
 void Print_ValAngle_Forces( reax_system *system, control_params *control,
-                simulation_data *data, storage *workspace,
-                reax_list **lists, output_controls *out_control )
+        simulation_data *data, storage *workspace,
+        reax_list **lists, output_controls *out_control )
 {
-  int j;
-
-  fprintf( out_control->f3body, "step: %d\n", data->step );
-  fprintf( out_control->f3body, "%6s%-37s%-37s%-37s%-38s\n",
-       "atom", "3-body total", "f_ang", "f_pen", "f_coa" );
-
-  for( j = 0; j < system->N; ++j ){
-    if( rvec_isZero(workspace->f_pen[j]) && rvec_isZero(workspace->f_coa[j]) )
-      fprintf( out_control->f3body,
-           "%6d %24.15e%24.15e%24.15e  0 0 0  0 0 0\n",
-           system->my_atoms[j].orig_id,
-           workspace->f_ang[j][0], workspace->f_ang[j][1],
-           workspace->f_ang[j][2] );
-    else if( rvec_isZero(workspace->f_coa[j]) )
-      fprintf( out_control->f3body,
-           "%6d %24.15e%24.15e%24.15e %24.15e%24.15e%24.15e "   \
-           "%24.15e%24.15e%24.15e\n",
-           system->my_atoms[j].orig_id,
-           workspace->f_ang[j][0] + workspace->f_pen[j][0],
-           workspace->f_ang[j][1] + workspace->f_pen[j][1],
-           workspace->f_ang[j][2] + workspace->f_pen[j][2],
-           workspace->f_ang[j][0], workspace->f_ang[j][1],
-           workspace->f_ang[j][2],
-           workspace->f_pen[j][0], workspace->f_pen[j][1],
-           workspace->f_pen[j][2] );
-    else{
-      fprintf( out_control->f3body, "%6d %24.15e%24.15e%24.15e ",
-         system->my_atoms[j].orig_id,
-           workspace->f_ang[j][0] + workspace->f_pen[j][0] +
-           workspace->f_coa[j][0],
-           workspace->f_ang[j][1] + workspace->f_pen[j][1] +
-           workspace->f_coa[j][1],
-           workspace->f_ang[j][2] + workspace->f_pen[j][2] +
-           workspace->f_coa[j][2] );
-
-      fprintf( out_control->f3body,
-           "%24.15e%24.15e%24.15e %24.15e%24.15e%24.15e "\
-           "%24.15e%24.15e%24.15e\n",
-           workspace->f_ang[j][0], workspace->f_ang[j][1],
-           workspace->f_ang[j][2],
-           workspace->f_pen[j][0], workspace->f_pen[j][1],
-           workspace->f_pen[j][2],
-           workspace->f_coa[j][0], workspace->f_coa[j][1],
-           workspace->f_coa[j][2] );
+    int j;
+
+    fprintf( out_control->f3body, "step: %d\n", data->step );
+    fprintf( out_control->f3body, "%6s%-37s%-37s%-37s%-38s\n",
+            "atom", "3-body total", "f_ang", "f_pen", "f_coa" );
+
+    for( j = 0; j < system->N; ++j ){
+        if( rvec_isZero(workspace->f_pen[j]) && rvec_isZero(workspace->f_coa[j]) )
+            fprintf( out_control->f3body,
+                    "%6d %24.15e%24.15e%24.15e  0 0 0  0 0 0\n",
+                    system->my_atoms[j].orig_id,
+                    workspace->f_ang[j][0], workspace->f_ang[j][1],
+                    workspace->f_ang[j][2] );
+        else if( rvec_isZero(workspace->f_coa[j]) )
+            fprintf( out_control->f3body,
+                    "%6d %24.15e%24.15e%24.15e %24.15e%24.15e%24.15e "   \
+                    "%24.15e%24.15e%24.15e\n",
+                    system->my_atoms[j].orig_id,
+                    workspace->f_ang[j][0] + workspace->f_pen[j][0],
+                    workspace->f_ang[j][1] + workspace->f_pen[j][1],
+                    workspace->f_ang[j][2] + workspace->f_pen[j][2],
+                    workspace->f_ang[j][0], workspace->f_ang[j][1],
+                    workspace->f_ang[j][2],
+                    workspace->f_pen[j][0], workspace->f_pen[j][1],
+                    workspace->f_pen[j][2] );
+        else{
+            fprintf( out_control->f3body, "%6d %24.15e%24.15e%24.15e ",
+                    system->my_atoms[j].orig_id,
+                    workspace->f_ang[j][0] + workspace->f_pen[j][0] +
+                    workspace->f_coa[j][0],
+                    workspace->f_ang[j][1] + workspace->f_pen[j][1] +
+                    workspace->f_coa[j][1],
+                    workspace->f_ang[j][2] + workspace->f_pen[j][2] +
+                    workspace->f_coa[j][2] );
+
+            fprintf( out_control->f3body,
+                    "%24.15e%24.15e%24.15e %24.15e%24.15e%24.15e "\
+                    "%24.15e%24.15e%24.15e\n",
+                    workspace->f_ang[j][0], workspace->f_ang[j][1],
+                    workspace->f_ang[j][2],
+                    workspace->f_pen[j][0], workspace->f_pen[j][1],
+                    workspace->f_pen[j][2],
+                    workspace->f_coa[j][0], workspace->f_coa[j][1],
+                    workspace->f_coa[j][2] );
+        }
     }
-  }
 }
 
 
 void Print_Hydrogen_Bond_Forces( reax_system *system, control_params *control,
-                 simulation_data *data, storage *workspace,
-                 reax_list **lists, output_controls *out_control)
+        simulation_data *data, storage *workspace,
+        reax_list **lists, output_controls *out_control)
 {
-  int j;
+    int j;
 
-  fprintf( out_control->fhb, "step: %d\n", data->step );
-  fprintf( out_control->fhb, "%6s\t%-38s\n", "atom", "f_hb[0,1,2]" );
+    fprintf( out_control->fhb, "step: %d\n", data->step );
+    fprintf( out_control->fhb, "%6s\t%-38s\n", "atom", "f_hb[0,1,2]" );
 
-  for( j = 0; j < system->N; ++j )
-    fprintf(out_control->fhb, "%6d%24.15e%24.15e%24.15e\n",
-         system->my_atoms[j].orig_id,
-         workspace->f_hb[j][0],
-         workspace->f_hb[j][1],
-         workspace->f_hb[j][2] );
+    for( j = 0; j < system->N; ++j )
+        fprintf(out_control->fhb, "%6d%24.15e%24.15e%24.15e\n",
+                system->my_atoms[j].orig_id,
+                workspace->f_hb[j][0],
+                workspace->f_hb[j][1],
+                workspace->f_hb[j][2] );
 }
 
 
 void Print_Four_Body_Forces( reax_system *system, control_params *control,
-                 simulation_data *data, storage *workspace,
-                 reax_list **lists, output_controls *out_control )
+        simulation_data *data, storage *workspace,
+        reax_list **lists, output_controls *out_control )
 {
-  int j;
-
-  fprintf( out_control->f4body, "step: %d\n", data->step );
-  fprintf( out_control->f4body, "%6s\t%-38s%-38s%-38s\n",
-       "atom", "4-body total", "f_tor", "f_con" );
-
-  for( j = 0; j < system->N; ++j ){
-    if( !rvec_isZero( workspace->f_con[j] ) )
-      fprintf( out_control->f4body,
-           "%6d %24.15e%24.15e%24.15e %24.15e%24.15e%24.15e "\
-           "%24.15e%24.15e%24.15e\n",
-         system->my_atoms[j].orig_id,
-           workspace->f_tor[j][0] + workspace->f_con[j][0],
-           workspace->f_tor[j][1] + workspace->f_con[j][1],
-           workspace->f_tor[j][2] + workspace->f_con[j][2],
-           workspace->f_tor[j][0], workspace->f_tor[j][1],
-           workspace->f_tor[j][2],
-           workspace->f_con[j][0], workspace->f_con[j][1],
-           workspace->f_con[j][2] );
-    else
-      fprintf( out_control->f4body,
-           "%6d %24.15e%24.15e%24.15e  0 0 0\n",
-           system->my_atoms[j].orig_id, workspace->f_tor[j][0],
-           workspace->f_tor[j][1], workspace->f_tor[j][2] );
-  }
+    int j;
+
+    fprintf( out_control->f4body, "step: %d\n", data->step );
+    fprintf( out_control->f4body, "%6s\t%-38s%-38s%-38s\n",
+            "atom", "4-body total", "f_tor", "f_con" );
+
+    for( j = 0; j < system->N; ++j ){
+        if( !rvec_isZero( workspace->f_con[j] ) )
+            fprintf( out_control->f4body,
+                    "%6d %24.15e%24.15e%24.15e %24.15e%24.15e%24.15e "\
+                    "%24.15e%24.15e%24.15e\n",
+                    system->my_atoms[j].orig_id,
+                    workspace->f_tor[j][0] + workspace->f_con[j][0],
+                    workspace->f_tor[j][1] + workspace->f_con[j][1],
+                    workspace->f_tor[j][2] + workspace->f_con[j][2],
+                    workspace->f_tor[j][0], workspace->f_tor[j][1],
+                    workspace->f_tor[j][2],
+                    workspace->f_con[j][0], workspace->f_con[j][1],
+                    workspace->f_con[j][2] );
+        else
+            fprintf( out_control->f4body,
+                    "%6d %24.15e%24.15e%24.15e  0 0 0\n",
+                    system->my_atoms[j].orig_id, workspace->f_tor[j][0],
+                    workspace->f_tor[j][1], workspace->f_tor[j][2] );
+    }
 
 }
 
 
 void Print_vdW_Coulomb_Forces( reax_system *system, control_params *control,
-                   simulation_data *data, storage *workspace,
-                   reax_list **lists, output_controls *out_control )
+        simulation_data *data, storage *workspace,
+        reax_list **lists, output_controls *out_control )
 {
-  int  i;
-
-  return;
-
-  fprintf( out_control->fnonb, "step: %d\n", data->step );
-  fprintf( out_control->fnonb, "%6s\t%-38s%-38s%-38s\n",
-       "atom", "nonbonded_total[0,1,2]", "f_vdw[0,1,2]", "f_ele[0,1,2]" );
-
-  for( i = 0; i < system->N; ++i )
-    fprintf( out_control->fnonb,
-         "%6d%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e\n",
-         system->my_atoms[i].orig_id,
-         workspace->f_vdw[i][0] + workspace->f_ele[i][0],
-         workspace->f_vdw[i][1] + workspace->f_ele[i][1],
-         workspace->f_vdw[i][2] + workspace->f_ele[i][2],
-         workspace->f_vdw[i][0],
-         workspace->f_vdw[i][1],
-         workspace->f_vdw[i][2],
-         workspace->f_ele[i][0],
-         workspace->f_ele[i][1],
-         workspace->f_ele[i][2] );
+    int  i;
+
+    return;
+
+    fprintf( out_control->fnonb, "step: %d\n", data->step );
+    fprintf( out_control->fnonb, "%6s\t%-38s%-38s%-38s\n",
+            "atom", "nonbonded_total[0,1,2]", "f_vdw[0,1,2]", "f_ele[0,1,2]" );
+
+    for( i = 0; i < system->N; ++i )
+        fprintf( out_control->fnonb,
+                "%6d%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e\n",
+                system->my_atoms[i].orig_id,
+                workspace->f_vdw[i][0] + workspace->f_ele[i][0],
+                workspace->f_vdw[i][1] + workspace->f_ele[i][1],
+                workspace->f_vdw[i][2] + workspace->f_ele[i][2],
+                workspace->f_vdw[i][0],
+                workspace->f_vdw[i][1],
+                workspace->f_vdw[i][2],
+                workspace->f_ele[i][0],
+                workspace->f_ele[i][1],
+                workspace->f_ele[i][2] );
 }
 
 
 void Print_Total_Force( reax_system *system, control_params *control,
-            simulation_data *data, storage *workspace,
-            reax_list **lists, output_controls *out_control )
+        simulation_data *data, storage *workspace,
+        reax_list **lists, output_controls *out_control )
 {
-  int    i;
+    int    i;
 
-  return;
+    return;
 
-  fprintf( out_control->ftot, "step: %d\n", data->step );
-  fprintf( out_control->ftot, "%6s\t%-38s\n", "atom", "atom.f[0,1,2]");
+    fprintf( out_control->ftot, "step: %d\n", data->step );
+    fprintf( out_control->ftot, "%6s\t%-38s\n", "atom", "atom.f[0,1,2]");
 
-  for( i = 0; i < system->n; ++i )
-    fprintf( out_control->ftot, "%6d%24.15e%24.15e%24.15e\n",
-         system->my_atoms[i].orig_id,
-         system->my_atoms[i].f[0],
-         system->my_atoms[i].f[1],
-         system->my_atoms[i].f[2] );
+    for( i = 0; i < system->n; ++i )
+        fprintf( out_control->ftot, "%6d%24.15e%24.15e%24.15e\n",
+                system->my_atoms[i].orig_id,
+                system->my_atoms[i].f[0],
+                system->my_atoms[i].f[1],
+                system->my_atoms[i].f[2] );
 }
 
 
 void Compare_Total_Forces( reax_system *system, control_params *control,
-               simulation_data *data, storage *workspace,
-               reax_list **lists, output_controls *out_control )
+        simulation_data *data, storage *workspace,
+        reax_list **lists, output_controls *out_control )
 {
-  int i;
+    int i;
 
-  return;
-
-  fprintf( out_control->ftot2, "step: %d\n", data->step );
-  fprintf( out_control->ftot2, "%6s\t%-38s%-38s\n",
-       "atom", "f_total[0,1,2]", "test_force_total[0,1,2]" );
-
-  for( i = 0; i < system->N; ++i )
-    fprintf( out_control->ftot2, "%6d%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e\n",
-         system->my_atoms[i].orig_id,
-         system->my_atoms[i].f[0],
-         system->my_atoms[i].f[1],
-         system->my_atoms[i].f[2],
-         workspace->f_be[i][0] + workspace->f_lp[i][0] +
-         workspace->f_ov[i][0] + workspace->f_un[i][0] +
-         workspace->f_ang[i][0]+ workspace->f_pen[i][0]+
-         workspace->f_coa[i][0]+ + workspace->f_hb[i][0] +
-         workspace->f_tor[i][0] + workspace->f_con[i][0] +
-         workspace->f_vdw[i][0] + workspace->f_ele[i][0],
-         workspace->f_be[i][1] + workspace->f_lp[i][1] +
-         workspace->f_ov[i][1] + workspace->f_un[i][1] +
-             workspace->f_ang[i][1]+ workspace->f_pen[i][1]+
-         workspace->f_coa[i][1]+ + workspace->f_hb[i][1] +
-         workspace->f_tor[i][1] + workspace->f_con[i][1] +
-         workspace->f_vdw[i][1] + workspace->f_ele[i][1],
-         workspace->f_be[i][2] + workspace->f_lp[i][2] +
-         workspace->f_ov[i][2] + workspace->f_un[i][2] +
-             workspace->f_ang[i][2]+ workspace->f_pen[i][2] +
-         workspace->f_coa[i][2]+ + workspace->f_hb[i][2] +
-         workspace->f_tor[i][2] + workspace->f_con[i][2] +
-         workspace->f_vdw[i][2] + workspace->f_ele[i][2] );
+    return;
+
+    fprintf( out_control->ftot2, "step: %d\n", data->step );
+    fprintf( out_control->ftot2, "%6s\t%-38s%-38s\n",
+            "atom", "f_total[0,1,2]", "test_force_total[0,1,2]" );
+
+    for( i = 0; i < system->N; ++i )
+        fprintf( out_control->ftot2, "%6d%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e\n",
+                system->my_atoms[i].orig_id,
+                system->my_atoms[i].f[0],
+                system->my_atoms[i].f[1],
+                system->my_atoms[i].f[2],
+                workspace->f_be[i][0] + workspace->f_lp[i][0] +
+                workspace->f_ov[i][0] + workspace->f_un[i][0] +
+                workspace->f_ang[i][0]+ workspace->f_pen[i][0]+
+                workspace->f_coa[i][0]+ + workspace->f_hb[i][0] +
+                workspace->f_tor[i][0] + workspace->f_con[i][0] +
+                workspace->f_vdw[i][0] + workspace->f_ele[i][0],
+                workspace->f_be[i][1] + workspace->f_lp[i][1] +
+                workspace->f_ov[i][1] + workspace->f_un[i][1] +
+                workspace->f_ang[i][1]+ workspace->f_pen[i][1]+
+                workspace->f_coa[i][1]+ + workspace->f_hb[i][1] +
+                workspace->f_tor[i][1] + workspace->f_con[i][1] +
+                workspace->f_vdw[i][1] + workspace->f_ele[i][1],
+                workspace->f_be[i][2] + workspace->f_lp[i][2] +
+                workspace->f_ov[i][2] + workspace->f_un[i][2] +
+                workspace->f_ang[i][2]+ workspace->f_pen[i][2] +
+                workspace->f_coa[i][2]+ + workspace->f_hb[i][2] +
+                workspace->f_tor[i][2] + workspace->f_con[i][2] +
+                workspace->f_vdw[i][2] + workspace->f_ele[i][2] );
 }*/
 
 /*void Init_Force_Test_Functions( )
-{
+  {
   Print_Interactions[0] = Print_Bond_Orders;
   Print_Interactions[1] = Print_Bond_Forces;
   Print_Interactions[2] = Print_LonePair_Forces;
diff --git a/PuReMD/src/linear_solvers.c b/PuReMD/src/linear_solvers.c
index 83ddaf8a..bcb04327 100644
--- a/PuReMD/src/linear_solvers.c
+++ b/PuReMD/src/linear_solvers.c
@@ -34,9 +34,9 @@
 #include "lapacke.h"
 #endif
 
-#if defined(CG_PERFORMANCE)
+/*#if defined(CG_PERFORMANCE)
 real t_start, t_elapsed, matvec_time, dot_time;
-#endif
+#endif*/
 
 int compare_dbls( const void* arg1, const void* arg2 )
 {   
@@ -102,7 +102,7 @@ int find_bucket( double *list, int len, double a )
     return s;
 }
 
-void setup_sparse_approx_inverse( reax_system *system, storage *workspace,
+real setup_sparse_approx_inverse( reax_system *system, simulation_data *data, storage *workspace,
         mpi_datatypes *mpi_data, sparse_matrix *A, sparse_matrix **A_spar_patt,
         int nprocs, real filter )
 {
@@ -126,6 +126,12 @@ void setup_sparse_approx_inverse( reax_system *system, storage *workspace,
 
     MPI_Comm comm;
 
+    real start, t_start, t_comm;
+    real total_comm;
+
+    start = Get_Time();
+    t_comm = 0.0;
+
     srecv = NULL;
     sdispls = NULL;
     samplelist_local = NULL;
@@ -160,8 +166,10 @@ void setup_sparse_approx_inverse( reax_system *system, storage *workspace,
     }
     s_local = (int) (12.0 * log2(n_local*nprocs));
     
+    t_start = Get_Time( );
     MPI_Allreduce(&n_local, &n, 1, MPI_INT, MPI_SUM, comm);
     MPI_Reduce(&s_local, &s, 1, MPI_INT, MPI_SUM, MASTER_NODE, comm);
+    t_comm += Get_Timing_Info( t_start );
 
     /* count num. bin elements for each processor, uniform bin sizes */
     input_array = malloc( sizeof(real) * n_local );
@@ -196,7 +204,9 @@ void setup_sparse_approx_inverse( reax_system *system, storage *workspace,
     }
 
     /* gather samples at the root process */
+    t_start = Get_Time( );
     MPI_Gather( &s_local, 1, MPI_INT, srecv, 1, MPI_INT, MASTER_NODE, comm );
+    t_comm += Get_Timing_Info( t_start );
 
     if( system->my_rank == MASTER_NODE )
     {
@@ -207,8 +217,10 @@ void setup_sparse_approx_inverse( reax_system *system, storage *workspace,
         }
     }
 
+    t_start = Get_Time( );
     MPI_Gatherv( samplelist_local, s_local, MPI_DOUBLE,
             samplelist, srecv, sdispls, MPI_DOUBLE, MASTER_NODE, comm);
+    t_comm += Get_Timing_Info( t_start );
 
     /* sort samples at the root process and select pivots */
     if ( system->my_rank == MASTER_NODE )
@@ -222,7 +234,9 @@ void setup_sparse_approx_inverse( reax_system *system, storage *workspace,
     }
 
     /* broadcast pivots */
+    t_start = Get_Time( );
     MPI_Bcast( pivotlist, nprocs - 1, MPI_DOUBLE, MASTER_NODE, comm );
+    t_comm += Get_Timing_Info( t_start );
 
     for ( i = 0; i < nprocs; ++i )
     {
@@ -258,7 +272,9 @@ void setup_sparse_approx_inverse( reax_system *system, storage *workspace,
     }
 
     /* determine counts for elements per process */
+    t_start = Get_Time( );
     MPI_Allreduce( MPI_IN_PLACE, scounts, nprocs, MPI_INT, MPI_SUM, comm );
+    t_comm += Get_Timing_Info( t_start );
 
     /* find the target process */
     target_proc = 0;
@@ -283,7 +299,10 @@ void setup_sparse_approx_inverse( reax_system *system, storage *workspace,
     }
 
     /* send local buckets to target processor for quickselect */
+    t_start = Get_Time( );
     MPI_Gather( scounts_local + target_proc, 1, MPI_INT, scounts, 1, MPI_INT, target_proc, comm );
+    t_comm += Get_Timing_Info( t_start );
+
     if ( system->my_rank == target_proc )
     {
         dspls[0] = 0;
@@ -293,8 +312,10 @@ void setup_sparse_approx_inverse( reax_system *system, storage *workspace,
         }
     }
 
+    t_start = Get_Time( );
     MPI_Gatherv( bucketlist_local + dspls_local[target_proc], scounts_local[target_proc], MPI_DOUBLE,
             bucketlist, scounts, dspls, MPI_DOUBLE, target_proc, comm);
+    t_comm += Get_Timing_Info( t_start );
 
     /* apply quick select algorithm at the target process */
     if( system->my_rank == target_proc)
@@ -362,7 +383,9 @@ void setup_sparse_approx_inverse( reax_system *system, storage *workspace,
     }
 
     /*broadcast the filtering value*/
+    t_start = Get_Time( );
     MPI_Bcast( &threshold, 1, MPI_DOUBLE, target_proc, comm );
+    t_comm += Get_Timing_Info( t_start );
 
     // int nnz = 0; uncomment to check the nnz's in the sparsity pattern
 
@@ -391,11 +414,19 @@ void setup_sparse_approx_inverse( reax_system *system, storage *workspace,
         fprintf(stdout,"total nnz in all sparsity patterns = %d\nthreshold = %.15lf\n", nnz, threshold);
         fflush(stdout);
     }*/
+ 
+    MPI_Reduce(&t_comm, &total_comm, 1, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world);
+
+    if( system->my_rank == MASTER_NODE )
+    {
+        data->timing.cm_solver_comm += total_comm / nprocs;
+    }
 
+    return Get_Timing_Info( start );
 }
 
-void sparse_approx_inverse(reax_system *system, storage *workspace, mpi_datatypes *mpi_data, 
-        sparse_matrix *A, sparse_matrix *A_spar_patt, sparse_matrix **A_app_inv )
+real sparse_approx_inverse(reax_system *system, simulation_data *data, storage *workspace, mpi_datatypes *mpi_data, 
+        sparse_matrix *A, sparse_matrix *A_spar_patt, sparse_matrix **A_app_inv, int nprocs )
 {
     int N, M, d_i, d_j;
     int i, k, pj, j_temp;
@@ -420,9 +451,11 @@ void sparse_approx_inverse(reax_system *system, storage *workspace, mpi_datatype
     int *j_send, *j_recv1, *j_recv2;
     real *val_send, *val_recv1, *val_recv2;
 
-    /*real start;
+    real start, t_start, t_comm;
+    real total_comm;
 
-    start = Get_Time( ); */
+    start = Get_Time( );
+    t_comm = 0.0;
 
     comm = mpi_data->world;
 
@@ -468,8 +501,10 @@ void sparse_approx_inverse(reax_system *system, storage *workspace, mpi_datatype
     }
 
     /* Announce the nnz's in each row that will be communicated later */
+    t_start = Get_Time( );
     scale = sizeof(int) / sizeof(void);
     Dist( system, mpi_data, row_nnz, MPI_INT, scale, int_packer );
+    t_comm += Get_Timing_Info( t_start );
 
     comm = mpi_data->comm_mesh3D;
     out_bufs = mpi_data->out_buffers;
@@ -501,8 +536,11 @@ void sparse_approx_inverse(reax_system *system, storage *workspace, mpi_datatype
                 flag1 = 1;
                 j_recv1 = (int *) malloc( sizeof(int) * cnt );
                 val_recv1 = (real *) malloc( sizeof(real) * cnt );
+
+                t_start = Get_Time( );
                 MPI_Irecv( j_recv1, cnt, MPI_INT, nbr1->rank, 2 * d + 1, comm, &req1 );
                 MPI_Irecv( val_recv1, cnt, MPI_DOUBLE, nbr1->rank, 2 * d + 1, comm, &req2 );
+                t_comm += Get_Timing_Info( t_start );
             }
         }
 
@@ -525,8 +563,11 @@ void sparse_approx_inverse(reax_system *system, storage *workspace, mpi_datatype
                 flag2 = 1;
                 j_recv2 = (int *) malloc( sizeof(int) * cnt );
                 val_recv2 = (real *) malloc( sizeof(real) * cnt );
+
+                t_start = Get_Time( );
                 MPI_Irecv( j_recv2, cnt, MPI_INT, nbr2->rank, 2 * d, comm, &req3 );
                 MPI_Irecv( val_recv2, cnt, MPI_DOUBLE, nbr2->rank, 2 * d, comm, &req4 );
+                t_comm += Get_Timing_Info( t_start );
             }
         }
 
@@ -568,8 +609,10 @@ void sparse_approx_inverse(reax_system *system, storage *workspace, mpi_datatype
                     }
                 }
 
+                t_start = Get_Time( );
                 MPI_Send( j_send, cnt, MPI_INT, nbr1->rank, 2 * d, comm );
                 MPI_Send( val_send, cnt, MPI_DOUBLE, nbr1->rank, 2 * d, comm );
+                t_comm += Get_Timing_Info( t_start );
             }
         }
 
@@ -610,16 +653,20 @@ void sparse_approx_inverse(reax_system *system, storage *workspace, mpi_datatype
                     }
                 }
 
+                t_start = Get_Time( );
                 MPI_Send( j_send, cnt, MPI_INT, nbr1->rank, 2 * d + 1, comm );
                 MPI_Send( val_send, cnt, MPI_DOUBLE, nbr1->rank, 2 * d + 1, comm );
+                t_comm += Get_Timing_Info( t_start );
             }
 
         }
 
         if( flag1 )
         {
+            t_start = Get_Time( );
             MPI_Wait( &req1, &stat1 );
             MPI_Wait( &req2, &stat2 );
+            t_comm += Get_Timing_Info( t_start );
 
             cnt = 0;
             for( i = nbr1->atoms_str; i < (nbr1->atoms_str + nbr1->atoms_cnt); ++i )
@@ -641,9 +688,10 @@ void sparse_approx_inverse(reax_system *system, storage *workspace, mpi_datatype
 
         if( flag2 )
         {
+            t_start = Get_Time( );
             MPI_Wait( &req3, &stat3 );
             MPI_Wait( &req4, &stat4 );
-
+            t_comm += Get_Timing_Info( t_start );
 
             cnt = 0;
             for( i = nbr2->atoms_str; i < (nbr2->atoms_str + nbr2->atoms_cnt); ++i )
@@ -826,7 +874,14 @@ void sparse_approx_inverse(reax_system *system, storage *workspace, mpi_datatype
     free( pos_x);
     free( X );
 
-    /* return Get_Timing_Info( start ); */
+    MPI_Reduce(&t_comm, &total_comm, 1, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world);
+
+    if( system->my_rank == MASTER_NODE )
+    {
+        data->timing.cm_solver_comm += total_comm / nprocs;
+    }
+
+    return Get_Timing_Info( start );
 }
 
 void dual_Sparse_MatVec( sparse_matrix *A, rvec2 *x, rvec2 *b, int N )
@@ -872,7 +927,7 @@ void dual_Sparse_MatVec( sparse_matrix *A, rvec2 *x, rvec2 *b, int N )
 }
 
 
-int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
+/*int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
         rvec2 *b, real tol, rvec2 *x, mpi_datatypes* mpi_data, FILE *fout )
 {
     int  i, j, n, N, matvecs, scale;
@@ -912,15 +967,15 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
 
     for ( j = 0; j < system->n; ++j )
     {
-        /* residual */
+        // residual
         workspace->r2[j][0] = b[j][0] - workspace->q2[j][0];
         workspace->r2[j][1] = b[j][1] - workspace->q2[j][1];
-        /* apply diagonal pre-conditioner */
+        // apply diagonal pre-conditioner
         workspace->d2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
         workspace->d2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
     }
 
-    /* norm of b */
+    // norm of b
     my_sum[0] = my_sum[1] = 0;
     for ( j = 0; j < n; ++j )
     {
@@ -932,7 +987,7 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
     b_norm[1] = sqrt( norm_sqr[1] );
     //fprintf( stderr, "bnorm = %f %f\n", b_norm[0], b_norm[1] );
 
-    /* dot product: r.d */
+    // dot product: r.d
     my_dot[0] = my_dot[1] = 0;
     for ( j = 0; j < n; ++j )
     {
@@ -963,7 +1018,7 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
         }
 #endif
 
-        /* dot product: d.q */
+        // dot product: d.q
         my_dot[0] = my_dot[1] = 0;
         for ( j = 0; j < n; ++j )
         {
@@ -978,16 +1033,16 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
         my_dot[0] = my_dot[1] = 0;
         for ( j = 0; j < system->n; ++j )
         {
-            /* update x */
+            // update x
             x[j][0] += alpha[0] * workspace->d2[j][0];
             x[j][1] += alpha[1] * workspace->d2[j][1];
-            /* update residual */
+            // update residual
             workspace->r2[j][0] -= alpha[0] * workspace->q2[j][0];
             workspace->r2[j][1] -= alpha[1] * workspace->q2[j][1];
-            /* apply diagonal pre-conditioner */
+            // apply diagonal pre-conditioner
             workspace->p2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
             workspace->p2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
-            /* dot product: r.p */
+            // dot product: r.p
             my_dot[0] += workspace->r2[j][0] * workspace->p2[j][0];
             my_dot[1] += workspace->r2[j][1] * workspace->p2[j][1];
         }
@@ -1012,7 +1067,7 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
         beta[1] = sig_new[1] / sig_old[1];
         for ( j = 0; j < system->n; ++j )
         {
-            /* d = p + beta * d */
+            // d = p + beta * d
             workspace->d2[j][0] = workspace->p2[j][0] + beta[0] * workspace->d2[j][0];
             workspace->d2[j][1] = workspace->p2[j][1] + beta[1] * workspace->d2[j][1];
         }
@@ -1059,7 +1114,7 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
 #endif
 
     return (i + 1) + matvecs;
-}
+}*/
 
 
 void Sparse_MatVec( sparse_matrix *A, real *x, real *b, int N )
@@ -1126,124 +1181,146 @@ static void Sparse_MatVec_full( const sparse_matrix * const A,
 }*/
 
 
-int CG( reax_system *system, storage *workspace, sparse_matrix *H, real *b,
-        real tol, real *x, mpi_datatypes* mpi_data, FILE *fout, int step )
+int CG( reax_system *system, control_params *control, simulation_data *data,
+        storage *workspace, sparse_matrix *H, real *b,
+        real tol, real *x, mpi_datatypes* mpi_data, FILE *fout, int nprocs )
 {
-    int  i, scale;
+    int  i, j, scale;
     real tmp, alpha, beta, b_norm;
     real sig_old, sig_new;
-#if defined(HALF_LIST)
-    int j;
-#endif
+    real t_start, t_pa, t_spmv, t_vops, t_comm;
+    real total_pa, total_spmv, total_vops, total_comm;
 
-#if defined(CG_PERFORMANCE)
-    if ( system->my_rank == MASTER_NODE )
-    {
-        t_start = matvec_time = dot_time = 0;
-        t_start = Get_Time( );
-    }
-#endif
+    t_pa = 0.0;
+    t_spmv = 0.0;
+    t_vops = 0.0;
+    t_comm = 0.0;
 
+    t_start = Get_Time( );
     scale = sizeof(real) / sizeof(void);
     Dist( system, mpi_data, x, MPI_DOUBLE, scale, real_packer );
+    t_comm += Get_Timing_Info( t_start );
+
+    t_start = Get_Time( );
     Sparse_MatVec( H, x, workspace->q, system->N );
+    t_spmv += Get_Timing_Info( t_start );
+
 #if defined(HALF_LIST)
+    t_start = Get_Time( );
     Coll( system, mpi_data, workspace->q, MPI_DOUBLE, scale, real_unpacker );
+    t_comm += Get_Timing_Info( t_start );
 #endif
 
-#if defined(CG_PERFORMANCE)
-    if ( system->my_rank == MASTER_NODE )
-    {
-        Update_Timing_Info( &t_start, &matvec_time );
-    }
-#endif
-
+    t_start = Get_Time( );
     Vector_Sum( workspace->r , 1.,  b, -1., workspace->q, system->n );
+    t_vops += Get_Timing_Info( t_start );
 
     /* pre-conditioning */
-#if defined(SAI_PRECONDITIONER) && !defined(HALF_LIST)
-    Dist( system, mpi_data, workspace->r, MPI_DOUBLE, scale, real_packer );
-    Sparse_MatVec( workspace->H_app_inv, workspace->r, workspace->d, system->n );
-#else
-    for ( j = 0; j < system->n; ++j )
+    if( control->cm_solver_pre_comp_type == SAI_PC )
     {
-        workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j];
+        t_start = Get_Time( );
+        Dist( system, mpi_data, workspace->r, MPI_DOUBLE, scale, real_packer );
+        t_comm += Get_Timing_Info( t_start );
+        
+        t_start = Get_Time( );
+        Sparse_MatVec( workspace->H_app_inv, workspace->r, workspace->d, system->n );
+        t_pa += Get_Timing_Info( t_start );
     }
-#endif
 
-    b_norm = Parallel_Norm( b, system->n, mpi_data->world );
-    sig_new = Parallel_Dot(workspace->r, workspace->d, system->n, mpi_data->world);
-
-#if defined(CG_PERFORMANCE)
-    if ( system->my_rank == MASTER_NODE )
+    else if ( control->cm_solver_pre_comp_type == DIAG_PC)
     {
-        Update_Timing_Info( &t_start, &dot_time );
+        t_start = Get_Time( );
+        for ( j = 0; j < system->n; ++j )
+        {
+            workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j];
+        }
+        t_pa += Get_Timing_Info( t_start );
     }
-#endif
 
-    for ( i = 1; i < 300 && sqrt(sig_new) / b_norm > tol; ++i )
+    t_start = Get_Time( );
+    b_norm = Parallel_Norm( b, system->n, mpi_data->world );
+    sig_new = Parallel_Dot(workspace->r, workspace->d, system->n, mpi_data->world);
+    t_vops += Get_Timing_Info( t_start );
+
+    for ( i = 0; i < control->cm_solver_max_iters && sqrt(sig_new) / b_norm > tol; ++i )
     {
+        t_start = Get_Time( );
         Dist( system, mpi_data, workspace->d, MPI_DOUBLE, scale, real_packer );
+        t_comm += Get_Timing_Info( t_start );
+
+        t_start = Get_Time( );
         Sparse_MatVec( H, workspace->d, workspace->q, system->N );
+        t_spmv += Get_Timing_Info( t_start );
+
 #if defined(HALF_LIST)
+        t_start = Get_Time( );
         Coll(system, mpi_data, workspace->q, MPI_DOUBLE, scale, real_unpacker);
+        t_comm += Get_Timing_Info( t_start );
 #endif
 
-#if defined(CG_PERFORMANCE)
-        if ( system->my_rank == MASTER_NODE )
-        {
-            Update_Timing_Info( &t_start, &matvec_time );
-        }
-#endif
-
+        t_start = Get_Time( );
         tmp = Parallel_Dot(workspace->d, workspace->q, system->n, mpi_data->world);
         alpha = sig_new / tmp;
         Vector_Add( x, alpha, workspace->d, system->n );
         Vector_Add( workspace->r, -alpha, workspace->q, system->n );
+        t_vops += Get_Timing_Info( t_start );
 
         /* pre-conditioning */
-#if defined(SAI_PRECONDITIONER) && !defined(HALF_LIST)
-        Dist( system, mpi_data, workspace->r, MPI_DOUBLE, scale, real_packer );
-        Sparse_MatVec( workspace->H_app_inv, workspace->r, workspace->p, system->n );
-#else
-        for ( j = 0; j < system->n; ++j )
+        if( control->cm_solver_pre_comp_type == SAI_PC )
         {
-            workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j];
+            t_start = Get_Time( );
+            Dist( system, mpi_data, workspace->r, MPI_DOUBLE, scale, real_packer );
+            t_comm += Get_Timing_Info( t_start );
+
+            t_start = Get_Time( );
+            Sparse_MatVec( workspace->H_app_inv, workspace->r, workspace->p, system->n );
+            t_pa += Get_Timing_Info( t_start );
         }
-#endif
 
+        else if ( control->cm_solver_pre_comp_type == DIAG_PC)
+        {
+            t_start = Get_Time( );
+            for ( j = 0; j < system->n; ++j )
+            {
+                workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j];
+            }
+            t_pa += Get_Timing_Info( t_start );
+        }
+
+        t_start = Get_Time( );
         sig_old = sig_new;
         sig_new = Parallel_Dot(workspace->r, workspace->p, system->n, mpi_data->world);
         beta = sig_new / sig_old;
         Vector_Sum( workspace->d, 1., workspace->p, beta, workspace->d, system->n );
-
-#if defined(CG_PERFORMANCE)
-        if ( system->my_rank == MASTER_NODE )
-        {
-            Update_Timing_Info( &t_start, &dot_time );
-        }
-#endif
+        t_vops += Get_Timing_Info( t_start );
     }
 
-    if ( i >= 300 )
+    MPI_Reduce(&t_pa, &total_pa, 1, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world);
+    MPI_Reduce(&t_spmv, &total_spmv, 1, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world);
+    MPI_Reduce(&t_vops, &total_vops, 1, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world);
+    MPI_Reduce(&t_comm, &total_comm, 1, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world);
+
+    if( system->my_rank == MASTER_NODE )
     {
-        fprintf( stderr, "CG convergence failed at step %d!\n", step );
-        return i;
+        data->timing.cm_solver_pre_app += total_pa / nprocs;
+        data->timing.cm_solver_spmv += total_spmv / nprocs;
+        data->timing.cm_solver_vector_ops += total_vops / nprocs;
+        data->timing.cm_solver_comm += total_comm / nprocs;
     }
 
-#if defined(CG_PERFORMANCE)
-    if ( system->my_rank == MASTER_NODE )
+    MPI_Barrier(mpi_data->world);
+
+    if ( i >= control->cm_solver_max_iters )
     {
-        fprintf( fout, "QEq %d iters. matvecs: %f  dot: %f\n", i, matvec_time,
-                dot_time );
+        fprintf( stderr, "CG convergence failed!\n" );
+        return i;
     }
-#endif
 
     return i;
 }
 
 
-int CG_test( reax_system *system, storage *workspace, sparse_matrix *H,
+/*int CG_test( reax_system *system, storage *workspace, sparse_matrix *H,
         real *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
 {
     int  i, j, scale;
@@ -1327,7 +1404,7 @@ int CG_test( reax_system *system, storage *workspace, sparse_matrix *H,
 
         Vector_Add( x, alpha, workspace->d, system->n );
         Vector_Add( workspace->r, -alpha, workspace->q, system->n );
-        /* pre-conditioning */
+        // pre-conditioning
         for ( j = 0; j < system->n; ++j )
             workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j];
 
@@ -1365,10 +1442,10 @@ int CG_test( reax_system *system, storage *workspace, sparse_matrix *H,
     }
 
     return i;
-}
+}*/
 
 
-void Forward_Subs( sparse_matrix *L, real *b, real *y )
+/*void Forward_Subs( sparse_matrix *L, real *b, real *y )
 {
     int i, pj, j, si, ei;
     real val;
@@ -1386,10 +1463,10 @@ void Forward_Subs( sparse_matrix *L, real *b, real *y )
         }
         y[i] /= L->entries[pj].val;
     }
-}
+}*/
 
 
-void Backward_Subs( sparse_matrix *U, real *y, real *x )
+/*void Backward_Subs( sparse_matrix *U, real *y, real *x )
 {
     int i, pj, j, si, ei;
     real val;
@@ -1407,10 +1484,10 @@ void Backward_Subs( sparse_matrix *U, real *y, real *x )
         }
         x[i] /= U->entries[si].val;
     }
-}
+}*/
 
 
-int PCG( reax_system *system, storage *workspace,
+/*int PCG( reax_system *system, storage *workspace,
         sparse_matrix *H, real *b, real tol,
         sparse_matrix *L, sparse_matrix *U, real *x,
         mpi_datatypes* mpi_data, FILE *fout )
@@ -1498,11 +1575,11 @@ int PCG( reax_system *system, storage *workspace,
         fprintf( stderr, "PCG convergence failed!\n" );
 
     return i;
-}
+}*/
 
 
 #if defined(OLD_STUFF)
-int sCG( reax_system *system, storage *workspace, sparse_matrix *H,
+/*int sCG( reax_system *system, storage *workspace, sparse_matrix *H,
         real *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
 {
     int  i, j;
@@ -1563,7 +1640,7 @@ int sCG( reax_system *system, storage *workspace, sparse_matrix *H,
 
         Vector_Add( x, alpha, workspace->d, system->n );
         Vector_Add( workspace->r, -alpha, workspace->q, system->n );
-        /* pre-conditioning */
+        // pre-conditioning
         for ( j = 0; j < system->n; ++j )
             workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j];
 
@@ -1591,10 +1668,10 @@ int sCG( reax_system *system, storage *workspace, sparse_matrix *H,
     }
 
     return i;
-}
+}*/
 
 
-int GMRES( reax_system *system, storage *workspace, sparse_matrix *H,
+/*int GMRES( reax_system *system, storage *workspace, sparse_matrix *H,
         real *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
 {
     int i, j, k, itr, N;
@@ -1603,14 +1680,14 @@ int GMRES( reax_system *system, storage *workspace, sparse_matrix *H,
     N = system->N;
     bnorm = Norm( b, N );
 
-    /* apply the diagonal pre-conditioner to rhs */
+    // apply the diagonal pre-conditioner to rhs
     for ( i = 0; i < N; ++i )
         workspace->b_prc[i] = b[i] * workspace->Hdia_inv[i];
 
-    /* GMRES outer-loop */
+    // GMRES outer-loop
     for ( itr = 0; itr < MAX_ITR; ++itr )
     {
-        /* calculate r0 */
+        // calculate r0
         Sparse_MatVec( H, x, workspace->b_prm, N );
         for ( i = 0; i < N; ++i )
             workspace->b_prm[i] *= workspace->Hdia_inv[i]; // pre-conditioner
@@ -1623,17 +1700,17 @@ int GMRES( reax_system *system, storage *workspace, sparse_matrix *H,
 
         // fprintf( stderr, "%10.6f\n", workspace->g[0] );
 
-        /* GMRES inner-loop */
+        // GMRES inner-loop
         for ( j = 0; j < RESTART && fabs(workspace->g[j]) / bnorm > tol; j++ )
         {
-            /* matvec */
+            // matvec
             Sparse_MatVec( H, workspace->v[j], workspace->v[j + 1], N );
 
             for ( k = 0; k < N; ++k )
                 workspace->v[j + 1][k] *= workspace->Hdia_inv[k]; // pre-conditioner
             // fprintf( stderr, "%d-%d: matvec done.\n", itr, j );
 
-            /* apply modified Gram-Schmidt to orthogonalize the new residual */
+            // apply modified Gram-Schmidt to orthogonalize the new residual
             for ( i = 0; i <= j; i++ )
             {
                 workspace->h[i][j] = Dot(workspace->v[i], workspace->v[j + 1], N);
@@ -1646,7 +1723,7 @@ int GMRES( reax_system *system, storage *workspace, sparse_matrix *H,
                     1. / workspace->h[j + 1][j], workspace->v[j + 1], N );
             // fprintf(stderr, "%d-%d: orthogonalization completed.\n", itr, j);
 
-            /* Givens rotations on the H matrix to make it U */
+            // Givens rotations on the H matrix to make it U
             for ( i = 0; i <= j; i++ )
             {
                 if ( i == j )
@@ -1665,7 +1742,7 @@ int GMRES( reax_system *system, storage *workspace, sparse_matrix *H,
                 workspace->h[i + 1][j] = tmp2;
             }
 
-            /* apply Givens rotations to the rhs as well */
+            // apply Givens rotations to the rhs as well
             tmp1 =  workspace->hc[j] * workspace->g[j];
             tmp2 = -workspace->hs[j] * workspace->g[j];
             workspace->g[j] = tmp1;
@@ -1674,8 +1751,8 @@ int GMRES( reax_system *system, storage *workspace, sparse_matrix *H,
             // fprintf( stderr, "%10.6f\n", fabs(workspace->g[j+1]) );
         }
 
-        /* solve Hy = g.
-           H is now upper-triangular, do back-substitution */
+        // solve Hy = g.
+        //   H is now upper-triangular, do back-substitution
         for ( i = j - 1; i >= 0; i-- )
         {
             temp = workspace->g[i];
@@ -1684,23 +1761,23 @@ int GMRES( reax_system *system, storage *workspace, sparse_matrix *H,
             workspace->y[i] = temp / workspace->h[i][i];
         }
 
-        /* update x = x_0 + Vy */
+        // update x = x_0 + Vy
         for ( i = 0; i < j; i++ )
             Vector_Add( x, workspace->y[i], workspace->v[i], N );
 
-        /* stopping condition */
+        // stopping condition
         if ( fabs(workspace->g[j]) / bnorm <= tol )
             break;
     }
 
-    /*Sparse_MatVec( system, H, x, workspace->b_prm, mpi_data );
-      for( i = 0; i < N; ++i )
-      workspace->b_prm[i] *= workspace->Hdia_inv[i];
+    //Sparse_MatVec( system, H, x, workspace->b_prm, mpi_data );
+    //  for( i = 0; i < N; ++i )
+    //  workspace->b_prm[i] *= workspace->Hdia_inv[i];
 
-      fprintf( fout, "\n%10s%15s%15s\n", "b_prc", "b_prm", "x" );
-      for( i = 0; i < N; ++i )
-      fprintf( fout, "%10.5f%15.12f%15.12f\n",
-      workspace->b_prc[i], workspace->b_prm[i], x[i] );*/
+    //  fprintf( fout, "\n%10s%15s%15s\n", "b_prc", "b_prm", "x" );
+     // for( i = 0; i < N; ++i )
+      //fprintf( fout, "%10.5f%15.12f%15.12f\n",
+      //workspace->b_prc[i], workspace->b_prm[i], x[i] );
 
     fprintf( fout, "GMRES outer: %d, inner: %d - |rel residual| = %15.10f\n",
             itr, j, fabs( workspace->g[j] ) / bnorm );
@@ -1712,10 +1789,10 @@ int GMRES( reax_system *system, storage *workspace, sparse_matrix *H,
     }
 
     return SUCCESS;
-}
+}*/
 
 
-int GMRES_HouseHolder( reax_system *system, storage *workspace,
+/*int GMRES_HouseHolder( reax_system *system, storage *workspace,
         sparse_matrix *H, real *b, real tol, real *x,
         mpi_datatypes* mpi_data, FILE *fout )
 {
@@ -1727,18 +1804,18 @@ int GMRES_HouseHolder( reax_system *system, storage *workspace,
     N = system->N;
     bnorm = Norm( b, N );
 
-    /* apply the diagonal pre-conditioner to rhs */
+    // apply the diagonal pre-conditioner to rhs
     for ( i = 0; i < N; ++i )
         workspace->b_prc[i] = b[i] * workspace->Hdia_inv[i];
 
-    /* GMRES outer-loop */
+    // GMRES outer-loop
     for ( itr = 0; itr < MAX_ITR; ++itr )
     {
-        /* compute z = r0 */
+        // compute z = r0
         Sparse_MatVec( H, x, workspace->b_prm, N );
 
         for ( i = 0; i < N; ++i )
-            workspace->b_prm[i] *= workspace->Hdia_inv[i]; /* pre-conditioner */
+            workspace->b_prm[i] *= workspace->Hdia_inv[i]; // pre-conditioner
 
         Vector_Sum( z[0], 1.,  workspace->b_prc, -1., workspace->b_prm, N );
 
@@ -1752,28 +1829,28 @@ int GMRES_HouseHolder( reax_system *system, storage *workspace,
         w[0]    *= ( u[0][0] < 0.0 ?  1 : -1 );
         // fprintf( stderr, "\n\n%12.6f\n", w[0] );
 
-        /* GMRES inner-loop */
+        // GMRES inner-loop
         for ( j = 0; j < RESTART && fabs( w[j] ) / bnorm > tol; j++ )
         {
-            /* compute v_j */
+            // compute v_j
             Vector_Scale( z[j], -2 * u[j][j], u[j], N );
-            z[j][j] += 1.; /* due to e_j */
+            z[j][j] += 1.; // due to e_j
 
             for ( i = j - 1; i >= 0; --i )
                 Vector_Add( z[j] + i, -2 * Dot( u[i] + i, z[j] + i, N - i ), u[i] + i, N - i );
 
-            /* matvec */
+            // matvec
             Sparse_MatVec( H, z[j], v, N );
 
             for ( k = 0; k < N; ++k )
-                v[k] *= workspace->Hdia_inv[k]; /* pre-conditioner */
+                v[k] *= workspace->Hdia_inv[k]; // pre-conditioner
 
             for ( i = 0; i <= j; ++i )
                 Vector_Add( v + i, -2 * Dot( u[i] + i, v + i, N - i ), u[i] + i, N - i );
 
             if ( !Vector_isZero( v + (j + 1), N - (j + 1) ) )
             {
-                /* compute the HouseHolder unit vector u_j+1 */
+                // compute the HouseHolder unit vector u_j+1
                 for ( i = 0; i <= j; ++i )
                     u[j + 1][i] = 0;
 
@@ -1784,14 +1861,14 @@ int GMRES_HouseHolder( reax_system *system, storage *workspace,
 
                 Vector_Scale( u[j + 1], 1 / Norm( u[j + 1], N ), u[j + 1], N );
 
-                /* overwrite v with P_m+1 * v */
+                // overwrite v with P_m+1 * v
                 v[j + 1] -=
                     2 * Dot( u[j + 1] + (j + 1), v + (j + 1), N - (j + 1) ) * u[j + 1][j + 1];
                 Vector_MakeZero( v + (j + 2), N - (j + 2) );
             }
 
 
-            /* previous Givens rotations on H matrix to make it U */
+            // previous Givens rotations on H matrix to make it U
             for ( i = 0; i < j; i++ )
             {
                 tmp1 =  workspace->hc[i] * v[i] + workspace->hs[i] * v[i + 1];
@@ -1801,7 +1878,7 @@ int GMRES_HouseHolder( reax_system *system, storage *workspace,
                 v[i + 1] = tmp2;
             }
 
-            /* apply the new Givens rotation to H and right-hand side */
+            // apply the new Givens rotation to H and right-hand side
             if ( fabs(v[j + 1]) >= ALMOST_ZERO )
             {
                 cc = sqrt( SQR( v[j] ) + SQR( v[j + 1] ) );
@@ -1814,14 +1891,14 @@ int GMRES_HouseHolder( reax_system *system, storage *workspace,
                 v[j]   = tmp1;
                 v[j + 1] = tmp2;
 
-                /* Givens rotations to rhs */
+                / Givens rotations to rhs
                 tmp1 =  workspace->hc[j] * w[j];
                 tmp2 = -workspace->hs[j] * w[j];
                 w[j]   = tmp1;
                 w[j + 1] = tmp2;
             }
 
-            /* extend R */
+            // extend R
             for ( i = 0; i <= j; ++i )
                 workspace->h[i][j] = v[i];
 
@@ -1834,8 +1911,8 @@ int GMRES_HouseHolder( reax_system *system, storage *workspace,
         }
 
 
-        /* solve Hy = w.
-           H is now upper-triangular, do back-substitution */
+        // solve Hy = w.
+        //   H is now upper-triangular, do back-substitution 
         for ( i = j - 1; i >= 0; i-- )
         {
             temp = w[i];
@@ -1848,7 +1925,7 @@ int GMRES_HouseHolder( reax_system *system, storage *workspace,
         for ( i = j - 1; i >= 0; i-- )
             Vector_Add( x, workspace->y[i], z[i], N );
 
-        /* stopping condition */
+        // stopping condition
         if ( fabs( w[j] ) / bnorm <= tol )
             break;
     }
@@ -1872,5 +1949,5 @@ int GMRES_HouseHolder( reax_system *system, storage *workspace,
     }
 
     return SUCCESS;
-}
+}*/
 #endif
diff --git a/PuReMD/src/linear_solvers.h b/PuReMD/src/linear_solvers.h
index 3b6fc1a0..980a8b90 100644
--- a/PuReMD/src/linear_solvers.h
+++ b/PuReMD/src/linear_solvers.h
@@ -24,11 +24,11 @@
 
 #include "reax_types.h"
 
-void setup_sparse_approx_inverse( reax_system*, storage*, mpi_datatypes*, 
+real setup_sparse_approx_inverse( reax_system*, simulation_data*, storage*, mpi_datatypes*, 
         sparse_matrix *, sparse_matrix **, int, double );
 
-void sparse_approx_inverse( reax_system*, storage*, mpi_datatypes*, 
-        sparse_matrix*, sparse_matrix*, sparse_matrix** );
+real sparse_approx_inverse( reax_system*, simulation_data*, storage*, mpi_datatypes*, 
+        sparse_matrix*, sparse_matrix*, sparse_matrix**, int );
 
 int GMRES( reax_system*, storage*, sparse_matrix*,
            real*, real, real*, mpi_datatypes*, FILE* );
@@ -36,7 +36,7 @@ int GMRES_HouseHolder( reax_system*, storage*, sparse_matrix*,
                        real*, real, real*, mpi_datatypes*, FILE* );
 int dual_CG( reax_system*, storage*, sparse_matrix*,
              rvec2*, real, rvec2*, mpi_datatypes*, FILE* );
-int CG( reax_system*, storage*, sparse_matrix*,
+int CG( reax_system*, control_params*, simulation_data*, storage*, sparse_matrix*,
         real*, real, real*, mpi_datatypes*, FILE*, int );
 int PCG( reax_system*, storage*, sparse_matrix*, real*, real,
          sparse_matrix*, sparse_matrix*, real*, mpi_datatypes*, FILE* );
diff --git a/PuReMD/src/parallelreax.c b/PuReMD/src/parallelreax.c
index ea30ceed..894d7fde 100644
--- a/PuReMD/src/parallelreax.c
+++ b/PuReMD/src/parallelreax.c
@@ -211,7 +211,7 @@ int main( int argc, char* argv[] )
 #endif
 
     /* start the simulation */
-    int total_itr = data->timing.s_matvecs + data->timing.t_matvecs;
+    int total_itr = data->timing.cm_solver_iters;
     for ( ++data->step; data->step <= control->nsteps; data->step++ )
     {
         if ( control->T_mode )
@@ -221,7 +221,7 @@ int main( int argc, char* argv[] )
         Post_Evolve(system, control, data, workspace, lists, out_control, mpi_data);
         if( system->my_rank == MASTER_NODE )
         {
-            total_itr += data->timing.s_matvecs + data->timing.t_matvecs;
+            total_itr += data->timing.cm_solver_iters;
         }
         Output_Results( system, control, data, lists, out_control, mpi_data );
         //Analysis(system, control, data, workspace, lists, out_control, mpi_data);
diff --git a/PuReMD/src/qEq.c b/PuReMD/src/qEq.c
index 9c5650f3..96917198 100644
--- a/PuReMD/src/qEq.c
+++ b/PuReMD/src/qEq.c
@@ -368,20 +368,41 @@ void Calculate_Charges( reax_system *system, storage *workspace,
 static void Setup_Preconditioner_QEq( reax_system *system, control_params *control, 
         simulation_data *data, storage *workspace, mpi_datatypes *mpi_data )
 {
+    real time, t_sort, t_pc, total_sort, total_pc;
+
     /* sort H needed for SpMV's in linear solver, H or H_sp needed for preconditioning */
+    time = Get_Time( );
     Sort_Matrix_Rows( workspace->H );
+    t_sort = Get_Timing_Info( time );
 
-    //TODO: add sai filtering value, which will be passed as the last parameter
-    setup_sparse_approx_inverse( system, workspace, mpi_data, workspace->H, &workspace->H_spar_patt, 
+    t_pc = setup_sparse_approx_inverse( system, data, workspace, mpi_data, workspace->H, &workspace->H_spar_patt, 
             control->nprocs, control->sai_thres );
+
+
+    MPI_Reduce(&t_sort, &total_sort, 1, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world);
+    MPI_Reduce(&t_pc, &total_pc, 1, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world);
+
+    if( system->my_rank == MASTER_NODE )
+    {
+        data->timing.cm_sort_mat_rows += total_sort / control->nprocs;
+        data->timing.cm_solver_pre_comp += total_pc / control->nprocs;
+    }
 }
 
 static void Compute_Preconditioner_QEq( reax_system *system, control_params *control, 
         simulation_data *data, storage *workspace, mpi_datatypes *mpi_data )
 {
+    real t_pc, total_pc;
 #if defined(HAVE_LAPACKE) || defined(HAVE_LAPACKE_MKL)
-    sparse_approx_inverse( system, workspace, mpi_data,
-            workspace->H, workspace->H_spar_patt, &workspace->H_app_inv );
+    t_pc = sparse_approx_inverse( system, data, workspace, mpi_data,
+            workspace->H, workspace->H_spar_patt, &workspace->H_app_inv, control->nprocs );
+
+    MPI_Reduce(&t_pc, &total_pc, 1, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world);
+
+    if( system->my_rank == MASTER_NODE )
+    {
+        data->timing.cm_solver_pre_comp += total_pc / control->nprocs;
+    }
 #else
     fprintf( stderr, "[ERROR] LAPACKE support disabled. Re-compile before enabling. Terminating...\n" );
     exit( INVALID_INPUT );
@@ -392,7 +413,7 @@ void QEq( reax_system *system, control_params *control, simulation_data *data,
         storage *workspace, output_controls *out_control,
         mpi_datatypes *mpi_data )
 {
-    int j, s_matvecs, t_matvecs;
+    int j, iters;
 
     Init_MatVec( system, data, control, workspace, mpi_data );
 
@@ -401,44 +422,35 @@ void QEq( reax_system *system, control_params *control, simulation_data *data,
     //Print_Linear_System( system, control, workspace, data->step );
 #endif
 
-    //s_matvecs = dual_CG(system, workspace, workspace->H, workspace->b,
-    //          control->q_err, workspace->x, mpi_data, out_control->log);
-    //t_matvecs = 0;
-
-#if defined(SAI_PRECONDITIONER) && !defined(HALF_LIST)
-    if( control->refactor > 0 && ((data->step - data->prev_steps) % control->refactor == 0))
+    if( control->cm_solver_pre_comp_type == SAI_PC )
     {
-        Setup_Preconditioner_QEq( system, control, data, workspace, mpi_data );
+        if( control->refactor > 0 && ((data->step - data->prev_steps) % control->refactor == 0))
+        {
+            Setup_Preconditioner_QEq( system, control, data, workspace, mpi_data );
 
-        Compute_Preconditioner_QEq( system, control, data, workspace, mpi_data );
+            Compute_Preconditioner_QEq( system, control, data, workspace, mpi_data );
+        }
     }
-#endif
 
 
     for ( j = 0; j < system->n; ++j )
         workspace->s[j] = workspace->x[j][0];
-    s_matvecs = CG(system, workspace, workspace->H, workspace->b_s,//newQEq sCG
-            control->q_err, workspace->s, mpi_data, out_control->log , data->step );
+    iters = CG(system, control, data, workspace, workspace->H, workspace->b_s,
+            control->q_err, workspace->s, mpi_data, out_control->log , control->nprocs );
     for ( j = 0; j < system->n; ++j )
         workspace->x[j][0] = workspace->s[j];
 
-    //s_matvecs = PCG( system, workspace, workspace->H, workspace->b_s,
-    //   control->q_err, workspace->L, workspace->U, workspace->s,
-    //   mpi_data, out_control->log );
 #if defined(DEBUG)
     fprintf( stderr, "p%d: first CG completed\n", system->my_rank );
 #endif
 
     for ( j = 0; j < system->n; ++j )
         workspace->t[j] = workspace->x[j][1];
-    t_matvecs = CG(system, workspace, workspace->H, workspace->b_t,//newQEq sCG
-            control->q_err, workspace->t, mpi_data, out_control->log, data->step );
+    iters += CG(system, control, data, workspace, workspace->H, workspace->b_t,//newQEq sCG
+            control->q_err, workspace->t, mpi_data, out_control->log, control->nprocs );
     for ( j = 0; j < system->n; ++j )
         workspace->x[j][1] = workspace->t[j];
 
-    //t_matvecs = PCG( system, workspace, workspace->H, workspace->b_t,
-    //   control->q_err, workspace->L, workspace->U, workspace->t,
-    //   mpi_data, out_control->log );
 #if defined(DEBUG)
     fprintf( stderr, "p%d: second CG completed\n", system->my_rank );
 #endif
@@ -452,8 +464,7 @@ void QEq( reax_system *system, control_params *control, simulation_data *data,
 #if defined(LOG_PERFORMANCE)
     if ( system->my_rank == MASTER_NODE )
     {
-        data->timing.s_matvecs += s_matvecs;
-        data->timing.t_matvecs += t_matvecs;
+        data->timing.cm_solver_iters += iters;
     }
 #endif
 }
diff --git a/PuReMD/src/reax_types.h b/PuReMD/src/reax_types.h
index 18cc8c49..4335d66c 100644
--- a/PuReMD/src/reax_types.h
+++ b/PuReMD/src/reax_types.h
@@ -41,18 +41,18 @@
 
 #define PURE_REAX
 //#define LAMMPS_REAX
-#define SAI_PRECONDITIONER
 //#define HALF_LIST
 //#define DEBUG
 //#define DEBUG_FOCUS
 //#define TEST_ENERGY
 //#define TEST_FORCES
-#define CG_PERFORMANCE
+//#define CG_PERFORMANCE
 #define LOG_PERFORMANCE
 #define STANDARD_BOUNDARIES
 //#define OLD_BOUNDARIES
 //#define MIDPOINT_BOUNDARIES
 
+#define ZERO                    (0.000000000000000e+00)
 #define REAX_MAX_STR            1024
 #define REAX_MAX_NBRS           6
 #define REAX_MAX_3BODY_PARAM    5
@@ -69,6 +69,20 @@
 #define NAN_REAL(a) (0)
 #endif
 
+
+/* preconditioner computation type for charge method linear solver */
+enum pre_comp
+{
+    NONE_PC = 0,
+    DIAG_PC = 1,
+    ICHOLT_PC = 2,
+    ILU_PAR_PC = 3,
+    ILUT_PAR_PC = 4,
+    ILU_SUPERLU_MT_PC = 5,
+    SAI_PC = 6,
+};
+
+
 /********************** TYPE DEFINITIONS ********************/
 typedef int  ivec[3];
 typedef double real;
@@ -534,7 +548,9 @@ typedef struct
     int tabulate;
 
     int qeq_freq;
+    int cm_solver_max_iters;
     real q_err;
+    int cm_solver_pre_comp_type;
     real sai_thres;
     int refactor;
     real droptol;
@@ -603,19 +619,44 @@ typedef struct
 
 typedef struct
 {
+    /* start time of event */
     real start;
+    /* end time of event */
     real end;
+    /* total elapsed time of event */
     real elapsed;
-
+    /* total simulation time */
     real total;
+    /* communication time */
     real comm;
+    /* neighbor list generation time */
     real nbrs;
+    /* force initialization time */
     real init_forces;
+    /* bonded force calculation time */
     real bonded;
+    /* non-bonded force calculation time */
     real nonb;
-    real qEq;
-    int  s_matvecs;
-    int  t_matvecs;
+    /* atomic charge distribution calculation time */
+    real cm;
+    /**/
+    real cm_sort_mat_rows;
+    /**/
+    real cm_solver_comm;
+    /**/
+    real cm_solver_pre_comp;
+    /**/
+    real cm_solver_pre_app; // update CG()
+    /* num. of steps in iterative linear solver for charge distribution */
+    int cm_solver_iters;
+    /**/
+    real cm_solver_spmv; // update CG()
+    /**/
+    real cm_solver_vector_ops; // update CG()
+    /**/
+    real cm_solver_orthog;
+    /**/
+    real cm_solver_tri_solve;
 } reax_timing;
 
 
diff --git a/PuReMD/src/reset_tools.c b/PuReMD/src/reset_tools.c
index eb7c6465..32811672 100644
--- a/PuReMD/src/reset_tools.c
+++ b/PuReMD/src/reset_tools.c
@@ -106,15 +106,14 @@ void Reset_Simulation_Data( simulation_data* data, int virial )
 
 void Reset_Timing( reax_timing *rt )
 {
-    rt->total = Get_Time();
+    /*rt->total = Get_Time();
     rt->comm = 0;
     rt->nbrs = 0;
     rt->init_forces = 0;
     rt->bonded = 0;
     rt->nonb = 0;
     rt->qEq = 0;
-    rt->s_matvecs = 0;
-    rt->t_matvecs = 0;
+    rt->cm_solver_iters = 0;*/
 }
 
 #ifdef TEST_FORCES
-- 
GitLab