diff --git a/PG-PuReMD/src/basic_comm.c b/PG-PuReMD/src/basic_comm.c
index 5d2afdeb38ec397ce2a82b1112ea5ca1c15dabce..b81180ec1626735aa8f141594b538ecd2741d83a 100644
--- a/PG-PuReMD/src/basic_comm.c
+++ b/PG-PuReMD/src/basic_comm.c
@@ -34,6 +34,19 @@ typedef void (*dist_packer)( void*, mpi_out_data* );
 typedef void (*coll_unpacker)( void*, void*, mpi_out_data* );
 
 
+static void int_packer( void *dummy, mpi_out_data *out_buf )
+{
+    int i;
+    int *buf = (int*) dummy;
+    int *out = (int*) out_buf->out_atoms;
+
+    for ( i = 0; i < out_buf->cnt; ++i )
+    {
+        out[i] = buf[ out_buf->index[i] ];
+    }
+}
+
+
 static void real_packer( void *dummy, mpi_out_data *out_buf )
 {
     int i;
@@ -77,6 +90,21 @@ static void rvec2_packer( void *dummy, mpi_out_data *out_buf )
 }
 
 
+static void int_unpacker( void *dummy_in, void *dummy_buf, mpi_out_data *out_buf )
+{
+        int i;
+        int *in, *buf;
+
+        in = (int*) dummy_in;
+        buf = (int*) dummy_buf;
+
+        for ( i = 0; i < out_buf->cnt; ++i )
+        {
+            buf[ out_buf->index[i] ] += in[i];
+        }
+}
+
+
 static void real_unpacker( void *dummy_in, void *dummy_buf, mpi_out_data *out_buf )
 {
     int i;
@@ -135,6 +163,10 @@ static void * Get_Buffer_Offset( const void * const buffer,
 
     switch ( type )
     {
+        case INT_PTR_TYPE:
+            ptr = (int *) buffer + offset;
+            break;
+
         case REAL_PTR_TYPE:
             ptr = (real *) buffer + offset;
             break;
diff --git a/PG-PuReMD/src/basic_comm.h b/PG-PuReMD/src/basic_comm.h
index 5b12d681b243e01faa85398d733bf765ddd9f0b0..5f13d4eb0a2df8f5c25cf65ddf254cd1ca2a60c4 100644
--- a/PG-PuReMD/src/basic_comm.h
+++ b/PG-PuReMD/src/basic_comm.h
@@ -27,9 +27,10 @@
 
 enum pointer_type
 {
-    REAL_PTR_TYPE = 0,
-    RVEC_PTR_TYPE = 1,
-    RVEC2_PTR_TYPE = 2,
+    INT_PTR_TYPE = 0,
+    REAL_PTR_TYPE = 1,
+    RVEC_PTR_TYPE = 2,
+    RVEC2_PTR_TYPE = 3,
 };
 
 
diff --git a/PG-PuReMD/src/charges.c b/PG-PuReMD/src/charges.c
index a98abddeda8ed9757a631e140f406d6d2dd78ff2..364e0805844046fd09020a97b47ddc6ffdcdae3e 100644
--- a/PG-PuReMD/src/charges.c
+++ b/PG-PuReMD/src/charges.c
@@ -179,7 +179,7 @@ static void Extrapolate_Charges_QEq( const reax_system * const system,
         else if ( control->cm_init_guess_extrap1 == 3 )
         {
             s_tmp = 4.0 * (system->my_atoms[i].s[0] + system->my_atoms[i].s[2]) -
-                    (6.0 * system->my_atoms[i].s[1] + system->my_atoms[i].s[3]);
+                (6.0 * system->my_atoms[i].s[1] + system->my_atoms[i].s[3]);
         }
         else
         {
@@ -277,7 +277,7 @@ static void Extrapolate_Charges_EE( const reax_system * const system,
         else if ( control->cm_init_guess_extrap1 == 3 )
         {
             s_tmp = 4.0 * (system->my_atoms[i].s[0] + system->my_atoms[i].s[2]) -
-                    (6.0 * system->my_atoms[i].s[1] + system->my_atoms[i].s[3]);
+                (6.0 * system->my_atoms[i].s[1] + system->my_atoms[i].s[3]);
         }
         else
         {
@@ -309,10 +309,11 @@ static void Extrapolate_Charges_EE_Part2( const reax_system * const system,
 
 
 /* Compute preconditioner for QEq
- */
+*/
 static void Compute_Preconditioner_QEq( const reax_system * const system,
         const control_params * const control,
-        simulation_data * const data, storage * const workspace )
+        simulation_data * const data, storage * const workspace, 
+        const mpi_datatypes * const mpi_data )
 {
     sparse_matrix *Hptr;
 
@@ -333,7 +334,7 @@ static void Compute_Preconditioner_QEq( const reax_system * const system,
         case DIAG_PC:
             data->timing.cm_solver_pre_comp +=
                 diag_pre_comp( system, workspace->Hdia_inv );
-//                diag_pre_comp( Hptr, workspace->Hdia_inv );
+            //                diag_pre_comp( Hptr, workspace->Hdia_inv );
             break;
 
         case ICHOLT_PC:
@@ -347,9 +348,9 @@ static void Compute_Preconditioner_QEq( const reax_system * const system,
         case SAI_PC:
 #if defined(HAVE_LAPACKE) || defined(HAVE_LAPACKE_MKL)
             //TODO: implement
-//            data->timing.cm_solver_pre_comp +=
-//                sparse_approx_inverse( workspace->H_full, workspace->H_spar_patt_full,
-//                        &workspace->H_app_inv );
+            data->timing.cm_solver_pre_comp += 
+                sparse_approx_inverse( system, workspace, mpi_data, 
+                        workspace->H_full, workspace->H_spar_patt_full, &workspace->H_app_inv );
 #else
             fprintf( stderr, "[ERROR] LAPACKE support disabled. Re-compile before enabling. Terminating...\n" );
             exit( INVALID_INPUT );
@@ -365,10 +366,11 @@ static void Compute_Preconditioner_QEq( const reax_system * const system,
 
 
 /* Compute preconditioner for EE
- */
+*/
 static void Compute_Preconditioner_EE( const reax_system * const system,
         const control_params * const control,
-        simulation_data * const data, storage * const workspace )
+        simulation_data * const data, storage * const workspace,
+        const mpi_datatypes * const mpi_data )
 {
     sparse_matrix *Hptr;
 
@@ -387,7 +389,7 @@ static void Compute_Preconditioner_EE( const reax_system * const system,
     {
         Hptr->entries[Hptr->start[system->N + 1] - 1].val = 1.0;
     }
-    
+
     switch ( control->cm_solver_pre_comp_type )
     {
         case NONE_PC:
@@ -396,7 +398,7 @@ static void Compute_Preconditioner_EE( const reax_system * const system,
         case DIAG_PC:
             data->timing.cm_solver_pre_comp +=
                 diag_pre_comp( system, workspace->Hdia_inv );
-//                diag_pre_comp( Hptr, workspace->Hdia_inv );
+            //                diag_pre_comp( Hptr, workspace->Hdia_inv );
             break;
 
         case ICHOLT_PC:
@@ -410,9 +412,9 @@ static void Compute_Preconditioner_EE( const reax_system * const system,
         case SAI_PC:
 #if defined(HAVE_LAPACKE) || defined(HAVE_LAPACKE_MKL)
             //TODO: implement
-//            data->timing.cm_solver_pre_comp +=
-//                sparse_approx_inverse( workspace->H_full, workspace->H_spar_patt_full,
-//                        &workspace->H_app_inv );
+            data->timing.cm_solver_pre_comp += 
+                sparse_approx_inverse( system, workspace, mpi_data, 
+                        workspace->H_full, workspace->H_spar_patt_full, &workspace->H_app_inv );
 #else
             fprintf( stderr, "[ERROR] LAPACKE support disabled. Re-compile before enabling. Terminating...\n" );
             exit( INVALID_INPUT );
@@ -435,10 +437,11 @@ static void Compute_Preconditioner_EE( const reax_system * const system,
 
 
 /* Compute preconditioner for ACKS2
- */
+*/
 static void Compute_Preconditioner_ACKS2( const reax_system * const system,
         const control_params * const control,
-        simulation_data * const data, storage * const workspace )
+        simulation_data * const data, storage * const workspace,
+        const mpi_datatypes * const mpi_data )
 {
     sparse_matrix *Hptr;
 
@@ -458,7 +461,7 @@ static void Compute_Preconditioner_ACKS2( const reax_system * const system,
         Hptr->entries[Hptr->start[system->N + 1] - 1].val = 1.0;
         Hptr->entries[Hptr->start[system->n_cm] - 1].val = 1.0;
     }
-    
+
     switch ( control->cm_solver_pre_comp_type )
     {
         case NONE_PC:
@@ -467,7 +470,7 @@ static void Compute_Preconditioner_ACKS2( const reax_system * const system,
         case DIAG_PC:
             data->timing.cm_solver_pre_comp +=
                 diag_pre_comp( system, workspace->Hdia_inv );
-//                diag_pre_comp( Hptr, workspace->Hdia_inv );
+            //                diag_pre_comp( Hptr, workspace->Hdia_inv );
             break;
 
         case ICHOLT_PC:
@@ -481,9 +484,9 @@ static void Compute_Preconditioner_ACKS2( const reax_system * const system,
         case SAI_PC:
 #if defined(HAVE_LAPACKE) || defined(HAVE_LAPACKE_MKL)
             //TODO: implement
-//            data->timing.cm_solver_pre_comp +=
-//                sparse_approx_inverse( workspace->H_full, workspace->H_spar_patt_full,
-//                        &workspace->H_app_inv );
+            data->timing.cm_solver_pre_comp += 
+                sparse_approx_inverse( system, workspace, mpi_data, 
+                        workspace->H_full, workspace->H_spar_patt_full, &workspace->H_app_inv );
 #else
             fprintf( stderr, "[ERROR] LAPACKE support disabled. Re-compile before enabling. Terminating...\n" );
             exit( INVALID_INPUT );
@@ -508,7 +511,7 @@ static void Compute_Preconditioner_ACKS2( const reax_system * const system,
 
 static void Setup_Preconditioner_QEq( const reax_system * const system,
         const control_params * const control,
-        simulation_data * const data, storage * const workspace )
+        simulation_data * const data, storage * const workspace, const mpi_datatypes * const mpi_data )
 {
     real time;
     sparse_matrix *Hptr;
@@ -539,8 +542,8 @@ static void Setup_Preconditioner_QEq( const reax_system * const system,
         case DIAG_PC:
             if ( workspace->Hdia_inv == NULL )
             {
-//                workspace->Hdia_inv = scalloc( Hptr->n, sizeof( real ),
-//                        "Setup_Preconditioner_QEq::workspace->Hdiv_inv" );
+                //                workspace->Hdia_inv = scalloc( Hptr->n, sizeof( real ),
+                //                        "Setup_Preconditioner_QEq::workspace->Hdiv_inv" );
             }
             break;
 
@@ -554,9 +557,10 @@ static void Setup_Preconditioner_QEq( const reax_system * const system,
 
         case SAI_PC:
             //TODO: implement
-//            setup_sparse_approx_inverse( Hptr, &workspace->H_full, &workspace->H_spar_patt,
-//                    &workspace->H_spar_patt_full, &workspace->H_app_inv,
-//                    control->cm_solver_pre_comp_sai_thres );
+            /*setup_sparse_approx_inverse( Hptr, &workspace->H_full, &workspace->H_spar_patt,
+              &workspace->H_spar_patt_full, &workspace->H_app_inv,
+              control->cm_solver_pre_comp_sai_thres );*/
+            setup_sparse_approx_inverse( system, workspace, mpi_data, Hptr, &workspace->H_spar_patt, control->nprocs, control->cm_solver_pre_comp_sai_thres );
             break;
 
         default:
@@ -568,10 +572,11 @@ static void Setup_Preconditioner_QEq( const reax_system * const system,
 
 
 /* Setup routines before computing the preconditioner for EE
- */
+*/
 static void Setup_Preconditioner_EE( const reax_system * const system,
         const control_params * const control,
-        simulation_data * const data, storage * const workspace )
+        simulation_data * const data, storage * const workspace, 
+        const mpi_datatypes * const mpi_data )
 {
     real time;
     sparse_matrix *Hptr;
@@ -609,8 +614,8 @@ static void Setup_Preconditioner_EE( const reax_system * const system,
         case DIAG_PC:
             if ( workspace->Hdia_inv == NULL )
             {
-//                workspace->Hdia_inv = scalloc( system->n_cm, sizeof( real ),
-//                        "Setup_Preconditioner_QEq::workspace->Hdiv_inv" );
+                //                workspace->Hdia_inv = scalloc( system->n_cm, sizeof( real ),
+                //                        "Setup_Preconditioner_QEq::workspace->Hdiv_inv" );
             }
             break;
 
@@ -624,9 +629,10 @@ static void Setup_Preconditioner_EE( const reax_system * const system,
 
         case SAI_PC:
             //TODO: implement
-//            setup_sparse_approx_inverse( Hptr, &workspace->H_full, &workspace->H_spar_patt,
-//                    &workspace->H_spar_patt_full, &workspace->H_app_inv,
-//                    control->cm_solver_pre_comp_sai_thres );
+            //            setup_sparse_approx_inverse( Hptr, &workspace->H_full, &workspace->H_spar_patt,
+            //                    &workspace->H_spar_patt_full, &workspace->H_app_inv,
+            //                    control->cm_solver_pre_comp_sai_thres );
+            setup_sparse_approx_inverse( system, workspace, mpi_data, Hptr, &workspace->H_spar_patt, control->nprocs, control->cm_solver_pre_comp_sai_thres );
             break;
 
         default:
@@ -645,10 +651,11 @@ static void Setup_Preconditioner_EE( const reax_system * const system,
 
 
 /* Setup routines before computing the preconditioner for ACKS2
- */
+*/
 static void Setup_Preconditioner_ACKS2( const reax_system * const system,
         const control_params * const control,
-        simulation_data * const data, storage * const workspace )
+        simulation_data * const data, storage * const workspace, 
+        const mpi_datatypes * const mpi_data )
 {
     real time;
     sparse_matrix *Hptr;
@@ -687,8 +694,8 @@ static void Setup_Preconditioner_ACKS2( const reax_system * const system,
         case DIAG_PC:
             if ( workspace->Hdia_inv == NULL )
             {
-//                workspace->Hdia_inv = scalloc( Hptr->n, sizeof( real ),
-//                        "Setup_Preconditioner_QEq::workspace->Hdiv_inv" );
+                //                workspace->Hdia_inv = scalloc( Hptr->n, sizeof( real ),
+                //                        "Setup_Preconditioner_QEq::workspace->Hdiv_inv" );
             }
             break;
 
@@ -702,9 +709,10 @@ static void Setup_Preconditioner_ACKS2( const reax_system * const system,
 
         case SAI_PC:
             //TODO: implement
-//            setup_sparse_approx_inverse( Hptr, &workspace->H_full, &workspace->H_spar_patt,
-//                    &workspace->H_spar_patt_full, &workspace->H_app_inv,
-//                    control->cm_solver_pre_comp_sai_thres );
+            //            setup_sparse_approx_inverse( Hptr, &workspace->H_full, &workspace->H_spar_patt,
+            //                    &workspace->H_spar_patt_full, &workspace->H_app_inv,
+            //                    control->cm_solver_pre_comp_sai_thres );
+            setup_sparse_approx_inverse( system, workspace, mpi_data, Hptr, &workspace->H_spar_patt, control->nprocs, control->cm_solver_pre_comp_sai_thres );
             break;
 
         default:
@@ -724,7 +732,7 @@ static void Setup_Preconditioner_ACKS2( const reax_system * const system,
 
 
 /* Combine ficticious charges s and t to get atomic charge q for QEq method
- */
+*/
 static void Calculate_Charges_QEq( const reax_system * const system,
         storage * const workspace, mpi_datatypes * const mpi_data )
 {
@@ -764,7 +772,7 @@ static void Calculate_Charges_QEq( const reax_system * const system,
 
 
 /* Get atomic charge q for EE method
- */
+*/
 static void Calculate_Charges_EE( const reax_system * const system,
         storage * const workspace, mpi_datatypes * const mpi_data )
 {
@@ -795,33 +803,33 @@ static void QEq( reax_system * const system, control_params * const control,
 
     if ( control->cm_solver_pre_comp_refactor > 0 &&
             ((data->step - data->prev_steps) % control->cm_solver_pre_comp_refactor == 0) )
-        
+
     {
-        Setup_Preconditioner_QEq( system, control, data, workspace );
+        Setup_Preconditioner_QEq( system, control, data, workspace, mpi_data );
 
-        Compute_Preconditioner_QEq( system, control, data, workspace );
+        Compute_Preconditioner_QEq( system, control, data, workspace, mpi_data );
     }
 
     Extrapolate_Charges_QEq( system, control, data, workspace );
 
     switch ( control->cm_solver_type )
     {
-    case CG_S:
-        iters = dual_CG( system, control, workspace, data, mpi_data,
-                &workspace->H, workspace->b, control->cm_solver_q_err,
-                workspace->x, (control->cm_solver_pre_comp_refactor > 0
-                    && (data->step - data->prev_steps)
-                    % control->cm_solver_pre_comp_refactor == 0) ? TRUE : FALSE );
-        break;
+        case CG_S:
+            iters = dual_CG( system, control, workspace, data, mpi_data,
+                    &workspace->H, workspace->b, control->cm_solver_q_err,
+                    workspace->x, (control->cm_solver_pre_comp_refactor > 0
+                        && (data->step - data->prev_steps)
+                        % control->cm_solver_pre_comp_refactor == 0) ? TRUE : FALSE );
+            break;
 
-    case GMRES_S:
-    case GMRES_H_S:
-    case SDM_S:
-    case BiCGStab_S:
-    default:
-        fprintf( stderr, "[ERROR] Unsupported solver selection. Terminating...\n" );
-        exit( INVALID_INPUT );
-        break;
+        case GMRES_S:
+        case GMRES_H_S:
+        case SDM_S:
+        case BiCGStab_S:
+        default:
+            fprintf( stderr, "[ERROR] Unsupported solver selection. Terminating...\n" );
+            exit( INVALID_INPUT );
+            break;
     }
 
 #if defined(LOG_PERFORMANCE)
@@ -856,24 +864,24 @@ static void EE( reax_system * const system, control_params * const control,
     if ( control->cm_solver_pre_comp_refactor > 0 &&
             ((data->step - data->prev_steps) % control->cm_solver_pre_comp_refactor == 0) )
     {
-        Setup_Preconditioner_EE( system, control, data, workspace );
+        Setup_Preconditioner_EE( system, control, data, workspace, mpi_data );
 
-        Compute_Preconditioner_EE( system, control, data, workspace );
+        Compute_Preconditioner_EE( system, control, data, workspace, mpi_data );
     }
 
     Extrapolate_Charges_EE( system, control, data, workspace );
 
     switch ( control->cm_solver_type )
     {
-    case GMRES_S:
-    case GMRES_H_S:
-    case CG_S:
-    case SDM_S:
-    case BiCGStab_S:
-    default:
-        fprintf( stderr, "[ERROR] Unrecognized EE solver selection. Terminating...\n" );
-        exit( INVALID_INPUT );
-        break;
+        case GMRES_S:
+        case GMRES_H_S:
+        case CG_S:
+        case SDM_S:
+        case BiCGStab_S:
+        default:
+            fprintf( stderr, "[ERROR] Unrecognized EE solver selection. Terminating...\n" );
+            exit( INVALID_INPUT );
+            break;
     }
 
     data->timing.cm_solver_iters += iters;
@@ -903,24 +911,24 @@ static void ACKS2( reax_system * const system, control_params * const control,
     if ( control->cm_solver_pre_comp_refactor > 0 &&
             ((data->step - data->prev_steps) % control->cm_solver_pre_comp_refactor == 0) )
     {
-        Setup_Preconditioner_ACKS2( system, control, data, workspace );
+        Setup_Preconditioner_ACKS2( system, control, data, workspace, mpi_data );
 
-        Compute_Preconditioner_ACKS2( system, control, data, workspace );
+        Compute_Preconditioner_ACKS2( system, control, data, workspace, mpi_data );
     }
 
     Extrapolate_Charges_EE( system, control, data, workspace );
 
     switch ( control->cm_solver_type )
     {
-    case GMRES_S:
-    case GMRES_H_S:
-    case CG_S:
-    case SDM_S:
-    case BiCGStab_S:
-    default:
-        fprintf( stderr, "[ERROR] Unrecognized ACKS2 solver selection. Terminating...\n" );
-        exit( INVALID_INPUT );
-        break;
+        case GMRES_S:
+        case GMRES_H_S:
+        case CG_S:
+        case SDM_S:
+        case BiCGStab_S:
+        default:
+            fprintf( stderr, "[ERROR] Unrecognized ACKS2 solver selection. Terminating...\n" );
+            exit( INVALID_INPUT );
+            break;
     }
 
     data->timing.cm_solver_iters += iters;
@@ -938,21 +946,21 @@ void Compute_Charges( reax_system * const system, control_params * const control
 {
     switch ( control->charge_method )
     {
-    case QEQ_CM:
-        QEq( system, control, data, workspace, out_control, mpi_data );
-        break;
+        case QEQ_CM:
+            QEq( system, control, data, workspace, out_control, mpi_data );
+            break;
 
-    case EE_CM:
-        EE( system, control, data, workspace, out_control, mpi_data );
-        break;
+        case EE_CM:
+            EE( system, control, data, workspace, out_control, mpi_data );
+            break;
 
-    case ACKS2_CM:
-        ACKS2( system, control, data, workspace, out_control, mpi_data );
-        break;
+        case ACKS2_CM:
+            ACKS2( system, control, data, workspace, out_control, mpi_data );
+            break;
 
-    default:
-        fprintf( stderr, "[ERROR] Invalid charge method. Terminating...\n" );
-        exit( UNKNOWN_OPTION );
-        break;
+        default:
+            fprintf( stderr, "[ERROR] Invalid charge method. Terminating...\n" );
+            exit( UNKNOWN_OPTION );
+            break;
     }
 }
diff --git a/PG-PuReMD/src/lin_alg.c b/PG-PuReMD/src/lin_alg.c
index d837d03bf7647ff5c75c5a05e55e2322f2abeb97..33c2c6a851b5ff67f12ae2527b8b29b488425393 100644
--- a/PG-PuReMD/src/lin_alg.c
+++ b/PG-PuReMD/src/lin_alg.c
@@ -29,13 +29,19 @@
 #include "vector.h"
 
 #if defined(HAVE_CUDA) && defined(DEBUG)
-  #include "cuda/cuda_validation.h"
+#include "cuda/cuda_validation.h"
 #endif
 
 #if defined(CG_PERFORMANCE)
 real t_start, t_elapsed, matvec_time, dot_time;
 #endif
 
+enum preconditioner_type
+{
+    LEFT = 0,
+    RIGHT = 1,
+};
+
 
 static void dual_Sparse_MatVec( const sparse_matrix * const A,
         const rvec2 * const x, rvec2 * const b, const int N )
@@ -61,23 +67,23 @@ static void dual_Sparse_MatVec( const sparse_matrix * const A,
 #if defined(HALF_LIST)
         for ( k = si + 1; k < A->end[i]; ++k )
 #else
-        for ( k = si; k < A->end[i]; ++k )
+            for ( k = si; k < A->end[i]; ++k )
 #endif
-        {
-            j = A->entries[k].j;
-            H = A->entries[k].val;
+            {
+                j = A->entries[k].j;
+                H = A->entries[k].val;
 
-            b[i][0] += H * x[j][0];
-            b[i][1] += H * x[j][1];
+                b[i][0] += H * x[j][0];
+                b[i][1] += H * x[j][1];
 
 #if defined(HALF_LIST)
-            // comment out for tryQEq
-            //if( j < A->n ) {
-            b[j][0] += H * x[i][0];
-            b[j][1] += H * x[i][1];
-            //}
+                // comment out for tryQEq
+                //if( j < A->n ) {
+                b[j][0] += H * x[i][0];
+                b[j][1] += H * x[i][1];
+                //}
 #endif
-        }
+            }
     }
 }
 
@@ -92,20 +98,1019 @@ real diag_pre_comp( const reax_system * const system, real * const Hdia_inv )
 
     for ( i = 0; i < system->n; ++i )
     {
-//        if ( H->entries[H->start[i + 1] - 1].val != 0.0 )
-//        {
-//            Hdia_inv[i] = 1.0 / H->entries[H->start[i + 1] - 1].val;
-            Hdia_inv[i] = 1.0 / system->reax_param.sbp[ system->my_atoms[i].type ].eta;
-//        }
-//        else
-//        {
-//            Hdia_inv[i] = 1.0;
-//        }
+        //        if ( H->entries[H->start[i + 1] - 1].val != 0.0 )
+        //        {
+        //            Hdia_inv[i] = 1.0 / H->entries[H->start[i + 1] - 1].val;
+        Hdia_inv[i] = 1.0 / system->reax_param.sbp[ system->my_atoms[i].type ].eta;
+        //        }
+        //        else
+        //        {
+        //            Hdia_inv[i] = 1.0;
+        //        }
     }
 
     return Get_Timing_Info( start );
 }
 
+int compare_dbls( const void* arg1, const void* arg2 )
+{
+    int ret;
+    double a1, a2;
+
+    a1 = *(double *) arg1;
+    a2 = *(double *) arg2;
+
+    if ( a1 < a2 )
+    {
+        ret = -1;
+    }
+    else if (a1 == a2)
+    {
+        ret = 0;
+    }
+    else
+    {
+        ret = 1;
+    }
+
+    return ret;
+}
+
+void qsort_dbls( double *array, int array_len )
+{
+    qsort( array, (size_t)array_len, sizeof(double),
+            compare_dbls );
+}
+
+int find_bucket( double *list, int len, double a )
+{
+    int s, e, m;
+
+    if ( a > list[len - 1] )
+    {
+        return len;
+    }
+
+    s = 0;
+    e = len - 1;
+
+    while ( s < e )
+    {
+        m = (s + e) / 2;
+
+        if ( list[m] < a )
+        {
+            s = m + 1;
+        }
+        else
+        {
+            e = m;
+        }
+    }
+
+    return s;
+}
+
+void setup_sparse_approx_inverse( reax_system *system, storage *workspace, mpi_datatypes* mpi_data, 
+        sparse_matrix *A, sparse_matrix **A_spar_patt, const int nprocs, const double filter )
+{
+
+    int i, bin, total, pos;
+    int n, m, s_local, s, n_local;
+    int target_proc;
+    int k; 
+    int pj, size;
+    int left, right, p, turn;
+
+    real threshold, pivot, tmp;
+    real *input_array;
+    real *samplelist_local, *samplelist;
+    real *pivotlist;
+    real *bucketlist_local, *bucketlist;
+    real *local_entries;
+
+    int *scounts_local, *scounts;
+    int *dspls_local, *dspls;
+    int *bin_elements;
+
+    MPI_Comm comm;
+
+    comm = mpi_data->world;
+
+
+    if ( *A_spar_patt == NULL )
+    {
+        Allocate_Matrix( A_spar_patt, A->n, A->n, A->m );
+    }
+    else if ( ((*A_spar_patt)->m) < (A->m) )
+    {
+        Deallocate_Matrix( *A_spar_patt );
+        Allocate_Matrix( A_spar_patt, A->n, A->n, A->m );
+    }
+
+    m = 0;
+    for( i = 0; i < A->n; ++i )
+    {
+        m += A->end[i] - A->start[i];
+    }
+
+    local_entries = (real *) malloc ( sizeof(real) * m );
+    m = 0;
+    for( i = 0; i < A->n; ++i )
+    {
+        for( pj = A->start[i]; pj < A->end[i]; ++pj )
+        {
+            local_entries[m++] = A->entries[pj].val;
+        }
+    }
+
+    /* the sample ratio is 10% */
+    n_local = m/10.0; 
+
+    input_array = (real *) malloc( sizeof(real) * n_local );
+
+    for ( i = 0; i < n_local ; i++ )
+    {
+        input_array[i] = local_entries[rand( ) % m];
+    }
+
+    s_local = (int) (12.0 * log2(n_local*nprocs));
+
+    MPI_Reduce(&n_local, &n, 1, MPI_DOUBLE, MPI_SUM, MASTER_NODE, comm);
+    MPI_Reduce(&s_local, &s, 1, MPI_DOUBLE, MPI_SUM, MASTER_NODE, comm);
+
+    samplelist_local = (real *) malloc( sizeof(real) * s_local );
+    if ( system->my_rank == MASTER_NODE )
+    {
+        samplelist = (real *) malloc( sizeof(real) * s );
+    }
+
+    for ( i = 0; i < s_local; i++)
+    {
+        samplelist_local[i] = input_array[rand( ) % n_local];
+    }
+
+
+    /* gather samples at the root process */
+    MPI_Gather( samplelist_local, s_local, MPI_DOUBLE, samplelist, s_local, MPI_DOUBLE, MASTER_NODE, comm );
+
+
+    /* sort samples at the root process and select pivots */
+    pivotlist = (real *) malloc( sizeof(real) *  (nprocs - 1) );
+    if ( system->my_rank == MASTER_NODE )
+    {
+        qsort_dbls( samplelist, s );
+
+        for ( i = 1; i < nprocs; i++)
+        {
+            pivotlist[i - 1] = samplelist[(i * s) / nprocs];
+        }
+    }
+
+
+    /* broadcast pivots */
+    MPI_Bcast( pivotlist, nprocs - 1, MPI_DOUBLE, MASTER_NODE, comm );
+
+
+    /* count num. bin elements for each processor, uniform bin sizes */
+    scounts_local = (int *) malloc( sizeof(int) * nprocs );
+    for ( i = 0; i < n_local; i++ )
+    {
+        pos = find_bucket( pivotlist, nprocs - 1, input_array[i] );
+        scounts_local[pos]++;
+    }
+
+    scounts = (int *) malloc( sizeof(int) * nprocs );
+    bin_elements = (int *) malloc( sizeof(int) * nprocs );
+
+    for ( i = 0; i < nprocs; ++i )
+    {
+        bin_elements[i] = scounts_local[i];
+        scounts[i] = scounts_local[i];
+    }
+
+    /* compute displacements for MPI comm */
+    dspls_local = (int *) malloc( sizeof(int) * nprocs );
+
+    dspls_local[0] = 0;
+    for ( i = 0; i < nprocs - 1; i++ )
+    {
+        dspls_local[i + 1] = dspls_local[i] + scounts_local[i];
+    }
+
+    /* bin elements */
+    bucketlist_local = (real *) malloc( sizeof(real) * n_local  );
+
+    for ( i = 0; i < n_local; ++i )
+    {
+        bin = find_bucket( pivotlist, nprocs - 1, input_array[i] );
+        pos = dspls_local[bin] + scounts_local[bin] - bin_elements[bin];
+        bucketlist_local[pos] = input_array[i];
+        bin_elements[bin]--;
+    }
+
+    /* determine counts for elements per process */
+    MPI_Allreduce( MPI_IN_PLACE, scounts, nprocs, MPI_INT, MPI_SUM, comm );
+
+    /*find the target process*/
+    target_proc = 0;
+    total = 0;
+    k = n*filter;
+
+    for(i = nprocs; i >= 0; --i )
+    {
+        if( total + scounts[i] >= k )
+        {
+            /* global k becomes local k*/
+            k -= total; 
+            target_proc = i;
+            break;
+        }
+        total += scounts[i];
+    }
+
+    /* send local buckets to target processor for quickselect*/
+    dspls = (int *) malloc( nprocs * sizeof(int) );
+
+    MPI_Gather( scounts_local + target_proc, 1, MPI_INT, scounts, 1, MPI_INT, target_proc, comm );
+
+    if ( system->my_rank == target_proc )
+    {
+        dspls[0] = 0;
+        for ( i = 0; i < nprocs - 1; ++i )
+        {
+            dspls[i + 1] = dspls[i] + scounts[i];
+        }
+    }
+
+    if( system->my_rank == target_proc)
+    {
+        bucketlist = (real *) malloc( sizeof(real) * scounts[target_proc] );
+    }
+
+    MPI_Gatherv( bucketlist_local + dspls_local[target_proc], scounts_local[target_proc], MPI_DOUBLE,
+            bucketlist, scounts, dspls, MPI_DOUBLE, target_proc, comm);
+
+    /* apply quick select algorithm at the target process*/
+    if( system->my_rank == target_proc)
+    {
+        left = 0;
+        right = scounts[target_proc];
+
+        turn = 0;
+        while( k ) {
+
+            p  = left;
+            turn = 1 - turn;
+            if( turn == 1)
+            {
+                pivot = bucketlist[right];
+            }
+            else
+            {
+                pivot = bucketlist[left];
+            }
+            for( i = left + 1 - turn; i <= right-turn; ++i )
+            {
+                if( bucketlist[i] > pivot )
+                {
+                    tmp = bucketlist[i];
+                    bucketlist[i] = bucketlist[p];
+                    bucketlist[p] = tmp;
+                    p++;
+                }
+            }
+            if(turn == 1)
+            {
+                tmp = bucketlist[p];
+                bucketlist[p] = bucketlist[right];
+                bucketlist[right] = tmp;
+            }
+            else
+            {
+                tmp = bucketlist[p];
+                bucketlist[p] = bucketlist[left];
+                bucketlist[left] = tmp;
+            }
+
+            if( p == k - 1)
+            {
+                threshold = bucketlist[p];
+                break;
+            }
+            else if( p > k - 1 )
+            {
+                right = p - 1;
+            }
+            else
+            {
+                left = p + 1;
+            }
+        }
+        if(threshold < 1.000000)
+        {                    
+            threshold = 1.000001;                    
+        }
+    }
+
+    /*broadcast the filtering value*/
+    MPI_Bcast( &threshold, 1, MPI_DOUBLE, target_proc, comm );
+
+    /*build entries of that pattern*/
+    for ( i = 0; i < A->n; ++i )
+    {
+        (*A_spar_patt)->start[i] = A->start[i];
+        size = A->start[i];
+
+        for ( pj = A->start[i]; pj < A->end[i]; ++pj )
+        {
+            if ( ( A->entries[pj].val >= threshold )  || ( A->entries[pj].j == i ) )
+            {
+                (*A_spar_patt)->entries[size].val = A->entries[pj].val;
+                (*A_spar_patt)->entries[size].j = A->entries[pj].j;
+                size++;
+            }
+        }
+        (*A_spar_patt)->end[i] = size;
+    }
+    (*A_spar_patt)->start[A->n] = A->start[A->n];
+    /*TODO: check if end[N] is set equal to NNZ as start[N]*/
+    (*A_spar_patt)->end[A->n] = A->end[A->n];
+}
+
+void sparse_approx_inverse(reax_system *system, storage *workspace, 
+        mpi_datatypes* mpi_data, const sparse_matrix * const A, 
+        const sparse_matrix * const A_spar_patt, sparse_matrix ** A_app_inv )
+{
+    int i, k, pj, j_temp, identity_pos;
+    int N, M, d_i, d_j;
+    lapack_int m, n, nrhs, lda, ldb, info;
+    int *pos_x, *pos_y;
+    real *e_j, *dense_matrix;
+    char *X, *Y;
+
+    int cnt;
+    reax_atom *atom;
+    int *mark, *row_needed, *row_nnz;
+    int **j_list;
+    real **val_list;
+
+    int d;
+    mpi_out_data *out_bufs;
+    MPI_Comm comm;
+    MPI_Request req1, req2, req3, req4;
+    MPI_Status stat1, stat2, stat3, stat4;
+    neighbor_proc *nbr1, *nbr2;
+    int *j_send, *j_recv1, *j_recv2;
+    real *val_send, *val_recv1, *val_recv2;
+
+
+
+    mark = (int *) malloc( sizeof(int) * system->N );
+    row_needed = (int *) malloc( sizeof(int) * system->N );
+    row_nnz = (int *) malloc( sizeof(int) * system->N );
+
+    j_list = (int **) malloc( sizeof(int *) * system->N );
+    val_list = (real **) malloc( sizeof(real *) * system->N );
+
+    for ( i = 0; i < system->N; ++i )
+    {   
+        mark[ i ] = -1;
+        row_needed[ i ] = -1;
+    }
+
+    /* mark the atoms that already have their row stored in the local matrix */
+    for ( i = 0; i < system->n; ++i )
+    {   
+        atom = &system->my_atoms[i];
+        mark[ atom->orig_id ] = i;
+    }
+
+    /*find the atoms that are not marked but needed,
+     *     meaning we need to communicate their row*/
+    for ( i = 0; i < A_spar_patt->n; ++i )
+    {
+        for ( pj = A_spar_patt->start[i]; pj < A_spar_patt->end[i]; ++pj )
+        {
+            atom = &system->my_atoms[ A_spar_patt->entries[pj].j ];
+
+            if( mark[ atom->orig_id ] == -1)
+            {
+                row_needed[ A_spar_patt->entries[pj].j ] = atom->orig_id;
+            }
+        }
+    }
+
+    /* distribute the row numbers that is needed for dense matrix */
+    Dist( system, mpi_data, row_needed, INT_PTR_TYPE, MPI_INT );
+
+    /* fill in the nnz of the lines that will be collected by other processes */
+    for( i = 0; i < system->N; ++i )
+    {
+        if( row_needed[i] !=-1 && mark[ row_needed[i] ] != -1)
+        {
+            row_nnz[i] = A->end[  mark[ row_needed[i] ] ] - A->start[  mark[ row_needed[i] ] ];
+        }
+    }
+
+    /* announce the nnz's in each row to allocota space */
+    Coll( system, mpi_data, row_nnz, INT_PTR_TYPE, MPI_INT );
+
+    comm = mpi_data->comm_mesh3D;
+    out_bufs = mpi_data->out_buffers;
+    for ( d = 2; d >= 0; --d )
+    {
+        /* initiate recvs */
+        nbr1 = &system->my_nbrs[2 * d];
+
+        if ( out_bufs[2 * d].cnt )
+        {
+            cnt = 0;
+            for( i = 0; i < out_bufs[2 * d].cnt; ++i )
+            {
+                cnt += row_nnz[ out_bufs[2 * d].index[i] ];
+            }
+
+            j_recv1 = (int *) malloc( sizeof(int) * cnt );
+            val_recv1 = (real *) malloc( sizeof(real) * cnt );
+
+            MPI_Irecv( j_recv1, cnt, MPI_INT, nbr1->rank, 2 * d + 1, comm, &req1 );
+            MPI_Irecv( val_recv1, cnt, MPI_DOUBLE, nbr1->rank, 2 * d + 1, comm, &req2 );            
+        }
+
+        nbr2 = &system->my_nbrs[2 * d + 1];
+
+        if ( out_bufs[2 * d + 1].cnt )
+        {
+            cnt = 0;
+            for( i = 0; i < out_bufs[2 * d+1].cnt; ++i )
+            {
+                cnt += row_nnz[ out_bufs[2 * d+1].index[i] ];
+            }
+
+            j_recv2 = (int *) malloc( sizeof(int) * cnt );
+            val_recv2 = (real *) malloc( sizeof(real) * cnt );
+
+            MPI_Irecv( j_recv2, cnt, MPI_INT, nbr2->rank, 2 * d, comm, &req3 );
+            MPI_Irecv( val_recv2, cnt, MPI_DOUBLE, nbr2->rank, 2 * d, comm, &req4 );    
+        }
+
+        /* send both messages in dimension d */
+        if ( nbr1->atoms_cnt )
+        {
+            cnt = 0;
+            for( i = nbr1->atoms_str; i < nbr1->atoms_str + nbr1->atoms_cnt; ++i)
+            {
+                atom = &system->my_atoms[i];
+
+                if(mark[ atom->orig_id ] != -1)
+                {
+                    cnt += A->end[ mark[ atom->orig_id ] ] - A->start[ mark[ atom->orig_id ] ];
+                }
+                else
+                {
+                    cnt += row_nnz[i];
+                }
+            }
+
+
+            j_send = (int *) malloc( sizeof(int) * cnt );
+            val_send = (real *) malloc( sizeof(real) * cnt );
+
+            cnt = 0;
+            for( i = nbr1->atoms_str; i < nbr1->atoms_str + nbr1->atoms_cnt; ++i)
+            {
+                atom = &system->my_atoms[i];
+
+                if(mark[ atom->orig_id ] != -1)
+                {
+                    for( pj = A->start[ mark[ atom->orig_id ] ]; pj < A->end[ mark[ atom->orig_id ] ]; ++pj)
+                    {
+                        j_send[cnt] = A->entries[pj].j;
+                        val_send[cnt] = A->entries[pj].val;
+                        cnt++;
+                    }
+                }
+                else
+                {
+                    for( pj = 0; pj < row_nnz[i]; ++pj)
+                    {
+                        j_send[cnt] = j_list[i][pj];
+                        val_send[cnt] = val_list[i][pj];
+                        cnt++;
+                    }
+                }
+
+            }
+
+            MPI_Send( j_send, cnt, MPI_INT, nbr1->rank, 2 * d, comm );
+            MPI_Send( val_send, cnt, MPI_DOUBLE, nbr1->rank, 2 * d, comm );
+
+        }
+
+        if ( nbr2->atoms_cnt )
+        {
+            cnt = 0;
+            for( i = nbr2->atoms_str; i < nbr2->atoms_str + nbr2->atoms_cnt; ++i)
+            {
+                atom = &system->my_atoms[i];
+
+                if(mark[ atom->orig_id ] != -1)
+                {
+                    cnt += A->end[ mark[ atom->orig_id ] ] - A->start[ mark[ atom->orig_id ] ];
+                }
+                else
+                {
+                    cnt += row_nnz[i];
+                }
+            }
+
+            j_send = (int *) malloc( sizeof(int) * cnt );
+            val_send = (real *) malloc( sizeof(real) * cnt );
+
+            cnt = 0;
+            for( i = nbr2->atoms_str; i < nbr2->atoms_str + nbr2->atoms_cnt; ++i)
+            {
+                atom = &system->my_atoms[i];
+
+                if(mark[ atom->orig_id ] != -1)
+                {
+                    for( pj = A->start[ mark[ atom->orig_id ] ]; pj < A->end[ mark[ atom->orig_id ] ]; ++pj)
+                    {
+                        j_send[cnt] = A->entries[pj].j;
+                        val_send[cnt] = A->entries[pj].val;
+                        cnt++;
+                    }
+                }
+                else
+                {
+                    for( pj = 0; pj < row_nnz[i]; ++pj)
+                    {
+                        j_send[cnt] = j_list[i][pj];
+                        val_send[cnt] = val_list[i][pj];
+                        cnt++;
+                    }
+                }
+            }
+
+            MPI_Send( j_send, cnt, MPI_INT, nbr2->rank, 2 * d + 1, comm );
+            MPI_Send( val_send, cnt, MPI_DOUBLE, nbr2->rank, 2 * d + 1, comm );
+        }
+
+        if ( out_bufs[2 * d].cnt )
+        {
+            MPI_Wait( &req1, &stat1 );
+            MPI_Wait( &req2, &stat2 );
+
+            cnt = 0;
+            for( i = 0; i < out_bufs[2 * d].cnt; ++i )
+            {
+                j_list[ out_bufs[2 * d].index[i] ] = (int *) malloc( sizeof(int) * row_nnz[ out_bufs[2 * d].index[i] ] );
+                val_list[ out_bufs[2 * d].index[i] ] = (real *) malloc( sizeof(real) * row_nnz[ out_bufs[2 * d].index[i] ] );
+
+                for( pj = 0; pj < row_nnz[ out_bufs[2 * d].index[i] ]; ++pj)
+                {
+                    j_list[ out_bufs[2 * d].index[i] ][pj] = j_recv1[cnt];
+                    val_list[ out_bufs[2 * d].index[i] ][pj] = val_recv1[cnt];
+                    cnt++;
+                }
+            }
+        }
+
+        if ( out_bufs[2 * d + 1].cnt )
+        {
+            MPI_Wait( &req3, &stat3 );
+            MPI_Wait( &req4, &stat4 );
+
+            cnt = 0;
+            for( i = 0; i < out_bufs[2 * d + 1].cnt; ++i )
+            {
+                for( pj = 0; pj < row_nnz[ out_bufs[2 * d + 1].index[i] ]; ++pj)
+                {
+                    j_list[ out_bufs[2 * d + 1].index[i] ][pj] = j_recv2[cnt];
+                    val_list[ out_bufs[2 * d + 1].index[i] ][pj] = val_recv2[cnt];
+                    cnt++;
+                }
+            }
+        }
+    }    
+
+    (*A_app_inv)->start[(*A_app_inv)->n] = A_spar_patt->start[A_spar_patt->n];
+
+
+    X = (char *) malloc( sizeof(char) * A->n );
+    Y = (char *) malloc( sizeof(char) * A->n );
+    pos_x = (int *) malloc( sizeof(int) * A->n );
+    pos_y = (int *) malloc( sizeof(int) * A->n );
+
+    for ( i = 0; i < A->n; ++i )
+    {
+        X[i] = 0;
+        Y[i] = 0;
+        pos_x[i] = 0;
+        pos_y[i] = 0;
+    }
+
+    for ( i = 0; i < A_spar_patt->n; ++i )
+    {
+        N = 0;
+        M = 0;
+
+        /* find column indices of nonzeros (which will be the columns indices of the dense matrix) */
+        for ( pj = A_spar_patt->start[i]; pj < A_spar_patt->end[i]; ++pj )
+        {
+
+            j_temp = A_spar_patt->entries[pj].j;
+
+            Y[j_temp] = 1;
+            pos_y[j_temp] = N;
+            ++N;
+
+            /* for each of those indices
+             *             search through the row of full A of that index */
+
+            /* the case where the local matrix has that index's row */
+            if(mark[j_temp] != -1)
+            {
+                for ( k = A->start[ mark[j_temp] ]; k < A->end[ mark[j_temp] ]; ++k )
+                {
+                    /* and accumulate the nonzero column indices to serve as the row indices of the dense matrix */
+                    X[A->entries[k].j] = 1;
+                }
+            }
+
+            /* the case where we communicated that index's row */
+            else
+            {
+                for ( k = 0; k < row_nnz[j_temp]; ++k )
+                {
+                    /* and accumulate the nonzero column indices to serve as the row indices of the dense matrix */
+                    X[ j_list[j_temp][k] ] = 1;
+                }
+            }
+        }
+
+        /* enumerate the row indices from 0 to (# of nonzero rows - 1) for the dense matrix */
+        identity_pos = M;
+        for ( k = 0; k < A->n; k++)
+        {
+            if ( X[k] != 0 )
+            {
+                pos_x[M] = k;
+                if ( k == i )
+                {
+                    identity_pos = M;
+                }
+                ++M;
+            }
+        }
+
+        /* allocate memory for NxM dense matrix */
+        dense_matrix = (real *) malloc( sizeof(real) * N * M );
+
+        /* fill in the entries of dense matrix */
+        for ( d_i = 0; d_i < M; ++d_i)
+        {
+            /* all rows are initialized to zero */
+            for ( d_j = 0; d_j < N; ++d_j )
+            {
+                dense_matrix[d_i * N + d_j] = 0.0;
+            }
+            /* change the value if any of the column indices is seen */
+
+            /* it is in the original list */
+            if( mark[ pos_x[d_i] ] != -1)
+            {
+                for ( d_j = A->start[  mark[ pos_x[d_i] ] ]; d_j < A->end[ mark[ pos_x[d_i] ] ]; ++d_j )
+                {
+                    if ( Y[A->entries[d_j].j] == 1 )
+                    {
+                        dense_matrix[d_i * N + pos_y[A->entries[d_j].j]] = A->entries[d_j].val;
+                    }
+                }
+            }
+            /* communicated */
+            else
+            {
+                for ( d_j = 0; d_j < row_nnz[ pos_x[d_i] ]; ++d_j )
+                {
+                    if ( Y[ j_list[ pos_x[d_i] ][d_j] ] == 1 )
+                    {
+                        dense_matrix[d_i * N + pos_y[ j_list[ pos_x[d_i] ][d_j] ]] = val_list[ pos_x[d_i] ][d_j];
+                    }
+                }
+            }
+
+        }
+
+        /* create the right hand side of the linear equation
+         *            that is the full column of the identity matrix*/
+        e_j = (real *) malloc( sizeof(real) * M );
+
+        for ( k = 0; k < M; ++k )
+        {
+            e_j[k] = 0.0;
+        }
+        e_j[identity_pos] = 1.0;
+
+        /* Solve the overdetermined system AX = B through the least-squares problem:
+         *          * min ||B - AX||_2 */
+        m = M;
+        n = N;
+        nrhs = 1;
+        lda = N;
+        ldb = nrhs;
+        info = LAPACKE_dgels( LAPACK_ROW_MAJOR, 'N', m, n, nrhs, dense_matrix, lda,
+                e_j, ldb );
+
+        /* Check for the full rank */
+        if ( info > 0 )
+        {
+            /*fprintf( stderr, "The diagonal element %i of the triangular factor ", info );
+             *             fprintf( stderr, "of A is zero, so that A does not have full rank;\n" );
+             *                         fprintf( stderr, "the least squares solution could not be computed.\n" );
+             *                                     exit( INVALID_INPUT );*/
+
+            /* TODO: print some error and exit */
+        }
+
+        /* Print least squares solution */
+        /* print_matrix( "Least squares solution", n, nrhs, b, ldb ); */
+
+        /* accumulate the resulting vector to build A_app_inv */
+        (*A_app_inv)->start[i] = A_spar_patt->start[i];
+        (*A_app_inv)->end[i] = A_spar_patt->end[i];
+        for ( k = A_spar_patt->start[i]; k < A_spar_patt->end[i]; ++k)
+        {
+            (*A_app_inv)->entries[k].j = A_spar_patt->entries[k].j;
+            (*A_app_inv)->entries[k].val = e_j[k - A_spar_patt->start[i]];
+        }
+
+        /* empty variables that will be used next iteration */
+        free( dense_matrix );
+        free( e_j );
+        for ( k = 0; k < A->n; ++k )
+        {
+            X[k] = 0;
+            Y[k] = 0;
+            pos_x[k] = 0;
+            pos_y[k] = 0;
+        }
+    }
+
+    free( pos_y);
+    free( pos_x);
+    free( Y );
+    free( X );
+}
+
+static void diag_pre_app( const real * const Hdia_inv, const real * const y,
+        real * const x, const int N )
+{
+    unsigned int i;
+
+#ifdef _OPENMP
+#pragma omp for schedule(static)
+#endif
+    for ( i = 0; i < N; ++i )
+    {
+        x[i] = y[i] * Hdia_inv[i];
+    }
+}
+
+static void apply_preconditioner( const reax_system * const system, const storage * const workspace, 
+        const control_params * const control, const real * const y, real * const x, 
+        const int fresh_pre, const int side )
+{
+    int i, si;
+    fprintf(stdout,"apply_preconditioner working\n");
+    fflush(stdout);
+    /* no preconditioning */
+    if ( control->cm_solver_pre_comp_type == NONE_PC )
+    {
+        if ( x != y )
+        {
+            Vector_Copy( x, y, system->n );
+        }
+    }
+    else
+    {
+        switch ( side )
+        {
+            case LEFT:
+                switch ( control->cm_solver_pre_app_type )
+                {
+                    case TRI_SOLVE_PA:
+                        switch ( control->cm_solver_pre_comp_type )
+                        {
+                            case DIAG_PC:
+                                diag_pre_app( workspace->Hdia_inv, y, x, system->n );
+                                break;
+                                /*case ICHOLT_PC:
+                                  case ILU_PAR_PC:
+                                  case ILUT_PAR_PC:
+                                  tri_solve( workspace->L, y, x, workspace->L->n, LOWER );
+                                  break;*/
+                            case SAI_PC:
+                                Sparse_MatVec( workspace->H_app_inv, y, x );
+                                break;
+                            default:
+                                fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+                                exit( INVALID_INPUT );
+                                break;
+                        }
+                        break;
+                    case TRI_SOLVE_LEVEL_SCHED_PA:
+                        switch ( control->cm_solver_pre_comp_type )
+                        {
+                            case DIAG_PC:
+                                diag_pre_app( workspace->Hdia_inv, y, x, system->n );
+                                break;
+                                /*case ICHOLT_PC:
+                                  case ILU_PAR_PC:
+                                  case ILUT_PAR_PC:
+                                  tri_solve_level_sched( (static_storage *) workspace,
+                                  workspace->L, y, x, workspace->L->n, LOWER, fresh_pre );
+                                  break;*/
+                            case SAI_PC:
+                                Sparse_MatVec( workspace->H_app_inv, y, x );
+                                break;
+                            default:
+                                fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+                                exit( INVALID_INPUT );
+                                break;
+                        }
+                        break;
+                    case TRI_SOLVE_GC_PA:
+                        switch ( control->cm_solver_pre_comp_type )
+                        {
+                            case DIAG_PC:
+                            case SAI_PC:
+                                fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" );
+                                exit( INVALID_INPUT );
+                                break;
+                                /*case ICHOLT_PC:
+                                  case ILU_PAR_PC:
+                                  case ILUT_PAR_PC:
+                                  for ( i = 0; i < workspace->H->n; ++i )
+                                  {
+                                  workspace->y_p[i] = y[i];
+                                  }
+
+                                  permute_vector( workspace, workspace->y_p, workspace->H->n, FALSE, LOWER );
+                                  tri_solve_level_sched( (static_storage *) workspace,
+                                  workspace->L, workspace->y_p, x, workspace->L->n, LOWER, fresh_pre );
+                                  break;*/
+                            default:
+                                fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+                                exit( INVALID_INPUT );
+                                break;
+                        }
+                        break;
+                    case JACOBI_ITER_PA:
+                        switch ( control->cm_solver_pre_comp_type )
+                        {
+                            case DIAG_PC:
+                            case SAI_PC:
+                                fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" );
+                                exit( INVALID_INPUT );
+                                break;
+                                /*case ICHOLT_PC:
+                                  case ILU_PAR_PC:
+                                  case ILUT_PAR_PC:
+                                // construct D^{-1}_L
+                                if ( fresh_pre == TRUE )
+                                {
+                                for ( i = 0; i < workspace->L->n; ++i )
+                                {
+                                si = workspace->L->start[i + 1] - 1;
+                                workspace->Dinv_L[i] = 1.0 / workspace->L->val[si];
+                                }
+                                }
+
+                                jacobi_iter( workspace, workspace->L, workspace->Dinv_L,
+                                y, x, LOWER, control->cm_solver_pre_app_jacobi_iters );
+                                break;*/
+                            default:
+                                fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+                                exit( INVALID_INPUT );
+                                break;
+                        }
+                        break;
+                    default:
+                        fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+                        exit( INVALID_INPUT );
+                        break;
+
+                }
+                break;
+
+            case RIGHT:
+                switch ( control->cm_solver_pre_app_type )
+                {
+                    case TRI_SOLVE_PA:
+                        switch ( control->cm_solver_pre_comp_type )
+                        {
+                            case DIAG_PC:
+                            case SAI_PC:
+                                if ( x != y )
+                                {
+                                    Vector_Copy( x, y, system->n );
+                                }
+                                break;
+                                /*case ICHOLT_PC:
+                                  case ILU_PAR_PC:
+                                  case ILUT_PAR_PC:
+                                  tri_solve( workspace->U, y, x, workspace->U->n, UPPER );
+                                  break;*/
+                            default:
+                                fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+                                exit( INVALID_INPUT );
+                                break;
+                        }
+                        break;
+                    case TRI_SOLVE_LEVEL_SCHED_PA:
+                        switch ( control->cm_solver_pre_comp_type )
+                        {
+                            case DIAG_PC:
+                            case SAI_PC:
+                                if ( x != y )
+                                {
+                                    Vector_Copy( x, y, system->n );
+                                }
+                                break;
+                                /*case ICHOLT_PC:
+                                  case ILU_PAR_PC:
+                                  case ILUT_PAR_PC:
+                                  tri_solve_level_sched( (static_storage *) workspace,
+                                  workspace->U, y, x, workspace->U->n, UPPER, fresh_pre );
+                                  break;*/
+                            default:
+                                fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+                                exit( INVALID_INPUT );
+                                break;
+                        }
+                        break;
+                    case TRI_SOLVE_GC_PA:
+                        switch ( control->cm_solver_pre_comp_type )
+                        {
+                            case DIAG_PC:
+                            case SAI_PC:
+                                fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" );
+                                exit( INVALID_INPUT );
+                                break;
+                                /*case ICHOLT_PC:
+                                  case ILU_PAR_PC:
+                                  case ILUT_PAR_PC:
+                                  tri_solve_level_sched( (static_storage *) workspace,
+                                  workspace->U, y, x, workspace->U->n, UPPER, fresh_pre );
+                                  permute_vector( workspace, x, workspace->H->n, TRUE, UPPER );
+                                  break;*/
+                            default:
+                                fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+                                exit( INVALID_INPUT );
+                                break;
+                        }
+                        break;
+                    case JACOBI_ITER_PA:
+                        switch ( control->cm_solver_pre_comp_type )
+                        {
+                            case DIAG_PC:
+                            case SAI_PC:
+                                fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" );
+                                exit( INVALID_INPUT );
+                                break;
+                                /*case ICHOLT_PC:
+                                  case ILU_PAR_PC:
+                                  case ILUT_PAR_PC:
+                                  if ( fresh_pre == TRUE )
+                                  {
+                                  for ( i = 0; i < workspace->U->n; ++i )
+                                  {
+                                  si = workspace->U->start[i];
+                                  workspace->Dinv_U[i] = 1.0 / workspace->U->val[si];
+                                  }
+                                  }
+
+                                  jacobi_iter( workspace, workspace->U, workspace->Dinv_U,
+                                  y, x, UPPER, control->cm_solver_pre_app_jacobi_iters );
+                                  break;*/
+                            default:
+                                fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+                                exit( INVALID_INPUT );
+                                break;
+                        }
+                        break;
+                    default:
+                        fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+                        exit( INVALID_INPUT );
+                        break;
+
+                }
+                break;
+        }
+    }
+}
 
 int dual_CG( const reax_system * const system, const control_params * const control,
         const storage * const workspace, const simulation_data * const data,
@@ -113,17 +1118,29 @@ int dual_CG( const reax_system * const system, const control_params * const cont
         const sparse_matrix * const H, const rvec2 * const b,
         const real tol, rvec2 * const x, const int fresh_pre )
 {
-    int i, j, n, N, iters;
+
+    fprintf(stdout,"dual_cg working\n");
+    fflush(stdout);
+    int i, j, k, n, N, iters;
     rvec2 tmp, alpha, beta;
     rvec2 my_sum, norm_sqr, b_norm, my_dot;
     rvec2 sig_old, sig_new;
     MPI_Comm comm;
 
+    real *d, *r, *p, *z;
+
+
     n = system->n;
     N = system->N;
     comm = mpi_data->world;
     iters = 0;
 
+    d = (real *) malloc( sizeof(real) * n);
+    r = (real *) malloc( sizeof(real) * n);
+    p = (real *) malloc( sizeof(real) * n);
+    z = (real *) malloc( sizeof(real) * n);
+
+
 #if defined(CG_PERFORMANCE)
     if ( system->my_rank == MASTER_NODE )
     {
@@ -145,7 +1162,7 @@ int dual_CG( const reax_system * const system, const control_params * const cont
 
     dual_Sparse_MatVec( H, x, workspace->q2, N );
 
-//  if (data->step > 0) return;
+    //  if (data->step > 0) return;
 
 #if defined(HALF_LIST)
     // tryQEq
@@ -165,8 +1182,36 @@ int dual_CG( const reax_system * const system, const control_params * const cont
         workspace->r2[j][1] = b[j][1] - workspace->q2[j][1];
 
         /* apply diagonal pre-conditioner */
-        workspace->d2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
-        workspace->d2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
+        /*workspace->d2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
+          workspace->d2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];*/
+    }
+
+    /* appl preconditioner for both systems */
+    for ( j = 0; j < n; ++j )
+    {
+        r[j] = workspace->r2[j][0];
+        d[j] = workspace->d2[j][0];
+    }
+
+    apply_preconditioner( system, workspace, control, r, d, fresh_pre, LEFT );
+
+    for ( j = 0; j < n; ++j )
+    {
+        workspace->d2[j][0] = d[j];
+    }
+
+
+    for ( j = 0; j < n; ++j )
+    {
+        r[j] = workspace->r2[j][1];
+        d[j] = workspace->d2[j][1];
+    }
+
+    apply_preconditioner( system, workspace, control, r, d, fresh_pre, LEFT );
+
+    for ( j = 0; j < n; ++j )
+    {
+        workspace->d2[j][1] = d[j];
     }
 
     //print_host_rvec2 (workspace->r2, n);
@@ -244,9 +1289,41 @@ int dual_CG( const reax_system * const system, const control_params * const cont
             workspace->r2[j][1] -= alpha[1] * workspace->q2[j][1];
 
             /* apply diagonal pre-conditioner */
-            workspace->p2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
-            workspace->p2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
+            /*workspace->p2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
+              workspace->p2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];*/
+        }
+
 
+        /* appl preconditioner for both systems */
+        for ( j = 0; j < n; ++j )
+        {
+            r[j] = workspace->r2[j][0];
+            z[j] = workspace->p2[j][0];
+        }
+
+        apply_preconditioner( system, workspace, control, r, z, fresh_pre, LEFT );
+
+        for ( j = 0; j < n; ++j )
+        {
+            workspace->p2[j][0] = z[j];
+        }
+
+
+        for ( j = 0; j < n; ++j )
+        {
+            r[j] = workspace->r2[j][1];
+            z[j] = workspace->p2[j][1];
+        }
+
+        apply_preconditioner( system, workspace, control, r, z, fresh_pre, LEFT );
+
+        for ( j = 0; j < n; ++j )
+        {
+            workspace->p2[j][1] = z[j];
+        }
+
+        for ( j = 0; j < n; ++j )
+        {
             /* dot product: r.p */
             my_dot[0] += workspace->r2[j][0] * workspace->p2[j][0];
             my_dot[1] += workspace->r2[j][1] * workspace->p2[j][1];
@@ -338,18 +1415,18 @@ const void Sparse_MatVec( const sparse_matrix * const A, const real * const x,
 #if defined(HALF_LIST)
         for ( k = si + 1; k < A->end[i]; ++k )
 #else
-        for ( k = si; k < A->end[i]; ++k )
+            for ( k = si; k < A->end[i]; ++k )
 #endif
-        {
-            j = A->entries[k].j;
-            val = A->entries[k].val;
+            {
+                j = A->entries[k].j;
+                val = A->entries[k].val;
 
-            b[i] += val * x[j];
+                b[i] += val * x[j];
 #if defined(HALF_LIST)
-            //if( j < A->n ) // comment out for tryQEq
-            b[j] += val * x[i];
+                //if( j < A->n ) // comment out for tryQEq
+                b[j] += val * x[i];
 #endif
-        }
+            }
     }
 }
 
@@ -360,9 +1437,18 @@ int CG( const reax_system * const system, const control_params * const control,
         const sparse_matrix * const H, const real * const b,
         const real tol, real * const x, const int fresh_pre )
 {
+    fprintf(stdout,"CG working\n");
+    fflush(stdout);
     int i, j;
     real tmp, alpha, beta, b_norm;
     real sig_old, sig_new, sig0;
+    //real *d, *r, *p, *z;
+
+    /*d = workspace->d;
+      r = workspace->r;
+      p = workspace->q;
+      z = workspace->p;*/
+
 
     Dist( system, mpi_data, x, REAL_PTR_TYPE, MPI_DOUBLE );
     Sparse_MatVec( H, x, workspace->q, system->N );
@@ -382,10 +1468,12 @@ int CG( const reax_system * const system, const control_params * const control,
     Vector_Sum( workspace->r , 1.0,  b, -1.0, workspace->q, system->n );
 
     // preconditioner
-    for ( j = 0; j < system->n; ++j )
-    {
-        workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j];
-    }
+    /*for ( j = 0; j < system->n; ++j )
+      {
+      workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j];
+      }*/
+    apply_preconditioner( system, workspace, control, workspace->r, workspace->d, fresh_pre, LEFT );
+    apply_preconditioner( system, workspace, control, workspace->d, workspace->p, fresh_pre, RIGHT );
 
     b_norm = Parallel_Norm( b, system->n, mpi_data->world );
     sig_new = Parallel_Dot( workspace->r, workspace->d, system->n, mpi_data->world );
@@ -421,10 +1509,12 @@ int CG( const reax_system * const system, const control_params * const control,
         Vector_Add( workspace->r, -alpha, workspace->q, system->n );
 
         /* preconditioner */
-        for ( j = 0; j < system->n; ++j )
-        {
-            workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j];
-        }
+        /*for ( j = 0; j < system->n; ++j )
+          {
+          workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j];
+          }*/
+        apply_preconditioner( system, workspace, control, workspace->r, workspace->d, FALSE, LEFT );
+        apply_preconditioner( system, workspace, control, workspace->d, workspace->p, FALSE, RIGHT );
 
         sig_old = sig_new;
         sig_new = Parallel_Dot( workspace->r, workspace->p, system->n, mpi_data->world );