From a83c57528012a3199d2e8916f87899ceb1f2cfe5 Mon Sep 17 00:00:00 2001
From: "Kurt A. O'Hearn" <ohearnku@msu.edu>
Date: Thu, 12 Aug 2021 01:20:19 -0400
Subject: [PATCH] PG-PuReMD: port SAI preconditioner from MPI codebase. Rework
 charge solver preconditioner code. Other code clean-up.

---
 PG-PuReMD/src/cuda/cuda_allocate.cu     | 124 ++++
 PG-PuReMD/src/cuda/cuda_charges.cu      | 117 +++-
 PG-PuReMD/src/cuda/cuda_copy.cu         |  34 ++
 PG-PuReMD/src/cuda/cuda_copy.h          |   6 +
 PG-PuReMD/src/cuda/cuda_init_md.cu      |  13 +
 PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu | 723 +++++++++++++++++++++---
 PG-PuReMD/src/cuda/cuda_spar_lin_alg.h  |  20 +-
 PG-PuReMD/src/lin_alg.c                 |  12 +-
 PG-PuReMD/src/reax_types.h              |   5 +-
 9 files changed, 940 insertions(+), 114 deletions(-)

diff --git a/PG-PuReMD/src/cuda/cuda_allocate.cu b/PG-PuReMD/src/cuda/cuda_allocate.cu
index 236bc228..619cee2d 100644
--- a/PG-PuReMD/src/cuda/cuda_allocate.cu
+++ b/PG-PuReMD/src/cuda/cuda_allocate.cu
@@ -538,14 +538,30 @@ void Cuda_Allocate_Workspace_Part2( reax_system *system, control_params *control
 
     case SDM_S:
         sCudaMalloc( (void **) &workspace->r, total_real, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->r, FALSE, total_real, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->d, total_real, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->d, FALSE, total_real, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->q, total_real, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->q, FALSE, total_real, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->p, total_real, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->p, FALSE, total_real, 
+                control->streams[0], __FILE__, __LINE__ );
 #if defined(DUAL_SOLVER)
         sCudaMalloc( (void **) &workspace->r2, total_rvec2, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->r2, FALSE, total_rvec2, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->d2, total_rvec2, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->d2, FALSE, total_rvec2, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->q2, total_rvec2, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->q2, FALSE, total_rvec2, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->p2, total_rvec2, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->p2, FALSE, total_rvec2, 
+                control->streams[0], __FILE__, __LINE__ );
 #endif
         break;
 
@@ -580,70 +596,178 @@ void Cuda_Allocate_Workspace_Part2( reax_system *system, control_params *control
 
     case BiCGStab_S:
         sCudaMalloc( (void **) &workspace->y, total_real, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->y, FALSE, total_real, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->g, total_real, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->g, FALSE, total_real, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->z, total_real, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->z, FALSE, total_real, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->r, total_real, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->r, FALSE, total_real, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->d, total_real, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->d, FALSE, total_real, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->q, total_real, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->q, FALSE, total_real, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->p, total_real, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->p, FALSE, total_real, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->r_hat, total_real, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->r_hat, FALSE, total_real, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->q_hat, total_real, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->q_hat, FALSE, total_real, 
+                control->streams[0], __FILE__, __LINE__ );
 #if defined(DUAL_SOLVER)
         sCudaMalloc( (void **) &workspace->y2, total_rvec2, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->y2, FALSE, total_rvec2, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->g2, total_rvec2, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->g2, FALSE, total_rvec2, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->z2, total_rvec2, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->z2, FALSE, total_rvec2, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->r2, total_rvec2, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->r2, FALSE, total_rvec2, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->d2, total_rvec2, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->d2, FALSE, total_rvec2, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->q2, total_rvec2, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->q2, FALSE, total_rvec2, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->p2, total_rvec2, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->p2, FALSE, total_rvec2, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->r_hat2, total_rvec2, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->r_hat2, FALSE, total_rvec2, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->q_hat2, total_rvec2, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->q_hat2, FALSE, total_rvec2, 
+                control->streams[0], __FILE__, __LINE__ );
 #endif
         break;
 
     case PIPECG_S:
         sCudaMalloc( (void **) &workspace->z, total_real, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->z, FALSE, total_real, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->r, total_real, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->r, FALSE, total_real, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->d, total_real, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->d, FALSE, total_real, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->q, total_real, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->q, FALSE, total_real, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->p, total_real, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->p, FALSE, total_real, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->m, total_real, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->m, FALSE, total_real, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->n, total_real, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->n, FALSE, total_real, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->u, total_real, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->u, FALSE, total_real, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->w, total_real, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->w, FALSE, total_real, 
+                control->streams[0], __FILE__, __LINE__ );
 #if defined(DUAL_SOLVER)
         sCudaMalloc( (void **) &workspace->z2, total_rvec2, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->z2, FALSE, total_rvec2, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->r2, total_rvec2, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->r2, FALSE, total_rvec2, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->d2, total_rvec2, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->d2, FALSE, total_rvec2, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->q2, total_rvec2, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->q2, FALSE, total_rvec2, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->p2, total_rvec2, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->p2, FALSE, total_rvec2, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->m2, total_rvec2, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->m2, FALSE, total_rvec2, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->n2, total_rvec2, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->n2, FALSE, total_rvec2, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->u2, total_rvec2, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->u2, FALSE, total_rvec2, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->w2, total_rvec2, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->w2, FALSE, total_rvec2, 
+                control->streams[0], __FILE__, __LINE__ );
 #endif
         break;
 
     case PIPECR_S:
         sCudaMalloc( (void **) &workspace->z, total_real, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->z, FALSE, total_real, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->r, total_real, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->r, FALSE, total_real, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->d, total_real, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->d, FALSE, total_real, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->q, total_real, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->q, FALSE, total_real, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->p, total_real, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->p, FALSE, total_real, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->m, total_real, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->m, FALSE, total_real, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->n, total_real, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->n, FALSE, total_real, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->u, total_real, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->u, FALSE, total_real, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->w, total_real, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->w, FALSE, total_real, 
+                control->streams[0], __FILE__, __LINE__ );
 #if defined(DUAL_SOLVER)
         sCudaMalloc( (void **) &workspace->z2, total_rvec2, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->z2, FALSE, total_rvec2, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->r2, total_rvec2, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->r2, FALSE, total_rvec2, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->d2, total_rvec2, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->d2, FALSE, total_rvec2, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->q2, total_rvec2, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->q2, FALSE, total_rvec2, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->p2, total_rvec2, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->p2, FALSE, total_rvec2, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->m2, total_rvec2, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->m2, FALSE, total_rvec2, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->n2, total_rvec2, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->n2, FALSE, total_rvec2, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->u2, total_rvec2, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->u2, FALSE, total_rvec2, 
+                control->streams[0], __FILE__, __LINE__ );
         sCudaMalloc( (void **) &workspace->w2, total_rvec2, __FILE__, __LINE__ );
+        sCudaMemsetAsync( workspace->w2, FALSE, total_rvec2, 
+                control->streams[0], __FILE__, __LINE__ );
 #endif
         break;
 
diff --git a/PG-PuReMD/src/cuda/cuda_charges.cu b/PG-PuReMD/src/cuda/cuda_charges.cu
index 06bb3d36..81c00c7f 100644
--- a/PG-PuReMD/src/cuda/cuda_charges.cu
+++ b/PG-PuReMD/src/cuda/cuda_charges.cu
@@ -21,12 +21,16 @@
 
 #include "cuda_charges.h"
 
+#include "cuda_allocate.h"
+#include "cuda_copy.h"
 #include "cuda_reduction.h"
 #include "cuda_spar_lin_alg.h"
 #include "cuda_utils.h"
 
+#include "../allocate.h"
 #include "../charges.h"
 #include "../comm_tools.h"
+#include "../lin_alg.h"
 #include "../tool_box.h"
 
 #if !defined(MPIX_CUDA_AWARE_SUPPORT) || !MPIX_CUDA_AWARE_SUPPORT
@@ -305,9 +309,27 @@ static void Setup_Preconditioner_QEq( reax_system const * const system,
             break;
 
         case SAI_PC:
-//            setup_sparse_approx_inverse( system, data, workspace, mpi_data,
-//                    &workspace->H, &workspace->H_spar_patt, 
-//                    control->nprocs, control->cm_solver_pre_comp_sai_thres );
+            if ( workspace->H.allocated == FALSE )
+            {
+                Allocate_Matrix( &workspace->H,
+                        workspace->d_workspace->H.n, workspace->d_workspace->H.n_max,
+                        workspace->d_workspace->H.m, workspace->d_workspace->H.format );
+            }
+            else if ( workspace->H.m < workspace->d_workspace->H.m
+                   || workspace->H.n_max < workspace->d_workspace->H.n_max )
+            {
+                Deallocate_Matrix( &workspace->H );
+                Allocate_Matrix( &workspace->H,
+                        workspace->d_workspace->H.n, workspace->d_workspace->H.n_max,
+                        workspace->d_workspace->H.m, workspace->d_workspace->H.format );
+            }
+
+            Cuda_Copy_Matrix_Device_to_Host( &workspace->H, &workspace->d_workspace->H,
+                   control->streams[4] );
+
+            setup_sparse_approx_inverse( system, data, workspace, mpi_data,
+                    &workspace->H, &workspace->H_spar_patt, 
+                    control->nprocs, control->cm_solver_pre_comp_sai_thres );
             break;
 
         default:
@@ -342,9 +364,10 @@ static void Setup_Preconditioner_ACKS2( reax_system const * const system,
 static void Compute_Preconditioner_QEq( reax_system const * const system,
         control_params const * const control,
         simulation_data * const data, storage * const workspace,
-        mpi_datatypes const * const mpi_data )
+        mpi_datatypes * const mpi_data )
 {
 #if defined(HAVE_LAPACKE) || defined(HAVE_LAPACKE_MKL)
+    int ret;
     real t_pc, total_pc;
 #endif
 
@@ -355,16 +378,37 @@ static void Compute_Preconditioner_QEq( reax_system const * const system,
     else if ( control->cm_solver_pre_comp_type == SAI_PC )
     {
 #if defined(HAVE_LAPACKE) || defined(HAVE_LAPACKE_MKL)
-//        t_pc = sparse_approx_inverse( system, data, workspace, mpi_data,
-//                &workspace->H, workspace->H_spar_patt, &workspace->H_app_inv, control->nprocs );
-//
-//        ret = MPI_Reduce( &t_pc, &total_pc, 1, MPI_DOUBLE, MPI_SUM, MASTER_NODE, MPI_COMM_WORLD );
-//        Check_MPI_Error( ret, __FILE__, __LINE__ );
-//
-//        if( system->my_rank == MASTER_NODE )
-//        {
-//            data->timing.cm_solver_pre_comp += total_pc / control->nprocs;
-//        }
+        t_pc = sparse_approx_inverse( system, data, workspace, mpi_data,
+                &workspace->H, &workspace->H_spar_patt,
+                &workspace->H_app_inv, control->nprocs );
+
+        ret = MPI_Reduce( &t_pc, &total_pc, 1, MPI_DOUBLE, MPI_SUM, MASTER_NODE, MPI_COMM_WORLD );
+        Check_MPI_Error( ret, __FILE__, __LINE__ );
+
+        if ( workspace->d_workspace->H_app_inv.allocated == FALSE )
+        {
+            Cuda_Allocate_Matrix( &workspace->d_workspace->H_app_inv,
+                    workspace->H_app_inv.n, workspace->H_app_inv.n_max,
+                    workspace->H_app_inv.m, workspace->H_app_inv.format,
+                    control->streams[4] );
+        }
+        else if ( workspace->d_workspace->H_app_inv.m < workspace->H_app_inv.m
+               || workspace->d_workspace->H_app_inv.n_max < workspace->H_app_inv.n_max )
+        {
+            Cuda_Deallocate_Matrix( &workspace->d_workspace->H_app_inv );
+            Cuda_Allocate_Matrix( &workspace->d_workspace->H_app_inv,
+                    workspace->H_app_inv.n, workspace->H_app_inv.n_max,
+                    workspace->H_app_inv.m, workspace->H_app_inv.format,
+                    control->streams[4] );
+        }
+
+        Cuda_Copy_Matrix_Host_to_Device( &workspace->H_app_inv,
+                &workspace->d_workspace->H_app_inv, control->streams[4] );
+
+        if( system->my_rank == MASTER_NODE )
+        {
+            data->timing.cm_solver_pre_comp += total_pc / control->nprocs;
+        }
 #else
         fprintf( stderr, "[ERROR] LAPACKE support disabled. Re-compile before enabling. Terminating...\n" );
         exit( INVALID_INPUT );
@@ -638,11 +682,12 @@ void QEq( reax_system const * const system, control_params const * const control
         output_controls const * const out_control,
         mpi_datatypes * const mpi_data )
 {
-    int iters;
+    int iters, refactor;
 
     iters = 0;
+    refactor = is_refactoring_step( control, data );
 
-//    if ( is_refactoring_step( control, data ) == TRUE )
+    if ( refactor == TRUE )
     {
         Setup_Preconditioner_QEq( system, control, data, workspace, mpi_data );
 
@@ -690,12 +735,14 @@ void QEq( reax_system const * const system, control_params const * const control
 #if defined(DUAL_SOLVER)
         iters = Cuda_dual_CG( system, control, data, workspace,
                 &workspace->d_workspace->H, workspace->d_workspace->b,
-                control->cm_solver_q_err, workspace->d_workspace->x, mpi_data );
+                control->cm_solver_q_err, workspace->d_workspace->x, mpi_data, refactor );
 #else
         iters = Cuda_CG( system, control, data, workspace, &workspace->d_workspace->H,
-                workspace->d_workspace->b_s, control->cm_solver_q_err, workspace->d_workspace->s, mpi_data );
+                workspace->d_workspace->b_s, control->cm_solver_q_err, workspace->d_workspace->s,
+                mpi_data, refactor );
         iters += Cuda_CG( system, control, data, workspace, &workspace->d_workspace->H,
-                workspace->d_workspace->b_t, control->cm_solver_q_err, workspace->d_workspace->t, mpi_data );
+                workspace->d_workspace->b_t, control->cm_solver_q_err, workspace->d_workspace->t,
+                mpi_data, FALSE );
 #endif
         break;
 
@@ -703,12 +750,14 @@ void QEq( reax_system const * const system, control_params const * const control
 #if defined(DUAL_SOLVER)
         iters = Cuda_dual_SDM( system, control, data, workspace,
                 &workspace->d_workspace->H, workspace->d_workspace->b,
-                control->cm_solver_q_err, workspace->d_workspace->x, mpi_data );
+                control->cm_solver_q_err, workspace->d_workspace->x, mpi_data, refactor );
 #else
         iters = Cuda_SDM( system, control, data, workspace, &workspace->d_workspace->H,
-                workspace->d_workspace->b_s, control->cm_solver_q_err, workspace->d_workspace->s, mpi_data );
+                workspace->d_workspace->b_s, control->cm_solver_q_err, workspace->d_workspace->s,
+                mpi_data, refactor );
         iters += Cuda_SDM( system, control, data, workspace, &workspace->d_workspace->H,
-                workspace->d_workspace->b_t, control->cm_solver_q_err, workspace->d_workspace->t, mpi_data );
+                workspace->d_workspace->b_t, control->cm_solver_q_err, workspace->d_workspace->t,
+                mpi_data, FALSE );
 #endif
         break;
 
@@ -716,12 +765,14 @@ void QEq( reax_system const * const system, control_params const * const control
 #if defined(DUAL_SOLVER)
         iters = Cuda_dual_BiCGStab( system, control, data, workspace,
                 &workspace->d_workspace->H, workspace->d_workspace->b,
-                control->cm_solver_q_err, workspace->d_workspace->x, mpi_data );
+                control->cm_solver_q_err, workspace->d_workspace->x, mpi_data, refactor );
 #else
         iters = Cuda_BiCGStab( system, control, data, workspace, &workspace->d_workspace->H,
-                workspace->d_workspace->b_s, control->cm_solver_q_err, workspace->d_workspace->s, mpi_data );
+                workspace->d_workspace->b_s, control->cm_solver_q_err, workspace->d_workspace->s,
+                mpi_data, refactor );
         iters += Cuda_BiCGStab( system, control, data, workspace, &workspace->d_workspace->H,
-                workspace->d_workspace->b_t, control->cm_solver_q_err, workspace->d_workspace->t, mpi_data );
+                workspace->d_workspace->b_t, control->cm_solver_q_err, workspace->d_workspace->t,
+                mpi_data, FALSE );
 #endif
         break;
 
@@ -729,12 +780,14 @@ void QEq( reax_system const * const system, control_params const * const control
 #if defined(DUAL_SOLVER)
         iters = Cuda_dual_PIPECG( system, control, data, workspace,
                 &workspace->d_workspace->H, workspace->d_workspace->b,
-                control->cm_solver_q_err, workspace->d_workspace->x, mpi_data );
+                control->cm_solver_q_err, workspace->d_workspace->x, mpi_data, refactor );
 #else
         iters = Cuda_PIPECG( system, control, data, workspace, &workspace->d_workspace->H,
-                workspace->d_workspace->b_s, control->cm_solver_q_err, workspace->d_workspace->s, mpi_data );
+                workspace->d_workspace->b_s, control->cm_solver_q_err, workspace->d_workspace->s,
+                mpi_data, refactor );
         iters += Cuda_PIPECG( system, control, data, workspace, &workspace->d_workspace->H,
-                workspace->d_workspace->b_t, control->cm_solver_q_err, workspace->d_workspace->t, mpi_data );
+                workspace->d_workspace->b_t, control->cm_solver_q_err, workspace->d_workspace->t,
+                mpi_data, FALSE );
 #endif
         break;
 
@@ -742,12 +795,14 @@ void QEq( reax_system const * const system, control_params const * const control
 #if defined(DUAL_SOLVER)
         iters = Cuda_dual_PIPECR( system, control, data, workspace,
                 &workspace->d_workspace->H, workspace->d_workspace->b,
-                control->cm_solver_q_err, workspace->d_workspace->x, mpi_data );
+                control->cm_solver_q_err, workspace->d_workspace->x, mpi_data, refactor );
 #else
         iters = Cuda_PIPECR( system, control, data, workspace, &workspace->d_workspace->H,
-                workspace->d_workspace->b_s, control->cm_solver_q_err, workspace->d_workspace->s, mpi_data );
+                workspace->d_workspace->b_s, control->cm_solver_q_err, workspace->d_workspace->s,
+                mpi_data, refactor );
         iters += Cuda_PIPECR( system, control, data, workspace, &workspace->d_workspace->H,
-                workspace->d_workspace->b_t, control->cm_solver_q_err, workspace->d_workspace->t, mpi_data );
+                workspace->d_workspace->b_t, control->cm_solver_q_err, workspace->d_workspace->t,
+                mpi_data, FALSE );
 #endif
         break;
 
diff --git a/PG-PuReMD/src/cuda/cuda_copy.cu b/PG-PuReMD/src/cuda/cuda_copy.cu
index 1163b0e9..73d36a17 100644
--- a/PG-PuReMD/src/cuda/cuda_copy.cu
+++ b/PG-PuReMD/src/cuda/cuda_copy.cu
@@ -63,6 +63,23 @@ extern "C" void Cuda_Copy_Atoms_Host_to_Device( reax_system *system, control_par
 }
 
 
+/* Copy sparse matrix from host to device */
+extern "C" void Cuda_Copy_Matrix_Host_to_Device( sparse_matrix const * const A,
+        sparse_matrix * const d_A, cudaStream_t s )
+{
+    sCudaMemcpyAsync( d_A->start, A->start, sizeof(int) * A->n,
+            cudaMemcpyHostToDevice, s, __FILE__, __LINE__ );
+    sCudaMemcpyAsync( d_A->end, A->end, sizeof(int) * A->n,
+            cudaMemcpyHostToDevice, s, __FILE__, __LINE__ );
+    sCudaMemcpyAsync( d_A->j, A->j, sizeof(int) * A->m,
+            cudaMemcpyHostToDevice, s, __FILE__, __LINE__ );
+    sCudaMemcpyAsync( d_A->val, A->val, sizeof(real) * A->m,
+            cudaMemcpyHostToDevice, s, __FILE__, __LINE__ );
+
+    cudaStreamSynchronize( s );
+}
+
+
 /* Copy atomic system info from host to device */
 extern "C" void Cuda_Copy_System_Host_to_Device( reax_system *system,
         control_params *control )
@@ -115,6 +132,23 @@ extern "C" void Cuda_Copy_Atoms_Device_to_Host( reax_system *system, control_par
 }
 
 
+/* Copy sparse matrix from device to host */
+extern "C" void Cuda_Copy_Matrix_Device_to_Host( sparse_matrix * const A,
+        sparse_matrix const * const d_A, cudaStream_t s )
+{
+    sCudaMemcpyAsync( A->start, d_A->start, sizeof(int) * d_A->n,
+            cudaMemcpyDeviceToHost, s, __FILE__, __LINE__ );
+    sCudaMemcpyAsync( A->end, d_A->end, sizeof(int) * d_A->n,
+            cudaMemcpyDeviceToHost, s, __FILE__, __LINE__ );
+    sCudaMemcpyAsync( A->j, d_A->j, sizeof(int) * d_A->m,
+            cudaMemcpyDeviceToHost, s, __FILE__, __LINE__ );
+    sCudaMemcpyAsync( A->val, d_A->val, sizeof(real) * d_A->m,
+            cudaMemcpyDeviceToHost, s, __FILE__, __LINE__ );
+
+    cudaStreamSynchronize( s );
+}
+
+
 /* Copy simulation data from device to host */
 extern "C" void Cuda_Copy_Simulation_Data_Device_to_Host( control_params const * const control,
         simulation_data * const data, simulation_data * const d_data )
diff --git a/PG-PuReMD/src/cuda/cuda_copy.h b/PG-PuReMD/src/cuda/cuda_copy.h
index a5c4eb30..1ab0a0a5 100644
--- a/PG-PuReMD/src/cuda/cuda_copy.h
+++ b/PG-PuReMD/src/cuda/cuda_copy.h
@@ -10,6 +10,9 @@ extern "C"  {
 
 void Cuda_Copy_Atoms_Host_to_Device( reax_system *, control_params * );
 
+void Cuda_Copy_Matrix_Host_to_Device( sparse_matrix const * const,
+        sparse_matrix * const, cudaStream_t );
+
 void Cuda_Copy_Grid_Host_to_Device( control_params *, grid *, grid * );
 
 void Cuda_Copy_System_Host_to_Device( reax_system *, control_params * );
@@ -18,6 +21,9 @@ void Cuda_Copy_List_Device_to_Host( control_params *, reax_list *, reax_list *,
 
 void Cuda_Copy_Atoms_Device_to_Host( reax_system *, control_params * );
 
+void Cuda_Copy_Matrix_Device_to_Host( sparse_matrix * const,
+        sparse_matrix const * const, cudaStream_t );
+
 void Cuda_Copy_Simulation_Data_Device_to_Host( control_params const * const,
         simulation_data * const, simulation_data * const );
 
diff --git a/PG-PuReMD/src/cuda/cuda_init_md.cu b/PG-PuReMD/src/cuda/cuda_init_md.cu
index bfe5ab44..92ca9043 100644
--- a/PG-PuReMD/src/cuda/cuda_init_md.cu
+++ b/PG-PuReMD/src/cuda/cuda_init_md.cu
@@ -180,6 +180,19 @@ void Cuda_Init_Workspace( reax_system *system, control_params *control,
     workspace->d_workspace->realloc.thbody = FALSE;
     workspace->d_workspace->realloc.gcell_atoms = 0;
 
+    if ( control->cm_solver_pre_comp_type == SAI_PC )
+    {
+        workspace->H.allocated = FALSE;
+        workspace->H_full.allocated = FALSE;
+        workspace->H_spar_patt.allocated = FALSE;
+        workspace->H_spar_patt_full.allocated = FALSE;
+        workspace->H_app_inv.allocated = FALSE;
+        workspace->d_workspace->H_full.allocated = FALSE;
+        workspace->d_workspace->H_spar_patt.allocated = FALSE;
+        workspace->d_workspace->H_spar_patt_full.allocated = FALSE;
+        workspace->d_workspace->H_app_inv.allocated = FALSE;
+    }
+
     Cuda_Reset_Workspace( system, control, workspace );
 
     Init_Taper( control, workspace->d_workspace, mpi_data );
diff --git a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu
index 822d8bfb..1ad6fdc9 100644
--- a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu
+++ b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu
@@ -40,6 +40,13 @@
 #define FULL_MASK (0xFFFFFFFF)
 
 
+enum preconditioner_type
+{
+    LEFT = 0,
+    RIGHT = 1,
+};
+
+
 /* Jacobi preconditioner computation */
 CUDA_GLOBAL void k_jacobi_cm_half( int *row_ptr_start,
         int *row_ptr_end, int *col_ind, real *vals,
@@ -727,7 +734,7 @@ static void Dual_Sparse_MatVec_Comm_Part2( const reax_system * const system,
 static void Dual_Sparse_MatVec( const reax_system * const system,
         control_params const * const control, simulation_data * const data,
         storage * const workspace, mpi_datatypes * const mpi_data,
-        sparse_matrix const * const A, rvec2 * const x,
+        sparse_matrix const * const A, rvec2 const * const x,
         int n, rvec2 * const b )
 {
 #if defined(LOG_PERFORMANCE)
@@ -939,11 +946,549 @@ static void Sparse_MatVec( reax_system const * const system,
 }
 
 
+/* Apply left-sided preconditioning while solving M^{-1}AX = M^{-1}B
+ *
+ * system:
+ * workspace: data struct containing matrices and vectors, stored in CSR
+ * control: data struct containing parameters
+ * data: struct containing timing simulation data (including performance data)
+ * y: vector to which to apply preconditioning,
+ *  specific to internals of iterative solver being used
+ * x (output): preconditioned vector
+ * fresh_pre: parameter indicating if this is a newly computed (fresh) preconditioner
+ * side: used in determining how to apply preconditioner if the preconditioner is
+ *  factorized as M = M_{1}M_{2} (e.g., incomplete LU, A \approx LU)
+ *
+ * Assumptions:
+ *   Matrices have non-zero diagonals
+ *   Each row of a matrix has at least one non-zero (i.e., no rows with all zeros) */
+static void dual_apply_preconditioner( reax_system const * const system,
+        storage * const workspace, control_params const * const control,
+        simulation_data * const data, mpi_datatypes * const  mpi_data,
+        rvec2 const * const y, rvec2 * const x, int fresh_pre, int side )
+{
+//    int i, si;
+
+    /* no preconditioning */
+    if ( control->cm_solver_pre_comp_type == NONE_PC )
+    {
+        if ( x != y )
+        {
+            Vector_Copy_rvec2( x, y, system->n, control->streams[4] );
+        }
+    }
+    else
+    {
+        switch ( side )
+        {
+            case LEFT:
+                switch ( control->cm_solver_pre_app_type )
+                {
+                    case TRI_SOLVE_PA:
+                        switch ( control->cm_solver_pre_comp_type )
+                        {
+                            case JACOBI_PC:
+                                dual_jacobi_apply( workspace->d_workspace->Hdia_inv,
+                                        y, x, system->n, control->streams[4] );
+                                break;
+//                            case ICHOLT_PC:
+//                            case ILUT_PC:
+//                            case ILUTP_PC:
+//                                  tri_solve( workspace->L, y, x, workspace->L->n, LOWER );
+//                                  break;
+                            case SAI_PC:
+#if defined(NEUTRAL_TERRITORY)
+                                Dual_Sparse_MatVec( system, control, data, workspace, mpi_data,
+                                        &workspace->d_workspace->H_app_inv,
+                                        y, H->NT, x );
+#else
+                                Dual_Sparse_MatVec( system, control, data, workspace, mpi_data,
+                                        &workspace->d_workspace->H_app_inv,
+                                        y, system->n, x );
+#endif
+                                break;
+                            default:
+                                fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+                                exit( INVALID_INPUT );
+                                break;
+                        }
+                        break;
+                    case TRI_SOLVE_LEVEL_SCHED_PA:
+                        switch ( control->cm_solver_pre_comp_type )
+                        {
+                            case JACOBI_PC:
+                                dual_jacobi_apply( workspace->d_workspace->Hdia_inv,
+                                        y, x, system->n, control->streams[4] );
+                                break;
+//                            case ICHOLT_PC:
+//                            case ILUT_PC:
+//                            case ILUTP_PC:
+//                                  tri_solve_level_sched( (static_storage *) workspace,
+//                                          workspace->L, y, x, workspace->L->n, LOWER, fresh_pre );
+//                                  break;
+                            case SAI_PC:
+#if defined(NEUTRAL_TERRITORY)
+                                Dual_Sparse_MatVec( system, control, data, workspace, mpi_data,
+                                        &workspace->d_workspace->H_app_inv,
+                                        y, H->NT, x );
+#else
+                                Dual_Sparse_MatVec( system, control, data, workspace, mpi_data,
+                                        &workspace->d_workspace->H_app_inv,
+                                        y, system->n, x );
+#endif
+                                break;
+                            default:
+                                fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+                                exit( INVALID_INPUT );
+                                break;
+                        }
+                        break;
+                    case TRI_SOLVE_GC_PA:
+                        switch ( control->cm_solver_pre_comp_type )
+                        {
+                            case JACOBI_PC:
+                            case SAI_PC:
+                                fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" );
+                                exit( INVALID_INPUT );
+                                break;
+//                            case ICHOLT_PC:
+//                            case ILUT_PC:
+//                            case ILUTP_PC:
+//                                  for ( i = 0; i < workspace->H->n; ++i )
+//                                  {
+//                                      workspace->y_p[i] = y[i];
+//                                  }
+//
+//                                  permute_vector( workspace, workspace->y_p, workspace->H->n, FALSE, LOWER );
+//                                  tri_solve_level_sched( (static_storage *) workspace,
+//                                  workspace->L, workspace->y_p, x, workspace->L->n, LOWER, fresh_pre );
+//                                  break;
+                            default:
+                                fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+                                exit( INVALID_INPUT );
+                                break;
+                        }
+                        break;
+                    case JACOBI_ITER_PA:
+                        switch ( control->cm_solver_pre_comp_type )
+                        {
+                            case JACOBI_PC:
+                            case SAI_PC:
+                                fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" );
+                                exit( INVALID_INPUT );
+                                break;
+//                            case ICHOLT_PC:
+//                            case ILUT_PC:
+//                            case ILUTP_PC:
+//                                // construct D^{-1}_L
+//                                if ( fresh_pre == TRUE )
+//                                {
+//                                    for ( i = 0; i < workspace->L->n; ++i )
+//                                    {
+//                                        si = workspace->L->start[i + 1] - 1;
+//                                        workspace->Dinv_L[i] = 1.0 / workspace->L->val[si];
+//                                    }
+//                                }
+//
+//                                jacobi_iter( workspace, workspace->L, workspace->Dinv_L,
+//                                        y, x, LOWER, control->cm_solver_pre_app_jacobi_iters );
+//                                break;
+                            default:
+                                fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+                                exit( INVALID_INPUT );
+                                break;
+                        }
+                        break;
+                    default:
+                        fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+                        exit( INVALID_INPUT );
+                        break;
+
+                }
+                break;
+
+            case RIGHT:
+                switch ( control->cm_solver_pre_app_type )
+                {
+                    case TRI_SOLVE_PA:
+                        switch ( control->cm_solver_pre_comp_type )
+                        {
+                            case JACOBI_PC:
+                            case SAI_PC:
+                                if ( x != y )
+                                {
+                                    Vector_Copy_rvec2( x, y, system->n, control->streams[4] );
+                                }
+                                break;
+//                            case ICHOLT_PC:
+//                            case ILUT_PC:
+//                            case ILUTP_PC:
+//                                  tri_solve( workspace->U, y, x, workspace->U->n, UPPER );
+//                                  break;
+                            default:
+                                fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+                                exit( INVALID_INPUT );
+                                break;
+                        }
+                        break;
+                    case TRI_SOLVE_LEVEL_SCHED_PA:
+                        switch ( control->cm_solver_pre_comp_type )
+                        {
+                            case JACOBI_PC:
+                            case SAI_PC:
+                                if ( x != y )
+                                {
+                                    Vector_Copy_rvec2( x, y, system->n, control->streams[4] );
+                                }
+                                break;
+//                            case ICHOLT_PC:
+//                            case ILUT_PC:
+//                            case ILUTP_PC:
+//                                  tri_solve_level_sched( (static_storage *) workspace,
+//                                          workspace->U, y, x, workspace->U->n, UPPER, fresh_pre );
+//                                  break;
+                            default:
+                                fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+                                exit( INVALID_INPUT );
+                                break;
+                        }
+                        break;
+                    case TRI_SOLVE_GC_PA:
+                        switch ( control->cm_solver_pre_comp_type )
+                        {
+                            case JACOBI_PC:
+                            case SAI_PC:
+                                fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" );
+                                exit( INVALID_INPUT );
+                                break;
+//                            case ICHOLT_PC:
+//                            case ILUT_PC:
+//                            case ILUTP_PC:
+//                                  tri_solve_level_sched( (static_storage *) workspace,
+//                                  workspace->U, y, x, workspace->U->n, UPPER, fresh_pre );
+//                                  permute_vector( workspace, x, workspace->H->n, TRUE, UPPER );
+//                                  break;
+                            default:
+                                fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+                                exit( INVALID_INPUT );
+                                break;
+                        }
+                        break;
+                    case JACOBI_ITER_PA:
+                        switch ( control->cm_solver_pre_comp_type )
+                        {
+                            case JACOBI_PC:
+                            case SAI_PC:
+                                fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" );
+                                exit( INVALID_INPUT );
+                                break;
+//                            case ICHOLT_PC:
+//                            case ILUT_PC:
+//                            case ILUTP_PC:
+//                                  if ( fresh_pre == TRUE )
+//                                  {
+//                                      for ( i = 0; i < workspace->U->n; ++i )
+//                                      {
+//                                          si = workspace->U->start[i];
+//                                          workspace->Dinv_U[i] = 1.0 / workspace->U->val[si];
+//                                      }
+//                                  }
+//
+//                                  jacobi_iter( workspace, workspace->U, workspace->Dinv_U,
+//                                          y, x, UPPER, control->cm_solver_pre_app_jacobi_iters );
+//                                  break;
+                            default:
+                                fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+                                exit( INVALID_INPUT );
+                                break;
+                        }
+                        break;
+                    default:
+                        fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+                        exit( INVALID_INPUT );
+                        break;
+
+                }
+                break;
+        }
+    }
+}
+
+
+/* Apply left-sided preconditioning while solving M^{-1}Ax = M^{-1}b
+ *
+ * system:
+ * workspace: data struct containing matrices and vectors, stored in CSR
+ * control: data struct containing parameters
+ * data: struct containing timing simulation data (including performance data)
+ * y: vector to which to apply preconditioning,
+ *  specific to internals of iterative solver being used
+ * x (output): preconditioned vector
+ * fresh_pre: parameter indicating if this is a newly computed (fresh) preconditioner
+ * side: used in determining how to apply preconditioner if the preconditioner is
+ *  factorized as M = M_{1}M_{2} (e.g., incomplete LU, A \approx LU)
+ *
+ * Assumptions:
+ *   Matrices have non-zero diagonals
+ *   Each row of a matrix has at least one non-zero (i.e., no rows with all zeros) */
+static void apply_preconditioner( reax_system const * const system,
+        storage * const workspace, control_params const * const control,
+        simulation_data * const data, mpi_datatypes * const  mpi_data,
+        real const * const y, real * const x, int fresh_pre, int side )
+{
+//    int i, si;
+
+    /* no preconditioning */
+    if ( control->cm_solver_pre_comp_type == NONE_PC )
+    {
+        if ( x != y )
+        {
+            Vector_Copy( x, y, system->n, control->streams[4] );
+        }
+    }
+    else
+    {
+        switch ( side )
+        {
+            case LEFT:
+                switch ( control->cm_solver_pre_app_type )
+                {
+                    case TRI_SOLVE_PA:
+                        switch ( control->cm_solver_pre_comp_type )
+                        {
+                            case JACOBI_PC:
+                                jacobi_apply( workspace->d_workspace->Hdia_inv,
+                                        y, x, system->n, control->streams[4] );
+                                break;
+//                            case ICHOLT_PC:
+//                            case ILUT_PC:
+//                            case ILUTP_PC:
+//                                  tri_solve( workspace->L, y, x, workspace->L->n, LOWER );
+//                                  break;
+                            case SAI_PC:
+#if defined(NEUTRAL_TERRITORY)
+                                Sparse_MatVec( system, control, data, workspace, mpi_data,
+                                        &workspace->d_workspace->H_app_inv,
+                                        y, H->NT, x );
+#else
+                                Sparse_MatVec( system, control, data, workspace, mpi_data,
+                                        &workspace->d_workspace->H_app_inv,
+                                        y, system->n, x );
+#endif
+                                break;
+                            default:
+                                fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+                                exit( INVALID_INPUT );
+                                break;
+                        }
+                        break;
+                    case TRI_SOLVE_LEVEL_SCHED_PA:
+                        switch ( control->cm_solver_pre_comp_type )
+                        {
+                            case JACOBI_PC:
+                                jacobi_apply( workspace->d_workspace->Hdia_inv,
+                                        y, x, system->n, control->streams[4] );
+                                break;
+//                            case ICHOLT_PC:
+//                            case ILUT_PC:
+//                            case ILUTP_PC:
+//                                  tri_solve_level_sched( (static_storage *) workspace,
+//                                          workspace->L, y, x, workspace->L->n, LOWER, fresh_pre );
+//                                  break;
+                            case SAI_PC:
+#if defined(NEUTRAL_TERRITORY)
+                                Sparse_MatVec( system, control, data, workspace, mpi_data,
+                                        &workspace->d_workspace->H_app_inv,
+                                        y, H->NT, x );
+#else
+                                Sparse_MatVec( system, control, data, workspace, mpi_data,
+                                        &workspace->d_workspace->H_app_inv,
+                                        y, system->n, x );
+#endif
+                                break;
+                            default:
+                                fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+                                exit( INVALID_INPUT );
+                                break;
+                        }
+                        break;
+                    case TRI_SOLVE_GC_PA:
+                        switch ( control->cm_solver_pre_comp_type )
+                        {
+                            case JACOBI_PC:
+                            case SAI_PC:
+                                fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" );
+                                exit( INVALID_INPUT );
+                                break;
+//                            case ICHOLT_PC:
+//                            case ILUT_PC:
+//                            case ILUTP_PC:
+//                                  for ( i = 0; i < workspace->H->n; ++i )
+//                                  {
+//                                      workspace->y_p[i] = y[i];
+//                                  }
+//
+//                                  permute_vector( workspace, workspace->y_p, workspace->H->n, FALSE, LOWER );
+//                                  tri_solve_level_sched( (static_storage *) workspace,
+//                                  workspace->L, workspace->y_p, x, workspace->L->n, LOWER, fresh_pre );
+//                                  break;
+                            default:
+                                fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+                                exit( INVALID_INPUT );
+                                break;
+                        }
+                        break;
+                    case JACOBI_ITER_PA:
+                        switch ( control->cm_solver_pre_comp_type )
+                        {
+                            case JACOBI_PC:
+                            case SAI_PC:
+                                fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" );
+                                exit( INVALID_INPUT );
+                                break;
+//                            case ICHOLT_PC:
+//                            case ILUT_PC:
+//                            case ILUTP_PC:
+//                                // construct D^{-1}_L
+//                                if ( fresh_pre == TRUE )
+//                                {
+//                                    for ( i = 0; i < workspace->L->n; ++i )
+//                                    {
+//                                        si = workspace->L->start[i + 1] - 1;
+//                                        workspace->Dinv_L[i] = 1.0 / workspace->L->val[si];
+//                                    }
+//                                }
+//
+//                                jacobi_iter( workspace, workspace->L, workspace->Dinv_L,
+//                                        y, x, LOWER, control->cm_solver_pre_app_jacobi_iters );
+//                                break;
+                            default:
+                                fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+                                exit( INVALID_INPUT );
+                                break;
+                        }
+                        break;
+                    default:
+                        fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+                        exit( INVALID_INPUT );
+                        break;
+
+                }
+                break;
+
+            case RIGHT:
+                switch ( control->cm_solver_pre_app_type )
+                {
+                    case TRI_SOLVE_PA:
+                        switch ( control->cm_solver_pre_comp_type )
+                        {
+                            case JACOBI_PC:
+                            case SAI_PC:
+                                if ( x != y )
+                                {
+                                    Vector_Copy( x, y, system->n, control->streams[4] );
+                                }
+                                break;
+//                            case ICHOLT_PC:
+//                            case ILUT_PC:
+//                            case ILUTP_PC:
+//                                  tri_solve( workspace->U, y, x, workspace->U->n, UPPER );
+//                                  break;
+                            default:
+                                fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+                                exit( INVALID_INPUT );
+                                break;
+                        }
+                        break;
+                    case TRI_SOLVE_LEVEL_SCHED_PA:
+                        switch ( control->cm_solver_pre_comp_type )
+                        {
+                            case JACOBI_PC:
+                            case SAI_PC:
+                                if ( x != y )
+                                {
+                                    Vector_Copy( x, y, system->n, control->streams[4] );
+                                }
+                                break;
+//                            case ICHOLT_PC:
+//                            case ILUT_PC:
+//                            case ILUTP_PC:
+//                                  tri_solve_level_sched( (static_storage *) workspace,
+//                                          workspace->U, y, x, workspace->U->n, UPPER, fresh_pre );
+//                                  break;
+                            default:
+                                fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+                                exit( INVALID_INPUT );
+                                break;
+                        }
+                        break;
+                    case TRI_SOLVE_GC_PA:
+                        switch ( control->cm_solver_pre_comp_type )
+                        {
+                            case JACOBI_PC:
+                            case SAI_PC:
+                                fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" );
+                                exit( INVALID_INPUT );
+                                break;
+//                            case ICHOLT_PC:
+//                            case ILUT_PC:
+//                            case ILUTP_PC:
+//                                  tri_solve_level_sched( (static_storage *) workspace,
+//                                  workspace->U, y, x, workspace->U->n, UPPER, fresh_pre );
+//                                  permute_vector( workspace, x, workspace->H->n, TRUE, UPPER );
+//                                  break;
+                            default:
+                                fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+                                exit( INVALID_INPUT );
+                                break;
+                        }
+                        break;
+                    case JACOBI_ITER_PA:
+                        switch ( control->cm_solver_pre_comp_type )
+                        {
+                            case JACOBI_PC:
+                            case SAI_PC:
+                                fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" );
+                                exit( INVALID_INPUT );
+                                break;
+//                            case ICHOLT_PC:
+//                            case ILUT_PC:
+//                            case ILUTP_PC:
+//                                  if ( fresh_pre == TRUE )
+//                                  {
+//                                      for ( i = 0; i < workspace->U->n; ++i )
+//                                      {
+//                                          si = workspace->U->start[i];
+//                                          workspace->Dinv_U[i] = 1.0 / workspace->U->val[si];
+//                                      }
+//                                  }
+//
+//                                  jacobi_iter( workspace, workspace->U, workspace->Dinv_U,
+//                                          y, x, UPPER, control->cm_solver_pre_app_jacobi_iters );
+//                                  break;
+                            default:
+                                fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+                                exit( INVALID_INPUT );
+                                break;
+                        }
+                        break;
+                    default:
+                        fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
+                        exit( INVALID_INPUT );
+                        break;
+
+                }
+                break;
+        }
+    }
+}
+
+
 int Cuda_dual_SDM( reax_system const * const system,
         control_params const * const control,
         simulation_data * const data, storage * const workspace,
         sparse_matrix const * const H, rvec2 const * const b, real tol,
-        rvec2 * const x, mpi_datatypes * const mpi_data )
+        rvec2 * const x, mpi_datatypes * const mpi_data, int fresh_pre )
 {
     unsigned int i, matvecs;
     int ret;
@@ -968,8 +1513,10 @@ int Cuda_dual_SDM( reax_system const * const system,
     Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
 #endif
 
-    dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r2,
-            workspace->d_workspace->d2, system->n, control->streams[4] );
+    dual_apply_preconditioner( system, workspace, control, data, mpi_data,
+            workspace->d_workspace->r2, workspace->d_workspace->q2, fresh_pre, LEFT );
+    dual_apply_preconditioner( system, workspace, control, data, mpi_data,
+            workspace->d_workspace->q2, workspace->d_workspace->d2, fresh_pre, RIGHT );
 
 #if defined(LOG_PERFORMANCE)
     Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
@@ -1040,8 +1587,10 @@ int Cuda_dual_SDM( reax_system const * const system,
         Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
 #endif
 
-        dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r2,
-                workspace->d_workspace->d2, system->n, control->streams[4] );
+        dual_apply_preconditioner( system, workspace, control, data, mpi_data,
+                workspace->d_workspace->r2, workspace->d_workspace->q2, FALSE, LEFT );
+        dual_apply_preconditioner( system, workspace, control, data, mpi_data,
+                workspace->d_workspace->q2, workspace->d_workspace->d2, FALSE, RIGHT );
 
 #if defined(LOG_PERFORMANCE)
         Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
@@ -1056,7 +1605,7 @@ int Cuda_dual_SDM( reax_system const * const system,
 
         matvecs = Cuda_SDM( system, control, data, workspace, H,
                 workspace->d_workspace->b_t, tol,
-                workspace->d_workspace->t, mpi_data );
+                workspace->d_workspace->t, mpi_data, FALSE );
 
         Vector_Copy_To_rvec2( workspace->d_workspace->x,
                 workspace->d_workspace->t, 1, system->n, control->streams[4] );
@@ -1069,7 +1618,7 @@ int Cuda_dual_SDM( reax_system const * const system,
 
         matvecs = Cuda_SDM( system, control, data, workspace, H,
                 workspace->d_workspace->b_s, tol,
-                workspace->d_workspace->s, mpi_data );
+                workspace->d_workspace->s, mpi_data, FALSE );
 
         Vector_Copy_To_rvec2( workspace->d_workspace->x,
                 workspace->d_workspace->s, 0, system->n, control->streams[4] );
@@ -1095,7 +1644,7 @@ int Cuda_dual_SDM( reax_system const * const system,
 int Cuda_SDM( reax_system const * const system, control_params const * const control,
         simulation_data * const data, storage * const workspace,
         sparse_matrix const * const H, real const * const b, real tol,
-        real * const x, mpi_datatypes * const mpi_data )
+        real * const x, mpi_datatypes * const mpi_data, int fresh_pre )
 {
     unsigned int i;
     int ret;
@@ -1119,8 +1668,10 @@ int Cuda_SDM( reax_system const * const system, control_params const * const con
     Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
 #endif
 
-    jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r,
-            workspace->d_workspace->d, system->n, control->streams[4] );
+    apply_preconditioner( system, workspace, control, data, mpi_data,
+            workspace->d_workspace->r, workspace->d_workspace->q, fresh_pre, LEFT );
+    apply_preconditioner( system, workspace, control, data, mpi_data,
+            workspace->d_workspace->q, workspace->d_workspace->d, fresh_pre, RIGHT );
 
 #if defined(LOG_PERFORMANCE)
     Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
@@ -1181,8 +1732,10 @@ int Cuda_SDM( reax_system const * const system, control_params const * const con
         Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
 #endif
 
-        jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r,
-                workspace->d_workspace->d, system->n, control->streams[4] );
+        apply_preconditioner( system, workspace, control, data, mpi_data,
+                workspace->d_workspace->r, workspace->d_workspace->q, FALSE, LEFT );
+        apply_preconditioner( system, workspace, control, data, mpi_data,
+                workspace->d_workspace->q, workspace->d_workspace->d, FALSE, RIGHT );
 
 #if defined(LOG_PERFORMANCE)
         Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
@@ -1206,7 +1759,7 @@ int Cuda_dual_CG( reax_system const * const system,
         control_params const * const control,
         simulation_data * const data, storage * const workspace,
         sparse_matrix const * const H, rvec2 const * const b, real tol,
-        rvec2 * const x, mpi_datatypes * const mpi_data )
+        rvec2 * const x, mpi_datatypes * const mpi_data, int fresh_pre )
 {
     unsigned int i, matvecs;
     int ret;
@@ -1230,8 +1783,10 @@ int Cuda_dual_CG( reax_system const * const system,
     Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
 #endif
 
-    dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r2,
-            workspace->d_workspace->d2, system->n, control->streams[4] );
+    dual_apply_preconditioner( system, workspace, control, data, mpi_data,
+            workspace->d_workspace->r2, workspace->d_workspace->q2, fresh_pre, LEFT );
+    dual_apply_preconditioner( system, workspace, control, data, mpi_data,
+            workspace->d_workspace->q2, workspace->d_workspace->d2, fresh_pre, RIGHT );
 
 #if defined(LOG_PERFORMANCE)
     Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
@@ -1301,8 +1856,10 @@ int Cuda_dual_CG( reax_system const * const system,
         Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
 #endif
 
-        dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r2,
-                workspace->d_workspace->p2, system->n, control->streams[4] );
+        dual_apply_preconditioner( system, workspace, control, data, mpi_data,
+                workspace->d_workspace->r2, workspace->d_workspace->q2, FALSE, LEFT );
+        dual_apply_preconditioner( system, workspace, control, data, mpi_data,
+                workspace->d_workspace->q2, workspace->d_workspace->p2, FALSE, RIGHT );
 
 #if defined(LOG_PERFORMANCE)
         Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
@@ -1350,7 +1907,7 @@ int Cuda_dual_CG( reax_system const * const system,
 
         matvecs = Cuda_CG( system, control, data, workspace, H,
                 workspace->d_workspace->b_t, tol,
-                workspace->d_workspace->t, mpi_data );
+                workspace->d_workspace->t, mpi_data, FALSE );
 
         Vector_Copy_To_rvec2( workspace->d_workspace->x,
                 workspace->d_workspace->t, 1, system->n, control->streams[4] );
@@ -1363,7 +1920,7 @@ int Cuda_dual_CG( reax_system const * const system,
 
         matvecs = Cuda_CG( system, control, data, workspace, H,
                 workspace->d_workspace->b_s, tol,
-                workspace->d_workspace->s, mpi_data );
+                workspace->d_workspace->s, mpi_data, FALSE );
 
         Vector_Copy_To_rvec2( workspace->d_workspace->x,
                 workspace->d_workspace->s, 0, system->n, control->streams[4] );
@@ -1389,7 +1946,7 @@ int Cuda_dual_CG( reax_system const * const system,
 int Cuda_CG( reax_system const * const system, control_params const * const control,
         simulation_data * const data, storage * const workspace,
         sparse_matrix const * const H, real const * const b, real tol,
-        real * const x, mpi_datatypes * const mpi_data )
+        real * const x, mpi_datatypes * const mpi_data, int fresh_pre )
 {
     unsigned int i;
     int ret;
@@ -1414,8 +1971,10 @@ int Cuda_CG( reax_system const * const system, control_params const * const cont
     Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
 #endif
 
-    jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r,
-            workspace->d_workspace->d, system->n, control->streams[4] );
+    apply_preconditioner( system, workspace, control, data, mpi_data,
+            workspace->d_workspace->r, workspace->d_workspace->q, fresh_pre, LEFT );
+    apply_preconditioner( system, workspace, control, data, mpi_data,
+            workspace->d_workspace->q, workspace->d_workspace->d, fresh_pre, RIGHT );
 
 #if defined(LOG_PERFORMANCE)
     Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
@@ -1467,8 +2026,10 @@ int Cuda_CG( reax_system const * const system, control_params const * const cont
         Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
 #endif
 
-        jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r,
-                workspace->d_workspace->p, system->n, control->streams[4] );
+        apply_preconditioner( system, workspace, control, data, mpi_data,
+                workspace->d_workspace->r, workspace->d_workspace->q, FALSE, LEFT );
+        apply_preconditioner( system, workspace, control, data, mpi_data,
+                workspace->d_workspace->q, workspace->d_workspace->p, FALSE, RIGHT );
 
 #if defined(LOG_PERFORMANCE)
         Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
@@ -1534,7 +2095,7 @@ int Cuda_CG( reax_system const * const system, control_params const * const cont
 int Cuda_dual_BiCGStab( reax_system const * const system, control_params const * const control,
         simulation_data * const data, storage * const workspace,
         sparse_matrix const * const H, rvec2 const * const b, real tol,
-        rvec2 * const x, mpi_datatypes * const mpi_data )
+        rvec2 * const x, mpi_datatypes * const mpi_data, int fresh_pre )
 {
     unsigned int i, matvecs;
     int ret;
@@ -1642,8 +2203,12 @@ int Cuda_dual_BiCGStab( reax_system const * const system, control_params const *
         Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
 #endif
 
-        dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->p2,
-                workspace->d_workspace->d2, system->n, control->streams[4] );
+        dual_apply_preconditioner( system, workspace, control, data, mpi_data,
+                workspace->d_workspace->p2, workspace->d_workspace->y2,
+                i == 0 ? fresh_pre : FALSE, LEFT );
+        dual_apply_preconditioner( system, workspace, control, data, mpi_data,
+                workspace->d_workspace->y2, workspace->d_workspace->d2,
+                i == 0 ? fresh_pre : FALSE, RIGHT );
 
 #if defined(LOG_PERFORMANCE)
         Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
@@ -1706,8 +2271,10 @@ int Cuda_dual_BiCGStab( reax_system const * const system, control_params const *
         Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
 #endif
 
-        dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->q2,
-                workspace->d_workspace->q_hat2, system->n, control->streams[4] );
+        dual_apply_preconditioner( system, workspace, control, data, mpi_data,
+                workspace->d_workspace->q2, workspace->d_workspace->y2, fresh_pre, LEFT );
+        dual_apply_preconditioner( system, workspace, control, data, mpi_data,
+                workspace->d_workspace->y2, workspace->d_workspace->q_hat2, fresh_pre, RIGHT );
 
 #if defined(LOG_PERFORMANCE)
         Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
@@ -1787,7 +2354,7 @@ int Cuda_dual_BiCGStab( reax_system const * const system, control_params const *
 
         matvecs = Cuda_BiCGStab( system, control, data, workspace, H,
                 workspace->d_workspace->b_t, tol,
-                workspace->d_workspace->t, mpi_data );
+                workspace->d_workspace->t, mpi_data, FALSE );
 
         Vector_Copy_To_rvec2( workspace->d_workspace->x,
                 workspace->d_workspace->t, 1, system->n, control->streams[4] );
@@ -1800,7 +2367,7 @@ int Cuda_dual_BiCGStab( reax_system const * const system, control_params const *
 
         matvecs = Cuda_BiCGStab( system, control, data, workspace, H,
                 workspace->d_workspace->b_s, tol,
-                workspace->d_workspace->s, mpi_data );
+                workspace->d_workspace->s, mpi_data, FALSE );
 
         Vector_Copy_To_rvec2( workspace->d_workspace->x,
                 workspace->d_workspace->s, 0, system->n, control->streams[4] );
@@ -1841,7 +2408,7 @@ int Cuda_dual_BiCGStab( reax_system const * const system, control_params const *
 int Cuda_BiCGStab( reax_system const * const system, control_params const * const control,
         simulation_data * const data, storage * const workspace,
         sparse_matrix const * const H, real const * const b, real tol,
-        real * const x, mpi_datatypes * const mpi_data )
+        real * const x, mpi_datatypes * const mpi_data, int fresh_pre )
 {
     unsigned int i;
     int ret;
@@ -1933,8 +2500,12 @@ int Cuda_BiCGStab( reax_system const * const system, control_params const * cons
         Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
 #endif
 
-        jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->p,
-                workspace->d_workspace->d, system->n, control->streams[4] );
+        apply_preconditioner( system, workspace, control, data, mpi_data,
+                workspace->d_workspace->p, workspace->d_workspace->y,
+                i == 0 ? fresh_pre : FALSE, LEFT );
+        apply_preconditioner( system, workspace, control, data, mpi_data,
+                workspace->d_workspace->y, workspace->d_workspace->d,
+                i == 0 ? fresh_pre : FALSE, RIGHT );
 
 #if defined(LOG_PERFORMANCE)
         Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
@@ -1994,8 +2565,10 @@ int Cuda_BiCGStab( reax_system const * const system, control_params const * cons
         Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
 #endif
 
-        jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->q,
-                workspace->d_workspace->q_hat, system->n, control->streams[4] );
+        apply_preconditioner( system, workspace, control, data, mpi_data,
+                workspace->d_workspace->q, workspace->d_workspace->y, fresh_pre, LEFT );
+        apply_preconditioner( system, workspace, control, data, mpi_data,
+                workspace->d_workspace->y, workspace->d_workspace->q_hat, fresh_pre, RIGHT );
 
 #if defined(LOG_PERFORMANCE)
         Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
@@ -2086,7 +2659,7 @@ int Cuda_BiCGStab( reax_system const * const system, control_params const * cons
 int Cuda_dual_PIPECG( reax_system const * const system, control_params const * const control,
         simulation_data * const data, storage * const workspace,
         sparse_matrix const * const H, rvec2 const * const b, real tol,
-        rvec2 * const x, mpi_datatypes * const mpi_data )
+        rvec2 * const x, mpi_datatypes * const mpi_data, int fresh_pre )
 {
     unsigned int i, matvecs;
     int ret;
@@ -2111,8 +2684,10 @@ int Cuda_dual_PIPECG( reax_system const * const system, control_params const * c
     Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
 #endif
 
-    dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r2,
-            workspace->d_workspace->u2, system->n, control->streams[4] );
+    dual_apply_preconditioner( system, workspace, control, data, mpi_data,
+            workspace->d_workspace->r2, workspace->d_workspace->m2, fresh_pre, LEFT );
+    dual_apply_preconditioner( system, workspace, control, data, mpi_data,
+            workspace->d_workspace->m2, workspace->d_workspace->u2, fresh_pre, RIGHT );
 
 #if defined(LOG_PERFORMANCE)
     Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
@@ -2141,8 +2716,10 @@ int Cuda_dual_PIPECG( reax_system const * const system, control_params const * c
             MPI_COMM_WORLD, &req );
     Check_MPI_Error( ret, __FILE__, __LINE__ );
 
-    dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->w2,
-            workspace->d_workspace->m2, system->n, control->streams[4] );
+    dual_apply_preconditioner( system, workspace, control, data, mpi_data,
+            workspace->d_workspace->w2, workspace->d_workspace->n2, FALSE, LEFT );
+    dual_apply_preconditioner( system, workspace, control, data, mpi_data,
+            workspace->d_workspace->n2, workspace->d_workspace->m2, FALSE, RIGHT );
 
 #if defined(LOG_PERFORMANCE)
     Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
@@ -2223,8 +2800,10 @@ int Cuda_dual_PIPECG( reax_system const * const system, control_params const * c
                 MPI_COMM_WORLD, &req );
         Check_MPI_Error( ret, __FILE__, __LINE__ );
 
-        dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->w2,
-                workspace->d_workspace->m2, system->n, control->streams[4] );
+        dual_apply_preconditioner( system, workspace, control, data, mpi_data,
+                workspace->d_workspace->w2, workspace->d_workspace->n2, FALSE, LEFT );
+        dual_apply_preconditioner( system, workspace, control, data, mpi_data,
+                workspace->d_workspace->n2, workspace->d_workspace->m2, FALSE, RIGHT );
 
 #if defined(LOG_PERFORMANCE)
         Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
@@ -2262,7 +2841,7 @@ int Cuda_dual_PIPECG( reax_system const * const system, control_params const * c
 
         matvecs = Cuda_PIPECG( system, control, data, workspace, H,
                 workspace->d_workspace->b_t, tol,
-                workspace->d_workspace->t, mpi_data );
+                workspace->d_workspace->t, mpi_data, FALSE );
 
         Vector_Copy_To_rvec2( workspace->d_workspace->x,
                 workspace->d_workspace->t, 1, system->n, control->streams[4] );
@@ -2275,7 +2854,7 @@ int Cuda_dual_PIPECG( reax_system const * const system, control_params const * c
 
         matvecs = Cuda_PIPECG( system, control, data, workspace, H,
                 workspace->d_workspace->b_s, tol,
-                workspace->d_workspace->s, mpi_data );
+                workspace->d_workspace->s, mpi_data, FALSE );
 
         Vector_Copy_To_rvec2( workspace->d_workspace->x,
                 workspace->d_workspace->s, 0, system->n, control->streams[4] );
@@ -2309,7 +2888,7 @@ int Cuda_dual_PIPECG( reax_system const * const system, control_params const * c
 int Cuda_PIPECG( reax_system const * const system, control_params const * const control,
         simulation_data * const data, storage * const workspace,
         sparse_matrix const * const H, real const * const b, real tol,
-        real * const x, mpi_datatypes * const mpi_data )
+        real * const x, mpi_datatypes * const mpi_data, int fresh_pre )
 {
     unsigned int i;
     int ret;
@@ -2334,8 +2913,10 @@ int Cuda_PIPECG( reax_system const * const system, control_params const * const
     Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
 #endif
 
-    jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r,
-            workspace->d_workspace->u, system->n, control->streams[4] );
+    apply_preconditioner( system, workspace, control, data, mpi_data,
+            workspace->d_workspace->r, workspace->d_workspace->m, fresh_pre, LEFT );
+    apply_preconditioner( system, workspace, control, data, mpi_data,
+            workspace->d_workspace->m, workspace->d_workspace->u, fresh_pre, RIGHT );
 
 #if defined(LOG_PERFORMANCE)
     Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
@@ -2364,8 +2945,10 @@ int Cuda_PIPECG( reax_system const * const system, control_params const * const
             MPI_COMM_WORLD, &req );
     Check_MPI_Error( ret, __FILE__, __LINE__ );
 
-    jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->w,
-            workspace->d_workspace->m, system->n, control->streams[4] );
+    apply_preconditioner( system, workspace, control, data, mpi_data,
+            workspace->d_workspace->w, workspace->d_workspace->n, FALSE, LEFT );
+    apply_preconditioner( system, workspace, control, data, mpi_data,
+            workspace->d_workspace->n, workspace->d_workspace->m, FALSE, RIGHT );
 
 #if defined(LOG_PERFORMANCE)
     Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
@@ -2433,8 +3016,10 @@ int Cuda_PIPECG( reax_system const * const system, control_params const * const
                 MPI_COMM_WORLD, &req );
         Check_MPI_Error( ret, __FILE__, __LINE__ );
 
-        jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->w,
-                workspace->d_workspace->m, system->n, control->streams[4] );
+        apply_preconditioner( system, workspace, control, data, mpi_data,
+                workspace->d_workspace->w, workspace->d_workspace->n, FALSE, LEFT );
+        apply_preconditioner( system, workspace, control, data, mpi_data,
+                workspace->d_workspace->n, workspace->d_workspace->m, FALSE, RIGHT );
 
 #if defined(LOG_PERFORMANCE)
         Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
@@ -2481,7 +3066,7 @@ int Cuda_PIPECG( reax_system const * const system, control_params const * const
 int Cuda_dual_PIPECR( reax_system const * const system, control_params const * const control,
         simulation_data * const data, storage * const workspace,
         sparse_matrix const * const H, rvec2 const * const b, real tol,
-        rvec2 * const x, mpi_datatypes * const mpi_data )
+        rvec2 * const x, mpi_datatypes * const mpi_data, int fresh_pre )
 {
     unsigned int i, matvecs;
     int ret;
@@ -2506,8 +3091,10 @@ int Cuda_dual_PIPECR( reax_system const * const system, control_params const * c
     Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
 #endif
 
-    dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r2,
-            workspace->d_workspace->u2, system->n, control->streams[4] );
+    dual_apply_preconditioner( system, workspace, control, data, mpi_data,
+            workspace->d_workspace->r2, workspace->d_workspace->n2, fresh_pre, LEFT );
+    dual_apply_preconditioner( system, workspace, control, data, mpi_data,
+            workspace->d_workspace->n2, workspace->d_workspace->u2, fresh_pre, RIGHT );
 
 #if defined(LOG_PERFORMANCE)
     Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
@@ -2550,8 +3137,10 @@ int Cuda_dual_PIPECR( reax_system const * const system, control_params const * c
             break;
         }
 
-        dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->w2,
-                workspace->d_workspace->m2, system->n, control->streams[4] );
+        dual_apply_preconditioner( system, workspace, control, data, mpi_data,
+                workspace->d_workspace->w2, workspace->d_workspace->n2, FALSE, LEFT );
+        dual_apply_preconditioner( system, workspace, control, data, mpi_data,
+                workspace->d_workspace->n2, workspace->d_workspace->m2, FALSE, RIGHT );
 
 #if defined(LOG_PERFORMANCE)
         Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
@@ -2640,7 +3229,7 @@ int Cuda_dual_PIPECR( reax_system const * const system, control_params const * c
 
         matvecs = Cuda_PIPECR( system, control, data, workspace, H,
                 workspace->d_workspace->b_t, tol,
-                workspace->d_workspace->t, mpi_data );
+                workspace->d_workspace->t, mpi_data, FALSE );
 
         Vector_Copy_To_rvec2( workspace->d_workspace->x,
                 workspace->d_workspace->t, 1, system->n, control->streams[4] );
@@ -2653,7 +3242,7 @@ int Cuda_dual_PIPECR( reax_system const * const system, control_params const * c
 
         matvecs = Cuda_PIPECR( system, control, data, workspace, H,
                 workspace->d_workspace->b_s, tol,
-                workspace->d_workspace->s, mpi_data );
+                workspace->d_workspace->s, mpi_data, FALSE );
 
         Vector_Copy_To_rvec2( workspace->d_workspace->x,
                 workspace->d_workspace->s, 0, system->n, control->streams[4] );
@@ -2684,7 +3273,7 @@ int Cuda_dual_PIPECR( reax_system const * const system, control_params const * c
 int Cuda_PIPECR( reax_system const * const system, control_params const * const control,
         simulation_data * const data, storage * const workspace,
         sparse_matrix const * const H, real const * const b, real tol,
-        real * const x, mpi_datatypes * const mpi_data )
+        real * const x, mpi_datatypes * const mpi_data, int fresh_pre )
 {
     unsigned int i;
     int ret;
@@ -2709,8 +3298,10 @@ int Cuda_PIPECR( reax_system const * const system, control_params const * const
     Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
 #endif
 
-    jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r,
-            workspace->d_workspace->u, system->n, control->streams[4] );
+    apply_preconditioner( system, workspace, control, data, mpi_data,
+            workspace->d_workspace->r, workspace->d_workspace->n, fresh_pre, LEFT );
+    apply_preconditioner( system, workspace, control, data, mpi_data,
+            workspace->d_workspace->n, workspace->d_workspace->u, fresh_pre, RIGHT );
 
 #if defined(LOG_PERFORMANCE)
     Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
@@ -2746,8 +3337,10 @@ int Cuda_PIPECR( reax_system const * const system, control_params const * const
 
     for ( i = 0; i < control->cm_solver_max_iters && r_norm / b_norm > tol; ++i )
     {
-        jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->w,
-                workspace->d_workspace->m, system->n, control->streams[4] );
+        apply_preconditioner( system, workspace, control, data, mpi_data,
+                workspace->d_workspace->w, workspace->d_workspace->n, FALSE, LEFT );
+        apply_preconditioner( system, workspace, control, data, mpi_data,
+                workspace->d_workspace->n, workspace->d_workspace->m, FALSE, RIGHT );
 
 #if defined(LOG_PERFORMANCE)
         Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
diff --git a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.h b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.h
index 281bbc18..c82634a9 100644
--- a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.h
+++ b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.h
@@ -28,52 +28,52 @@
 int Cuda_dual_SDM( reax_system const * const, control_params const * const,
         simulation_data * const, storage * const,
         sparse_matrix const * const, rvec2 const * const, real,
-        rvec2 * const, mpi_datatypes * const );
+        rvec2 * const, mpi_datatypes * const, int );
 
 int Cuda_SDM( reax_system const * const, control_params const * const,
         simulation_data * const, storage * const,
         sparse_matrix const * const, real const * const, real,
-        real * const, mpi_datatypes * const );
+        real * const, mpi_datatypes * const, int );
 
 int Cuda_dual_CG( reax_system const * const, control_params const * const,
         simulation_data * const, storage * const,
         sparse_matrix const * const, rvec2 const * const, real,
-        rvec2 * const, mpi_datatypes * const );
+        rvec2 * const, mpi_datatypes * const, int );
 
 int Cuda_CG( reax_system const * const, control_params const * const,
         simulation_data * const, storage * const,
         sparse_matrix const * const, real const * const, real,
-        real * const, mpi_datatypes * const );
+        real * const, mpi_datatypes * const, int );
 
 int Cuda_dual_BiCGStab( reax_system const * const, control_params const * const,
         simulation_data * const, storage * const,
         sparse_matrix const * const, rvec2 const * const, real,
-        rvec2 * const, mpi_datatypes * const );
+        rvec2 * const, mpi_datatypes * const, int );
 
 int Cuda_BiCGStab( reax_system const * const, control_params const * const,
         simulation_data * const, storage * const,
         sparse_matrix const * const, real const * const, real,
-        real * const, mpi_datatypes * const );
+        real * const, mpi_datatypes * const, int );
 
 int Cuda_dual_PIPECG( reax_system const * const, control_params const * const,
         simulation_data * const, storage * const,
         sparse_matrix const * const, rvec2 const * const, real,
-        rvec2 * const, mpi_datatypes * const );
+        rvec2 * const, mpi_datatypes * const, int );
 
 int Cuda_PIPECG( reax_system const * const, control_params const * const,
         simulation_data * const, storage * const,
         sparse_matrix const * const, real const * const, real,
-        real * const, mpi_datatypes * const );
+        real * const, mpi_datatypes * const, int );
 
 int Cuda_dual_PIPECR( reax_system const * const, control_params const * const,
         simulation_data * const, storage * const,
         sparse_matrix const * const, rvec2 const * const, real,
-        rvec2 * const, mpi_datatypes * const );
+        rvec2 * const, mpi_datatypes * const, int );
 
 int Cuda_PIPECR( reax_system const * const, control_params const * const,
         simulation_data * const, storage * const,
         sparse_matrix const * const, real const * const, real,
-        real * const, mpi_datatypes * const );
+        real * const, mpi_datatypes * const, int );
 
 
 #endif
diff --git a/PG-PuReMD/src/lin_alg.c b/PG-PuReMD/src/lin_alg.c
index 35b3ef00..e2ef1026 100644
--- a/PG-PuReMD/src/lin_alg.c
+++ b/PG-PuReMD/src/lin_alg.c
@@ -3703,9 +3703,9 @@ int dual_PIPECG( reax_system const * const system, control_params const * const
     Check_MPI_Error( ret, __FILE__, __LINE__ );
 
     dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->w2,
-            workspace->n2, fresh_pre, LEFT );
+            workspace->n2, FALSE, LEFT );
     dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->n2,
-            workspace->m2, fresh_pre, RIGHT );
+            workspace->m2, FALSE, RIGHT );
 
 #if defined(NEUTRAL_TERRITORY)
     Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->m2,
@@ -3789,9 +3789,9 @@ int dual_PIPECG( reax_system const * const system, control_params const * const
         Check_MPI_Error( ret, __FILE__, __LINE__ );
 
         dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->w2,
-                workspace->n2, fresh_pre, LEFT );
+                workspace->n2, FALSE, LEFT );
         dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->n2,
-                workspace->m2, fresh_pre, RIGHT );
+                workspace->m2, FALSE, RIGHT );
 
 #if defined(NEUTRAL_TERRITORY)
         Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->m2,
@@ -4112,9 +4112,9 @@ int dual_PIPECR( reax_system const * const system, control_params const * const
         }
 
         dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->w2,
-                workspace->n2, fresh_pre, LEFT );
+                workspace->n2, FALSE, LEFT );
         dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->n2,
-                workspace->m2, fresh_pre, RIGHT );
+                workspace->m2, FALSE, RIGHT );
 
 #if defined(LOG_PERFORMANCE)
         time = Get_Time( );
diff --git a/PG-PuReMD/src/reax_types.h b/PG-PuReMD/src/reax_types.h
index dfa540b4..e1e175a2 100644
--- a/PG-PuReMD/src/reax_types.h
+++ b/PG-PuReMD/src/reax_types.h
@@ -360,10 +360,11 @@
   /* max. num. of active CUDA streams */
   #define MAX_CUDA_STREAMS (5)
 
-  /* max. num. of CUDA events used for synchronizing streams */
-  #define MAX_CUDA_STREAM_EVENTS (4)
 #endif
 
+/* max. num. of CUDA events used for synchronizing streams */
+#define MAX_CUDA_STREAM_EVENTS (4)
+
 
 /* ensemble type */
 enum ensemble
-- 
GitLab