diff --git a/PG-PuReMD/src/allocate.c b/PG-PuReMD/src/allocate.c
index 9a02f53853d6d3bc149f47df5625b7abb6c595ab..74e6b9394c949ee1e5d4cb54e74c480b5abe1993 100644
--- a/PG-PuReMD/src/allocate.c
+++ b/PG-PuReMD/src/allocate.c
@@ -206,10 +206,12 @@ static void Deallocate_Workspace_Part2( control_params * const control,
             sfree( workspace->d, "Deallocate_Workspace_Part2::workspace->d" );
             sfree( workspace->q, "Deallocate_Workspace_Part2::workspace->q" );
             sfree( workspace->p, "Deallocate_Workspace_Part2::workspace->p" );
+#if defined(DUAL_SOLVER)
             sfree( workspace->r2, "Deallocate_Workspace_Part2::workspace->r2" );
             sfree( workspace->d2, "Deallocate_Workspace_Part2::workspace->d2" );
             sfree( workspace->q2, "Deallocate_Workspace_Part2::workspace->q2" );
             sfree( workspace->p2, "Deallocate_Workspace_Part2::workspace->p2" );
+#endif
             break;
 
         case SDM_S:
@@ -217,10 +219,12 @@ static void Deallocate_Workspace_Part2( control_params * const control,
             sfree( workspace->d, "Deallocate_Workspace_Part2::workspace->d" );
             sfree( workspace->q, "Deallocate_Workspace_Part2::workspace->q" );
             sfree( workspace->p, "Deallocate_Workspace_Part2::workspace->p" );
+#if defined(DUAL_SOLVER)
             sfree( workspace->r2, "Deallocate_Workspace_Part2::workspace->r2" );
             sfree( workspace->d2, "Deallocate_Workspace_Part2::workspace->d2" );
             sfree( workspace->q2, "Deallocate_Workspace_Part2::workspace->q2" );
             sfree( workspace->p2, "Deallocate_Workspace_Part2::workspace->p2" );
+#endif
             break;
 
         case BiCGStab_S:
@@ -233,6 +237,17 @@ static void Deallocate_Workspace_Part2( control_params * const control,
             sfree( workspace->p, "Deallocate_Workspace_Part2::workspace->p" );
             sfree( workspace->r_hat, "Deallocate_Workspace_Part2::workspace->r_hat" );
             sfree( workspace->q_hat, "Deallocate_Workspace_Part2::workspace->q_hat" );
+#if defined(DUAL_SOLVER)
+            sfree( workspace->y2, "Deallocate_Workspace_Part2::workspace->y2" );
+            sfree( workspace->g2, "Deallocate_Workspace_Part2::workspace->g2" );
+            sfree( workspace->z2, "Deallocate_Workspace_Part2::workspace->z2" );
+            sfree( workspace->r2, "Deallocate_Workspace_Part2::workspace->r2" );
+            sfree( workspace->d2, "Deallocate_Workspace_Part2::workspace->d2" );
+            sfree( workspace->q2, "Deallocate_Workspace_Part2::workspace->q2" );
+            sfree( workspace->p2, "Deallocate_Workspace_Part2::workspace->p2" );
+            sfree( workspace->r_hat2, "Deallocate_Workspace_Part2::workspace->r_hat2" );
+            sfree( workspace->q_hat2, "Deallocate_Workspace_Part2::workspace->q_hat2" );
+#endif
             break;
 
         case PIPECG_S:
@@ -245,6 +260,7 @@ static void Deallocate_Workspace_Part2( control_params * const control,
             sfree( workspace->n, "Deallocate_Workspace_Part2::workspace->n" );
             sfree( workspace->u, "Deallocate_Workspace_Part2::workspace->u" );
             sfree( workspace->w, "Deallocate_Workspace_Part2::workspace->w" );
+#if defined(DUAL_SOLVER)
             sfree( workspace->z2, "Deallocate_Workspace_Part2::workspace->z2" );
             sfree( workspace->r2, "Deallocate_Workspace_Part2::workspace->r2" );
             sfree( workspace->d2, "Deallocate_Workspace_Part2::workspace->d2" );
@@ -254,6 +270,7 @@ static void Deallocate_Workspace_Part2( control_params * const control,
             sfree( workspace->n2, "Deallocate_Workspace_Part2::workspace->n2" );
             sfree( workspace->u2, "Deallocate_Workspace_Part2::workspace->u2" );
             sfree( workspace->w2, "Deallocate_Workspace_Part2::workspace->w2" );
+#endif
             break;
 
         case PIPECR_S:
@@ -266,6 +283,17 @@ static void Deallocate_Workspace_Part2( control_params * const control,
             sfree( workspace->n, "Deallocate_Workspace_Part2::workspace->n" );
             sfree( workspace->u, "Deallocate_Workspace_Part2::workspace->u" );
             sfree( workspace->w, "Deallocate_Workspace_Part2::workspace->w" );
+#if defined(DUAL_SOLVER)
+            sfree( workspace->z2, "Deallocate_Workspace_Part2::workspace->z2" );
+            sfree( workspace->r2, "Deallocate_Workspace_Part2::workspace->r2" );
+            sfree( workspace->d2, "Deallocate_Workspace_Part2::workspace->d2" );
+            sfree( workspace->q2, "Deallocate_Workspace_Part2::workspace->q2" );
+            sfree( workspace->p2, "Deallocate_Workspace_Part2::workspace->p2" );
+            sfree( workspace->m2, "Deallocate_Workspace_Part2::workspace->m2" );
+            sfree( workspace->n2, "Deallocate_Workspace_Part2::workspace->n2" );
+            sfree( workspace->u2, "Deallocate_Workspace_Part2::workspace->u2" );
+            sfree( workspace->w2, "Deallocate_Workspace_Part2::workspace->w2" );
+#endif
             break;
 
         default:
@@ -433,10 +461,12 @@ void Allocate_Workspace_Part2( reax_system * const system, control_params * cons
             workspace->d = scalloc( total_cap, sizeof(real), "Allocate_Workspace_Part2::d" );
             workspace->q = scalloc( total_cap, sizeof(real), "Allocate_Workspace_Part2::q" );
             workspace->p = scalloc( total_cap, sizeof(real), "Allocate_Workspace_Part2::p" );
+#if defined(DUAL_SOLVER)
             workspace->r2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::r2" );
             workspace->d2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::d2" );
             workspace->q2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::q2" );
             workspace->p2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::p2" );
+#endif
             break;
 
         case SDM_S:
@@ -444,10 +474,12 @@ void Allocate_Workspace_Part2( reax_system * const system, control_params * cons
             workspace->d = scalloc( total_cap, sizeof(real), "Allocate_Workspace_Part2::d" );
             workspace->q = scalloc( total_cap, sizeof(real), "Allocate_Workspace_Part2::q" );
             workspace->p = scalloc( total_cap, sizeof(real), "Allocate_Workspace_Part2::p" );
+#if defined(DUAL_SOLVER)
             workspace->r2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::r2" );
             workspace->d2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::d2" );
             workspace->q2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::q2" );
             workspace->p2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::p2" );
+#endif
             break;
 
         case BiCGStab_S:
@@ -460,6 +492,17 @@ void Allocate_Workspace_Part2( reax_system * const system, control_params * cons
             workspace->p = scalloc( total_cap, sizeof(real), "Allocate_Workspace_Part2::p" );
             workspace->r_hat = scalloc( total_cap, sizeof(real), "Allocate_Workspace_Part2::r_hat" );
             workspace->q_hat = scalloc( total_cap, sizeof(real), "Allocate_Workspace_Part2::q_hat" );
+#if defined(DUAL_SOLVER)
+            workspace->y2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::y2" );
+            workspace->g2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::g2" );
+            workspace->z2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::z2" );
+            workspace->r2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::r2" );
+            workspace->d2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::d2" );
+            workspace->q2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::q2" );
+            workspace->p2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::p2" );
+            workspace->r_hat2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::r_hat2" );
+            workspace->q_hat2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::q_hat2" );
+#endif
             break;
 
         case PIPECG_S:
@@ -472,6 +515,7 @@ void Allocate_Workspace_Part2( reax_system * const system, control_params * cons
             workspace->n = scalloc( total_cap, sizeof(real), "Allocate_Workspace_Part2::n" );
             workspace->u = scalloc( total_cap, sizeof(real), "Allocate_Workspace_Part2::u" );
             workspace->w = scalloc( total_cap, sizeof(real), "Allocate_Workspace_Part2::w" );
+#if defined(DUAL_SOLVER)
             workspace->z2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::z2" );
             workspace->r2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::r2" );
             workspace->d2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::d2" );
@@ -481,6 +525,7 @@ void Allocate_Workspace_Part2( reax_system * const system, control_params * cons
             workspace->n2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::n2" );
             workspace->u2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::u2" );
             workspace->w2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::w2" );
+#endif
             break;
 
         case PIPECR_S:
@@ -493,6 +538,17 @@ void Allocate_Workspace_Part2( reax_system * const system, control_params * cons
             workspace->n = scalloc( total_cap, sizeof(real), "Allocate_Workspace_Part2::n" );
             workspace->u = scalloc( total_cap, sizeof(real), "Allocate_Workspace_Part2::u" );
             workspace->w = scalloc( total_cap, sizeof(real), "Allocate_Workspace_Part2::w" );
+#if defined(DUAL_SOLVER)
+            workspace->z2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::z2" );
+            workspace->r2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::r2" );
+            workspace->d2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::d2" );
+            workspace->q2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::q2" );
+            workspace->p2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::p2" );
+            workspace->m2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::m2" );
+            workspace->n2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::n2" );
+            workspace->u2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::u2" );
+            workspace->w2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::w2" );
+#endif
             break;
 
         default:
diff --git a/PG-PuReMD/src/charges.c b/PG-PuReMD/src/charges.c
index ed164cab6bc1c7ea42048b4b64200328af180429..a0b1c0252619195ec94c28fcd314671ca9b0c1c4 100644
--- a/PG-PuReMD/src/charges.c
+++ b/PG-PuReMD/src/charges.c
@@ -468,10 +468,8 @@ static void QEq( reax_system const * const system, control_params const * const
 
     case SDM_S:
 #if defined(DUAL_SOLVER)
-        fprintf( stderr, "[ERROR] Dual SDM solver for QEq not yet implemented. Terminating...\n" );
-        exit( INVALID_INPUT );
-//        iters = dual_SDM( system, control, data, workspace, &workspace->H, workspace->b,
-//                control->cm_solver_q_err, workspace->x, mpi_data, refactor );
+        iters = dual_SDM( system, control, data, workspace, &workspace->H, workspace->b,
+                control->cm_solver_q_err, workspace->x, mpi_data, refactor );
 #else
         iters = SDM( system, control, data, workspace, &workspace->H, workspace->b_s,
                 control->cm_solver_q_err, workspace->s, mpi_data, refactor );
@@ -483,10 +481,8 @@ static void QEq( reax_system const * const system, control_params const * const
 
     case BiCGStab_S:
 #if defined(DUAL_SOLVER)
-        fprintf( stderr, "[ERROR] Dual BiCGStab solver for QEq not yet implemented. Terminating...\n" );
-        exit( INVALID_INPUT );
-//        iters = dual_BiCGStab( system, control, data, workspace, &workspace->H, workspace->b,
-//                control->cm_solver_q_err, workspace->x, mpi_data, refactor );
+        iters = dual_BiCGStab( system, control, data, workspace, &workspace->H, workspace->b,
+                control->cm_solver_q_err, workspace->x, mpi_data, refactor );
 #else
         iters = BiCGStab( system, control, data, workspace, &workspace->H, workspace->b_s,
                 control->cm_solver_q_err, workspace->s, mpi_data, refactor );
@@ -511,10 +507,8 @@ static void QEq( reax_system const * const system, control_params const * const
 
     case PIPECR_S:
 #if defined(DUAL_SOLVER)
-        fprintf( stderr, "[ERROR] Dual PIPECR solver for QEq not yet implemented. Terminating...\n" );
-        exit( INVALID_INPUT );
-//        iters = dual_PIPECR( system, control, data, workspace, &workspace->H, workspace->b,
-//                control->cm_solver_q_err, workspace->x, mpi_data, refactor );
+        iters = dual_PIPECR( system, control, data, workspace, &workspace->H, workspace->b,
+                control->cm_solver_q_err, workspace->x, mpi_data, refactor );
 #else
         iters = PIPECR( system, control, data, workspace, &workspace->H, workspace->b_s,
                 control->cm_solver_q_err, workspace->s, mpi_data, refactor );
diff --git a/PG-PuReMD/src/init_md.c b/PG-PuReMD/src/init_md.c
index 1fa0671107bbc426c692778b66c1f7e8aa3dbe70..fd7bd3d88e7fa0dc363e62bb0dc9cf830eb4f82f 100644
--- a/PG-PuReMD/src/init_md.c
+++ b/PG-PuReMD/src/init_md.c
@@ -881,10 +881,12 @@ static void Finalize_Workspace( reax_system * const system, control_params * con
             sfree( workspace->d, "Finalize_Workspace::workspace->d" );
             sfree( workspace->q, "Finalize_Workspace::workspace->q" );
             sfree( workspace->p, "Finalize_Workspace::workspace->p" );
+#if defined(DUAL_SOLVER)
             sfree( workspace->r2, "Finalize_Workspace::workspace->r2" );
             sfree( workspace->d2, "Finalize_Workspace::workspace->d2" );
             sfree( workspace->q2, "Finalize_Workspace::workspace->q2" );
             sfree( workspace->p2, "Finalize_Workspace::workspace->p2" );
+#endif
             break;
 
         case SDM_S:
@@ -892,10 +894,12 @@ static void Finalize_Workspace( reax_system * const system, control_params * con
             sfree( workspace->d, "Finalize_Workspace::workspace->d" );
             sfree( workspace->q, "Finalize_Workspace::workspace->q" );
             sfree( workspace->p, "Finalize_Workspace::workspace->p" );
+#if defined(DUAL_SOLVER)
             sfree( workspace->r2, "Finalize_Workspace::workspace->r2" );
             sfree( workspace->d2, "Finalize_Workspace::workspace->d2" );
             sfree( workspace->q2, "Finalize_Workspace::workspace->q2" );
             sfree( workspace->p2, "Finalize_Workspace::workspace->p2" );
+#endif
             break;
 
         case BiCGStab_S:
@@ -908,6 +912,17 @@ static void Finalize_Workspace( reax_system * const system, control_params * con
             sfree( workspace->p, "Finalize_Workspace::workspace->p" );
             sfree( workspace->r_hat, "Finalize_Workspace::workspace->r_hat" );
             sfree( workspace->q_hat, "Finalize_Workspace::workspace->q_hat" );
+#if defined(DUAL_SOLVER)
+            sfree( workspace->y2, "Finalize_Workspace::workspace->y2" );
+            sfree( workspace->g2, "Finalize_Workspace::workspace->g2" );
+            sfree( workspace->z2, "Finalize_Workspace::workspace->z2" );
+            sfree( workspace->r2, "Finalize_Workspace::workspace->r2" );
+            sfree( workspace->d2, "Finalize_Workspace::workspace->d2" );
+            sfree( workspace->q2, "Finalize_Workspace::workspace->q2" );
+            sfree( workspace->p2, "Finalize_Workspace::workspace->p2" );
+            sfree( workspace->r_hat2, "Finalize_Workspace::workspace->r_hat2" );
+            sfree( workspace->q_hat2, "Finalize_Workspace::workspace->q_hat2" );
+#endif
             break;
 
         case PIPECG_S:
@@ -920,6 +935,7 @@ static void Finalize_Workspace( reax_system * const system, control_params * con
             sfree( workspace->n, "Finalize_Workspace::workspace->n" );
             sfree( workspace->u, "Finalize_Workspace::workspace->u" );
             sfree( workspace->w, "Finalize_Workspace::workspace->w" );
+#if defined(DUAL_SOLVER)
             sfree( workspace->z2, "Finalize_Workspace::workspace->z2" );
             sfree( workspace->r2, "Finalize_Workspace::workspace->r2" );
             sfree( workspace->d2, "Finalize_Workspace::workspace->d2" );
@@ -929,6 +945,7 @@ static void Finalize_Workspace( reax_system * const system, control_params * con
             sfree( workspace->n2, "Finalize_Workspace::workspace->n2" );
             sfree( workspace->u2, "Finalize_Workspace::workspace->u2" );
             sfree( workspace->w2, "Finalize_Workspace::workspace->w2" );
+#endif
             break;
 
         case PIPECR_S:
@@ -941,6 +958,17 @@ static void Finalize_Workspace( reax_system * const system, control_params * con
             sfree( workspace->n, "Finalize_Workspace::workspace->n" );
             sfree( workspace->u, "Finalize_Workspace::workspace->u" );
             sfree( workspace->w, "Finalize_Workspace::workspace->w" );
+#if defined(DUAL_SOLVER)
+            sfree( workspace->z2, "Finalize_Workspace::workspace->z2" );
+            sfree( workspace->r2, "Finalize_Workspace::workspace->r2" );
+            sfree( workspace->d2, "Finalize_Workspace::workspace->d2" );
+            sfree( workspace->q2, "Finalize_Workspace::workspace->q2" );
+            sfree( workspace->p2, "Finalize_Workspace::workspace->p2" );
+            sfree( workspace->m2, "Finalize_Workspace::workspace->m2" );
+            sfree( workspace->n2, "Finalize_Workspace::workspace->n2" );
+            sfree( workspace->u2, "Finalize_Workspace::workspace->u2" );
+            sfree( workspace->w2, "Finalize_Workspace::workspace->w2" );
+#endif
             break;
 
         default:
diff --git a/PG-PuReMD/src/lin_alg.c b/PG-PuReMD/src/lin_alg.c
index 2825724cd7f74ba57df5e874958ca0ab29df3548..35b3ef00f4c80c54df81397bf3f61989b1fd83f4 100644
--- a/PG-PuReMD/src/lin_alg.c
+++ b/PG-PuReMD/src/lin_alg.c
@@ -2491,6 +2491,154 @@ static void apply_preconditioner( reax_system const * const system,
 }
 
 
+/* Steepest Descent 
+ * This function performs dual iteration for QEq (2 simultaneous solves)
+ * */
+int dual_SDM( reax_system const * const system, control_params const * const control,
+        simulation_data * const data, storage * const workspace,
+        sparse_matrix * const H, rvec2 * const b, real tol,
+        rvec2 * const x, mpi_datatypes * const  mpi_data, int fresh_pre )
+{
+    int i, j, ret;
+    rvec2 tmp, alpha, bnorm, sig;
+    real redux[4];
+#if defined(LOG_PERFORMANCE)
+    real time;
+#endif
+
+#if defined(NEUTRAL_TERRITORY)
+    Dual_Sparse_MatVec( system, control, data, mpi_data, H, x, 
+            H->NT, workspace->q2 );
+#else
+    Dual_Sparse_MatVec( system, control, data, mpi_data, H, x, 
+            system->N, workspace->q2 );
+#endif
+
+#if defined(LOG_PERFORMANCE)
+    time = Get_Time( );
+#endif
+
+    Vector_Sum_rvec2( workspace->r2, 1.0, 1.0, b,
+            -1.0, -1.0, workspace->q2, system->n );
+
+#if defined(LOG_PERFORMANCE)
+    Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
+#endif
+
+    dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->r2,
+            workspace->q2, fresh_pre, LEFT );
+    dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->q2,
+            workspace->d2, fresh_pre, RIGHT );
+
+#if defined(LOG_PERFORMANCE)
+    time = Get_Time( );
+#endif
+
+    Dot_local_rvec2( b, b, system->n, &redux[0], &redux[1] );
+    Dot_local_rvec2( workspace->r2, workspace->d2, system->n, &redux[2], &redux[3] );
+
+#if defined(LOG_PERFORMANCE)
+    Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
+#endif
+
+    ret = MPI_Allreduce( MPI_IN_PLACE, redux, 4, MPI_DOUBLE,
+            MPI_SUM, MPI_COMM_WORLD );
+    Check_MPI_Error( ret, __FILE__, __LINE__ );
+    bnorm[0] = SQRT( redux[0] );
+    bnorm[1] = SQRT( redux[1] );
+    sig[0] = redux[2];
+    sig[1] = redux[3];
+
+#if defined(LOG_PERFORMANCE)
+    Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
+#endif
+
+    for ( i = 0; i < control->cm_solver_max_iters; ++i )
+    {
+        if ( SQRT(sig[0]) / bnorm[0] <= tol || SQRT(sig[1]) / bnorm[1] <= tol )
+        {
+            break;
+        }
+
+#if defined(NEUTRAL_TERRITORY)
+        Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->d2,
+                H->NT, workspace->q2 );
+#else
+        Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->d2,
+                system->N, workspace->q2 );
+#endif
+
+#if defined(LOG_PERFORMANCE)
+        time = Get_Time( );
+#endif
+
+        Dot_local_rvec2( workspace->r2, workspace->d2, system->n, &redux[0], &redux[1] );
+        Dot_local_rvec2( workspace->d2, workspace->q2, system->n, &redux[2], &redux[3] );
+
+#if defined(LOG_PERFORMANCE)
+        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
+#endif
+
+        ret = MPI_Allreduce( MPI_IN_PLACE, redux, 4, MPI_DOUBLE,
+                MPI_SUM, MPI_COMM_WORLD );
+        Check_MPI_Error( ret, __FILE__, __LINE__ );
+
+#if defined(LOG_PERFORMANCE)
+        Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
+#endif
+
+        sig[0] = redux[0];
+        sig[1] = redux[1];
+        tmp[0] = redux[2];
+        tmp[1] = redux[3];
+        alpha[0] = sig[0] / tmp[0];
+        alpha[1] = sig[1] / tmp[1];
+        Vector_Add_rvec2( x, alpha[0], alpha[1], workspace->d2, system->n );
+        Vector_Add_rvec2( workspace->r2,
+                -1.0 * alpha[0], -1.0 * alpha[1], workspace->q2, system->n );
+
+#if defined(LOG_PERFORMANCE)
+        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
+#endif
+
+        dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->r2,
+                workspace->q2, FALSE, LEFT );
+        dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->q2,
+                workspace->d2, FALSE, RIGHT );
+    }
+
+    /* continue to solve the system that has not converged yet */
+    if ( sig[0] / bnorm[0] > tol )
+    {
+        Vector_Copy_From_rvec2( workspace->s, workspace->x, 0, system->n );
+
+        i += SDM( system, control, data, workspace,
+                H, workspace->b_s, tol, workspace->s, mpi_data, FALSE );
+
+        Vector_Copy_To_rvec2( workspace->x, workspace->s, 0, system->n );
+    }
+    else if ( sig[1] / bnorm[1] > tol )
+    {
+        Vector_Copy_From_rvec2( workspace->t, workspace->x, 1, system->n );
+
+        i += SDM( system, control, data, workspace,
+                H, workspace->b_t, tol, workspace->t, mpi_data, FALSE );
+
+        Vector_Copy_To_rvec2( workspace->x, workspace->t, 1, system->n );
+    }
+
+    if ( i >= control->cm_solver_max_iters && system->my_rank == MASTER_NODE )
+    {
+        fprintf( stderr, "[WARNING] SDM convergence failed (%d iters)\n", i );
+        fprintf( stderr, "  [INFO] Rel. residual error (s solve): %f\n", SQRT(sig[0]) / bnorm[0] );
+        fprintf( stderr, "  [INFO] Rel. residual error (t solve): %f\n", SQRT(sig[1]) / bnorm[1] );
+        return i;
+    }
+
+    return i;
+}
+
+
 /* Steepest Descent */
 int SDM( reax_system const * const system, control_params const * const control,
         simulation_data * const data, storage * const workspace,
@@ -2605,7 +2753,7 @@ int SDM( reax_system const * const system, control_params const * const control,
 
 
 /* Dual iteration of the Preconditioned Conjugate Gradient Method
- * for QEq (2 simaltaneous solves) */
+ * for QEq (2 simultaneous solves) */
 int dual_CG( reax_system const * const system, control_params const * const control,
         simulation_data * const data,
         storage * const workspace, sparse_matrix * const H, rvec2 * const b,
@@ -2854,45 +3002,312 @@ int CG( reax_system const * const system, control_params const * const control,
     Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
 #endif
 
-    for ( i = 0; i < control->cm_solver_max_iters && r_norm / b_norm > tol; ++i )
-    {
+    for ( i = 0; i < control->cm_solver_max_iters && r_norm / b_norm > tol; ++i )
+    {
+#if defined(NEUTRAL_TERRITORY)
+        Sparse_MatVec( system, control, data, mpi_data, H, workspace->d, 
+                H->NT, workspace->q );
+#else
+        Sparse_MatVec( system, control, data, mpi_data, H, workspace->d, 
+                system->N, workspace->q );
+#endif
+
+#if defined(LOG_PERFORMANCE)
+        time = Get_Time( );
+#endif
+
+        tmp = Parallel_Dot( workspace->d, workspace->q, system->n, MPI_COMM_WORLD );
+
+#if defined(LOG_PERFORMANCE)
+        Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
+#endif
+
+        alpha = sig_new / tmp;
+        Vector_Add( x, alpha, workspace->d, system->n );
+        Vector_Add( workspace->r, -1.0 * alpha, workspace->q, system->n );
+
+#if defined(LOG_PERFORMANCE)
+        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
+#endif
+
+        apply_preconditioner( system, workspace, control, data, mpi_data, workspace->r,
+                workspace->q, FALSE, LEFT );
+        apply_preconditioner( system, workspace, control, data, mpi_data, workspace->q,
+                workspace->p, FALSE, RIGHT );
+
+#if defined(LOG_PERFORMANCE)
+        time = Get_Time( );
+#endif
+
+        redux[0] = Dot_local( workspace->r, workspace->p, system->n );
+        redux[1] = Dot_local( workspace->p, workspace->p, system->n );
+
+#if defined(LOG_PERFORMANCE)
+        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
+#endif
+
+        ret = MPI_Allreduce( MPI_IN_PLACE, redux, 2, MPI_DOUBLE,
+                MPI_SUM, MPI_COMM_WORLD );
+        Check_MPI_Error( ret, __FILE__, __LINE__ );
+
+#if defined(LOG_PERFORMANCE)
+        Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
+#endif
+
+        sig_old = sig_new;
+        sig_new = redux[0];
+        r_norm = SQRT( redux[1] );
+        beta = sig_new / sig_old;
+        Vector_Sum( workspace->d, 1.0, workspace->p, beta, workspace->d, system->n );
+
+#if defined(LOG_PERFORMANCE)
+        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
+#endif
+    }
+
+    if ( i >= control->cm_solver_max_iters && system->my_rank == MASTER_NODE )
+    {
+        fprintf( stderr, "[WARNING] CG convergence failed (%d iters)\n", i );
+        fprintf( stderr, "  [INFO] Rel. residual error: %e\n", r_norm / b_norm );
+        return i;
+    }
+
+    return i;
+}
+
+
+/* Bi-conjugate gradient stabalized method with left preconditioning for
+ * solving nonsymmetric linear systems.
+ * This function performs dual iteration for QEq (2 simultaneous solves)
+ *
+ * system: 
+ * workspace: struct containing storage for workspace for the linear solver
+ * control: struct containing parameters governing the simulation and numeric methods
+ * data: struct containing simulation data (e.g., atom info)
+ * H: sparse, symmetric matrix, lower half stored in CSR format
+ * b: right-hand side of the linear system
+ * tol: tolerence compared against the relative residual for determining convergence
+ * x: inital guess
+ * mpi_data: 
+ *
+ * Reference: Netlib (in MATLAB)
+ *  http://www.netlib.org/templates/matlab/bicgstab.m
+ * */
+int dual_BiCGStab( reax_system const * const system, control_params const * const control,
+        simulation_data * const data,
+        storage * const workspace, sparse_matrix * const H, rvec2 * const b,
+        real tol, rvec2 * const x, mpi_datatypes * const  mpi_data, int fresh_pre )
+{
+    int i, j, ret;
+    rvec2 tmp, alpha, beta, omega, sigma, rho, rho_old, r_norm, b_norm;
+    real time, redux[4];
+
+#if defined(NEUTRAL_TERRITORY)
+    Dual_Sparse_MatVec( system, control, data, mpi_data, H, x, 
+            H->NT, workspace->d2 );
+#else
+    Dual_Sparse_MatVec( system, control, data, mpi_data, H, x, 
+            system->N, workspace->d2 );
+#endif
+
+#if defined(LOG_PERFORMANCE)
+    time = Get_Time( );
+#endif
+
+    Vector_Sum_rvec2( workspace->r2, 1.0, 1.0, b, -1.0, -1.0, workspace->d2, system->n );
+    Dot_local_rvec2( b, b, system->n, &redux[0], &redux[1] );
+    Dot_local_rvec2( workspace->r2, workspace->r2, system->n, &redux[2], &redux[3] );
+
+#if defined(LOG_PERFORMANCE)
+    Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
+#endif
+
+    ret = MPI_Allreduce( MPI_IN_PLACE, redux, 4, MPI_DOUBLE,
+            MPI_SUM, MPI_COMM_WORLD );
+    Check_MPI_Error( ret, __FILE__, __LINE__ );
+
+#if defined(LOG_PERFORMANCE)
+    Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
+#endif
+
+    b_norm[0] = SQRT( redux[0] );
+    b_norm[1] = SQRT( redux[1] );
+    r_norm[0] = SQRT( redux[2] );
+    r_norm[1] = SQRT( redux[3] );
+    if ( b_norm[0] == 0.0 )
+    {
+        b_norm[0] = 1.0;
+    }
+    if ( b_norm[1] == 0.0 )
+    {
+        b_norm[1] = 1.0;
+    }
+    Vector_Copy_rvec2( workspace->r_hat2, workspace->r2, system->n );
+    omega[0] = 1.0;
+    omega[1] = 1.0;
+    rho[0] = 1.0;
+    rho[1] = 1.0;
+
+#if defined(LOG_PERFORMANCE)
+    Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
+#endif
+
+    for ( i = 0; i < control->cm_solver_max_iters; ++i )
+    {
+        if ( r_norm[0] / b_norm[0] <= tol || r_norm[1] / b_norm[1] <= tol )
+        {
+            break;
+        }
+
+        Dot_local_rvec2( workspace->r_hat2, workspace->r2, system->n,
+                &redux[0], &redux[1] );
+
+#if defined(LOG_PERFORMANCE)
+        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
+#endif
+
+        ret = MPI_Allreduce( MPI_IN_PLACE, redux, 2, MPI_DOUBLE,
+                MPI_SUM, MPI_COMM_WORLD );
+        Check_MPI_Error( ret, __FILE__, __LINE__ );
+
+#if defined(LOG_PERFORMANCE)
+        Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
+#endif
+
+        rho[0] = redux[0];
+        rho[1] = redux[1];
+        if ( rho[0] == 0.0 || rho[1] == 0.0 )
+        {
+            break;
+        }
+        if ( i > 0 )
+        {
+            beta[0] = (rho[0] / rho_old[0]) * (alpha[0] / omega[0]);
+            beta[1] = (rho[1] / rho_old[1]) * (alpha[1] / omega[1]);
+            Vector_Sum_rvec2( workspace->q2, 1.0, 1.0, workspace->p2,
+                    -1.0 * omega[0], -1.0 * omega[1], workspace->z2, system->n );
+            Vector_Sum_rvec2( workspace->p2, 1.0, 1.0, workspace->r2,
+                    beta[0], beta[1], workspace->q2, system->n );
+        }
+        else
+        {
+            Vector_Copy_rvec2( workspace->p2, workspace->r2, system->n );
+        }
+
+#if defined(LOG_PERFORMANCE)
+        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
+#endif
+
+        dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->p2,
+                workspace->y2, i == 0 ? fresh_pre : FALSE, LEFT );
+        dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->y2,
+                workspace->d2, i == 0 ? fresh_pre : FALSE, RIGHT );
+
+#if defined(NEUTRAL_TERRITORY)
+        Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->d2,
+                H->NT, workspace->z2 );
+#else
+        Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->d2,
+                system->N, workspace->z2 );
+#endif
+
+#if defined(LOG_PERFORMANCE)
+        time = Get_Time( );
+#endif
+
+        Dot_local_rvec2( workspace->r_hat2, workspace->z2, system->n,
+                &redux[0], &redux[1] );
+
+#if defined(LOG_PERFORMANCE)
+        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
+#endif
+
+        ret = MPI_Allreduce( MPI_IN_PLACE, redux, 2, MPI_DOUBLE,
+                MPI_SUM, MPI_COMM_WORLD );
+        Check_MPI_Error( ret, __FILE__, __LINE__ );
+
+#if defined(LOG_PERFORMANCE)
+        Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
+#endif
+
+        tmp[0] = redux[0];
+        tmp[1] = redux[1];
+        alpha[0] = rho[0] / tmp[0];
+        alpha[1] = rho[1] / tmp[1];
+        Vector_Sum_rvec2( workspace->q2, 1.0, 1.0, workspace->r2,
+                -1.0 * alpha[0], -1.0 * alpha[1], workspace->z2, system->n );
+        Dot_local_rvec2( workspace->q2, workspace->q2, system->n,
+                &redux[0], &redux[1] );
+
+#if defined(LOG_PERFORMANCE)
+        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
+#endif
+
+        ret = MPI_Allreduce( MPI_IN_PLACE, redux, 2, MPI_DOUBLE,
+                MPI_SUM, MPI_COMM_WORLD );
+        Check_MPI_Error( ret, __FILE__, __LINE__ );
+
+#if defined(LOG_PERFORMANCE)
+        Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
+#endif
+
+        tmp[0] = redux[0];
+        tmp[1] = redux[1];
+        /* early convergence check */
+        if ( tmp[0] < tol || tmp[1] < tol )
+        {
+            Vector_Add_rvec2( x, alpha[0], alpha[1], workspace->d2, system->n );
+            break;
+        }
+
+#if defined(LOG_PERFORMANCE)
+        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
+#endif
+
+        dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->q2,
+                workspace->y2, FALSE, LEFT );
+        dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->y2,
+                workspace->q_hat2, FALSE, RIGHT );
+
 #if defined(NEUTRAL_TERRITORY)
-        Sparse_MatVec( system, control, data, mpi_data, H, workspace->d, 
-                H->NT, workspace->q );
+        Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->q_hat2,
+                H->NT, workspace->y2 );
 #else
-        Sparse_MatVec( system, control, data, mpi_data, H, workspace->d, 
-                system->N, workspace->q );
+        Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->q_hat2,
+                system->N, workspace->y2 );
 #endif
 
 #if defined(LOG_PERFORMANCE)
         time = Get_Time( );
 #endif
 
-        tmp = Parallel_Dot( workspace->d, workspace->q, system->n, MPI_COMM_WORLD );
-
-#if defined(LOG_PERFORMANCE)
-        Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
-#endif
-
-        alpha = sig_new / tmp;
-        Vector_Add( x, alpha, workspace->d, system->n );
-        Vector_Add( workspace->r, -1.0 * alpha, workspace->q, system->n );
+        Dot_local_rvec2( workspace->y2, workspace->q2, system->n, &redux[0], &redux[1] );
+        Dot_local_rvec2( workspace->y2, workspace->y2, system->n, &redux[2], &redux[3] );
 
 #if defined(LOG_PERFORMANCE)
         Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
 #endif
 
-        apply_preconditioner( system, workspace, control, data, mpi_data, workspace->r,
-                workspace->q, FALSE, LEFT );
-        apply_preconditioner( system, workspace, control, data, mpi_data, workspace->q,
-                workspace->p, FALSE, RIGHT );
+        ret = MPI_Allreduce( MPI_IN_PLACE, redux, 4, MPI_DOUBLE,
+                MPI_SUM, MPI_COMM_WORLD );
+        Check_MPI_Error( ret, __FILE__, __LINE__ );
 
 #if defined(LOG_PERFORMANCE)
-        time = Get_Time( );
+        Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
 #endif
 
-        redux[0] = Dot_local( workspace->r, workspace->p, system->n );
-        redux[1] = Dot_local( workspace->p, workspace->p, system->n );
+        sigma[0] = redux[0];
+        sigma[1] = redux[1];
+        tmp[0] = redux[2];
+        tmp[1] = redux[3];
+        omega[0] = sigma[0] / tmp[0];
+        omega[1] = sigma[1] / tmp[1];
+        Vector_Sum_rvec2( workspace->g2, alpha[0], alpha[1], workspace->d2,
+                omega[0], omega[1], workspace->q_hat2, system->n );
+        Vector_Add_rvec2( x, 1.0, 1.0, workspace->g2, system->n );
+        Vector_Sum_rvec2( workspace->r2, 1.0, 1.0, workspace->q2,
+                -1.0 * omega[0], -1.0 * omega[1], workspace->y2, system->n );
+        Dot_local_rvec2( workspace->r2, workspace->r2, system->n, &redux[0], &redux[1] );
 
 #if defined(LOG_PERFORMANCE)
         Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
@@ -2906,22 +3321,58 @@ int CG( reax_system const * const system, control_params const * const control,
         Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
 #endif
 
-        sig_old = sig_new;
-        sig_new = redux[0];
-        r_norm = SQRT( redux[1] );
-        beta = sig_new / sig_old;
-        Vector_Sum( workspace->d, 1.0, workspace->p, beta, workspace->d, system->n );
+        r_norm[0] = SQRT( redux[0] );
+        r_norm[1] = SQRT( redux[1] );
+        if ( omega[0] == 0.0 || omega[1] == 0.0 )
+        {
+            break;
+        }
+        rho_old[0] = rho[0];
+        rho_old[1] = rho[1];
 
 #if defined(LOG_PERFORMANCE)
         Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
 #endif
     }
 
-    if ( i >= control->cm_solver_max_iters && system->my_rank == MASTER_NODE )
+    if ( (omega[0] == 0.0 || omega[1] == 0.0) && system->my_rank == MASTER_NODE )
     {
-        fprintf( stderr, "[WARNING] CG convergence failed (%d iters)\n", i );
-        fprintf( stderr, "  [INFO] Rel. residual error: %e\n", r_norm / b_norm );
-        return i;
+        fprintf( stderr, "[WARNING] BiCGStab numeric breakdown (%d iters)\n", i );
+        fprintf( stderr, "  [INFO] omega = %e\n", omega );
+    }
+    else if ( (rho[0] == 0.0 || rho[1] == 0.0) && system->my_rank == MASTER_NODE )
+    {
+        fprintf( stderr, "[WARNING] BiCGStab numeric breakdown (%d iters)\n", i );
+        fprintf( stderr, "  [INFO] rho = %e\n", rho );
+    }
+
+    /* continue to solve the system that has not converged yet */
+    if ( r_norm[0] / b_norm[0] > tol )
+    {
+        Vector_Copy_From_rvec2( workspace->s, workspace->x, 0, system->n );
+
+        i += BiCGStab( system, control, data, workspace,
+                H, workspace->b_s, tol, workspace->s, mpi_data, FALSE );
+
+        Vector_Copy_To_rvec2( workspace->x, workspace->s, 0, system->n );
+    }
+    else if ( r_norm[1] / b_norm[1] > tol )
+    {
+        Vector_Copy_From_rvec2( workspace->t, workspace->x, 1, system->n );
+
+        i += BiCGStab( system, control, data, workspace,
+                H, workspace->b_t, tol, workspace->t, mpi_data, FALSE );
+
+        Vector_Copy_To_rvec2( workspace->x, workspace->t, 1, system->n );
+    }
+
+
+    if ( i >= control->cm_solver_max_iters
+            && system->my_rank == MASTER_NODE )
+    {
+        fprintf( stderr, "[WARNING] BiCGStab convergence failed (%d iters)\n", i );
+        fprintf( stderr, "  [INFO] Rel. residual error (s solve): %e\n", r_norm[0] / b_norm[0] );
+        fprintf( stderr, "  [INFO] Rel. residual error (t solve): %e\n", r_norm[1] / b_norm[1] );
     }
 
     return i;
@@ -3572,6 +4023,209 @@ int PIPECG( reax_system const * const system, control_params const * const contr
 }
 
 
+/* Pipelined Preconditioned Conjugate Residual Method.
+ * This function performs dual iteration for QEq (2 simultaneous solves)
+ *
+ * References:
+ * 1) Hiding global synchronization latency in the preconditioned Conjugate Gradient algorithm,
+ *  P. Ghysels and W. Vanroose, Parallel Computing, 2014.
+ *  */
+int dual_PIPECR( reax_system const * const system, control_params const * const control,
+        simulation_data * const data,
+        storage * const workspace, sparse_matrix * const H, rvec2 * const b,
+        real tol, rvec2 * const x, mpi_datatypes * const  mpi_data, int fresh_pre )
+{
+    int i, j, ret;
+    rvec2 alpha, beta, delta, gamma_old, gamma_new, r_norm, b_norm;
+    real redux[6];
+    MPI_Request req;
+#if defined(LOG_PERFORMANCE)
+    real time;
+#endif
+
+#if defined(NEUTRAL_TERRITORY)
+    Dual_Sparse_MatVec( system, control, data, mpi_data, H, x, 
+            H->NT, workspace->u2 );
+#else
+    Dual_Sparse_MatVec( system, control, data, mpi_data, H, x, 
+            system->N, workspace->u2 );
+#endif
+
+#if defined(LOG_PERFORMANCE)
+    time = Get_Time( );
+#endif
+
+    Vector_Sum_rvec2( workspace->r2, 1.0, 1.0, b, -1.0, -1.0, workspace->u2, system->n );
+
+#if defined(LOG_PERFORMANCE)
+    Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
+#endif
+
+    dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->r2,
+            workspace->n2, fresh_pre, LEFT );
+    dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->n2,
+            workspace->u2, fresh_pre, RIGHT );
+
+#if defined(LOG_PERFORMANCE)
+    time = Get_Time( );
+#endif
+
+    Dot_local_rvec2( b, b, system->n, &redux[0], &redux[1] );
+    Dot_local_rvec2( workspace->u2, workspace->u2, system->n, &redux[2], &redux[3] );
+
+    ret = MPI_Iallreduce( MPI_IN_PLACE, redux, 4, MPI_DOUBLE, MPI_SUM,
+            MPI_COMM_WORLD, &req );
+    Check_MPI_Error( ret, __FILE__, __LINE__ );
+
+#if defined(LOG_PERFORMANCE)
+    Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
+#endif
+
+#if defined(NEUTRAL_TERRITORY)
+    Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->u2,
+            H->NT, workspace->w2 );
+#else
+    Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->u2,
+            system->N, workspace->w2 );
+#endif
+
+#if defined(LOG_PERFORMANCE)
+    time = Get_Time( );
+#endif
+
+    ret = MPI_Wait( &req, MPI_STATUS_IGNORE );
+    Check_MPI_Error( ret, __FILE__, __LINE__ );
+    b_norm[0] = SQRT( redux[0] );
+    b_norm[1] = SQRT( redux[1] );
+    r_norm[0] = SQRT( redux[2] );
+    r_norm[1] = SQRT( redux[3] );
+
+#if defined(LOG_PERFORMANCE)
+    Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
+#endif
+
+    for ( i = 0; i < control->cm_solver_max_iters; ++i )
+    {
+        if ( r_norm[0] / b_norm[0] <= tol || r_norm[1] / b_norm[1] <= tol )
+        {
+            break;
+        }
+
+        dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->w2,
+                workspace->n2, fresh_pre, LEFT );
+        dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->n2,
+                workspace->m2, fresh_pre, RIGHT );
+
+#if defined(LOG_PERFORMANCE)
+        time = Get_Time( );
+#endif
+
+        Dot_local_rvec2( workspace->w2, workspace->u2, system->n, &redux[0], &redux[1] );
+        Dot_local_rvec2( workspace->m2, workspace->w2, system->n, &redux[2], &redux[3] );
+        Dot_local_rvec2( workspace->u2, workspace->u2, system->n, &redux[4], &redux[5] );
+
+#if defined(LOG_PERFORMANCE)
+        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
+#endif
+
+        ret = MPI_Iallreduce( MPI_IN_PLACE, redux, 6, MPI_DOUBLE, MPI_SUM,
+                MPI_COMM_WORLD, &req );
+        Check_MPI_Error( ret, __FILE__, __LINE__ );
+
+#if defined(NEUTRAL_TERRITORY)
+        Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->m2,
+                H->NT, workspace->n2 );
+#else
+        Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->m2,
+                system->N, workspace->n2 );
+#endif
+
+#if defined(LOG_PERFORMANCE)
+        time = Get_Time( );
+#endif
+
+        ret = MPI_Wait( &req, MPI_STATUS_IGNORE );
+        Check_MPI_Error( ret, __FILE__, __LINE__ );
+        gamma_new[0] = redux[0];
+        gamma_new[1] = redux[1];
+        delta[0] = redux[2];
+        delta[1] = redux[3];
+        r_norm[0] = SQRT( redux[4] );
+        r_norm[1] = SQRT( redux[5] );
+
+#if defined(LOG_PERFORMANCE)
+        Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
+#endif
+
+        if ( i > 0 )
+        {
+            beta[0] = gamma_new[0] / gamma_old[0];
+            beta[1] = gamma_new[1] / gamma_old[1];
+            alpha[0] = gamma_new[0] / (delta[0] - beta[0] / alpha[0] * gamma_new[0]);
+            alpha[1] = gamma_new[1] / (delta[1] - beta[1] / alpha[1] * gamma_new[1]);
+        }
+        else
+        {
+            beta[0] = 0.0;
+            beta[1] = 0.0;
+            alpha[0] = gamma_new[0] / delta[0];
+            alpha[1] = gamma_new[1] / delta[1];
+        }
+
+        Vector_Sum_rvec2( workspace->z2, 1.0, 1.0, workspace->n2,
+                beta[0], beta[1], workspace->z2, system->n );
+        Vector_Sum_rvec2( workspace->q2, 1.0, 1.0, workspace->m2,
+                beta[0], beta[1], workspace->q2, system->n );
+        Vector_Sum_rvec2( workspace->p2, 1.0, 1.0, workspace->u2,
+                beta[0], beta[1], workspace->p2, system->n );
+        Vector_Sum_rvec2( workspace->d2, 1.0, 1.0, workspace->w2,
+                beta[0], beta[1], workspace->d2, system->n );
+        Vector_Sum_rvec2( x, 1.0, 1.0, x, alpha[0], alpha[1], workspace->p2, system->n );
+        Vector_Sum_rvec2( workspace->u2, 1.0, 1.0, workspace->u2,
+                -1.0 * alpha[0], -1.0 * alpha[1], workspace->q2, system->n );
+        Vector_Sum_rvec2( workspace->w2, 1.0, 1.0, workspace->w2,
+                -1.0 * alpha[0], -1.0 * alpha[1], workspace->z2, system->n );
+        Vector_Sum_rvec2( workspace->r2, 1.0, 1.0, workspace->r2,
+                -1.0 * alpha[0], -1.0 * alpha[1], workspace->d2, system->n );
+
+        gamma_old[0] = gamma_new[0];
+        gamma_old[1] = gamma_new[1];
+
+#if defined(LOG_PERFORMANCE)
+        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
+#endif
+    }
+
+    /* continue to solve the system that has not converged yet */
+    if ( r_norm[0] / b_norm[0] > tol )
+    {
+        Vector_Copy_From_rvec2( workspace->s, workspace->x, 0, system->n );
+
+        i += PIPECR( system, control, data, workspace,
+                H, workspace->b_s, tol, workspace->s, mpi_data, FALSE );
+
+        Vector_Copy_To_rvec2( workspace->x, workspace->s, 0, system->n );
+    }
+    else if ( r_norm[1] / b_norm[1] > tol )
+    {
+        Vector_Copy_From_rvec2( workspace->t, workspace->x, 1, system->n );
+
+        i += PIPECR( system, control, data, workspace,
+                H, workspace->b_t, tol, workspace->t, mpi_data, FALSE );
+
+        Vector_Copy_To_rvec2( workspace->x, workspace->t, 1, system->n );
+    }
+
+    if ( i >= control->cm_solver_max_iters && system->my_rank == MASTER_NODE )
+    {
+        fprintf( stderr, "[WARNING] PIPECR convergence failed!\n" );
+        return i;
+    }
+
+    return i;
+}
+
+
 /* Pipelined Preconditioned Conjugate Residual Method
  *
  * References:
diff --git a/PG-PuReMD/src/lin_alg.h b/PG-PuReMD/src/lin_alg.h
index e20a97fac2bf2f6e6b01d65961c05bd01bb3e79d..242a01ada847d0690bb47beba9e4563be9c72923 100644
--- a/PG-PuReMD/src/lin_alg.h
+++ b/PG-PuReMD/src/lin_alg.h
@@ -44,6 +44,11 @@ real sparse_approx_inverse( reax_system const * const,
         storage * const, mpi_datatypes * const, 
         sparse_matrix * const, sparse_matrix * const, sparse_matrix * const, int );
 
+int dual_SDM( reax_system const * const, control_params const * const,
+        simulation_data * const,
+        storage * const, sparse_matrix * const, rvec2 * const,
+        real, rvec2 * const, mpi_datatypes * const, int );
+
 int SDM( reax_system const * const, control_params const * const,
         simulation_data * const,
         storage * const, sparse_matrix * const, real * const,
@@ -59,6 +64,11 @@ int CG( reax_system const * const, control_params const * const,
         storage * const, sparse_matrix * const, real * const,
         real, real * const, mpi_datatypes * const, int );
 
+int dual_BiCGStab( reax_system const * const, control_params const * const,
+        simulation_data * const,
+        storage * const, sparse_matrix * const, rvec2 * const,
+        real, rvec2 * const, mpi_datatypes * const, int );
+
 int BiCGStab( reax_system const * const, control_params const * const,
         simulation_data * const,
         storage * const, sparse_matrix * const, real * const,
@@ -74,6 +84,11 @@ int PIPECG( reax_system const * const, control_params const * const,
         storage * const, sparse_matrix * const, real * const,
         real, real * const, mpi_datatypes * const, int );
 
+int dual_PIPECR( reax_system const * const, control_params const * const,
+        simulation_data * const,
+        storage * const, sparse_matrix * const, rvec2 * const,
+        real, rvec2 * const, mpi_datatypes * const, int );
+
 int PIPECR( reax_system const * const, control_params const * const,
         simulation_data * const,
         storage * const, sparse_matrix * const, real * const,