diff --git a/PuReMD/src/allocate.c b/PuReMD/src/allocate.c
index 98fb650206fbc64bbf8399d0e0606dacdd6222be..65474f1ea197475f32b86bacf2f58f9ebd96dc33 100644
--- a/PuReMD/src/allocate.c
+++ b/PuReMD/src/allocate.c
@@ -178,13 +178,15 @@ void DeAllocate_Workspace( control_params *control, storage *workspace )
 
     if ( control->cm_solver_type == GMRES_S
             || control->cm_solver_type == GMRES_H_S
-            || control->cm_solver_type == PIPECG_S )
+            || control->cm_solver_type == PIPECG_S
+            || control->cm_solver_type == PIPECR_S )
     {
         sfree( workspace->z, "z" );
     }
 
     if ( control->cm_solver_type == CG_S
-            || control->cm_solver_type == PIPECG_S )
+            || control->cm_solver_type == PIPECG_S
+            || control->cm_solver_type == PIPECR_S )
     {
         sfree( workspace->d, "d" );
         sfree( workspace->p, "p" );
@@ -192,7 +194,8 @@ void DeAllocate_Workspace( control_params *control, storage *workspace )
         sfree( workspace->r, "r" );
     }
 
-    if ( control->cm_solver_type == PIPECG_S )
+    if ( control->cm_solver_type == PIPECG_S
+            || control->cm_solver_type == PIPECR_S )
     {
         sfree( workspace->m, "m" );
         sfree( workspace->n, "n" );
@@ -327,24 +330,25 @@ int Allocate_Workspace( reax_system *system, control_params *control,
 
         for ( i = 0; i < RESTART + 1; ++i )
         {
-            workspace->h[i] = (real*) scalloc( RESTART + 1, sizeof(real), "h[i]", comm );
-            workspace->v[i] = (real*) scalloc( total_cap, sizeof(real), "v[i]", comm );
+            workspace->h[i] = scalloc( RESTART + 1, sizeof(real), "h[i]", comm );
+            workspace->v[i] = scalloc( total_cap, sizeof(real), "v[i]", comm );
         }
     }
 
     if ( control->cm_solver_type == GMRES_S
-            || control->cm_solver_type == GMRES_H_S
-            || control->cm_solver_type == PIPECG_S )
+            || control->cm_solver_type == GMRES_H_S )
     {
         workspace->z = scalloc( RESTART + 1, sizeof(real), "z", comm );
     }
-    else if ( control->cm_solver_type == PIPECG_S )
+    else if ( control->cm_solver_type == PIPECG_S
+            || control->cm_solver_type == PIPECR_S )
     {
         workspace->z = scalloc( total_cap, sizeof(real), "z", comm );
     }
 
     if ( control->cm_solver_type == CG_S
-            || control->cm_solver_type == PIPECG_S )
+            || control->cm_solver_type == PIPECG_S
+            || control->cm_solver_type == PIPECR_S )
     {
         workspace->d = scalloc( total_cap, sizeof(real), "d", comm );
         workspace->p = scalloc( total_cap, sizeof(real), "p", comm );
@@ -352,7 +356,8 @@ int Allocate_Workspace( reax_system *system, control_params *control,
         workspace->r = scalloc( total_cap, sizeof(real), "r", comm );
     }
 
-    if ( control->cm_solver_type == PIPECG_S )
+    if ( control->cm_solver_type == PIPECG_S
+            || control->cm_solver_type == PIPECR_S )
     {
         workspace->m = scalloc( total_cap, sizeof(real), "m", comm );
         workspace->n = scalloc( total_cap, sizeof(real), "n", comm );
diff --git a/PuReMD/src/linear_solvers.c b/PuReMD/src/linear_solvers.c
index f29eab07b907d38528f75fde169990b99bed61f4..276daed272277b413e4bcb715da1bb2fd2c52cd6 100644
--- a/PuReMD/src/linear_solvers.c
+++ b/PuReMD/src/linear_solvers.c
@@ -1963,8 +1963,6 @@ int CG( reax_system *system, control_params *control, simulation_data *data,
         MPI_Reduce( timings, NULL, 5, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world );
     }
 
-    MPI_Barrier(mpi_data->world);
-
     if ( i >= control->cm_solver_max_iters && system->my_rank == MASTER_NODE )
     {
         fprintf( stderr, "[WARNING] CG convergence failed!\n" );
@@ -1989,11 +1987,9 @@ int PIPECG( reax_system *system, control_params *control, simulation_data *data,
         real tol, real *x, mpi_datatypes* mpi_data )
 {
     int i, j;
-    real alpha, beta, b_norm, norm;
-    real delta;
-    real gamma_old, gamma_new;
+    real alpha, beta, delta, gamma_old, gamma_new, norm;
     real t_start, t_pa, t_spmv, t_vops, t_comm, t_allreduce;
-    real timings[5], redux[4];
+    real timings[5], redux[3];
     MPI_Request req;
 
     t_pa = 0.0;
@@ -2008,33 +2004,46 @@ int PIPECG( reax_system *system, control_params *control, simulation_data *data,
 
     t_start = MPI_Wtime( );
 #if defined(NEUTRAL_TERRITORY)
-    Sparse_MatVec( H, x, workspace->q, H->NT );
+    Sparse_MatVec( H, x, workspace->u, H->NT );
 #else
-    Sparse_MatVec( H, x, workspace->q, system->N );
+    Sparse_MatVec( H, x, workspace->u, system->N );
 #endif
     t_spmv += MPI_Wtime( ) - t_start;
 
     if ( H->format == SYM_HALF_MATRIX )
     {
         t_start = MPI_Wtime( );
-        Coll( system, mpi_data, workspace->q, REAL_PTR_TYPE, MPI_DOUBLE );
+        Coll( system, mpi_data, workspace->u, REAL_PTR_TYPE, MPI_DOUBLE );
         t_comm += MPI_Wtime( ) - t_start;
     }
 #if defined(NEUTRAL_TERRITORY)
     else
     {
         t_start = MPI_Wtime( );
-        Coll( system, mpi_data, workspace->q, REAL_PTR_TYPE, MPI_DOUBLE );
+        Coll( system, mpi_data, workspace->u, REAL_PTR_TYPE, MPI_DOUBLE );
         t_comm += MPI_Wtime( ) - t_start;
     }
 #endif
 
     t_start = MPI_Wtime( );
-    Vector_Sum( workspace->r , 1.0,  b, -1.0, workspace->q, system->n );
+    Vector_Sum( workspace->r , 1.0,  b, -1.0, workspace->u, system->n );
     t_vops += MPI_Wtime( ) - t_start;
 
     /* pre-conditioning */
-    if ( control->cm_solver_pre_comp_type == SAI_PC )
+    if ( control->cm_solver_pre_comp_type == NONE_PC )
+    {
+        Vector_Copy( workspace->u, workspace->r, system->n );
+    }
+    else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
+    {
+        t_start = MPI_Wtime( );
+        for ( j = 0; j < system->n; ++j )
+        {
+            workspace->u[j] = workspace->r[j] * workspace->Hdia_inv[j];
+        }
+        t_pa += MPI_Wtime( ) - t_start;
+    }
+    else if ( control->cm_solver_pre_comp_type == SAI_PC )
     {
         t_start = MPI_Wtime( );
         Dist( system, mpi_data, workspace->r, REAL_PTR_TYPE, MPI_DOUBLE );
@@ -2048,15 +2057,6 @@ int PIPECG( reax_system *system, control_params *control, simulation_data *data,
 #endif
         t_pa += MPI_Wtime( ) - t_start;
     }
-    else if ( control->cm_solver_pre_comp_type == JACOBI_PC)
-    {
-        t_start = MPI_Wtime( );
-        for ( j = 0; j < system->n; ++j )
-        {
-            workspace->u[j] = workspace->r[j] * workspace->Hdia_inv[j];
-        }
-        t_pa += MPI_Wtime( ) - t_start;
-    }
 
     t_start = MPI_Wtime( );
     Dist( system, mpi_data, workspace->u, REAL_PTR_TYPE, MPI_DOUBLE );
@@ -2089,14 +2089,25 @@ int PIPECG( reax_system *system, control_params *control, simulation_data *data,
     redux[0] = Dot_local( workspace->w, workspace->u, system->n );
     redux[1] = Dot_local( workspace->r, workspace->u, system->n );
     redux[2] = Dot_local( workspace->u, workspace->u, system->n );
-    redux[3] = Dot_local( b, b, system->n );
     t_vops += MPI_Wtime( ) - t_start;
 
-    t_start = MPI_Wtime( );
-    MPI_Iallreduce( MPI_IN_PLACE, redux, 4, MPI_DOUBLE, MPI_SUM, mpi_data->world, &req );
+    MPI_Iallreduce( MPI_IN_PLACE, redux, 3, MPI_DOUBLE, MPI_SUM, mpi_data->world, &req );
 
     /* pre-conditioning */
-    if ( control->cm_solver_pre_comp_type == SAI_PC )
+    if ( control->cm_solver_pre_comp_type == NONE_PC )
+    {
+        Vector_Copy( workspace->m, workspace->w, system->n );
+    }
+    else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
+    {
+        t_start = MPI_Wtime( );
+        for ( j = 0; j < system->n; ++j )
+        {
+            workspace->m[j] = workspace->w[j] * workspace->Hdia_inv[j];
+        }
+        t_pa += MPI_Wtime( ) - t_start;
+    }
+    else if ( control->cm_solver_pre_comp_type == SAI_PC )
     {
         t_start = MPI_Wtime( );
         Dist( system, mpi_data, workspace->w, REAL_PTR_TYPE, MPI_DOUBLE );
@@ -2110,15 +2121,6 @@ int PIPECG( reax_system *system, control_params *control, simulation_data *data,
 #endif
         t_pa += MPI_Wtime( ) - t_start;
     }
-    else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
-    {
-        t_start = MPI_Wtime( );
-        for ( j = 0; j < system->n; ++j )
-        {
-            workspace->m[j] = workspace->w[j] * workspace->Hdia_inv[j];
-        }
-        t_pa += MPI_Wtime( ) - t_start;
-    }
 
     t_start = MPI_Wtime( );
     Dist( system, mpi_data, workspace->m, REAL_PTR_TYPE, MPI_DOUBLE );
@@ -2147,14 +2149,14 @@ int PIPECG( reax_system *system, control_params *control, simulation_data *data,
     }
 #endif
 
+    t_start = MPI_Wtime( );
     MPI_Wait( &req, MPI_STATUS_IGNORE );
     t_allreduce += MPI_Wtime( ) - t_start;
     delta = redux[0];
     gamma_new = redux[1];
     norm = sqrt( redux[2] );
-    b_norm = sqrt( redux[3] );
 
-    for ( i = 0; i < control->cm_solver_max_iters && norm / b_norm > tol; ++i )
+    for ( i = 0; i < control->cm_solver_max_iters && norm > tol; ++i )
     {
         if ( i > 0 )
         {
@@ -2168,24 +2170,36 @@ int PIPECG( reax_system *system, control_params *control, simulation_data *data,
         }
 
         t_start = MPI_Wtime( );
-        Vector_Sum( workspace->u , 1.0,  workspace->u, -alpha, workspace->q, system->n );
-        Vector_Sum( workspace->w , 1.0,  workspace->w, -alpha, workspace->z, system->n );
-        Vector_Sum( workspace->r , 1.0,  workspace->r, -alpha, workspace->d, system->n );
-        Vector_Sum( x, 1.0,  x, alpha, workspace->p, system->n );
-        Vector_Sum( workspace->z , 1.0,  workspace->n, beta, workspace->z, system->n );
-        Vector_Sum( workspace->q , 1.0,  workspace->m, beta, workspace->q, system->n );
-        Vector_Sum( workspace->p , 1.0,  workspace->u, beta, workspace->p, system->n );
-        Vector_Sum( workspace->d , 1.0,  workspace->w, beta, workspace->d, system->n );
+        Vector_Sum( workspace->z, 1.0, workspace->n, beta, workspace->z, system->n );
+        Vector_Sum( workspace->q, 1.0, workspace->m, beta, workspace->q, system->n );
+        Vector_Sum( workspace->p, 1.0, workspace->u, beta, workspace->p, system->n );
+        Vector_Sum( workspace->d, 1.0, workspace->w, beta, workspace->d, system->n );
+        Vector_Sum( x, 1.0, x, alpha, workspace->p, system->n );
+        Vector_Sum( workspace->u, 1.0, workspace->u, -alpha, workspace->q, system->n );
+        Vector_Sum( workspace->w, 1.0, workspace->w, -alpha, workspace->z, system->n );
+        Vector_Sum( workspace->r, 1.0, workspace->r, -alpha, workspace->d, system->n );
         redux[0] = Dot_local( workspace->w, workspace->u, system->n );
         redux[1] = Dot_local( workspace->r, workspace->u, system->n );
         redux[2] = Dot_local( workspace->u, workspace->u, system->n );
         t_vops += MPI_Wtime( ) - t_start;
 
-        t_start = MPI_Wtime( );
         MPI_Iallreduce( MPI_IN_PLACE, redux, 3, MPI_DOUBLE, MPI_SUM, mpi_data->world, &req );
 
         /* pre-conditioning */
-        if ( control->cm_solver_pre_comp_type == SAI_PC )
+        if ( control->cm_solver_pre_comp_type == NONE_PC )
+        {
+            Vector_Copy( workspace->m, workspace->w, system->n );
+        }
+        else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
+        {
+            t_start = MPI_Wtime( );
+            for ( j = 0; j < system->n; ++j )
+            {
+                workspace->m[j] = workspace->w[j] * workspace->Hdia_inv[j];
+            }
+            t_pa += MPI_Wtime( ) - t_start;
+        }
+        else if ( control->cm_solver_pre_comp_type == SAI_PC )
         {
             t_start = MPI_Wtime( );
             Dist( system, mpi_data, workspace->w, REAL_PTR_TYPE, MPI_DOUBLE );
@@ -2199,15 +2213,6 @@ int PIPECG( reax_system *system, control_params *control, simulation_data *data,
 #endif
             t_pa += MPI_Wtime( ) - t_start;
         }
-        else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
-        {
-            t_start = MPI_Wtime( );
-            for ( j = 0; j < system->n; ++j )
-            {
-                workspace->m[j] = workspace->w[j] * workspace->Hdia_inv[j];
-            }
-            t_pa += MPI_Wtime( ) - t_start;
-        }
 
         t_start = MPI_Wtime( );
         Dist( system, mpi_data, workspace->m, REAL_PTR_TYPE, MPI_DOUBLE );
@@ -2238,6 +2243,7 @@ int PIPECG( reax_system *system, control_params *control, simulation_data *data,
 
         gamma_old = gamma_new;
 
+        t_start = MPI_Wtime( );
         MPI_Wait( &req, MPI_STATUS_IGNORE );
         t_allreduce += MPI_Wtime( ) - t_start;
         delta = redux[0];
@@ -2286,11 +2292,11 @@ int PIPECR( reax_system *system, control_params *control, simulation_data *data,
         storage *workspace, sparse_matrix *H, real *b,
         real tol, real *x, mpi_datatypes* mpi_data )
 {
-    int  i, j;
-    real tmp, alpha, beta, b_norm;
-    real sig_old, sig_new;
+    int i, j;
+    real alpha, beta, delta, gamma_old, gamma_new, norm;
     real t_start, t_pa, t_spmv, t_vops, t_comm, t_allreduce;
-    real timings[5];
+    real timings[5], redux[3];
+    MPI_Request req;
 
     t_pa = 0.0;
     t_spmv = 0.0;
@@ -2304,33 +2310,46 @@ int PIPECR( reax_system *system, control_params *control, simulation_data *data,
 
     t_start = MPI_Wtime( );
 #if defined(NEUTRAL_TERRITORY)
-    Sparse_MatVec( H, x, workspace->q, H->NT );
+    Sparse_MatVec( H, x, workspace->u, H->NT );
 #else
-    Sparse_MatVec( H, x, workspace->q, system->N );
+    Sparse_MatVec( H, x, workspace->u, system->N );
 #endif
     t_spmv += MPI_Wtime( ) - t_start;
 
     if ( H->format == SYM_HALF_MATRIX )
     {
         t_start = MPI_Wtime( );
-        Coll( system, mpi_data, workspace->q, REAL_PTR_TYPE, MPI_DOUBLE );
+        Coll( system, mpi_data, workspace->u, REAL_PTR_TYPE, MPI_DOUBLE );
         t_comm += MPI_Wtime( ) - t_start;
     }
 #if defined(NEUTRAL_TERRITORY)
     else
     {
         t_start = MPI_Wtime( );
-        Coll( system, mpi_data, workspace->q, REAL_PTR_TYPE, MPI_DOUBLE );
+        Coll( system, mpi_data, workspace->u, REAL_PTR_TYPE, MPI_DOUBLE );
         t_comm += MPI_Wtime( ) - t_start;
     }
 #endif
 
     t_start = MPI_Wtime( );
-    Vector_Sum( workspace->r , 1.,  b, -1., workspace->q, system->n );
+    Vector_Sum( workspace->r , 1.0,  b, -1.0, workspace->u, system->n );
     t_vops += MPI_Wtime( ) - t_start;
 
     /* pre-conditioning */
-    if ( control->cm_solver_pre_comp_type == SAI_PC )
+    if ( control->cm_solver_pre_comp_type == NONE_PC )
+    {
+        Vector_Copy( workspace->u, workspace->r, system->n );
+    }
+    else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
+    {
+        t_start = MPI_Wtime( );
+        for ( j = 0; j < system->n; ++j )
+        {
+            workspace->u[j] = workspace->r[j] * workspace->Hdia_inv[j];
+        }
+        t_pa += MPI_Wtime( ) - t_start;
+    }
+    else if ( control->cm_solver_pre_comp_type == SAI_PC )
     {
         t_start = MPI_Wtime( );
         Dist( system, mpi_data, workspace->r, REAL_PTR_TYPE, MPI_DOUBLE );
@@ -2338,102 +2357,139 @@ int PIPECR( reax_system *system, control_params *control, simulation_data *data,
         
         t_start = MPI_Wtime( );
 #if defined(NEUTRAL_TERRITORY)
-        Sparse_MatVec( workspace->H_app_inv, workspace->r, workspace->d, H->NT );
+        Sparse_MatVec( workspace->H_app_inv, workspace->r, workspace->u, H->NT );
 #else
-        Sparse_MatVec( workspace->H_app_inv, workspace->r, workspace->d, system->n );
+        Sparse_MatVec( workspace->H_app_inv, workspace->r, workspace->u, system->n );
 #endif
         t_pa += MPI_Wtime( ) - t_start;
     }
 
-    else if ( control->cm_solver_pre_comp_type == JACOBI_PC)
+    t_start = MPI_Wtime( );
+    Dist( system, mpi_data, workspace->u, REAL_PTR_TYPE, MPI_DOUBLE );
+    t_comm += MPI_Wtime( ) - t_start;
+
+    t_start = MPI_Wtime( );
+#if defined(NEUTRAL_TERRITORY)
+    Sparse_MatVec( H, workspace->u, workspace->w, H->NT );
+#else
+    Sparse_MatVec( H, workspace->u, workspace->w, system->N );
+#endif
+    t_spmv += MPI_Wtime( ) - t_start;
+
+    if ( H->format == SYM_HALF_MATRIX )
     {
         t_start = MPI_Wtime( );
-        for ( j = 0; j < system->n; ++j )
-        {
-            workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j];
-        }
-        t_pa += MPI_Wtime( ) - t_start;
+        Coll( system, mpi_data, workspace->w, REAL_PTR_TYPE, MPI_DOUBLE );
+        t_comm += MPI_Wtime( ) - t_start;
+    }
+#if defined(NEUTRAL_TERRITORY)
+    else
+    {
+        t_start = MPI_Wtime( );
+        Coll( system, mpi_data, workspace->w, REAL_PTR_TYPE, MPI_DOUBLE );
+        t_comm += MPI_Wtime( ) - t_start;
     }
+#endif
 
-    t_start = MPI_Wtime( );
-    b_norm = Parallel_Norm( b, system->n, mpi_data->world );
-    sig_new = Parallel_Dot( workspace->r, workspace->d, system->n, mpi_data->world );
-    t_allreduce += MPI_Wtime( ) - t_start;
+    //TODO: better loop unrolling and termination condition check
+    norm = tol + 1.0;
 
-    for ( i = 0; i < control->cm_solver_max_iters && sqrt(sig_new) / b_norm > tol; ++i )
+    for ( i = 0; i < control->cm_solver_max_iters && norm > tol; ++i )
     {
+        /* pre-conditioning */
+        if ( control->cm_solver_pre_comp_type == NONE_PC )
+        {
+            Vector_Copy( workspace->m, workspace->w, system->n );
+        }
+        else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
+        {
+            t_start = MPI_Wtime( );
+            for ( j = 0; j < system->n; ++j )
+            {
+                workspace->m[j] = workspace->w[j] * workspace->Hdia_inv[j];
+            }
+            t_pa += MPI_Wtime( ) - t_start;
+        }
+        else if ( control->cm_solver_pre_comp_type == SAI_PC )
+        {
+            t_start = MPI_Wtime( );
+            Dist( system, mpi_data, workspace->w, REAL_PTR_TYPE, MPI_DOUBLE );
+            t_comm += MPI_Wtime( ) - t_start;
+            
+            t_start = MPI_Wtime( );
+#if defined(NEUTRAL_TERRITORY)
+            Sparse_MatVec( workspace->H_app_inv, workspace->w, workspace->m, H->NT );
+#else
+            Sparse_MatVec( workspace->H_app_inv, workspace->w, workspace->m, system->n );
+#endif
+            t_pa += MPI_Wtime( ) - t_start;
+        }
+
         t_start = MPI_Wtime( );
-        Dist( system, mpi_data, workspace->d, REAL_PTR_TYPE, MPI_DOUBLE );
+        redux[0] = Dot_local( workspace->w, workspace->u, system->n );
+        redux[1] = Dot_local( workspace->m, workspace->w, system->n );
+        redux[2] = Dot_local( workspace->u, workspace->u, system->n );
+        t_vops += MPI_Wtime( ) - t_start;
+
+        MPI_Iallreduce( MPI_IN_PLACE, redux, 3, MPI_DOUBLE, MPI_SUM, mpi_data->world, &req );
+
+        t_start = MPI_Wtime( );
+        Dist( system, mpi_data, workspace->m, REAL_PTR_TYPE, MPI_DOUBLE );
         t_comm += MPI_Wtime( ) - t_start;
 
         t_start = MPI_Wtime( );
 #if defined(NEUTRAL_TERRITORY)
-        Sparse_MatVec( H, workspace->d, workspace->q, H->NT );
+        Sparse_MatVec( H, workspace->m, workspace->n, H->NT );
 #else
-        Sparse_MatVec( H, workspace->d, workspace->q, system->N );
+        Sparse_MatVec( H, workspace->m, workspace->n, system->N );
 #endif
         t_spmv += MPI_Wtime( ) - t_start;
 
         if ( H->format == SYM_HALF_MATRIX )
         {
             t_start = MPI_Wtime( );
-            Coll( system, mpi_data, workspace->q, REAL_PTR_TYPE, MPI_DOUBLE );
+            Coll( system, mpi_data, workspace->n, REAL_PTR_TYPE, MPI_DOUBLE );
             t_comm += MPI_Wtime( ) - t_start;
         }
 #if defined(NEUTRAL_TERRITORY)
         else
         {
             t_start = MPI_Wtime( );
-            Coll( system, mpi_data, workspace->q, REAL_PTR_TYPE, MPI_DOUBLE );
+            Coll( system, mpi_data, workspace->n, REAL_PTR_TYPE, MPI_DOUBLE );
             t_comm += MPI_Wtime( ) - t_start;
         }
 #endif
 
-        t_start =  MPI_Wtime( );
-        tmp = Parallel_Dot( workspace->d, workspace->q, system->n, mpi_data->world );
-        t_allreduce += MPI_Wtime( ) - t_start;
-
         t_start = MPI_Wtime( );
-        alpha = sig_new / tmp;
-        Vector_Add( x, alpha, workspace->d, system->n );
-        Vector_Add( workspace->r, -alpha, workspace->q, system->n );
-        t_vops += MPI_Wtime( ) - t_start;
+        MPI_Wait( &req, MPI_STATUS_IGNORE );
+        t_allreduce += MPI_Wtime( ) - t_start;
+        gamma_new = redux[0];
+        delta = redux[1];
+        norm = sqrt( redux[2] );
 
-        /* pre-conditioning */
-        if ( control->cm_solver_pre_comp_type == SAI_PC )
+        if ( i > 0 )
         {
-            t_start = MPI_Wtime( );
-            Dist( system, mpi_data, workspace->r, REAL_PTR_TYPE, MPI_DOUBLE );
-            t_comm += MPI_Wtime( ) - t_start;
-
-            t_start = MPI_Wtime( );
-#if defined(NEUTRAL_TERRITORY)
-            Sparse_MatVec( workspace->H_app_inv, workspace->r, workspace->p, H->NT );
-#else
-            Sparse_MatVec( workspace->H_app_inv, workspace->r, workspace->p, system->n );
-#endif
-            t_pa += MPI_Wtime( ) - t_start;
+            beta = gamma_new / gamma_old;
+            alpha = gamma_new / (delta - beta / alpha * gamma_new);
         }
-
-        else if ( control->cm_solver_pre_comp_type == JACOBI_PC)
+        else
         {
-            t_start = MPI_Wtime( );
-            for ( j = 0; j < system->n; ++j )
-            {
-                workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j];
-            }
-            t_pa += MPI_Wtime( ) - t_start;
+            beta = 0.0;
+            alpha = gamma_new / delta;
         }
 
         t_start = MPI_Wtime( );
-        sig_old = sig_new;
-        sig_new = Parallel_Dot( workspace->r, workspace->p, system->n, mpi_data->world );
-        t_allreduce += MPI_Wtime() - t_start;
-
-        t_start = MPI_Wtime( );
-        beta = sig_new / sig_old;
-        Vector_Sum( workspace->d, 1., workspace->p, beta, workspace->d, system->n );
+        Vector_Sum( workspace->z, 1.0, workspace->n, beta, workspace->z, system->n );
+        Vector_Sum( workspace->q, 1.0, workspace->m, beta, workspace->q, system->n );
+        Vector_Sum( workspace->p, 1.0, workspace->u, beta, workspace->p, system->n );
+        Vector_Sum( workspace->d, 1.0, workspace->w, beta, workspace->d, system->n );
+        Vector_Sum( x, 1.0, x, alpha, workspace->p, system->n );
+        Vector_Sum( workspace->u, 1.0, workspace->u, -alpha, workspace->q, system->n );
+        Vector_Sum( workspace->w, 1.0, workspace->w, -alpha, workspace->z, system->n );
+        Vector_Sum( workspace->r, 1.0, workspace->r, -alpha, workspace->d, system->n );
         t_vops += MPI_Wtime( ) - t_start;
+
+        gamma_old = gamma_new;
     }
 
     timings[0] = t_pa;
@@ -2457,11 +2513,9 @@ int PIPECR( reax_system *system, control_params *control, simulation_data *data,
         MPI_Reduce( timings, NULL, 5, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world );
     }
 
-    MPI_Barrier(mpi_data->world);
-
     if ( i >= control->cm_solver_max_iters && system->my_rank == MASTER_NODE )
     {
-        fprintf( stderr, "[WARNING] CG convergence failed!\n" );
+        fprintf( stderr, "[WARNING] PIPECR convergence failed!\n" );
         return i;
     }
 
diff --git a/PuReMD/src/qEq.c b/PuReMD/src/qEq.c
index b936c46ea3f8acd6c57cfe21a1a7980e9cf56752..4599b9868482c56dbd79e719f4c946e4dba4604d 100644
--- a/PuReMD/src/qEq.c
+++ b/PuReMD/src/qEq.c
@@ -502,6 +502,34 @@ void QEq( reax_system *system, control_params *control, simulation_data *data,
         }
         break;
 
+    case PIPECR_S:
+        for ( j = 0; j < system->n; ++j )
+        {
+            workspace->s[j] = workspace->x[j][0];
+        }
+
+        iters = PIPECR( system, control, data, workspace, workspace->H, workspace->b_s,
+                control->cm_solver_q_err, workspace->s, mpi_data );
+
+        for ( j = 0; j < system->n; ++j )
+        {
+            workspace->x[j][0] = workspace->s[j];
+        }
+
+        for ( j = 0; j < system->n; ++j )
+        {
+            workspace->t[j] = workspace->x[j][1];
+        }
+
+        iters += PIPECR( system, control, data, workspace, workspace->H, workspace->b_t,
+                control->cm_solver_q_err, workspace->t, mpi_data );
+
+        for ( j = 0; j < system->n; ++j )
+        {
+            workspace->x[j][1] = workspace->t[j];
+        }
+        break;
+
     case GMRES_S:
     case GMRES_H_S:
     case SDM_S:
diff --git a/PuReMD/src/reax_types.h b/PuReMD/src/reax_types.h
index 244fdc0f9a6d3fc21d8fb8c8a29f2bc1c0675d46..8a94c0c995c8c77ff15cc907cbed90f67b57816f 100644
--- a/PuReMD/src/reax_types.h
+++ b/PuReMD/src/reax_types.h
@@ -88,6 +88,7 @@ enum solver
     SDM_S = 3,
     BiCGStab_S = 4,
     PIPECG_S = 5,
+    PIPECR_S = 6,
 };
 
 /* preconditioner computation type for charge method linear solver */
@@ -199,8 +200,8 @@ typedef struct
 
 typedef struct
 {
-    MPI_Comm     world;
-    MPI_Comm     comm_mesh3D;
+    MPI_Comm world;
+    MPI_Comm comm_mesh3D;
 
     MPI_Datatype sys_info;
     MPI_Datatype mpi_atom_type;
diff --git a/PuReMD/src/vector.c b/PuReMD/src/vector.c
index 78ed514f4e00971dfaea8c6f4e61dc557fe576b2..ee4cf8ee96595150683526f34c26f1b1ed01251f 100644
--- a/PuReMD/src/vector.c
+++ b/PuReMD/src/vector.c
@@ -27,57 +27,70 @@
 #include "reax_vector.h"
 #endif
 
+
 int Vector_isZero( real* v, int k )
 {
-    for ( --k; k >= 0; --k )
-        if ( fabs( v[k] ) > ALMOST_ZERO )
-            return 0;
+    int i, ret;
 
-    return 1;
+    ret = 1;
+
+    for ( i = 0; i < k; ++i )
+    {
+        if ( fabs( v[i] ) > ALMOST_ZERO )
+        {
+            ret = 0;
+            break;
+        }
+    }
+
+    return ret;
 }
 
 
 void Vector_MakeZero( real *v, int k )
 {
-    for ( --k; k >= 0; --k )
-        v[k] = 0;
+    int i;
+
+    for ( i = 0; i < k; ++i )
+    {
+        v[i] = 0;
+    }
 }
 
 
 void Vector_Copy( real* dest, real* v, int k )
 {
-    for ( --k; k >= 0; --k )
-        dest[k] = v[k];
-}
-
+    int i;
 
-void Vector_Scale( real* dest, real c, real* v, int k )
-{
-    for ( --k; k >= 0; --k )
-        dest[k] = c * v[k];
+    for ( i = 0; i < k; ++i )
+    {
+        dest[i] = v[i];
+    }
 }
 
 
-/*void Vector_Sum( real* dest, real c, real* v, real d, real* y, int k )
+void Vector_Scale( real* dest, real c, real* v, int k )
 {
-    for ( --k; k >= 0; --k )
-        dest[k] = c * v[k] + d * y[k];
-}*/
-
+    int i;
 
-/*void Vector_Add( real* dest, real c, real* v, int k )
-{
-    for ( --k; k >= 0; --k )
-        dest[k] += c * v[k];
-}*/
+    for ( i = 0; i < k; ++i )
+    {
+        dest[i] = c * v[i];
+    }
+}
 
 
 real Dot( real* v1, real* v2, int k )
 {
-    real ret = 0;
+    int i;
+    real ret;
 
-    for ( --k; k >= 0; --k )
-        ret +=  v1[k] * v2[k];
+    ret = 0.0;
+
+    for ( i = 0; i < k; ++i )
+    {
+        ret += v1[i] * v2[i];
+    }
 
     return ret;
 }
@@ -101,10 +114,15 @@ real Dot_local( real *v1, real *v2, int k )
 
 real Norm( real* v1, int k )
 {
-    real ret = 0;
+    int i;
+    real ret;
 
-    for ( --k; k >= 0; --k )
-        ret +=  SQR( v1[k] );
+    ret = 0.0;
+
+    for ( i = 0; i < k; ++i )
+    {
+        ret +=  SQR( v1[i] );
+    }
 
     return sqrt( ret );
 }
diff --git a/PuReMD/src/vector.h b/PuReMD/src/vector.h
index e133d8aef762291a5bf9861c3696bf47fd85c1d4..1b12342307e180eac1ecdfb96d894ce9f6be41bb 100644
--- a/PuReMD/src/vector.h
+++ b/PuReMD/src/vector.h
@@ -56,6 +56,7 @@ static inline void Vector_Add( real* dest, real c, real* v, int k )
     }
 }
 
+
 real Dot( real*, real*, int );
 
 real Dot_local( real*, real*, int );