From a82576a0ee82debe2675894613b1af23b5044981 Mon Sep 17 00:00:00 2001
From: "Kurt A. O'Hearn" <ohearnk@msu.edu>
Date: Thu, 6 Aug 2020 16:47:24 -0400
Subject: [PATCH] PG-PuReMD: add dual QEq BiCGStab solver.

---
 PG-PuReMD/src/cuda/cuda_allocate.cu      |  22 +
 PG-PuReMD/src/cuda/cuda_charges.cu       |  14 +-
 PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu |  48 +++
 PG-PuReMD/src/cuda/cuda_dense_lin_alg.h  |   3 +
 PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu  | 487 +++++++++++++++++++++--
 PG-PuReMD/src/cuda/cuda_spar_lin_alg.h   |   7 +-
 PG-PuReMD/src/reax_types.h               |  20 +-
 7 files changed, 555 insertions(+), 46 deletions(-)

diff --git a/PG-PuReMD/src/cuda/cuda_allocate.cu b/PG-PuReMD/src/cuda/cuda_allocate.cu
index 601cb2ad..dd791df7 100644
--- a/PG-PuReMD/src/cuda/cuda_allocate.cu
+++ b/PG-PuReMD/src/cuda/cuda_allocate.cu
@@ -503,6 +503,17 @@ void Cuda_Allocate_Workspace_Part2( reax_system *system, control_params *control
         cuda_malloc( (void **) &workspace->p, sizeof(real) * total_cap, TRUE, "p" );
         cuda_malloc( (void **) &workspace->r_hat, sizeof(real) * total_cap, TRUE, "r_hat" );
         cuda_malloc( (void **) &workspace->q_hat, sizeof(real) * total_cap, TRUE, "q_hat" );
+#if defined(DUAL_SOLVER)
+        cuda_malloc( (void **) &workspace->y2, sizeof(rvec2) * total_cap, TRUE, "y" );
+        cuda_malloc( (void **) &workspace->g2, sizeof(rvec2) * total_cap, TRUE, "g" );
+        cuda_malloc( (void **) &workspace->z2, sizeof(rvec2) * total_cap, TRUE, "z" );
+        cuda_malloc( (void **) &workspace->r2, sizeof(rvec2) * total_cap, TRUE, "r" );
+        cuda_malloc( (void **) &workspace->d2, sizeof(rvec2) * total_cap, TRUE, "d" );
+        cuda_malloc( (void **) &workspace->q2, sizeof(rvec2) * total_cap, TRUE, "q" );
+        cuda_malloc( (void **) &workspace->p2, sizeof(rvec2) * total_cap, TRUE, "p" );
+        cuda_malloc( (void **) &workspace->r_hat2, sizeof(rvec2) * total_cap, TRUE, "r_hat" );
+        cuda_malloc( (void **) &workspace->q_hat2, sizeof(rvec2) * total_cap, TRUE, "q_hat" );
+#endif
         break;
 
     case PIPECG_S:
@@ -640,6 +651,17 @@ void Cuda_Deallocate_Workspace_Part2( control_params *control, storage *workspac
             cuda_free( workspace->p, "p" );
             cuda_free( workspace->r_hat, "r_hat" );
             cuda_free( workspace->q_hat, "q_hat" );
+#if defined(DUAL_SOLVER)
+            cuda_free( workspace->y2, "y2" );
+            cuda_free( workspace->g2, "g2" );
+            cuda_free( workspace->z2, "z2" );
+            cuda_free( workspace->r2, "r2" );
+            cuda_free( workspace->d2, "d2" );
+            cuda_free( workspace->q2, "q2" );
+            cuda_free( workspace->p2, "p2" );
+            cuda_free( workspace->r_hat2, "r_hat2" );
+            cuda_free( workspace->q_hat2, "q_hat2" );
+#endif
             break;
 
         case PIPECG_S:
diff --git a/PG-PuReMD/src/cuda/cuda_charges.cu b/PG-PuReMD/src/cuda/cuda_charges.cu
index 01d811f0..a8d4bc86 100644
--- a/PG-PuReMD/src/cuda/cuda_charges.cu
+++ b/PG-PuReMD/src/cuda/cuda_charges.cu
@@ -623,9 +623,9 @@ void QEq( reax_system const * const system, control_params const * const control
 
     case CG_S:
 #if defined(DUAL_SOLVER)
-        iters = Cuda_dual_CG( system, control, data, workspace, &workspace->d_workspace->H,
-                workspace->d_workspace->b, control->cm_solver_q_err, workspace->d_workspace->x, mpi_data,
-                out_control->log );
+        iters = Cuda_dual_CG( system, control, data, workspace,
+                &workspace->d_workspace->H, workspace->d_workspace->b,
+                control->cm_solver_q_err, workspace->d_workspace->x, mpi_data );
 #else
         iters = Cuda_CG( system, control, data, workspace, &workspace->d_workspace->H,
                 workspace->d_workspace->b_s, control->cm_solver_q_err, workspace->d_workspace->s, mpi_data );
@@ -639,11 +639,9 @@ void QEq( reax_system const * const system, control_params const * const control
 
     case BiCGStab_S:
 #if defined(DUAL_SOLVER)
-        fprintf( stderr, "[ERROR] Dual BiCGStab solver for QEq not yet implemented. Terminating...\n" );
-        exit( INVALID_INPUT );
-//        iters = Cuda_dual_BiCGStab( system, control, data, workspace, &workspace->d_workspace->H,
-//                workspace->d_workspace->b, control->cm_solver_q_err, workspace->d_workspace->x, mpi_data,
-//                out_control->log );
+        iters = Cuda_dual_BiCGStab( system, control, data, workspace,
+                &workspace->d_workspace->H, workspace->d_workspace->b,
+                control->cm_solver_q_err, workspace->d_workspace->x, mpi_data );
 #else
         iters = Cuda_BiCGStab( system, control, data, workspace, &workspace->d_workspace->H,
                 workspace->d_workspace->b_s, control->cm_solver_q_err, workspace->d_workspace->s, mpi_data );
diff --git a/PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu b/PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu
index 7a0354c8..01d8b831 100644
--- a/PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu
+++ b/PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu
@@ -52,6 +52,31 @@ CUDA_GLOBAL void k_vector_copy( real * const dest, real const * const v,
 }
 
 
+/* copy the entries from one vector to another
+ *
+ * inputs:
+ *  v: dense vector to copy
+ *  k: number of entries in v
+ * output:
+ *  dest: vector copied into
+ */
+CUDA_GLOBAL void k_vector_copy_rvec2( rvec2 * const dest, rvec2 const * const v,
+        unsigned int k )
+{
+    unsigned int i;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= k )
+    {
+        return;
+    }
+
+    dest[i][0] = v[i][0];
+    dest[i][1] = v[i][1];
+}
+
+
 CUDA_GLOBAL void k_vector_copy_from_rvec2( real * const dst, rvec2 const * const src,
         int index, int n )
 {
@@ -323,6 +348,29 @@ void Vector_Copy( real * const dest, real const * const v,
 }
 
 
+/* copy the entries from one vector to another
+ *
+ * inputs:
+ *  v: dense vector to copy
+ *  k: number of entries in v
+ * output:
+ *  dest: vector copied into
+ */
+void Vector_Copy_rvec2( rvec2 * const dest, rvec2 const * const v,
+        unsigned int k )
+{
+    int blocks;
+
+    blocks = (k / DEF_BLOCK_SIZE)
+        + ((k % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+    k_vector_copy_rvec2 <<< blocks, DEF_BLOCK_SIZE >>>
+        ( dest, v, k );
+    cudaDeviceSynchronize( );
+    cudaCheckError( );
+}
+
+
 void Vector_Copy_From_rvec2( real * const dst, rvec2 const * const src,
         int index, int k )
 {
diff --git a/PG-PuReMD/src/cuda/cuda_dense_lin_alg.h b/PG-PuReMD/src/cuda/cuda_dense_lin_alg.h
index 422dcded..61846fb6 100644
--- a/PG-PuReMD/src/cuda/cuda_dense_lin_alg.h
+++ b/PG-PuReMD/src/cuda/cuda_dense_lin_alg.h
@@ -11,6 +11,9 @@ void Vector_MakeZero( real * const, unsigned int );
 void Vector_Copy( real * const, real const * const,
         unsigned int );
 
+void Vector_Copy_rvec2( rvec2 * const, rvec2 const * const,
+        unsigned int );
+
 void Vector_Copy_From_rvec2( real * const, rvec2 const * const,
         int, int );
 
diff --git a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu
index 48b2b7bc..39edee8b 100644
--- a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu
+++ b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu
@@ -175,6 +175,67 @@ CUDA_GLOBAL void k_sparse_matvec_half_csr( int *row_ptr_start,
 }
 
 
+/* sparse matrix, dense vector multiplication Ax = b,
+ * where warps of 32 threads collaborate to multiply each row
+ *
+ * A: symmetric (upper triangular portion only stored) matrix,
+ *    stored in CSR format
+ * x: dense vector, size equal to num. columns in A
+ * b (output): dense vector, size equal to num. columns in A
+ * N: number of rows in A */
+CUDA_GLOBAL void k_sparse_matvec_half_opt_csr( int *row_ptr_start,
+        int *row_ptr_end, int *col_ind, real *vals,
+        const real * const x, real * const b, int N )
+{
+    int pj, si, ei, thread_id, warp_id, lane_id, offset;
+    real sum;
+
+    thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+    warp_id = thread_id >> 5;
+
+    if ( warp_id >= N )
+    {
+        return;
+    }
+
+    lane_id = thread_id & 0x0000001F; 
+    si = row_ptr_start[warp_id];
+    ei = row_ptr_end[warp_id];
+
+    /* A symmetric, upper triangular portion stored
+     * => diagonal only contributes once */
+    if ( lane_id == 0 )
+    {
+        sum = vals[si] * x[warp_id];
+    }
+    else
+    {
+        sum = 0.0;
+    }
+
+    /* partial sums per thread */
+    for ( pj = si + lane_id + 1; pj < ei; pj += warpSize )
+    {
+        sum += vals[pj] * x[col_ind[pj]];
+        /* symmetric contribution to row j */
+        atomicAdd( (double *) &b[col_ind[pj]], (double) (vals[pj] * x[warp_id]) );
+    }
+
+    /* warp-level reduction of partial sums
+     * using registers within a warp */
+    for ( offset = warpSize >> 1; offset > 0; offset >>= 1 )
+    {
+        sum += __shfl_down_sync( FULL_MASK, sum, offset );
+    }
+
+    /* local contribution to row i for this warp */
+    if ( lane_id == 0 )
+    {
+        atomicAdd( (double *) &b[warp_id], (double) sum );
+    }
+}
+
+
 /* sparse matrix, dense vector multiplication Ax = b,
  * where one GPU thread multiplies a row
  *
@@ -213,8 +274,7 @@ CUDA_GLOBAL void k_sparse_matvec_full_csr( int *row_ptr_start,
 
 
 /* sparse matrix, dense vector multiplication Ax = b,
- * where warps of 32 threads
- * collaborate to multiply each row
+ * where warps of 32 threads collaborate to multiply each row
  *
  * A: symmetric matrix,
  *    stored in CSR format
@@ -225,8 +285,7 @@ CUDA_GLOBAL void k_sparse_matvec_full_opt_csr( int *row_ptr_start,
         int *row_ptr_end, int *col_ind, real *vals,
         const real * const x, real * const b, int n )
 {
-    int pj, thread_id, warp_id, lane_id, offset;
-    int si, ei;
+    int pj, si, ei, thread_id, warp_id, lane_id, offset;
     real sum;
 
     thread_id = blockDim.x * blockIdx.x + threadIdx.x;
@@ -310,6 +369,73 @@ CUDA_GLOBAL void k_dual_sparse_matvec_half_csr( int *row_ptr_start,
 }
 
 
+/* sparse matrix, dense vector multiplication Ax = b,
+ * where warps of 32 threads collaborate to multiply each row
+ *
+ * A: symmetric (upper triangular portion only stored) matrix,
+ *    stored in CSR format
+ * X: 2 dense vectors, size equal to num. columns in A
+ * B (output): 2 dense vectors, size equal to num. columns in A
+ * N: number of rows in A */
+CUDA_GLOBAL void k_dual_sparse_matvec_half_opt_csr( int *row_ptr_start,
+        int *row_ptr_end, int *col_ind, real *vals,
+        const rvec2 * const x, rvec2 * const b, int N )
+{
+    int pj, si, ei, thread_id, warp_id, lane_id, offset;
+    rvec2 sum;
+
+    thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+    warp_id = thread_id >> 5;
+
+    if ( warp_id >= N )
+    {
+        return;
+    }
+
+    lane_id = thread_id & 0x0000001F; 
+    si = row_ptr_start[warp_id];
+    ei = row_ptr_end[warp_id];
+
+    /* A symmetric, upper triangular portion stored
+     * => diagonal only contributes once */
+    if ( lane_id == 0 )
+    {
+        sum[0] = vals[si] * x[warp_id][0];
+        sum[1] = vals[si] * x[warp_id][1];
+    }
+    else
+    {
+        sum[0] = 0.0;
+        sum[1] = 0.0;
+    }
+
+    /* partial sums per thread */
+    for ( pj = si + lane_id + 1; pj < ei; pj += warpSize )
+    {
+        sum[0] += vals[pj] * x[col_ind[pj]][0];
+        sum[1] += vals[pj] * x[col_ind[pj]][1];
+        /* symmetric contribution to row j */
+        atomicAdd( (double *) &b[col_ind[pj]][0], (double) (vals[pj] * x[warp_id][0]) );
+        atomicAdd( (double *) &b[col_ind[pj]][1], (double) (vals[pj] * x[warp_id][1]) );
+    }
+
+    /* warp-level reduction of partial sums
+     * using registers within a warp */
+    for ( offset = warpSize >> 1; offset > 0; offset >>= 1 )
+    {
+        sum[0] += __shfl_down_sync( FULL_MASK, sum[0], offset );
+        sum[1] += __shfl_down_sync( FULL_MASK, sum[1], offset );
+    }
+
+    /* local contribution to row i for this warp */
+    if ( lane_id == 0 )
+    {
+        atomicAdd( (double *) &b[warp_id][0], (double) sum[0] );
+        atomicAdd( (double *) &b[warp_id][1], (double) sum[1] );
+    }
+}
+
+
 /* 1 thread per row implementation */
 CUDA_GLOBAL void k_dual_sparse_matvec_full_csr( sparse_matrix A,
         rvec2 const * const x, rvec2 * const b, int n )
@@ -487,19 +613,16 @@ static void Dual_Sparse_MatVec_local( control_params const * const control,
         cuda_memset( b, 0, sizeof(rvec2) * n, "Dual_Sparse_MatVec_local::b" );
 
         /* 1 thread per row implementation */
-        k_dual_sparse_matvec_half_csr <<< control->blocks, control->block_size >>>
-            ( A->start, A->end, A->j, A->val, x, b, A->n );
+//        k_dual_sparse_matvec_half_csr <<< control->blocks, control->block_size >>>
+//            ( A->start, A->end, A->j, A->val, x, b, A->n );
 
-        //TODO: implement half-format dual SpMV
-//        blocks = (A->n * 32) / DEF_BLOCK_SIZE
-//            + ((A->n * 32) % DEF_BLOCK_SIZE == 0 ? 0 : 1);
+        blocks = A->n * 32 / DEF_BLOCK_SIZE
+            + (A->n * 32 % DEF_BLOCK_SIZE == 0 ? 0 : 1);
         
-        /* multiple threads per row implementation,
-         * with shared memory to accumulate partial row sums */
-//        k_dual_sparse_matvec_half_opt_csr <<< blocks, DEF_BLOCK_SIZE >>>
-//             ( A->start, A->end, A->j, A->val, x, b, A->n );
-        cudaDeviceSynchronize( );
-        cudaCheckError( );
+        /* 32 threads per row implementation
+         * using registers to accumulate partial row sums */
+        k_dual_sparse_matvec_half_opt_csr <<< blocks, DEF_BLOCK_SIZE >>>
+             ( A->start, A->end, A->j, A->val, x, b, A->n );
     }
     else if ( A->format == SYM_FULL_MATRIX )
     {
@@ -514,9 +637,9 @@ static void Dual_Sparse_MatVec_local( control_params const * const control,
          * using registers to accumulate partial row sums */
         k_dual_sparse_matvec_full_opt_csr <<< blocks, DEF_BLOCK_SIZE >>>
                 ( A->start, A->end, A->j, A->val, x, b, A->n );
-        cudaDeviceSynchronize( );
-        cudaCheckError( );
     }
+    cudaDeviceSynchronize( );
+    cudaCheckError( );
 }
 
 
@@ -670,16 +793,16 @@ static void Sparse_MatVec_local( control_params const * const control,
         cuda_memset( b, 0, sizeof(real) * n, "Sparse_MatVec_local::b" );
 
         /* 1 thread per row implementation */
-        k_sparse_matvec_half_csr <<< control->blocks, control->block_size >>>
-            ( A->start, A->end, A->j, A->val, x, b, A->n );
+//        k_sparse_matvec_half_csr <<< control->blocks, control->block_size >>>
+//            ( A->start, A->end, A->j, A->val, x, b, A->n );
 
-//        blocks = (A->n * 32 / DEF_BLOCK_SIZE)
-//            + ((A->n * 32 % DEF_BLOCK_SIZE) == 0 ? 0 : 1);
+        blocks = (A->n * 32 / DEF_BLOCK_SIZE)
+            + (A->n * 32 % DEF_BLOCK_SIZE == 0 ? 0 : 1);
 
-        /* 32 threads per row implementation,
-         * with shared memory to accumulate partial row sums */
-//        k_sparse_matvec_half_opt_csr <<< blocks, DEF_BLOCK_SIZE >>>
-//             ( A->start, A->end, A->j, A->val, x, b, A->n );
+        /* 32 threads per row implementation
+         * using registers to accumulate partial row sums */
+        k_sparse_matvec_half_opt_csr <<< blocks, DEF_BLOCK_SIZE >>>
+             ( A->start, A->end, A->j, A->val, x, b, A->n );
     }
     else if ( A->format == SYM_FULL_MATRIX )
     {
@@ -790,11 +913,13 @@ static void Sparse_MatVec( reax_system const * const system,
 }
 
 
+/* Preconditioned Conjugate Gradient Method
+ * Note: this version is for the dual QEq solver */
 int Cuda_dual_CG( reax_system const * const system,
         control_params const * const control,
         simulation_data * const data, storage * const workspace,
         sparse_matrix const * const H, rvec2 const * const b, real tol,
-        rvec2 * const x, mpi_datatypes * const mpi_data, FILE *fout )
+        rvec2 * const x, mpi_datatypes * const mpi_data )
 {
     unsigned int i, matvecs;
     int ret;
@@ -1079,10 +1204,10 @@ int Cuda_CG( reax_system const * const system, control_params const * const cont
 #endif
     }
 
-    if ( i >= control->cm_solver_max_iters
-            && system->my_rank == MASTER_NODE )
+    if ( i >= control->cm_solver_max_iters )
     {
-        fprintf( stderr, "[WARNING] CG convergence failed (%d iters)\n", i );
+        fprintf( stderr, "[WARNING] p%d: CG convergence failed (%d iters)\n",
+                system->my_rank, i );
         fprintf( stderr, "    [INFO] Rel. residual error: %e\n", r_norm / b_norm );
     }
 
@@ -1090,6 +1215,304 @@ int Cuda_CG( reax_system const * const system, control_params const * const cont
 }
 
 
+/* Bi-conjugate gradient stabalized method with left preconditioning for
+ * solving nonsymmetric linear systems
+ * Note: this version is for the dual QEq solver
+ *
+ * system: 
+ * workspace: struct containing storage for workspace for the linear solver
+ * control: struct containing parameters governing the simulation and numeric methods
+ * data: struct containing simulation data (e.g., atom info)
+ * H: sparse, symmetric matrix in CSR format
+ * b: right-hand side of the linear system
+ * tol: tolerence compared against the relative residual for determining convergence
+ * x: inital guess
+ * mpi_data: 
+ *
+ * Reference: Netlib (in MATLAB)
+ *  http://www.netlib.org/templates/matlab/bicgstab.m
+ * */
+int Cuda_dual_BiCGStab( reax_system const * const system, control_params const * const control,
+        simulation_data * const data, storage * const workspace,
+        sparse_matrix const * const H, rvec2 const * const b, real tol,
+        rvec2 * const x, mpi_datatypes * const mpi_data )
+{
+    unsigned int i, matvecs;
+    int ret;
+    rvec2 tmp, alpha, beta, omega, sigma, rho, rho_old, r_norm, b_norm;
+    real redux[4];
+#if defined(LOG_PERFORMANCE)
+    real time;
+
+    time = Get_Time( );
+#endif
+
+    Dual_Sparse_MatVec( system, control, data, workspace, mpi_data,
+            H, x, system->N, workspace->d_workspace->d2 );
+
+    Vector_Sum_rvec2( workspace->d_workspace->r2, 1.0, 1.0, b,
+            -1.0, -1.0, workspace->d_workspace->d2, system->n );
+    Dot_local_rvec2( control, workspace, b,
+            b, system->n, &redux[0], &redux[1] );
+    Dot_local_rvec2( control, workspace, workspace->d_workspace->r2,
+            workspace->d_workspace->r2, system->n, &redux[2], &redux[3] );
+
+#if defined(LOG_PERFORMANCE)
+    Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
+#endif
+
+    ret = MPI_Allreduce( MPI_IN_PLACE, redux, 4, MPI_DOUBLE,
+            MPI_SUM, MPI_COMM_WORLD );
+    Check_MPI_Error( ret, __FILE__, __LINE__ );
+
+#if defined(LOG_PERFORMANCE)
+    Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
+#endif
+
+    b_norm[0] = SQRT( redux[0] );
+    b_norm[1] = SQRT( redux[1] );
+    r_norm[0] = SQRT( redux[2] );
+    r_norm[1] = SQRT( redux[3] );
+    if ( b_norm[0] == 0.0 )
+    {
+        b_norm[0] = 1.0;
+    }
+    if ( b_norm[1] == 0.0 )
+    {
+        b_norm[1] = 1.0;
+    }
+    Vector_Copy_rvec2( workspace->d_workspace->r_hat2,
+            workspace->d_workspace->r2, system->n );
+    omega[0] = 1.0;
+    omega[1] = 1.0;
+    rho[0] = 1.0;
+    rho[1] = 1.0;
+
+#if defined(LOG_PERFORMANCE)
+    Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
+#endif
+
+    for ( i = 0; i < control->cm_solver_max_iters; ++i )
+    {
+        if ( r_norm[0] / b_norm[0] <= tol || r_norm[1] / b_norm[1] <= tol )
+        {
+            break;
+        }
+
+        Dot_local_rvec2( control, workspace, workspace->d_workspace->r_hat2,
+                workspace->d_workspace->r2, system->n, &redux[0], &redux[1] );
+
+#if defined(LOG_PERFORMANCE)
+        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
+#endif
+
+        ret = MPI_Allreduce( MPI_IN_PLACE, redux, 2, MPI_DOUBLE,
+                MPI_SUM, MPI_COMM_WORLD );
+        Check_MPI_Error( ret, __FILE__, __LINE__ );
+
+#if defined(LOG_PERFORMANCE)
+        Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
+#endif
+
+        rho[0] = redux[0];
+        rho[1] = redux[1];
+        if ( rho[0] == 0.0 || rho[1] == 0.0 )
+        {
+            break;
+        }
+        if ( i > 0 )
+        {
+            beta[0] = (rho[0] / rho_old[0]) * (alpha[0] / omega[0]);
+            beta[1] = (rho[1] / rho_old[1]) * (alpha[1] / omega[1]);
+            Vector_Sum_rvec2( workspace->d_workspace->q2,
+                    1.0, 1.0, workspace->d_workspace->p2,
+                    -1.0 * omega[0], -1.0 * omega[1], workspace->d_workspace->z2, system->n );
+            Vector_Sum_rvec2( workspace->d_workspace->p2,
+                    1.0, 1.0, workspace->d_workspace->r2,
+                    beta[0], beta[1], workspace->d_workspace->q2, system->n );
+        }
+        else
+        {
+            Vector_Copy_rvec2( workspace->d_workspace->p2,
+                    workspace->d_workspace->r2, system->n );
+        }
+
+#if defined(LOG_PERFORMANCE)
+        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
+#endif
+
+        dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->p2,
+                workspace->d_workspace->d2, system->n );
+
+#if defined(LOG_PERFORMANCE)
+        Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
+#endif
+
+        Dual_Sparse_MatVec( system, control, data, workspace, mpi_data,
+                H, workspace->d_workspace->d2, system->N, workspace->d_workspace->z2 );
+
+        Dot_local_rvec2( control, workspace, workspace->d_workspace->r_hat2,
+                workspace->d_workspace->z2, system->n, &redux[0], &redux[1] );
+
+#if defined(LOG_PERFORMANCE)
+        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
+#endif
+
+        ret = MPI_Allreduce( MPI_IN_PLACE, redux, 2, MPI_DOUBLE,
+                MPI_SUM, MPI_COMM_WORLD );
+        Check_MPI_Error( ret, __FILE__, __LINE__ );
+
+#if defined(LOG_PERFORMANCE)
+        Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
+#endif
+
+        tmp[0] = redux[0];
+        tmp[1] = redux[1];
+        alpha[0] = rho[0] / tmp[0];
+        alpha[1] = rho[1] / tmp[1];
+        Vector_Sum_rvec2( workspace->d_workspace->q2,
+                1.0, 1.0, workspace->d_workspace->r2,
+                -1.0 * alpha[0], -1.0 * alpha[1], workspace->d_workspace->z2, system->n );
+        Dot_local_rvec2( control, workspace, workspace->d_workspace->q2,
+                workspace->d_workspace->q2, system->n, &redux[0], &redux[1] );
+
+#if defined(LOG_PERFORMANCE)
+        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
+#endif
+
+        ret = MPI_Allreduce( MPI_IN_PLACE, redux, 2, MPI_DOUBLE,
+                MPI_SUM, MPI_COMM_WORLD );
+        Check_MPI_Error( ret, __FILE__, __LINE__ );
+
+#if defined(LOG_PERFORMANCE)
+        Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
+#endif
+
+        tmp[0] = redux[0];
+        tmp[1] = redux[1];
+        /* early convergence check */
+        if ( tmp[0] < tol || tmp[1] < tol )
+        {
+            Vector_Add_rvec2( x, alpha[0], alpha[1], workspace->d_workspace->d2, system->n );
+            break;
+        }
+
+#if defined(LOG_PERFORMANCE)
+        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
+#endif
+
+        dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->q2,
+                workspace->d_workspace->q_hat2, system->n );
+
+#if defined(LOG_PERFORMANCE)
+        Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
+#endif
+
+        Dual_Sparse_MatVec( system, control, data, workspace, mpi_data,
+                H, workspace->d_workspace->q_hat2, system->N, workspace->d_workspace->y2 );
+
+        Dot_local_rvec2( control, workspace, workspace->d_workspace->y2,
+                workspace->d_workspace->q2, system->n, &redux[0], &redux[1] );
+        Dot_local_rvec2( control, workspace, workspace->d_workspace->y2,
+                workspace->d_workspace->y2, system->n, &redux[2], &redux[3] );
+
+#if defined(LOG_PERFORMANCE)
+        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
+#endif
+
+        ret = MPI_Allreduce( MPI_IN_PLACE, redux, 4, MPI_DOUBLE,
+                MPI_SUM, MPI_COMM_WORLD );
+        Check_MPI_Error( ret, __FILE__, __LINE__ );
+
+#if defined(LOG_PERFORMANCE)
+        Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
+#endif
+
+        sigma[0] = redux[0];
+        sigma[1] = redux[1];
+        tmp[0] = redux[2];
+        tmp[1] = redux[3];
+        omega[0] = sigma[0] / tmp[0];
+        omega[1] = sigma[1] / tmp[1];
+        Vector_Sum_rvec2( workspace->d_workspace->g2,
+                alpha[0], alpha[1], workspace->d_workspace->d2,
+                omega[0], omega[1], workspace->d_workspace->q_hat2, system->n );
+        Vector_Add_rvec2( x, 1.0, 1.0, workspace->d_workspace->g2, system->n );
+        Vector_Sum_rvec2( workspace->d_workspace->r2,
+                1.0, 1.0, workspace->d_workspace->q2,
+                -1.0 * omega[0], -1.0 * omega[1], workspace->d_workspace->y2, system->n );
+        Dot_local_rvec2( control, workspace, workspace->d_workspace->r2,
+                workspace->d_workspace->r2, system->n, &redux[0], &redux[1] );
+
+#if defined(LOG_PERFORMANCE)
+        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
+#endif
+
+        ret = MPI_Allreduce( MPI_IN_PLACE, redux, 2, MPI_DOUBLE,
+                MPI_SUM, MPI_COMM_WORLD );
+        Check_MPI_Error( ret, __FILE__, __LINE__ );
+
+#if defined(LOG_PERFORMANCE)
+        Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
+#endif
+
+        r_norm[0] = SQRT( redux[0] );
+        r_norm[1] = SQRT( redux[1] );
+        if ( omega[0] == 0.0 || omega[1] == 0.0 )
+        {
+            break;
+        }
+        rho_old[0] = rho[0];
+        rho_old[1] = rho[1];
+
+#if defined(LOG_PERFORMANCE)
+        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
+#endif
+    }
+
+    if ( r_norm[0] / b_norm[0] <= tol
+            && r_norm[1] / b_norm[1] > tol )
+    {
+        Vector_Copy_From_rvec2( workspace->d_workspace->t,
+                workspace->d_workspace->x, 1, system->n );
+
+        matvecs = Cuda_BiCGStab( system, control, data, workspace, H,
+                workspace->d_workspace->b_t, tol,
+                workspace->d_workspace->t, mpi_data );
+
+        Vector_Copy_To_rvec2( workspace->d_workspace->x,
+                workspace->d_workspace->t, 1, system->n );
+    }
+    else if ( r_norm[1] / b_norm[1] <= tol
+            && r_norm[0] / b_norm[0] > tol )
+    {
+        Vector_Copy_From_rvec2( workspace->d_workspace->s,
+                workspace->d_workspace->x, 0, system->n );
+
+        matvecs = Cuda_BiCGStab( system, control, data, workspace, H,
+                workspace->d_workspace->b_s, tol,
+                workspace->d_workspace->s, mpi_data );
+
+        Vector_Copy_To_rvec2( workspace->d_workspace->x,
+                workspace->d_workspace->s, 0, system->n );
+    }
+    else
+    {
+        matvecs = 0;
+    }
+
+    if ( i >= control->cm_solver_max_iters )
+    {
+        fprintf( stderr, "[WARNING] p%d: dual BiCGStab convergence failed (%d iters)\n",
+                system->my_rank, i );
+        fprintf( stderr, "    [INFO] Rel. residual error for s solve: %e\n", r_norm[0] / b_norm[0] );
+        fprintf( stderr, "    [INFO] Rel. residual error for t solve: %e\n", r_norm[1] / b_norm[1] );
+    }
+
+    return (i + 1) + matvecs;
+}
+
+
 /* Bi-conjugate gradient stabalized method with left preconditioning for
  * solving nonsymmetric linear systems
  *
@@ -1320,10 +1743,10 @@ int Cuda_BiCGStab( reax_system const * const system, control_params const * cons
 #endif
     }
 
-    if ( i >= control->cm_solver_max_iters
-            && system->my_rank == MASTER_NODE )
+    if ( i >= control->cm_solver_max_iters )
     {
-        fprintf( stderr, "[WARNING] CG convergence failed (%d iters)\n", i );
+        fprintf( stderr, "[WARNING] p%d: BiCGStab convergence failed (%d iters)\n",
+                system->my_rank, i );
         fprintf( stderr, "    [INFO] Rel. residual error: %e\n", r_norm / b_norm );
     }
 
diff --git a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.h b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.h
index 3a04c889..0bdfda5c 100644
--- a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.h
+++ b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.h
@@ -28,13 +28,18 @@
 int Cuda_dual_CG( reax_system const * const, control_params const * const,
         simulation_data * const, storage * const,
         sparse_matrix const * const, rvec2 const * const, real,
-        rvec2 * const, mpi_datatypes * const, FILE * );
+        rvec2 * const, mpi_datatypes * const );
 
 int Cuda_CG( reax_system const * const, control_params const * const,
         simulation_data * const, storage * const,
         sparse_matrix const * const, real const * const, real,
         real * const, mpi_datatypes * const );
 
+int Cuda_dual_BiCGStab( reax_system const * const, control_params const * const,
+        simulation_data * const, storage * const,
+        sparse_matrix const * const, rvec2 const * const, real,
+        rvec2 * const, mpi_datatypes * const );
+
 int Cuda_BiCGStab( reax_system const * const, control_params const * const,
         simulation_data * const, storage * const,
         sparse_matrix const * const, real const * const, real,
diff --git a/PG-PuReMD/src/reax_types.h b/PG-PuReMD/src/reax_types.h
index e4e4b5a0..af915bcb 100644
--- a/PG-PuReMD/src/reax_types.h
+++ b/PG-PuReMD/src/reax_types.h
@@ -2127,18 +2127,28 @@ struct storage
     real *u;
     real *w;
 
-    /* dual-CG storage */
+    /* dual GMRES, dual BiCGStab storage */
+    rvec2 *g2;
+    rvec2 *y2;
+
+    /* dual GMRES, dual BiCGStab, dual PIPECG, dual PIPECR storage */
+    rvec2 *z2;
+
+    /* dual CG, dual BiCGStab, dual PIPECG, dual PIPECR storage */
+    rvec2 *r2;
     rvec2 *d2;
-    rvec2 *p2;
     rvec2 *q2;
-    rvec2 *r2;
+    rvec2 *p2;
 
-    /* dual-PIPECG storage */
+    /* dual BiCGStab storage */
+    rvec2 *r_hat2;
+    rvec2 *q_hat2;
+
+    /* dual PIPECG, dual PIPECR storage */
     rvec2 *m2;
     rvec2 *n2;
     rvec2 *u2;
     rvec2 *w2;
-    rvec2 *z2;
 
     /* Taper */
     real Tap[8];
-- 
GitLab