From d56ca2ee1a04243d81d4a9a2e6317a6dccb34eef Mon Sep 17 00:00:00 2001
From: "Kurt A. O'Hearn" <ohearnk@msu.edu>
Date: Mon, 6 Jul 2020 00:09:33 -0400
Subject: [PATCH] PG-PuReMD: fix compilation error (remove variable). Fix issue
 with dual CG solver for QEq (local arithmetic in SpMV was incorrect). Revert
 CG solver convergence criterion to use norm of the preconditioned residual
 vector.

---
 PG-PuReMD/src/charges.c                 |  5 ++-
 PG-PuReMD/src/cuda/cuda_allocate.cu     |  9 ----
 PG-PuReMD/src/cuda/cuda_charges.cu      | 14 ++++---
 PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu | 55 ++++++++++++++++---------
 4 files changed, 47 insertions(+), 36 deletions(-)

diff --git a/PG-PuReMD/src/charges.c b/PG-PuReMD/src/charges.c
index baec5a9f..7379596f 100644
--- a/PG-PuReMD/src/charges.c
+++ b/PG-PuReMD/src/charges.c
@@ -335,10 +335,11 @@ static void Calculate_Charges_QEq( reax_system const * const system,
 
         /* compute charge based on s & t */
 #if defined(DUAL_SOLVER)
-        q[i] = atom->q = workspace->x[i][0] - u * workspace->x[i][1];
+        atom->q = workspace->x[i][0] - u * workspace->x[i][1];
 #else
-        q[i] = atom->q = workspace->s[i] - u * workspace->t[i];
+        atom->q = workspace->s[i] - u * workspace->t[i];
 #endif
+        q[i] = atom->q;
 
         /* update previous solutions in s & t */
         atom->s[3] = atom->s[2];
diff --git a/PG-PuReMD/src/cuda/cuda_allocate.cu b/PG-PuReMD/src/cuda/cuda_allocate.cu
index eeee26df..41285852 100644
--- a/PG-PuReMD/src/cuda/cuda_allocate.cu
+++ b/PG-PuReMD/src/cuda/cuda_allocate.cu
@@ -364,8 +364,6 @@ void Cuda_Allocate_Workspace( reax_system *system, control_params *control,
 {
     int total_real, total_rvec, local_rvec;
 
-    workspace->allocated = TRUE;
-
     total_real = sizeof(real) * total_cap;
     total_rvec = sizeof(rvec) * total_cap;
     local_rvec = sizeof(rvec) * local_cap;
@@ -515,13 +513,6 @@ void Cuda_Allocate_Workspace( reax_system *system, control_params *control,
 
 void Cuda_Deallocate_Workspace( control_params *control, storage *workspace )
 {
-    if ( workspace->allocated == FALSE )
-    {
-        return;
-    }
-
-    workspace->allocated = FALSE;
-
     /* bond order related storage  */
     cuda_free( workspace->total_bond_order, "total_bo" );
     cuda_free( workspace->Deltap, "Deltap" );
diff --git a/PG-PuReMD/src/cuda/cuda_charges.cu b/PG-PuReMD/src/cuda/cuda_charges.cu
index 93884ab4..f9c34d87 100644
--- a/PG-PuReMD/src/cuda/cuda_charges.cu
+++ b/PG-PuReMD/src/cuda/cuda_charges.cu
@@ -58,13 +58,13 @@ static void jacobi( reax_system const * const system,
     int blocks;
 
     blocks = system->n / DEF_BLOCK_SIZE
-        + (( system->n % DEF_BLOCK_SIZE == 0 ) ? 0 : 1);
+        + ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
 
     k_jacobi <<< blocks, DEF_BLOCK_SIZE >>>
         ( system->d_my_atoms, system->reax_param.d_sbp, 
           *(workspace->d_workspace), system->n );
-    cudaDeviceSynchronize();
-    cudaCheckError();
+    cudaDeviceSynchronize( );
+    cudaCheckError( );
 }
 
 
@@ -245,8 +245,8 @@ static void Spline_Extrapolate_Charges_QEq( reax_system const * const system,
         ( system->d_my_atoms, system->reax_param.d_sbp, 
           (control_params *)control->d_control_params,
           *(workspace->d_workspace), system->n );
-    cudaDeviceSynchronize();
-    cudaCheckError();
+    cudaDeviceSynchronize( );
+    cudaCheckError( );
 }
 
 
@@ -376,6 +376,7 @@ CUDA_GLOBAL void k_extrapolate_charges_qeq_part2( reax_atom *my_atoms,
         return;
     }
 
+    /* compute charge based on s & t */
 #if defined(DUAL_SOLVER)
     my_atoms[i].q = workspace.x[i][0] - u * workspace.x[i][1];
 #else
@@ -525,7 +526,8 @@ static void Calculate_Charges_QEq( reax_system const * const system,
 #endif
 
     /* global reduction on pseudo-charges for s and t */
-    ret = MPI_Allreduce( &my_sum, &all_sum, 2, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );
+    ret = MPI_Allreduce( &my_sum, &all_sum, 2, MPI_DOUBLE,
+            MPI_SUM, MPI_COMM_WORLD );
     Check_MPI_Error( ret, __FILE__, __LINE__ );
 
     u = all_sum[0] / all_sum[1];
diff --git a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu
index b0e7f70c..fdb63082 100644
--- a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu
+++ b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu
@@ -294,7 +294,17 @@ CUDA_GLOBAL void k_dual_sparse_matvec_full_csr( sparse_matrix A,
 }
 
 
-CUDA_GLOBAL void k_dual_sparse_matvec_full_opt_csr( sparse_matrix A,
+/* sparse matrix, dense vector multiplication AX = B,
+ * where warps of 32 threads
+ * collaborate to multiply each row
+ *
+ * A: symmetric, full, square matrix,
+ *    stored in CSR format
+ * X: 2 dense vectors, size equal to num. columns in A
+ * B (output): 2 dense vectors, size equal to num. columns in A
+ * N: number of rows in A */
+CUDA_GLOBAL void k_dual_sparse_matvec_full_opt_csr( int *row_ptr_start,
+        int *row_ptr_end, int *col_ind, real *vals,
         rvec2 const * const x, rvec2 * const b, int n )
 {
     int i, pj, thread_id, warp_id, lane_id, offset;
@@ -310,23 +320,27 @@ CUDA_GLOBAL void k_dual_sparse_matvec_full_opt_csr( sparse_matrix A,
     if ( warp_id < n )
     {
         i = warp_id;
+        si = row_ptr_start[i];
+        ei = row_ptr_end[i];
         sum[0] = 0.0;
         sum[1] = 0.0;
-        si = A.start[i];
-        ei = A.end[i];
 
+        /* partial sums per thread */
         for ( pj = si + lane_id; pj < ei; pj += warpSize )
         {
-            sum[0] += A.val[pj] * x[ A.j[pj] ][0];
-            sum[1] += A.val[pj] * x[ A.j[pj] ][1];
+            sum[0] += vals[pj] * x[ col_ind[pj] ][0];
+            sum[1] += vals[pj] * x[ col_ind[pj] ][1];
         }
 
+        /* warp-level reduction of partial sums
+         * using registers within a warp */
         for ( offset = warpSize >> 1; offset > 0; offset /= 2 )
         {
             sum[0] += __shfl_down_sync( mask, sum[0], offset );
             sum[1] += __shfl_down_sync( mask, sum[1], offset );
         }
 
+        /* first thread within a warp writes sum to global memory */
         if ( lane_id == 0 )
         {
             b[i][0] = sum[0];
@@ -418,7 +432,7 @@ static void Dual_Sparse_MatVec_local( control_params const * const control,
         sparse_matrix const * const A, rvec2 const * const x,
         rvec2 * const b, int n )
 {
-//    int blocks;
+    int blocks;
 
     if ( A->format == SYM_HALF_MATRIX )
     {
@@ -445,11 +459,14 @@ static void Dual_Sparse_MatVec_local( control_params const * const control,
         /* 1 thread per row implementation */
 //        k_dual_sparse_matvec_full_csr <<< control->blocks_n, control->blocks_size_n >>>
 //             ( *A, x, b, n );
+
+        blocks = (A->n * 32 / DEF_BLOCK_SIZE)
+            + ((A->n * 32 % DEF_BLOCK_SIZE) == 0 ? 0 : 1);
         
-        /* one warp per row implementation,
-         * with shared memory to accumulate partial row sums */
-        k_dual_sparse_matvec_full_opt_csr <<< control->blocks_n, control->block_size_n >>>
-                ( *A, x, b, n );
+        /* multiple threads per row implementation
+         * using registers to accumulate partial row sums */
+        k_dual_sparse_matvec_full_opt_csr <<< blocks, DEF_BLOCK_SIZE >>>
+                ( A->start, A->end, A->j, A->val, x, b, n );
         cudaDeviceSynchronize( );
         cudaCheckError( );
     }
@@ -621,13 +638,13 @@ static void Sparse_MatVec_local( control_params const * const control,
     }
     else if ( A->format == SYM_FULL_MATRIX )
     {
-        blocks = (A->n * 32 / DEF_BLOCK_SIZE)
-            + ((A->n * 32 % DEF_BLOCK_SIZE) == 0 ? 0 : 1);
-
         /* 1 thread per row implementation */
 //        k_sparse_matvec_full_csr <<< control->blocks, control->blocks_size >>>
 //             ( A->start, A->end, A->j, A->val, x, b, n );
 
+        blocks = (A->n * 32 / DEF_BLOCK_SIZE)
+            + ((A->n * 32 % DEF_BLOCK_SIZE) == 0 ? 0 : 1);
+
         /* multiple threads per row implementation
          * using registers to accumulate partial row sums */
         k_sparse_matvec_full_opt_csr <<< blocks, DEF_BLOCK_SIZE >>>
@@ -672,12 +689,12 @@ static void Sparse_MatVec_Comm_Part2( const reax_system * const system,
                 "Sparse_MatVec_Comm_Part2::workspace->host_scratch" );
         spad = (real *) workspace->host_scratch;
         copy_host_device( spad, b, sizeof(real) * n1,
-                cudaMemcpyDeviceToHost, "Sparse_MatVec_Comm_Part2::q" );
+                cudaMemcpyDeviceToHost, "Sparse_MatVec_Comm_Part2::b" );
 
         Coll( system, mpi_data, spad, buf_type, mpi_type );
 
         copy_host_device( spad, b, sizeof(real) * n2,
-                cudaMemcpyHostToDevice, "Sparse_MatVec_Comm_Part2::q" );
+                cudaMemcpyHostToDevice, "Sparse_MatVec_Comm_Part2::b" );
 //#endif
     }
 }
@@ -939,8 +956,8 @@ int Cuda_CG( reax_system const * const system, control_params const * const cont
 
     redux[0] = Dot_local( workspace, workspace->d_workspace->r,
             workspace->d_workspace->d, system->n );
-    redux[1] = Dot_local( workspace, workspace->d_workspace->r,
-            workspace->d_workspace->r, system->n );
+    redux[1] = Dot_local( workspace, workspace->d_workspace->d,
+            workspace->d_workspace->d, system->n );
     redux[2] = Dot_local( workspace, b, b, system->n );
 
 #if defined(LOG_PERFORMANCE)
@@ -988,8 +1005,8 @@ int Cuda_CG( reax_system const * const system, control_params const * const cont
 
         redux[0] = Dot_local( workspace, workspace->d_workspace->r,
                 workspace->d_workspace->p, system->n );
-        redux[1] = Dot_local( workspace, workspace->d_workspace->r,
-                workspace->d_workspace->r, system->n );
+        redux[1] = Dot_local( workspace, workspace->d_workspace->p,
+                workspace->d_workspace->p, system->n );
 
 #if defined(LOG_PERFORMANCE)
         Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
-- 
GitLab