PG-PuReMD: fix compilation error (remove variable). Fix issue with dual CG...

PG-PuReMD: fix compilation error (remove variable). Fix issue with dual CG solver for QEq (local arithmetic in SpMV was incorrect). Revert CG solver convergence criterion to use norm of the preconditioned residual vector.

PG-PuReMD: fix compilation error (remove variable). Fix issue with dual CG...
PG-PuReMD: fix compilation error (remove variable). Fix issue with dual CG solver for QEq (local arithmetic in SpMV was incorrect). Revert CG solver convergence criterion to use norm of the preconditioned residual vector.
d56ca2ee · Kurt A. O'Hearn · e42123c7 · d56ca2ee · d56ca2ee · d56ca2ee
Commit d56ca2ee authored 4 years ago by Kurt A. O'Hearn
--- a/PG-PuReMD/src/charges.c
+++ b/PG-PuReMD/src/charges.c
@@ -335,10 +335,11 @@ static void Calculate_Charges_QEq( reax_system const * const system,

        /* compute charge based on s & t */
 #if defined(DUAL_SOLVER)
-        q[i] = atom->q = workspace->x[i][0] - u * workspace->x[i][1];
+        atom->q = workspace->x[i][0] - u * workspace->x[i][1];
 #else
-        q[i] = atom->q = workspace->s[i] - u * workspace->t[i];
+        atom->q = workspace->s[i] - u * workspace->t[i];
 #endif
+        q[i] = atom->q;

        /* update previous solutions in s & t */
        atom->s[3] = atom->s[2];

--- a/PG-PuReMD/src/cuda/cuda_allocate.cu
+++ b/PG-PuReMD/src/cuda/cuda_allocate.cu
@@ -364,8 +364,6 @@ void Cuda_Allocate_Workspace( reax_system *system, control_params *control,
 {
    int total_real, total_rvec, local_rvec;

-    workspace->allocated = TRUE;
-
    total_real = sizeof(real) * total_cap;
    total_rvec = sizeof(rvec) * total_cap;
    local_rvec = sizeof(rvec) * local_cap;
@@ -515,13 +513,6 @@ void Cuda_Allocate_Workspace( reax_system *system, control_params *control,

 void Cuda_Deallocate_Workspace( control_params *control, storage *workspace )
 {
-    if ( workspace->allocated == FALSE )
-    {
-        return;
-    }
-
-    workspace->allocated = FALSE;
-
    /* bond order related storage  */
    cuda_free( workspace->total_bond_order, "total_bo" );
    cuda_free( workspace->Deltap, "Deltap" );

--- a/PG-PuReMD/src/cuda/cuda_charges.cu
+++ b/PG-PuReMD/src/cuda/cuda_charges.cu
@@ -58,13 +58,13 @@ static void jacobi( reax_system const * const system,
    int blocks;

    blocks = system->n / DEF_BLOCK_SIZE
-        + (( system->n % DEF_BLOCK_SIZE == 0 ) ? 0 : 1);
+        + ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);

    k_jacobi <<< blocks, DEF_BLOCK_SIZE >>>
        ( system->d_my_atoms, system->reax_param.d_sbp, 
          *(workspace->d_workspace), system->n );
-    cudaDeviceSynchronize();
-    cudaCheckError();
+    cudaDeviceSynchronize( );
+    cudaCheckError( );
 }


@@ -245,8 +245,8 @@ static void Spline_Extrapolate_Charges_QEq( reax_system const * const system,
        ( system->d_my_atoms, system->reax_param.d_sbp, 
          (control_params *)control->d_control_params,
          *(workspace->d_workspace), system->n );
-    cudaDeviceSynchronize();
-    cudaCheckError();
+    cudaDeviceSynchronize( );
+    cudaCheckError( );
 }


@@ -376,6 +376,7 @@ CUDA_GLOBAL void k_extrapolate_charges_qeq_part2( reax_atom *my_atoms,
        return;
    }

+    /* compute charge based on s & t */
 #if defined(DUAL_SOLVER)
    my_atoms[i].q = workspace.x[i][0] - u * workspace.x[i][1];
 #else
@@ -525,7 +526,8 @@ static void Calculate_Charges_QEq( reax_system const * const system,
 #endif

    /* global reduction on pseudo-charges for s and t */
-    ret = MPI_Allreduce( &my_sum, &all_sum, 2, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );
+    ret = MPI_Allreduce( &my_sum, &all_sum, 2, MPI_DOUBLE,
+            MPI_SUM, MPI_COMM_WORLD );
    Check_MPI_Error( ret, __FILE__, __LINE__ );

    u = all_sum[0] / all_sum[1];

--- a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu
+++ b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu
@@ -294,7 +294,17 @@ CUDA_GLOBAL void k_dual_sparse_matvec_full_csr( sparse_matrix A,
 }


-CUDA_GLOBAL void k_dual_sparse_matvec_full_opt_csr( sparse_matrix A,
+/* sparse matrix, dense vector multiplication AX = B,
+ * where warps of 32 threads
+ * collaborate to multiply each row
+ *
+ * A: symmetric, full, square matrix,
+ *    stored in CSR format
+ * X: 2 dense vectors, size equal to num. columns in A
+ * B (output): 2 dense vectors, size equal to num. columns in A
+ * N: number of rows in A */
+CUDA_GLOBAL void k_dual_sparse_matvec_full_opt_csr( int *row_ptr_start,
+        int *row_ptr_end, int *col_ind, real *vals,
        rvec2 const * const x, rvec2 * const b, int n )
 {
    int i, pj, thread_id, warp_id, lane_id, offset;
@@ -310,23 +320,27 @@ CUDA_GLOBAL void k_dual_sparse_matvec_full_opt_csr( sparse_matrix A,
    if ( warp_id < n )
    {
        i = warp_id;
+        si = row_ptr_start[i];
+        ei = row_ptr_end[i];
        sum[0] = 0.0;
        sum[1] = 0.0;
-        si = A.start[i];
-        ei = A.end[i];

+        /* partial sums per thread */
        for ( pj = si + lane_id; pj < ei; pj += warpSize )
        {
-            sum[0] += A.val[pj] * x[ A.j[pj] ][0];
-            sum[1] += A.val[pj] * x[ A.j[pj] ][1];
+            sum[0] += vals[pj] * x[ col_ind[pj] ][0];
+            sum[1] += vals[pj] * x[ col_ind[pj] ][1];
        }

+        /* warp-level reduction of partial sums
+         * using registers within a warp */
        for ( offset = warpSize >> 1; offset > 0; offset /= 2 )
        {
            sum[0] += __shfl_down_sync( mask, sum[0], offset );
            sum[1] += __shfl_down_sync( mask, sum[1], offset );
        }

+        /* first thread within a warp writes sum to global memory */
        if ( lane_id == 0 )
        {
            b[i][0] = sum[0];
@@ -418,7 +432,7 @@ static void Dual_Sparse_MatVec_local( control_params const * const control,
        sparse_matrix const * const A, rvec2 const * const x,
        rvec2 * const b, int n )
 {
-//    int blocks;
+    int blocks;

    if ( A->format == SYM_HALF_MATRIX )
    {
@@ -445,11 +459,14 @@ static void Dual_Sparse_MatVec_local( control_params const * const control,
        /* 1 thread per row implementation */
 //        k_dual_sparse_matvec_full_csr <<< control->blocks_n, control->blocks_size_n >>>
 //             ( *A, x, b, n );
+
+        blocks = (A->n * 32 / DEF_BLOCK_SIZE)
+            + ((A->n * 32 % DEF_BLOCK_SIZE) == 0 ? 0 : 1);
        
-        /* one warp per row implementation,
-         * with shared memory to accumulate partial row sums */
-        k_dual_sparse_matvec_full_opt_csr <<< control->blocks_n, control->block_size_n >>>
-                ( *A, x, b, n );
+        /* multiple threads per row implementation
+         * using registers to accumulate partial row sums */
+        k_dual_sparse_matvec_full_opt_csr <<< blocks, DEF_BLOCK_SIZE >>>
+                ( A->start, A->end, A->j, A->val, x, b, n );
        cudaDeviceSynchronize( );
        cudaCheckError( );
    }
@@ -621,13 +638,13 @@ static void Sparse_MatVec_local( control_params const * const control,
    }
    else if ( A->format == SYM_FULL_MATRIX )
    {
-        blocks = (A->n * 32 / DEF_BLOCK_SIZE)
-            + ((A->n * 32 % DEF_BLOCK_SIZE) == 0 ? 0 : 1);
-
        /* 1 thread per row implementation */
 //        k_sparse_matvec_full_csr <<< control->blocks, control->blocks_size >>>
 //             ( A->start, A->end, A->j, A->val, x, b, n );

+        blocks = (A->n * 32 / DEF_BLOCK_SIZE)
+            + ((A->n * 32 % DEF_BLOCK_SIZE) == 0 ? 0 : 1);
+
        /* multiple threads per row implementation
         * using registers to accumulate partial row sums */
        k_sparse_matvec_full_opt_csr <<< blocks, DEF_BLOCK_SIZE >>>
@@ -672,12 +689,12 @@ static void Sparse_MatVec_Comm_Part2( const reax_system * const system,
                "Sparse_MatVec_Comm_Part2::workspace->host_scratch" );
        spad = (real *) workspace->host_scratch;
        copy_host_device( spad, b, sizeof(real) * n1,
-                cudaMemcpyDeviceToHost, "Sparse_MatVec_Comm_Part2::q" );
+                cudaMemcpyDeviceToHost, "Sparse_MatVec_Comm_Part2::b" );

        Coll( system, mpi_data, spad, buf_type, mpi_type );

        copy_host_device( spad, b, sizeof(real) * n2,
-                cudaMemcpyHostToDevice, "Sparse_MatVec_Comm_Part2::q" );
+                cudaMemcpyHostToDevice, "Sparse_MatVec_Comm_Part2::b" );
 //#endif
    }
 }
@@ -939,8 +956,8 @@ int Cuda_CG( reax_system const * const system, control_params const * const cont

    redux[0] = Dot_local( workspace, workspace->d_workspace->r,
            workspace->d_workspace->d, system->n );
-    redux[1] = Dot_local( workspace, workspace->d_workspace->r,
-            workspace->d_workspace->r, system->n );
+    redux[1] = Dot_local( workspace, workspace->d_workspace->d,
+            workspace->d_workspace->d, system->n );
    redux[2] = Dot_local( workspace, b, b, system->n );

 #if defined(LOG_PERFORMANCE)
@@ -988,8 +1005,8 @@ int Cuda_CG( reax_system const * const system, control_params const * const cont

        redux[0] = Dot_local( workspace, workspace->d_workspace->r,
                workspace->d_workspace->p, system->n );
-        redux[1] = Dot_local( workspace, workspace->d_workspace->r,
-                workspace->d_workspace->r, system->n );
+        redux[1] = Dot_local( workspace, workspace->d_workspace->p,
+                workspace->d_workspace->p, system->n );

 #if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );