From c9eed2322f748169420d867b976b3ffbc4c34d8f Mon Sep 17 00:00:00 2001
From: "Kurt A. O'Hearn" <ohearnk@msu.edu>
Date: Thu, 6 Aug 2020 01:04:36 -0400
Subject: [PATCH] PG-PuReMD: add support for symmetric, half stored format
 (SYM_HALF_MATRIX) of the sparse matrix for the charge model (add
 initialization routines and fix-up solver).

---
 PG-PuReMD/src/cuda/cuda_charges.cu      |   1 -
 PG-PuReMD/src/cuda/cuda_forces.cu       | 244 ++++++++++++++++++++++--
 PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu |  65 +++----
 PG-PuReMD/src/forces.c                  |   6 +-
 4 files changed, 260 insertions(+), 56 deletions(-)

diff --git a/PG-PuReMD/src/cuda/cuda_charges.cu b/PG-PuReMD/src/cuda/cuda_charges.cu
index ce45043e..01d811f0 100644
--- a/PG-PuReMD/src/cuda/cuda_charges.cu
+++ b/PG-PuReMD/src/cuda/cuda_charges.cu
@@ -622,7 +622,6 @@ void QEq( reax_system const * const system, control_params const * const control
         break;
 
     case CG_S:
-        workspace->d_workspace->H.format = SYM_FULL_MATRIX;
 #if defined(DUAL_SOLVER)
         iters = Cuda_dual_CG( system, control, data, workspace, &workspace->d_workspace->H,
                 workspace->d_workspace->b, control->cm_solver_q_err, workspace->d_workspace->x, mpi_data,
diff --git a/PG-PuReMD/src/cuda/cuda_forces.cu b/PG-PuReMD/src/cuda/cuda_forces.cu
index bba38c8b..1da8cb67 100644
--- a/PG-PuReMD/src/cuda/cuda_forces.cu
+++ b/PG-PuReMD/src/cuda/cuda_forces.cu
@@ -259,6 +259,163 @@ CUDA_GLOBAL void k_init_distance( reax_atom *my_atoms, reax_list far_nbr_list, i
 }
 
 
+/* Compute the charge matrix entries and store the matrix in half format
+ * using the far neighbors list (stored in full format) and according to
+ * the full shell communication method */
+CUDA_GLOBAL void k_init_cm_half_fs( reax_atom *my_atoms, single_body_parameters *sbp, 
+        two_body_parameters *tbp, storage workspace, control_params *control, 
+        reax_list far_nbr_list, int num_atom_types,
+        int *max_cm_entries, int *realloc_cm_entries )
+{
+    int i, j, pj;
+    int start_i, end_i;
+    int type_i, type_j;
+    int cm_top;
+    int num_cm_entries;
+    real r_ij;
+    single_body_parameters *sbp_i;
+    two_body_parameters *twbp;
+    reax_atom *atom_i, *atom_j;
+    sparse_matrix *H;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= workspace.H.n_max )
+    {
+        return;
+    }
+
+    H = &workspace.H;
+    cm_top = H->start[i];
+
+    if ( i < H->n )
+    {
+        atom_i = &my_atoms[i];
+        type_i = atom_i->type;
+        start_i = Start_Index( i, &far_nbr_list );
+        end_i = End_Index( i, &far_nbr_list );
+        sbp_i = &sbp[type_i];
+
+        /* diagonal entry in the matrix */
+        H->j[cm_top] = i;
+        H->val[cm_top] = Init_Charge_Matrix_Entry( sbp_i, workspace.Tap, control,
+                i, i, 0.0, 0.0, DIAGONAL );
+        ++cm_top;
+
+        for ( pj = start_i; pj < end_i; ++pj )
+        {
+            if ( far_nbr_list.far_nbr_list.d[pj] <= control->nonb_cut )
+            {
+                j = far_nbr_list.far_nbr_list.nbr[pj];
+                atom_j = &my_atoms[j];
+                type_j = atom_j->type;
+
+                /* if j is a local OR ghost atom in the upper triangular region of the matrix */
+                if ( atom_i->orig_id < atom_j->orig_id )
+                {
+                    twbp = &tbp[ index_tbp(type_i, type_j, num_atom_types) ];
+                    r_ij = far_nbr_list.far_nbr_list.d[pj];
+
+                    H->j[cm_top] = j;
+                    H->val[cm_top] = Init_Charge_Matrix_Entry( sbp_i, workspace.Tap,
+                            control, i, H->j[cm_top], r_ij, twbp->gamma, OFF_DIAGONAL );
+                    ++cm_top;
+                }
+            }
+        }
+    }
+
+    __syncthreads( );
+
+    H->end[i] = cm_top;
+    num_cm_entries = cm_top - H->start[i];
+
+    /* reallocation checks */
+    if ( num_cm_entries > max_cm_entries[i] )
+    {
+        *realloc_cm_entries = TRUE;
+    }
+}
+
+
+/* Compute the tabulated charge matrix entries and store the matrix in half format
+ * using the far neighbors list (stored in full format) and according to
+ * the full shell communication method */
+CUDA_GLOBAL void k_init_cm_half_fs_tab( reax_atom *my_atoms, single_body_parameters *sbp, 
+        storage workspace, control_params *control, 
+        reax_list far_nbr_list, LR_lookup_table *t_LR, int num_atom_types,
+        int *max_cm_entries, int *realloc_cm_entries )
+{
+    int i, j, pj;
+    int start_i, end_i;
+    int type_i, type_j;
+    int cm_top;
+    int num_cm_entries;
+    real r_ij;
+    single_body_parameters *sbp_i;
+    reax_atom *atom_i, *atom_j;
+    sparse_matrix *H;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= workspace.H.n_max )
+    {
+        return;
+    }
+
+    H = &workspace.H;
+    cm_top = H->start[i];
+
+    if ( i < H->n )
+    {
+        atom_i = &my_atoms[i];
+        type_i = atom_i->type;
+        start_i = Start_Index( i, &far_nbr_list );
+        end_i = End_Index( i, &far_nbr_list );
+        sbp_i = &sbp[type_i];
+
+        /* diagonal entry in the matrix */
+        H->j[cm_top] = i;
+        H->val[cm_top] = Init_Charge_Matrix_Entry( sbp_i, workspace.Tap, control,
+                i, i, 0.0, 0.0, DIAGONAL );
+        ++cm_top;
+
+        for ( pj = start_i; pj < end_i; ++pj )
+        {
+            j = far_nbr_list.far_nbr_list.nbr[pj];
+
+            if ( far_nbr_list.far_nbr_list.d[pj] <= control->nonb_cut )
+            {
+                atom_j = &my_atoms[j];
+                type_j = atom_j->type;
+
+                /* if j is a local OR ghost atom in the upper triangular region of the matrix */
+                if ( atom_i->orig_id < atom_j->orig_id )
+                {
+                    r_ij = far_nbr_list.far_nbr_list.d[pj];
+
+                    H->j[cm_top] = j;
+                    H->val[cm_top] = Init_Charge_Matrix_Entry_Tab( t_LR, r_ij,
+                            type_i, type_j, num_atom_types );
+                    ++cm_top;
+                }
+            }
+        }
+    }
+
+    __syncthreads( );
+
+    H->end[i] = cm_top;
+    num_cm_entries = cm_top - H->start[i];
+
+    /* reallocation checks */
+    if ( num_cm_entries > max_cm_entries[i] )
+    {
+        *realloc_cm_entries = TRUE;
+    }
+}
+
+
 /* Compute the charge matrix entries and store the matrix in full format
  * using the far neighbors list (stored in full format) and according to
  * the full shell communication method */
@@ -302,7 +459,6 @@ CUDA_GLOBAL void k_init_cm_full_fs( reax_atom *my_atoms, single_body_parameters
                 i, i, 0.0, 0.0, DIAGONAL );
         ++cm_top;
 
-        /* update i-j distance - check if j is within cutoff */
         for ( pj = start_i; pj < end_i; ++pj )
         {
             j = far_nbr_list.far_nbr_list.nbr[pj];
@@ -340,7 +496,7 @@ CUDA_GLOBAL void k_init_cm_full_fs( reax_atom *my_atoms, single_body_parameters
  * using the far neighbors list (stored in full format) and according to
  * the full shell communication method */
 CUDA_GLOBAL void k_init_cm_full_fs_tab( reax_atom *my_atoms, single_body_parameters *sbp, 
-        two_body_parameters *tbp, storage workspace, control_params *control, 
+        storage workspace, control_params *control, 
         reax_list far_nbr_list, LR_lookup_table *t_LR, int num_atom_types,
         int *max_cm_entries, int *realloc_cm_entries )
 {
@@ -378,7 +534,6 @@ CUDA_GLOBAL void k_init_cm_full_fs_tab( reax_atom *my_atoms, single_body_paramet
                 i, i, 0.0, 0.0, DIAGONAL );
         ++cm_top;
 
-        /* update i-j distance - check if j is within cutoff */
         for ( pj = start_i; pj < end_i; ++pj )
         {
             j = far_nbr_list.far_nbr_list.nbr[pj];
@@ -608,6 +763,50 @@ CUDA_GLOBAL void k_init_bonds( reax_atom *my_atoms, single_body_parameters *sbp,
 }
 
 
+CUDA_GLOBAL void k_estimate_storages_cm_half( reax_atom *my_atoms,
+        control_params *control, reax_list far_nbr_list, int cm_n, int cm_n_max,
+        int *cm_entries, int *max_cm_entries )
+{
+    int i, j, pj; 
+    int start_i, end_i;
+    int num_cm_entries;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= cm_n_max )
+    {
+        return;
+    }
+
+    num_cm_entries = 0;
+
+    if ( i < cm_n )
+    {
+        start_i = Start_Index( i, &far_nbr_list );
+        end_i = End_Index( i, &far_nbr_list );
+
+        /* diagonal entry */
+        ++num_cm_entries;
+
+        for ( pj = start_i; pj < end_i; ++pj )
+        { 
+            j = far_nbr_list.far_nbr_list.nbr[pj];
+
+            if ( far_nbr_list.far_nbr_list.d[pj] <= control->nonb_cut
+                    && (j < cm_n || my_atoms[i].orig_id < my_atoms[j].orig_id) )
+            {
+                ++num_cm_entries;
+            }
+        }
+    }
+
+    __syncthreads( );
+
+    cm_entries[i] = num_cm_entries;
+    max_cm_entries[i] = MAX( (int) CEIL(num_cm_entries * SAFE_ZONE), MIN_CM_ENTRIES );
+}
+
+
 CUDA_GLOBAL void k_estimate_storages_cm_full( control_params *control,
         reax_list far_nbr_list, int cm_n, int cm_n_max,
         int *cm_entries, int *max_cm_entries )
@@ -1198,11 +1397,11 @@ void Cuda_Estimate_Storages( reax_system *system, control_params *control,
 
         if ( workspace->d_workspace->H.format == SYM_HALF_MATRIX )
         {
-//            k_estimate_storages_cm_half <<< blocks, DEF_BLOCK_SIZE >>>
-//                ( (control_params *) control->d_control_params,
-//                  *(lists[FAR_NBRS]), workspace->d_workspace->H.n,
-//                  workspace->d_workspace->H.n_max,
-//                  system->d_cm_entries, system->d_max_cm_entries );
+            k_estimate_storages_cm_half <<< blocks, DEF_BLOCK_SIZE >>>
+                ( system->d_my_atoms, (control_params *) control->d_control_params,
+                  *(lists[FAR_NBRS]), workspace->d_workspace->H.n,
+                  workspace->d_workspace->H.n_max,
+                  system->d_cm_entries, system->d_max_cm_entries );
         }
         else
         {
@@ -1328,13 +1527,22 @@ int Cuda_Init_Forces( reax_system *system, control_params *control,
     {
         if ( workspace->d_workspace->H.format == SYM_HALF_MATRIX )
         {
-//            k_init_cm_half_fs <<< blocks, DEF_BLOCK_SIZE >>>
-//                ( system->d_my_atoms, system->reax_param.d_sbp, system->reax_param.d_tbp,
-//                  *(workspace->d_workspace), (control_params *) control->d_control_params,
-//                  *(lists[FAR_NBRS]), workspace->d_LR, system->n, system->N, system->reax_param.num_atom_types,
-//                  system->d_max_cm_entries, system->d_realloc_cm_entries );
-//            cudaDeviceSynchronize( );
-//            cudaCheckError( );
+            if ( control->tabulate <= 0 )
+            {
+                k_init_cm_half_fs <<< blocks, DEF_BLOCK_SIZE >>>
+                    ( system->d_my_atoms, system->reax_param.d_sbp, system->reax_param.d_tbp,
+                      *(workspace->d_workspace), (control_params *) control->d_control_params,
+                      *(lists[FAR_NBRS]), system->reax_param.num_atom_types,
+                      system->d_max_cm_entries, system->d_realloc_cm_entries );
+            }
+            else
+            {
+                k_init_cm_half_fs_tab <<< blocks, DEF_BLOCK_SIZE >>>
+                    ( system->d_my_atoms, system->reax_param.d_sbp,
+                      *(workspace->d_workspace), (control_params *) control->d_control_params,
+                      *(lists[FAR_NBRS]), workspace->d_LR, system->reax_param.num_atom_types,
+                      system->d_max_cm_entries, system->d_realloc_cm_entries );
+            }
         }
         else
         {
@@ -1349,14 +1557,14 @@ int Cuda_Init_Forces( reax_system *system, control_params *control,
             else
             {
                 k_init_cm_full_fs_tab <<< blocks, DEF_BLOCK_SIZE >>>
-                    ( system->d_my_atoms, system->reax_param.d_sbp, system->reax_param.d_tbp,
+                    ( system->d_my_atoms, system->reax_param.d_sbp,
                       *(workspace->d_workspace), (control_params *) control->d_control_params,
                       *(lists[FAR_NBRS]), workspace->d_LR, system->reax_param.num_atom_types,
                       system->d_max_cm_entries, system->d_realloc_cm_entries );
             }
-            cudaDeviceSynchronize( );
-            cudaCheckError( );
         }
+        cudaDeviceSynchronize( );
+        cudaCheckError( );
     }
 
 #if defined(LOG_PERFORMANCE)
diff --git a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu
index 385a163d..48b2b7bc 100644
--- a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu
+++ b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu
@@ -137,7 +137,7 @@ CUDA_GLOBAL void k_jacobi_apply( real const * const Hdia_inv, real const * const
 /* sparse matrix, dense vector multiplication Ax = b,
  * where one GPU thread multiplies a row
  *
- * A: symmetric (lower triangular portion only stored), square matrix,
+ * A: symmetric (upper triangular portion only stored) matrix,
  *    stored in CSR format
  * x: dense vector, size equal to num. columns in A
  * b (output): dense vector, size equal to num. columns in A
@@ -158,27 +158,27 @@ CUDA_GLOBAL void k_sparse_matvec_half_csr( int *row_ptr_start,
 
     si = row_ptr_start[i];
     ei = row_ptr_end[i];
-    sum = 0.0;
 
-    for ( pj = si; pj < ei; ++pj )
+    /* A symmetric, upper triangular portion stored
+     * => diagonal only contributes once */
+    sum = vals[si] * x[i];
+
+    for ( pj = si + 1; pj < ei; ++pj )
     {
         sum += vals[pj] * x[col_ind[pj]];
-        /* symmetric contribution (A is symmetric, lower half stored) */
-        atomicAdd( (double*) &b[col_ind[pj]], (double) (vals[pj] * x[i]) );
+        /* symmetric contribution to row j */
+        atomicAdd( (double *) &b[col_ind[pj]], (double) (vals[pj] * x[i]) );
     }
 
-    /* diagonal entry */
-    sum += vals[ei] * x[i];
-
     /* local contribution to row i for this thread */
-    atomicAdd( (double*) &b[i], (double) sum );
+    atomicAdd( (double *) &b[i], (double) sum );
 }
 
 
 /* sparse matrix, dense vector multiplication Ax = b,
  * where one GPU thread multiplies a row
  *
- * A: symmetric (lower triangular portion only stored), square matrix,
+ * A: symmetric matrix,
  *    stored in CSR format
  * x: dense vector, size equal to num. columns in A
  * b (output): dense vector, size equal to num. columns in A
@@ -216,7 +216,7 @@ CUDA_GLOBAL void k_sparse_matvec_full_csr( int *row_ptr_start,
  * where warps of 32 threads
  * collaborate to multiply each row
  *
- * A: symmetric, full, square matrix,
+ * A: symmetric matrix,
  *    stored in CSR format
  * x: dense vector, size equal to num. columns in A
  * b (output): dense vector, size equal to num. columns in A
@@ -268,7 +268,7 @@ CUDA_GLOBAL void k_sparse_matvec_full_opt_csr( int *row_ptr_start,
 /* sparse matrix, dense vector multiplication Ax = b,
  * where one GPU thread multiplies a row
  *
- * A: symmetric (lower triangular portion only stored), square matrix,
+ * A: symmetric (upper triangular portion only stored) matrix,
  *    stored in CSR format
  * X: 2 dense vectors, size equal to num. columns in A
  * B (output): 2 dense vectors, size equal to num. columns in A
@@ -289,25 +289,24 @@ CUDA_GLOBAL void k_dual_sparse_matvec_half_csr( int *row_ptr_start,
 
     si = row_ptr_start[i];
     ei = row_ptr_end[i];
-    sum[0] = 0.0;
-    sum[1] = 0.0;
 
-    for ( pj = si; pj < ei; ++pj )
+    /* A symmetric, upper triangular portion stored
+     * => diagonal only contributes once */
+    sum[0] = vals[si] * x[i][0];
+    sum[1] = vals[si] * x[i][1];
+
+    for ( pj = si + 1; pj < ei; ++pj )
     {
         sum[0] += vals[pj] * x[col_ind[pj]][0];
         sum[1] += vals[pj] * x[col_ind[pj]][1];
-        /* symmetric contribution (A is symmetric, lower half stored) */
-        atomicAdd( (double*) &b[col_ind[pj]][0], (double) (vals[pj] * x[i][0]) );
-        atomicAdd( (double*) &b[col_ind[pj]][1], (double) (vals[pj] * x[i][1]) );
+        /* symmetric contribution to row j */
+        atomicAdd( (double *) &b[col_ind[pj]][0], (double) (vals[pj] * x[i][0]) );
+        atomicAdd( (double *) &b[col_ind[pj]][1], (double) (vals[pj] * x[i][1]) );
     }
 
-    /* diagonal entry */
-    sum[0] += vals[ei] * x[i][0];
-    sum[1] += vals[ei] * x[i][1];
-
     /* local contribution to row i for this thread */
-    atomicAdd( (double*) &b[i][0], (double) sum[0] );
-    atomicAdd( (double*) &b[i][1], (double) sum[1] );
+    atomicAdd( (double *) &b[i][0], (double) sum[0] );
+    atomicAdd( (double *) &b[i][1], (double) sum[1] );
 }
 
 
@@ -345,7 +344,7 @@ CUDA_GLOBAL void k_dual_sparse_matvec_full_csr( sparse_matrix A,
  * where warps of 32 threads
  * collaborate to multiply each row
  *
- * A: symmetric, full, square matrix,
+ * A: symmetric matrix,
  *    stored in CSR format
  * X: 2 dense vectors, size equal to num. columns in A
  * B (output): 2 dense vectors, size equal to num. columns in A
@@ -574,7 +573,7 @@ static void Dual_Sparse_MatVec_Comm_Part2( const reax_system * const system,
  * control:
  * data:
  * workspace: storage container for workspace structures
- * A: symmetric (lower triangular portion only stored), square matrix,
+ * A: symmetric matrix,
  *    stored in CSR format
  * X: dense vector, size equal to num. columns in A
  * n: number of rows in X
@@ -677,12 +676,10 @@ static void Sparse_MatVec_local( control_params const * const control,
 //        blocks = (A->n * 32 / DEF_BLOCK_SIZE)
 //            + ((A->n * 32 % DEF_BLOCK_SIZE) == 0 ? 0 : 1);
 
-        /* multiple threads per row implementation,
+        /* 32 threads per row implementation,
          * with shared memory to accumulate partial row sums */
 //        k_sparse_matvec_half_opt_csr <<< blocks, DEF_BLOCK_SIZE >>>
 //             ( A->start, A->end, A->j, A->val, x, b, A->n );
-        cudaDeviceSynchronize( );
-        cudaCheckError( );
     }
     else if ( A->format == SYM_FULL_MATRIX )
     {
@@ -693,13 +690,13 @@ static void Sparse_MatVec_local( control_params const * const control,
         blocks = ((A->n * 32) / DEF_BLOCK_SIZE)
             + (((A->n * 32) % DEF_BLOCK_SIZE) == 0 ? 0 : 1);
 
-        /* multiple threads per row implementation
+        /* 32 threads per row implementation
          * using registers to accumulate partial row sums */
         k_sparse_matvec_full_opt_csr <<< blocks, DEF_BLOCK_SIZE >>>
              ( A->start, A->end, A->j, A->val, x, b, A->n );
-        cudaDeviceSynchronize( );
-        cudaCheckError( );
     }
+    cudaDeviceSynchronize( );
+    cudaCheckError( );
 }
 
 
@@ -754,7 +751,7 @@ static void Sparse_MatVec_Comm_Part2( const reax_system * const system,
  * control:
  * data:
  * workspace: storage container for workspace structures
- * A: symmetric (lower triangular portion only stored), square matrix,
+ * A: symmetric matrix,
  *    stored in CSR format
  * x: dense vector
  * n: number of entries in x
@@ -1100,7 +1097,7 @@ int Cuda_CG( reax_system const * const system, control_params const * const cont
  * workspace: struct containing storage for workspace for the linear solver
  * control: struct containing parameters governing the simulation and numeric methods
  * data: struct containing simulation data (e.g., atom info)
- * H: sparse, symmetric matrix, lower half stored in CSR format
+ * H: sparse, symmetric matrix in CSR format
  * b: right-hand side of the linear system
  * tol: tolerence compared against the relative residual for determining convergence
  * x: inital guess
diff --git a/PG-PuReMD/src/forces.c b/PG-PuReMD/src/forces.c
index e8121f98..29e220c7 100644
--- a/PG-PuReMD/src/forces.c
+++ b/PG-PuReMD/src/forces.c
@@ -188,16 +188,16 @@ static inline real Init_Charge_Matrix_Entry( const reax_system * const system,
                 /* i == j: periodic self-interaction term
                  * i != j: general interaction term */
                 ret = ((i == j) ? 0.5 : 1.0) * Tap * EV_to_KCALpMOL / dr3gamij_3;
-            break;
+                break;
 
             case DIAGONAL:
                 ret = system->reax_param.sbp[system->my_atoms[i].type].eta;
-            break;
+                break;
 
             default:
                 fprintf( stderr, "[ERROR] Invalid matrix position. Terminating...\n" );
                 exit( INVALID_INPUT );
-            break;
+                break;
         }
         break;
 
-- 
GitLab