From c9eed2322f748169420d867b976b3ffbc4c34d8f Mon Sep 17 00:00:00 2001 From: "Kurt A. O'Hearn" <ohearnk@msu.edu> Date: Thu, 6 Aug 2020 01:04:36 -0400 Subject: [PATCH] PG-PuReMD: add support for symmetric, half stored format (SYM_HALF_MATRIX) of the sparse matrix for the charge model (add initialization routines and fix-up solver). --- PG-PuReMD/src/cuda/cuda_charges.cu | 1 - PG-PuReMD/src/cuda/cuda_forces.cu | 244 ++++++++++++++++++++++-- PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu | 65 +++---- PG-PuReMD/src/forces.c | 6 +- 4 files changed, 260 insertions(+), 56 deletions(-) diff --git a/PG-PuReMD/src/cuda/cuda_charges.cu b/PG-PuReMD/src/cuda/cuda_charges.cu index ce45043e..01d811f0 100644 --- a/PG-PuReMD/src/cuda/cuda_charges.cu +++ b/PG-PuReMD/src/cuda/cuda_charges.cu @@ -622,7 +622,6 @@ void QEq( reax_system const * const system, control_params const * const control break; case CG_S: - workspace->d_workspace->H.format = SYM_FULL_MATRIX; #if defined(DUAL_SOLVER) iters = Cuda_dual_CG( system, control, data, workspace, &workspace->d_workspace->H, workspace->d_workspace->b, control->cm_solver_q_err, workspace->d_workspace->x, mpi_data, diff --git a/PG-PuReMD/src/cuda/cuda_forces.cu b/PG-PuReMD/src/cuda/cuda_forces.cu index bba38c8b..1da8cb67 100644 --- a/PG-PuReMD/src/cuda/cuda_forces.cu +++ b/PG-PuReMD/src/cuda/cuda_forces.cu @@ -259,6 +259,163 @@ CUDA_GLOBAL void k_init_distance( reax_atom *my_atoms, reax_list far_nbr_list, i } +/* Compute the charge matrix entries and store the matrix in half format + * using the far neighbors list (stored in full format) and according to + * the full shell communication method */ +CUDA_GLOBAL void k_init_cm_half_fs( reax_atom *my_atoms, single_body_parameters *sbp, + two_body_parameters *tbp, storage workspace, control_params *control, + reax_list far_nbr_list, int num_atom_types, + int *max_cm_entries, int *realloc_cm_entries ) +{ + int i, j, pj; + int start_i, end_i; + int type_i, type_j; + int cm_top; + int num_cm_entries; + real r_ij; + single_body_parameters *sbp_i; + two_body_parameters *twbp; + reax_atom *atom_i, *atom_j; + sparse_matrix *H; + + i = blockIdx.x * blockDim.x + threadIdx.x; + + if ( i >= workspace.H.n_max ) + { + return; + } + + H = &workspace.H; + cm_top = H->start[i]; + + if ( i < H->n ) + { + atom_i = &my_atoms[i]; + type_i = atom_i->type; + start_i = Start_Index( i, &far_nbr_list ); + end_i = End_Index( i, &far_nbr_list ); + sbp_i = &sbp[type_i]; + + /* diagonal entry in the matrix */ + H->j[cm_top] = i; + H->val[cm_top] = Init_Charge_Matrix_Entry( sbp_i, workspace.Tap, control, + i, i, 0.0, 0.0, DIAGONAL ); + ++cm_top; + + for ( pj = start_i; pj < end_i; ++pj ) + { + if ( far_nbr_list.far_nbr_list.d[pj] <= control->nonb_cut ) + { + j = far_nbr_list.far_nbr_list.nbr[pj]; + atom_j = &my_atoms[j]; + type_j = atom_j->type; + + /* if j is a local OR ghost atom in the upper triangular region of the matrix */ + if ( atom_i->orig_id < atom_j->orig_id ) + { + twbp = &tbp[ index_tbp(type_i, type_j, num_atom_types) ]; + r_ij = far_nbr_list.far_nbr_list.d[pj]; + + H->j[cm_top] = j; + H->val[cm_top] = Init_Charge_Matrix_Entry( sbp_i, workspace.Tap, + control, i, H->j[cm_top], r_ij, twbp->gamma, OFF_DIAGONAL ); + ++cm_top; + } + } + } + } + + __syncthreads( ); + + H->end[i] = cm_top; + num_cm_entries = cm_top - H->start[i]; + + /* reallocation checks */ + if ( num_cm_entries > max_cm_entries[i] ) + { + *realloc_cm_entries = TRUE; + } +} + + +/* Compute the tabulated charge matrix entries and store the matrix in half format + * using the far neighbors list (stored in full format) and according to + * the full shell communication method */ +CUDA_GLOBAL void k_init_cm_half_fs_tab( reax_atom *my_atoms, single_body_parameters *sbp, + storage workspace, control_params *control, + reax_list far_nbr_list, LR_lookup_table *t_LR, int num_atom_types, + int *max_cm_entries, int *realloc_cm_entries ) +{ + int i, j, pj; + int start_i, end_i; + int type_i, type_j; + int cm_top; + int num_cm_entries; + real r_ij; + single_body_parameters *sbp_i; + reax_atom *atom_i, *atom_j; + sparse_matrix *H; + + i = blockIdx.x * blockDim.x + threadIdx.x; + + if ( i >= workspace.H.n_max ) + { + return; + } + + H = &workspace.H; + cm_top = H->start[i]; + + if ( i < H->n ) + { + atom_i = &my_atoms[i]; + type_i = atom_i->type; + start_i = Start_Index( i, &far_nbr_list ); + end_i = End_Index( i, &far_nbr_list ); + sbp_i = &sbp[type_i]; + + /* diagonal entry in the matrix */ + H->j[cm_top] = i; + H->val[cm_top] = Init_Charge_Matrix_Entry( sbp_i, workspace.Tap, control, + i, i, 0.0, 0.0, DIAGONAL ); + ++cm_top; + + for ( pj = start_i; pj < end_i; ++pj ) + { + j = far_nbr_list.far_nbr_list.nbr[pj]; + + if ( far_nbr_list.far_nbr_list.d[pj] <= control->nonb_cut ) + { + atom_j = &my_atoms[j]; + type_j = atom_j->type; + + /* if j is a local OR ghost atom in the upper triangular region of the matrix */ + if ( atom_i->orig_id < atom_j->orig_id ) + { + r_ij = far_nbr_list.far_nbr_list.d[pj]; + + H->j[cm_top] = j; + H->val[cm_top] = Init_Charge_Matrix_Entry_Tab( t_LR, r_ij, + type_i, type_j, num_atom_types ); + ++cm_top; + } + } + } + } + + __syncthreads( ); + + H->end[i] = cm_top; + num_cm_entries = cm_top - H->start[i]; + + /* reallocation checks */ + if ( num_cm_entries > max_cm_entries[i] ) + { + *realloc_cm_entries = TRUE; + } +} + + /* Compute the charge matrix entries and store the matrix in full format * using the far neighbors list (stored in full format) and according to * the full shell communication method */ @@ -302,7 +459,6 @@ CUDA_GLOBAL void k_init_cm_full_fs( reax_atom *my_atoms, single_body_parameters i, i, 0.0, 0.0, DIAGONAL ); ++cm_top; - /* update i-j distance - check if j is within cutoff */ for ( pj = start_i; pj < end_i; ++pj ) { j = far_nbr_list.far_nbr_list.nbr[pj]; @@ -340,7 +496,7 @@ CUDA_GLOBAL void k_init_cm_full_fs( reax_atom *my_atoms, single_body_parameters * using the far neighbors list (stored in full format) and according to * the full shell communication method */ CUDA_GLOBAL void k_init_cm_full_fs_tab( reax_atom *my_atoms, single_body_parameters *sbp, - two_body_parameters *tbp, storage workspace, control_params *control, + storage workspace, control_params *control, reax_list far_nbr_list, LR_lookup_table *t_LR, int num_atom_types, int *max_cm_entries, int *realloc_cm_entries ) { @@ -378,7 +534,6 @@ CUDA_GLOBAL void k_init_cm_full_fs_tab( reax_atom *my_atoms, single_body_paramet i, i, 0.0, 0.0, DIAGONAL ); ++cm_top; - /* update i-j distance - check if j is within cutoff */ for ( pj = start_i; pj < end_i; ++pj ) { j = far_nbr_list.far_nbr_list.nbr[pj]; @@ -608,6 +763,50 @@ CUDA_GLOBAL void k_init_bonds( reax_atom *my_atoms, single_body_parameters *sbp, } +CUDA_GLOBAL void k_estimate_storages_cm_half( reax_atom *my_atoms, + control_params *control, reax_list far_nbr_list, int cm_n, int cm_n_max, + int *cm_entries, int *max_cm_entries ) +{ + int i, j, pj; + int start_i, end_i; + int num_cm_entries; + + i = blockIdx.x * blockDim.x + threadIdx.x; + + if ( i >= cm_n_max ) + { + return; + } + + num_cm_entries = 0; + + if ( i < cm_n ) + { + start_i = Start_Index( i, &far_nbr_list ); + end_i = End_Index( i, &far_nbr_list ); + + /* diagonal entry */ + ++num_cm_entries; + + for ( pj = start_i; pj < end_i; ++pj ) + { + j = far_nbr_list.far_nbr_list.nbr[pj]; + + if ( far_nbr_list.far_nbr_list.d[pj] <= control->nonb_cut + && (j < cm_n || my_atoms[i].orig_id < my_atoms[j].orig_id) ) + { + ++num_cm_entries; + } + } + } + + __syncthreads( ); + + cm_entries[i] = num_cm_entries; + max_cm_entries[i] = MAX( (int) CEIL(num_cm_entries * SAFE_ZONE), MIN_CM_ENTRIES ); +} + + CUDA_GLOBAL void k_estimate_storages_cm_full( control_params *control, reax_list far_nbr_list, int cm_n, int cm_n_max, int *cm_entries, int *max_cm_entries ) @@ -1198,11 +1397,11 @@ void Cuda_Estimate_Storages( reax_system *system, control_params *control, if ( workspace->d_workspace->H.format == SYM_HALF_MATRIX ) { -// k_estimate_storages_cm_half <<< blocks, DEF_BLOCK_SIZE >>> -// ( (control_params *) control->d_control_params, -// *(lists[FAR_NBRS]), workspace->d_workspace->H.n, -// workspace->d_workspace->H.n_max, -// system->d_cm_entries, system->d_max_cm_entries ); + k_estimate_storages_cm_half <<< blocks, DEF_BLOCK_SIZE >>> + ( system->d_my_atoms, (control_params *) control->d_control_params, + *(lists[FAR_NBRS]), workspace->d_workspace->H.n, + workspace->d_workspace->H.n_max, + system->d_cm_entries, system->d_max_cm_entries ); } else { @@ -1328,13 +1527,22 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, { if ( workspace->d_workspace->H.format == SYM_HALF_MATRIX ) { -// k_init_cm_half_fs <<< blocks, DEF_BLOCK_SIZE >>> -// ( system->d_my_atoms, system->reax_param.d_sbp, system->reax_param.d_tbp, -// *(workspace->d_workspace), (control_params *) control->d_control_params, -// *(lists[FAR_NBRS]), workspace->d_LR, system->n, system->N, system->reax_param.num_atom_types, -// system->d_max_cm_entries, system->d_realloc_cm_entries ); -// cudaDeviceSynchronize( ); -// cudaCheckError( ); + if ( control->tabulate <= 0 ) + { + k_init_cm_half_fs <<< blocks, DEF_BLOCK_SIZE >>> + ( system->d_my_atoms, system->reax_param.d_sbp, system->reax_param.d_tbp, + *(workspace->d_workspace), (control_params *) control->d_control_params, + *(lists[FAR_NBRS]), system->reax_param.num_atom_types, + system->d_max_cm_entries, system->d_realloc_cm_entries ); + } + else + { + k_init_cm_half_fs_tab <<< blocks, DEF_BLOCK_SIZE >>> + ( system->d_my_atoms, system->reax_param.d_sbp, + *(workspace->d_workspace), (control_params *) control->d_control_params, + *(lists[FAR_NBRS]), workspace->d_LR, system->reax_param.num_atom_types, + system->d_max_cm_entries, system->d_realloc_cm_entries ); + } } else { @@ -1349,14 +1557,14 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, else { k_init_cm_full_fs_tab <<< blocks, DEF_BLOCK_SIZE >>> - ( system->d_my_atoms, system->reax_param.d_sbp, system->reax_param.d_tbp, + ( system->d_my_atoms, system->reax_param.d_sbp, *(workspace->d_workspace), (control_params *) control->d_control_params, *(lists[FAR_NBRS]), workspace->d_LR, system->reax_param.num_atom_types, system->d_max_cm_entries, system->d_realloc_cm_entries ); } - cudaDeviceSynchronize( ); - cudaCheckError( ); } + cudaDeviceSynchronize( ); + cudaCheckError( ); } #if defined(LOG_PERFORMANCE) diff --git a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu index 385a163d..48b2b7bc 100644 --- a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu +++ b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu @@ -137,7 +137,7 @@ CUDA_GLOBAL void k_jacobi_apply( real const * const Hdia_inv, real const * const /* sparse matrix, dense vector multiplication Ax = b, * where one GPU thread multiplies a row * - * A: symmetric (lower triangular portion only stored), square matrix, + * A: symmetric (upper triangular portion only stored) matrix, * stored in CSR format * x: dense vector, size equal to num. columns in A * b (output): dense vector, size equal to num. columns in A @@ -158,27 +158,27 @@ CUDA_GLOBAL void k_sparse_matvec_half_csr( int *row_ptr_start, si = row_ptr_start[i]; ei = row_ptr_end[i]; - sum = 0.0; - for ( pj = si; pj < ei; ++pj ) + /* A symmetric, upper triangular portion stored + * => diagonal only contributes once */ + sum = vals[si] * x[i]; + + for ( pj = si + 1; pj < ei; ++pj ) { sum += vals[pj] * x[col_ind[pj]]; - /* symmetric contribution (A is symmetric, lower half stored) */ - atomicAdd( (double*) &b[col_ind[pj]], (double) (vals[pj] * x[i]) ); + /* symmetric contribution to row j */ + atomicAdd( (double *) &b[col_ind[pj]], (double) (vals[pj] * x[i]) ); } - /* diagonal entry */ - sum += vals[ei] * x[i]; - /* local contribution to row i for this thread */ - atomicAdd( (double*) &b[i], (double) sum ); + atomicAdd( (double *) &b[i], (double) sum ); } /* sparse matrix, dense vector multiplication Ax = b, * where one GPU thread multiplies a row * - * A: symmetric (lower triangular portion only stored), square matrix, + * A: symmetric matrix, * stored in CSR format * x: dense vector, size equal to num. columns in A * b (output): dense vector, size equal to num. columns in A @@ -216,7 +216,7 @@ CUDA_GLOBAL void k_sparse_matvec_full_csr( int *row_ptr_start, * where warps of 32 threads * collaborate to multiply each row * - * A: symmetric, full, square matrix, + * A: symmetric matrix, * stored in CSR format * x: dense vector, size equal to num. columns in A * b (output): dense vector, size equal to num. columns in A @@ -268,7 +268,7 @@ CUDA_GLOBAL void k_sparse_matvec_full_opt_csr( int *row_ptr_start, /* sparse matrix, dense vector multiplication Ax = b, * where one GPU thread multiplies a row * - * A: symmetric (lower triangular portion only stored), square matrix, + * A: symmetric (upper triangular portion only stored) matrix, * stored in CSR format * X: 2 dense vectors, size equal to num. columns in A * B (output): 2 dense vectors, size equal to num. columns in A @@ -289,25 +289,24 @@ CUDA_GLOBAL void k_dual_sparse_matvec_half_csr( int *row_ptr_start, si = row_ptr_start[i]; ei = row_ptr_end[i]; - sum[0] = 0.0; - sum[1] = 0.0; - for ( pj = si; pj < ei; ++pj ) + /* A symmetric, upper triangular portion stored + * => diagonal only contributes once */ + sum[0] = vals[si] * x[i][0]; + sum[1] = vals[si] * x[i][1]; + + for ( pj = si + 1; pj < ei; ++pj ) { sum[0] += vals[pj] * x[col_ind[pj]][0]; sum[1] += vals[pj] * x[col_ind[pj]][1]; - /* symmetric contribution (A is symmetric, lower half stored) */ - atomicAdd( (double*) &b[col_ind[pj]][0], (double) (vals[pj] * x[i][0]) ); - atomicAdd( (double*) &b[col_ind[pj]][1], (double) (vals[pj] * x[i][1]) ); + /* symmetric contribution to row j */ + atomicAdd( (double *) &b[col_ind[pj]][0], (double) (vals[pj] * x[i][0]) ); + atomicAdd( (double *) &b[col_ind[pj]][1], (double) (vals[pj] * x[i][1]) ); } - /* diagonal entry */ - sum[0] += vals[ei] * x[i][0]; - sum[1] += vals[ei] * x[i][1]; - /* local contribution to row i for this thread */ - atomicAdd( (double*) &b[i][0], (double) sum[0] ); - atomicAdd( (double*) &b[i][1], (double) sum[1] ); + atomicAdd( (double *) &b[i][0], (double) sum[0] ); + atomicAdd( (double *) &b[i][1], (double) sum[1] ); } @@ -345,7 +344,7 @@ CUDA_GLOBAL void k_dual_sparse_matvec_full_csr( sparse_matrix A, * where warps of 32 threads * collaborate to multiply each row * - * A: symmetric, full, square matrix, + * A: symmetric matrix, * stored in CSR format * X: 2 dense vectors, size equal to num. columns in A * B (output): 2 dense vectors, size equal to num. columns in A @@ -574,7 +573,7 @@ static void Dual_Sparse_MatVec_Comm_Part2( const reax_system * const system, * control: * data: * workspace: storage container for workspace structures - * A: symmetric (lower triangular portion only stored), square matrix, + * A: symmetric matrix, * stored in CSR format * X: dense vector, size equal to num. columns in A * n: number of rows in X @@ -677,12 +676,10 @@ static void Sparse_MatVec_local( control_params const * const control, // blocks = (A->n * 32 / DEF_BLOCK_SIZE) // + ((A->n * 32 % DEF_BLOCK_SIZE) == 0 ? 0 : 1); - /* multiple threads per row implementation, + /* 32 threads per row implementation, * with shared memory to accumulate partial row sums */ // k_sparse_matvec_half_opt_csr <<< blocks, DEF_BLOCK_SIZE >>> // ( A->start, A->end, A->j, A->val, x, b, A->n ); - cudaDeviceSynchronize( ); - cudaCheckError( ); } else if ( A->format == SYM_FULL_MATRIX ) { @@ -693,13 +690,13 @@ static void Sparse_MatVec_local( control_params const * const control, blocks = ((A->n * 32) / DEF_BLOCK_SIZE) + (((A->n * 32) % DEF_BLOCK_SIZE) == 0 ? 0 : 1); - /* multiple threads per row implementation + /* 32 threads per row implementation * using registers to accumulate partial row sums */ k_sparse_matvec_full_opt_csr <<< blocks, DEF_BLOCK_SIZE >>> ( A->start, A->end, A->j, A->val, x, b, A->n ); - cudaDeviceSynchronize( ); - cudaCheckError( ); } + cudaDeviceSynchronize( ); + cudaCheckError( ); } @@ -754,7 +751,7 @@ static void Sparse_MatVec_Comm_Part2( const reax_system * const system, * control: * data: * workspace: storage container for workspace structures - * A: symmetric (lower triangular portion only stored), square matrix, + * A: symmetric matrix, * stored in CSR format * x: dense vector * n: number of entries in x @@ -1100,7 +1097,7 @@ int Cuda_CG( reax_system const * const system, control_params const * const cont * workspace: struct containing storage for workspace for the linear solver * control: struct containing parameters governing the simulation and numeric methods * data: struct containing simulation data (e.g., atom info) - * H: sparse, symmetric matrix, lower half stored in CSR format + * H: sparse, symmetric matrix in CSR format * b: right-hand side of the linear system * tol: tolerence compared against the relative residual for determining convergence * x: inital guess diff --git a/PG-PuReMD/src/forces.c b/PG-PuReMD/src/forces.c index e8121f98..29e220c7 100644 --- a/PG-PuReMD/src/forces.c +++ b/PG-PuReMD/src/forces.c @@ -188,16 +188,16 @@ static inline real Init_Charge_Matrix_Entry( const reax_system * const system, /* i == j: periodic self-interaction term * i != j: general interaction term */ ret = ((i == j) ? 0.5 : 1.0) * Tap * EV_to_KCALpMOL / dr3gamij_3; - break; + break; case DIAGONAL: ret = system->reax_param.sbp[system->my_atoms[i].type].eta; - break; + break; default: fprintf( stderr, "[ERROR] Invalid matrix position. Terminating...\n" ); exit( INVALID_INPUT ); - break; + break; } break; -- GitLab