diff --git a/PG-PuReMD/src/cuda/cuda_forces.cu b/PG-PuReMD/src/cuda/cuda_forces.cu index 6d6f5aec22d3fe984d80863de630b4cf796f1eab..43ea11bf25b6a09a9da27b9c3c4dea991b18c0aa 100644 --- a/PG-PuReMD/src/cuda/cuda_forces.cu +++ b/PG-PuReMD/src/cuda/cuda_forces.cu @@ -803,7 +803,10 @@ CUDA_GLOBAL void k_estimate_storages_cm_half( reax_atom *my_atoms, __syncthreads( ); cm_entries[i] = num_cm_entries; - max_cm_entries[i] = MAX( (int) CEIL(num_cm_entries * SAFE_ZONE), MIN_CM_ENTRIES ); + /* round up to the nearest multiple of 32 to ensure that reads along + * rows can be coalesced for 1 warp per row SpMV implementation */ + max_cm_entries[i] = MAX( ((int) CEIL( num_cm_entries * SAFE_ZONE ) + + 0x0000001F) & 0x111111E0, MIN_CM_ENTRIES ); } @@ -844,7 +847,10 @@ CUDA_GLOBAL void k_estimate_storages_cm_full( control_params *control, __syncthreads( ); cm_entries[i] = num_cm_entries; - max_cm_entries[i] = MAX( (int) CEIL(num_cm_entries * SAFE_ZONE), MIN_CM_ENTRIES ); + /* round up to the nearest multiple of 32 to ensure that reads along + * rows can be coalesced for 1 warp per row SpMV implementation */ + max_cm_entries[i] = MAX( ((int) CEIL( num_cm_entries * SAFE_ZONE ) + + 0x0000001F) & 0x111111E0, MIN_CM_ENTRIES ); } diff --git a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu index 3a6547ffba774f70455aa254cef232957bbe238c..f7ed486b5b9ce16d337db474033abbc290bd43ff 100644 --- a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu +++ b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu @@ -187,8 +187,8 @@ CUDA_GLOBAL void k_sparse_matvec_half_opt_csr( int *row_ptr_start, int *row_ptr_end, int *col_ind, real *vals, const real * const x, real * const b, int N ) { - int pj, si, ei, thread_id, warp_id, lane_id, offset; - real sum; + int pj, si, ei, thread_id, warp_id, lane_id, offset, itr, col_ind_l; + real vals_l, sum; thread_id = blockDim.x * blockIdx.x + threadIdx.x; warp_id = thread_id >> 5; @@ -201,24 +201,31 @@ CUDA_GLOBAL void k_sparse_matvec_half_opt_csr( int *row_ptr_start, lane_id = thread_id & 0x0000001F; si = row_ptr_start[warp_id]; ei = row_ptr_end[warp_id]; - - /* A symmetric, upper triangular portion stored - * => diagonal only contributes once */ - if ( lane_id == 0 ) - { - sum = vals[si] * x[warp_id]; - } - else - { - sum = 0.0; - } + sum = 0.0; /* partial sums per thread */ - for ( pj = si + lane_id + 1; pj < ei; pj += warpSize ) + for ( itr = 0, pj = si + lane_id; itr < (ei - si + 0x0000001F) >> 5; ++itr ) { - sum += vals[pj] * x[col_ind[pj]]; - /* symmetric contribution to row j */ - atomicAdd( (double *) &b[col_ind[pj]], (double) (vals[pj] * x[warp_id]) ); + /* coaleseced 128-bit aligned reads from global memory */ + vals_l = vals[pj]; + col_ind_l = col_ind[pj]; + + /* only threads with value non-zero positions accumulate the result */ + if ( pj < ei ) + { + /* gather on x from global memory and compute partial sum for this non-zero entry */ + sum += vals_l * x[col_ind_l]; + + /* A symmetric, upper triangular portion stored + * => diagonal only contributes once */ + if ( pj > si ) + { + /* symmetric contribution to row j */ + atomicAdd( (double *) &b[col_ind[pj]], (double) (vals_l * x[warp_id]) ); + } + } + + pj += warpSize; } /* warp-level reduction of partial sums @@ -285,8 +292,8 @@ CUDA_GLOBAL void k_sparse_matvec_full_opt_csr( int *row_ptr_start, int *row_ptr_end, int *col_ind, real *vals, const real * const x, real * const b, int n ) { - int pj, si, ei, thread_id, warp_id, lane_id, offset; - real sum; + int pj, si, ei, thread_id, warp_id, lane_id, offset, itr, col_ind_l; + real vals_l, sum; thread_id = blockDim.x * blockIdx.x + threadIdx.x; warp_id = thread_id >> 5; @@ -302,9 +309,20 @@ CUDA_GLOBAL void k_sparse_matvec_full_opt_csr( int *row_ptr_start, sum = 0.0; /* partial sums per thread */ - for ( pj = si + lane_id; pj < ei; pj += warpSize ) + for ( itr = 0, pj = si + lane_id; itr < (ei - si + 0x0000001F) >> 5; ++itr ) { - sum += vals[pj] * x[col_ind[pj]]; + /* coaleseced 128-bit aligned reads from global memory */ + vals_l = vals[pj]; + col_ind_l = col_ind[pj]; + + /* only threads with value non-zero positions accumulate the result */ + if ( pj < ei ) + { + /* gather on x from global memory and compute partial sum for this non-zero entry */ + sum += vals_l * x[col_ind_l]; + } + + pj += warpSize; } /* warp-level reduction of partial sums diff --git a/PG-PuReMD/src/reax_types.h b/PG-PuReMD/src/reax_types.h index a0c1c88e245d4dc11bf0660c9630d6d474ecc6df..194970ac95f2ee18781c628fa6307075f3d26d20 100644 --- a/PG-PuReMD/src/reax_types.h +++ b/PG-PuReMD/src/reax_types.h @@ -176,19 +176,19 @@ #define HB_THRESHOLD (1.0e-2) /* minimum capacity (entries) in various interaction lists */ -#define MIN_CAP (50) +#define MIN_CAP (64) /* minimum number of interactions per entry in the far neighbor list */ -#define MIN_NBRS (100) +#define MIN_NBRS (96) /* minimum number of non-zero entries per row in the charge matrix */ -#define MIN_CM_ENTRIES (100) +#define MIN_CM_ENTRIES (96) /* minimum number of interactions per entry in the bond list */ -#define MIN_BONDS (15) +#define MIN_BONDS (32) /* minimum number of interactions per entry in the hydrogen bond list */ -#define MIN_HBONDS (25) +#define MIN_HBONDS (32) /* minimum capacity (entries) in 3-body interaction list */ -#define MIN_3BODIES (1000) +#define MIN_3BODIES (1024) /* minimum number of atoms per grid cell */ -#define MIN_GCELL_POPL (50) +#define MIN_GCELL_POPL (64) /* ??? */ #define MIN_SEND (100) /* over-allocation factor for various data structures */