diff --git a/PG-PuReMD/src/charges.c b/PG-PuReMD/src/charges.c index baec5a9f8ca3570709cb9362c4e7f9cdb1e601bb..7379596ffeaa83061704997a0136e746b75aff0d 100644 --- a/PG-PuReMD/src/charges.c +++ b/PG-PuReMD/src/charges.c @@ -335,10 +335,11 @@ static void Calculate_Charges_QEq( reax_system const * const system, /* compute charge based on s & t */ #if defined(DUAL_SOLVER) - q[i] = atom->q = workspace->x[i][0] - u * workspace->x[i][1]; + atom->q = workspace->x[i][0] - u * workspace->x[i][1]; #else - q[i] = atom->q = workspace->s[i] - u * workspace->t[i]; + atom->q = workspace->s[i] - u * workspace->t[i]; #endif + q[i] = atom->q; /* update previous solutions in s & t */ atom->s[3] = atom->s[2]; diff --git a/PG-PuReMD/src/cuda/cuda_allocate.cu b/PG-PuReMD/src/cuda/cuda_allocate.cu index eeee26df7024798abd7d0a8d6355021d88f541da..4128585242230a9b46c9ce65f919bbe45f4bec1a 100644 --- a/PG-PuReMD/src/cuda/cuda_allocate.cu +++ b/PG-PuReMD/src/cuda/cuda_allocate.cu @@ -364,8 +364,6 @@ void Cuda_Allocate_Workspace( reax_system *system, control_params *control, { int total_real, total_rvec, local_rvec; - workspace->allocated = TRUE; - total_real = sizeof(real) * total_cap; total_rvec = sizeof(rvec) * total_cap; local_rvec = sizeof(rvec) * local_cap; @@ -515,13 +513,6 @@ void Cuda_Allocate_Workspace( reax_system *system, control_params *control, void Cuda_Deallocate_Workspace( control_params *control, storage *workspace ) { - if ( workspace->allocated == FALSE ) - { - return; - } - - workspace->allocated = FALSE; - /* bond order related storage */ cuda_free( workspace->total_bond_order, "total_bo" ); cuda_free( workspace->Deltap, "Deltap" ); diff --git a/PG-PuReMD/src/cuda/cuda_charges.cu b/PG-PuReMD/src/cuda/cuda_charges.cu index 93884ab48c9b427572e1b2ec0a717840b1d0e4d3..f9c34d87c950fa83de880cc674d8025ba7551c69 100644 --- a/PG-PuReMD/src/cuda/cuda_charges.cu +++ b/PG-PuReMD/src/cuda/cuda_charges.cu @@ -58,13 +58,13 @@ static void jacobi( reax_system const * const system, int blocks; blocks = system->n / DEF_BLOCK_SIZE - + (( system->n % DEF_BLOCK_SIZE == 0 ) ? 0 : 1); + + ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1); k_jacobi <<< blocks, DEF_BLOCK_SIZE >>> ( system->d_my_atoms, system->reax_param.d_sbp, *(workspace->d_workspace), system->n ); - cudaDeviceSynchronize(); - cudaCheckError(); + cudaDeviceSynchronize( ); + cudaCheckError( ); } @@ -245,8 +245,8 @@ static void Spline_Extrapolate_Charges_QEq( reax_system const * const system, ( system->d_my_atoms, system->reax_param.d_sbp, (control_params *)control->d_control_params, *(workspace->d_workspace), system->n ); - cudaDeviceSynchronize(); - cudaCheckError(); + cudaDeviceSynchronize( ); + cudaCheckError( ); } @@ -376,6 +376,7 @@ CUDA_GLOBAL void k_extrapolate_charges_qeq_part2( reax_atom *my_atoms, return; } + /* compute charge based on s & t */ #if defined(DUAL_SOLVER) my_atoms[i].q = workspace.x[i][0] - u * workspace.x[i][1]; #else @@ -525,7 +526,8 @@ static void Calculate_Charges_QEq( reax_system const * const system, #endif /* global reduction on pseudo-charges for s and t */ - ret = MPI_Allreduce( &my_sum, &all_sum, 2, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD ); + ret = MPI_Allreduce( &my_sum, &all_sum, 2, MPI_DOUBLE, + MPI_SUM, MPI_COMM_WORLD ); Check_MPI_Error( ret, __FILE__, __LINE__ ); u = all_sum[0] / all_sum[1]; diff --git a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu index b0e7f70c15733a604d449252aea81bb423dcc100..fdb63082f0b21cd7fec7942581d50f2f3c45d899 100644 --- a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu +++ b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu @@ -294,7 +294,17 @@ CUDA_GLOBAL void k_dual_sparse_matvec_full_csr( sparse_matrix A, } -CUDA_GLOBAL void k_dual_sparse_matvec_full_opt_csr( sparse_matrix A, +/* sparse matrix, dense vector multiplication AX = B, + * where warps of 32 threads + * collaborate to multiply each row + * + * A: symmetric, full, square matrix, + * stored in CSR format + * X: 2 dense vectors, size equal to num. columns in A + * B (output): 2 dense vectors, size equal to num. columns in A + * N: number of rows in A */ +CUDA_GLOBAL void k_dual_sparse_matvec_full_opt_csr( int *row_ptr_start, + int *row_ptr_end, int *col_ind, real *vals, rvec2 const * const x, rvec2 * const b, int n ) { int i, pj, thread_id, warp_id, lane_id, offset; @@ -310,23 +320,27 @@ CUDA_GLOBAL void k_dual_sparse_matvec_full_opt_csr( sparse_matrix A, if ( warp_id < n ) { i = warp_id; + si = row_ptr_start[i]; + ei = row_ptr_end[i]; sum[0] = 0.0; sum[1] = 0.0; - si = A.start[i]; - ei = A.end[i]; + /* partial sums per thread */ for ( pj = si + lane_id; pj < ei; pj += warpSize ) { - sum[0] += A.val[pj] * x[ A.j[pj] ][0]; - sum[1] += A.val[pj] * x[ A.j[pj] ][1]; + sum[0] += vals[pj] * x[ col_ind[pj] ][0]; + sum[1] += vals[pj] * x[ col_ind[pj] ][1]; } + /* warp-level reduction of partial sums + * using registers within a warp */ for ( offset = warpSize >> 1; offset > 0; offset /= 2 ) { sum[0] += __shfl_down_sync( mask, sum[0], offset ); sum[1] += __shfl_down_sync( mask, sum[1], offset ); } + /* first thread within a warp writes sum to global memory */ if ( lane_id == 0 ) { b[i][0] = sum[0]; @@ -418,7 +432,7 @@ static void Dual_Sparse_MatVec_local( control_params const * const control, sparse_matrix const * const A, rvec2 const * const x, rvec2 * const b, int n ) { -// int blocks; + int blocks; if ( A->format == SYM_HALF_MATRIX ) { @@ -445,11 +459,14 @@ static void Dual_Sparse_MatVec_local( control_params const * const control, /* 1 thread per row implementation */ // k_dual_sparse_matvec_full_csr <<< control->blocks_n, control->blocks_size_n >>> // ( *A, x, b, n ); + + blocks = (A->n * 32 / DEF_BLOCK_SIZE) + + ((A->n * 32 % DEF_BLOCK_SIZE) == 0 ? 0 : 1); - /* one warp per row implementation, - * with shared memory to accumulate partial row sums */ - k_dual_sparse_matvec_full_opt_csr <<< control->blocks_n, control->block_size_n >>> - ( *A, x, b, n ); + /* multiple threads per row implementation + * using registers to accumulate partial row sums */ + k_dual_sparse_matvec_full_opt_csr <<< blocks, DEF_BLOCK_SIZE >>> + ( A->start, A->end, A->j, A->val, x, b, n ); cudaDeviceSynchronize( ); cudaCheckError( ); } @@ -621,13 +638,13 @@ static void Sparse_MatVec_local( control_params const * const control, } else if ( A->format == SYM_FULL_MATRIX ) { - blocks = (A->n * 32 / DEF_BLOCK_SIZE) - + ((A->n * 32 % DEF_BLOCK_SIZE) == 0 ? 0 : 1); - /* 1 thread per row implementation */ // k_sparse_matvec_full_csr <<< control->blocks, control->blocks_size >>> // ( A->start, A->end, A->j, A->val, x, b, n ); + blocks = (A->n * 32 / DEF_BLOCK_SIZE) + + ((A->n * 32 % DEF_BLOCK_SIZE) == 0 ? 0 : 1); + /* multiple threads per row implementation * using registers to accumulate partial row sums */ k_sparse_matvec_full_opt_csr <<< blocks, DEF_BLOCK_SIZE >>> @@ -672,12 +689,12 @@ static void Sparse_MatVec_Comm_Part2( const reax_system * const system, "Sparse_MatVec_Comm_Part2::workspace->host_scratch" ); spad = (real *) workspace->host_scratch; copy_host_device( spad, b, sizeof(real) * n1, - cudaMemcpyDeviceToHost, "Sparse_MatVec_Comm_Part2::q" ); + cudaMemcpyDeviceToHost, "Sparse_MatVec_Comm_Part2::b" ); Coll( system, mpi_data, spad, buf_type, mpi_type ); copy_host_device( spad, b, sizeof(real) * n2, - cudaMemcpyHostToDevice, "Sparse_MatVec_Comm_Part2::q" ); + cudaMemcpyHostToDevice, "Sparse_MatVec_Comm_Part2::b" ); //#endif } } @@ -939,8 +956,8 @@ int Cuda_CG( reax_system const * const system, control_params const * const cont redux[0] = Dot_local( workspace, workspace->d_workspace->r, workspace->d_workspace->d, system->n ); - redux[1] = Dot_local( workspace, workspace->d_workspace->r, - workspace->d_workspace->r, system->n ); + redux[1] = Dot_local( workspace, workspace->d_workspace->d, + workspace->d_workspace->d, system->n ); redux[2] = Dot_local( workspace, b, b, system->n ); #if defined(LOG_PERFORMANCE) @@ -988,8 +1005,8 @@ int Cuda_CG( reax_system const * const system, control_params const * const cont redux[0] = Dot_local( workspace, workspace->d_workspace->r, workspace->d_workspace->p, system->n ); - redux[1] = Dot_local( workspace, workspace->d_workspace->r, - workspace->d_workspace->r, system->n ); + redux[1] = Dot_local( workspace, workspace->d_workspace->p, + workspace->d_workspace->p, system->n ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );