diff --git a/PG-PuReMD/src/cuda/cuda_bond_orders.cu b/PG-PuReMD/src/cuda/cuda_bond_orders.cu index 64c944172372c4074c2928e8fc9973ae9b94d5ca..0c11e58fbaf40d8237c16e28d5aea1b966a14919 100644 --- a/PG-PuReMD/src/cuda/cuda_bond_orders.cu +++ b/PG-PuReMD/src/cuda/cuda_bond_orders.cu @@ -773,12 +773,12 @@ void Cuda_Total_Forces( reax_system *system, control_params *control, if ( control->virial != 0 ) { /* reduction for ext_press */ - k_reduction_rvec <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec) * DEF_BLOCK_SIZE >>> + k_reduction_rvec <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec) * (DEF_BLOCK_SIZE / 32) >>> ( spad_rvec, &spad_rvec[system->N], system->N ); cudaDeviceSynchronize( ); cudaCheckError( ); - k_reduction_rvec <<< 1, control->blocks_pow_2_n, sizeof(rvec) * control->blocks_pow_2_n>>> + k_reduction_rvec <<< 1, ((blocks + 31) / 32) * 32, sizeof(rvec) * ((blocks + 31) / 32) >>> ( &spad_rvec[system->N], &((simulation_data *)data->d_simulation_data)->my_ext_press, blocks ); cudaDeviceSynchronize( ); cudaCheckError( ); diff --git a/PG-PuReMD/src/cuda/cuda_bonds.cu b/PG-PuReMD/src/cuda/cuda_bonds.cu index bae45dbc54388452697d0be0bd45a793e3297e1f..9cd4e0a43321a8c393e4d9f6b447d2fa5bc32c3f 100644 --- a/PG-PuReMD/src/cuda/cuda_bonds.cu +++ b/PG-PuReMD/src/cuda/cuda_bonds.cu @@ -85,7 +85,7 @@ CUDA_GLOBAL void Cuda_Bonds( reax_atom *my_atoms, global_parameters gp, ebond = -twbp->De_s * bo_ij->BO_s * exp_be12 - twbp->De_p * bo_ij->BO_pi - twbp->De_pp * bo_ij->BO_pi2; - e_bond[ i ] += ebond; + e_bond[i] += ebond; /* calculate derivatives of bond orders */ bo_ij->Cdbo += CEbo; @@ -111,8 +111,8 @@ CUDA_GLOBAL void Cuda_Bonds( reax_atom *my_atoms, global_parameters gp, /* Stabilisation terminal triple bond */ if ( bo_ij->BO >= 1.00 ) { - if ( gp37 == 2 || - ( (Cuda_strncmp( sbp_i->name, "C", sizeof(sbp_i->name) ) == 0 + if ( gp37 == 2 + || ( (Cuda_strncmp( sbp_i->name, "C", sizeof(sbp_i->name) ) == 0 && Cuda_strncmp( sbp_j->name, "O", sizeof(sbp_j->name) ) == 0) || (Cuda_strncmp( sbp_i->name, "O", sizeof(sbp_i->name) ) == 0 && Cuda_strncmp( sbp_j->name, "C", sizeof(sbp_j->name) ) == 0) ) ) diff --git a/PG-PuReMD/src/cuda/cuda_charges.cu b/PG-PuReMD/src/cuda/cuda_charges.cu index e063ed9ad2ec8715e0f294e831b66d584ec3a894..ce45043e501c7ce958eb18d1b67ca5f82f2a385f 100644 --- a/PG-PuReMD/src/cuda/cuda_charges.cu +++ b/PG-PuReMD/src/cuda/cuda_charges.cu @@ -492,24 +492,26 @@ static void Calculate_Charges_QEq( reax_system const * const system, + ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1); cuda_check_malloc( &workspace->scratch, &workspace->scratch_size, - sizeof(rvec2) * 2 * system->n, + sizeof(rvec2) * (blocks + 1), "Calculate_Charges_QEq::workspace->scratch" ); spad_rvec2 = (rvec2 *) workspace->scratch; - cuda_memset( spad_rvec2, 0, sizeof(rvec2) * 2 * system->n, - "Calculate_Charges_QEq::spad_rvec2," ); + cuda_memset( spad_rvec2, 0, sizeof(rvec2) * (blocks + 1), + "Calculate_Charges_QEq::spad_rvec2" ); /* compute local sums of pseudo-charges in s and t on device */ - k_reduction_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec2) * DEF_BLOCK_SIZE >>> + k_reduction_rvec2 <<< blocks, DEF_BLOCK_SIZE, + sizeof(rvec2) * (DEF_BLOCK_SIZE / 32) >>> ( workspace->d_workspace->x, spad_rvec2, system->n ); cudaDeviceSynchronize( ); cudaCheckError( ); - k_reduction_rvec2 <<< 1, control->blocks_pow_2, sizeof(rvec2) * control->blocks_pow_2 >>> - ( spad_rvec2, &spad_rvec2[system->n], blocks ); + k_reduction_rvec2 <<< 1, ((blocks + 31) / 32) * 32, + sizeof(rvec2) * ((blocks + 31) / 32) >>> + ( spad_rvec2, &spad_rvec2[blocks], blocks ); cudaDeviceSynchronize( ); cudaCheckError( ); - copy_host_device( &my_sum, &spad_rvec2[system->n], + copy_host_device( &my_sum, &spad_rvec2[blocks], sizeof(rvec2), cudaMemcpyDeviceToHost, "Calculate_Charges_QEq::my_sum," ); #else cuda_check_malloc( &workspace->scratch, &workspace->scratch_size, diff --git a/PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu b/PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu index a957f939ef518ff663bd7105e262fc9dbb809820..7a0354c868ccc7db0a57caeec47d43948257818e 100644 --- a/PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu +++ b/PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu @@ -69,7 +69,7 @@ CUDA_GLOBAL void k_vector_copy_from_rvec2( real * const dst, rvec2 const * const CUDA_GLOBAL void k_vector_copy_to_rvec2( rvec2 * const dst, real const * const src, - int index, int n) + int index, int n ) { int i; @@ -631,17 +631,19 @@ void Dot_local_rvec2( control_params const * const control, sizeof(rvec2) * (k + blocks + 1), "Dot_local_rvec2::workspace->scratch" ); spad = (rvec2 *) workspace->scratch; - Vector_Mult_rvec2( &spad[0], v1, v2, k ); + Vector_Mult_rvec2( spad, v1, v2, k ); /* local reduction (sum) on device */ -// Cuda_Reduction_Sum( &spad[0], &spad[k], k ); +// Cuda_Reduction_Sum( spad, &spad[k], k ); - k_reduction_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec2) * DEF_BLOCK_SIZE >>> - ( &spad[0], &spad[k], k ); + k_reduction_rvec2 <<< blocks, DEF_BLOCK_SIZE, + sizeof(rvec2) * DEF_BLOCK_SIZE >>> + ( spad, &spad[k], k ); cudaDeviceSynchronize( ); cudaCheckError( ); - k_reduction_rvec2 <<< 1, control->blocks_pow_2, sizeof(rvec2) * control->blocks_pow_2 >>> + k_reduction_rvec2 <<< 1, ((blocks + 31) / 32) * 32, + sizeof(rvec2) * ((blocks + 31) / 32) >>> ( &spad[k], &spad[k + blocks], blocks ); cudaDeviceSynchronize( ); cudaCheckError( ); diff --git a/PG-PuReMD/src/cuda/cuda_environment.cu b/PG-PuReMD/src/cuda/cuda_environment.cu index 98da574e41fea06abfc1bd4381888d377f1bae4f..e2045a0e941aaf446887167fcbff324966e04481 100644 --- a/PG-PuReMD/src/cuda/cuda_environment.cu +++ b/PG-PuReMD/src/cuda/cuda_environment.cu @@ -3,16 +3,16 @@ #include "cuda_utils.h" -static void compute_blocks( int *blocks, int *block_size, int count ) +static void compute_blocks( int *blocks, int *block_size, int threads ) { *block_size = DEF_BLOCK_SIZE; // threads per block - *blocks = (int) CEIL( (double) count / DEF_BLOCK_SIZE ); // blocks per grid + *blocks = (threads + (DEF_BLOCK_SIZE - 1)) / DEF_BLOCK_SIZE; // blocks per grid } -static void compute_nearest_pow_2( int blocks, int *result ) +static void compute_nearest_multiple_32( int blocks, int *result ) { - *result = (int) EXP2( CEIL( LOG2((double) blocks) ) ); + *result = ((blocks + 31) / 32) * 32; } @@ -50,10 +50,10 @@ extern "C" void Cuda_Init_Block_Sizes( reax_system *system, control_params *control ) { compute_blocks( &control->blocks, &control->block_size, system->n ); - compute_nearest_pow_2( control->blocks, &control->blocks_pow_2 ); + compute_nearest_multiple_32( control->blocks, &control->blocks_pow_2 ); compute_blocks( &control->blocks_n, &control->block_size_n, system->N ); - compute_nearest_pow_2( control->blocks_n, &control->blocks_pow_2_n ); + compute_nearest_multiple_32( control->blocks_n, &control->blocks_pow_2_n ); } diff --git a/PG-PuReMD/src/cuda/cuda_forces.cu b/PG-PuReMD/src/cuda/cuda_forces.cu index aba8c39f498f662768c88313779556eb0b6f527a..bba38c8b3a42b3d1d03618c98767549899bc1b6b 100644 --- a/PG-PuReMD/src/cuda/cuda_forces.cu +++ b/PG-PuReMD/src/cuda/cuda_forces.cu @@ -264,7 +264,7 @@ CUDA_GLOBAL void k_init_distance( reax_atom *my_atoms, reax_list far_nbr_list, i * the full shell communication method */ CUDA_GLOBAL void k_init_cm_full_fs( reax_atom *my_atoms, single_body_parameters *sbp, two_body_parameters *tbp, storage workspace, control_params *control, - reax_list far_nbr_list, LR_lookup_table *t_LR, int num_atom_types, + reax_list far_nbr_list, int num_atom_types, int *max_cm_entries, int *realloc_cm_entries ) { int i, j, pj; @@ -288,14 +288,14 @@ CUDA_GLOBAL void k_init_cm_full_fs( reax_atom *my_atoms, single_body_parameters H = &workspace.H; cm_top = H->start[i]; - atom_i = &my_atoms[i]; - type_i = atom_i->type; - start_i = Start_Index( i, &far_nbr_list ); - end_i = End_Index( i, &far_nbr_list ); - sbp_i = &sbp[type_i]; - if ( i < H->n ) { + atom_i = &my_atoms[i]; + type_i = atom_i->type; + start_i = Start_Index( i, &far_nbr_list ); + end_i = End_Index( i, &far_nbr_list ); + sbp_i = &sbp[type_i]; + /* diagonal entry in the matrix */ H->j[cm_top] = i; H->val[cm_top] = Init_Charge_Matrix_Entry( sbp_i, workspace.Tap, control, @@ -316,21 +316,90 @@ CUDA_GLOBAL void k_init_cm_full_fs( reax_atom *my_atoms, single_body_parameters r_ij = far_nbr_list.far_nbr_list.d[pj]; H->j[cm_top] = j; + H->val[cm_top] = Init_Charge_Matrix_Entry( sbp_i, workspace.Tap, + control, i, H->j[cm_top], r_ij, twbp->gamma, OFF_DIAGONAL ); + ++cm_top; + } + } + } - if ( control->tabulate == 0 ) - { - H->val[cm_top] = Init_Charge_Matrix_Entry( sbp_i, workspace.Tap, - control, i, H->j[cm_top], r_ij, twbp->gamma, OFF_DIAGONAL ); - } - else - { - H->val[cm_top] = Init_Charge_Matrix_Entry_Tab( t_LR, r_ij, type_i, type_j,num_atom_types ); - } + __syncthreads( ); + + H->end[i] = cm_top; + num_cm_entries = cm_top - H->start[i]; + + /* reallocation checks */ + if ( num_cm_entries > max_cm_entries[i] ) + { + *realloc_cm_entries = TRUE; + } +} + + +/* Compute the tabulated charge matrix entries and store the matrix in full format + * using the far neighbors list (stored in full format) and according to + * the full shell communication method */ +CUDA_GLOBAL void k_init_cm_full_fs_tab( reax_atom *my_atoms, single_body_parameters *sbp, + two_body_parameters *tbp, storage workspace, control_params *control, + reax_list far_nbr_list, LR_lookup_table *t_LR, int num_atom_types, + int *max_cm_entries, int *realloc_cm_entries ) +{ + int i, j, pj; + int start_i, end_i; + int type_i, type_j; + int cm_top; + int num_cm_entries; + real r_ij; + single_body_parameters *sbp_i; + reax_atom *atom_i, *atom_j; + sparse_matrix *H; + + i = blockIdx.x * blockDim.x + threadIdx.x; + + if ( i >= workspace.H.n_max ) + { + return; + } + + H = &workspace.H; + cm_top = H->start[i]; + + if ( i < H->n ) + { + atom_i = &my_atoms[i]; + type_i = atom_i->type; + start_i = Start_Index( i, &far_nbr_list ); + end_i = End_Index( i, &far_nbr_list ); + sbp_i = &sbp[type_i]; + + /* diagonal entry in the matrix */ + H->j[cm_top] = i; + H->val[cm_top] = Init_Charge_Matrix_Entry( sbp_i, workspace.Tap, control, + i, i, 0.0, 0.0, DIAGONAL ); + ++cm_top; + + /* update i-j distance - check if j is within cutoff */ + for ( pj = start_i; pj < end_i; ++pj ) + { + j = far_nbr_list.far_nbr_list.nbr[pj]; + + if ( far_nbr_list.far_nbr_list.d[pj] <= control->nonb_cut ) + { + atom_j = &my_atoms[j]; + type_j = atom_j->type; + + r_ij = far_nbr_list.far_nbr_list.d[pj]; + + H->j[cm_top] = j; + H->val[cm_top] = Init_Charge_Matrix_Entry_Tab( t_LR, r_ij, + type_i, type_j, num_atom_types ); ++cm_top; } } } + __syncthreads( ); + H->end[i] = cm_top; num_cm_entries = cm_top - H->start[i]; @@ -539,26 +608,66 @@ CUDA_GLOBAL void k_init_bonds( reax_atom *my_atoms, single_body_parameters *sbp, } -CUDA_GLOBAL void k_estimate_storages( reax_atom *my_atoms, +CUDA_GLOBAL void k_estimate_storages_cm_full( control_params *control, + reax_list far_nbr_list, int cm_n, int cm_n_max, + int *cm_entries, int *max_cm_entries ) +{ + int i, pj; + int start_i, end_i; + int num_cm_entries; + + i = blockIdx.x * blockDim.x + threadIdx.x; + + if ( i >= cm_n_max ) + { + return; + } + + num_cm_entries = 0; + + if ( i < cm_n ) + { + start_i = Start_Index( i, &far_nbr_list ); + end_i = End_Index( i, &far_nbr_list ); + + /* diagonal entry */ + ++num_cm_entries; + + for ( pj = start_i; pj < end_i; ++pj ) + { + if ( far_nbr_list.far_nbr_list.d[pj] <= control->nonb_cut ) + { + ++num_cm_entries; + } + } + } + + __syncthreads( ); + + cm_entries[i] = num_cm_entries; + max_cm_entries[i] = MAX( (int) CEIL(num_cm_entries * SAFE_ZONE), MIN_CM_ENTRIES ); +} + + +CUDA_GLOBAL void k_estimate_storages_bonds( reax_atom *my_atoms, single_body_parameters *sbp, two_body_parameters *tbp, control_params *control, reax_list far_nbr_list, - int num_atom_types, int n, int N, int total_cap, int cm_n_max, + int num_atom_types, int n, int N, int total_cap, int *bonds, int *max_bonds, - int *hbonds, int *max_hbonds, - int *cm_entries, int *max_cm_entries ) + int *hbonds, int *max_hbonds ) { int i, j, pj; int start_i, end_i; int type_i, type_j; int ihb, jhb; - int num_bonds, num_hbonds, num_cm_entries; + int num_bonds, num_hbonds; real cutoff; real r_ij; real C12, C34, C56; real BO, BO_s, BO_pi, BO_pi2; single_body_parameters *sbp_i, *sbp_j; two_body_parameters *twbp; - reax_atom *atom_i, *atom_j; + reax_atom *atom_i; i = blockIdx.x * blockDim.x + threadIdx.x; @@ -569,7 +678,6 @@ CUDA_GLOBAL void k_estimate_storages( reax_atom *my_atoms, num_bonds = 0; num_hbonds = 0; - num_cm_entries = 0; if ( i < N ) { @@ -583,8 +691,6 @@ CUDA_GLOBAL void k_estimate_storages( reax_atom *my_atoms, if ( i < n ) { cutoff = control->nonb_cut; - /* diagonal entry */ - ++num_cm_entries; } else { @@ -594,7 +700,6 @@ CUDA_GLOBAL void k_estimate_storages( reax_atom *my_atoms, for ( pj = start_i; pj < end_i; ++pj ) { j = far_nbr_list.far_nbr_list.nbr[pj]; - atom_j = &my_atoms[j]; if ( far_nbr_list.far_nbr_list.d[pj] <= control->nonb_cut ) { @@ -603,12 +708,6 @@ CUDA_GLOBAL void k_estimate_storages( reax_atom *my_atoms, ihb = sbp_i->p_hbond; jhb = sbp_j->p_hbond; - //TODO: assuming far_nbr_list in FULL_LIST, add conditions for HALF_LIST - if ( i < n ) - { - ++num_cm_entries; - } - /* atom i: H bonding, ghost * atom j: H atom, native */ if ( control->hbond_cut > 0.0 && i >= n && j < n @@ -705,12 +804,6 @@ CUDA_GLOBAL void k_estimate_storages( reax_atom *my_atoms, hbonds[i] = num_hbonds; max_hbonds[i] = MAX( (int) CEIL(num_hbonds * SAFE_ZONE), MIN_HBONDS ); - - if ( i < cm_n_max ) - { - cm_entries[i] = num_cm_entries; - max_cm_entries[i] = MAX( (int) CEIL(num_cm_entries * SAFE_ZONE), MIN_CM_ENTRIES ); - } } @@ -1098,20 +1191,51 @@ void Cuda_Estimate_Storages( reax_system *system, control_params *control, { int blocks; - blocks = system->total_cap / DEF_BLOCK_SIZE - + (system->total_cap % DEF_BLOCK_SIZE == 0 ? 0 : 1); + if ( realloc_cm == TRUE ) + { + blocks = workspace->d_workspace->H.n_max / DEF_BLOCK_SIZE + + (workspace->d_workspace->H.n_max % DEF_BLOCK_SIZE == 0 ? 0 : 1); - k_estimate_storages <<< blocks, DEF_BLOCK_SIZE >>> - ( system->d_my_atoms, system->reax_param.d_sbp, system->reax_param.d_tbp, - (control_params *) control->d_control_params, - *(lists[FAR_NBRS]), system->reax_param.num_atom_types, - system->n, system->N, system->total_cap, - workspace->d_workspace->H.n_max, - system->d_bonds, system->d_max_bonds, - system->d_hbonds, system->d_max_hbonds, - system->d_cm_entries, system->d_max_cm_entries ); - cudaDeviceSynchronize( ); - cudaCheckError( ); + if ( workspace->d_workspace->H.format == SYM_HALF_MATRIX ) + { +// k_estimate_storages_cm_half <<< blocks, DEF_BLOCK_SIZE >>> +// ( (control_params *) control->d_control_params, +// *(lists[FAR_NBRS]), workspace->d_workspace->H.n, +// workspace->d_workspace->H.n_max, +// system->d_cm_entries, system->d_max_cm_entries ); + } + else + { + k_estimate_storages_cm_full <<< blocks, DEF_BLOCK_SIZE >>> + ( (control_params *) control->d_control_params, + *(lists[FAR_NBRS]), workspace->d_workspace->H.n, + workspace->d_workspace->H.n_max, + system->d_cm_entries, system->d_max_cm_entries ); + } + cudaDeviceSynchronize( ); + cudaCheckError( ); + + Cuda_Reduction_Sum( system->d_max_cm_entries, + system->d_total_cm_entries, workspace->d_workspace->H.n_max ); + copy_host_device( &system->total_cm_entries, system->d_total_cm_entries, sizeof(int), + cudaMemcpyDeviceToHost, "Cuda_Estimate_Storages::d_total_cm_entries" ); + } + + if ( realloc_bonds == TRUE || realloc_hbonds == TRUE ) + { + blocks = system->total_cap / DEF_BLOCK_SIZE + + (system->total_cap % DEF_BLOCK_SIZE == 0 ? 0 : 1); + + k_estimate_storages_bonds <<< blocks, DEF_BLOCK_SIZE >>> + ( system->d_my_atoms, system->reax_param.d_sbp, system->reax_param.d_tbp, + (control_params *) control->d_control_params, + *(lists[FAR_NBRS]), system->reax_param.num_atom_types, + system->n, system->N, system->total_cap, + system->d_bonds, system->d_max_bonds, + system->d_hbonds, system->d_max_hbonds ); + cudaDeviceSynchronize( ); + cudaCheckError( ); + } if ( realloc_bonds == TRUE ) { @@ -1143,15 +1267,7 @@ void Cuda_Estimate_Storages( reax_system *system, control_params *control, #endif control->hbond_cut = 0.0; - k_disable_hydrogen_bonding <<< 1, 1 >>> ( (control_params *)control->d_control_params ); - } - - if ( realloc_cm == TRUE ) - { - Cuda_Reduction_Sum( system->d_max_cm_entries, - system->d_total_cm_entries, workspace->d_workspace->H.n_max ); - copy_host_device( &system->total_cm_entries, system->d_total_cm_entries, sizeof(int), - cudaMemcpyDeviceToHost, "Cuda_Estimate_Storages::d_total_cm_entries" ); + k_disable_hydrogen_bonding <<< 1, 1 >>> ( (control_params *) control->d_control_params ); } } @@ -1222,11 +1338,22 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, } else { - k_init_cm_full_fs <<< blocks, DEF_BLOCK_SIZE >>> - ( system->d_my_atoms, system->reax_param.d_sbp, system->reax_param.d_tbp, - *(workspace->d_workspace), (control_params *) control->d_control_params, - *(lists[FAR_NBRS]), workspace->d_LR, system->reax_param.num_atom_types, - system->d_max_cm_entries, system->d_realloc_cm_entries ); + if ( control->tabulate <= 0 ) + { + k_init_cm_full_fs <<< blocks, DEF_BLOCK_SIZE >>> + ( system->d_my_atoms, system->reax_param.d_sbp, system->reax_param.d_tbp, + *(workspace->d_workspace), (control_params *) control->d_control_params, + *(lists[FAR_NBRS]), system->reax_param.num_atom_types, + system->d_max_cm_entries, system->d_realloc_cm_entries ); + } + else + { + k_init_cm_full_fs_tab <<< blocks, DEF_BLOCK_SIZE >>> + ( system->d_my_atoms, system->reax_param.d_sbp, system->reax_param.d_tbp, + *(workspace->d_workspace), (control_params *) control->d_control_params, + *(lists[FAR_NBRS]), workspace->d_LR, system->reax_param.num_atom_types, + system->d_max_cm_entries, system->d_realloc_cm_entries ); + } cudaDeviceSynchronize( ); cudaCheckError( ); } @@ -1340,11 +1467,11 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, #endif cuda_check_malloc( &workspace->scratch, &workspace->scratch_size, - MAX( sizeof(real) * 2 * system->N, + MAX( sizeof(real) * system->n, MAX( sizeof(real) * 3 * system->n, - MAX( (sizeof(real) * 6 + sizeof(rvec) * 2) * system->N + sizeof(rvec) * control->blocks_n, - MAX( (sizeof(real) * 4 + sizeof(rvec) * 2) * system->n + sizeof(rvec) * control->blocks, - (sizeof(real) + sizeof(rvec)) * 2 * system->n + sizeof(rvec) * control->blocks )))), + MAX( (sizeof(real) * 3 + sizeof(rvec)) * system->N + sizeof(rvec) * control->blocks_n, + MAX( (sizeof(real) * 2 + sizeof(rvec)) * system->n + sizeof(rvec) * control->blocks, + (sizeof(real) + sizeof(rvec)) * system->n + sizeof(rvec) * control->blocks )))), "Cuda_Compute_Bonded_Forces::workspace->scratch" ); spad = (real *) workspace->scratch; update_energy = (out_control->energy_update_freq > 0 @@ -1380,7 +1507,7 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, cudaCheckError( ); /* 2. Bond Energy Interactions */ - cuda_memset( spad, 0, sizeof(real) * 2 * system->N, + cuda_memset( spad, 0, sizeof(real) * system->n, "Compute_Bonded_Forces::spad" ); Cuda_Bonds <<< control->blocks, control->block_size, sizeof(real) * control->block_size >>> @@ -1438,7 +1565,10 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, cuda_check_malloc( &workspace->scratch, &workspace->scratch_size, sizeof(int) * system->total_bonds, "Cuda_Compute_Bonded_Forces::workspace->scratch" ); + thbody = (int *) workspace->scratch; + spad = (real *) workspace->scratch; /* in case scratch gets reallocated above, changing the pointer */ + ret = Cuda_Estimate_Storage_Three_Body( system, control, data, workspace, lists, thbody ); @@ -1446,63 +1576,69 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, { Cuda_Init_Three_Body_Indices( thbody, system->total_thbodies_indices, lists ); - cuda_memset( spad, 0, (sizeof(real) * 6 + sizeof(rvec) * 2) * system->N, + cuda_memset( spad, 0, + (sizeof(real) * 3 + sizeof(rvec)) * system->N + sizeof(rvec) * control->blocks_n, "Cuda_Compute_Bonded_Forces::spad" ); Cuda_Valence_Angles_Part1 <<< control->blocks_n, control->block_size_n >>> ( system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_sbp, system->reax_param.d_thbp, - (control_params *)control->d_control_params, + (control_params *) control->d_control_params, *(workspace->d_workspace), *(lists[BONDS]), *(lists[THREE_BODIES]), system->n, system->N, system->reax_param.num_atom_types, - spad, &spad[2 * system->N], &spad[4 * system->N], (rvec *)(&spad[6 * system->N]) ); + spad, &spad[system->N], &spad[2 * system->N], (rvec *) (&spad[3 * system->N]) ); cudaDeviceSynchronize( ); cudaCheckError( ); - /* reduction for E_Ang */ if ( update_energy == TRUE ) { - Cuda_Reduction_Sum( spad, &((simulation_data *)data->d_simulation_data)->my_en.e_ang, + /* reduction for E_Ang */ + Cuda_Reduction_Sum( spad, + &((simulation_data *)data->d_simulation_data)->my_en.e_ang, system->N ); - } - if ( update_energy == TRUE ) - { /* reduction for E_Pen */ - Cuda_Reduction_Sum( &spad[2 * system->N], + Cuda_Reduction_Sum( &spad[system->N], &((simulation_data *)data->d_simulation_data)->my_en.e_pen, system->N ); /* reduction for E_Coa */ - Cuda_Reduction_Sum( &spad[4 * system->N], + Cuda_Reduction_Sum( &spad[2 * system->N], &((simulation_data *)data->d_simulation_data)->my_en.e_coa, system->N ); } - /* reduction for ext_pres */ - rvec_spad = (rvec *) (&spad[6 * system->N]); - k_reduction_rvec <<< control->blocks_n, control->block_size_n, - sizeof(rvec) * control->block_size_n >>> - ( rvec_spad, &rvec_spad[system->N], system->N ); - cudaDeviceSynchronize( ); - cudaCheckError( ); + if ( control->virial == 1 ) + { + rvec_spad = (rvec *) (&spad[3 * system->N]); - k_reduction_rvec <<< 1, control->blocks_pow_2_n, sizeof(rvec) * control->blocks_pow_2_n >>> - ( &rvec_spad[system->N], &((simulation_data *)data->d_simulation_data)->my_ext_press, control->blocks_n ); - cudaDeviceSynchronize (); - cudaCheckError( ); -// Cuda_Reduction_Sum( rvec_spad, -// &((simulation_data *)data->d_simulation_data)->my_ext_press, -// system->N ); + /* reduction for ext_pres */ + k_reduction_rvec <<< control->blocks_n, control->block_size_n, + sizeof(rvec) * (control->block_size_n / 32) >>> + ( rvec_spad, &rvec_spad[system->N], system->N ); + cudaDeviceSynchronize( ); + cudaCheckError( ); + + k_reduction_rvec <<< 1, control->blocks_pow_2_n, + sizeof(rvec) * (control->blocks_pow_2_n / 32) >>> + ( &rvec_spad[system->N], + &((simulation_data *)data->d_simulation_data)->my_ext_press, + control->blocks_n ); + cudaDeviceSynchronize (); + cudaCheckError( ); +// Cuda_Reduction_Sum( rvec_spad, +// &((simulation_data *)data->d_simulation_data)->my_ext_press, +// system->N ); + } Cuda_Valence_Angles_Part2 <<< control->blocks_n, control->block_size_n >>> - ( system->d_my_atoms, (control_params *)control->d_control_params, + ( system->d_my_atoms, (control_params *) control->d_control_params, *(workspace->d_workspace), *(lists[BONDS]), system->N ); cudaDeviceSynchronize( ); cudaCheckError( ); /* 5. Torsion Angles Interactions */ - cuda_memset( spad, 0, (sizeof(real) * 4 + sizeof(rvec) * 2) * system->n, + cuda_memset( spad, 0, (sizeof(real) * 2 + sizeof(rvec)) * system->n + sizeof(rvec) * control->blocks, "Cuda_Compute_Bonded_Forces::spad" ); Cuda_Torsion_Angles_Part1 <<< control->blocks, control->block_size >>> @@ -1510,40 +1646,47 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, (control_params *) control->d_control_params, *(lists[BONDS]), *(lists[THREE_BODIES]), *(workspace->d_workspace), system->n, system->reax_param.num_atom_types, - spad, &spad[2 * system->n], (rvec *) (&spad[4 * system->n]) ); + spad, &spad[system->n], (rvec *) (&spad[2 * system->n]) ); cudaDeviceSynchronize( ); cudaCheckError( ); if ( update_energy == TRUE ) { /* reduction for E_Tor */ - Cuda_Reduction_Sum( spad, &((simulation_data *)data->d_simulation_data)->my_en.e_tor, + Cuda_Reduction_Sum( spad, + &((simulation_data *)data->d_simulation_data)->my_en.e_tor, system->n ); /* reduction for E_Con */ - Cuda_Reduction_Sum( &spad[2 * system->n], + Cuda_Reduction_Sum( &spad[system->n], &((simulation_data *)data->d_simulation_data)->my_en.e_con, system->n ); } - /* reduction for ext_pres */ - rvec_spad = (rvec *) (&spad[4 * system->n]); - k_reduction_rvec <<< control->blocks, control->block_size, - sizeof(rvec) * control->block_size >>> - ( rvec_spad, &rvec_spad[system->n], system->n ); - cudaDeviceSynchronize( ); - cudaCheckError( ); + if ( control->virial == 1 ) + { + rvec_spad = (rvec *) (&spad[2 * system->n]); - k_reduction_rvec <<< 1, control->blocks_pow_2, sizeof(rvec) * control->blocks_pow_2 >>> - ( &rvec_spad[system->n], - &((simulation_data *)data->d_simulation_data)->my_ext_press, control->blocks ); - cudaDeviceSynchronize( ); - cudaCheckError( ); -// Cuda_Reduction_Sum( rvec_spad, -// &((simulation_data *)data->d_simulation_data)->my_ext_press, -// system->n ); + /* reduction for ext_pres */ + k_reduction_rvec <<< control->blocks, control->block_size, + sizeof(rvec) * (control->block_size / 32) >>> + ( rvec_spad, &rvec_spad[system->n], system->n ); + cudaDeviceSynchronize( ); + cudaCheckError( ); + + k_reduction_rvec <<< 1, control->blocks_pow_2, + sizeof(rvec) * (control->blocks_pow_2 / 32) >>> + ( &rvec_spad[system->n], + &((simulation_data *)data->d_simulation_data)->my_ext_press, + control->blocks ); + cudaDeviceSynchronize( ); + cudaCheckError( ); +// Cuda_Reduction_Sum( rvec_spad, +// &((simulation_data *)data->d_simulation_data)->my_ext_press, +// system->n ); + } - Cuda_Torsion_Angles_Part2 <<< control->blocks_n, control->block_size >>> + Cuda_Torsion_Angles_Part2 <<< control->blocks_n, control->block_size_n >>> ( system->d_my_atoms, *(workspace->d_workspace), *(lists[BONDS]), system->N ); cudaDeviceSynchronize( ); @@ -1552,7 +1695,8 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, /* 6. Hydrogen Bonds Interactions */ if ( control->hbond_cut > 0.0 && system->numH > 0 ) { - cuda_memset( spad, 0, (sizeof(real) + sizeof(rvec)) * 2 * system->n, + cuda_memset( spad, 0, + (sizeof(real) + sizeof(rvec)) * system->n + sizeof(rvec) * control->blocks, "Cuda_Compute_Bonded_Forces::spad" ); // hbs = (system->n * HB_KER_THREADS_PER_ATOM / HB_BLOCK_SIZE) + @@ -1567,7 +1711,7 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, *(workspace->d_workspace), *(lists[FAR_NBRS]), *(lists[BONDS]), *(lists[HBONDS]), system->n, system->reax_param.num_atom_types, - spad, (rvec *) (&spad[2 * system->n]), system->my_rank ); + spad, (rvec *) (&spad[system->n]), system->my_rank ); cudaDeviceSynchronize( ); cudaCheckError( ); @@ -1579,23 +1723,28 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, system->n ); } - /* reduction for ext_pres */ - rvec_spad = (rvec *) (&spad[2 * system->n]); - k_reduction_rvec <<< control->blocks, control->block_size, - sizeof(rvec) * control->block_size >>> - ( rvec_spad, &rvec_spad[system->n], system->n ); - cudaDeviceSynchronize( ); - cudaCheckError( ); - - k_reduction_rvec <<< 1, control->blocks_pow_2, sizeof(rvec) * control->blocks_pow_2 >>> - ( &rvec_spad[system->n], - &((simulation_data *)data->d_simulation_data)->my_ext_press, - control->blocks ); - cudaDeviceSynchronize( ); - cudaCheckError( ); -// Cuda_Reduction_Sum( rvec_spad, -// &((simulation_data *)data->d_simulation_data)->my_ext_press, -// system->n ); + if ( control->virial == 1 ) + { + rvec_spad = (rvec *) (&spad[system->n]); + + /* reduction for ext_pres */ + k_reduction_rvec <<< control->blocks, control->block_size, + sizeof(rvec) * (control->block_size / 32) >>> + ( rvec_spad, &rvec_spad[system->n], system->n ); + cudaDeviceSynchronize( ); + cudaCheckError( ); + + k_reduction_rvec <<< 1, control->blocks_pow_2, + sizeof(rvec) * (control->blocks_pow_2 / 32) >>> + ( &rvec_spad[system->n], + &((simulation_data *)data->d_simulation_data)->my_ext_press, + control->blocks ); + cudaDeviceSynchronize( ); + cudaCheckError( ); +// Cuda_Reduction_Sum( rvec_spad, +// &((simulation_data *)data->d_simulation_data)->my_ext_press, +// system->n ); + } Cuda_Hydrogen_Bonds_Part2 <<< control->blocks, control->block_size >>> ( system->d_my_atoms, *(workspace->d_workspace), diff --git a/PG-PuReMD/src/cuda/cuda_init_md.cu b/PG-PuReMD/src/cuda/cuda_init_md.cu index 0cacf504639e0723f65f2690f7d9ffdd81ad4b14..b46989af98b629952a402dbb21ca23a6dcef817b 100644 --- a/PG-PuReMD/src/cuda/cuda_init_md.cu +++ b/PG-PuReMD/src/cuda/cuda_init_md.cu @@ -198,8 +198,12 @@ void Cuda_Init_Lists( reax_system *system, control_params *control, Cuda_Generate_Neighbor_Lists( system, data, workspace, lists ); + /* first call to Cuda_Estimate_Storages requires setting these manually before allocation */ + workspace->d_workspace->H.n = system->n; + workspace->d_workspace->H.n_max = system->local_cap; + workspace->d_workspace->H.format = SYM_FULL_MATRIX; + /* estimate storage for bonds, hbonds, and sparse matrix */ - workspace->d_workspace->H.n_max = system->local_cap; // first call requires setting this manually before allocation Cuda_Estimate_Storages( system, control, workspace, lists, TRUE, TRUE, TRUE, data->step - data->prev_steps ); diff --git a/PG-PuReMD/src/cuda/cuda_nonbonded.cu b/PG-PuReMD/src/cuda/cuda_nonbonded.cu index 02cad3548cb0ca5deca8137d9693f89e82862610..37e0e28b1912c155efcf7df1482f28c3aa016868 100644 --- a/PG-PuReMD/src/cuda/cuda_nonbonded.cu +++ b/PG-PuReMD/src/cuda/cuda_nonbonded.cu @@ -525,7 +525,7 @@ CUDA_GLOBAL void k_vdW_coulomb_energy_opt( reax_atom *my_atoms, /* one thread per atom implementation */ -CUDA_GLOBAL void k_tabulated_vdW_coulomb_energy( reax_atom *my_atoms, +CUDA_GLOBAL void k_vdW_coulomb_energy_tab( reax_atom *my_atoms, global_parameters gp, control_params *control, storage workspace, reax_list far_nbr_list, LR_lookup_table *t_LR, int n, int num_atom_types, @@ -693,53 +693,47 @@ void Cuda_NonBonded_Energy( reax_system *system, control_params *control, storage *workspace, simulation_data *data, reax_list **lists, output_controls *out_control ) { -// int blocks; int update_energy; real *spad; rvec *spad_rvec; -// blocks = (system->n * VDW_KER_THREADS_PER_ATOM / DEF_BLOCK_SIZE) -// + ((system->n * VDW_KER_THREADS_PER_ATOM % DEF_BLOCK_SIZE == 0) ? 0 : 1); update_energy = (out_control->energy_update_freq > 0 && data->step % out_control->energy_update_freq == 0) ? TRUE : FALSE; cuda_check_malloc( &workspace->scratch, &workspace->scratch_size, (sizeof(real) * 2 + sizeof(rvec)) * system->n + sizeof(rvec) * control->blocks, "Cuda_NonBonded_Energy::workspace->scratch" ); -// cuda_check_malloc( &workspace->scratch, &workspace->scratch_size, -// (sizeof(real) * 2 + sizeof(rvec)) * system->n + sizeof(rvec) * blocks, -// "Cuda_NonBonded_Energy::workspace->scratch" ); spad = (real *) workspace->scratch; if ( control->tabulate == 0 ) { k_vdW_coulomb_energy <<< control->blocks, control->block_size >>> ( system->d_my_atoms, system->reax_param.d_tbp, - system->reax_param.d_gp, (control_params *)control->d_control_params, + system->reax_param.d_gp, (control_params *) control->d_control_params, *(workspace->d_workspace), *(lists[FAR_NBRS]), system->n, system->reax_param.num_atom_types, - spad, &spad[system->n], (rvec *)(&spad[2 * system->n]) ); + spad, &spad[system->n], (rvec *) (&spad[2 * system->n]) ); // k_vdW_coulomb_energy_opt <<< blocks, DEF_BLOCK_SIZE, // (2 * sizeof(real) + sizeof(rvec)) * DEF_BLOCK_SIZE >>> // ( system->d_my_atoms, system->reax_param.d_tbp, -// system->reax_param.d_gp, (control_params *)control->d_control_params, +// system->reax_param.d_gp, (control_params *) control->d_control_params, // *(workspace->d_workspace), *(lists[FAR_NBRS]), // system->n, system->reax_param.num_atom_types, -// spad, &spad[system->n], (rvec *)(&spad[2 * system->n]) ); +// spad, &spad[system->n], (rvec *) (&spad[2 * system->n]) ); cudaDeviceSynchronize( ); cudaCheckError( ); } else { - k_tabulated_vdW_coulomb_energy <<< control->blocks, control->block_size >>> + k_vdW_coulomb_energy_tab <<< control->blocks, control->block_size >>> ( system->d_my_atoms, system->reax_param.d_gp, - (control_params *)control->d_control_params, + (control_params *) control->d_control_params, *(workspace->d_workspace), *(lists[FAR_NBRS]), workspace->d_LR, system->n, system->reax_param.num_atom_types, data->step, data->prev_steps, out_control->energy_update_freq, - spad, &spad[system->n], (rvec *)(&spad[2 * system->n])); + spad, &spad[system->n], (rvec *) (&spad[2 * system->n])); cudaDeviceSynchronize( ); cudaCheckError( ); } @@ -747,28 +741,35 @@ void Cuda_NonBonded_Energy( reax_system *system, control_params *control, if ( update_energy == TRUE ) { /* reduction for vdw */ - Cuda_Reduction_Sum( spad, &((simulation_data *)data->d_simulation_data)->my_en.e_vdW, + Cuda_Reduction_Sum( spad, + &((simulation_data *)data->d_simulation_data)->my_en.e_vdW, system->n ); /* reduction for ele */ - Cuda_Reduction_Sum( &spad[system->n], &((simulation_data *)data->d_simulation_data)->my_en.e_ele, + Cuda_Reduction_Sum( &spad[system->n], + &((simulation_data *)data->d_simulation_data)->my_en.e_ele, system->n ); } - /* reduction for ext_press */ - spad_rvec = (rvec *) (&spad[2 * system->n]); - k_reduction_rvec <<< control->blocks, control->block_size, - sizeof(rvec) * control->block_size >>> - ( spad_rvec, &spad_rvec[system->n], system->n ); - cudaDeviceSynchronize( ); - cudaCheckError( ); + if ( control->virial == 1 ) + { + spad_rvec = (rvec *) (&spad[2 * system->n]); - k_reduction_rvec <<< 1, control->blocks_pow_2, - sizeof(rvec) * control->blocks_pow_2 >>> - ( &spad_rvec[system->n], - &((simulation_data *)data->d_simulation_data)->my_ext_press, control->blocks ); - cudaDeviceSynchronize( ); - cudaCheckError( ); + /* reduction for ext_press */ + k_reduction_rvec <<< control->blocks, control->block_size, + sizeof(rvec) * (control->block_size / 32) >>> + ( spad_rvec, &spad_rvec[system->n], system->n ); + cudaDeviceSynchronize( ); + cudaCheckError( ); + + k_reduction_rvec <<< 1, control->blocks_pow_2, + sizeof(rvec) * (control->blocks_pow_2 / 32) >>> + ( &spad_rvec[system->n], + &((simulation_data *)data->d_simulation_data)->my_ext_press, + control->blocks ); + cudaDeviceSynchronize( ); + cudaCheckError( ); + } if ( update_energy == TRUE ) { diff --git a/PG-PuReMD/src/cuda/cuda_reduction.cu b/PG-PuReMD/src/cuda/cuda_reduction.cu index 3e44b8428a716d750af9ce4eddb25cfabc4f4b26..71dce725d14ae8576bad491a8b75e66a801e1d8c 100644 --- a/PG-PuReMD/src/cuda/cuda_reduction.cu +++ b/PG-PuReMD/src/cuda/cuda_reduction.cu @@ -198,7 +198,7 @@ CUDA_GLOBAL void k_reduction_rvec( rvec *input, rvec *results, size_t n ) rvec_Copy( data, input[i] ); /* warp-level sum using registers within a warp */ - for ( offset = warpSize >> 1; offset > 0; offset /= 2 ) + for ( offset = warpSize >> 1; offset > 0; offset >>= 1 ) { data[0] += __shfl_down_sync( mask, data[0], offset ); data[1] += __shfl_down_sync( mask, data[1], offset ); @@ -249,7 +249,7 @@ CUDA_GLOBAL void k_reduction_rvec2( rvec2 *input, rvec2 *results, size_t n ) data[1] = input[i][1]; /* warp-level sum using registers within a warp */ - for ( offset = warpSize >> 1; offset > 0; offset /= 2 ) + for ( offset = warpSize >> 1; offset > 0; offset >>= 1 ) { data[0] += __shfl_down_sync( mask, data[0], offset ); data[1] += __shfl_down_sync( mask, data[1], offset ); diff --git a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu index fdb63082f0b21cd7fec7942581d50f2f3c45d899..385a163d1a4d14d309d440cb876044800c1c800c 100644 --- a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu +++ b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu @@ -147,7 +147,7 @@ CUDA_GLOBAL void k_sparse_matvec_half_csr( int *row_ptr_start, const real * const x, real * const b, int N ) { int i, pj, si, ei; - real b_local; + real sum; i = blockIdx.x * blockDim.x + threadIdx.x; @@ -158,20 +158,20 @@ CUDA_GLOBAL void k_sparse_matvec_half_csr( int *row_ptr_start, si = row_ptr_start[i]; ei = row_ptr_end[i]; - b_local = 0.0; + sum = 0.0; for ( pj = si; pj < ei; ++pj ) { - b_local += vals[pj] * x[col_ind[pj]]; + sum += vals[pj] * x[col_ind[pj]]; /* symmetric contribution (A is symmetric, lower half stored) */ atomicAdd( (double*) &b[col_ind[pj]], (double) (vals[pj] * x[i]) ); } /* diagonal entry */ - b_local += vals[ei] * x[i]; + sum += vals[ei] * x[i]; /* local contribution to row i for this thread */ - atomicAdd( (double*) &b[i], (double) b_local ); + atomicAdd( (double*) &b[i], (double) sum ); } @@ -225,42 +225,89 @@ CUDA_GLOBAL void k_sparse_matvec_full_opt_csr( int *row_ptr_start, int *row_ptr_end, int *col_ind, real *vals, const real * const x, real * const b, int n ) { - int i, pj, thread_id, warp_id, lane_id, offset; + int pj, thread_id, warp_id, lane_id, offset; int si, ei; - unsigned int mask; real sum; thread_id = blockDim.x * blockIdx.x + threadIdx.x; warp_id = thread_id >> 5; + + if ( warp_id >= n ) + { + return; + } + lane_id = thread_id & 0x0000001F; - mask = __ballot_sync( FULL_MASK, warp_id < n ); + si = row_ptr_start[warp_id]; + ei = row_ptr_end[warp_id]; + sum = 0.0; - if ( warp_id < n ) + /* partial sums per thread */ + for ( pj = si + lane_id; pj < ei; pj += warpSize ) { - i = warp_id; - si = row_ptr_start[i]; - ei = row_ptr_end[i]; - sum = 0.0; + sum += vals[pj] * x[col_ind[pj]]; + } - /* partial sums per thread */ - for ( pj = si + lane_id; pj < ei; pj += warpSize ) - { - sum += vals[pj] * x[ col_ind[pj] ]; - } + /* warp-level reduction of partial sums + * using registers within a warp */ + for ( offset = warpSize >> 1; offset > 0; offset >>= 1 ) + { + sum += __shfl_down_sync( FULL_MASK, sum, offset ); + } - /* warp-level reduction of partial sums - * using registers within a warp */ - for ( offset = warpSize >> 1; offset > 0; offset /= 2 ) - { - sum += __shfl_down_sync( mask, sum, offset ); - } + __syncthreads( ); - /* first thread within a warp writes sum to global memory */ - if ( lane_id == 0 ) - { - b[i] = sum; - } + /* first thread within a warp writes sum to global memory */ + if ( lane_id == 0 ) + { + b[warp_id] = sum; + } +} + + +/* sparse matrix, dense vector multiplication Ax = b, + * where one GPU thread multiplies a row + * + * A: symmetric (lower triangular portion only stored), square matrix, + * stored in CSR format + * X: 2 dense vectors, size equal to num. columns in A + * B (output): 2 dense vectors, size equal to num. columns in A + * N: number of rows in A */ +CUDA_GLOBAL void k_dual_sparse_matvec_half_csr( int *row_ptr_start, + int *row_ptr_end, int *col_ind, real *vals, + const rvec2 * const x, rvec2 * const b, int N ) +{ + int i, pj, si, ei; + rvec2 sum; + + i = blockIdx.x * blockDim.x + threadIdx.x; + + if ( i >= N ) + { + return; + } + + si = row_ptr_start[i]; + ei = row_ptr_end[i]; + sum[0] = 0.0; + sum[1] = 0.0; + + for ( pj = si; pj < ei; ++pj ) + { + sum[0] += vals[pj] * x[col_ind[pj]][0]; + sum[1] += vals[pj] * x[col_ind[pj]][1]; + /* symmetric contribution (A is symmetric, lower half stored) */ + atomicAdd( (double*) &b[col_ind[pj]][0], (double) (vals[pj] * x[i][0]) ); + atomicAdd( (double*) &b[col_ind[pj]][1], (double) (vals[pj] * x[i][1]) ); } + + /* diagonal entry */ + sum[0] += vals[ei] * x[i][0]; + sum[1] += vals[ei] * x[i][1]; + + /* local contribution to row i for this thread */ + atomicAdd( (double*) &b[i][0], (double) sum[0] ); + atomicAdd( (double*) &b[i][1], (double) sum[1] ); } @@ -285,8 +332,8 @@ CUDA_GLOBAL void k_dual_sparse_matvec_full_csr( sparse_matrix A, for ( pj = si; pj < ei; ++pj ) { - sum[0] += A.val[pj] * x[ A.j[pj] ][0]; - sum[1] += A.val[pj] * x[ A.j[pj] ][1]; + sum[0] += A.val[pj] * x[A.j[pj]][0]; + sum[1] += A.val[pj] * x[A.j[pj]][1]; } b[i][0] = sum[0]; @@ -302,50 +349,50 @@ CUDA_GLOBAL void k_dual_sparse_matvec_full_csr( sparse_matrix A, * stored in CSR format * X: 2 dense vectors, size equal to num. columns in A * B (output): 2 dense vectors, size equal to num. columns in A - * N: number of rows in A */ + * n: number of rows in A */ CUDA_GLOBAL void k_dual_sparse_matvec_full_opt_csr( int *row_ptr_start, int *row_ptr_end, int *col_ind, real *vals, rvec2 const * const x, rvec2 * const b, int n ) { - int i, pj, thread_id, warp_id, lane_id, offset; - int si, ei; - unsigned int mask; + int pj, si, ei, thread_id, warp_id, lane_id, offset; rvec2 sum; thread_id = blockDim.x * blockIdx.x + threadIdx.x; warp_id = thread_id >> 5; + + if ( warp_id >= n ) + { + return; + } + lane_id = thread_id & 0x0000001F; - mask = __ballot_sync( FULL_MASK, warp_id < n ); + si = row_ptr_start[warp_id]; + ei = row_ptr_end[warp_id]; + sum[0] = 0.0; + sum[1] = 0.0; - if ( warp_id < n ) + /* partial sums per thread */ + for ( pj = si + lane_id; pj < ei; pj += warpSize ) { - i = warp_id; - si = row_ptr_start[i]; - ei = row_ptr_end[i]; - sum[0] = 0.0; - sum[1] = 0.0; - - /* partial sums per thread */ - for ( pj = si + lane_id; pj < ei; pj += warpSize ) - { - sum[0] += vals[pj] * x[ col_ind[pj] ][0]; - sum[1] += vals[pj] * x[ col_ind[pj] ][1]; - } + sum[0] += vals[pj] * x[col_ind[pj]][0]; + sum[1] += vals[pj] * x[col_ind[pj]][1]; + } - /* warp-level reduction of partial sums - * using registers within a warp */ - for ( offset = warpSize >> 1; offset > 0; offset /= 2 ) - { - sum[0] += __shfl_down_sync( mask, sum[0], offset ); - sum[1] += __shfl_down_sync( mask, sum[1], offset ); - } + /* warp-level reduction of partial sums + * using registers within a warp */ + for ( offset = warpSize >> 1; offset > 0; offset >>= 1 ) + { + sum[0] += __shfl_down_sync( FULL_MASK, sum[0], offset ); + sum[1] += __shfl_down_sync( FULL_MASK, sum[1], offset ); + } - /* first thread within a warp writes sum to global memory */ - if ( lane_id == 0 ) - { - b[i][0] = sum[0]; - b[i][1] = sum[1]; - } + __syncthreads( ); + + /* first thread within a warp writes sum to global memory */ + if ( lane_id == 0 ) + { + b[warp_id][0] = sum[0]; + b[warp_id][1] = sum[1]; } } @@ -408,6 +455,7 @@ static void Dual_Sparse_MatVec_Comm_Part1( const reax_system * const system, sizeof(rvec2) * n, TRUE, SAFE_ZONE, "Dual_Sparse_MatVec_Comm_Part1::workspace->host_scratch" ); spad = (rvec2 *) workspace->host_scratch; + copy_host_device( spad, (void *) x, sizeof(rvec2) * n, cudaMemcpyDeviceToHost, "Dual_Sparse_MatVec_Comm_Part1::x" ); @@ -439,18 +487,18 @@ static void Dual_Sparse_MatVec_local( control_params const * const control, /* half-format requires entries of b be initialized to zero */ cuda_memset( b, 0, sizeof(rvec2) * n, "Dual_Sparse_MatVec_local::b" ); - //TODO: implement half-format dual SpMV -// blocks = n * 32 / DEF_BLOCK_SIZE + -// (n * 32 % DEF_BLOCK_SIZE == 0 ? 0 : 1); - /* 1 thread per row implementation */ -// k_dual_sparse_matvec_half_csr <<< control->blocks, control->block_size >>> -// ( A->start, A->end, A->j, A->val, x, b, n ); + k_dual_sparse_matvec_half_csr <<< control->blocks, control->block_size >>> + ( A->start, A->end, A->j, A->val, x, b, A->n ); + //TODO: implement half-format dual SpMV +// blocks = (A->n * 32) / DEF_BLOCK_SIZE +// + ((A->n * 32) % DEF_BLOCK_SIZE == 0 ? 0 : 1); + /* multiple threads per row implementation, * with shared memory to accumulate partial row sums */ // k_dual_sparse_matvec_half_opt_csr <<< blocks, DEF_BLOCK_SIZE >>> -// ( A->start, A->end, A->j, A->val, x, b, n ); +// ( A->start, A->end, A->j, A->val, x, b, A->n ); cudaDeviceSynchronize( ); cudaCheckError( ); } @@ -458,15 +506,15 @@ static void Dual_Sparse_MatVec_local( control_params const * const control, { /* 1 thread per row implementation */ // k_dual_sparse_matvec_full_csr <<< control->blocks_n, control->blocks_size_n >>> -// ( *A, x, b, n ); +// ( *A, x, b, A->n ); - blocks = (A->n * 32 / DEF_BLOCK_SIZE) - + ((A->n * 32 % DEF_BLOCK_SIZE) == 0 ? 0 : 1); + blocks = ((A->n * 32) / DEF_BLOCK_SIZE) + + (((A->n * 32) % DEF_BLOCK_SIZE) == 0 ? 0 : 1); - /* multiple threads per row implementation + /* 32 threads per row implementation * using registers to accumulate partial row sums */ k_dual_sparse_matvec_full_opt_csr <<< blocks, DEF_BLOCK_SIZE >>> - ( A->start, A->end, A->j, A->val, x, b, n ); + ( A->start, A->end, A->j, A->val, x, b, A->n ); cudaDeviceSynchronize( ); cudaCheckError( ); } @@ -498,7 +546,7 @@ static void Dual_Sparse_MatVec_Comm_Part2( const reax_system * const system, rvec2 *spad; /* reduction required for symmetric half matrix */ -// if ( mat_format == SYM_HALF_MATRIX ) + if ( mat_format == SYM_HALF_MATRIX ) { //#if defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT // Coll( system, mpi_data, b, buf_type, mpi_type ); @@ -619,20 +667,20 @@ static void Sparse_MatVec_local( control_params const * const control, if ( A->format == SYM_HALF_MATRIX ) { -// blocks = (A->n * 32 / DEF_BLOCK_SIZE) -// + ((A->n * 32 % DEF_BLOCK_SIZE) == 0 ? 0 : 1); - /* half-format requires entries of b be initialized to zero */ cuda_memset( b, 0, sizeof(real) * n, "Sparse_MatVec_local::b" ); /* 1 thread per row implementation */ k_sparse_matvec_half_csr <<< control->blocks, control->block_size >>> - ( A->start, A->end, A->j, A->val, x, b, n ); + ( A->start, A->end, A->j, A->val, x, b, A->n ); + +// blocks = (A->n * 32 / DEF_BLOCK_SIZE) +// + ((A->n * 32 % DEF_BLOCK_SIZE) == 0 ? 0 : 1); /* multiple threads per row implementation, * with shared memory to accumulate partial row sums */ // k_sparse_matvec_half_opt_csr <<< blocks, DEF_BLOCK_SIZE >>> -// ( A->start, A->end, A->j, A->val, x, b, n ); +// ( A->start, A->end, A->j, A->val, x, b, A->n ); cudaDeviceSynchronize( ); cudaCheckError( ); } @@ -640,15 +688,15 @@ static void Sparse_MatVec_local( control_params const * const control, { /* 1 thread per row implementation */ // k_sparse_matvec_full_csr <<< control->blocks, control->blocks_size >>> -// ( A->start, A->end, A->j, A->val, x, b, n ); +// ( A->start, A->end, A->j, A->val, x, b, A->n ); - blocks = (A->n * 32 / DEF_BLOCK_SIZE) - + ((A->n * 32 % DEF_BLOCK_SIZE) == 0 ? 0 : 1); + blocks = ((A->n * 32) / DEF_BLOCK_SIZE) + + (((A->n * 32) % DEF_BLOCK_SIZE) == 0 ? 0 : 1); /* multiple threads per row implementation * using registers to accumulate partial row sums */ k_sparse_matvec_full_opt_csr <<< blocks, DEF_BLOCK_SIZE >>> - ( A->start, A->end, A->j, A->val, x, b, n ); + ( A->start, A->end, A->j, A->val, x, b, A->n ); cudaDeviceSynchronize( ); cudaCheckError( ); } @@ -678,7 +726,7 @@ static void Sparse_MatVec_Comm_Part2( const reax_system * const system, real *spad; /* reduction required for symmetric half matrix */ -// if ( mat_format == SYM_HALF_MATRIX ) + if ( mat_format == SYM_HALF_MATRIX ) { //#if defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT // Coll( system, mpi_data, b, buf_type, mpi_type ); @@ -761,8 +809,6 @@ int Cuda_dual_CG( reax_system const * const system, time = Get_Time( ); #endif - matvecs = 0; - Dual_Sparse_MatVec( system, control, data, workspace, mpi_data, H, x, system->N, workspace->d_workspace->q2 ); @@ -907,6 +953,10 @@ int Cuda_dual_CG( reax_system const * const system, Vector_Copy_To_rvec2( workspace->d_workspace->x, workspace->d_workspace->s, 0, system->n ); } + else + { + matvecs = 0; + } if ( i >= control->cm_solver_max_iters ) { diff --git a/PG-PuReMD/src/cuda/cuda_system_props.cu b/PG-PuReMD/src/cuda/cuda_system_props.cu index cf3337faada05b255aa119b23677e53947146516..e13bb8b9c135ea93d4404db0d76c9caf5f570738 100644 --- a/PG-PuReMD/src/cuda/cuda_system_props.cu +++ b/PG-PuReMD/src/cuda/cuda_system_props.cu @@ -176,39 +176,29 @@ CUDA_GLOBAL void k_center_of_mass_blocks_amcm( single_body_parameters *sbp, CUDA_GLOBAL void k_compute_inertial_tensor_blocks( real *input, real *output, size_t n ) { - extern __shared__ real xx[]; - extern __shared__ real xy[]; - extern __shared__ real xz[]; - extern __shared__ real yy[]; - extern __shared__ real yz[]; - extern __shared__ real zz[]; - unsigned int i, index, xx_i, xy_i, xz_i, yy_i, yz_i, zz_i; + extern __shared__ real t_s[]; + unsigned int i, index; int offset; i = blockIdx.x * blockDim.x + threadIdx.x; - xx_i = threadIdx.x; - xy_i = blockDim.x; - xz_i = 2 * blockDim.x; - yy_i = 3 * blockDim.x; - yz_i = 4 * blockDim.x; - zz_i = 5 * blockDim.x; - - xx[xx_i] = 0.0; - xy[xy_i + threadIdx.x] = 0.0; - xz[xz_i + threadIdx.x] = 0.0; - yy[yy_i + threadIdx.x] = 0.0; - yz[yz_i + threadIdx.x] = 0.0; - zz[zz_i + threadIdx.x] = 0.0; - if ( i < n ) { - xx[ xx_i ] = input[ threadIdx.x * 6 ]; - xy[ xy_i + threadIdx.x ] = input[ threadIdx.x * 6 + 1 ]; - xz[ xz_i + threadIdx.x ] = input[ threadIdx.x * 6 + 2 ]; - yy[ yy_i + threadIdx.x ] = input[ threadIdx.x * 6 + 3 ]; - yz[ yz_i + threadIdx.x ] = input[ threadIdx.x * 6 + 4 ]; - zz[ zz_i + threadIdx.x ] = input[ threadIdx.x * 6 + 5 ]; + t_s[ 6 * i ] = input[ i * 6 ]; + t_s[ 6 * i + 1 ] = input[ i * 6 + 1 ]; + t_s[ 6 * i + 2 ] = input[ i * 6 + 2 ]; + t_s[ 6 * i + 3 ] = input[ i * 6 + 3 ]; + t_s[ 6 * i + 4 ] = input[ i * 6 + 4 ]; + t_s[ 6 * i + 5 ] = input[ i * 6 + 5 ]; + } + else + { + t_s[ 6 * i ] = 0.0; + t_s[ 6 * i + 1 ] = 0.0; + t_s[ 6 * i + 2 ] = 0.0; + t_s[ 6 * i + 3 ] = 0.0; + t_s[ 6 * i + 4 ] = 0.0; + t_s[ 6 * i + 5 ] = 0.0; } __syncthreads( ); @@ -216,25 +206,25 @@ CUDA_GLOBAL void k_compute_inertial_tensor_blocks( real *input, real *output, si { if ( threadIdx.x < offset ) { - index = threadIdx.x + offset; - xx[ threadIdx.x ] += xx[ index ]; - xy[ xy_i + threadIdx.x ] += xy[ xy_i + index ]; - xz[ xz_i + threadIdx.x ] += xz[ xz_i + index ]; - yy[ yy_i + threadIdx.x ] += yy[ yy_i + index ]; - yz[ yz_i + threadIdx.x ] += yz[ yz_i + index ]; - zz[ zz_i + threadIdx.x ] += zz[ zz_i + index ]; + index = 6 * (threadIdx.x + offset); + t_s[ 6 * threadIdx.x ] += t_s[ index ]; + t_s[ 6 * threadIdx.x + 1 ] += t_s[ index + 1 ]; + t_s[ 6 * threadIdx.x + 2 ] += t_s[ index + 2 ]; + t_s[ 6 * threadIdx.x + 3 ] += t_s[ index + 3 ]; + t_s[ 6 * threadIdx.x + 4 ] += t_s[ index + 4 ]; + t_s[ 6 * threadIdx.x + 5 ] += t_s[ index + 5 ]; } __syncthreads( ); } if ( threadIdx.x == 0 ) { - output[0] = xx[0]; - output[1] = xy[xy_i]; - output[2] = xz[xz_i]; - output[3] = xz[yy_i]; - output[4] = xz[yz_i]; - output[5] = xz[zz_i]; + output[0] = t_s[0]; + output[1] = t_s[1]; + output[2] = t_s[2]; + output[3] = t_s[3]; + output[4] = t_s[4]; + output[5] = t_s[5]; } } @@ -242,16 +232,14 @@ CUDA_GLOBAL void k_compute_inertial_tensor_blocks( real *input, real *output, si CUDA_GLOBAL void k_compute_inertial_tensor_xx_xy( single_body_parameters *sbp, reax_atom *atoms, real *t_g, real xcm0, real xcm1, real xcm2, size_t n ) { - extern __shared__ real xx_s[]; - extern __shared__ real xy_s[]; - unsigned int xy_i, i, index, mask; + extern __shared__ real xx_xy_s[]; + unsigned int i, index, mask; int offset; real xx, xy, m; rvec diff, xcm; i = blockIdx.x * blockDim.x + threadIdx.x; mask = __ballot_sync( FULL_MASK, i < n ); - xy_i = blockDim.x; xcm[0] = xcm0; xcm[1] = xcm1; xcm[2] = xcm2; @@ -273,8 +261,8 @@ CUDA_GLOBAL void k_compute_inertial_tensor_xx_xy( single_body_parameters *sbp, /* first thread within a warp writes warp-level sum to shared memory */ if ( threadIdx.x % warpSize == 0 ) { - xx_s[threadIdx.x >> 5] = xx; - xy_s[threadIdx.x >> 5] = xy; + xx_xy_s[2 * (threadIdx.x >> 5)] = xx; + xx_xy_s[2 * (threadIdx.x >> 5) + 1] = xy; } } __syncthreads( ); @@ -284,9 +272,9 @@ CUDA_GLOBAL void k_compute_inertial_tensor_xx_xy( single_body_parameters *sbp, { if ( threadIdx.x < offset ) { - index = threadIdx.x + offset; - xx_s[ threadIdx.x ] += xx_s[ index ]; - xy_s[ xy_i + threadIdx.x ] += xy_s[ xy_i + index ]; + index = 2 * (threadIdx.x + offset); + xx_xy_s[ 2 * threadIdx.x ] += xx_xy_s[ index ]; + xx_xy_s[ 2 * threadIdx.x + 1 ] += xx_xy_s[ index + 1 ]; } __syncthreads( ); } @@ -295,8 +283,8 @@ CUDA_GLOBAL void k_compute_inertial_tensor_xx_xy( single_body_parameters *sbp, * of the reduction back to global memory */ if ( threadIdx.x == 0 ) { - t_g[ blockIdx.x * 6 ] = xx_s[ 0 ]; - t_g[ blockIdx.x * 6 + 1 ] = xy_s[ xy_i ]; + t_g[ blockIdx.x * 6 ] = xx_xy_s[ 0 ]; + t_g[ blockIdx.x * 6 + 1 ] = xx_xy_s[ 1 ]; } } @@ -304,16 +292,14 @@ CUDA_GLOBAL void k_compute_inertial_tensor_xx_xy( single_body_parameters *sbp, CUDA_GLOBAL void k_compute_inertial_tensor_xz_yy( single_body_parameters *sbp, reax_atom *atoms, real *t_g, real xcm0, real xcm1, real xcm2, size_t n ) { - extern __shared__ real xz_s[]; - extern __shared__ real yy_s[]; - unsigned int yy_i, i, index, mask; + extern __shared__ real xz_yy_s[]; + unsigned int i, index, mask; int offset; real xz, yy, m; rvec diff, xcm; i = blockIdx.x * blockDim.x + threadIdx.x; mask = __ballot_sync( FULL_MASK, i < n ); - yy_i = blockDim.x; xcm[0] = xcm0; xcm[1] = xcm1; xcm[2] = xcm2; @@ -335,8 +321,8 @@ CUDA_GLOBAL void k_compute_inertial_tensor_xz_yy( single_body_parameters *sbp, /* first thread within a warp writes warp-level sum to shared memory */ if ( threadIdx.x % warpSize == 0 ) { - xz_s[threadIdx.x >> 5] = xz; - yy_s[threadIdx.x >> 5] = yy; + xz_yy_s[2 * (threadIdx.x >> 5)] = xz; + xz_yy_s[2 * (threadIdx.x >> 5) + 1] = yy; } } __syncthreads( ); @@ -346,9 +332,9 @@ CUDA_GLOBAL void k_compute_inertial_tensor_xz_yy( single_body_parameters *sbp, { if ( threadIdx.x < offset ) { - index = threadIdx.x + offset; - xz_s[ threadIdx.x ] += xz_s[ index ]; - yy_s[ yy_i + threadIdx.x ] += yy_s[ yy_i + index ]; + index = 2 * (threadIdx.x + offset); + xz_yy_s[ 2 * threadIdx.x ] += xz_yy_s[ index ]; + xz_yy_s[ 2 * threadIdx.x + 1 ] += xz_yy_s[ index + 1 ]; } __syncthreads( ); } @@ -357,8 +343,8 @@ CUDA_GLOBAL void k_compute_inertial_tensor_xz_yy( single_body_parameters *sbp, * of the reduction back to global memory */ if ( threadIdx.x == 0 ) { - t_g[ blockIdx.x * 6 + 2 ] = xz_s[ 0 ]; - t_g[ blockIdx.x * 6 + 3 ] = yy_s[ yy_i ]; + t_g[ blockIdx.x * 6 + 2 ] = xz_yy_s[ 0 ]; + t_g[ blockIdx.x * 6 + 3 ] = xz_yy_s[ 1 ]; } } @@ -366,16 +352,14 @@ CUDA_GLOBAL void k_compute_inertial_tensor_xz_yy( single_body_parameters *sbp, CUDA_GLOBAL void k_compute_inertial_tensor_yz_zz( single_body_parameters *sbp, reax_atom *atoms, real *t_g, real xcm0, real xcm1, real xcm2, size_t n ) { - extern __shared__ real yz_s[]; - extern __shared__ real zz_s[]; - unsigned int i, zz_i, index, mask; + extern __shared__ real yz_zz_s[]; + unsigned int i, index, mask; int offset; real yz, zz, m; rvec diff, xcm; i = blockIdx.x * blockDim.x + threadIdx.x; mask = __ballot_sync( FULL_MASK, i < n ); - zz_i = blockDim.x; xcm[0] = xcm0; xcm[1] = xcm1; xcm[2] = xcm2; @@ -397,8 +381,8 @@ CUDA_GLOBAL void k_compute_inertial_tensor_yz_zz( single_body_parameters *sbp, /* first thread within a warp writes warp-level sum to shared memory */ if ( threadIdx.x % warpSize == 0 ) { - yz_s[threadIdx.x >> 5] = yz; - zz_s[threadIdx.x >> 5] = zz; + yz_zz_s[2 * (threadIdx.x >> 5)] = yz; + yz_zz_s[2 * (threadIdx.x >> 5)] = zz; } } __syncthreads( ); @@ -408,9 +392,9 @@ CUDA_GLOBAL void k_compute_inertial_tensor_yz_zz( single_body_parameters *sbp, { if ( threadIdx.x < offset ) { - index = threadIdx.x + offset; - yz_s[ threadIdx.x ] += yz_s[ index ]; - zz_s[ zz_i + threadIdx.x ] += zz_s[ zz_i + index ]; + index = 2 * (threadIdx.x + offset); + yz_zz_s[ 2 * threadIdx.x ] += yz_zz_s[ index ]; + yz_zz_s[ 2 * threadIdx.x + 1 ] += yz_zz_s[ index + 1 ]; } __syncthreads( ); } @@ -419,14 +403,14 @@ CUDA_GLOBAL void k_compute_inertial_tensor_yz_zz( single_body_parameters *sbp, * of the reduction back to global memory */ if ( threadIdx.x == 0 ) { - t_g[ blockIdx.x * 6 + 4 ] = yz_s[ 0 ]; - t_g[ blockIdx.x * 6 + 5 ] = zz_s[ zz_i ]; + t_g[ blockIdx.x * 6 + 4 ] = yz_zz_s[ 0 ]; + t_g[ blockIdx.x * 6 + 5 ] = yz_zz_s[ 1 ]; } } CUDA_GLOBAL void k_compute_total_mass( single_body_parameters *sbp, reax_atom *my_atoms, - real *block_results, int n ) + real *results, int n ) { extern __shared__ real M_s[]; unsigned int i, mask; @@ -464,13 +448,13 @@ CUDA_GLOBAL void k_compute_total_mass( single_body_parameters *sbp, reax_atom *m if ( threadIdx.x == 0 ) { - block_results[blockIdx.x] = M_s[0]; + results[blockIdx.x] = M_s[0]; } } CUDA_GLOBAL void k_compute_kinetic_energy( single_body_parameters *sbp, reax_atom *my_atoms, - real *block_results, int n ) + real *results, int n ) { extern __shared__ real e_kin_s[]; unsigned int i, mask; @@ -515,7 +499,7 @@ CUDA_GLOBAL void k_compute_kinetic_energy( single_body_parameters *sbp, reax_ato * of the reduction back to global memory */ if ( threadIdx.x == 0 ) { - block_results[blockIdx.x] = e_kin_s[0]; + results[blockIdx.x] = e_kin_s[0]; } } @@ -581,63 +565,66 @@ static void Cuda_Compute_Momentum( reax_system *system, control_params *control, rvec *spad; cuda_check_malloc( &workspace->scratch, &workspace->scratch_size, - sizeof(rvec) * (control->blocks_pow_2 + 1), + sizeof(rvec) * (control->blocks + 1), "Cuda_Compute_Momentum::workspace->scratch" ); spad = (rvec *) workspace->scratch; // xcm - cuda_memset( spad, 0, sizeof(rvec) * (control->blocks_pow_2 + 1), - "Cuda_Compute_Momentum::tmp" ); + cuda_memset( spad, 0, sizeof(rvec) * (control->blocks + 1), + "Cuda_Compute_Momentum::spad" ); k_center_of_mass_blocks_xcm <<< control->blocks, control->block_size, - sizeof(rvec) * control->block_size >>> + sizeof(rvec) * (control->block_size / 32) >>> ( system->reax_param.d_sbp, system->d_my_atoms, spad, system->n ); cudaDeviceSynchronize( ); cudaCheckError( ); - k_reduction_rvec <<< 1, control->blocks_pow_2, sizeof(rvec) * control->blocks_pow_2 >>> - ( spad, &spad[control->blocks_pow_2], control->blocks ); + k_reduction_rvec <<< 1, control->blocks_pow_2, + sizeof(rvec) * (control->blocks_pow_2 / 32) >>> + ( spad, &spad[control->blocks], control->blocks ); cudaDeviceSynchronize( ); cudaCheckError( ); - copy_host_device( xcm, &spad[control->blocks_pow_2], - sizeof(rvec), cudaMemcpyDeviceToHost, "Cuda_Compute_Momentum::xcm" ); + copy_host_device( xcm, &spad[control->blocks], sizeof(rvec), + cudaMemcpyDeviceToHost, "Cuda_Compute_Momentum::xcm" ); // vcm - cuda_memset( spad, 0, sizeof(rvec) * (control->blocks_pow_2 + 1), - "Cuda_Compute_Momentum::tmp" ); + cuda_memset( spad, 0, sizeof(rvec) * (control->blocks + 1), + "Cuda_Compute_Momentum::spad" ); k_center_of_mass_blocks_vcm <<< control->blocks, control->block_size, - sizeof(rvec) * control->block_size >>> + sizeof(rvec) * (control->block_size / 32) >>> ( system->reax_param.d_sbp, system->d_my_atoms, spad, system->n ); cudaDeviceSynchronize( ); cudaCheckError( ); - k_reduction_rvec <<< 1, control->blocks_pow_2, sizeof(rvec) * control->blocks_pow_2 >>> - ( spad, &spad[control->blocks_pow_2], control->blocks ); + k_reduction_rvec <<< 1, control->blocks_pow_2, + sizeof(rvec) * (control->blocks_pow_2 / 32) >>> + ( spad, &spad[control->blocks], control->blocks ); cudaDeviceSynchronize( ); cudaCheckError( ); - copy_host_device( vcm, &spad[control->blocks_pow_2], sizeof(rvec), + copy_host_device( vcm, &spad[control->blocks], sizeof(rvec), cudaMemcpyDeviceToHost, "Cuda_Compute_Momentum::vcm" ); // amcm - cuda_memset( spad, 0, sizeof (rvec) * (control->blocks_pow_2 + 1), - "Cuda_Compute_Momentum::tmp"); + cuda_memset( spad, 0, sizeof(rvec) * (control->blocks + 1), + "Cuda_Compute_Momentum::spad"); k_center_of_mass_blocks_amcm <<< control->blocks, control->block_size, - sizeof(rvec) * control->block_size >>> + sizeof(rvec) * (control->block_size / 32) >>> ( system->reax_param.d_sbp, system->d_my_atoms, spad, system->n ); cudaDeviceSynchronize( ); cudaCheckError( ); - k_reduction_rvec <<< 1, control->blocks_pow_2, sizeof(rvec) * control->blocks_pow_2 >>> - ( spad, &spad[control->blocks_pow_2], control->blocks ); + k_reduction_rvec <<< 1, control->blocks_pow_2, + sizeof(rvec) * (control->blocks_pow_2 / 32) >>> + ( spad, &spad[control->blocks], control->blocks ); cudaDeviceSynchronize( ); cudaCheckError( ); - copy_host_device( amcm, &spad[control->blocks_pow_2], sizeof(rvec), - cudaMemcpyDeviceToHost, "Cuda_Compute_Momentum::amcm" ); + copy_host_device( amcm, &spad[control->blocks], sizeof(rvec), + cudaMemcpyDeviceToHost,"Cuda_Compute_Momentum::amcm" ); } @@ -647,28 +634,28 @@ static void Cuda_Compute_Inertial_Tensor( reax_system *system, control_params *c real *spad; cuda_check_malloc( &workspace->scratch, &workspace->scratch_size, - sizeof(real) * 6 * (control->blocks_pow_2 + 1), + sizeof(real) * 6 * (control->blocks + 1), "Cuda_Compute_Inertial_Tensor::workspace->scratch" ); spad = (real *) workspace->scratch; - cuda_memset( spad, 0, sizeof(real) * 6 * (control->blocks_pow_2 + 1), + cuda_memset( spad, 0, sizeof(real) * 6 * (control->blocks + 1), "Cuda_Compute_Intertial_Tensor::tmp" ); k_compute_inertial_tensor_xx_xy <<< control->blocks, control->block_size, - sizeof(real) * 2 * control->block_size >>> + sizeof(real) * 2 * (control->block_size / 32) >>> ( system->reax_param.d_sbp, system->d_my_atoms, spad, my_xcm[0], my_xcm[1], my_xcm[2], system->n ); cudaDeviceSynchronize( ); cudaCheckError( ); k_compute_inertial_tensor_xz_yy <<< control->blocks, control->block_size, - sizeof(real) * 2 * control->block_size >>> + sizeof(real) * 2 * (control->block_size / 32) >>> ( system->reax_param.d_sbp, system->d_my_atoms, spad, my_xcm[0], my_xcm[1], my_xcm[2], system->n ); cudaDeviceSynchronize( ); cudaCheckError( ); k_compute_inertial_tensor_yz_zz <<< control->blocks, control->block_size, - sizeof(real) * 2 * control->block_size >>> + sizeof(real) * 2 * (control->block_size / 32) >>> ( system->reax_param.d_sbp, system->d_my_atoms, spad, my_xcm[0], my_xcm[1], my_xcm[2], system->n ); cudaDeviceSynchronize( ); @@ -677,11 +664,11 @@ static void Cuda_Compute_Inertial_Tensor( reax_system *system, control_params *c /* reduction of block-level partial sums for inertial tensor */ k_compute_inertial_tensor_blocks <<< 1, control->blocks_pow_2, sizeof(real) * 6 * control->blocks_pow_2 >>> - ( spad, &spad[control->blocks_pow_2 * 6], control->blocks ); + ( spad, &spad[6 * control->blocks], control->blocks ); cudaDeviceSynchronize( ); cudaCheckError( ); - copy_host_device( t, &spad[6 * control->blocks_pow_2], + copy_host_device( t, &spad[6 * control->blocks], sizeof(real) * 6, cudaMemcpyDeviceToHost, "Cuda_Compute_Intertial_Tensor::t" ); } @@ -712,26 +699,27 @@ extern "C" void Cuda_Compute_Kinetic_Energy( reax_system *system, real *block_energy; cuda_check_malloc( &workspace->scratch, &workspace->scratch_size, - sizeof(real) * (control->blocks_pow_2 + 1), + sizeof(real) * (control->blocks + 1), "Cuda_Compute_Kinetic_Energy::workspace->scratch" ); block_energy = (real *) workspace->scratch; - cuda_memset( block_energy, 0, sizeof(real) * (control->blocks_pow_2 + 1), + cuda_memset( block_energy, 0, sizeof(real) * (control->blocks + 1), "Cuda_Compute_Kinetic_Energy::tmp" ); data->my_en.e_kin = 0.0; k_compute_kinetic_energy <<< control->blocks, control->block_size, - sizeof(real) * control->block_size >>> + sizeof(real) * (control->block_size / 32) >>> ( system->reax_param.d_sbp, system->d_my_atoms, block_energy, system->n ); cudaDeviceSynchronize( ); cudaCheckError( ); /* note: above kernel sums the kinetic energy contribution within blocks, * and this call finishes the global reduction across all blocks */ - Cuda_Reduction_Sum( block_energy, &block_energy[control->blocks_pow_2], control->blocks_pow_2 ); + Cuda_Reduction_Sum( block_energy, &block_energy[control->blocks], control->blocks ); - copy_host_device( &data->my_en.e_kin, &block_energy[control->blocks_pow_2], - sizeof(real), cudaMemcpyDeviceToHost, "Cuda_Compute_Kinetic_Energy::tmp" ); + copy_host_device( &data->my_en.e_kin, &block_energy[control->blocks], + sizeof(real), cudaMemcpyDeviceToHost, + "Cuda_Compute_Kinetic_Energy::tmp" ); ret = MPI_Allreduce( &data->my_en.e_kin, &data->sys_en.e_kin, 1, MPI_DOUBLE, MPI_SUM, comm ); @@ -754,24 +742,23 @@ void Cuda_Compute_Total_Mass( reax_system *system, control_params *control, real M_l, *spad_real; cuda_check_malloc( &workspace->scratch, &workspace->scratch_size, - sizeof(real) * (control->blocks_pow_2 + 1), + sizeof(real) * (control->blocks + 1), "Cuda_Compute_Total_Mass::workspace->scratch" ); spad_real = (real *) workspace->scratch; - cuda_memset( spad_real, 0, sizeof(real) * (control->blocks_pow_2 + 1), + cuda_memset( spad_real, 0, sizeof(real) * (control->blocks + 1), "Cuda_Compute_Total_Mass::spad_real" ); k_compute_total_mass <<< control->blocks, control->block_size, - sizeof(real) * control->block_size >>> + sizeof(real) * (control->block_size / 32) >>> ( system->reax_param.d_sbp, system->d_my_atoms, spad_real, system->n ); cudaDeviceSynchronize( ); cudaCheckError( ); /* note: above kernel sums the mass contribution within blocks, * and this call finishes the global reduction across all blocks */ - Cuda_Reduction_Sum( spad_real, &spad_real[control->blocks_pow_2], - control->blocks_pow_2 ); + Cuda_Reduction_Sum( spad_real, &spad_real[control->blocks], control->blocks ); - copy_host_device( &M_l, &spad_real[control->blocks_pow_2], sizeof(real), + copy_host_device( &M_l, &spad_real[control->blocks], sizeof(real), cudaMemcpyDeviceToHost, "total_mass:M_l" ); ret = MPI_Allreduce( &M_l, &data->M, 1, MPI_DOUBLE, MPI_SUM, comm ); @@ -916,20 +903,21 @@ void Cuda_Compute_Pressure( reax_system* system, control_params *control, system->n ); k_reduction_rvec <<< control->blocks, control->block_size, - sizeof(rvec) * control->block_size >>> + sizeof(rvec) * (control->block_size / 32) >>> ( rvec_spad, &rvec_spad[system->n], system->n ); cudaDeviceSynchronize( ); cudaCheckError( ); k_reduction_rvec <<< 1, control->blocks_pow_2, - sizeof(rvec) * control->blocks_pow_2 >>> + sizeof(rvec) * (control->blocks_pow_2 / 32) >>> ( &rvec_spad[system->n], &rvec_spad[system->n + control->blocks], control->blocks ); cudaDeviceSynchronize( ); cudaCheckError( ); - copy_host_device( &int_press, &rvec_spad[system->n + control->blocks], sizeof(rvec), - cudaMemcpyDeviceToHost, "Cuda_Compute_Pressure::d_int_press" ); + copy_host_device( &int_press, &rvec_spad[system->n + control->blocks], + sizeof(rvec), cudaMemcpyDeviceToHost, + "Cuda_Compute_Pressure::int_press" ); } /* sum up internal and external pressure */ diff --git a/PG-PuReMD/src/cuda/cuda_torsion_angles.cu b/PG-PuReMD/src/cuda/cuda_torsion_angles.cu index f7fdca090bd0b3e6bb30dadf7a3a2031dc009d25..e791194513d25293059f6c2769021734d3001de8 100644 --- a/PG-PuReMD/src/cuda/cuda_torsion_angles.cu +++ b/PG-PuReMD/src/cuda/cuda_torsion_angles.cu @@ -454,7 +454,7 @@ CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *my_atoms, global_paramete rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_dk ); atomic_rvecAdd( pbond_ij->ta_f, force ); rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); - rvec_Add( data_ext_press [j], ext_press ); + rvec_Add( data_ext_press[j], ext_press ); rvec_ScaledAdd( workspace.f[j], CEtors7 + CEconj4, p_ijk->dcos_dj ); @@ -471,18 +471,18 @@ CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *my_atoms, global_paramete rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dj ); atomic_rvecAdd( pbond_jk->ta_f, force ); rvec_iMultiply( ext_press, pbond_jk->rel_box, force ); - rvec_Add( data_ext_press [j], ext_press ); + rvec_Add( data_ext_press[j], ext_press ); rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dk ); rvec_Add( pbond_kl->ta_f, force ); rvec_iMultiply( ext_press, rel_box_jl, force ); - rvec_Add( data_ext_press [j], ext_press ); + rvec_Add( data_ext_press[j], ext_press ); /* dcos_omega */ rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_di ); atomic_rvecAdd( pbond_ij->ta_f, force ); rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); - rvec_Add( data_ext_press [j], ext_press ); + rvec_Add( data_ext_press[j], ext_press ); rvec_ScaledAdd( workspace.f[j], CEtors9 + CEconj6, dcos_omega_dj ); @@ -490,12 +490,12 @@ CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *my_atoms, global_paramete rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dk ); rvec_Add( pbond_jk->ta_f, force ); rvec_iMultiply( ext_press, pbond_jk->rel_box, force ); - rvec_Add( data_ext_press [j], ext_press ); + rvec_Add( data_ext_press[j], ext_press ); rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dl ); rvec_Add( pbond_kl->ta_f, force ); rvec_iMultiply( ext_press, rel_box_jl, force ); - rvec_Add( data_ext_press [j], ext_press ); + rvec_Add( data_ext_press[j], ext_press ); } #if defined(TEST_ENERGY)