diff --git a/PG-PuReMD/src/cuda/cuda_allocate.cu b/PG-PuReMD/src/cuda/cuda_allocate.cu index e85b4eaee10deabf763901c6b67358cde9a8eaf2..c81938c5e34f6a80096c32d42c33db38866dc0e4 100644 --- a/PG-PuReMD/src/cuda/cuda_allocate.cu +++ b/PG-PuReMD/src/cuda/cuda_allocate.cu @@ -577,13 +577,8 @@ void Cuda_Reallocate_Bonds_List( reax_list *bonds, size_t n, size_t num_intrs ) void Cuda_Reallocate_Thbodies_List( reax_list *thbodies, size_t n, size_t num_intrs ) { - /* delete three-body list */ Dev_Delete_List( thbodies ); -// Delete_List( thbodies ); - - /* recreate Three-body list */ Dev_Make_List( n, num_intrs, TYP_THREE_BODY, thbodies ); -// Make_List( n, num_intrs, TYP_THREE_BODY, thbodies ); } diff --git a/PG-PuReMD/src/cuda/cuda_copy.cu b/PG-PuReMD/src/cuda/cuda_copy.cu index 1098d197ffdce8c39a51fe4ff5033eacd13beeb4..4912c42a410af43e364ccec4e746f6ce375ef27b 100644 --- a/PG-PuReMD/src/cuda/cuda_copy.cu +++ b/PG-PuReMD/src/cuda/cuda_copy.cu @@ -3,6 +3,7 @@ #include "cuda_utils.h" +#include "../list.h" #include "../vector.h" @@ -133,11 +134,11 @@ void Output_Sync_Lists( reax_list *host, reax_list *device, int type ) fprintf( stderr, " Trying to copy *%d* list from device to host \n", type ); #endif -// if ( host->allocated == TRUE ) -// { -// Delete_List( host ); -// } -// Make_List( device->n, device->num_intrs, type, host ); + if ( host->allocated == TRUE ) + { + Delete_List( host ); + } + Make_List( device->n, device->num_intrs, type, host ); copy_host_device( host->index, device->index, sizeof(int) * device->n, cudaMemcpyDeviceToHost, "Output_Sync_Lists::list->index" ); diff --git a/PG-PuReMD/src/cuda/cuda_forces.cu b/PG-PuReMD/src/cuda/cuda_forces.cu index 553acbd06d26052379bd6a35817bd51683c4a1b9..71224dd05ced22112cd504fcd0d299bb67762612 100644 --- a/PG-PuReMD/src/cuda/cuda_forces.cu +++ b/PG-PuReMD/src/cuda/cuda_forces.cu @@ -432,8 +432,8 @@ CUDA_DEVICE real Compute_tabH( LR_lookup_table *t_LR, real r_ij, int ti, int tj, real val, dif, base; LR_lookup_table *t; - tmin = MIN( ti, tj ); - tmax = MAX( ti, tj ); + tmin = MIN( ti, tj ); + tmax = MAX( ti, tj ); t = &( t_LR[ index_lr(tmin,tmax, num_atom_types) ] ); /* cubic spline interpolation */ @@ -452,149 +452,6 @@ CUDA_DEVICE real Compute_tabH( LR_lookup_table *t_LR, real r_ij, int ti, int tj, } -CUDA_GLOBAL void k_estimate_sparse_matrix( reax_atom *my_atoms, - control_params *control, reax_list p_far_nbrs, int n, - int N, int renbr, int *indices ) -{ - int i, j, pj; - int start_i, end_i; - int flag; - real cutoff; - far_neighbor_data *nbr_pj; - reax_atom *atom_i, *atom_j; - reax_list *far_nbrs; - - i = blockIdx.x * blockDim.x + threadIdx.x; - - if ( i >= N ) - { - return; - } - - far_nbrs = &( p_far_nbrs ); - atom_i = &(my_atoms[i]); - start_i = Dev_Start_Index( i, far_nbrs ); - end_i = Dev_End_Index( i, far_nbrs ); - cutoff = control->nonb_cut; - - if ( i < n ) - { - indices[i]++; - } - - /* update i-j distance - check if j is within cutoff */ - for ( pj = start_i; pj < end_i; ++pj ) - { - nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); - j = nbr_pj->nbr; - atom_j = &(my_atoms[j]); - - if ( renbr ) - { - if ( nbr_pj->d <= cutoff ) - { - flag = TRUE; - } - else - { - flag = FALSE; - } - } - else - { - nbr_pj->dvec[0] = atom_j->x[0] - atom_i->x[0]; - nbr_pj->dvec[1] = atom_j->x[1] - atom_i->x[1]; - nbr_pj->dvec[2] = atom_j->x[2] - atom_i->x[2]; - - nbr_pj->d = rvec_Norm_Sqr( nbr_pj->dvec ); - - if ( nbr_pj->d <= SQR(cutoff) ) - { - nbr_pj->d = SQRT(nbr_pj->d); - flag = TRUE; - } - else - { - flag = FALSE; - } - } - - if ( flag == TRUE ) - { - /* H matrix entry */ - if ( i < j && i < n && (j < n || atom_i->orig_id < atom_j->orig_id) ) - { - indices[i]++; - } - else if ( i > j && i >= n && j < n && atom_j->orig_id < atom_i->orig_id ) - { - indices[i]++; - } - else if ( i > j && i < n && (j < n || atom_j->orig_id < atom_i->orig_id) ) - { - indices[i]++; - } - } - } -} - - -int Cuda_Estimate_Storage_Sparse_Matrix( reax_system *system, control_params *control, - simulation_data *data, reax_list **lists ) -{ - int blocks, max_sp_entries, total_sp_entries; - int *indices = (int *) scratch; -// int *h_indices = (int *) host_scratch; - - cuda_memset( indices, 0, sizeof(int) * system->N, "scratch::sp_matrix::indices" ); - - blocks = system->N / DEF_BLOCK_SIZE + - ((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1); - - //TODO - k_estimate_sparse_matrix <<< blocks, DEF_BLOCK_SIZE >>> - ( system->d_my_atoms, (control_params *)control->d_control_params, - *(*dev_lists + FAR_NBRS), system->n, system->N, - (data->step-data->prev_steps % control->reneighbor) == 0, indices ); - cudaThreadSynchronize( ); - cudaCheckError( ); - - Cuda_Reduction_Max( indices, system->d_max_cm_entries, system->N ); - - copy_host_device( &max_sp_entries, system->d_max_cm_entries, sizeof(int), - cudaMemcpyDeviceToHost, "d_max_sparse_entries" ); - - Cuda_Reduction_Sum( indices, system->d_cm_entries, system->N ); - - copy_host_device( &total_sp_entries, system->d_cm_entries, sizeof(int), - cudaMemcpyDeviceToHost, "d_cm_sparse_entries" ); - -// copy_host_device( h_indices, indices, sizeof(int) * system->N, -// cudaMemcpyDeviceToHost, "sp_matrix:indices" ); -// max_sp_entries = 0; -// total_sp_entries = 0; -// for (int i = 0; i < system->N; i++) -// { -// total_sp_entries += h_indices[i]; -// if (max_sp_entries < h_indices[i]) -// { -// max_sp_entries = h_indices[i]; -// } -// } - -#if defined(DBEUG) - fprintf( stderr, " TOTAL DEVICE SPARSE ENTRIES: %d \n", - total_sp_entries ); - fprintf( stderr, "p%d: Max sparse entries -> %d \n", - system->my_rank, max_sp_entries ); -#endif - - system->total_cm_entries = max_sp_entries * SAFE_ZONE; - - return SUCCESS; -} - - CUDA_GLOBAL void k_print_hbond_info( reax_atom *my_atoms, single_body_parameters *sbp, control_params *control, reax_list hbonds, int N ) { @@ -915,7 +772,7 @@ CUDA_GLOBAL void k_init_forces( reax_atom *my_atoms, single_body_parameters *sbp //num_bonds += 2; ++btop_i; - /* Need to do later... since i and j are parallel */ + /* TODO: Need to do later... since i and j are parallel */ // if( workspace->bond_mark[j] > workspace->bond_mark[i] + 1 ) // { // workspace->bond_mark[j] = workspace->bond_mark[i] + 1; @@ -1043,8 +900,6 @@ CUDA_GLOBAL void New_fix_sym_hbond_indices( reax_atom *my_atoms, reax_list hbond } -//////////////////////// -// HBOND ISSUE CUDA_GLOBAL void k_update_bonds( reax_atom *my_atoms, reax_list bonds, int n ) { int i; @@ -1056,8 +911,6 @@ CUDA_GLOBAL void k_update_bonds( reax_atom *my_atoms, reax_list bonds, int n ) return; } -// my_atoms[i].num_bonds = -// MAX( Dev_Num_Entries(i, &bonds) * 2, MIN_BONDS ); my_atoms[i].num_bonds = Dev_Num_Entries( i, &bonds ); } @@ -1075,13 +928,8 @@ CUDA_GLOBAL void k_update_hbonds( reax_atom *my_atoms, reax_list hbonds, int n ) } Hindex = my_atoms[i].Hindex; -// my_atoms[i].num_hbonds = -// MAX( Dev_Num_Entries(Hindex, &hbonds) * SAFER_ZONE, MIN_HBONDS ); my_atoms[i].num_hbonds = Dev_Num_Entries( Hindex, &hbonds ); } -//////////////////////// -//////////////////////// -//////////////////////// int Cuda_Validate_Lists( reax_system *system, storage *workspace, @@ -1089,15 +937,8 @@ int Cuda_Validate_Lists( reax_system *system, storage *workspace, int step, int numH ) { int blocks; - int i, comp, Hindex, ret; - int *index, *end_index; - int *thbody; - reax_list *bonds, *hbonds; - reallocate_data *realloc; - int max_sp_entries; - int total_sp_entries; + int ret; - realloc = &( dev_workspace->realloc ); blocks = system->n / DEF_BLOCK_SIZE + ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1); ret = SUCCESS; @@ -1107,9 +948,6 @@ int Cuda_Validate_Lists( reax_system *system, storage *workspace, cudaThreadSynchronize( ); cudaCheckError( ); - //////////////////////// - // HBOND ISSUE - //FIX - 4 - Added this check for hydrogen bond issue if ( control->hbond_cut > 0.0 && system->numH > 0 ) { k_update_hbonds <<< blocks, DEF_BLOCK_SIZE >>> @@ -1118,68 +956,6 @@ int Cuda_Validate_Lists( reax_system *system, storage *workspace, cudaCheckError( ); } - /* validate charge matrix */ - memset( host_scratch, 0, 2 * system->N * sizeof(int) ); - index = (int *) host_scratch; - end_index = index + system->N; - copy_host_device( index, dev_workspace->H.start, system->N * sizeof(int), - cudaMemcpyDeviceToHost, "sparse_matrix:start" ); - copy_host_device( end_index, dev_workspace->H.end, system->N * sizeof(int), - cudaMemcpyDeviceToHost, "sparse_matrix:end" ); - - max_sp_entries = 0; - total_sp_entries = 0; - -// for (i = 0; i < system->N; i++ ) -// { -// //if (i < system->N-1) -// // comp = index [i+1]; -// //else -// // comp = dev_workspace->H.m; -// -// total_sp_entries += end_index[i] - index[i]; -// if ( end_index[i] - index[i] > system->max_sparse_entries ) -// { -// fprintf( stderr, "step%d-sparsemat-chk failed: i=%d start(i)=%d end(i)=%d \n", -// step, i, index[i], end_index[i] ); -// ret = FAILURE; -// } -// else if ( end_index[i] >= dev_workspace->H.m ) -// { -// //SUDHIR_FIX_SPARSE_MATRIX -// //TODO move this carver -// fprintf( stderr, "p:%d - step%d-sparsemat-chk failed (exceed limits): i=%d start(i)=%d end(i)=%d \n", -// system->my_rank, step, i, index[i], end_index[i] ); -// //TODO move this carver -// ret = FAILURE; -// } -// else -// { -// if ( max_sp_entries <= end_index[i] - index[i] ) -// { -// max_sp_entries = end_index[i] - index[i]; -// } -// } -// } - //if (max_sp_entries <= end_index[i] - index[i]) - // max_sp_entries = end_index[i] - index[i]; - - //update the current step max_sp_entries; - realloc->Htop = max_sp_entries; - -#if defined(DEBUG) - fprintf( stderr, "p:%d - Cuda_Reallocate: Total H matrix entries: %d, cap: %d, used: %d \n", - system->my_rank, dev_workspace->H.n, dev_workspace->H.m, total_sp_entries ); -#endif - - if ( total_sp_entries >= dev_workspace->H.m ) - { - fprintf( stderr, "p:%d - **ran out of space for sparse matrix: step: %d, allocated: %d, used: %d \n", - system->my_rank, step, dev_workspace->H.m, total_sp_entries ); - - ret = FAILURE; - } - /* 3bodies list: since a more accurate estimate of the num. * of three body interactions requires that bond orders have * been computed, delay validation until for computation */ @@ -1207,7 +983,7 @@ CUDA_GLOBAL void k_print_forces( reax_atom *my_atoms, rvec *f, int n ) } -void Print_Forces( reax_system *system ) +static void Print_Forces( reax_system *system ) { int blocks; @@ -1343,9 +1119,6 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, cudaThreadSynchronize( ); cudaCheckError( ); - /////////////////////// - /////////////////////// - // FIX - 4 - HBOND ISSUE if ( control->hbond_cut > 0 && system->numH > 0 ) { /* make hbond_list symmetric */ @@ -1358,7 +1131,7 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, cudaCheckError( ); } -// /* update bond_mark */ + /* update bond_mark */ // k_bond_mark <<< blocks, DEF_BLOCK_SIZE >>> // k_bond_mark <<< 1, 1 >>> // ( *(*dev_lists + BONDS), *dev_workspace, system->N ); @@ -1378,7 +1151,7 @@ int Cuda_Init_Forces_No_Charges( reax_system *system, control_params *control, simulation_data *data, storage *workspace, reax_list **lists, output_controls *out_control ) { - //TODO: Implement later when you figure out the bond_mark usage. + //TODO: implement later when figure out bond_mark usage return FAILURE; } @@ -1387,13 +1160,15 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, simulation_data *data, storage *workspace, reax_list **lists, output_controls *out_control ) { - int i, hbs, hnbrs_blocks, ret; + int i, hbs, hnbrs_blocks, update_energy, ret; int *thbody; static int compute_bonded_part1 = FALSE; real t_start, t_elapsed; real *spad = (real *) scratch; rvec *rvec_spad; + update_energy = (out_control->energy_update_freq > 0 + && data->step % out_control->energy_update_freq == 0) ? TRUE : FALSE; ret = SUCCESS; if ( compute_bonded_part1 == FALSE ) @@ -1406,7 +1181,7 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, BLOCKS_N, BLOCK_SIZE ); #endif - Cuda_Calculate_BO_init <<< BLOCKS_N, BLOCK_SIZE >>> + Cuda_Calculate_BO_init <<< BLOCKS_N, BLOCK_SIZE >>> ( system->d_my_atoms, system->reax_param.d_sbp, *dev_workspace, system->N ); cudaThreadSynchronize( ); @@ -1420,14 +1195,14 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, cudaThreadSynchronize( ); cudaCheckError( ); - Cuda_Update_Uncorrected_BO <<<BLOCKS_N, BLOCK_SIZE>>> - (*dev_workspace, *(*dev_lists + BONDS), system->N); + Cuda_Update_Uncorrected_BO <<< BLOCKS_N, BLOCK_SIZE >>> + ( *dev_workspace, *(*dev_lists + BONDS), system->N ); cudaThreadSynchronize( ); cudaCheckError( ); - Cuda_Update_Workspace_After_BO <<<BLOCKS_N, BLOCK_SIZE>>> - (system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_sbp, - *dev_workspace, system->N); + Cuda_Update_Workspace_After_BO <<< BLOCKS_N, BLOCK_SIZE >>> + ( system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_sbp, + *dev_workspace, system->N ); cudaThreadSynchronize( ); cudaCheckError( ); @@ -1451,8 +1226,11 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, cudaCheckError( ); /* reduction for E_BE */ - Cuda_Reduction_Sum( spad, &((simulation_data *)data->d_simulation_data)->my_en.e_bond, - system->n ); + if ( update_energy == TRUE ) + { + Cuda_Reduction_Sum( spad, &((simulation_data *)data->d_simulation_data)->my_en.e_bond, + system->n ); + } t_elapsed = Get_Timing_Info( t_start ); @@ -1466,36 +1244,43 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, t_start = Get_Time( ); cuda_memset( spad, 0, ( 6 * sizeof(real) * system->n ), "scratch" ); - Cuda_Atom_Energy <<<BLOCKS, BLOCK_SIZE>>>( system->d_my_atoms, system->reax_param.d_gp, - system->reax_param.d_sbp, system->reax_param.d_tbp, - *dev_workspace, - *(*dev_lists + BONDS), system->n, system->reax_param.num_atom_types, - spad, spad + 2 * system->n, spad + 4 * system->n); + Cuda_Atom_Energy <<< BLOCKS, BLOCK_SIZE >>> + ( system->d_my_atoms, system->reax_param.d_gp, + system->reax_param.d_sbp, system->reax_param.d_tbp, *dev_workspace, + *(*dev_lists + BONDS), system->n, system->reax_param.num_atom_types, + spad, spad + 2 * system->n, spad + 4 * system->n); cudaThreadSynchronize( ); cudaCheckError( ); - //CHANGE ORIGINAL -// Cuda_Atom_Energy_PostProcess <<<BLOCKS, BLOCK_SIZE >>> +// Cuda_Atom_Energy_PostProcess <<< BLOCKS, BLOCK_SIZE >>> // ( *(*dev_lists + BONDS), *dev_workspace, system->n ); - Cuda_Atom_Energy_PostProcess <<<BLOCKS_N, BLOCK_SIZE >>> + Cuda_Atom_Energy_PostProcess <<< BLOCKS_N, BLOCK_SIZE >>> ( *(*dev_lists + BONDS), *dev_workspace, system->N ); - //CHANGE ORIGINAL cudaThreadSynchronize( ); cudaCheckError( ); /* reduction for E_Lp */ - Cuda_Reduction_Sum( spad, &((simulation_data *)data->d_simulation_data)->my_en.e_lp, - system->n ); + if ( update_energy == TRUE ) + { + Cuda_Reduction_Sum( spad, &((simulation_data *)data->d_simulation_data)->my_en.e_lp, + system->n ); + } /* reduction for E_Ov */ - Cuda_Reduction_Sum( spad + 2 * system->n, - &((simulation_data *)data->d_simulation_data)->my_en.e_ov, - system->n ); + if ( update_energy == TRUE ) + { + Cuda_Reduction_Sum( spad + 2 * system->n, + &((simulation_data *)data->d_simulation_data)->my_en.e_ov, + system->n ); + } /* reduction for E_Un */ - Cuda_Reduction_Sum( spad + 4 * system->n, - &((simulation_data *)data->d_simulation_data)->my_en.e_ov, - system->n ); + if ( update_energy == TRUE ) + { + Cuda_Reduction_Sum( spad + 4 * system->n, + &((simulation_data *)data->d_simulation_data)->my_en.e_ov, + system->n ); + } t_elapsed = Get_Timing_Info( t_start ); @@ -1534,23 +1319,32 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, (control_params *)control->d_control_params, *dev_workspace, *(*dev_lists + BONDS), *(*dev_lists + THREE_BODIES), system->n, system->N, system->reax_param.num_atom_types, - spad, spad + 2 * system->N, spad + 4 * system->N, (rvec *)(spad + 6 * system->N)); + spad, spad + 2 * system->N, spad + 4 * system->N, (rvec *)(spad + 6 * system->N) ); cudaThreadSynchronize( ); cudaCheckError( ); /* reduction for E_Ang */ - Cuda_Reduction_Sum( spad, &((simulation_data *)data->d_simulation_data)->my_en.e_ang, - system->N ); + if ( update_energy == TRUE ) + { + Cuda_Reduction_Sum( spad, &((simulation_data *)data->d_simulation_data)->my_en.e_ang, + system->N ); + } /* reduction for E_Pen */ - Cuda_Reduction_Sum( spad + 2 * system->N, - &((simulation_data *)data->d_simulation_data)->my_en.e_pen, - system->N ); + if ( update_energy == TRUE ) + { + Cuda_Reduction_Sum( spad + 2 * system->N, + &((simulation_data *)data->d_simulation_data)->my_en.e_pen, + system->N ); + } /* reduction for E_Coa */ - Cuda_Reduction_Sum( spad + 4 * system->N, - &((simulation_data *)data->d_simulation_data)->my_en.e_coa, - system->N ); + if ( update_energy == TRUE ) + { + Cuda_Reduction_Sum( spad + 4 * system->N, + &((simulation_data *)data->d_simulation_data)->my_en.e_coa, + system->N ); + } /* reduction for ext_pres */ rvec_spad = (rvec *) (spad + 6 * system->N); @@ -1596,24 +1390,30 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, cudaCheckError( ); /* reduction for E_Tor */ - Cuda_Reduction_Sum( spad, &((simulation_data *)data->d_simulation_data)->my_en.e_tor, - system->n ); + if ( update_energy == TRUE ) + { + Cuda_Reduction_Sum( spad, &((simulation_data *)data->d_simulation_data)->my_en.e_tor, + system->n ); + } /* reduction for E_Con */ - Cuda_Reduction_Sum( spad + 2 * system->n, - &((simulation_data *)data->d_simulation_data)->my_en.e_con, - system->n ); + if ( update_energy == TRUE ) + { + Cuda_Reduction_Sum( spad + 2 * system->n, + &((simulation_data *)data->d_simulation_data)->my_en.e_con, + system->n ); + } /* reduction for ext_pres */ rvec_spad = (rvec *) (spad + 4 * system->n); - k_reduction_rvec <<<BLOCKS, BLOCK_SIZE, sizeof(rvec) * BLOCK_SIZE >>> - (rvec_spad, rvec_spad + system->n, system->n); + k_reduction_rvec <<< BLOCKS, BLOCK_SIZE, sizeof(rvec) * BLOCK_SIZE >>> + ( rvec_spad, rvec_spad + system->n, system->n ); cudaThreadSynchronize( ); cudaCheckError( ); - k_reduction_rvec <<<1, BLOCKS_POW_2, sizeof(rvec) * BLOCKS_POW_2 >>> + k_reduction_rvec <<< 1, BLOCKS_POW_2, sizeof(rvec) * BLOCKS_POW_2 >>> ( rvec_spad + system->n, - &((simulation_data *)data->d_simulation_data)->my_ext_press, BLOCKS ); + &((simulation_data *)data->d_simulation_data)->my_ext_press, BLOCKS ); cudaThreadSynchronize( ); cudaCheckError( ); // Cuda_Reduction_Sum( rvec_spad, @@ -1635,7 +1435,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, #endif /* 6. Hydrogen Bonds Interactions */ - // FIX - 4 - Added additional check here if ( control->hbond_cut > 0.0 && system->numH > 0 ) { t_start = Get_Time( ); @@ -1658,18 +1457,21 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, cudaCheckError( ); /* reduction for E_HB */ - Cuda_Reduction_Sum( spad, - &((simulation_data *)data->d_simulation_data)->my_en.e_hb, - system->n ); + if ( update_energy == TRUE ) + { + Cuda_Reduction_Sum( spad, + &((simulation_data *)data->d_simulation_data)->my_en.e_hb, + system->n ); + } /* reduction for ext_pres */ rvec_spad = (rvec *) (spad + 2 * system->n); - k_reduction_rvec <<<BLOCKS, BLOCK_SIZE, sizeof(rvec) * BLOCK_SIZE >>> + k_reduction_rvec <<< BLOCKS, BLOCK_SIZE, sizeof(rvec) * BLOCK_SIZE >>> (rvec_spad, rvec_spad + system->n, system->n); cudaThreadSynchronize( ); cudaCheckError( ); - k_reduction_rvec <<<1, BLOCKS_POW_2, sizeof(rvec) * BLOCKS_POW_2 >>> + k_reduction_rvec <<< 1, BLOCKS_POW_2, sizeof(rvec) * BLOCKS_POW_2 >>> (rvec_spad + system->n, &((simulation_data *)data->d_simulation_data)->my_ext_press, BLOCKS); cudaThreadSynchronize( ); cudaCheckError( ); @@ -1679,8 +1481,8 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, /* post process step1 */ Cuda_Hydrogen_Bonds_PostProcess <<< BLOCKS_N, BLOCK_SIZE, BLOCK_SIZE * sizeof(rvec) >>> - ( system->d_my_atoms, *dev_workspace, - *(*dev_lists + BONDS), system->N ); + ( system->d_my_atoms, *dev_workspace, + *(*dev_lists + BONDS), system->N ); cudaThreadSynchronize( ); cudaCheckError( ); @@ -1788,39 +1590,6 @@ int Cuda_Compute_Forces( reax_system *system, control_params *control, if ( charge_flag == TRUE ) { retVal = Cuda_Init_Forces( system, control, data, workspace, lists, out_control ); - -// int i; -// static reax_list **temp_lists; -// -// if ( data->step == 0 ) -// { -// temp_lists = (reax_list **) smalloc( LIST_N * sizeof (reax_list *), "temp_lists" ); -// for ( i = 0; i < LIST_N; ++i ) -// { -// temp_lists[i] = (reax_list *) smalloc( sizeof(reax_list), "lists[i]" ); -// temp_lists[i]->allocated = FALSE; -// } -// Make_List( (*dev_lists + BONDS)->n, (*dev_lists + BONDS)->num_intrs, -// TYP_BOND, *temp_lists + BONDS ); -// Make_List( (*dev_lists + HBONDS)->n, (*dev_lists + HBONDS)->num_intrs, -// TYP_HBOND, *temp_lists + HBONDS ); -// } -// else -// { -// Delete_List( *temp_lists + BONDS ); -// Make_List( (*dev_lists + BONDS)->n, (*dev_lists + BONDS)->num_intrs, -// TYP_BOND, *temp_lists + BONDS ); -// Delete_List( *temp_lists + HBONDS ); -// Make_List( (*dev_lists + HBONDS)->n, (*dev_lists + HBONDS)->num_intrs, -// TYP_HBOND, *temp_lists + HBONDS ); -// -// } -// Output_Sync_Lists( *temp_lists + BONDS, *dev_lists + BONDS, TYP_BOND ); -// Print_Bonds( system, temp_lists, control ); -// Output_Sync_Lists( *temp_lists + HBONDS, *dev_lists + HBONDS, TYP_HBOND ); -// Print_HBonds( system, temp_lists, control, data->step ); -// Print_HBond_Indices( system, temp_lists, control, data->step ); -// exit( 0 ); } else { @@ -1870,7 +1639,7 @@ int Cuda_Compute_Forces( reax_system *system, control_params *control, //MPI_Barrier( MPI_COMM_WORLD ); if ( system->my_rank == MASTER_NODE ) { - Update_Timing_Info( &t_start, &(data->timing.qEq) ); + Update_Timing_Info( &t_start, &(data->timing.cm) ); } #endif @@ -1914,8 +1683,6 @@ int Cuda_Compute_Forces( reax_system *system, control_params *control, MPI_Barrier( MPI_COMM_WORLD ); #endif - -// Print_Forces( system ); } return retVal; diff --git a/PG-PuReMD/src/cuda/cuda_init_md.cu b/PG-PuReMD/src/cuda/cuda_init_md.cu index 2bb2f89e7b76315462927b659c00efe8102b56c0..3162e7326b03fa9165772300bb4d926c900016f4 100644 --- a/PG-PuReMD/src/cuda/cuda_init_md.cu +++ b/PG-PuReMD/src/cuda/cuda_init_md.cu @@ -252,14 +252,10 @@ int Cuda_Init_Lists( reax_system *system, control_params *control, Cuda_Generate_Neighbor_Lists( system, data, workspace, dev_lists ); - /* estimate storage for bonds and hbonds */ + /* estimate storage for bonds, hbonds, and sparse matrix */ Cuda_Estimate_Storages( system, control, dev_lists, &(dev_workspace->H), data->step ); - /* estimate storage for charge sparse matrix */ -// Cuda_Estimate_Storage_Sparse_Matrix( system, control, data, dev_lists ); - dev_alloc_matrix( &(dev_workspace->H), system->total_cap, system->total_cm_entries ); - Cuda_Init_Sparse_Matrix_Indices( system, &(dev_workspace->H) ); //MATRIX CHANGES @@ -277,7 +273,6 @@ int Cuda_Init_Lists( reax_system *system, control_params *control, if ( control->hbond_cut > 0.0 && system->numH > 0 ) { Dev_Make_List( system->total_cap, system->total_hbonds, TYP_HBOND, *dev_lists + HBONDS ); -// Make_List( system->total_cap, system->total_hbonds, TYP_HBOND, *lists + HBONDS ); Cuda_Init_HBond_Indices( system ); @@ -290,7 +285,6 @@ int Cuda_Init_Lists( reax_system *system, control_params *control, /* bonds list */ Dev_Make_List( system->total_cap, system->total_bonds, TYP_BOND, *dev_lists + BONDS ); -// Make_List( system->total_cap, system->total_bonds, TYP_BOND, *lists + BONDS ); Cuda_Init_Bond_Indices( system ); diff --git a/PG-PuReMD/src/cuda/cuda_nonbonded.cu b/PG-PuReMD/src/cuda/cuda_nonbonded.cu index 25c0b17df74448775e5e37a61db25c151169e056..473ef77e6c1259eed6e7fd204fc79dcb36784b6d 100644 --- a/PG-PuReMD/src/cuda/cuda_nonbonded.cu +++ b/PG-PuReMD/src/cuda/cuda_nonbonded.cu @@ -30,13 +30,12 @@ #include "../vector.h" -//CUDA_GLOBAL void __launch_bounds__ (960) ker_vdW_coulomb_energy( -CUDA_GLOBAL void ker_vdW_coulomb_energy( reax_atom *my_atoms, +//CUDA_GLOBAL void __launch_bounds__ (960) k_vdW_coulomb_energy( +CUDA_GLOBAL void k_vdW_coulomb_energy( reax_atom *my_atoms, two_body_parameters *tbp, global_parameters gp, control_params *control, storage p_workspace, reax_list p_far_nbrs, int n, int N, int num_atom_types, real *data_e_vdW, real *data_e_ele, rvec *data_ext_press ) { - #if defined(__SM_35__) real sh_vdw; real sh_ele; @@ -44,13 +43,11 @@ CUDA_GLOBAL void ker_vdW_coulomb_energy( reax_atom *my_atoms, #else extern __shared__ real _vdw[]; extern __shared__ real _ele[]; - extern __shared__ rvec _force []; - + extern __shared__ rvec _force[]; real *sh_vdw; real *sh_ele; rvec *sh_force; #endif - int i, j, pj, natoms; int start_i, end_i, orig_i, orig_j; real p_vdW1, p_vdW1i; @@ -64,12 +61,13 @@ CUDA_GLOBAL void ker_vdW_coulomb_energy( reax_atom *my_atoms, far_neighbor_data *nbr_pj; reax_list *far_nbrs; storage *workspace = &( p_workspace ); - // rtensor temp_rtensor, total_rtensor; - - int thread_id = blockIdx.x * blockDim.x + threadIdx.x; - int warpid = thread_id / VDW_KER_THREADS_PER_ATOM; - int laneid = thread_id & (VDW_KER_THREADS_PER_ATOM -1); + int thread_id; + int warpid; + int laneid; + thread_id = blockIdx.x * blockDim.x + threadIdx.x; + warpid = thread_id / VDW_KER_THREADS_PER_ATOM; + laneid = thread_id & (VDW_KER_THREADS_PER_ATOM -1); #if defined(__SM_35__) sh_vdw = 0.0; sh_ele = 0.0; @@ -83,12 +81,11 @@ CUDA_GLOBAL void ker_vdW_coulomb_energy( reax_atom *my_atoms, sh_ele[threadIdx.x] = 0.0; rvec_MakeZero ( sh_force [threadIdx.x] ); #endif - //i = blockIdx.x * blockDim.x + threadIdx.x; //if (i >= N) return; i = warpid; - if (i < N) + if ( i < N ) { natoms = n; far_nbrs = &( p_far_nbrs ); @@ -97,18 +94,18 @@ CUDA_GLOBAL void ker_vdW_coulomb_energy( reax_atom *my_atoms, e_core = 0; e_vdW = 0; - data_e_vdW [i] = 0; - data_e_ele [i] = 0; + data_e_vdW[i] = 0; + data_e_ele[i] = 0; //for( i = 0; i < natoms; ++i ) { start_i = Dev_Start_Index(i, far_nbrs); - end_i = Dev_End_Index(i, far_nbrs); - orig_i = my_atoms[i].orig_id; + end_i = Dev_End_Index(i, far_nbrs); + orig_i = my_atoms[i].orig_id; //fprintf( stderr, "i:%d, start_i: %d, end_i: %d\n", i, start_i, end_i ); //for( pj = start_i; pj < end_i; ++pj ) pj = start_i + laneid; - while (pj < end_i) + while ( pj < end_i ) { nbr_pj = &(far_nbrs->select.far_nbr_list[pj]); @@ -133,18 +130,18 @@ CUDA_GLOBAL void ker_vdW_coulomb_energy( reax_atom *my_atoms, Tap = Tap * r_ij + workspace->Tap[1]; Tap = Tap * r_ij + workspace->Tap[0]; - dTap = 7*workspace->Tap[7] * r_ij + 6*workspace->Tap[6]; + dTap = 7 * workspace->Tap[7] * r_ij + 6 * workspace->Tap[6]; dTap = dTap * r_ij + 5*workspace->Tap[5]; dTap = dTap * r_ij + 4*workspace->Tap[4]; dTap = dTap * r_ij + 3*workspace->Tap[3]; dTap = dTap * r_ij + 2*workspace->Tap[2]; - dTap += workspace->Tap[1]/r_ij; + dTap += workspace->Tap[1] / r_ij; - /*vdWaals Calculations*/ - if(gp.vdw_type==1 || gp.vdw_type==3) - { // shielding - powr_vdW1 = POW(r_ij, p_vdW1); - powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1); + /* shielding vdWaals Calculations */ + if ( gp.vdw_type == 1 || gp.vdw_type == 3 ) + { + powr_vdW1 = POW( r_ij, p_vdW1 ); + powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1 ); fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i ); exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); @@ -152,44 +149,47 @@ CUDA_GLOBAL void ker_vdW_coulomb_energy( reax_atom *my_atoms, e_vdW = twbp->D * (exp1 - 2.0 * exp2); - //data_e_vdW [i] += Tap * e_vdW; - // data_e_vdW [i] += Tap * e_vdW / 2.0; + //data_e_vdW[i] += Tap * e_vdW; + //data_e_vdW[i] += Tap * e_vdW / 2.0; #if defined(__SM_35__) - sh_vdw += Tap * e_vdW / 2.0; + sh_vdw += Tap * e_vdW / 2.0; #else - sh_vdw [threadIdx.x] += Tap * e_vdW / 2.0; + sh_vdw[threadIdx.x] += Tap * e_vdW / 2.0; #endif - dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * - POW(r_ij, p_vdW1 - 2.0); + dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * + POW( r_ij, p_vdW1 - 2.0 ); CEvd = dTap * e_vdW - Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13; } - else{ // no shielding + /* no shielding */ + else + { exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) ); exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) ); e_vdW = twbp->D * (exp1 - 2.0 * exp2); - //data_e_vdW [i] += Tap * e_vdW; - //data_e_vdW [i] += Tap * e_vdW / 2.0; + //data_e_vdW[i] += Tap * e_vdW; + //data_e_vdW[i] += Tap * e_vdW / 2.0; #if defined(__SM_35__) sh_vdw += Tap * e_vdW / 2.0; #else - sh_vdw [threadIdx.x] += Tap * e_vdW / 2.0; + sh_vdw[threadIdx.x] += Tap * e_vdW / 2.0; #endif CEvd = dTap * e_vdW - Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2); } - if(gp.vdw_type==2 || gp.vdw_type==3) - { // innner wall + /* inner wall */ + if ( gp.vdw_type == 2 || gp.vdw_type == 3 ) + { e_core = twbp->ecore * EXP(twbp->acore * (1.0-(r_ij/twbp->rcore))); - //data_e_vdW [i] += Tap * e_core; - //data_e_vdW [i] += Tap * e_core / 2.0; + //data_e_vdW[i] += Tap * e_core; + //data_e_vdW[i] += Tap * e_core / 2.0; #if defined(__SM_35__) sh_vdw += Tap * e_core / 2.0; #else @@ -201,46 +201,53 @@ CUDA_GLOBAL void ker_vdW_coulomb_energy( reax_atom *my_atoms, } /*Coulomb Calculations*/ - dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma ); - dr3gamij_3 = POW( dr3gamij_1 , 1.0 / 3.0 ); + dr3gamij_1 = r_ij * r_ij * r_ij + twbp->gamma; + dr3gamij_3 = POW( dr3gamij_1, 1.0 / 3.0 ); tmp = Tap / dr3gamij_3; - //data_e_ele [i] += e_ele = C_ele * my_atoms[i].q * my_atoms[j].q * tmp; + //data_e_ele[i] += e_ele = C_ele * my_atoms[i].q * my_atoms[j].q * tmp; e_ele = C_ele * my_atoms[i].q * my_atoms[j].q * tmp; - //data_e_ele [i] += e_ele; - //data_e_ele [i] += e_ele / 2.0; + //data_e_ele[i] += e_ele; + //data_e_ele[i] += e_ele / 2.0; #if defined(__SM_35__) sh_ele += e_ele / 2.0; #else - sh_ele [ threadIdx.x ] += e_ele / 2.0; + sh_ele[ threadIdx.x ] += e_ele / 2.0; #endif - CEclmb = C_ele * my_atoms[i].q * my_atoms[j].q * ( dTap - Tap * r_ij / dr3gamij_1 ) / dr3gamij_3; + // fprintf( fout, "%5d %5d %10.6f %10.6f\n", // MIN( system->my_atoms[i].orig_id, system->my_atoms[j].orig_id ), // MAX( system->my_atoms[i].orig_id, system->my_atoms[j].orig_id ), // CEvd, CEclmb ); - if( control->virial == 0 ) { + if ( control->virial == 0 ) + { if ( i < j ) + { //rvec_ScaledAdd( workspace->f[i], -(CEvd + CEclmb), nbr_pj->dvec ); #if defined (__SM_35__) rvec_ScaledAdd( sh_force, -(CEvd + CEclmb), nbr_pj->dvec ); #else rvec_ScaledAdd( sh_force[ threadIdx.x ], -(CEvd + CEclmb), nbr_pj->dvec ); #endif + } else + { //rvec_ScaledAdd( workspace->f[i], +(CEvd + CEclmb), nbr_pj->dvec ); #if defined (__SM_35__) rvec_ScaledAdd( sh_force , +(CEvd + CEclmb), nbr_pj->dvec ); #else - rvec_ScaledAdd( sh_force [ threadIdx.x ], +(CEvd + CEclmb), nbr_pj->dvec ); + rvec_ScaledAdd( sh_force[ threadIdx.x ], +(CEvd + CEclmb), nbr_pj->dvec ); #endif //rvec_ScaledAdd( workspace->f[j], +(CEvd + CEclmb), nbr_pj->dvec ); + } } - else { /* NPT, iNPT or sNPT */ + /* NPT, iNPT or sNPT */ + else + { /* for pressure coupling, terms not related to bond order derivatives are added directly into pressure vector/tensor */ rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec ); @@ -274,6 +281,7 @@ CUDA_GLOBAL void ker_vdW_coulomb_energy( reax_atom *my_atoms, r_ij, system->my_atoms[i].q, system->my_atoms[j].q, e_ele, data->my_en.e_ele ); #endif + #ifdef TEST_FORCES rvec_ScaledAdd( workspace->f_vdw[i], -CEvd, nbr_pj->dvec ); rvec_ScaledAdd( workspace->f_vdw[j], +CEvd, nbr_pj->dvec ); @@ -289,7 +297,8 @@ CUDA_GLOBAL void ker_vdW_coulomb_energy( reax_atom *my_atoms, } // if i < N #if defined( __SM_35__) - for (int x = VDW_KER_THREADS_PER_ATOM >> 1; x >= 1; x/=2){ + for ( int x = VDW_KER_THREADS_PER_ATOM >> 1; x >= 1; x/=2 ) + { sh_vdw += shfl( sh_vdw, x); sh_ele += shfl( sh_ele, x ); sh_force[0] += shfl( sh_force[0], x ); @@ -297,57 +306,64 @@ CUDA_GLOBAL void ker_vdW_coulomb_energy( reax_atom *my_atoms, sh_force[2] += shfl( sh_force[2], x ); } - if (laneid == 0) { + if ( laneid == 0 ) + { data_e_vdW[i] += sh_vdw; data_e_ele[i] += sh_ele; - rvec_Add (workspace->f[i], sh_force ); + rvec_Add( workspace->f[i], sh_force ); } #else - __syncthreads (); + __syncthreads( ); - if (laneid < 16) { + if (laneid < 16) + { sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 16]; sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 16]; - rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 16] ); + rvec_Add( sh_force[threadIdx.x], sh_force[threadIdx.x + 16] ); } - __syncthreads (); - if (laneid < 8) { + __syncthreads( ); + if (laneid < 8) + { sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 8]; sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 8]; - rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 8] ); + rvec_Add( sh_force[threadIdx.x], sh_force[threadIdx.x + 8] ); } - __syncthreads (); - if (laneid < 4) { + __syncthreads( ); + if (laneid < 4) + { sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 4]; sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 4]; - rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 4] ); + rvec_Add( sh_force[threadIdx.x], sh_force[threadIdx.x + 4] ); } - __syncthreads (); - if (laneid < 2) { + __syncthreads( ); + if (laneid < 2) + { sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 2]; sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 2]; - rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 2] ); + rvec_Add( sh_force[threadIdx.x], sh_force[threadIdx.x + 2] ); } - __syncthreads (); - if (laneid < 1) { + __syncthreads( ); + if (laneid < 1) + { sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 1]; sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 1]; - rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 1] ); + rvec_Add( sh_force[threadIdx.x], sh_force[threadIdx.x + 1] ); } - __syncthreads (); - if (laneid == 0) { + __syncthreads( ); + if (laneid == 0) + { data_e_vdW[i] += sh_vdw[threadIdx.x]; data_e_ele[i] += sh_ele[threadIdx.x]; - rvec_Add (workspace->f[i], sh_force [ threadIdx.x ]); + rvec_Add( workspace->f[i], sh_force[ threadIdx.x ] ); } #endif } -CUDA_GLOBAL void ker_tabulated_vdW_coulomb_energy( reax_atom *my_atoms, +CUDA_GLOBAL void k_tabulated_vdW_coulomb_energy( reax_atom *my_atoms, global_parameters gp, control_params *control, storage p_workspace, reax_list p_far_nbrs, LR_lookup_table *t_LR, int n, int N, int num_atom_types, @@ -364,27 +380,30 @@ CUDA_GLOBAL void ker_tabulated_vdW_coulomb_energy( reax_atom *my_atoms, far_neighbor_data *nbr_pj; reax_list *far_nbrs; LR_lookup_table *t; + storage *workspace; - storage *workspace = &( p_workspace ); + i = blockIdx.x * blockDim.x + threadIdx.x; + if ( i >= N ) + { + return; + } + + workspace = &( p_workspace ); natoms = n; far_nbrs = &( p_far_nbrs ); steps = step - prev_steps; update_freq = energy_update_freq; update_energies = update_freq > 0 && steps % update_freq == 0; e_ele = e_vdW = 0; - - i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= N) return; - - data_e_vdW [i] = 0; - data_e_ele [i] = 0; + data_e_vdW[i] = 0; + data_e_ele[i] = 0; //for( i = 0; i < natoms; ++i ) { - type_i = my_atoms[i].type; + type_i = my_atoms[i].type; start_i = Dev_Start_Index(i,far_nbrs); - end_i = Dev_End_Index(i,far_nbrs); - orig_i = my_atoms[i].orig_id; + end_i = Dev_End_Index(i,far_nbrs); + orig_i = my_atoms[i].orig_id; for( pj = start_i; pj < end_i; ++pj ) { @@ -404,18 +423,22 @@ CUDA_GLOBAL void ker_tabulated_vdW_coulomb_energy( reax_atom *my_atoms, tmin = MIN( type_i, type_j ); tmax = MAX( type_i, type_j ); - t = &( t_LR[ index_lr (tmin, tmax, num_atom_types) ]); + t = &( t_LR[ index_lr(tmin, tmax, num_atom_types) ]); - // table = &( LR[type_i][type_j] ); + //table = &( LR[type_i][type_j] ); /* Cubic Spline Interpolation */ r = (int)(r_ij * t->inv_dx); - if( r == 0 ) ++r; + if( r == 0 ) + { + ++r; + } base = (real)(r+1) * t->dx; dif = r_ij - base; //fprintf(stderr, "r: %f, i: %d, base: %f, dif: %f\n", r, i, base, dif); - if( update_energies ) { + if ( update_energies ) + { e_vdW = ((t->vdW[r].d*dif + t->vdW[r].c)*dif + t->vdW[r].b)*dif + t->vdW[r].a; @@ -423,28 +446,35 @@ CUDA_GLOBAL void ker_tabulated_vdW_coulomb_energy( reax_atom *my_atoms, t->ele[r].a; e_ele *= my_atoms[i].q * my_atoms[j].q; - //data_e_vdW [i] += e_vdW; - data_e_vdW [i] += e_vdW / 2.0; - //data_e_ele [i] += e_ele; - data_e_ele [i] += e_ele / 2.0; + //data_e_vdW[i] += e_vdW; + data_e_vdW[i] += e_vdW / 2.0; + //data_e_ele[i] += e_ele; + data_e_ele[i] += e_ele / 2.0; } - CEvd = ((t->CEvd[r].d*dif + t->CEvd[r].c)*dif + t->CEvd[r].b)*dif + + CEvd = ((t->CEvd[r].d * dif + t->CEvd[r].c) * dif + t->CEvd[r].b) * dif + t->CEvd[r].a; - CEclmb = ((t->CEclmb[r].d*dif+t->CEclmb[r].c)*dif+t->CEclmb[r].b)*dif + + CEclmb = ((t->CEclmb[r].d * dif + t->CEclmb[r].c) * dif + t->CEclmb[r].b) * dif + t->CEclmb[r].a; CEclmb *= my_atoms[i].q * my_atoms[j].q; - if( control->virial == 0 ) { + if( control->virial == 0 ) + { if ( i < j ) + { rvec_ScaledAdd( workspace->f[i], -(CEvd + CEclmb), nbr_pj->dvec ); + } else + { rvec_ScaledAdd( workspace->f[i], +(CEvd + CEclmb), nbr_pj->dvec ); + } //rvec_ScaledAdd( workspace->f[i], -(CEvd + CEclmb), nbr_pj->dvec ); //rvec_ScaledAdd( workspace->f[j], +(CEvd + CEclmb), nbr_pj->dvec ); } - else { // NPT, iNPT or sNPT + /* NPT, iNPT or sNPT */ + else + { /* for pressure coupling, terms not related to bond order derivatives are added directly into pressure vector/tensor */ rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec ); @@ -467,6 +497,7 @@ CUDA_GLOBAL void ker_tabulated_vdW_coulomb_energy( reax_atom *my_atoms, r_ij, system->my_atoms[i].q, system->my_atoms[j].q, e_ele, data->my_en.e_ele ); #endif + #ifdef TEST_FORCES rvec_ScaledAdd( workspace->f_vdw[i], -CEvd, nbr_pj->dvec ); rvec_ScaledAdd( workspace->f_vdw[j], +CEvd, nbr_pj->dvec ); @@ -479,25 +510,25 @@ CUDA_GLOBAL void ker_tabulated_vdW_coulomb_energy( reax_atom *my_atoms, } -CUDA_GLOBAL void ker_pol_energy( reax_atom *my_atoms, +CUDA_GLOBAL void k_pol_energy( reax_atom *my_atoms, single_body_parameters *sbp, int n, real *data_e_pol ) { - int type_i; + int i, type_i; real q; - int i = blockIdx.x * blockDim.x + threadIdx.x; - if ( i >= n) return; + i = blockIdx.x * blockDim.x + threadIdx.x; - data_e_pol [i] = 0; + if ( i >= n ) + { + return; + } - //for( i = 0; i < system->n; i++ ) { q = my_atoms[i].q; type_i = my_atoms[i].type; - data_e_pol[i] += + data_e_pol[i] = KCALpMOL_to_EV * (sbp[type_i].chi * q + (sbp[type_i].eta / 2.) * SQR(q)); - //} } @@ -505,17 +536,18 @@ void Cuda_Compute_Polarization_Energy( reax_system *system, simulation_data *dat { int blocks; real *spad = (real *) scratch; + cuda_memset( spad, 0, sizeof(real) * 2 * system->n, "pol_energy" ); blocks = system->n / DEF_BLOCK_SIZE + ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1); - ker_pol_energy <<< blocks, DEF_BLOCK_SIZE >>> + + k_pol_energy <<< blocks, DEF_BLOCK_SIZE >>> ( system->d_my_atoms, system->reax_param.d_sbp, system->n, spad ); cudaThreadSynchronize( ); cudaCheckError( ); - /* reduction for polarization energy */ Cuda_Reduction_Sum( spad, &((simulation_data *)data->d_simulation_data)->my_en.e_pol, system->n ); @@ -526,22 +558,23 @@ void Cuda_NonBonded_Energy( reax_system *system, control_params *control, storage *workspace, simulation_data *data, reax_list **lists, output_controls *out_control, bool isTabulated ) { - int blocks; - int rblocks; + int blocks, rblocks, update_energy; int size = (2 * system->N + 2 * system->N ) * sizeof(real) + 2 * system->N * sizeof(rvec); - rvec *spad_rvec; real *spad = (real *) scratch; - cuda_memset (spad, 0, size, "pol_energy"); + update_energy = (out_control->energy_update_freq > 0 + && data->step % out_control->energy_update_freq == 0) ? TRUE : FALSE; rblocks = system->N / DEF_BLOCK_SIZE + ((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1); blocks = ((system->N * VDW_KER_THREADS_PER_ATOM) / DEF_BLOCK_SIZE) + (((system->N * VDW_KER_THREADS_PER_ATOM) % DEF_BLOCK_SIZE == 0) ? 0 : 1); - if (!isTabulated) + cuda_memset( spad, 0, size, "pol_energy" ); + + if ( !isTabulated ) { - ker_vdW_coulomb_energy <<< blocks, DEF_BLOCK_SIZE, DEF_BLOCK_SIZE * (2 * sizeof(real) + sizeof(rvec)) >>> + k_vdW_coulomb_energy <<< blocks, DEF_BLOCK_SIZE, DEF_BLOCK_SIZE * (2 * sizeof(real) + sizeof(rvec)) >>> ( system->d_my_atoms, system->reax_param.d_tbp, system->reax_param.d_gp, (control_params *)control->d_control_params, *(dev_workspace), *(*dev_lists + FAR_NBRS), @@ -552,7 +585,7 @@ void Cuda_NonBonded_Energy( reax_system *system, control_params *control, } else { - ker_tabulated_vdW_coulomb_energy <<< blocks, DEF_BLOCK_SIZE >>> + k_tabulated_vdW_coulomb_energy <<< blocks, DEF_BLOCK_SIZE >>> ( system->d_my_atoms, system->reax_param.d_gp, (control_params *)control->d_control_params, *(dev_workspace), *(*dev_lists + FAR_NBRS), @@ -567,14 +600,20 @@ void Cuda_NonBonded_Energy( reax_system *system, control_params *control, } /* reduction for vdw */ - Cuda_Reduction_Sum( spad, - &((simulation_data *)data->d_simulation_data)->my_en.e_vdW, - system->N ); + if ( update_energy == TRUE ) + { + Cuda_Reduction_Sum( spad, + &((simulation_data *)data->d_simulation_data)->my_en.e_vdW, + system->N ); + } - /* reduction for ele */ - Cuda_Reduction_Sum( spad + 2 * system->N, - &((simulation_data *)data->d_simulation_data)->my_en.e_ele, - system->N ); + /* reduction for ele */ + if ( update_energy == TRUE ) + { + Cuda_Reduction_Sum( spad + 2 * system->N, + &((simulation_data *)data->d_simulation_data)->my_en.e_ele, + system->N ); + } /* reduction for ext_press */ spad_rvec = (rvec *) (spad + 4 * system->N); @@ -588,5 +627,8 @@ void Cuda_NonBonded_Energy( reax_system *system, control_params *control, cudaThreadSynchronize( ); cudaCheckError( ); - Cuda_Compute_Polarization_Energy( system, data ); + if ( update_energy == TRUE ) + { + Cuda_Compute_Polarization_Energy( system, data ); + } } diff --git a/PG-PuReMD/src/cuda/cuda_utils.h b/PG-PuReMD/src/cuda/cuda_utils.h index bfc4256d1408e3a2947714d3adde62899b144f55..f4e49a10b65429f0910f5e374a2f15790ff0c89b 100644 --- a/PG-PuReMD/src/cuda/cuda_utils.h +++ b/PG-PuReMD/src/cuda/cuda_utils.h @@ -43,7 +43,7 @@ static inline void __cudaCheckError( const char *file, const int line ) exit( RUNTIME_ERROR ); } -#if defined(DEBUG) +#if defined(DEBUG_FOCUS) /* More careful checking. However, this will affect performance. */ err = cudaDeviceSynchronize( ); if( cudaSuccess != err ) diff --git a/PG-PuReMD/src/forces.c b/PG-PuReMD/src/forces.c index 19133fcee9aaeeb9dcca9e736a1859920a3acd19..f210c3b4ba45ec46f6bdbc5d4a2f23e24629a02d 100644 --- a/PG-PuReMD/src/forces.c +++ b/PG-PuReMD/src/forces.c @@ -1751,7 +1751,7 @@ int Compute_Forces( reax_system *system, control_params *control, //MPI_Barrier( MPI_COMM_WORLD ); if ( system->my_rank == MASTER_NODE ) { - Update_Timing_Info( &t_start, &(data->timing.qEq) ); + Update_Timing_Info( &t_start, &(data->timing.cm) ); } #endif #if defined(DEBUG_FOCUS) diff --git a/PG-PuReMD/src/io_tools.c b/PG-PuReMD/src/io_tools.c index 131f8a2e955fefd4fec111f5b5e6c1b1ad4f2de7..91783d67ceb452962c64576ea2b424a1445ba539 100644 --- a/PG-PuReMD/src/io_tools.c +++ b/PG-PuReMD/src/io_tools.c @@ -117,9 +117,9 @@ int Init_Output_Files( reax_system *system, control_params *control, sprintf( temp, "%s.log", control->sim_name ); if ( (out_control->log = fopen( temp, "w" )) != NULL ) { - fprintf( out_control->log, "%6s%8s%8s%8s%8s%8s%8s%8s%8s\n", + fprintf( out_control->log, "%6s%8s%8s%8s%8s%8s%8s%8s%8s%8s\n", "step", "total", "comm", "nbrs", "init", "bonded", "nonb", - "qeq", "matvecs" ); + "charges", "l iters", "retries" ); fflush( out_control->log ); } else @@ -1156,9 +1156,9 @@ void Print_Bond_List2( reax_system *system, reax_list *bonds, char *fname ) void Print_Total_Force( reax_system *system, simulation_data *data, - storage *workspace ) + storage *workspace ) { - int i; + int i; fprintf( stderr, "step: %d\n", data->step ); fprintf( stderr, "%6s\t%-38s\n", "atom", "atom.f[0,1,2]"); @@ -1179,10 +1179,10 @@ void Output_Results( reax_system *system, control_params *control, real t_elapsed, denom; #endif - if ((out_control->energy_update_freq > 0 && + if ( (out_control->energy_update_freq > 0 && data->step % out_control->energy_update_freq == 0) || (out_control->write_steps > 0 && - data->step % out_control->write_steps == 0)) + data->step % out_control->write_steps == 0) ) { /* update system-wide energies */ Compute_System_Energy( system, data, MPI_COMM_WORLD ); @@ -1192,7 +1192,7 @@ void Output_Results( reax_system *system, control_params *control, out_control->energy_update_freq > 0 && data->step % out_control->energy_update_freq == 0 ) { -#if defined(DEBUG) && defined(DEBUG_FOCUS) +#if !defined(DEBUG) && !defined(DEBUG_FOCUS) fprintf( out_control->out, "%-6d%14.2f%14.2f%14.2f%11.2f%13.2f%13.5f\n", data->step, data->sys_en.e_tot, data->sys_en.e_pot, @@ -1229,19 +1229,21 @@ void Output_Results( reax_system *system, control_params *control, #if defined(LOG_PERFORMANCE) t_elapsed = Get_Timing_Info( data->timing.total ); if ( data->step - data->prev_steps > 0 ) + { denom = 1.0 / out_control->energy_update_freq; - else denom = 1; + } + else + { + denom = 1; + } - fprintf( out_control->log, "%6d%8.3f%8.3f%8.3f%8.3f%8.3f%8.3f%8.3f%6d\n", - data->step, - t_elapsed * denom, - data->timing.comm * denom, - data->timing.nbrs * denom, - data->timing.init_forces * denom, - data->timing.bonded * denom, - data->timing.nonb * denom, - data->timing.qEq * denom, - (int)((data->timing.s_matvecs + data->timing.t_matvecs)*denom) ); + fprintf( out_control->log, "%6d%8.3f%8.3f%8.3f%8.3f%8.3f%8.3f%8.3f%8d%8d\n", + data->step, t_elapsed * denom, data->timing.comm * denom, + data->timing.nbrs * denom, data->timing.init_forces * denom, + data->timing.bonded * denom, data->timing.nonb * denom, + data->timing.cm * denom, + (int)((data->timing.s_matvecs + data->timing.t_matvecs) * denom), + data->timing.num_retries ); Reset_Timing( &(data->timing) ); fflush( out_control->log ); diff --git a/PG-PuReMD/src/list.h b/PG-PuReMD/src/list.h index df6ec82f24b9002f51e0b93a6461a0d7ca14bbeb..a6a82d204a9c6409553cbd8a4e4ab3a7992959a0 100644 --- a/PG-PuReMD/src/list.h +++ b/PG-PuReMD/src/list.h @@ -25,7 +25,7 @@ #include "reax_types.h" -#ifdef _cplusplus +#ifdef __cplusplus extern "C" { #endif @@ -35,7 +35,7 @@ void Make_List( int, int, int, reax_list* ); void Delete_List( reax_list* ); -#ifdef _cplusplus +#ifdef __cplusplus } #endif diff --git a/PG-PuReMD/src/parallelreax.c b/PG-PuReMD/src/parallelreax.c index 30c2372227d3ce6d7fe3e81e0e4789f1fe61537a..304bb1f6e1825d0953c68014c7125427564b8a9a 100644 --- a/PG-PuReMD/src/parallelreax.c +++ b/PG-PuReMD/src/parallelreax.c @@ -109,7 +109,7 @@ void Post_Evolve( reax_system* system, control_params* control, int i; rvec diff, cross; - /* remove trans & rot velocity of the center of mass from system */ + /* remove translational and rotational velocity of the center of mass from system */ if ( control->ensemble != NVE && control->remove_CoM_vel && data->step % control->remove_CoM_vel == 0 ) { @@ -118,17 +118,17 @@ void Post_Evolve( reax_system* system, control_params* control, for ( i = 0; i < system->n; i++ ) { - /* remove translational vel */ + /* remove translational term */ rvec_ScaledAdd( system->my_atoms[i].v, -1., data->vcm ); - /* remove rotational */ + /* remove rotational term */ rvec_ScaledSum( diff, 1., system->my_atoms[i].x, -1., data->xcm ); rvec_Cross( cross, data->avcm, diff ); rvec_ScaledAdd( system->my_atoms[i].v, -1., cross ); } } - /* compute kinetic energy of the system */ + /* compute kinetic energy of system */ Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D ); } @@ -183,27 +183,6 @@ int main( int argc, char* argv[] ) #ifdef HAVE_CUDA - /* Remove this debug information later */ -#if defined(__CUDA_DEBUG_LOG__) - fprintf( stderr, " Size of LR Lookup table %d \n", sizeof(LR_lookup_table) ); -#endif - -#if defined( __SM_35__) - fprintf( stderr, " nbrs block size: %d \n", NBRS_BLOCK_SIZE ); - fprintf( stderr, " nbrs threads per atom: %d \n", NB_KER_THREADS_PER_ATOM ); - - fprintf( stderr, " hbonds block size: %d \n", HB_BLOCK_SIZE ); - fprintf( stderr, " hbonds threads per atom: %d \n", HB_KER_THREADS_PER_ATOM ); - - fprintf( stderr, " vdw block size: %d \n", VDW_BLOCK_SIZE ); - fprintf( stderr, " vdw threads per atom: %d \n", VDW_KER_THREADS_PER_ATOM ); - - fprintf( stderr, " matvec block size: %d \n", MATVEC_BLOCK_SIZE ); - fprintf( stderr, " matvec threads per atom: %d \n", MATVEC_KER_THREADS_PER_ROW); - - fprintf( stderr, " General block size: %d \n", DEF_BLOCK_SIZE ); -#endif - /* allocate main data structures */ system = (reax_system *) smalloc( sizeof(reax_system), "system" ); control = (control_params *) smalloc( sizeof(control_params), "control" ); @@ -249,17 +228,11 @@ int main( int argc, char* argv[] ) Read_System( argv[1], argv[2], argv[3], system, control, data, workspace, out_control, mpi_data ); -#if defined(DEBUG) - fprintf( stderr, "p%d: read simulation info\n", system->my_rank ); - MPI_Barrier( MPI_COMM_WORLD ); -#endif - /* setup the CUDA Device for this process */ Setup_Cuda_Environment( system->my_rank, control->nprocs, control->gpus_per_node ); #if defined(DEBUG) print_device_mem_usage( ); - fprintf( stderr, "p%d: Total number of GPUs on this node -- %d\n", system->my_rank, my_device_id); #endif /* init the blocks sizes for cuda kernels */ @@ -278,67 +251,31 @@ int main( int argc, char* argv[] ) Pure_Initialize( system, control, data, workspace, lists, out_control, mpi_data ); #endif -//#if defined(DEBUG) +#if defined(DEBUG) print_device_mem_usage( ); -//#endif +#endif /* init the blocks sizes for cuda kernels */ init_blocks( system ); -#if defined(DEBUG) - fprintf( stderr, "p%d: initializated data structures\n", system->my_rank ); - MPI_Barrier( MPI_COMM_WORLD ); -#endif - //END OF FIRST STEP - - // compute f_0 + /* compute f_0 */ Comm_Atoms( system, control, data, workspace, lists, mpi_data, TRUE ); Sync_Atoms( system ); Sync_Grid( &system->my_grid, &system->d_my_grid ); init_blocks( system ); -#if defined(__CUDA_DEBUG_LOG__) - fprintf( stderr, "p%d: Comm_Atoms synchronized \n", system->my_rank ); -#endif - - //Second step Cuda_Reset( system, control, data, workspace, lists ); #if defined(__CUDA_DEBUG__) Reset( system, control, data, workspace, lists ); #endif -#if defined(DEBUG) - fprintf( stderr, "p%d: Cuda_Reset done...\n", system->my_rank ); -#endif - - //Third Step Cuda_Generate_Neighbor_Lists( system, data, workspace, lists ); #if defined(__CUDA_DEBUG__) Generate_Neighbor_Lists( system, data, workspace, lists ); #endif -// reax_list **temp_lists = (reax_list **) smalloc( LIST_N * sizeof (reax_list *), "temp_lists" ); -// for ( i = 0; i < LIST_N; ++i ) -// { -// temp_lists[i] = (reax_list *) smalloc( sizeof(reax_list), "lists[i]" ); -// temp_lists[i]->allocated = FALSE; -// } -// Make_List( (*dev_lists + FAR_NBRS)->n, (*dev_lists + FAR_NBRS)->num_intrs, -// TYP_FAR_NEIGHBOR, (*temp_lists + FAR_NBRS) ); -// Output_Sync_Lists( (*temp_lists + FAR_NBRS), (*dev_lists + FAR_NBRS), TYP_FAR_NEIGHBOR ); -// Print_Far_Neighbors( system, temp_lists, control ); - -#if defined(DEBUG) - fprintf( stderr, "p%d: Cuda_Generate_Neighbor_Lists done...\n", system->my_rank ); -#endif - - //Fourth Step -#if defined(DEBUG) - fprintf( stderr, " Host Compute Forces begin.... \n" ); -#endif - #if defined(__CUDA_DEBUG__) Compute_Forces( system, control, data, workspace, lists, out_control, mpi_data ); @@ -347,20 +284,12 @@ int main( int argc, char* argv[] ) Cuda_Compute_Forces( system, control, data, workspace, lists, out_control, mpi_data ); -#if defined(DEBUG) - fprintf (stderr, "p%d: Cuda_Compute_Forces done...\n", system->my_rank ); -#endif - #if defined (__CUDA_DEBUG__) Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D ); #endif Cuda_Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D ); -#if defined(DEBUG) - fprintf (stderr, "p%d: Cuda_Compute_Kinetic_Energy done ... \n", system->my_rank); -#endif - #if defined(__CUDA_DEBUG__) validate_device( system, data, workspace, lists ); #endif @@ -368,16 +297,13 @@ int main( int argc, char* argv[] ) #if !defined(__CUDA_DEBUG__) Output_Results( system, control, data, lists, out_control, mpi_data ); #endif -#if defined(DEBUG) - fprintf (stderr, "p%d: Output_Results done ... \n", system->my_rank); -#endif #if defined(DEBUG) - fprintf( stderr, "p%d: computed forces at t0\n", system->my_rank ); + fprintf( stderr, "p%d: step%d completed\n", system->my_rank, data->step ); MPI_Barrier( MPI_COMM_WORLD ); #endif - // start the simulation + /* begin main simulation loop */ ++data->step; retries = 0; while ( data->step <= control->nsteps && retries < MAX_RETRIES ) @@ -423,20 +349,18 @@ int main( int argc, char* argv[] ) fprintf( stderr, " Post Evolve time: %f \n", t_end ); #endif -#if !defined(__CUDA_DEBUG__) if ( ret == SUCCESS ) { + data->timing.num_retries = retries; + +#if !defined(__CUDA_DEBUG__) Output_Results( system, control, data, lists, out_control, mpi_data ); - } #endif -// if ( ret == SUCCESS ) -// { -// Analysis(system, control, data, workspace, lists, out_control, mpi_data); -// } +// Analysis(system, control, data, workspace, lists, out_control, mpi_data); /* dump restart info */ -// if ( ret == SUCCESS && out_control->restart_freq && +// if ( out_control->restart_freq && // (data->step-data->prev_steps) % out_control->restart_freq == 0 ) // { // if( out_control->restart_format == WRITE_ASCII ) @@ -449,8 +373,6 @@ int main( int argc, char* argv[] ) // } // } - if ( ret == SUCCESS ) - { #if defined(DEBUG) fprintf( stderr, "p%d: step%d completed\n", system->my_rank, data->step ); MPI_Barrier( MPI_COMM_WORLD ); @@ -462,7 +384,9 @@ int main( int argc, char* argv[] ) else { ++retries; +#if defined(DEBUG) fprintf( stderr, "[INFO] p%d: retrying step %d...\n", system->my_rank, data->step ); +#endif } } @@ -576,16 +500,12 @@ int main( int argc, char* argv[] ) if ( ret == SUCCESS ) { + data->timing.num_retries = retries; + Output_Results( system, control, data, lists, out_control, mpi_data ); - } - if ( ret == SUCCESS ) - { // Analysis(system, control, data, workspace, lists, out_control, mpi_data); - } - if ( ret == SUCCESS ) - { /* dump restart info */ if ( out_control->restart_freq && (data->step - data->prev_steps) % out_control->restart_freq == 0 ) @@ -599,10 +519,7 @@ int main( int argc, char* argv[] ) Write_Binary_Restart( system, control, data, out_control, mpi_data ); } } - } - if ( ret == SUCCESS ) - { #if defined(DEBUG) fprintf( stderr, "p%d: step%d completed\n", system->my_rank, data->step ); MPI_Barrier( mpi_data->world ); diff --git a/PG-PuReMD/src/reax_types.h b/PG-PuReMD/src/reax_types.h index 38810bd6e6bb43d38f7e20ac47d69f1241cc10f6..d683df1cb80d925a35b79bdec5f2e11a61ad4c23 100644 --- a/PG-PuReMD/src/reax_types.h +++ b/PG-PuReMD/src/reax_types.h @@ -1456,11 +1456,13 @@ typedef struct /* non-bonded force calculation time */ real nonb; /* atomic charge distribution calculation time */ - real qEq; + real cm; /* num. of steps in iterative linear solver for charge distribution (QEq, first solve) */ - int s_matvecs; + int s_matvecs; /* num. of steps in iterative linear solver for charge distribution (QEq, second solve) */ - int t_matvecs; + int t_matvecs; + /* num. of retries in main sim. loop */ + int num_retries; } reax_timing; diff --git a/PG-PuReMD/src/reset_tools.c b/PG-PuReMD/src/reset_tools.c index c3778145e1913b16e5299bd5e3c2cf5f3f263b5f..f2a24753f8748cdacf00a93160b687e14d28dfa8 100644 --- a/PG-PuReMD/src/reset_tools.c +++ b/PG-PuReMD/src/reset_tools.c @@ -112,9 +112,10 @@ void Reset_Timing( reax_timing *rt ) rt->init_forces = 0; rt->bonded = 0; rt->nonb = 0; - rt->qEq = 0; + rt->cm = 0; rt->s_matvecs = 0; rt->t_matvecs = 0; + rt->num_retries = 0; } diff --git a/PG-PuReMD/src/tool_box.c b/PG-PuReMD/src/tool_box.c index a6f570c8c2b3fdb2a7d1afd789490f8a1f5fcd57..58772d773252982721b9ba1385ca932123632f02 100644 --- a/PG-PuReMD/src/tool_box.c +++ b/PG-PuReMD/src/tool_box.c @@ -186,7 +186,7 @@ void Update_Timing_Info( real *t_start, real *timing ) struct timeval tim; real t_end; - gettimeofday(&tim, NULL ); + gettimeofday( &tim, NULL ); t_end = tim.tv_sec + (tim.tv_usec / 1000000.0); *timing += (t_end - *t_start); diff --git a/PG-PuReMD/src/traj.c b/PG-PuReMD/src/traj.c index b7ba111220dd8874726590c294cc72e07cb06b50..331e443057455b8e74b003a0e20a6f81fba3d550 100644 --- a/PG-PuReMD/src/traj.c +++ b/PG-PuReMD/src/traj.c @@ -1032,7 +1032,6 @@ int Append_Frame( reax_system *system, control_params *control, if ( out_control->write_atoms ) { - //Sync atoms here #ifdef HAVE_CUDA Output_Sync_Atoms( system ); #endif @@ -1041,22 +1040,21 @@ int Append_Frame( reax_system *system, control_params *control, if ( out_control->write_bonds ) { - //sync bonds here #ifdef HAVE_CUDA - Output_Sync_Lists((*lists + BONDS), (*dev_lists + BONDS), TYP_BOND); + Output_Sync_Lists( (*lists + BONDS), (*dev_lists + BONDS), TYP_BOND ); #endif Write_Bonds( system, control, (*lists + BONDS), out_control, mpi_data ); } if ( out_control->write_angles ) { - //sync three body interactions here #ifdef HAVE_CUDA - Output_Sync_Lists((*lists + THREE_BODIES), (*dev_lists + THREE_BODIES), TYP_THREE_BODY); + Output_Sync_Lists( (*lists + THREE_BODIES), (*dev_lists + THREE_BODIES), TYP_THREE_BODY ); #endif Write_Angles( system, control, (*lists + BONDS), (*lists + THREE_BODIES), out_control, mpi_data ); } + #if defined(DEBUG_FOCUS) fprintf( stderr, "p%d: appended frame %d\n", system->my_rank, data->step ); #endif