From c24310e8d3f6be1d17d93752bf8764fb61c18882 Mon Sep 17 00:00:00 2001 From: "Kurt A. O'Hearn" <ohearnku@msu.edu> Date: Thu, 24 Aug 2017 17:01:08 -0400 Subject: [PATCH] PG-PuReMD: complete memory changes. Remove max register per CUDA function option in build. Other misc. changes. --- PG-PuReMD/Makefile.am | 8 +- PG-PuReMD/src/comm_tools.c | 8 +- PG-PuReMD/src/control.c | 1 + PG-PuReMD/src/cuda/cuda_allocate.cu | 116 ++-- PG-PuReMD/src/cuda/cuda_bond_orders.cu | 56 +- PG-PuReMD/src/cuda/cuda_bonds.cu | 14 +- PG-PuReMD/src/cuda/cuda_charges.cu | 3 +- PG-PuReMD/src/cuda/cuda_forces.cu | 745 ++++++++++++++-------- PG-PuReMD/src/cuda/cuda_forces.h | 15 +- PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu | 71 +-- PG-PuReMD/src/cuda/cuda_hydrogen_bonds.h | 20 +- PG-PuReMD/src/cuda/cuda_init_md.cu | 46 +- PG-PuReMD/src/cuda/cuda_integrate.cu | 165 +++-- PG-PuReMD/src/cuda/cuda_lin_alg.cu | 57 +- PG-PuReMD/src/cuda/cuda_multi_body.cu | 46 +- PG-PuReMD/src/cuda/cuda_neighbors.cu | 316 ++------- PG-PuReMD/src/cuda/cuda_neighbors.h | 12 +- PG-PuReMD/src/cuda/cuda_random.cu | 65 ++ PG-PuReMD/src/cuda/cuda_random.h | 44 ++ PG-PuReMD/src/cuda/cuda_system_props.cu | 74 ++- PG-PuReMD/src/cuda/cuda_system_props.h | 2 + PG-PuReMD/src/cuda/cuda_torsion_angles.cu | 25 +- PG-PuReMD/src/cuda/cuda_utils.cu | 24 +- PG-PuReMD/src/cuda/cuda_valence_angles.cu | 32 +- PG-PuReMD/src/cuda/cuda_vector.h | 41 ++ PG-PuReMD/src/ffield.c | 46 +- PG-PuReMD/src/ffield.h | 2 +- PG-PuReMD/src/init_md.c | 2 +- PG-PuReMD/src/io_tools.c | 12 +- PG-PuReMD/src/parallelreax.c | 6 +- PG-PuReMD/src/random.c | 15 +- PG-PuReMD/src/random.h | 1 + PG-PuReMD/src/reax_types.h | 30 +- PG-PuReMD/src/traj.c | 54 +- PG-PuReMD/src/vector.h | 10 +- PuReMD/src/basic_comm.c | 2 +- PuReMD/src/control.c | 6 +- PuReMD/src/ffield.c | 12 +- PuReMD/src/geo_tools.c | 4 +- PuReMD/src/grid.c | 2 +- PuReMD/src/init_md.c | 8 +- PuReMD/src/lookup.c | 12 +- PuReMD/src/parallelreax.c | 14 +- PuReMD/src/qEq.c | 2 +- PuReMD/src/restart.c | 12 +- PuReMD/src/tool_box.c | 2 +- PuReMD/src/traj.c | 6 +- cuda.am | 10 +- 48 files changed, 1189 insertions(+), 1087 deletions(-) create mode 100644 PG-PuReMD/src/cuda/cuda_random.cu create mode 100644 PG-PuReMD/src/cuda/cuda_random.h create mode 100644 PG-PuReMD/src/cuda/cuda_vector.h diff --git a/PG-PuReMD/Makefile.am b/PG-PuReMD/Makefile.am index 149e16de..d16fd249 100644 --- a/PG-PuReMD/Makefile.am +++ b/PG-PuReMD/Makefile.am @@ -35,8 +35,8 @@ include_HEADERS = src/reax_types.h src/index_utils.h \ if USE_CUDA bin_pg_puremd_SOURCES += src/cuda/cuda_utils.cu src/cuda/cuda_allocate.cu src/cuda/cuda_environment.cu \ - src/cuda/cuda_system_props.cu src/cuda/cuda_reduction.cu src/cuda/cuda_box.cu \ - src/cuda/cuda_copy.cu src/cuda/cuda_reset_tools.cu src/cuda/cuda_list.cu \ + src/cuda/cuda_system_props.cu src/cuda/cuda_reduction.cu src/cuda/cuda_box.cu src/cuda/cuda_list.cu \ + src/cuda/cuda_copy.cu src/cuda/cuda_reset_tools.cu src/cuda/cuda_random.cu \ src/cuda/cuda_neighbors.cu src/cuda/cuda_bond_orders.cu src/cuda/cuda_bonds.cu \ src/cuda/cuda_multi_body.cu src/cuda/cuda_valence_angles.cu \ src/cuda/cuda_torsion_angles.cu src/cuda/cuda_hydrogen_bonds.cu src/cuda/cuda_forces.cu \ @@ -45,8 +45,8 @@ bin_pg_puremd_SOURCES += src/cuda/cuda_utils.cu src/cuda/cuda_allocate.cu src/cu src/cuda/cuda_init_md.cu src/cuda/cuda_validation.cu src/cuda/cuda_lookup.cu include_HEADERS += src/cuda/cuda_helpers.h src/cuda/cuda_shuffle.h \ src/cuda/cuda_utils.h src/cuda/cuda_allocate.h src/cuda/cuda_environment.h \ - src/cuda/cuda_system_props.h src/cuda/cuda_reduction.h src/cuda/cuda_box.h \ - src/cuda/cuda_copy.h src/cuda/cuda_reset_tools.h src/cuda/cuda_list.h \ + src/cuda/cuda_system_props.h src/cuda/cuda_reduction.h src/cuda/cuda_box.h src/cuda/cuda_list.h \ + src/cuda/cuda_copy.h src/cuda/cuda_reset_tools.h src/cuda/cuda_random.h src/cuda/cuda_vector.h \ src/cuda/cuda_neighbors.h src/cuda/cuda_bond_orders.h src/cuda/cuda_bonds.h \ src/cuda/cuda_multi_body.h src/cuda/cuda_valence_angles.h \ src/cuda/cuda_torsion_angles.h src/cuda/cuda_hydrogen_bonds.h src/cuda/cuda_forces.h \ diff --git a/PG-PuReMD/src/comm_tools.c b/PG-PuReMD/src/comm_tools.c index a8d46fcb..9e8978dd 100644 --- a/PG-PuReMD/src/comm_tools.c +++ b/PG-PuReMD/src/comm_tools.c @@ -763,8 +763,8 @@ void Comm_Atoms( reax_system *system, control_params *control, Reorder_My_Atoms( system, workspace ); #if defined(DEBUG_FOCUS) - fprintf( stderr, "p%d updated local atoms, n=%d\n", - system->my_rank, system->n ); + fprintf( stderr, "p%d, step %d: updated local atoms, n=%d\n", + system->my_rank, data->step, system->n ); MPI_Barrier( MPI_COMM_WORLD ); #endif @@ -777,8 +777,8 @@ void Comm_Atoms( reax_system *system, control_params *control, Sort_Boundary_Atoms, Unpack_Exchange_Message, TRUE ); #if defined(DEBUG_FOCUS) - fprintf( stderr, "p%d: exchanged boundary atoms, N=%d\n", - system->my_rank, system->N ); + fprintf( stderr, "p%d, step %d: exchanged boundary atoms, N=%d\n", + system->my_rank, data->step, system->N ); for ( i = 0; i < MAX_NBRS; ++i ) { fprintf( stderr, "p%d: nbr%d(p%d) str=%d cnt=%d end=%d\n", diff --git a/PG-PuReMD/src/control.c b/PG-PuReMD/src/control.c index 7f7060c1..268b6a17 100644 --- a/PG-PuReMD/src/control.c +++ b/PG-PuReMD/src/control.c @@ -58,6 +58,7 @@ char Read_Control_File( char *control_file, control_params* control, control->geo_format = 1; control->gpus_per_node = 1; + control->random_vel = 0; control->restart = 0; out_control->restart_format = WRITE_BINARY; out_control->restart_freq = 0; diff --git a/PG-PuReMD/src/cuda/cuda_allocate.cu b/PG-PuReMD/src/cuda/cuda_allocate.cu index c81938c5..5df47e6f 100644 --- a/PG-PuReMD/src/cuda/cuda_allocate.cu +++ b/PG-PuReMD/src/cuda/cuda_allocate.cu @@ -2,6 +2,7 @@ #include "cuda_allocate.h" #include "cuda_allocate.h" +#include "cuda_forces.h" #include "cuda_list.h" #include "cuda_neighbors.h" #include "cuda_utils.h" @@ -42,10 +43,10 @@ CUDA_GLOBAL void Init_Nbrs( ivec *nbrs, int N ) void dev_alloc_grid( reax_system *system ) { int total; - grid_cell local_cell; +// grid_cell local_cell; grid *host = &system->my_grid; grid *device = &system->d_my_grid; - ivec *nbrs_x = (ivec *) scratch; +// ivec *nbrs_x = (ivec *) scratch; total = host->ncells[0] * host->ncells[1] * host->ncells[2]; ivec_Copy( device->ncells, host->ncells ); @@ -249,80 +250,85 @@ void dev_alloc_system( reax_system *system ) } -void dev_realloc_system( reax_system *system, int local_cap, int total_cap, char *msg ) +void dev_realloc_system( reax_system *system, int old_total_cap, int total_cap, char *msg ) { int *temp; + reax_atom *temp_atom; temp = (int *) scratch; + temp_atom = (reax_atom*) scratch; /* free the existing storage for atoms, leave other info allocated */ + copy_device( temp_atom, system->d_my_atoms, old_total_cap * sizeof(reax_atom), + "dev_realloc_system::temp" ); cuda_free( system->d_my_atoms, "system::d_my_atoms" ); cuda_malloc( (void **) &system->d_my_atoms, sizeof(reax_atom) * total_cap, TRUE, "system::d_my_atoms" ); + copy_device( system->d_my_atoms, temp, old_total_cap * sizeof(reax_atom), + "dev_realloc_system::temp" ); - //TODO: record old total_cap before increase, use here - copy_device( temp, system->d_far_nbrs, system->total_cap * sizeof(int), + copy_device( temp, system->d_far_nbrs, old_total_cap * sizeof(int), "dev_realloc_system::temp" ); cuda_free( system->d_far_nbrs, "system::d_far_nbrs" ); cuda_malloc( (void **) &system->d_far_nbrs, system->total_cap * sizeof(int), TRUE, "system::d_far_nbrs" ); - copy_device( system->d_far_nbrs, temp, system->total_cap * sizeof(int), + copy_device( system->d_far_nbrs, temp, old_total_cap * sizeof(int), "dev_realloc_system::temp" ); - copy_device( temp, system->d_max_far_nbrs, system->total_cap * sizeof(int), + copy_device( temp, system->d_max_far_nbrs, old_total_cap * sizeof(int), "dev_realloc_system::temp" ); cuda_free( system->d_max_far_nbrs, "system::d_max_far_nbrs" ); cuda_malloc( (void **) &system->d_max_far_nbrs, system->total_cap * sizeof(int), TRUE, "system::d_max_far_nbrs" ); - copy_device( system->d_max_far_nbrs, temp, system->total_cap * sizeof(int), + copy_device( system->d_max_far_nbrs, temp, old_total_cap * sizeof(int), "dev_realloc_system::temp" ); - copy_device( temp, system->d_bonds, system->total_cap * sizeof(int), + copy_device( temp, system->d_bonds, old_total_cap * sizeof(int), "dev_realloc_system::temp" ); cuda_free( system->d_bonds, "system::d_bonds" ); cuda_malloc( (void **) &system->d_bonds, system->total_cap * sizeof(int), TRUE, "system::d_bonds" ); - copy_device( system->d_bonds, temp, system->total_cap * sizeof(int), + copy_device( system->d_bonds, temp, old_total_cap * sizeof(int), "dev_realloc_system::temp" ); - copy_device( temp, system->d_max_bonds, system->total_cap * sizeof(int), + copy_device( temp, system->d_max_bonds, old_total_cap * sizeof(int), "dev_realloc_system::temp" ); cuda_free( system->d_max_bonds, "system::d_max_bonds" ); cuda_malloc( (void **) &system->d_max_bonds, system->total_cap * sizeof(int), TRUE, "system::d_max_bonds" ); - copy_device( system->d_max_bonds, temp, system->total_cap * sizeof(int), + copy_device( system->d_max_bonds, temp, old_total_cap * sizeof(int), "dev_realloc_system::temp" ); - copy_device( temp, system->d_hbonds, system->total_cap * sizeof(int), + copy_device( temp, system->d_hbonds, old_total_cap * sizeof(int), "dev_realloc_system::temp" ); cuda_free( system->d_hbonds, "system::d_hbonds" ); cuda_malloc( (void **) &system->d_hbonds, system->total_cap * sizeof(int), TRUE, "system::d_hbonds" ); - copy_device( system->d_hbonds, temp, system->total_cap * sizeof(int), + copy_device( system->d_hbonds, temp, old_total_cap * sizeof(int), "dev_realloc_system::temp" ); - copy_device( temp, system->d_max_hbonds, system->total_cap * sizeof(int), + copy_device( temp, system->d_max_hbonds, old_total_cap * sizeof(int), "dev_realloc_system::temp" ); cuda_free( system->d_max_hbonds, "system::d_max_hbonds" ); cuda_malloc( (void **) &system->d_max_hbonds, system->total_cap * sizeof(int), TRUE, "system::d_max_hbonds" ); - copy_device( system->d_max_hbonds, temp, system->total_cap * sizeof(int), + copy_device( system->d_max_hbonds, temp, old_total_cap * sizeof(int), "dev_realloc_system::temp" ); - copy_device( temp, system->d_cm_entries, system->total_cap * sizeof(int), + copy_device( temp, system->d_cm_entries, old_total_cap * sizeof(int), "dev_realloc_system::temp" ); cuda_free( system->d_cm_entries, "system::d_cm_entries" ); cuda_malloc( (void **) &system->d_cm_entries, system->total_cap * sizeof(int), TRUE, "system::d_cm_entries" ); - copy_device( system->d_cm_entries, temp, system->total_cap * sizeof(int), + copy_device( system->d_cm_entries, temp, old_total_cap * sizeof(int), "dev_realloc_system::temp" ); - copy_device( temp, system->d_max_cm_entries, system->total_cap * sizeof(int), + copy_device( temp, system->d_max_cm_entries, old_total_cap * sizeof(int), "dev_realloc_system::temp" ); cuda_free( system->d_max_cm_entries, "system::d_max_cm_entries" ); cuda_malloc( (void **) &system->d_max_cm_entries, system->total_cap * sizeof(int), TRUE, "system::d_max_cm_entries" ); - copy_device( system->d_max_cm_entries, temp, system->total_cap * sizeof(int), + copy_device( system->d_max_cm_entries, temp, old_total_cap * sizeof(int), "dev_realloc_system::temp" ); } @@ -336,14 +342,12 @@ void dev_alloc_simulation_data( simulation_data *data ) void dev_alloc_workspace( reax_system *system, control_params *control, storage *workspace, int local_cap, int total_cap, char *msg ) { - int i, total_real, total_rvec, local_int, local_real, local_rvec; + int total_real, total_rvec, local_rvec; workspace->allocated = TRUE; total_real = total_cap * sizeof(real); total_rvec = total_cap * sizeof(rvec); - local_int = local_cap * sizeof(int); - local_real = local_cap * sizeof(real); local_rvec = local_cap * sizeof(rvec); /* communication storage */ @@ -588,8 +592,8 @@ void Cuda_ReAllocate( reax_system *system, control_params *control, mpi_datatypes *mpi_data ) { int i, j, k, p; - int nflag, Nflag, Hflag, mpi_flag, total_send; - int renbr, *indices; + int nflag, Nflag, old_total_cap, mpi_flag, total_send; + int renbr; reallocate_data *realloc; reax_list *far_nbrs; sparse_matrix *H; @@ -601,25 +605,6 @@ void Cuda_ReAllocate( reax_system *system, control_params *control, realloc = &(dev_workspace->realloc); g = &(system->my_grid); H = &dev_workspace->H; - indices = (int *) host_scratch; - -#if defined(DEBUG) - fprintf( stderr, "p%d@reallocate: n: %d, N: %d, numH: %d\n", - system->my_rank, system->n, system->N, system->numH ); - fprintf( stderr, "p%d@reallocate: local_cap: %d, total_cap: %d, Hcap: %d\n", - system->my_rank, system->local_cap, system->total_cap, - system->Hcap); - fprintf( stderr, "p%d: realloc.far_nbrs: %d\n", - system->my_rank, realloc->far_nbrs ); - fprintf( stderr, "p%d: realloc.H: %d, realloc.Htop: %d\n", - system->my_rank, realloc->H, realloc->Htop ); - fprintf( stderr, "p%d: realloc.Hbonds: %d, realloc.num_hbonds: %d\n", - system->my_rank, realloc->hbonds, realloc->num_hbonds ); - fprintf( stderr, "p%d: realloc.bonds: %d, num_bonds: %d\n", - system->my_rank, realloc->bonds, realloc->num_bonds ); - fprintf( stderr, "p%d: realloc.num_3body: %d\n", - system->my_rank, realloc->num_3body ); -#endif // IMPORTANT: LOOSE ZONES CHECKS ARE DISABLED FOR NOW BY &&'ing with 0!!! nflag = FALSE; @@ -635,6 +620,7 @@ void Cuda_ReAllocate( reax_system *system, control_params *control, (0 && system->N <= LOOSE_ZONE * system->total_cap) ) { Nflag = TRUE; + old_total_cap = system->total_cap; system->total_cap = (int)(system->N * SAFE_ZONE); } @@ -649,7 +635,7 @@ void Cuda_ReAllocate( reax_system *system, control_params *control, #endif /* system */ - dev_realloc_system( system, system->local_cap, system->total_cap, msg ); + dev_realloc_system( system, old_total_cap, system->total_cap, msg ); /* workspace */ dev_dealloc_workspace( control, workspace ); @@ -659,24 +645,23 @@ void Cuda_ReAllocate( reax_system *system, control_params *control, /* far neighbors */ renbr = (data->step - data->prev_steps) % control->reneighbor == 0; - if ( renbr || realloc->far_nbrs == TRUE ) + if ( renbr && (Nflag == TRUE || realloc->far_nbrs == TRUE) ) { far_nbrs = *dev_lists + FAR_NBRS; - if ( Nflag == TRUE || realloc->far_nbrs == TRUE ) - { #if defined(DEBUG_FOCUS) - fprintf( stderr, "p%d: reallocating far_nbrs: far_nbrs=%d, space=%dMB\n", - system->my_rank, system->total_far_nbrs, - (int)(system->total_far_nbrs * sizeof(far_neighbor_data) / - (1024.0 * 1024.0)) ); - fprintf( stderr, "p:%d - *** Reallocating Far Nbrs *** \n", system->my_rank ); + fprintf( stderr, "p%d: reallocating far_nbrs: far_nbrs=%d, space=%dMB\n", + system->my_rank, system->total_far_nbrs, + (int)(system->total_far_nbrs * sizeof(far_neighbor_data) / + (1024.0 * 1024.0)) ); + fprintf( stderr, "p:%d - *** Reallocating Far Nbrs *** \n", system->my_rank ); #endif - Cuda_Reallocate_Neighbor_List( far_nbrs, system->total_cap, system->total_far_nbrs ); - Cuda_Init_Neighbor_Indices( system ); - realloc->far_nbrs = FALSE; - } + Cuda_Reallocate_Neighbor_List( far_nbrs, system->total_cap, system->total_far_nbrs ); + + Cuda_Init_Neighbor_Indices( system ); + + realloc->far_nbrs = FALSE; } /* charge matrix */ @@ -714,7 +699,9 @@ void Cuda_ReAllocate( reax_system *system, control_params *control, #endif Cuda_Reallocate_HBonds_List( (*dev_lists) + HBONDS, system->total_cap, system->total_hbonds ); + Cuda_Init_HBond_Indices( system ); + realloc->hbonds = FALSE; } } @@ -729,23 +716,26 @@ void Cuda_ReAllocate( reax_system *system, control_params *control, #endif Cuda_Reallocate_Bonds_List( (*dev_lists) + BONDS, system->total_cap, system->total_bonds ); + Cuda_Init_Bond_Indices( system ); + realloc->bonds = FALSE; } /* 3-body list */ - if ( Nflag == TRUE || realloc->num_3body > 0 ) + if ( Nflag == TRUE || realloc->thbody == TRUE ) { #if defined(DEBUG_FOCUS) - fprintf( stderr, "p%d: reallocating 3body list: num_3body=%d, space=%dMB\n", - system->my_rank, realloc->num_3body, - (int)(realloc->num_3body * sizeof(three_body_interaction_data) / + fprintf( stderr, "p%d: reallocating thbody list: num_thbody=%d, space=%dMB\n", + system->my_rank, system->total_thbodies, + (int)(system->total_thbodies * sizeof(three_body_interaction_data) / (1024*1024)) ); #endif Cuda_Reallocate_Thbodies_List( (*dev_lists) + THREE_BODIES, - (*dev_lists + BONDS)->num_intrs, system->total_thbodies ); - realloc->num_3body = -1; + system->total_thbodies_indices, system->total_thbodies ); + + realloc->thbody = FALSE; } /* grid */ diff --git a/PG-PuReMD/src/cuda/cuda_bond_orders.cu b/PG-PuReMD/src/cuda/cuda_bond_orders.cu index bb478a3a..f16ddcbc 100644 --- a/PG-PuReMD/src/cuda/cuda_bond_orders.cu +++ b/PG-PuReMD/src/cuda/cuda_bond_orders.cu @@ -14,6 +14,7 @@ CUDA_GLOBAL void Cuda_Calculate_BO_init( reax_atom *my_atoms, { int i, type_i; single_body_parameters *sbp_i; + storage *workspace; i = blockIdx.x * blockDim.x + threadIdx.x; @@ -22,11 +23,11 @@ CUDA_GLOBAL void Cuda_Calculate_BO_init( reax_atom *my_atoms, return; } - storage *workspace = & (p_workspace); + workspace = &( p_workspace ); /* Calculate Deltaprime, Deltaprime_boc values */ type_i = my_atoms[i].type; - sbp_i = &(sbp[type_i]); + sbp_i = &( sbp[type_i] ); workspace->Deltap[i] = workspace->total_bond_order[i] - sbp_i->valency; workspace->Deltap_boc[i] = workspace->total_bond_order[i] - sbp_i->valency_val; @@ -40,18 +41,20 @@ CUDA_GLOBAL void Cuda_Calculate_BO( reax_atom *my_atoms, global_parameters gp, int num_atom_types, int N ) { int i, j, pj, type_i, type_j; - int start_i, end_i, sym_index, num_bonds; + int start_i, end_i; +// int sym_index; real val_i, Deltap_i, Deltap_boc_i; real val_j, Deltap_j, Deltap_boc_j; real f1, f2, f3, f4, f5, f4f5, exp_f4, exp_f5; real exp_p1i, exp_p2i, exp_p1j, exp_p2j; real temp, u1_ij, u1_ji, Cf1A_ij, Cf1B_ij, Cf1_ij, Cf1_ji; - real Cf45_ij, Cf45_ji, p_lp1; //u_ij, u_ji + real Cf45_ij, Cf45_ji; real A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji; - real explp1, p_boc1, p_boc2; - single_body_parameters *sbp_i, *sbp_j; + real p_boc1, p_boc2; + single_body_parameters *sbp_i; two_body_parameters *twbp; - bond_order_data *bo_ij, *bo_ji; + bond_order_data *bo_ij; +// bond_order_data *bo_ji; storage *workspace; reax_list *bonds; @@ -64,25 +67,9 @@ CUDA_GLOBAL void Cuda_Calculate_BO( reax_atom *my_atoms, global_parameters gp, workspace = &(p_workspace); bonds = &(p_bonds); - num_bonds = 0; p_boc1 = gp.l[0]; p_boc2 = gp.l[1]; - /* Calculate Deltaprime, Deltaprime_boc values */ - /* - //for( i = 0; i < system->N; ++i ) { - type_i = my_atoms[i].type; - sbp_i = &(sbp[type_i]); - workspace->Deltap[i] = workspace->total_bond_order[i] - sbp_i->valency; - workspace->Deltap_boc[i] = - workspace->total_bond_order[i] - sbp_i->valency_val; - - //fprintf( stdout, "%d(%d) %24.15f\n", - // i, workspace->bond_mark[i], workspace->total_bond_order[i] ); - workspace->total_bond_order[i] = 0; - //} - */ - /* Corrected Bond Order calculations */ //for( i = 0; i < system->N; ++i ) { type_i = my_atoms[i].type; @@ -103,10 +90,6 @@ CUDA_GLOBAL void Cuda_Calculate_BO( reax_atom *my_atoms, global_parameters gp, bo_ij = &( bonds->select.bond_list[pj].bo_data ); // fprintf( stderr, "\tj:%d - ubo: %8.3f\n", j+1, bo_ij->BO ); - //TODO - //TODO - //TODO - //TODO //TODO //if( i < j || workspace->bond_mark[j] > 3 ) { if( i < j ) @@ -231,12 +214,6 @@ CUDA_GLOBAL void Cuda_Calculate_BO( reax_atom *my_atoms, global_parameters gp, f4f5 = f4 * f5; /* Bond Order pages 8-9, derivative of f4 and f5 */ - /*temp = twbp->p_boc5 - - twbp->p_boc3 * twbp->p_boc4 * SQR( bo_ij->BO ); - u_ij = temp + twbp->p_boc3 * Deltap_boc_i; - u_ji = temp + twbp->p_boc3 * Deltap_boc_j; - Cf45_ij = Cf45( u_ij, u_ji ) / f4f5; - Cf45_ji = Cf45( u_ji, u_ij ) / f4f5;*/ Cf45_ij = -f4 * exp_f4; Cf45_ji = -f5 * exp_f5; } @@ -390,7 +367,8 @@ CUDA_GLOBAL void Cuda_Update_Uncorrected_BO( storage p_workspace, bo_ij->BO_pi = bo_ji->BO_pi; bo_ij->BO_pi2 = bo_ji->BO_pi2; - workspace->total_bond_order[i] += bo_ij->BO;// now keeps total_BO + // now keeps total_BO + workspace->total_bond_order[i] += bo_ij->BO; } } } @@ -401,9 +379,8 @@ CUDA_GLOBAL void Cuda_Update_Workspace_After_BO( reax_atom *my_atoms, storage p_workspace, int N ) { int j, type_j; - real explp1; - real p_lp1; - single_body_parameters *sbp_i, *sbp_j; + real explp1, p_lp1; + single_body_parameters *sbp_j; storage *workspace; j = blockIdx.x * blockDim.x + threadIdx.x; @@ -465,7 +442,6 @@ CUDA_DEVICE void Cuda_Add_dBond_to_Forces_NPT( int i, int pj, rvec temp, ext_press; ivec rel_box; int pk, k, j; - rvec tf_f; /* Initializations */ nbr_j = &(bonds->select.bond_list[pj]); @@ -635,7 +611,7 @@ CUDA_DEVICE void Cuda_Add_dBond_to_Forces( int i, int pj, bond_data *nbr_j, *nbr_k; bond_order_data *bo_ij, *bo_ji; dbond_coefficients coef; - int pk, k, j; + int pk, j; rvec tf_f; rvec_MakeZero( tf_f ); @@ -680,7 +656,6 @@ CUDA_DEVICE void Cuda_Add_dBond_to_Forces( int i, int pj, for( pk = Dev_Start_Index(i, bonds); pk < Dev_End_Index(i, bonds); ++pk ) { nbr_k = &(bonds->select.bond_list[pk]); - k = nbr_k->nbr; rvec_MakeZero( tf_f ); /*2nd,dBO*/ @@ -725,7 +700,6 @@ CUDA_DEVICE void Cuda_Add_dBond_to_Forces( int i, int pj, for( pk = Dev_Start_Index(i, bonds); pk < Dev_End_Index(i, bonds); ++pk ) { nbr_k = &(bonds->select.bond_list[pk]); - k = nbr_k->nbr; rvec_MakeZero( tf_f ); /*3rd, dBO*/ diff --git a/PG-PuReMD/src/cuda/cuda_bonds.cu b/PG-PuReMD/src/cuda/cuda_bonds.cu index e3592630..a9c3ce30 100644 --- a/PG-PuReMD/src/cuda/cuda_bonds.cu +++ b/PG-PuReMD/src/cuda/cuda_bonds.cu @@ -31,7 +31,7 @@ CUDA_GLOBAL void Cuda_Bonds( reax_atom *my_atoms, global_parameters gp, storage p_workspace, reax_list p_bonds, int n, int num_atom_types, real *e_bond ) { - int i, j, pj, natoms; + int i, j, pj; int start_i, end_i; int type_i, type_j; real ebond, pow_BOs_be2, exp_be12, CEbo; @@ -59,7 +59,6 @@ CUDA_GLOBAL void Cuda_Bonds( reax_atom *my_atoms, global_parameters gp, gp10 = gp.l[10]; gp37 = (int) gp.l[37]; - //for( i = 0; i < natoms; ++i ) { start_i = Dev_Start_Index(i, bonds); end_i = Dev_End_Index(i, bonds); @@ -85,10 +84,10 @@ CUDA_GLOBAL void Cuda_Bonds( reax_atom *my_atoms, global_parameters gp, ( 1.0 - twbp->p_be1 * twbp->p_be2 * pow_BOs_be2 ); /* calculate the Bond Energy */ - e_bond[ i ] += ebond = - -twbp->De_s * bo_ij->BO_s * exp_be12 + ebond = -twbp->De_s * bo_ij->BO_s * exp_be12 -twbp->De_p * bo_ij->BO_pi -twbp->De_pp * bo_ij->BO_pi2; + e_bond[ i ] += ebond; /* calculate derivatives of Bond Orders */ bo_ij->Cdbo += CEbo; @@ -102,12 +101,14 @@ CUDA_GLOBAL void Cuda_Bonds( reax_atom *my_atoms, global_parameters gp, system->my_atoms[j].orig_id, bo_ij->BO, ebond, data->my_en.e_bond ); #endif + #ifdef TEST_FORCES Add_dBO( system, lists, i, pj, CEbo, workspace->f_be ); Add_dBOpinpi2( system, lists, i, pj, -(CEbo + twbp->De_p), -(CEbo + twbp->De_pp), workspace->f_be, workspace->f_be ); #endif + /* Stabilisation terminal triple bond */ if ( bo_ij->BO >= 1.00 ) { @@ -122,7 +123,7 @@ CUDA_GLOBAL void Cuda_Bonds( reax_atom *my_atoms, global_parameters gp, hulpov = 1.0 / (1.0 + 25.0 * exphuov); estriph = gp10 * exphu * hulpov * (exphua1 + exphub1); - e_bond [i] += estriph; + e_bond[i] += estriph; decobdbo = gp10 * exphu * hulpov * (exphua1 + exphub1) * ( gp3 - 2.0 * gp7 * (bo_ij->BO-2.50) ); @@ -134,12 +135,14 @@ CUDA_GLOBAL void Cuda_Bonds( reax_atom *my_atoms, global_parameters gp, bo_ij->Cdbo += decobdbo; workspace->CdDelta[i] += decobdboua; workspace->CdDelta[j] += decobdboub; + #ifdef TEST_ENERGY //fprintf( out_control->ebond, // "%6d%6d%24.15e%24.15e%24.15e%24.15e\n", // system->my_atoms[i].orig_id, system->my_atoms[j].orig_id, // estriph, decobdbo, decobdboua, decobdboub ); #endif + #ifdef TEST_FORCES Add_dBO( system, lists, i, pj, decobdbo, workspace->f_be ); Add_dDelta( system, lists, i, decobdboua, workspace->f_be ); @@ -149,5 +152,4 @@ CUDA_GLOBAL void Cuda_Bonds( reax_atom *my_atoms, global_parameters gp, } } } - // } } diff --git a/PG-PuReMD/src/cuda/cuda_charges.cu b/PG-PuReMD/src/cuda/cuda_charges.cu index 72018b1d..9ea97653 100644 --- a/PG-PuReMD/src/cuda/cuda_charges.cu +++ b/PG-PuReMD/src/cuda/cuda_charges.cu @@ -210,10 +210,9 @@ void cuda_charges_updateq( reax_system *system, real *q ) void Cuda_Calculate_Charges( reax_system *system, storage *workspace, mpi_datatypes *mpi_data ) { - int i, scale; + int scale; real u;//, s_sum, t_sum; rvec2 my_sum, all_sum; - reax_atom *atom; real *q; my_sum[0] = 0.0; diff --git a/PG-PuReMD/src/cuda/cuda_forces.cu b/PG-PuReMD/src/cuda/cuda_forces.cu index 71224dd0..4960038c 100644 --- a/PG-PuReMD/src/cuda/cuda_forces.cu +++ b/PG-PuReMD/src/cuda/cuda_forces.cu @@ -30,19 +30,187 @@ CUDA_GLOBAL void k_disable_hydrogen_bonding( control_params *control ) } +CUDA_GLOBAL void k_init_end_index( int * intr_cnt, int *indices, int *end_indices, int N ) +{ + int i; + + i = blockIdx.x * blockDim.x + threadIdx.x; + + if ( i >= N ) + { + return; + } + + end_indices[i] = indices[i] + intr_cnt[i]; +} + + +CUDA_GLOBAL void k_setup_hindex( reax_atom *my_atoms, int N ) +{ + int i; + + i = blockIdx.x * blockDim.x + threadIdx.x; + + if ( i >= N ) + { + return; + } + + my_atoms[i].Hindex = i; +} + + +CUDA_GLOBAL void k_init_hbond_indices( reax_atom * atoms, single_body_parameters *sbp, + int *hbonds, int *max_hbonds, int *indices, int *end_indices, int N ) +{ + int i, hindex, my_hbonds; + + i = blockIdx.x * blockDim.x + threadIdx.x; + + if ( i >= N ) + { + return; + } + + hindex = atoms[i].Hindex; + + if ( sbp[ atoms[i].type ].p_hbond == H_ATOM || + sbp[ atoms[i].type ].p_hbond == H_BONDING_ATOM ) + { + my_hbonds = hbonds[i]; + indices[hindex] = max_hbonds[i]; + end_indices[hindex] = indices[hindex] + hbonds[i]; + } + else + { + my_hbonds = 0; + indices[hindex] = 0; + end_indices[hindex] = 0; + } + atoms[i].num_hbonds = my_hbonds; +} + + +/* Initialize indices for far neighbors list post reallocation + * + * system: atomic system info. */ +void Cuda_Init_Neighbor_Indices( reax_system *system ) +{ + int blocks; + reax_list *far_nbrs = *dev_lists + FAR_NBRS; + + /* init indices */ + Cuda_Scan_Excl_Sum( system->d_max_far_nbrs, far_nbrs->index, system->total_cap ); + + /* init end_indices */ + blocks = system->N / DEF_BLOCK_SIZE + + ((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1); + k_init_end_index <<< blocks, DEF_BLOCK_SIZE >>> + ( system->d_far_nbrs, far_nbrs->index, far_nbrs->end_index, system->N ); + cudaThreadSynchronize( ); + cudaCheckError( ); +} + + +/* Initialize indices for far hydrogen bonds list post reallocation + * + * system: atomic system info. */ +void Cuda_Init_HBond_Indices( reax_system *system ) +{ + int blocks; + int *temp; + reax_list *hbonds = *dev_lists + HBONDS; + + temp = (int *) scratch; + + /* init Hindices */ + blocks = system->N / DEF_BLOCK_SIZE + + ((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1); + + k_setup_hindex <<< blocks, DEF_BLOCK_SIZE >>> + ( system->d_my_atoms, system->N ); + cudaThreadSynchronize( ); + cudaCheckError( ); + + /* init indices and end_indices */ + Cuda_Scan_Excl_Sum( system->d_max_hbonds, temp, system->total_cap ); + + k_init_hbond_indices <<< blocks, DEF_BLOCK_SIZE >>> + ( system->d_my_atoms, system->reax_param.d_sbp, system->d_hbonds, temp, + hbonds->index, hbonds->end_index, system->N ); + cudaThreadSynchronize( ); + cudaCheckError( ); +} + + +/* Initialize indices for far bonds list post reallocation + * + * system: atomic system info. */ +void Cuda_Init_Bond_Indices( reax_system *system ) +{ + int blocks; + reax_list *bonds = *dev_lists + BONDS; + + /* init indices */ + Cuda_Scan_Excl_Sum( system->d_max_bonds, bonds->index, system->total_cap ); + + /* init end_indices */ + blocks = system->N / DEF_BLOCK_SIZE + + ((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1); + k_init_end_index <<< blocks, DEF_BLOCK_SIZE >>> + ( system->d_bonds, bonds->index, bonds->end_index, system->N ); + cudaThreadSynchronize( ); + cudaCheckError( ); +} + + +/* Initialize indices for charge matrix post reallocation + * + * system: atomic system info. + * H: charge matrix */ +void Cuda_Init_Sparse_Matrix_Indices( reax_system *system, sparse_matrix *H ) +{ + int blocks; + + /* init indices */ + Cuda_Scan_Excl_Sum( system->d_max_cm_entries, H->start, system->N ); + + /* init end_indices */ + blocks = system->N / DEF_BLOCK_SIZE + + ((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1); + k_init_end_index <<< blocks, DEF_BLOCK_SIZE >>> + ( system->d_cm_entries, H->start, H->end, system->N ); + cudaThreadSynchronize( ); + cudaCheckError( ); +} + + +/* Initialize indices for three body list post reallocation + * + * indices: list indices + * entries: num. of entries in list */ +void Cuda_Init_Three_Body_Indices( int *indices, int entries ) +{ + reax_list *thbody = *dev_lists + THREE_BODIES; + + Cuda_Scan_Excl_Sum( indices, thbody->index, entries ); +} + + CUDA_GLOBAL void k_estimate_storages( reax_atom *my_atoms, single_body_parameters *sbp, two_body_parameters *tbp, control_params *control, reax_list far_nbrs, - int num_atom_types, int n, int N, int Hcap, int total_cap, - int *cm_entries, int *max_cm_entries, int *realloc_cm_entries, - int *bonds, int *max_bonds, int *realloc_bonds, - int *hbonds, int *max_hbonds, int *realloc_hbonds ) + int num_atom_types, int n, int N, int total_cap, + int *cm_entries, int *max_cm_entries, + int *bonds, int *max_bonds, + int *hbonds, int *max_hbonds ) { int i, j, pj; int start_i, end_i; int type_i, type_j; int ihb, jhb; int local; + int my_bonds, my_hbonds, my_cm_entries; real cutoff; real r_ij, r2; real C12, C34, C56; @@ -59,6 +227,10 @@ CUDA_GLOBAL void k_estimate_storages( reax_atom *my_atoms, return; } + my_bonds = 0; + my_hbonds = 0; + my_cm_entries = 0; + if ( i < N ) { atom_i = &(my_atoms[i]); @@ -71,7 +243,7 @@ CUDA_GLOBAL void k_estimate_storages( reax_atom *my_atoms, { local = TRUE; cutoff = control->nonb_cut; - ++cm_entries[i]; + ++my_cm_entries; // ihb = sbp_i->p_hbond; } else @@ -100,18 +272,18 @@ CUDA_GLOBAL void k_estimate_storages( reax_atom *my_atoms, { if ( i < j && (j < n || atom_i->orig_id < atom_j->orig_id) ) { - ++cm_entries[i]; + ++my_cm_entries; } else if ( i > j && (j < n || atom_j->orig_id > atom_i->orig_id) ) { - ++cm_entries[i]; + ++my_cm_entries; } } else { if ( i > j && j < n && atom_j->orig_id < atom_i->orig_id ) { - ++cm_entries[i]; + ++my_cm_entries; } } @@ -120,7 +292,7 @@ CUDA_GLOBAL void k_estimate_storages( reax_atom *my_atoms, if ( control->hbond_cut > 0.0 && nbr_pj->d <= control->hbond_cut && ihb == H_BONDING_ATOM && jhb == H_ATOM && i >= n && j < n ) { - ++hbonds[i]; + ++my_hbonds; } // if ( i >= n ) @@ -148,13 +320,13 @@ CUDA_GLOBAL void k_estimate_storages( reax_atom *my_atoms, * atom j: H bonding atom */ if( ihb == H_ATOM && jhb == H_BONDING_ATOM ) { - ++hbonds[i]; + ++my_hbonds; } /* atom i: H bonding atom, native * atom j: H atom, native */ else if( ihb == H_BONDING_ATOM && jhb == H_ATOM && j < n ) { - ++hbonds[i]; + ++my_hbonds; } } } @@ -199,158 +371,90 @@ CUDA_GLOBAL void k_estimate_storages( reax_atom *my_atoms, if ( BO >= control->bo_cut ) { - ++bonds[i]; -// atomicAdd( bonds + j, 1 ); + ++my_bonds; } } } } } - else - { - bonds[i] = MIN_BONDS; - hbonds[i] = 0; - } - if ( bonds[i] > max_bonds[i] ) - { - max_bonds[i] = MAX( (int)(bonds[i] * 2), MIN_BONDS ); - *realloc_bonds = TRUE; - } + bonds[i] = my_bonds; + max_bonds[i] = MAX( (int)(my_bonds * 2), MIN_BONDS ); - if ( hbonds[i] > max_hbonds[i] ) - { - max_hbonds[i] = MAX( (int)(hbonds[i] * SAFE_ZONE), MIN_HBONDS ); - *realloc_hbonds = TRUE; - } - - if ( cm_entries[i] > max_cm_entries[i] ) - { - max_cm_entries[i] = MAX( (int)(cm_entries[i] * SAFE_ZONE), MIN_CM_ENTRIES ); - *realloc_cm_entries = TRUE; - } -} - - -/* Copy num. of bonds/hbonds per atom into atom structs (used for MPI messaging) - * - * my_atoms: atom structs - * N: num. of atoms (native + ghost) - * hbonds: num. of hydrogen bonds per atom - * bonds: num. of bonds per atom */ -CUDA_GLOBAL void k_init_system_atoms( reax_atom *my_atoms, int N, - int *hbonds, int *bonds ) -{ - int i; - - i = blockIdx.x * blockDim.x + threadIdx.x; - - if ( i >= N ) - { - return; - } + hbonds[i] = my_hbonds; + max_hbonds[i] = MAX( (int)(my_hbonds * SAFE_ZONE), MIN_HBONDS ); - my_atoms[i].num_bonds = bonds[i]; - my_atoms[i].num_hbonds = hbonds[i]; + cm_entries[i] = my_cm_entries; + max_cm_entries[i] = MAX( (int)(my_cm_entries * SAFE_ZONE), MIN_CM_ENTRIES ); } -int Cuda_Estimate_Storages( reax_system *system, control_params *control, - reax_list **lists, sparse_matrix *H, int step ) +void Cuda_Estimate_Storages( reax_system *system, control_params *control, + reax_list **lists, int realloc_bonds, int realloc_hbonds, int realloc_cm, + int step ) { - int i, ret, ret_bonds, ret_hbonds, ret_cm; - int blocks = 0; - - ret = SUCCESS; + int blocks; - /* careful: this wrapper around cudaMemset(...) performs a byte-wide assignment - * to the provided literal */ - cuda_memset( system->d_realloc_bonds, FALSE, sizeof(int), - "Cuda_Estimate_Storages::d_realloc_bonds" ); - cuda_memset( system->d_realloc_hbonds, FALSE, sizeof(int), - "Cuda_Estimate_Storages::d_realloc_hbonds" ); - cuda_memset( system->d_realloc_cm_entries, FALSE, sizeof(int), - "Cuda_Estimate_Storages::d_realloc_cm_entries" ); - cuda_memset( system->d_bonds, 0, system->total_cap * sizeof(int), - "Cuda_Estimate_Storages::d_bonds" ); - cuda_memset( system->d_hbonds, 0, system->total_cap * sizeof(int), - "Cuda_Estimate_Storages::d_hbonds" ); - cuda_memset( system->d_cm_entries, 0, system->total_cap * sizeof(int), - "Cuda_Estimate_Storages::d_cm_entries" ); - - blocks = (int)CEIL( (real)system->total_cap / ST_BLOCK_SIZE ); + blocks = system->total_cap / ST_BLOCK_SIZE + + (((system->total_cap % ST_BLOCK_SIZE == 0)) ? 0 : 1); k_estimate_storages <<< blocks, ST_BLOCK_SIZE >>> ( system->d_my_atoms, system->reax_param.d_sbp, system->reax_param.d_tbp, (control_params *)control->d_control_params, *(*dev_lists + FAR_NBRS), system->reax_param.num_atom_types, - system->n, system->N, system->Hcap, system->total_cap, - system->d_cm_entries, system->d_max_cm_entries, system->d_realloc_cm_entries, - system->d_bonds, system->d_max_bonds, system->d_realloc_bonds, - system->d_hbonds, system->d_max_hbonds, system->d_realloc_hbonds ); + system->n, system->N, system->total_cap, + system->d_cm_entries, system->d_max_cm_entries, + system->d_bonds, system->d_max_bonds, + system->d_hbonds, system->d_max_hbonds ); cudaThreadSynchronize( ); cudaCheckError( ); - /* check reallocation flags on device */ - copy_host_device( &ret_bonds, system->d_realloc_bonds, sizeof(int), - cudaMemcpyDeviceToHost, "Cuda_Estimate_Storages::d_realloc_bonds" ); - copy_host_device( &ret_hbonds, system->d_realloc_hbonds, sizeof(int), - cudaMemcpyDeviceToHost, "Cuda_Estimate_Storages::d_realloc_hbonds" ); - copy_host_device( &ret_cm, system->d_realloc_cm_entries, sizeof(int), - cudaMemcpyDeviceToHost, "Cuda_Estimate_Storages::d_realloc_cm_entries" ); - - if ( ret_bonds == TRUE ) + if ( realloc_bonds == TRUE ) { Cuda_Reduction_Sum( system->d_max_bonds, system->d_total_bonds, system->total_cap ); - copy_host_device( &(system->total_bonds), system->d_total_bonds, sizeof(int), cudaMemcpyDeviceToHost, "Cuda_Estimate_Storages::d_total_bonds" ); - - if ( step > 0 ) - { - dev_workspace->realloc.bonds = TRUE; - } - ret = FAILURE; } - if ( system->numH > 0 && control->hbond_cut > 0.0 && ret_hbonds == TRUE ) + if ( system->numH > 0 && control->hbond_cut > 0.0 ) { - Cuda_Reduction_Sum( system->d_max_hbonds, system->d_total_hbonds, - system->total_cap ); - - copy_host_device( &(system->total_hbonds), system->d_total_hbonds, sizeof(int), - cudaMemcpyDeviceToHost, "Cuda_Estimate_Storages::d_total_hbonds" ); - - if ( step > 0 ) + if ( realloc_hbonds == TRUE ) { - dev_workspace->realloc.hbonds = TRUE; + Cuda_Reduction_Sum( system->d_max_hbonds, system->d_total_hbonds, + system->total_cap ); + copy_host_device( &(system->total_hbonds), system->d_total_hbonds, sizeof(int), + cudaMemcpyDeviceToHost, "Cuda_Estimate_Storages::d_total_hbonds" ); } - ret = FAILURE; } else { - /* if number of hydrogen atoms is 0, disable hydrogen bond functionality */ - if ( system->numH == 0 && step == 0 ) + if ( step == 0 ) { - fprintf( stderr, "[WARNING] DISABLING HYDROGEN BONDS\n" ); +#if defined(DEBUG) + if ( system->numH == 0 ) + { + fprintf( stderr, "[INFO] DISABLING HYDROGEN BOND COMPUTATION: NO HYDROGEN ATOMS FOUND\n" ); + } +#endif + +#if defined(DEBUG) + if ( control->hbond_cut <= 0.0 ) + { + fprintf( stderr, "[INFO] DISABLING HYDROGEN BOND COMPUTATION: BOND CUTOFF LENGTH IS ZERO\n" ); + } +#endif + control->hbond_cut = 0.0; k_disable_hydrogen_bonding <<< 1, 1 >>> ( (control_params *)control->d_control_params ); } } - if ( ret_cm == TRUE ) + if ( realloc_cm == TRUE ) { Cuda_Reduction_Sum( system->d_max_cm_entries, system->d_total_cm_entries, system->total_cap ); - copy_host_device( &(system->total_cm_entries), system->d_total_cm_entries, sizeof(int), - cudaMemcpyDeviceToHost, "d_total_cm_entries" ); - - if ( step > 0 ) - { - dev_workspace->realloc.cm = TRUE; - } - ret = FAILURE; + cudaMemcpyDeviceToHost, "Cuda_Estimate_Storages::d_total_cm_entries" ); } #if defined(DEBUG) @@ -359,13 +463,6 @@ int Cuda_Estimate_Storages( reax_system *system, control_params *control, fprintf( stderr, " TOTAL DEVICE HBOND COUNT: %d \n", system->total_hbonds ); fprintf( stderr, " TOTAL DEVICE SPARSE COUNT: %d \n", system->total_cm_entries ); #endif - - k_init_system_atoms <<< blocks, ST_BLOCK_SIZE >>> - ( system->d_my_atoms, system->N, system->d_hbonds, system->d_bonds ); - cudaThreadSynchronize( ); - cudaCheckError( ); - - return ret; } @@ -375,6 +472,7 @@ int Cuda_Estimate_Storage_Three_Body( reax_system *system, control_params *contr int ret; ret = SUCCESS; + cuda_memset( thbody, 0, system->total_bonds * sizeof(int), "scratch::thbody" ); Estimate_Cuda_Valence_Angles <<< BLOCKS_N, BLOCK_SIZE >>> @@ -390,16 +488,29 @@ int Cuda_Estimate_Storage_Three_Body( reax_system *system, control_params *contr if ( step == 0 ) { + system->total_thbodies = MAX( (int)(system->total_thbodies * SAFE_ZONE), MIN_3BODIES ); + system->total_thbodies_indices = system->total_bonds; + /* create Three-body list */ - Dev_Make_List( system->total_bonds, system->total_thbodies, + Dev_Make_List( system->total_thbodies_indices, system->total_thbodies, TYP_THREE_BODY, *dev_lists + THREE_BODIES ); } if ( system->total_thbodies > (*dev_lists + THREE_BODIES)->num_intrs || - (*dev_lists + THREE_BODIES)->n < system->total_bonds ) + system->total_bonds > (*dev_lists + THREE_BODIES)->n ) { - system->total_thbodies = (*dev_lists + THREE_BODIES)->num_intrs * SAFE_ZONE; - dev_workspace->realloc.num_3body = system->total_thbodies; + if ( system->total_thbodies > (*dev_lists + THREE_BODIES)->num_intrs ) + { + system->total_thbodies = MAX( (int)((*dev_lists + THREE_BODIES)->num_intrs * SAFE_ZONE), + system->total_thbodies ); + } + if ( system->total_bonds > (*dev_lists + THREE_BODIES)->n ) + { + system->total_thbodies_indices = MAX( (int)((*dev_lists + THREE_BODIES)->n * SAFE_ZONE), + system->total_bonds ); + } + + dev_workspace->realloc.thbody = TRUE; ret = FAILURE; } @@ -491,14 +602,17 @@ CUDA_GLOBAL void k_print_hbond_info( reax_atom *my_atoms, single_body_parameters CUDA_GLOBAL void k_init_forces( reax_atom *my_atoms, single_body_parameters *sbp, two_body_parameters *tbp, storage workspace, control_params *control, - reax_list far_nbrs, reax_list bonds, reax_list hbonds, - LR_lookup_table *t_LR, int n, int N, int num_atom_types, int renbr ) + reax_list far_nbrs_list, reax_list bonds_list, reax_list hbonds_list, + LR_lookup_table *t_LR, int n, int N, int num_atom_types, int renbr, + int *cm_entries, int *max_cm_entries, int *realloc_cm_entries, + int *bonds, int *max_bonds, int *realloc_bonds, + int *hbonds, int *max_hbonds, int *realloc_hbonds ) { int i, j, pj; int start_i, end_i; int type_i, type_j; - int Htop; - int btop_i, ihb, jhb, ihb_top; + int Htop, btop_i, ihb, jhb, ihb_top; + int my_bonds, my_hbonds, my_cm_entries; int local, flag, flag2, flag3; real r_ij, cutoff; single_body_parameters *sbp_i, *sbp_j; @@ -519,9 +633,9 @@ CUDA_GLOBAL void k_init_forces( reax_atom *my_atoms, single_body_parameters *sbp atom_i = &(my_atoms[i]); type_i = atom_i->type; - start_i = Dev_Start_Index( i, &far_nbrs ); - end_i = Dev_End_Index( i, &far_nbrs ); - btop_i = Dev_Start_Index( i, &bonds ); + start_i = Dev_Start_Index( i, &far_nbrs_list ); + end_i = Dev_End_Index( i, &far_nbrs_list ); + btop_i = Dev_Start_Index( i, &bonds_list ); sbp_i = &(sbp[type_i]); if ( i < n ) @@ -557,7 +671,7 @@ CUDA_GLOBAL void k_init_forces( reax_atom *my_atoms, single_body_parameters *sbp if ( ihb == H_ATOM || ihb == H_BONDING_ATOM ) { - ihb_top = Dev_Start_Index( atom_i->Hindex, &hbonds ); + ihb_top = Dev_Start_Index( atom_i->Hindex, &hbonds_list ); } else { @@ -568,7 +682,7 @@ CUDA_GLOBAL void k_init_forces( reax_atom *my_atoms, single_body_parameters *sbp /* update i-j distance - check if j is within cutoff */ for ( pj = start_i; pj < end_i; ++pj ) { - nbr_pj = &( far_nbrs.select.far_nbr_list[pj] ); + nbr_pj = &( far_nbrs_list.select.far_nbr_list[pj] ); j = nbr_pj->nbr; atom_j = &(my_atoms[j]); @@ -641,13 +755,13 @@ CUDA_GLOBAL void k_init_forces( reax_atom *my_atoms, single_body_parameters *sbp if ( control->hbond_cut > 0.0 && nbr_pj->d <= control->hbond_cut && ihb == H_BONDING_ATOM && jhb == H_ATOM && i >= n && j < n ) { - hbonds.select.hbond_list[ihb_top].nbr = j; - hbonds.select.hbond_list[ihb_top].scl = -1; - hbonds.select.hbond_list[ihb_top].ptr = nbr_pj; + hbonds_list.select.hbond_list[ihb_top].nbr = j; + hbonds_list.select.hbond_list[ihb_top].scl = -1; + hbonds_list.select.hbond_list[ihb_top].ptr = nbr_pj; //CUDA SPECIFIC - hbonds.select.hbond_list[ihb_top].sym_index = -1; - rvec_MakeZero( hbonds.select.hbond_list[ihb_top].hb_f ); + hbonds_list.select.hbond_list[ihb_top].sym_index = -1; + rvec_MakeZero( hbonds_list.select.hbond_list[ihb_top].hb_f ); ++ihb_top; } @@ -727,21 +841,21 @@ CUDA_GLOBAL void k_init_forces( reax_atom *my_atoms, single_body_parameters *sbp * atom j: H bonding atom */ if ( ihb == H_ATOM && jhb == H_BONDING_ATOM ) { - hbonds.select.hbond_list[ihb_top].nbr = j; + hbonds_list.select.hbond_list[ihb_top].nbr = j; if ( i < j ) { - hbonds.select.hbond_list[ihb_top].scl = 1; + hbonds_list.select.hbond_list[ihb_top].scl = 1; } else { - hbonds.select.hbond_list[ihb_top].scl = -1; + hbonds_list.select.hbond_list[ihb_top].scl = -1; } - hbonds.select.hbond_list[ihb_top].ptr = nbr_pj; + hbonds_list.select.hbond_list[ihb_top].ptr = nbr_pj; //CUDA SPECIFIC - hbonds.select.hbond_list[ihb_top].sym_index = -1; - rvec_MakeZero( hbonds.select.hbond_list[ihb_top].hb_f ); + hbonds_list.select.hbond_list[ihb_top].sym_index = -1; + rvec_MakeZero( hbonds_list.select.hbond_list[ihb_top].hb_f ); ++ihb_top; } @@ -750,13 +864,13 @@ CUDA_GLOBAL void k_init_forces( reax_atom *my_atoms, single_body_parameters *sbp else if ( ihb == H_BONDING_ATOM && jhb == H_ATOM && j < n ) { //jhb_top = End_Index( atom_j->Hindex, hbonds ); - hbonds.select.hbond_list[ihb_top].nbr = j; - hbonds.select.hbond_list[ihb_top].scl = -1; - hbonds.select.hbond_list[ihb_top].ptr = nbr_pj; + hbonds_list.select.hbond_list[ihb_top].nbr = j; + hbonds_list.select.hbond_list[ihb_top].scl = -1; + hbonds_list.select.hbond_list[ihb_top].ptr = nbr_pj; //CUDA SPECIFIC - hbonds.select.hbond_list[ihb_top].sym_index = -1; - rvec_MakeZero( hbonds.select.hbond_list[ihb_top].hb_f ); + hbonds_list.select.hbond_list[ihb_top].sym_index = -1; + rvec_MakeZero( hbonds_list.select.hbond_list[ihb_top].hb_f ); ++ihb_top; } @@ -765,11 +879,10 @@ CUDA_GLOBAL void k_init_forces( reax_atom *my_atoms, single_body_parameters *sbp /* uncorrected bond orders */ if ( nbr_pj->d <= control->bond_cut && - Dev_BOp( bonds, control->bo_cut, i, btop_i, nbr_pj, + Dev_BOp( bonds_list, control->bo_cut, i, btop_i, nbr_pj, sbp_i, sbp_j, twbp, workspace.dDeltap_self, workspace.total_bond_order ) == TRUE ) { - //num_bonds += 2; ++btop_i; /* TODO: Need to do later... since i and j are parallel */ @@ -785,15 +898,40 @@ CUDA_GLOBAL void k_init_forces( reax_atom *my_atoms, single_body_parameters *sbp } } - Dev_Set_End_Index( i, btop_i, &bonds ); + Dev_Set_End_Index( i, btop_i, &bonds_list ); H->end[i] = Htop; // if( local == TRUE ) // { if ( control->hbond_cut > 0.0 && ihb_top > 0 && (ihb == H_ATOM || ihb == H_BONDING_ATOM) ) { - Dev_Set_End_Index( atom_i->Hindex, ihb_top, &hbonds ); + Dev_Set_End_Index( atom_i->Hindex, ihb_top, &hbonds_list ); } // } + + my_bonds = btop_i - Dev_Start_Index( i, &bonds_list ); + my_hbonds = ihb_top - Dev_Start_Index( atom_i->Hindex, &hbonds_list ); + my_cm_entries = Htop - H->start[i]; + + /* copy (h)bond info to atom structure + * (needed for atom ownership transfer via MPI) */ + my_atoms[i].num_bonds = my_bonds; + my_atoms[i].num_hbonds = my_hbonds; + + /* reallocation checks */ + if ( my_bonds > max_bonds[i] ) + { + *realloc_bonds = TRUE; + } + + if ( my_hbonds > max_hbonds[i] ) + { + *realloc_hbonds = TRUE; + } + + if ( my_cm_entries > max_cm_entries[i] ) + { + *realloc_cm_entries = TRUE; + } } @@ -859,17 +997,21 @@ CUDA_GLOBAL void New_fix_sym_hbond_indices( reax_atom *my_atoms, reax_list hbond int nbr, nbrstart, nbrend; int start, end; hbond_data *ihbond, *jhbond; - int __THREADS_PER_ATOM__ = HB_KER_SYM_THREADS_PER_ATOM; - int thread_id = blockIdx.x * blockDim.x + threadIdx.x; - int warp_id = thread_id / __THREADS_PER_ATOM__; - int lane_id = thread_id & (__THREADS_PER_ATOM__ - 1); - int my_bucket = threadIdx.x / __THREADS_PER_ATOM__; + int __THREADS_PER_ATOM__; + int thread_id; + int warp_id; + int lane_id; + + __THREADS_PER_ATOM__ = HB_KER_SYM_THREADS_PER_ATOM; + thread_id = blockIdx.x * blockDim.x + threadIdx.x; + warp_id = thread_id / __THREADS_PER_ATOM__; if ( warp_id > N ) { return; } + lane_id = thread_id & (__THREADS_PER_ATOM__ - 1); i = warp_id; start = Dev_Start_Index( my_atoms[i].Hindex, &hbonds ); end = Dev_End_Index( my_atoms[i].Hindex, &hbonds ); @@ -932,38 +1074,6 @@ CUDA_GLOBAL void k_update_hbonds( reax_atom *my_atoms, reax_list hbonds, int n ) } -int Cuda_Validate_Lists( reax_system *system, storage *workspace, - reax_list **lists, control_params *control, - int step, int numH ) -{ - int blocks; - int ret; - - blocks = system->n / DEF_BLOCK_SIZE + - ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1); - ret = SUCCESS; - - k_update_bonds <<< blocks, DEF_BLOCK_SIZE >>> - ( system->d_my_atoms, *(*lists + BONDS), system->n ); - cudaThreadSynchronize( ); - cudaCheckError( ); - - if ( control->hbond_cut > 0.0 && system->numH > 0 ) - { - k_update_hbonds <<< blocks, DEF_BLOCK_SIZE >>> - ( system->d_my_atoms, *(*lists + HBONDS), system->n ); - cudaThreadSynchronize( ); - cudaCheckError( ); - } - - /* 3bodies list: since a more accurate estimate of the num. - * of three body interactions requires that bond orders have - * been computed, delay validation until for computation */ - - return ret; -} - - CUDA_GLOBAL void k_print_forces( reax_atom *my_atoms, rvec *f, int n ) { int i; @@ -997,23 +1107,69 @@ static void Print_Forces( reax_system *system ) } +CUDA_GLOBAL void k_print_hbonds( reax_atom *my_atoms, reax_list p_hbonds, int n, int rank, int step ) +{ + int i, k, pj, start, end; + reax_list *hbonds; + hbond_data *hbond_list, *hbond_jk; + + i = blockIdx.x * blockDim.x + threadIdx.x; + + if ( i >= n ) + { + return; + } + + hbonds = &( p_hbonds ); + start = Dev_Start_Index( my_atoms[i].Hindex, hbonds ); + end = Dev_End_Index( my_atoms[i].Hindex, hbonds ); + hbond_list = hbonds->select.hbond_list; + + for ( pj = start; pj < end; ++pj ) + { + k = hbond_list[pj].nbr; + hbond_jk = &( hbond_list[pj] ); + + printf( "p%03d, step %05d: %8d: %8d, %24.15f, %24.15f, %24.15f\n", + rank, step, my_atoms[i].Hindex, k, + hbond_jk->hb_f[0], + hbond_jk->hb_f[1], + hbond_jk->hb_f[2] ); + } +} + + +static void Print_HBonds( reax_system *system, int step ) +{ + int blocks; + + blocks = (system->n) / DEF_BLOCK_SIZE + + (((system->n % DEF_BLOCK_SIZE) == 0) ? 0 : 1); + + k_print_hbonds <<< blocks, DEF_BLOCK_SIZE >>> + ( system->d_my_atoms, *(*dev_lists + HBONDS), system->n, system->my_rank, step ); + cudaThreadSynchronize( ); + cudaCheckError( ); +} + + CUDA_GLOBAL void k_init_bond_orders( reax_atom *my_atoms, reax_list far_nbrs, reax_list bonds, real *total_bond_order, int N ) { - int i, j, pj; + int i, pj; +// int j; int start_i, end_i; - int type_i, type_j; - far_neighbor_data *nbr_pj; - reax_atom *atom_i, *atom_j; +// far_neighbor_data *nbr_pj; +// reax_atom *atom_i, *atom_j; i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= N) + if ( i >= N ) { return; } - atom_i = &(my_atoms[i]); +// atom_i = &(my_atoms[i]); start_i = Dev_Start_Index(i, &far_nbrs); end_i = Dev_End_Index(i, &far_nbrs); @@ -1068,51 +1224,75 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, simulation_data *data, storage *workspace, reax_list **lists, output_controls *out_control ) { - int i, ret, Htop; + int ret, ret_bonds, ret_hbonds, ret_cm; int blocks, hblocks; - ret = Cuda_Estimate_Storages( system, control, dev_lists, &(dev_workspace->H), data->step ); - - if ( ret == SUCCESS ) - { -// /* init the workspace (bond_mark) */ -// cuda_memset( dev_workspace->bond_mark, 0, sizeof(int) * system->n, "bond_mark" ); + /* init the workspace (bond_mark) */ +// cuda_memset( dev_workspace->bond_mark, 0, sizeof(int) * system->n, "bond_mark" ); // -// blocks = (system->N - system->n) / DEF_BLOCK_SIZE + -// (((system->N - system->n) % DEF_BLOCK_SIZE == 0) ? 0 : 1); -// k_init_bond_mark <<< blocks, DEF_BLOCK_SIZE >>> -// ( system->n, (system->N - system->n), dev_workspace->bond_mark ); -// cudaThreadSynchronize( ); -// cudaCheckError( ); +// blocks = (system->N - system->n) / DEF_BLOCK_SIZE + +// (((system->N - system->n) % DEF_BLOCK_SIZE == 0) ? 0 : 1); +// k_init_bond_mark <<< blocks, DEF_BLOCK_SIZE >>> +// ( system->n, (system->N - system->n), dev_workspace->bond_mark ); +// cudaThreadSynchronize( ); +// cudaCheckError( ); + + /* main kernel */ + blocks = (system->N) / DEF_BLOCK_SIZE + + (((system->N % DEF_BLOCK_SIZE) == 0) ? 0 : 1); + +// k_init_bond_orders <<< blocks, DEF_BLOCK_SIZE >>> +// ( system->d_my_atoms, *(*dev_lists + FAR_NBRS), *(*dev_lists + BONDS), +// dev_workspace->total_bond_order, system->N ); +// cudaThreadSynchronize( ); +// cudaCheckError( ); +// +// k_print_hbond_info <<< blocks, DEF_BLOCK_SIZE >>> +// ( system->d_my_atoms, system->reax_param.d_sbp, +// (control_params *)control->d_control_params, +// *(*dev_lists + HBONDS), system->N ); +// cudaThreadSynchronize( ); +// cudaCheckError( ); + + /* reset reallocation flags on device */ + cuda_memset( system->d_realloc_bonds, FALSE, sizeof(int), + "Cuda_Init_Forces::d_realloc_bonds" ); + cuda_memset( system->d_realloc_hbonds, FALSE, sizeof(int), + "Cuda_Init_Forces::d_realloc_hbonds" ); + cuda_memset( system->d_realloc_cm_entries, FALSE, sizeof(int), + "Cuda_Init_Forces::d_realloc_cm_entries" ); - /* main kernel */ - blocks = (system->N) / DEF_BLOCK_SIZE + - (((system->N % DEF_BLOCK_SIZE) == 0) ? 0 : 1); + k_init_forces <<< blocks, DEF_BLOCK_SIZE >>> + ( system->d_my_atoms, system->reax_param.d_sbp, + system->reax_param.d_tbp, *dev_workspace, + (control_params *)control->d_control_params, + *(*dev_lists + FAR_NBRS), *(*dev_lists + BONDS), + *(*dev_lists + HBONDS), d_LR, system->n, + system->N, system->reax_param.num_atom_types, + (((data->step-data->prev_steps) % control->reneighbor) == 0), + system->d_cm_entries, system->d_max_cm_entries, system->d_realloc_cm_entries, + system->d_bonds, system->d_max_bonds, system->d_realloc_bonds, + system->d_hbonds, system->d_max_hbonds, system->d_realloc_hbonds ); + cudaThreadSynchronize( ); + cudaCheckError( ); -// k_init_bond_orders <<< blocks, DEF_BLOCK_SIZE >>> -// ( system->d_my_atoms, *(*dev_lists + FAR_NBRS), *(*dev_lists + BONDS), -// dev_workspace->total_bond_order, system->N ); -// cudaThreadSynchronize( ); -// cudaCheckError( ); + /* check reallocation flags on device */ + copy_host_device( &ret_bonds, system->d_realloc_bonds, sizeof(int), + cudaMemcpyDeviceToHost, "Cuda_Init_Forces::d_realloc_bonds" ); + copy_host_device( &ret_hbonds, system->d_realloc_hbonds, sizeof(int), + cudaMemcpyDeviceToHost, "Cuda_Init_Forces::d_realloc_hbonds" ); + copy_host_device( &ret_cm, system->d_realloc_cm_entries, sizeof(int), + cudaMemcpyDeviceToHost, "Cuda_Init_Forces::d_realloc_cm_entries" ); -// k_print_hbond_info <<< blocks, DEF_BLOCK_SIZE >>> -// ( system->d_my_atoms, system->reax_param.d_sbp, -// (control_params *)control->d_control_params, -// *(*dev_lists + HBONDS), system->N ); -// cudaThreadSynchronize( ); -// cudaCheckError( ); + ret = (ret_bonds == FALSE && ret_hbonds == FALSE && ret_cm == FALSE) ? SUCCESS : FAILURE; - k_init_forces <<< blocks, DEF_BLOCK_SIZE >>> - ( system->d_my_atoms, system->reax_param.d_sbp, - system->reax_param.d_tbp, *dev_workspace, - (control_params *)control->d_control_params, - *(*dev_lists + FAR_NBRS), *(*dev_lists + BONDS), - *(*dev_lists + HBONDS), d_LR, system->n, - system->N, system->reax_param.num_atom_types, - (((data->step-data->prev_steps) % control->reneighbor) == 0) ); - cudaThreadSynchronize( ); - cudaCheckError( ); +#if defined(DEBUG_FOCUS) + fprintf( stderr, "[INFO] p%d, step %d: ret = %d, ret_bonds = %d, ret_hbonds = %d, ret_cm = %d\n", + system->my_rank, data->step, ret, ret_bonds, ret_hbonds, ret_cm ); +#endif + if ( ret == SUCCESS ) + { /* fix sym_index and dbond_index */ New_fix_sym_dbond_indices <<< blocks, BLOCK_SIZE >>> ( *(*dev_lists + BONDS), system->N ); @@ -1137,10 +1317,15 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, // ( *(*dev_lists + BONDS), *dev_workspace, system->N ); // cudaThreadSynchronize( ); // cudaCheckError( ); + } + else + { + Cuda_Estimate_Storages( system, control, dev_lists, + ret_bonds, ret_hbonds, ret_cm, data->step ); - /* validate lists */ - ret = Cuda_Validate_Lists( system, workspace, dev_lists, control, - data->step, system->numH ); + dev_workspace->realloc.bonds = ret_bonds; + dev_workspace->realloc.hbonds = ret_hbonds; + dev_workspace->realloc.cm = ret_cm; } return ret; @@ -1160,12 +1345,14 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, simulation_data *data, storage *workspace, reax_list **lists, output_controls *out_control ) { - int i, hbs, hnbrs_blocks, update_energy, ret; + int hbs, hnbrs_blocks, update_energy, ret; int *thbody; static int compute_bonded_part1 = FALSE; - real t_start, t_elapsed; real *spad = (real *) scratch; rvec *rvec_spad; +#if defined(DEBUG) + real t_start, t_elapsed; +#endif update_energy = (out_control->energy_update_freq > 0 && data->step % out_control->energy_update_freq == 0) ? TRUE : FALSE; @@ -1174,9 +1361,9 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, if ( compute_bonded_part1 == FALSE ) { /* 1. Bond Order Interactions */ +#if defined(DEBUG) t_start = Get_Time( ); -#if defined(DEBUG) fprintf( stderr, " Begin Bonded Forces ... %d x %d\n", BLOCKS_N, BLOCK_SIZE ); #endif @@ -1206,16 +1393,19 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, cudaThreadSynchronize( ); cudaCheckError( ); +#if defined(DEBUG) t_elapsed = Get_Timing_Info( t_start ); -#if defined(DEBUG) fprintf( stderr, "Bond Orders... return value --> %d --- Timing %lf \n", cudaGetLastError( ), t_elapsed ); fprintf( stderr, "Cuda_Calculate_Bond_Orders Done... \n" ); #endif /* 2. Bond Energy Interactions */ +#if defined(DEBUG) t_start = Get_Time( ); +#endif + cuda_memset( spad, 0, system->N * (2 * sizeof(real)) , "scratch" ); Cuda_Bonds <<< BLOCKS, BLOCK_SIZE, sizeof(real)* BLOCK_SIZE >>> @@ -1232,16 +1422,19 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, system->n ); } +#if defined(DEBUG) t_elapsed = Get_Timing_Info( t_start ); -#if defined(DEBUG) fprintf( stderr, "Cuda_Bond_Energy ... return value --> %d --- Timing %lf \n", cudaGetLastError( ), t_elapsed ); fprintf( stderr, "Cuda_Bond_Energy Done... \n" ); #endif /* 3. Atom Energy Interactions */ +#if defined(DEBUG) t_start = Get_Time( ); +#endif + cuda_memset( spad, 0, ( 6 * sizeof(real) * system->n ), "scratch" ); Cuda_Atom_Energy <<< BLOCKS, BLOCK_SIZE >>> @@ -1282,9 +1475,9 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, system->n ); } +#if defined(DEBUG) t_elapsed = Get_Timing_Info( t_start ); -#if defined(DEBUG) fprintf( stderr, "test_LonePair_postprocess ... return value --> %d --- Timing %lf \n", cudaGetLastError( ), t_elapsed ); fprintf( stderr, "test_LonePair_postprocess Done... \n"); @@ -1294,7 +1487,9 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, } /* 4. Valence Angles Interactions */ +#if defined(DEBUG) t_start = Get_Time( ); +#endif thbody = (int *) scratch; ret = Cuda_Estimate_Storage_Three_Body( system, control, data->step, @@ -1310,9 +1505,10 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, if ( ret == SUCCESS ) { - Cuda_Init_Three_Body_Indices( thbody, system->total_bonds ); + Cuda_Init_Three_Body_Indices( thbody, system->total_thbodies_indices ); cuda_memset( spad, 0, 6 * sizeof(real) * system->N + sizeof(rvec) * system->N * 2, "scratch" ); + Cuda_Valence_Angles <<< BLOCKS_N, BLOCK_SIZE >>> ( system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_sbp, system->reax_param.d_thbp, @@ -1367,19 +1563,22 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, cudaThreadSynchronize( ); cudaCheckError( ); +#if defined(DEBUG) t_elapsed = Get_Timing_Info( t_start ); -#if defined(DEBUG) fprintf( stderr, "Three_Body_Interactions ... Timing %lf \n", t_elapsed ); fprintf( stderr, "Three_Body_Interactions Done... \n" ); #endif /* 5. Torsion Angles Interactions */ +#if defined(DEBUG) t_start = Get_Time( ); - cuda_memset( spad, 0, - 4 * sizeof(real) * system->n + sizeof(rvec) * system->n * 2, +#endif + + cuda_memset( spad, 0, 4 * sizeof(real) * system->n + sizeof(rvec) * system->n * 2, "scratch" ); + Cuda_Torsion_Angles <<< BLOCKS, BLOCK_SIZE >>> ( system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_fbp, (control_params *) control->d_control_params, *(*dev_lists + BONDS), @@ -1426,9 +1625,9 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, cudaThreadSynchronize( ); cudaCheckError( ); +#if defined(DEBUG) t_elapsed = Get_Timing_Info( t_start ); -#if defined(DEBUG) fprintf( stderr, "Four_Body_post process return value --> %d --- Four body Timing %lf \n", cudaGetLastError( ), t_elapsed ); fprintf( stderr, " Four_Body_ Done... \n"); @@ -1437,7 +1636,10 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, /* 6. Hydrogen Bonds Interactions */ if ( control->hbond_cut > 0.0 && system->numH > 0 ) { +#if defined(DEBUG) t_start = Get_Time( ); +#endif + cuda_memset( spad, 0, 2 * sizeof(real) * system->n + sizeof(rvec) * system->n * 2, "scratch" ); @@ -1452,10 +1654,15 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, (control_params *) control->d_control_params, *dev_workspace, *(*dev_lists + BONDS), *(*dev_lists + HBONDS), system->n, system->reax_param.num_atom_types, - spad, (rvec *) (spad + 2 * system->n) ); + spad, (rvec *) (spad + 2 * system->n), system->my_rank, data->step ); cudaThreadSynchronize( ); cudaCheckError( ); +// if ( data->step == 10 ) +// { +// Print_HBonds( system, data->step ); +// } + /* reduction for E_HB */ if ( update_energy == TRUE ) { @@ -1498,9 +1705,9 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, cudaThreadSynchronize( ); cudaCheckError( ); +#if defined(DEBUG) t_elapsed = Get_Timing_Info( t_start ); -#if defined(DEBUG) fprintf( stderr, "Hydrogen bonds return value --> %d --- HydrogenBonds Timing %lf \n", cudaGetLastError( ), t_elapsed ); @@ -1564,6 +1771,7 @@ int Cuda_Compute_Forces( reax_system *system, control_params *control, output_controls *out_control, mpi_datatypes *mpi_data ) { int charge_flag, retVal; + static int init_forces_done = FALSE; #if defined(LOG_PERFORMANCE) real t_start = 0; @@ -1587,13 +1795,21 @@ int Cuda_Compute_Forces( reax_system *system, control_params *control, charge_flag = FALSE; } - if ( charge_flag == TRUE ) - { - retVal = Cuda_Init_Forces( system, control, data, workspace, lists, out_control ); - } - else + if ( init_forces_done == FALSE ) { - retVal = Cuda_Init_Forces_No_Charges( system, control, data, workspace, lists, out_control ); + if ( charge_flag == TRUE ) + { + retVal = Cuda_Init_Forces( system, control, data, workspace, lists, out_control ); + } + else + { + retVal = Cuda_Init_Forces_No_Charges( system, control, data, workspace, lists, out_control ); + } + + if ( retVal == SUCCESS ) + { + init_forces_done = TRUE; + } } if ( retVal == SUCCESS ) @@ -1647,6 +1863,7 @@ int Cuda_Compute_Forces( reax_system *system, control_params *control, fprintf(stderr, "p%d @ step%d: qeq completed\n", system->my_rank, data->step); MPI_Barrier( MPI_COMM_WORLD ); #endif + #endif //PURE_REAX /********* nonbonded interactions ************/ @@ -1660,6 +1877,7 @@ int Cuda_Compute_Forces( reax_system *system, control_params *control, Update_Timing_Info( &t_start, &(data->timing.nonb) ); } #endif + #if defined(DEBUG_FOCUS) fprintf( stderr, "p%d @ step%d: nonbonded forces completed\n", system->my_rank, data->step ); @@ -1676,6 +1894,7 @@ int Cuda_Compute_Forces( reax_system *system, control_params *control, Update_Timing_Info( &t_start, &(data->timing.bonded) ); } #endif + #if defined(DEBUG_FOCUS) fprintf( stderr, "p%d @ step%d: total forces computed\n", system->my_rank, data->step ); @@ -1683,6 +1902,8 @@ int Cuda_Compute_Forces( reax_system *system, control_params *control, MPI_Barrier( MPI_COMM_WORLD ); #endif + + init_forces_done = FALSE; } return retVal; diff --git a/PG-PuReMD/src/cuda/cuda_forces.h b/PG-PuReMD/src/cuda/cuda_forces.h index b618fa29..94d3b73f 100644 --- a/PG-PuReMD/src/cuda/cuda_forces.h +++ b/PG-PuReMD/src/cuda/cuda_forces.h @@ -10,8 +10,16 @@ extern "C" { #endif -int Cuda_Estimate_Storages( reax_system *, control_params *, reax_list **, - sparse_matrix *, int ); +void Cuda_Init_HBond_Indices( reax_system * ); + +void Cuda_Init_Bond_Indices( reax_system * ); + +void Cuda_Init_Sparse_Matrix_Indices( reax_system *, sparse_matrix * ); + +void Cuda_Init_Three_Body_Indices( int *, int ); + +void Cuda_Estimate_Storages( reax_system *, control_params *, reax_list **, + int, int, int, int ); int Cuda_Estimate_Storage_Three_Body( reax_system *, control_params *, int, reax_list **, int *, int * ); @@ -22,9 +30,6 @@ int Cuda_Init_Forces( reax_system *, control_params *, simulation_data *, int Cuda_Init_Forces_No_Charges( reax_system *, control_params *, simulation_data *, storage *, reax_list **, output_controls * ); -int Cuda_Validate_Lists( reax_system *, storage *, reax_list **, control_params *, - int, int, int, int ); - int Cuda_Compute_Bonded_Forces( reax_system *, control_params *, simulation_data *, storage *, reax_list **, output_controls * ); diff --git a/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu b/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu index 18cdbb57..9043e424 100644 --- a/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu +++ b/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu @@ -33,7 +33,7 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds( reax_atom *my_atoms, single_body_parameters *sbp, hbond_parameters *d_hbp, global_parameters gp, control_params *control, storage p_workspace, reax_list p_bonds, reax_list p_hbonds, int n, - int num_atom_types, real *data_e_hb, rvec *data_ext_press ) + int num_atom_types, real *data_e_hb, rvec *data_ext_press, int rank, int step ) { int i, j, k, pi, pk; int type_i, type_j, type_k; @@ -41,7 +41,7 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds( reax_atom *my_atoms, single_body_parameter int hblist[MAX_BONDS]; int itr, top; ivec rel_jk; - real r_ij, r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2; + real r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2; real e_hb, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3; rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk; rvec dvec_jk, force, ext_press; @@ -53,9 +53,6 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds( reax_atom *my_atoms, single_body_parameter bond_data *bond_list; hbond_data *hbond_list, *hbond_jk; storage *workspace; -#if defined(DEBUG) - int num_hb_intrs; -#endif j = blockIdx.x * blockDim.x + threadIdx.x; @@ -69,15 +66,12 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds( reax_atom *my_atoms, single_body_parameter hbonds = &( p_hbonds ); hbond_list = hbonds->select.hbond_list; workspace = &( p_workspace ); -#if defined(DEBUG) - num_hb_intrs = 0; -#endif /* loops below discover the Hydrogen bonds between i-j-k triplets. * here j is H atom and there has to be some bond between i and j. * Hydrogen bond is between j and k. * so in this function i->X, j->H, k->Z when we map - * variables onto the ones in the handout.*/ + * variables onto the ones in the handout. */ //for( j = 0; j < system->n; ++j ) if ( sbp[ my_atoms[j].type ].p_hbond == H_ATOM ) { @@ -89,14 +83,14 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds( reax_atom *my_atoms, single_body_parameter top = 0; /* search bonded atoms to atom j (i.e., hydrogen atom) for potential hydrogen bonding */ - for( pi = start_j; pi < end_j; ++pi ) + for ( pi = start_j; pi < end_j; ++pi ) { pbond_ij = &( bond_list[pi] ); i = pbond_ij->nbr; bo_ij = &(pbond_ij->bo_data); type_i = my_atoms[i].type; - if( sbp[type_i].p_hbond == H_BONDING_ATOM && + if ( sbp[type_i].p_hbond == H_BONDING_ATOM && bo_ij->BO >= HB_THRESHOLD ) { hblist[top++] = pi; @@ -130,12 +124,7 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds( reax_atom *my_atoms, single_body_parameter { bo_ij = &(pbond_ij->bo_data); type_i = my_atoms[i].type; - r_ij = pbond_ij->d; - hbp = &(d_hbp[ index_hbp (type_i,type_j,type_k,num_atom_types) ]); - -#if defined(DEBUG) - ++num_hb_intrs; -#endif + hbp = &(d_hbp[ index_hbp(type_i, type_j, type_k, num_atom_types) ]); Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk, &theta, &cos_theta ); @@ -144,15 +133,30 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds( reax_atom *my_atoms, single_body_parameter &dcos_theta_di, &dcos_theta_dj, &dcos_theta_dk ); +// if ( j == 0 && k == 36 && step == 10 && rank == 0 ) +// { +// printf( "[0] p%05d, top = %d, itr = %d, MAX_BONDS = %d\n", rank, top, itr, MAX_BONDS ); +// printf( "[1] p%05d %05d, %05d: %12.5f %12.5f %12.5f %12.5f %12.5f\n", rank, j, k, theta, cos_theta, dcos_theta_di, +// dcos_theta_dj, dcos_theta_dk ); +// } + /* hydrogen bond energy */ sin_theta2 = SIN( theta / 2.0 ); - sin_xhz4 = SQR(sin_theta2); + sin_xhz4 = SQR( sin_theta2 ); sin_xhz4 *= sin_xhz4; cos_xhz1 = ( 1.0 - cos_theta ); exp_hb2 = EXP( -hbp->p_hb2 * bo_ij->BO ); exp_hb3 = EXP( -hbp->p_hb3 * ( hbp->r0_hb / r_jk + r_jk / hbp->r0_hb - 2.0 ) ); +// if ( j == 0 && k == 36 && step == 10 && rank == 0 ) +// { +// printf( "[2] p%05d %05d, %05d: %12.5f %12.5f %12.5f %12.5f %12.5f\n", rank, j, k, +// sin_theta2, sin_xhz4, cos_xhz1, exp_hb2, exp_hb3 ); +// printf( "[3] p%05d %05d, %05d: %12.5f %12.5f %12.5f\n", rank, j, k, +// hbp->p_hb3, hbp->r0_hb, r_jk ); +// } + e_hb = hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4; data_e_hb[j] += e_hb; @@ -161,6 +165,10 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds( reax_atom *my_atoms, single_body_parameter CEhb3 = -hbp->p_hb3 * (-hbp->r0_hb / SQR(r_jk) + 1.0 / hbp->r0_hb) * e_hb; +// if ( j == 0 && k == 36 && step == 10 && rank == 0 ) +// printf( "[4] p%05d %05d, %05d: %12.5f %12.5f %12.5f %12.5f %12.5f\n", rank, j, k, +// e_hb, data_e_hb[j], CEhb1, CEhb2, CEhb3 ); + /*fprintf( stdout, "%6d%6d%6d%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f\n", system->my_atoms[i].orig_id, system->my_atoms[j].orig_id, @@ -173,7 +181,7 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds( reax_atom *my_atoms, single_body_parameter if ( control->virial == 0 ) { - // dcos terms + /* dcos terms */ //rvec_ScaledAdd( workspace->f[i], +CEhb2, dcos_theta_di ); //atomic_rvecScaledAdd( workspace->f[i], +CEhb2, dcos_theta_di ); rvec_ScaledAdd( pbond_ij->hb_f, +CEhb2, dcos_theta_di ); @@ -184,7 +192,7 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds( reax_atom *my_atoms, single_body_parameter //atomic_rvecScaledAdd( workspace->f[k], +CEhb2, dcos_theta_dk ); rvec_ScaledAdd( hbond_jk->hb_f, +CEhb2, dcos_theta_dk ); - // dr terms + /* dr terms */ rvec_ScaledAdd( workspace->f[j], -CEhb3/r_jk, dvec_jk ); //rvec_ScaledAdd( workspace->f[k], +CEhb3/r_jk, dvec_jk ); @@ -209,7 +217,7 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds( reax_atom *my_atoms, single_body_parameter rvec_Add( hbond_jk->hb_f, force ); rvec_iMultiply( ext_press, rel_jk, force ); rvec_ScaledAdd( data_ext_press[j], 1.0, ext_press ); - // dr terms + /* dr terms */ rvec_ScaledAdd( workspace->f[j], -CEhb3/r_jk, dvec_jk ); rvec_Scale( force, CEhb3 / r_jk, dvec_jk ); @@ -271,7 +279,7 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_MT( reax_atom *my_atoms, single_body_parame real *sh_hb = t_hb; real *sh_cdbo = t_hb + blockDim.x; rvec *sh_atomf = (rvec *)(sh_cdbo + blockDim.x); - rvec *sh_hf = (rvec *) (sh_atomf + blockDim.x); + rvec *sh_hf = (rvec *)(sh_atomf + blockDim.x); #endif int __THREADS_PER_ATOM__, thread_id, group_id, lane_id; int i, j, k, pi, pk; @@ -282,7 +290,7 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_MT( reax_atom *my_atoms, single_body_parame int itr, top; int loopcount, count; ivec rel_jk; - real r_ij, r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2; + real r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2; real e_hb, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3; rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk; rvec dvec_jk, force, ext_press; @@ -294,9 +302,6 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_MT( reax_atom *my_atoms, single_body_parame bond_data *bond_list; hbond_data *hbond_list, *hbond_jk; storage *workspace; -#if defined(DEBUG) - int num_hb_intrs; -#endif __THREADS_PER_ATOM__ = HB_KER_THREADS_PER_ATOM; thread_id = blockIdx.x * blockDim.x + threadIdx.x; @@ -314,9 +319,6 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_MT( reax_atom *my_atoms, single_body_parame hbonds = &( p_hbonds ); hbond_list = hbonds->select.hbond_list; j = group_id; -#if defined(DEBUG) - num_hb_intrs = 0; -#endif /* loops below discover the Hydrogen bonds between i-j-k triplets. here j is H atom and there has to be some bond between i and j. @@ -404,13 +406,8 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_MT( reax_atom *my_atoms, single_body_parame { bo_ij = &(pbond_ij->bo_data); type_i = my_atoms[i].type; - r_ij = pbond_ij->d; hbp = &(d_hbp[ index_hbp(type_i,type_j,type_k,num_atom_types) ]); -#if defined(DEBUG) - ++num_hb_intrs; -#endif - Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk, &theta, &cos_theta ); /* the derivative of cos(theta) */ @@ -645,7 +642,7 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs( reax_atom *atoms, #else extern __shared__ rvec __f[]; #endif - int i, pj,j; + int i, pj; int start, end; storage *workspace; hbond_data *nbr_pj, *sym_index_nbr; @@ -667,7 +664,6 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs( reax_atom *atoms, while ( pj < end ) { nbr_pj = &( hbonds->select.hbond_list[pj] ); - j = nbr_pj->nbr; sym_index_nbr = &(hbonds->select.hbond_list[ nbr_pj->sym_index ]); @@ -743,7 +739,7 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs_BL( reax_atom *atoms, #else extern __shared__ rvec __f[]; #endif - int i, pj,j; + int i, pj; int start, end; storage *workspace; hbond_data *nbr_pj, *sym_index_nbr; @@ -778,7 +774,6 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs_BL( reax_atom *atoms, while ( pj < end ) { nbr_pj = &(hbonds->select.hbond_list[pj]); - j = nbr_pj->nbr; sym_index_nbr = &(hbonds->select.hbond_list[ nbr_pj->sym_index ]); #if defined(__SM_35__) diff --git a/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.h b/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.h index 606196b4..aa09d2f7 100644 --- a/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.h +++ b/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.h @@ -25,24 +25,24 @@ #include "../reax_types.h" -CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs( reax_atom *, - storage, reax_list ); - -CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs_BL( reax_atom *, - storage, reax_list, int ); - -CUDA_GLOBAL void Cuda_Hydrogen_Bonds_PostProcess( reax_atom *, - storage, reax_list, int ); - CUDA_GLOBAL void Cuda_Hydrogen_Bonds( reax_atom *, single_body_parameters *, hbond_parameters *, global_parameters, control_params *, storage , - reax_list, reax_list, int, int, real *, rvec * ); + reax_list, reax_list, int, int, real *, rvec *, int, int ); CUDA_GLOBAL void Cuda_Hydrogen_Bonds_MT( reax_atom *, single_body_parameters *, hbond_parameters *, global_parameters , control_params *, storage, reax_list, reax_list, int, int, real *, rvec * ); +CUDA_GLOBAL void Cuda_Hydrogen_Bonds_PostProcess( reax_atom *, + storage, reax_list, int ); + +CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs( reax_atom *, + storage, reax_list ); + +CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs_BL( reax_atom *, + storage, reax_list, int ); + #endif diff --git a/PG-PuReMD/src/cuda/cuda_init_md.cu b/PG-PuReMD/src/cuda/cuda_init_md.cu index 3162e732..b115f287 100644 --- a/PG-PuReMD/src/cuda/cuda_init_md.cu +++ b/PG-PuReMD/src/cuda/cuda_init_md.cu @@ -56,8 +56,7 @@ int Cuda_Init_System( reax_system *system, control_params *control, simulation_data *data, storage *workspace, mpi_datatypes *mpi_data, char *msg ) { - int i, ret; - reax_atom *atom; + int i; int nrecv[MAX_NBRS]; Setup_New_Grid( system, control, MPI_COMM_WORLD ); @@ -111,7 +110,7 @@ int Cuda_Init_System( reax_system *system, control_params *control, /* initialize velocities so that desired init T can be attained */ if ( !control->restart || (control->restart && control->random_vel) ) { - Generate_Initial_Velocities( system, control->T_init ); + Cuda_Generate_Initial_Velocities( system, control->T_init ); } Cuda_Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D ); @@ -228,15 +227,11 @@ void Cuda_Init_Workspace( reax_system *system, control_params *control, } -int Cuda_Init_Lists( reax_system *system, control_params *control, +void Cuda_Init_Lists( reax_system *system, control_params *control, simulation_data *data, storage *workspace, reax_list **lists, mpi_datatypes *mpi_data, char *msg ) { - int ret; - int Htop; - - /* ignore returned error, as system->d_max_far_nbrs was not valid */ - ret = Cuda_Estimate_Neighbors( system, data->step ); + Cuda_Estimate_Neighbors( system ); Dev_Make_List( system->total_cap, system->total_far_nbrs, TYP_FAR_NEIGHBOR, *dev_lists + FAR_NBRS ); @@ -253,27 +248,21 @@ int Cuda_Init_Lists( reax_system *system, control_params *control, Cuda_Generate_Neighbor_Lists( system, data, workspace, dev_lists ); /* estimate storage for bonds, hbonds, and sparse matrix */ - Cuda_Estimate_Storages( system, control, dev_lists, &(dev_workspace->H), data->step ); + Cuda_Estimate_Storages( system, control, dev_lists, + TRUE, TRUE, TRUE, data->step ); dev_alloc_matrix( &(dev_workspace->H), system->total_cap, system->total_cm_entries ); Cuda_Init_Sparse_Matrix_Indices( system, &(dev_workspace->H) ); - //MATRIX CHANGES - //workspace->L = NULL; - //workspace->U = NULL; - #if defined(DEBUG_FOCUS) - fprintf( stderr, "p:%d - allocated H matrix: max_entries: %d, cap: %d \n", - system->my_rank, system->total_cm_entries, dev_workspace->H.m ); - fprintf( stderr, "p%d: allocated H matrix: Htop=%d, space=%dMB\n", - system->my_rank, Htop, - (int)(Htop * sizeof(sparse_matrix_entry) / (1024 * 1024)) ); + fprintf( stderr, "p:%d - allocated H matrix: max_entries: %d, space=%dMB\n", + system->my_rank, system->total_cm_entries, + (int)(system->total_cm_entries * sizeof(sparse_matrix_entry) / (1024 * 1024)) ); #endif if ( control->hbond_cut > 0.0 && system->numH > 0 ) { Dev_Make_List( system->total_cap, system->total_hbonds, TYP_HBOND, *dev_lists + HBONDS ); - Cuda_Init_HBond_Indices( system ); #if defined(DEBUG_FOCUS) @@ -285,7 +274,6 @@ int Cuda_Init_Lists( reax_system *system, control_params *control, /* bonds list */ Dev_Make_List( system->total_cap, system->total_bonds, TYP_BOND, *dev_lists + BONDS ); - Cuda_Init_Bond_Indices( system ); #if defined(DEBUG_FOCUS) @@ -295,10 +283,8 @@ int Cuda_Init_Lists( reax_system *system, control_params *control, #endif /* 3bodies list: since a more accurate estimate of the num. - * of three body interactions requires that bond orders have + * three body interactions requires that bond orders have * been computed, delay estimation until for computation */ - - return SUCCESS; } @@ -308,7 +294,6 @@ void Cuda_Initialize( reax_system *system, control_params *control, mpi_datatypes *mpi_data ) { char msg[MAX_STR]; - real t_start, t_end; /* HOST/DEVICE SCRATCH */ Cuda_Init_ScratchArea( ); @@ -348,20 +333,11 @@ void Cuda_Initialize( reax_system *system, control_params *control, fprintf( stderr, "p%d: initialized workspace\n", system->my_rank ); #endif - //Sync the taper here from host to device. - /* CONTROL */ dev_alloc_control( control ); /* LISTS */ - if ( Cuda_Init_Lists( system, control, data, workspace, lists, mpi_data, msg ) == - FAILURE ) - { - fprintf( stderr, "p%d: %s\n", system->my_rank, msg ); - fprintf( stderr, "p%d: system could not be initialized! terminating.\n", - system->my_rank ); - MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE ); - } + Cuda_Init_Lists( system, control, data, workspace, lists, mpi_data, msg ); #if defined(DEBUG) fprintf( stderr, "p%d: initialized lists\n", system->my_rank ); diff --git a/PG-PuReMD/src/cuda/cuda_integrate.cu b/PG-PuReMD/src/cuda/cuda_integrate.cu index 77203263..e1de784a 100644 --- a/PG-PuReMD/src/cuda/cuda_integrate.cu +++ b/PG-PuReMD/src/cuda/cuda_integrate.cu @@ -284,7 +284,7 @@ int Cuda_Velocity_Verlet_NVE( reax_system* system, control_params* control, output_controls *out_control, mpi_datatypes *mpi_data ) { int steps, renbr, ret; - static int verlet_part1_done = FALSE, estimate_nbrs_done = 0; + static int verlet_part1_done = FALSE, far_nbrs_done = FALSE; real dt; #if defined(DEBUG) real t_over_start, t_over_elapsed; @@ -300,11 +300,10 @@ int Cuda_Velocity_Verlet_NVE( reax_system* system, control_params* control, renbr = steps % control->reneighbor == 0 ? TRUE : FALSE; ret = SUCCESS; - Cuda_ReAllocate( system, control, data, workspace, lists, mpi_data ); - if ( verlet_part1_done == FALSE ) { update_velocity_part1( system, dt ); + verlet_part1_done = TRUE; #if defined(DEBUG_FOCUS) @@ -321,43 +320,38 @@ int Cuda_Velocity_Verlet_NVE( reax_system* system, control_params* control, Comm_Atoms( system, control, data, workspace, lists, mpi_data, renbr ); Sync_Atoms( system ); - /* synch the Grid to the Device here */ + /* sync grid to device */ Sync_Grid( &system->my_grid, &system->d_my_grid ); init_blocks( system ); - -#if defined(__CUDA_DEBUG_LOG__) - fprintf( stderr, "p:%d - Matvec BLocks: %d, blocksize: %d \n", - system->my_rank, MATVEC_BLOCKS, MATVEC_BLOCK_SIZE ); -#endif } + Cuda_ReAllocate( system, control, data, workspace, lists, mpi_data ); + Cuda_Reset( system, control, data, workspace, lists ); - if ( renbr ) + if ( renbr && far_nbrs_done == FALSE ) { #if defined(DEBUG) t_over_start = Get_Time( ); #endif - if ( estimate_nbrs_done == 0 ) + ret = Cuda_Generate_Neighbor_Lists( system, data, workspace, lists ); + + if ( ret != SUCCESS ) { - //TODO: move far_nbrs reallocation checks outside of renbr frequency check - ret = Cuda_Estimate_Neighbors( system, data->step ); - estimate_nbrs_done = 1; + Cuda_Estimate_Neighbors( system ); } - - if ( ret == SUCCESS && estimate_nbrs_done == 1 ) + if ( ret == SUCCESS ) { - Cuda_Generate_Neighbor_Lists( system, data, workspace, lists ); - estimate_nbrs_done = 2; + far_nbrs_done = TRUE; + } #if defined(DEBUG) - t_over_elapsed = Get_Timing_Info( t_over_start ); - fprintf( stderr, "p%d --> Overhead (Step-%d) %f \n", - system->my_rank, data->step, t_over_elapsed ); + t_over_elapsed = Get_Timing_Info( t_over_start ); + fprintf( stderr, "p%d --> Overhead (Step-%d) %f \n", + system->my_rank, data->step, t_over_elapsed ); #endif - } } if ( ret == SUCCESS ) @@ -371,7 +365,7 @@ int Cuda_Velocity_Verlet_NVE( reax_system* system, control_params* control, update_velocity_part2( system, dt ); verlet_part1_done = FALSE; - estimate_nbrs_done = 0; + far_nbrs_done = FALSE; } #if defined(DEBUG_FOCUS) @@ -389,7 +383,7 @@ int Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein( reax_system* system, { int itr, steps, renbr, ret; real *d_my_ekin, *d_total_my_ekin; - static int verlet_part1_done = FALSE, estimate_nbrs_done = 0; + static int verlet_part1_done = FALSE, far_nbrs_done = FALSE; real dt, dt_sqr; real my_ekin, new_ekin; real G_xi_new, v_xi_new, v_xi_old; @@ -406,8 +400,6 @@ int Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein( reax_system* system, steps = data->step - data->prev_steps; renbr = steps % control->reneighbor == 0 ? TRUE : FALSE; - Cuda_ReAllocate( system, control, data, workspace, lists, mpi_data ); - if ( verlet_part1_done == FALSE ) { nhNVT_update_velocity_part1( system, dt ); @@ -431,46 +423,44 @@ int Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein( reax_system* system, Comm_Atoms( system, control, data, workspace, lists, mpi_data, renbr ); Sync_Atoms( system ); - /* synch the Grid to the Device here */ + /* sync grid to device */ Sync_Grid( &system->my_grid, &system->d_my_grid ); init_blocks( system ); } + Cuda_ReAllocate( system, control, data, workspace, lists, mpi_data ); + Cuda_Reset( system, control, data, workspace, lists ); - if ( renbr ) + if ( renbr && far_nbrs_done == FALSE ) { #if defined(DEBUG) t_over_start = Get_Time( ); #endif - if ( estimate_nbrs_done == 0 ) + ret = Cuda_Generate_Neighbor_Lists( system, data, workspace, lists ); + + if ( ret != SUCCESS ) { - //TODO: move far_nbrs reallocation checks outside of renbr frequency check - ret = Cuda_Estimate_Neighbors( system, data->step ); - fprintf( stderr, "Cuda_Estimate_Neighbors (%d)\n", ret ); - estimate_nbrs_done = 1; + Cuda_Estimate_Neighbors( system ); } - - if ( ret == SUCCESS && estimate_nbrs_done == 1 ) + if ( ret == SUCCESS ) { - Cuda_Generate_Neighbor_Lists( system, data, workspace, lists ); - estimate_nbrs_done = 2; - + far_nbrs_done = TRUE; + } + #if defined(DEBUG) - t_over_elapsed = Get_Timing_Info( t_over_start ); - fprintf( stderr, "p%d --> Overhead (Step-%d) %f \n", - system->my_rank, data->step, t_over_elapsed ); + t_over_elapsed = Get_Timing_Info( t_over_start ); + fprintf( stderr, "p%d --> Overhead (Step-%d) %f \n", + system->my_rank, data->step, t_over_elapsed ); #endif - } } if ( ret == SUCCESS ) { ret = Cuda_Compute_Forces( system, control, data, workspace, lists, out_control, mpi_data ); - fprintf( stderr, "Cuda_Compute_Forces (%d)\n", ret ); } if ( ret == SUCCESS ) @@ -514,7 +504,7 @@ int Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein( reax_system* system, "Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein::d_my_ekin" ); verlet_part1_done = FALSE; - estimate_nbrs_done = 0; + far_nbrs_done = FALSE; } #if defined(DEBUG_FOCUS) @@ -534,7 +524,7 @@ int Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* con output_controls *out_control, mpi_datatypes *mpi_data ) { int steps, renbr, ret; - static int verlet_part1_done = FALSE, estimate_nbrs_done = 0; + static int verlet_part1_done = FALSE, far_nbrs_done = FALSE; real dt, lambda; #if defined(DEBUG) real t_over_start, t_over_elapsed; @@ -550,12 +540,11 @@ int Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* con renbr = steps % control->reneighbor == 0 ? TRUE : FALSE; ret = SUCCESS; - Cuda_ReAllocate( system, control, data, workspace, lists, mpi_data ); - if ( verlet_part1_done == FALSE ) { /* velocity verlet, 1st part */ update_velocity_part1( system, dt ); + verlet_part1_done = TRUE; #if defined(DEBUG_FOCUS) @@ -563,6 +552,8 @@ int Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* con MPI_Barrier( MPI_COMM_WORLD ); #endif + Cuda_ReAllocate( system, control, data, workspace, lists, mpi_data ); + if ( renbr ) { Update_Grid( system, control, mpi_data->world ); @@ -576,39 +567,38 @@ int Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* con Sync_Grid( &system->my_grid, &system->d_my_grid ); init_blocks( system ); - -#if defined(__CUDA_DEBUG_LOG__) - fprintf( stderr, "p:%d - Matvec BLocks: %d, blocksize: %d \n", - system->my_rank, MATVEC_BLOCKS, MATVEC_BLOCK_SIZE ); -#endif + + Cuda_Reset( system, control, data, workspace, lists ); } + else + { + Cuda_ReAllocate( system, control, data, workspace, lists, mpi_data ); - Cuda_Reset( system, control, data, workspace, lists ); + Cuda_Reset( system, control, data, workspace, lists ); + } - if ( renbr ) + if ( renbr && far_nbrs_done == FALSE ) { #if defined(DEBUG) - t_over_start = Get_Time (); + t_over_start = Get_Time( ); #endif - if ( estimate_nbrs_done == 0 ) + ret = Cuda_Generate_Neighbor_Lists( system, data, workspace, lists ); + + if ( ret != SUCCESS ) { - //TODO: move far_nbrs reallocation checks outside of renbr frequency check - ret = Cuda_Estimate_Neighbors( system, data->step ); - estimate_nbrs_done = 1; + Cuda_Estimate_Neighbors( system ); } - - if ( ret == SUCCESS && estimate_nbrs_done == 1 ) + if ( ret == SUCCESS ) { - Cuda_Generate_Neighbor_Lists( system, data, workspace, lists ); - estimate_nbrs_done = 2; - + far_nbrs_done = TRUE; + } + #if defined(DEBUG) - t_over_elapsed = Get_Timing_Info( t_over_start ); - fprintf( stderr, "p%d --> Overhead (Step-%d) %f \n", - system->my_rank, data->step, t_over_elapsed ); + t_over_elapsed = Get_Timing_Info( t_over_start ); + fprintf( stderr, "p%d --> Overhead (Step-%d) %f \n", + system->my_rank, data->step, t_over_elapsed ); #endif - } } if ( ret == SUCCESS ) @@ -653,7 +643,7 @@ int Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* con #endif verlet_part1_done = FALSE; - estimate_nbrs_done = 0; + far_nbrs_done = FALSE; } return ret; @@ -668,7 +658,7 @@ int Cuda_Velocity_Verlet_Berendsen_NPT( reax_system* system, control_params* con output_controls *out_control, mpi_datatypes *mpi_data ) { int steps, renbr, ret; - static int verlet_part1_done = FALSE, estimate_nbrs_done = 0; + static int verlet_part1_done = FALSE, far_nbrs_done = FALSE; real dt; #if defined(DEBUG) real t_over_start, t_over_elapsed; @@ -684,11 +674,10 @@ int Cuda_Velocity_Verlet_Berendsen_NPT( reax_system* system, control_params* con renbr = steps % control->reneighbor == 0 ? TRUE : FALSE; ret = SUCCESS; - Cuda_ReAllocate( system, control, data, workspace, lists, mpi_data ); - if ( verlet_part1_done == FALSE ) { update_velocity_part1( system, dt ); + verlet_part1_done = TRUE; #if defined(DEBUG_FOCUS) @@ -711,32 +700,32 @@ int Cuda_Velocity_Verlet_Berendsen_NPT( reax_system* system, control_params* con init_blocks( system ); } + Cuda_ReAllocate( system, control, data, workspace, lists, mpi_data ); + Cuda_Reset( system, control, data, workspace, lists ); - if ( renbr ) + if ( renbr && far_nbrs_done == FALSE ) { #if defined(DEBUG) t_over_start = Get_Time( ); #endif - if ( estimate_nbrs_done == 0 ) + ret = Cuda_Generate_Neighbor_Lists( system, data, workspace, lists ); + + if ( ret != SUCCESS ) { - //TODO: move far_nbrs reallocation checks outside of renbr frequency check - ret = Cuda_Estimate_Neighbors( system, data->step ); - estimate_nbrs_done = 1; + Cuda_Estimate_Neighbors( system ); } - - if ( ret == SUCCESS && estimate_nbrs_done == 1 ) + if ( ret == SUCCESS ) { - Cuda_Generate_Neighbor_Lists( system, data, workspace, lists ); - estimate_nbrs_done = 2; + far_nbrs_done = TRUE; + } #if defined(DEBUG) - t_over_elapsed = Get_Timing_Info( t_over_start ); - fprintf( stderr, "p%d --> Overhead (Step-%d) %f \n", - system->my_rank, data->step, t_over_elapsed ); + t_over_elapsed = Get_Timing_Info( t_over_start ); + fprintf( stderr, "p%d --> Overhead (Step-%d) %f \n", + system->my_rank, data->step, t_over_elapsed ); #endif - } } if ( ret == SUCCESS ) @@ -749,12 +738,12 @@ int Cuda_Velocity_Verlet_Berendsen_NPT( reax_system* system, control_params* con { update_velocity_part2( system, dt ); - verlet_part1_done = FALSE; - estimate_nbrs_done = 0; - Cuda_Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D ); Cuda_Compute_Pressure( system, control, data, mpi_data ); Cuda_Scale_Box( system, control, data, mpi_data ); + + verlet_part1_done = FALSE; + far_nbrs_done = FALSE; } #if defined(DEBUG_FOCUS) diff --git a/PG-PuReMD/src/cuda/cuda_lin_alg.cu b/PG-PuReMD/src/cuda/cuda_lin_alg.cu index a0db127b..d47a0856 100644 --- a/PG-PuReMD/src/cuda/cuda_lin_alg.cu +++ b/PG-PuReMD/src/cuda/cuda_lin_alg.cu @@ -570,18 +570,18 @@ void Cuda_RvecCopy_To(rvec2 *dst, real *src, int index, int n) void Cuda_Dual_Matvec( sparse_matrix *H, rvec2 *a, rvec2 *b, int n, int size ) { - int blocks; +// int blocks; - blocks = (n / DEF_BLOCK_SIZE) + - (( n % DEF_BLOCK_SIZE) == 0 ? 0 : 1); +// blocks = (n / DEF_BLOCK_SIZE) + +// (( n % DEF_BLOCK_SIZE) == 0 ? 0 : 1); cuda_memset( b, 0, sizeof(rvec2) * size, "dual_matvec:result" ); - //One thread per row implementation - //k_dual_matvec <<< blocks, DEF_BLOCK_SIZE >>> - // (*H, a, b, n); - //cudaThreadSynchronize (); - //cudaCheckError (); + /* one thread per row implementation */ +// k_dual_matvec <<< blocks, DEF_BLOCK_SIZE >>> +// ( *H, a, b, n ); +// cudaThreadSynchronize( ); +// cudaCheckError( ); //One warp per row implementation #if defined(__SM_35__) @@ -628,16 +628,16 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, rvec2 *b, real tol, rvec2 *x, mpi_datatypes* mpi_data, FILE *fout, simulation_data *data ) { - int i, j, n, N, matvecs, scale; + int i, n, matvecs, scale; +// int j, N; rvec2 tmp, alpha, beta; rvec2 my_sum, norm_sqr, b_norm, my_dot; rvec2 sig_old, sig_new; MPI_Comm comm; rvec2 *spad = (rvec2 *) host_scratch; - int a; n = system->n; - N = system->N; +// N = system->N; comm = mpi_data->world; matvecs = 0; scale = sizeof(rvec2) / sizeof(void); @@ -679,7 +679,7 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, //originally we were using only H->n which was system->n (init_md.c) //Cuda_Dual_Matvec ( H, x, dev_workspace->q2, H->n, system->total_cap); - Cuda_Dual_Matvec ( H, x, dev_workspace->q2, system->N, system->total_cap); + Cuda_Dual_Matvec( H, x, dev_workspace->q2, system->N, system->total_cap ); // compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2"); @@ -693,7 +693,7 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, copy_host_device( spad, dev_workspace->q2, sizeof(rvec2) * system->total_cap, cudaMemcpyDeviceToHost, "CG:q2:get" ); - Coll(system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_unpacker); + Coll( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_unpacker ); copy_host_device( spad, dev_workspace->q2, sizeof(rvec2) * system->total_cap, cudaMemcpyHostToDevice,"CG:q2:put" ); @@ -730,8 +730,9 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, // fprintf (stderr, "cg: my_sum[ %f, %f] \n", my_sum[0], my_sum[1]); //#endif - my_sum[0] = my_sum[1] = 0; - Cuda_Norm (b, n, my_sum); + my_sum[0] = 0; + my_sum[1] = 0; + Cuda_Norm( b, n, my_sum ); // fprintf (stderr, "cg: my_sum[ %f, %f] \n", my_sum[0], my_sum[1]); @@ -750,8 +751,9 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, // fprintf( stderr, "my_dot: %f %f\n", my_dot[0], my_dot[1] ); //#endif - my_dot[0] = my_dot[1] = 0; - Cuda_Dot (dev_workspace->r2, dev_workspace->d2, my_dot, n); + my_dot[0] = 0; + my_dot[1] = 0; + Cuda_Dot( dev_workspace->r2, dev_workspace->d2, my_dot, n ); // fprintf( stderr, "my_dot: %f %f\n", my_dot[0], my_dot[1] ); @@ -841,7 +843,8 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, alpha[0] = sig_new[0] / tmp[0]; alpha[1] = sig_new[1] / tmp[1]; - my_dot[0] = my_dot[1] = 0; + my_dot[0] = 0; + my_dot[1] = 0; //#ifdef __CUDA_DEBUG__ // for( j = 0; j < system->n; ++j ) { @@ -861,7 +864,8 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, // fprintf( stderr, "H:my_dot: %f %f\n", my_dot[0], my_dot[1] ); //#endif - my_dot[0] = my_dot[1] = 0; + my_dot[0] = 0; + my_dot[1] = 0; Cuda_DualCG_Preconditioner( dev_workspace, x, alpha, system->n, my_dot ); //fprintf( stderr, "D:my_dot: %f %f\n", my_dot[0], my_dot[1] ); @@ -971,9 +975,10 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, int Cuda_CG( reax_system *system, storage *workspace, sparse_matrix *H, real *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout ) { - int i, j, scale; + int i, scale; +// int j; real tmp, alpha, beta, b_norm; - real sig_old, sig_new, sig0; + real sig_old, sig_new; real *spad = (real *) host_scratch; scale = sizeof(real) / sizeof(void); @@ -1027,8 +1032,6 @@ int Cuda_CG( reax_system *system, storage *workspace, sparse_matrix *H, real sig_new = Parallel_Dot( spad, spad + system->total_cap, system->n, mpi_data->world ); - sig0 = sig_new; - #if defined(CG_PERFORMANCE) if ( system->my_rank == MASTER_NODE ) { @@ -1036,7 +1039,7 @@ int Cuda_CG( reax_system *system, storage *workspace, sparse_matrix *H, real } #endif - for ( i = 1; i < 300 && SQRT(sig_new) / b_norm > tol; ++i ) + for ( i = 1; i < 1000 && SQRT(sig_new) / b_norm > tol; ++i ) { //MVAPICH2 copy_host_device( spad, dev_workspace->d, sizeof(real) * system->total_cap, @@ -1103,11 +1106,5 @@ int Cuda_CG( reax_system *system, storage *workspace, sparse_matrix *H, real #endif } - if ( i >= 300 ) - { - fprintf( stderr, "CG convergence failed!\n" ); - return i; - } - return i; } diff --git a/PG-PuReMD/src/cuda/cuda_multi_body.cu b/PG-PuReMD/src/cuda/cuda_multi_body.cu index cb741571..9a328061 100644 --- a/PG-PuReMD/src/cuda/cuda_multi_body.cu +++ b/PG-PuReMD/src/cuda/cuda_multi_body.cu @@ -41,9 +41,9 @@ CUDA_GLOBAL void Cuda_Atom_Energy( reax_atom *my_atoms, global_parameters gp, real exp_ovun2n, exp_ovun6, exp_ovun8; real inv_exp_ovun1, inv_exp_ovun2, inv_exp_ovun2n, inv_exp_ovun8; real e_un, CEunder1, CEunder2, CEunder3, CEunder4; - real p_lp1, p_lp2, p_lp3; + real p_lp2, p_lp3; real p_ovun2, p_ovun3, p_ovun4, p_ovun5, p_ovun6, p_ovun7, p_ovun8; - single_body_parameters *sbp_i, *sbp_j; + single_body_parameters *sbp_i; two_body_parameters *twbp; bond_data *pbond; bond_order_data *bo_ij; @@ -59,7 +59,6 @@ CUDA_GLOBAL void Cuda_Atom_Energy( reax_atom *my_atoms, global_parameters gp, storage *workspace = &( p_workspace ); /* Initialize parameters */ - p_lp1 = gp.l[15]; p_lp3 = gp.l[5]; p_ovun3 = gp.l[32]; p_ovun4 = gp.l[31]; @@ -78,8 +77,8 @@ CUDA_GLOBAL void Cuda_Atom_Energy( reax_atom *my_atoms, global_parameters gp, inv_expvd2 = 1. / (1. + expvd2 ); /* calculate the energy */ - data_elp [i] += e_lp = - p_lp2 * workspace->Delta_lp[i] * inv_expvd2; + e_lp = p_lp2 * workspace->Delta_lp[i] * inv_expvd2; + data_elp[i] += e_lp; dElp = p_lp2 * inv_expvd2 + 75 * p_lp2 * workspace->Delta_lp[i] * expvd2 * SQR(inv_expvd2); @@ -117,14 +116,16 @@ CUDA_GLOBAL void Cuda_Atom_Energy( reax_atom *my_atoms, global_parameters gp, twbp = &( tbp[index_tbp (type_i,type_j, num_atom_types) ]); bo_ij = &( bonds->select.bond_list[pj].bo_data ); Di = workspace->Delta[i]; - vov3 = bo_ij->BO - Di - 0.040*POW(Di, 4.); + vov3 = bo_ij->BO - Di - 0.040 * POW(Di, 4.); if ( vov3 > 3. ) { - data_elp [i] += e_lph = p_lp3 * SQR(vov3-3.0); + e_lph = p_lp3 * SQR( vov3 - 3.0 ); + data_elp[i] += e_lph; - deahu2dbo = 2.*p_lp3*(vov3 - 3.); - deahu2dsbo = 2.*p_lp3*(vov3 - 3.)*(-1. - 0.16*POW(Di, 3.)); + deahu2dbo = 2. * p_lp3 * (vov3 - 3.); + deahu2dsbo = 2. * p_lp3 * (vov3 - 3.) * + (-1. - 0.16 * POW(Di, 3.)); bo_ij->Cdbo += deahu2dbo; workspace->CdDelta[i] += deahu2dsbo; @@ -167,18 +168,11 @@ CUDA_GLOBAL void Cuda_Atom_Energy( reax_atom *my_atoms, global_parameters gp, j = bonds->select.bond_list[pj].nbr; type_j = my_atoms[j].type; bo_ij = &(bonds->select.bond_list[pj].bo_data); - sbp_j = &(sbp[ type_j ]); twbp = &(tbp[ index_tbp(type_i, type_j, num_atom_types )]); sum_ovun1 += twbp->p_ovun1 * twbp->De_s * bo_ij->BO; sum_ovun2 += (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j])* ( bo_ij->BO_pi + bo_ij->BO_pi2 ); - - /*fprintf( stdout, "%4d%4d%12.6f%12.6f%12.6f\n", - i+1, j+1, - dfvl * workspace->Delta_lp_temp[j], - sbp_j->nlp_opt, - workspace->nlp_temp[j] );*/ } exp_ovun1 = p_ovun3 * EXP( p_ovun4 * sum_ovun2 ); @@ -192,7 +186,8 @@ CUDA_GLOBAL void Cuda_Atom_Energy( reax_atom *my_atoms, global_parameters gp, DlpVi = 1.0 / (Delta_lpcorr + sbp_i->valency + 1e-8); CEover1 = Delta_lpcorr * DlpVi * inv_exp_ovun2; - data_eov [i] += e_ov = sum_ovun1 * CEover1; + e_ov = sum_ovun1 * CEover1; + data_eov[i] += e_ov; CEover2 = sum_ovun1 * DlpVi * inv_exp_ovun2 * (1.0 - Delta_lpcorr * ( DlpVi + p_ovun2 * exp_ovun2 * inv_exp_ovun2 )); @@ -212,8 +207,8 @@ CUDA_GLOBAL void Cuda_Atom_Energy( reax_atom *my_atoms, global_parameters gp, inv_exp_ovun2n = 1.0 / (1.0 + exp_ovun2n); inv_exp_ovun8 = 1.0 / (1.0 + exp_ovun8); - data_eun [i] += e_un = - -p_ovun5 * (1.0 - exp_ovun6) * inv_exp_ovun2n * inv_exp_ovun8; + e_un = -p_ovun5 * (1.0 - exp_ovun6) * inv_exp_ovun2n * inv_exp_ovun8; + data_eun[i] += e_un; CEunder1 = inv_exp_ovun2n * ( p_ovun5 * p_ovun6 * exp_ovun6 * inv_exp_ovun8 + @@ -330,8 +325,9 @@ CUDA_GLOBAL void Cuda_Atom_Energy( reax_atom *my_atoms, global_parameters gp, CUDA_GLOBAL void Cuda_Atom_Energy_PostProcess( reax_list p_bonds, storage p_workspace, int n ) { - int i,pj; - bond_data *pbond, *sbond; + int i, pj; + bond_data *sbond; +// bond_data *pbond; bond_data *sym_index_bond; reax_list *bonds; storage *workspace; @@ -348,11 +344,9 @@ CUDA_GLOBAL void Cuda_Atom_Energy_PostProcess( reax_list p_bonds, for ( pj = Dev_Start_Index(i, bonds); pj < Dev_End_Index(i, bonds); ++pj ) { - /* - pbond = &(bonds->select.bond_list[pj]); - dbond_index_bond = &( bonds->select.bond_list[ pbond->dbond_index ] ); - workspace->CdDelta [i] += dbond_index_bond->ae_CdDelta; - */ +// pbond = &(bonds->select.bond_list[pj]); +// dbond_index_bond = &( bonds->select.bond_list[ pbond->dbond_index ] ); +// workspace->CdDelta[i] += dbond_index_bond->ae_CdDelta; sbond = &(bonds->select.bond_list[pj]); sym_index_bond = &( bonds->select.bond_list[ sbond->sym_index ]); diff --git a/PG-PuReMD/src/cuda/cuda_neighbors.cu b/PG-PuReMD/src/cuda/cuda_neighbors.cu index f9c127d6..1ba30fa0 100644 --- a/PG-PuReMD/src/cuda/cuda_neighbors.cu +++ b/PG-PuReMD/src/cuda/cuda_neighbors.cu @@ -49,13 +49,14 @@ CUDA_DEVICE real Dev_DistSqr_to_Special_Point( rvec cp, rvec x ) /* Generate far neighbor lists by scanning the atoms list and applying cutoffs */ CUDA_GLOBAL void k_generate_neighbor_lists( reax_atom *my_atoms, - simulation_box my_ext_box, grid g, reax_list far_nbrs, int n, int N ) + simulation_box my_ext_box, grid g, reax_list far_nbrs_list, int n, int N, + int *far_nbrs, int *max_far_nbrs, int *realloc_far_nbrs ) { - int i, j, k, l, m, itr, num_far; + int i, j, k, l, m, itr, num_far, my_num_far; real d, cutoff; ivec c, nbrs_x; rvec dvec; - far_neighbor_data *nbr_data;//, *my_start; + far_neighbor_data *nbr_data; reax_atom *atom1, *atom2; l = blockIdx.x * blockDim.x + threadIdx.x; @@ -65,8 +66,8 @@ CUDA_GLOBAL void k_generate_neighbor_lists( reax_atom *my_atoms, return; } - atom1 = &(my_atoms[l]); - num_far = Dev_Start_Index( l, &far_nbrs ); + atom1 = &( my_atoms[l] ); + num_far = Dev_Start_Index( l, &far_nbrs_list ); /* get the coordinates of the atom and compute the grid cell */ if ( l < n ) @@ -132,7 +133,7 @@ CUDA_GLOBAL void k_generate_neighbor_lists( reax_atom *my_atoms, if ( d <= cutoff ) { /* commit far neighbor to list */ - nbr_data = &(far_nbrs.select.far_nbr_list[num_far]); + nbr_data = &(far_nbrs_list.select.far_nbr_list[num_far]); nbr_data->nbr = m; nbr_data->d = SQRT( d ); rvec_Copy( nbr_data->dvec, dvec ); @@ -176,7 +177,7 @@ CUDA_GLOBAL void k_generate_neighbor_lists( reax_atom *my_atoms, if ( d <= cutoff ) { /* commit far neighbor to list */ - nbr_data = &(far_nbrs.select.far_nbr_list[num_far]); + nbr_data = &(far_nbrs_list.select.far_nbr_list[num_far]); nbr_data->nbr = m; nbr_data->d = SQRT( d ); rvec_Copy( nbr_data->dvec, dvec ); @@ -193,7 +194,14 @@ CUDA_GLOBAL void k_generate_neighbor_lists( reax_atom *my_atoms, ++itr; } - Dev_Set_End_Index( l, num_far, &far_nbrs ); + Dev_Set_End_Index( l, num_far, &far_nbrs_list ); + + /* reallocation check */ + my_num_far = num_far - Dev_Start_Index( l, &far_nbrs_list ); + if ( my_num_far > max_far_nbrs[l] ) + { + *realloc_far_nbrs = TRUE; + } } @@ -421,10 +429,10 @@ CUDA_GLOBAL void k_mt_generate_neighbor_lists( reax_atom *my_atoms, } -void Cuda_Generate_Neighbor_Lists( reax_system *system, simulation_data *data, +int Cuda_Generate_Neighbor_Lists( reax_system *system, simulation_data *data, storage *workspace, reax_list **lists ) { - int i, blocks; + int blocks, ret, ret_far_nbr; #if defined(LOG_PERFORMANCE) real t_start = 0, t_elapsed = 0; @@ -434,12 +442,20 @@ void Cuda_Generate_Neighbor_Lists( reax_system *system, simulation_data *data, } #endif + /* reset reallocation flag on device */ + /* careful: this wrapper around cudaMemset(...) performs a byte-wide assignment + * to the provided literal */ + cuda_memset( system->d_realloc_far_nbrs, FALSE, sizeof(int), + "Cuda_Generate_Neighbor_Lists::d_realloc_far_nbrs" ); + /* one thread per atom implementation */ blocks = (system->N / NBRS_BLOCK_SIZE) + ((system->N % NBRS_BLOCK_SIZE) == 0 ? 0 : 1); + k_generate_neighbor_lists <<< blocks, NBRS_BLOCK_SIZE >>> ( system->d_my_atoms, system->my_ext_box, system->d_my_grid, - *(*dev_lists + FAR_NBRS), system->n, system->N ); + *(*dev_lists + FAR_NBRS), system->n, system->N, + system->d_far_nbrs, system->d_max_far_nbrs, system->d_realloc_far_nbrs ); cudaThreadSynchronize( ); cudaCheckError( ); @@ -454,6 +470,13 @@ void Cuda_Generate_Neighbor_Lists( reax_system *system, simulation_data *data, // cudaThreadSynchronize( ); // cudaCheckError( ); + /* check reallocation flag on device */ + copy_host_device( &ret_far_nbr, system->d_realloc_far_nbrs, sizeof(int), + cudaMemcpyDeviceToHost, "Cuda_Generate_Neighbor_Lists::d_realloc_far_nbrs" ); + + ret = (ret_far_nbr == FALSE) ? SUCCESS : FAILURE; + dev_workspace->realloc.far_nbrs = ret_far_nbr; + #if defined(LOG_PERFORMANCE) if( system->my_rank == MASTER_NODE ) { @@ -467,19 +490,20 @@ void Cuda_Generate_Neighbor_Lists( reax_system *system, simulation_data *data, system->my_rank, data->step ); MPI_Barrier( MPI_COMM_WORLD ); #endif + + return ret; } /* Estimate the number of far neighbors per atom (GPU) */ CUDA_GLOBAL void k_estimate_neighbors( reax_atom *my_atoms, simulation_box my_ext_box, grid g, int n, int N, int total_cap, - int *far_nbrs, int *max_far_nbrs, int *realloc_far_nbrs ) + int *far_nbrs, int *max_far_nbrs ) { int i, j, k, l, m, itr, num_far; real d, cutoff; ivec c, nbrs_x; rvec dvec; - far_neighbor_data *nbr_data; reax_atom *atom1, *atom2; l = blockIdx.x * blockDim.x + threadIdx.x; @@ -604,13 +628,8 @@ CUDA_GLOBAL void k_estimate_neighbors( reax_atom *my_atoms, num_far = MIN_NBRS; } - if ( num_far > max_far_nbrs[l] ) - { - max_far_nbrs[l] = MAX( (int)(num_far * SAFE_ZONE), MIN_NBRS ); - *realloc_far_nbrs = TRUE; - } - far_nbrs[l] = num_far; + max_far_nbrs[l] = MAX( (int)(num_far * SAFE_ZONE), MIN_NBRS ); } @@ -619,17 +638,9 @@ CUDA_GLOBAL void k_estimate_neighbors( reax_atom *my_atoms, * system: atomic system info * returns: SUCCESS if reallocation of the far neighbors list is necessary * based on current per-atom far neighbor limits, FAILURE otherwise */ -int Cuda_Estimate_Neighbors( reax_system *system, int step ) +void Cuda_Estimate_Neighbors( reax_system *system ) { - int blocks, ret, ret_far_nbr; - reax_list *far_nbrs; - - ret = SUCCESS; - - /* careful: this wrapper around cudaMemset(...) performs a byte-wide assignment - * to the provided literal */ - cuda_memset( system->d_realloc_far_nbrs, FALSE, sizeof(int), - "Cuda_Estimate_Neighbors::d_realloc_far_nbrs" ); + int blocks; blocks = system->total_cap / DEF_BLOCK_SIZE + ((system->total_cap % DEF_BLOCK_SIZE == 0) ? 0 : 1); @@ -637,249 +648,12 @@ int Cuda_Estimate_Neighbors( reax_system *system, int step ) k_estimate_neighbors <<< blocks, DEF_BLOCK_SIZE >>> ( system->d_my_atoms, system->my_ext_box, system->d_my_grid, system->n, system->N, system->total_cap, - system->d_far_nbrs, system->d_max_far_nbrs, system->d_realloc_far_nbrs ); + system->d_far_nbrs, system->d_max_far_nbrs ); cudaThreadSynchronize( ); cudaCheckError( ); - /* check reallocation flag on device */ - copy_host_device( &ret_far_nbr, system->d_realloc_far_nbrs, sizeof(int), - cudaMemcpyDeviceToHost, "Cuda_Estimate_Neighbors::d_realloc_far_nbrs" ); - - if ( ret_far_nbr == TRUE ) - { - Cuda_Reduction_Sum( system->d_max_far_nbrs, system->d_total_far_nbrs, - system->total_cap ); - - copy_host_device( &(system->total_far_nbrs), system->d_total_far_nbrs, sizeof(int), - cudaMemcpyDeviceToHost, "Cuda_Estimate_Neighbors::d_total_far_nbrs" ); - - if ( step > 0 ) - { - dev_workspace->realloc.far_nbrs = TRUE; - } - ret = FAILURE; - } - - return ret; -} - - -CUDA_GLOBAL void k_init_end_index( int * intr_cnt, int *indices, int *end_indices, int N ) -{ - int i; - - i = blockIdx.x * blockDim.x + threadIdx.x; - - if ( i >= N ) - { - return; - } - - end_indices[i] = indices[i] + intr_cnt[i]; -} - - -CUDA_GLOBAL void k_setup_hindex( reax_atom *my_atoms, int N ) -{ - int i; - - i = blockIdx.x * blockDim.x + threadIdx.x; - - if ( i >= N ) - { - return; - } - - my_atoms[i].Hindex = i; -} - - -CUDA_GLOBAL void k_setup_hindex_part1( reax_atom *my_atoms, int * hindex, int n ) -{ - int i; - - i = blockIdx.x * blockDim.x + threadIdx.x; - - if ( i >= n ) - { - return; - } - - hindex[i] = my_atoms[i].Hindex; -} - - -CUDA_GLOBAL void k_setup_hindex_part2( reax_atom *my_atoms, int * hindex, int n ) -{ - int i; - - i = blockIdx.x * blockDim.x + threadIdx.x; - - if ( i >= n ) - { - return; - } - - if ( hindex[i + 1] - hindex[i] > 0 ) - { - my_atoms[i].Hindex = hindex[i]; - } - else - { - my_atoms[i].Hindex = -1; - } -} - - -CUDA_GLOBAL void k_init_hbond_indices( reax_atom * atoms, single_body_parameters *sbp, - int *hbonds, int *max_hbonds, int *indices, int *end_indices, int N ) -{ - int i, hindex, my_hbonds; - - i = blockIdx.x * blockDim.x + threadIdx.x; - - if ( i >= N ) - { - return; - } - - hindex = atoms[i].Hindex; - - if ( sbp[ atoms[i].type ].p_hbond == H_ATOM || - sbp[ atoms[i].type ].p_hbond == H_BONDING_ATOM ) - { - my_hbonds = hbonds[i]; - indices[hindex] = max_hbonds[i]; - end_indices[hindex] = indices[hindex] + hbonds[i]; - } - else - { - my_hbonds = 0; - indices[hindex] = 0; - end_indices[hindex] = 0; - } - atoms[i].num_hbonds = my_hbonds; - -// hindex = atoms[i].Hindex; -// -// if ( hindex >= 0 ) -// { -// my_hbonds = hbonds[i]; -// indices[hindex] = max_hbonds[i]; -// end_indices[hindex] = indices[hindex] + hbonds[i]; -// } -// else -// { -// my_hbonds = 0; -// } -// atoms[i].num_hbonds = my_hbonds; -} - - -/* Initialize indices for far neighbors list post reallocation - * - * system: atomic system info. */ -void Cuda_Init_Neighbor_Indices( reax_system *system ) -{ - int blocks; - reax_list *far_nbrs = *dev_lists + FAR_NBRS; - - /* init indices */ - Cuda_Scan_Excl_Sum( system->d_max_far_nbrs, far_nbrs->index, system->total_cap ); - - /* init end_indices */ - blocks = system->N / DEF_BLOCK_SIZE + - ((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1); - k_init_end_index <<< blocks, DEF_BLOCK_SIZE >>> - ( system->d_far_nbrs, far_nbrs->index, far_nbrs->end_index, system->N ); - cudaThreadSynchronize( ); - cudaCheckError( ); -} - - -void Cuda_Init_HBond_Indices( reax_system *system ) -{ - int blocks; - int *temp; - reax_list *hbonds = *dev_lists + HBONDS; - - temp = (int *) scratch; -// cuda_memset( temp, 0, 2 * (system->N + 1) * sizeof(int), -// "Cuda_Init_HBond_Indices::temp" ); - - /* init Hindices */ - blocks = system->N / DEF_BLOCK_SIZE + - ((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1); - - k_setup_hindex <<< blocks, DEF_BLOCK_SIZE >>> - ( system->d_my_atoms, system->N ); - cudaThreadSynchronize( ); - cudaCheckError( ); - -// blocks = system->n / DEF_BLOCK_SIZE + -// ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1); -// -// k_setup_hindex_part1 <<< blocks, DEF_BLOCK_SIZE >>> -// ( system->d_my_atoms, temp, system->n ); -// cudaThreadSynchronize( ); -// cudaCheckError( ); -// -// Cuda_Scan_Excl_Sum( temp, temp + system->n + 1, system->n + 1 ); -// -// k_setup_hindex_part2 <<< blocks, DEF_BLOCK_SIZE >>> -// ( system->d_my_atoms, temp + system->n + 1, system->n ); -// cudaThreadSynchronize( ); -// cudaCheckError( ); - - /* init indices and end_indices */ - Cuda_Scan_Excl_Sum( system->d_max_hbonds, temp, system->total_cap ); - - k_init_hbond_indices <<< blocks, DEF_BLOCK_SIZE >>> - ( system->d_my_atoms, system->reax_param.d_sbp, system->d_hbonds, temp, - hbonds->index, hbonds->end_index, system->N ); - cudaThreadSynchronize( ); - cudaCheckError( ); -} - - -void Cuda_Init_Bond_Indices( reax_system *system ) -{ - int blocks; - reax_list *bonds = *dev_lists + BONDS; - - /* init indices */ - Cuda_Scan_Excl_Sum( system->d_max_bonds, bonds->index, system->total_cap ); - - /* init end_indices */ - blocks = system->N / DEF_BLOCK_SIZE + - ((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1); - k_init_end_index <<< blocks, DEF_BLOCK_SIZE >>> - ( system->d_bonds, bonds->index, bonds->end_index, system->N ); - cudaThreadSynchronize( ); - cudaCheckError( ); -} - - -void Cuda_Init_Sparse_Matrix_Indices( reax_system *system, sparse_matrix *H ) -{ - int blocks; - - /* init indices */ - Cuda_Scan_Excl_Sum( system->d_max_cm_entries, H->start, system->N ); - - /* init end_indices */ - blocks = system->N / DEF_BLOCK_SIZE + - ((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1); - k_init_end_index <<< blocks, DEF_BLOCK_SIZE >>> - ( system->d_cm_entries, H->start, H->end, system->N ); - cudaThreadSynchronize( ); - cudaCheckError( ); -} - - -void Cuda_Init_Three_Body_Indices( int *indices, int entries ) -{ - reax_list *thbody = *dev_lists + THREE_BODIES; - - Cuda_Scan_Excl_Sum( indices, thbody->index, entries ); + Cuda_Reduction_Sum( system->d_max_far_nbrs, system->d_total_far_nbrs, + system->total_cap ); + copy_host_device( &(system->total_far_nbrs), system->d_total_far_nbrs, sizeof(int), + cudaMemcpyDeviceToHost, "Cuda_Estimate_Neighbors::d_total_far_nbrs" ); } diff --git a/PG-PuReMD/src/cuda/cuda_neighbors.h b/PG-PuReMD/src/cuda/cuda_neighbors.h index 4d4a9c4e..d7bfc4b9 100644 --- a/PG-PuReMD/src/cuda/cuda_neighbors.h +++ b/PG-PuReMD/src/cuda/cuda_neighbors.h @@ -9,20 +9,12 @@ extern "C" { #endif -void Cuda_Generate_Neighbor_Lists( reax_system *, simulation_data *, storage *, reax_list ** ); +int Cuda_Generate_Neighbor_Lists( reax_system *, simulation_data *, storage *, reax_list ** ); -int Cuda_Estimate_Neighbors( reax_system *, int ); +void Cuda_Estimate_Neighbors( reax_system * ); void Cuda_Init_Neighbor_Indices( reax_system * ); -void Cuda_Init_HBond_Indices( reax_system * ); - -void Cuda_Init_Bond_Indices( reax_system * ); - -void Cuda_Init_Sparse_Matrix_Indices( reax_system *, sparse_matrix * ); - -void Cuda_Init_Three_Body_Indices( int *, int ); - #ifdef __cplusplus } #endif diff --git a/PG-PuReMD/src/cuda/cuda_random.cu b/PG-PuReMD/src/cuda/cuda_random.cu new file mode 100644 index 00000000..d0de3700 --- /dev/null +++ b/PG-PuReMD/src/cuda/cuda_random.cu @@ -0,0 +1,65 @@ +/*---------------------------------------------------------------------- + PuReMD - Purdue ReaxFF Molecular Dynamics Program + + Copyright (2010) Purdue University + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Joseph Fogarty, jcfogart@mail.usf.edu + Sagar Pandit, pandit@usf.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#include "cuda_random.h" + + +/* System random number generator used linear congruance method with + large periodicity for generation of pseudo random number. function + Random returns this random number appropriately scaled so that + 0 <= Random(range) < range */ +CUDA_DEVICE double Cuda_Random( double range ) +{ + //TODO: use cuRAND +// return (random( ) * range) / 2147483647L; + return 0.0; +} + + +/* This function seeds the system pseudo random number generator with + current time. Use this function once in the begining to initialize + the system */ +void Cuda_Randomize( ) +{ + //TODO: use cuRAND +// curandState_t state; +// +// curand_init( time(NULL), 0, 0, &state ); +} + + +/* GRandom return random number with gaussian distribution with mean + and standard deviation "sigma" */ +CUDA_DEVICE double Cuda_GRandom( double mean, double sigma ) +{ + double v1 = Cuda_Random(2.0) - 1.0; + double v2 = Cuda_Random(2.0) - 1.0; + double rsq = v1 * v1 + v2 * v2; + + while (rsq >= 1.0 || rsq == 0.0) + { + v1 = Cuda_Random(2.0) - 1.0; + v2 = Cuda_Random(2.0) - 1.0; + rsq = v1 * v1 + v2 * v2; + } + + return mean + v1 * sigma * SQRT(-2.0 * LOG(rsq) / rsq); +} diff --git a/PG-PuReMD/src/cuda/cuda_random.h b/PG-PuReMD/src/cuda/cuda_random.h new file mode 100644 index 00000000..388359c7 --- /dev/null +++ b/PG-PuReMD/src/cuda/cuda_random.h @@ -0,0 +1,44 @@ +/*---------------------------------------------------------------------- + PuReMD - Purdue ReaxFF Molecular Dynamics Program + + Copyright (2010) Purdue University + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Joseph Fogarty, jcfogart@mail.usf.edu + Sagar Pandit, pandit@usf.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#ifndef __CUDA_RANDOM_H_ +#define __CUDA_RANDOM_H_ + +#include "../reax_types.h" + + +/* System random number generator used linear congruance method with + large periodicity for generation of pseudo random number. function + Random returns this random number appropriately scaled so that + 0 <= Random(range) < range */ +CUDA_DEVICE double Cuda_Random( double ); + +/* This function seeds the system pseudo random number generator with + current time. Use this function once in the begining to initialize + the system */ +void Cuda_Randomize( ); + +/* GRandom return random number with gaussian distribution with mean + and standard deviation "sigma" */ +CUDA_DEVICE double Cuda_GRandom( double, double ); + + +#endif diff --git a/PG-PuReMD/src/cuda/cuda_system_props.cu b/PG-PuReMD/src/cuda/cuda_system_props.cu index a7ece36c..f67bdaf3 100644 --- a/PG-PuReMD/src/cuda/cuda_system_props.cu +++ b/PG-PuReMD/src/cuda/cuda_system_props.cu @@ -3,8 +3,10 @@ #include "cuda_copy.h" #include "cuda_utils.h" +#include "cuda_random.h" #include "cuda_reduction.h" #include "cuda_shuffle.h" +#include "cuda_vector.h" #include "../tool_box.h" #include "../vector.h" @@ -27,32 +29,35 @@ CUDA_GLOBAL void center_of_mass_blocks( single_body_parameters *sbp, reax_atom * rvec tmp; real m; - rvec_MakeZero (xcm [threadIdx.x]); - rvec_MakeZero (vcm [vcm_id + threadIdx.x]); - rvec_MakeZero (amcm[amcm_id + threadIdx.x]); - rvec_MakeZero (tmp); + rvec_MakeZero(xcm[threadIdx.x]); + rvec_MakeZero(vcm[vcm_id + threadIdx.x]); + rvec_MakeZero(amcm[amcm_id + threadIdx.x]); + rvec_MakeZero(tmp); - if (i < n){ + if ( i < n ) + { m = sbp [ atoms[i].type ].mass; rvec_ScaledAdd (xcm [threadIdx.x], m, atoms [i].x); rvec_ScaledAdd (vcm [vcm_id + threadIdx.x], m, atoms [i].v); rvec_Cross (tmp, atoms[i].x, atoms [i].v); rvec_ScaledAdd (amcm[amcm_id + threadIdx.x], m, tmp); } - __syncthreads (); - - for( int offset = blockDim.x / 2; offset > 0; offset >>= 1 ) { + __syncthreads( ); - if ((threadIdx.x < offset)) { + for( int offset = blockDim.x / 2; offset > 0; offset >>= 1 ) + { + if ((threadIdx.x < offset)) + { index = threadIdx.x + offset; rvec_Add (xcm [threadIdx.x], xcm[index]); rvec_Add (vcm [vcm_id + threadIdx.x], vcm[vcm_id + index]); rvec_Add (amcm[amcm_id + threadIdx.x], amcm[amcm_id + index]); } - __syncthreads (); + __syncthreads( ); } - if ((threadIdx.x == 0)){ + if ((threadIdx.x == 0)) + { rvec_Copy (res_xcm[blockIdx.x], xcm[0]); rvec_Copy (res_vcm[blockIdx.x], vcm[vcm_id]); rvec_Copy (res_amcm[blockIdx.x], amcm[amcm_id]); @@ -883,6 +888,53 @@ extern "C" void dev_sync_simulation_data( simulation_data *data ) } +CUDA_GLOBAL void k_generate_initial_velocities( single_body_parameters *sbp, reax_atom *my_atoms, + real T, int n ) +{ + int i; + real m, scale, norm; + + i = blockIdx.x * blockDim.x + threadIdx.x; + + if ( i >= n ) + { + return; + } + + if ( T <= 0.1 ) + { + rvec_MakeZero( my_atoms[i].v ); + } + else + { + cuda_rvec_Random( my_atoms[i].v ); + + norm = rvec_Norm_Sqr( my_atoms[i].v ); + m = sbp[ my_atoms[i].type ].mass; + scale = SQRT( m * norm / (3.0 * K_B * T) ); + + rvec_Scale( my_atoms[i].v, 1. / scale, my_atoms[i].v ); + } +} + + +void Cuda_Generate_Initial_Velocities( reax_system *system, real T ) +{ + int blocks; + + blocks = system->n / DEF_BLOCK_SIZE + + ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1); + + if ( T > 0.1 ) + { + Cuda_Randomize( ); + } + + k_generate_initial_velocities <<< blocks, DEF_BLOCK_SIZE >>> + ( system->reax_param.d_sbp, system->d_my_atoms, T, system->n ); +} + + void Cuda_Compute_Kinetic_Energy( reax_system* system, simulation_data* data, MPI_Comm comm ) { diff --git a/PG-PuReMD/src/cuda/cuda_system_props.h b/PG-PuReMD/src/cuda/cuda_system_props.h index ddf8ed4d..9877f6dd 100644 --- a/PG-PuReMD/src/cuda/cuda_system_props.h +++ b/PG-PuReMD/src/cuda/cuda_system_props.h @@ -23,6 +23,8 @@ void dev_sync_simulation_data( simulation_data * ); void Cuda_Compute_Total_Mass( reax_system*, simulation_data*, MPI_Comm ); +void Cuda_Generate_Initial_Velocities( reax_system *, real ); + void Cuda_Compute_Kinetic_Energy( reax_system*, simulation_data*, MPI_Comm ); void Cuda_Compute_Center_of_Mass( reax_system*, simulation_data*, diff --git a/PG-PuReMD/src/cuda/cuda_torsion_angles.cu b/PG-PuReMD/src/cuda/cuda_torsion_angles.cu index 47c087d2..21cb33d6 100644 --- a/PG-PuReMD/src/cuda/cuda_torsion_angles.cu +++ b/PG-PuReMD/src/cuda/cuda_torsion_angles.cu @@ -165,9 +165,9 @@ CUDA_GLOBAL void Cuda_Torsion_Angles( reax_atom *my_atoms, global_parameters gp, reax_list p_thb_intrs, storage p_workspace, int n, int num_atom_types, real *data_e_tor, real *data_e_con, rvec *data_ext_press ) { - int i, j, k, l, pi, pj, pk, pl, pij, plk, natoms; + int i, j, k, l, pi, pj, pk, pl, pij, plk; int type_i, type_j, type_k, type_l; - int start_j, end_j, start_k, end_k; + int start_j, end_j; int start_pj, end_pj, start_pk, end_pk; int num_frb_intrs = 0; @@ -217,9 +217,6 @@ CUDA_GLOBAL void Cuda_Torsion_Angles( reax_atom *my_atoms, global_parameters gp, // sprintf( fname, "tor%d.out", system->my_rank ); // ftor = fopen( fname, "w" ); - //natoms = system->n; - - //for( j = 0; j < natoms; ++j ) { type_j = my_atoms[j].type; Delta_j = workspace->Delta_boc[j]; start_j = Dev_Start_Index(j, bonds); @@ -236,10 +233,8 @@ CUDA_GLOBAL void Cuda_Torsion_Angles( reax_atom *my_atoms, global_parameters gp, where j is the central atom. Otherwise there is no point in trying to form a 4-body interaction out of this neighborhood */ if ( my_atoms[j].orig_id < my_atoms[k].orig_id && - bo_jk->BO > control->thb_cut/*0*/ && Dev_Num_Entries(pk, thb_intrs) ) + bo_jk->BO > control->thb_cut && Dev_Num_Entries(pk, thb_intrs) ) { - start_k = Dev_Start_Index(k, bonds); - end_k = Dev_End_Index(k, bonds); pj = pbond_jk->sym_index; // pj points to j on k's list /* do the same check as above: @@ -272,7 +267,7 @@ CUDA_GLOBAL void Cuda_Torsion_Angles( reax_atom *my_atoms, global_parameters gp, bo_ij = &( pbond_ij->bo_data ); - if ( bo_ij->BO > control->thb_cut/*0*/ ) + if ( bo_ij->BO > control->thb_cut ) { i = p_ijk->thb; type_i = my_atoms[i].type; @@ -312,8 +307,8 @@ CUDA_GLOBAL void Cuda_Torsion_Angles( reax_atom *my_atoms, global_parameters gp, fbp = &(d_fbp[index_fbp (type_i,type_j,type_k,type_l,num_atom_types)].prm[0]); if ( i != l && fbh->cnt && - bo_kl->BO > control->thb_cut/*0*/ && - bo_ij->BO * bo_jk->BO * bo_kl->BO > control->thb_cut/*0*/ ) + bo_kl->BO > control->thb_cut && + bo_ij->BO * bo_jk->BO * bo_kl->BO > control->thb_cut ) { ++num_frb_intrs; r_kl = pbond_kl->d; @@ -340,7 +335,6 @@ CUDA_GLOBAL void Cuda_Torsion_Angles( reax_atom *my_atoms, global_parameters gp, -1., my_atoms[l].x ); r_li = rvec_Norm( dvec_li ); - /* omega and its derivative */ omega = Calculate_Omega( pbond_ij->dvec, r_ij, pbond_jk->dvec, r_jk, pbond_kl->dvec, r_kl, @@ -365,7 +359,8 @@ CUDA_GLOBAL void Cuda_Torsion_Angles( reax_atom *my_atoms, global_parameters gp, fbp->V2 * exp_tor1 * (1.0 - cos2omega) + fbp->V3 * (1.0 + cos3omega) ); - data_e_tor [j] += e_tor = fn10 * sin_ijk * sin_jkl * CV; + e_tor = fn10 * sin_ijk * sin_jkl * CV; + data_e_tor[j] += e_tor; dfn11 = (-p_tor3 * exp_tor3_DjDk + (p_tor3 * exp_tor3_DjDk - p_tor4 * exp_tor4_DjDk) * @@ -398,9 +393,9 @@ CUDA_GLOBAL void Cuda_Torsion_Angles( reax_atom *my_atoms, global_parameters gp, /* 4-body conjugation energy */ fn12 = exp_cot2_ij * exp_cot2_jk * exp_cot2_kl; - data_e_con [j] += e_con = - fbp->p_cot1 * fn12 * + e_con = fbp->p_cot1 * fn12 * (1.0 + (SQR(cos_omega) - 1.0) * sin_ijk * sin_jkl); + data_e_con[j] += e_con; Cconj = -2.0 * fn12 * fbp->p_cot1 * p_cot2 * (1.0 + (SQR(cos_omega) - 1.0) * sin_ijk * sin_jkl); diff --git a/PG-PuReMD/src/cuda/cuda_utils.cu b/PG-PuReMD/src/cuda/cuda_utils.cu index 7e1757bc..22ac8de6 100644 --- a/PG-PuReMD/src/cuda/cuda_utils.cu +++ b/PG-PuReMD/src/cuda/cuda_utils.cu @@ -11,7 +11,7 @@ extern "C" void cuda_malloc( void **ptr, size_t size, int mem_set, const char *m if ( retVal != cudaSuccess ) { fprintf( stderr, "[ERROR] failed to allocate memory on device for resouce %s\n", msg ); - fprintf( stderr, "CUDA API error code: %d, requested memory size (in bytes): %lu\n", + fprintf( stderr, " [INFO] CUDA API error code: %d, requested memory size (in bytes): %lu\n", retVal, size ); exit( INSUFFICIENT_MEMORY ); } @@ -23,7 +23,7 @@ extern "C" void cuda_malloc( void **ptr, size_t size, int mem_set, const char *m if( retVal != cudaSuccess ) { fprintf( stderr, "[ERROR] failed to memset memory on device for resource %s\n", msg ); - fprintf( stderr, "CUDA API error code: %d, requested memory size (in bytes): %lu\n", + fprintf( stderr, " [INFO] CUDA API error code: %d, requested memory size (in bytes): %lu\n", retVal, size ); exit( INSUFFICIENT_MEMORY ); } @@ -45,9 +45,10 @@ extern "C" void cuda_free( void *ptr, const char *msg ) if( retVal != cudaSuccess ) { - fprintf( stderr, - "[WARNING] failed to release memory on device for resource %s\nCUDA API error code: %d, memory address: %ld\n", - msg, retVal, (long int) ptr ); + fprintf( stderr, "[WARNING] failed to release memory on device for resource %s\n", + msg ); + fprintf( stderr, " [INFO] CUDA API error code: %d, memory address: %ld\n", + retVal, (long int) ptr ); return; } } @@ -61,10 +62,9 @@ extern "C" void cuda_memset( void *ptr, int data, size_t count, const char *msg if( retVal != cudaSuccess ) { - fprintf( stderr, - "[ERROR] failed to memset memory on device for resource %s\nCUDA API error code: %d\n", - msg, retVal ); - exit( INSUFFICIENT_MEMORY ); + fprintf( stderr, "[ERROR] failed to memset memory on device for resource %s\n", msg ); + fprintf( stderr, " [INFO] CUDA API error code: %d\n", retVal ); + exit( RUNTIME_ERROR ); } } @@ -86,7 +86,7 @@ extern "C" void copy_host_device( void *host, void *dev, size_t size, if( retVal != cudaSuccess ) { fprintf( stderr, - "[ERROR] could not copy resource %s from host to device\nCUDA API error code: %d n", + "[ERROR] could not copy resource %s from host to device\n [INFO] CUDA API error code: %d n", msg, retVal ); exit( INSUFFICIENT_MEMORY ); } @@ -102,7 +102,7 @@ extern "C" void copy_device( void *dest, void *src, size_t size, const char *msg if( retVal != cudaSuccess ) { fprintf( stderr, - "[ERROR] could not copy resource %s from device to device\nCUDA API error code: %d\n", + "[ERROR] could not copy resource %s from device to device\n [INFO] CUDA API error code: %d\n", msg, retVal ); exit( INSUFFICIENT_MEMORY ); } @@ -140,7 +140,7 @@ extern "C" void print_device_mem_usage( ) if ( retVal != cudaSuccess ) { fprintf( stderr, - "[WARNING] could not get message usage info from device\nCUDA API error code: %d\n", + "[WARNING] could not get message usage info from device\n [INFO] CUDA API error code: %d\n", retVal ); return; } diff --git a/PG-PuReMD/src/cuda/cuda_valence_angles.cu b/PG-PuReMD/src/cuda/cuda_valence_angles.cu index 21b8d2c8..35e07fe0 100644 --- a/PG-PuReMD/src/cuda/cuda_valence_angles.cu +++ b/PG-PuReMD/src/cuda/cuda_valence_angles.cu @@ -37,7 +37,8 @@ CUDA_GLOBAL void Cuda_Valence_Angles( reax_atom *my_atoms, { int i, j, pi, k, pk, t; int type_i, type_j, type_k; - int start_j, end_j, start_pk, end_pk; + int start_j, end_j; +// int start_pk, end_pk; int cnt, num_thb_intrs; real temp, temp_bo_jt, pBOjt7; real p_val1, p_val2, p_val3, p_val4, p_val5; @@ -54,12 +55,12 @@ CUDA_GLOBAL void Cuda_Valence_Angles( reax_atom *my_atoms, real Cf7ij, Cf7jk, Cf8j, Cf9j; real f7_ij, f7_jk, f8_Dj, f9_Dj; real Ctheta_0, theta_0, theta_00, theta, cos_theta, sin_theta; - real r_ij, r_jk; real BOA_ij, BOA_jk; rvec force, ext_press; three_body_header *thbh; three_body_parameters *thbp; - three_body_interaction_data *p_ijk, *p_kji; + three_body_interaction_data *p_ijk; +// three_body_interaction_data *p_kji; bond_data *pbond_ij, *pbond_jk, *pbond_jt; bond_order_data *bo_ij, *bo_jk, *bo_jt; reax_list *bonds; @@ -146,17 +147,14 @@ CUDA_GLOBAL void Cuda_Valence_Angles( reax_atom *my_atoms, bo_ij = &(pbond_ij->bo_data); BOA_ij = bo_ij->BO - control->thb_cut; - //if( BOA_ij/*bo_ij->BO*/ > 0.0) { if ( BOA_ij > 0.0 && ( j < n || pbond_ij->nbr < n ) ) { i = pbond_ij->nbr; - r_ij = pbond_ij->d; type_i = my_atoms[i].type; - /* first copy 3-body intrs from previously computed ones where i>k. - in the second for-loop below, - we compute only new 3-body intrs where i < k */ + /* first copy 3-body intrs from previously computed ones where i > k; + in the second for-loop below, compute only new 3-body intrs where i < k */ // The copy loop commented out because strange asynchronous issues started to surface // Each kernel now manually generates everything @@ -234,7 +232,6 @@ CUDA_GLOBAL void Cuda_Valence_Angles( reax_atom *my_atoms, if ( j < n && BOA_jk > 0.0 && bo_ij->BO * bo_jk->BO > SQR(control->thb_cut) ) { - r_jk = pbond_jk->d; thbh = &( d_thbh[ index_thbp(type_i, type_j, type_k, num_atom_types) ] ); for ( cnt = 0; cnt < thbh->cnt; ++cnt ) @@ -274,7 +271,7 @@ CUDA_GLOBAL void Cuda_Valence_Angles( reax_atom *my_atoms, { expval12theta = p_val1 * (1.0 - expval2theta); } - // To avoid linear Me-H-Me angles (6/6/06) + /* to avoid linear Me-H-Me angles (6/6/06) */ else { expval12theta = p_val1 * -expval2theta; @@ -340,13 +337,13 @@ CUDA_GLOBAL void Cuda_Valence_Angles( reax_atom *my_atoms, exp_coa2 = EXP( p_coa2 * workspace->Delta_boc[j] ); - /* Similar to above comment regarding if statement */ + /* similar to above comment regarding if statement */ if ( pk < pi ) { e_coa = p_coa1 / (1. + exp_coa2) * - EXP( -p_coa3 * SQR(workspace->total_bond_order[i]-BOA_ij) ) * - EXP( -p_coa3 * SQR(workspace->total_bond_order[k]-BOA_jk) ) * + EXP( -p_coa3 * SQR(workspace->total_bond_order[i] - BOA_ij) ) * + EXP( -p_coa3 * SQR(workspace->total_bond_order[k] - BOA_jk) ) * EXP( -p_coa4 * SQR(BOA_ij - 1.5) ) * EXP( -p_coa4 * SQR(BOA_jk - 1.5) ); data_e_coa[j] += e_coa; @@ -356,9 +353,9 @@ CUDA_GLOBAL void Cuda_Valence_Angles( reax_atom *my_atoms, CEcoa2 = -2 * p_coa4 * (BOA_jk - 1.5) * e_coa; CEcoa3 = -p_coa2 * exp_coa2 * e_coa / (1 + exp_coa2); CEcoa4 = -2 * p_coa3 * - (workspace->total_bond_order[i]-BOA_ij) * e_coa; + (workspace->total_bond_order[i] - BOA_ij) * e_coa; CEcoa5 = -2 * p_coa3 * - (workspace->total_bond_order[k]-BOA_jk) * e_coa; + (workspace->total_bond_order[k] - BOA_jk) * e_coa; /* END COALITION ENERGY */ /* FORCES */ @@ -562,7 +559,7 @@ CUDA_GLOBAL void Cuda_Valence_Angles_PostProcess( reax_atom *atoms, CUDA_GLOBAL void Estimate_Cuda_Valence_Angles( reax_atom *my_atoms, control_params *control, reax_list p_bonds, int n, int N, int *count ) { - int i, j, k, pi, pk; + int j, pi, pk; int start_j, end_j; int num_thb_intrs; real BOA_ij, BOA_jk; @@ -590,12 +587,9 @@ CUDA_GLOBAL void Estimate_Cuda_Valence_Angles( reax_atom *my_atoms, bo_ij = &(pbond_ij->bo_data); BOA_ij = bo_ij->BO - control->thb_cut; - //if( BOA_ij/*bo_ij->BO*/ > 0.0) { if ( BOA_ij > 0.0 && ( j < n || pbond_ij->nbr < n ) ) { - i = pbond_ij->nbr; - for ( pk = start_j; pk < end_j; ++pk ) { if ( pk == pi ) diff --git a/PG-PuReMD/src/cuda/cuda_vector.h b/PG-PuReMD/src/cuda/cuda_vector.h new file mode 100644 index 00000000..ef526938 --- /dev/null +++ b/PG-PuReMD/src/cuda/cuda_vector.h @@ -0,0 +1,41 @@ +/*---------------------------------------------------------------------- + PuReMD - Purdue ReaxFF Molecular Dynamics Program + + Copyright (2010) Purdue University + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Joseph Fogarty, jcfogart@mail.usf.edu + Sagar Pandit, pandit@usf.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#ifndef __CUDA_VECTOR_H_ +#define __CUDA_VECTOR_H_ + +#include "../reax_types.h" + +#include "cuda_random.h" + + +CUDA_DEVICE static inline void cuda_rvec_Random( rvec v ) +{ +// v[0] = Cuda_Random( 2.0 ) - 1.0; +// v[1] = Cuda_Random( 2.0 ) - 1.0; +// v[2] = Cuda_Random( 2.0 ) - 1.0; + v[0] = 0.0; + v[1] = 0.0; + v[2] = 0.0; +} + + +#endif diff --git a/PG-PuReMD/src/ffield.c b/PG-PuReMD/src/ffield.c index 7252674c..4302e57c 100644 --- a/PG-PuReMD/src/ffield.c +++ b/PG-PuReMD/src/ffield.c @@ -31,7 +31,7 @@ int Read_Force_Field( char *ffield_file, reax_interaction *reax, - control_params *control ) + reax_system *system, control_params *control ) { FILE *fp; char *s; @@ -45,7 +45,8 @@ int Read_Force_Field( char *ffield_file, reax_interaction *reax, /* open force field file */ if ( (fp = fopen( ffield_file, "r" ) ) == NULL ) { - fprintf( stderr, "[ERROR] cannot open force filed file! terminating...\n" ); + fprintf( stderr, "[ERROR] p%d: cannot open force field file! terminating...\n", + system->my_rank ); MPI_Abort( MPI_COMM_WORLD, FILE_NOT_FOUND ); } @@ -67,7 +68,8 @@ int Read_Force_Field( char *ffield_file, reax_interaction *reax, n = atoi(tmp[0]); if ( n < 1 ) { - fprintf( stderr, "[WARNING] number of globals in ffield file is 0!\n" ); + fprintf( stderr, "[WARNING] p%d: number of globals in ffield file is 0!\n", + system->my_rank ); return SUCCESS; } @@ -146,7 +148,8 @@ int Read_Force_Field( char *ffield_file, reax_interaction *reax, } #if defined(DEBUG_FOCUS) - fprintf( stderr, "Atom Name in the force field : %s \n", reax->sbp[i].name ); + fprintf( stderr, "p%d: Atom Name in the force field : %s \n", + system->my_rank, reax->sbp[i].name ); #endif val = atof(tmp[1]); @@ -233,20 +236,20 @@ int Read_Force_Field( char *ffield_file, reax_interaction *reax, { if ( reax->gp.vdw_type != 0 && reax->gp.vdw_type != 3 ) { - fprintf( stderr, "[WARNING] inconsistent vdWaals-parameters\n" + fprintf( stderr, "[WARNING] p%d: inconsistent vdWaals-parameters\n" "Force field parameters for element %s\n" "indicate inner wall+shielding, but earlier\n" "atoms indicate different vdWaals-method.\n" "This may cause division-by-zero errors.\n" "Keeping vdWaals-setting for earlier atoms.\n", - reax->sbp[i].name ); + system->my_rank, reax->sbp[i].name ); } else { reax->gp.vdw_type = 3; #if defined(DEBUG) - fprintf( stderr, "vdWaals type for element %s: Shielding+inner-wall", - reax->sbp[i].name ); + fprintf( stderr, "p%d: vdWaals type for element %s: Shielding+inner-wall", + system->my_rank, reax->sbp[i].name ); #endif } } @@ -255,7 +258,8 @@ int Read_Force_Field( char *ffield_file, reax_interaction *reax, { if ( reax->gp.vdw_type != 0 && reax->gp.vdw_type != 2 ) { - fprintf( stderr, "[WARNING] inconsistent vdWaals-parameters\n" ); + fprintf( stderr, "[WARNING] p%d: inconsistent vdWaals-parameters\n", + system->my_rank ); fprintf( stderr, " [INFO] Force field parameters for element %s\n", reax->sbp[i].name ); fprintf( stderr, " [INFO] indicate inner wall without shielding, but earlier\n" ); fprintf( stderr, " [INFO] atoms indicate different vdWaals-method.\n" ); @@ -266,8 +270,8 @@ int Read_Force_Field( char *ffield_file, reax_interaction *reax, { reax->gp.vdw_type = 2; #if defined(DEBUG) - fprintf( stderr, "vdWaals type for element%s: No Shielding,inner-wall", - reax->sbp[i].name ); + fprintf( stderr, "p%d: vdWaals type for element%s: No Shielding,inner-wall", + system->my_rank, reax->sbp[i].name ); #endif } } @@ -279,34 +283,34 @@ int Read_Force_Field( char *ffield_file, reax_interaction *reax, if ( reax->sbp[i].gamma_w > 0.5 ) { if ( reax->gp.vdw_type != 0 && reax->gp.vdw_type != 1 ) - fprintf( stderr, "[WARNING] inconsistent vdWaals-parameters\n" \ + fprintf( stderr, "[WARNING] p%d: inconsistent vdWaals-parameters\n" \ " [INFO] Force field parameters for element %s\n" \ " [INFO] indicate shielding without inner wall, but earlier\n" \ " [INFO] atoms indicate different vdWaals-method.\n" \ " [INFO] This may cause division-by-zero errors.\n" \ " [INFO] Keeping vdWaals-setting for earlier atoms.\n", - reax->sbp[i].name ); + system->my_rank, reax->sbp[i].name ); else { reax->gp.vdw_type = 1; #if defined(DEBUG) - fprintf( stderr, "vdWaals type for element%s: Shielding,no inner-wall", - reax->sbp[i].name ); + fprintf( stderr, "p%d, vdWaals type for element%s: Shielding,no inner-wall", + system->my_rank, reax->sbp[i].name ); #endif } } else { - fprintf( stderr, "[ERROR] inconsistent vdWaals-parameters\n"\ + fprintf( stderr, "[ERROR] p%d: inconsistent vdWaals-parameters\n" \ " [INFO] No shielding or inner-wall set for element %s\n", - reax->sbp[i].name ); + system->my_rank, reax->sbp[i].name ); MPI_Abort( MPI_COMM_WORLD, INVALID_INPUT ); } } } #if defined(DEBUG) - fprintf( stderr, "vdWaals type: %d\n", reax->gp.vdw_type ); + fprintf( stderr, "p%d: vdWaals type: %d\n", system->my_rank, reax->gp.vdw_type ); #endif /* Equate vval3 to valf for first-row elements (25/10/2004) */ @@ -315,8 +319,8 @@ int Read_Force_Field( char *ffield_file, reax_interaction *reax, if ( reax->sbp[i].mass < 21 && reax->sbp[i].valency_val != reax->sbp[i].valency_boc ) { - fprintf( stderr, "[WARNING] changed valency_val to valency_boc for atom type %s\n", - reax->sbp[i].name ); + fprintf( stderr, "[WARNING] p%d: changed valency_val to valency_boc for atom type %s\n", + system->my_rank, reax->sbp[i].name ); reax->sbp[i].valency_val = reax->sbp[i].valency_boc; } } @@ -764,7 +768,7 @@ int Read_Force_Field( char *ffield_file, reax_interaction *reax, sfree( tor_flag, "READ_FFIELD" ); #if defined(DEBUG_FOCUS) - fprintf( stderr, "force field read\n" ); + fprintf( stderr, "p%d: force field read\n", system->my_rank ); #endif return SUCCESS; diff --git a/PG-PuReMD/src/ffield.h b/PG-PuReMD/src/ffield.h index 5cca5e9d..902e8572 100644 --- a/PG-PuReMD/src/ffield.h +++ b/PG-PuReMD/src/ffield.h @@ -29,7 +29,7 @@ extern "C" { #endif -int Read_Force_Field( char*, reax_interaction*, control_params* ); +int Read_Force_Field( char*, reax_interaction*, reax_system *, control_params* ); #ifdef __cplusplus } diff --git a/PG-PuReMD/src/init_md.c b/PG-PuReMD/src/init_md.c index 2e406d1a..3d27ecf6 100644 --- a/PG-PuReMD/src/init_md.c +++ b/PG-PuReMD/src/init_md.c @@ -105,7 +105,7 @@ void Generate_Initial_Velocities( reax_system *system, real T ) } else { - Randomize(); + Randomize( ); for ( i = 0; i < system->n; i++ ) { diff --git a/PG-PuReMD/src/io_tools.c b/PG-PuReMD/src/io_tools.c index 91783d67..3cce1777 100644 --- a/PG-PuReMD/src/io_tools.c +++ b/PG-PuReMD/src/io_tools.c @@ -634,25 +634,19 @@ void Print_Grid( grid* g, FILE *out ) fprintf( out, "\t---------------------------------\n" ); fprintf( stderr, "GCELL MARKS:\n" ); - //SUDHIR - //gc_type = g->cells[0][0][0].type; - gc_type = g->cells[ index_grid_3d (0, 0, 0, g) ].type; + gc_type = g->cells[ index_grid_3d(0, 0, 0, g) ].type; ivec_MakeZero( gc_str ); x = y = z = 0; for ( x = 0; x < g->ncells[0]; ++x ) for ( y = 0; y < g->ncells[1]; ++y ) for ( z = 0; z < g->ncells[2]; ++z ) - //SUDHIR - //if( g->cells[x][y][z].type != gc_type ){ if ( g->cells[ index_grid_3d(x, y, z, g) ].type != gc_type ) { fprintf( stderr, "\tgcells from(%2d %2d %2d) to (%2d %2d %2d): %d - %s\n", gc_str[0], gc_str[1], gc_str[2], x, y, z, gc_type, gcell_type_text[gc_type] ); - //SUDHIR - //gc_type = g->cells[x][y][z].type; gc_type = g->cells[ index_grid_3d(x, y, z, g) ].type; gc_str[0] = x; gc_str[1] = y; @@ -728,8 +722,6 @@ void Print_Native_GCells( reax_system *system ) for ( j = g->native_str[1]; j < g->native_end[1]; j++ ) for ( k = g->native_str[2]; k < g->native_end[2]; k++ ) { - //SUDHIR - //gc = &( g->cells[i][j][k] ); gc = &( g->cells[ index_grid_3d(i, j, k, g) ] ); fprintf( f, "p%d gcell(%2d %2d %2d) of type %d(%s)\n", @@ -770,8 +762,6 @@ void Print_All_GCells( reax_system *system ) for ( j = 0; j < g->ncells[1]; j++ ) for ( k = 0; k < g->ncells[2]; k++ ) { - //SUDHIR - //gc = &( g->cells[i][j][k] ); gc = &( g->cells[ index_grid_3d(i, j, k, g) ] ); fprintf( f, "p%d gcell(%2d %2d %2d) of type %d(%s)\n", diff --git a/PG-PuReMD/src/parallelreax.c b/PG-PuReMD/src/parallelreax.c index cc5f9d3d..9ace1c32 100644 --- a/PG-PuReMD/src/parallelreax.c +++ b/PG-PuReMD/src/parallelreax.c @@ -73,7 +73,7 @@ void Read_System( char *geo_file, char *ffield_file, char *control_file, storage *workspace, output_controls *out_control, mpi_datatypes *mpi_data ) { /* ffield file */ - Read_Force_Field( ffield_file, &(system->reax_param), control ); + Read_Force_Field( ffield_file, &(system->reax_param), system, control ); /* control file */ Read_Control_File( control_file, control, out_control ); @@ -240,7 +240,7 @@ int main( int argc, char* argv[] ) print_device_mem_usage( ); #endif - /* init the blocks sizes for cuda kernels */ + /* init blocks sizes */ init_blocks( system ); /* measure total simulation time after input is read */ @@ -397,7 +397,7 @@ int main( int argc, char* argv[] ) if ( retries >= MAX_RETRIES ) { - fprintf( stderr, "Maximum retries reached for this step (%d). Terminating...\n", + fprintf( stderr, "[ERROR] Maximum retries reached for this step (%d). Terminating...\n", retries ); MPI_Abort( MPI_COMM_WORLD, MAX_RETRIES_REACHED ); } diff --git a/PG-PuReMD/src/random.c b/PG-PuReMD/src/random.c index ffe55458..f9152830 100644 --- a/PG-PuReMD/src/random.c +++ b/PG-PuReMD/src/random.c @@ -19,30 +19,31 @@ <http://www.gnu.org/licenses/>. ----------------------------------------------------------------------*/ -#include "reax_types.h" - #include "random.h" + /* System random number generator used linear congruance method with large periodicity for generation of pseudo random number. function Random returns this random number appropriately scaled so that 0 <= Random(range) < range */ -double Random(double range) +double Random( double range ) { - return (random() * range) / 2147483647L; + return (random( ) * range) / 2147483647L; } + /* This function seeds the system pseudo random number generator with current time. Use this function once in the begining to initialize the system */ -void Randomize() +void Randomize( ) { - srandom(time(NULL)); + srandom( time(NULL) ); } + /* GRandom return random number with gaussian distribution with mean and standard deviation "sigma" */ -double GRandom(double mean, double sigma) +double GRandom( double mean, double sigma ) { double v1 = Random(2.0) - 1.0; double v2 = Random(2.0) - 1.0; diff --git a/PG-PuReMD/src/random.h b/PG-PuReMD/src/random.h index 66a5d59d..d26de8f0 100644 --- a/PG-PuReMD/src/random.h +++ b/PG-PuReMD/src/random.h @@ -48,4 +48,5 @@ double GRandom( double, double ); } #endif + #endif diff --git a/PG-PuReMD/src/reax_types.h b/PG-PuReMD/src/reax_types.h index c0bac402..a6d16f8e 100644 --- a/PG-PuReMD/src/reax_types.h +++ b/PG-PuReMD/src/reax_types.h @@ -198,12 +198,12 @@ #endif /**************** RESOURCE CONSTANTS **********************/ -/* 20 MB */ -#define HOST_SCRATCH_SIZE (1024 * 1024 * 20) +/* 500 MB */ +#define HOST_SCRATCH_SIZE (1024 * 1024 * 500) #ifdef HAVE_CUDA -/* 20 MB */ -#define DEVICE_SCRATCH_SIZE (1024 * 1024 * 20) -/* 20 MB */ +/* 500 MB */ +#define DEVICE_SCRATCH_SIZE (1024 * 1024 * 500) +/* 500 MB */ #define RES_SCRATCH 0x90 /* BLOCK SIZES for kernels */ @@ -433,21 +433,21 @@ typedef real rvec4[4]; /* header used in restart file */ typedef struct { - /**/ + /* current simulation time step */ int step; - /**/ + /* total num. atoms in simulation */ int bigN; - /**/ + /* thermostat temperature */ real T; - /**/ + /* thrmostat ??? */ real xi; - /**/ + /* thrmostat ??? */ real v_xi; - /**/ + /* thrmostat ??? */ real v_xi_old; - /**/ + /* thrmostat ??? */ real G_xi; - /**/ + /* ??? */ rtensor box; } restart_header; @@ -1238,6 +1238,8 @@ typedef struct /* TRUE if charge matrix requires reallocation, FALSE otherwise (GPU) */ int *d_realloc_cm_entries; + /* total num. three body list indices */ + int total_thbodies_indices; /* total num. three body interactions */ int total_thbodies; /* total num. three body interactions (GPU) */ @@ -1802,6 +1804,8 @@ typedef struct int num_bonds; /* TRUE if three body list needs * to be reallocated, FALSE otherwise */ + int thbody; + /**/ int num_3body; /**/ int gcell_atoms; diff --git a/PG-PuReMD/src/traj.c b/PG-PuReMD/src/traj.c index 331e4430..af27e1cb 100644 --- a/PG-PuReMD/src/traj.c +++ b/PG-PuReMD/src/traj.c @@ -350,11 +350,11 @@ int Write_Header( reax_system *system, control_params *control, { out_control->trj_offset = 0; Set_My_Trajectory_View( out_control->trj, - out_control->trj_offset, mpi_data->header_line, - mpi_data->world, system->my_rank, - my_hdr_lines, num_hdr_lines ); + out_control->trj_offset, mpi_data->header_line, + mpi_data->world, system->my_rank, + my_hdr_lines, num_hdr_lines ); MPI_File_write_all( out_control->trj, out_control->buffer, - num_hdr_lines, mpi_data->header_line, &status ); + num_hdr_lines, mpi_data->header_line, &status ); out_control->trj_offset = (num_hdr_lines) * HEADER_LINE_LEN; } else @@ -411,10 +411,10 @@ int Write_Init_Desc( reax_system *system, control_params *control, if ( out_control->traj_method == MPI_TRAJ ) { Set_My_Trajectory_View( out_control->trj, out_control->trj_offset, - mpi_data->init_desc_line, mpi_data->world, - me, system->n, system->bigN ); + mpi_data->init_desc_line, mpi_data->world, + me, system->n, system->bigN ); MPI_File_write( out_control->trj, out_control->buffer, system->n, - mpi_data->init_desc_line, &status ); + mpi_data->init_desc_line, &status ); out_control->trj_offset += system->bigN * INIT_DESC_LEN; } else @@ -446,12 +446,12 @@ int Init_Traj( reax_system *system, control_params *control, output_controls *out_control, mpi_datatypes *mpi_data, char *msg ) { char fname[MAX_STR]; - int atom_line_len[ NR_OPT_ATOM ] = { 0, 0, 0, 0, - ATOM_BASIC_LEN, ATOM_wV_LEN, - ATOM_wF_LEN, ATOM_FULL_LEN - }; - int bond_line_len[ NR_OPT_BOND ] = { 0, BOND_BASIC_LEN, BOND_FULL_LEN }; - int angle_line_len[ NR_OPT_ANGLE ] = { 0, ANGLE_BASIC_LEN }; + int atom_line_len[ NR_OPT_ATOM ] = { 0, 0, 0, 0, + ATOM_BASIC_LEN, ATOM_wV_LEN, + ATOM_wF_LEN, ATOM_FULL_LEN + }; + int bond_line_len[ NR_OPT_BOND ] = { 0, BOND_BASIC_LEN, BOND_FULL_LEN }; + int angle_line_len[ NR_OPT_ANGLE ] = { 0, ANGLE_BASIC_LEN }; /* generate trajectory name */ sprintf( fname, "%s.trj", control->sim_name ); @@ -531,11 +531,15 @@ int Init_Traj( reax_system *system, control_params *control, #if defined(DEBUG_FOCUS) fprintf( stderr, "p%d: initiated trajectory\n", system->my_rank ); #endif + Write_Header( system, control, out_control, mpi_data ); + #if defined(DEBUG_FOCUS) fprintf( stderr, "p%d: header written\n", system->my_rank ); #endif + Write_Init_Desc( system, control, out_control, mpi_data ); + #if defined(DEBUG_FOCUS) fprintf( stderr, "p%d: atom descriptions written\n", system->my_rank ); #endif @@ -669,11 +673,11 @@ int Write_Frame_Header( reax_system *system, control_params *control, if ( out_control->traj_method == MPI_TRAJ ) { Set_My_Trajectory_View( out_control->trj, out_control->trj_offset, - mpi_data->header_line, mpi_data->world, - me, my_frm_hdr_lines, num_frm_hdr_lines ); + mpi_data->header_line, mpi_data->world, + me, my_frm_hdr_lines, num_frm_hdr_lines ); MPI_File_write_all(out_control->trj, out_control->buffer, my_frm_hdr_lines, - mpi_data->header_line, &status); + mpi_data->header_line, &status); out_control->trj_offset += (num_frm_hdr_lines) * HEADER_LINE_LEN; } else @@ -756,10 +760,10 @@ int Write_Atoms( reax_system *system, control_params *control, if ( out_control->traj_method == MPI_TRAJ ) { Set_My_Trajectory_View( out_control->trj, out_control->trj_offset, - mpi_data->atom_line, mpi_data->world, - me, system->n, system->bigN ); + mpi_data->atom_line, mpi_data->world, + me, system->n, system->bigN ); MPI_File_write( out_control->trj, out_control->buffer, system->n, - mpi_data->atom_line, &status ); + mpi_data->atom_line, &status ); out_control->trj_offset += (system->bigN) * out_control->atom_line_len; } else @@ -869,10 +873,10 @@ int Write_Bonds( reax_system *system, control_params *control, reax_list *bonds, if ( out_control->traj_method == MPI_TRAJ ) { Set_My_Trajectory_View( out_control->trj, out_control->trj_offset, - mpi_data->bond_line, mpi_data->world, - me, my_bonds, num_bonds ); + mpi_data->bond_line, mpi_data->world, + me, my_bonds, num_bonds ); MPI_File_write( out_control->trj, out_control->buffer, my_bonds, - mpi_data->bond_line, &status ); + mpi_data->bond_line, &status ); out_control->trj_offset += num_bonds * line_len; } else @@ -990,10 +994,10 @@ int Write_Angles( reax_system *system, control_params *control, if ( out_control->traj_method == MPI_TRAJ ) { Set_My_Trajectory_View( out_control->trj, out_control->trj_offset, - mpi_data->angle_line, mpi_data->world, - me, my_angles, num_angles ); + mpi_data->angle_line, mpi_data->world, + me, my_angles, num_angles ); MPI_File_write( out_control->trj, out_control->buffer, my_angles, - mpi_data->angle_line, &status ); + mpi_data->angle_line, &status ); out_control->trj_offset += num_angles * line_len; } else diff --git a/PG-PuReMD/src/vector.h b/PG-PuReMD/src/vector.h index 14250909..1cfd7950 100644 --- a/PG-PuReMD/src/vector.h +++ b/PG-PuReMD/src/vector.h @@ -23,13 +23,13 @@ #define __VECTOR_H_ #include "reax_types.h" + #include "random.h" #ifdef __cplusplus extern "C" { #endif - #if defined(LAMMPS_REAX) || defined(PURE_REAX) CUDA_HOST_DEVICE static inline int Vector_isZero( real* v, int k ) { @@ -285,11 +285,11 @@ CUDA_HOST_DEVICE static inline void rvec_MakeZero( rvec v ) #if defined(PURE_REAX) -CUDA_HOST_DEVICE static inline void rvec_Random( rvec v ) +static inline void rvec_Random( rvec v ) { - v[0] = Random(2.0) - 1.0; - v[1] = Random(2.0) - 1.0; - v[2] = Random(2.0) - 1.0; + v[0] = Random( 2.0 ) - 1.0; + v[1] = Random( 2.0 ) - 1.0; + v[2] = Random( 2.0 ) - 1.0; } #endif diff --git a/PuReMD/src/basic_comm.c b/PuReMD/src/basic_comm.c index 55f2867e..96c83976 100644 --- a/PuReMD/src/basic_comm.c +++ b/PuReMD/src/basic_comm.c @@ -299,7 +299,7 @@ void Coll_ids_at_Master( reax_system *system, storage *workspace, workspace->id_all, workspace->rcounts, workspace->displs, MPI_INT, MASTER_NODE, mpi_data->world ); - free( id_list ); + sfree( id_list, "id_list" ); #if defined(DEBUG) if ( system->my_rank == MASTER_NODE ) diff --git a/PuReMD/src/control.c b/PuReMD/src/control.c index 7ef81836..07b46d3a 100644 --- a/PuReMD/src/control.c +++ b/PuReMD/src/control.c @@ -439,9 +439,9 @@ char Read_Control_File( char *control_file, control_params* control, /* free memory allocations at the top */ for ( i = 0; i < MAX_TOKENS; i++ ) - free( tmp[i] ); - free( tmp ); - free( s ); + sfree( tmp[i], "tmp[i]" ); + sfree( tmp, "tmp" ); + sfree( s, "s" ); // fprintf( stderr,"%d %d %10.5f %d %10.5f %10.5f\n", // control->ensemble, control->nsteps, control->dt, diff --git a/PuReMD/src/ffield.c b/PuReMD/src/ffield.c index 29138f5b..b05216bd 100644 --- a/PuReMD/src/ffield.c +++ b/PuReMD/src/ffield.c @@ -766,9 +766,9 @@ char Read_Force_Field( char *ffield_file, reax_interaction *reax, /* deallocate helper storage */ for ( i = 0; i < MAX_TOKENS; i++ ) - free( tmp[i] ); - free( tmp ); - free( s ); + sfree( tmp[i], "tmp[i]" ); + sfree( tmp, "tmp" ); + sfree( s, "s" ); /* deallocate tor_flag */ @@ -777,12 +777,12 @@ char Read_Force_Field( char *ffield_file, reax_interaction *reax, for ( j = 0; j < reax->num_atom_types; j++ ) { for ( k = 0; k < reax->num_atom_types; k++ ) - free( tor_flag[i][j][k] ); + sfree( tor_flag[i][j][k], "tor_flag[i][j][k]" ); - free( tor_flag[i][j] ); + sfree( tor_flag[i][j], "tor_flag[i][j]" ); } - free( tor_flag[i] ); + sfree( tor_flag[i], "tor_flag[i]" ); } diff --git a/PuReMD/src/geo_tools.c b/PuReMD/src/geo_tools.c index c037840b..dd5115c0 100644 --- a/PuReMD/src/geo_tools.c +++ b/PuReMD/src/geo_tools.c @@ -623,8 +623,8 @@ char Write_PDB(reax_system* system, reax_list* bonds, simulation_data *data, } */ - free(buffer); - free(line); + sfree(buffer, "buffer"); + sfree(line, "line"); return SUCCESS; } diff --git a/PuReMD/src/grid.c b/PuReMD/src/grid.c index 0881e00f..0064f220 100644 --- a/PuReMD/src/grid.c +++ b/PuReMD/src/grid.c @@ -515,7 +515,7 @@ void Reorder_My_Atoms( reax_system *system, storage *workspace ) } /* deallocate old storage */ - free( system->my_atoms ); + sfree( system->my_atoms, "system->my_atoms" ); /* start using clustered storages */ system->my_atoms = new_atoms; system->n = top; diff --git a/PuReMD/src/init_md.c b/PuReMD/src/init_md.c index 897552ec..b35a4688 100644 --- a/PuReMD/src/init_md.c +++ b/PuReMD/src/init_md.c @@ -675,8 +675,8 @@ int Init_Lists( reax_system *system, control_params *control, bond_cap * MAX_BONDS * 3 * sizeof(dbond_data) / (1024 * 1024) ); #endif - free( hb_top ); - free( bond_top ); + sfree( hb_top, "hb_top" ); + sfree( bond_top, "bond_top" ); return SUCCESS; } @@ -779,8 +779,8 @@ int Init_Lists( reax_system *system, control_params *control, bond_cap * MAX_BONDS * 3 * sizeof(dbond_data) / (1024 * 1024) ); #endif - free( hb_top ); - free( bond_top ); + sfree( hb_top, "hb_top" ); + sfree( bond_top, "bond_top" ); return SUCCESS; } diff --git a/PuReMD/src/lookup.c b/PuReMD/src/lookup.c index e821db03..2a38eb23 100644 --- a/PuReMD/src/lookup.c +++ b/PuReMD/src/lookup.c @@ -304,12 +304,12 @@ int Init_Lookup_Tables( reax_system *system, control_params *control, comm ); } - free(h); - free(fh); - free(fvdw); - free(fCEvd); - free(fele); - free(fCEclmb); + sfree(h, "h"); + sfree(fh, "fh"); + sfree(fvdw, "fvdw"); + sfree(fCEvd, "fCEvd"); + sfree(fele, "fele"); + sfree(fCEclmb, "cCEclmb"); return 1; } diff --git a/PuReMD/src/parallelreax.c b/PuReMD/src/parallelreax.c index 703a434c..6d0955c7 100644 --- a/PuReMD/src/parallelreax.c +++ b/PuReMD/src/parallelreax.c @@ -233,13 +233,13 @@ int main( int argc, char* argv[] ) MPI_Finalize(); /* de-allocate data structures */ - free( system ); - free( control ); - free( data ); - free( workspace ); - free( lists ); - free( out_control ); - free( mpi_data ); + sfree( system, "system" ); + sfree( control, "control" ); + sfree( data, "data" ); + sfree( workspace, "workspace" ); + sfree( lists, "lists" ); + sfree( out_control, "out_control" ); + sfree( mpi_data, "mpi_data" ); #if defined(TEST_ENERGY) || defined(TEST_FORCES) // Integrate_Results(control); diff --git a/PuReMD/src/qEq.c b/PuReMD/src/qEq.c index cba4f83d..ed5c276b 100644 --- a/PuReMD/src/qEq.c +++ b/PuReMD/src/qEq.c @@ -355,7 +355,7 @@ void Calculate_Charges( reax_system *system, storage *workspace, for ( i = system->n; i < system->N; ++i ) system->my_atoms[i].q = q[i]; - free(q); + sfree(q, "q"); } diff --git a/PuReMD/src/restart.c b/PuReMD/src/restart.c index 4b9fa853..b687d82a 100644 --- a/PuReMD/src/restart.c +++ b/PuReMD/src/restart.c @@ -111,7 +111,7 @@ void Write_Binary_Restart( reax_system *system, control_params *control, fclose( fres ); } - free(buffer); + sfree(buffer, "buffer"); } @@ -206,8 +206,8 @@ void Write_Restart( reax_system *system, control_params *control, fprintf( fres, "%s", buffer ); fclose( fres ); } - free(buffer); - free(line); + sfree(buffer, "buffer"); + sfree(line, "line"); } @@ -467,9 +467,9 @@ void Read_Restart( char *res_file, reax_system *system, fclose( fres ); /* free memory allocations at the top */ for ( i = 0; i < MAX_TOKENS; i++ ) - free( tmp[i] ); - free( tmp ); - free( s ); + sfree( tmp[i], "tmp[i]" ); + sfree( tmp, "tmp" ); + sfree( s, "s" ); data->step = data->prev_steps; // nsteps is updated based on the number of steps in the previous run diff --git a/PuReMD/src/tool_box.c b/PuReMD/src/tool_box.c index 02d9e3d9..a0892870 100644 --- a/PuReMD/src/tool_box.c +++ b/PuReMD/src/tool_box.c @@ -48,7 +48,7 @@ int SumScan( int n, int me, int root, MPI_Comm comm ) MPI_Scatter( nbuf, 1, MPI_INT, &my_order, 1, MPI_INT, root, comm ); - free( nbuf ); + sfree( nbuf, "nbuf" ); } else { diff --git a/PuReMD/src/traj.c b/PuReMD/src/traj.c index ab321a2e..e455d74b 100644 --- a/PuReMD/src/traj.c +++ b/PuReMD/src/traj.c @@ -75,7 +75,7 @@ int Reallocate_Output_Buffer( output_controls *out_control, int req_space, MPI_Comm comm ) { if ( out_control->buffer_len > 0 ) - free( out_control->buffer ); + sfree( out_control->buffer, "out_control->buffer" ); out_control->buffer_len = (int)(req_space * SAFE_ZONE); out_control->buffer = (char*) malloc(out_control->buffer_len * sizeof(char)); @@ -1122,8 +1122,8 @@ int End_Traj( int my_rank, output_controls *out_control ) fclose( out_control->strj ); #endif - free( out_control->buffer ); - free( out_control->line ); + sfree( out_control->buffer, "out_control->buffer" ); + sfree( out_control->line, "out_control->line" ); return SUCCESS; } diff --git a/cuda.am b/cuda.am index e5e86003..eb61fb31 100644 --- a/cuda.am +++ b/cuda.am @@ -7,12 +7,8 @@ AM_V_NVCC = $(AM_V_NVCC_@AM_V@) AM_V_NVCC_ = $(AM_V_NVCC_@AM_DEFAULT_V@) -AM_V_NVCC_0 = @echo " NVCC" $@; - -# these are default values for the maximun register count parameter -# passed to nvcc compiler (you might need to change it sometimes; all you need -# is to set it as an environment variable). -MAX_REG_COUNT ?=48 +AM_V_NVCC_0 = @echo " NVCC " $@; +AM_V_NVCC_1 = .cu.o: - $(AM_V_NVCC)$(NVCC) $(NVCCFLAGS) -maxrregcount=$(MAX_REG_COUNT) -o $@ -c $< + $(AM_V_NVCC)$(NVCC) $(NVCCFLAGS) -o $@ -c $< -- GitLab