From 5299a3154c6112e9f002a919442ed32ed8fc5b8f Mon Sep 17 00:00:00 2001 From: "Kurt A. O'Hearn" <ohearnku@msu.edu> Date: Sun, 16 Jul 2017 23:19:55 -0400 Subject: [PATCH] PG-PuReMD: minor project refactoring. Separate CUDA code. Change header includes. --- PG-PuReMD/Makefile.am | 38 +- PG-PuReMD/src/allocate.c | 3 +- PG-PuReMD/src/allocate.h | 4 +- PG-PuReMD/src/analyze.c | 3 + PG-PuReMD/src/analyze.h | 12 +- PG-PuReMD/src/basic_comm.h | 20 +- PG-PuReMD/src/bond_orders.c | 1 + PG-PuReMD/src/bond_orders.h | 34 +- PG-PuReMD/src/bonds.c | 28 +- PG-PuReMD/src/bonds.h | 13 +- PG-PuReMD/src/box.c | 3 + PG-PuReMD/src/box.h | 35 +- PG-PuReMD/src/center_mass.cu | 551 -------- PG-PuReMD/src/center_mass.h | 49 - PG-PuReMD/src/charges.c | 102 +- PG-PuReMD/src/charges.h | 9 +- PG-PuReMD/src/comm_tools.c | 2 + PG-PuReMD/src/comm_tools.h | 23 +- PG-PuReMD/src/control.h | 10 + PG-PuReMD/src/{ => cuda}/cuda_allocate.cu | 8 +- PG-PuReMD/src/{ => cuda}/cuda_allocate.h | 2 +- PG-PuReMD/src/{ => cuda}/cuda_bond_orders.cu | 5 +- PG-PuReMD/src/{ => cuda}/cuda_bond_orders.h | 5 +- PG-PuReMD/src/{ => cuda}/cuda_bonds.cu | 7 +- PG-PuReMD/src/{ => cuda}/cuda_bonds.h | 18 +- PG-PuReMD/src/{ => cuda}/cuda_charges.cu | 95 +- PG-PuReMD/src/{ => cuda}/cuda_charges.h | 5 +- PG-PuReMD/src/{ => cuda}/cuda_copy.cu | 3 +- PG-PuReMD/src/{ => cuda}/cuda_copy.h | 10 +- PG-PuReMD/src/{ => cuda}/cuda_environment.cu | 0 PG-PuReMD/src/{ => cuda}/cuda_environment.h | 8 +- PG-PuReMD/src/{ => cuda}/cuda_forces.cu | 227 +++- PG-PuReMD/src/{ => cuda}/cuda_forces.h | 5 +- PG-PuReMD/src/{ => cuda}/cuda_helpers.h | 2 +- .../src/{ => cuda}/cuda_hydrogen_bonds.cu | 8 +- PG-PuReMD/src/cuda/cuda_hydrogen_bonds.h | 48 + PG-PuReMD/src/cuda/cuda_init_md.cu | 412 ++++++ PG-PuReMD/src/cuda/cuda_init_md.h | 22 + PG-PuReMD/src/cuda/cuda_integrate.cu | 249 ++++ PG-PuReMD/src/{ => cuda}/cuda_integrate.h | 10 +- PG-PuReMD/src/cuda/cuda_lin_alg.cu | 1113 +++++++++++++++++ PG-PuReMD/src/{ => cuda}/cuda_lin_alg.h | 39 +- PG-PuReMD/src/{ => cuda}/cuda_list.cu | 9 +- PG-PuReMD/src/{ => cuda}/cuda_list.h | 4 +- PG-PuReMD/src/{ => cuda}/cuda_lookup.cu | 5 +- PG-PuReMD/src/{ => cuda}/cuda_lookup.h | 6 +- PG-PuReMD/src/{ => cuda}/cuda_multi_body.cu | 5 +- PG-PuReMD/src/{ => cuda}/cuda_multi_body.h | 25 +- PG-PuReMD/src/{ => cuda}/cuda_neighbors.cu | 8 +- PG-PuReMD/src/{ => cuda}/cuda_neighbors.h | 5 +- PG-PuReMD/src/{ => cuda}/cuda_nonbonded.cu | 5 +- PG-PuReMD/src/{ => cuda}/cuda_nonbonded.h | 14 +- PG-PuReMD/src/{ => cuda}/cuda_post_evolve.cu | 5 +- PG-PuReMD/src/{ => cuda}/cuda_post_evolve.h | 6 +- PG-PuReMD/src/{ => cuda}/cuda_reduction.cu | 6 +- PG-PuReMD/src/{ => cuda}/cuda_reduction.h | 15 +- PG-PuReMD/src/{ => cuda}/cuda_reset_tools.cu | 2 +- PG-PuReMD/src/{ => cuda}/cuda_reset_tools.h | 6 +- PG-PuReMD/src/{ => cuda}/cuda_shuffle.h | 3 +- PG-PuReMD/src/cuda/cuda_system_props.cu | 1026 +++++++++++++++ PG-PuReMD/src/{ => cuda}/cuda_system_props.h | 15 +- .../src/{ => cuda}/cuda_torsion_angles.cu | 7 +- .../src/{ => cuda}/cuda_torsion_angles.h | 32 +- PG-PuReMD/src/{ => cuda}/cuda_utils.cu | 17 + PG-PuReMD/src/{ => cuda}/cuda_utils.h | 32 +- .../src/{ => cuda}/cuda_valence_angles.cu | 5 +- .../src/{ => cuda}/cuda_valence_angles.h | 6 +- PG-PuReMD/src/{ => cuda}/cuda_validation.cu | 9 +- PG-PuReMD/src/{ => cuda}/cuda_validation.h | 18 +- PG-PuReMD/src/cuda_hydrogen_bonds.h | 66 - PG-PuReMD/src/cuda_init_md.cu | 14 - PG-PuReMD/src/cuda_init_md.h | 15 - PG-PuReMD/src/cuda_integrate.cu | 105 -- PG-PuReMD/src/cuda_lin_alg.cu | 624 --------- PG-PuReMD/src/cuda_system_props.cu | 406 ------ PG-PuReMD/src/ffield.c | 3 +- PG-PuReMD/src/ffield.h | 10 + PG-PuReMD/src/forces.c | 217 +--- PG-PuReMD/src/forces.h | 11 +- PG-PuReMD/src/geo_tools.c | 3 + PG-PuReMD/src/geo_tools.h | 21 +- PG-PuReMD/src/grid.c | 6 +- PG-PuReMD/src/grid.h | 14 + PG-PuReMD/src/hydrogen_bonds.c | 4 +- PG-PuReMD/src/hydrogen_bonds.h | 12 +- PG-PuReMD/src/init_md.c | 377 ------ PG-PuReMD/src/init_md.h | 15 +- PG-PuReMD/src/integrate.c | 147 +-- PG-PuReMD/src/integrate.h | 13 +- PG-PuReMD/src/io_tools.c | 4 +- PG-PuReMD/src/io_tools.h | 107 +- PG-PuReMD/src/lin_alg.c | 503 +------- PG-PuReMD/src/lin_alg.h | 29 +- PG-PuReMD/src/list.c | 8 +- PG-PuReMD/src/list.h | 11 +- PG-PuReMD/src/lookup.c | 12 +- PG-PuReMD/src/lookup.h | 12 +- PG-PuReMD/src/multi_body.c | 1 + PG-PuReMD/src/multi_body.h | 12 +- PG-PuReMD/src/neighbors.c | 6 +- PG-PuReMD/src/neighbors.h | 14 +- PG-PuReMD/src/nonbonded.c | 8 +- PG-PuReMD/src/nonbonded.h | 16 +- PG-PuReMD/src/parallelreax.c | 33 +- PG-PuReMD/src/random.c | 2 + PG-PuReMD/src/random.h | 15 +- PG-PuReMD/src/reax_types.h | 15 +- PG-PuReMD/src/reset_tools.c | 4 +- PG-PuReMD/src/reset_tools.h | 4 +- PG-PuReMD/src/restart.c | 3 + PG-PuReMD/src/restart.h | 19 +- PG-PuReMD/src/system_props.c | 157 +-- PG-PuReMD/src/system_props.h | 19 +- PG-PuReMD/src/tool_box.h | 3 +- PG-PuReMD/src/torsion_angles.c | 3 +- PG-PuReMD/src/torsion_angles.h | 12 +- PG-PuReMD/src/traj.c | 2 +- PG-PuReMD/src/traj.h | 10 +- PG-PuReMD/src/valence_angles.h | 8 + PG-PuReMD/src/vector.h | 5 +- 120 files changed, 3967 insertions(+), 3710 deletions(-) delete mode 100644 PG-PuReMD/src/center_mass.cu delete mode 100644 PG-PuReMD/src/center_mass.h rename PG-PuReMD/src/{ => cuda}/cuda_allocate.cu (99%) rename PG-PuReMD/src/{ => cuda}/cuda_allocate.h (96%) rename PG-PuReMD/src/{ => cuda}/cuda_bond_orders.cu (99%) rename PG-PuReMD/src/{ => cuda}/cuda_bond_orders.h (98%) rename PG-PuReMD/src/{ => cuda}/cuda_bonds.cu (98%) rename PG-PuReMD/src/{ => cuda}/cuda_bonds.h (69%) rename PG-PuReMD/src/{ => cuda}/cuda_charges.cu (68%) rename PG-PuReMD/src/{ => cuda}/cuda_charges.h (89%) rename PG-PuReMD/src/{ => cuda}/cuda_copy.cu (99%) rename PG-PuReMD/src/{ => cuda}/cuda_copy.h (93%) rename PG-PuReMD/src/{ => cuda}/cuda_environment.cu (100%) rename PG-PuReMD/src/{ => cuda}/cuda_environment.h (56%) rename PG-PuReMD/src/{ => cuda}/cuda_forces.cu (89%) rename PG-PuReMD/src/{ => cuda}/cuda_forces.h (86%) rename PG-PuReMD/src/{ => cuda}/cuda_helpers.h (97%) rename PG-PuReMD/src/{ => cuda}/cuda_hydrogen_bonds.cu (99%) create mode 100644 PG-PuReMD/src/cuda/cuda_hydrogen_bonds.h create mode 100644 PG-PuReMD/src/cuda/cuda_init_md.cu create mode 100644 PG-PuReMD/src/cuda/cuda_init_md.h create mode 100644 PG-PuReMD/src/cuda/cuda_integrate.cu rename PG-PuReMD/src/{ => cuda}/cuda_integrate.h (86%) create mode 100644 PG-PuReMD/src/cuda/cuda_lin_alg.cu rename PG-PuReMD/src/{ => cuda}/cuda_lin_alg.h (52%) rename PG-PuReMD/src/{ => cuda}/cuda_list.cu (96%) rename PG-PuReMD/src/{ => cuda}/cuda_list.h (98%) rename PG-PuReMD/src/{ => cuda}/cuda_lookup.cu (98%) rename PG-PuReMD/src/{ => cuda}/cuda_lookup.h (56%) rename PG-PuReMD/src/{ => cuda}/cuda_multi_body.cu (99%) rename PG-PuReMD/src/{ => cuda}/cuda_multi_body.h (58%) rename PG-PuReMD/src/{ => cuda}/cuda_neighbors.cu (99%) rename PG-PuReMD/src/{ => cuda}/cuda_neighbors.h (95%) rename PG-PuReMD/src/{ => cuda}/cuda_nonbonded.cu (99%) rename PG-PuReMD/src/{ => cuda}/cuda_nonbonded.h (79%) rename PG-PuReMD/src/{ => cuda}/cuda_post_evolve.cu (95%) rename PG-PuReMD/src/{ => cuda}/cuda_post_evolve.h (60%) rename PG-PuReMD/src/{ => cuda}/cuda_reduction.cu (99%) rename PG-PuReMD/src/{ => cuda}/cuda_reduction.h (96%) rename PG-PuReMD/src/{ => cuda}/cuda_reset_tools.cu (98%) rename PG-PuReMD/src/{ => cuda}/cuda_reset_tools.h (94%) rename PG-PuReMD/src/{ => cuda}/cuda_shuffle.h (97%) create mode 100644 PG-PuReMD/src/cuda/cuda_system_props.cu rename PG-PuReMD/src/{ => cuda}/cuda_system_props.h (65%) rename PG-PuReMD/src/{ => cuda}/cuda_torsion_angles.cu (99%) rename PG-PuReMD/src/{ => cuda}/cuda_torsion_angles.h (57%) rename PG-PuReMD/src/{ => cuda}/cuda_utils.cu (88%) rename PG-PuReMD/src/{ => cuda}/cuda_utils.h (80%) rename PG-PuReMD/src/{ => cuda}/cuda_valence_angles.cu (99%) rename PG-PuReMD/src/{ => cuda}/cuda_valence_angles.h (98%) rename PG-PuReMD/src/{ => cuda}/cuda_validation.cu (99%) rename PG-PuReMD/src/{ => cuda}/cuda_validation.h (97%) delete mode 100644 PG-PuReMD/src/cuda_hydrogen_bonds.h delete mode 100644 PG-PuReMD/src/cuda_init_md.cu delete mode 100644 PG-PuReMD/src/cuda_init_md.h delete mode 100644 PG-PuReMD/src/cuda_integrate.cu delete mode 100644 PG-PuReMD/src/cuda_lin_alg.cu delete mode 100644 PG-PuReMD/src/cuda_system_props.cu diff --git a/PG-PuReMD/Makefile.am b/PG-PuReMD/Makefile.am index b0c1c871..3b051035 100644 --- a/PG-PuReMD/Makefile.am +++ b/PG-PuReMD/Makefile.am @@ -34,25 +34,25 @@ include_HEADERS = src/reax_types.h src/index_utils.h \ src/integrate.h src/init_md.h if USE_CUDA -bin_pg_puremd_SOURCES += src/cuda_utils.cu src/cuda_allocate.cu src/cuda_environment.cu \ - src/cuda_system_props.cu src/cuda_reduction.cu src/center_mass.cu \ - src/cuda_copy.cu src/cuda_reset_tools.cu src/cuda_list.cu \ - src/cuda_neighbors.cu src/cuda_bond_orders.cu src/cuda_bonds.cu \ - src/cuda_multi_body.cu src/cuda_valence_angles.cu \ - src/cuda_torsion_angles.cu src/cuda_hydrogen_bonds.cu src/cuda_forces.cu \ - src/cuda_charges.cu src/cuda_lin_alg.cu \ - src/cuda_nonbonded.cu src/cuda_integrate.cu src/cuda_post_evolve.cu \ - src/cuda_init_md.cu src/cuda_validation.cu src/cuda_lookup.cu -include_HEADERS += src/cuda_helpers.h src/cuda_shuffle.h \ - src/cuda_utils.h src/cuda_allocate.h src/cuda_environment.h \ - src/cuda_system_props.h src/cuda_reduction.h src/center_mass.h \ - src/cuda_copy.h src/cuda_reset_tools.h src/cuda_list.h \ - src/cuda_neighbors.h src/cuda_bond_orders.h src/cuda_bonds.h \ - src/cuda_multi_body.h src/cuda_valence_angles.h \ - src/cuda_torsion_angles.h src/cuda_hydrogen_bonds.h src/cuda_forces.h \ - src/cuda_charges.h src/cuda_lin_alg.h \ - src/cuda_nonbonded.h src/cuda_integrate.h src/cuda_post_evolve.h \ - src/cuda_init_md.h src/cuda_validation.h src/cuda_lookup.h +bin_pg_puremd_SOURCES += src/cuda/cuda_utils.cu src/cuda/cuda_allocate.cu src/cuda/cuda_environment.cu \ + src/cuda/cuda_system_props.cu src/cuda/cuda_reduction.cu \ + src/cuda/cuda_copy.cu src/cuda/cuda_reset_tools.cu src/cuda/cuda_list.cu \ + src/cuda/cuda_neighbors.cu src/cuda/cuda_bond_orders.cu src/cuda/cuda_bonds.cu \ + src/cuda/cuda_multi_body.cu src/cuda/cuda_valence_angles.cu \ + src/cuda/cuda_torsion_angles.cu src/cuda/cuda_hydrogen_bonds.cu src/cuda/cuda_forces.cu \ + src/cuda/cuda_charges.cu src/cuda/cuda_lin_alg.cu \ + src/cuda/cuda_nonbonded.cu src/cuda/cuda_integrate.cu src/cuda/cuda_post_evolve.cu \ + src/cuda/cuda_init_md.cu src/cuda/cuda_validation.cu src/cuda/cuda_lookup.cu +include_HEADERS += src/cuda/cuda_helpers.h src/cuda/cuda_shuffle.h \ + src/cuda/cuda_utils.h src/cuda/cuda_allocate.h src/cuda/cuda_environment.h \ + src/cuda/cuda_system_props.h src/cuda/cuda_reduction.h \ + src/cuda/cuda_copy.h src/cuda/cuda_reset_tools.h src/cuda/cuda_list.h \ + src/cuda/cuda_neighbors.h src/cuda/cuda_bond_orders.h src/cuda/cuda_bonds.h \ + src/cuda/cuda_multi_body.h src/cuda/cuda_valence_angles.h \ + src/cuda/cuda_torsion_angles.h src/cuda/cuda_hydrogen_bonds.h src/cuda/cuda_forces.h \ + src/cuda/cuda_charges.h src/cuda/cuda_lin_alg.h \ + src/cuda/cuda_nonbonded.h src/cuda/cuda_integrate.h src/cuda/cuda_post_evolve.h \ + src/cuda/cuda_init_md.h src/cuda/cuda_validation.h src/cuda/cuda_lookup.h # dummy source to cause C linking nodist_EXTRA_bin_pg_puremd_SOURCES = src/dummy.c diff --git a/PG-PuReMD/src/allocate.c b/PG-PuReMD/src/allocate.c index 1d85b8f9..54614694 100644 --- a/PG-PuReMD/src/allocate.c +++ b/PG-PuReMD/src/allocate.c @@ -20,7 +20,6 @@ ----------------------------------------------------------------------*/ #include "reax_types.h" -#include "index_utils.h" #if defined(PURE_REAX) #include "allocate.h" @@ -36,6 +35,8 @@ #include "reax_vector.h" #endif +#include "index_utils.h" + /* allocate space for my_atoms important: we cannot know the exact number of atoms that will fall into a diff --git a/PG-PuReMD/src/allocate.h b/PG-PuReMD/src/allocate.h index 5fd27315..a2876453 100644 --- a/PG-PuReMD/src/allocate.h +++ b/PG-PuReMD/src/allocate.h @@ -24,11 +24,11 @@ #include "reax_types.h" + #ifdef __cplusplus extern "C" { #endif - int PreAllocate_Space( reax_system*, control_params*, storage* ); void Allocate_System( reax_system*, int, int, char* ); @@ -53,9 +53,9 @@ void Deallocate_MPI_Buffers( mpi_datatypes * ); void ReAllocate( reax_system*, control_params*, simulation_data*, storage*, reax_list**, mpi_datatypes* ); - #ifdef __cplusplus } #endif + #endif diff --git a/PG-PuReMD/src/analyze.c b/PG-PuReMD/src/analyze.c index 283d7e47..0f47ba48 100644 --- a/PG-PuReMD/src/analyze.c +++ b/PG-PuReMD/src/analyze.c @@ -19,7 +19,10 @@ <http://www.gnu.org/licenses/>. ----------------------------------------------------------------------*/ +#include "reax_types.h" + #include "analyze.h" + #include "box.h" #include "list.h" #include "vector.h" diff --git a/PG-PuReMD/src/analyze.h b/PG-PuReMD/src/analyze.h index e4703341..a772dcb2 100644 --- a/PG-PuReMD/src/analyze.h +++ b/PG-PuReMD/src/analyze.h @@ -24,7 +24,17 @@ #include "reax_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + void Analysis( reax_system*, control_params*, simulation_data*, storage*, - reax_list**, output_controls*, mpi_datatypes* ); + reax_list**, output_controls*, mpi_datatypes* ); + +#ifdef __cplusplus +} +#endif + #endif diff --git a/PG-PuReMD/src/basic_comm.h b/PG-PuReMD/src/basic_comm.h index e1effc50..4d8f1c34 100644 --- a/PG-PuReMD/src/basic_comm.h +++ b/PG-PuReMD/src/basic_comm.h @@ -24,33 +24,43 @@ #include "reax_types.h" + #ifdef __cplusplus extern "C" { #endif void real_packer( void*, mpi_out_data* ); + void rvec_packer( void*, mpi_out_data* ); + void rvec2_packer( void*, mpi_out_data* ); + void Dist(reax_system*, mpi_datatypes*, void*, MPI_Datatype, int, dist_packer); void real_unpacker( void*, void*, mpi_out_data* ); + void rvec_unpacker( void*, void*, mpi_out_data* ); + void rvec2_unpacker( void*, void*, mpi_out_data* ); + void Coll( reax_system*, mpi_datatypes*, void*, MPI_Datatype, - int, coll_unpacker ); + int, coll_unpacker ); real Parallel_Norm( real*, int, MPI_Comm ); + real Parallel_Dot( real*, real*, int, MPI_Comm ); + real Parallel_Vector_Acc( real*, int, MPI_Comm ); +#if defined(TEST_FORCES) +void Coll_ids_at_Master( reax_system*, storage*, mpi_datatypes* ); + +void Coll_rvecs_at_Master( reax_system*, storage*, mpi_datatypes*, rvec* ); +#endif #ifdef __cplusplus } #endif -#if defined(TEST_FORCES) -void Coll_ids_at_Master( reax_system*, storage*, mpi_datatypes* ); -void Coll_rvecs_at_Master( reax_system*, storage*, mpi_datatypes*, rvec* ); -#endif #endif diff --git a/PG-PuReMD/src/bond_orders.c b/PG-PuReMD/src/bond_orders.c index 4e023e97..da23e002 100644 --- a/PG-PuReMD/src/bond_orders.c +++ b/PG-PuReMD/src/bond_orders.c @@ -31,6 +31,7 @@ #include "reax_list.h" #include "reax_vector.h" #endif + #include "index_utils.h" diff --git a/PG-PuReMD/src/bond_orders.h b/PG-PuReMD/src/bond_orders.h index 1975e20b..8cfa2e18 100644 --- a/PG-PuReMD/src/bond_orders.h +++ b/PG-PuReMD/src/bond_orders.h @@ -24,6 +24,7 @@ #include "reax_types.h" + typedef struct { real C1dbo, C2dbo, C3dbo; @@ -32,28 +33,45 @@ typedef struct real C1dDelta, C2dDelta, C3dDelta; } dbond_coefficients; + +#ifdef __cplusplus +extern "C" { +#endif + #ifdef TEST_FORCES void Get_dBO( reax_system*, reax_list**, int, int, real, rvec* ); + void Get_dBOpinpi2( reax_system*, reax_list**, - int, int, real, real, rvec*, rvec* ); + int, int, real, real, rvec*, rvec* ); void Add_dBO( reax_system*, reax_list**, int, int, real, rvec* ); + void Add_dBOpinpi2( reax_system*, reax_list**, - int, int, real, real, rvec*, rvec* ); + int, int, real, real, rvec*, rvec* ); void Add_dBO_to_Forces( reax_system*, reax_list**, int, int, real ); + void Add_dBOpinpi2_to_Forces( reax_system*, reax_list**, - int, int, real, real ); + int, int, real, real ); void Add_dDelta( reax_system*, reax_list**, int, real, rvec* ); + void Add_dDelta_to_Forces( reax_system *, reax_list**, int, real ); #endif void Add_dBond_to_Forces( int, int, storage*, reax_list** ); -void Add_dBond_to_Forces_NPT( int, int, simulation_data*, - storage*, reax_list** ); -int BOp(storage*, reax_list*, real, int, int, far_neighbor_data*, - single_body_parameters*, single_body_parameters*, two_body_parameters*); + +void Add_dBond_to_Forces_NPT( int, int, simulation_data*, storage*, reax_list** ); + +int BOp( storage*, reax_list*, real, int, int, far_neighbor_data*, + single_body_parameters*, single_body_parameters*, two_body_parameters* ); + void BO( reax_system*, control_params*, simulation_data*, - storage*, reax_list**, output_controls* ); + storage*, reax_list**, output_controls* ); + +#ifdef __cplusplus +} +#endif + + #endif diff --git a/PG-PuReMD/src/bonds.c b/PG-PuReMD/src/bonds.c index 9c2839eb..8fb160ec 100644 --- a/PG-PuReMD/src/bonds.c +++ b/PG-PuReMD/src/bonds.c @@ -20,25 +20,27 @@ ----------------------------------------------------------------------*/ #include "reax_types.h" -#include "index_utils.h" + #if defined(PURE_REAX) -#include "bonds.h" -#include "bond_orders.h" -#include "list.h" -#include "tool_box.h" -#include "vector.h" + #include "bonds.h" + #include "bond_orders.h" + #include "list.h" + #include "tool_box.h" + #include "vector.h" #elif defined(LAMMPS_REAX) -#include "reax_bonds.h" -#include "reax_bond_orders.h" -#include "reax_list.h" -#include "reax_tool_box.h" -#include "reax_vector.h" + #include "reax_bonds.h" + #include "reax_bond_orders.h" + #include "reax_list.h" + #include "reax_tool_box.h" + #include "reax_vector.h" #endif +#include "index_utils.h" + void Bonds( reax_system *system, control_params *control, - simulation_data *data, storage *workspace, reax_list **lists, - output_controls *out_control ) + simulation_data *data, storage *workspace, reax_list **lists, + output_controls *out_control ) { int i, j, pj, natoms; int start_i, end_i; diff --git a/PG-PuReMD/src/bonds.h b/PG-PuReMD/src/bonds.h index 2aa3c1f9..89090386 100644 --- a/PG-PuReMD/src/bonds.h +++ b/PG-PuReMD/src/bonds.h @@ -24,6 +24,17 @@ #include "reax_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + void Bonds( reax_system*, control_params*, simulation_data*, - storage*, reax_list**, output_controls* ); + storage*, reax_list**, output_controls* ); + +#ifdef __cplusplus +} +#endif + + #endif diff --git a/PG-PuReMD/src/box.c b/PG-PuReMD/src/box.c index 86ebd6eb..525f24e5 100644 --- a/PG-PuReMD/src/box.c +++ b/PG-PuReMD/src/box.c @@ -19,7 +19,10 @@ <http://www.gnu.org/licenses/>. ----------------------------------------------------------------------*/ +#include "reax_types.h" + #include "box.h" + #include "comm_tools.h" #include "io_tools.h" #include "system_props.h" diff --git a/PG-PuReMD/src/box.h b/PG-PuReMD/src/box.h index 841e3679..00e51d06 100644 --- a/PG-PuReMD/src/box.h +++ b/PG-PuReMD/src/box.h @@ -24,30 +24,51 @@ #include "reax_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + /* initializes simulation boxes */ void Setup_Big_Box( real, real, real, real, real, real, simulation_box* ); + void Init_Box( rtensor, simulation_box* ); + //void Setup_My_Box( reax_system*, control_params* ); + //void Setup_My_Ext_Box( reax_system*, control_params* ); + void Setup_Environment( reax_system*, control_params*, mpi_datatypes* ); /* scales simulation box for NPT ensembles */ void Scale_Box( reax_system*, control_params*, - simulation_data*, mpi_datatypes* ); + simulation_data*, mpi_datatypes* ); /* applies transformation to/from Cartesian/ Triclinic coordinates */ /* use -1 flag for Cartesian -> Triclinic and +1 for otherway */ -// void Transform( rvec, simulation_box*, char, rvec ); -// void Distance_on_T3_Gen( rvec, rvec, simulation_box*, rvec ); -// void Inc_on_T3_Gen( rvec, rvec, simulation_box* ); -// int Get_Nbr_Box( simulation_box*, int, int, int ); -// rvec Get_Nbr_Box_Press( simulation_box*, int, int, int ); -// void Inc_Nbr_Box_Press( simulation_box*, int, int, int, rvec ); +//void Transform( rvec, simulation_box*, char, rvec ); + +//void Distance_on_T3_Gen( rvec, rvec, simulation_box*, rvec ); + +//void Inc_on_T3_Gen( rvec, rvec, simulation_box* ); + +//int Get_Nbr_Box( simulation_box*, int, int, int ); + +//rvec Get_Nbr_Box_Press( simulation_box*, int, int, int ); + +//void Inc_Nbr_Box_Press( simulation_box*, int, int, int, rvec ); /* these functions assume that the coordinates are in triclinic system this function returns cartesian norm but triclinic distance vector */ //real Sq_Distance_on_T3( rvec, rvec, simulation_box*, rvec ); + //void Inc_on_T3( rvec, rvec, simulation_box* ); + //real Metric_Product( rvec, rvec, simulation_box* ); +#ifdef __cplusplus +} +#endif + + #endif diff --git a/PG-PuReMD/src/center_mass.cu b/PG-PuReMD/src/center_mass.cu deleted file mode 100644 index 725cafbb..00000000 --- a/PG-PuReMD/src/center_mass.cu +++ /dev/null @@ -1,551 +0,0 @@ -#include "center_mass.h" -#include "vector.h" -#include "cuda_shuffle.h" - -CUDA_GLOBAL void center_of_mass_blocks (single_body_parameters *sbp, reax_atom *atoms, - rvec *res_xcm, - rvec *res_vcm, - rvec *res_amcm, - size_t n) -{ - extern __shared__ rvec xcm[]; - extern __shared__ rvec vcm[]; - extern __shared__ rvec amcm[]; - - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - - //unsigned int xcm_id = threadIdx.x; - unsigned int vcm_id = blockDim.x; - unsigned int amcm_id = 2 *(blockDim.x); - - unsigned int index = 0; - rvec tmp; - real m; - - rvec_MakeZero (xcm [threadIdx.x]); - rvec_MakeZero (vcm [vcm_id + threadIdx.x]); - rvec_MakeZero (amcm[amcm_id + threadIdx.x]); - rvec_MakeZero (tmp); - - if (i < n){ - m = sbp [ atoms[i].type ].mass; - rvec_ScaledAdd (xcm [threadIdx.x], m, atoms [i].x); - rvec_ScaledAdd (vcm [vcm_id + threadIdx.x], m, atoms [i].v); - rvec_Cross (tmp, atoms[i].x, atoms [i].v); - rvec_ScaledAdd (amcm[amcm_id + threadIdx.x], m, tmp); - } - __syncthreads (); - - for( int offset = blockDim.x / 2; offset > 0; offset >>= 1 ) { - - if ((threadIdx.x < offset)) { - index = threadIdx.x + offset; - rvec_Add (xcm [threadIdx.x], xcm[index]); - rvec_Add (vcm [vcm_id + threadIdx.x], vcm[vcm_id + index]); - rvec_Add (amcm[amcm_id + threadIdx.x], amcm[amcm_id + index]); - } - __syncthreads (); - } - - if ((threadIdx.x == 0)){ - rvec_Copy (res_xcm[blockIdx.x], xcm[0]); - rvec_Copy (res_vcm[blockIdx.x], vcm[vcm_id]); - rvec_Copy (res_amcm[blockIdx.x], amcm[amcm_id]); - } -} - -#if defined( __SM_35__) -CUDA_GLOBAL void center_of_mass_blocks_xcm (single_body_parameters *sbp, reax_atom *atoms, - rvec *res_xcm, - size_t n) -{ - extern __shared__ rvec my_xcm[]; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int xcm_id = threadIdx.x; - unsigned int index = 0; - rvec xcm; - real m; - - rvec_MakeZero (xcm); - - if (i < n){ - m = sbp [ atoms[i].type ].mass; - rvec_ScaledAdd (xcm , m, atoms [i].x); - } - __syncthreads (); - - for (int z = 16; z >= 1; z /= 2){ - xcm[0] += shfl( xcm[0], z); - xcm[1] += shfl( xcm[1], z); - xcm[2] += shfl( xcm[2], z); - } - __syncthreads (); - - if (threadIdx.x % 32 == 0) - rvec_Copy( my_xcm[ threadIdx.x >> 5], xcm ); - __syncthreads (); - - for( int offset = blockDim.x >> 6; offset > 0; offset >>= 1 ) { - - if ((threadIdx.x < offset)) { - index = threadIdx.x + offset; - rvec_Add (my_xcm [threadIdx.x], my_xcm[index]); - } - __syncthreads (); - } - - if ((threadIdx.x == 0)) - rvec_Copy (res_xcm[blockIdx.x], my_xcm[0]); -} - -CUDA_GLOBAL void center_of_mass_blocks_vcm (single_body_parameters *sbp, reax_atom *atoms, - rvec *res_vcm, - size_t n) -{ - extern __shared__ rvec my_vcm[]; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int index = 0; - rvec vcm; - real m; - - rvec_MakeZero (vcm); - - if (i < n){ - m = sbp [ atoms[i].type ].mass; - rvec_ScaledAdd (vcm , m, atoms [i].v); - } - __syncthreads (); - - for (int z = 16; z >= 1; z /= 2){ - vcm[0] += shfl( vcm[0], z); - vcm[1] += shfl( vcm[1], z); - vcm[2] += shfl( vcm[2], z); - } - __syncthreads (); - - if (threadIdx.x % 32 == 0) - rvec_Copy( my_vcm[ threadIdx.x >> 5], vcm ); - __syncthreads (); - - for( int offset = blockDim.x >> 6; offset > 0; offset >>= 1 ) { - - if ((threadIdx.x < offset)) { - index = threadIdx.x + offset; - rvec_Add (my_vcm [threadIdx.x], my_vcm[index]); - } - __syncthreads (); - } - - if ((threadIdx.x == 0)) - rvec_Copy (res_vcm[blockIdx.x], my_vcm[0]); -} - -CUDA_GLOBAL void center_of_mass_blocks_amcm (single_body_parameters *sbp, reax_atom *atoms, - rvec *res_amcm, - size_t n) -{ - extern __shared__ rvec my_amcm[]; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int index = 0; - rvec amcm; - real m; - rvec tmp; - - rvec_MakeZero (amcm); - rvec_MakeZero( tmp ); - - if (i < n){ - m = sbp [ atoms[i].type ].mass; - rvec_Cross (tmp, atoms[i].x, atoms [i].v); - rvec_ScaledAdd (amcm, m, tmp); - } - __syncthreads (); - - for (int z = 16; z >= 1; z /= 2){ - amcm[0] += shfl( amcm[0], z); - amcm[1] += shfl( amcm[1], z); - amcm[2] += shfl( amcm[2], z); - } - __syncthreads (); - - if (threadIdx.x % 32 == 0) - rvec_Copy( my_amcm[ threadIdx.x >> 5], amcm ); - __syncthreads (); - - - for( int offset = blockDim.x >> 6; offset > 0; offset >>= 1 ) { - - if ((threadIdx.x < offset)) { - index = threadIdx.x + offset; - rvec_Add (my_amcm[threadIdx.x], my_amcm[index]); - } - __syncthreads (); - } - - if ((threadIdx.x == 0)){ - rvec_Copy (res_amcm[blockIdx.x], my_amcm[0]); - } -} - -#endif - - -CUDA_GLOBAL void center_of_mass (rvec *xcm, - rvec *vcm, - rvec *amcm, - rvec *res_xcm, - rvec *res_vcm, - rvec *res_amcm, - size_t n) -{ - extern __shared__ rvec sh_xcm[]; - extern __shared__ rvec sh_vcm[]; - extern __shared__ rvec sh_amcm[]; - - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - - unsigned int xcm_id = threadIdx.x; - unsigned int vcm_id = blockDim.x; - unsigned int amcm_id = 2 * (blockDim.x); - - unsigned int index = 0; - rvec t_xcm, t_vcm, t_amcm; - - rvec_MakeZero (t_xcm); - rvec_MakeZero (t_vcm); - rvec_MakeZero (t_amcm); - - if (i < n){ - rvec_Copy ( t_xcm, xcm[threadIdx.x]); - rvec_Copy ( t_vcm, vcm[threadIdx.x]); - rvec_Copy ( t_amcm, amcm[threadIdx.x]); - } - - rvec_Copy (sh_xcm[xcm_id], t_xcm); - rvec_Copy (sh_vcm[vcm_id + threadIdx.x], t_vcm); - rvec_Copy (sh_amcm[amcm_id + threadIdx.x], t_amcm); - - __syncthreads (); - - for( int offset = blockDim.x / 2; offset > 0; offset >>= 1 ) { - - if (threadIdx.x < offset) { - index = threadIdx.x + offset; - rvec_Add (sh_xcm [threadIdx.x], sh_xcm[index]); - rvec_Add (sh_vcm [vcm_id + threadIdx.x], sh_vcm[vcm_id + index]); - rvec_Add (sh_amcm [amcm_id + threadIdx.x], sh_amcm[amcm_id + index]); - } - __syncthreads (); - } - - if (threadIdx.x == 0){ - rvec_Copy (res_xcm[blockIdx.x], sh_xcm[0]); - rvec_Copy (res_vcm[blockIdx.x], sh_vcm[vcm_id]); - rvec_Copy (res_amcm[blockIdx.x], sh_amcm[amcm_id]); - } -} - -CUDA_GLOBAL void compute_center_mass (single_body_parameters *sbp, - reax_atom *atoms, - real *results, - real xcm0, real xcm1, real xcm2, - size_t n) -{ - extern __shared__ real xx[]; - extern __shared__ real xy[]; - extern __shared__ real xz[]; - extern __shared__ real yy[]; - extern __shared__ real yz[]; - extern __shared__ real zz[]; - - unsigned int xx_i = threadIdx.x; - unsigned int xy_i = blockDim.x; - unsigned int xz_i = 2 * blockDim.x; - unsigned int yy_i = 3 * blockDim.x; - unsigned int yz_i = 4 * blockDim.x; - unsigned int zz_i = 5 * blockDim.x; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int index = 0; - - rvec diff, xcm; - real m = 0; - rvec_MakeZero (diff); - xcm[0] = xcm0; - xcm[1] = xcm1; - xcm[2] = xcm2; - - - xx[xx_i] = xy [xy_i + threadIdx.x] = xz[xz_i + threadIdx.x] = - yy[yy_i + threadIdx.x] = yz[yz_i + threadIdx.x] = zz[zz_i + threadIdx.x] = 0; - - if (i < n){ - m = sbp[ atoms[i].type ].mass; - rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm ); - xx[ xx_i ] = diff[0] * diff[0] * m; - xy[ xy_i + threadIdx.x ] = diff[0] * diff[1] * m; - xz[ xz_i + threadIdx.x ] = diff[0] * diff[2] * m; - yy[ yy_i + threadIdx.x ] = diff[1] * diff[1] * m; - yz[ yz_i + threadIdx.x ] = diff[1] * diff[2] * m; - zz[ zz_i + threadIdx.x ] = diff[2] * diff[2] * m; - } - __syncthreads (); - - for (int offset = blockDim.x / 2; offset > 0; offset >>= 1){ - if (threadIdx.x < offset){ - index = threadIdx.x + offset; - xx[ threadIdx.x ] += xx[ index ]; - xy[ xy_i + threadIdx.x ] += xy [ xy_i + index ]; - xz[ xz_i + threadIdx.x ] += xz [ xz_i + index ]; - yy[ yy_i + threadIdx.x ] += yy [ yy_i + index ]; - yz[ yz_i + threadIdx.x ] += yz [ yz_i + index ]; - zz[ zz_i + threadIdx.x ] += zz [ zz_i + index ]; - } - __syncthreads (); - } - - if (threadIdx.x == 0) { - results [ blockIdx.x*6 ] = xx [ 0 ]; - results [ blockIdx.x*6 + 1 ] = xy [ xy_i + 0 ]; - results [ blockIdx.x*6 + 2 ] = xz [ xz_i + 0 ]; - results [ blockIdx.x*6 + 3 ] = yy [ yy_i + 0 ]; - results [ blockIdx.x*6 + 4 ] = yz [ yz_i + 0 ]; - results [ blockIdx.x*6 + 5 ] = zz [ zz_i + 0 ]; - } -} - -CUDA_GLOBAL void compute_center_mass (real *input, real *output, size_t n) -{ - extern __shared__ real xx[]; - extern __shared__ real xy[]; - extern __shared__ real xz[]; - extern __shared__ real yy[]; - extern __shared__ real yz[]; - extern __shared__ real zz[]; - - unsigned int xx_i = threadIdx.x; - unsigned int xy_i = blockDim.x; - unsigned int xz_i = 2 * blockDim.x; - unsigned int yy_i = 3 * blockDim.x; - unsigned int yz_i = 4 * blockDim.x; - unsigned int zz_i = 5 * blockDim.x; - - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int index = 0; - - xx[xx_i] = xy [xy_i + threadIdx.x] = xz[xz_i + threadIdx.x] = - yy[yy_i + threadIdx.x] = yz[yz_i + threadIdx.x] = zz[zz_i + threadIdx.x] = 0; - - if (i < n) - { - xx [ xx_i ] = input [ threadIdx.x*6 + 0 ]; - xy [ xy_i + threadIdx.x ] = input [ threadIdx.x*6 + 1 ]; - xz [ xz_i + threadIdx.x ] = input [ threadIdx.x*6 + 2 ]; - yy [ yy_i + threadIdx.x ] = input [ threadIdx.x*6 + 3 ]; - yz [ yz_i + threadIdx.x ] = input [ threadIdx.x*6 + 4 ]; - zz [ zz_i + threadIdx.x ] = input [ threadIdx.x*6 + 5 ]; - } - __syncthreads (); - - for (int offset = blockDim.x / 2; offset > 0; offset >>= 1) - { - if (threadIdx.x < offset ) - { - index = threadIdx.x + offset; - xx [ threadIdx.x ] += xx [ index ]; - xy [ xy_i + threadIdx.x ] += xy [ xy_i + index ]; - xz [ xz_i + threadIdx.x ] += xz [ xz_i + index ]; - yy [ yy_i + threadIdx.x ] += yy [ yy_i + index ]; - yz [ yz_i + threadIdx.x ] += yz [ yz_i + index ]; - zz [ zz_i + threadIdx.x ] += zz [ zz_i + index ]; - } - __syncthreads (); - } - - if (threadIdx.x == 0) - { - output[0] = xx[0]; - output[1] = xy[xy_i]; - output[2] = xz[xz_i]; - output[3] = xz[yy_i]; - output[4] = xz[yz_i]; - output[5] = xz[zz_i]; - } -} - -#if defined( __SM_35__) - -CUDA_GLOBAL void compute_center_mass_xx_xy (single_body_parameters *sbp, - reax_atom *atoms, - real *results, - real xcm0, real xcm1, real xcm2, - size_t n) -{ - extern __shared__ real my_results_xx[]; - extern __shared__ real my_results_xy[]; - - unsigned int xx_i = threadIdx.x; - unsigned int xy_i = blockDim.x; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int index = 0; - real xx = 0; - real xy = 0; - - rvec diff, xcm; - real m = 0; - rvec_MakeZero (diff); - xcm[0] = xcm0; - xcm[1] = xcm1; - xcm[2] = xcm2; - - - if (i < n){ - m = sbp[ atoms[i].type ].mass; - rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm ); - xx = diff[0] * diff[0] * m; - xy = diff[0] * diff[1] * m; - } - __syncthreads (); - - for (int z = 16; z <= 1; z++){ - xx += shfl( xx, z); - xy += shfl( xy, z); - } - __syncthreads (); - - if (threadIdx.x % 32 == 0){ - my_results_xx[threadIdx.x >> 5] = xx; - my_results_xy[threadIdx.x >> 5] = xy; - } - __syncthreads (); - - for (int offset = blockDim.x >> 6; offset > 0; offset >>= 1){ - if (threadIdx.x < offset){ - index = threadIdx.x + offset; - my_results_xx[ threadIdx.x ] += my_results_xx[ index ]; - my_results_xy[ xy_i + threadIdx.x ] += my_results_xy [ xy_i + index ]; - } - __syncthreads (); - } - - if (threadIdx.x == 0) { - results [ blockIdx.x*6 ] = my_results_xx [ 0 ]; - results [ blockIdx.x*6 + 1 ] = my_results_xy [ xy_i + 0 ]; - } -} - -CUDA_GLOBAL void compute_center_mass_xz_yy (single_body_parameters *sbp, - reax_atom *atoms, - real *results, - real xcm0, real xcm1, real xcm2, - size_t n) -{ - extern __shared__ real my_results_xz[]; - extern __shared__ real my_results_yy[]; - - unsigned int yy_i = blockDim.x; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int index = 0; - real xz = 0; - real yy = 0; - - rvec diff, xcm; - real m = 0; - rvec_MakeZero (diff); - xcm[0] = xcm0; - xcm[1] = xcm1; - xcm[2] = xcm2; - - if (i < n){ - m = sbp[ atoms[i].type ].mass; - rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm ); - xz = diff[0] * diff[2] * m; - yy = diff[1] * diff[1] * m; - } - __syncthreads (); - - for (int z = 16; z <= 1; z++){ - xz += shfl( xz, z); - yy += shfl( yy, z); - } - __syncthreads (); - - if (threadIdx.x % 32 == 0){ - my_results_xz[threadIdx.x >> 5] = xz; - my_results_yy[threadIdx.x >> 5] = yy; - } - __syncthreads (); - - for (int offset = blockDim.x >> 6; offset > 0; offset >>= 1){ - if (threadIdx.x < offset){ - index = threadIdx.x + offset; - my_results_xz[ threadIdx.x ] += my_results_xz [ index ]; - my_results_yy[ yy_i + threadIdx.x ] += my_results_yy [ yy_i + index ]; - } - __syncthreads (); - } - - if (threadIdx.x == 0) { - results [ blockIdx.x*6 + 2 ] = my_results_xz [ 0 ]; - results [ blockIdx.x*6 + 3 ] = my_results_yy [ yy_i + 0 ]; - } -} - -CUDA_GLOBAL void compute_center_mass_yz_zz (single_body_parameters *sbp, - reax_atom *atoms, - real *results, - real xcm0, real xcm1, real xcm2, - size_t n) -{ - extern __shared__ real my_results_yz[]; - extern __shared__ real my_results_zz[]; - - unsigned int zz_i = blockDim.x; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int index = 0; - real yz = 0; - real zz = 0; - - rvec diff, xcm; - real m = 0; - rvec_MakeZero (diff); - xcm[0] = xcm0; - xcm[1] = xcm1; - xcm[2] = xcm2; - - - if (i < n){ - m = sbp[ atoms[i].type ].mass; - rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm ); - yz = diff[1] * diff[2] * m; - zz = diff[2] * diff[2] * m; - } - __syncthreads (); - - for (int z = 16; z <= 1; z++){ - yz += shfl( yz, z); - zz += shfl( zz, z); - } - __syncthreads (); - - if (threadIdx.x % 32 == 0){ - my_results_yz[threadIdx.x >> 5] = yz; - my_results_zz[threadIdx.x >> 5] = zz; - } - __syncthreads (); - - for (int offset = blockDim.x >> 6; offset > 0; offset >>= 1){ - if (threadIdx.x < offset){ - index = threadIdx.x + offset; - my_results_yz[ threadIdx.x ] += my_results_yz [ index ]; - my_results_zz[ zz_i + threadIdx.x ] += my_results_zz [ zz_i + index ]; - } - __syncthreads (); - } - - if (threadIdx.x == 0) { - results [ blockIdx.x*6 + 4 ] = my_results_yz [ 0 ]; - results [ blockIdx.x*6 + 5 ] = my_results_zz [ zz_i + 0 ]; - } -} - -#endif diff --git a/PG-PuReMD/src/center_mass.h b/PG-PuReMD/src/center_mass.h deleted file mode 100644 index 113971ad..00000000 --- a/PG-PuReMD/src/center_mass.h +++ /dev/null @@ -1,49 +0,0 @@ - -#ifndef __CENTER_MASS_H__ -#define __CENTER_MASS_H__ - -#include "reax_types.h" -#include "reax_types.h" - -CUDA_GLOBAL void center_of_mass_blocks (single_body_parameters *, reax_atom *, - rvec *res_xcm, - rvec *res_vcm, - rvec *res_amcm, - size_t n); - -#if defined(__SM_35__) -CUDA_GLOBAL void center_of_mass_blocks_xcm (single_body_parameters *, reax_atom *, - rvec *res_xcm, - size_t n); -CUDA_GLOBAL void center_of_mass_blocks_vcm (single_body_parameters *, reax_atom *, - rvec *res_vcm, - size_t n); -CUDA_GLOBAL void center_of_mass_blocks_amcm (single_body_parameters *, reax_atom *, - rvec *res_amcm, - size_t n); -#endif - - -CUDA_GLOBAL void center_of_mass (rvec *xcm, - rvec *vcm, - rvec *amcm, - rvec *res_xcm, - rvec *res_vcm, - rvec *res_amcm, - size_t n); - -CUDA_GLOBAL void compute_center_mass (single_body_parameters *sbp, - reax_atom *atoms, - real *results, - real xcm0, real xcm1, real xcm2, - size_t n); - -CUDA_GLOBAL void compute_center_mass (real *input, real *output, size_t n); - -#if defined(__SM_35__) -CUDA_GLOBAL void compute_center_mass_xx_xy (single_body_parameters *, reax_atom *, real *, real , real , real , size_t ); -CUDA_GLOBAL void compute_center_mass_xz_yy (single_body_parameters *, reax_atom *, real *, real , real , real , size_t ); -CUDA_GLOBAL void compute_center_mass_yz_zz (single_body_parameters *, reax_atom *, real *, real , real , real , size_t ); -#endif - -#endif diff --git a/PG-PuReMD/src/charges.c b/PG-PuReMD/src/charges.c index 6d695f56..8f53b65d 100644 --- a/PG-PuReMD/src/charges.c +++ b/PG-PuReMD/src/charges.c @@ -19,6 +19,8 @@ <http://www.gnu.org/licenses/>. ----------------------------------------------------------------------*/ +#include "reax_types.h" + #include "charges.h" #include "allocate.h" @@ -27,12 +29,6 @@ #include "lin_alg.h" #include "tool_box.h" -#ifdef HAVE_CUDA - #include "cuda_charges.h" - #include "cuda_lin_alg.h" - #include "cuda_validation.h" -#endif - int compare_matrix_entry(const void *v1, const void *v2) { @@ -406,46 +402,6 @@ void Calculate_Charges( reax_system *system, storage *workspace, } -#ifdef HAVE_CUDA -void Cuda_Calculate_Charges( reax_system *system, storage *workspace, - mpi_datatypes *mpi_data ) -{ - int i, scale; - real u;//, s_sum, t_sum; - rvec2 my_sum, all_sum; - reax_atom *atom; - real *q; - - my_sum[0] = 0.0; - my_sum[1] = 0.0; - scale = sizeof(real) / sizeof(void); - q = (real *) host_scratch; - memset( q, 0, system->N * sizeof (real)); - - cuda_charges_x( system, my_sum ); - -#if defined(DEBUG_FOCUS) - fprintf( stderr, "Device: my_sum[0]: %f, my_sum[1]: %f\n", - my_sum[0], my_sum[1] ); -#endif - - MPI_Allreduce( &my_sum, &all_sum, 2, MPI_DOUBLE, MPI_SUM, mpi_data->world ); - - u = all_sum[0] / all_sum[1]; - -#if defined(DEBUG_FOCUS) - fprintf( stderr, "Device: u: %f \n", u ); -#endif - - cuda_charges_st( system, workspace, q, u ); - - Dist( system, mpi_data, q, MPI_DOUBLE, scale, real_packer ); - - cuda_charges_updateq( system, q ); -} -#endif - - void QEq( reax_system *system, control_params *control, simulation_data *data, storage *workspace, output_controls *out_control, mpi_datatypes *mpi_data ) @@ -504,57 +460,3 @@ void QEq( reax_system *system, control_params *control, simulation_data *data, } #endif } - - -#ifdef HAVE_CUDA -void Cuda_QEq( reax_system *system, control_params *control, simulation_data - *data, storage *workspace, output_controls *out_control, mpi_datatypes - *mpi_data ) -{ - int s_matvecs, t_matvecs; - - Cuda_Init_MatVec( system, workspace ); - - //if (data->step > 0) { - // compare_rvec2 (workspace->b, dev_workspace->b, system->n, "b"); - // compare_rvec2 (workspace->x, dev_workspace->x, system->n, "x"); - // compare_array (workspace->b_s, dev_workspace->b_s, system->n, "b_s"); - // compare_array (workspace->b_t, dev_workspace->b_t, system->n, "b_t"); - //} - -//#ifdef __CUDA_DEBUG__ -// Init_MatVec( system, data, control, workspace, mpi_data ); -//#endif - -#if defined(DEBUG) - fprintf( stderr, "p%d: initialized qEq\n", system->my_rank ); - //Print_Linear_System( system, control, workspace, data->step ); -#endif - - //MATRIX CHANGES - s_matvecs = Cuda_dual_CG(system, workspace, &dev_workspace->H, - dev_workspace->b, control->q_err, dev_workspace->x, mpi_data, - out_control->log, data); - t_matvecs = 0; - //fprintf (stderr, "Device: First CG complated with iterations: %d \n", s_matvecs); - -#if defined(DEBUG) - fprintf( stderr, "p%d: first CG completed\n", system->my_rank ); -#endif - - Cuda_Calculate_Charges( system, workspace, mpi_data ); - -#if defined(DEBUG) - fprintf( stderr, "p%d: computed charges\n", system->my_rank ); - //Print_Charges( system ); -#endif - -#if defined(LOG_PERFORMANCE) - if ( system->my_rank == MASTER_NODE ) - { - data->timing.s_matvecs += s_matvecs; - data->timing.t_matvecs += t_matvecs; - } -#endif -} -#endif diff --git a/PG-PuReMD/src/charges.h b/PG-PuReMD/src/charges.h index faad0d09..08af5641 100644 --- a/PG-PuReMD/src/charges.h +++ b/PG-PuReMD/src/charges.h @@ -25,11 +25,16 @@ #include "reax_types.h" +#ifdef __cplusplus +extern "C" { +#endif + void QEq( reax_system*, control_params*, simulation_data*, storage*, output_controls*, mpi_datatypes* ); -void Cuda_QEq( reax_system*, control_params*, simulation_data*, - storage*, output_controls*, mpi_datatypes* ); +#ifdef __cplusplus +} +#endif #endif diff --git a/PG-PuReMD/src/comm_tools.c b/PG-PuReMD/src/comm_tools.c index 5a832aff..a8d46fcb 100644 --- a/PG-PuReMD/src/comm_tools.c +++ b/PG-PuReMD/src/comm_tools.c @@ -19,6 +19,8 @@ <http://www.gnu.org/licenses/>. ----------------------------------------------------------------------*/ +#include "reax_types.h" + #include "comm_tools.h" #include "grid.h" diff --git a/PG-PuReMD/src/comm_tools.h b/PG-PuReMD/src/comm_tools.h index 3b0b645f..a0e8d7e5 100644 --- a/PG-PuReMD/src/comm_tools.h +++ b/PG-PuReMD/src/comm_tools.h @@ -24,21 +24,36 @@ #include "reax_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + void Check_MPI_Error( int, const char * ); + void Setup_Comm( reax_system*, control_params*, mpi_datatypes* ); + void Update_Comm( reax_system* ); void Sort_Boundary_Atoms( reax_system*, int, int, int, mpi_out_data* ); + void Estimate_Boundary_Atoms( reax_system*, int, int, int, mpi_out_data* ); + void Unpack_Exchange_Message( reax_system*, int, void*, int, - neighbor_proc*, int ); + neighbor_proc*, int ); + void Unpack_Estimate_Message( reax_system*, int, void*, int, - neighbor_proc*, int ); + neighbor_proc*, int ); int SendRecv( reax_system*, mpi_datatypes*_data, MPI_Datatype, int*, - message_sorter, unpacker, int ); + message_sorter, unpacker, int ); void Comm_Atoms( reax_system*, control_params*, simulation_data*, storage*, - reax_list**, mpi_datatypes*, int ); + reax_list**, mpi_datatypes*, int ); + +#ifdef __cplusplus +} +#endif + #endif diff --git a/PG-PuReMD/src/control.h b/PG-PuReMD/src/control.h index c6c6ce6c..24cf0451 100644 --- a/PG-PuReMD/src/control.h +++ b/PG-PuReMD/src/control.h @@ -24,6 +24,16 @@ #include "reax_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + char Read_Control_File( char*, control_params*, output_controls* ); +#ifdef __cplusplus +} +#endif + + #endif diff --git a/PG-PuReMD/src/cuda_allocate.cu b/PG-PuReMD/src/cuda/cuda_allocate.cu similarity index 99% rename from PG-PuReMD/src/cuda_allocate.cu rename to PG-PuReMD/src/cuda/cuda_allocate.cu index 7970e9f6..5c722e56 100644 --- a/PG-PuReMD/src/cuda_allocate.cu +++ b/PG-PuReMD/src/cuda/cuda_allocate.cu @@ -6,10 +6,10 @@ #include "cuda_neighbors.h" #include "cuda_utils.h" -#include "allocate.h" -#include "index_utils.h" -#include "tool_box.h" -#include "vector.h" +#include "../allocate.h" +#include "../index_utils.h" +#include "../tool_box.h" +#include "../vector.h" extern "C" { diff --git a/PG-PuReMD/src/cuda_allocate.h b/PG-PuReMD/src/cuda/cuda_allocate.h similarity index 96% rename from PG-PuReMD/src/cuda_allocate.h rename to PG-PuReMD/src/cuda/cuda_allocate.h index 57124046..0d78d932 100644 --- a/PG-PuReMD/src/cuda_allocate.h +++ b/PG-PuReMD/src/cuda/cuda_allocate.h @@ -1,7 +1,7 @@ #ifndef __CUDA_ALLOCATE_H_ #define __CUDA_ALLOCATE_H_ -#include "reax_types.h" +#include "../reax_types.h" #ifdef __cplusplus extern "C" { diff --git a/PG-PuReMD/src/cuda_bond_orders.cu b/PG-PuReMD/src/cuda/cuda_bond_orders.cu similarity index 99% rename from PG-PuReMD/src/cuda_bond_orders.cu rename to PG-PuReMD/src/cuda/cuda_bond_orders.cu index 6e4344aa..bb478a3a 100644 --- a/PG-PuReMD/src/cuda_bond_orders.cu +++ b/PG-PuReMD/src/cuda/cuda_bond_orders.cu @@ -2,11 +2,12 @@ #include "cuda_bond_orders.h" #include "cuda_list.h" -#include "index_utils.h" -#include "bond_orders.h" #include "cuda_utils.h" #include "cuda_reduction.h" +#include "../index_utils.h" +#include "../bond_orders.h" + CUDA_GLOBAL void Cuda_Calculate_BO_init( reax_atom *my_atoms, single_body_parameters *sbp, storage p_workspace, int N ) diff --git a/PG-PuReMD/src/cuda_bond_orders.h b/PG-PuReMD/src/cuda/cuda_bond_orders.h similarity index 98% rename from PG-PuReMD/src/cuda_bond_orders.h rename to PG-PuReMD/src/cuda/cuda_bond_orders.h index 8be3a592..a957b11b 100644 --- a/PG-PuReMD/src/cuda_bond_orders.h +++ b/PG-PuReMD/src/cuda/cuda_bond_orders.h @@ -2,10 +2,9 @@ #ifndef __CUDA_BOND_ORDERS_H__ #define __CUDA_BOND_ORDERS_H__ -#include "reax_types.h" -#include "reax_types.h" +#include "../reax_types.h" -#include "vector.h" +#include "../vector.h" extern "C" { diff --git a/PG-PuReMD/src/cuda_bonds.cu b/PG-PuReMD/src/cuda/cuda_bonds.cu similarity index 98% rename from PG-PuReMD/src/cuda_bonds.cu rename to PG-PuReMD/src/cuda/cuda_bonds.cu index 81f3444b..e3592630 100644 --- a/PG-PuReMD/src/cuda_bonds.cu +++ b/PG-PuReMD/src/cuda/cuda_bonds.cu @@ -19,13 +19,12 @@ <http://www.gnu.org/licenses/>. ----------------------------------------------------------------------*/ -#include "reax_types.h" - -#include "index_utils.h" -#include "reax_types.h" +#include "cuda_bonds.h" #include "cuda_list.h" +#include "../index_utils.h" + CUDA_GLOBAL void Cuda_Bonds( reax_atom *my_atoms, global_parameters gp, single_body_parameters *sbp, two_body_parameters *tbp, diff --git a/PG-PuReMD/src/cuda_bonds.h b/PG-PuReMD/src/cuda/cuda_bonds.h similarity index 69% rename from PG-PuReMD/src/cuda_bonds.h rename to PG-PuReMD/src/cuda/cuda_bonds.h index d8a7d273..fd9126be 100644 --- a/PG-PuReMD/src/cuda_bonds.h +++ b/PG-PuReMD/src/cuda/cuda_bonds.h @@ -22,16 +22,12 @@ #ifndef __CUDA_BONDS_H_ #define __CUDA_BONDS_H_ -#include "reax_types.h" - -CUDA_GLOBAL void Cuda_Bonds( reax_atom *, - global_parameters , - single_body_parameters *, - two_body_parameters *, - storage , - reax_list , - int , int , - real * - ); +#include "../reax_types.h" + + +CUDA_GLOBAL void Cuda_Bonds( reax_atom *, global_parameters, + single_body_parameters *, two_body_parameters *, storage, + reax_list, int, int, real * ); + #endif diff --git a/PG-PuReMD/src/cuda_charges.cu b/PG-PuReMD/src/cuda/cuda_charges.cu similarity index 68% rename from PG-PuReMD/src/cuda_charges.cu rename to PG-PuReMD/src/cuda/cuda_charges.cu index c8410952..ada6bf2f 100644 --- a/PG-PuReMD/src/cuda_charges.cu +++ b/PG-PuReMD/src/cuda/cuda_charges.cu @@ -21,12 +21,13 @@ #include "cuda_charges.h" -#include "reax_types.h" +#include "cuda_lin_alg.h" #include "cuda_reduction.h" #include "cuda_utils.h" - #include "cuda_validation.h" +#include "../basic_comm.h" + CUDA_GLOBAL void k_init_matvec( reax_atom *my_atoms, single_body_parameters *sbp, storage p_workspace, int n ) @@ -204,3 +205,93 @@ void cuda_charges_updateq( reax_system *system, real *q ) cudaThreadSynchronize( ); cudaCheckError( ); } + + +void Cuda_Calculate_Charges( reax_system *system, storage *workspace, + mpi_datatypes *mpi_data ) +{ + int i, scale; + real u;//, s_sum, t_sum; + rvec2 my_sum, all_sum; + reax_atom *atom; + real *q; + + my_sum[0] = 0.0; + my_sum[1] = 0.0; + scale = sizeof(real) / sizeof(void); + q = (real *) host_scratch; + memset( q, 0, system->N * sizeof (real)); + + cuda_charges_x( system, my_sum ); + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "Device: my_sum[0]: %f, my_sum[1]: %f\n", + my_sum[0], my_sum[1] ); +#endif + + MPI_Allreduce( &my_sum, &all_sum, 2, MPI_DOUBLE, MPI_SUM, mpi_data->world ); + + u = all_sum[0] / all_sum[1]; + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "Device: u: %f \n", u ); +#endif + + cuda_charges_st( system, workspace, q, u ); + + Dist( system, mpi_data, q, MPI_DOUBLE, scale, real_packer ); + + cuda_charges_updateq( system, q ); +} + + +void Cuda_QEq( reax_system *system, control_params *control, simulation_data + *data, storage *workspace, output_controls *out_control, mpi_datatypes + *mpi_data ) +{ + int s_matvecs, t_matvecs; + + Cuda_Init_MatVec( system, workspace ); + + //if (data->step > 0) { + // compare_rvec2 (workspace->b, dev_workspace->b, system->n, "b"); + // compare_rvec2 (workspace->x, dev_workspace->x, system->n, "x"); + // compare_array (workspace->b_s, dev_workspace->b_s, system->n, "b_s"); + // compare_array (workspace->b_t, dev_workspace->b_t, system->n, "b_t"); + //} + +//#ifdef __CUDA_DEBUG__ +// Init_MatVec( system, data, control, workspace, mpi_data ); +//#endif + +#if defined(DEBUG) + fprintf( stderr, "p%d: initialized qEq\n", system->my_rank ); + //Print_Linear_System( system, control, workspace, data->step ); +#endif + + //MATRIX CHANGES + s_matvecs = Cuda_dual_CG(system, workspace, &dev_workspace->H, + dev_workspace->b, control->q_err, dev_workspace->x, mpi_data, + out_control->log, data); + t_matvecs = 0; + //fprintf (stderr, "Device: First CG complated with iterations: %d \n", s_matvecs); + +#if defined(DEBUG) + fprintf( stderr, "p%d: first CG completed\n", system->my_rank ); +#endif + + Cuda_Calculate_Charges( system, workspace, mpi_data ); + +#if defined(DEBUG) + fprintf( stderr, "p%d: computed charges\n", system->my_rank ); + //Print_Charges( system ); +#endif + +#if defined(LOG_PERFORMANCE) + if ( system->my_rank == MASTER_NODE ) + { + data->timing.s_matvecs += s_matvecs; + data->timing.t_matvecs += t_matvecs; + } +#endif +} diff --git a/PG-PuReMD/src/cuda_charges.h b/PG-PuReMD/src/cuda/cuda_charges.h similarity index 89% rename from PG-PuReMD/src/cuda_charges.h rename to PG-PuReMD/src/cuda/cuda_charges.h index 2d421389..d1922a48 100644 --- a/PG-PuReMD/src/cuda_charges.h +++ b/PG-PuReMD/src/cuda/cuda_charges.h @@ -22,7 +22,7 @@ #ifndef __CUDA_CHARGES_H_ #define __CUDA_CHARGES_H_ -#include "reax_types.h" +#include "../reax_types.h" #ifdef __cplusplus extern "C" { @@ -37,6 +37,9 @@ void cuda_charges_st( reax_system *, storage *, real *, real ); void cuda_charges_updateq( reax_system *, real * ); +void Cuda_QEq( reax_system*, control_params*, simulation_data*, + storage*, output_controls*, mpi_datatypes* ); + #ifdef __cplusplus } diff --git a/PG-PuReMD/src/cuda_copy.cu b/PG-PuReMD/src/cuda/cuda_copy.cu similarity index 99% rename from PG-PuReMD/src/cuda_copy.cu rename to PG-PuReMD/src/cuda/cuda_copy.cu index a3bfca30..42055875 100644 --- a/PG-PuReMD/src/cuda_copy.cu +++ b/PG-PuReMD/src/cuda/cuda_copy.cu @@ -2,7 +2,8 @@ #include "cuda_copy.h" #include "cuda_utils.h" -#include "vector.h" + +#include "../vector.h" /* Copy grid info from host to device */ diff --git a/PG-PuReMD/src/cuda_copy.h b/PG-PuReMD/src/cuda/cuda_copy.h similarity index 93% rename from PG-PuReMD/src/cuda_copy.h rename to PG-PuReMD/src/cuda/cuda_copy.h index 51c4314c..72bf992c 100644 --- a/PG-PuReMD/src/cuda_copy.h +++ b/PG-PuReMD/src/cuda/cuda_copy.h @@ -1,24 +1,30 @@ #ifndef __CUDA_COPY_H_ #define __CUDA_COPY_H_ -#include "reax_types.h" +#include "../reax_types.h" + #ifdef __cplusplus extern "C" { #endif void Sync_Atoms( reax_system * ); + void Sync_Grid( grid *, grid * ); + void Sync_System( reax_system * ); void Prep_Device_For_Output( reax_system *, simulation_data * ); + void Output_Sync_Lists( reax_list *host, reax_list *device, int type ); + void Output_Sync_Atoms( reax_system * ); -void Output_Sync_Simulation_Data( simulation_data *, simulation_data * ); +void Output_Sync_Simulation_Data( simulation_data *, simulation_data * ); #ifdef __cplusplus } #endif + #endif diff --git a/PG-PuReMD/src/cuda_environment.cu b/PG-PuReMD/src/cuda/cuda_environment.cu similarity index 100% rename from PG-PuReMD/src/cuda_environment.cu rename to PG-PuReMD/src/cuda/cuda_environment.cu diff --git a/PG-PuReMD/src/cuda_environment.h b/PG-PuReMD/src/cuda/cuda_environment.h similarity index 56% rename from PG-PuReMD/src/cuda_environment.h rename to PG-PuReMD/src/cuda/cuda_environment.h index f8ae3cd0..1cbcc92c 100644 --- a/PG-PuReMD/src/cuda_environment.h +++ b/PG-PuReMD/src/cuda/cuda_environment.h @@ -2,15 +2,19 @@ #ifndef __CUDA_ENVIRONMENT_H__ #define __CUDA_ENVIRONMENT_H__ +#include "../reax_types.h" + + #ifdef __cplusplus extern "C" { #endif -void Setup_Cuda_Environment (int, int, int); -void Cleanup_Cuda_Environment (); +void Setup_Cuda_Environment( int, int, int ); +void Cleanup_Cuda_Environment( ); #ifdef __cplusplus } #endif + #endif diff --git a/PG-PuReMD/src/cuda_forces.cu b/PG-PuReMD/src/cuda/cuda_forces.cu similarity index 89% rename from PG-PuReMD/src/cuda_forces.cu rename to PG-PuReMD/src/cuda/cuda_forces.cu index 831a5694..a790b1a8 100644 --- a/PG-PuReMD/src/cuda_forces.cu +++ b/PG-PuReMD/src/cuda/cuda_forces.cu @@ -1,24 +1,27 @@ #include "cuda_forces.h" -#include "reax_types.h" -#include "cuda_list.h" -#include "cuda_utils.h" +#include "cuda_bonds.h" +#include "cuda_bond_orders.h" +#include "cuda_charges.h" #include "cuda_helpers.h" +#include "cuda_hydrogen_bonds.h" +#include "cuda_lin_alg.h" +#include "cuda_list.h" +#include "cuda_multi_body.h" #include "cuda_neighbors.h" -#include "cuda_bond_orders.h" +#include "cuda_nonbonded.h" #include "cuda_reduction.h" -#include "cuda_bonds.h" -#include "cuda_multi_body.h" -#include "cuda_valence_angles.h" #include "cuda_torsion_angles.h" -#include "cuda_hydrogen_bonds.h" -#include "tool_box.h" -#include "cuda_nonbonded.h" +#include "cuda_utils.h" +#include "cuda_valence_angles.h" +#include "cuda_validation.h" -#include "index_utils.h" -#include "vector.h" -#include "forces.h" +#include "../basic_comm.h" +#include "../forces.h" +#include "../index_utils.h" +#include "../tool_box.h" +#include "../vector.h" CUDA_GLOBAL void k_disable_hydrogen_bonding( control_params *control ) @@ -1719,3 +1722,201 @@ void Cuda_Compute_NonBonded_Forces( reax_system *system, control_params *control Cuda_NonBonded_Energy( system, control, workspace, data, lists, out_control, (control->tabulate == 0) ? false: true ); } + + +void Cuda_Compute_Total_Force( reax_system *system, control_params *control, + simulation_data *data, storage *workspace, + reax_list **lists, mpi_datatypes *mpi_data ) +{ + rvec *f; + + f = (rvec *) host_scratch; + memset( f, 0, sizeof(rvec) * system->N ); + + Cuda_Total_Forces( system, control, data, workspace ); + +#if defined(PURE_REAX) + /* now all forces are computed to their partially-final values + * based on the neighbors information each processor has had. + * final values of force on each atom needs to be computed by adding up + * all partially-final pieces */ + + //MVAPICH2 + copy_host_device( f, dev_workspace->f, sizeof(rvec) * system->N , + cudaMemcpyDeviceToHost, "total_force:f:get" ); + + Coll( system, mpi_data, f, mpi_data->mpi_rvec, + sizeof(rvec) / sizeof(void), rvec_unpacker ); + + copy_host_device( f, dev_workspace->f, sizeof(rvec) * system->N, + cudaMemcpyHostToDevice, "total_force:f:put" ); + + Cuda_Total_Forces_PURE( system, dev_workspace ); +#endif + +} + + +int Cuda_Compute_Forces( reax_system *system, control_params *control, + simulation_data *data, storage *workspace, reax_list **lists, + output_controls *out_control, mpi_datatypes *mpi_data ) +{ + int charge_flag, retVal; + +#if defined(LOG_PERFORMANCE) + real t_start = 0; + + //MPI_Barrier( MPI_COMM_WORLD ); + if ( system->my_rank == MASTER_NODE ) + { + t_start = Get_Time( ); + } +#endif + + retVal = SUCCESS; + + /********* init forces ************/ + if ( control->charge_freq && (data->step - data->prev_steps) % control->charge_freq == 0 ) + { + charge_flag = TRUE; + } + else + { + charge_flag = FALSE; + } + + if ( charge_flag == TRUE ) + { + retVal = Cuda_Init_Forces( system, control, data, workspace, lists, out_control ); + +// int i; +// static reax_list **temp_lists; +// +// if ( data->step == 0 ) +// { +// temp_lists = (reax_list **) smalloc( LIST_N * sizeof (reax_list *), "temp_lists" ); +// for ( i = 0; i < LIST_N; ++i ) +// { +// temp_lists[i] = (reax_list *) smalloc( sizeof(reax_list), "lists[i]" ); +// temp_lists[i]->allocated = FALSE; +// } +// Make_List( (*dev_lists + BONDS)->n, (*dev_lists + BONDS)->num_intrs, +// TYP_BOND, *temp_lists + BONDS ); +// Make_List( (*dev_lists + HBONDS)->n, (*dev_lists + HBONDS)->num_intrs, +// TYP_HBOND, *temp_lists + HBONDS ); +// } +// else +// { +// Delete_List( *temp_lists + BONDS ); +// Make_List( (*dev_lists + BONDS)->n, (*dev_lists + BONDS)->num_intrs, +// TYP_BOND, *temp_lists + BONDS ); +// Delete_List( *temp_lists + HBONDS ); +// Make_List( (*dev_lists + HBONDS)->n, (*dev_lists + HBONDS)->num_intrs, +// TYP_HBOND, *temp_lists + HBONDS ); +// +// } +// Output_Sync_Lists( *temp_lists + BONDS, *dev_lists + BONDS, TYP_BOND ); +// Print_Bonds( system, temp_lists, control ); +// Output_Sync_Lists( *temp_lists + HBONDS, *dev_lists + HBONDS, TYP_HBOND ); +// Print_HBonds( system, temp_lists, control, data->step ); +// Print_HBond_Indices( system, temp_lists, control, data->step ); +// exit( 0 ); + } + else + { + retVal = Cuda_Init_Forces_No_Charges( system, control, data, workspace, lists, out_control ); + } + + if ( retVal == SUCCESS ) + { + //validate_sparse_matrix( system, workspace ); + +#if defined(LOG_PERFORMANCE) + //MPI_Barrier( MPI_COMM_WORLD ); + if ( system->my_rank == MASTER_NODE ) + { + Update_Timing_Info( &t_start, &(data->timing.init_forces) ); + } +#endif + + /********* bonded interactions ************/ + retVal = Cuda_Compute_Bonded_Forces( system, control, data, workspace, lists, out_control ); + +#if defined(LOG_PERFORMANCE) + //MPI_Barrier( MPI_COMM_WORLD ); + if ( system->my_rank == MASTER_NODE ) + { + Update_Timing_Info( &t_start, &(data->timing.bonded) ); + } +#endif + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "p%d @ step%d: completed bonded\n", + system->my_rank, data->step ); + MPI_Barrier( MPI_COMM_WORLD ); +#endif + } + + if ( retVal == SUCCESS ) + { + /**************** charges ************************/ +#if defined(PURE_REAX) + if ( charge_flag == TRUE ) + { + Cuda_QEq( system, control, data, workspace, out_control, mpi_data ); + } + +#if defined(LOG_PERFORMANCE) + //MPI_Barrier( MPI_COMM_WORLD ); + if ( system->my_rank == MASTER_NODE ) + { + Update_Timing_Info( &t_start, &(data->timing.qEq) ); + } +#endif + +#if defined(DEBUG_FOCUS) + fprintf(stderr, "p%d @ step%d: qeq completed\n", system->my_rank, data->step); + MPI_Barrier( MPI_COMM_WORLD ); +#endif +#endif //PURE_REAX + + /********* nonbonded interactions ************/ + Cuda_Compute_NonBonded_Forces( system, control, data, workspace, + lists, out_control, mpi_data ); + +#if defined(LOG_PERFORMANCE) + //MPI_Barrier( MPI_COMM_WORLD ); + if ( system->my_rank == MASTER_NODE ) + { + Update_Timing_Info( &t_start, &(data->timing.nonb) ); + } +#endif +#if defined(DEBUG_FOCUS) + fprintf( stderr, "p%d @ step%d: nonbonded forces completed\n", + system->my_rank, data->step ); + MPI_Barrier( MPI_COMM_WORLD ); +#endif + + /*********** total force ***************/ + Cuda_Compute_Total_Force( system, control, data, workspace, lists, mpi_data ); + +#if defined(LOG_PERFORMANCE) + //MPI_Barrier( MPI_COMM_WORLD ); + if ( system->my_rank == MASTER_NODE ) + { + Update_Timing_Info( &t_start, &(data->timing.bonded) ); + } +#endif +#if defined(DEBUG_FOCUS) + fprintf( stderr, "p%d @ step%d: total forces computed\n", + system->my_rank, data->step ); + //Print_Total_Force( system, data, workspace ); + MPI_Barrier( MPI_COMM_WORLD ); + +#endif + +// Print_Forces( system ); + } + + return retVal; +} diff --git a/PG-PuReMD/src/cuda_forces.h b/PG-PuReMD/src/cuda/cuda_forces.h similarity index 86% rename from PG-PuReMD/src/cuda_forces.h rename to PG-PuReMD/src/cuda/cuda_forces.h index 9dc3da66..4abdb52f 100644 --- a/PG-PuReMD/src/cuda_forces.h +++ b/PG-PuReMD/src/cuda/cuda_forces.h @@ -2,7 +2,7 @@ #ifndef __CUDA_FORCES_H__ #define __CUDA_FORCES_H__ -#include "reax_types.h" +#include "../reax_types.h" #ifdef __cplusplus @@ -32,6 +32,9 @@ void Cuda_Compute_NonBonded_Forces( reax_system *, control_params *, simulation_data *, storage *, reax_list **, output_controls *, mpi_datatypes * ); +int Cuda_Compute_Forces( reax_system*, control_params*, simulation_data*, + storage*, reax_list**, output_controls*, mpi_datatypes* ); + void Print_Forces( reax_system * ); diff --git a/PG-PuReMD/src/cuda_helpers.h b/PG-PuReMD/src/cuda/cuda_helpers.h similarity index 97% rename from PG-PuReMD/src/cuda_helpers.h rename to PG-PuReMD/src/cuda/cuda_helpers.h index a4943a5f..b14f45b3 100644 --- a/PG-PuReMD/src/cuda_helpers.h +++ b/PG-PuReMD/src/cuda/cuda_helpers.h @@ -1,7 +1,7 @@ #ifndef __CUDA_HELPERS__ #define __CUDA_HELPERS__ -#include "reax_types.h" +#include "../reax_types.h" CUDA_DEVICE static inline int cuda_strcmp( char * a, diff --git a/PG-PuReMD/src/cuda_hydrogen_bonds.cu b/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu similarity index 99% rename from PG-PuReMD/src/cuda_hydrogen_bonds.cu rename to PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu index 95eda081..18cdbb57 100644 --- a/PG-PuReMD/src/cuda_hydrogen_bonds.cu +++ b/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu @@ -21,16 +21,14 @@ #include "cuda_hydrogen_bonds.h" -#include "reax_types.h" -#include "index_utils.h" - #include "cuda_valence_angles.h" #include "cuda_helpers.h" #include "cuda_list.h" -#include "vector.h" - #include "cuda_shuffle.h" +#include "../index_utils.h" +#include "../vector.h" + CUDA_GLOBAL void Cuda_Hydrogen_Bonds( reax_atom *my_atoms, single_body_parameters *sbp, hbond_parameters *d_hbp, global_parameters gp, control_params *control, diff --git a/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.h b/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.h new file mode 100644 index 00000000..606196b4 --- /dev/null +++ b/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.h @@ -0,0 +1,48 @@ +/*---------------------------------------------------------------------- + PuReMD - Purdue ReaxFF Molecular Dynamics Program + + Copyright (2010) Purdue University + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Joseph Fogarty, jcfogart@mail.usf.edu + Sagar Pandit, pandit@usf.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#ifndef __CUDA_HBONDS_H_ +#define __CUDA_HBONDS_H_ + +#include "../reax_types.h" + + +CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs( reax_atom *, + storage, reax_list ); + +CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs_BL( reax_atom *, + storage, reax_list, int ); + +CUDA_GLOBAL void Cuda_Hydrogen_Bonds_PostProcess( reax_atom *, + storage, reax_list, int ); + +CUDA_GLOBAL void Cuda_Hydrogen_Bonds( reax_atom *, + single_body_parameters *, hbond_parameters *, + global_parameters, control_params *, storage , + reax_list, reax_list, int, int, real *, rvec * ); + +CUDA_GLOBAL void Cuda_Hydrogen_Bonds_MT( reax_atom *, + single_body_parameters *, hbond_parameters *, + global_parameters , control_params *, storage, + reax_list, reax_list, int, int, real *, rvec * ); + + +#endif diff --git a/PG-PuReMD/src/cuda/cuda_init_md.cu b/PG-PuReMD/src/cuda/cuda_init_md.cu new file mode 100644 index 00000000..fb1ac0df --- /dev/null +++ b/PG-PuReMD/src/cuda/cuda_init_md.cu @@ -0,0 +1,412 @@ + +#include "cuda_init_md.h" + +#include "cuda_allocate.h" +#include "cuda_list.h" +#include "cuda_copy.h" +#include "cuda_forces.h" +#include "cuda_integrate.h" +#include "cuda_neighbors.h" +#include "cuda_reset_tools.h" +#include "cuda_system_props.h" +#include "cuda_utils.h" +#include "cuda_validation.h" + +#if defined(PURE_REAX) + #include "../box.h" + #include "../comm_tools.h" + #include "../grid.h" + #include "../init_md.h" + #include "../integrate.h" + #include "../io_tools.h" +#ifdef __cplusplus +extern "C" { +#endif + #include "../lookup.h" +#ifdef __cplusplus +} +#endif + #include "../random.h" + #include "../reset_tools.h" + #include "../tool_box.h" + #include "../vector.h" +#elif defined(LAMMPS_REAX) + #include "../reax_box.h" + #include "../reax_comm_tools.h" + #include "../reax_grid.h" + #include "../reax_init_md.h" + #include "../reax_integrate.h" + #include "../reax_io_tools.h" + #include "../reax_list.h" + #include "../reax_lookup.h" + #include "../reax_random.h" + #include "../reax_reset_tools.h" + #include "../reax_tool_box.h" + #include "../reax_vector.h" +#endif + + +void Cuda_Init_ScratchArea( ) +{ + cuda_malloc( (void **)&scratch, DEVICE_SCRATCH_SIZE, TRUE, "device:scratch" ); + + host_scratch = (void *) smalloc( HOST_SCRATCH_SIZE, "host:scratch" ); +} + + +int Cuda_Init_System( reax_system *system, control_params *control, + simulation_data *data, storage *workspace, + mpi_datatypes *mpi_data, char *msg ) +{ + int i, ret; + reax_atom *atom; + int nrecv[MAX_NBRS]; + + Setup_New_Grid( system, control, MPI_COMM_WORLD ); + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "p%d GRID:\n", system->my_rank ); + Print_Grid( &(system->my_grid), stderr ); +#endif + + Bin_My_Atoms( system, &(workspace->realloc) ); + Reorder_My_Atoms( system, workspace ); + + /* estimate N and total capacity */ + for ( i = 0; i < MAX_NBRS; ++i ) + { + nrecv[i] = 0; + } + + MPI_Barrier( MPI_COMM_WORLD ); + system->max_recved = 0; + system->N = SendRecv( system, mpi_data, mpi_data->boundary_atom_type, nrecv, + Estimate_Boundary_Atoms, Unpack_Estimate_Message, TRUE ); + system->total_cap = MAX( (int)(system->N * SAFE_ZONE), MIN_CAP ); + Bin_Boundary_Atoms( system ); + + /* Sync atoms here to continue the computation */ + dev_alloc_system( system ); + Sync_System( system ); + + /* estimate numH and Hcap */ + Cuda_Reset_Atoms( system, control ); + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "p%d: n=%d local_cap=%d\n", + system->my_rank, system->n, system->local_cap ); + fprintf( stderr, "p%d: N=%d total_cap=%d\n", + system->my_rank, system->N, system->total_cap ); + fprintf( stderr, "p%d: numH=%d H_cap=%d\n", + system->my_rank, system->numH, system->Hcap ); +#endif + + Cuda_Compute_Total_Mass( system, data, mpi_data->comm_mesh3D ); + + Cuda_Compute_Center_of_Mass( system, data, mpi_data, mpi_data->comm_mesh3D ); + +// if( Reposition_Atoms( system, control, data, mpi_data, msg ) == FAILURE ) +// { +// return FAILURE; +// } + + /* initialize velocities so that desired init T can be attained */ + if ( !control->restart || (control->restart && control->random_vel) ) + { + Generate_Initial_Velocities( system, control->T_init ); + } + + Cuda_Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D ); + + return SUCCESS; +} + + +void Cuda_Init_Simulation_Data( reax_system *system, control_params *control, + simulation_data *data, char *msg ) +{ + dev_alloc_simulation_data( data ); + + Reset_Simulation_Data( data ); + + if ( !control->restart ) + { + data->step = data->prev_steps = 0; + } + + switch ( control->ensemble ) + { + case NVE: + data->N_f = 3 * system->bigN; + Cuda_Evolve = Velocity_Verlet_NVE; + control->virial = 0; + break; + + case bNVT: + data->N_f = 3 * system->bigN + 1; + Cuda_Evolve = Cuda_Velocity_Verlet_Berendsen_NVT; + control->virial = 0; + break; + + case nhNVT: + fprintf( stderr, "[WARNING] Nose-Hoover NVT is still under testing.\n" ); + data->N_f = 3 * system->bigN + 1; + Cuda_Evolve = Velocity_Verlet_Nose_Hoover_NVT_Klein; + control->virial = 0; + if ( !control->restart || (control->restart && control->random_vel) ) + { + data->therm.G_xi = control->Tau_T * + (2.0 * data->sys_en.e_kin - data->N_f * K_B * control->T ); + data->therm.v_xi = data->therm.G_xi * control->dt; + data->therm.v_xi_old = 0; + data->therm.xi = 0; + } + break; + + case sNPT: /* Semi-Isotropic NPT */ + data->N_f = 3 * system->bigN + 4; + Cuda_Evolve = Velocity_Verlet_Berendsen_NPT; + control->virial = 1; + if ( !control->restart ) + { + Reset_Pressures( data ); + } + break; + + case iNPT: /* Isotropic NPT */ + data->N_f = 3 * system->bigN + 2; + Cuda_Evolve = Velocity_Verlet_Berendsen_NPT; + control->virial = 1; + if ( !control->restart ) + { + Reset_Pressures( data ); + } + break; + + case NPT: /* Anisotropic NPT */ + data->N_f = 3 * system->bigN + 9; + Cuda_Evolve = Velocity_Verlet_Berendsen_NPT; + control->virial = 1; + + fprintf( stderr, "p%d: init_simulation_data: option not yet implemented\n", + system->my_rank ); + MPI_Abort( MPI_COMM_WORLD, INVALID_INPUT ); + break; + + default: + fprintf( stderr, "p%d: init_simulation_data: ensemble not recognized\n", + system->my_rank ); + MPI_Abort( MPI_COMM_WORLD, INVALID_INPUT ); + } + + /* initialize the timer(s) */ + MPI_Barrier( MPI_COMM_WORLD ); + if ( system->my_rank == MASTER_NODE ) + { + data->timing.start = Get_Time( ); + +#if defined(LOG_PERFORMANCE) + Reset_Timing( &data->timing ); +#endif + } + +#if defined(DEBUG) + fprintf( stderr, "data->N_f: %8.3f\n", data->N_f ); +#endif +} + + +void Cuda_Init_Workspace( reax_system *system, control_params *control, + storage *workspace, char *msg ) +{ + dev_alloc_workspace( system, control, dev_workspace, + system->local_cap, system->total_cap, msg ); + + memset( &(workspace->realloc), 0, sizeof(reallocate_data) ); + Cuda_Reset_Workspace( system, workspace ); + + /* Initialize the Taper function */ + Init_Taper( control, dev_workspace ); +} + + +int Cuda_Init_Lists( reax_system *system, control_params *control, + simulation_data *data, storage *workspace, reax_list **lists, + mpi_datatypes *mpi_data, char *msg ) +{ + int ret; + int Htop; + + /* ignore returned error, as system->d_max_far_nbrs was not valid */ + ret = Cuda_Estimate_Neighbors( system, data->step ); + + Dev_Make_List( system->total_cap, system->total_far_nbrs, + TYP_FAR_NEIGHBOR, *dev_lists + FAR_NBRS ); + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "p%d: allocated far_nbrs: num_far=%d, space=%dMB\n", + system->my_rank, system->total_far_nbrs, + (int)(system->total_far_nbrs * sizeof(far_neighbor_data) / (1024 * 1024)) ); + fprintf( stderr, "N: %d and total_cap: %d \n", system->N, system->total_cap ); +#endif + + Cuda_Init_Neighbor_Indices( system ); + + Cuda_Generate_Neighbor_Lists( system, data, workspace, dev_lists ); + + /* estimate storage for bonds and hbonds */ + Cuda_Estimate_Storages( system, control, dev_lists, &(dev_workspace->H), data->step ); + + /* estimate storage for charge sparse matrix */ +// Cuda_Estimate_Storage_Sparse_Matrix( system, control, data, dev_lists ); + + dev_alloc_matrix( &(dev_workspace->H), system->total_cap, system->total_cm_entries ); + + Cuda_Init_Sparse_Matrix_Indices( system, &(dev_workspace->H) ); + + //MATRIX CHANGES + //workspace->L = NULL; + //workspace->U = NULL; + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "p:%d - allocated H matrix: max_entries: %d, cap: %d \n", + system->my_rank, system->total_cm_entries, dev_workspace->H.m ); + fprintf( stderr, "p%d: allocated H matrix: Htop=%d, space=%dMB\n", + system->my_rank, Htop, + (int)(Htop * sizeof(sparse_matrix_entry) / (1024 * 1024)) ); +#endif + + if ( control->hbond_cut > 0.0 && system->numH > 0 ) + { + Dev_Make_List( system->total_cap, system->total_hbonds, TYP_HBOND, *dev_lists + HBONDS ); +// Make_List( system->total_cap, system->total_hbonds, TYP_HBOND, *lists + HBONDS ); + + Cuda_Init_HBond_Indices( system ); + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "p%d: allocated hbonds: total_hbonds=%d, space=%dMB\n", + system->my_rank, system->total_hbonds, + (int)(system->total_hbonds * sizeof(hbond_data) / (1024 * 1024)) ); +#endif + } + + /* bonds list */ + Dev_Make_List( system->total_cap, system->total_bonds, TYP_BOND, *dev_lists + BONDS ); +// Make_List( system->total_cap, system->total_bonds, TYP_BOND, *lists + BONDS ); + + Cuda_Init_Bond_Indices( system ); + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "p%d: allocated bonds: total_bonds=%d, space=%dMB\n", + system->my_rank, total_bonds, + (int)(total_bonds * sizeof(bond_data) / (1024 * 1024)) ); +#endif + + /* 3bodies list: since a more accurate estimate of the num. + * of three body interactions requires that bond orders have + * been computed, delay estimation until for computation */ + + return SUCCESS; +} + + +void Cuda_Initialize( reax_system *system, control_params *control, + simulation_data *data, storage *workspace, + reax_list **lists, output_controls *out_control, + mpi_datatypes *mpi_data ) +{ + char msg[MAX_STR]; + real t_start, t_end; + + /* HOST/DEVICE SCRATCH */ + Cuda_Init_ScratchArea( ); + + /* MPI_DATATYPES */ + if ( Init_MPI_Datatypes( system, workspace, mpi_data, msg ) == FAILURE ) + { + fprintf( stderr, "p%d: init_mpi_datatypes: could not create datatypes\n", + system->my_rank ); + fprintf( stderr, "p%d: mpi_data couldn't be initialized! terminating.\n", + system->my_rank ); + MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE ); + } + + /* SYSTEM */ + if ( Cuda_Init_System( system, control, data, workspace, mpi_data, msg ) == FAILURE ) + { + fprintf( stderr, "p%d: %s\n", system->my_rank, msg ); + fprintf( stderr, "p%d: system could not be initialized! terminating.\n", + system->my_rank ); + MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE ); + } + + /* GRID */ + dev_alloc_grid( system ); + Sync_Grid( &system->my_grid, &system->d_my_grid ); + + //validate_grid( system ); + + /* SIMULATION_DATA */ + Cuda_Init_Simulation_Data( system, control, data, msg ); + + /* WORKSPACE */ + Cuda_Init_Workspace( system, control, workspace, msg ); + +#if defined(DEBUG) + fprintf( stderr, "p%d: initialized workspace\n", system->my_rank ); +#endif + + //Sync the taper here from host to device. + + /* CONTROL */ + dev_alloc_control( control ); + + /* LISTS */ + if ( Cuda_Init_Lists( system, control, data, workspace, lists, mpi_data, msg ) == + FAILURE ) + { + fprintf( stderr, "p%d: %s\n", system->my_rank, msg ); + fprintf( stderr, "p%d: system could not be initialized! terminating.\n", + system->my_rank ); + MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE ); + } + +#if defined(DEBUG) + fprintf( stderr, "p%d: initialized lists\n", system->my_rank ); +#endif + + /* OUTPUT Files */ + if ( Init_Output_Files( system, control, out_control, mpi_data, msg ) == FAILURE ) + { + fprintf( stderr, "p%d: %s\n", system->my_rank, msg ); + fprintf( stderr, "p%d: could not open output files! terminating...\n", + system->my_rank ); + MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE ); + } + +#if defined(DEBUG) + fprintf( stderr, "p%d: output files opened\n", system->my_rank ); +#endif + + /* Lookup Tables */ + if ( control->tabulate ) + { + if ( Init_Lookup_Tables( system, control, dev_workspace->Tap, mpi_data, msg ) == FAILURE ) + { + fprintf( stderr, "p%d: %s\n", system->my_rank, msg ); + fprintf( stderr, "p%d: couldn't create lookup table! terminating.\n", + system->my_rank ); + MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE ); + } + +#if defined(DEBUG) + fprintf( stderr, "p%d: initialized lookup tables\n", system->my_rank ); +#endif + } + +#if defined(DEBUG) + fprintf( stderr, "p%d: Device Initialization Done \n", system->my_rank ); +#endif +} + + diff --git a/PG-PuReMD/src/cuda/cuda_init_md.h b/PG-PuReMD/src/cuda/cuda_init_md.h new file mode 100644 index 00000000..328674a5 --- /dev/null +++ b/PG-PuReMD/src/cuda/cuda_init_md.h @@ -0,0 +1,22 @@ + +#ifndef __CUDA_INIT_MD_H__ +#define __CUDA_INIT_MD_H__ + +#include "../reax_types.h" + + +#ifdef __cplusplus +extern "C" { +#endif + +void Cuda_Init_ScratchArea( ); + +void Cuda_Initialize( reax_system*, control_params*, simulation_data*, + storage*, reax_list**, output_controls*, mpi_datatypes* ); + +#ifdef __cplusplus +} +#endif + + +#endif diff --git a/PG-PuReMD/src/cuda/cuda_integrate.cu b/PG-PuReMD/src/cuda/cuda_integrate.cu new file mode 100644 index 00000000..dcb97292 --- /dev/null +++ b/PG-PuReMD/src/cuda/cuda_integrate.cu @@ -0,0 +1,249 @@ + +#include "cuda_integrate.h" + +#include "cuda_allocate.h" +#include "cuda_forces.h" +#include "cuda_integrate.h" +#include "cuda_copy.h" +#include "cuda_neighbors.h" +#include "cuda_reset_tools.h" +#include "cuda_system_props.h" +#include "cuda_utils.h" + +#include "../comm_tools.h" +#include "../grid.h" +#include "../vector.h" + + +CUDA_GLOBAL void k_update_velocity_1( reax_atom *my_atoms, + single_body_parameters *sbp, real dt, int n ) +{ + real inv_m; + rvec dx; + reax_atom *atom; + int i = blockIdx.x * blockDim.x + threadIdx.x; + + if ( i >= n ) + { + return; + } + + /* velocity verlet, 1st part */ + atom = &(my_atoms[i]); + inv_m = 1.0 / sbp[atom->type].mass; + /* Compute x(t + dt) */ + rvec_ScaledSum( dx, dt, atom->v, 0.5 * -F_CONV * inv_m * SQR(dt), atom->f ); + rvec_Add( atom->x, dx ); + /* Compute v(t + dt/2) */ + rvec_ScaledAdd( atom->v, 0.5 * -F_CONV * inv_m * dt, atom->f ); +} + + +void bNVT_update_velocity_part1( reax_system *system, real dt ) +{ + int blocks; + + blocks = system->n / DEF_BLOCK_SIZE + + ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1); + k_update_velocity_1 <<< blocks, DEF_BLOCK_SIZE >>> + (system->d_my_atoms, system->reax_param.d_sbp, dt, system->n); + cudaThreadSynchronize( ); + cudaCheckError( ); +} + + +CUDA_GLOBAL void k_update_velocity_2( reax_atom *my_atoms, + single_body_parameters *sbp, real dt, int n ) +{ + reax_atom *atom; + real inv_m; + int i = blockIdx.x * blockDim.x + threadIdx.x; + + if ( i >= n ) + { + return; + } + + /* velocity verlet, 2nd part */ + atom = &(my_atoms[i]); + inv_m = 1.0 / sbp[atom->type].mass; + /* Compute v(t + dt) */ + rvec_ScaledAdd( atom->v, 0.5 * dt * -F_CONV * inv_m, atom->f ); +} + + +void bNVT_update_velocity_part2( reax_system *system, real dt ) +{ + int blocks; + + blocks = system->n / DEF_BLOCK_SIZE + + ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1); + k_update_velocity_2 <<< blocks, DEF_BLOCK_SIZE >>> + (system->d_my_atoms, system->reax_param.d_sbp, dt, system->n); + cudaThreadSynchronize( ); + cudaCheckError( ); +} + + +CUDA_GLOBAL void k_scale_velocities( reax_atom *my_atoms, real lambda, int n ) +{ + reax_atom *atom; + int i = blockIdx.x * blockDim.x + threadIdx.x; + + if ( i >= n ) + { + return; + } + + /* Scale velocities and positions at t+dt */ + atom = &(my_atoms[i]); + rvec_Scale( atom->v, lambda, atom->v ); +} + + +void bNVT_scale_velocities( reax_system *system, real lambda ) +{ + int blocks; + + blocks = system->n / DEF_BLOCK_SIZE + + ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1); + k_scale_velocities <<< blocks, DEF_BLOCK_SIZE >>> + (system->d_my_atoms, lambda, system->n); + cudaThreadSynchronize( ); + cudaCheckError( ); +} + + +int Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* control, + simulation_data *data, storage *workspace, reax_list **lists, + output_controls *out_control, mpi_datatypes *mpi_data ) +{ + int i, steps, renbr, ret; + static int verlet_part1_done = FALSE, estimate_nbrs_done = 0; + real inv_m, dt, lambda; + rvec dx; + reax_atom *atom; + int *bond_top, *hb_top; + int Htop, num_3body; + int total_hbonds, count, total_bonds; + int bond_cap, cap_3body; + real t_over_start, t_over_elapsed; + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "p%d @ step%d\n", system->my_rank, data->step ); + MPI_Barrier( MPI_COMM_WORLD ); +#endif + + dt = control->dt; + steps = data->step - data->prev_steps; + renbr = steps % control->reneighbor == 0 ? TRUE : FALSE; + ret = SUCCESS; + + Cuda_ReAllocate( system, control, data, workspace, lists, mpi_data ); + + if ( verlet_part1_done == FALSE ) + { + /* velocity verlet, 1st part */ + bNVT_update_velocity_part1( system, dt ); + verlet_part1_done = TRUE; + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "p%d @ step%d: verlet1 done\n", system->my_rank, data->step ); + MPI_Barrier( MPI_COMM_WORLD ); +#endif + + if ( renbr ) + { + Update_Grid( system, control, mpi_data->world ); + } + + Output_Sync_Atoms( system ); + Comm_Atoms( system, control, data, workspace, lists, mpi_data, renbr ); + Sync_Atoms( system ); + + /* synch the Grid to the Device here */ + Sync_Grid( &system->my_grid, &system->d_my_grid ); + + init_blocks( system ); + +#if defined(__CUDA_DEBUG_LOG__) + fprintf( stderr, "p:%d - Matvec BLocks: %d, blocksize: %d \n", + system->my_rank, MATVEC_BLOCKS, MATVEC_BLOCK_SIZE ); +#endif + } + + Cuda_Reset( system, control, data, workspace, lists ); + + if ( renbr ) + { +#if defined(DEBUG) + t_over_start = Get_Time (); +#endif + + if ( estimate_nbrs_done == 0 ) + { + //TODO: move far_nbrs reallocation checks outside of renbr frequency check + ret = Cuda_Estimate_Neighbors( system, data->step ); + estimate_nbrs_done = 1; + } + + if ( ret == SUCCESS && estimate_nbrs_done == 1 ) + { + Cuda_Generate_Neighbor_Lists( system, data, workspace, lists ); + estimate_nbrs_done = 2; + +#if defined(DEBUG) + t_over_elapsed = Get_Timing_Info( t_over_start ); + fprintf( stderr, "p%d --> Overhead (Step-%d) %f \n", + system->my_rank, data->step, t_over_elapsed ); +#endif + } + } + + if ( ret == SUCCESS ) + { + ret = Cuda_Compute_Forces( system, control, data, workspace, + lists, out_control, mpi_data ); + } + + if ( ret == SUCCESS ) + { + /* velocity verlet, 2nd part */ + bNVT_update_velocity_part2( system, dt ); + +#if defined(DEBUG_FOCUS) + fprintf(stderr, "p%d @ step%d: verlet2 done\n", system->my_rank, data->step); + MPI_Barrier( MPI_COMM_WORLD ); +#endif + + /* temperature scaler */ + Cuda_Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D ); + + lambda = 1.0 + (dt / control->Tau_T) * (control->T / data->therm.T - 1.0); + if ( lambda < MIN_dT ) + { + lambda = MIN_dT; + } + else if (lambda > MAX_dT ) + { + lambda = MAX_dT; + } + lambda = SQRT( lambda ); + + /* Scale velocities and positions at t+dt */ + bNVT_scale_velocities( system, lambda ); + + Cuda_Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D ); + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "p%d @ step%d: scaled velocities\n", + system->my_rank, data->step ); + MPI_Barrier( MPI_COMM_WORLD ); +#endif + + verlet_part1_done = FALSE; + estimate_nbrs_done = 0; + } + + return ret; +} diff --git a/PG-PuReMD/src/cuda_integrate.h b/PG-PuReMD/src/cuda/cuda_integrate.h similarity index 86% rename from PG-PuReMD/src/cuda_integrate.h rename to PG-PuReMD/src/cuda/cuda_integrate.h index b71e14e3..2797b3e3 100644 --- a/PG-PuReMD/src/cuda_integrate.h +++ b/PG-PuReMD/src/cuda/cuda_integrate.h @@ -22,18 +22,26 @@ #ifndef __CUDA_INTEGRATE_H_ #define __CUDA_INTEGRATE_H_ -#include "reax_types.h" +#include "../reax_types.h" + #ifdef __cplusplus extern "C" { #endif void bNVT_update_velocity_part1( reax_system *, real ); + void bNVT_update_velocity_part2( reax_system *, real ); + void bNVT_scale_velocities( reax_system *, real ); +int Cuda_Velocity_Verlet_Berendsen_NVT( reax_system*, control_params*, + simulation_data*, storage*, reax_list**, output_controls*, + mpi_datatypes* ); + #ifdef __cplusplus } #endif + #endif diff --git a/PG-PuReMD/src/cuda/cuda_lin_alg.cu b/PG-PuReMD/src/cuda/cuda_lin_alg.cu new file mode 100644 index 00000000..dc7a2fc3 --- /dev/null +++ b/PG-PuReMD/src/cuda/cuda_lin_alg.cu @@ -0,0 +1,1113 @@ +/*---------------------------------------------------------------------- + PuReMD - Purdue ReaxFF Molecular Dynamics Program + + Copyright (2010) Purdue University + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Joseph Fogarty, jcfogart@mail.usf.edu + Sagar Pandit, pandit@usf.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#include "cuda_lin_alg.h" + +#include "cuda_shuffle.h" +#include "cuda_utils.h" +#include "cuda_reduction.h" + +#include "../basic_comm.h" + + +//one thread per row +CUDA_GLOBAL void k_matvec( sparse_matrix H, real *vec, real *results, + int rows ) +{ + int i, col; + real results_row; + real val; + + i = blockIdx.x * blockDim.x + threadIdx.x; + + if ( i >= rows ) + { + return; + } + + results_row = 0; + + for (int c = H.start[i]; c < H.end[i]; c++) + { + col = H.entries [c].j; + val = H.entries[c].val; + + results_row += val * vec[col]; + } + + results[i] = results_row; +} + + +//32 thread warp per matrix row. +//invoked as follows +// <<< system->N, 32 >>> +//CUDA_GLOBAL void __launch_bounds__(384, 16) k_matvec_csr(sparse_matrix H, real *vec, real *results, int num_rows) +CUDA_GLOBAL void k_matvec_csr( sparse_matrix H, real *vec, real *results, + int num_rows ) +{ +#if defined(__SM_35__) + real vals; + int x; +#else + extern __shared__ real vals[]; +#endif + int jj; + int thread_id = blockDim.x * blockIdx.x + threadIdx.x; + int warp_id = thread_id / MATVEC_KER_THREADS_PER_ROW; + int lane = thread_id & ( MATVEC_KER_THREADS_PER_ROW - 1); + int row_start; + int row_end; + // one warp per row + int row = warp_id; + +#if defined(__SM_35__) + vals = 0; +#else + vals[threadIdx.x] = 0; +#endif + + if (row < num_rows) + { + row_start = H.start[row]; + row_end = H.end[row]; + + // compute running sum per thread + for ( jj = row_start + lane; jj < row_end; + jj += MATVEC_KER_THREADS_PER_ROW ) +#if defined(__SM_35__) + { + vals += H.entries[jj].val * vec[ H.entries[jj].j ]; + } + } +#else + { + vals[threadIdx.x] += H.entries[jj].val * vec[ H.entries[jj].j ]; + } + } + + __syncthreads( ); +#endif + + // parallel reduction in shared memory + //SIMD instructions with a WARP are synchronous -- so we do not need to synch here +#if defined(__SM_35__) + for (x = MATVEC_KER_THREADS_PER_ROW >> 1; x >= 1; x/=2) + { + vals += shfl( vals, x ); + } + + if (lane == 0 && row < num_rows) + { + results[row] = vals; + } +#else + if (lane < 16) + { + vals[threadIdx.x] += vals[threadIdx.x + 16]; + } + __syncthreads( ); + if (lane < 8) + { + vals[threadIdx.x] += vals[threadIdx.x + 8]; + } + __syncthreads( ); + if (lane < 4) + { + vals[threadIdx.x] += vals[threadIdx.x + 4]; + } + __syncthreads( ); + if (lane < 2) + { + vals[threadIdx.x] += vals[threadIdx.x + 2]; + } + __syncthreads( ); + if (lane < 1) + { + vals[threadIdx.x] += vals[threadIdx.x + 1]; + } + __syncthreads( ); + + // first thread writes the result + if (lane == 0 && row < num_rows) + { + results[row] = vals[threadIdx.x]; + } +#endif +} + + +//one thread per row +CUDA_GLOBAL void k_dual_matvec( sparse_matrix H, rvec2 *vec, rvec2 *results, + int rows ) +{ + int i, c, col; + rvec2 results_row; + real val; + + i = blockIdx.x * blockDim.x + threadIdx.x; + + if ( i >= rows) + { + return; + } + + results_row[0] = 0.0; + results_row[1] = 0.0; + + for (c = H.start[i]; c < H.end[i]; c++) + { + col = H.entries [c].j; + val = H.entries[c].val; + + results_row[0] += val * vec [col][0]; + results_row[1] += val * vec [col][1]; + } + + results[i][0] = results_row[0]; + results[i][1] = results_row[1]; +} + + +//32 thread warp per matrix row. +//invoked as follows +// <<< system->N, 32 >>> +//CUDA_GLOBAL void __launch_bounds__(384, 8) k_dual_matvec_csr(sparse_matrix H, rvec2 *vec, rvec2 *results, int num_rows) +CUDA_GLOBAL void k_dual_matvec_csr( sparse_matrix H, rvec2 *vec, + rvec2 *results, int num_rows ) +{ +#if defined(__SM_35__) + rvec2 rvals; + int thread_id = blockDim.x * blockIdx.x + threadIdx.x; + int warp_id = thread_id / MATVEC_KER_THREADS_PER_ROW; + int lane = thread_id & (MATVEC_KER_THREADS_PER_ROW - 1); + int row_start; + int row_end; + // one warp per row + int row = warp_id; + + rvals[0] = 0; + rvals[1] = 0; + + if (row < num_rows) + { + row_start = H.start[row]; + row_end = H.end[row]; + + for(int jj = row_start + lane; jj < row_end; jj += MATVEC_KER_THREADS_PER_ROW) + { + rvals[0] += H.entries[jj].val * vec [ H.entries[jj].j ][0]; + rvals[1] += H.entries[jj].val * vec [ H.entries[jj].j ][1]; + } + } + + for (int s = MATVEC_KER_THREADS_PER_ROW >> 1; s >= 1; s /= 2) + { + rvals[0] += shfl( rvals[0], s); + rvals[1] += shfl( rvals[1], s); + } + + if (lane == 0 && row < num_rows) + { + results[row][0] = rvals[0]; + results[row][1] = rvals[1]; + } + +#else + extern __shared__ rvec2 rvals[]; + int thread_id = blockDim.x * blockIdx.x + threadIdx.x; + int warp_id = thread_id / 32; + int lane = thread_id & (32 - 1); + int row_start; + int row_end; + // one warp per row + //int row = warp_id; + int row = warp_id; + + rvals[threadIdx.x][0] = 0; + rvals[threadIdx.x][1] = 0; + + if (row < num_rows) + { + row_start = H.start[row]; + row_end = H.end[row]; + + // compute running sum per thread + for(int jj = row_start + lane; jj < row_end; jj += 32) + { + rvals[threadIdx.x][0] += H.entries[jj].val * vec [ H.entries[jj].j ][0]; + rvals[threadIdx.x][1] += H.entries[jj].val * vec [ H.entries[jj].j ][1]; + } + } + + __syncthreads( ); + + // parallel reduction in shared memory + //SIMD instructions with a WARP are synchronous -- so we do not need to synch here + if (lane < 16) + { + rvals[threadIdx.x][0] += rvals[threadIdx.x + 16][0]; + rvals[threadIdx.x][1] += rvals[threadIdx.x + 16][1]; + } + __syncthreads( ); + if (lane < 8) + { + rvals[threadIdx.x][0] += rvals[threadIdx.x + 8][0]; + rvals[threadIdx.x][1] += rvals[threadIdx.x + 8][1]; + } + __syncthreads( ); + if (lane < 4) + { + rvals[threadIdx.x][0] += rvals[threadIdx.x + 4][0]; + rvals[threadIdx.x][1] += rvals[threadIdx.x + 4][1]; + } + __syncthreads( ); + if (lane < 2) + { + rvals[threadIdx.x][0] += rvals[threadIdx.x + 2][0]; + rvals[threadIdx.x][1] += rvals[threadIdx.x + 2][1]; + } + __syncthreads( ); + if (lane < 1) + { + rvals[threadIdx.x][0] += rvals[threadIdx.x + 1][0]; + rvals[threadIdx.x][1] += rvals[threadIdx.x + 1][1]; + } + __syncthreads( ); + + // first thread writes the result + if (lane == 0 && row < num_rows) + { + results[row][0] = rvals[threadIdx.x][0]; + results[row][1] = rvals[threadIdx.x][1]; + } + +#endif +} + + +void Cuda_Vector_Sum( real *res, real a, real *x, real b, real *y, int count ) +{ + //res = ax + by + //use the cublas here + int blocks; + + blocks = (count / DEF_BLOCK_SIZE) + + ((count % DEF_BLOCK_SIZE == 0) ? 0 : 1); + + k_vector_sum <<< blocks, DEF_BLOCK_SIZE >>> + ( res, a, x, b, y, count ); + cudaThreadSynchronize( ); + cudaCheckError( ); +} + + +void Cuda_CG_Preconditioner( real *res, real *a, real *b, int count ) +{ + //res = a*b - vector multiplication + //use the cublas here. + int blocks; + + blocks = (count / DEF_BLOCK_SIZE) + + ((count % DEF_BLOCK_SIZE == 0) ? 0 : 1); + + k_vector_mul <<< blocks, DEF_BLOCK_SIZE >>> + ( res, a, b, count ); + cudaThreadSynchronize( ); + cudaCheckError( ); +} + + +CUDA_GLOBAL void k_diagonal_preconditioner(storage p_workspace, rvec2 *b, int n) +{ + storage *workspace; + int j; + + j = blockIdx.x * blockDim.x + threadIdx.x; + + if ( j >= n ) + { + return; + } + + workspace = &( p_workspace ); + + //for( j = 0; j < system->n; ++j ) { + // residual + workspace->r2[j][0] = b[j][0] - workspace->q2[j][0]; + workspace->r2[j][1] = b[j][1] - workspace->q2[j][1]; + + // apply diagonal pre-conditioner + workspace->d2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j]; + workspace->d2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j]; + //} +} + + +void Cuda_CG_Diagonal_Preconditioner( storage *workspace, rvec2 *b, int n ) +{ + int blocks; + + blocks = (n / DEF_BLOCK_SIZE) + + (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1); + + k_diagonal_preconditioner <<< blocks, DEF_BLOCK_SIZE >>> + (*workspace, b, n); + + cudaThreadSynchronize( ); + cudaCheckError( ); +} + + +CUDA_GLOBAL void k_dual_cg_preconditioner( storage p_workspace, rvec2 *x, + real alpha_0, real alpha_1, int n, rvec2 *my_dot ) +{ + storage *workspace; + rvec2 alpha; + int j; + + j = blockIdx.x * blockDim.x + threadIdx.x; + + if ( j >= n ) + { + return; + } + + workspace = &( p_workspace ); + alpha[0] = alpha_0; + alpha[1] = alpha_1; + my_dot[j][0] = my_dot[j][1] = 0.0; + + //for( j = 0; j < system->n; ++j ) { + // update x + x[j][0] += alpha[0] * workspace->d2[j][0]; + x[j][1] += alpha[1] * workspace->d2[j][1]; + + // update residual + workspace->r2[j][0] -= alpha[0] * workspace->q2[j][0]; + workspace->r2[j][1] -= alpha[1] * workspace->q2[j][1]; + + // apply diagonal pre-conditioner + workspace->p2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j]; + workspace->p2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j]; + + // dot product: r.p + my_dot[j][0] = workspace->r2[j][0] * workspace->p2[j][0]; + my_dot[j][1] = workspace->r2[j][1] * workspace->p2[j][1]; + //} +} + + +void Cuda_DualCG_Preconditioner( storage *workspace, rvec2 *x, rvec2 alpha, + int n, rvec2 result ) +{ + int blocks; + rvec2 *tmp = (rvec2 *) scratch; + + cuda_memset( tmp, 0, sizeof(rvec2) * ( 2 * n + 1), + "cuda_dualcg_preconditioner" ); + blocks = (n / DEF_BLOCK_SIZE) + + (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1); + + k_dual_cg_preconditioner <<< blocks, DEF_BLOCK_SIZE >>> + (*workspace, x, alpha[0], alpha[1], n, tmp); + + cudaThreadSynchronize( ); + cudaCheckError( ); + + //Reduction to calculate my_dot + k_reduction_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec2) * DEF_BLOCK_SIZE >>> + ( tmp, tmp + n, n); + + cudaThreadSynchronize( ); + cudaCheckError( ); + + k_reduction_rvec2 <<< 1, BLOCKS_POW_2, sizeof(rvec2) * BLOCKS_POW_2 >>> + ( tmp + n, tmp + 2*n, blocks); + + cudaThreadSynchronize( ); + cudaCheckError( ); + + copy_host_device( result, (tmp + 2*n), sizeof(rvec2), + cudaMemcpyDeviceToHost, "my_dot" ); +} + + +void Cuda_Norm( rvec2 *arr, int n, rvec2 result ) +{ + int blocks; + rvec2 *tmp = (rvec2 *) scratch; + + blocks = (n / DEF_BLOCK_SIZE) + + (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1); + + k_norm_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec2) * DEF_BLOCK_SIZE >>> + (arr, tmp, n, INITIAL); + cudaThreadSynchronize( ); + cudaCheckError( ); + + k_norm_rvec2 <<< 1, BLOCKS_POW_2, sizeof(rvec2) * BLOCKS_POW_2 >>> + (tmp, tmp + BLOCKS_POW_2, blocks, FINAL ); + cudaThreadSynchronize( ); + cudaCheckError( ); + + copy_host_device( result, tmp + BLOCKS_POW_2, sizeof(rvec2), + cudaMemcpyDeviceToHost, "cuda_norm_rvec2" ); +} + + +void Cuda_Dot( rvec2 *a, rvec2 *b, rvec2 result, int n ) +{ + int blocks; + rvec2 *tmp = (rvec2 *) scratch; + + blocks = (n / DEF_BLOCK_SIZE) + + (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1); + + k_dot_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec2) * DEF_BLOCK_SIZE >>> + ( a, b, tmp, n ); + cudaThreadSynchronize( ); + cudaCheckError( ); + + k_norm_rvec2 <<< 1, BLOCKS_POW_2, sizeof(rvec2) * BLOCKS_POW_2 >>> + //k_norm_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec2) * BLOCKS_POW_2 >>> + ( tmp, tmp + BLOCKS_POW_2, blocks, FINAL ); + cudaThreadSynchronize( ); + cudaCheckError( ); + + copy_host_device( result, tmp + BLOCKS_POW_2, sizeof(rvec2), + cudaMemcpyDeviceToHost, "cuda_dot" ); +} + + +void Cuda_Vector_Sum_Rvec2(rvec2 *x, rvec2 *a, rvec2 b, rvec2 *c, int n) +{ + int blocks; + + blocks = (n / DEF_BLOCK_SIZE) + + (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1); + + k_rvec2_pbetad <<< blocks, DEF_BLOCK_SIZE >>> + ( x, a, b[0], b[1], c, n); + + cudaThreadSynchronize( ); + cudaCheckError( ); +} + + +CUDA_GLOBAL void k_rvec2_to_real_copy( real *dst, rvec2 *src, int index, int n ) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + + if (i >= n) + { + return; + } + + dst[i] = src[i][index]; +} + + +void Cuda_RvecCopy_From( real *dst, rvec2 *src, int index, int n ) +{ + int blocks; + + blocks = (n / DEF_BLOCK_SIZE) + + (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1); + + k_rvec2_to_real_copy <<< blocks, DEF_BLOCK_SIZE >>> + ( dst, src, index, n); + cudaThreadSynchronize( ); + cudaCheckError( ); +} + + +CUDA_GLOBAL void k_real_to_rvec2_copy( rvec2 *dst, real *src, int index, int n) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + + if (i >= n) + { + return; + } + + dst[i][index] = src[i]; +} + + +void Cuda_RvecCopy_To(rvec2 *dst, real *src, int index, int n) +{ + int blocks; + + blocks = (n / DEF_BLOCK_SIZE) + + (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1); + + k_real_to_rvec2_copy <<< blocks, DEF_BLOCK_SIZE >>> + ( dst, src, index, n); + + cudaThreadSynchronize( ); + cudaCheckError( ); +} + + +void Cuda_Dual_Matvec( sparse_matrix *H, rvec2 *a, rvec2 *b, int n, int size ) +{ + int blocks; + + blocks = (n / DEF_BLOCK_SIZE) + + (( n % DEF_BLOCK_SIZE) == 0 ? 0 : 1); + + cuda_memset( b, 0, sizeof(rvec2) * size, "dual_matvec:result" ); + + //One thread per row implementation + //k_dual_matvec <<< blocks, DEF_BLOCK_SIZE >>> + // (*H, a, b, n); + //cudaThreadSynchronize (); + //cudaCheckError (); + + //One warp per row implementation +#if defined(__SM_35__) + k_dual_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE >>> +#else + k_dual_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, + sizeof(rvec2) * MATVEC_BLOCK_SIZE >>> +#endif + ( *H, a, b, n ); + cudaThreadSynchronize( ); + cudaCheckError( ); +} + + +void Cuda_Matvec( sparse_matrix *H, real *a, real *b, int n, int size ) +{ + int blocks; + + blocks = (n / DEF_BLOCK_SIZE) + + (( n % DEF_BLOCK_SIZE) == 0 ? 0 : 1); + + cuda_memset( b, 0, sizeof(real) * size, "dual_matvec:result" ); + + //one thread per row implementation + //k_matvec <<< blocks, DEF_BLOCK_SIZE >>> + // (*H, a, b, n); + //cudaThreadSynchronize (); + //cudaCheckError (); + +#if defined(__SM_35__) + k_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE >>> +#else + k_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, + sizeof(real) * MATVEC_BLOCK_SIZE>>> +#endif + (*H, a, b, n); + + cudaThreadSynchronize( ); + cudaCheckError( ); +} + + +int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, + rvec2 *b, real tol, rvec2 *x, mpi_datatypes* mpi_data, FILE *fout, + simulation_data *data ) +{ + int i, j, n, N, matvecs, scale; + rvec2 tmp, alpha, beta; + rvec2 my_sum, norm_sqr, b_norm, my_dot; + rvec2 sig_old, sig_new; + MPI_Comm comm; + rvec2 *spad = (rvec2 *) host_scratch; + int a; + + n = system->n; + N = system->N; + comm = mpi_data->world; + matvecs = 0; + scale = sizeof(rvec2) / sizeof(void); + +#if defined(CG_PERFORMANCE) + if ( system->my_rank == MASTER_NODE ) + { + matvecs = 0; + t_start = matvec_time = dot_time = 0; + t_start = Get_Time( ); + } +#endif + + //MVAPICH2 +//#ifdef __CUDA_DEBUG__ +// Dist( system, mpi_data, workspace->x, mpi_data->mpi_rvec2, scale, rvec2_packer ); +//#endif + +// check_zeros_device( x, system->N, "x" ); + + copy_host_device( spad, x, sizeof(rvec2) * system->total_cap, cudaMemcpyDeviceToHost, "CG:x:get" ); + Dist( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_packer ); + copy_host_device( spad, x, sizeof(rvec2) * system->total_cap, cudaMemcpyHostToDevice, "CG:x:put" ); + +// check_zeros_device( x, system->N, "x" ); + +// compare_rvec2 (workspace->x, x, N, "x"); +// if (data->step > 0) { +// compare_rvec2 (workspace->b, dev_workspace->b, system->N, "b"); +// compare_rvec2 (workspace->x, dev_workspace->x, system->N, "x"); +// +// exit (0); +// } + + +//#ifdef __CUDA_DEBUG__ +// dual_Sparse_MatVec( &workspace->H, workspace->x, workspace->q2, N ); +//#endif + //originally we were using only H->n which was system->n (init_md.c) + //Cuda_Dual_Matvec ( H, x, dev_workspace->q2, H->n, system->total_cap); + + Cuda_Dual_Matvec ( H, x, dev_workspace->q2, system->N, system->total_cap); + +// compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2"); + +// if (data->step > 0) exit (0); + + // tryQEq + //MVAPICH2 +//#ifdef __CUDA_DEBUG__ +// Coll(system,mpi_data,workspace->q2,mpi_data->mpi_rvec2,scale,rvec2_unpacker); +//#endif + + copy_host_device( spad, dev_workspace->q2, sizeof(rvec2) * system->total_cap, + cudaMemcpyDeviceToHost, "CG:q2:get" ); + Coll(system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_unpacker); + copy_host_device( spad, dev_workspace->q2, sizeof(rvec2) * system->total_cap, + cudaMemcpyHostToDevice,"CG:q2:put" ); + +#if defined(CG_PERFORMANCE) + if ( system->my_rank == MASTER_NODE ) + { + Update_Timing_Info( &t_start, &matvec_time ); + } +#endif + +//#ifdef __CUDA_DEBUG__ +// for( j = 0; j < system->n; ++j ) { +// // residual +// workspace->r2[j][0] = workspace->b[j][0] - workspace->q2[j][0]; +// workspace->r2[j][1] = workspace->b[j][1] - workspace->q2[j][1]; +// // apply diagonal pre-conditioner +// workspace->d2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j]; +// workspace->d2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j]; +// } +//#endif + + Cuda_CG_Diagonal_Preconditioner( dev_workspace, b, system->n ); + +// compare_rvec2 (workspace->r2, dev_workspace->r2, n, "r2"); +// compare_rvec2 (workspace->d2, dev_workspace->d2, n, "d2"); + + /* norm of b */ +//#ifdef __CUDA_DEBUG__ +// my_sum[0] = my_sum[1] = 0; +// for( j = 0; j < n; ++j ) { +// my_sum[0] += SQR( workspace->b[j][0] ); +// my_sum[1] += SQR( workspace->b[j][1] ); +// } +// fprintf (stderr, "cg: my_sum[ %f, %f] \n", my_sum[0], my_sum[1]); +//#endif + + my_sum[0] = my_sum[1] = 0; + Cuda_Norm (b, n, my_sum); + +// fprintf (stderr, "cg: my_sum[ %f, %f] \n", my_sum[0], my_sum[1]); + + MPI_Allreduce( &my_sum, &norm_sqr, 2, MPI_DOUBLE, MPI_SUM, comm ); + b_norm[0] = SQRT( norm_sqr[0] ); + b_norm[1] = SQRT( norm_sqr[1] ); + //fprintf( stderr, "bnorm = %f %f\n", b_norm[0], b_norm[1] ); + + /* dot product: r.d */ +//#ifdef __CUDA_DEBUG__ +// my_dot[0] = my_dot[1] = 0; +// for( j = 0; j < n; ++j ) { +// my_dot[0] += workspace->r2[j][0] * workspace->d2[j][0]; +// my_dot[1] += workspace->r2[j][1] * workspace->d2[j][1]; +// } +// fprintf( stderr, "my_dot: %f %f\n", my_dot[0], my_dot[1] ); +//#endif + + my_dot[0] = my_dot[1] = 0; + Cuda_Dot (dev_workspace->r2, dev_workspace->d2, my_dot, n); + +// fprintf( stderr, "my_dot: %f %f\n", my_dot[0], my_dot[1] ); + + MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm ); + + //fprintf( stderr, "DEVICE:sig_new: %f %f\n", sig_new[0], sig_new[1] ); + +#if defined(CG_PERFORMANCE) + if ( system->my_rank == MASTER_NODE ) + { + Update_Timing_Info( &t_start, &dot_time ); + } +#endif + + for ( i = 1; i < 300; ++i ) + { + //MVAPICH2 +//#ifdef __CUDA_DEBUG__ +// Dist(system,mpi_data,workspace->d2,mpi_data->mpi_rvec2,scale,rvec2_packer); +//#endif + + copy_host_device( spad, dev_workspace->d2, sizeof(rvec2) * system->total_cap, + cudaMemcpyDeviceToHost, "cg:d2:get" ); + Dist( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_packer ); + copy_host_device( spad, dev_workspace->d2, sizeof(rvec2) * system->total_cap, + cudaMemcpyHostToDevice, "cg:d2:put" ); + + //print_device_rvec2 (dev_workspace->d2, N); + +//#ifdef __CUDA_DEBUG__ +// dual_Sparse_MatVec( &workspace->H, workspace->d2, workspace->q2, N ); +//#endif + + Cuda_Dual_Matvec( H, dev_workspace->d2, dev_workspace->q2, system->N, + system->total_cap ); + + /* + fprintf (stderr, "******************* Device sparse Matrix--------> %d \n", H->n ); + fprintf (stderr, " ******* HOST SPARSE MATRIX ******** \n"); + print_sparse_matrix_host (&workspace->H); + fprintf (stderr, " ******* HOST Vector ***************\n"); + print_host_rvec2 (workspace->d2, system->N); + fprintf (stderr, " ******* Device SPARSE MATRIX ******** \n"); + print_sparse_matrix (&dev_workspace->H); + fprintf (stderr, " ******* Device Vector ***************\n"); + print_device_rvec2 (dev_workspace->d2, system->N); + */ + //compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2"); + + // tryQEq + // MVAPICH2 +//#ifdef __CUDA_DEBUG__ +// Coll(system,mpi_data,workspace->q2,mpi_data->mpi_rvec2,scale,rvec2_unpacker); +//#endif + + copy_host_device( spad, dev_workspace->q2, sizeof(rvec2) * system->total_cap, + cudaMemcpyDeviceToHost, "cg:q2:get" ); + Coll( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_unpacker ); + copy_host_device( spad, dev_workspace->q2, sizeof(rvec2) * system->total_cap, + cudaMemcpyHostToDevice, "cg:q2:put" ); + +// compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2"); + +#if defined(CG_PERFORMANCE) + if ( system->my_rank == MASTER_NODE ) + { + Update_Timing_Info( &t_start, &matvec_time ); + } +#endif + + /* dot product: d.q */ +//#ifdef __CUDA_DEBUG__ +// my_dot[0] = my_dot[1] = 0; +// for( j = 0; j < n; ++j ) { +// my_dot[0] += workspace->d2[j][0] * workspace->q2[j][0]; +// my_dot[1] += workspace->d2[j][1] * workspace->q2[j][1]; +// } +// fprintf( stderr, "H:my_dot: %f %f\n", my_dot[0], my_dot[1] ); +//#endif + + my_dot[0] = my_dot[1] = 0; + Cuda_Dot (dev_workspace->d2, dev_workspace->q2, my_dot, n); + //fprintf( stderr, "D:my_dot: %f %f\n", my_dot[0], my_dot[1] ); + + MPI_Allreduce( &my_dot, &tmp, 2, MPI_DOUBLE, MPI_SUM, comm ); + //fprintf( stderr, "tmp: %f %f\n", tmp[0], tmp[1] ); + + alpha[0] = sig_new[0] / tmp[0]; + alpha[1] = sig_new[1] / tmp[1]; + my_dot[0] = my_dot[1] = 0; + +//#ifdef __CUDA_DEBUG__ +// for( j = 0; j < system->n; ++j ) { +// // update x +// workspace->x[j][0] += alpha[0] * workspace->d2[j][0]; +// workspace->x[j][1] += alpha[1] * workspace->d2[j][1]; +// // update residual +// workspace->r2[j][0] -= alpha[0] * workspace->q2[j][0]; +// workspace->r2[j][1] -= alpha[1] * workspace->q2[j][1]; +// // apply diagonal pre-conditioner +// workspace->p2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j]; +// workspace->p2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j]; +// // dot product: r.p +// my_dot[0] += workspace->r2[j][0] * workspace->p2[j][0]; +// my_dot[1] += workspace->r2[j][1] * workspace->p2[j][1]; +// } +// fprintf( stderr, "H:my_dot: %f %f\n", my_dot[0], my_dot[1] ); +//#endif + + my_dot[0] = my_dot[1] = 0; + Cuda_DualCG_Preconditioner( dev_workspace, x, alpha, system->n, my_dot ); + + //fprintf( stderr, "D:my_dot: %f %f\n", my_dot[0], my_dot[1] ); + +// compare_rvec2 (workspace->x, dev_workspace->x, N, "x"); +// compare_rvec2 (workspace->r2, dev_workspace->r2, N, "r2"); +// compare_rvec2 (workspace->p2, dev_workspace->p2, N, "p2"); + + sig_old[0] = sig_new[0]; + sig_old[1] = sig_new[1]; + MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm ); + + //fprintf( stderr, "DEVICE:sig_new: %f %f\n", sig_new[0], sig_new[1] ); + +#if defined(CG_PERFORMANCE) + if ( system->my_rank == MASTER_NODE ) + { + Update_Timing_Info( &t_start, &dot_time ); + } +#endif + + if ( SQRT(sig_new[0]) / b_norm[0] <= tol || SQRT(sig_new[1]) / b_norm[1] <= tol ) + { + break; + } + + beta[0] = sig_new[0] / sig_old[0]; + beta[1] = sig_new[1] / sig_old[1]; + +//#ifdef __CUDA_DEBUG__ +// for( j = 0; j < system->n; ++j ) { +// // d = p + beta * d +// workspace->d2[j][0] = workspace->p2[j][0] + beta[0] * workspace->d2[j][0]; +// workspace->d2[j][1] = workspace->p2[j][1] + beta[1] * workspace->d2[j][1]; +// } +//#endif + + Cuda_Vector_Sum_Rvec2( dev_workspace->d2, dev_workspace->p2, beta, + dev_workspace->d2, system->n ); + +// compare_rvec2 (workspace->d2, dev_workspace->d2, N, "q2"); + } + + + if ( SQRT(sig_new[0]) / b_norm[0] <= tol ) + { + //for( j = 0; j < n; ++j ) + // workspace->t[j] = workspace->x[j][1]; + //fprintf (stderr, "Getting started with Cuda_CG1 \n"); + + Cuda_RvecCopy_From( dev_workspace->t, dev_workspace->x, 1, system->n ); + + //compare_array (workspace->b_t, dev_workspace->b_t, system->n, "b_t"); + //compare_array (workspace->t, dev_workspace->t, system->n, "t"); + + matvecs = Cuda_CG( system, workspace, H, dev_workspace->b_t, tol, dev_workspace->t, + mpi_data, fout ); + + //fprintf (stderr, " Cuda_CG1: iterations --> %d \n", matvecs ); + //for( j = 0; j < n; ++j ) + // workspace->x[j][1] = workspace->t[j]; + + Cuda_RvecCopy_To( dev_workspace->x, dev_workspace->t, 1, system->n ); + } + else if ( SQRT(sig_new[1]) / b_norm[1] <= tol ) + { + //for( j = 0; j < n; ++j ) + // workspace->s[j] = workspace->x[j][0]; + + Cuda_RvecCopy_From( dev_workspace->s, dev_workspace->x, 0, system->n ); + + //compare_array (workspace->s, dev_workspace->s, system->n, "s"); + //compare_array (workspace->b_s, dev_workspace->b_s, system->n, "b_s"); + + //fprintf (stderr, "Getting started with Cuda_CG2 \n"); + + matvecs = Cuda_CG( system, workspace, H, dev_workspace->b_s, tol, dev_workspace->s, + mpi_data, fout ); + + //fprintf (stderr, " Cuda_CG2: iterations --> %d \n", matvecs ); + //for( j = 0; j < system->n; ++j ) + // workspace->x[j][0] = workspace->s[j]; + + Cuda_RvecCopy_To( dev_workspace->x, dev_workspace->s, 0, system->n ); + } + + if ( i >= 300 ) + { + fprintf( stderr, "[WARNING] p%d: dual CG convergence failed! (%d steps)\n", + system->my_rank, i ); + fprintf( stderr, " [INFO] s lin solve error: %f\n", SQRT(sig_new[0]) / b_norm[0] ); + fprintf( stderr, " [INFO] t lin solve error: %f\n", SQRT(sig_new[1]) / b_norm[1] ); + } + +#if defined(CG_PERFORMANCE) + if ( system->my_rank == MASTER_NODE ) + { + fprintf( fout, "QEq %d + %d iters. matvecs: %f dot: %f\n", + i + 1, matvecs, matvec_time, dot_time ); + } +#endif + + return (i + 1) + matvecs; +} + + +int Cuda_CG( reax_system *system, storage *workspace, sparse_matrix *H, real + *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout ) +{ + int i, j, scale; + real tmp, alpha, beta, b_norm; + real sig_old, sig_new, sig0; + real *spad = (real *) host_scratch; + + scale = sizeof(real) / sizeof(void); + + /* x is on the device */ + //MVAPICH2 + memset( spad, 0, sizeof(real) * system->total_cap ); + copy_host_device( spad, x, sizeof(real) * system->total_cap, + cudaMemcpyDeviceToHost, "cuda_cg:x:get" ); + Dist( system, mpi_data, spad, MPI_DOUBLE, scale, real_packer ); + + //MVAPICH2 + copy_host_device( spad, x, sizeof(real) * system->total_cap, + cudaMemcpyHostToDevice, "cuda_cg:x:put" ); + Cuda_Matvec( H, x, dev_workspace->q, system->N, system->total_cap ); + + // tryQEq + // MVAPICH2 + copy_host_device( spad, dev_workspace->q, sizeof(real) * system->total_cap, + cudaMemcpyDeviceToHost, "cuda_cg:q:get" ); + Coll( system, mpi_data, spad, MPI_DOUBLE, scale, real_unpacker ); + + //MVAPICH2 + copy_host_device( spad, dev_workspace->q, sizeof(real) * system->total_cap, + cudaMemcpyHostToDevice, "cuda_cg:q:put" ); + +#if defined(CG_PERFORMANCE) + if ( system->my_rank == MASTER_NODE ) + { + Update_Timing_Info( &t_start, &matvec_time ); + } +#endif + + Cuda_Vector_Sum( dev_workspace->r , 1., b, -1., dev_workspace->q, + system->n ); + //for( j = 0; j < system->n; ++j ) + // workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; //pre-condition + Cuda_CG_Preconditioner( dev_workspace->d, dev_workspace->r, + dev_workspace->Hdia_inv, system->n ); + + //TODO do the parallel_norm on the device for the local sum + copy_host_device( spad, b, sizeof(real) * system->n, + cudaMemcpyDeviceToHost, "cuda_cg:b:get" ); + b_norm = Parallel_Norm( spad, system->n, mpi_data->world ); + + //TODO do the parallel dot on the device for the local sum + copy_host_device( spad, dev_workspace->r, sizeof(real) * system->total_cap, + cudaMemcpyDeviceToHost, "cuda_cg:r:get" ); + copy_host_device( spad + system->total_cap, dev_workspace->d, sizeof(real) * system->total_cap, + cudaMemcpyDeviceToHost, "cuda_cg:d:get" ); + sig_new = Parallel_Dot( spad, spad + system->total_cap, system->n, + mpi_data->world ); + + sig0 = sig_new; + +#if defined(CG_PERFORMANCE) + if ( system->my_rank == MASTER_NODE ) + { + Update_Timing_Info( &t_start, &dot_time ); + } +#endif + + for ( i = 1; i < 300 && SQRT(sig_new) / b_norm > tol; ++i ) + { + //MVAPICH2 + copy_host_device( spad, dev_workspace->d, sizeof(real) * system->total_cap, + cudaMemcpyDeviceToHost, "cuda_cg:d:get" ); + Dist( system, mpi_data, spad, MPI_DOUBLE, scale, real_packer ); + copy_host_device( spad, dev_workspace->d, sizeof(real) * system->total_cap, + cudaMemcpyHostToDevice, "cuda_cg:d:put" ); + + Cuda_Matvec( H, dev_workspace->d, dev_workspace->q, system->N, system->total_cap ); + + //tryQEq + copy_host_device( spad, dev_workspace->q, sizeof(real) * system->total_cap, + cudaMemcpyDeviceToHost, "cuda_cg:q:get" ); + Coll( system, mpi_data, spad, MPI_DOUBLE, scale, real_unpacker ); + copy_host_device( spad, dev_workspace->q, sizeof(real) * system->total_cap, + cudaMemcpyHostToDevice, "cuda_cg:q:get" ); + +#if defined(CG_PERFORMANCE) + if ( system->my_rank == MASTER_NODE ) + { + Update_Timing_Info( &t_start, &matvec_time ); + } +#endif + + //TODO do the parallel dot on the device for the local sum + copy_host_device( spad, dev_workspace->d, sizeof(real) * system->n, + cudaMemcpyDeviceToHost, "cuda_cg:d:get" ); + copy_host_device( spad + system->n, dev_workspace->q, sizeof(real) * system->n, + cudaMemcpyDeviceToHost, "cuda_cg:q:get" ); + tmp = Parallel_Dot( spad, spad + system->n, system->n, mpi_data->world ); + + alpha = sig_new / tmp; + //Cuda_Vector_Add( x, alpha, dev_workspace->d, system->n ); + Cuda_Vector_Sum( x, alpha, dev_workspace->d, 1.0, x, system->n ); + + //Cuda_Vector_Add( workspace->r, -alpha, workspace->q, system->n ); + Cuda_Vector_Sum( dev_workspace->r, -alpha, dev_workspace->q, 1.0, + dev_workspace->r, system->n ); + /* pre-conditioning */ + //for( j = 0; j < system->n; ++j ) + // workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j]; + Cuda_CG_Preconditioner( dev_workspace->p, dev_workspace->r, + dev_workspace->Hdia_inv, system->n ); + + sig_old = sig_new; + + //TODO do the parallel dot on the device for the local sum + copy_host_device( spad, dev_workspace->r, sizeof(real) * system->n, + cudaMemcpyDeviceToHost, "cuda_cg:r:get" ); + copy_host_device( spad + system->n, dev_workspace->p, sizeof(real) * system->n, + cudaMemcpyDeviceToHost, "cuda_cg:p:get" ); + sig_new = Parallel_Dot( spad , spad + system->n, system->n, mpi_data->world ); + //fprintf (stderr, "Device: sig_new: %f \n", sig_new ); + + beta = sig_new / sig_old; + Cuda_Vector_Sum( dev_workspace->d, 1., dev_workspace->p, beta, + dev_workspace->d, system->n ); + +#if defined(CG_PERFORMANCE) + if ( system->my_rank == MASTER_NODE ) + { + Update_Timing_Info( &t_start, &dot_time ); + } +#endif + } + + if ( i >= 300 ) + { + fprintf( stderr, "CG convergence failed!\n" ); + return i; + } + + return i; +} diff --git a/PG-PuReMD/src/cuda_lin_alg.h b/PG-PuReMD/src/cuda/cuda_lin_alg.h similarity index 52% rename from PG-PuReMD/src/cuda_lin_alg.h rename to PG-PuReMD/src/cuda/cuda_lin_alg.h index a7e3cc5f..aa31c126 100644 --- a/PG-PuReMD/src/cuda_lin_alg.h +++ b/PG-PuReMD/src/cuda/cuda_lin_alg.h @@ -22,29 +22,44 @@ #ifndef __CUDA_LIN_ALG_H_ #define __CUDA_LIN_ALG_H_ -#include "reax_types.h" +#include "../reax_types.h" #ifdef __cplusplus extern "C" { #endif +void Cuda_Vector_Sum( real *, real, real *, real, real *, int ); -void Cuda_Vector_Sum(real *res, real a, real *x, real b, real *y, int count); -void Cuda_CG_Preconditioner(real *res, real *a, real *b, int count); -void Cuda_CG_Diagonal_Preconditioner(storage *workspace, rvec2 *b, int n); -void Cuda_DualCG_Preconditioner(storage *workspace, rvec2 *, rvec2 alpha, int n, rvec2 result); -void Cuda_Norm(rvec2 *arr, int n, rvec2 result); -void Cuda_Dot(rvec2 *a, rvec2 *b, rvec2 result, int n); -void Cuda_Vector_Sum_Rvec2(rvec2 *x, rvec2 *, rvec2 , rvec2 *c, int n); -void Cuda_RvecCopy_From(real *dst, rvec2 *src, int index, int n); -void Cuda_RvecCopy_To(rvec2 *dst, real *src, int index, int n); -void Cuda_Dual_Matvec(sparse_matrix *, rvec2 *, rvec2 *, int , int); -void Cuda_Matvec(sparse_matrix *, real *, real *, int , int); +void Cuda_CG_Preconditioner( real *, real *, real *, int ); +void Cuda_CG_Diagonal_Preconditioner( storage *, rvec2 *, int ); + +void Cuda_DualCG_Preconditioner( storage *, rvec2 *, rvec2, int, rvec2 ); + +void Cuda_Norm( rvec2 *, int, rvec2 ); + +void Cuda_Dot( rvec2 *, rvec2 *, rvec2, int ); + +void Cuda_Vector_Sum_Rvec2( rvec2 *, rvec2 *, rvec2, rvec2 *, int ); + +void Cuda_RvecCopy_From( real *, rvec2 *, int, int ); + +void Cuda_RvecCopy_To( rvec2 *, real *, int, int ); + +void Cuda_Dual_Matvec( sparse_matrix *, rvec2 *, rvec2 *, int , int ); + +void Cuda_Matvec( sparse_matrix *, real *, real *, int , int ); + +int Cuda_dual_CG( reax_system*, storage*, sparse_matrix*, + rvec2*, real, rvec2*, mpi_datatypes*, FILE* , simulation_data * ); + +int Cuda_CG( reax_system*, storage*, sparse_matrix*, + real*, real, real*, mpi_datatypes*, FILE* ); #ifdef __cplusplus } #endif + #endif diff --git a/PG-PuReMD/src/cuda_list.cu b/PG-PuReMD/src/cuda/cuda_list.cu similarity index 96% rename from PG-PuReMD/src/cuda_list.cu rename to PG-PuReMD/src/cuda/cuda_list.cu index 21d8d091..9d0626f1 100644 --- a/PG-PuReMD/src/cuda_list.cu +++ b/PG-PuReMD/src/cuda/cuda_list.cu @@ -19,15 +19,14 @@ <http://www.gnu.org/licenses/>. ----------------------------------------------------------------------*/ -#include "reax_types.h" #include "cuda_utils.h" #if defined(PURE_REAX) - #include "list.h" - #include "tool_box.h" + #include "../list.h" + #include "../tool_box.h" #elif defined(LAMMPS_REAX) - #include "reax_list.h" - #include "reax_tool_box.h" + #include "../reax_list.h" + #include "../reax_tool_box.h" #endif diff --git a/PG-PuReMD/src/cuda_list.h b/PG-PuReMD/src/cuda/cuda_list.h similarity index 98% rename from PG-PuReMD/src/cuda_list.h rename to PG-PuReMD/src/cuda/cuda_list.h index 0b4e7aa0..fe06f4ce 100644 --- a/PG-PuReMD/src/cuda_list.h +++ b/PG-PuReMD/src/cuda/cuda_list.h @@ -22,13 +22,15 @@ #ifndef __CUDA_LIST_H_ #define __CUDA_LIST_H_ -#include "reax_types.h" +#include "../reax_types.h" + #ifdef __cplusplus extern "C" { #endif void Dev_Make_List( int, int, int, reax_list* ); + void Dev_Delete_List( reax_list* ); #ifdef __cplusplus diff --git a/PG-PuReMD/src/cuda_lookup.cu b/PG-PuReMD/src/cuda/cuda_lookup.cu similarity index 98% rename from PG-PuReMD/src/cuda_lookup.cu rename to PG-PuReMD/src/cuda/cuda_lookup.cu index 837a3c71..01bc8a79 100644 --- a/PG-PuReMD/src/cuda_lookup.cu +++ b/PG-PuReMD/src/cuda/cuda_lookup.cu @@ -1,8 +1,9 @@ #include "cuda_lookup.h" -#include "index_utils.h" + #include "cuda_utils.h" -#include "reax_types.h" + +#include "../index_utils.h" void copy_LR_table_to_device( reax_system *system, control_params *control, diff --git a/PG-PuReMD/src/cuda_lookup.h b/PG-PuReMD/src/cuda/cuda_lookup.h similarity index 56% rename from PG-PuReMD/src/cuda_lookup.h rename to PG-PuReMD/src/cuda/cuda_lookup.h index 88f5cfce..87026f7d 100644 --- a/PG-PuReMD/src/cuda_lookup.h +++ b/PG-PuReMD/src/cuda/cuda_lookup.h @@ -2,16 +2,18 @@ #ifndef __CUDA_LOOKUP_H__ #define __CUDA_LOOKUP_H__ -#include "reax_types.h" +#include "../reax_types.h" + #ifdef __cplusplus extern "C" { #endif -void copy_LR_table_to_device (reax_system *, control_params *, int *); +void copy_LR_table_to_device( reax_system *, control_params *, int * ); #ifdef __cplusplus } #endif + #endif diff --git a/PG-PuReMD/src/cuda_multi_body.cu b/PG-PuReMD/src/cuda/cuda_multi_body.cu similarity index 99% rename from PG-PuReMD/src/cuda_multi_body.cu rename to PG-PuReMD/src/cuda/cuda_multi_body.cu index 09a12963..cb741571 100644 --- a/PG-PuReMD/src/cuda_multi_body.cu +++ b/PG-PuReMD/src/cuda/cuda_multi_body.cu @@ -19,12 +19,13 @@ <http://www.gnu.org/licenses/>. ----------------------------------------------------------------------*/ -#include "reax_types.h" #include "cuda_multi_body.h" -#include "index_utils.h" + #include "cuda_helpers.h" #include "cuda_list.h" +#include "../index_utils.h" + CUDA_GLOBAL void Cuda_Atom_Energy( reax_atom *my_atoms, global_parameters gp, single_body_parameters *sbp, two_body_parameters *tbp, diff --git a/PG-PuReMD/src/cuda_multi_body.h b/PG-PuReMD/src/cuda/cuda_multi_body.h similarity index 58% rename from PG-PuReMD/src/cuda_multi_body.h rename to PG-PuReMD/src/cuda/cuda_multi_body.h index 332e6f06..06014b3a 100644 --- a/PG-PuReMD/src/cuda_multi_body.h +++ b/PG-PuReMD/src/cuda/cuda_multi_body.h @@ -22,21 +22,14 @@ #ifndef __CUDA_MULTI_BODY_H_ #define __CUDA_MULTI_BODY_H_ -#include "reax_types.h" - -CUDA_GLOBAL void Cuda_Atom_Energy( reax_atom *, - global_parameters , - single_body_parameters *, - two_body_parameters *, - storage , - reax_list , - int , - int , - real *, - real *, - real * - ); - -CUDA_GLOBAL void Cuda_Atom_Energy_PostProcess (reax_list, storage, int ); +#include "../reax_types.h" + + +CUDA_GLOBAL void Cuda_Atom_Energy( reax_atom *, global_parameters, + single_body_parameters *, two_body_parameters *, storage, + reax_list, int, int, real *, real *, real *); + +CUDA_GLOBAL void Cuda_Atom_Energy_PostProcess( reax_list, storage, int ); + #endif diff --git a/PG-PuReMD/src/cuda_neighbors.cu b/PG-PuReMD/src/cuda/cuda_neighbors.cu similarity index 99% rename from PG-PuReMD/src/cuda_neighbors.cu rename to PG-PuReMD/src/cuda/cuda_neighbors.cu index f9a20ebd..b1f2b85d 100644 --- a/PG-PuReMD/src/cuda_neighbors.cu +++ b/PG-PuReMD/src/cuda/cuda_neighbors.cu @@ -21,15 +21,13 @@ #include "cuda_neighbors.h" -#include "reax_types.h" - #include "cuda_list.h" #include "cuda_utils.h" #include "cuda_reduction.h" -#include "vector.h" -#include "index_utils.h" -#include "tool_box.h" +#include "../index_utils.h" +#include "../tool_box.h" +#include "../vector.h" CUDA_DEVICE real Dev_DistSqr_to_Special_Point( rvec cp, rvec x ) diff --git a/PG-PuReMD/src/cuda_neighbors.h b/PG-PuReMD/src/cuda/cuda_neighbors.h similarity index 95% rename from PG-PuReMD/src/cuda_neighbors.h rename to PG-PuReMD/src/cuda/cuda_neighbors.h index f7d7cb15..4d4a9c4e 100644 --- a/PG-PuReMD/src/cuda_neighbors.h +++ b/PG-PuReMD/src/cuda/cuda_neighbors.h @@ -2,14 +2,13 @@ #ifndef __CUDA_NEIGHBORS_H__ #define __CUDA_NEIGHBORS_H__ -#include "reax_types.h" +#include "../reax_types.h" #ifdef __cplusplus extern "C" { #endif - void Cuda_Generate_Neighbor_Lists( reax_system *, simulation_data *, storage *, reax_list ** ); int Cuda_Estimate_Neighbors( reax_system *, int ); @@ -24,9 +23,9 @@ void Cuda_Init_Sparse_Matrix_Indices( reax_system *, sparse_matrix * ); void Cuda_Init_Three_Body_Indices( int *, int ); - #ifdef __cplusplus } #endif + #endif diff --git a/PG-PuReMD/src/cuda_nonbonded.cu b/PG-PuReMD/src/cuda/cuda_nonbonded.cu similarity index 99% rename from PG-PuReMD/src/cuda_nonbonded.cu rename to PG-PuReMD/src/cuda/cuda_nonbonded.cu index 93bca2da..25c0b17d 100644 --- a/PG-PuReMD/src/cuda_nonbonded.cu +++ b/PG-PuReMD/src/cuda/cuda_nonbonded.cu @@ -25,10 +25,9 @@ #include "cuda_utils.h" #include "cuda_reduction.h" #include "cuda_shuffle.h" -#include "vector.h" -#include "reax_types.h" -#include "index_utils.h" +#include "../index_utils.h" +#include "../vector.h" //CUDA_GLOBAL void __launch_bounds__ (960) ker_vdW_coulomb_energy( diff --git a/PG-PuReMD/src/cuda_nonbonded.h b/PG-PuReMD/src/cuda/cuda_nonbonded.h similarity index 79% rename from PG-PuReMD/src/cuda_nonbonded.h rename to PG-PuReMD/src/cuda/cuda_nonbonded.h index 1c9916bf..238d49d7 100644 --- a/PG-PuReMD/src/cuda_nonbonded.h +++ b/PG-PuReMD/src/cuda/cuda_nonbonded.h @@ -19,15 +19,17 @@ <http://www.gnu.org/licenses/>. ----------------------------------------------------------------------*/ -#ifndef __NONBONDED_H_ -#define __NONBONDED_H_ +#ifndef __CUDA_NONBONDED_H_ +#define __CUDA_NONBONDED_H_ -#include "reax_types.h" +#include "../reax_types.h" void Cuda_Compute_Polarization_Energy( reax_system *, simulation_data *); -void Cuda_NonBonded_Energy ( reax_system *, control_params *, - storage *, simulation_data *, reax_list **, - output_controls *, bool ); + +void Cuda_NonBonded_Energy( reax_system *, control_params *, + storage *, simulation_data *, reax_list **, + output_controls *, bool ); + #endif diff --git a/PG-PuReMD/src/cuda_post_evolve.cu b/PG-PuReMD/src/cuda/cuda_post_evolve.cu similarity index 95% rename from PG-PuReMD/src/cuda_post_evolve.cu rename to PG-PuReMD/src/cuda/cuda_post_evolve.cu index 9a478192..828a0e4b 100644 --- a/PG-PuReMD/src/cuda_post_evolve.cu +++ b/PG-PuReMD/src/cuda/cuda_post_evolve.cu @@ -1,9 +1,10 @@ #include "cuda_post_evolve.h" -#include "reax_types.h" -#include "vector.h" + #include "cuda_utils.h" +#include "../vector.h" + CUDA_GLOBAL void ker_post_evolve( reax_atom *my_atoms, simulation_data *data, int n ) diff --git a/PG-PuReMD/src/cuda_post_evolve.h b/PG-PuReMD/src/cuda/cuda_post_evolve.h similarity index 60% rename from PG-PuReMD/src/cuda_post_evolve.h rename to PG-PuReMD/src/cuda/cuda_post_evolve.h index dcdcd50c..a1a0571a 100644 --- a/PG-PuReMD/src/cuda_post_evolve.h +++ b/PG-PuReMD/src/cuda/cuda_post_evolve.h @@ -2,16 +2,18 @@ #ifndef __CUDA_POST_EVOLVE_H__ #define __CUDA_POST_EVOLVE_H__ -#include "reax_types.h" +#include "../reax_types.h" + #ifdef __cplusplus extern "C" { #endif -void post_evolve_velocities (reax_system *, simulation_data *); +void post_evolve_velocities( reax_system *, simulation_data * ); #ifdef __cplusplus } #endif + #endif diff --git a/PG-PuReMD/src/cuda_reduction.cu b/PG-PuReMD/src/cuda/cuda_reduction.cu similarity index 99% rename from PG-PuReMD/src/cuda_reduction.cu rename to PG-PuReMD/src/cuda/cuda_reduction.cu index 02d800ee..01bd3c81 100644 --- a/PG-PuReMD/src/cuda_reduction.cu +++ b/PG-PuReMD/src/cuda/cuda_reduction.cu @@ -4,10 +4,10 @@ #include "cuda_shuffle.h" #include "cuda_utils.h" -#include "vector.h" +#include "../vector.h" -#include "cub/cub/device/device_reduce.cuh" -#include "cub/cub/device/device_scan.cuh" +#include "../cub/cub/device/device_reduce.cuh" +#include "../cub/cub/device/device_scan.cuh" //struct RvecSum diff --git a/PG-PuReMD/src/cuda_reduction.h b/PG-PuReMD/src/cuda/cuda_reduction.h similarity index 96% rename from PG-PuReMD/src/cuda_reduction.h rename to PG-PuReMD/src/cuda/cuda_reduction.h index 15ca538f..cf9efc5d 100644 --- a/PG-PuReMD/src/cuda_reduction.h +++ b/PG-PuReMD/src/cuda/cuda_reduction.h @@ -2,32 +2,45 @@ #ifndef __CUDA_REDUCTION_H__ #define __CUDA_REDUCTION_H__ -#include "reax_types.h" +#include "../reax_types.h" #define INITIAL 0 #define FINAL 1 void Cuda_Reduction_Sum( int *, int *, size_t ); + void Cuda_Reduction_Sum( real *, real *, size_t ); + //void Cuda_Reduction_Sum( rvec *, rvec *, size_t ); + void Cuda_Reduction_Max( int *, int *, size_t ); + void Cuda_Scan_Excl_Sum( int *, int *, size_t ); CUDA_GLOBAL void k_reduction( const real *, real *, const size_t ); + CUDA_GLOBAL void k_reduction_rvec( rvec *, rvec *, size_t ); + CUDA_GLOBAL void k_reduction_rvec2( rvec2 *, rvec2 *, size_t ); + CUDA_GLOBAL void k_norm( const real *, real *, const size_t, int ); + CUDA_GLOBAL void k_dot( const real *, const real *, real *, const size_t ); CUDA_GLOBAL void k_vector_sum( real*, real, real*, real, real*, int ); + CUDA_GLOBAL void k_rvec2_pbetad( rvec2 *, rvec2 *, real, real, rvec2 *, int ); + CUDA_GLOBAL void k_rvec2_mul( rvec2*, rvec2*, rvec2*, int ); + CUDA_GLOBAL void k_vector_mul( real*, real*, real*, int ); + CUDA_GLOBAL void k_norm_rvec2( const rvec2 *, rvec2 *, const size_t, int ); + CUDA_GLOBAL void k_dot_rvec2( const rvec2 *, rvec2 *, rvec2 *, const size_t ); diff --git a/PG-PuReMD/src/cuda_reset_tools.cu b/PG-PuReMD/src/cuda/cuda_reset_tools.cu similarity index 98% rename from PG-PuReMD/src/cuda_reset_tools.cu rename to PG-PuReMD/src/cuda/cuda_reset_tools.cu index 27cb4580..ca435269 100644 --- a/PG-PuReMD/src/cuda_reset_tools.cu +++ b/PG-PuReMD/src/cuda/cuda_reset_tools.cu @@ -5,7 +5,7 @@ #include "cuda_utils.h" #include "cuda_reduction.h" -#include "reset_tools.h" +#include "../reset_tools.h" extern "C" diff --git a/PG-PuReMD/src/cuda_reset_tools.h b/PG-PuReMD/src/cuda/cuda_reset_tools.h similarity index 94% rename from PG-PuReMD/src/cuda_reset_tools.h rename to PG-PuReMD/src/cuda/cuda_reset_tools.h index f158afec..2e90b8eb 100644 --- a/PG-PuReMD/src/cuda_reset_tools.h +++ b/PG-PuReMD/src/cuda/cuda_reset_tools.h @@ -2,13 +2,13 @@ #ifndef __CUDA_RESET_TOOLS_H__ #define __CUDA_RESET_TOOLS_H__ -#include "reax_types.h" +#include "../reax_types.h" + #ifdef __cplusplus extern "C" { #endif - void Cuda_Reset_Workspace( reax_system *, storage * ); void Cuda_Reset_Atoms( reax_system *, control_params * ); @@ -19,9 +19,9 @@ int Cuda_Reset_Neighbor_Lists( reax_system *, control_params *, void Cuda_Reset( reax_system*, control_params*, simulation_data*, storage*, reax_list** ); - #ifdef __cplusplus } #endif + #endif diff --git a/PG-PuReMD/src/cuda_shuffle.h b/PG-PuReMD/src/cuda/cuda_shuffle.h similarity index 97% rename from PG-PuReMD/src/cuda_shuffle.h rename to PG-PuReMD/src/cuda/cuda_shuffle.h index f8dfddfa..0d687271 100644 --- a/PG-PuReMD/src/cuda_shuffle.h +++ b/PG-PuReMD/src/cuda/cuda_shuffle.h @@ -22,8 +22,7 @@ #ifndef __CUDA_SHUFFLE_H_ #define __CUDA_SHUFFLE_H_ -#include "reax_types.h" -#include "reax_types.h" +#include "../reax_types.h" #ifdef __cplusplus diff --git a/PG-PuReMD/src/cuda/cuda_system_props.cu b/PG-PuReMD/src/cuda/cuda_system_props.cu new file mode 100644 index 00000000..54957d00 --- /dev/null +++ b/PG-PuReMD/src/cuda/cuda_system_props.cu @@ -0,0 +1,1026 @@ + +#include "cuda_system_props.h" + +#include "cuda_utils.h" +#include "cuda_reduction.h" +#include "cuda_copy.h" +#include "cuda_shuffle.h" + +#include "../vector.h" + + +CUDA_GLOBAL void center_of_mass_blocks( single_body_parameters *sbp, reax_atom *atoms, + rvec *res_xcm, rvec *res_vcm, rvec *res_amcm, size_t n ) +{ + extern __shared__ rvec xcm[]; + extern __shared__ rvec vcm[]; + extern __shared__ rvec amcm[]; + + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + + //unsigned int xcm_id = threadIdx.x; + unsigned int vcm_id = blockDim.x; + unsigned int amcm_id = 2 *(blockDim.x); + + unsigned int index = 0; + rvec tmp; + real m; + + rvec_MakeZero (xcm [threadIdx.x]); + rvec_MakeZero (vcm [vcm_id + threadIdx.x]); + rvec_MakeZero (amcm[amcm_id + threadIdx.x]); + rvec_MakeZero (tmp); + + if (i < n){ + m = sbp [ atoms[i].type ].mass; + rvec_ScaledAdd (xcm [threadIdx.x], m, atoms [i].x); + rvec_ScaledAdd (vcm [vcm_id + threadIdx.x], m, atoms [i].v); + rvec_Cross (tmp, atoms[i].x, atoms [i].v); + rvec_ScaledAdd (amcm[amcm_id + threadIdx.x], m, tmp); + } + __syncthreads (); + + for( int offset = blockDim.x / 2; offset > 0; offset >>= 1 ) { + + if ((threadIdx.x < offset)) { + index = threadIdx.x + offset; + rvec_Add (xcm [threadIdx.x], xcm[index]); + rvec_Add (vcm [vcm_id + threadIdx.x], vcm[vcm_id + index]); + rvec_Add (amcm[amcm_id + threadIdx.x], amcm[amcm_id + index]); + } + __syncthreads (); + } + + if ((threadIdx.x == 0)){ + rvec_Copy (res_xcm[blockIdx.x], xcm[0]); + rvec_Copy (res_vcm[blockIdx.x], vcm[vcm_id]); + rvec_Copy (res_amcm[blockIdx.x], amcm[amcm_id]); + } +} + + +#if defined( __SM_35__) +CUDA_GLOBAL void center_of_mass_blocks_xcm( single_body_parameters *sbp, reax_atom *atoms, + rvec *res_xcm, size_t n ) +{ + extern __shared__ rvec my_xcm[]; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int xcm_id = threadIdx.x; + unsigned int index = 0; + rvec xcm; + real m; + + rvec_MakeZero (xcm); + + if (i < n){ + m = sbp [ atoms[i].type ].mass; + rvec_ScaledAdd (xcm , m, atoms [i].x); + } + __syncthreads (); + + for (int z = 16; z >= 1; z /= 2){ + xcm[0] += shfl( xcm[0], z); + xcm[1] += shfl( xcm[1], z); + xcm[2] += shfl( xcm[2], z); + } + __syncthreads (); + + if (threadIdx.x % 32 == 0) + rvec_Copy( my_xcm[ threadIdx.x >> 5], xcm ); + __syncthreads (); + + for( int offset = blockDim.x >> 6; offset > 0; offset >>= 1 ) { + + if ((threadIdx.x < offset)) { + index = threadIdx.x + offset; + rvec_Add (my_xcm [threadIdx.x], my_xcm[index]); + } + __syncthreads (); + } + + if ((threadIdx.x == 0)) + rvec_Copy (res_xcm[blockIdx.x], my_xcm[0]); +} + + +CUDA_GLOBAL void center_of_mass_blocks_vcm( single_body_parameters *sbp, reax_atom *atoms, + rvec *res_vcm, size_t n ) +{ + extern __shared__ rvec my_vcm[]; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int index = 0; + rvec vcm; + real m; + + rvec_MakeZero (vcm); + + if (i < n){ + m = sbp [ atoms[i].type ].mass; + rvec_ScaledAdd (vcm , m, atoms [i].v); + } + __syncthreads (); + + for (int z = 16; z >= 1; z /= 2){ + vcm[0] += shfl( vcm[0], z); + vcm[1] += shfl( vcm[1], z); + vcm[2] += shfl( vcm[2], z); + } + __syncthreads (); + + if (threadIdx.x % 32 == 0) + rvec_Copy( my_vcm[ threadIdx.x >> 5], vcm ); + __syncthreads (); + + for( int offset = blockDim.x >> 6; offset > 0; offset >>= 1 ) { + + if ((threadIdx.x < offset)) { + index = threadIdx.x + offset; + rvec_Add (my_vcm [threadIdx.x], my_vcm[index]); + } + __syncthreads (); + } + + if ((threadIdx.x == 0)) + rvec_Copy (res_vcm[blockIdx.x], my_vcm[0]); +} + + +CUDA_GLOBAL void center_of_mass_blocks_amcm( single_body_parameters *sbp, reax_atom *atoms, + rvec *res_amcm, size_t n ) +{ + extern __shared__ rvec my_amcm[]; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int index = 0; + rvec amcm; + real m; + rvec tmp; + + rvec_MakeZero (amcm); + rvec_MakeZero( tmp ); + + if (i < n){ + m = sbp [ atoms[i].type ].mass; + rvec_Cross (tmp, atoms[i].x, atoms [i].v); + rvec_ScaledAdd (amcm, m, tmp); + } + __syncthreads (); + + for (int z = 16; z >= 1; z /= 2){ + amcm[0] += shfl( amcm[0], z); + amcm[1] += shfl( amcm[1], z); + amcm[2] += shfl( amcm[2], z); + } + __syncthreads (); + + if (threadIdx.x % 32 == 0) + rvec_Copy( my_amcm[ threadIdx.x >> 5], amcm ); + __syncthreads (); + + + for( int offset = blockDim.x >> 6; offset > 0; offset >>= 1 ) { + + if ((threadIdx.x < offset)) { + index = threadIdx.x + offset; + rvec_Add (my_amcm[threadIdx.x], my_amcm[index]); + } + __syncthreads (); + } + + if ((threadIdx.x == 0)){ + rvec_Copy (res_amcm[blockIdx.x], my_amcm[0]); + } +} +#endif + + +CUDA_GLOBAL void center_of_mass( rvec *xcm, rvec *vcm, rvec *amcm, + rvec *res_xcm, rvec *res_vcm, rvec *res_amcm, size_t n ) +{ + extern __shared__ rvec sh_xcm[]; + extern __shared__ rvec sh_vcm[]; + extern __shared__ rvec sh_amcm[]; + + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + + unsigned int xcm_id = threadIdx.x; + unsigned int vcm_id = blockDim.x; + unsigned int amcm_id = 2 * (blockDim.x); + + unsigned int index = 0; + rvec t_xcm, t_vcm, t_amcm; + + rvec_MakeZero (t_xcm); + rvec_MakeZero (t_vcm); + rvec_MakeZero (t_amcm); + + if (i < n){ + rvec_Copy ( t_xcm, xcm[threadIdx.x]); + rvec_Copy ( t_vcm, vcm[threadIdx.x]); + rvec_Copy ( t_amcm, amcm[threadIdx.x]); + } + + rvec_Copy (sh_xcm[xcm_id], t_xcm); + rvec_Copy (sh_vcm[vcm_id + threadIdx.x], t_vcm); + rvec_Copy (sh_amcm[amcm_id + threadIdx.x], t_amcm); + + __syncthreads (); + + for( int offset = blockDim.x / 2; offset > 0; offset >>= 1 ) { + + if (threadIdx.x < offset) { + index = threadIdx.x + offset; + rvec_Add (sh_xcm [threadIdx.x], sh_xcm[index]); + rvec_Add (sh_vcm [vcm_id + threadIdx.x], sh_vcm[vcm_id + index]); + rvec_Add (sh_amcm [amcm_id + threadIdx.x], sh_amcm[amcm_id + index]); + } + __syncthreads (); + } + + if (threadIdx.x == 0){ + rvec_Copy (res_xcm[blockIdx.x], sh_xcm[0]); + rvec_Copy (res_vcm[blockIdx.x], sh_vcm[vcm_id]); + rvec_Copy (res_amcm[blockIdx.x], sh_amcm[amcm_id]); + } +} + + +CUDA_GLOBAL void compute_center_mass( single_body_parameters *sbp, + reax_atom *atoms, real *results, real xcm0, real xcm1, real xcm2, + size_t n ) +{ + extern __shared__ real xx[]; + extern __shared__ real xy[]; + extern __shared__ real xz[]; + extern __shared__ real yy[]; + extern __shared__ real yz[]; + extern __shared__ real zz[]; + + unsigned int xx_i = threadIdx.x; + unsigned int xy_i = blockDim.x; + unsigned int xz_i = 2 * blockDim.x; + unsigned int yy_i = 3 * blockDim.x; + unsigned int yz_i = 4 * blockDim.x; + unsigned int zz_i = 5 * blockDim.x; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int index = 0; + + rvec diff, xcm; + real m = 0; + rvec_MakeZero (diff); + xcm[0] = xcm0; + xcm[1] = xcm1; + xcm[2] = xcm2; + + + xx[xx_i] = xy [xy_i + threadIdx.x] = xz[xz_i + threadIdx.x] = + yy[yy_i + threadIdx.x] = yz[yz_i + threadIdx.x] = zz[zz_i + threadIdx.x] = 0; + + if (i < n){ + m = sbp[ atoms[i].type ].mass; + rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm ); + xx[ xx_i ] = diff[0] * diff[0] * m; + xy[ xy_i + threadIdx.x ] = diff[0] * diff[1] * m; + xz[ xz_i + threadIdx.x ] = diff[0] * diff[2] * m; + yy[ yy_i + threadIdx.x ] = diff[1] * diff[1] * m; + yz[ yz_i + threadIdx.x ] = diff[1] * diff[2] * m; + zz[ zz_i + threadIdx.x ] = diff[2] * diff[2] * m; + } + __syncthreads (); + + for (int offset = blockDim.x / 2; offset > 0; offset >>= 1){ + if (threadIdx.x < offset){ + index = threadIdx.x + offset; + xx[ threadIdx.x ] += xx[ index ]; + xy[ xy_i + threadIdx.x ] += xy [ xy_i + index ]; + xz[ xz_i + threadIdx.x ] += xz [ xz_i + index ]; + yy[ yy_i + threadIdx.x ] += yy [ yy_i + index ]; + yz[ yz_i + threadIdx.x ] += yz [ yz_i + index ]; + zz[ zz_i + threadIdx.x ] += zz [ zz_i + index ]; + } + __syncthreads (); + } + + if (threadIdx.x == 0) { + results [ blockIdx.x*6 ] = xx [ 0 ]; + results [ blockIdx.x*6 + 1 ] = xy [ xy_i + 0 ]; + results [ blockIdx.x*6 + 2 ] = xz [ xz_i + 0 ]; + results [ blockIdx.x*6 + 3 ] = yy [ yy_i + 0 ]; + results [ blockIdx.x*6 + 4 ] = yz [ yz_i + 0 ]; + results [ blockIdx.x*6 + 5 ] = zz [ zz_i + 0 ]; + } +} + + +CUDA_GLOBAL void compute_center_mass( real *input, real *output, size_t n ) +{ + extern __shared__ real xx[]; + extern __shared__ real xy[]; + extern __shared__ real xz[]; + extern __shared__ real yy[]; + extern __shared__ real yz[]; + extern __shared__ real zz[]; + + unsigned int xx_i = threadIdx.x; + unsigned int xy_i = blockDim.x; + unsigned int xz_i = 2 * blockDim.x; + unsigned int yy_i = 3 * blockDim.x; + unsigned int yz_i = 4 * blockDim.x; + unsigned int zz_i = 5 * blockDim.x; + + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int index = 0; + + xx[xx_i] = xy [xy_i + threadIdx.x] = xz[xz_i + threadIdx.x] = + yy[yy_i + threadIdx.x] = yz[yz_i + threadIdx.x] = zz[zz_i + threadIdx.x] = 0; + + if (i < n) + { + xx [ xx_i ] = input [ threadIdx.x*6 + 0 ]; + xy [ xy_i + threadIdx.x ] = input [ threadIdx.x*6 + 1 ]; + xz [ xz_i + threadIdx.x ] = input [ threadIdx.x*6 + 2 ]; + yy [ yy_i + threadIdx.x ] = input [ threadIdx.x*6 + 3 ]; + yz [ yz_i + threadIdx.x ] = input [ threadIdx.x*6 + 4 ]; + zz [ zz_i + threadIdx.x ] = input [ threadIdx.x*6 + 5 ]; + } + __syncthreads (); + + for (int offset = blockDim.x / 2; offset > 0; offset >>= 1) + { + if (threadIdx.x < offset ) + { + index = threadIdx.x + offset; + xx [ threadIdx.x ] += xx [ index ]; + xy [ xy_i + threadIdx.x ] += xy [ xy_i + index ]; + xz [ xz_i + threadIdx.x ] += xz [ xz_i + index ]; + yy [ yy_i + threadIdx.x ] += yy [ yy_i + index ]; + yz [ yz_i + threadIdx.x ] += yz [ yz_i + index ]; + zz [ zz_i + threadIdx.x ] += zz [ zz_i + index ]; + } + __syncthreads (); + } + + if (threadIdx.x == 0) + { + output[0] = xx[0]; + output[1] = xy[xy_i]; + output[2] = xz[xz_i]; + output[3] = xz[yy_i]; + output[4] = xz[yz_i]; + output[5] = xz[zz_i]; + } +} + + +#if defined( __SM_35__) +CUDA_GLOBAL void compute_center_mass_xx_xy( single_body_parameters *sbp, + reax_atom *atoms, real *results, real xcm0, real xcm1, real xcm2, + size_t n ) +{ + extern __shared__ real my_results_xx[]; + extern __shared__ real my_results_xy[]; + + unsigned int xx_i = threadIdx.x; + unsigned int xy_i = blockDim.x; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int index = 0; + real xx = 0; + real xy = 0; + + rvec diff, xcm; + real m = 0; + rvec_MakeZero (diff); + xcm[0] = xcm0; + xcm[1] = xcm1; + xcm[2] = xcm2; + + + if (i < n){ + m = sbp[ atoms[i].type ].mass; + rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm ); + xx = diff[0] * diff[0] * m; + xy = diff[0] * diff[1] * m; + } + __syncthreads (); + + for (int z = 16; z <= 1; z++){ + xx += shfl( xx, z); + xy += shfl( xy, z); + } + __syncthreads (); + + if (threadIdx.x % 32 == 0){ + my_results_xx[threadIdx.x >> 5] = xx; + my_results_xy[threadIdx.x >> 5] = xy; + } + __syncthreads (); + + for (int offset = blockDim.x >> 6; offset > 0; offset >>= 1){ + if (threadIdx.x < offset){ + index = threadIdx.x + offset; + my_results_xx[ threadIdx.x ] += my_results_xx[ index ]; + my_results_xy[ xy_i + threadIdx.x ] += my_results_xy [ xy_i + index ]; + } + __syncthreads (); + } + + if (threadIdx.x == 0) { + results [ blockIdx.x*6 ] = my_results_xx [ 0 ]; + results [ blockIdx.x*6 + 1 ] = my_results_xy [ xy_i + 0 ]; + } +} + + +CUDA_GLOBAL void compute_center_mass_xz_yy( single_body_parameters *sbp, + reax_atom *atoms, real *results, real xcm0, real xcm1, real xcm2, + size_t n ) +{ + extern __shared__ real my_results_xz[]; + extern __shared__ real my_results_yy[]; + + unsigned int yy_i = blockDim.x; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int index = 0; + real xz = 0; + real yy = 0; + + rvec diff, xcm; + real m = 0; + rvec_MakeZero (diff); + xcm[0] = xcm0; + xcm[1] = xcm1; + xcm[2] = xcm2; + + if (i < n){ + m = sbp[ atoms[i].type ].mass; + rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm ); + xz = diff[0] * diff[2] * m; + yy = diff[1] * diff[1] * m; + } + __syncthreads (); + + for (int z = 16; z <= 1; z++){ + xz += shfl( xz, z); + yy += shfl( yy, z); + } + __syncthreads (); + + if (threadIdx.x % 32 == 0){ + my_results_xz[threadIdx.x >> 5] = xz; + my_results_yy[threadIdx.x >> 5] = yy; + } + __syncthreads (); + + for (int offset = blockDim.x >> 6; offset > 0; offset >>= 1){ + if (threadIdx.x < offset){ + index = threadIdx.x + offset; + my_results_xz[ threadIdx.x ] += my_results_xz [ index ]; + my_results_yy[ yy_i + threadIdx.x ] += my_results_yy [ yy_i + index ]; + } + __syncthreads (); + } + + if (threadIdx.x == 0) { + results [ blockIdx.x*6 + 2 ] = my_results_xz [ 0 ]; + results [ blockIdx.x*6 + 3 ] = my_results_yy [ yy_i + 0 ]; + } +} + + +CUDA_GLOBAL void compute_center_mass_yz_zz( single_body_parameters *sbp, + reax_atom *atoms, real *results, real xcm0, real xcm1, real xcm2, + size_t n ) +{ + extern __shared__ real my_results_yz[]; + extern __shared__ real my_results_zz[]; + + unsigned int zz_i = blockDim.x; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int index = 0; + real yz = 0; + real zz = 0; + + rvec diff, xcm; + real m = 0; + rvec_MakeZero (diff); + xcm[0] = xcm0; + xcm[1] = xcm1; + xcm[2] = xcm2; + + if (i < n) + { + m = sbp[ atoms[i].type ].mass; + rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm ); + yz = diff[1] * diff[2] * m; + zz = diff[2] * diff[2] * m; + } + __syncthreads (); + + for (int z = 16; z <= 1; z++){ + yz += shfl( yz, z); + zz += shfl( zz, z); + } + __syncthreads (); + + if (threadIdx.x % 32 == 0){ + my_results_yz[threadIdx.x >> 5] = yz; + my_results_zz[threadIdx.x >> 5] = zz; + } + __syncthreads (); + + for (int offset = blockDim.x >> 6; offset > 0; offset >>= 1){ + if (threadIdx.x < offset){ + index = threadIdx.x + offset; + my_results_yz[ threadIdx.x ] += my_results_yz [ index ]; + my_results_zz[ zz_i + threadIdx.x ] += my_results_zz [ zz_i + index ]; + } + __syncthreads (); + } + + if (threadIdx.x == 0) { + results [ blockIdx.x*6 + 4 ] = my_results_yz [ 0 ]; + results [ blockIdx.x*6 + 5 ] = my_results_zz [ zz_i + 0 ]; + } +} +#endif + + +CUDA_GLOBAL void k_compute_total_mass( single_body_parameters *sbp, reax_atom *my_atoms, + real *block_results, int n ) +{ +#if defined(__SM_35__) + extern __shared__ real my_sbp[]; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + real sdata = 0; + + if (i < n) + { + sdata = sbp[ my_atoms[i].type ].mass; + } + __syncthreads( ); + + for(int z = 16; z >=1; z/=2) + { + sdata += shfl( sdata, z); + } + + if (threadIdx.x % 32 == 0) + { + my_sbp[threadIdx.x >> 5] = sdata; + } + + __syncthreads( ); + + for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1) + { + if(threadIdx.x < offset) + { + my_sbp[threadIdx.x] += my_sbp[threadIdx.x + offset]; + } + + __syncthreads( ); + } + + if(threadIdx.x == 0) + { + block_results[blockIdx.x] = my_sbp[0]; + } + +#else + extern __shared__ real sdata[]; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + real x = 0; + + if (i < n) + { + x = sbp[ my_atoms[i].type ].mass; + } + + sdata[ threadIdx.x ] = x; + __syncthreads( ); + + for (int offset = blockDim.x / 2; offset > 0; offset >>= 1) + { + if (threadIdx.x < offset) + { + sdata[threadIdx.x] += sdata[threadIdx.x + offset]; + } + + __syncthreads( ); + } + + if (threadIdx.x == 0) + { + block_results[ blockIdx.x] = sdata [0]; + } + +#endif +} + + +extern "C" void dev_compute_total_mass( reax_system *system, real *local_val ) +{ + real *block_mass = (real *) scratch; + cuda_memset( block_mass, 0, sizeof(real) * (1 + BLOCKS_POW_2), "total_mass:tmp" ); + + k_compute_total_mass <<<BLOCKS, BLOCK_SIZE, sizeof(real) * BLOCK_SIZE >>> + (system->reax_param.d_sbp, system->d_my_atoms, block_mass, system->n); + cudaThreadSynchronize( ); + cudaCheckError( ); + + k_reduction <<<1, BLOCKS_POW_2, sizeof(real) * BLOCKS_POW_2 >>> + (block_mass, block_mass + BLOCKS_POW_2, BLOCKS_POW_2); + cudaThreadSynchronize( ); + cudaCheckError( ); + + copy_host_device (local_val, block_mass + BLOCKS_POW_2, sizeof(real), + cudaMemcpyDeviceToHost, "total_mass:tmp"); +} + + +CUDA_GLOBAL void k_compute_kinetic_energy( single_body_parameters *sbp, reax_atom *my_atoms, + real *block_results, int n ) +{ +#if defined(__SM_35__) + extern __shared__ real my_sbpdot[]; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + real sdata = 0; + rvec p; + + if (i < n) + { + sdata = sbp[ my_atoms[i].type ].mass; + rvec_Scale( p, sdata, my_atoms[ i ].v ); + sdata = 0.5 * rvec_Dot( p, my_atoms[ i ].v ); + } + + __syncthreads( ); + + for(int z = 16; z >=1; z/=2) + { + sdata += shfl( sdata, z); + } + + if (threadIdx.x % 32 == 0) + { + my_sbpdot[threadIdx.x >> 5] = sdata; + } + + __syncthreads( ); + + for (int offset = blockDim.x >> 6; offset > 0; offset >>= 1) + { + if (threadIdx.x < offset) + { + my_sbpdot[threadIdx.x] += my_sbpdot[threadIdx.x + offset]; + } + + __syncthreads( ); + } + + if (threadIdx.x == 0) + { + block_results[blockIdx.x] = my_sbpdot[0]; + } + +#else + extern __shared__ real sdata []; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + real m = 0; + rvec p; + + if (i < n) + { + m = sbp[ my_atoms[i].type ].mass; + rvec_Scale( p, m, my_atoms[ i ].v ); + m = 0.5 * rvec_Dot( p, my_atoms[ i ].v ); + } + + sdata[ threadIdx.x ] = m; + __syncthreads( ); + + for (int offset = blockDim.x / 2; offset > 0; offset >>= 1) + { + if (threadIdx.x < offset) + { + sdata[threadIdx.x] += sdata[threadIdx.x + offset]; + } + + __syncthreads( ); + } + + if (threadIdx.x == 0) + { + block_results[blockIdx.x] = sdata[0]; + } +#endif +} + +extern "C" void dev_compute_kinetic_energy( reax_system *system, + simulation_data *data, real *local_val ) +{ + real *block_energy = (real *) scratch; + cuda_memset( block_energy, 0, sizeof(real) * (BLOCKS_POW_2 + 1), "kinetic_energy:tmp" ); + + k_compute_kinetic_energy <<<BLOCKS, BLOCK_SIZE, sizeof(real) * BLOCK_SIZE >>> + (system->reax_param.d_sbp, system->d_my_atoms, block_energy, system->n); + cudaThreadSynchronize( ); + cudaCheckError( ); + + k_reduction <<<1, BLOCKS_POW_2, sizeof(real) * BLOCKS_POW_2 >>> + (block_energy, block_energy + BLOCKS_POW_2, BLOCKS_POW_2); + cudaThreadSynchronize( ); + cudaCheckError( ); + + copy_host_device( local_val, block_energy + BLOCKS_POW_2, + //copy_host_device (local_val, &((simulation_data *)data->d_simulation_data)->my_en.e_kin, + sizeof(real), cudaMemcpyDeviceToHost, "kinetic_energy:tmp" ); + //copy_device (block_energy + BLOCKS_POW_2, &((simulation_data *)data->d_simulation_data)->my_en.e_kin, + // sizeof (real), "kinetic_energy"); +} + + +extern "C" void dev_compute_momentum( reax_system *system, rvec xcm, + rvec vcm, rvec amcm ) +{ + rvec *l_xcm, *l_vcm, *l_amcm; + rvec *r_scratch = (rvec *)scratch; + +#if defined( __SM_35__) + // xcm + cuda_memset( scratch, 0, sizeof(rvec) * (BLOCKS_POW_2 + 1), "momentum:tmp" ); + l_xcm = r_scratch; + + center_of_mass_blocks_xcm <<< BLOCKS_POW_2,BLOCK_SIZE,(sizeof(rvec) * BLOCK_SIZE) >>> + ( system->reax_param.d_sbp, system->d_my_atoms, l_xcm, system->n ); + cudaThreadSynchronize( ); + cudaCheckError( ); + + k_reduction_rvec <<<1, BLOCKS_POW_2, (sizeof(rvec) * BLOCKS_POW_2) >>> + (l_xcm, l_xcm + BLOCKS_POW_2, BLOCKS_POW_2); + cudaThreadSynchronize( ); + cudaCheckError( ); + copy_host_device( xcm, l_xcm + BLOCKS_POW_2, + sizeof(rvec), cudaMemcpyDeviceToHost, "momentum:xcm" ); + + // vcm + cuda_memset( scratch, 0, sizeof(rvec) * (BLOCKS_POW_2 + 1), "momentum:tmp" ); + l_vcm = r_scratch; + + center_of_mass_blocks_vcm <<< BLOCKS_POW_2,BLOCK_SIZE,(sizeof(rvec) * BLOCK_SIZE) >>> + ( system->reax_param.d_sbp, system->d_my_atoms, l_vcm, system->n ); + cudaThreadSynchronize( ); + cudaCheckError( ); + + k_reduction_rvec <<<1, BLOCKS_POW_2, (sizeof(rvec) * BLOCKS_POW_2) >>> + (l_vcm, l_vcm + BLOCKS_POW_2, BLOCKS_POW_2); + cudaThreadSynchronize( ); + cudaCheckError( ); + copy_host_device( vcm, l_vcm + BLOCKS_POW_2, sizeof(rvec), + cudaMemcpyDeviceToHost, "momentum:vcm" ); + + // amcm + cuda_memset( scratch, 0, sizeof (rvec) * (BLOCKS_POW_2 + 1), "momentum:tmp"); + l_amcm = r_scratch; + + center_of_mass_blocks_amcm <<< BLOCKS_POW_2,BLOCK_SIZE,(sizeof(rvec) * BLOCK_SIZE) >>> + ( system->reax_param.d_sbp, system->d_my_atoms, l_amcm, system->n ); + cudaThreadSynchronize( ); + cudaCheckError( ); + + k_reduction_rvec <<<1, BLOCKS_POW_2, (sizeof(rvec) * BLOCKS_POW_2) >>> + (l_amcm, l_amcm + BLOCKS_POW_2, BLOCKS_POW_2); + cudaThreadSynchronize( ); + cudaCheckError( ); + copy_host_device( amcm, l_amcm + BLOCKS_POW_2, sizeof(rvec), + cudaMemcpyDeviceToHost, "momemtum:amcm" ); + +#else + cuda_memset( scratch, 0, 3 * sizeof (rvec) * (BLOCKS_POW_2 + 1), "momentum:tmp" ); + + l_xcm = r_scratch; + l_vcm = r_scratch + (BLOCKS_POW_2 + 1); + l_amcm = r_scratch + 2 * (BLOCKS_POW_2 + 1); + + center_of_mass_blocks <<< BLOCKS_POW_2, BLOCK_SIZE, 3 * (sizeof (rvec) * BLOCK_SIZE) >>> + ( system->reax_param.d_sbp, system->d_my_atoms, l_xcm, l_vcm, l_amcm, system->n ); + cudaThreadSynchronize( ); + cudaCheckError( ); + + center_of_mass <<< 1, BLOCKS_POW_2, 3 * (sizeof (rvec) * BLOCKS_POW_2) >>> + ( l_xcm, l_vcm, l_amcm, l_xcm + BLOCKS_POW_2, l_vcm + BLOCKS_POW_2, + l_amcm + BLOCKS_POW_2, BLOCKS_POW_2 ); + cudaThreadSynchronize( ); + cudaCheckError( ); + + copy_host_device( xcm, l_xcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost, "momemtum:xcm" ); + copy_host_device( vcm, l_vcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost, "momentum:vcm" ); + copy_host_device( amcm, l_amcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost,"momentum:amcm" ); +#endif +} + + +extern "C" void dev_compute_inertial_tensor( reax_system *system, real *local_results, rvec my_xcm ) +{ +#if defined(__SM_35__) + real *partial_results = (real *) scratch; + cuda_memset( partial_results, 0, sizeof (real) * 6 * (BLOCKS_POW_2 + 1), "tensor:tmp" ); + + compute_center_mass_xx_xy <<<BLOCKS_POW_2, BLOCK_SIZE, 2 * (sizeof (real) * BLOCK_SIZE) >>> + (system->reax_param.d_sbp, system->d_my_atoms, partial_results, + my_xcm[0], my_xcm[1], my_xcm[2], system->n); + cudaThreadSynchronize( ); + cudaCheckError( ); + + compute_center_mass_xz_yy <<<BLOCKS_POW_2, BLOCK_SIZE, 2 * (sizeof (real) * BLOCK_SIZE) >>> + (system->reax_param.d_sbp, system->d_my_atoms, partial_results, + my_xcm[0], my_xcm[1], my_xcm[2], system->n); + cudaThreadSynchronize( ); + cudaCheckError( ); + + compute_center_mass_yz_zz <<<BLOCKS_POW_2, BLOCK_SIZE, 2 * (sizeof (real) * BLOCK_SIZE) >>> + (system->reax_param.d_sbp, system->d_my_atoms, partial_results, + my_xcm[0], my_xcm[1], my_xcm[2], system->n); + cudaThreadSynchronize( ); + cudaCheckError( ); + + compute_center_mass <<<1, BLOCKS_POW_2, 6 * (sizeof (real) * BLOCKS_POW_2) >>> + (partial_results, partial_results + (BLOCKS_POW_2 * 6), BLOCKS_POW_2); + cudaThreadSynchronize( ); + cudaCheckError( ); + + copy_host_device( local_results, partial_results + 6 * BLOCKS_POW_2, + sizeof(real) * 6, cudaMemcpyDeviceToHost, "tensor:local_results" ); + +#else + real *partial_results = (real *) scratch; + //real *local_results; + + cuda_memset (partial_results, 0, sizeof (real) * 6 * (BLOCKS_POW_2 + 1), "tensor:tmp"); + //local_results = (real *) malloc (sizeof (real) * 6 *(BLOCKS_POW_2+ 1)); + + compute_center_mass <<<BLOCKS_POW_2, BLOCK_SIZE, 6 * (sizeof (real) * BLOCK_SIZE) >>> + (system->reax_param.d_sbp, system->d_my_atoms, partial_results, + my_xcm[0], my_xcm[1], my_xcm[2], system->n); + cudaThreadSynchronize( ); + cudaCheckError( ); + + compute_center_mass <<<1, BLOCKS_POW_2, 6 * (sizeof (real) * BLOCKS_POW_2) >>> + (partial_results, partial_results + (BLOCKS_POW_2 * 6), BLOCKS_POW_2); + cudaThreadSynchronize( ); + cudaCheckError( ); + + copy_host_device (local_results, partial_results + 6 * BLOCKS_POW_2, + sizeof(real) * 6, cudaMemcpyDeviceToHost, "tensor:local_results"); +#endif +} + + +extern "C" void dev_sync_simulation_data( simulation_data *data ) +{ + Output_Sync_Simulation_Data( data, (simulation_data *)data->d_simulation_data ); +} + + +void Cuda_Compute_Kinetic_Energy( reax_system* system, simulation_data* data, + MPI_Comm comm ) +{ + int i; + rvec p; + real m; + + data->my_en.e_kin = 0.0; + + dev_compute_kinetic_energy( system, data, &data->my_en.e_kin ); + + MPI_Allreduce( &data->my_en.e_kin, &data->sys_en.e_kin, + 1, MPI_DOUBLE, MPI_SUM, comm ); + + data->therm.T = (2. * data->sys_en.e_kin) / (data->N_f * K_B); + + // avoid T being an absolute zero, might cause F.P.E! + if ( FABS(data->therm.T) < ALMOST_ZERO ) + { + data->therm.T = ALMOST_ZERO; + } +} + + +void Cuda_Compute_Total_Mass( reax_system *system, simulation_data *data, + MPI_Comm comm ) +{ + int i; + real tmp; + + //compute local total mass of the system + dev_compute_total_mass( system, &tmp ); + + MPI_Allreduce( &tmp, &data->M, 1, MPI_DOUBLE, MPI_SUM, comm ); + + data->inv_M = 1. / data->M; +} + + +void Cuda_Compute_Center_of_Mass( reax_system *system, simulation_data *data, + mpi_datatypes *mpi_data, MPI_Comm comm ) +{ + int i; + real m, det; //xx, xy, xz, yy, yz, zz; + real tmp_mat[6], tot_mat[6]; + rvec my_xcm, my_vcm, my_amcm, my_avcm; + rvec tvec, diff; + rtensor mat, inv; + + rvec_MakeZero( my_xcm ); // position of CoM + rvec_MakeZero( my_vcm ); // velocity of CoM + rvec_MakeZero( my_amcm ); // angular momentum of CoM + rvec_MakeZero( my_avcm ); // angular velocity of CoM + + /* Compute the position, vel. and ang. momentum about the centre of mass */ + dev_compute_momentum ( system, my_xcm, my_vcm, my_amcm ); + + MPI_Allreduce( my_xcm, data->xcm, 3, MPI_DOUBLE, MPI_SUM, comm ); + MPI_Allreduce( my_vcm, data->vcm, 3, MPI_DOUBLE, MPI_SUM, comm ); + MPI_Allreduce( my_amcm, data->amcm, 3, MPI_DOUBLE, MPI_SUM, comm ); + + rvec_Scale( data->xcm, data->inv_M, data->xcm ); + rvec_Scale( data->vcm, data->inv_M, data->vcm ); + rvec_Cross( tvec, data->xcm, data->vcm ); + rvec_ScaledAdd( data->amcm, -data->M, tvec ); + data->etran_cm = 0.5 * data->M * rvec_Norm_Sqr( data->vcm ); + + /* Calculate and then invert the inertial tensor */ + for ( i = 0; i < 6; ++i ) + { + tmp_mat[i] = 0; + } + + dev_compute_inertial_tensor( system, tmp_mat, my_xcm ); + + MPI_Reduce( tmp_mat, tot_mat, 6, MPI_DOUBLE, MPI_SUM, MASTER_NODE, comm ); + + if ( system->my_rank == MASTER_NODE ) + { + mat[0][0] = tot_mat[3] + tot_mat[5]; // yy + zz; + mat[0][1] = mat[1][0] = -tot_mat[1]; // -xy; + mat[0][2] = mat[2][0] = -tot_mat[2]; // -xz; + mat[1][1] = tot_mat[0] + tot_mat[5]; // xx + zz; + mat[2][1] = mat[1][2] = -tot_mat[4]; // -yz; + mat[2][2] = tot_mat[0] + tot_mat[3]; // xx + yy; + + /* invert the inertial tensor */ + det = ( mat[0][0] * mat[1][1] * mat[2][2] + + mat[0][1] * mat[1][2] * mat[2][0] + + mat[0][2] * mat[1][0] * mat[2][1] ) - + ( mat[0][0] * mat[1][2] * mat[2][1] + + mat[0][1] * mat[1][0] * mat[2][2] + + mat[0][2] * mat[1][1] * mat[2][0] ); + + inv[0][0] = mat[1][1] * mat[2][2] - mat[1][2] * mat[2][1]; + inv[0][1] = mat[0][2] * mat[2][1] - mat[0][1] * mat[2][2]; + inv[0][2] = mat[0][1] * mat[1][2] - mat[0][2] * mat[1][1]; + inv[1][0] = mat[1][2] * mat[2][0] - mat[1][0] * mat[2][2]; + inv[1][1] = mat[0][0] * mat[2][2] - mat[0][2] * mat[2][0]; + inv[1][2] = mat[0][2] * mat[1][0] - mat[0][0] * mat[1][2]; + inv[2][0] = mat[1][0] * mat[2][1] - mat[2][0] * mat[1][1]; + inv[2][1] = mat[2][0] * mat[0][1] - mat[0][0] * mat[2][1]; + inv[2][2] = mat[0][0] * mat[1][1] - mat[1][0] * mat[0][1]; + + if ( det > ALMOST_ZERO ) + { + rtensor_Scale( inv, 1. / det, inv ); + } + else + { + rtensor_MakeZero( inv ); + } + + /* Compute the angular velocity about the centre of mass */ + rtensor_MatVec( data->avcm, inv, data->amcm ); + } + + MPI_Bcast( data->avcm, 3, MPI_DOUBLE, MASTER_NODE, comm ); + + /* Compute the rotational energy */ + data->erot_cm = 0.5 * E_CONV * rvec_Dot( data->avcm, data->amcm ); + +#if defined(DEBUG) + fprintf( stderr, "xcm: %24.15e %24.15e %24.15e\n", + data->xcm[0], data->xcm[1], data->xcm[2] ); + fprintf( stderr, "vcm: %24.15e %24.15e %24.15e\n", + data->vcm[0], data->vcm[1], data->vcm[2] ); + fprintf( stderr, "amcm: %24.15e %24.15e %24.15e\n", + data->amcm[0], data->amcm[1], data->amcm[2] ); + /* fprintf( stderr, "mat: %f %f %f\n %f %f %f\n %f %f %f\n", + mat[0][0], mat[0][1], mat[0][2], + mat[1][0], mat[1][1], mat[1][2], + mat[2][0], mat[2][1], mat[2][2] ); + fprintf( stderr, "inv: %g %g %g\n %g %g %g\n %g %g %g\n", + inv[0][0], inv[0][1], inv[0][2], + inv[1][0], inv[1][1], inv[1][2], + inv[2][0], inv[2][1], inv[2][2] ); */ + fprintf( stderr, "avcm: %24.15e %24.15e %24.15e\n", + data->avcm[0], data->avcm[1], data->avcm[2] ); +#endif +} + + diff --git a/PG-PuReMD/src/cuda_system_props.h b/PG-PuReMD/src/cuda/cuda_system_props.h similarity index 65% rename from PG-PuReMD/src/cuda_system_props.h rename to PG-PuReMD/src/cuda/cuda_system_props.h index ce6fccc1..66f620b3 100644 --- a/PG-PuReMD/src/cuda_system_props.h +++ b/PG-PuReMD/src/cuda/cuda_system_props.h @@ -2,24 +2,35 @@ #ifndef __CUDA_SYSTEM_PROPS_H__ #define __CUDA_SYSTEM_PROPS_H__ -#include "reax_types.h" +#include "../reax_types.h" + #ifdef __cplusplus extern "C" { #endif - void dev_compute_total_mass( reax_system *, real * ); + void dev_compute_kinetic_energy( reax_system *, simulation_data *, real * ); + void dev_compute_momentum( reax_system *, rvec, rvec, rvec ); + void dev_compute_inertial_tensor( reax_system *, real *, rvec my_xcm ); void dev_sync_simulation_data( simulation_data * ); + //void dev_compute_kinetic_energy( reax_system *, simulation_data *, real * ); +void Cuda_Compute_Total_Mass( reax_system*, simulation_data*, MPI_Comm ); + +void Cuda_Compute_Kinetic_Energy( reax_system*, simulation_data*, MPI_Comm ); + +void Cuda_Compute_Center_of_Mass( reax_system*, simulation_data*, + mpi_datatypes*, MPI_Comm ); #ifdef __cplusplus } #endif + #endif diff --git a/PG-PuReMD/src/cuda_torsion_angles.cu b/PG-PuReMD/src/cuda/cuda_torsion_angles.cu similarity index 99% rename from PG-PuReMD/src/cuda_torsion_angles.cu rename to PG-PuReMD/src/cuda/cuda_torsion_angles.cu index e70c378b..47c087d2 100644 --- a/PG-PuReMD/src/cuda_torsion_angles.cu +++ b/PG-PuReMD/src/cuda/cuda_torsion_angles.cu @@ -19,13 +19,14 @@ <http://www.gnu.org/licenses/>. ----------------------------------------------------------------------*/ -#include "reax_types.h" -#include "index_utils.h" #include "cuda_torsion_angles.h" -#include "vector.h" + #include "cuda_list.h" #include "cuda_helpers.h" +#include "../index_utils.h" +#include "../vector.h" + #define MIN_SINE 1e-10 diff --git a/PG-PuReMD/src/cuda_torsion_angles.h b/PG-PuReMD/src/cuda/cuda_torsion_angles.h similarity index 57% rename from PG-PuReMD/src/cuda_torsion_angles.h rename to PG-PuReMD/src/cuda/cuda_torsion_angles.h index 235e91b0..a7d9c3cb 100644 --- a/PG-PuReMD/src/cuda_torsion_angles.h +++ b/PG-PuReMD/src/cuda/cuda_torsion_angles.h @@ -19,24 +19,18 @@ <http://www.gnu.org/licenses/>. ----------------------------------------------------------------------*/ -#ifndef __TORSION_ANGLES_H_ -#define __TORSION_ANGLES_H_ - -#include "reax_types.h" -#include "reax_types.h" - -CUDA_GLOBAL void Cuda_Torsion_Angles( reax_atom *, - global_parameters , - four_body_header *, - control_params *, - reax_list , reax_list , - storage , - int , int , - real *, real *, - rvec *); - -CUDA_GLOBAL void Cuda_Torsion_Angles_PostProcess ( reax_atom *, - storage , - reax_list , int ); +#ifndef __CUDA_TORSION_ANGLES_H_ +#define __CUDA_TORSION_ANGLES_H_ + +#include "../reax_types.h" + + +CUDA_GLOBAL void Cuda_Torsion_Angles( reax_atom *, global_parameters, + four_body_header *, control_params *, reax_list, reax_list, + storage, int, int, real *, real *, rvec * ); + +CUDA_GLOBAL void Cuda_Torsion_Angles_PostProcess( reax_atom *, + storage, reax_list, int ); + #endif diff --git a/PG-PuReMD/src/cuda_utils.cu b/PG-PuReMD/src/cuda/cuda_utils.cu similarity index 88% rename from PG-PuReMD/src/cuda_utils.cu rename to PG-PuReMD/src/cuda/cuda_utils.cu index 5899a1ec..7e1757bc 100644 --- a/PG-PuReMD/src/cuda_utils.cu +++ b/PG-PuReMD/src/cuda/cuda_utils.cu @@ -149,3 +149,20 @@ extern "C" void print_device_mem_usage( ) total, (long long int)total/(1024.0*1024.0), free, (long long int)free/(1024.0*1024.0) ); } + + +extern "C" void init_blocks( reax_system *system ) +{ + compute_blocks( &BLOCKS, &BLOCK_SIZE, system->n ); + compute_nearest_pow_2( BLOCKS, &BLOCKS_POW_2 ); + + compute_blocks( &BLOCKS_N, &BLOCK_SIZE, system->N ); + compute_nearest_pow_2( BLOCKS_N, &BLOCKS_POW_2_N ); + + compute_matvec_blocks( &MATVEC_BLOCKS, system->N ); + +#if defined(__CUDA_DEBUG_LOG__) + fprintf( stderr, " MATVEC_BLOCKS: %d BLOCKSIZE: %d - N:%d \n", + MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, system->N ); +#endif +} diff --git a/PG-PuReMD/src/cuda_utils.h b/PG-PuReMD/src/cuda/cuda_utils.h similarity index 80% rename from PG-PuReMD/src/cuda_utils.h rename to PG-PuReMD/src/cuda/cuda_utils.h index 3d63d5e3..bfc4256d 100644 --- a/PG-PuReMD/src/cuda_utils.h +++ b/PG-PuReMD/src/cuda/cuda_utils.h @@ -1,7 +1,7 @@ #ifndef __CUDA_UTILS_H_ #define __CUDA_UTILS_H_ -#include "reax_types.h" +#include "../reax_types.h" #ifdef __cplusplus @@ -9,22 +9,33 @@ extern "C" { #endif void cuda_malloc( void **, size_t, int, const char * ); + void cuda_free( void *, const char * ); + void cuda_memset( void *, int , size_t , const char * ); + void copy_host_device( void *, void *, size_t, enum cudaMemcpyKind, const char * ); + void copy_device( void *, void *, size_t, const char * ); void compute_blocks( int *, int *, int ); + void compute_matvec_blocks( int *, int ); + void compute_nearest_pow_2( int, int * ); +void init_blocks( reax_system * ); + void print_device_mem_usage( ); + #ifdef __cplusplus #define cudaCheckError() __cudaCheckError( __FILE__, __LINE__ ) static inline void __cudaCheckError( const char *file, const int line ) { - cudaError err = cudaGetLastError(); + cudaError err; + + err = cudaGetLastError(); if ( cudaSuccess != err ) { fprintf( stderr, "[ERROR] runtime error encountered: %s:%d\n", file, line ); @@ -32,19 +43,22 @@ static inline void __cudaCheckError( const char *file, const int line ) exit( RUNTIME_ERROR ); } +#if defined(DEBUG) /* More careful checking. However, this will affect performance. */ -// err = cudaDeviceSynchronize(); -// if( cudaSuccess != err ) -// { -// exit( -1 ); -// } + err = cudaDeviceSynchronize( ); + if( cudaSuccess != err ) + { + exit( RUNTIME_ERROR ); + } +#endif return; } #endif -#endif - #ifdef __cplusplus } #endif + + +#endif diff --git a/PG-PuReMD/src/cuda_valence_angles.cu b/PG-PuReMD/src/cuda/cuda_valence_angles.cu similarity index 99% rename from PG-PuReMD/src/cuda_valence_angles.cu rename to PG-PuReMD/src/cuda/cuda_valence_angles.cu index d778c3b2..21b8d2c8 100644 --- a/PG-PuReMD/src/cuda_valence_angles.cu +++ b/PG-PuReMD/src/cuda/cuda_valence_angles.cu @@ -21,9 +21,10 @@ #include "cuda_valence_angles.h" -#include "index_utils.h" #include "cuda_list.h" -#include "vector.h" + +#include "../index_utils.h" +#include "../vector.h" /* Compute 3-body interactions, in which the main role is played by diff --git a/PG-PuReMD/src/cuda_valence_angles.h b/PG-PuReMD/src/cuda/cuda_valence_angles.h similarity index 98% rename from PG-PuReMD/src/cuda_valence_angles.h rename to PG-PuReMD/src/cuda/cuda_valence_angles.h index 65109597..d8abac25 100644 --- a/PG-PuReMD/src/cuda_valence_angles.h +++ b/PG-PuReMD/src/cuda/cuda_valence_angles.h @@ -22,8 +22,10 @@ #ifndef __CUDA_VALENCE_ANGLES_H_ #define __CUDA_VALENCE_ANGLES_H_ -#include "reax_types.h" -#include "vector.h" +#include "../reax_types.h" + +#include "../vector.h" + CUDA_GLOBAL void Cuda_Valence_Angles( reax_atom *, global_parameters, single_body_parameters *, three_body_header *, control_params *, diff --git a/PG-PuReMD/src/cuda_validation.cu b/PG-PuReMD/src/cuda/cuda_validation.cu similarity index 99% rename from PG-PuReMD/src/cuda_validation.cu rename to PG-PuReMD/src/cuda/cuda_validation.cu index 34a42430..34ebf6e5 100644 --- a/PG-PuReMD/src/cuda_validation.cu +++ b/PG-PuReMD/src/cuda/cuda_validation.cu @@ -1,13 +1,12 @@ -#include "reax_types.h" #include "cuda_validation.h" #include "cuda_utils.h" -#include "index_utils.h" -#include "list.h" -#include "tool_box.h" -#include "vector.h" +#include "../index_utils.h" +#include "../list.h" +#include "../tool_box.h" +#include "../vector.h" bool check_zero( real p1, real p2 ) diff --git a/PG-PuReMD/src/cuda_validation.h b/PG-PuReMD/src/cuda/cuda_validation.h similarity index 97% rename from PG-PuReMD/src/cuda_validation.h rename to PG-PuReMD/src/cuda/cuda_validation.h index 42eb37a4..7faa773b 100644 --- a/PG-PuReMD/src/cuda_validation.h +++ b/PG-PuReMD/src/cuda/cuda_validation.h @@ -3,50 +3,60 @@ #ifndef __CUDA_VALIDATION_H__ #define __CUDA_VALIDATION_H__ -#include "reax_types.h" +#include "../reax_types.h" + #ifdef __cplusplus extern "C" { #endif - int validate_neighbors( reax_system *, reax_list **lists ); + int validate_sym_dbond_indices( reax_system *system, storage *workspace, reax_list **lists ); int validate_bonds( reax_system *, storage *, reax_list ** ); + int validate_hbonds( reax_system *, storage *, reax_list ** ); + int validate_sparse_matrix( reax_system *, storage * ); int validate_grid( reax_system * ); + int validate_workspace( reax_system *, storage * ); int validate_data( reax_system *, simulation_data * ); + int validate_three_bodies( reax_system *, storage *, reax_list ** ); + int validate_atoms( reax_system *, reax_list ** ); int print_sparse_matrix( sparse_matrix *H ); + int print_sparse_matrix_host( sparse_matrix *H ); int print_host_rvec2( rvec2 *, int ); + int print_device_rvec2( rvec2 *, int ); int print_host_array( real *, int ); + int print_device_array( real *, int ); void compare_rvec2( rvec2 *host, rvec2 *device, int N, const char *msg ); + void compare_array( real *host, real *device, int N, const char *msg ); int check_zeros_host( rvec2 *host, int n, const char * ); -int check_zeros_device( rvec2 *device, int n, const char * ); - +int check_zeros_device( rvec2 *device, int n, const char * ); #ifdef __cplusplus } #endif + #endif diff --git a/PG-PuReMD/src/cuda_hydrogen_bonds.h b/PG-PuReMD/src/cuda_hydrogen_bonds.h deleted file mode 100644 index 7e1644f1..00000000 --- a/PG-PuReMD/src/cuda_hydrogen_bonds.h +++ /dev/null @@ -1,66 +0,0 @@ -/*---------------------------------------------------------------------- - PuReMD - Purdue ReaxFF Molecular Dynamics Program - - Copyright (2010) Purdue University - Hasan Metin Aktulga, haktulga@cs.purdue.edu - Joseph Fogarty, jcfogart@mail.usf.edu - Sagar Pandit, pandit@usf.edu - Ananth Y Grama, ayg@cs.purdue.edu - - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public License as - published by the Free Software Foundation; either version 2 of - the License, or (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details: - <http://www.gnu.org/licenses/>. - ----------------------------------------------------------------------*/ - -#ifndef __HBONDS_H_ -#define __HBONDS_H_ - -#include "reax_types.h" -#include "reax_types.h" - -CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs ( reax_atom *, - storage , - reax_list ); - -CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs_BL ( reax_atom *, - storage , - reax_list, int ); - -CUDA_GLOBAL void Cuda_Hydrogen_Bonds_PostProcess ( reax_atom *, - storage , - reax_list , int ); - -CUDA_GLOBAL void Cuda_Hydrogen_Bonds( reax_atom *, - single_body_parameters *, - hbond_parameters *, - global_parameters , - control_params *, - storage , - reax_list , - reax_list , - int , - int , - real *, - rvec *); - -CUDA_GLOBAL void Cuda_Hydrogen_Bonds_MT( reax_atom *, - single_body_parameters *, - hbond_parameters *, - global_parameters , - control_params *, - storage , - reax_list , - reax_list , - int , - int , - real *, - rvec *); - -#endif diff --git a/PG-PuReMD/src/cuda_init_md.cu b/PG-PuReMD/src/cuda_init_md.cu deleted file mode 100644 index 044e8e73..00000000 --- a/PG-PuReMD/src/cuda_init_md.cu +++ /dev/null @@ -1,14 +0,0 @@ - -#include "cuda_init_md.h" - -#include "reax_types.h" -#include "cuda_utils.h" - -#include "tool_box.h" - -void Cuda_Init_ScratchArea( ) -{ - cuda_malloc( (void **)&scratch, DEVICE_SCRATCH_SIZE, TRUE, "device:scratch" ); - - host_scratch = (void *) smalloc( HOST_SCRATCH_SIZE, "host:scratch" ); -} diff --git a/PG-PuReMD/src/cuda_init_md.h b/PG-PuReMD/src/cuda_init_md.h deleted file mode 100644 index cf7b5249..00000000 --- a/PG-PuReMD/src/cuda_init_md.h +++ /dev/null @@ -1,15 +0,0 @@ - -#ifndef __CUDA_INIT_MD_H__ -#define __CUDA_INIT_MD_H__ - -#ifdef __cplusplus -extern "C" { -#endif - -void Cuda_Init_ScratchArea( ); - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/PG-PuReMD/src/cuda_integrate.cu b/PG-PuReMD/src/cuda_integrate.cu deleted file mode 100644 index 936c6816..00000000 --- a/PG-PuReMD/src/cuda_integrate.cu +++ /dev/null @@ -1,105 +0,0 @@ - -#include "cuda_integrate.h" -#include "reax_types.h" - -#include "vector.h" -#include "cuda_utils.h" - - -CUDA_GLOBAL void k_update_velocity_1( reax_atom *my_atoms, - single_body_parameters *sbp, real dt, int n ) -{ - real inv_m; - rvec dx; - reax_atom *atom; - int i = blockIdx.x * blockDim.x + threadIdx.x; - - if ( i >= n ) - { - return; - } - - /* velocity verlet, 1st part */ - atom = &(my_atoms[i]); - inv_m = 1.0 / sbp[atom->type].mass; - /* Compute x(t + dt) */ - rvec_ScaledSum( dx, dt, atom->v, 0.5 * -F_CONV * inv_m * SQR(dt), atom->f ); - rvec_Add( atom->x, dx ); - /* Compute v(t + dt/2) */ - rvec_ScaledAdd( atom->v, 0.5 * -F_CONV * inv_m * dt, atom->f ); -} - - -void bNVT_update_velocity_part1( reax_system *system, real dt ) -{ - int blocks; - - blocks = system->n / DEF_BLOCK_SIZE + - ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1); - k_update_velocity_1 <<< blocks, DEF_BLOCK_SIZE >>> - (system->d_my_atoms, system->reax_param.d_sbp, dt, system->n); - cudaThreadSynchronize( ); - cudaCheckError( ); -} - - -CUDA_GLOBAL void k_update_velocity_2( reax_atom *my_atoms, - single_body_parameters *sbp, real dt, int n ) -{ - reax_atom *atom; - real inv_m; - int i = blockIdx.x * blockDim.x + threadIdx.x; - - if ( i >= n ) - { - return; - } - - /* velocity verlet, 2nd part */ - atom = &(my_atoms[i]); - inv_m = 1.0 / sbp[atom->type].mass; - /* Compute v(t + dt) */ - rvec_ScaledAdd( atom->v, 0.5 * dt * -F_CONV * inv_m, atom->f ); -} - - -void bNVT_update_velocity_part2( reax_system *system, real dt ) -{ - int blocks; - - blocks = system->n / DEF_BLOCK_SIZE + - ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1); - k_update_velocity_2 <<< blocks, DEF_BLOCK_SIZE >>> - (system->d_my_atoms, system->reax_param.d_sbp, dt, system->n); - cudaThreadSynchronize( ); - cudaCheckError( ); -} - - -CUDA_GLOBAL void k_scale_velocities( reax_atom *my_atoms, real lambda, int n ) -{ - reax_atom *atom; - int i = blockIdx.x * blockDim.x + threadIdx.x; - - if ( i >= n ) - { - return; - } - - /* Scale velocities and positions at t+dt */ - atom = &(my_atoms[i]); - rvec_Scale( atom->v, lambda, atom->v ); -} - - -void bNVT_scale_velocities( reax_system *system, real lambda ) -{ - int blocks; - - blocks = system->n / DEF_BLOCK_SIZE + - ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1); - k_scale_velocities <<< blocks, DEF_BLOCK_SIZE >>> - (system->d_my_atoms, lambda, system->n); - cudaThreadSynchronize( ); - cudaCheckError( ); -} diff --git a/PG-PuReMD/src/cuda_lin_alg.cu b/PG-PuReMD/src/cuda_lin_alg.cu deleted file mode 100644 index 4f37d577..00000000 --- a/PG-PuReMD/src/cuda_lin_alg.cu +++ /dev/null @@ -1,624 +0,0 @@ -/*---------------------------------------------------------------------- - PuReMD - Purdue ReaxFF Molecular Dynamics Program - - Copyright (2010) Purdue University - Hasan Metin Aktulga, haktulga@cs.purdue.edu - Joseph Fogarty, jcfogart@mail.usf.edu - Sagar Pandit, pandit@usf.edu - Ananth Y Grama, ayg@cs.purdue.edu - - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public License as - published by the Free Software Foundation; either version 2 of - the License, or (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details: - <http://www.gnu.org/licenses/>. - ----------------------------------------------------------------------*/ - -#include "cuda_lin_alg.h" - -#include "reax_types.h" - -#include "cuda_shuffle.h" -#include "cuda_utils.h" -#include "cuda_reduction.h" - - -//one thread per row -CUDA_GLOBAL void k_matvec( sparse_matrix H, real *vec, real *results, - int rows ) -{ - int i, col; - real results_row; - real val; - - i = blockIdx.x * blockDim.x + threadIdx.x; - - if ( i >= rows ) - { - return; - } - - results_row = 0; - - for (int c = H.start[i]; c < H.end[i]; c++) - { - col = H.entries [c].j; - val = H.entries[c].val; - - results_row += val * vec[col]; - } - - results[i] = results_row; -} - - -//32 thread warp per matrix row. -//invoked as follows -// <<< system->N, 32 >>> -//CUDA_GLOBAL void __launch_bounds__(384, 16) k_matvec_csr(sparse_matrix H, real *vec, real *results, int num_rows) -CUDA_GLOBAL void k_matvec_csr( sparse_matrix H, real *vec, real *results, - int num_rows ) -{ -#if defined(__SM_35__) - real vals; - int x; -#else - extern __shared__ real vals[]; -#endif - int jj; - int thread_id = blockDim.x * blockIdx.x + threadIdx.x; - int warp_id = thread_id / MATVEC_KER_THREADS_PER_ROW; - int lane = thread_id & ( MATVEC_KER_THREADS_PER_ROW - 1); - int row_start; - int row_end; - // one warp per row - int row = warp_id; - -#if defined(__SM_35__) - vals = 0; -#else - vals[threadIdx.x] = 0; -#endif - - if (row < num_rows) - { - row_start = H.start[row]; - row_end = H.end[row]; - - // compute running sum per thread - for ( jj = row_start + lane; jj < row_end; - jj += MATVEC_KER_THREADS_PER_ROW ) -#if defined(__SM_35__) - { - vals += H.entries[jj].val * vec[ H.entries[jj].j ]; - } - } -#else - { - vals[threadIdx.x] += H.entries[jj].val * vec[ H.entries[jj].j ]; - } - } - - __syncthreads( ); -#endif - - // parallel reduction in shared memory - //SIMD instructions with a WARP are synchronous -- so we do not need to synch here -#if defined(__SM_35__) - for (x = MATVEC_KER_THREADS_PER_ROW >> 1; x >= 1; x/=2) - { - vals += shfl( vals, x ); - } - - if (lane == 0 && row < num_rows) - { - results[row] = vals; - } -#else - if (lane < 16) - { - vals[threadIdx.x] += vals[threadIdx.x + 16]; - } - __syncthreads( ); - if (lane < 8) - { - vals[threadIdx.x] += vals[threadIdx.x + 8]; - } - __syncthreads( ); - if (lane < 4) - { - vals[threadIdx.x] += vals[threadIdx.x + 4]; - } - __syncthreads( ); - if (lane < 2) - { - vals[threadIdx.x] += vals[threadIdx.x + 2]; - } - __syncthreads( ); - if (lane < 1) - { - vals[threadIdx.x] += vals[threadIdx.x + 1]; - } - __syncthreads( ); - - // first thread writes the result - if (lane == 0 && row < num_rows) - { - results[row] = vals[threadIdx.x]; - } -#endif -} - - -//one thread per row -CUDA_GLOBAL void k_dual_matvec( sparse_matrix H, rvec2 *vec, rvec2 *results, - int rows ) -{ - int i, c, col; - rvec2 results_row; - real val; - - i = blockIdx.x * blockDim.x + threadIdx.x; - - if ( i >= rows) - { - return; - } - - results_row[0] = 0.0; - results_row[1] = 0.0; - - for (c = H.start[i]; c < H.end[i]; c++) - { - col = H.entries [c].j; - val = H.entries[c].val; - - results_row[0] += val * vec [col][0]; - results_row[1] += val * vec [col][1]; - } - - results[i][0] = results_row[0]; - results[i][1] = results_row[1]; -} - - -//32 thread warp per matrix row. -//invoked as follows -// <<< system->N, 32 >>> -//CUDA_GLOBAL void __launch_bounds__(384, 8) k_dual_matvec_csr(sparse_matrix H, rvec2 *vec, rvec2 *results, int num_rows) -CUDA_GLOBAL void k_dual_matvec_csr( sparse_matrix H, rvec2 *vec, - rvec2 *results, int num_rows ) -{ -#if defined(__SM_35__) - rvec2 rvals; - int thread_id = blockDim.x * blockIdx.x + threadIdx.x; - int warp_id = thread_id / MATVEC_KER_THREADS_PER_ROW; - int lane = thread_id & (MATVEC_KER_THREADS_PER_ROW - 1); - int row_start; - int row_end; - // one warp per row - int row = warp_id; - - rvals[0] = 0; - rvals[1] = 0; - - if (row < num_rows) - { - row_start = H.start[row]; - row_end = H.end[row]; - - for(int jj = row_start + lane; jj < row_end; jj += MATVEC_KER_THREADS_PER_ROW) - { - rvals[0] += H.entries[jj].val * vec [ H.entries[jj].j ][0]; - rvals[1] += H.entries[jj].val * vec [ H.entries[jj].j ][1]; - } - } - - for (int s = MATVEC_KER_THREADS_PER_ROW >> 1; s >= 1; s /= 2) - { - rvals[0] += shfl( rvals[0], s); - rvals[1] += shfl( rvals[1], s); - } - - if (lane == 0 && row < num_rows) - { - results[row][0] = rvals[0]; - results[row][1] = rvals[1]; - } - -#else - extern __shared__ rvec2 rvals[]; - int thread_id = blockDim.x * blockIdx.x + threadIdx.x; - int warp_id = thread_id / 32; - int lane = thread_id & (32 - 1); - int row_start; - int row_end; - // one warp per row - //int row = warp_id; - int row = warp_id; - - rvals[threadIdx.x][0] = 0; - rvals[threadIdx.x][1] = 0; - - if (row < num_rows) - { - row_start = H.start[row]; - row_end = H.end[row]; - - // compute running sum per thread - for(int jj = row_start + lane; jj < row_end; jj += 32) - { - rvals[threadIdx.x][0] += H.entries[jj].val * vec [ H.entries[jj].j ][0]; - rvals[threadIdx.x][1] += H.entries[jj].val * vec [ H.entries[jj].j ][1]; - } - } - - __syncthreads( ); - - // parallel reduction in shared memory - //SIMD instructions with a WARP are synchronous -- so we do not need to synch here - if (lane < 16) - { - rvals[threadIdx.x][0] += rvals[threadIdx.x + 16][0]; - rvals[threadIdx.x][1] += rvals[threadIdx.x + 16][1]; - } - __syncthreads( ); - if (lane < 8) - { - rvals[threadIdx.x][0] += rvals[threadIdx.x + 8][0]; - rvals[threadIdx.x][1] += rvals[threadIdx.x + 8][1]; - } - __syncthreads( ); - if (lane < 4) - { - rvals[threadIdx.x][0] += rvals[threadIdx.x + 4][0]; - rvals[threadIdx.x][1] += rvals[threadIdx.x + 4][1]; - } - __syncthreads( ); - if (lane < 2) - { - rvals[threadIdx.x][0] += rvals[threadIdx.x + 2][0]; - rvals[threadIdx.x][1] += rvals[threadIdx.x + 2][1]; - } - __syncthreads( ); - if (lane < 1) - { - rvals[threadIdx.x][0] += rvals[threadIdx.x + 1][0]; - rvals[threadIdx.x][1] += rvals[threadIdx.x + 1][1]; - } - __syncthreads( ); - - // first thread writes the result - if (lane == 0 && row < num_rows) - { - results[row][0] = rvals[threadIdx.x][0]; - results[row][1] = rvals[threadIdx.x][1]; - } - -#endif -} - - -void Cuda_Vector_Sum( real *res, real a, real *x, real b, real *y, int count ) -{ - //res = ax + by - //use the cublas here - int blocks; - - blocks = (count / DEF_BLOCK_SIZE) + - ((count % DEF_BLOCK_SIZE == 0) ? 0 : 1); - - k_vector_sum <<< blocks, DEF_BLOCK_SIZE >>> - ( res, a, x, b, y, count ); - cudaThreadSynchronize( ); - cudaCheckError( ); -} - - -void Cuda_CG_Preconditioner( real *res, real *a, real *b, int count ) -{ - //res = a*b - vector multiplication - //use the cublas here. - int blocks; - - blocks = (count / DEF_BLOCK_SIZE) + - ((count % DEF_BLOCK_SIZE == 0) ? 0 : 1); - - k_vector_mul <<< blocks, DEF_BLOCK_SIZE >>> - ( res, a, b, count ); - cudaThreadSynchronize( ); - cudaCheckError( ); -} - - -CUDA_GLOBAL void k_diagonal_preconditioner(storage p_workspace, rvec2 *b, int n) -{ - storage *workspace; - int j; - - j = blockIdx.x * blockDim.x + threadIdx.x; - - if ( j >= n ) - { - return; - } - - workspace = &( p_workspace ); - - //for( j = 0; j < system->n; ++j ) { - // residual - workspace->r2[j][0] = b[j][0] - workspace->q2[j][0]; - workspace->r2[j][1] = b[j][1] - workspace->q2[j][1]; - - // apply diagonal pre-conditioner - workspace->d2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j]; - workspace->d2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j]; - //} -} - - -void Cuda_CG_Diagonal_Preconditioner( storage *workspace, rvec2 *b, int n ) -{ - int blocks; - - blocks = (n / DEF_BLOCK_SIZE) + - (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1); - - k_diagonal_preconditioner <<< blocks, DEF_BLOCK_SIZE >>> - (*workspace, b, n); - - cudaThreadSynchronize( ); - cudaCheckError( ); -} - - -CUDA_GLOBAL void k_dual_cg_preconditioner( storage p_workspace, rvec2 *x, - real alpha_0, real alpha_1, int n, rvec2 *my_dot ) -{ - storage *workspace; - rvec2 alpha; - int j; - - j = blockIdx.x * blockDim.x + threadIdx.x; - - if ( j >= n ) - { - return; - } - - workspace = &( p_workspace ); - alpha[0] = alpha_0; - alpha[1] = alpha_1; - my_dot[j][0] = my_dot[j][1] = 0.0; - - //for( j = 0; j < system->n; ++j ) { - // update x - x[j][0] += alpha[0] * workspace->d2[j][0]; - x[j][1] += alpha[1] * workspace->d2[j][1]; - - // update residual - workspace->r2[j][0] -= alpha[0] * workspace->q2[j][0]; - workspace->r2[j][1] -= alpha[1] * workspace->q2[j][1]; - - // apply diagonal pre-conditioner - workspace->p2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j]; - workspace->p2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j]; - - // dot product: r.p - my_dot[j][0] = workspace->r2[j][0] * workspace->p2[j][0]; - my_dot[j][1] = workspace->r2[j][1] * workspace->p2[j][1]; - //} -} - - -void Cuda_DualCG_Preconditioner( storage *workspace, rvec2 *x, rvec2 alpha, - int n, rvec2 result ) -{ - int blocks; - rvec2 *tmp = (rvec2 *) scratch; - - cuda_memset( tmp, 0, sizeof(rvec2) * ( 2 * n + 1), - "cuda_dualcg_preconditioner" ); - blocks = (n / DEF_BLOCK_SIZE) + - (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1); - - k_dual_cg_preconditioner <<< blocks, DEF_BLOCK_SIZE >>> - (*workspace, x, alpha[0], alpha[1], n, tmp); - - cudaThreadSynchronize( ); - cudaCheckError( ); - - //Reduction to calculate my_dot - k_reduction_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec2) * DEF_BLOCK_SIZE >>> - ( tmp, tmp + n, n); - - cudaThreadSynchronize( ); - cudaCheckError( ); - - k_reduction_rvec2 <<< 1, BLOCKS_POW_2, sizeof(rvec2) * BLOCKS_POW_2 >>> - ( tmp + n, tmp + 2*n, blocks); - - cudaThreadSynchronize( ); - cudaCheckError( ); - - copy_host_device( result, (tmp + 2*n), sizeof(rvec2), - cudaMemcpyDeviceToHost, "my_dot" ); -} - - -void Cuda_Norm( rvec2 *arr, int n, rvec2 result ) -{ - int blocks; - rvec2 *tmp = (rvec2 *) scratch; - - blocks = (n / DEF_BLOCK_SIZE) + - (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1); - - k_norm_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec2) * DEF_BLOCK_SIZE >>> - (arr, tmp, n, INITIAL); - cudaThreadSynchronize( ); - cudaCheckError( ); - - k_norm_rvec2 <<< 1, BLOCKS_POW_2, sizeof(rvec2) * BLOCKS_POW_2 >>> - (tmp, tmp + BLOCKS_POW_2, blocks, FINAL ); - cudaThreadSynchronize( ); - cudaCheckError( ); - - copy_host_device( result, tmp + BLOCKS_POW_2, sizeof(rvec2), - cudaMemcpyDeviceToHost, "cuda_norm_rvec2" ); -} - - -void Cuda_Dot( rvec2 *a, rvec2 *b, rvec2 result, int n ) -{ - int blocks; - rvec2 *tmp = (rvec2 *) scratch; - - blocks = (n / DEF_BLOCK_SIZE) + - (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1); - - k_dot_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec2) * DEF_BLOCK_SIZE >>> - ( a, b, tmp, n ); - cudaThreadSynchronize( ); - cudaCheckError( ); - - k_norm_rvec2 <<< 1, BLOCKS_POW_2, sizeof(rvec2) * BLOCKS_POW_2 >>> - //k_norm_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec2) * BLOCKS_POW_2 >>> - ( tmp, tmp + BLOCKS_POW_2, blocks, FINAL ); - cudaThreadSynchronize( ); - cudaCheckError( ); - - copy_host_device( result, tmp + BLOCKS_POW_2, sizeof(rvec2), - cudaMemcpyDeviceToHost, "cuda_dot" ); -} - - -void Cuda_Vector_Sum_Rvec2(rvec2 *x, rvec2 *a, rvec2 b, rvec2 *c, int n) -{ - int blocks; - - blocks = (n / DEF_BLOCK_SIZE) + - (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1); - - k_rvec2_pbetad <<< blocks, DEF_BLOCK_SIZE >>> - ( x, a, b[0], b[1], c, n); - - cudaThreadSynchronize( ); - cudaCheckError( ); -} - - -CUDA_GLOBAL void k_rvec2_to_real_copy( real *dst, rvec2 *src, int index, int n ) -{ - int i = blockIdx.x * blockDim.x + threadIdx.x; - - if (i >= n) - { - return; - } - - dst[i] = src[i][index]; -} - - -void Cuda_RvecCopy_From( real *dst, rvec2 *src, int index, int n ) -{ - int blocks; - - blocks = (n / DEF_BLOCK_SIZE) + - (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1); - - k_rvec2_to_real_copy <<< blocks, DEF_BLOCK_SIZE >>> - ( dst, src, index, n); - cudaThreadSynchronize( ); - cudaCheckError( ); -} - - -CUDA_GLOBAL void k_real_to_rvec2_copy( rvec2 *dst, real *src, int index, int n) -{ - int i = blockIdx.x * blockDim.x + threadIdx.x; - - if (i >= n) - { - return; - } - - dst[i][index] = src[i]; -} - - -void Cuda_RvecCopy_To(rvec2 *dst, real *src, int index, int n) -{ - int blocks; - - blocks = (n / DEF_BLOCK_SIZE) + - (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1); - - k_real_to_rvec2_copy <<< blocks, DEF_BLOCK_SIZE >>> - ( dst, src, index, n); - - cudaThreadSynchronize( ); - cudaCheckError( ); -} - - -void Cuda_Dual_Matvec( sparse_matrix *H, rvec2 *a, rvec2 *b, int n, int size ) -{ - int blocks; - - blocks = (n / DEF_BLOCK_SIZE) + - (( n % DEF_BLOCK_SIZE) == 0 ? 0 : 1); - - cuda_memset( b, 0, sizeof(rvec2) * size, "dual_matvec:result" ); - - //One thread per row implementation - //k_dual_matvec <<< blocks, DEF_BLOCK_SIZE >>> - // (*H, a, b, n); - //cudaThreadSynchronize (); - //cudaCheckError (); - - //One warp per row implementation -#if defined(__SM_35__) - k_dual_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE >>> -#else - k_dual_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, - sizeof(rvec2) * MATVEC_BLOCK_SIZE >>> -#endif - ( *H, a, b, n ); - cudaThreadSynchronize( ); - cudaCheckError( ); -} - - -void Cuda_Matvec( sparse_matrix *H, real *a, real *b, int n, int size ) -{ - int blocks; - - blocks = (n / DEF_BLOCK_SIZE) + - (( n % DEF_BLOCK_SIZE) == 0 ? 0 : 1); - - cuda_memset( b, 0, sizeof(real) * size, "dual_matvec:result" ); - - //one thread per row implementation - //k_matvec <<< blocks, DEF_BLOCK_SIZE >>> - // (*H, a, b, n); - //cudaThreadSynchronize (); - //cudaCheckError (); - -#if defined(__SM_35__) - k_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE >>> -#else - k_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, - sizeof(real) * MATVEC_BLOCK_SIZE>>> -#endif - (*H, a, b, n); - - cudaThreadSynchronize( ); - cudaCheckError( ); -} diff --git a/PG-PuReMD/src/cuda_system_props.cu b/PG-PuReMD/src/cuda_system_props.cu deleted file mode 100644 index 3202f64a..00000000 --- a/PG-PuReMD/src/cuda_system_props.cu +++ /dev/null @@ -1,406 +0,0 @@ - -#include "cuda_system_props.h" - -#include "cuda_utils.h" -#include "cuda_reduction.h" -#include "center_mass.h" -#include "cuda_copy.h" -#include "cuda_shuffle.h" - -#include "vector.h" - - -CUDA_GLOBAL void k_compute_total_mass( single_body_parameters *sbp, reax_atom *my_atoms, - real *block_results, int n ) -{ -#if defined(__SM_35__) - extern __shared__ real my_sbp[]; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - real sdata = 0; - - if (i < n) - { - sdata = sbp[ my_atoms[i].type ].mass; - } - __syncthreads( ); - - for(int z = 16; z >=1; z/=2) - { - sdata += shfl( sdata, z); - } - - if (threadIdx.x % 32 == 0) - { - my_sbp[threadIdx.x >> 5] = sdata; - } - - __syncthreads( ); - - for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1) - { - if(threadIdx.x < offset) - { - my_sbp[threadIdx.x] += my_sbp[threadIdx.x + offset]; - } - - __syncthreads( ); - } - - if(threadIdx.x == 0) - { - block_results[blockIdx.x] = my_sbp[0]; - } - -#else - extern __shared__ real sdata[]; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - real x = 0; - - if (i < n) - { - x = sbp[ my_atoms[i].type ].mass; - } - - sdata[ threadIdx.x ] = x; - __syncthreads( ); - - for (int offset = blockDim.x / 2; offset > 0; offset >>= 1) - { - if (threadIdx.x < offset) - { - sdata[threadIdx.x] += sdata[threadIdx.x + offset]; - } - - __syncthreads( ); - } - - if (threadIdx.x == 0) - { - block_results[ blockIdx.x] = sdata [0]; - } - -#endif -} - - -extern "C" void dev_compute_total_mass( reax_system *system, real *local_val ) -{ - real *block_mass = (real *) scratch; - cuda_memset( block_mass, 0, sizeof(real) * (1 + BLOCKS_POW_2), "total_mass:tmp" ); - - k_compute_total_mass <<<BLOCKS, BLOCK_SIZE, sizeof(real) * BLOCK_SIZE >>> - (system->reax_param.d_sbp, system->d_my_atoms, block_mass, system->n); - cudaThreadSynchronize( ); - cudaCheckError( ); - - k_reduction <<<1, BLOCKS_POW_2, sizeof(real) * BLOCKS_POW_2 >>> - (block_mass, block_mass + BLOCKS_POW_2, BLOCKS_POW_2); - cudaThreadSynchronize( ); - cudaCheckError( ); - - copy_host_device (local_val, block_mass + BLOCKS_POW_2, sizeof(real), - cudaMemcpyDeviceToHost, "total_mass:tmp"); -} - - -CUDA_GLOBAL void k_compute_kinetic_energy( single_body_parameters *sbp, reax_atom *my_atoms, - real *block_results, int n ) -{ -#if defined(__SM_35__) - extern __shared__ real my_sbpdot[]; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - real sdata = 0; - rvec p; - - if (i < n) - { - sdata = sbp[ my_atoms[i].type ].mass; - rvec_Scale( p, sdata, my_atoms[ i ].v ); - sdata = 0.5 * rvec_Dot( p, my_atoms[ i ].v ); - } - - __syncthreads( ); - - for(int z = 16; z >=1; z/=2) - { - sdata += shfl( sdata, z); - } - - if (threadIdx.x % 32 == 0) - { - my_sbpdot[threadIdx.x >> 5] = sdata; - } - - __syncthreads( ); - - for (int offset = blockDim.x >> 6; offset > 0; offset >>= 1) - { - if (threadIdx.x < offset) - { - my_sbpdot[threadIdx.x] += my_sbpdot[threadIdx.x + offset]; - } - - __syncthreads( ); - } - - if (threadIdx.x == 0) - { - block_results[blockIdx.x] = my_sbpdot[0]; - } - -#else - extern __shared__ real sdata []; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - real m = 0; - rvec p; - - if (i < n) - { - m = sbp[ my_atoms[i].type ].mass; - rvec_Scale( p, m, my_atoms[ i ].v ); - m = 0.5 * rvec_Dot( p, my_atoms[ i ].v ); - } - - sdata[ threadIdx.x ] = m; - __syncthreads( ); - - for (int offset = blockDim.x / 2; offset > 0; offset >>= 1) - { - if (threadIdx.x < offset) - { - sdata[threadIdx.x] += sdata[threadIdx.x + offset]; - } - - __syncthreads( ); - } - - if (threadIdx.x == 0) - { - block_results[blockIdx.x] = sdata[0]; - } -#endif -} - -extern "C" void dev_compute_kinetic_energy( reax_system *system, - simulation_data *data, real *local_val ) -{ - real *block_energy = (real *) scratch; - cuda_memset( block_energy, 0, sizeof(real) * (BLOCKS_POW_2 + 1), "kinetic_energy:tmp" ); - - k_compute_kinetic_energy <<<BLOCKS, BLOCK_SIZE, sizeof(real) * BLOCK_SIZE >>> - (system->reax_param.d_sbp, system->d_my_atoms, block_energy, system->n); - cudaThreadSynchronize( ); - cudaCheckError( ); - - k_reduction <<<1, BLOCKS_POW_2, sizeof(real) * BLOCKS_POW_2 >>> - (block_energy, block_energy + BLOCKS_POW_2, BLOCKS_POW_2); - cudaThreadSynchronize( ); - cudaCheckError( ); - - copy_host_device( local_val, block_energy + BLOCKS_POW_2, - //copy_host_device (local_val, &((simulation_data *)data->d_simulation_data)->my_en.e_kin, - sizeof(real), cudaMemcpyDeviceToHost, "kinetic_energy:tmp" ); - //copy_device (block_energy + BLOCKS_POW_2, &((simulation_data *)data->d_simulation_data)->my_en.e_kin, - // sizeof (real), "kinetic_energy"); -} - - -extern "C" void dev_compute_momentum( reax_system *system, rvec xcm, - rvec vcm, rvec amcm ) -{ - rvec *l_xcm, *l_vcm, *l_amcm; - rvec *r_scratch = (rvec *)scratch; - -#if defined( __SM_35__) - // xcm - cuda_memset( scratch, 0, sizeof(rvec) * (BLOCKS_POW_2 + 1), "momentum:tmp" ); - l_xcm = r_scratch; - - center_of_mass_blocks_xcm <<<BLOCKS_POW_2,BLOCK_SIZE,(sizeof(rvec) * BLOCK_SIZE) >>> - (system->reax_param.d_sbp, system->d_my_atoms, l_xcm, system->n ); - cudaThreadSynchronize( ); - cudaCheckError( ); - - k_reduction_rvec <<<1, BLOCKS_POW_2, (sizeof(rvec) * BLOCKS_POW_2) >>> - (l_xcm, l_xcm + BLOCKS_POW_2, BLOCKS_POW_2); - cudaThreadSynchronize( ); - cudaCheckError( ); - copy_host_device( xcm, l_xcm + BLOCKS_POW_2, - sizeof(rvec), cudaMemcpyDeviceToHost, "momentum:xcm" ); - - // vcm - cuda_memset( scratch, 0, sizeof(rvec) * (BLOCKS_POW_2 + 1), "momentum:tmp" ); - l_vcm = r_scratch; - - center_of_mass_blocks_vcm <<<BLOCKS_POW_2,BLOCK_SIZE,(sizeof(rvec) * BLOCK_SIZE) >>> - (system->reax_param.d_sbp, system->d_my_atoms, l_vcm, system->n ); - cudaThreadSynchronize( ); - cudaCheckError( ); - - k_reduction_rvec <<<1, BLOCKS_POW_2, (sizeof(rvec) * BLOCKS_POW_2) >>> - (l_vcm, l_vcm + BLOCKS_POW_2, BLOCKS_POW_2); - cudaThreadSynchronize( ); - cudaCheckError( ); - copy_host_device( vcm, l_vcm + BLOCKS_POW_2, sizeof(rvec), - cudaMemcpyDeviceToHost, "momentum:vcm" ); - - // amcm - cuda_memset( scratch, 0, sizeof (rvec) * (BLOCKS_POW_2 + 1), "momentum:tmp"); - l_amcm = r_scratch; - - center_of_mass_blocks_amcm <<<BLOCKS_POW_2,BLOCK_SIZE,(sizeof(rvec) * BLOCK_SIZE) >>> - (system->reax_param.d_sbp, system->d_my_atoms, l_amcm, system->n ); - cudaThreadSynchronize( ); - cudaCheckError( ); - - k_reduction_rvec <<<1, BLOCKS_POW_2, (sizeof(rvec) * BLOCKS_POW_2) >>> - (l_amcm, l_amcm + BLOCKS_POW_2, BLOCKS_POW_2); - cudaThreadSynchronize( ); - cudaCheckError( ); - copy_host_device( amcm, l_amcm + BLOCKS_POW_2, sizeof(rvec), - cudaMemcpyDeviceToHost, "momemtum:amcm" ); - -#else - cuda_memset( scratch, 0, 3 * sizeof (rvec) * (BLOCKS_POW_2 + 1), "momentum:tmp" ); - - l_xcm = r_scratch; - l_vcm = r_scratch + (BLOCKS_POW_2 + 1); - l_amcm = r_scratch + 2 * (BLOCKS_POW_2 + 1); - - center_of_mass_blocks <<<BLOCKS_POW_2, BLOCK_SIZE, 3 * (sizeof (rvec) * BLOCK_SIZE) >>> - (system->reax_param.d_sbp, system->d_my_atoms, l_xcm, l_vcm, l_amcm, system->n); - cudaThreadSynchronize( ); - cudaCheckError( ); - - center_of_mass <<<1, BLOCKS_POW_2, 3 * (sizeof (rvec) * BLOCKS_POW_2) >>> - (l_xcm, l_vcm, l_amcm, - l_xcm + BLOCKS_POW_2, - l_vcm + BLOCKS_POW_2, - l_amcm + BLOCKS_POW_2, - BLOCKS_POW_2); - cudaThreadSynchronize( ); - cudaCheckError( ); - - copy_host_device( xcm, l_xcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost, "momemtum:xcm" ); - copy_host_device( vcm, l_vcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost, "momentum:vcm" ); - copy_host_device( amcm, l_amcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost,"momentum:amcm" ); -#endif -} - - -extern "C" void dev_compute_inertial_tensor( reax_system *system, real *local_results, rvec my_xcm ) -{ -#if defined(__SM_35__) - real *partial_results = (real *) scratch; - cuda_memset( partial_results, 0, sizeof (real) * 6 * (BLOCKS_POW_2 + 1), "tensor:tmp" ); - - compute_center_mass_xx_xy <<<BLOCKS_POW_2, BLOCK_SIZE, 2 * (sizeof (real) * BLOCK_SIZE) >>> - (system->reax_param.d_sbp, system->d_my_atoms, partial_results, - my_xcm[0], my_xcm[1], my_xcm[2], system->n); - cudaThreadSynchronize( ); - cudaCheckError( ); - - compute_center_mass_xz_yy <<<BLOCKS_POW_2, BLOCK_SIZE, 2 * (sizeof (real) * BLOCK_SIZE) >>> - (system->reax_param.d_sbp, system->d_my_atoms, partial_results, - my_xcm[0], my_xcm[1], my_xcm[2], system->n); - cudaThreadSynchronize( ); - cudaCheckError( ); - - compute_center_mass_yz_zz <<<BLOCKS_POW_2, BLOCK_SIZE, 2 * (sizeof (real) * BLOCK_SIZE) >>> - (system->reax_param.d_sbp, system->d_my_atoms, partial_results, - my_xcm[0], my_xcm[1], my_xcm[2], system->n); - cudaThreadSynchronize( ); - cudaCheckError( ); - - compute_center_mass <<<1, BLOCKS_POW_2, 6 * (sizeof (real) * BLOCKS_POW_2) >>> - (partial_results, partial_results + (BLOCKS_POW_2 * 6), BLOCKS_POW_2); - cudaThreadSynchronize( ); - cudaCheckError( ); - - copy_host_device( local_results, partial_results + 6 * BLOCKS_POW_2, - sizeof(real) * 6, cudaMemcpyDeviceToHost, "tensor:local_results" ); - -#else - real *partial_results = (real *) scratch; - //real *local_results; - - cuda_memset (partial_results, 0, sizeof (real) * 6 * (BLOCKS_POW_2 + 1), "tensor:tmp"); - //local_results = (real *) malloc (sizeof (real) * 6 *(BLOCKS_POW_2+ 1)); - - compute_center_mass <<<BLOCKS_POW_2, BLOCK_SIZE, 6 * (sizeof (real) * BLOCK_SIZE) >>> - (system->reax_param.d_sbp, system->d_my_atoms, partial_results, - my_xcm[0], my_xcm[1], my_xcm[2], system->n); - cudaThreadSynchronize( ); - cudaCheckError( ); - - compute_center_mass <<<1, BLOCKS_POW_2, 6 * (sizeof (real) * BLOCKS_POW_2) >>> - (partial_results, partial_results + (BLOCKS_POW_2 * 6), BLOCKS_POW_2); - cudaThreadSynchronize( ); - cudaCheckError( ); - - copy_host_device (local_results, partial_results + 6 * BLOCKS_POW_2, - sizeof(real) * 6, cudaMemcpyDeviceToHost, "tensor:local_results"); -#endif -} - - -extern "C" void dev_sync_simulation_data( simulation_data *data ) -{ - Output_Sync_Simulation_Data( data, (simulation_data *)data->d_simulation_data ); -} - - -/* -CUDA_GLOBAL void ker_kinetic_energy (reax_atom *my_atoms, - single_body_parameters *sbp, int n, real *block_results) -{ - extern __shared__ real sken[]; - rvec p; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - real x = 0; - - if(i < n) - { - m = sbp[my_atoms[i].type].mass; - rvec_Scale( p, m, my_atoms[i].v ); - x = 0.5 * rvec_Dot( p, my_atoms[i].v ); - } - sken[threadIdx.x] = x; - __syncthreads(); - - for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) - { - if(threadIdx.x < offset) - { - sken[threadIdx.x] += sken[threadIdx.x + offset]; - } - - __syncthreads(); - } - - if(threadIdx.x == 0) - { - per_block_results[blockIdx.x] = sken[0]; - } -} - - -void dev_compute_kinetic_energy (reax_system *system, simulation_data *data, real *p_ekin) -{ - real *spad = (real *) scratch; - cuda_memset (spad, 0, sizeof (real) * 2 * system->n, "kinetic_energy"); - - ker_kinetic_energy <<<BLOCKS, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>> - (spad, spad + system->n, system->n); - cudaThreadSynchronize (); - cudaCheckError (); - - k_reduction <<<1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2 >>> - (spad + system->n, &((simulation_data *)data->d_simulation_data)->my_en.e_kin, BLOCKS); - cudaThreadSynchronize (); - cudaCheckError (); - - copy_host_device (p_ekin, &((simulation_data *)data->d_simulation_data)->my_en.e_kin, - sizeof (real), cudaMemcpyDeviceToHost, "kinetic_energy"); -} -*/ diff --git a/PG-PuReMD/src/ffield.c b/PG-PuReMD/src/ffield.c index 443d9051..d985339b 100644 --- a/PG-PuReMD/src/ffield.c +++ b/PG-PuReMD/src/ffield.c @@ -20,7 +20,8 @@ ----------------------------------------------------------------------*/ #include "reax_types.h" - #if defined(PURE_REAX) + +#if defined(PURE_REAX) #include "ffield.h" #include "tool_box.h" #elif defined(LAMMPS_REAX) diff --git a/PG-PuReMD/src/ffield.h b/PG-PuReMD/src/ffield.h index 9aa2a27f..313c3e67 100644 --- a/PG-PuReMD/src/ffield.h +++ b/PG-PuReMD/src/ffield.h @@ -24,6 +24,16 @@ #include "reax_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + char Read_Force_Field( char*, reax_interaction*, control_params* ); +#ifdef __cplusplus +} +#endif + + #endif diff --git a/PG-PuReMD/src/forces.c b/PG-PuReMD/src/forces.c index c57527fe..19133fce 100644 --- a/PG-PuReMD/src/forces.c +++ b/PG-PuReMD/src/forces.c @@ -21,15 +21,6 @@ #include "reax_types.h" -#include "index_utils.h" -#ifdef HAVE_CUDA - #include "cuda_forces.h" - #include "cuda_lin_alg.h" - #include "cuda_neighbors.h" - #include "cuda_utils.h" - #include "cuda_validation.h" -#endif - #if defined(PURE_REAX) #include "forces.h" #include "bond_orders.h" @@ -63,11 +54,7 @@ #include "reax_vector.h" #endif - -#ifdef HAVE_CUDA -void Cuda_Total_Forces( reax_system *, control_params *, simulation_data *, storage * ); -void Cuda_Total_Forces_PURE( reax_system *, storage * ); -#endif +#include "index_utils.h" interaction_function Interaction_Functions[NUM_INTRS]; @@ -221,41 +208,6 @@ void Compute_Total_Force( reax_system *system, control_params *control, } -#ifdef HAVE_CUDA -void Cuda_Compute_Total_Force( reax_system *system, control_params *control, - simulation_data *data, storage *workspace, - reax_list **lists, mpi_datatypes *mpi_data ) -{ - rvec *f; - - f = (rvec *) host_scratch; - memset( f, 0, sizeof(rvec) * system->N ); - - Cuda_Total_Forces( system, control, data, workspace ); - -#if defined(PURE_REAX) - /* now all forces are computed to their partially-final values - * based on the neighbors information each processor has had. - * final values of force on each atom needs to be computed by adding up - * all partially-final pieces */ - - //MVAPICH2 - copy_host_device( f, dev_workspace->f, sizeof(rvec) * system->N , - cudaMemcpyDeviceToHost, "total_force:f:get" ); - - Coll( system, mpi_data, f, mpi_data->mpi_rvec, - sizeof(rvec) / sizeof(void), rvec_unpacker ); - - copy_host_device( f, dev_workspace->f, sizeof(rvec) * system->N, - cudaMemcpyHostToDevice, "total_force:f:put" ); - - Cuda_Total_Forces_PURE( system, dev_workspace ); -#endif - -} -#endif - - // Essentially no-cuda copies of cuda kernels, to be used only in the mpi-not-gpu version //////////////////////// // HBOND ISSUE @@ -1851,173 +1803,6 @@ int Compute_Forces( reax_system *system, control_params *control, } -#ifdef HAVE_CUDA -int Cuda_Compute_Forces( reax_system *system, control_params *control, - simulation_data *data, storage *workspace, reax_list **lists, - output_controls *out_control, mpi_datatypes *mpi_data ) -{ - int charge_flag, retVal; - -#if defined(LOG_PERFORMANCE) - real t_start = 0; - - //MPI_Barrier( MPI_COMM_WORLD ); - if ( system->my_rank == MASTER_NODE ) - { - t_start = Get_Time( ); - } -#endif - - retVal = SUCCESS; - - /********* init forces ************/ - if ( control->charge_freq && (data->step - data->prev_steps) % control->charge_freq == 0 ) - { - charge_flag = TRUE; - } - else - { - charge_flag = FALSE; - } - - if ( charge_flag == TRUE ) - { - retVal = Cuda_Init_Forces( system, control, data, workspace, lists, out_control ); - -// int i; -// static reax_list **temp_lists; -// -// if ( data->step == 0 ) -// { -// temp_lists = (reax_list **) smalloc( LIST_N * sizeof (reax_list *), "temp_lists" ); -// for ( i = 0; i < LIST_N; ++i ) -// { -// temp_lists[i] = (reax_list *) smalloc( sizeof(reax_list), "lists[i]" ); -// temp_lists[i]->allocated = FALSE; -// } -// Make_List( (*dev_lists + BONDS)->n, (*dev_lists + BONDS)->num_intrs, -// TYP_BOND, *temp_lists + BONDS ); -// Make_List( (*dev_lists + HBONDS)->n, (*dev_lists + HBONDS)->num_intrs, -// TYP_HBOND, *temp_lists + HBONDS ); -// } -// else -// { -// Delete_List( *temp_lists + BONDS ); -// Make_List( (*dev_lists + BONDS)->n, (*dev_lists + BONDS)->num_intrs, -// TYP_BOND, *temp_lists + BONDS ); -// Delete_List( *temp_lists + HBONDS ); -// Make_List( (*dev_lists + HBONDS)->n, (*dev_lists + HBONDS)->num_intrs, -// TYP_HBOND, *temp_lists + HBONDS ); -// -// } -// Output_Sync_Lists( *temp_lists + BONDS, *dev_lists + BONDS, TYP_BOND ); -// Print_Bonds( system, temp_lists, control ); -// Output_Sync_Lists( *temp_lists + HBONDS, *dev_lists + HBONDS, TYP_HBOND ); -// Print_HBonds( system, temp_lists, control, data->step ); -// Print_HBond_Indices( system, temp_lists, control, data->step ); -// exit( 0 ); - } - else - { - retVal = Cuda_Init_Forces_No_Charges( system, control, data, workspace, lists, out_control ); - } - - if ( retVal == SUCCESS ) - { - //validate_sparse_matrix( system, workspace ); - -#if defined(LOG_PERFORMANCE) - //MPI_Barrier( MPI_COMM_WORLD ); - if ( system->my_rank == MASTER_NODE ) - { - Update_Timing_Info( &t_start, &(data->timing.init_forces) ); - } -#endif - - /********* bonded interactions ************/ - retVal = Cuda_Compute_Bonded_Forces( system, control, data, workspace, lists, out_control ); - -#if defined(LOG_PERFORMANCE) - //MPI_Barrier( MPI_COMM_WORLD ); - if ( system->my_rank == MASTER_NODE ) - { - Update_Timing_Info( &t_start, &(data->timing.bonded) ); - } -#endif - -#if defined(DEBUG_FOCUS) - fprintf( stderr, "p%d @ step%d: completed bonded\n", - system->my_rank, data->step ); - MPI_Barrier( MPI_COMM_WORLD ); -#endif - } - - if ( retVal == SUCCESS ) - { - /**************** charges ************************/ -#if defined(PURE_REAX) - if ( charge_flag == TRUE ) - { - Cuda_QEq( system, control, data, workspace, out_control, mpi_data ); - } - -#if defined(LOG_PERFORMANCE) - //MPI_Barrier( MPI_COMM_WORLD ); - if ( system->my_rank == MASTER_NODE ) - { - Update_Timing_Info( &t_start, &(data->timing.qEq) ); - } -#endif - -#if defined(DEBUG_FOCUS) - fprintf(stderr, "p%d @ step%d: qeq completed\n", system->my_rank, data->step); - MPI_Barrier( MPI_COMM_WORLD ); -#endif -#endif //PURE_REAX - - /********* nonbonded interactions ************/ - Cuda_Compute_NonBonded_Forces( system, control, data, workspace, - lists, out_control, mpi_data ); - -#if defined(LOG_PERFORMANCE) - //MPI_Barrier( MPI_COMM_WORLD ); - if ( system->my_rank == MASTER_NODE ) - { - Update_Timing_Info( &t_start, &(data->timing.nonb) ); - } -#endif -#if defined(DEBUG_FOCUS) - fprintf( stderr, "p%d @ step%d: nonbonded forces completed\n", - system->my_rank, data->step ); - MPI_Barrier( MPI_COMM_WORLD ); -#endif - - /*********** total force ***************/ - Cuda_Compute_Total_Force( system, control, data, workspace, lists, mpi_data ); - -#if defined(LOG_PERFORMANCE) - //MPI_Barrier( MPI_COMM_WORLD ); - if ( system->my_rank == MASTER_NODE ) - { - Update_Timing_Info( &t_start, &(data->timing.bonded) ); - } -#endif -#if defined(DEBUG_FOCUS) - fprintf( stderr, "p%d @ step%d: total forces computed\n", - system->my_rank, data->step ); - //Print_Total_Force( system, data, workspace ); - MPI_Barrier( MPI_COMM_WORLD ); - -#endif - -// Print_Forces( system ); - } - - return retVal; -} -#endif - - int validate_device( reax_system *system, simulation_data *data, storage *workspace, reax_list **lists ) { diff --git a/PG-PuReMD/src/forces.h b/PG-PuReMD/src/forces.h index 6b4218e8..0579f092 100644 --- a/PG-PuReMD/src/forces.h +++ b/PG-PuReMD/src/forces.h @@ -28,6 +28,10 @@ extern interaction_function Interaction_Functions[NUM_INTRS]; +#ifdef __cplusplus +extern "C" { +#endif + void Init_Force_Functions( control_params* ); int Compute_Forces( reax_system*, control_params*, simulation_data*, @@ -36,10 +40,11 @@ int Compute_Forces( reax_system*, control_params*, simulation_data*, void Estimate_Storages( reax_system*, control_params*, reax_list**, int*, int*, int*, int* ); -int Cuda_Compute_Forces( reax_system*, control_params*, simulation_data*, - storage*, reax_list**, output_controls*, mpi_datatypes* ); - int validate_device( reax_system *, simulation_data *, storage *, reax_list ** ); +#ifdef __cplusplus +} +#endif + #endif diff --git a/PG-PuReMD/src/geo_tools.c b/PG-PuReMD/src/geo_tools.c index b97123a9..dff292e7 100644 --- a/PG-PuReMD/src/geo_tools.c +++ b/PG-PuReMD/src/geo_tools.c @@ -19,7 +19,10 @@ <http://www.gnu.org/licenses/>. ----------------------------------------------------------------------*/ +#include "reax_types.h" + #include "geo_tools.h" + #include "allocate.h" #include "box.h" #include "tool_box.h" diff --git a/PG-PuReMD/src/geo_tools.h b/PG-PuReMD/src/geo_tools.h index 80786856..628e8f74 100644 --- a/PG-PuReMD/src/geo_tools.h +++ b/PG-PuReMD/src/geo_tools.h @@ -29,10 +29,6 @@ // CUSTOM ATOM: serial element name x y z #define CUSTOM_ATOM_FORMAT " %d %s %s %lf %lf %lf" -char Read_Geo( char*, reax_system*, control_params*, - simulation_data*, storage*, mpi_datatypes* ); - - /*PDB format : http://www.rcsb.org/pdb/file_formats/pdb/pdbguide2.2/guide2.2_frame.html @@ -114,10 +110,23 @@ COLUMNS DATA TYPE FIELD DEFINITION #define PDB_ATOM_FORMAT_O_LENGTH 81 #define PDB_CRYST1_FORMAT_O "%6s%9.3f%9.3f%9.3f%7.2f%7.2f%7.2f%11s%4d\n" + +#ifdef __cplusplus +extern "C" { +#endif + +char Read_Geo( char*, reax_system*, control_params*, + simulation_data*, storage*, mpi_datatypes* ); + char Read_PDB( char*, reax_system*, control_params*, - simulation_data*, storage*, mpi_datatypes* ); + simulation_data*, storage*, mpi_datatypes* ); char Write_PDB( reax_system*, reax_list*, simulation_data*, - control_params*, mpi_datatypes*, output_controls* ); + control_params*, mpi_datatypes*, output_controls* ); + +#ifdef __cplusplus +} +#endif + #endif diff --git a/PG-PuReMD/src/grid.c b/PG-PuReMD/src/grid.c index 3714766c..d893f6c6 100644 --- a/PG-PuReMD/src/grid.c +++ b/PG-PuReMD/src/grid.c @@ -19,15 +19,17 @@ <http://www.gnu.org/licenses/>. ----------------------------------------------------------------------*/ +#include "reax_types.h" + #include "grid.h" + #include "allocate.h" +#include "index_utils.h" #include "io_tools.h" #include "reset_tools.h" #include "tool_box.h" #include "vector.h" -#include "index_utils.h" - /* determines the exchange boundaries with nbrs in terms of gcells */ void Mark_GCells( reax_system* system, grid *g, ivec procs, MPI_Comm comm ) diff --git a/PG-PuReMD/src/grid.h b/PG-PuReMD/src/grid.h index ad51e699..cb124da7 100644 --- a/PG-PuReMD/src/grid.h +++ b/PG-PuReMD/src/grid.h @@ -24,10 +24,24 @@ #include "reax_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + void Setup_New_Grid( reax_system*, control_params*, MPI_Comm ); + void Update_Grid( reax_system*, control_params*, MPI_Comm ); + void Bin_My_Atoms( reax_system*, reallocate_data* ); + void Reorder_My_Atoms( reax_system*, storage* ); + void Bin_Boundary_Atoms( reax_system* ); +#ifdef __cplusplus +} +#endif + + #endif diff --git a/PG-PuReMD/src/hydrogen_bonds.c b/PG-PuReMD/src/hydrogen_bonds.c index 5743feb5..dfd7abac 100644 --- a/PG-PuReMD/src/hydrogen_bonds.c +++ b/PG-PuReMD/src/hydrogen_bonds.c @@ -21,8 +21,6 @@ #include "reax_types.h" -#include "index_utils.h" - #if defined(PURE_REAX) #include "hydrogen_bonds.h" #include "bond_orders.h" @@ -37,6 +35,8 @@ #include "reax_vector.h" #endif +#include "index_utils.h" + // DANIEL // This function is taken straight from PuReMD, with minimal changes to accomodate the new datastructures diff --git a/PG-PuReMD/src/hydrogen_bonds.h b/PG-PuReMD/src/hydrogen_bonds.h index 346f0045..e4f58e10 100644 --- a/PG-PuReMD/src/hydrogen_bonds.h +++ b/PG-PuReMD/src/hydrogen_bonds.h @@ -24,7 +24,17 @@ #include "reax_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + void Hydrogen_Bonds( reax_system*, control_params*, simulation_data*, - storage*, reax_list**, output_controls* ); + storage*, reax_list**, output_controls* ); + +#ifdef __cplusplus +} +#endif + #endif diff --git a/PG-PuReMD/src/init_md.c b/PG-PuReMD/src/init_md.c index 595724bc..2e406d1a 100644 --- a/PG-PuReMD/src/init_md.c +++ b/PG-PuReMD/src/init_md.c @@ -23,17 +23,6 @@ #include <stddef.h> -#ifdef HAVE_CUDA - #include "cuda_allocate.h" - #include "cuda_list.h" - #include "cuda_copy.h" - #include "cuda_forces.h" - #include "cuda_init_md.h" - #include "cuda_neighbors.h" - #include "cuda_reset_tools.h" - #include "cuda_validation.h" -#endif - #if defined(PURE_REAX) #include "init_md.h" #include "allocate.h" @@ -239,76 +228,6 @@ int Init_System( reax_system *system, control_params *control, } -#ifdef HAVE_CUDA -int Cuda_Init_System( reax_system *system, control_params *control, - simulation_data *data, storage *workspace, - mpi_datatypes *mpi_data, char *msg ) -{ - int i, ret; - reax_atom *atom; - int nrecv[MAX_NBRS]; - - Setup_New_Grid( system, control, MPI_COMM_WORLD ); - -#if defined(DEBUG_FOCUS) - fprintf( stderr, "p%d GRID:\n", system->my_rank ); - Print_Grid( &(system->my_grid), stderr ); -#endif - - Bin_My_Atoms( system, &(workspace->realloc) ); - Reorder_My_Atoms( system, workspace ); - - /* estimate N and total capacity */ - for ( i = 0; i < MAX_NBRS; ++i ) - { - nrecv[i] = 0; - } - - MPI_Barrier( MPI_COMM_WORLD ); - system->max_recved = 0; - system->N = SendRecv( system, mpi_data, mpi_data->boundary_atom_type, nrecv, - Estimate_Boundary_Atoms, Unpack_Estimate_Message, TRUE ); - system->total_cap = MAX( (int)(system->N * SAFE_ZONE), MIN_CAP ); - Bin_Boundary_Atoms( system ); - - /* Sync atoms here to continue the computation */ - dev_alloc_system( system ); - Sync_System( system ); - - /* estimate numH and Hcap */ - Cuda_Reset_Atoms( system, control ); - -#if defined(DEBUG_FOCUS) - fprintf( stderr, "p%d: n=%d local_cap=%d\n", - system->my_rank, system->n, system->local_cap ); - fprintf( stderr, "p%d: N=%d total_cap=%d\n", - system->my_rank, system->N, system->total_cap ); - fprintf( stderr, "p%d: numH=%d H_cap=%d\n", - system->my_rank, system->numH, system->Hcap ); -#endif - - Cuda_Compute_Total_Mass( system, data, mpi_data->comm_mesh3D ); - - Cuda_Compute_Center_of_Mass( system, data, mpi_data, mpi_data->comm_mesh3D ); - -// if( Reposition_Atoms( system, control, data, mpi_data, msg ) == FAILURE ) -// { -// return FAILURE; -// } - - /* initialize velocities so that desired init T can be attained */ - if ( !control->restart || (control->restart && control->random_vel) ) - { - Generate_Initial_Velocities( system, control->T_init ); - } - - Cuda_Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D ); - - return SUCCESS; -} -#endif - - /************************ initialize simulation data ************************/ void Init_Simulation_Data( reax_system *system, control_params *control, simulation_data *data, char *msg ) @@ -411,102 +330,6 @@ void Init_Simulation_Data( reax_system *system, control_params *control, } -#ifdef HAVE_CUDA -void Cuda_Init_Simulation_Data( reax_system *system, control_params *control, - simulation_data *data, char *msg ) -{ - dev_alloc_simulation_data( data ); - - Reset_Simulation_Data( data ); - - if ( !control->restart ) - { - data->step = data->prev_steps = 0; - } - - switch ( control->ensemble ) - { - case NVE: - data->N_f = 3 * system->bigN; - Cuda_Evolve = Velocity_Verlet_NVE; - control->virial = 0; - break; - - case bNVT: - data->N_f = 3 * system->bigN + 1; - Cuda_Evolve = Cuda_Velocity_Verlet_Berendsen_NVT; - control->virial = 0; - break; - - case nhNVT: - fprintf( stderr, "[WARNING] Nose-Hoover NVT is still under testing.\n" ); - data->N_f = 3 * system->bigN + 1; - Cuda_Evolve = Velocity_Verlet_Nose_Hoover_NVT_Klein; - control->virial = 0; - if ( !control->restart || (control->restart && control->random_vel) ) - { - data->therm.G_xi = control->Tau_T * - (2.0 * data->sys_en.e_kin - data->N_f * K_B * control->T ); - data->therm.v_xi = data->therm.G_xi * control->dt; - data->therm.v_xi_old = 0; - data->therm.xi = 0; - } - break; - - case sNPT: /* Semi-Isotropic NPT */ - data->N_f = 3 * system->bigN + 4; - Cuda_Evolve = Velocity_Verlet_Berendsen_NPT; - control->virial = 1; - if ( !control->restart ) - { - Reset_Pressures( data ); - } - break; - - case iNPT: /* Isotropic NPT */ - data->N_f = 3 * system->bigN + 2; - Cuda_Evolve = Velocity_Verlet_Berendsen_NPT; - control->virial = 1; - if ( !control->restart ) - { - Reset_Pressures( data ); - } - break; - - case NPT: /* Anisotropic NPT */ - data->N_f = 3 * system->bigN + 9; - Cuda_Evolve = Velocity_Verlet_Berendsen_NPT; - control->virial = 1; - - fprintf( stderr, "p%d: init_simulation_data: option not yet implemented\n", - system->my_rank ); - MPI_Abort( MPI_COMM_WORLD, INVALID_INPUT ); - break; - - default: - fprintf( stderr, "p%d: init_simulation_data: ensemble not recognized\n", - system->my_rank ); - MPI_Abort( MPI_COMM_WORLD, INVALID_INPUT ); - } - - /* initialize the timer(s) */ - MPI_Barrier( MPI_COMM_WORLD ); - if ( system->my_rank == MASTER_NODE ) - { - data->timing.start = Get_Time( ); - -#if defined(LOG_PERFORMANCE) - Reset_Timing( &data->timing ); -#endif - } - -#if defined(DEBUG) - fprintf( stderr, "data->N_f: %8.3f\n", data->N_f ); -#endif -} -#endif - - #elif defined(LAMMPS_REAX) int Init_System( reax_system *system, char *msg ) { @@ -603,22 +426,6 @@ void Init_Workspace( reax_system *system, control_params *control, } -#ifdef HAVE_CUDA -void Cuda_Init_Workspace( reax_system *system, control_params *control, - storage *workspace, char *msg ) -{ - dev_alloc_workspace( system, control, dev_workspace, - system->local_cap, system->total_cap, msg ); - - memset( &(workspace->realloc), 0, sizeof(reallocate_data) ); - Cuda_Reset_Workspace( system, workspace ); - - /* Initialize the Taper function */ - Init_Taper( control, dev_workspace ); -} -#endif - - /************** setup communication data structures **************/ int Init_MPI_Datatypes( reax_system *system, storage *workspace, mpi_datatypes *mpi_data, char *msg ) @@ -885,88 +692,6 @@ int Init_Lists( reax_system *system, control_params *control, } -#ifdef HAVE_CUDA -int Cuda_Init_Lists( reax_system *system, control_params *control, - simulation_data *data, storage *workspace, reax_list **lists, - mpi_datatypes *mpi_data, char *msg ) -{ - int ret; - int Htop; - - /* ignore returned error, as system->d_max_far_nbrs was not valid */ - ret = Cuda_Estimate_Neighbors( system, data->step ); - - Dev_Make_List( system->total_cap, system->total_far_nbrs, - TYP_FAR_NEIGHBOR, *dev_lists + FAR_NBRS ); - -#if defined(DEBUG_FOCUS) - fprintf( stderr, "p%d: allocated far_nbrs: num_far=%d, space=%dMB\n", - system->my_rank, system->total_far_nbrs, - (int)(system->total_far_nbrs * sizeof(far_neighbor_data) / (1024 * 1024)) ); - fprintf( stderr, "N: %d and total_cap: %d \n", system->N, system->total_cap ); -#endif - - Cuda_Init_Neighbor_Indices( system ); - - Cuda_Generate_Neighbor_Lists( system, data, workspace, dev_lists ); - - /* estimate storage for bonds and hbonds */ - Cuda_Estimate_Storages( system, control, dev_lists, &(dev_workspace->H), data->step ); - - /* estimate storage for charge sparse matrix */ -// Cuda_Estimate_Storage_Sparse_Matrix( system, control, data, dev_lists ); - - dev_alloc_matrix( &(dev_workspace->H), system->total_cap, system->total_cm_entries ); - - Cuda_Init_Sparse_Matrix_Indices( system, &(dev_workspace->H) ); - - //MATRIX CHANGES - //workspace->L = NULL; - //workspace->U = NULL; - -#if defined(DEBUG_FOCUS) - fprintf( stderr, "p:%d - allocated H matrix: max_entries: %d, cap: %d \n", - system->my_rank, system->total_cm_entries, dev_workspace->H.m ); - fprintf( stderr, "p%d: allocated H matrix: Htop=%d, space=%dMB\n", - system->my_rank, Htop, - (int)(Htop * sizeof(sparse_matrix_entry) / (1024 * 1024)) ); -#endif - - if ( control->hbond_cut > 0.0 && system->numH > 0 ) - { - Dev_Make_List( system->total_cap, system->total_hbonds, TYP_HBOND, *dev_lists + HBONDS ); -// Make_List( system->total_cap, system->total_hbonds, TYP_HBOND, *lists + HBONDS ); - - Cuda_Init_HBond_Indices( system ); - -#if defined(DEBUG_FOCUS) - fprintf( stderr, "p%d: allocated hbonds: total_hbonds=%d, space=%dMB\n", - system->my_rank, system->total_hbonds, - (int)(system->total_hbonds * sizeof(hbond_data) / (1024 * 1024)) ); -#endif - } - - /* bonds list */ - Dev_Make_List( system->total_cap, system->total_bonds, TYP_BOND, *dev_lists + BONDS ); -// Make_List( system->total_cap, system->total_bonds, TYP_BOND, *lists + BONDS ); - - Cuda_Init_Bond_Indices( system ); - -#if defined(DEBUG_FOCUS) - fprintf( stderr, "p%d: allocated bonds: total_bonds=%d, space=%dMB\n", - system->my_rank, total_bonds, - (int)(total_bonds * sizeof(bond_data) / (1024 * 1024)) ); -#endif - - /* 3bodies list: since a more accurate estimate of the num. - * of three body interactions requires that bond orders have - * been computed, delay estimation until for computation */ - - return SUCCESS; -} -#endif - - #if defined(PURE_REAX) void Initialize( reax_system *system, control_params *control, simulation_data *data, storage *workspace, @@ -1106,108 +831,6 @@ void Pure_Initialize( reax_system *system, control_params *control, } -#ifdef HAVE_CUDA -void Cuda_Initialize( reax_system *system, control_params *control, - simulation_data *data, storage *workspace, - reax_list **lists, output_controls *out_control, - mpi_datatypes *mpi_data ) -{ - char msg[MAX_STR]; - real t_start, t_end; - - /* HOST/DEVICE SCRATCH */ - Cuda_Init_ScratchArea( ); - - /* MPI_DATATYPES */ - if ( Init_MPI_Datatypes( system, workspace, mpi_data, msg ) == FAILURE ) - { - fprintf( stderr, "p%d: init_mpi_datatypes: could not create datatypes\n", - system->my_rank ); - fprintf( stderr, "p%d: mpi_data couldn't be initialized! terminating.\n", - system->my_rank ); - MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE ); - } - - /* SYSTEM */ - if ( Cuda_Init_System( system, control, data, workspace, mpi_data, msg ) == FAILURE ) - { - fprintf( stderr, "p%d: %s\n", system->my_rank, msg ); - fprintf( stderr, "p%d: system could not be initialized! terminating.\n", - system->my_rank ); - MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE ); - } - - /* GRID */ - dev_alloc_grid( system ); - Sync_Grid( &system->my_grid, &system->d_my_grid ); - - //validate_grid( system ); - - /* SIMULATION_DATA */ - Cuda_Init_Simulation_Data( system, control, data, msg ); - - /* WORKSPACE */ - Cuda_Init_Workspace( system, control, workspace, msg ); - -#if defined(DEBUG) - fprintf( stderr, "p%d: initialized workspace\n", system->my_rank ); -#endif - - //Sync the taper here from host to device. - - /* CONTROL */ - dev_alloc_control( control ); - - /* LISTS */ - if ( Cuda_Init_Lists( system, control, data, workspace, lists, mpi_data, msg ) == - FAILURE ) - { - fprintf( stderr, "p%d: %s\n", system->my_rank, msg ); - fprintf( stderr, "p%d: system could not be initialized! terminating.\n", - system->my_rank ); - MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE ); - } - -#if defined(DEBUG) - fprintf( stderr, "p%d: initialized lists\n", system->my_rank ); -#endif - - /* OUTPUT Files */ - if ( Init_Output_Files( system, control, out_control, mpi_data, msg ) == FAILURE ) - { - fprintf( stderr, "p%d: %s\n", system->my_rank, msg ); - fprintf( stderr, "p%d: could not open output files! terminating...\n", - system->my_rank ); - MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE ); - } - -#if defined(DEBUG) - fprintf( stderr, "p%d: output files opened\n", system->my_rank ); -#endif - - /* Lookup Tables */ - if ( control->tabulate ) - { - if ( Init_Lookup_Tables( system, control, dev_workspace->Tap, mpi_data, msg ) == FAILURE ) - { - fprintf( stderr, "p%d: %s\n", system->my_rank, msg ); - fprintf( stderr, "p%d: couldn't create lookup table! terminating.\n", - system->my_rank ); - MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE ); - } - -#if defined(DEBUG) - fprintf( stderr, "p%d: initialized lookup tables\n", system->my_rank ); -#endif - } - -#if defined(DEBUG) - fprintf( stderr, "p%d: Device Initialization Done \n", system->my_rank ); -#endif -} -#endif - - #elif defined(LAMMPS_REAX) void Initialize( reax_system *system, control_params *control, simulation_data *data, storage *workspace, diff --git a/PG-PuReMD/src/init_md.h b/PG-PuReMD/src/init_md.h index 5a66e4fb..c5222cbd 100644 --- a/PG-PuReMD/src/init_md.h +++ b/PG-PuReMD/src/init_md.h @@ -25,14 +25,25 @@ #include "reax_types.h" +#ifdef __cplusplus +extern "C" { +#endif + +void Generate_Initial_Velocities( reax_system *, real ); + +int Init_MPI_Datatypes( reax_system *, storage *, mpi_datatypes *, char * ); + void Initialize( reax_system*, control_params*, simulation_data*, storage*, reax_list**, output_controls*, mpi_datatypes* ); void Pure_Initialize( reax_system*, control_params*, simulation_data*, storage*, reax_list**, output_controls*, mpi_datatypes* ); -void Cuda_Initialize( reax_system*, control_params*, simulation_data*, - storage*, reax_list**, output_controls*, mpi_datatypes* ); +void Init_Taper( control_params *, storage * ); + +#ifdef __cplusplus +} +#endif #endif diff --git a/PG-PuReMD/src/integrate.c b/PG-PuReMD/src/integrate.c index 88b406b5..b0200897 100644 --- a/PG-PuReMD/src/integrate.c +++ b/PG-PuReMD/src/integrate.c @@ -19,6 +19,8 @@ <http://www.gnu.org/licenses/>. ----------------------------------------------------------------------*/ +#include "reax_types.h" + #include "integrate.h" #include "allocate.h" @@ -33,14 +35,6 @@ #include "tool_box.h" #include "vector.h" -#ifdef HAVE_CUDA - #include "cuda_allocate.h" - #include "cuda_integrate.h" - #include "cuda_copy.h" - #include "cuda_neighbors.h" - #include "cuda_reset_tools.h" -#endif - int Velocity_Verlet_NVE( reax_system* system, control_params* control, simulation_data *data, storage *workspace, reax_list **lists, @@ -339,143 +333,6 @@ int Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* control, } -#ifdef HAVE_CUDA -int Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* control, - simulation_data *data, storage *workspace, reax_list **lists, - output_controls *out_control, mpi_datatypes *mpi_data ) -{ - int i, steps, renbr, ret; - static int verlet_part1_done = FALSE, estimate_nbrs_done = 0; - real inv_m, dt, lambda; - rvec dx; - reax_atom *atom; - int *bond_top, *hb_top; - int Htop, num_3body; - int total_hbonds, count, total_bonds; - int bond_cap, cap_3body; - real t_over_start, t_over_elapsed; - -#if defined(DEBUG_FOCUS) - fprintf( stderr, "p%d @ step%d\n", system->my_rank, data->step ); - MPI_Barrier( MPI_COMM_WORLD ); -#endif - - dt = control->dt; - steps = data->step - data->prev_steps; - renbr = steps % control->reneighbor == 0 ? TRUE : FALSE; - ret = SUCCESS; - - Cuda_ReAllocate( system, control, data, workspace, lists, mpi_data ); - - if ( verlet_part1_done == FALSE ) - { - /* velocity verlet, 1st part */ - bNVT_update_velocity_part1( system, dt ); - verlet_part1_done = TRUE; - -#if defined(DEBUG_FOCUS) - fprintf( stderr, "p%d @ step%d: verlet1 done\n", system->my_rank, data->step ); - MPI_Barrier( MPI_COMM_WORLD ); -#endif - - if ( renbr ) - { - Update_Grid( system, control, mpi_data->world ); - } - - Output_Sync_Atoms( system ); - Comm_Atoms( system, control, data, workspace, lists, mpi_data, renbr ); - Sync_Atoms( system ); - - /* synch the Grid to the Device here */ - Sync_Grid( &system->my_grid, &system->d_my_grid ); - - init_blocks( system ); - -#if defined(__CUDA_DEBUG_LOG__) - fprintf( stderr, "p:%d - Matvec BLocks: %d, blocksize: %d \n", - system->my_rank, MATVEC_BLOCKS, MATVEC_BLOCK_SIZE ); -#endif - } - - Cuda_Reset( system, control, data, workspace, lists ); - - if ( renbr ) - { -#if defined(DEBUG) - t_over_start = Get_Time (); -#endif - - if ( estimate_nbrs_done == 0 ) - { - //TODO: move far_nbrs reallocation checks outside of renbr frequency check - ret = Cuda_Estimate_Neighbors( system, data->step ); - estimate_nbrs_done = 1; - } - - if ( ret == SUCCESS && estimate_nbrs_done == 1 ) - { - Cuda_Generate_Neighbor_Lists( system, data, workspace, lists ); - estimate_nbrs_done = 2; - -#if defined(DEBUG) - t_over_elapsed = Get_Timing_Info( t_over_start ); - fprintf( stderr, "p%d --> Overhead (Step-%d) %f \n", - system->my_rank, data->step, t_over_elapsed ); -#endif - } - } - - if ( ret == SUCCESS ) - { - ret = Cuda_Compute_Forces( system, control, data, workspace, - lists, out_control, mpi_data ); - } - - if ( ret == SUCCESS ) - { - /* velocity verlet, 2nd part */ - bNVT_update_velocity_part2( system, dt ); - -#if defined(DEBUG_FOCUS) - fprintf(stderr, "p%d @ step%d: verlet2 done\n", system->my_rank, data->step); - MPI_Barrier( MPI_COMM_WORLD ); -#endif - - /* temperature scaler */ - Cuda_Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D ); - - lambda = 1.0 + (dt / control->Tau_T) * (control->T / data->therm.T - 1.0); - if ( lambda < MIN_dT ) - { - lambda = MIN_dT; - } - else if (lambda > MAX_dT ) - { - lambda = MAX_dT; - } - lambda = SQRT( lambda ); - - /* Scale velocities and positions at t+dt */ - bNVT_scale_velocities( system, lambda ); - - Cuda_Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D ); - -#if defined(DEBUG_FOCUS) - fprintf( stderr, "p%d @ step%d: scaled velocities\n", - system->my_rank, data->step ); - MPI_Barrier( MPI_COMM_WORLD ); -#endif - - verlet_part1_done = FALSE; - estimate_nbrs_done = 0; - } - - return ret; -} -#endif - - /* uses Berendsen-type coupling for both T and P. * All box dimensions are scaled by the same amount, * there is no change in the angles between axes. */ diff --git a/PG-PuReMD/src/integrate.h b/PG-PuReMD/src/integrate.h index 63fa9cbf..9a25c761 100644 --- a/PG-PuReMD/src/integrate.h +++ b/PG-PuReMD/src/integrate.h @@ -24,6 +24,11 @@ #include "reax_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + int Velocity_Verlet_NVE( reax_system*, control_params*, simulation_data*, storage*, reax_list**, output_controls*, mpi_datatypes* ); @@ -49,9 +54,9 @@ int Velocity_Verlet_Flexible_NPT( reax_system*, control_params*, output_controls*, mpi_datatypes* ); */ -//CUDA SPECIFIC FUNCTIONS -int Cuda_Velocity_Verlet_Berendsen_NVT( reax_system*, control_params*, - simulation_data*, storage*, reax_list**, output_controls*, - mpi_datatypes* ); +#ifdef __cplusplus +} +#endif + #endif diff --git a/PG-PuReMD/src/io_tools.c b/PG-PuReMD/src/io_tools.c index c7c0f2fe..131f8a2e 100644 --- a/PG-PuReMD/src/io_tools.c +++ b/PG-PuReMD/src/io_tools.c @@ -20,7 +20,7 @@ ----------------------------------------------------------------------*/ #include "reax_types.h" -#include "index_utils.h" + #if defined(PURE_REAX) #include "io_tools.h" #include "basic_comm.h" @@ -41,6 +41,8 @@ #include "reax_vector.h" #endif +#include "index_utils.h" + print_interaction Print_Interactions[NUM_INTRS]; diff --git a/PG-PuReMD/src/io_tools.h b/PG-PuReMD/src/io_tools.h index 6ae2d6d8..f83c9686 100644 --- a/PG-PuReMD/src/io_tools.h +++ b/PG-PuReMD/src/io_tools.h @@ -25,45 +25,71 @@ #include "reax_types.h" +#ifdef __cplusplus +extern "C" { +#endif + int Init_Output_Files( reax_system*, control_params*, - output_controls*, mpi_datatypes*, char* ); + output_controls*, mpi_datatypes*, char* ); + int Close_Output_Files( reax_system*, control_params*, - output_controls*, mpi_datatypes* ); - -void Print_Box( simulation_box*, char*, FILE* ); - -void Print_Grid( grid*, FILE* ); -void Print_GCell_Exchange_Bounds( int, neighbor_proc* ); -void Print_Native_GCells( reax_system* ); -void Print_All_GCells( reax_system*); - -void Print_Init_Atoms( reax_system*, storage* ); -void Print_My_Atoms( reax_system* ); -void Print_My_Ext_Atoms( reax_system* ); - -void Print_Far_Neighbors( reax_system*, reax_list**, control_params *); -void Print_Sparse_Matrix( reax_system*, sparse_matrix* ); -void Print_Sparse_Matrix2( reax_system*, sparse_matrix*, char* ); -void Print_Linear_System( reax_system*, control_params*, storage*, int ); -void Print_LinSys_Soln( reax_system*, real*, real*, real* ); -void Print_Charges( reax_system* ); -void Print_HBonds( reax_system*, reax_list**, control_params *, int ); -void Print_HBond_Indices( reax_system*, reax_list**, control_params *, int ); -void Print_Bonds( reax_system*, reax_list**, control_params *); -void Print_Bond_List2( reax_system*, reax_list*, char* ); -void Print_Total_Force( reax_system*, simulation_data*, storage* ); -void Output_Results( reax_system*, control_params*, simulation_data*, - reax_list**, output_controls*, mpi_datatypes* ); + output_controls*, mpi_datatypes* ); + +void Print_Box( simulation_box*, char*, FILE* ); + +void Print_Grid( grid*, FILE* ); + +void Print_GCell_Exchange_Bounds( int, neighbor_proc* ); + +void Print_Native_GCells( reax_system* ); + +void Print_All_GCells( reax_system*); + +void Print_Init_Atoms( reax_system*, storage* ); + +void Print_My_Atoms( reax_system* ); + +void Print_My_Ext_Atoms( reax_system* ); + +void Print_Far_Neighbors( reax_system*, reax_list**, control_params *); + +void Print_Sparse_Matrix( reax_system*, sparse_matrix* ); + +void Print_Sparse_Matrix2( reax_system*, sparse_matrix*, char* ); + +void Print_Linear_System( reax_system*, control_params*, storage*, int ); + +void Print_LinSys_Soln( reax_system*, real*, real*, real* ); + +void Print_Charges( reax_system* ); + +void Print_HBonds( reax_system*, reax_list**, control_params *, int ); + +void Print_HBond_Indices( reax_system*, reax_list**, control_params *, int ); + +void Print_Bonds( reax_system*, reax_list**, control_params *); + +void Print_Bond_List2( reax_system*, reax_list*, char* ); + +void Print_Total_Force( reax_system*, simulation_data*, storage* ); + +void Output_Results( reax_system*, control_params*, simulation_data*, + reax_list**, output_controls*, mpi_datatypes* ); #if defined(DEBUG_FOCUS) || defined(TEST_FORCES) || defined(TEST_ENERGY) void Debug_Marker_Bonded( output_controls*, int ); + void Debug_Marker_Nonbonded( output_controls*, int ); -void Print_Near_Neighbors_List( reax_system*, reax_list**, control_params*, - simulation_data*, output_controls* ); -void Print_Far_Neighbors_List( reax_system*, reax_list**, control_params*, - simulation_data*, output_controls* ); -void Print_Bond_List( reax_system*, control_params*, simulation_data*, - reax_list**, output_controls* ); + +void Print_Near_Neighbors_List( reax_system*, reax_list**, control_params*, + simulation_data*, output_controls* ); + +void Print_Far_Neighbors_List( reax_system*, reax_list**, control_params*, + simulation_data*, output_controls* ); + +void Print_Bond_List( reax_system*, control_params*, simulation_data*, + reax_list**, output_controls* ); + /*void Dummy_Printer( reax_system*, control_params*, simulation_data*, storage*, reax_list**, output_controls* ); void Print_Bond_Orders( reax_system*, control_params*, simulation_data*, @@ -89,23 +115,28 @@ void Print_Total_Force( reax_system*, control_params*, simulation_data*, storage*, reax_list**, output_controls* ); void Compare_Total_Forces( reax_system*, control_params*, simulation_data*, storage*, reax_list**, output_controls* );*/ + //void Print_Total_Force( reax_system*, control_params* ); + void Print_Force_Files( reax_system*, control_params*, simulation_data*, - storage*, reax_list**, output_controls*, - mpi_datatypes * ); + storage*, reax_list**, output_controls*, mpi_datatypes * ); + //void Init_Force_Test_Functions( ); int fn_qsort_intcmp( const void *, const void * ); void Print_Far_Neighbors_List( reax_system*, reax_list**, control_params*, - simulation_data*, output_controls* ); + simulation_data*, output_controls* ); void Print_Near_Neighbors_List( reax_system*, reax_list**, control_params*, - simulation_data*, output_controls* ); + simulation_data*, output_controls* ); void Print_Bond_List( reax_system*, control_params*, simulation_data*, - reax_list**, output_controls*); + reax_list**, output_controls*); +#endif +#ifdef __cplusplus +} #endif diff --git a/PG-PuReMD/src/lin_alg.c b/PG-PuReMD/src/lin_alg.c index bac272a0..e9ce62e7 100644 --- a/PG-PuReMD/src/lin_alg.c +++ b/PG-PuReMD/src/lin_alg.c @@ -19,6 +19,8 @@ <http://www.gnu.org/licenses/>. ----------------------------------------------------------------------*/ +#include "reax_types.h" + #include "lin_alg.h" #include "basic_comm.h" @@ -27,9 +29,7 @@ #include "vector.h" #ifdef HAVE_CUDA - #include "cuda_lin_alg.h" - #include "cuda_utils.h" - #include "cuda_validation.h" + #include "cuda/cuda_validation.h" #endif #if defined(CG_PERFORMANCE) @@ -100,13 +100,13 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, rvec2 #endif #ifdef HAVE_CUDA - check_zeros_host (x, system->N, "x"); + check_zeros_host( x, system->N, "x" ); #endif Dist( system, mpi_data, x, mpi_data->mpi_rvec2, scale, rvec2_packer ); #ifdef HAVE_CUDA - check_zeros_host (x, system->N, "x"); + check_zeros_host( x, system->N, "x" ); #endif dual_Sparse_MatVec( H, x, workspace->q2, N ); @@ -285,352 +285,6 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, rvec2 } -#ifdef HAVE_CUDA -int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, - rvec2 *b, real tol, rvec2 *x, mpi_datatypes* mpi_data, FILE *fout, - simulation_data *data ) -{ - int i, j, n, N, matvecs, scale; - rvec2 tmp, alpha, beta; - rvec2 my_sum, norm_sqr, b_norm, my_dot; - rvec2 sig_old, sig_new; - MPI_Comm comm; - rvec2 *spad = (rvec2 *) host_scratch; - int a; - - n = system->n; - N = system->N; - comm = mpi_data->world; - matvecs = 0; - scale = sizeof(rvec2) / sizeof(void); - -#if defined(CG_PERFORMANCE) - if ( system->my_rank == MASTER_NODE ) - { - matvecs = 0; - t_start = matvec_time = dot_time = 0; - t_start = Get_Time( ); - } -#endif - - //MVAPICH2 -//#ifdef __CUDA_DEBUG__ -// Dist( system, mpi_data, workspace->x, mpi_data->mpi_rvec2, scale, rvec2_packer ); -//#endif - -// check_zeros_device( x, system->N, "x" ); - - copy_host_device( spad, x, sizeof(rvec2) * system->total_cap, cudaMemcpyDeviceToHost, "CG:x:get" ); - Dist( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_packer ); - copy_host_device( spad, x, sizeof(rvec2) * system->total_cap, cudaMemcpyHostToDevice, "CG:x:put" ); - -// check_zeros_device( x, system->N, "x" ); - -// compare_rvec2 (workspace->x, x, N, "x"); -// if (data->step > 0) { -// compare_rvec2 (workspace->b, dev_workspace->b, system->N, "b"); -// compare_rvec2 (workspace->x, dev_workspace->x, system->N, "x"); -// -// exit (0); -// } - - -//#ifdef __CUDA_DEBUG__ -// dual_Sparse_MatVec( &workspace->H, workspace->x, workspace->q2, N ); -//#endif - //originally we were using only H->n which was system->n (init_md.c) - //Cuda_Dual_Matvec ( H, x, dev_workspace->q2, H->n, system->total_cap); - - Cuda_Dual_Matvec ( H, x, dev_workspace->q2, system->N, system->total_cap); - -// compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2"); - -// if (data->step > 0) exit (0); - - // tryQEq - //MVAPICH2 -//#ifdef __CUDA_DEBUG__ -// Coll(system,mpi_data,workspace->q2,mpi_data->mpi_rvec2,scale,rvec2_unpacker); -//#endif - - copy_host_device( spad, dev_workspace->q2, sizeof(rvec2) * system->total_cap, - cudaMemcpyDeviceToHost, "CG:q2:get" ); - Coll(system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_unpacker); - copy_host_device( spad, dev_workspace->q2, sizeof(rvec2) * system->total_cap, - cudaMemcpyHostToDevice,"CG:q2:put" ); - -#if defined(CG_PERFORMANCE) - if ( system->my_rank == MASTER_NODE ) - { - Update_Timing_Info( &t_start, &matvec_time ); - } -#endif - -//#ifdef __CUDA_DEBUG__ -// for( j = 0; j < system->n; ++j ) { -// // residual -// workspace->r2[j][0] = workspace->b[j][0] - workspace->q2[j][0]; -// workspace->r2[j][1] = workspace->b[j][1] - workspace->q2[j][1]; -// // apply diagonal pre-conditioner -// workspace->d2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j]; -// workspace->d2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j]; -// } -//#endif - - Cuda_CG_Diagonal_Preconditioner( dev_workspace, b, system->n ); - -// compare_rvec2 (workspace->r2, dev_workspace->r2, n, "r2"); -// compare_rvec2 (workspace->d2, dev_workspace->d2, n, "d2"); - - /* norm of b */ -//#ifdef __CUDA_DEBUG__ -// my_sum[0] = my_sum[1] = 0; -// for( j = 0; j < n; ++j ) { -// my_sum[0] += SQR( workspace->b[j][0] ); -// my_sum[1] += SQR( workspace->b[j][1] ); -// } -// fprintf (stderr, "cg: my_sum[ %f, %f] \n", my_sum[0], my_sum[1]); -//#endif - - my_sum[0] = my_sum[1] = 0; - Cuda_Norm (b, n, my_sum); - -// fprintf (stderr, "cg: my_sum[ %f, %f] \n", my_sum[0], my_sum[1]); - - MPI_Allreduce( &my_sum, &norm_sqr, 2, MPI_DOUBLE, MPI_SUM, comm ); - b_norm[0] = SQRT( norm_sqr[0] ); - b_norm[1] = SQRT( norm_sqr[1] ); - //fprintf( stderr, "bnorm = %f %f\n", b_norm[0], b_norm[1] ); - - /* dot product: r.d */ -//#ifdef __CUDA_DEBUG__ -// my_dot[0] = my_dot[1] = 0; -// for( j = 0; j < n; ++j ) { -// my_dot[0] += workspace->r2[j][0] * workspace->d2[j][0]; -// my_dot[1] += workspace->r2[j][1] * workspace->d2[j][1]; -// } -// fprintf( stderr, "my_dot: %f %f\n", my_dot[0], my_dot[1] ); -//#endif - - my_dot[0] = my_dot[1] = 0; - Cuda_Dot (dev_workspace->r2, dev_workspace->d2, my_dot, n); - -// fprintf( stderr, "my_dot: %f %f\n", my_dot[0], my_dot[1] ); - - MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm ); - - //fprintf( stderr, "DEVICE:sig_new: %f %f\n", sig_new[0], sig_new[1] ); - -#if defined(CG_PERFORMANCE) - if ( system->my_rank == MASTER_NODE ) - { - Update_Timing_Info( &t_start, &dot_time ); - } -#endif - - for ( i = 1; i < 300; ++i ) - { - //MVAPICH2 -//#ifdef __CUDA_DEBUG__ -// Dist(system,mpi_data,workspace->d2,mpi_data->mpi_rvec2,scale,rvec2_packer); -//#endif - - copy_host_device( spad, dev_workspace->d2, sizeof(rvec2) * system->total_cap, - cudaMemcpyDeviceToHost, "cg:d2:get" ); - Dist( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_packer ); - copy_host_device( spad, dev_workspace->d2, sizeof(rvec2) * system->total_cap, - cudaMemcpyHostToDevice, "cg:d2:put" ); - - //print_device_rvec2 (dev_workspace->d2, N); - -//#ifdef __CUDA_DEBUG__ -// dual_Sparse_MatVec( &workspace->H, workspace->d2, workspace->q2, N ); -//#endif - - Cuda_Dual_Matvec( H, dev_workspace->d2, dev_workspace->q2, system->N, - system->total_cap ); - - /* - fprintf (stderr, "******************* Device sparse Matrix--------> %d \n", H->n ); - fprintf (stderr, " ******* HOST SPARSE MATRIX ******** \n"); - print_sparse_matrix_host (&workspace->H); - fprintf (stderr, " ******* HOST Vector ***************\n"); - print_host_rvec2 (workspace->d2, system->N); - fprintf (stderr, " ******* Device SPARSE MATRIX ******** \n"); - print_sparse_matrix (&dev_workspace->H); - fprintf (stderr, " ******* Device Vector ***************\n"); - print_device_rvec2 (dev_workspace->d2, system->N); - */ - //compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2"); - - // tryQEq - // MVAPICH2 -//#ifdef __CUDA_DEBUG__ -// Coll(system,mpi_data,workspace->q2,mpi_data->mpi_rvec2,scale,rvec2_unpacker); -//#endif - - copy_host_device( spad, dev_workspace->q2, sizeof(rvec2) * system->total_cap, - cudaMemcpyDeviceToHost, "cg:q2:get" ); - Coll( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_unpacker ); - copy_host_device( spad, dev_workspace->q2, sizeof(rvec2) * system->total_cap, - cudaMemcpyHostToDevice, "cg:q2:put" ); - -// compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2"); - -#if defined(CG_PERFORMANCE) - if ( system->my_rank == MASTER_NODE ) - { - Update_Timing_Info( &t_start, &matvec_time ); - } -#endif - - /* dot product: d.q */ -//#ifdef __CUDA_DEBUG__ -// my_dot[0] = my_dot[1] = 0; -// for( j = 0; j < n; ++j ) { -// my_dot[0] += workspace->d2[j][0] * workspace->q2[j][0]; -// my_dot[1] += workspace->d2[j][1] * workspace->q2[j][1]; -// } -// fprintf( stderr, "H:my_dot: %f %f\n", my_dot[0], my_dot[1] ); -//#endif - - my_dot[0] = my_dot[1] = 0; - Cuda_Dot (dev_workspace->d2, dev_workspace->q2, my_dot, n); - //fprintf( stderr, "D:my_dot: %f %f\n", my_dot[0], my_dot[1] ); - - MPI_Allreduce( &my_dot, &tmp, 2, MPI_DOUBLE, MPI_SUM, comm ); - //fprintf( stderr, "tmp: %f %f\n", tmp[0], tmp[1] ); - - alpha[0] = sig_new[0] / tmp[0]; - alpha[1] = sig_new[1] / tmp[1]; - my_dot[0] = my_dot[1] = 0; - -//#ifdef __CUDA_DEBUG__ -// for( j = 0; j < system->n; ++j ) { -// // update x -// workspace->x[j][0] += alpha[0] * workspace->d2[j][0]; -// workspace->x[j][1] += alpha[1] * workspace->d2[j][1]; -// // update residual -// workspace->r2[j][0] -= alpha[0] * workspace->q2[j][0]; -// workspace->r2[j][1] -= alpha[1] * workspace->q2[j][1]; -// // apply diagonal pre-conditioner -// workspace->p2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j]; -// workspace->p2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j]; -// // dot product: r.p -// my_dot[0] += workspace->r2[j][0] * workspace->p2[j][0]; -// my_dot[1] += workspace->r2[j][1] * workspace->p2[j][1]; -// } -// fprintf( stderr, "H:my_dot: %f %f\n", my_dot[0], my_dot[1] ); -//#endif - - my_dot[0] = my_dot[1] = 0; - Cuda_DualCG_Preconditioner( dev_workspace, x, alpha, system->n, my_dot ); - - //fprintf( stderr, "D:my_dot: %f %f\n", my_dot[0], my_dot[1] ); - -// compare_rvec2 (workspace->x, dev_workspace->x, N, "x"); -// compare_rvec2 (workspace->r2, dev_workspace->r2, N, "r2"); -// compare_rvec2 (workspace->p2, dev_workspace->p2, N, "p2"); - - sig_old[0] = sig_new[0]; - sig_old[1] = sig_new[1]; - MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm ); - - //fprintf( stderr, "DEVICE:sig_new: %f %f\n", sig_new[0], sig_new[1] ); - -#if defined(CG_PERFORMANCE) - if ( system->my_rank == MASTER_NODE ) - { - Update_Timing_Info( &t_start, &dot_time ); - } -#endif - - if ( SQRT(sig_new[0]) / b_norm[0] <= tol || SQRT(sig_new[1]) / b_norm[1] <= tol ) - { - break; - } - - beta[0] = sig_new[0] / sig_old[0]; - beta[1] = sig_new[1] / sig_old[1]; - -//#ifdef __CUDA_DEBUG__ -// for( j = 0; j < system->n; ++j ) { -// // d = p + beta * d -// workspace->d2[j][0] = workspace->p2[j][0] + beta[0] * workspace->d2[j][0]; -// workspace->d2[j][1] = workspace->p2[j][1] + beta[1] * workspace->d2[j][1]; -// } -//#endif - - Cuda_Vector_Sum_Rvec2( dev_workspace->d2, dev_workspace->p2, beta, - dev_workspace->d2, system->n ); - -// compare_rvec2 (workspace->d2, dev_workspace->d2, N, "q2"); - } - - - if ( SQRT(sig_new[0]) / b_norm[0] <= tol ) - { - //for( j = 0; j < n; ++j ) - // workspace->t[j] = workspace->x[j][1]; - //fprintf (stderr, "Getting started with Cuda_CG1 \n"); - - Cuda_RvecCopy_From( dev_workspace->t, dev_workspace->x, 1, system->n ); - - //compare_array (workspace->b_t, dev_workspace->b_t, system->n, "b_t"); - //compare_array (workspace->t, dev_workspace->t, system->n, "t"); - - matvecs = Cuda_CG( system, workspace, H, dev_workspace->b_t, tol, dev_workspace->t, - mpi_data, fout ); - - //fprintf (stderr, " Cuda_CG1: iterations --> %d \n", matvecs ); - //for( j = 0; j < n; ++j ) - // workspace->x[j][1] = workspace->t[j]; - - Cuda_RvecCopy_To( dev_workspace->x, dev_workspace->t, 1, system->n ); - } - else if ( SQRT(sig_new[1]) / b_norm[1] <= tol ) - { - //for( j = 0; j < n; ++j ) - // workspace->s[j] = workspace->x[j][0]; - - Cuda_RvecCopy_From( dev_workspace->s, dev_workspace->x, 0, system->n ); - - //compare_array (workspace->s, dev_workspace->s, system->n, "s"); - //compare_array (workspace->b_s, dev_workspace->b_s, system->n, "b_s"); - - //fprintf (stderr, "Getting started with Cuda_CG2 \n"); - - matvecs = Cuda_CG( system, workspace, H, dev_workspace->b_s, tol, dev_workspace->s, - mpi_data, fout ); - - //fprintf (stderr, " Cuda_CG2: iterations --> %d \n", matvecs ); - //for( j = 0; j < system->n; ++j ) - // workspace->x[j][0] = workspace->s[j]; - - Cuda_RvecCopy_To( dev_workspace->x, dev_workspace->s, 0, system->n ); - } - - if ( i >= 300 ) - { - fprintf( stderr, "[WARNING] p%d: dual CG convergence failed! (%d steps)\n", - system->my_rank, i ); - fprintf( stderr, " [INFO] s lin solve error: %f\n", SQRT(sig_new[0]) / b_norm[0] ); - fprintf( stderr, " [INFO] t lin solve error: %f\n", SQRT(sig_new[1]) / b_norm[1] ); - } - -#if defined(CG_PERFORMANCE) - if ( system->my_rank == MASTER_NODE ) - { - fprintf( fout, "QEq %d + %d iters. matvecs: %f dot: %f\n", - i + 1, matvecs, matvec_time, dot_time ); - } -#endif - - return (i + 1) + matvecs; -} -#endif - - void Sparse_MatVec( sparse_matrix *A, real *x, real *b, int N ) { int i, j, k, si; @@ -745,153 +399,6 @@ int CG( reax_system *system, storage *workspace, sparse_matrix *H, real *b, } -#ifdef HAVE_CUDA -int Cuda_CG( reax_system *system, storage *workspace, sparse_matrix *H, real - *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout ) -{ - int i, j, scale; - real tmp, alpha, beta, b_norm; - real sig_old, sig_new, sig0; - real *spad = (real *) host_scratch; - - scale = sizeof(real) / sizeof(void); - - /* x is on the device */ - //MVAPICH2 - memset( spad, 0, sizeof(real) * system->total_cap ); - copy_host_device( spad, x, sizeof(real) * system->total_cap, - cudaMemcpyDeviceToHost, "cuda_cg:x:get" ); - Dist( system, mpi_data, spad, MPI_DOUBLE, scale, real_packer ); - - //MVAPICH2 - copy_host_device( spad, x, sizeof(real) * system->total_cap, - cudaMemcpyHostToDevice, "cuda_cg:x:put" ); - Cuda_Matvec( H, x, dev_workspace->q, system->N, system->total_cap ); - - // tryQEq - // MVAPICH2 - copy_host_device( spad, dev_workspace->q, sizeof(real) * system->total_cap, - cudaMemcpyDeviceToHost, "cuda_cg:q:get" ); - Coll( system, mpi_data, spad, MPI_DOUBLE, scale, real_unpacker ); - - //MVAPICH2 - copy_host_device( spad, dev_workspace->q, sizeof(real) * system->total_cap, - cudaMemcpyHostToDevice, "cuda_cg:q:put" ); - -#if defined(CG_PERFORMANCE) - if ( system->my_rank == MASTER_NODE ) - { - Update_Timing_Info( &t_start, &matvec_time ); - } -#endif - - Cuda_Vector_Sum( dev_workspace->r , 1., b, -1., dev_workspace->q, - system->n ); - //for( j = 0; j < system->n; ++j ) - // workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; //pre-condition - Cuda_CG_Preconditioner( dev_workspace->d, dev_workspace->r, - dev_workspace->Hdia_inv, system->n ); - - //TODO do the parallel_norm on the device for the local sum - copy_host_device( spad, b, sizeof(real) * system->n, - cudaMemcpyDeviceToHost, "cuda_cg:b:get" ); - b_norm = Parallel_Norm( spad, system->n, mpi_data->world ); - - //TODO do the parallel dot on the device for the local sum - copy_host_device( spad, dev_workspace->r, sizeof(real) * system->total_cap, - cudaMemcpyDeviceToHost, "cuda_cg:r:get" ); - copy_host_device( spad + system->total_cap, dev_workspace->d, sizeof(real) * system->total_cap, - cudaMemcpyDeviceToHost, "cuda_cg:d:get" ); - sig_new = Parallel_Dot( spad, spad + system->total_cap, system->n, - mpi_data->world ); - - sig0 = sig_new; - -#if defined(CG_PERFORMANCE) - if ( system->my_rank == MASTER_NODE ) - { - Update_Timing_Info( &t_start, &dot_time ); - } -#endif - - for ( i = 1; i < 300 && SQRT(sig_new) / b_norm > tol; ++i ) - { - //MVAPICH2 - copy_host_device( spad, dev_workspace->d, sizeof(real) * system->total_cap, - cudaMemcpyDeviceToHost, "cuda_cg:d:get" ); - Dist( system, mpi_data, spad, MPI_DOUBLE, scale, real_packer ); - copy_host_device( spad, dev_workspace->d, sizeof(real) * system->total_cap, - cudaMemcpyHostToDevice, "cuda_cg:d:put" ); - - Cuda_Matvec( H, dev_workspace->d, dev_workspace->q, system->N, system->total_cap ); - - //tryQEq - copy_host_device( spad, dev_workspace->q, sizeof(real) * system->total_cap, - cudaMemcpyDeviceToHost, "cuda_cg:q:get" ); - Coll( system, mpi_data, spad, MPI_DOUBLE, scale, real_unpacker ); - copy_host_device( spad, dev_workspace->q, sizeof(real) * system->total_cap, - cudaMemcpyHostToDevice, "cuda_cg:q:get" ); - -#if defined(CG_PERFORMANCE) - if ( system->my_rank == MASTER_NODE ) - { - Update_Timing_Info( &t_start, &matvec_time ); - } -#endif - - //TODO do the parallel dot on the device for the local sum - copy_host_device( spad, dev_workspace->d, sizeof(real) * system->n, - cudaMemcpyDeviceToHost, "cuda_cg:d:get" ); - copy_host_device( spad + system->n, dev_workspace->q, sizeof(real) * system->n, - cudaMemcpyDeviceToHost, "cuda_cg:q:get" ); - tmp = Parallel_Dot( spad, spad + system->n, system->n, mpi_data->world ); - - alpha = sig_new / tmp; - //Cuda_Vector_Add( x, alpha, dev_workspace->d, system->n ); - Cuda_Vector_Sum( x, alpha, dev_workspace->d, 1.0, x, system->n ); - - //Cuda_Vector_Add( workspace->r, -alpha, workspace->q, system->n ); - Cuda_Vector_Sum( dev_workspace->r, -alpha, dev_workspace->q, 1.0, - dev_workspace->r, system->n ); - /* pre-conditioning */ - //for( j = 0; j < system->n; ++j ) - // workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j]; - Cuda_CG_Preconditioner( dev_workspace->p, dev_workspace->r, - dev_workspace->Hdia_inv, system->n ); - - sig_old = sig_new; - - //TODO do the parallel dot on the device for the local sum - copy_host_device( spad, dev_workspace->r, sizeof(real) * system->n, - cudaMemcpyDeviceToHost, "cuda_cg:r:get" ); - copy_host_device( spad + system->n, dev_workspace->p, sizeof(real) * system->n, - cudaMemcpyDeviceToHost, "cuda_cg:p:get" ); - sig_new = Parallel_Dot( spad , spad + system->n, system->n, mpi_data->world ); - //fprintf (stderr, "Device: sig_new: %f \n", sig_new ); - - beta = sig_new / sig_old; - Cuda_Vector_Sum( dev_workspace->d, 1., dev_workspace->p, beta, - dev_workspace->d, system->n ); - -#if defined(CG_PERFORMANCE) - if ( system->my_rank == MASTER_NODE ) - { - Update_Timing_Info( &t_start, &dot_time ); - } -#endif - } - - if ( i >= 300 ) - { - fprintf( stderr, "CG convergence failed!\n" ); - return i; - } - - return i; -} -#endif - - int CG_test( reax_system *system, storage *workspace, sparse_matrix *H, real *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout ) { diff --git a/PG-PuReMD/src/lin_alg.h b/PG-PuReMD/src/lin_alg.h index f401fb2d..3663978e 100644 --- a/PG-PuReMD/src/lin_alg.h +++ b/PG-PuReMD/src/lin_alg.h @@ -24,23 +24,32 @@ #include "reax_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + int GMRES( reax_system*, storage*, sparse_matrix*, - real*, real, real*, mpi_datatypes*, FILE* ); + real*, real, real*, mpi_datatypes*, FILE* ); + int GMRES_HouseHolder( reax_system*, storage*, sparse_matrix*, - real*, real, real*, mpi_datatypes*, FILE* ); + real*, real, real*, mpi_datatypes*, FILE* ); + int dual_CG( reax_system*, storage*, sparse_matrix*, - rvec2*, real, rvec2*, mpi_datatypes*, FILE* , simulation_data *); + rvec2*, real, rvec2*, mpi_datatypes*, FILE* , simulation_data *); + int CG( reax_system*, storage*, sparse_matrix*, real*, real, real*, mpi_datatypes*, FILE* ); + int PCG( reax_system*, storage*, sparse_matrix*, real*, real, - sparse_matrix*, sparse_matrix*, real*, mpi_datatypes*, FILE* ); + sparse_matrix*, sparse_matrix*, real*, mpi_datatypes*, FILE* ); + int sCG( reax_system*, storage*, sparse_matrix*, - real*, real, real*, mpi_datatypes*, FILE* ); + real*, real, real*, mpi_datatypes*, FILE* ); + +#ifdef __cplusplus +} +#endif -//CUDA Functions -int Cuda_dual_CG( reax_system*, storage*, sparse_matrix*, - rvec2*, real, rvec2*, mpi_datatypes*, FILE* , simulation_data *); -int Cuda_CG( reax_system*, storage*, sparse_matrix*, - real*, real, real*, mpi_datatypes*, FILE* ); #endif diff --git a/PG-PuReMD/src/list.c b/PG-PuReMD/src/list.c index 05213cb3..69736afb 100644 --- a/PG-PuReMD/src/list.c +++ b/PG-PuReMD/src/list.c @@ -22,11 +22,11 @@ #include "reax_types.h" #if defined(PURE_REAX) -#include "list.h" -#include "tool_box.h" + #include "list.h" + #include "tool_box.h" #elif defined(LAMMPS_REAX) -#include "reax_list.h" -#include "reax_tool_box.h" + #include "reax_list.h" + #include "reax_tool_box.h" #endif diff --git a/PG-PuReMD/src/list.h b/PG-PuReMD/src/list.h index 1f29f5f8..df6ec82f 100644 --- a/PG-PuReMD/src/list.h +++ b/PG-PuReMD/src/list.h @@ -24,17 +24,21 @@ #include "reax_types.h" + #ifdef _cplusplus extern "C" { #endif - void Print_List( reax_list* ); void Make_List( int, int, int, reax_list* ); void Delete_List( reax_list* ); +#ifdef _cplusplus +} +#endif + #if defined(LAMMPS_REAX) || defined(PURE_REAX) static inline int Num_Entries( int i, reax_list *l ) { @@ -60,12 +64,7 @@ static inline void Set_End_Index( int i, int val, reax_list *l ) { l->end_index[i] = val; } - #endif -#ifdef _cplusplus -} -#endif - #endif diff --git a/PG-PuReMD/src/lookup.c b/PG-PuReMD/src/lookup.c index 2c6652f9..b071ea89 100644 --- a/PG-PuReMD/src/lookup.c +++ b/PG-PuReMD/src/lookup.c @@ -21,12 +21,6 @@ #include "reax_types.h" -#include "index_utils.h" - -#ifdef HAVE_CUDA - #include "cuda_lookup.h" -#endif - #if defined(PURE_REAX) #include "lookup.h" #include "nonbonded.h" @@ -37,6 +31,12 @@ #include "reax_tool_box.h" #endif +#include "index_utils.h" + +#ifdef HAVE_CUDA + #include "cuda/cuda_lookup.h" +#endif + /* Fills solution into x. Warning: will modify c and d! */ void Tridiagonal_Solve( const real *a, const real *b, diff --git a/PG-PuReMD/src/lookup.h b/PG-PuReMD/src/lookup.h index f6e45bd1..4db34ce0 100644 --- a/PG-PuReMD/src/lookup.h +++ b/PG-PuReMD/src/lookup.h @@ -26,7 +26,17 @@ //extern LR_lookup_table **LR; + +#ifdef _cplusplus +extern "C" { +#endif + int Init_Lookup_Tables( reax_system*, control_params*, real *, - mpi_datatypes*, char* ); + mpi_datatypes*, char* ); + +#ifdef _cplusplus +} +#endif + #endif diff --git a/PG-PuReMD/src/multi_body.c b/PG-PuReMD/src/multi_body.c index b480d3bb..aab4957d 100644 --- a/PG-PuReMD/src/multi_body.c +++ b/PG-PuReMD/src/multi_body.c @@ -32,6 +32,7 @@ #include "reax_list.h" #include "reax_vector.h" #endif + #include "index_utils.h" diff --git a/PG-PuReMD/src/multi_body.h b/PG-PuReMD/src/multi_body.h index aaed59e5..9cc865b4 100644 --- a/PG-PuReMD/src/multi_body.h +++ b/PG-PuReMD/src/multi_body.h @@ -24,7 +24,17 @@ #include "reax_types.h" + +#ifdef _cplusplus +extern "C" { +#endif + void Atom_Energy( reax_system*, control_params*, simulation_data*, - storage*, reax_list**, output_controls* ); + storage*, reax_list**, output_controls* ); + +#ifdef _cplusplus +} +#endif + #endif diff --git a/PG-PuReMD/src/neighbors.c b/PG-PuReMD/src/neighbors.c index 753ecc36..e938329a 100644 --- a/PG-PuReMD/src/neighbors.c +++ b/PG-PuReMD/src/neighbors.c @@ -19,14 +19,16 @@ <http://www.gnu.org/licenses/>. ----------------------------------------------------------------------*/ +#include "reax_types.h" + #include "neighbors.h" + +#include "index_utils.h" #include "io_tools.h" #include "list.h" #include "tool_box.h" #include "vector.h" -#include "index_utils.h" - int compare_far_nbrs( const void *p1, const void *p2 ) { diff --git a/PG-PuReMD/src/neighbors.h b/PG-PuReMD/src/neighbors.h index 0a1e3daf..37c3642b 100644 --- a/PG-PuReMD/src/neighbors.h +++ b/PG-PuReMD/src/neighbors.h @@ -31,8 +31,18 @@ int, int*, int*, int*, int, int, int, real, rvec, ivec );*/ -void Generate_Neighbor_Lists( reax_system*, simulation_data*, storage*, - reax_list** ); + +#ifdef _cplusplus +extern "C" { +#endif + +void Generate_Neighbor_Lists( reax_system*, simulation_data*, storage*, reax_list** ); + int Estimate_NumNeighbors( reax_system*, reax_list** ); +#ifdef _cplusplus +} +#endif + + #endif diff --git a/PG-PuReMD/src/nonbonded.c b/PG-PuReMD/src/nonbonded.c index 8edd2b11..e073ec62 100644 --- a/PG-PuReMD/src/nonbonded.c +++ b/PG-PuReMD/src/nonbonded.c @@ -20,7 +20,7 @@ ----------------------------------------------------------------------*/ #include "reax_types.h" -#include "index_utils.h" + #if defined(PURE_REAX) #include "nonbonded.h" #include "bond_orders.h" @@ -34,10 +34,12 @@ #include "reax_vector.h" #endif +#include "index_utils.h" + void vdW_Coulomb_Energy( reax_system *system, control_params *control, - simulation_data *data, storage *workspace, - reax_list **lists, output_controls *out_control ) + simulation_data *data, storage *workspace, + reax_list **lists, output_controls *out_control ) { int i, j, pj, natoms; int start_i, end_i, orig_i, orig_j; diff --git a/PG-PuReMD/src/nonbonded.h b/PG-PuReMD/src/nonbonded.h index 81613be5..45137bf8 100644 --- a/PG-PuReMD/src/nonbonded.h +++ b/PG-PuReMD/src/nonbonded.h @@ -24,14 +24,24 @@ #include "reax_types.h" + +#ifdef _cplusplus +extern "C" { +#endif + void vdW_Coulomb_Energy( reax_system*, control_params*, simulation_data*, - storage*, reax_list**, output_controls* ); + storage*, reax_list**, output_controls* ); void Tabulated_vdW_Coulomb_Energy( reax_system*, control_params*, - simulation_data*, storage*, - reax_list**, output_controls* ); + simulation_data*, storage*, reax_list**, output_controls* ); void Compute_Polarization_Energy( reax_system*, simulation_data* ); void LR_vdW_Coulomb( reax_system*, real *, int, int, real, LR_data* ); + +#ifdef _cplusplus +} +#endif + + #endif diff --git a/PG-PuReMD/src/parallelreax.c b/PG-PuReMD/src/parallelreax.c index 4d677687..30c23722 100644 --- a/PG-PuReMD/src/parallelreax.c +++ b/PG-PuReMD/src/parallelreax.c @@ -40,13 +40,13 @@ #include "vector.h" #ifdef HAVE_CUDA - #include "cuda_copy.h" - #include "cuda_environment.h" - #include "cuda_neighbors.h" - #include "cuda_post_evolve.h" - #include "cuda_reset_tools.h" - #include "cuda_utils.h" - #include "cuda_validation.h" + #include "cuda/cuda_copy.h" + #include "cuda/cuda_environment.h" + #include "cuda/cuda_neighbors.h" + #include "cuda/cuda_post_evolve.h" + #include "cuda/cuda_reset_tools.h" + #include "cuda/cuda_utils.h" + #include "cuda/cuda_validation.h" #endif evolve_function Evolve; @@ -156,25 +156,6 @@ int Cuda_Post_Evolve( reax_system* system, control_params* control, #endif -#ifdef HAVE_CUDA -void init_blocks( reax_system *system ) -{ - compute_blocks( &BLOCKS, &BLOCK_SIZE, system->n ); - compute_nearest_pow_2( BLOCKS, &BLOCKS_POW_2 ); - - compute_blocks( &BLOCKS_N, &BLOCK_SIZE, system->N ); - compute_nearest_pow_2( BLOCKS_N, &BLOCKS_POW_2_N ); - - compute_matvec_blocks( &MATVEC_BLOCKS, system->N ); - -#if defined(__CUDA_DEBUG_LOG__) - fprintf( stderr, " MATVEC_BLOCKS: %d BLOCKSIZE: %d - N:%d \n", - MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, system->N ); -#endif -} -#endif - - static void usage( char* argv[] ) { fprintf( stderr, "usage: ./%s geometry ffield control\n", argv[0] ); diff --git a/PG-PuReMD/src/random.c b/PG-PuReMD/src/random.c index 2811a6b5..ffe55458 100644 --- a/PG-PuReMD/src/random.c +++ b/PG-PuReMD/src/random.c @@ -19,6 +19,8 @@ <http://www.gnu.org/licenses/>. ----------------------------------------------------------------------*/ +#include "reax_types.h" + #include "random.h" /* System random number generator used linear congruance method with diff --git a/PG-PuReMD/src/random.h b/PG-PuReMD/src/random.h index a3ce3526..66a5d59d 100644 --- a/PG-PuReMD/src/random.h +++ b/PG-PuReMD/src/random.h @@ -24,19 +24,28 @@ #include "reax_types.h" + +#ifdef _cplusplus +extern "C" { +#endif + /* System random number generator used linear congruance method with large periodicity for generation of pseudo random number. function Random returns this random number appropriately scaled so that 0 <= Random(range) < range */ -double Random(double); +double Random( double ); /* This function seeds the system pseudo random number generator with current time. Use this function once in the begining to initialize the system */ -void Randomize(); +void Randomize( ); /* GRandom return random number with gaussian distribution with mean and standard deviation "sigma" */ -double GRandom(double, double); +double GRandom( double, double ); + +#ifdef _cplusplus +} +#endif #endif diff --git a/PG-PuReMD/src/reax_types.h b/PG-PuReMD/src/reax_types.h index c39277b8..38810bd6 100644 --- a/PG-PuReMD/src/reax_types.h +++ b/PG-PuReMD/src/reax_types.h @@ -96,6 +96,14 @@ #define FABS fabs #define FMOD fmod +/* transcendental constant pi */ +#if defined(M_PI) + /* GNU C library (libc), defined in math.h */ + #define PI (M_PI) +#else + #define PI 3.14159265 +#endif + #define SQR(x) ((x)*(x)) #define CUBE(x) ((x)*(x)*(x)) #define DEG2RAD(a) ((a)*PI/180.0) @@ -104,13 +112,6 @@ #define MIN(x,y) (((x) < (y)) ? (x) : (y)) #define MAX3(x,y,z) MAX( MAX(x,y), z) -/* transcendental constant pi */ -#if defined(M_PI) - /* GNU C library (libc), defined in math.h */ - #define PI (M_PI) -#else - #define PI 3.14159265 -#endif /* ??? */ #define C_ele 332.06371 /* ??? */ diff --git a/PG-PuReMD/src/reset_tools.c b/PG-PuReMD/src/reset_tools.c index a605cc79..c3778145 100644 --- a/PG-PuReMD/src/reset_tools.c +++ b/PG-PuReMD/src/reset_tools.c @@ -21,8 +21,6 @@ #include "reax_types.h" -#include "index_utils.h" - #if defined(PURE_REAX) #include "reset_tools.h" #include "list.h" @@ -35,6 +33,8 @@ #include "reax_vector.h" #endif +#include "index_utils.h" + void Reset_Atoms( reax_system* system, control_params *control ) { diff --git a/PG-PuReMD/src/reset_tools.h b/PG-PuReMD/src/reset_tools.h index 34f38760..001b7f57 100644 --- a/PG-PuReMD/src/reset_tools.h +++ b/PG-PuReMD/src/reset_tools.h @@ -24,11 +24,11 @@ #include "reax_types.h" + #ifdef __cplusplus extern "C" { #endif - void Reset_Pressures( simulation_data* ); void Reset_Simulation_Data( simulation_data* ); @@ -49,9 +49,9 @@ void Reset( reax_system*, control_params*, simulation_data*, storage*, reax_list void Reset_Test_Forces( reax_system*, storage* ); #endif - #ifdef __cplusplus } #endif + #endif diff --git a/PG-PuReMD/src/restart.c b/PG-PuReMD/src/restart.c index 967e025d..6b8ddcdf 100644 --- a/PG-PuReMD/src/restart.c +++ b/PG-PuReMD/src/restart.c @@ -19,7 +19,10 @@ <http://www.gnu.org/licenses/>. ----------------------------------------------------------------------*/ +#include "reax_types.h" + #include "restart.h" + #include "allocate.h" #include "box.h" #include "tool_box.h" diff --git a/PG-PuReMD/src/restart.h b/PG-PuReMD/src/restart.h index 39a5dcd5..3d13a5a1 100644 --- a/PG-PuReMD/src/restart.h +++ b/PG-PuReMD/src/restart.h @@ -24,6 +24,7 @@ #include "reax_types.h" + #define RESTART_HEADER "%8d%12d%8.3f%8.3f%8.3f%8.3f%8.3f\n%15.5f%15.5f%15.5f\n%15.5f%15.5f%15.5f\n%15.5f%15.5f%15.5f\n" #define RESTART_HEADER_LINE_LEN 200 /* step, system->bigN, data->therm.T, data->therm.xi, @@ -39,16 +40,26 @@ #define READ_RESTART_HEADER " %d %d %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf" #define READ_RESTART_LINE " %d %d %s %lf %lf %lf %lf %lf %lf" + +#ifdef __cplusplus +extern "C" { +#endif + void Write_Binary_Restart( reax_system*, control_params*, - simulation_data*, output_controls*, mpi_datatypes* ); + simulation_data*, output_controls*, mpi_datatypes* ); void Write_Restart( reax_system*, control_params*, - simulation_data*, output_controls*, mpi_datatypes* ); + simulation_data*, output_controls*, mpi_datatypes* ); void Read_Binary_Restart( char*, reax_system*, control_params*, - simulation_data*, storage*, mpi_datatypes* ); + simulation_data*, storage*, mpi_datatypes* ); void Read_Restart( char*, reax_system*, control_params*, - simulation_data*, storage*, mpi_datatypes* ); + simulation_data*, storage*, mpi_datatypes* ); + +#ifdef __cplusplus +} +#endif + #endif diff --git a/PG-PuReMD/src/system_props.c b/PG-PuReMD/src/system_props.c index e2852a4b..ea4465c5 100644 --- a/PG-PuReMD/src/system_props.c +++ b/PG-PuReMD/src/system_props.c @@ -21,10 +21,6 @@ #include "reax_types.h" -#ifdef HAVE_CUDA - #include "cuda_system_props.h" -#endif - #if defined(PURE_REAX) #include "system_props.h" #include "tool_box.h" @@ -35,6 +31,10 @@ #include "reax_vector.h" #endif +#ifdef HAVE_CUDA + #include "cuda/cuda_system_props.h" +#endif + void Temperature_Control( control_params *control, simulation_data *data ) { @@ -83,29 +83,6 @@ void Compute_Kinetic_Energy( reax_system* system, simulation_data* data, data->my_en.e_kin += 0.5 * rvec_Dot( p, system->my_atoms[i].v ); } - MPI_Allreduce( &data->my_en.e_kin, &data->sys_en.e_kin, - 1, MPI_DOUBLE, MPI_SUM, comm ); - - data->therm.T = (2. * data->sys_en.e_kin) / (data->N_f * K_B); - - // avoid T being an absolute zero, might cause F.P.E! - if ( FABS(data->therm.T) < ALMOST_ZERO ) - data->therm.T = ALMOST_ZERO; -} - - -#ifdef HAVE_CUDA -void Cuda_Compute_Kinetic_Energy( reax_system* system, simulation_data* data, - MPI_Comm comm ) -{ - int i; - rvec p; - real m; - - data->my_en.e_kin = 0.0; - - dev_compute_kinetic_energy( system, data, &data->my_en.e_kin ); - MPI_Allreduce( &data->my_en.e_kin, &data->sys_en.e_kin, 1, MPI_DOUBLE, MPI_SUM, comm ); @@ -117,7 +94,6 @@ void Cuda_Compute_Kinetic_Energy( reax_system* system, simulation_data* data, data->therm.T = ALMOST_ZERO; } } -#endif void Compute_System_Energy( reax_system *system, simulation_data *data, @@ -130,7 +106,7 @@ void Compute_System_Energy( reax_system *system, simulation_data *data, #ifdef HAVE_CUDA //Cuda Wrapper here - dev_sync_simulation_data ( data ); + dev_sync_simulation_data( data ); #endif my_en[0] = data->my_en.e_bond; @@ -205,23 +181,6 @@ void Compute_Total_Mass( reax_system *system, simulation_data *data, } -#ifdef HAVE_CUDA -void Cuda_Compute_Total_Mass( reax_system *system, simulation_data *data, - MPI_Comm comm ) -{ - int i; - real tmp; - - //compute local total mass of the system - dev_compute_total_mass( system, &tmp ); - - MPI_Allreduce( &tmp, &data->M, 1, MPI_DOUBLE, MPI_SUM, comm ); - - data->inv_M = 1. / data->M; -} -#endif - - void Compute_Center_of_Mass( reax_system *system, simulation_data *data, mpi_datatypes *mpi_data, MPI_Comm comm ) { @@ -342,112 +301,6 @@ void Compute_Center_of_Mass( reax_system *system, simulation_data *data, } -#ifdef HAVE_CUDA -void Cuda_Compute_Center_of_Mass( reax_system *system, simulation_data *data, - mpi_datatypes *mpi_data, MPI_Comm comm ) -{ - int i; - real m, det; //xx, xy, xz, yy, yz, zz; - real tmp_mat[6], tot_mat[6]; - rvec my_xcm, my_vcm, my_amcm, my_avcm; - rvec tvec, diff; - rtensor mat, inv; - - rvec_MakeZero( my_xcm ); // position of CoM - rvec_MakeZero( my_vcm ); // velocity of CoM - rvec_MakeZero( my_amcm ); // angular momentum of CoM - rvec_MakeZero( my_avcm ); // angular velocity of CoM - - /* Compute the position, vel. and ang. momentum about the centre of mass */ - dev_compute_momentum ( system, my_xcm, my_vcm, my_amcm ); - - MPI_Allreduce( my_xcm, data->xcm, 3, MPI_DOUBLE, MPI_SUM, comm ); - MPI_Allreduce( my_vcm, data->vcm, 3, MPI_DOUBLE, MPI_SUM, comm ); - MPI_Allreduce( my_amcm, data->amcm, 3, MPI_DOUBLE, MPI_SUM, comm ); - - rvec_Scale( data->xcm, data->inv_M, data->xcm ); - rvec_Scale( data->vcm, data->inv_M, data->vcm ); - rvec_Cross( tvec, data->xcm, data->vcm ); - rvec_ScaledAdd( data->amcm, -data->M, tvec ); - data->etran_cm = 0.5 * data->M * rvec_Norm_Sqr( data->vcm ); - - /* Calculate and then invert the inertial tensor */ - for ( i = 0; i < 6; ++i ) - { - tmp_mat[i] = 0; - } - - dev_compute_inertial_tensor( system, tmp_mat, my_xcm ); - - MPI_Reduce( tmp_mat, tot_mat, 6, MPI_DOUBLE, MPI_SUM, MASTER_NODE, comm ); - - if ( system->my_rank == MASTER_NODE ) - { - mat[0][0] = tot_mat[3] + tot_mat[5]; // yy + zz; - mat[0][1] = mat[1][0] = -tot_mat[1]; // -xy; - mat[0][2] = mat[2][0] = -tot_mat[2]; // -xz; - mat[1][1] = tot_mat[0] + tot_mat[5]; // xx + zz; - mat[2][1] = mat[1][2] = -tot_mat[4]; // -yz; - mat[2][2] = tot_mat[0] + tot_mat[3]; // xx + yy; - - /* invert the inertial tensor */ - det = ( mat[0][0] * mat[1][1] * mat[2][2] + - mat[0][1] * mat[1][2] * mat[2][0] + - mat[0][2] * mat[1][0] * mat[2][1] ) - - ( mat[0][0] * mat[1][2] * mat[2][1] + - mat[0][1] * mat[1][0] * mat[2][2] + - mat[0][2] * mat[1][1] * mat[2][0] ); - - inv[0][0] = mat[1][1] * mat[2][2] - mat[1][2] * mat[2][1]; - inv[0][1] = mat[0][2] * mat[2][1] - mat[0][1] * mat[2][2]; - inv[0][2] = mat[0][1] * mat[1][2] - mat[0][2] * mat[1][1]; - inv[1][0] = mat[1][2] * mat[2][0] - mat[1][0] * mat[2][2]; - inv[1][1] = mat[0][0] * mat[2][2] - mat[0][2] * mat[2][0]; - inv[1][2] = mat[0][2] * mat[1][0] - mat[0][0] * mat[1][2]; - inv[2][0] = mat[1][0] * mat[2][1] - mat[2][0] * mat[1][1]; - inv[2][1] = mat[2][0] * mat[0][1] - mat[0][0] * mat[2][1]; - inv[2][2] = mat[0][0] * mat[1][1] - mat[1][0] * mat[0][1]; - - if ( det > ALMOST_ZERO ) - { - rtensor_Scale( inv, 1. / det, inv ); - } - else - { - rtensor_MakeZero( inv ); - } - - /* Compute the angular velocity about the centre of mass */ - rtensor_MatVec( data->avcm, inv, data->amcm ); - } - - MPI_Bcast( data->avcm, 3, MPI_DOUBLE, MASTER_NODE, comm ); - - /* Compute the rotational energy */ - data->erot_cm = 0.5 * E_CONV * rvec_Dot( data->avcm, data->amcm ); - -#if defined(DEBUG) - fprintf( stderr, "xcm: %24.15e %24.15e %24.15e\n", - data->xcm[0], data->xcm[1], data->xcm[2] ); - fprintf( stderr, "vcm: %24.15e %24.15e %24.15e\n", - data->vcm[0], data->vcm[1], data->vcm[2] ); - fprintf( stderr, "amcm: %24.15e %24.15e %24.15e\n", - data->amcm[0], data->amcm[1], data->amcm[2] ); - /* fprintf( stderr, "mat: %f %f %f\n %f %f %f\n %f %f %f\n", - mat[0][0], mat[0][1], mat[0][2], - mat[1][0], mat[1][1], mat[1][2], - mat[2][0], mat[2][1], mat[2][2] ); - fprintf( stderr, "inv: %g %g %g\n %g %g %g\n %g %g %g\n", - inv[0][0], inv[0][1], inv[0][2], - inv[1][0], inv[1][1], inv[1][2], - inv[2][0], inv[2][1], inv[2][2] ); */ - fprintf( stderr, "avcm: %24.15e %24.15e %24.15e\n", - data->avcm[0], data->avcm[1], data->avcm[2] ); -#endif -} -#endif - - /* IMPORTANT: This function assumes that current kinetic energy * the system is already computed * diff --git a/PG-PuReMD/src/system_props.h b/PG-PuReMD/src/system_props.h index 5efff3c5..f04a9590 100644 --- a/PG-PuReMD/src/system_props.h +++ b/PG-PuReMD/src/system_props.h @@ -24,6 +24,11 @@ #include "reax_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + void Temperature_Control( control_params*, simulation_data* ); void Compute_Kinetic_Energy( reax_system*, simulation_data*, MPI_Comm ); @@ -33,16 +38,16 @@ void Compute_System_Energy( reax_system*, simulation_data*, MPI_Comm ); void Compute_Total_Mass( reax_system*, simulation_data*, MPI_Comm ); void Compute_Center_of_Mass( reax_system*, simulation_data*, - mpi_datatypes*, MPI_Comm ); + mpi_datatypes*, MPI_Comm ); void Compute_Pressure( reax_system*, control_params*, - simulation_data*, mpi_datatypes* ); + simulation_data*, mpi_datatypes* ); + //void Compute_Pressure( reax_system*, simulation_data* ); -//CUDA Functions -void Cuda_Compute_Total_Mass( reax_system*, simulation_data*, MPI_Comm ); -void Cuda_Compute_Kinetic_Energy( reax_system*, simulation_data*, MPI_Comm ); -void Cuda_Compute_Center_of_Mass( reax_system*, simulation_data*, - mpi_datatypes*, MPI_Comm ); +#ifdef __cplusplus +} +#endif + #endif diff --git a/PG-PuReMD/src/tool_box.h b/PG-PuReMD/src/tool_box.h index ecaee197..a1f55910 100644 --- a/PG-PuReMD/src/tool_box.h +++ b/PG-PuReMD/src/tool_box.h @@ -29,7 +29,6 @@ extern "C" { #endif - /* from comm_tools.h */ int SumScan( int, int, int, MPI_Comm ); @@ -76,7 +75,6 @@ void *scalloc( size_t, size_t, const char* ); void sfree( void*, const char* ); - #ifdef __cplusplus } #endif @@ -227,4 +225,5 @@ static inline real DistSqr_to_Special_Point( rvec cp, rvec x ) } #endif + #endif diff --git a/PG-PuReMD/src/torsion_angles.c b/PG-PuReMD/src/torsion_angles.c index 58e71f4e..29cfb444 100644 --- a/PG-PuReMD/src/torsion_angles.c +++ b/PG-PuReMD/src/torsion_angles.c @@ -21,7 +21,6 @@ #include "reax_types.h" -#include "index_utils.h" #if defined(PURE_REAX) #include "torsion_angles.h" #include "bond_orders.h" @@ -36,6 +35,8 @@ #include "reax_vector.h" #endif +#include "index_utils.h" + #define MIN_SINE 1e-10 diff --git a/PG-PuReMD/src/torsion_angles.h b/PG-PuReMD/src/torsion_angles.h index d0762a4e..454f0679 100644 --- a/PG-PuReMD/src/torsion_angles.h +++ b/PG-PuReMD/src/torsion_angles.h @@ -24,7 +24,17 @@ #include "reax_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + void Torsion_Angles( reax_system*, control_params*, simulation_data*, - storage*, reax_list**, output_controls* ); + storage*, reax_list**, output_controls* ); + +#ifdef __cplusplus +} +#endif + #endif diff --git a/PG-PuReMD/src/traj.c b/PG-PuReMD/src/traj.c index d561a45f..b7ba1112 100644 --- a/PG-PuReMD/src/traj.c +++ b/PG-PuReMD/src/traj.c @@ -32,7 +32,7 @@ #endif #ifdef HAVE_CUDA - #include "cuda_copy.h" + #include "cuda/cuda_copy.h" #endif diff --git a/PG-PuReMD/src/traj.h b/PG-PuReMD/src/traj.h index 8f09c4a7..13435ecb 100644 --- a/PG-PuReMD/src/traj.h +++ b/PG-PuReMD/src/traj.h @@ -22,10 +22,8 @@ #ifndef __TRAJ_H__ #define __TRAJ_H__ - #include "reax_types.h" - #define MAX_TRJ_LINE_LEN 120 #define MAX_TRJ_BUFFER_SIZE (MAX_TRJ_LINE_LEN * 100) @@ -80,6 +78,10 @@ enum ANGLE_LINE_OPTS }; +#ifdef __cplusplus +extern "C" { +#endif + int Init_Traj( reax_system*, control_params*, output_controls*, mpi_datatypes*, char* ); int End_Traj( int, output_controls* ); @@ -87,5 +89,9 @@ int End_Traj( int, output_controls* ); int Append_Frame( reax_system*, control_params*, simulation_data*, reax_list**, output_controls*, mpi_datatypes* ); +#ifdef __cplusplus +} +#endif + #endif diff --git a/PG-PuReMD/src/valence_angles.h b/PG-PuReMD/src/valence_angles.h index c7a56eaa..1958b0cb 100644 --- a/PG-PuReMD/src/valence_angles.h +++ b/PG-PuReMD/src/valence_angles.h @@ -25,6 +25,10 @@ #include "reax_types.h" +#ifdef __cplusplus +extern "C" { +#endif + void Valence_Angles( reax_system*, control_params*, simulation_data*, storage*, reax_list**, output_controls* ); @@ -32,5 +36,9 @@ void Calculate_Theta( rvec, real, rvec, real, real*, real* ); void Calculate_dCos_Theta( rvec, real, rvec, real, rvec*, rvec*, rvec* ); +#ifdef __cplusplus +} +#endif + #endif diff --git a/PG-PuReMD/src/vector.h b/PG-PuReMD/src/vector.h index adfe7da2..14250909 100644 --- a/PG-PuReMD/src/vector.h +++ b/PG-PuReMD/src/vector.h @@ -136,6 +136,7 @@ CUDA_HOST_DEVICE static inline void rvec_Copy( rvec dest, rvec src ) dest[2] = src[2]; } + CUDA_HOST_DEVICE static inline void rvec_Scale( rvec ret, real c, rvec v ) { ret[0] = c * v[0]; @@ -497,8 +498,8 @@ CUDA_HOST_DEVICE static inline void rtensor_Transpose( rtensor ret, rtensor t ) CUDA_HOST_DEVICE static inline real rtensor_Det( rtensor t ) { return ( t[0][0] * (t[1][1] * t[2][2] - t[1][2] * t[2][1] ) + - t[0][1] * (t[1][2] * t[2][0] - t[1][0] * t[2][2] ) + - t[0][2] * (t[1][0] * t[2][1] - t[1][1] * t[2][0] ) ); + t[0][1] * (t[1][2] * t[2][0] - t[1][0] * t[2][2] ) + + t[0][2] * (t[1][0] * t[2][1] - t[1][1] * t[2][0] ) ); } -- GitLab