From a085b0203209a319158d2a85dcb673428a9d8174 Mon Sep 17 00:00:00 2001 From: KAO <ohearnk@seawolf.cis.gvsu.edu> Date: Fri, 25 Nov 2016 17:50:29 -0500 Subject: [PATCH] PuReMD-GPU: begin build system refactor. --- PuReMD-GPU/Makefile.am | 26 +- PuReMD-GPU/src/allocate.c | 277 ++++++++++++ PuReMD-GPU/src/allocate.h | 8 - PuReMD-GPU/src/bond_orders.cu | 20 +- PuReMD-GPU/src/{box.cu => box.c} | 0 PuReMD-GPU/src/box.h | 3 +- .../src/{allocate.cu => cuda_allocate.cu} | 377 ++++------------ PuReMD-GPU/src/cuda_allocate.h | 31 ++ PuReMD-GPU/src/cuda_helpers.h | 22 +- .../{system_props.cu => cuda_system_props.cu} | 402 ++---------------- PuReMD-GPU/src/cuda_system_props.h | 32 ++ PuReMD-GPU/src/forces.cu | 13 +- PuReMD-GPU/src/four_body_interactions.cu | 18 +- PuReMD-GPU/src/init_md.cu | 11 +- PuReMD-GPU/src/integrate.cu | 3 + PuReMD-GPU/src/mytypes.h | 5 + PuReMD-GPU/src/neighbors.cu | 2 +- PuReMD-GPU/src/random.h | 6 +- PuReMD-GPU/src/single_body_interactions.cu | 32 +- PuReMD-GPU/src/system_props.c | 348 +++++++++++++++ PuReMD-GPU/src/system_props.h | 7 - PuReMD-GPU/src/{testmd.cu => testmd.c} | 1 + PuReMD-GPU/src/three_body_interactions.cu | 30 +- PuReMD-GPU/src/{traj.cu => traj.c} | 9 +- PuReMD-GPU/src/traj.h | 8 +- PuReMD-GPU/src/two_body_interactions.cu | 6 +- PuReMD-GPU/src/{vector.cu => vector.c} | 0 PuReMD-GPU/src/vector.h | 48 +-- 28 files changed, 937 insertions(+), 808 deletions(-) create mode 100644 PuReMD-GPU/src/allocate.c rename PuReMD-GPU/src/{box.cu => box.c} (100%) rename PuReMD-GPU/src/{allocate.cu => cuda_allocate.cu} (66%) create mode 100644 PuReMD-GPU/src/cuda_allocate.h rename PuReMD-GPU/src/{system_props.cu => cuda_system_props.cu} (56%) create mode 100644 PuReMD-GPU/src/cuda_system_props.h create mode 100644 PuReMD-GPU/src/system_props.c rename PuReMD-GPU/src/{testmd.cu => testmd.c} (99%) rename PuReMD-GPU/src/{traj.cu => traj.c} (99%) rename PuReMD-GPU/src/{vector.cu => vector.c} (100%) diff --git a/PuReMD-GPU/Makefile.am b/PuReMD-GPU/Makefile.am index 1a914816..f4da052f 100644 --- a/PuReMD-GPU/Makefile.am +++ b/PuReMD-GPU/Makefile.am @@ -18,29 +18,33 @@ NVCCFLAGS += --compiler-options "$(DEFS) -D__SM_35__ -O3 -funroll-loops -fstrict #NVCCFLAGS += --ptxas-options -v bin_PROGRAMS = bin/puremd-gpu -bin_puremd_gpu_SOURCES = src/analyze.c src/print_utils.c src/restart.c src/param.c src/pdb_tools.c \ - src/GMRES.cu src/QEq.cu src/allocate.cu src/bond_orders.cu \ - src/box.cu src/forces.cu src/four_body_interactions.cu \ +bin_puremd_gpu_SOURCES = src/analyze.c src/print_utils.c \ + src/restart.c src/param.c src/pdb_tools.c src/box.c \ + src/GMRES.cu src/QEq.cu src/allocate.c src/bond_orders.cu \ + src/forces.cu src/four_body_interactions.cu \ src/grid.cu src/init_md.cu src/integrate.cu src/list.cu \ src/lookup.cu src/neighbors.cu \ src/reset_utils.cu src/single_body_interactions.cu \ - src/system_props.cu src/three_body_interactions.cu \ - src/traj.cu src/two_body_interactions.cu src/vector.cu \ - src/testmd.cu \ + src/system_props.c src/three_body_interactions.cu \ + src/traj.c src/two_body_interactions.cu src/vector.c \ src/cuda_utils.cu src/cuda_copy.cu src/cuda_init.cu src/reduction.cu \ - src/center_mass.cu src/helpers.cu src/validation.cu src/matvec.cu - + src/center_mass.cu src/helpers.cu src/validation.cu src/matvec.cu \ + src/cuda_allocate.cu \ + src/cuda_system_props.cu \ + src/testmd.c include_HEADERS = src/mytypes.h src/analyze.h src/print_utils.h \ - src/restart.h src/param.h src/pdb_tools.h \ + src/restart.h src/param.h src/pdb_tools.h src/box.h \ src/GMRES.h src/QEq.h src/allocate.h src/bond_orders.h \ - src/box.h src/forces.h src/four_body_interactions.h \ + src/forces.h src/four_body_interactions.h \ src/grid.h src/init_md.h src/integrate.h src/list.h \ src/lookup.h src/neighbors.h \ src/reset_utils.h src/single_body_interactions.h \ src/system_props.h src/three_body_interactions.h \ src/traj.h src/two_body_interactions.h src/vector.h \ src/cuda_utils.h src/cuda_copy.h src/cuda_init.h src/reduction.h \ - src/center_mass.h src/helpers.h src/validation.h src/matvec.h + src/center_mass.h src/helpers.h src/validation.h src/matvec.h \ + src/cuda_allocate.h \ + src/cuda_system_props.h # dummy source to cause C linking nodist_EXTRA_bin_puremd_gpu_SOURCES = src/dummy.c diff --git a/PuReMD-GPU/src/allocate.c b/PuReMD-GPU/src/allocate.c new file mode 100644 index 00000000..b776a6a7 --- /dev/null +++ b/PuReMD-GPU/src/allocate.c @@ -0,0 +1,277 @@ +/*---------------------------------------------------------------------- + PuReMD-GPU - Reax Force Field Simulator + + Copyright (2014) Purdue University + Sudhir Kylasa, skylasa@purdue.edu + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#include "allocate.h" + +#include "list.h" + + +void Reallocate_Neighbor_List( list *far_nbrs, int n, int num_intrs ) +{ + Delete_List( far_nbrs, TYP_HOST ); + if(!Make_List( n, num_intrs, TYP_FAR_NEIGHBOR, far_nbrs, TYP_HOST )){ + fprintf(stderr, "Problem in initializing far nbrs list. Terminating!\n"); + exit( INIT_ERR ); + } + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "num_far = %d, far_nbrs = %d -> reallocating!\n", + num_intrs, far_nbrs->num_intrs ); + fprintf( stderr, "memory allocated: far_nbrs = %ldMB\n", + num_intrs * sizeof(far_neighbor_data) / (1024*1024) ); +#endif +} + + +int Allocate_Matrix( sparse_matrix *H, int n, int m ) +{ + H->n = n; + H->m = m; + if( (H->start = (int*) malloc(sizeof(int) * n+1)) == NULL ) + return 0; + + if( (H->end = (int*) malloc(sizeof(int) * n+1)) == NULL ) + return 0; + + if( (H->entries = + (sparse_matrix_entry*) malloc(sizeof(sparse_matrix_entry)*m)) == NULL ) + return 0; + + return 1; +} + + +void Deallocate_Matrix( sparse_matrix *H ) +{ + free(H->start); + free(H->entries); + free(H->end); +} + + +int Reallocate_Matrix( sparse_matrix *H, int n, int m, char *name ) +{ + Deallocate_Matrix( H ); + if( !Allocate_Matrix( H, n, m ) ) { + fprintf(stderr, "not enough space for %s matrix. terminating!\n", name); + exit( 1 ); + } + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "reallocating %s matrix, n = %d, m = %d\n", + name, n, m ); + fprintf( stderr, "memory allocated: %s = %ldMB\n", + name, m * sizeof(sparse_matrix_entry) / (1024*1024) ); +#endif + return 1; +} + + +int Allocate_HBond_List( int n, int num_h, int *h_index, int *hb_top, + list *hbonds ) +{ + int i, num_hbonds; + + num_hbonds = 0; + /* find starting indexes for each H and the total number of hbonds */ + for( i = 1; i < n; ++i ) + hb_top[i] += hb_top[i-1]; + num_hbonds = hb_top[n-1]; + + if( !Make_List(num_h, num_hbonds, TYP_HBOND, hbonds, TYP_HOST ) ) { + fprintf( stderr, "not enough space for hbonds list. terminating!\n" ); + exit( INIT_ERR ); + } + + for( i = 0; i < n; ++i ) + if( h_index[i] == 0 ){ + Set_Start_Index( 0, 0, hbonds ); + Set_End_Index( 0, 0, hbonds ); + } + else if( h_index[i] > 0 ){ + Set_Start_Index( h_index[i], hb_top[i-1], hbonds ); + Set_End_Index( h_index[i], hb_top[i-1], hbonds ); + } + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "allocating hbonds - num_hbonds: %d\n", num_hbonds ); + fprintf( stderr, "memory allocated: hbonds = %ldMB\n", + num_hbonds * sizeof(hbond_data) / (1024*1024) ); +#endif + return 1; +} + + +int Reallocate_HBonds_List( int n, int num_h, int *h_index, list *hbonds ) +{ + int i; + int *hb_top; + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "reallocating hbonds\n" ); +#endif + hb_top = (int *)calloc( n, sizeof(int) ); + for( i = 0; i < n; ++i ) + if( h_index[i] >= 0 ) + hb_top[i] = MAX(Num_Entries(h_index[i],hbonds)*SAFE_HBONDS, MIN_HBONDS); + + Delete_List( hbonds, TYP_HOST ); + + Allocate_HBond_List( n, num_h, h_index, hb_top, hbonds ); + + free( hb_top ); + + return 1; +} + + +int Allocate_Bond_List( int n, int *bond_top, list *bonds ) +{ + int i, num_bonds; + + num_bonds = 0; + /* find starting indexes for each atom and the total number of bonds */ + for( i = 1; i < n; ++i ) + bond_top[i] += bond_top[i-1]; + num_bonds = bond_top[n-1]; + + if( !Make_List(n, num_bonds, TYP_BOND, bonds, TYP_HOST ) ) { + fprintf( stderr, "not enough space for bonds list. terminating!\n" ); + exit( INIT_ERR ); + } + + Set_Start_Index( 0, 0, bonds ); + Set_End_Index( 0, 0, bonds ); + for( i = 1; i < n; ++i ) { + Set_Start_Index( i, bond_top[i-1], bonds ); + Set_End_Index( i, bond_top[i-1], bonds ); + } + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "allocating bonds - num_bonds: %d\n", num_bonds ); + fprintf( stderr, "memory allocated: bonds = %ldMB\n", + num_bonds * sizeof(bond_data) / (1024*1024) ); +#endif + return 1; +} + + +int Reallocate_Bonds_List( int n, list *bonds, int *num_bonds, int *est_3body ) +{ + int i; + int *bond_top; + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "reallocating bonds\n" ); +#endif + bond_top = (int *)calloc( n, sizeof(int) ); + *est_3body = 0; + for( i = 0; i < n; ++i ){ + *est_3body += SQR( Num_Entries( i, bonds ) ); + bond_top[i] = MAX( Num_Entries( i, bonds ) * 2, MIN_BONDS ); + } + + Delete_List( bonds, TYP_HOST ); + + Allocate_Bond_List( n, bond_top, bonds ); + *num_bonds = bond_top[n-1]; + + free( bond_top ); + + return 1; +} + + +void Reallocate( reax_system *system, static_storage *workspace, list **lists, + int nbr_flag ) +{ + int num_bonds, est_3body; + reallocate_data *realloc; + grid *g; + + realloc = &(workspace->realloc); + g = &(system->g); + + if( realloc->num_far > 0 && nbr_flag ) { + fprintf (stderr, " Reallocating neighbors \n"); + Reallocate_Neighbor_List( (*lists)+FAR_NBRS, + system->N, realloc->num_far * SAFE_ZONE ); + realloc->num_far = -1; + } + + if( realloc->Htop > 0 ){ + fprintf (stderr, " Reallocating Matrix \n"); + Reallocate_Matrix(&(workspace->H), system->N, realloc->Htop*SAFE_ZONE,"H"); + realloc->Htop = -1; + + Deallocate_Matrix( &workspace->L ); + Deallocate_Matrix( &workspace->U ); + } + + if( realloc->hbonds > 0 ){ + fprintf (stderr, " Reallocating hbonds \n"); + Reallocate_HBonds_List(system->N, workspace->num_H, workspace->hbond_index, + (*lists)+HBONDS ); + realloc->hbonds = -1; + } + + num_bonds = est_3body = -1; + if( realloc->bonds > 0 ){ + fprintf (stderr, " Reallocating bonds \n"); + Reallocate_Bonds_List( system->N, (*lists)+BONDS, &num_bonds, &est_3body ); + realloc->bonds = -1; + realloc->num_3body = MAX( realloc->num_3body, est_3body ); + } + + if( realloc->num_3body > 0 ) { + fprintf (stderr, " Reallocating 3Body \n"); + Delete_List( (*lists)+THREE_BODIES, TYP_HOST ); + + if( num_bonds == -1 ) + num_bonds = ((*lists)+BONDS)->num_intrs; + realloc->num_3body *= SAFE_ZONE; + + if( !Make_List( num_bonds, realloc->num_3body, + TYP_THREE_BODY, (*lists)+THREE_BODIES, TYP_HOST ) ) { + fprintf( stderr, "Problem in initializing angles list. Terminating!\n" ); + exit( INIT_ERR ); + } + realloc->num_3body = -1; +#if defined(DEBUG_FOCUS) + fprintf( stderr, "reallocating 3 bodies\n" ); + fprintf( stderr, "reallocated - num_bonds: %d\n", num_bonds ); + fprintf( stderr, "reallocated - num_3body: %d\n", realloc->num_3body ); + fprintf( stderr, "reallocated 3body memory: %ldMB\n", + realloc->num_3body*sizeof(three_body_interaction_data)/ + (1024*1024) ); +#endif + } + + if( realloc->gcell_atoms > -1 ){ +#if defined(DEBUG_FOCUS) + fprintf(stderr, "reallocating gcell: g->max_atoms: %d\n", g->max_atoms); +#endif + + free (g->atoms); + g->atoms = (int *) calloc ( g->ncell[0]*g->ncell[1]*g->ncell[2], + sizeof (int) * workspace->realloc.gcell_atoms); + realloc->gcell_atoms = -1; + } +} diff --git a/PuReMD-GPU/src/allocate.h b/PuReMD-GPU/src/allocate.h index 7ab146bb..4ec4b541 100644 --- a/PuReMD-GPU/src/allocate.h +++ b/PuReMD-GPU/src/allocate.h @@ -32,13 +32,5 @@ int Allocate_HBond_List( int, int, int*, int*, list* ); int Allocate_Bond_List( int, int*, list* ); -//Cuda Functions -int Cuda_Allocate_Matrix( sparse_matrix*, int, int ); -int Cuda_Allocate_HBond_List( int, int, int*, int*, list* ); -int Cuda_Allocate_Bond_List( int, int*, list* ); -void Cuda_Reallocate( reax_system*, static_storage*, list*, int, int ); - -GLOBAL void Init_HBond_Indexes ( int *, int *, list , int ); -GLOBAL void Init_Bond_Indexes ( int *, list , int ); #endif diff --git a/PuReMD-GPU/src/bond_orders.cu b/PuReMD-GPU/src/bond_orders.cu index d39f1b41..bb72417b 100644 --- a/PuReMD-GPU/src/bond_orders.cu +++ b/PuReMD-GPU/src/bond_orders.cu @@ -36,6 +36,7 @@ inline real Cf45( real p1, real p2 ) ( SQR( EXP(-p1 / 2) + EXP(p1 / 2) ) * (EXP(-p2 / 2) + EXP(p2 / 2)) ); } + #ifdef TEST_FORCES void Get_dBO( reax_system *system, list **lists, int i, int pj, real C, rvec *v ) @@ -66,7 +67,8 @@ void Get_dBOpinpi2( reax_system *system, list **lists, start_pj = Start_Index(pj, dBOs); end_pj = End_Index(pj, dBOs); - for( k = start_pj; k < end_pj; ++k ) { + for( k = start_pj; k < end_pj; ++k ) + { dbo_k = &(dBOs->select.dbo_list[k]); rvec_Scale( vpi[dbo_k->wrt], Cpi, dbo_k->dBOpi ); rvec_Scale( vpi2[dbo_k->wrt], Cpi2, dbo_k->dBOpi2 ); @@ -179,7 +181,6 @@ void Add_dDelta_to_Forces( reax_system *system, list **lists, int i, real C ) } - HOST_DEVICE void Calculate_dBO( int i, int pj, static_storage p_workspace, list p_bonds, list p_dBOs, int *top ) { @@ -367,7 +368,6 @@ HOST_DEVICE void Calculate_dBO( int i, int pj, static_storage p_workspace, #endif - void Add_dBond_to_Forces_NPT( int i, int pj, reax_system *system, simulation_data *data, static_storage *workspace, list **lists ) @@ -521,10 +521,10 @@ void Add_dBond_to_Forces_NPT( int i, int pj, reax_system *system, temp[0], temp[1], temp[2] ); */ } + ///////////////////////////////////////////////////////////// //Cuda Functions ///////////////////////////////////////////////////////////// - HOST_DEVICE void Cuda_Add_dBond_to_Forces_NPT( int i, int pj, reax_atom *atoms, simulation_data *data, static_storage *workspace, list *bonds ) @@ -646,6 +646,7 @@ HOST_DEVICE void Cuda_Add_dBond_to_Forces_NPT( int i, int pj, reax_atom *atoms, rvec_Add( data->ext_press, ext_press ); } + ///////////////////////////////////////////////////////////// //Cuda Functions ///////////////////////////////////////////////////////////// @@ -761,6 +762,7 @@ void Add_dBond_to_Forces( int i, int pj, reax_system *system, /*3rd, dBOpi2*/ } + HOST_DEVICE void Cuda_Add_dBond_to_Forces ( int i, int pj, reax_atom *atoms, static_storage *workspace, list *bonds ) { @@ -890,6 +892,7 @@ HOST_DEVICE void Cuda_Add_dBond_to_Forces ( int i, int pj, reax_atom *atoms, } } + HOST_DEVICE void Cuda_dbond_to_Forces_postprocess (int i, reax_atom *atoms, list *bonds) { int pk; @@ -910,6 +913,7 @@ HOST_DEVICE void Cuda_dbond_to_Forces_postprocess (int i, reax_atom *atoms, list } } + /* Locate j on i's list. This function assumes that j is there for sure! And this is the case given our method of neighbor generation*/ @@ -1377,7 +1381,6 @@ GLOBAL void Cuda_Calculate_Bond_Orders_Init ( reax_atom *atoms, global_paramete /* A very important and crucial assumption here is that each segment belonging to a different atom in nbrhoods->nbr_list is sorted in its own. This can either be done in the general coordinator function or here */ - GLOBAL void Cuda_Calculate_Bond_Orders ( reax_atom *atoms, global_parameters g_params, single_body_parameters *sbp, two_body_parameters *tbp, static_storage workspace, list bonds, list dDeltas, list dBOs, int num_atom_types, int N ) @@ -1398,7 +1401,6 @@ GLOBAL void Cuda_Calculate_Bond_Orders ( reax_atom *atoms, global_parameters g_ bond_order_data *bo_ij, *bo_ji; single_body_parameters *sbp_i, *sbp_j; - #if defined(TEST_FORCES) int k, pk, start_j, end_j; int top_dbo=0, top_dDelta=0; @@ -1433,7 +1435,6 @@ GLOBAL void Cuda_Calculate_Bond_Orders ( reax_atom *atoms, global_parameters g_ // fprintf( stderr, "done with uncorrected bond orders\n" ); - /* Corrected Bond Order calculations */ //for( i = 0; i < system->N; ++i ) { type_i = atoms[i].type; @@ -1778,6 +1779,7 @@ workspace.dDelta_lp_temp[j] = workspace.Clp[j]; #endif } + GLOBAL void Cuda_Update_Uncorrected_BO ( static_storage workspace, list bonds, int N ) { int i, j, pj; @@ -1812,6 +1814,7 @@ GLOBAL void Cuda_Update_Uncorrected_BO ( static_storage workspace, list bonds, } } + GLOBAL void Cuda_Update_Workspace_After_Bond_Orders( reax_atom *atoms, global_parameters g_params, single_body_parameters *sbp, static_storage workspace, int N ) { @@ -1867,8 +1870,8 @@ GLOBAL void Cuda_Update_Workspace_After_Bond_Orders( reax_atom *atoms, global_p } -//Import from the forces file. +//Import from the forces file. GLOBAL void Cuda_Compute_Total_Force (reax_atom *atoms, simulation_data *data, static_storage workspace, list p_bonds, int ensemble, int N) { @@ -1889,6 +1892,7 @@ GLOBAL void Cuda_Compute_Total_Force (reax_atom *atoms, simulation_data *data, } } + GLOBAL void Cuda_Compute_Total_Force_PostProcess (reax_atom *atoms, simulation_data *data, static_storage workspace, list p_bonds, int ensemble, int N) { diff --git a/PuReMD-GPU/src/box.cu b/PuReMD-GPU/src/box.c similarity index 100% rename from PuReMD-GPU/src/box.cu rename to PuReMD-GPU/src/box.c diff --git a/PuReMD-GPU/src/box.h b/PuReMD-GPU/src/box.h index ed8cc9f4..4d97e424 100644 --- a/PuReMD-GPU/src/box.h +++ b/PuReMD-GPU/src/box.h @@ -21,8 +21,10 @@ #ifndef __BOX_H__ #define __BOX_H__ + #include "mytypes.h" + /* Initializes box from CRYST1 line of PDB */ void Init_Box_From_CRYST(real, real, real, real, real, real, simulation_box*/*, int*/); @@ -98,7 +100,6 @@ void Print_Box_Information( simulation_box*, FILE* ); //CUDA Device Functions //HOST_DEVICE inline void Inc_on_T3( rvec, rvec, simulation_box* ); - HOST_DEVICE inline void Inc_on_T3( rvec x, rvec dx, simulation_box *box ) { int i; diff --git a/PuReMD-GPU/src/allocate.cu b/PuReMD-GPU/src/cuda_allocate.cu similarity index 66% rename from PuReMD-GPU/src/allocate.cu rename to PuReMD-GPU/src/cuda_allocate.cu index bae0dde8..5f2b322b 100644 --- a/PuReMD-GPU/src/allocate.cu +++ b/PuReMD-GPU/src/cuda_allocate.cu @@ -18,27 +18,17 @@ <http://www.gnu.org/licenses/>. ----------------------------------------------------------------------*/ -#include "allocate.h" +#include "cuda_allocate.h" + #include "list.h" #include "cuda_utils.h" #include "reduction.h" -void Reallocate_Neighbor_List( list *far_nbrs, int n, int num_intrs ) -{ - Delete_List( far_nbrs, TYP_HOST ); - if(!Make_List( n, num_intrs, TYP_FAR_NEIGHBOR, far_nbrs, TYP_HOST )){ - fprintf(stderr, "Problem in initializing far nbrs list. Terminating!\n"); - exit( INIT_ERR ); - } -#if defined(DEBUG_FOCUS) - fprintf( stderr, "num_far = %d, far_nbrs = %d -> reallocating!\n", - num_intrs, far_nbrs->num_intrs ); - fprintf( stderr, "memory allocated: far_nbrs = %ldMB\n", - num_intrs * sizeof(far_neighbor_data) / (1024*1024) ); -#endif -} +GLOBAL void Init_HBond_Indexes ( int *, int *, list , int ); +GLOBAL void Init_Bond_Indexes ( int *, list , int ); + void Cuda_Reallocate_Neighbor_List( list *far_nbrs, int n, int num_intrs ) { @@ -57,23 +47,6 @@ void Cuda_Reallocate_Neighbor_List( list *far_nbrs, int n, int num_intrs ) } -int Allocate_Matrix( sparse_matrix *H, int n, int m ) -{ - H->n = n; - H->m = m; - if( (H->start = (int*) malloc(sizeof(int) * n+1)) == NULL ) - return 0; - - if( (H->end = (int*) malloc(sizeof(int) * n+1)) == NULL ) - return 0; - - if( (H->entries = - (sparse_matrix_entry*) malloc(sizeof(sparse_matrix_entry)*m)) == NULL ) - return 0; - - return 1; -} - int Cuda_Allocate_Matrix( sparse_matrix *H, int n, int m ) { H->n = n; @@ -87,13 +60,6 @@ int Cuda_Allocate_Matrix( sparse_matrix *H, int n, int m ) } -void Deallocate_Matrix( sparse_matrix *H ) -{ - free(H->start); - free(H->entries); - free(H->end); -} - void Cuda_Deallocate_Matrix( sparse_matrix *H ) { cuda_free(H->start, RES_SPARSE_MATRIX_INDEX); @@ -106,23 +72,6 @@ void Cuda_Deallocate_Matrix( sparse_matrix *H ) } -int Reallocate_Matrix( sparse_matrix *H, int n, int m, char *name ) -{ - Deallocate_Matrix( H ); - if( !Allocate_Matrix( H, n, m ) ) { - fprintf(stderr, "not enough space for %s matrix. terminating!\n", name); - exit( 1 ); - } - -#if defined(DEBUG_FOCUS) - fprintf( stderr, "reallocating %s matrix, n = %d, m = %d\n", - name, n, m ); - fprintf( stderr, "memory allocated: %s = %ldMB\n", - name, m * sizeof(sparse_matrix_entry) / (1024*1024) ); -#endif - return 1; -} - int Cuda_Reallocate_Matrix( sparse_matrix *H, int n, int m, char *name ) { Cuda_Deallocate_Matrix( H ); @@ -142,56 +91,6 @@ int Cuda_Reallocate_Matrix( sparse_matrix *H, int n, int m, char *name ) } -int Allocate_HBond_List( int n, int num_h, int *h_index, int *hb_top, - list *hbonds ) -{ - int i, num_hbonds; - - num_hbonds = 0; - /* find starting indexes for each H and the total number of hbonds */ - for( i = 1; i < n; ++i ) - hb_top[i] += hb_top[i-1]; - num_hbonds = hb_top[n-1]; - - if( !Make_List(num_h, num_hbonds, TYP_HBOND, hbonds, TYP_HOST ) ) { - fprintf( stderr, "not enough space for hbonds list. terminating!\n" ); - exit( INIT_ERR ); - } - - for( i = 0; i < n; ++i ) - if( h_index[i] == 0 ){ - Set_Start_Index( 0, 0, hbonds ); - Set_End_Index( 0, 0, hbonds ); - } - else if( h_index[i] > 0 ){ - Set_Start_Index( h_index[i], hb_top[i-1], hbonds ); - Set_End_Index( h_index[i], hb_top[i-1], hbonds ); - } - -#if defined(DEBUG_FOCUS) - fprintf( stderr, "allocating hbonds - num_hbonds: %d\n", num_hbonds ); - fprintf( stderr, "memory allocated: hbonds = %ldMB\n", - num_hbonds * sizeof(hbond_data) / (1024*1024) ); -#endif - return 1; -} - -GLOBAL void Init_HBond_Indexes ( int *h_index, int *hb_top, list hbonds, int N ) -{ - int index = blockIdx.x * blockDim.x + threadIdx.x; - - if (index >= N) return; - - if( h_index[index] == 0 ){ - Set_Start_Index( 0, 0, &hbonds ); - Set_End_Index( 0, 0, &hbonds ); - } - else if( h_index[index] > 0 ){ - Set_Start_Index( h_index[index], hb_top[index-1], &hbonds ); - Set_End_Index( h_index[index], hb_top[index-1], &hbonds ); - } -} - int Cuda_Allocate_HBond_List( int n, int num_h, int *h_index, int *hb_top, list *hbonds ) { int i, num_hbonds; @@ -225,27 +124,6 @@ int Cuda_Allocate_HBond_List( int n, int num_h, int *h_index, int *hb_top, list return 1; } -int Reallocate_HBonds_List( int n, int num_h, int *h_index, list *hbonds ) -{ - int i; - int *hb_top; - -#if defined(DEBUG_FOCUS) - fprintf( stderr, "reallocating hbonds\n" ); -#endif - hb_top = (int *)calloc( n, sizeof(int) ); - for( i = 0; i < n; ++i ) - if( h_index[i] >= 0 ) - hb_top[i] = MAX(Num_Entries(h_index[i],hbonds)*SAFE_HBONDS, MIN_HBONDS); - - Delete_List( hbonds, TYP_HOST ); - - Allocate_HBond_List( n, num_h, h_index, hb_top, hbonds ); - - free( hb_top ); - - return 1; -} int Cuda_Reallocate_HBonds_List( int n, int num_h, int *h_index, list *hbonds ) { @@ -281,21 +159,6 @@ int Cuda_Reallocate_HBonds_List( int n, int num_h, int *h_index, list *hbonds ) return 1; } -GLOBAL void Init_Bond_Indexes ( int *b_top, list bonds, int N ) -{ - int index = blockIdx.x * blockDim.x + threadIdx.x; - - if (index >= N) return; - - if( index == 0 ){ - Set_Start_Index( 0, 0, &bonds ); - Set_End_Index( 0, 0, &bonds ); - } - else if( index > 0 ){ - Set_Start_Index( index, b_top[index-1], &bonds ); - Set_End_Index( index, b_top[index-1], &bonds ); - } -} int Cuda_Allocate_Bond_List( int num_b, int *b_top, list *bonds ) { @@ -326,93 +189,6 @@ int Cuda_Allocate_Bond_List( int num_b, int *b_top, list *bonds ) } -int Allocate_Bond_List( int n, int *bond_top, list *bonds ) -{ - int i, num_bonds; - - num_bonds = 0; - /* find starting indexes for each atom and the total number of bonds */ - for( i = 1; i < n; ++i ) - bond_top[i] += bond_top[i-1]; - num_bonds = bond_top[n-1]; - - if( !Make_List(n, num_bonds, TYP_BOND, bonds, TYP_HOST ) ) { - fprintf( stderr, "not enough space for bonds list. terminating!\n" ); - exit( INIT_ERR ); - } - - Set_Start_Index( 0, 0, bonds ); - Set_End_Index( 0, 0, bonds ); - for( i = 1; i < n; ++i ) { - Set_Start_Index( i, bond_top[i-1], bonds ); - Set_End_Index( i, bond_top[i-1], bonds ); - } - -#if defined(DEBUG_FOCUS) - fprintf( stderr, "allocating bonds - num_bonds: %d\n", num_bonds ); - fprintf( stderr, "memory allocated: bonds = %ldMB\n", - num_bonds * sizeof(bond_data) / (1024*1024) ); -#endif - return 1; -} - - -int Reallocate_Bonds_List( int n, list *bonds, int *num_bonds, int *est_3body ) -{ - int i; - int *bond_top; - -#if defined(DEBUG_FOCUS) - fprintf( stderr, "reallocating bonds\n" ); -#endif - bond_top = (int *)calloc( n, sizeof(int) ); - *est_3body = 0; - for( i = 0; i < n; ++i ){ - *est_3body += SQR( Num_Entries( i, bonds ) ); - bond_top[i] = MAX( Num_Entries( i, bonds ) * 2, MIN_BONDS ); - } - - Delete_List( bonds, TYP_HOST ); - - Allocate_Bond_List( n, bond_top, bonds ); - *num_bonds = bond_top[n-1]; - - free( bond_top ); - - return 1; -} - -void GLOBAL Calculate_Bond_Indexes (int *bond_top, list bonds, int *per_block_results, int n) -{ - extern __shared__ int sh_input[]; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - real x = 0; - - if(i < n) - { - x = SQR (Num_Entries( i, &bonds ) ); - bond_top[i] = MAX( Num_Entries( i, &bonds ) * 2, MIN_BONDS ); - } - sh_input[threadIdx.x] = x; - __syncthreads(); - - for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) - { - if(threadIdx.x < offset) - { - sh_input[threadIdx.x] += sh_input[threadIdx.x + offset]; - } - - __syncthreads(); - } - - if(threadIdx.x == 0) - { - per_block_results[blockIdx.x] = sh_input[0]; - } -} - - int Cuda_Reallocate_Bonds_List( int n, list *bonds, int *num_3body ) { int i; @@ -450,6 +226,7 @@ int Cuda_Reallocate_Bonds_List( int n, list *bonds, int *num_3body ) return i; } + int Cuda_Reallocate_ThreeBody_List ( list *thblist, int count ) { int i; @@ -542,83 +319,6 @@ cuda_memset (d_bond_top, 0, (n+BLOCKS_POW_2+1) * INT_SIZE, RES_SCRATCH ); */ -void Reallocate( reax_system *system, static_storage *workspace, list **lists, - int nbr_flag ) -{ - int num_bonds, est_3body; - reallocate_data *realloc; - grid *g; - - realloc = &(workspace->realloc); - g = &(system->g); - - if( realloc->num_far > 0 && nbr_flag ) { - fprintf (stderr, " Reallocating neighbors \n"); - Reallocate_Neighbor_List( (*lists)+FAR_NBRS, - system->N, realloc->num_far * SAFE_ZONE ); - realloc->num_far = -1; - } - - if( realloc->Htop > 0 ){ - fprintf (stderr, " Reallocating Matrix \n"); - Reallocate_Matrix(&(workspace->H), system->N, realloc->Htop*SAFE_ZONE,"H"); - realloc->Htop = -1; - - Deallocate_Matrix( &workspace->L ); - Deallocate_Matrix( &workspace->U ); - } - - if( realloc->hbonds > 0 ){ - fprintf (stderr, " Reallocating hbonds \n"); - Reallocate_HBonds_List(system->N, workspace->num_H, workspace->hbond_index, - (*lists)+HBONDS ); - realloc->hbonds = -1; - } - - num_bonds = est_3body = -1; - if( realloc->bonds > 0 ){ - fprintf (stderr, " Reallocating bonds \n"); - Reallocate_Bonds_List( system->N, (*lists)+BONDS, &num_bonds, &est_3body ); - realloc->bonds = -1; - realloc->num_3body = MAX( realloc->num_3body, est_3body ); - } - - if( realloc->num_3body > 0 ) { - fprintf (stderr, " Reallocating 3Body \n"); - Delete_List( (*lists)+THREE_BODIES, TYP_HOST ); - - if( num_bonds == -1 ) - num_bonds = ((*lists)+BONDS)->num_intrs; - realloc->num_3body *= SAFE_ZONE; - - if( !Make_List( num_bonds, realloc->num_3body, - TYP_THREE_BODY, (*lists)+THREE_BODIES, TYP_HOST ) ) { - fprintf( stderr, "Problem in initializing angles list. Terminating!\n" ); - exit( INIT_ERR ); - } - realloc->num_3body = -1; -#if defined(DEBUG_FOCUS) - fprintf( stderr, "reallocating 3 bodies\n" ); - fprintf( stderr, "reallocated - num_bonds: %d\n", num_bonds ); - fprintf( stderr, "reallocated - num_3body: %d\n", realloc->num_3body ); - fprintf( stderr, "reallocated 3body memory: %ldMB\n", - realloc->num_3body*sizeof(three_body_interaction_data)/ - (1024*1024) ); -#endif - } - - if( realloc->gcell_atoms > -1 ){ -#if defined(DEBUG_FOCUS) - fprintf(stderr, "reallocating gcell: g->max_atoms: %d\n", g->max_atoms); -#endif - - free (g->atoms); - g->atoms = (int *) calloc ( g->ncell[0]*g->ncell[1]*g->ncell[2], - sizeof (int) * workspace->realloc.gcell_atoms); - realloc->gcell_atoms = -1; - } -} - void Cuda_Reallocate( reax_system *system, static_storage *workspace, list *lists, int nbr_flag, int step ) { @@ -724,3 +424,68 @@ void Cuda_Reallocate( reax_system *system, static_storage *workspace, list *list realloc->gcell_atoms = -1; } } + + +GLOBAL void Init_HBond_Indexes ( int *h_index, int *hb_top, list hbonds, int N ) +{ + int index = blockIdx.x * blockDim.x + threadIdx.x; + + if (index >= N) return; + + if( h_index[index] == 0 ){ + Set_Start_Index( 0, 0, &hbonds ); + Set_End_Index( 0, 0, &hbonds ); + } + else if( h_index[index] > 0 ){ + Set_Start_Index( h_index[index], hb_top[index-1], &hbonds ); + Set_End_Index( h_index[index], hb_top[index-1], &hbonds ); + } +} + + +GLOBAL void Init_Bond_Indexes ( int *b_top, list bonds, int N ) +{ + int index = blockIdx.x * blockDim.x + threadIdx.x; + + if (index >= N) return; + + if( index == 0 ){ + Set_Start_Index( 0, 0, &bonds ); + Set_End_Index( 0, 0, &bonds ); + } + else if( index > 0 ){ + Set_Start_Index( index, b_top[index-1], &bonds ); + Set_End_Index( index, b_top[index-1], &bonds ); + } +} + + +void GLOBAL Calculate_Bond_Indexes (int *bond_top, list bonds, int *per_block_results, int n) +{ + extern __shared__ int sh_input[]; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + real x = 0; + + if(i < n) + { + x = SQR (Num_Entries( i, &bonds ) ); + bond_top[i] = MAX( Num_Entries( i, &bonds ) * 2, MIN_BONDS ); + } + sh_input[threadIdx.x] = x; + __syncthreads(); + + for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) + { + if(threadIdx.x < offset) + { + sh_input[threadIdx.x] += sh_input[threadIdx.x + offset]; + } + + __syncthreads(); + } + + if(threadIdx.x == 0) + { + per_block_results[blockIdx.x] = sh_input[0]; + } +} diff --git a/PuReMD-GPU/src/cuda_allocate.h b/PuReMD-GPU/src/cuda_allocate.h new file mode 100644 index 00000000..1de435a0 --- /dev/null +++ b/PuReMD-GPU/src/cuda_allocate.h @@ -0,0 +1,31 @@ +/*---------------------------------------------------------------------- + PuReMD-GPU - Reax Force Field Simulator + + Copyright (2014) Purdue University + Sudhir Kylasa, skylasa@purdue.edu + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#ifndef __CUDA_ALLOCATE_H_ +#define __CUDA_ALLOCATE_H_ + +#include "mytypes.h" + +int Cuda_Allocate_Matrix( sparse_matrix*, int, int ); +int Cuda_Allocate_HBond_List( int, int, int*, int*, list* ); +int Cuda_Allocate_Bond_List( int, int*, list* ); +void Cuda_Reallocate( reax_system*, static_storage*, list*, int, int ); + +#endif diff --git a/PuReMD-GPU/src/cuda_helpers.h b/PuReMD-GPU/src/cuda_helpers.h index adbf73ce..292d5613 100644 --- a/PuReMD-GPU/src/cuda_helpers.h +++ b/PuReMD-GPU/src/cuda_helpers.h @@ -25,7 +25,7 @@ #include "mytypes.h" -DEVICE static inline int cuda_strcmp(char *a, char *b, int len) +DEVICE inline int cuda_strcmp(char *a, char *b, int len) { char *src, *dst; @@ -52,8 +52,7 @@ DEVICE static inline int cuda_strcmp(char *a, char *b, int len) } -#if __CUDA_ARCH__ < 600 -DEVICE static inline real atomicAdd(double* address, double val) +DEVICE inline real myAtomicAdd(double* address, double val) { unsigned long long int* address_as_ull = (unsigned long long int*)address; @@ -67,22 +66,21 @@ DEVICE static inline real atomicAdd(double* address, double val) while (assumed != old); return __longlong_as_double(old); } -#endif -DEVICE static inline void atomic_rvecAdd( rvec ret, rvec v ) +DEVICE inline void atomic_rvecAdd( rvec ret, rvec v ) { - atomicAdd( (double*)&ret[0], (double)v[0] ); - atomicAdd( (double*)&ret[1], (double)v[1] ); - atomicAdd( (double*)&ret[2], (double)v[2] ); + MYATOMICADD( (double*)&ret[0], (double)v[0] ); + MYATOMICADD( (double*)&ret[1], (double)v[1] ); + MYATOMICADD( (double*)&ret[2], (double)v[2] ); } -DEVICE static inline void atomic_rvecScaledAdd( rvec ret, real c, rvec v ) +DEVICE inline void atomic_rvecScaledAdd( rvec ret, real c, rvec v ) { - atomicAdd( (double*)&ret[0], (double)(c * v[0]) ); - atomicAdd( (double*)&ret[1], (double)(c * v[1]) ); - atomicAdd( (double*)&ret[2], (double)(c * v[2]) ); + MYATOMICADD( (double*)&ret[0], (double)(c * v[0]) ); + MYATOMICADD( (double*)&ret[1], (double)(c * v[1]) ); + MYATOMICADD( (double*)&ret[2], (double)(c * v[2]) ); } diff --git a/PuReMD-GPU/src/system_props.cu b/PuReMD-GPU/src/cuda_system_props.cu similarity index 56% rename from PuReMD-GPU/src/system_props.cu rename to PuReMD-GPU/src/cuda_system_props.cu index e0bb2626..dfbc5892 100644 --- a/PuReMD-GPU/src/system_props.cu +++ b/PuReMD-GPU/src/cuda_system_props.cu @@ -18,57 +18,21 @@ <http://www.gnu.org/licenses/>. ----------------------------------------------------------------------*/ -#include "system_props.h" +#include "cuda_system_props.h" + #include "box.h" #include "vector.h" -#include "cuda_utils.h" +#include "center_mass.h" #include "cuda_copy.h" +#include "cuda_utils.h" #include "reduction.h" -#include "center_mass.h" -#include "validation.h" - - -real Get_Time( ) -{ - struct timeval tim; - - gettimeofday(&tim, NULL ); - return( tim.tv_sec + (tim.tv_usec / 1000000.0) ); -} - - -real Get_Timing_Info( real t_start ) -{ - struct timeval tim; - real t_end; - - gettimeofday(&tim, NULL ); - t_end = tim.tv_sec + (tim.tv_usec / 1000000.0); - return (t_end - t_start); -} -void Temperature_Control( control_params *control, simulation_data *data, - output_controls *out_control ) -{ - real tmp; - - if( control->T_mode == 1 ) { // step-wise temperature control - if( (data->step - data->prev_steps) % - ((int)(control->T_freq / control->dt)) == 0 ) { - if( fabs( control->T - control->T_final ) >= fabs( control->T_rate ) ) - control->T += control->T_rate; - else control->T = control->T_final; - } - } - else if( control->T_mode == 2 ) { // constant slope control - tmp = control->T_rate * control->dt / control->T_freq; +GLOBAL void k_Compute_Total_Mass(single_body_parameters *, reax_atom *, real *, size_t ); +GLOBAL void k_Compute_Kinetic_Energy(single_body_parameters *, reax_atom *, unsigned int , real *); +GLOBAL void k_Kinetic_Energy_Reduction(simulation_data *, real *, int); - if( fabs( control->T - control->T_final ) >= fabs( tmp ) ) - control->T += tmp; - } -} void prep_dev_system (reax_system *system) { @@ -77,21 +41,6 @@ void prep_dev_system (reax_system *system) } -void Compute_Total_Mass( reax_system *system, simulation_data *data ) -{ - int i; - int blocks; - int block_size; - real *partial_sums = 0; - - data->M = 0; - - for( i = 0; i < system->N; i++ ) - data->M += system->reaxprm.sbp[ system->atoms[i].type ].mass; - - data->inv_M = 1. / data->M; -} - void Cuda_Compute_Total_Mass( reax_system *system, simulation_data *data ) { real *partial_sums = (real *) scratch; @@ -133,158 +82,6 @@ void Cuda_Compute_Total_Mass( reax_system *system, simulation_data *data ) } -GLOBAL void k_Compute_Total_Mass (single_body_parameters *sbp, reax_atom *atoms, real *per_block_results, size_t n) -{ - extern __shared__ real sdata[]; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - real x = 0; - - if(i < n) - x = sbp [ atoms[ i ].type ].mass; - - sdata[threadIdx.x] = x; - __syncthreads(); - - for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) - { - if(threadIdx.x < offset) - { - sdata[threadIdx.x] += sdata[threadIdx.x + offset]; - } - __syncthreads(); - } - - if(threadIdx.x == 0) - { - per_block_results[blockIdx.x] = sdata[0]; - } -} - - -void Compute_Center_of_Mass( reax_system *system, simulation_data *data, - FILE *fout ) -{ - int i; - real m, xx, xy, xz, yy, yz, zz, det; - rvec tvec, diff; - rtensor mat, inv; - - int blocks; - int block_size; - rvec *l_xcm, *l_vcm, *l_amcm; - real t_start, t_end; - - rvec_MakeZero( data->xcm ); // position of CoM - rvec_MakeZero( data->vcm ); // velocity of CoM - rvec_MakeZero( data->amcm ); // angular momentum of CoM - rvec_MakeZero( data->avcm ); // angular velocity of CoM - - /* Compute the position, velocity and angular momentum about the CoM */ - for( i = 0; i < system->N; ++i ) { - m = system->reaxprm.sbp[ system->atoms[i].type ].mass; - - rvec_ScaledAdd( data->xcm, m, system->atoms[i].x ); - rvec_ScaledAdd( data->vcm, m, system->atoms[i].v ); - - rvec_Cross( tvec, system->atoms[i].x, system->atoms[i].v ); - rvec_ScaledAdd( data->amcm, m, tvec ); - - /*fprintf( fout,"%3d %g %g %g\n", - i+1, - system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2] ); - fprintf( fout, "vcm: %g %g %g\n", - data->vcm[0], data->vcm[1], data->vcm[2] ); - */ - } - - rvec_Scale( data->xcm, data->inv_M, data->xcm ); - rvec_Scale( data->vcm, data->inv_M, data->vcm ); - - rvec_Cross( tvec, data->xcm, data->vcm ); - rvec_ScaledAdd( data->amcm, -data->M, tvec ); - - data->etran_cm = 0.5 * data->M * rvec_Norm_Sqr( data->vcm ); - - /* Calculate and then invert the inertial tensor */ - xx = xy = xz = yy = yz = zz = 0; - - for( i = 0; i < system->N; ++i ) { - m = system->reaxprm.sbp[ system->atoms[i].type ].mass; - - rvec_ScaledSum( diff, 1., system->atoms[i].x, -1., data->xcm ); - xx += diff[0] * diff[0] * m; - xy += diff[0] * diff[1] * m; - xz += diff[0] * diff[2] * m; - yy += diff[1] * diff[1] * m; - yz += diff[1] * diff[2] * m; - zz += diff[2] * diff[2] * m; - } - -#ifdef __DEBUG_CUDA__ - fprintf (stderr, " xx: %f \n", xx); - fprintf (stderr, " xy: %f \n", xy); - fprintf (stderr, " xz: %f \n", xz); - fprintf (stderr, " yy: %f \n", yy); - fprintf (stderr, " yz: %f \n", yz); - fprintf (stderr, " zz: %f \n", zz); -#endif - - mat[0][0] = yy + zz; - mat[0][1] = mat[1][0] = -xy; - mat[0][2] = mat[2][0] = -xz; - mat[1][1] = xx + zz; - mat[2][1] = mat[1][2] = -yz; - mat[2][2] = xx + yy; - - /* invert the inertial tensor */ - det = ( mat[0][0] * mat[1][1] * mat[2][2] + - mat[0][1] * mat[1][2] * mat[2][0] + - mat[0][2] * mat[1][0] * mat[2][1] ) - - ( mat[0][0] * mat[1][2] * mat[2][1] + - mat[0][1] * mat[1][0] * mat[2][2] + - mat[0][2] * mat[1][1] * mat[2][0] ); - - inv[0][0] = mat[1][1] * mat[2][2] - mat[1][2] * mat[2][1]; - inv[0][1] = mat[0][2] * mat[2][1] - mat[0][1] * mat[2][2]; - inv[0][2] = mat[0][1] * mat[1][2] - mat[0][2] * mat[1][1]; - inv[1][0] = mat[1][2] * mat[2][0] - mat[1][0] * mat[2][2]; - inv[1][1] = mat[0][0] * mat[2][2] - mat[0][2] * mat[2][0]; - inv[1][2] = mat[0][2] * mat[1][0] - mat[0][0] * mat[1][2]; - inv[2][0] = mat[1][0] * mat[2][1] - mat[2][0] * mat[1][1]; - inv[2][1] = mat[2][0] * mat[0][1] - mat[0][0] * mat[2][1]; - inv[2][2] = mat[0][0] * mat[1][1] - mat[1][0] * mat[0][1]; - - if( fabs(det) > ALMOST_ZERO ) - rtensor_Scale( inv, 1./det, inv ); - else - rtensor_MakeZero( inv ); - - /* Compute the angular velocity about the centre of mass */ - rtensor_MatVec( data->avcm, inv, data->amcm ); - data->erot_cm = 0.5 * E_CONV * rvec_Dot( data->avcm, data->amcm ); - -#if defined(DEBUG) - fprintf( stderr, "xcm: %24.15e %24.15e %24.15e\n", - data->xcm[0], data->xcm[1], data->xcm[2] ); - fprintf( stderr, "vcm: %24.15e %24.15e %24.15e\n", - data->vcm[0], data->vcm[1], data->vcm[2] ); - fprintf( stderr, "amcm: %24.15e %24.15e %24.15e\n", - data->amcm[0], data->amcm[1], data->amcm[2] ); - /* fprintf( fout, "mat: %f %f %f\n %f %f %f\n %f %f %f\n", - mat[0][0], mat[0][1], mat[0][2], - mat[1][0], mat[1][1], mat[1][2], - mat[2][0], mat[2][1], mat[2][2] ); - fprintf( fout, "inv: %g %g %g\n %g %g %g\n %g %g %g\n", - inv[0][0], inv[0][1], inv[0][2], - inv[1][0], inv[1][1], inv[1][2], - inv[2][0], inv[2][1], inv[2][2] ); - fflush( fout ); */ - fprintf( stderr, "avcm: %24.15e %24.15e %24.15e\n", - data->avcm[0], data->avcm[1], data->avcm[2] ); -#endif -} - - void Cuda_Compute_Center_of_Mass( reax_system *system, simulation_data *data, FILE *fout ) { @@ -489,34 +286,51 @@ void Cuda_Compute_Center_of_Mass( reax_system *system, simulation_data *data, } - -void Compute_Kinetic_Energy( reax_system* system, simulation_data* data ) +void Cuda_Compute_Kinetic_Energy (reax_system *system, simulation_data *data) { - int i; - rvec p; - real m; + real *results = (real *) scratch; + cuda_memset (results, 0, REAL_SIZE * BLOCKS_POW_2, RES_SCRATCH); + k_Compute_Kinetic_Energy <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> + (system->reaxprm.d_sbp, system->d_atoms, system->N, (real *) results); + cudaThreadSynchronize (); + cudaCheckError (); - data->E_Kin = 0.0; + k_Kinetic_Energy_Reduction <<< 1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> + ((simulation_data *)data->d_simulation_data, results, BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); +} - for (i=0; i < system->N; i++) { - m = system->reaxprm.sbp[system->atoms[i].type].mass; - rvec_Scale( p, m, system->atoms[i].v ); - data->E_Kin += 0.5 * rvec_Dot( p, system->atoms[i].v ); +GLOBAL void k_Compute_Total_Mass (single_body_parameters *sbp, reax_atom *atoms, real *per_block_results, size_t n) +{ + extern __shared__ real sdata[]; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + real x = 0; - /* fprintf(stderr,"%d, %lf, %lf, %lf %lf\n", - i,system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2], - system->reaxprm.sbp[system->atoms[i].type].mass); */ - } + if(i < n) + x = sbp [ atoms[ i ].type ].mass; - data->therm.T = (2. * data->E_Kin) / (data->N_f * K_B); + sdata[threadIdx.x] = x; + __syncthreads(); - if ( fabs(data->therm.T) < ALMOST_ZERO ) /* avoid T being an absolute zero! */ - data->therm.T = ALMOST_ZERO; + for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) + { + if(threadIdx.x < offset) + { + sdata[threadIdx.x] += sdata[threadIdx.x + offset]; + } + __syncthreads(); + } + + if(threadIdx.x == 0) + { + per_block_results[blockIdx.x] = sdata[0]; + } } -GLOBAL void Compute_Kinetic_Energy( single_body_parameters* sbp, reax_atom* atoms, +GLOBAL void k_Compute_Kinetic_Energy( single_body_parameters* sbp, reax_atom* atoms, unsigned int N, real *output) { extern __shared__ real sh_ekin[]; @@ -547,8 +361,8 @@ GLOBAL void Compute_Kinetic_Energy( single_body_parameters* sbp, reax_atom* atom } } -GLOBAL void Kinetic_Energy_Reduction (simulation_data *data, - real *input, int n) + +GLOBAL void k_Kinetic_Energy_Reduction (simulation_data *data, real *input, int n) { extern __shared__ real sdata[]; unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; @@ -582,20 +396,6 @@ GLOBAL void Kinetic_Energy_Reduction (simulation_data *data, } } -void Cuda_Compute_Kinetic_Energy (reax_system *system, simulation_data *data) -{ - real *results = (real *) scratch; - cuda_memset (results, 0, REAL_SIZE * BLOCKS_POW_2, RES_SCRATCH); - Compute_Kinetic_Energy <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> - (system->reaxprm.d_sbp, system->d_atoms, system->N, (real *) results); - cudaThreadSynchronize (); - cudaCheckError (); - - Kinetic_Energy_Reduction <<< 1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> - ((simulation_data *)data->d_simulation_data, results, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); -} /* GLOBAL void Compute_Kinetic_Energy( single_body_parameters* sbp, reax_atom* atoms, @@ -660,117 +460,3 @@ data->therm.T = ALMOST_ZERO; */ -/* IMPORTANT: This function assumes that current kinetic energy and - * the center of mass of the system is already computed before. - * - * IMPORTANT: In Klein's paper, it is stated that a dU/dV term needs - * to be added when there are long-range interactions or long-range - * corrections to short-range interactions present. - * We may want to add that for more accuracy. - */ -void Compute_Pressure_Isotropic( reax_system* system, control_params *control, - simulation_data* data, - output_controls *out_control ) -{ - int i; - reax_atom *p_atom; - rvec tx; - rvec tmp; - simulation_box *box = &(system->box); - - /* Calculate internal pressure */ - rvec_MakeZero( data->int_press ); - - // 0: both int and ext, 1: ext only, 2: int only - if( control->press_mode == 0 || control->press_mode == 2 ) { - for( i = 0; i < system->N; ++i ) { - p_atom = &( system->atoms[i] ); - - /* transform x into unitbox coordinates */ - Transform_to_UnitBox( p_atom->x, box, 1, tx ); - - /* this atom's contribution to internal pressure */ - rvec_Multiply( tmp, p_atom->f, tx ); - rvec_Add( data->int_press, tmp ); - - if( out_control->debug_level > 0 ) { - fprintf( out_control->prs, "%-8d%8.2f%8.2f%8.2f", - i+1, p_atom->x[0], p_atom->x[1], p_atom->x[2] ); - fprintf( out_control->prs, "%8.2f%8.2f%8.2f", - p_atom->f[0], p_atom->f[1], p_atom->f[2] ); - fprintf( out_control->prs, "%8.2f%8.2f%8.2f\n", - data->int_press[0],data->int_press[1],data->int_press[2]); - } - } - } - - /* kinetic contribution */ - data->kin_press = 2. * (E_CONV * data->E_Kin) / ( 3. * box->volume * P_CONV ); - - /* Calculate total pressure in each direction */ - data->tot_press[0] = data->kin_press - - ((data->int_press[0] + data->ext_press[0]) / - (box->box_norms[1] * box->box_norms[2] * P_CONV)); - - data->tot_press[1] = data->kin_press - - ((data->int_press[1] + data->ext_press[1])/ - (box->box_norms[0] * box->box_norms[2] * P_CONV)); - - data->tot_press[2] = data->kin_press - - ((data->int_press[2] + data->ext_press[2])/ - (box->box_norms[0] * box->box_norms[1] * P_CONV)); - - /* Average pressure for the whole box */ - data->iso_bar.P=(data->tot_press[0]+data->tot_press[1]+data->tot_press[2])/3; -} - - -void Compute_Pressure_Isotropic_Klein( reax_system* system, - simulation_data* data ) -{ - int i; - reax_atom *p_atom; - rvec dx; - - // IMPORTANT: This function assumes that current kinetic energy and - // the center of mass of the system is already computed before. - data->iso_bar.P = 2.0 * data->E_Kin; - - for( i = 0; i < system->N; ++i ) - { - p_atom = &( system->atoms[i] ); - rvec_ScaledSum(dx,1.0,p_atom->x,-1.0,data->xcm); - data->iso_bar.P += ( -F_CONV * rvec_Dot(p_atom->f, dx) ); - } - - data->iso_bar.P /= (3.0 * system->box.volume); - - // IMPORTANT: In Klein's paper, it is stated that a dU/dV term needs - // to be added when there are long-range interactions or long-range - // corrections to short-range interactions present. - // We may want to add that for more accuracy. -} - - -void Compute_Pressure( reax_system* system, simulation_data* data, - static_storage *workspace ) -{ - int i; - reax_atom *p_atom; - rtensor temp; - - rtensor_MakeZero( data->flex_bar.P ); - - for( i = 0; i < system->N; ++i ) { - p_atom = &( system->atoms[i] ); - // Distance_on_T3_Gen( data->rcm, p_atom->x, &(system->box), &dx ); - rvec_OuterProduct( temp, p_atom->v, p_atom->v ); - rtensor_ScaledAdd( data->flex_bar.P, - system->reaxprm.sbp[ p_atom->type ].mass, temp ); - // rvec_OuterProduct(temp, workspace->virial_forces[i], p_atom->x ); - rtensor_ScaledAdd( data->flex_bar.P, -F_CONV, temp ); - } - - rtensor_Scale( data->flex_bar.P, 1.0 / system->box.volume, data->flex_bar.P ); - data->iso_bar.P = rtensor_Trace( data->flex_bar.P ) / 3.0; -} diff --git a/PuReMD-GPU/src/cuda_system_props.h b/PuReMD-GPU/src/cuda_system_props.h new file mode 100644 index 00000000..28aea013 --- /dev/null +++ b/PuReMD-GPU/src/cuda_system_props.h @@ -0,0 +1,32 @@ +/*---------------------------------------------------------------------- + PuReMD-GPU - Reax Force Field Simulator + + Copyright (2014) Purdue University + Sudhir Kylasa, skylasa@purdue.edu + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#ifndef __CUDA_SYSTEM_PROP_H_ +#define __CUDA_SYSTEM_PROP_H_ + +#include "mytypes.h" + +void prep_dev_system (reax_system *system); + +void Cuda_Compute_Total_Mass( reax_system*, simulation_data* ); +void Cuda_Compute_Center_of_Mass( reax_system*, simulation_data*, FILE* ); +void Cuda_Compute_Kinetic_Energy( reax_system*, simulation_data* ); + +#endif diff --git a/PuReMD-GPU/src/forces.cu b/PuReMD-GPU/src/forces.cu index e8e1e291..a2ad8ea7 100644 --- a/PuReMD-GPU/src/forces.cu +++ b/PuReMD-GPU/src/forces.cu @@ -36,7 +36,6 @@ #include "cuda_init.h" #include "reduction.h" //#include "matrix.h" - #include "validation.h" #include "cudaProfiler.h" @@ -2634,7 +2633,7 @@ GLOBAL void Estimate_Storage_Sizes (reax_atom *atoms, if( nbr_pj->d <= control->r_cut ) { //++(*Htop); - atomicAdd (Htop, 1); + atomicAdd(Htop, 1); /* hydrogen bond lists */ //TODO - CHANGE ORIGINAL @@ -2643,11 +2642,11 @@ GLOBAL void Estimate_Storage_Sizes (reax_atom *atoms, jhb = sbp_j->p_hbond; if( ihb == 1 && jhb == 2 ) //++hb_top[i]; - atomicAdd (&hb_top[i], 1); + atomicAdd(&hb_top[i], 1); else if( ihb == 2 && jhb == 1 ) //++hb_top[j]; - //atomicAdd (&hb_top[j], 1); - atomicAdd (&hb_top[i], 1); + //atomicAdd(&hb_top[j], 1); + atomicAdd(&hb_top[i], 1); } //TODO -- CHANGE ORIGINAL @@ -2685,8 +2684,8 @@ GLOBAL void Estimate_Storage_Sizes (reax_atom *atoms, if( BO >= control->bo_cut ) { //++bond_top[i]; //++bond_top[j]; - atomicAdd (&bond_top[i], 1); - atomicAdd (&bond_top[j], 1); + atomicAdd(&bond_top[i], 1); + atomicAdd(&bond_top[j], 1); } } } diff --git a/PuReMD-GPU/src/four_body_interactions.cu b/PuReMD-GPU/src/four_body_interactions.cu index af134bab..35a907f8 100644 --- a/PuReMD-GPU/src/four_body_interactions.cu +++ b/PuReMD-GPU/src/four_body_interactions.cu @@ -889,7 +889,7 @@ GLOBAL void Four_Body_Interactions ( reax_atom *atoms, //PERFORMANCE IMPACT e_tor = fn10 * sin_ijk * sin_jkl * CV; - //atomicAdd (&data->E_Tor ,e_tor ); + //MYATOMICADD(&data->E_Tor ,e_tor ); E_Tor [j] += e_tor; //sh_tor [threadIdx.x] += e_tor; @@ -933,7 +933,7 @@ GLOBAL void Four_Body_Interactions ( reax_atom *atoms, fn12 = exp_cot2_ij * exp_cot2_jk * exp_cot2_kl; //PERFORMANCE IMPACT e_con = fbp->p_cot1 * fn12 * (1. + (SQR(cos_omega)-1.) * sin_ijk*sin_jkl); - //atomicAdd (&data->E_Con ,e_con ); + //MYATOMICADD(&data->E_Con ,e_con ); E_Con [j] += e_con ; //sh_con [threadIdx.x] += e_con; @@ -971,12 +971,12 @@ GLOBAL void Four_Body_Interactions ( reax_atom *atoms, /* forces */ //PERFORMANCE IMPACT /* - atomicAdd ( &bo_jk->Cdbopi, CEtors2 ); - atomicAdd ( &workspace->CdDelta[j], CEtors3 ); - atomicAdd ( &workspace->CdDelta[k], CEtors3 ); - atomicAdd ( &bo_ij->Cdbo, (CEtors4 + CEconj1) ); - atomicAdd ( &bo_jk->Cdbo, (CEtors5 + CEconj2) ); - atomicAdd ( &bo_kl->Cdbo, (CEtors6 + CEconj3) ); + MYATOMICADD( &bo_jk->Cdbopi, CEtors2 ); + MYATOMICADD( &workspace->CdDelta[j], CEtors3 ); + MYATOMICADD( &workspace->CdDelta[k], CEtors3 ); + MYATOMICADD( &bo_ij->Cdbo, (CEtors4 + CEconj1) ); + MYATOMICADD( &bo_jk->Cdbo, (CEtors5 + CEconj2) ); + MYATOMICADD( &bo_kl->Cdbo, (CEtors6 + CEconj3) ); */ //PERFORMANCE IMPACT @@ -987,7 +987,7 @@ GLOBAL void Four_Body_Interactions ( reax_atom *atoms, bo_jk->Cdbo += CEtors5 + CEconj2; //TODO REMOVE THIS ATOMIC OPERATION IF POSSIBLE - atomicAdd (&pbond_kl->Cdbo_kl, CEtors6 + CEconj3 ); + MYATOMICADD(&pbond_kl->Cdbo_kl, CEtors6 + CEconj3 ); //TODO REMOVE THIS ATOMIC OPERATION IF POSSIBLE if( control->ensemble == NVE || control->ensemble == NVT ||control->ensemble == bNVT) { diff --git a/PuReMD-GPU/src/init_md.cu b/PuReMD-GPU/src/init_md.cu index 87b84a76..92691bbb 100644 --- a/PuReMD-GPU/src/init_md.cu +++ b/PuReMD-GPU/src/init_md.cu @@ -34,17 +34,20 @@ #include "traj.h" #include "vector.h" - #include "cuda_init.h" #include "cuda_copy.h" #include "cuda_utils.h" #include "helpers.h" #include "reduction.h" -#include "index_utils.h" +#include "cuda_allocate.h" +#include "cuda_system_props.h" + +#include "index_utils.h" #include "validation.h" + void Generate_Initial_Velocities(reax_system *system, real T ) { int i; @@ -967,7 +970,7 @@ void compare_far_neighbors (int *test, int *start, int *end, far_neighbor_data * fprintf (stderr, "Serial NumNeighbors ---> %d \n", num_nbrs); #endif - if( !Make_List(system->N, num_nbrs, TYP_FAR_NEIGHBOR, (*lists)+FAR_NBRS), TYP_HOST ) { + if( !Make_List(system->N, num_nbrs, TYP_FAR_NEIGHBOR, (*lists)+FAR_NBRS, TYP_HOST ) ) { fprintf(stderr, "Problem in initializing far nbrs list. Terminating!\n"); exit( INIT_ERR ); } @@ -1036,7 +1039,7 @@ void compare_far_neighbors (int *test, int *start, int *end, far_neighbor_data * #endif /* 3bodies list */ - if(!Make_List(num_bonds, num_3body, TYP_THREE_BODY, (*lists)+THREE_BODIES), TYP_HOST) { + if(!Make_List(num_bonds, num_3body, TYP_THREE_BODY, (*lists)+THREE_BODIES, TYP_HOST )) { fprintf( stderr, "Problem in initializing angles list. Terminating!\n" ); exit( INIT_ERR ); } diff --git a/PuReMD-GPU/src/integrate.cu b/PuReMD-GPU/src/integrate.cu index d0790286..c54603b5 100644 --- a/PuReMD-GPU/src/integrate.cu +++ b/PuReMD-GPU/src/integrate.cu @@ -36,6 +36,9 @@ #include "reduction.h" #include "validation.h" +#include "cuda_allocate.h" +#include "cuda_system_props.h" + void Velocity_Verlet_NVE(reax_system* system, control_params* control, simulation_data *data, static_storage *workspace, diff --git a/PuReMD-GPU/src/mytypes.h b/PuReMD-GPU/src/mytypes.h index 91b9438d..eacde6b9 100644 --- a/PuReMD-GPU/src/mytypes.h +++ b/PuReMD-GPU/src/mytypes.h @@ -31,6 +31,11 @@ #include <cuda.h> #include <cublas_v2.h> #include <cusparse_v2.h> + #if __CUDA_ARCH__ < 600 + #define MYATOMICADD myAtomicAdd + #else + #define MYATOMICADD atomicAdd + #endif #endif #else #ifndef __MYTYPES_H_ diff --git a/PuReMD-GPU/src/neighbors.cu b/PuReMD-GPU/src/neighbors.cu index 90779538..0f902de6 100644 --- a/PuReMD-GPU/src/neighbors.cu +++ b/PuReMD-GPU/src/neighbors.cu @@ -782,7 +782,7 @@ int Estimate_NumNeighbors( reax_system *system, control_params *control, } //__syncthreads (); - //atomicAdd ( &warp_sync [threadIdx.x / __THREADS_PER_ATOM__ ], 1); + //MYATOMICADD( &warp_sync [threadIdx.x / __THREADS_PER_ATOM__ ], 1); //while ( warp_sync [threadIdx.x / __THREADS_PER_ATOM__ ] < __THREADS_PER_ATOM__ ) ; if (nbrgen) diff --git a/PuReMD-GPU/src/random.h b/PuReMD-GPU/src/random.h index f7edb397..f5685b8b 100644 --- a/PuReMD-GPU/src/random.h +++ b/PuReMD-GPU/src/random.h @@ -23,10 +23,6 @@ #include "mytypes.h" -HOST_DEVICE inline double Random(double); -HOST_DEVICE inline void Randomize(); -HOST_DEVICE inline double GRandom(double , double ); - /* System random number generator used linear congruance method with large periodicity for generation of pseudo random number. function @@ -37,6 +33,7 @@ HOST_DEVICE inline double Random(double range) return (random() * range) / 2147483647L; } + /* This function seeds the system pseudo random number generator with current time. Use this function once in the begining to initialize the system */ @@ -45,6 +42,7 @@ HOST_DEVICE inline void Randomize() srandom(time(NULL)); } + /* GRandom return random number with gaussian distribution with mean and standard deviation "sigma" */ HOST_DEVICE inline double GRandom(double mean, double sigma) diff --git a/PuReMD-GPU/src/single_body_interactions.cu b/PuReMD-GPU/src/single_body_interactions.cu index 3c6c0882..9b8438a6 100644 --- a/PuReMD-GPU/src/single_body_interactions.cu +++ b/PuReMD-GPU/src/single_body_interactions.cu @@ -374,7 +374,7 @@ GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob e_lp = p_lp2 * workspace->Delta_lp[i] * inv_expvd2; //PERFORMANCE IMPACT - atomicAdd (&data->E_Lp, e_lp); + MYATOMICADD(&data->E_Lp, e_lp); dElp = p_lp2 * inv_expvd2 + 75 * p_lp2 * workspace->Delta_lp[i] * expvd2 * SQR(inv_expvd2); @@ -382,7 +382,7 @@ GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob //PERFORMANCE IMPACT //workspace->CdDelta[i] += CElp; // lp - 1st term - atomicAdd (&workspace->CdDelta[i], CElp); + MYATOMICADD(&workspace->CdDelta[i], CElp); #ifdef TEST_ENERGY @@ -416,7 +416,7 @@ GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob //PERFORMANCE IMPACT e_lph = p_lp3 * SQR(vov3-3.0); - atomicAdd (&data->E_Lp, e_lph ); + MYATOMICADD(&data->E_Lp, e_lph ); //estrain(i) += e_lph; deahu2dbo = 2.*p_lp3*(vov3 - 3.); @@ -426,7 +426,7 @@ GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob //PERFORMANCE IMPACT - atomicAdd (&workspace->CdDelta[i], deahu2dsbo); + MYATOMICADD(&workspace->CdDelta[i], deahu2dsbo); #ifdef TEST_ENERGY //TODO //fprintf(out_control->elp,"C2cor%6d%6d%23.15e%23.15e%23.15e\n", @@ -500,7 +500,7 @@ GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob //PERFORMANCE IMPACT //data->E_Ov += e_ov = sum_ovun1 * CEover1; e_ov = sum_ovun1 * CEover1; - atomicAdd (&data->E_Ov, e_ov ); + MYATOMICADD(&data->E_Ov, e_ov ); CEover2 = sum_ovun1 * DlpVi * inv_exp_ovun2 * ( 1.0 - Delta_lpcorr*( DlpVi + p_ovun2 * exp_ovun2 * inv_exp_ovun2 ) ); @@ -523,7 +523,7 @@ GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob //PERFORMANCE IMPACT e_un = -p_ovun5 * (1.0 - exp_ovun6) * inv_exp_ovun2n * inv_exp_ovun8; - atomicAdd (&data->E_Un, e_un ); + MYATOMICADD(&data->E_Un, e_un ); CEunder1 = inv_exp_ovun2n * ( p_ovun5*p_ovun6*exp_ovun6*inv_exp_ovun8 + p_ovun2 * e_un * exp_ovun2n); @@ -537,8 +537,8 @@ GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob // forces //PERFORMANCE IMPACT - atomicAdd (&workspace->CdDelta[i] , CEover3); // OvCoor - 2nd term - atomicAdd (&workspace->CdDelta[i], CEunder3); // UnCoor - 1st term + MYATOMICADD(&workspace->CdDelta[i] , CEover3); // OvCoor - 2nd term + MYATOMICADD(&workspace->CdDelta[i], CEunder3); // UnCoor - 1st term #ifdef TEST_FORCES //TODO @@ -559,7 +559,7 @@ GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob bo_ij->Cdbo += CEover1 * twbp->p_ovun1 * twbp->De_s; // OvCoor - 1st //PERFORMANCE IMPACT - atomicAdd (&workspace->CdDelta[j], CEover4*(1.0 - dfvl*workspace->dDelta_lp[j])* (bo_ij->BO_pi + bo_ij->BO_pi2)); // OvCoor - 3a + MYATOMICADD(&workspace->CdDelta[j], CEover4*(1.0 - dfvl*workspace->dDelta_lp[j])* (bo_ij->BO_pi + bo_ij->BO_pi2)); // OvCoor - 3a bo_ij->Cdbopi += CEover4 * (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//OvCoor-3b @@ -568,7 +568,7 @@ GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob //PERFORMANCE IMPACT - atomicAdd (&workspace->CdDelta[j], CEunder4*(1.0-dfvl*workspace->dDelta_lp[j]) * (bo_ij->BO_pi + bo_ij->BO_pi2) ); // UnCoor - 2a + MYATOMICADD(&workspace->CdDelta[j], CEunder4*(1.0-dfvl*workspace->dDelta_lp[j]) * (bo_ij->BO_pi + bo_ij->BO_pi2) ); // UnCoor - 2a bo_ij->Cdbopi += CEunder4 * (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//UnCoor-2b @@ -705,7 +705,7 @@ GLOBAL void test_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob // calculate the energy e_lp = p_lp2 * workspace->Delta_lp[i] * inv_expvd2; - //atomicAdd (&data->E_Lp, e_lp ); + //MYATOMICADD(&data->E_Lp, e_lp ); E_Lp [ i ] = e_lp; dElp = p_lp2 * inv_expvd2 + @@ -732,7 +732,7 @@ GLOBAL void test_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob e_lph = p_lp3 * SQR(vov3-3.0); E_Lp [i] += e_lph; - //atomicAdd (&data->E_Lp, e_lph ); + //MYATOMICADD(&data->E_Lp, e_lph ); //estrain(i) += e_lph; deahu2dbo = 2.*p_lp3*(vov3 - 3.); @@ -790,7 +790,7 @@ GLOBAL void test_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob e_ov = sum_ovun1 * CEover1; E_Ov [ i ] = e_ov; - //atomicAdd ( &data->E_Ov, e_ov ); + //MYATOMICADD( &data->E_Ov, e_ov ); CEover2 = sum_ovun1 * DlpVi * inv_exp_ovun2 * ( 1.0 - Delta_lpcorr*( DlpVi + p_ovun2 * exp_ovun2 * inv_exp_ovun2 ) ); @@ -813,7 +813,7 @@ GLOBAL void test_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob e_un = -p_ovun5 * (1.0 - exp_ovun6) * inv_exp_ovun2n * inv_exp_ovun8; E_Un [i] = e_un; - //atomicAdd ( &data->E_Un, e_un ); + //MYATOMICADD( &data->E_Un, e_un ); CEunder1 = inv_exp_ovun2n * ( p_ovun5*p_ovun6*exp_ovun6*inv_exp_ovun8 + p_ovun2 * e_un * exp_ovun2n); @@ -903,7 +903,7 @@ GLOBAL void test_LonePair_OverUnder_Coordination_Energy_LP ( reax_atom *atoms, g // calculate the energy e_lp = p_lp2 * workspace->Delta_lp[i] * inv_expvd2; - //atomicAdd (&data->E_Lp, e_lp ); + //MYATOMICADD(&data->E_Lp, e_lp ); E_Lp [ i ] = e_lp; dElp = p_lp2 * inv_expvd2 + @@ -930,7 +930,7 @@ GLOBAL void test_LonePair_OverUnder_Coordination_Energy_LP ( reax_atom *atoms, g e_lph = p_lp3 * SQR(vov3-3.0); E_Lp [i] += e_lph; - //atomicAdd (&data->E_Lp, e_lph ); + //MYATOMICADD(&data->E_Lp, e_lph ); //estrain(i) += e_lph; deahu2dbo = 2.*p_lp3*(vov3 - 3.); diff --git a/PuReMD-GPU/src/system_props.c b/PuReMD-GPU/src/system_props.c new file mode 100644 index 00000000..491b0d93 --- /dev/null +++ b/PuReMD-GPU/src/system_props.c @@ -0,0 +1,348 @@ +/*---------------------------------------------------------------------- + PuReMD-GPU - Reax Force Field Simulator + + Copyright (2014) Purdue University + Sudhir Kylasa, skylasa@purdue.edu + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#include "system_props.h" + +#include "box.h" +#include "vector.h" + + +real Get_Time( ) +{ + struct timeval tim; + + gettimeofday(&tim, NULL ); + return( tim.tv_sec + (tim.tv_usec / 1000000.0) ); +} + + +real Get_Timing_Info( real t_start ) +{ + struct timeval tim; + real t_end; + + gettimeofday(&tim, NULL ); + t_end = tim.tv_sec + (tim.tv_usec / 1000000.0); + return (t_end - t_start); +} + + +void Temperature_Control( control_params *control, simulation_data *data, + output_controls *out_control ) +{ + real tmp; + + if( control->T_mode == 1 ) { // step-wise temperature control + if( (data->step - data->prev_steps) % + ((int)(control->T_freq / control->dt)) == 0 ) { + if( fabs( control->T - control->T_final ) >= fabs( control->T_rate ) ) + control->T += control->T_rate; + else control->T = control->T_final; + } + } + else if( control->T_mode == 2 ) { // constant slope control + tmp = control->T_rate * control->dt / control->T_freq; + + if( fabs( control->T - control->T_final ) >= fabs( tmp ) ) + control->T += tmp; + } +} + + +void Compute_Total_Mass( reax_system *system, simulation_data *data ) +{ + int i; + int blocks; + int block_size; + real *partial_sums = 0; + + data->M = 0; + + for( i = 0; i < system->N; i++ ) + data->M += system->reaxprm.sbp[ system->atoms[i].type ].mass; + + data->inv_M = 1. / data->M; +} + + +void Compute_Center_of_Mass( reax_system *system, simulation_data *data, + FILE *fout ) +{ + int i; + real m, xx, xy, xz, yy, yz, zz, det; + rvec tvec, diff; + rtensor mat, inv; + + int blocks; + int block_size; + rvec *l_xcm, *l_vcm, *l_amcm; + real t_start, t_end; + + rvec_MakeZero( data->xcm ); // position of CoM + rvec_MakeZero( data->vcm ); // velocity of CoM + rvec_MakeZero( data->amcm ); // angular momentum of CoM + rvec_MakeZero( data->avcm ); // angular velocity of CoM + + /* Compute the position, velocity and angular momentum about the CoM */ + for( i = 0; i < system->N; ++i ) { + m = system->reaxprm.sbp[ system->atoms[i].type ].mass; + + rvec_ScaledAdd( data->xcm, m, system->atoms[i].x ); + rvec_ScaledAdd( data->vcm, m, system->atoms[i].v ); + + rvec_Cross( tvec, system->atoms[i].x, system->atoms[i].v ); + rvec_ScaledAdd( data->amcm, m, tvec ); + + /*fprintf( fout,"%3d %g %g %g\n", + i+1, + system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2] ); + fprintf( fout, "vcm: %g %g %g\n", + data->vcm[0], data->vcm[1], data->vcm[2] ); + */ + } + + rvec_Scale( data->xcm, data->inv_M, data->xcm ); + rvec_Scale( data->vcm, data->inv_M, data->vcm ); + + rvec_Cross( tvec, data->xcm, data->vcm ); + rvec_ScaledAdd( data->amcm, -data->M, tvec ); + + data->etran_cm = 0.5 * data->M * rvec_Norm_Sqr( data->vcm ); + + /* Calculate and then invert the inertial tensor */ + xx = xy = xz = yy = yz = zz = 0; + + for( i = 0; i < system->N; ++i ) { + m = system->reaxprm.sbp[ system->atoms[i].type ].mass; + + rvec_ScaledSum( diff, 1., system->atoms[i].x, -1., data->xcm ); + xx += diff[0] * diff[0] * m; + xy += diff[0] * diff[1] * m; + xz += diff[0] * diff[2] * m; + yy += diff[1] * diff[1] * m; + yz += diff[1] * diff[2] * m; + zz += diff[2] * diff[2] * m; + } + +#ifdef __DEBUG_CUDA__ + fprintf (stderr, " xx: %f \n", xx); + fprintf (stderr, " xy: %f \n", xy); + fprintf (stderr, " xz: %f \n", xz); + fprintf (stderr, " yy: %f \n", yy); + fprintf (stderr, " yz: %f \n", yz); + fprintf (stderr, " zz: %f \n", zz); +#endif + + mat[0][0] = yy + zz; + mat[0][1] = mat[1][0] = -xy; + mat[0][2] = mat[2][0] = -xz; + mat[1][1] = xx + zz; + mat[2][1] = mat[1][2] = -yz; + mat[2][2] = xx + yy; + + /* invert the inertial tensor */ + det = ( mat[0][0] * mat[1][1] * mat[2][2] + + mat[0][1] * mat[1][2] * mat[2][0] + + mat[0][2] * mat[1][0] * mat[2][1] ) - + ( mat[0][0] * mat[1][2] * mat[2][1] + + mat[0][1] * mat[1][0] * mat[2][2] + + mat[0][2] * mat[1][1] * mat[2][0] ); + + inv[0][0] = mat[1][1] * mat[2][2] - mat[1][2] * mat[2][1]; + inv[0][1] = mat[0][2] * mat[2][1] - mat[0][1] * mat[2][2]; + inv[0][2] = mat[0][1] * mat[1][2] - mat[0][2] * mat[1][1]; + inv[1][0] = mat[1][2] * mat[2][0] - mat[1][0] * mat[2][2]; + inv[1][1] = mat[0][0] * mat[2][2] - mat[0][2] * mat[2][0]; + inv[1][2] = mat[0][2] * mat[1][0] - mat[0][0] * mat[1][2]; + inv[2][0] = mat[1][0] * mat[2][1] - mat[2][0] * mat[1][1]; + inv[2][1] = mat[2][0] * mat[0][1] - mat[0][0] * mat[2][1]; + inv[2][2] = mat[0][0] * mat[1][1] - mat[1][0] * mat[0][1]; + + if( fabs(det) > ALMOST_ZERO ) + rtensor_Scale( inv, 1./det, inv ); + else + rtensor_MakeZero( inv ); + + /* Compute the angular velocity about the centre of mass */ + rtensor_MatVec( data->avcm, inv, data->amcm ); + data->erot_cm = 0.5 * E_CONV * rvec_Dot( data->avcm, data->amcm ); + +#if defined(DEBUG) + fprintf( stderr, "xcm: %24.15e %24.15e %24.15e\n", + data->xcm[0], data->xcm[1], data->xcm[2] ); + fprintf( stderr, "vcm: %24.15e %24.15e %24.15e\n", + data->vcm[0], data->vcm[1], data->vcm[2] ); + fprintf( stderr, "amcm: %24.15e %24.15e %24.15e\n", + data->amcm[0], data->amcm[1], data->amcm[2] ); + /* fprintf( fout, "mat: %f %f %f\n %f %f %f\n %f %f %f\n", + mat[0][0], mat[0][1], mat[0][2], + mat[1][0], mat[1][1], mat[1][2], + mat[2][0], mat[2][1], mat[2][2] ); + fprintf( fout, "inv: %g %g %g\n %g %g %g\n %g %g %g\n", + inv[0][0], inv[0][1], inv[0][2], + inv[1][0], inv[1][1], inv[1][2], + inv[2][0], inv[2][1], inv[2][2] ); + fflush( fout ); */ + fprintf( stderr, "avcm: %24.15e %24.15e %24.15e\n", + data->avcm[0], data->avcm[1], data->avcm[2] ); +#endif +} + + +void Compute_Kinetic_Energy( reax_system* system, simulation_data* data ) +{ + int i; + rvec p; + real m; + + data->E_Kin = 0.0; + + for (i=0; i < system->N; i++) { + m = system->reaxprm.sbp[system->atoms[i].type].mass; + + rvec_Scale( p, m, system->atoms[i].v ); + data->E_Kin += 0.5 * rvec_Dot( p, system->atoms[i].v ); + + /* fprintf(stderr,"%d, %lf, %lf, %lf %lf\n", + i,system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2], + system->reaxprm.sbp[system->atoms[i].type].mass); */ + } + + data->therm.T = (2. * data->E_Kin) / (data->N_f * K_B); + + if ( fabs(data->therm.T) < ALMOST_ZERO ) /* avoid T being an absolute zero! */ + data->therm.T = ALMOST_ZERO; +} + + +/* IMPORTANT: This function assumes that current kinetic energy and + * the center of mass of the system is already computed before. + * + * IMPORTANT: In Klein's paper, it is stated that a dU/dV term needs + * to be added when there are long-range interactions or long-range + * corrections to short-range interactions present. + * We may want to add that for more accuracy. + */ +void Compute_Pressure_Isotropic( reax_system* system, control_params *control, + simulation_data* data, + output_controls *out_control ) +{ + int i; + reax_atom *p_atom; + rvec tx; + rvec tmp; + simulation_box *box = &(system->box); + + /* Calculate internal pressure */ + rvec_MakeZero( data->int_press ); + + // 0: both int and ext, 1: ext only, 2: int only + if( control->press_mode == 0 || control->press_mode == 2 ) { + for( i = 0; i < system->N; ++i ) { + p_atom = &( system->atoms[i] ); + + /* transform x into unitbox coordinates */ + Transform_to_UnitBox( p_atom->x, box, 1, tx ); + + /* this atom's contribution to internal pressure */ + rvec_Multiply( tmp, p_atom->f, tx ); + rvec_Add( data->int_press, tmp ); + + if( out_control->debug_level > 0 ) { + fprintf( out_control->prs, "%-8d%8.2f%8.2f%8.2f", + i+1, p_atom->x[0], p_atom->x[1], p_atom->x[2] ); + fprintf( out_control->prs, "%8.2f%8.2f%8.2f", + p_atom->f[0], p_atom->f[1], p_atom->f[2] ); + fprintf( out_control->prs, "%8.2f%8.2f%8.2f\n", + data->int_press[0],data->int_press[1],data->int_press[2]); + } + } + } + + /* kinetic contribution */ + data->kin_press = 2. * (E_CONV * data->E_Kin) / ( 3. * box->volume * P_CONV ); + + /* Calculate total pressure in each direction */ + data->tot_press[0] = data->kin_press - + ((data->int_press[0] + data->ext_press[0]) / + (box->box_norms[1] * box->box_norms[2] * P_CONV)); + + data->tot_press[1] = data->kin_press - + ((data->int_press[1] + data->ext_press[1])/ + (box->box_norms[0] * box->box_norms[2] * P_CONV)); + + data->tot_press[2] = data->kin_press - + ((data->int_press[2] + data->ext_press[2])/ + (box->box_norms[0] * box->box_norms[1] * P_CONV)); + + /* Average pressure for the whole box */ + data->iso_bar.P=(data->tot_press[0]+data->tot_press[1]+data->tot_press[2])/3; +} + + +void Compute_Pressure_Isotropic_Klein( reax_system* system, + simulation_data* data ) +{ + int i; + reax_atom *p_atom; + rvec dx; + + // IMPORTANT: This function assumes that current kinetic energy and + // the center of mass of the system is already computed before. + data->iso_bar.P = 2.0 * data->E_Kin; + + for( i = 0; i < system->N; ++i ) + { + p_atom = &( system->atoms[i] ); + rvec_ScaledSum(dx,1.0,p_atom->x,-1.0,data->xcm); + data->iso_bar.P += ( -F_CONV * rvec_Dot(p_atom->f, dx) ); + } + + data->iso_bar.P /= (3.0 * system->box.volume); + + // IMPORTANT: In Klein's paper, it is stated that a dU/dV term needs + // to be added when there are long-range interactions or long-range + // corrections to short-range interactions present. + // We may want to add that for more accuracy. +} + + +void Compute_Pressure( reax_system* system, simulation_data* data, + static_storage *workspace ) +{ + int i; + reax_atom *p_atom; + rtensor temp; + + rtensor_MakeZero( data->flex_bar.P ); + + for( i = 0; i < system->N; ++i ) { + p_atom = &( system->atoms[i] ); + // Distance_on_T3_Gen( data->rcm, p_atom->x, &(system->box), &dx ); + rvec_OuterProduct( temp, p_atom->v, p_atom->v ); + rtensor_ScaledAdd( data->flex_bar.P, + system->reaxprm.sbp[ p_atom->type ].mass, temp ); + // rvec_OuterProduct(temp, workspace->virial_forces[i], p_atom->x ); + rtensor_ScaledAdd( data->flex_bar.P, -F_CONV, temp ); + } + + rtensor_Scale( data->flex_bar.P, 1.0 / system->box.volume, data->flex_bar.P ); + data->iso_bar.P = rtensor_Trace( data->flex_bar.P ) / 3.0; +} diff --git a/PuReMD-GPU/src/system_props.h b/PuReMD-GPU/src/system_props.h index d152c254..90f43d76 100644 --- a/PuReMD-GPU/src/system_props.h +++ b/PuReMD-GPU/src/system_props.h @@ -30,21 +30,14 @@ real Get_Timing_Info( real ); void Temperature_Control( control_params*, simulation_data*, output_controls* ); void Compute_Total_Mass( reax_system*, simulation_data* ); -void Cuda_Compute_Total_Mass( reax_system*, simulation_data* ); void Compute_Center_of_Mass( reax_system*, simulation_data*, FILE* ); -void Cuda_Compute_Center_of_Mass( reax_system*, simulation_data*, FILE* ); void Compute_Kinetic_Energy( reax_system*, simulation_data* ); -void Cuda_Compute_Kinetic_Energy( reax_system*, simulation_data* ); void Compute_Pressure( reax_system*, simulation_data*, static_storage* ); void Compute_Pressure_Isotropic( reax_system*, control_params*, simulation_data*, output_controls* ); -void prep_dev_system (reax_system *system); -GLOBAL void k_Compute_Total_Mass (single_body_parameters *, reax_atom *, real *, size_t ); -//GLOBAL void Compute_Kinetic_Energy (single_body_parameters *, reax_atom *, unsigned int , simulation_data *, real *); - #endif diff --git a/PuReMD-GPU/src/testmd.cu b/PuReMD-GPU/src/testmd.c similarity index 99% rename from PuReMD-GPU/src/testmd.cu rename to PuReMD-GPU/src/testmd.c index 93f286cc..f2f27d19 100644 --- a/PuReMD-GPU/src/testmd.cu +++ b/PuReMD-GPU/src/testmd.c @@ -39,6 +39,7 @@ #include "cuda_copy.h" #include "validation.h" +#include "cuda_system_props.h" interaction_function Interaction_Functions[NO_OF_INTERACTIONS]; diff --git a/PuReMD-GPU/src/three_body_interactions.cu b/PuReMD-GPU/src/three_body_interactions.cu index c2eed63b..1edd41ae 100644 --- a/PuReMD-GPU/src/three_body_interactions.cu +++ b/PuReMD-GPU/src/three_body_interactions.cu @@ -871,7 +871,7 @@ where i < k */ e_ang = f7_ij * f7_jk * f8_Dj * expval12theta; //PERFORMANCE IMPACT - //atomicAdd (&data->E_Ang, e_ang); + //MYATOMICADD(&data->E_Ang, e_ang); E_Ang [j] += e_ang; /* END ANGLE ENERGY*/ @@ -895,7 +895,7 @@ where i < k */ e_pen = p_pen1 * f9_Dj * exp_pen2ij * exp_pen2jk; //PERFORMANCE IMPACT - //atomicAdd (&data->E_Pen, e_pen); + //MYATOMICADD(&data->E_Pen, e_pen); E_Pen [j] += e_pen; @@ -921,7 +921,7 @@ where i < k */ EXP( -p_coa4 * SQR(BOA_jk - 1.5) ); //PERFORMANCE IMPACT - //atomicAdd (&data->E_Coa, e_coa); + //MYATOMICADD(&data->E_Coa, e_coa); E_Coa [j] += e_coa; CEcoa1 = -2 * p_coa4 * (BOA_ij - 1.5) * e_coa; @@ -933,19 +933,19 @@ where i < k */ /* FORCES */ /* - atomicAdd (&bo_ij->Cdbo, (CEval1 + CEpen2 + (CEcoa1-CEcoa4)) ); - atomicAdd (&bo_jk->Cdbo, (CEval2 + CEpen3 + (CEcoa2-CEcoa5)) ); - atomicAdd (&workspace->CdDelta[j], ((CEval3 + CEval7) + CEpen1 + CEcoa3) ); - atomicAdd (&workspace->CdDelta[i], CEcoa4 ); - atomicAdd (&workspace->CdDelta[k], CEcoa5 ); + MYATOMICADD(&bo_ij->Cdbo, (CEval1 + CEpen2 + (CEcoa1-CEcoa4)) ); + MYATOMICADD(&bo_jk->Cdbo, (CEval2 + CEpen3 + (CEcoa2-CEcoa5)) ); + MYATOMICADD(&workspace->CdDelta[j], ((CEval3 + CEval7) + CEpen1 + CEcoa3) ); + MYATOMICADD(&workspace->CdDelta[i], CEcoa4 ); + MYATOMICADD(&workspace->CdDelta[k], CEcoa5 ); */ bo_ij->Cdbo += (CEval1 + CEpen2 + (CEcoa1-CEcoa4)) ; bo_jk->Cdbo += (CEval2 + CEpen3 + (CEcoa2-CEcoa5)) ; workspace->CdDelta[j] += ((CEval3 + CEval7) + CEpen1 + CEcoa3) ; - //atomicAdd (&workspace->CdDelta[i], CEcoa4 ); + //MYATOMICADD(&workspace->CdDelta[i], CEcoa4 ); pbond_ij->CdDelta_ij += CEcoa4 ; - //atomicAdd (&workspace->CdDelta[k], CEcoa5 ); + //MYATOMICADD(&workspace->CdDelta[k], CEcoa5 ); pbond_jk->CdDelta_ij += CEcoa5; for( t = start_j; t < end_j; ++t ) { @@ -960,9 +960,9 @@ where i < k */ // (CEval6 * pBOjt7) ); /* - atomicAdd (&bo_jt->Cdbo, (CEval6 * pBOjt7) ); - atomicAdd (&bo_jt->Cdbopi, CEval5 ); - atomicAdd (&bo_jt->Cdbopi2, CEval5 ); + MYATOMICADD(&bo_jt->Cdbo, (CEval6 * pBOjt7) ); + MYATOMICADD(&bo_jt->Cdbopi, CEval5 ); + MYATOMICADD(&bo_jt->Cdbopi2, CEval5 ); */ bo_jt->Cdbo += (CEval6 * pBOjt7) ; bo_jt->Cdbopi += CEval5 ; @@ -1670,7 +1670,7 @@ GLOBAL void Hydrogen_Bonds ( reax_atom *atoms, //PERFORMANCE IMPACT e_hb = hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4; - //atomicAdd ( &data->E_HB, e_hb ); + //MYATOMICADD( &data->E_HB, e_hb ); //E_HB [j] += e_hb; sh_hb [threadIdx.x] += e_hb; @@ -2050,7 +2050,7 @@ GLOBAL void Hydrogen_Bonds ( reax_atom *atoms, //PERFORMANCE IMPACT e_hb = hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4; - //atomicAdd ( &data->E_HB, e_hb ); + //MYATOMICADD( &data->E_HB, e_hb ); //E_HB [j] += e_hb; sh_hb [threadIdx.x] += e_hb; diff --git a/PuReMD-GPU/src/traj.cu b/PuReMD-GPU/src/traj.c similarity index 99% rename from PuReMD-GPU/src/traj.cu rename to PuReMD-GPU/src/traj.c index 97496e7f..afe3b1d2 100644 --- a/PuReMD-GPU/src/traj.cu +++ b/PuReMD-GPU/src/traj.c @@ -20,7 +20,10 @@ #include "traj.h" #include "list.h" -#include "cuda_copy.h" + +#ifdef __PRINT_CPU_RESULTS__ + #include "cuda_copy.h" +#endif /************************************************/ /* CUSTOM FORMAT ROUTINES */ @@ -207,7 +210,7 @@ int Append_Custom_Frame( reax_system *system, control_params *control, if( write_bonds ) { -#ifndef __PRINT_CPU_RESULTS__ +#ifdef __PRINT_CPU_RESULTS__ //fprintf (stderr, "Synching bonds from device for printing ....\n"); Sync_Host_Device (bonds, (dev_lists + BONDS), TYP_BOND ); #endif @@ -239,7 +242,7 @@ int Append_Custom_Frame( reax_system *system, control_params *control, num_thb_intrs = 0; if( write_angles ) { -#ifndef __PRINT_CPU_RESULTS__ +#ifdef __PRINT_CPU_RESULTS__ //fprintf (stderr, "Synching three bodies from deivce for printing ... \n"); Sync_Host_Device (thb_intrs, dev_lists + THREE_BODIES, TYP_THREE_BODY ); if ( !write_bonds) { diff --git a/PuReMD-GPU/src/traj.h b/PuReMD-GPU/src/traj.h index d8c1792d..35d92602 100644 --- a/PuReMD-GPU/src/traj.h +++ b/PuReMD-GPU/src/traj.h @@ -22,8 +22,8 @@ #define __TRAJ_H__ #include "mytypes.h" -#include "zlib.h" +#include <zlib.h> #define BLOCK_MARK "REAX_BLOCK_MARK " #define BLOCK_MARK_LEN 16 @@ -73,12 +73,14 @@ #define SIZE_INFO_LINE3 "%-10d %-10d %-10d\n" #define SIZE_INFO_LEN3 33 + enum ATOM_LINE_OPTS {OPT_NOATOM = 0, OPT_ATOM_BASIC = 4, OPT_ATOM_wF = 5, OPT_ATOM_wV = 6, OPT_ATOM_FULL = 7 }; enum BOND_LINE_OPTS {OPT_NOBOND, OPT_BOND_BASIC, OPT_BOND_FULL}; enum ANGLE_LINE_OPTS {OPT_NOANGLE, OPT_ANGLE_BASIC}; + struct { int no_of_sub_blocks; @@ -89,11 +91,11 @@ struct typedef struct __block block; + int Write_Block( gzFile, block* ); int Read_Next_Block( gzFile, block*, int* ); int Skip_Next_Block( gzFile, int*); - /* Format for trajectory file @@ -141,8 +143,6 @@ int Skip_Next_Block( gzFile, int*); No. of torsion entries (int) Torsion info lines as per torsion format. */ - - int Write_Custom_Header( reax_system*, control_params*, static_storage*, output_controls* ); int Write_xyz_Header ( reax_system*, control_params*, diff --git a/PuReMD-GPU/src/two_body_interactions.cu b/PuReMD-GPU/src/two_body_interactions.cu index f53b0cfb..60cfbd0b 100644 --- a/PuReMD-GPU/src/two_body_interactions.cu +++ b/PuReMD-GPU/src/two_body_interactions.cu @@ -221,7 +221,7 @@ GLOBAL void Cuda_Bond_Energy ( reax_atom *atoms, global_parameters g_params, -twbp->De_pp * bo_ij->BO_pi2; //PERFORMANCE IMAPCT - //atomicAdd (&data->E_BE, ebond); + //MYATOMICADD(&data->E_BE, ebond); //TODO //E_BE [ i ] += ebond/2.0; E_BE [ i ] += ebond; @@ -274,7 +274,7 @@ GLOBAL void Cuda_Bond_Energy ( reax_atom *atoms, global_parameters g_params, //estrain(j2) = estrain(j2) + 0.50*estriph; //PERFORMANCE IMPACT - //atomicAdd (&data->E_BE, estriph); + //MYATOMICADD(&data->E_BE, estriph); E_BE [ i] += estriph; //data->E_BE += estriph; @@ -289,7 +289,7 @@ GLOBAL void Cuda_Bond_Energy ( reax_atom *atoms, global_parameters g_params, //PERFORMANCE IMAPCT workspace->CdDelta[i] += decobdboua; - //atomicAdd (&workspace->CdDelta[j], decobdboub); + //MYATOMICADD(&workspace->CdDelta[j], decobdboub); //CdDelta [ i * N + i ] += decobdboua; //CdDelta [ i * N + j ] += decobdboua; //workspace->CdDelta [i] += decobdboua; diff --git a/PuReMD-GPU/src/vector.cu b/PuReMD-GPU/src/vector.c similarity index 100% rename from PuReMD-GPU/src/vector.cu rename to PuReMD-GPU/src/vector.c diff --git a/PuReMD-GPU/src/vector.h b/PuReMD-GPU/src/vector.h index 33653478..ee1375ee 100644 --- a/PuReMD-GPU/src/vector.h +++ b/PuReMD-GPU/src/vector.h @@ -22,8 +22,10 @@ #define __VECTOR_H_ #include "mytypes.h" + #include "random.h" + int Vector_isZero( real*, int ); void Vector_MakeZero( real*, int ); void Vector_Copy( real*, real*, int ); @@ -33,8 +35,6 @@ void Vector_Copy( real*, real*, int ); void Vector_Print( FILE*, char*, real*, int ); real Norm( real*, int ); -HOST_DEVICE inline real Dot( real*, real*, int ); - void rvec_Sum( rvec, rvec, rvec ); real rvec_ScaledDot( real, rvec, real, rvec ); void rvec_Multiply( rvec, rvec, rvec ); @@ -44,19 +44,6 @@ void rvec_Invert( rvec, rvec ); void rvec_OuterProduct( rtensor, rvec, rvec ); int rvec_isZero( rvec ); -HOST_DEVICE inline real rvec_Dot( rvec, rvec ); -HOST_DEVICE inline void rvec_Scale( rvec, real, rvec ); -HOST_DEVICE inline real rvec_Norm_Sqr( rvec ); -HOST_DEVICE inline void rvec_Random( rvec ); -HOST_DEVICE inline void rvec_MakeZero( rvec ); -HOST_DEVICE inline void rvec_Add( rvec, rvec ); -HOST_DEVICE inline void rvec_Copy( rvec, rvec ); -HOST_DEVICE inline void rvec_Cross( rvec, rvec, rvec ); -HOST_DEVICE inline void rvec_ScaledAdd( rvec, real, rvec ); -HOST_DEVICE inline void rvec_ScaledSum( rvec, real, rvec, real, rvec ); -HOST_DEVICE inline void rvec_iMultiply( rvec, ivec, rvec ); -HOST_DEVICE inline real rvec_Norm( rvec ); - void rtensor_MakeZero( rtensor ); void rtensor_Multiply( rtensor, rtensor, rtensor ); void rtensor_MatVec( rvec, rtensor, rvec ); @@ -80,15 +67,6 @@ void ivec_MakeZero( ivec ); void ivec_rScale( ivec, real, rvec ); -HOST_DEVICE inline void ivec_Copy( ivec, ivec ); -HOST_DEVICE inline void ivec_Scale( ivec, real, ivec ); -HOST_DEVICE inline void ivec_Sum( ivec, ivec, ivec ); - -/* - * Code which is common to multiple HOST and DEVICE - * - */ - HOST_DEVICE inline real Dot( real* v1, real* v2, int k ) { real ret = 0; @@ -100,17 +78,15 @@ HOST_DEVICE inline real Dot( real* v1, real* v2, int k ) } - - ///////////////////////////// //rvec functions ///////////////////////////// - HOST_DEVICE inline void rvec_MakeZero( rvec v ) { v[0] = v[1] = v[2] = ZERO; } + HOST_DEVICE inline void rvec_Add( rvec ret, rvec v ) { ret[0] += v[0]; @@ -118,11 +94,13 @@ HOST_DEVICE inline void rvec_Add( rvec ret, rvec v ) ret[2] += v[2]; } + HOST_DEVICE inline void rvec_Copy( rvec dest, rvec src ) { dest[0] = src[0], dest[1] = src[1], dest[2] = src[2]; } + HOST_DEVICE inline void rvec_Cross( rvec ret, rvec v1, rvec v2 ) { ret[0] = v1[1] * v2[2] - v1[2] * v2[1]; @@ -130,11 +108,13 @@ HOST_DEVICE inline void rvec_Cross( rvec ret, rvec v1, rvec v2 ) ret[2] = v1[0] * v2[1] - v1[1] * v2[0]; } + HOST_DEVICE inline void rvec_ScaledAdd( rvec ret, real c, rvec v ) { ret[0] += c * v[0], ret[1] += c * v[1], ret[2] += c * v[2]; } + HOST_DEVICE inline void rvec_ScaledSum( rvec ret, real c1, rvec v1 , real c2, rvec v2 ) { ret[0] = c1 * v1[0] + c2 * v2[0]; @@ -142,6 +122,7 @@ HOST_DEVICE inline void rvec_ScaledSum( rvec ret, real c1, rvec v1 , real c2, rv ret[2] = c1 * v1[2] + c2 * v2[2]; } + HOST_DEVICE inline void rvec_Random( rvec v ) { v[0] = Random(2.0) - 1.0; @@ -149,21 +130,25 @@ HOST_DEVICE inline void rvec_Random( rvec v ) v[2] = Random(2.0) - 1.0; } + HOST_DEVICE inline real rvec_Norm_Sqr( rvec v ) { return SQR(v[0]) + SQR(v[1]) + SQR(v[2]); } + HOST_DEVICE inline void rvec_Scale( rvec ret, real c, rvec v ) { ret[0] = c * v[0], ret[1] = c * v[1], ret[2] = c * v[2]; } + HOST_DEVICE inline real rvec_Dot( rvec v1, rvec v2 ) { return v1[0] * v2[0] + v1[1] * v2[1] + v1[2] * v2[2]; } + HOST_DEVICE inline void rvec_iMultiply( rvec r, ivec v1, rvec v2 ) { r[0] = v1[0] * v2[0]; @@ -171,23 +156,22 @@ HOST_DEVICE inline void rvec_iMultiply( rvec r, ivec v1, rvec v2 ) r[2] = v1[2] * v2[2]; } + HOST_DEVICE inline real rvec_Norm( rvec v ) { return SQRT( SQR(v[0]) + SQR(v[1]) + SQR(v[2]) ); } - ///////////////// //ivec functions ///////////////// - - HOST_DEVICE inline void ivec_Copy( ivec dest , ivec src ) { dest[0] = src[0], dest[1] = src[1], dest[2] = src[2]; } + HOST_DEVICE inline void ivec_Scale( ivec dest, real C, ivec src ) { dest[0] = C * src[0]; @@ -195,6 +179,7 @@ HOST_DEVICE inline void ivec_Scale( ivec dest, real C, ivec src ) dest[2] = C * src[2]; } + HOST_DEVICE inline void ivec_Sum( ivec dest, ivec v1, ivec v2 ) { dest[0] = v1[0] + v2[0]; @@ -203,7 +188,6 @@ HOST_DEVICE inline void ivec_Sum( ivec dest, ivec v1, ivec v2 ) } - ///////////////// //vector functions ///////////////// @@ -213,12 +197,14 @@ HOST_DEVICE inline void Vector_Sum( real* dest, real c, real* v, real d, real* y dest[k] = c * v[k] + d * y[k]; } + HOST_DEVICE inline void Vector_Scale( real* dest, real c, real* v, int k ) { for (k--; k >= 0; k--) dest[k] = c * v[k]; } + HOST_DEVICE inline void Vector_Add( real* dest, real c, real* v, int k ) { for (k--; k >= 0; k--) -- GitLab