From bb006b442445353331dbd0968bf83fe7bf0ae746 Mon Sep 17 00:00:00 2001 From: "Kurt A. O'Hearn" <ohearnku@cse.msu.edu> Date: Mon, 20 Jun 2016 20:59:17 -0400 Subject: [PATCH] More code formatting. --- PG-PuReMD/src/center_mass.cu | 1000 ++-- PG-PuReMD/src/cuda_bond_orders.cu | 1564 +++--- PG-PuReMD/src/cuda_bonds.cu | 220 +- PG-PuReMD/src/cuda_copy.cu | 316 +- PG-PuReMD/src/cuda_environment.cu | 64 +- PG-PuReMD/src/cuda_forces.cu | 3113 ++++++------ PG-PuReMD/src/cuda_hydrogen_bonds.cu | 1212 ++--- PG-PuReMD/src/cuda_init_md.cu | 4 +- PG-PuReMD/src/cuda_integrate.cu | 122 +- PG-PuReMD/src/cuda_linear_solvers.cu | 360 +- PG-PuReMD/src/cuda_lookup.cu | 132 +- PG-PuReMD/src/cuda_multi_body.cu | 518 +- PG-PuReMD/src/cuda_neighbors.cu | 1318 ++--- PG-PuReMD/src/cuda_nonbonded.cu | 1000 ++-- PG-PuReMD/src/cuda_post_evolve.cu | 38 +- PG-PuReMD/src/cuda_qEq.cu | 190 +- PG-PuReMD/src/cuda_reset_tools.cu | 286 +- PG-PuReMD/src/cuda_torsion_angles.cu | 1160 ++--- PG-PuReMD/src/cuda_utils.cu | 144 +- PG-PuReMD/src/cuda_valence_angles.cu | 1094 ++--- PG-PuReMD/src/dev_alloc.cu | 794 +-- PG-PuReMD/src/dev_list.cu | 138 +- PG-PuReMD/src/dev_system_props.cu | 444 +- PG-PuReMD/src/dual_matvec.cu | 208 +- PG-PuReMD/src/matvec.cu | 100 +- PG-PuReMD/src/reduction.cu | 756 +-- PG-PuReMD/src/validation.cu | 3104 ++++++------ PG-PuReMD/src/vector.cu | 772 +-- PuReMD-GPU/src/GMRES.cu | 1940 ++++---- PuReMD-GPU/src/QEq.cu | 1446 +++--- PuReMD-GPU/src/allocate.cu | 898 ++-- PuReMD-GPU/src/bond_orders.cu | 3104 ++++++------ PuReMD-GPU/src/box.cu | 746 +-- PuReMD-GPU/src/center_mass.cu | 436 +- PuReMD-GPU/src/cuda_copy.cu | 224 +- PuReMD-GPU/src/cuda_init.cu | 412 +- PuReMD-GPU/src/cuda_utils.cu | 156 +- PuReMD-GPU/src/forces.cu | 5168 ++++++++++---------- PuReMD-GPU/src/four_body_interactions.cu | 2538 +++++----- PuReMD-GPU/src/grid.cu | 698 +-- PuReMD-GPU/src/helpers.cu | 14 +- PuReMD-GPU/src/init_md.cu | 2202 ++++----- PuReMD-GPU/src/integrate.cu | 1804 +++---- PuReMD-GPU/src/list.cu | 404 +- PuReMD-GPU/src/lookup.cu | 1424 +++--- PuReMD-GPU/src/matvec.cu | 102 +- PuReMD-GPU/src/neighbors.cu | 2602 +++++----- PuReMD-GPU/src/reduction.cu | 338 +- PuReMD-GPU/src/reset_utils.cu | 290 +- PuReMD-GPU/src/single_body_interactions.cu | 1524 +++--- PuReMD-GPU/src/system_props.cu | 1130 ++--- PuReMD-GPU/src/testmd.cu | 570 +-- PuReMD-GPU/src/three_body_interactions.cu | 4340 ++++++++-------- PuReMD-GPU/src/traj.cu | 818 ++-- PuReMD-GPU/src/two_body_interactions.cu | 2700 +++++----- PuReMD-GPU/src/validation.cu | 3704 +++++++------- PuReMD-GPU/src/vector.cu | 294 +- 57 files changed, 31106 insertions(+), 31091 deletions(-) diff --git a/PG-PuReMD/src/center_mass.cu b/PG-PuReMD/src/center_mass.cu index 16d34141..725cafbb 100644 --- a/PG-PuReMD/src/center_mass.cu +++ b/PG-PuReMD/src/center_mass.cu @@ -3,549 +3,549 @@ #include "cuda_shuffle.h" CUDA_GLOBAL void center_of_mass_blocks (single_body_parameters *sbp, reax_atom *atoms, - rvec *res_xcm, - rvec *res_vcm, - rvec *res_amcm, - size_t n) + rvec *res_xcm, + rvec *res_vcm, + rvec *res_amcm, + size_t n) { - extern __shared__ rvec xcm[]; - extern __shared__ rvec vcm[]; - extern __shared__ rvec amcm[]; - - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - - //unsigned int xcm_id = threadIdx.x; - unsigned int vcm_id = blockDim.x; - unsigned int amcm_id = 2 *(blockDim.x); - - unsigned int index = 0; - rvec tmp; - real m; - - rvec_MakeZero (xcm [threadIdx.x]); - rvec_MakeZero (vcm [vcm_id + threadIdx.x]); - rvec_MakeZero (amcm[amcm_id + threadIdx.x]); - rvec_MakeZero (tmp); - - if (i < n){ - m = sbp [ atoms[i].type ].mass; - rvec_ScaledAdd (xcm [threadIdx.x], m, atoms [i].x); - rvec_ScaledAdd (vcm [vcm_id + threadIdx.x], m, atoms [i].v); - rvec_Cross (tmp, atoms[i].x, atoms [i].v); - rvec_ScaledAdd (amcm[amcm_id + threadIdx.x], m, tmp); - } - __syncthreads (); - - for( int offset = blockDim.x / 2; offset > 0; offset >>= 1 ) { - - if ((threadIdx.x < offset)) { - index = threadIdx.x + offset; - rvec_Add (xcm [threadIdx.x], xcm[index]); - rvec_Add (vcm [vcm_id + threadIdx.x], vcm[vcm_id + index]); - rvec_Add (amcm[amcm_id + threadIdx.x], amcm[amcm_id + index]); - } - __syncthreads (); - } - - if ((threadIdx.x == 0)){ - rvec_Copy (res_xcm[blockIdx.x], xcm[0]); - rvec_Copy (res_vcm[blockIdx.x], vcm[vcm_id]); - rvec_Copy (res_amcm[blockIdx.x], amcm[amcm_id]); - } + extern __shared__ rvec xcm[]; + extern __shared__ rvec vcm[]; + extern __shared__ rvec amcm[]; + + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + + //unsigned int xcm_id = threadIdx.x; + unsigned int vcm_id = blockDim.x; + unsigned int amcm_id = 2 *(blockDim.x); + + unsigned int index = 0; + rvec tmp; + real m; + + rvec_MakeZero (xcm [threadIdx.x]); + rvec_MakeZero (vcm [vcm_id + threadIdx.x]); + rvec_MakeZero (amcm[amcm_id + threadIdx.x]); + rvec_MakeZero (tmp); + + if (i < n){ + m = sbp [ atoms[i].type ].mass; + rvec_ScaledAdd (xcm [threadIdx.x], m, atoms [i].x); + rvec_ScaledAdd (vcm [vcm_id + threadIdx.x], m, atoms [i].v); + rvec_Cross (tmp, atoms[i].x, atoms [i].v); + rvec_ScaledAdd (amcm[amcm_id + threadIdx.x], m, tmp); + } + __syncthreads (); + + for( int offset = blockDim.x / 2; offset > 0; offset >>= 1 ) { + + if ((threadIdx.x < offset)) { + index = threadIdx.x + offset; + rvec_Add (xcm [threadIdx.x], xcm[index]); + rvec_Add (vcm [vcm_id + threadIdx.x], vcm[vcm_id + index]); + rvec_Add (amcm[amcm_id + threadIdx.x], amcm[amcm_id + index]); + } + __syncthreads (); + } + + if ((threadIdx.x == 0)){ + rvec_Copy (res_xcm[blockIdx.x], xcm[0]); + rvec_Copy (res_vcm[blockIdx.x], vcm[vcm_id]); + rvec_Copy (res_amcm[blockIdx.x], amcm[amcm_id]); + } } #if defined( __SM_35__) CUDA_GLOBAL void center_of_mass_blocks_xcm (single_body_parameters *sbp, reax_atom *atoms, - rvec *res_xcm, - size_t n) + rvec *res_xcm, + size_t n) { - extern __shared__ rvec my_xcm[]; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int xcm_id = threadIdx.x; - unsigned int index = 0; - rvec xcm; - real m; - - rvec_MakeZero (xcm); - - if (i < n){ - m = sbp [ atoms[i].type ].mass; - rvec_ScaledAdd (xcm , m, atoms [i].x); - } - __syncthreads (); - - for (int z = 16; z >= 1; z /= 2){ - xcm[0] += shfl( xcm[0], z); - xcm[1] += shfl( xcm[1], z); - xcm[2] += shfl( xcm[2], z); - } - __syncthreads (); - - if (threadIdx.x % 32 == 0) - rvec_Copy( my_xcm[ threadIdx.x >> 5], xcm ); - __syncthreads (); - - for( int offset = blockDim.x >> 6; offset > 0; offset >>= 1 ) { - - if ((threadIdx.x < offset)) { - index = threadIdx.x + offset; - rvec_Add (my_xcm [threadIdx.x], my_xcm[index]); - } - __syncthreads (); - } - - if ((threadIdx.x == 0)) - rvec_Copy (res_xcm[blockIdx.x], my_xcm[0]); + extern __shared__ rvec my_xcm[]; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int xcm_id = threadIdx.x; + unsigned int index = 0; + rvec xcm; + real m; + + rvec_MakeZero (xcm); + + if (i < n){ + m = sbp [ atoms[i].type ].mass; + rvec_ScaledAdd (xcm , m, atoms [i].x); + } + __syncthreads (); + + for (int z = 16; z >= 1; z /= 2){ + xcm[0] += shfl( xcm[0], z); + xcm[1] += shfl( xcm[1], z); + xcm[2] += shfl( xcm[2], z); + } + __syncthreads (); + + if (threadIdx.x % 32 == 0) + rvec_Copy( my_xcm[ threadIdx.x >> 5], xcm ); + __syncthreads (); + + for( int offset = blockDim.x >> 6; offset > 0; offset >>= 1 ) { + + if ((threadIdx.x < offset)) { + index = threadIdx.x + offset; + rvec_Add (my_xcm [threadIdx.x], my_xcm[index]); + } + __syncthreads (); + } + + if ((threadIdx.x == 0)) + rvec_Copy (res_xcm[blockIdx.x], my_xcm[0]); } CUDA_GLOBAL void center_of_mass_blocks_vcm (single_body_parameters *sbp, reax_atom *atoms, - rvec *res_vcm, - size_t n) + rvec *res_vcm, + size_t n) { - extern __shared__ rvec my_vcm[]; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int index = 0; - rvec vcm; - real m; - - rvec_MakeZero (vcm); - - if (i < n){ - m = sbp [ atoms[i].type ].mass; - rvec_ScaledAdd (vcm , m, atoms [i].v); - } - __syncthreads (); - - for (int z = 16; z >= 1; z /= 2){ - vcm[0] += shfl( vcm[0], z); - vcm[1] += shfl( vcm[1], z); - vcm[2] += shfl( vcm[2], z); - } - __syncthreads (); - - if (threadIdx.x % 32 == 0) - rvec_Copy( my_vcm[ threadIdx.x >> 5], vcm ); - __syncthreads (); - - for( int offset = blockDim.x >> 6; offset > 0; offset >>= 1 ) { - - if ((threadIdx.x < offset)) { - index = threadIdx.x + offset; - rvec_Add (my_vcm [threadIdx.x], my_vcm[index]); - } - __syncthreads (); - } - - if ((threadIdx.x == 0)) - rvec_Copy (res_vcm[blockIdx.x], my_vcm[0]); + extern __shared__ rvec my_vcm[]; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int index = 0; + rvec vcm; + real m; + + rvec_MakeZero (vcm); + + if (i < n){ + m = sbp [ atoms[i].type ].mass; + rvec_ScaledAdd (vcm , m, atoms [i].v); + } + __syncthreads (); + + for (int z = 16; z >= 1; z /= 2){ + vcm[0] += shfl( vcm[0], z); + vcm[1] += shfl( vcm[1], z); + vcm[2] += shfl( vcm[2], z); + } + __syncthreads (); + + if (threadIdx.x % 32 == 0) + rvec_Copy( my_vcm[ threadIdx.x >> 5], vcm ); + __syncthreads (); + + for( int offset = blockDim.x >> 6; offset > 0; offset >>= 1 ) { + + if ((threadIdx.x < offset)) { + index = threadIdx.x + offset; + rvec_Add (my_vcm [threadIdx.x], my_vcm[index]); + } + __syncthreads (); + } + + if ((threadIdx.x == 0)) + rvec_Copy (res_vcm[blockIdx.x], my_vcm[0]); } CUDA_GLOBAL void center_of_mass_blocks_amcm (single_body_parameters *sbp, reax_atom *atoms, - rvec *res_amcm, - size_t n) + rvec *res_amcm, + size_t n) { - extern __shared__ rvec my_amcm[]; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int index = 0; - rvec amcm; - real m; - rvec tmp; - - rvec_MakeZero (amcm); - rvec_MakeZero( tmp ); - - if (i < n){ - m = sbp [ atoms[i].type ].mass; - rvec_Cross (tmp, atoms[i].x, atoms [i].v); - rvec_ScaledAdd (amcm, m, tmp); - } - __syncthreads (); - - for (int z = 16; z >= 1; z /= 2){ - amcm[0] += shfl( amcm[0], z); - amcm[1] += shfl( amcm[1], z); - amcm[2] += shfl( amcm[2], z); - } - __syncthreads (); - - if (threadIdx.x % 32 == 0) - rvec_Copy( my_amcm[ threadIdx.x >> 5], amcm ); - __syncthreads (); - - - for( int offset = blockDim.x >> 6; offset > 0; offset >>= 1 ) { - - if ((threadIdx.x < offset)) { - index = threadIdx.x + offset; - rvec_Add (my_amcm[threadIdx.x], my_amcm[index]); - } - __syncthreads (); - } - - if ((threadIdx.x == 0)){ - rvec_Copy (res_amcm[blockIdx.x], my_amcm[0]); - } + extern __shared__ rvec my_amcm[]; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int index = 0; + rvec amcm; + real m; + rvec tmp; + + rvec_MakeZero (amcm); + rvec_MakeZero( tmp ); + + if (i < n){ + m = sbp [ atoms[i].type ].mass; + rvec_Cross (tmp, atoms[i].x, atoms [i].v); + rvec_ScaledAdd (amcm, m, tmp); + } + __syncthreads (); + + for (int z = 16; z >= 1; z /= 2){ + amcm[0] += shfl( amcm[0], z); + amcm[1] += shfl( amcm[1], z); + amcm[2] += shfl( amcm[2], z); + } + __syncthreads (); + + if (threadIdx.x % 32 == 0) + rvec_Copy( my_amcm[ threadIdx.x >> 5], amcm ); + __syncthreads (); + + + for( int offset = blockDim.x >> 6; offset > 0; offset >>= 1 ) { + + if ((threadIdx.x < offset)) { + index = threadIdx.x + offset; + rvec_Add (my_amcm[threadIdx.x], my_amcm[index]); + } + __syncthreads (); + } + + if ((threadIdx.x == 0)){ + rvec_Copy (res_amcm[blockIdx.x], my_amcm[0]); + } } #endif CUDA_GLOBAL void center_of_mass (rvec *xcm, - rvec *vcm, - rvec *amcm, - rvec *res_xcm, - rvec *res_vcm, - rvec *res_amcm, - size_t n) + rvec *vcm, + rvec *amcm, + rvec *res_xcm, + rvec *res_vcm, + rvec *res_amcm, + size_t n) { - extern __shared__ rvec sh_xcm[]; - extern __shared__ rvec sh_vcm[]; - extern __shared__ rvec sh_amcm[]; - - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - - unsigned int xcm_id = threadIdx.x; - unsigned int vcm_id = blockDim.x; - unsigned int amcm_id = 2 * (blockDim.x); - - unsigned int index = 0; - rvec t_xcm, t_vcm, t_amcm; - - rvec_MakeZero (t_xcm); - rvec_MakeZero (t_vcm); - rvec_MakeZero (t_amcm); - - if (i < n){ - rvec_Copy ( t_xcm, xcm[threadIdx.x]); - rvec_Copy ( t_vcm, vcm[threadIdx.x]); - rvec_Copy ( t_amcm, amcm[threadIdx.x]); - } - - rvec_Copy (sh_xcm[xcm_id], t_xcm); - rvec_Copy (sh_vcm[vcm_id + threadIdx.x], t_vcm); - rvec_Copy (sh_amcm[amcm_id + threadIdx.x], t_amcm); - - __syncthreads (); - - for( int offset = blockDim.x / 2; offset > 0; offset >>= 1 ) { - - if (threadIdx.x < offset) { - index = threadIdx.x + offset; - rvec_Add (sh_xcm [threadIdx.x], sh_xcm[index]); - rvec_Add (sh_vcm [vcm_id + threadIdx.x], sh_vcm[vcm_id + index]); - rvec_Add (sh_amcm [amcm_id + threadIdx.x], sh_amcm[amcm_id + index]); - } - __syncthreads (); - } - - if (threadIdx.x == 0){ - rvec_Copy (res_xcm[blockIdx.x], sh_xcm[0]); - rvec_Copy (res_vcm[blockIdx.x], sh_vcm[vcm_id]); - rvec_Copy (res_amcm[blockIdx.x], sh_amcm[amcm_id]); - } + extern __shared__ rvec sh_xcm[]; + extern __shared__ rvec sh_vcm[]; + extern __shared__ rvec sh_amcm[]; + + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + + unsigned int xcm_id = threadIdx.x; + unsigned int vcm_id = blockDim.x; + unsigned int amcm_id = 2 * (blockDim.x); + + unsigned int index = 0; + rvec t_xcm, t_vcm, t_amcm; + + rvec_MakeZero (t_xcm); + rvec_MakeZero (t_vcm); + rvec_MakeZero (t_amcm); + + if (i < n){ + rvec_Copy ( t_xcm, xcm[threadIdx.x]); + rvec_Copy ( t_vcm, vcm[threadIdx.x]); + rvec_Copy ( t_amcm, amcm[threadIdx.x]); + } + + rvec_Copy (sh_xcm[xcm_id], t_xcm); + rvec_Copy (sh_vcm[vcm_id + threadIdx.x], t_vcm); + rvec_Copy (sh_amcm[amcm_id + threadIdx.x], t_amcm); + + __syncthreads (); + + for( int offset = blockDim.x / 2; offset > 0; offset >>= 1 ) { + + if (threadIdx.x < offset) { + index = threadIdx.x + offset; + rvec_Add (sh_xcm [threadIdx.x], sh_xcm[index]); + rvec_Add (sh_vcm [vcm_id + threadIdx.x], sh_vcm[vcm_id + index]); + rvec_Add (sh_amcm [amcm_id + threadIdx.x], sh_amcm[amcm_id + index]); + } + __syncthreads (); + } + + if (threadIdx.x == 0){ + rvec_Copy (res_xcm[blockIdx.x], sh_xcm[0]); + rvec_Copy (res_vcm[blockIdx.x], sh_vcm[vcm_id]); + rvec_Copy (res_amcm[blockIdx.x], sh_amcm[amcm_id]); + } } CUDA_GLOBAL void compute_center_mass (single_body_parameters *sbp, - reax_atom *atoms, - real *results, - real xcm0, real xcm1, real xcm2, - size_t n) + reax_atom *atoms, + real *results, + real xcm0, real xcm1, real xcm2, + size_t n) { - extern __shared__ real xx[]; - extern __shared__ real xy[]; - extern __shared__ real xz[]; - extern __shared__ real yy[]; - extern __shared__ real yz[]; - extern __shared__ real zz[]; - - unsigned int xx_i = threadIdx.x; - unsigned int xy_i = blockDim.x; - unsigned int xz_i = 2 * blockDim.x; - unsigned int yy_i = 3 * blockDim.x; - unsigned int yz_i = 4 * blockDim.x; - unsigned int zz_i = 5 * blockDim.x; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int index = 0; - - rvec diff, xcm; - real m = 0; - rvec_MakeZero (diff); - xcm[0] = xcm0; - xcm[1] = xcm1; - xcm[2] = xcm2; - - - xx[xx_i] = xy [xy_i + threadIdx.x] = xz[xz_i + threadIdx.x] = - yy[yy_i + threadIdx.x] = yz[yz_i + threadIdx.x] = zz[zz_i + threadIdx.x] = 0; - - if (i < n){ - m = sbp[ atoms[i].type ].mass; - rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm ); - xx[ xx_i ] = diff[0] * diff[0] * m; - xy[ xy_i + threadIdx.x ] = diff[0] * diff[1] * m; - xz[ xz_i + threadIdx.x ] = diff[0] * diff[2] * m; - yy[ yy_i + threadIdx.x ] = diff[1] * diff[1] * m; - yz[ yz_i + threadIdx.x ] = diff[1] * diff[2] * m; - zz[ zz_i + threadIdx.x ] = diff[2] * diff[2] * m; - } - __syncthreads (); - - for (int offset = blockDim.x / 2; offset > 0; offset >>= 1){ - if (threadIdx.x < offset){ - index = threadIdx.x + offset; - xx[ threadIdx.x ] += xx[ index ]; - xy[ xy_i + threadIdx.x ] += xy [ xy_i + index ]; - xz[ xz_i + threadIdx.x ] += xz [ xz_i + index ]; - yy[ yy_i + threadIdx.x ] += yy [ yy_i + index ]; - yz[ yz_i + threadIdx.x ] += yz [ yz_i + index ]; - zz[ zz_i + threadIdx.x ] += zz [ zz_i + index ]; - } - __syncthreads (); - } - - if (threadIdx.x == 0) { - results [ blockIdx.x*6 ] = xx [ 0 ]; - results [ blockIdx.x*6 + 1 ] = xy [ xy_i + 0 ]; - results [ blockIdx.x*6 + 2 ] = xz [ xz_i + 0 ]; - results [ blockIdx.x*6 + 3 ] = yy [ yy_i + 0 ]; - results [ blockIdx.x*6 + 4 ] = yz [ yz_i + 0 ]; - results [ blockIdx.x*6 + 5 ] = zz [ zz_i + 0 ]; - } + extern __shared__ real xx[]; + extern __shared__ real xy[]; + extern __shared__ real xz[]; + extern __shared__ real yy[]; + extern __shared__ real yz[]; + extern __shared__ real zz[]; + + unsigned int xx_i = threadIdx.x; + unsigned int xy_i = blockDim.x; + unsigned int xz_i = 2 * blockDim.x; + unsigned int yy_i = 3 * blockDim.x; + unsigned int yz_i = 4 * blockDim.x; + unsigned int zz_i = 5 * blockDim.x; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int index = 0; + + rvec diff, xcm; + real m = 0; + rvec_MakeZero (diff); + xcm[0] = xcm0; + xcm[1] = xcm1; + xcm[2] = xcm2; + + + xx[xx_i] = xy [xy_i + threadIdx.x] = xz[xz_i + threadIdx.x] = + yy[yy_i + threadIdx.x] = yz[yz_i + threadIdx.x] = zz[zz_i + threadIdx.x] = 0; + + if (i < n){ + m = sbp[ atoms[i].type ].mass; + rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm ); + xx[ xx_i ] = diff[0] * diff[0] * m; + xy[ xy_i + threadIdx.x ] = diff[0] * diff[1] * m; + xz[ xz_i + threadIdx.x ] = diff[0] * diff[2] * m; + yy[ yy_i + threadIdx.x ] = diff[1] * diff[1] * m; + yz[ yz_i + threadIdx.x ] = diff[1] * diff[2] * m; + zz[ zz_i + threadIdx.x ] = diff[2] * diff[2] * m; + } + __syncthreads (); + + for (int offset = blockDim.x / 2; offset > 0; offset >>= 1){ + if (threadIdx.x < offset){ + index = threadIdx.x + offset; + xx[ threadIdx.x ] += xx[ index ]; + xy[ xy_i + threadIdx.x ] += xy [ xy_i + index ]; + xz[ xz_i + threadIdx.x ] += xz [ xz_i + index ]; + yy[ yy_i + threadIdx.x ] += yy [ yy_i + index ]; + yz[ yz_i + threadIdx.x ] += yz [ yz_i + index ]; + zz[ zz_i + threadIdx.x ] += zz [ zz_i + index ]; + } + __syncthreads (); + } + + if (threadIdx.x == 0) { + results [ blockIdx.x*6 ] = xx [ 0 ]; + results [ blockIdx.x*6 + 1 ] = xy [ xy_i + 0 ]; + results [ blockIdx.x*6 + 2 ] = xz [ xz_i + 0 ]; + results [ blockIdx.x*6 + 3 ] = yy [ yy_i + 0 ]; + results [ blockIdx.x*6 + 4 ] = yz [ yz_i + 0 ]; + results [ blockIdx.x*6 + 5 ] = zz [ zz_i + 0 ]; + } } CUDA_GLOBAL void compute_center_mass (real *input, real *output, size_t n) { - extern __shared__ real xx[]; - extern __shared__ real xy[]; - extern __shared__ real xz[]; - extern __shared__ real yy[]; - extern __shared__ real yz[]; - extern __shared__ real zz[]; - - unsigned int xx_i = threadIdx.x; - unsigned int xy_i = blockDim.x; - unsigned int xz_i = 2 * blockDim.x; - unsigned int yy_i = 3 * blockDim.x; - unsigned int yz_i = 4 * blockDim.x; - unsigned int zz_i = 5 * blockDim.x; - - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int index = 0; - - xx[xx_i] = xy [xy_i + threadIdx.x] = xz[xz_i + threadIdx.x] = - yy[yy_i + threadIdx.x] = yz[yz_i + threadIdx.x] = zz[zz_i + threadIdx.x] = 0; - - if (i < n) - { - xx [ xx_i ] = input [ threadIdx.x*6 + 0 ]; - xy [ xy_i + threadIdx.x ] = input [ threadIdx.x*6 + 1 ]; - xz [ xz_i + threadIdx.x ] = input [ threadIdx.x*6 + 2 ]; - yy [ yy_i + threadIdx.x ] = input [ threadIdx.x*6 + 3 ]; - yz [ yz_i + threadIdx.x ] = input [ threadIdx.x*6 + 4 ]; - zz [ zz_i + threadIdx.x ] = input [ threadIdx.x*6 + 5 ]; - } - __syncthreads (); - - for (int offset = blockDim.x / 2; offset > 0; offset >>= 1) - { - if (threadIdx.x < offset ) - { - index = threadIdx.x + offset; - xx [ threadIdx.x ] += xx [ index ]; - xy [ xy_i + threadIdx.x ] += xy [ xy_i + index ]; - xz [ xz_i + threadIdx.x ] += xz [ xz_i + index ]; - yy [ yy_i + threadIdx.x ] += yy [ yy_i + index ]; - yz [ yz_i + threadIdx.x ] += yz [ yz_i + index ]; - zz [ zz_i + threadIdx.x ] += zz [ zz_i + index ]; - } - __syncthreads (); - } - - if (threadIdx.x == 0) - { - output[0] = xx[0]; - output[1] = xy[xy_i]; - output[2] = xz[xz_i]; - output[3] = xz[yy_i]; - output[4] = xz[yz_i]; - output[5] = xz[zz_i]; - } + extern __shared__ real xx[]; + extern __shared__ real xy[]; + extern __shared__ real xz[]; + extern __shared__ real yy[]; + extern __shared__ real yz[]; + extern __shared__ real zz[]; + + unsigned int xx_i = threadIdx.x; + unsigned int xy_i = blockDim.x; + unsigned int xz_i = 2 * blockDim.x; + unsigned int yy_i = 3 * blockDim.x; + unsigned int yz_i = 4 * blockDim.x; + unsigned int zz_i = 5 * blockDim.x; + + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int index = 0; + + xx[xx_i] = xy [xy_i + threadIdx.x] = xz[xz_i + threadIdx.x] = + yy[yy_i + threadIdx.x] = yz[yz_i + threadIdx.x] = zz[zz_i + threadIdx.x] = 0; + + if (i < n) + { + xx [ xx_i ] = input [ threadIdx.x*6 + 0 ]; + xy [ xy_i + threadIdx.x ] = input [ threadIdx.x*6 + 1 ]; + xz [ xz_i + threadIdx.x ] = input [ threadIdx.x*6 + 2 ]; + yy [ yy_i + threadIdx.x ] = input [ threadIdx.x*6 + 3 ]; + yz [ yz_i + threadIdx.x ] = input [ threadIdx.x*6 + 4 ]; + zz [ zz_i + threadIdx.x ] = input [ threadIdx.x*6 + 5 ]; + } + __syncthreads (); + + for (int offset = blockDim.x / 2; offset > 0; offset >>= 1) + { + if (threadIdx.x < offset ) + { + index = threadIdx.x + offset; + xx [ threadIdx.x ] += xx [ index ]; + xy [ xy_i + threadIdx.x ] += xy [ xy_i + index ]; + xz [ xz_i + threadIdx.x ] += xz [ xz_i + index ]; + yy [ yy_i + threadIdx.x ] += yy [ yy_i + index ]; + yz [ yz_i + threadIdx.x ] += yz [ yz_i + index ]; + zz [ zz_i + threadIdx.x ] += zz [ zz_i + index ]; + } + __syncthreads (); + } + + if (threadIdx.x == 0) + { + output[0] = xx[0]; + output[1] = xy[xy_i]; + output[2] = xz[xz_i]; + output[3] = xz[yy_i]; + output[4] = xz[yz_i]; + output[5] = xz[zz_i]; + } } #if defined( __SM_35__) CUDA_GLOBAL void compute_center_mass_xx_xy (single_body_parameters *sbp, - reax_atom *atoms, - real *results, - real xcm0, real xcm1, real xcm2, - size_t n) + reax_atom *atoms, + real *results, + real xcm0, real xcm1, real xcm2, + size_t n) { - extern __shared__ real my_results_xx[]; - extern __shared__ real my_results_xy[]; - - unsigned int xx_i = threadIdx.x; - unsigned int xy_i = blockDim.x; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int index = 0; - real xx = 0; - real xy = 0; - - rvec diff, xcm; - real m = 0; - rvec_MakeZero (diff); - xcm[0] = xcm0; - xcm[1] = xcm1; - xcm[2] = xcm2; - - - if (i < n){ - m = sbp[ atoms[i].type ].mass; - rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm ); - xx = diff[0] * diff[0] * m; - xy = diff[0] * diff[1] * m; - } - __syncthreads (); - - for (int z = 16; z <= 1; z++){ - xx += shfl( xx, z); - xy += shfl( xy, z); - } - __syncthreads (); - - if (threadIdx.x % 32 == 0){ - my_results_xx[threadIdx.x >> 5] = xx; - my_results_xy[threadIdx.x >> 5] = xy; - } - __syncthreads (); - - for (int offset = blockDim.x >> 6; offset > 0; offset >>= 1){ - if (threadIdx.x < offset){ - index = threadIdx.x + offset; - my_results_xx[ threadIdx.x ] += my_results_xx[ index ]; - my_results_xy[ xy_i + threadIdx.x ] += my_results_xy [ xy_i + index ]; - } - __syncthreads (); - } - - if (threadIdx.x == 0) { - results [ blockIdx.x*6 ] = my_results_xx [ 0 ]; - results [ blockIdx.x*6 + 1 ] = my_results_xy [ xy_i + 0 ]; - } + extern __shared__ real my_results_xx[]; + extern __shared__ real my_results_xy[]; + + unsigned int xx_i = threadIdx.x; + unsigned int xy_i = blockDim.x; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int index = 0; + real xx = 0; + real xy = 0; + + rvec diff, xcm; + real m = 0; + rvec_MakeZero (diff); + xcm[0] = xcm0; + xcm[1] = xcm1; + xcm[2] = xcm2; + + + if (i < n){ + m = sbp[ atoms[i].type ].mass; + rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm ); + xx = diff[0] * diff[0] * m; + xy = diff[0] * diff[1] * m; + } + __syncthreads (); + + for (int z = 16; z <= 1; z++){ + xx += shfl( xx, z); + xy += shfl( xy, z); + } + __syncthreads (); + + if (threadIdx.x % 32 == 0){ + my_results_xx[threadIdx.x >> 5] = xx; + my_results_xy[threadIdx.x >> 5] = xy; + } + __syncthreads (); + + for (int offset = blockDim.x >> 6; offset > 0; offset >>= 1){ + if (threadIdx.x < offset){ + index = threadIdx.x + offset; + my_results_xx[ threadIdx.x ] += my_results_xx[ index ]; + my_results_xy[ xy_i + threadIdx.x ] += my_results_xy [ xy_i + index ]; + } + __syncthreads (); + } + + if (threadIdx.x == 0) { + results [ blockIdx.x*6 ] = my_results_xx [ 0 ]; + results [ blockIdx.x*6 + 1 ] = my_results_xy [ xy_i + 0 ]; + } } CUDA_GLOBAL void compute_center_mass_xz_yy (single_body_parameters *sbp, - reax_atom *atoms, - real *results, - real xcm0, real xcm1, real xcm2, - size_t n) + reax_atom *atoms, + real *results, + real xcm0, real xcm1, real xcm2, + size_t n) { - extern __shared__ real my_results_xz[]; - extern __shared__ real my_results_yy[]; - - unsigned int yy_i = blockDim.x; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int index = 0; - real xz = 0; - real yy = 0; - - rvec diff, xcm; - real m = 0; - rvec_MakeZero (diff); - xcm[0] = xcm0; - xcm[1] = xcm1; - xcm[2] = xcm2; - - if (i < n){ - m = sbp[ atoms[i].type ].mass; - rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm ); - xz = diff[0] * diff[2] * m; - yy = diff[1] * diff[1] * m; - } - __syncthreads (); - - for (int z = 16; z <= 1; z++){ - xz += shfl( xz, z); - yy += shfl( yy, z); - } - __syncthreads (); - - if (threadIdx.x % 32 == 0){ - my_results_xz[threadIdx.x >> 5] = xz; - my_results_yy[threadIdx.x >> 5] = yy; - } - __syncthreads (); - - for (int offset = blockDim.x >> 6; offset > 0; offset >>= 1){ - if (threadIdx.x < offset){ - index = threadIdx.x + offset; - my_results_xz[ threadIdx.x ] += my_results_xz [ index ]; - my_results_yy[ yy_i + threadIdx.x ] += my_results_yy [ yy_i + index ]; - } - __syncthreads (); - } - - if (threadIdx.x == 0) { - results [ blockIdx.x*6 + 2 ] = my_results_xz [ 0 ]; - results [ blockIdx.x*6 + 3 ] = my_results_yy [ yy_i + 0 ]; - } + extern __shared__ real my_results_xz[]; + extern __shared__ real my_results_yy[]; + + unsigned int yy_i = blockDim.x; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int index = 0; + real xz = 0; + real yy = 0; + + rvec diff, xcm; + real m = 0; + rvec_MakeZero (diff); + xcm[0] = xcm0; + xcm[1] = xcm1; + xcm[2] = xcm2; + + if (i < n){ + m = sbp[ atoms[i].type ].mass; + rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm ); + xz = diff[0] * diff[2] * m; + yy = diff[1] * diff[1] * m; + } + __syncthreads (); + + for (int z = 16; z <= 1; z++){ + xz += shfl( xz, z); + yy += shfl( yy, z); + } + __syncthreads (); + + if (threadIdx.x % 32 == 0){ + my_results_xz[threadIdx.x >> 5] = xz; + my_results_yy[threadIdx.x >> 5] = yy; + } + __syncthreads (); + + for (int offset = blockDim.x >> 6; offset > 0; offset >>= 1){ + if (threadIdx.x < offset){ + index = threadIdx.x + offset; + my_results_xz[ threadIdx.x ] += my_results_xz [ index ]; + my_results_yy[ yy_i + threadIdx.x ] += my_results_yy [ yy_i + index ]; + } + __syncthreads (); + } + + if (threadIdx.x == 0) { + results [ blockIdx.x*6 + 2 ] = my_results_xz [ 0 ]; + results [ blockIdx.x*6 + 3 ] = my_results_yy [ yy_i + 0 ]; + } } CUDA_GLOBAL void compute_center_mass_yz_zz (single_body_parameters *sbp, - reax_atom *atoms, - real *results, - real xcm0, real xcm1, real xcm2, - size_t n) + reax_atom *atoms, + real *results, + real xcm0, real xcm1, real xcm2, + size_t n) { - extern __shared__ real my_results_yz[]; - extern __shared__ real my_results_zz[]; - - unsigned int zz_i = blockDim.x; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int index = 0; - real yz = 0; - real zz = 0; - - rvec diff, xcm; - real m = 0; - rvec_MakeZero (diff); - xcm[0] = xcm0; - xcm[1] = xcm1; - xcm[2] = xcm2; - - - if (i < n){ - m = sbp[ atoms[i].type ].mass; - rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm ); - yz = diff[1] * diff[2] * m; - zz = diff[2] * diff[2] * m; - } - __syncthreads (); - - for (int z = 16; z <= 1; z++){ - yz += shfl( yz, z); - zz += shfl( zz, z); - } - __syncthreads (); - - if (threadIdx.x % 32 == 0){ - my_results_yz[threadIdx.x >> 5] = yz; - my_results_zz[threadIdx.x >> 5] = zz; - } - __syncthreads (); - - for (int offset = blockDim.x >> 6; offset > 0; offset >>= 1){ - if (threadIdx.x < offset){ - index = threadIdx.x + offset; - my_results_yz[ threadIdx.x ] += my_results_yz [ index ]; - my_results_zz[ zz_i + threadIdx.x ] += my_results_zz [ zz_i + index ]; - } - __syncthreads (); - } - - if (threadIdx.x == 0) { - results [ blockIdx.x*6 + 4 ] = my_results_yz [ 0 ]; - results [ blockIdx.x*6 + 5 ] = my_results_zz [ zz_i + 0 ]; - } + extern __shared__ real my_results_yz[]; + extern __shared__ real my_results_zz[]; + + unsigned int zz_i = blockDim.x; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int index = 0; + real yz = 0; + real zz = 0; + + rvec diff, xcm; + real m = 0; + rvec_MakeZero (diff); + xcm[0] = xcm0; + xcm[1] = xcm1; + xcm[2] = xcm2; + + + if (i < n){ + m = sbp[ atoms[i].type ].mass; + rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm ); + yz = diff[1] * diff[2] * m; + zz = diff[2] * diff[2] * m; + } + __syncthreads (); + + for (int z = 16; z <= 1; z++){ + yz += shfl( yz, z); + zz += shfl( zz, z); + } + __syncthreads (); + + if (threadIdx.x % 32 == 0){ + my_results_yz[threadIdx.x >> 5] = yz; + my_results_zz[threadIdx.x >> 5] = zz; + } + __syncthreads (); + + for (int offset = blockDim.x >> 6; offset > 0; offset >>= 1){ + if (threadIdx.x < offset){ + index = threadIdx.x + offset; + my_results_yz[ threadIdx.x ] += my_results_yz [ index ]; + my_results_zz[ zz_i + threadIdx.x ] += my_results_zz [ zz_i + index ]; + } + __syncthreads (); + } + + if (threadIdx.x == 0) { + results [ blockIdx.x*6 + 4 ] = my_results_yz [ 0 ]; + results [ blockIdx.x*6 + 5 ] = my_results_zz [ zz_i + 0 ]; + } } #endif diff --git a/PG-PuReMD/src/cuda_bond_orders.cu b/PG-PuReMD/src/cuda_bond_orders.cu index 3a208f44..05257c94 100644 --- a/PG-PuReMD/src/cuda_bond_orders.cu +++ b/PG-PuReMD/src/cuda_bond_orders.cu @@ -8,813 +8,813 @@ #include "reduction.h" CUDA_GLOBAL void Cuda_Calculate_BO_init ( reax_atom *my_atoms, - single_body_parameters *sbp, - storage p_workspace, - int N ) + single_body_parameters *sbp, + storage p_workspace, + int N ) { - int i, type_i; - single_body_parameters *sbp_i; + int i, type_i; + single_body_parameters *sbp_i; - i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= N) return; + i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= N) return; - storage *workspace = & (p_workspace); + storage *workspace = & (p_workspace); - /* Calculate Deltaprime, Deltaprime_boc values */ - type_i = my_atoms[i].type; - sbp_i = &(sbp[type_i]); - workspace->Deltap[i] = workspace->total_bond_order[i] - sbp_i->valency; - workspace->Deltap_boc[i] = - workspace->total_bond_order[i] - sbp_i->valency_val; - workspace->total_bond_order[i] = 0; + /* Calculate Deltaprime, Deltaprime_boc values */ + type_i = my_atoms[i].type; + sbp_i = &(sbp[type_i]); + workspace->Deltap[i] = workspace->total_bond_order[i] - sbp_i->valency; + workspace->Deltap_boc[i] = + workspace->total_bond_order[i] - sbp_i->valency_val; + workspace->total_bond_order[i] = 0; } CUDA_GLOBAL void Cuda_Calculate_BO ( reax_atom *my_atoms, global_parameters gp, - single_body_parameters *sbp, two_body_parameters *tbp, - storage p_workspace, reax_list p_bonds, - int num_atom_types, int N ) + single_body_parameters *sbp, two_body_parameters *tbp, + storage p_workspace, reax_list p_bonds, + int num_atom_types, int N ) { - int i, j, pj, type_i, type_j; - int start_i, end_i, sym_index, num_bonds; - real val_i, Deltap_i, Deltap_boc_i; - real val_j, Deltap_j, Deltap_boc_j; - real f1, f2, f3, f4, f5, f4f5, exp_f4, exp_f5; - real exp_p1i, exp_p2i, exp_p1j, exp_p2j; - real temp, u1_ij, u1_ji, Cf1A_ij, Cf1B_ij, Cf1_ij, Cf1_ji; - real Cf45_ij, Cf45_ji, p_lp1; //u_ij, u_ji - real A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji; - real explp1, p_boc1, p_boc2; - single_body_parameters *sbp_i, *sbp_j; - two_body_parameters *twbp; - bond_order_data *bo_ij, *bo_ji; - - - i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= N) return; - - storage *workspace = & (p_workspace); - reax_list *bonds = &(p_bonds); - - num_bonds = 0; - p_boc1 = gp.l[0]; - p_boc2 = gp.l[1]; - - /* Calculate Deltaprime, Deltaprime_boc values */ - /* - //for( i = 0; i < system->N; ++i ) { - type_i = my_atoms[i].type; - sbp_i = &(sbp[type_i]); - workspace->Deltap[i] = workspace->total_bond_order[i] - sbp_i->valency; - workspace->Deltap_boc[i] = - workspace->total_bond_order[i] - sbp_i->valency_val; - - //fprintf( stdout, "%d(%d) %24.15f\n", - // i, workspace->bond_mark[i], workspace->total_bond_order[i] ); - workspace->total_bond_order[i] = 0; - //} - */ - - /* Corrected Bond Order calculations */ - //for( i = 0; i < system->N; ++i ) { - type_i = my_atoms[i].type; - sbp_i = &(sbp[type_i]); - val_i = sbp_i->valency; - Deltap_i = workspace->Deltap[i]; - Deltap_boc_i = workspace->Deltap_boc[i]; - start_i = Dev_Start_Index(i, bonds); - end_i = Dev_End_Index(i, bonds); - // fprintf( stderr, "i:%d Dp:%g Dbocp:%g s:%d e:%d\n", - // i+1, Deltap_i, Deltap_boc_i, start_i, end_i ); - for( pj = start_i; pj < end_i; ++pj ) { - j = bonds->select.bond_list[pj].nbr; - type_j = my_atoms[j].type; - bo_ij = &( bonds->select.bond_list[pj].bo_data ); - // fprintf( stderr, "\tj:%d - ubo: %8.3f\n", j+1, bo_ij->BO ); - - //TODO - //TODO - //TODO - //TODO - //TODO - //if( i < j || workspace->bond_mark[j] > 3 ) { - if( i < j ) { - twbp = &( tbp[ index_tbp (type_i, type_j, num_atom_types)] ); + int i, j, pj, type_i, type_j; + int start_i, end_i, sym_index, num_bonds; + real val_i, Deltap_i, Deltap_boc_i; + real val_j, Deltap_j, Deltap_boc_j; + real f1, f2, f3, f4, f5, f4f5, exp_f4, exp_f5; + real exp_p1i, exp_p2i, exp_p1j, exp_p2j; + real temp, u1_ij, u1_ji, Cf1A_ij, Cf1B_ij, Cf1_ij, Cf1_ji; + real Cf45_ij, Cf45_ji, p_lp1; //u_ij, u_ji + real A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji; + real explp1, p_boc1, p_boc2; + single_body_parameters *sbp_i, *sbp_j; + two_body_parameters *twbp; + bond_order_data *bo_ij, *bo_ji; + + + i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= N) return; + + storage *workspace = & (p_workspace); + reax_list *bonds = &(p_bonds); + + num_bonds = 0; + p_boc1 = gp.l[0]; + p_boc2 = gp.l[1]; + + /* Calculate Deltaprime, Deltaprime_boc values */ + /* + //for( i = 0; i < system->N; ++i ) { + type_i = my_atoms[i].type; + sbp_i = &(sbp[type_i]); + workspace->Deltap[i] = workspace->total_bond_order[i] - sbp_i->valency; + workspace->Deltap_boc[i] = + workspace->total_bond_order[i] - sbp_i->valency_val; + + //fprintf( stdout, "%d(%d) %24.15f\n", + // i, workspace->bond_mark[i], workspace->total_bond_order[i] ); + workspace->total_bond_order[i] = 0; + //} + */ + + /* Corrected Bond Order calculations */ + //for( i = 0; i < system->N; ++i ) { + type_i = my_atoms[i].type; + sbp_i = &(sbp[type_i]); + val_i = sbp_i->valency; + Deltap_i = workspace->Deltap[i]; + Deltap_boc_i = workspace->Deltap_boc[i]; + start_i = Dev_Start_Index(i, bonds); + end_i = Dev_End_Index(i, bonds); + // fprintf( stderr, "i:%d Dp:%g Dbocp:%g s:%d e:%d\n", + // i+1, Deltap_i, Deltap_boc_i, start_i, end_i ); + for( pj = start_i; pj < end_i; ++pj ) { + j = bonds->select.bond_list[pj].nbr; + type_j = my_atoms[j].type; + bo_ij = &( bonds->select.bond_list[pj].bo_data ); + // fprintf( stderr, "\tj:%d - ubo: %8.3f\n", j+1, bo_ij->BO ); + + //TODO + //TODO + //TODO + //TODO + //TODO + //if( i < j || workspace->bond_mark[j] > 3 ) { + if( i < j ) { + twbp = &( tbp[ index_tbp (type_i, type_j, num_atom_types)] ); #ifdef TEST_FORCES - Set_Start_Index( pj, top_dbo, dBOs ); - /* fprintf( stderr, "%6d%6d%12.6f%12.6f%12.6f\n", - workspace->reverse_map[i], workspace->reverse_map[j], - twbp->ovc, twbp->v13cor, bo_ij->BO ); */ + Set_Start_Index( pj, top_dbo, dBOs ); + /* fprintf( stderr, "%6d%6d%12.6f%12.6f%12.6f\n", + workspace->reverse_map[i], workspace->reverse_map[j], + twbp->ovc, twbp->v13cor, bo_ij->BO ); */ #endif - if( twbp->ovc < 0.001 && twbp->v13cor < 0.001 ) { - /* There is no correction to bond orders nor to derivatives - of bond order prime! So we leave bond orders unchanged and - set derivative of bond order coefficients such that - dBO = dBOp & dBOxx = dBOxxp in Add_dBO_to_Forces */ - bo_ij->C1dbo = 1.000000; - bo_ij->C2dbo = 0.000000; - bo_ij->C3dbo = 0.000000; - - bo_ij->C1dbopi = bo_ij->BO_pi; - bo_ij->C2dbopi = 0.000000; - bo_ij->C3dbopi = 0.000000; - bo_ij->C4dbopi = 0.000000; - - bo_ij->C1dbopi2 = bo_ij->BO_pi2; - bo_ij->C2dbopi2 = 0.000000; - bo_ij->C3dbopi2 = 0.000000; - bo_ij->C4dbopi2 = 0.000000; + if( twbp->ovc < 0.001 && twbp->v13cor < 0.001 ) { + /* There is no correction to bond orders nor to derivatives + of bond order prime! So we leave bond orders unchanged and + set derivative of bond order coefficients such that + dBO = dBOp & dBOxx = dBOxxp in Add_dBO_to_Forces */ + bo_ij->C1dbo = 1.000000; + bo_ij->C2dbo = 0.000000; + bo_ij->C3dbo = 0.000000; + + bo_ij->C1dbopi = bo_ij->BO_pi; + bo_ij->C2dbopi = 0.000000; + bo_ij->C3dbopi = 0.000000; + bo_ij->C4dbopi = 0.000000; + + bo_ij->C1dbopi2 = bo_ij->BO_pi2; + bo_ij->C2dbopi2 = 0.000000; + bo_ij->C3dbopi2 = 0.000000; + bo_ij->C4dbopi2 = 0.000000; #ifdef TEST_FORCES - pdbo = &(dBOs->select.dbo_list[ top_dbo ]); - - // compute dBO_ij/dr_i - pdbo->wrt = i; - rvec_Copy( pdbo->dBO, bo_ij->dBOp ); - rvec_Scale( pdbo->dBOpi, bo_ij->BO_pi, bo_ij->dln_BOp_pi ); - rvec_Scale( pdbo->dBOpi2, bo_ij->BO_pi2, bo_ij->dln_BOp_pi2); - - // compute dBO_ij/dr_j - pdbo++; - pdbo->wrt = j; - rvec_Scale( pdbo->dBO, -1.0, bo_ij->dBOp ); - rvec_Scale( pdbo->dBOpi, -bo_ij->BO_pi, bo_ij->dln_BOp_pi ); - rvec_Scale(pdbo->dBOpi2, -bo_ij->BO_pi2, bo_ij->dln_BOp_pi2); - - top_dbo += 2; + pdbo = &(dBOs->select.dbo_list[ top_dbo ]); + + // compute dBO_ij/dr_i + pdbo->wrt = i; + rvec_Copy( pdbo->dBO, bo_ij->dBOp ); + rvec_Scale( pdbo->dBOpi, bo_ij->BO_pi, bo_ij->dln_BOp_pi ); + rvec_Scale( pdbo->dBOpi2, bo_ij->BO_pi2, bo_ij->dln_BOp_pi2); + + // compute dBO_ij/dr_j + pdbo++; + pdbo->wrt = j; + rvec_Scale( pdbo->dBO, -1.0, bo_ij->dBOp ); + rvec_Scale( pdbo->dBOpi, -bo_ij->BO_pi, bo_ij->dln_BOp_pi ); + rvec_Scale(pdbo->dBOpi2, -bo_ij->BO_pi2, bo_ij->dln_BOp_pi2); + + top_dbo += 2; #endif - } - else { - val_j = sbp[type_j].valency; - Deltap_j = workspace->Deltap[j]; - Deltap_boc_j = workspace->Deltap_boc[j]; - - /* on page 1 */ - if( twbp->ovc >= 0.001 ) { - /* Correction for overcoordination */ - exp_p1i = EXP( -p_boc1 * Deltap_i ); - exp_p2i = EXP( -p_boc2 * Deltap_i ); - exp_p1j = EXP( -p_boc1 * Deltap_j ); - exp_p2j = EXP( -p_boc2 * Deltap_j ); - - f2 = exp_p1i + exp_p1j; - f3 = -1.0 / p_boc2 * log( 0.5 * ( exp_p2i + exp_p2j ) ); - f1 = 0.5 * ( ( val_i + f2 )/( val_i + f2 + f3 ) + - ( val_j + f2 )/( val_j + f2 + f3 ) ); - - - /*fprintf( stderr,"%d %d\t%g %g j:%g %g p_boc:%g %g\n" - "\tf:%g %g %g, exp:%g %g %g %g\n", - i+1, j+1, - val_i, Deltap_i, val_j, Deltap_j, p_boc1, p_boc2, - f1, f2, f3, exp_p1i, exp_p2i, exp_p1j, exp_p2j );*/ - - /* Now come the derivates */ - /* Bond Order pages 5-7, derivative of f1 */ - temp = f2 + f3; - u1_ij = val_i + temp; - u1_ji = val_j + temp; - Cf1A_ij = 0.5 * f3 * (1.0 / SQR( u1_ij ) + - 1.0 / SQR( u1_ji )); - Cf1B_ij = -0.5 * (( u1_ij - f3 ) / SQR( u1_ij ) + - ( u1_ji - f3 ) / SQR( u1_ji )); - - //Cf1_ij = -Cf1A_ij * p_boc1 * exp_p1i + - // Cf1B_ij * exp_p2i / ( exp_p2i + exp_p2j ); - Cf1_ij = 0.50 * ( -p_boc1 * exp_p1i / u1_ij - - ((val_i+f2) / SQR(u1_ij)) * - ( -p_boc1 * exp_p1i + - exp_p2i / ( exp_p2i + exp_p2j ) ) + - -p_boc1 * exp_p1i / u1_ji - - ((val_j+f2) / SQR(u1_ji)) * - ( -p_boc1 * exp_p1i + - exp_p2i / ( exp_p2i + exp_p2j ) )); - - - Cf1_ji = -Cf1A_ij * p_boc1 * exp_p1j + - Cf1B_ij * exp_p2j / ( exp_p2i + exp_p2j ); - - //fprintf( stderr, "\tCf1:%g %g\n", Cf1_ij, Cf1_ji ); - } - else { - /* No overcoordination correction! */ - f1 = 1.0; - Cf1_ij = Cf1_ji = 0.0; - } - - if( twbp->v13cor >= 0.001 ) { - /* Correction for 1-3 bond orders */ - exp_f4 =EXP(-(twbp->p_boc4 * SQR( bo_ij->BO ) - - Deltap_boc_i) * twbp->p_boc3 + twbp->p_boc5); - exp_f5 =EXP(-(twbp->p_boc4 * SQR( bo_ij->BO ) - - Deltap_boc_j) * twbp->p_boc3 + twbp->p_boc5); - - f4 = 1. / (1. + exp_f4); - f5 = 1. / (1. + exp_f5); - f4f5 = f4 * f5; - - /* Bond Order pages 8-9, derivative of f4 and f5 */ - /*temp = twbp->p_boc5 - - twbp->p_boc3 * twbp->p_boc4 * SQR( bo_ij->BO ); - u_ij = temp + twbp->p_boc3 * Deltap_boc_i; - u_ji = temp + twbp->p_boc3 * Deltap_boc_j; - Cf45_ij = Cf45( u_ij, u_ji ) / f4f5; - Cf45_ji = Cf45( u_ji, u_ij ) / f4f5;*/ - Cf45_ij = -f4 * exp_f4; - Cf45_ji = -f5 * exp_f5; - } - else { - f4 = f5 = f4f5 = 1.0; - Cf45_ij = Cf45_ji = 0.0; - } - - /* Bond Order page 10, derivative of total bond order */ - A0_ij = f1 * f4f5; - A1_ij = -2 * twbp->p_boc3 * twbp->p_boc4 * bo_ij->BO * - (Cf45_ij + Cf45_ji); - A2_ij = Cf1_ij / f1 + twbp->p_boc3 * Cf45_ij; - A2_ji = Cf1_ji / f1 + twbp->p_boc3 * Cf45_ji; - A3_ij = A2_ij + Cf1_ij / f1; - A3_ji = A2_ji + Cf1_ji / f1; - - /*fprintf( stderr, "\tBO: %f, A0: %f, A1: %f" - "A2_ij: %f A2_ji: %f, A3_ij: %f, A3_ji: %f\n", - bo_ij->BO, - A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji );*/ - - - /* find corrected bond orders and their derivative coef */ - bo_ij->BO = bo_ij->BO * A0_ij; - bo_ij->BO_pi = bo_ij->BO_pi * A0_ij *f1; - bo_ij->BO_pi2= bo_ij->BO_pi2* A0_ij *f1; - bo_ij->BO_s = bo_ij->BO - ( bo_ij->BO_pi + bo_ij->BO_pi2 ); - - bo_ij->C1dbo = A0_ij + bo_ij->BO * A1_ij; - bo_ij->C2dbo = bo_ij->BO * A2_ij; - bo_ij->C3dbo = bo_ij->BO * A2_ji; - - bo_ij->C1dbopi = f1*f1*f4*f5; - bo_ij->C2dbopi = bo_ij->BO_pi * A1_ij; - bo_ij->C3dbopi = bo_ij->BO_pi * A3_ij; - bo_ij->C4dbopi = bo_ij->BO_pi * A3_ji; - - bo_ij->C1dbopi2 = f1*f1*f4*f5; - bo_ij->C2dbopi2 = bo_ij->BO_pi2 * A1_ij; - bo_ij->C3dbopi2 = bo_ij->BO_pi2 * A3_ij; - bo_ij->C4dbopi2 = bo_ij->BO_pi2 * A3_ji; - - //CHANGE ORIGINAL - } - //CHANGE ORIGINAL - - /* neglect bonds that are < 1e-10 */ - if( bo_ij->BO < 1e-10 ) - bo_ij->BO = 0.0; - if( bo_ij->BO_s < 1e-10 ) - bo_ij->BO_s = 0.0; - if( bo_ij->BO_pi < 1e-10 ) - bo_ij->BO_pi = 0.0; - if( bo_ij->BO_pi2 < 1e-10 ) - bo_ij->BO_pi2 = 0.0; - - workspace->total_bond_order[i] += bo_ij->BO; //now keeps total_BO - - - /* fprintf( stderr, "%d %d\t%g %g %g %g\n" - "Cdbo:\t%g %g %g\n" - "Cdbopi:\t%g %g %g %g\n" - "Cdbopi2:%g %g %g %g\n\n", - i+1, j+1, - bonds->select.bond_list[ pj ].d, - bo_ij->BO,bo_ij->BO_pi, bo_ij->BO_pi2, - bo_ij->C1dbo, bo_ij->C2dbo, bo_ij->C3dbo, - bo_ij->C1dbopi, bo_ij->C2dbopi, - bo_ij->C3dbopi, bo_ij->C4dbopi, - bo_ij->C1dbopi2,bo_ij->C2dbopi2, - bo_ij->C3dbopi2, bo_ij->C4dbopi2 ); */ - - /* fprintf( stderr, "%d %d BO:%f BO_s:%f BO_pi:%f BO_pi2:%f\n", - i+1,j+1,bo_ij->BO,bo_ij->BO_s,bo_ij->BO_pi,bo_ij->BO_pi2 );*/ + } + else { + val_j = sbp[type_j].valency; + Deltap_j = workspace->Deltap[j]; + Deltap_boc_j = workspace->Deltap_boc[j]; + + /* on page 1 */ + if( twbp->ovc >= 0.001 ) { + /* Correction for overcoordination */ + exp_p1i = EXP( -p_boc1 * Deltap_i ); + exp_p2i = EXP( -p_boc2 * Deltap_i ); + exp_p1j = EXP( -p_boc1 * Deltap_j ); + exp_p2j = EXP( -p_boc2 * Deltap_j ); + + f2 = exp_p1i + exp_p1j; + f3 = -1.0 / p_boc2 * log( 0.5 * ( exp_p2i + exp_p2j ) ); + f1 = 0.5 * ( ( val_i + f2 )/( val_i + f2 + f3 ) + + ( val_j + f2 )/( val_j + f2 + f3 ) ); + + + /*fprintf( stderr,"%d %d\t%g %g j:%g %g p_boc:%g %g\n" + "\tf:%g %g %g, exp:%g %g %g %g\n", + i+1, j+1, + val_i, Deltap_i, val_j, Deltap_j, p_boc1, p_boc2, + f1, f2, f3, exp_p1i, exp_p2i, exp_p1j, exp_p2j );*/ + + /* Now come the derivates */ + /* Bond Order pages 5-7, derivative of f1 */ + temp = f2 + f3; + u1_ij = val_i + temp; + u1_ji = val_j + temp; + Cf1A_ij = 0.5 * f3 * (1.0 / SQR( u1_ij ) + + 1.0 / SQR( u1_ji )); + Cf1B_ij = -0.5 * (( u1_ij - f3 ) / SQR( u1_ij ) + + ( u1_ji - f3 ) / SQR( u1_ji )); + + //Cf1_ij = -Cf1A_ij * p_boc1 * exp_p1i + + // Cf1B_ij * exp_p2i / ( exp_p2i + exp_p2j ); + Cf1_ij = 0.50 * ( -p_boc1 * exp_p1i / u1_ij - + ((val_i+f2) / SQR(u1_ij)) * + ( -p_boc1 * exp_p1i + + exp_p2i / ( exp_p2i + exp_p2j ) ) + + -p_boc1 * exp_p1i / u1_ji - + ((val_j+f2) / SQR(u1_ji)) * + ( -p_boc1 * exp_p1i + + exp_p2i / ( exp_p2i + exp_p2j ) )); + + + Cf1_ji = -Cf1A_ij * p_boc1 * exp_p1j + + Cf1B_ij * exp_p2j / ( exp_p2i + exp_p2j ); + + //fprintf( stderr, "\tCf1:%g %g\n", Cf1_ij, Cf1_ji ); + } + else { + /* No overcoordination correction! */ + f1 = 1.0; + Cf1_ij = Cf1_ji = 0.0; + } + + if( twbp->v13cor >= 0.001 ) { + /* Correction for 1-3 bond orders */ + exp_f4 =EXP(-(twbp->p_boc4 * SQR( bo_ij->BO ) - + Deltap_boc_i) * twbp->p_boc3 + twbp->p_boc5); + exp_f5 =EXP(-(twbp->p_boc4 * SQR( bo_ij->BO ) - + Deltap_boc_j) * twbp->p_boc3 + twbp->p_boc5); + + f4 = 1. / (1. + exp_f4); + f5 = 1. / (1. + exp_f5); + f4f5 = f4 * f5; + + /* Bond Order pages 8-9, derivative of f4 and f5 */ + /*temp = twbp->p_boc5 - + twbp->p_boc3 * twbp->p_boc4 * SQR( bo_ij->BO ); + u_ij = temp + twbp->p_boc3 * Deltap_boc_i; + u_ji = temp + twbp->p_boc3 * Deltap_boc_j; + Cf45_ij = Cf45( u_ij, u_ji ) / f4f5; + Cf45_ji = Cf45( u_ji, u_ij ) / f4f5;*/ + Cf45_ij = -f4 * exp_f4; + Cf45_ji = -f5 * exp_f5; + } + else { + f4 = f5 = f4f5 = 1.0; + Cf45_ij = Cf45_ji = 0.0; + } + + /* Bond Order page 10, derivative of total bond order */ + A0_ij = f1 * f4f5; + A1_ij = -2 * twbp->p_boc3 * twbp->p_boc4 * bo_ij->BO * + (Cf45_ij + Cf45_ji); + A2_ij = Cf1_ij / f1 + twbp->p_boc3 * Cf45_ij; + A2_ji = Cf1_ji / f1 + twbp->p_boc3 * Cf45_ji; + A3_ij = A2_ij + Cf1_ij / f1; + A3_ji = A2_ji + Cf1_ji / f1; + + /*fprintf( stderr, "\tBO: %f, A0: %f, A1: %f" + "A2_ij: %f A2_ji: %f, A3_ij: %f, A3_ji: %f\n", + bo_ij->BO, + A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji );*/ + + + /* find corrected bond orders and their derivative coef */ + bo_ij->BO = bo_ij->BO * A0_ij; + bo_ij->BO_pi = bo_ij->BO_pi * A0_ij *f1; + bo_ij->BO_pi2= bo_ij->BO_pi2* A0_ij *f1; + bo_ij->BO_s = bo_ij->BO - ( bo_ij->BO_pi + bo_ij->BO_pi2 ); + + bo_ij->C1dbo = A0_ij + bo_ij->BO * A1_ij; + bo_ij->C2dbo = bo_ij->BO * A2_ij; + bo_ij->C3dbo = bo_ij->BO * A2_ji; + + bo_ij->C1dbopi = f1*f1*f4*f5; + bo_ij->C2dbopi = bo_ij->BO_pi * A1_ij; + bo_ij->C3dbopi = bo_ij->BO_pi * A3_ij; + bo_ij->C4dbopi = bo_ij->BO_pi * A3_ji; + + bo_ij->C1dbopi2 = f1*f1*f4*f5; + bo_ij->C2dbopi2 = bo_ij->BO_pi2 * A1_ij; + bo_ij->C3dbopi2 = bo_ij->BO_pi2 * A3_ij; + bo_ij->C4dbopi2 = bo_ij->BO_pi2 * A3_ji; + + //CHANGE ORIGINAL + } + //CHANGE ORIGINAL + + /* neglect bonds that are < 1e-10 */ + if( bo_ij->BO < 1e-10 ) + bo_ij->BO = 0.0; + if( bo_ij->BO_s < 1e-10 ) + bo_ij->BO_s = 0.0; + if( bo_ij->BO_pi < 1e-10 ) + bo_ij->BO_pi = 0.0; + if( bo_ij->BO_pi2 < 1e-10 ) + bo_ij->BO_pi2 = 0.0; + + workspace->total_bond_order[i] += bo_ij->BO; //now keeps total_BO + + + /* fprintf( stderr, "%d %d\t%g %g %g %g\n" + "Cdbo:\t%g %g %g\n" + "Cdbopi:\t%g %g %g %g\n" + "Cdbopi2:%g %g %g %g\n\n", + i+1, j+1, + bonds->select.bond_list[ pj ].d, + bo_ij->BO,bo_ij->BO_pi, bo_ij->BO_pi2, + bo_ij->C1dbo, bo_ij->C2dbo, bo_ij->C3dbo, + bo_ij->C1dbopi, bo_ij->C2dbopi, + bo_ij->C3dbopi, bo_ij->C4dbopi, + bo_ij->C1dbopi2,bo_ij->C2dbopi2, + bo_ij->C3dbopi2, bo_ij->C4dbopi2 ); */ + + /* fprintf( stderr, "%d %d BO:%f BO_s:%f BO_pi:%f BO_pi2:%f\n", + i+1,j+1,bo_ij->BO,bo_ij->BO_s,bo_ij->BO_pi,bo_ij->BO_pi2 );*/ #ifdef TEST_FORCES - Set_End_Index( pj, top_dbo, dBOs ); - Add_dBO( system, lists, i, pj, 1.0, workspace->dDelta ); + Set_End_Index( pj, top_dbo, dBOs ); + Add_dBO( system, lists, i, pj, 1.0, workspace->dDelta ); #endif - //CHANGE ORIGINAL - //} - //CHANGE ORIGINAL - /* - else { - // We only need to update bond orders from bo_ji - // everything else is set in uncorrected_bo calculations - sym_index = bonds->select.bond_list[pj].sym_index; - bo_ji = &(bonds->select.bond_list[ sym_index ].bo_data); - bo_ij->BO = bo_ji->BO; - bo_ij->BO_s = bo_ji->BO_s; - bo_ij->BO_pi = bo_ji->BO_pi; - bo_ij->BO_pi2 = bo_ji->BO_pi2; - - workspace->total_bond_order[i] += bo_ij->BO;// now keeps total_BO + //CHANGE ORIGINAL + //} + //CHANGE ORIGINAL + /* + else { + // We only need to update bond orders from bo_ji + // everything else is set in uncorrected_bo calculations + sym_index = bonds->select.bond_list[pj].sym_index; + bo_ji = &(bonds->select.bond_list[ sym_index ].bo_data); + bo_ij->BO = bo_ji->BO; + bo_ij->BO_s = bo_ji->BO_s; + bo_ij->BO_pi = bo_ji->BO_pi; + bo_ij->BO_pi2 = bo_ji->BO_pi2; + + workspace->total_bond_order[i] += bo_ij->BO;// now keeps total_BO #ifdef TEST_FORCES Add_dBO( system, lists, j, sym_index, 1.0, workspace->dDelta ); #endif } - */ - } + */ + } } //} COMMENTED FOR CUDA KERNEL } CUDA_GLOBAL void Cuda_Update_Uncorrected_BO ( storage p_workspace, reax_list p_bonds, int N ) { - int i, j, pj; - int start_i, end_i; - int sym_index; - storage *workspace = &( p_workspace ); - reax_list *bonds = &( p_bonds ); - - bond_order_data *bo_ij, *bo_ji; - - i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= N) return; - - start_i = Dev_Start_Index(i, bonds); - end_i = Dev_End_Index(i, bonds); - - for( pj = start_i; pj < end_i; ++pj ) { - - j = bonds->select.bond_list[pj].nbr; - bo_ij = &( bonds->select.bond_list[pj].bo_data ); - - //if( (i >= j) || (workspace->bond_mark [i] <= 3)) { - if( (i >= j) ) { - - /* We only need to update bond orders from bo_ji - everything else is set in uncorrected_bo calculations */ - sym_index = bonds->select.bond_list[pj].sym_index; - bo_ji = &(bonds->select.bond_list[ sym_index ].bo_data); - bo_ij->BO = bo_ji->BO; - bo_ij->BO_s = bo_ji->BO_s; - bo_ij->BO_pi = bo_ji->BO_pi; - bo_ij->BO_pi2 = bo_ji->BO_pi2; - - workspace->total_bond_order[i] += bo_ij->BO;// now keeps total_BO - } - } - } - - CUDA_GLOBAL void Cuda_Update_Workspace_After_BO ( reax_atom *my_atoms, global_parameters gp, - single_body_parameters *sbp, storage p_workspace, - int N) - { - int j, type_j; - real explp1; - real p_lp1; - single_body_parameters *sbp_i, *sbp_j; - storage *workspace = &( p_workspace ); - - j = blockIdx.x * blockDim.x + threadIdx.x; - if (j >= N) return; - - p_lp1 = gp.l[15]; - /* Calculate some helper variables that are used at many places - throughout force calculations */ - //for( j = 0; j < system->N; ++j ){ - type_j = my_atoms[j].type; - sbp_j = &(sbp[ type_j ]); - - workspace->Delta[j] = workspace->total_bond_order[j] - sbp_j->valency; - workspace->Delta_e[j] = workspace->total_bond_order[j] - sbp_j->valency_e; - workspace->Delta_boc[j] = workspace->total_bond_order[j] - - sbp_j->valency_boc; - - workspace->vlpex[j] = workspace->Delta_e[j] - - 2.0 * (int)(workspace->Delta_e[j]/2.0); - explp1 = EXP(-p_lp1 * SQR(2.0 + workspace->vlpex[j])); - workspace->nlp[j] = explp1 - (int)(workspace->Delta_e[j] / 2.0); - workspace->Delta_lp[j] = sbp_j->nlp_opt - workspace->nlp[j]; - workspace->Clp[j] = 2.0 * p_lp1 * explp1 * (2.0 + workspace->vlpex[j]); - /* Adri uses different dDelta_lp values than the ones in notes... */ - workspace->dDelta_lp[j] = workspace->Clp[j]; - //workspace->dDelta_lp[j] = workspace->Clp[j] + (0.5-workspace->Clp[j]) * - //((fabs(workspace->Delta_e[j]/2.0 - - // (int)(workspace->Delta_e[j]/2.0)) < 0.1) ? 1 : 0 ); - - if( sbp_j->mass > 21.0 ) { - workspace->nlp_temp[j] = 0.5 * (sbp_j->valency_e - sbp_j->valency); - workspace->Delta_lp_temp[j] = sbp_j->nlp_opt - workspace->nlp_temp[j]; - workspace->dDelta_lp_temp[j] = 0.; - } - else { - workspace->nlp_temp[j] = workspace->nlp[j]; - workspace->Delta_lp_temp[j] = sbp_j->nlp_opt - workspace->nlp_temp[j]; - workspace->dDelta_lp_temp[j] = workspace->Clp[j]; - } - //} Commented for Cuda - } - - - CUDA_DEVICE void Cuda_Add_dBond_to_Forces_NPT( int i, int pj, simulation_data *data, - storage *workspace, reax_list *bonds, rvec data_ext_press) - { - bond_data *nbr_j, *nbr_k; - bond_order_data *bo_ij, *bo_ji; - dbond_coefficients coef; - rvec temp, ext_press; - ivec rel_box; - int pk, k, j; - rvec tf_f; - - /* Initializations */ - nbr_j = &(bonds->select.bond_list[pj]); - j = nbr_j->nbr; - - //bo_ij = &(nbr_j->bo_data); - //bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data); - if (i < j) { - bo_ij = &(nbr_j->bo_data); - bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data); - } else { - bo_ji = &(nbr_j->bo_data); - bo_ij = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data); - } - - coef.C1dbo = bo_ij->C1dbo * (bo_ij->Cdbo + bo_ji->Cdbo); - coef.C2dbo = bo_ij->C2dbo * (bo_ij->Cdbo + bo_ji->Cdbo); - coef.C3dbo = bo_ij->C3dbo * (bo_ij->Cdbo + bo_ji->Cdbo); - - coef.C1dbopi = bo_ij->C1dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); - coef.C2dbopi = bo_ij->C2dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); - coef.C3dbopi = bo_ij->C3dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); - coef.C4dbopi = bo_ij->C4dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); - - coef.C1dbopi2 = bo_ij->C1dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); - coef.C2dbopi2 = bo_ij->C2dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); - coef.C3dbopi2 = bo_ij->C3dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); - coef.C4dbopi2 = bo_ij->C4dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); - - coef.C1dDelta = bo_ij->C1dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]); - coef.C2dDelta = bo_ij->C2dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]); - coef.C3dDelta = bo_ij->C3dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]); - - - /************************************ - * forces related to atom i * - * first neighbors of atom i * - ************************************/ - if (i < j) { - for( pk = Dev_Start_Index(i, bonds); pk < Dev_End_Index(i, bonds); ++pk ) { - nbr_k = &(bonds->select.bond_list[pk]); - k = nbr_k->nbr; - - rvec_MakeZero (nbr_k->tf_f); - - rvec_Scale(temp, -coef.C2dbo, nbr_k->bo_data.dBOp); /*2nd, dBO*/ - rvec_ScaledAdd(temp, -coef.C2dDelta, nbr_k->bo_data.dBOp);/*dDelta*/ - rvec_ScaledAdd(temp, -coef.C3dbopi, nbr_k->bo_data.dBOp); /*3rd, dBOpi*/ - rvec_ScaledAdd(temp, -coef.C3dbopi2, nbr_k->bo_data.dBOp);/*3rd, dBOpi2*/ - - /* force */ - rvec_Add( nbr_k->tf_f, temp ); - /* pressure */ - rvec_iMultiply( ext_press, nbr_k->rel_box, temp ); - rvec_Add( data_ext_press, ext_press ); - - /* if( !ivec_isZero( nbr_k->rel_box ) ) - fprintf( stderr, "%3d %3d %3d: dvec[%10.6f %10.6f %10.6f]" - "ext[%3d %3d %3d] f[%10.6f %10.6f %10.6f]\n", - i+1, system->my_atoms[i].x[0], - system->my_atoms[i].x[1], system->my_atoms[i].x[2], - j+1, k+1, system->my_atoms[k].x[0], - system->my_atoms[k].x[1], system->my_atoms[k].x[2], - nbr_k->dvec[0], nbr_k->dvec[1], nbr_k->dvec[2], - nbr_k->rel_box[0], nbr_k->rel_box[1], nbr_k->rel_box[2], - temp[0], temp[1], temp[2] ); */ - } - - /* then atom i itself */ - rvec_Scale( temp, coef.C1dbo, bo_ij->dBOp ); /*1st,dBO*/ - rvec_ScaledAdd( temp, coef.C2dbo, workspace->dDeltap_self[i] ); /*2nd,dBO*/ - rvec_ScaledAdd( temp, coef.C1dDelta, bo_ij->dBOp ); /*1st,dBO*/ - rvec_ScaledAdd( temp, coef.C2dDelta, workspace->dDeltap_self[i] );/*2nd,dBO*/ - rvec_ScaledAdd( temp, coef.C1dbopi, bo_ij->dln_BOp_pi ); /*1st,dBOpi*/ - rvec_ScaledAdd( temp, coef.C2dbopi, bo_ij->dBOp ); /*2nd,dBOpi*/ - rvec_ScaledAdd( temp, coef.C3dbopi, workspace->dDeltap_self[i]);/*3rd,dBOpi*/ - - rvec_ScaledAdd( temp, coef.C1dbopi2, bo_ij->dln_BOp_pi2 ); /*1st,dBO_pi2*/ - rvec_ScaledAdd( temp, coef.C2dbopi2, bo_ij->dBOp ); /*2nd,dBO_pi2*/ - rvec_ScaledAdd( temp, coef.C3dbopi2, workspace->dDeltap_self[i] );/*3rd*/ - - /* force */ - rvec_Add( workspace->f[i], temp ); - /* ext pressure due to i is dropped, counting force on j will be enough */ - } - else { - - /****************************************************** - * forces and pressure related to atom j * - * first neighbors of atom j * - ******************************************************/ - for( pk = Dev_Start_Index(j, bonds); pk < Dev_End_Index(j, bonds); ++pk ) { - nbr_k = &(bonds->select.bond_list[pk]); - k = nbr_k->nbr; - - rvec_MakeZero (nbr_k->tf_f); - - rvec_Scale( temp, -coef.C3dbo, nbr_k->bo_data.dBOp ); /*3rd,dBO*/ - rvec_ScaledAdd( temp, -coef.C3dDelta, nbr_k->bo_data.dBOp);/*dDelta*/ - rvec_ScaledAdd( temp, -coef.C4dbopi, nbr_k->bo_data.dBOp); /*4th,dBOpi*/ - rvec_ScaledAdd( temp, -coef.C4dbopi2, nbr_k->bo_data.dBOp);/*4th,dBOpi2*/ - - /* force */ - rvec_Add( nbr_k->tf_f, temp ); - /* pressure */ - if( k != i ) { - ivec_Sum( rel_box, nbr_k->rel_box, nbr_j->rel_box ); //rel_box(k, i) - rvec_iMultiply( ext_press, rel_box, temp ); - rvec_Add( data_ext_press, ext_press ); - - /* if( !ivec_isZero( rel_box ) ) - fprintf( stderr, "%3d %3d %3d: dvec[%10.6f %10.6f %10.6f]" - "ext[%3d %3d %3d] f[%10.6f %10.6f %10.6f]\n", - i+1, j+1, system->my_atoms[j].x[0], - system->my_atoms[j].x[1], system->my_atoms[j].x[2], - k+1, system->my_atoms[k].x[0], - system->my_atoms[k].x[1], system->my_atoms[k].x[2], - nbr_k->dvec[0], nbr_k->dvec[1], nbr_k->dvec[2], - rel_box[0], rel_box[1], rel_box[2], - temp[0], temp[1], temp[2] ); */ - } - } - - /* then atom j itself */ - rvec_Scale( temp, -coef.C1dbo, bo_ij->dBOp ); /*1st, dBO*/ - rvec_ScaledAdd( temp, coef.C3dbo, workspace->dDeltap_self[j] ); /*2nd, dBO*/ - rvec_ScaledAdd( temp, -coef.C1dDelta, bo_ij->dBOp ); /*1st, dBO*/ - rvec_ScaledAdd( temp, coef.C3dDelta, workspace->dDeltap_self[j]);/*2nd, dBO*/ - - rvec_ScaledAdd( temp, -coef.C1dbopi, bo_ij->dln_BOp_pi ); /*1st,dBOpi*/ - rvec_ScaledAdd( temp, -coef.C2dbopi, bo_ij->dBOp ); /*2nd,dBOpi*/ - rvec_ScaledAdd( temp, coef.C4dbopi, workspace->dDeltap_self[j]);/*3rd,dBOpi*/ - - rvec_ScaledAdd( temp, -coef.C1dbopi2, bo_ij->dln_BOp_pi2 ); /*1st,dBOpi2*/ - rvec_ScaledAdd( temp, -coef.C2dbopi2, bo_ij->dBOp ); /*2nd,dBOpi2*/ - rvec_ScaledAdd( temp,coef.C4dbopi2,workspace->dDeltap_self[j]);/*3rd,dBOpi2*/ - - /* force */ - rvec_Add( workspace->f[j], temp ); - /* pressure */ - rvec_iMultiply( ext_press, nbr_j->rel_box, temp ); - rvec_Add( data->my_ext_press, ext_press ); - - /* if( !ivec_isZero( nbr_j->rel_box ) ) - fprintf( stderr, "%3d %3d %3d: dvec[%10.6f %10.6f %10.6f]" - "ext[%3d %3d %3d] f[%10.6f %10.6f %10.6f]\n", - i+1, system->my_atoms[i].x[0], system->my_atoms[i].x[1], - system->my_atoms[i].x[2], - j+1,system->my_atoms[j].x[0], system->my_atoms[j].x[1], - system->my_atoms[j].x[2], - j+1, nbr_j->dvec[0], nbr_j->dvec[1], nbr_j->dvec[2], - nbr_j->rel_box[0], nbr_j->rel_box[1], nbr_j->rel_box[2], - temp[0], temp[1], temp[2] ); */ - } - } - - CUDA_DEVICE void Cuda_Add_dBond_to_Forces( int i, int pj, - storage *workspace, reax_list *bonds ) - { - bond_data *nbr_j, *nbr_k; - bond_order_data *bo_ij, *bo_ji; - dbond_coefficients coef; - int pk, k, j; - - rvec tf_f; - rvec_MakeZero (tf_f); - - /* Initializations */ - nbr_j = &(bonds->select.bond_list[pj]); - j = nbr_j->nbr; - //bo_ij = &(nbr_j->bo_data); - //bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data); - - if (i < j) { - bo_ij = &(nbr_j->bo_data); - bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data); - } else { - bo_ji = &(nbr_j->bo_data); - bo_ij = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data); - } - - coef.C1dbo = bo_ij->C1dbo * (bo_ij->Cdbo + bo_ji->Cdbo); - coef.C2dbo = bo_ij->C2dbo * (bo_ij->Cdbo + bo_ji->Cdbo); - coef.C3dbo = bo_ij->C3dbo * (bo_ij->Cdbo + bo_ji->Cdbo); - - coef.C1dbopi = bo_ij->C1dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); - coef.C2dbopi = bo_ij->C2dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); - coef.C3dbopi = bo_ij->C3dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); - coef.C4dbopi = bo_ij->C4dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); - - coef.C1dbopi2 = bo_ij->C1dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); - coef.C2dbopi2 = bo_ij->C2dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); - coef.C3dbopi2 = bo_ij->C3dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); - coef.C4dbopi2 = bo_ij->C4dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); - - coef.C1dDelta = bo_ij->C1dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]); - coef.C2dDelta = bo_ij->C2dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]); - coef.C3dDelta = bo_ij->C3dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]); - - if (i < j) { - for( pk = Dev_Start_Index(i, bonds); pk < Dev_End_Index(i, bonds); ++pk ) { - nbr_k = &(bonds->select.bond_list[pk]); - k = nbr_k->nbr; - rvec_MakeZero (tf_f); - - /*2nd,dBO*/ - rvec_ScaledAdd( tf_f, -coef.C2dbo, nbr_k->bo_data.dBOp ); - /*dDelta*/ - rvec_ScaledAdd( tf_f, -coef.C2dDelta, nbr_k->bo_data.dBOp ); - /*3rd, dBOpi*/ - rvec_ScaledAdd( tf_f, -coef.C3dbopi, nbr_k->bo_data.dBOp ); - /*3rd, dBOpi2*/ - rvec_ScaledAdd( tf_f, -coef.C3dbopi2, nbr_k->bo_data.dBOp ); - - //Temp storage - rvec_Add (nbr_k->tf_f, tf_f); - } - /*1st, dBO*/ - rvec_ScaledAdd( workspace->f[i], coef.C1dbo, bo_ij->dBOp ); - /*2nd, dBO*/ - rvec_ScaledAdd( workspace->f[i], coef.C2dbo, workspace->dDeltap_self[i] ); - - /*1st, dBO*/ - rvec_ScaledAdd( workspace->f[i], coef.C1dDelta, bo_ij->dBOp ); - /*2nd, dBO*/ - rvec_ScaledAdd( workspace->f[i], coef.C2dDelta, workspace->dDeltap_self[i] ); - - /*1st, dBOpi*/ - rvec_ScaledAdd( workspace->f[i], coef.C1dbopi, bo_ij->dln_BOp_pi ); - /*2nd, dBOpi*/ - rvec_ScaledAdd( workspace->f[i], coef.C2dbopi, bo_ij->dBOp ); - /*3rd, dBOpi*/ - rvec_ScaledAdd( workspace->f[i], coef.C3dbopi, workspace->dDeltap_self[i] ); - - /*1st, dBO_pi2*/ - rvec_ScaledAdd( workspace->f[i], coef.C1dbopi2, bo_ij->dln_BOp_pi2 ); - /*2nd, dBO_pi2*/ - rvec_ScaledAdd( workspace->f[i], coef.C2dbopi2, bo_ij->dBOp ); - /*3rd, dBO_pi2*/ - rvec_ScaledAdd( workspace->f[i], coef.C3dbopi2, workspace->dDeltap_self[i] ); - - } else { - - for( pk = Dev_Start_Index(i, bonds); pk < Dev_End_Index(i, bonds); ++pk ) { - nbr_k = &(bonds->select.bond_list[pk]); - k = nbr_k->nbr; - rvec_MakeZero (tf_f); - - /*3rd, dBO*/ - rvec_ScaledAdd( tf_f, -coef.C3dbo, nbr_k->bo_data.dBOp ); - /*dDelta*/ - rvec_ScaledAdd( tf_f, -coef.C3dDelta, nbr_k->bo_data.dBOp ); - /*4th, dBOpi*/ - rvec_ScaledAdd( tf_f, -coef.C4dbopi, nbr_k->bo_data.dBOp ); - /*4th, dBOpi2*/ - rvec_ScaledAdd( tf_f, -coef.C4dbopi2, nbr_k->bo_data.dBOp ); - - //Temp Storage - rvec_Add (nbr_k->tf_f, tf_f); - } - - /*1st,dBO*/ - rvec_ScaledAdd( workspace->f[i], -coef.C1dbo, bo_ij->dBOp ); - /*2nd,dBO*/ - rvec_ScaledAdd( workspace->f[i], coef.C3dbo, workspace->dDeltap_self[i] ); - - /*1st, dBO*/ - rvec_ScaledAdd( workspace->f[i], -coef.C1dDelta, bo_ij->dBOp ); - /*2nd, dBO*/ - rvec_ScaledAdd( workspace->f[i], coef.C3dDelta, workspace->dDeltap_self[i] ); - - /*1st, dBOpi*/ - rvec_ScaledAdd( workspace->f[i], -coef.C1dbopi, bo_ij->dln_BOp_pi ); - /*2nd, dBOpi*/ - rvec_ScaledAdd( workspace->f[i], -coef.C2dbopi, bo_ij->dBOp ); - /*3rd, dBOpi*/ - rvec_ScaledAdd( workspace->f[i], coef.C4dbopi, workspace->dDeltap_self[i] ); - - /*1st, dBOpi2*/ - rvec_ScaledAdd( workspace->f[i], -coef.C1dbopi2, bo_ij->dln_BOp_pi2 ); - /*2nd, dBOpi2*/ - rvec_ScaledAdd( workspace->f[i], -coef.C2dbopi2, bo_ij->dBOp ); - /*3rd, dBOpi2*/ - rvec_ScaledAdd( workspace->f[i], coef.C4dbopi2, workspace->dDeltap_self[i] ); - } - } - - CUDA_DEVICE void Cuda_dbond_to_Forces_postprocess (int i, reax_atom *atoms, reax_list *bonds, storage *workspace) - { - int pk; - bond_data *nbr_k, *nbr_k_sym; - - for( pk = Dev_Start_Index(i, bonds); pk < Dev_End_Index(i, bonds); ++pk ) { - nbr_k = &(bonds->select.bond_list[pk]); - nbr_k_sym = &( bonds->select.bond_list [nbr_k->sym_index] ); - - //rvec_Add (atoms[i].f, nbr_k_sym->tf_f); - rvec_Add (workspace->f[i], nbr_k_sym->tf_f); - } - } - - CUDA_GLOBAL void ker_total_forces_postprocess (reax_atom *my_atoms, reax_list p_bonds, storage p_workspace, int N) - { - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= N) return; - - reax_list *bonds = &( p_bonds ); - storage *workspace = &( p_workspace ); - Cuda_dbond_to_Forces_postprocess (i, my_atoms, bonds, workspace ); - } - - CUDA_GLOBAL void ker_total_forces (storage p_workspace, reax_list p_bonds, - control_params *control, - simulation_data *data, - rvec *data_ext_press, - int N ) - { - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= N) return; - - int pj; - reax_list *bonds = &( p_bonds ); - storage *workspace = &( p_workspace ); - - for( pj = Dev_Start_Index(i, bonds); pj < Dev_End_Index(i, bonds); ++pj ) - //if( i < bonds->select.bond_list[pj].nbr ) { - if( control->virial == 0 ) - Cuda_Add_dBond_to_Forces( i, pj, workspace, bonds); - else - Cuda_Add_dBond_to_Forces_NPT( i, pj, data, workspace, bonds, data_ext_press [i]); - //} - } - - void Cuda_Total_Forces (reax_system *system, control_params *control, - simulation_data *data, storage *workspace) - { - int blocks; - rvec *spad_rvec = (rvec *) scratch; - cuda_memset (spad_rvec, 0, system->N * 2 * sizeof (rvec), "total_forces:ext_press"); - - blocks = system->N / DEF_BLOCK_SIZE + - ((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1); - ker_total_forces <<< blocks, DEF_BLOCK_SIZE >>> - ( *dev_workspace, *(*dev_lists + BONDS), - (control_params *) control->d_control_params, - (simulation_data *)data->d_simulation_data, - spad_rvec, system->N ); - cudaThreadSynchronize (); - cudaCheckError (); - - if (control->virial != 0) - { - //do the reduction here for ext press - k_reduction_rvec <<< blocks, DEF_BLOCK_SIZE, sizeof (rvec) * DEF_BLOCK_SIZE >>> - ( spad_rvec, spad_rvec + system->N, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - k_reduction_rvec <<< 1, BLOCKS_POW_2_N, sizeof (rvec) * BLOCKS_POW_2_N>>> - ( spad_rvec + system->N, &((simulation_data *)data->d_simulation_data)->my_ext_press, blocks); - cudaThreadSynchronize (); - cudaCheckError (); - } - - //do the post processing for the atomic forces here - ker_total_forces_postprocess <<< blocks, DEF_BLOCK_SIZE >>> - (system->d_my_atoms, *(*dev_lists + BONDS), *dev_workspace, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - } - - CUDA_GLOBAL void ker_total_forces_pure (reax_atom *my_atoms, int n, - storage p_workspace) - { - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= n) return; - - storage *workspace = &( p_workspace ); - - rvec_Copy (my_atoms[i].f, workspace->f[i]); - } - - void Cuda_Total_Forces_PURE (reax_system *system, storage *workspace) - { - int blocks; - - blocks = system->n / DEF_BLOCK_SIZE + - ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1); - ker_total_forces_pure <<< blocks, DEF_BLOCK_SIZE >>> - ( system->d_my_atoms, system->n, *dev_workspace); - cudaThreadSynchronize (); - cudaCheckError (); - } + int i, j, pj; + int start_i, end_i; + int sym_index; + storage *workspace = &( p_workspace ); + reax_list *bonds = &( p_bonds ); + + bond_order_data *bo_ij, *bo_ji; + + i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= N) return; + + start_i = Dev_Start_Index(i, bonds); + end_i = Dev_End_Index(i, bonds); + + for( pj = start_i; pj < end_i; ++pj ) { + + j = bonds->select.bond_list[pj].nbr; + bo_ij = &( bonds->select.bond_list[pj].bo_data ); + + //if( (i >= j) || (workspace->bond_mark [i] <= 3)) { + if( (i >= j) ) { + + /* We only need to update bond orders from bo_ji + everything else is set in uncorrected_bo calculations */ + sym_index = bonds->select.bond_list[pj].sym_index; + bo_ji = &(bonds->select.bond_list[ sym_index ].bo_data); + bo_ij->BO = bo_ji->BO; + bo_ij->BO_s = bo_ji->BO_s; + bo_ij->BO_pi = bo_ji->BO_pi; + bo_ij->BO_pi2 = bo_ji->BO_pi2; + + workspace->total_bond_order[i] += bo_ij->BO;// now keeps total_BO + } + } + } + + CUDA_GLOBAL void Cuda_Update_Workspace_After_BO ( reax_atom *my_atoms, global_parameters gp, + single_body_parameters *sbp, storage p_workspace, + int N) + { + int j, type_j; + real explp1; + real p_lp1; + single_body_parameters *sbp_i, *sbp_j; + storage *workspace = &( p_workspace ); + + j = blockIdx.x * blockDim.x + threadIdx.x; + if (j >= N) return; + + p_lp1 = gp.l[15]; + /* Calculate some helper variables that are used at many places + throughout force calculations */ + //for( j = 0; j < system->N; ++j ){ + type_j = my_atoms[j].type; + sbp_j = &(sbp[ type_j ]); + + workspace->Delta[j] = workspace->total_bond_order[j] - sbp_j->valency; + workspace->Delta_e[j] = workspace->total_bond_order[j] - sbp_j->valency_e; + workspace->Delta_boc[j] = workspace->total_bond_order[j] - + sbp_j->valency_boc; + + workspace->vlpex[j] = workspace->Delta_e[j] - + 2.0 * (int)(workspace->Delta_e[j]/2.0); + explp1 = EXP(-p_lp1 * SQR(2.0 + workspace->vlpex[j])); + workspace->nlp[j] = explp1 - (int)(workspace->Delta_e[j] / 2.0); + workspace->Delta_lp[j] = sbp_j->nlp_opt - workspace->nlp[j]; + workspace->Clp[j] = 2.0 * p_lp1 * explp1 * (2.0 + workspace->vlpex[j]); + /* Adri uses different dDelta_lp values than the ones in notes... */ + workspace->dDelta_lp[j] = workspace->Clp[j]; + //workspace->dDelta_lp[j] = workspace->Clp[j] + (0.5-workspace->Clp[j]) * + //((fabs(workspace->Delta_e[j]/2.0 - + // (int)(workspace->Delta_e[j]/2.0)) < 0.1) ? 1 : 0 ); + + if( sbp_j->mass > 21.0 ) { + workspace->nlp_temp[j] = 0.5 * (sbp_j->valency_e - sbp_j->valency); + workspace->Delta_lp_temp[j] = sbp_j->nlp_opt - workspace->nlp_temp[j]; + workspace->dDelta_lp_temp[j] = 0.; + } + else { + workspace->nlp_temp[j] = workspace->nlp[j]; + workspace->Delta_lp_temp[j] = sbp_j->nlp_opt - workspace->nlp_temp[j]; + workspace->dDelta_lp_temp[j] = workspace->Clp[j]; + } + //} Commented for Cuda + } + + + CUDA_DEVICE void Cuda_Add_dBond_to_Forces_NPT( int i, int pj, simulation_data *data, + storage *workspace, reax_list *bonds, rvec data_ext_press) + { + bond_data *nbr_j, *nbr_k; + bond_order_data *bo_ij, *bo_ji; + dbond_coefficients coef; + rvec temp, ext_press; + ivec rel_box; + int pk, k, j; + rvec tf_f; + + /* Initializations */ + nbr_j = &(bonds->select.bond_list[pj]); + j = nbr_j->nbr; + + //bo_ij = &(nbr_j->bo_data); + //bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data); + if (i < j) { + bo_ij = &(nbr_j->bo_data); + bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data); + } else { + bo_ji = &(nbr_j->bo_data); + bo_ij = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data); + } + + coef.C1dbo = bo_ij->C1dbo * (bo_ij->Cdbo + bo_ji->Cdbo); + coef.C2dbo = bo_ij->C2dbo * (bo_ij->Cdbo + bo_ji->Cdbo); + coef.C3dbo = bo_ij->C3dbo * (bo_ij->Cdbo + bo_ji->Cdbo); + + coef.C1dbopi = bo_ij->C1dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); + coef.C2dbopi = bo_ij->C2dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); + coef.C3dbopi = bo_ij->C3dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); + coef.C4dbopi = bo_ij->C4dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); + + coef.C1dbopi2 = bo_ij->C1dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); + coef.C2dbopi2 = bo_ij->C2dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); + coef.C3dbopi2 = bo_ij->C3dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); + coef.C4dbopi2 = bo_ij->C4dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); + + coef.C1dDelta = bo_ij->C1dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]); + coef.C2dDelta = bo_ij->C2dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]); + coef.C3dDelta = bo_ij->C3dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]); + + + /************************************ + * forces related to atom i * + * first neighbors of atom i * + ************************************/ + if (i < j) { + for( pk = Dev_Start_Index(i, bonds); pk < Dev_End_Index(i, bonds); ++pk ) { + nbr_k = &(bonds->select.bond_list[pk]); + k = nbr_k->nbr; + + rvec_MakeZero (nbr_k->tf_f); + + rvec_Scale(temp, -coef.C2dbo, nbr_k->bo_data.dBOp); /*2nd, dBO*/ + rvec_ScaledAdd(temp, -coef.C2dDelta, nbr_k->bo_data.dBOp);/*dDelta*/ + rvec_ScaledAdd(temp, -coef.C3dbopi, nbr_k->bo_data.dBOp); /*3rd, dBOpi*/ + rvec_ScaledAdd(temp, -coef.C3dbopi2, nbr_k->bo_data.dBOp);/*3rd, dBOpi2*/ + + /* force */ + rvec_Add( nbr_k->tf_f, temp ); + /* pressure */ + rvec_iMultiply( ext_press, nbr_k->rel_box, temp ); + rvec_Add( data_ext_press, ext_press ); + + /* if( !ivec_isZero( nbr_k->rel_box ) ) + fprintf( stderr, "%3d %3d %3d: dvec[%10.6f %10.6f %10.6f]" + "ext[%3d %3d %3d] f[%10.6f %10.6f %10.6f]\n", + i+1, system->my_atoms[i].x[0], + system->my_atoms[i].x[1], system->my_atoms[i].x[2], + j+1, k+1, system->my_atoms[k].x[0], + system->my_atoms[k].x[1], system->my_atoms[k].x[2], + nbr_k->dvec[0], nbr_k->dvec[1], nbr_k->dvec[2], + nbr_k->rel_box[0], nbr_k->rel_box[1], nbr_k->rel_box[2], + temp[0], temp[1], temp[2] ); */ + } + + /* then atom i itself */ + rvec_Scale( temp, coef.C1dbo, bo_ij->dBOp ); /*1st,dBO*/ + rvec_ScaledAdd( temp, coef.C2dbo, workspace->dDeltap_self[i] ); /*2nd,dBO*/ + rvec_ScaledAdd( temp, coef.C1dDelta, bo_ij->dBOp ); /*1st,dBO*/ + rvec_ScaledAdd( temp, coef.C2dDelta, workspace->dDeltap_self[i] );/*2nd,dBO*/ + rvec_ScaledAdd( temp, coef.C1dbopi, bo_ij->dln_BOp_pi ); /*1st,dBOpi*/ + rvec_ScaledAdd( temp, coef.C2dbopi, bo_ij->dBOp ); /*2nd,dBOpi*/ + rvec_ScaledAdd( temp, coef.C3dbopi, workspace->dDeltap_self[i]);/*3rd,dBOpi*/ + + rvec_ScaledAdd( temp, coef.C1dbopi2, bo_ij->dln_BOp_pi2 ); /*1st,dBO_pi2*/ + rvec_ScaledAdd( temp, coef.C2dbopi2, bo_ij->dBOp ); /*2nd,dBO_pi2*/ + rvec_ScaledAdd( temp, coef.C3dbopi2, workspace->dDeltap_self[i] );/*3rd*/ + + /* force */ + rvec_Add( workspace->f[i], temp ); + /* ext pressure due to i is dropped, counting force on j will be enough */ + } + else { + + /****************************************************** + * forces and pressure related to atom j * + * first neighbors of atom j * + ******************************************************/ + for( pk = Dev_Start_Index(j, bonds); pk < Dev_End_Index(j, bonds); ++pk ) { + nbr_k = &(bonds->select.bond_list[pk]); + k = nbr_k->nbr; + + rvec_MakeZero (nbr_k->tf_f); + + rvec_Scale( temp, -coef.C3dbo, nbr_k->bo_data.dBOp ); /*3rd,dBO*/ + rvec_ScaledAdd( temp, -coef.C3dDelta, nbr_k->bo_data.dBOp);/*dDelta*/ + rvec_ScaledAdd( temp, -coef.C4dbopi, nbr_k->bo_data.dBOp); /*4th,dBOpi*/ + rvec_ScaledAdd( temp, -coef.C4dbopi2, nbr_k->bo_data.dBOp);/*4th,dBOpi2*/ + + /* force */ + rvec_Add( nbr_k->tf_f, temp ); + /* pressure */ + if( k != i ) { + ivec_Sum( rel_box, nbr_k->rel_box, nbr_j->rel_box ); //rel_box(k, i) + rvec_iMultiply( ext_press, rel_box, temp ); + rvec_Add( data_ext_press, ext_press ); + + /* if( !ivec_isZero( rel_box ) ) + fprintf( stderr, "%3d %3d %3d: dvec[%10.6f %10.6f %10.6f]" + "ext[%3d %3d %3d] f[%10.6f %10.6f %10.6f]\n", + i+1, j+1, system->my_atoms[j].x[0], + system->my_atoms[j].x[1], system->my_atoms[j].x[2], + k+1, system->my_atoms[k].x[0], + system->my_atoms[k].x[1], system->my_atoms[k].x[2], + nbr_k->dvec[0], nbr_k->dvec[1], nbr_k->dvec[2], + rel_box[0], rel_box[1], rel_box[2], + temp[0], temp[1], temp[2] ); */ + } + } + + /* then atom j itself */ + rvec_Scale( temp, -coef.C1dbo, bo_ij->dBOp ); /*1st, dBO*/ + rvec_ScaledAdd( temp, coef.C3dbo, workspace->dDeltap_self[j] ); /*2nd, dBO*/ + rvec_ScaledAdd( temp, -coef.C1dDelta, bo_ij->dBOp ); /*1st, dBO*/ + rvec_ScaledAdd( temp, coef.C3dDelta, workspace->dDeltap_self[j]);/*2nd, dBO*/ + + rvec_ScaledAdd( temp, -coef.C1dbopi, bo_ij->dln_BOp_pi ); /*1st,dBOpi*/ + rvec_ScaledAdd( temp, -coef.C2dbopi, bo_ij->dBOp ); /*2nd,dBOpi*/ + rvec_ScaledAdd( temp, coef.C4dbopi, workspace->dDeltap_self[j]);/*3rd,dBOpi*/ + + rvec_ScaledAdd( temp, -coef.C1dbopi2, bo_ij->dln_BOp_pi2 ); /*1st,dBOpi2*/ + rvec_ScaledAdd( temp, -coef.C2dbopi2, bo_ij->dBOp ); /*2nd,dBOpi2*/ + rvec_ScaledAdd( temp,coef.C4dbopi2,workspace->dDeltap_self[j]);/*3rd,dBOpi2*/ + + /* force */ + rvec_Add( workspace->f[j], temp ); + /* pressure */ + rvec_iMultiply( ext_press, nbr_j->rel_box, temp ); + rvec_Add( data->my_ext_press, ext_press ); + + /* if( !ivec_isZero( nbr_j->rel_box ) ) + fprintf( stderr, "%3d %3d %3d: dvec[%10.6f %10.6f %10.6f]" + "ext[%3d %3d %3d] f[%10.6f %10.6f %10.6f]\n", + i+1, system->my_atoms[i].x[0], system->my_atoms[i].x[1], + system->my_atoms[i].x[2], + j+1,system->my_atoms[j].x[0], system->my_atoms[j].x[1], + system->my_atoms[j].x[2], + j+1, nbr_j->dvec[0], nbr_j->dvec[1], nbr_j->dvec[2], + nbr_j->rel_box[0], nbr_j->rel_box[1], nbr_j->rel_box[2], + temp[0], temp[1], temp[2] ); */ + } + } + + CUDA_DEVICE void Cuda_Add_dBond_to_Forces( int i, int pj, + storage *workspace, reax_list *bonds ) + { + bond_data *nbr_j, *nbr_k; + bond_order_data *bo_ij, *bo_ji; + dbond_coefficients coef; + int pk, k, j; + + rvec tf_f; + rvec_MakeZero (tf_f); + + /* Initializations */ + nbr_j = &(bonds->select.bond_list[pj]); + j = nbr_j->nbr; + //bo_ij = &(nbr_j->bo_data); + //bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data); + + if (i < j) { + bo_ij = &(nbr_j->bo_data); + bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data); + } else { + bo_ji = &(nbr_j->bo_data); + bo_ij = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data); + } + + coef.C1dbo = bo_ij->C1dbo * (bo_ij->Cdbo + bo_ji->Cdbo); + coef.C2dbo = bo_ij->C2dbo * (bo_ij->Cdbo + bo_ji->Cdbo); + coef.C3dbo = bo_ij->C3dbo * (bo_ij->Cdbo + bo_ji->Cdbo); + + coef.C1dbopi = bo_ij->C1dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); + coef.C2dbopi = bo_ij->C2dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); + coef.C3dbopi = bo_ij->C3dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); + coef.C4dbopi = bo_ij->C4dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); + + coef.C1dbopi2 = bo_ij->C1dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); + coef.C2dbopi2 = bo_ij->C2dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); + coef.C3dbopi2 = bo_ij->C3dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); + coef.C4dbopi2 = bo_ij->C4dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); + + coef.C1dDelta = bo_ij->C1dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]); + coef.C2dDelta = bo_ij->C2dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]); + coef.C3dDelta = bo_ij->C3dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]); + + if (i < j) { + for( pk = Dev_Start_Index(i, bonds); pk < Dev_End_Index(i, bonds); ++pk ) { + nbr_k = &(bonds->select.bond_list[pk]); + k = nbr_k->nbr; + rvec_MakeZero (tf_f); + + /*2nd,dBO*/ + rvec_ScaledAdd( tf_f, -coef.C2dbo, nbr_k->bo_data.dBOp ); + /*dDelta*/ + rvec_ScaledAdd( tf_f, -coef.C2dDelta, nbr_k->bo_data.dBOp ); + /*3rd, dBOpi*/ + rvec_ScaledAdd( tf_f, -coef.C3dbopi, nbr_k->bo_data.dBOp ); + /*3rd, dBOpi2*/ + rvec_ScaledAdd( tf_f, -coef.C3dbopi2, nbr_k->bo_data.dBOp ); + + //Temp storage + rvec_Add (nbr_k->tf_f, tf_f); + } + /*1st, dBO*/ + rvec_ScaledAdd( workspace->f[i], coef.C1dbo, bo_ij->dBOp ); + /*2nd, dBO*/ + rvec_ScaledAdd( workspace->f[i], coef.C2dbo, workspace->dDeltap_self[i] ); + + /*1st, dBO*/ + rvec_ScaledAdd( workspace->f[i], coef.C1dDelta, bo_ij->dBOp ); + /*2nd, dBO*/ + rvec_ScaledAdd( workspace->f[i], coef.C2dDelta, workspace->dDeltap_self[i] ); + + /*1st, dBOpi*/ + rvec_ScaledAdd( workspace->f[i], coef.C1dbopi, bo_ij->dln_BOp_pi ); + /*2nd, dBOpi*/ + rvec_ScaledAdd( workspace->f[i], coef.C2dbopi, bo_ij->dBOp ); + /*3rd, dBOpi*/ + rvec_ScaledAdd( workspace->f[i], coef.C3dbopi, workspace->dDeltap_self[i] ); + + /*1st, dBO_pi2*/ + rvec_ScaledAdd( workspace->f[i], coef.C1dbopi2, bo_ij->dln_BOp_pi2 ); + /*2nd, dBO_pi2*/ + rvec_ScaledAdd( workspace->f[i], coef.C2dbopi2, bo_ij->dBOp ); + /*3rd, dBO_pi2*/ + rvec_ScaledAdd( workspace->f[i], coef.C3dbopi2, workspace->dDeltap_self[i] ); + + } else { + + for( pk = Dev_Start_Index(i, bonds); pk < Dev_End_Index(i, bonds); ++pk ) { + nbr_k = &(bonds->select.bond_list[pk]); + k = nbr_k->nbr; + rvec_MakeZero (tf_f); + + /*3rd, dBO*/ + rvec_ScaledAdd( tf_f, -coef.C3dbo, nbr_k->bo_data.dBOp ); + /*dDelta*/ + rvec_ScaledAdd( tf_f, -coef.C3dDelta, nbr_k->bo_data.dBOp ); + /*4th, dBOpi*/ + rvec_ScaledAdd( tf_f, -coef.C4dbopi, nbr_k->bo_data.dBOp ); + /*4th, dBOpi2*/ + rvec_ScaledAdd( tf_f, -coef.C4dbopi2, nbr_k->bo_data.dBOp ); + + //Temp Storage + rvec_Add (nbr_k->tf_f, tf_f); + } + + /*1st,dBO*/ + rvec_ScaledAdd( workspace->f[i], -coef.C1dbo, bo_ij->dBOp ); + /*2nd,dBO*/ + rvec_ScaledAdd( workspace->f[i], coef.C3dbo, workspace->dDeltap_self[i] ); + + /*1st, dBO*/ + rvec_ScaledAdd( workspace->f[i], -coef.C1dDelta, bo_ij->dBOp ); + /*2nd, dBO*/ + rvec_ScaledAdd( workspace->f[i], coef.C3dDelta, workspace->dDeltap_self[i] ); + + /*1st, dBOpi*/ + rvec_ScaledAdd( workspace->f[i], -coef.C1dbopi, bo_ij->dln_BOp_pi ); + /*2nd, dBOpi*/ + rvec_ScaledAdd( workspace->f[i], -coef.C2dbopi, bo_ij->dBOp ); + /*3rd, dBOpi*/ + rvec_ScaledAdd( workspace->f[i], coef.C4dbopi, workspace->dDeltap_self[i] ); + + /*1st, dBOpi2*/ + rvec_ScaledAdd( workspace->f[i], -coef.C1dbopi2, bo_ij->dln_BOp_pi2 ); + /*2nd, dBOpi2*/ + rvec_ScaledAdd( workspace->f[i], -coef.C2dbopi2, bo_ij->dBOp ); + /*3rd, dBOpi2*/ + rvec_ScaledAdd( workspace->f[i], coef.C4dbopi2, workspace->dDeltap_self[i] ); + } + } + + CUDA_DEVICE void Cuda_dbond_to_Forces_postprocess (int i, reax_atom *atoms, reax_list *bonds, storage *workspace) + { + int pk; + bond_data *nbr_k, *nbr_k_sym; + + for( pk = Dev_Start_Index(i, bonds); pk < Dev_End_Index(i, bonds); ++pk ) { + nbr_k = &(bonds->select.bond_list[pk]); + nbr_k_sym = &( bonds->select.bond_list [nbr_k->sym_index] ); + + //rvec_Add (atoms[i].f, nbr_k_sym->tf_f); + rvec_Add (workspace->f[i], nbr_k_sym->tf_f); + } + } + + CUDA_GLOBAL void ker_total_forces_postprocess (reax_atom *my_atoms, reax_list p_bonds, storage p_workspace, int N) + { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= N) return; + + reax_list *bonds = &( p_bonds ); + storage *workspace = &( p_workspace ); + Cuda_dbond_to_Forces_postprocess (i, my_atoms, bonds, workspace ); + } + + CUDA_GLOBAL void ker_total_forces (storage p_workspace, reax_list p_bonds, + control_params *control, + simulation_data *data, + rvec *data_ext_press, + int N ) + { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= N) return; + + int pj; + reax_list *bonds = &( p_bonds ); + storage *workspace = &( p_workspace ); + + for( pj = Dev_Start_Index(i, bonds); pj < Dev_End_Index(i, bonds); ++pj ) + //if( i < bonds->select.bond_list[pj].nbr ) { + if( control->virial == 0 ) + Cuda_Add_dBond_to_Forces( i, pj, workspace, bonds); + else + Cuda_Add_dBond_to_Forces_NPT( i, pj, data, workspace, bonds, data_ext_press [i]); + //} + } + + void Cuda_Total_Forces (reax_system *system, control_params *control, + simulation_data *data, storage *workspace) + { + int blocks; + rvec *spad_rvec = (rvec *) scratch; + cuda_memset (spad_rvec, 0, system->N * 2 * sizeof (rvec), "total_forces:ext_press"); + + blocks = system->N / DEF_BLOCK_SIZE + + ((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1); + ker_total_forces <<< blocks, DEF_BLOCK_SIZE >>> + ( *dev_workspace, *(*dev_lists + BONDS), + (control_params *) control->d_control_params, + (simulation_data *)data->d_simulation_data, + spad_rvec, system->N ); + cudaThreadSynchronize (); + cudaCheckError (); + + if (control->virial != 0) + { + //do the reduction here for ext press + k_reduction_rvec <<< blocks, DEF_BLOCK_SIZE, sizeof (rvec) * DEF_BLOCK_SIZE >>> + ( spad_rvec, spad_rvec + system->N, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + k_reduction_rvec <<< 1, BLOCKS_POW_2_N, sizeof (rvec) * BLOCKS_POW_2_N>>> + ( spad_rvec + system->N, &((simulation_data *)data->d_simulation_data)->my_ext_press, blocks); + cudaThreadSynchronize (); + cudaCheckError (); + } + + //do the post processing for the atomic forces here + ker_total_forces_postprocess <<< blocks, DEF_BLOCK_SIZE >>> + (system->d_my_atoms, *(*dev_lists + BONDS), *dev_workspace, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + } + + CUDA_GLOBAL void ker_total_forces_pure (reax_atom *my_atoms, int n, + storage p_workspace) + { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) return; + + storage *workspace = &( p_workspace ); + + rvec_Copy (my_atoms[i].f, workspace->f[i]); + } + + void Cuda_Total_Forces_PURE (reax_system *system, storage *workspace) + { + int blocks; + + blocks = system->n / DEF_BLOCK_SIZE + + ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1); + ker_total_forces_pure <<< blocks, DEF_BLOCK_SIZE >>> + ( system->d_my_atoms, system->n, *dev_workspace); + cudaThreadSynchronize (); + cudaCheckError (); + } diff --git a/PG-PuReMD/src/cuda_bonds.cu b/PG-PuReMD/src/cuda_bonds.cu index bce63602..90f1480b 100644 --- a/PG-PuReMD/src/cuda_bonds.cu +++ b/PG-PuReMD/src/cuda_bonds.cu @@ -26,124 +26,124 @@ CUDA_GLOBAL void Cuda_Bonds( reax_atom *my_atoms, - global_parameters gp, - single_body_parameters *sbp, - two_body_parameters *tbp, - storage p_workspace, - reax_list p_bonds, - int n, int num_atom_types, - real *e_bond - ) + global_parameters gp, + single_body_parameters *sbp, + two_body_parameters *tbp, + storage p_workspace, + reax_list p_bonds, + int n, int num_atom_types, + real *e_bond + ) { - int i, j, pj, natoms; - int start_i, end_i; - int type_i, type_j; - real ebond, pow_BOs_be2, exp_be12, CEbo; - real gp3, gp4, gp7, gp10, gp37; - real exphu, exphua1, exphub1, exphuov, hulpov, estriph; - real decobdbo, decobdboua, decobdboub; - single_body_parameters *sbp_i, *sbp_j; - two_body_parameters *twbp; - bond_order_data *bo_ij; - reax_list *bonds; - storage *workspace; - - i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= n) return; - - bonds = &( p_bonds); - workspace = &( p_workspace ); - gp3 = gp.l[3]; - gp4 = gp.l[4]; - gp7 = gp.l[7]; - gp10 = gp.l[10]; - gp37 = (int) gp.l[37]; - - //for( i = 0; i < natoms; ++i ) { - start_i = Dev_Start_Index(i, bonds); - end_i = Dev_End_Index(i, bonds); - - for( pj = start_i; pj < end_i; ++pj ) { - j = bonds->select.bond_list[pj].nbr; - - if( my_atoms[i].orig_id <= my_atoms[j].orig_id ) { - /* set the pointers */ - type_i = my_atoms[i].type; - type_j = my_atoms[j].type; - sbp_i = &( sbp[type_i] ); - sbp_j = &( sbp[type_j] ); - - twbp = &( tbp[ index_tbp (type_i,type_j, num_atom_types) ] ); - bo_ij = &( bonds->select.bond_list[pj].bo_data ); - - /* calculate the constants */ - pow_BOs_be2 = POW( bo_ij->BO_s, twbp->p_be2 ); - exp_be12 = EXP( twbp->p_be1 * ( 1.0 - pow_BOs_be2 ) ); - CEbo = -twbp->De_s * exp_be12 * - ( 1.0 - twbp->p_be1 * twbp->p_be2 * pow_BOs_be2 ); - - /* calculate the Bond Energy */ - e_bond[ i ] += ebond = - -twbp->De_s * bo_ij->BO_s * exp_be12 - -twbp->De_p * bo_ij->BO_pi - -twbp->De_pp * bo_ij->BO_pi2; - - /* calculate derivatives of Bond Orders */ - bo_ij->Cdbo += CEbo; - bo_ij->Cdbopi -= (CEbo + twbp->De_p); - bo_ij->Cdbopi2 -= (CEbo + twbp->De_pp); + int i, j, pj, natoms; + int start_i, end_i; + int type_i, type_j; + real ebond, pow_BOs_be2, exp_be12, CEbo; + real gp3, gp4, gp7, gp10, gp37; + real exphu, exphua1, exphub1, exphuov, hulpov, estriph; + real decobdbo, decobdboua, decobdboub; + single_body_parameters *sbp_i, *sbp_j; + two_body_parameters *twbp; + bond_order_data *bo_ij; + reax_list *bonds; + storage *workspace; + + i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) return; + + bonds = &( p_bonds); + workspace = &( p_workspace ); + gp3 = gp.l[3]; + gp4 = gp.l[4]; + gp7 = gp.l[7]; + gp10 = gp.l[10]; + gp37 = (int) gp.l[37]; + + //for( i = 0; i < natoms; ++i ) { + start_i = Dev_Start_Index(i, bonds); + end_i = Dev_End_Index(i, bonds); + + for( pj = start_i; pj < end_i; ++pj ) { + j = bonds->select.bond_list[pj].nbr; + + if( my_atoms[i].orig_id <= my_atoms[j].orig_id ) { + /* set the pointers */ + type_i = my_atoms[i].type; + type_j = my_atoms[j].type; + sbp_i = &( sbp[type_i] ); + sbp_j = &( sbp[type_j] ); + + twbp = &( tbp[ index_tbp (type_i,type_j, num_atom_types) ] ); + bo_ij = &( bonds->select.bond_list[pj].bo_data ); + + /* calculate the constants */ + pow_BOs_be2 = POW( bo_ij->BO_s, twbp->p_be2 ); + exp_be12 = EXP( twbp->p_be1 * ( 1.0 - pow_BOs_be2 ) ); + CEbo = -twbp->De_s * exp_be12 * + ( 1.0 - twbp->p_be1 * twbp->p_be2 * pow_BOs_be2 ); + + /* calculate the Bond Energy */ + e_bond[ i ] += ebond = + -twbp->De_s * bo_ij->BO_s * exp_be12 + -twbp->De_p * bo_ij->BO_pi + -twbp->De_pp * bo_ij->BO_pi2; + + /* calculate derivatives of Bond Orders */ + bo_ij->Cdbo += CEbo; + bo_ij->Cdbopi -= (CEbo + twbp->De_p); + bo_ij->Cdbopi2 -= (CEbo + twbp->De_pp); #ifdef TEST_ENERGY - //fprintf( out_control->ebond, "%6d%6d%24.15e%24.15e%24.15e\n", - fprintf( out_control->ebond, "%6d%6d%12.4f%12.4f%12.4f\n", - system->my_atoms[i].orig_id, - system->my_atoms[j].orig_id, - bo_ij->BO, ebond, data->my_en.e_bond ); + //fprintf( out_control->ebond, "%6d%6d%24.15e%24.15e%24.15e\n", + fprintf( out_control->ebond, "%6d%6d%12.4f%12.4f%12.4f\n", + system->my_atoms[i].orig_id, + system->my_atoms[j].orig_id, + bo_ij->BO, ebond, data->my_en.e_bond ); #endif #ifdef TEST_FORCES - Add_dBO( system, lists, i, pj, CEbo, workspace->f_be ); - Add_dBOpinpi2( system, lists, i, pj, - -(CEbo + twbp->De_p), -(CEbo + twbp->De_pp), - workspace->f_be, workspace->f_be ); + Add_dBO( system, lists, i, pj, CEbo, workspace->f_be ); + Add_dBOpinpi2( system, lists, i, pj, + -(CEbo + twbp->De_p), -(CEbo + twbp->De_pp), + workspace->f_be, workspace->f_be ); #endif - /* Stabilisation terminal triple bond */ - if( bo_ij->BO >= 1.00 ) { - if( gp37 == 2 || - (sbp_i->mass == 12.0000 && sbp_j->mass == 15.9990) || - (sbp_j->mass == 12.0000 && sbp_i->mass == 15.9990) ) { - exphu = EXP( -gp7 * SQR(bo_ij->BO - 2.50) ); - exphua1 = EXP(-gp3 * (workspace->total_bond_order[i]-bo_ij->BO)); - exphub1 = EXP(-gp3 * (workspace->total_bond_order[j]-bo_ij->BO)); - exphuov = EXP(gp4 * (workspace->Delta[i] + workspace->Delta[j])); - hulpov = 1.0 / (1.0 + 25.0 * exphuov); - - estriph = gp10 * exphu * hulpov * (exphua1 + exphub1); - e_bond [i] += estriph; - - decobdbo = gp10 * exphu * hulpov * (exphua1 + exphub1) * - ( gp3 - 2.0 * gp7 * (bo_ij->BO-2.50) ); - decobdboua = -gp10 * exphu * hulpov * - (gp3*exphua1 + 25.0*gp4*exphuov*hulpov*(exphua1+exphub1)); - decobdboub = -gp10 * exphu * hulpov * - (gp3*exphub1 + 25.0*gp4*exphuov*hulpov*(exphua1+exphub1)); - - bo_ij->Cdbo += decobdbo; - workspace->CdDelta[i] += decobdboua; - workspace->CdDelta[j] += decobdboub; + /* Stabilisation terminal triple bond */ + if( bo_ij->BO >= 1.00 ) { + if( gp37 == 2 || + (sbp_i->mass == 12.0000 && sbp_j->mass == 15.9990) || + (sbp_j->mass == 12.0000 && sbp_i->mass == 15.9990) ) { + exphu = EXP( -gp7 * SQR(bo_ij->BO - 2.50) ); + exphua1 = EXP(-gp3 * (workspace->total_bond_order[i]-bo_ij->BO)); + exphub1 = EXP(-gp3 * (workspace->total_bond_order[j]-bo_ij->BO)); + exphuov = EXP(gp4 * (workspace->Delta[i] + workspace->Delta[j])); + hulpov = 1.0 / (1.0 + 25.0 * exphuov); + + estriph = gp10 * exphu * hulpov * (exphua1 + exphub1); + e_bond [i] += estriph; + + decobdbo = gp10 * exphu * hulpov * (exphua1 + exphub1) * + ( gp3 - 2.0 * gp7 * (bo_ij->BO-2.50) ); + decobdboua = -gp10 * exphu * hulpov * + (gp3*exphua1 + 25.0*gp4*exphuov*hulpov*(exphua1+exphub1)); + decobdboub = -gp10 * exphu * hulpov * + (gp3*exphub1 + 25.0*gp4*exphuov*hulpov*(exphua1+exphub1)); + + bo_ij->Cdbo += decobdbo; + workspace->CdDelta[i] += decobdboua; + workspace->CdDelta[j] += decobdboub; #ifdef TEST_ENERGY - //fprintf( out_control->ebond, - // "%6d%6d%24.15e%24.15e%24.15e%24.15e\n", - // system->my_atoms[i].orig_id, system->my_atoms[j].orig_id, - // estriph, decobdbo, decobdboua, decobdboub ); + //fprintf( out_control->ebond, + // "%6d%6d%24.15e%24.15e%24.15e%24.15e\n", + // system->my_atoms[i].orig_id, system->my_atoms[j].orig_id, + // estriph, decobdbo, decobdboua, decobdboub ); #endif #ifdef TEST_FORCES - Add_dBO( system, lists, i, pj, decobdbo, workspace->f_be ); - Add_dDelta( system, lists, i, decobdboua, workspace->f_be ); - Add_dDelta( system, lists, j, decobdboub, workspace->f_be ); + Add_dBO( system, lists, i, pj, decobdbo, workspace->f_be ); + Add_dDelta( system, lists, i, decobdboua, workspace->f_be ); + Add_dDelta( system, lists, j, decobdboub, workspace->f_be ); #endif - } - } - } - } - // } + } + } + } + } + // } } diff --git a/PG-PuReMD/src/cuda_copy.cu b/PG-PuReMD/src/cuda_copy.cu index 4172fc35..a40a5b90 100644 --- a/PG-PuReMD/src/cuda_copy.cu +++ b/PG-PuReMD/src/cuda_copy.cu @@ -13,151 +13,151 @@ extern "C" void Delete_List( reax_list*); void Sync_Grid (grid *host, grid *device) { - int total; - grid_cell local_cell; - total = host->ncells[0] * host->ncells[1] * host->ncells[2]; - - ivec_Copy (device->ncells, host->ncells); - rvec_Copy (device->cell_len, host->cell_len); - rvec_Copy (device->inv_len, host->inv_len); - - ivec_Copy (device->bond_span, host->bond_span ); - ivec_Copy (device->nonb_span, host->nonb_span ); - ivec_Copy (device->vlist_span, host->vlist_span ); - - ivec_Copy (device->native_cells, host->native_cells ); - ivec_Copy (device->native_str, host->native_str ); - ivec_Copy (device->native_end, host->native_end ); - - device->ghost_cut = host->ghost_cut; - ivec_Copy (device->ghost_span, host->ghost_span ); - ivec_Copy (device->ghost_nonb_span, host->ghost_nonb_span ); - ivec_Copy (device->ghost_hbond_span, host->ghost_hbond_span ); - ivec_Copy (device->ghost_bond_span, host->ghost_bond_span ); - - copy_host_device (host->str, device->str, sizeof (int) * total, cudaMemcpyHostToDevice, "grid:str"); - copy_host_device (host->end, device->end, sizeof (int) * total, cudaMemcpyHostToDevice, "grid:end"); - copy_host_device (host->cutoff, device->cutoff, sizeof (real) * total, cudaMemcpyHostToDevice, "grid:cutoff"); - copy_host_device (host->nbrs_x, device->nbrs_x, sizeof (ivec) * total * host->max_nbrs, cudaMemcpyHostToDevice, "grid:nbrs_x"); - copy_host_device (host->nbrs_cp, device->nbrs_cp, sizeof (rvec) * total * host->max_nbrs, cudaMemcpyHostToDevice, "grid:nbrs_cp"); - - copy_host_device (host->rel_box, device->rel_box, sizeof (ivec) * total, cudaMemcpyHostToDevice, "grid:rel_box"); - - device->max_nbrs = host->max_nbrs; - - /* - for (int i = 0; i < total; i++) { - - copy_host_device (&local_cell, &device->cells[i], sizeof (grid_cell), cudaMemcpyDeviceToHost, "grid:cell-cuda_copy"); - - //fprintf (stderr, " Atoms address %ld (%d) \n", local_cell.atoms, host->max_atoms ); - //cuda_memset (local_cell.atoms, 0, sizeof (int) * host->max_atoms, "grid:cell:atoms-memset"); - //fprintf (stderr, "host native atoms -> %d %d \n", host->native_str[0], host->native_end[0]); - //fprintf (stderr, "host atoms -> %d \n", host->cells[i].atoms[i]); - //fprintf (stderr, "Host Max atoms : %d \n", host->max_atoms ); - //copy_host_device (host->cells[i].atoms, - // (local_cell.atoms), sizeof (int) * host->max_atoms, cudaMemcpyHostToDevice, "grid:cell:atoms"); - - //////////////////////////////////////////// - //No need to copy atoms from the cells from host to device. - // str and end has positions in the d_my_atoms list, which are just indexes into this list - // this index is used in the cuda_neighbors to compute the neighbors. - // This is the only place where atoms is used. - //////////////////////////////////////////////// - - //fprintf (stderr, " cells:nbrs_x %ld \n", local_cell.nbrs_x); - copy_host_device (host->cells[i].nbrs_x, - local_cell.nbrs_x, sizeof (ivec) * host->max_nbrs, cudaMemcpyHostToDevice, "grid:nbrs_x"); - - //fprintf (stderr, " Atoms address %ld \n", local_cell.nbrs_cp); - copy_host_device (host->cells[i].nbrs_cp, - local_cell.nbrs_cp, sizeof (rvec) * host->max_nbrs, cudaMemcpyHostToDevice, "grid:nbrs_cp"); - - //no need to copy pointers for device->cells[i].nbrs. - // we can extract the pointer by nbrs_x (ivec) into the cells array. - // This makes nbrs member redundant on the device - - local_cell.cutoff = host->cells[i].cutoff; - rvec_Copy (local_cell.min, host->cells[i].min); - rvec_Copy (local_cell.max, host->cells[i].max); - ivec_Copy (local_cell.rel_box, host->cells[i].rel_box); - - local_cell.mark = host->cells[i].mark; - local_cell.type = host->cells[i].type; - local_cell.str = host->cells[i].str; - local_cell.end = host->cells[i].end; - local_cell.top = host->cells[i].top; - - copy_host_device (&local_cell, &device->cells[i], sizeof (grid_cell), - cudaMemcpyHostToDevice, "grid:cell-cuda_copy"); - } - */ + int total; + grid_cell local_cell; + total = host->ncells[0] * host->ncells[1] * host->ncells[2]; + + ivec_Copy (device->ncells, host->ncells); + rvec_Copy (device->cell_len, host->cell_len); + rvec_Copy (device->inv_len, host->inv_len); + + ivec_Copy (device->bond_span, host->bond_span ); + ivec_Copy (device->nonb_span, host->nonb_span ); + ivec_Copy (device->vlist_span, host->vlist_span ); + + ivec_Copy (device->native_cells, host->native_cells ); + ivec_Copy (device->native_str, host->native_str ); + ivec_Copy (device->native_end, host->native_end ); + + device->ghost_cut = host->ghost_cut; + ivec_Copy (device->ghost_span, host->ghost_span ); + ivec_Copy (device->ghost_nonb_span, host->ghost_nonb_span ); + ivec_Copy (device->ghost_hbond_span, host->ghost_hbond_span ); + ivec_Copy (device->ghost_bond_span, host->ghost_bond_span ); + + copy_host_device (host->str, device->str, sizeof (int) * total, cudaMemcpyHostToDevice, "grid:str"); + copy_host_device (host->end, device->end, sizeof (int) * total, cudaMemcpyHostToDevice, "grid:end"); + copy_host_device (host->cutoff, device->cutoff, sizeof (real) * total, cudaMemcpyHostToDevice, "grid:cutoff"); + copy_host_device (host->nbrs_x, device->nbrs_x, sizeof (ivec) * total * host->max_nbrs, cudaMemcpyHostToDevice, "grid:nbrs_x"); + copy_host_device (host->nbrs_cp, device->nbrs_cp, sizeof (rvec) * total * host->max_nbrs, cudaMemcpyHostToDevice, "grid:nbrs_cp"); + + copy_host_device (host->rel_box, device->rel_box, sizeof (ivec) * total, cudaMemcpyHostToDevice, "grid:rel_box"); + + device->max_nbrs = host->max_nbrs; + + /* + for (int i = 0; i < total; i++) { + + copy_host_device (&local_cell, &device->cells[i], sizeof (grid_cell), cudaMemcpyDeviceToHost, "grid:cell-cuda_copy"); + + //fprintf (stderr, " Atoms address %ld (%d) \n", local_cell.atoms, host->max_atoms ); + //cuda_memset (local_cell.atoms, 0, sizeof (int) * host->max_atoms, "grid:cell:atoms-memset"); + //fprintf (stderr, "host native atoms -> %d %d \n", host->native_str[0], host->native_end[0]); + //fprintf (stderr, "host atoms -> %d \n", host->cells[i].atoms[i]); + //fprintf (stderr, "Host Max atoms : %d \n", host->max_atoms ); + //copy_host_device (host->cells[i].atoms, + // (local_cell.atoms), sizeof (int) * host->max_atoms, cudaMemcpyHostToDevice, "grid:cell:atoms"); + + //////////////////////////////////////////// + //No need to copy atoms from the cells from host to device. + // str and end has positions in the d_my_atoms list, which are just indexes into this list + // this index is used in the cuda_neighbors to compute the neighbors. + // This is the only place where atoms is used. + //////////////////////////////////////////////// + + //fprintf (stderr, " cells:nbrs_x %ld \n", local_cell.nbrs_x); + copy_host_device (host->cells[i].nbrs_x, + local_cell.nbrs_x, sizeof (ivec) * host->max_nbrs, cudaMemcpyHostToDevice, "grid:nbrs_x"); + + //fprintf (stderr, " Atoms address %ld \n", local_cell.nbrs_cp); + copy_host_device (host->cells[i].nbrs_cp, + local_cell.nbrs_cp, sizeof (rvec) * host->max_nbrs, cudaMemcpyHostToDevice, "grid:nbrs_cp"); + + //no need to copy pointers for device->cells[i].nbrs. + // we can extract the pointer by nbrs_x (ivec) into the cells array. + // This makes nbrs member redundant on the device + + local_cell.cutoff = host->cells[i].cutoff; + rvec_Copy (local_cell.min, host->cells[i].min); + rvec_Copy (local_cell.max, host->cells[i].max); + ivec_Copy (local_cell.rel_box, host->cells[i].rel_box); + + local_cell.mark = host->cells[i].mark; + local_cell.type = host->cells[i].type; + local_cell.str = host->cells[i].str; + local_cell.end = host->cells[i].end; + local_cell.top = host->cells[i].top; + + copy_host_device (&local_cell, &device->cells[i], sizeof (grid_cell), + cudaMemcpyHostToDevice, "grid:cell-cuda_copy"); + } + */ } void Sync_Atoms (reax_system *sys) { - //TODO - //TODO METIN FIX, coredump on his machine - //TODO - //TODO - //copy_host_device (sys->my_atoms, sys->d_my_atoms, sizeof (reax_atom) * sys->total_cap, cudaMemcpyHostToDevice, "system:my_atoms"); + //TODO + //TODO METIN FIX, coredump on his machine + //TODO + //TODO + //copy_host_device (sys->my_atoms, sys->d_my_atoms, sizeof (reax_atom) * sys->total_cap, cudaMemcpyHostToDevice, "system:my_atoms"); #if defined(__CUDA_DEBUG_LOG__) - fprintf (stderr, "p:%d - Synching atoms: n: %d N: %d, total_cap: %d \n", - sys->my_rank, sys->n, sys->N, sys->total_cap); + fprintf (stderr, "p:%d - Synching atoms: n: %d N: %d, total_cap: %d \n", + sys->my_rank, sys->n, sys->N, sys->total_cap); #endif - copy_host_device (sys->my_atoms, sys->d_my_atoms, sizeof (reax_atom) * sys->N, cudaMemcpyHostToDevice, "system:my_atoms"); - //TODO - //TODO METIN FIX, coredump on his machine - //TODO - //TODO + copy_host_device (sys->my_atoms, sys->d_my_atoms, sizeof (reax_atom) * sys->N, cudaMemcpyHostToDevice, "system:my_atoms"); + //TODO + //TODO METIN FIX, coredump on his machine + //TODO + //TODO } void Sync_System (reax_system *sys) { - //fprintf (stderr, "p:%d - trying to copy atoms : %d \n", sys->my_rank, sys->local_cap); - Sync_Atoms (sys); - - copy_host_device (&(sys->my_box), sys->d_my_box, - sizeof (simulation_box), cudaMemcpyHostToDevice, "system:my_box"); - - copy_host_device (&(sys->my_ext_box), sys->d_my_ext_box, - sizeof (simulation_box), cudaMemcpyHostToDevice, "system:my_ext_box"); - - copy_host_device (sys->reax_param.sbp, sys->reax_param.d_sbp, - sizeof (single_body_parameters) * sys->reax_param.num_atom_types, cudaMemcpyHostToDevice, "system:sbp"); - copy_host_device (sys->reax_param.tbp, sys->reax_param.d_tbp, - sizeof (two_body_parameters) * pow (sys->reax_param.num_atom_types, 2), cudaMemcpyHostToDevice, "system:tbp"); - copy_host_device (sys->reax_param.thbp, sys->reax_param.d_thbp, - sizeof (three_body_header) * pow (sys->reax_param.num_atom_types, 3), cudaMemcpyHostToDevice, "system:thbh"); - copy_host_device (sys->reax_param.hbp, sys->reax_param.d_hbp, - sizeof (hbond_parameters) * pow (sys->reax_param.num_atom_types, 3), cudaMemcpyHostToDevice, "system:hbond"); - copy_host_device (sys->reax_param.fbp, sys->reax_param.d_fbp, - sizeof (four_body_header) * pow (sys->reax_param.num_atom_types, 4), cudaMemcpyHostToDevice, "system:four_header"); - - copy_host_device (sys->reax_param.gp.l, sys->reax_param.d_gp.l, - sizeof (real) * sys->reax_param.gp.n_global, cudaMemcpyHostToDevice, "system:global_parameters"); - - sys->reax_param.d_gp.n_global = sys->reax_param.gp.n_global; - sys->reax_param.d_gp.vdw_type = sys->reax_param.gp.vdw_type; + //fprintf (stderr, "p:%d - trying to copy atoms : %d \n", sys->my_rank, sys->local_cap); + Sync_Atoms (sys); + + copy_host_device (&(sys->my_box), sys->d_my_box, + sizeof (simulation_box), cudaMemcpyHostToDevice, "system:my_box"); + + copy_host_device (&(sys->my_ext_box), sys->d_my_ext_box, + sizeof (simulation_box), cudaMemcpyHostToDevice, "system:my_ext_box"); + + copy_host_device (sys->reax_param.sbp, sys->reax_param.d_sbp, + sizeof (single_body_parameters) * sys->reax_param.num_atom_types, cudaMemcpyHostToDevice, "system:sbp"); + copy_host_device (sys->reax_param.tbp, sys->reax_param.d_tbp, + sizeof (two_body_parameters) * pow (sys->reax_param.num_atom_types, 2), cudaMemcpyHostToDevice, "system:tbp"); + copy_host_device (sys->reax_param.thbp, sys->reax_param.d_thbp, + sizeof (three_body_header) * pow (sys->reax_param.num_atom_types, 3), cudaMemcpyHostToDevice, "system:thbh"); + copy_host_device (sys->reax_param.hbp, sys->reax_param.d_hbp, + sizeof (hbond_parameters) * pow (sys->reax_param.num_atom_types, 3), cudaMemcpyHostToDevice, "system:hbond"); + copy_host_device (sys->reax_param.fbp, sys->reax_param.d_fbp, + sizeof (four_body_header) * pow (sys->reax_param.num_atom_types, 4), cudaMemcpyHostToDevice, "system:four_header"); + + copy_host_device (sys->reax_param.gp.l, sys->reax_param.d_gp.l, + sizeof (real) * sys->reax_param.gp.n_global, cudaMemcpyHostToDevice, "system:global_parameters"); + + sys->reax_param.d_gp.n_global = sys->reax_param.gp.n_global; + sys->reax_param.d_gp.vdw_type = sys->reax_param.gp.vdw_type; } void Output_Sync_Atoms (reax_system *sys) { - //TODO changed this from sys->n to sys->N - copy_host_device (sys->my_atoms, sys->d_my_atoms, - sizeof (reax_atom) * sys->total_cap, cudaMemcpyDeviceToHost, "system:my_atoms"); + //TODO changed this from sys->n to sys->N + copy_host_device (sys->my_atoms, sys->d_my_atoms, + sizeof (reax_atom) * sys->total_cap, cudaMemcpyDeviceToHost, "system:my_atoms"); } void Output_Sync_Simulation_Data (simulation_data *host, simulation_data *dev) { - copy_host_device (&host->my_en, &dev->my_en, sizeof (energy_data), - cudaMemcpyDeviceToHost, "simulation_data:energy_data"); - copy_host_device (&host->kin_press, &dev->kin_press, sizeof (real), - cudaMemcpyDeviceToHost, "simulation_data:kin_press"); - copy_host_device (host->int_press, dev->int_press, sizeof (rvec), - cudaMemcpyDeviceToHost, "simulation_data:int_press"); - copy_host_device (host->ext_press, dev->ext_press, sizeof (rvec), - cudaMemcpyDeviceToHost, "simulation_data:ext_press"); + copy_host_device (&host->my_en, &dev->my_en, sizeof (energy_data), + cudaMemcpyDeviceToHost, "simulation_data:energy_data"); + copy_host_device (&host->kin_press, &dev->kin_press, sizeof (real), + cudaMemcpyDeviceToHost, "simulation_data:kin_press"); + copy_host_device (host->int_press, dev->int_press, sizeof (rvec), + cudaMemcpyDeviceToHost, "simulation_data:int_press"); + copy_host_device (host->ext_press, dev->ext_press, sizeof (rvec), + cudaMemcpyDeviceToHost, "simulation_data:ext_press"); } void Sync_Workspace (storage *workspace, enum cudaMemcpyKind dir) @@ -183,37 +183,37 @@ void Prep_Device_For_Output (reax_system *system, simulation_data *data ) void Output_Sync_Lists (reax_list *host, reax_list *device, int type) { - //fprintf (stderr, " Trying to copy *%d* list from device to host \n", type); - - //list is already allocated -- discard it first - //if (host->n > 0) - //if (host->allocated > 0) - // Delete_List (host); - - //memory is allocated on the host - //Make_List(device->n, device->num_intrs, type, host); - - //memcpy the entries from device to host - copy_host_device (host->index, device->index, sizeof (int) * device->n, cudaMemcpyDeviceToHost, "output_sync_list:list:index"); - copy_host_device (host->end_index, device->end_index, sizeof (int) * device->n, cudaMemcpyDeviceToHost, "output_sync:list:end_index"); - - switch (type) - { - case TYP_BOND: - copy_host_device (host->select.bond_list, device->select.bond_list, - sizeof (bond_data) * device->num_intrs, cudaMemcpyDeviceToHost, "bond_list"); - break; - - case TYP_THREE_BODY: - copy_host_device (host->select.three_body_list, device->select.three_body_list, - sizeof (three_body_interaction_data )* device->num_intrs, cudaMemcpyDeviceToHost, "three_body_list"); - break; - - default: - fprintf (stderr, "Unknown list synching from device to host ---- > %d \n", type ); - exit (1); - break; - } + //fprintf (stderr, " Trying to copy *%d* list from device to host \n", type); + + //list is already allocated -- discard it first + //if (host->n > 0) + //if (host->allocated > 0) + // Delete_List (host); + + //memory is allocated on the host + //Make_List(device->n, device->num_intrs, type, host); + + //memcpy the entries from device to host + copy_host_device (host->index, device->index, sizeof (int) * device->n, cudaMemcpyDeviceToHost, "output_sync_list:list:index"); + copy_host_device (host->end_index, device->end_index, sizeof (int) * device->n, cudaMemcpyDeviceToHost, "output_sync:list:end_index"); + + switch (type) + { + case TYP_BOND: + copy_host_device (host->select.bond_list, device->select.bond_list, + sizeof (bond_data) * device->num_intrs, cudaMemcpyDeviceToHost, "bond_list"); + break; + + case TYP_THREE_BODY: + copy_host_device (host->select.three_body_list, device->select.three_body_list, + sizeof (three_body_interaction_data )* device->num_intrs, cudaMemcpyDeviceToHost, "three_body_list"); + break; + + default: + fprintf (stderr, "Unknown list synching from device to host ---- > %d \n", type ); + exit (1); + break; + } } //#ifdef __cplusplus diff --git a/PG-PuReMD/src/cuda_environment.cu b/PG-PuReMD/src/cuda_environment.cu index 2583a97b..dbbaba9b 100644 --- a/PG-PuReMD/src/cuda_environment.cu +++ b/PG-PuReMD/src/cuda_environment.cu @@ -6,41 +6,41 @@ extern "C" void Setup_Cuda_Environment (int rank, int nprocs, int gpus_per_node) { - int deviceCount = 0; - cudaGetDeviceCount (&deviceCount); - - //Calculate the # of GPUs per processor - //and assign the GPU for each process - - //hpcc changes - //if (gpus_per_node == 2) { - cudaSetDevice ( (rank % (deviceCount)) ); - //cudaSetDevice( 1 ); - fprintf( stderr, "p:%d is using GPU: %d \n", rank, (rank % deviceCount)); - //} else { - // cudaSetDevice ( 0 ); - //} - - /////////////////////////////////////////////// - /////////////////////////////////////////////// - /////////////////////////////////////////////// - // CHANGE ORIGINAL///////////////////////////// - /////////////////////////////////////////////// - /////////////////////////////////////////////// - /////////////////////////////////////////////// - //cudaDeviceSetLimit ( cudaLimitStackSize, 8192 ); - //cudaDeviceSetCacheConfig ( cudaFuncCachePreferL1 ); - //cudaCheckError (); - /////////////////////////////////////////////// - /////////////////////////////////////////////// - /////////////////////////////////////////////// - /////////////////////////////////////////////// - /////////////////////////////////////////////// + int deviceCount = 0; + cudaGetDeviceCount (&deviceCount); + + //Calculate the # of GPUs per processor + //and assign the GPU for each process + + //hpcc changes + //if (gpus_per_node == 2) { + cudaSetDevice ( (rank % (deviceCount)) ); + //cudaSetDevice( 1 ); + fprintf( stderr, "p:%d is using GPU: %d \n", rank, (rank % deviceCount)); + //} else { + // cudaSetDevice ( 0 ); + //} + + /////////////////////////////////////////////// + /////////////////////////////////////////////// + /////////////////////////////////////////////// + // CHANGE ORIGINAL///////////////////////////// + /////////////////////////////////////////////// + /////////////////////////////////////////////// + /////////////////////////////////////////////// + //cudaDeviceSetLimit ( cudaLimitStackSize, 8192 ); + //cudaDeviceSetCacheConfig ( cudaFuncCachePreferL1 ); + //cudaCheckError (); + /////////////////////////////////////////////// + /////////////////////////////////////////////// + /////////////////////////////////////////////// + /////////////////////////////////////////////// + /////////////////////////////////////////////// } extern "C" void Cleanup_Cuda_Environment () { - cudaDeviceReset (); - cudaDeviceSynchronize (); + cudaDeviceReset (); + cudaDeviceSynchronize (); } diff --git a/PG-PuReMD/src/cuda_forces.cu b/PG-PuReMD/src/cuda_forces.cu index 0e40cc2f..063554bf 100644 --- a/PG-PuReMD/src/cuda_forces.cu +++ b/PG-PuReMD/src/cuda_forces.cu @@ -31,1580 +31,1595 @@ extern "C" void Delete_List( reax_list*); CUDA_GLOBAL void ker_estimate_storages (reax_atom *my_atoms, - single_body_parameters *sbp, - two_body_parameters *tbp, - control_params *control, - reax_list far_nbrs, - int num_atom_types, - int n, int N, - int Hcap, int total_cap, - int *Htop, int *num_3body, - int *bond_top, int *hb_top - ) + single_body_parameters *sbp, + two_body_parameters *tbp, + control_params *control, + reax_list far_nbrs, + int num_atom_types, + int n, int N, + int Hcap, int total_cap, + int *Htop, int *num_3body, + int *bond_top, int *hb_top + ) { - int i, j, pj; - int start_i, end_i; - int type_i, type_j; - int ihb, jhb; - int local; - real cutoff; - real r_ij, r2; - real C12, C34, C56; - real BO, BO_s, BO_pi, BO_pi2; - single_body_parameters *sbp_i, *sbp_j; - two_body_parameters *twbp; - far_neighbor_data *nbr_pj; - reax_atom *atom_i, *atom_j; - - i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= N) return; - - - //Commented in CUDA_KERNEL - //for( i = 0; i < N; ++i ) { - atom_i = &(my_atoms[i]); - type_i = atom_i->type; - start_i = Dev_Start_Index(i, &far_nbrs); - end_i = Dev_End_Index(i, &far_nbrs); - sbp_i = &(sbp[type_i]); - - if( i < n ) { - local = 1; - cutoff = control->nonb_cut; - //++(*Htop); - atomicAdd (Htop, 1); - ihb = sbp_i->p_hbond; - } - else { - local = 0; - cutoff = control->bond_cut; - ihb = -1; - } - - for( pj = start_i; pj < end_i; ++pj ) { - nbr_pj = &( far_nbrs.select.far_nbr_list[pj] ); - j = nbr_pj->nbr; - atom_j = &(my_atoms[j]); - - if (nbr_pj->d <= control->nonb_cut) { - type_j = my_atoms[j].type; - sbp_j = &(sbp[type_j]); - ihb = sbp_i->p_hbond; - jhb = sbp_j->p_hbond; - if ((control->hbond_cut > 0.1) - && (nbr_pj->d <= control->hbond_cut) - && (ihb == 2) - && (jhb == 1) - && (j < n) - && (i > n) - ) - atomicAdd (&hb_top [i], 1); - - if (i >= n) ihb = -1; - } - - - - if(nbr_pj->d <= cutoff) { - type_j = my_atoms[j].type; - r_ij = nbr_pj->d; - sbp_j = &(sbp[type_j]); - twbp = &(tbp[index_tbp (type_i,type_j,num_atom_types)]); - - if( local ) { - //if( j < n || atom_i->orig_id < atom_j->orig_id ) //tryQEq ||1 - if( j < n || atom_i->orig_id < atom_j->orig_id ) //tryQEq ||1 - //++(*Htop); - atomicAdd (Htop, 1); - else if( j < n || atom_i->orig_id > atom_j->orig_id ) //tryQEq ||1 - //++(*Htop); - atomicAdd (Htop, 1); - - if( control->hbond_cut > 0.1 && (ihb==1 || ihb==2) && - nbr_pj->d <= control->hbond_cut - ) { - jhb = sbp_j->p_hbond; - if( (ihb == 1) && (jhb == 2)) - //++hb_top[i]; - atomicAdd (&hb_top[i], 1); - //else if( j < n && ihb == 2 && jhb == 1 ) - //else if( ihb == 2 && jhb == 1 && j < n) - else if( ihb == 2 && jhb == 1 && j < n) - //++hb_top[j]; - atomicAdd (&hb_top[i], 1); - } - } - - // uncorrected bond orders - if( nbr_pj->d <= control->bond_cut ) { - r2 = SQR(r_ij); - - if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) { - C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 ); - BO_s = (1.0 + control->bo_cut) * EXP( C12 ); - } - else BO_s = C12 = 0.0; - - if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) { - C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 ); - BO_pi = EXP( C34 ); - } - else BO_pi = C34 = 0.0; - - if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) { - C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 ); - BO_pi2= EXP( C56 ); - } - else BO_pi2 = C56 = 0.0; - - // Initially BO values are the uncorrected ones, page 1 - BO = BO_s + BO_pi + BO_pi2; - - if( BO >= control->bo_cut ) { - //++bond_top[i]; - //++bond_top[j]; - atomicAdd (&bond_top [i], 1); - //atomicAdd (&bond_top [j], 1); - } - } - } - } - //} -- Commented in CUDA_KERNEL + int i, j, pj; + int start_i, end_i; + int type_i, type_j; + int ihb, jhb; + int local; + real cutoff; + real r_ij, r2; + real C12, C34, C56; + real BO, BO_s, BO_pi, BO_pi2; + single_body_parameters *sbp_i, *sbp_j; + two_body_parameters *twbp; + far_neighbor_data *nbr_pj; + reax_atom *atom_i, *atom_j; + + i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= N) return; + + + //Commented in CUDA_KERNEL + //for( i = 0; i < N; ++i ) { + atom_i = &(my_atoms[i]); + type_i = atom_i->type; + start_i = Dev_Start_Index(i, &far_nbrs); + end_i = Dev_End_Index(i, &far_nbrs); + sbp_i = &(sbp[type_i]); + + if( i < n ) { + local = 1; + cutoff = control->nonb_cut; + //++(*Htop); + atomicAdd (Htop, 1); + ihb = sbp_i->p_hbond; + } + else { + local = 0; + cutoff = control->bond_cut; + ihb = -1; + } + + for( pj = start_i; pj < end_i; ++pj ) { + nbr_pj = &( far_nbrs.select.far_nbr_list[pj] ); + j = nbr_pj->nbr; + atom_j = &(my_atoms[j]); + + if (nbr_pj->d <= control->nonb_cut) { + type_j = my_atoms[j].type; + sbp_j = &(sbp[type_j]); + ihb = sbp_i->p_hbond; + jhb = sbp_j->p_hbond; + if ((control->hbond_cut > 0.1) + && (nbr_pj->d <= control->hbond_cut) + && (ihb == 2) + && (jhb == 1) + && (j < n) + && (i > n) + ) + atomicAdd (&hb_top [i], 1); + + if (i >= n) ihb = -1; + } + + + + if(nbr_pj->d <= cutoff) { + type_j = my_atoms[j].type; + r_ij = nbr_pj->d; + sbp_j = &(sbp[type_j]); + twbp = &(tbp[index_tbp (type_i,type_j,num_atom_types)]); + + if( local ) { + //if( j < n || atom_i->orig_id < atom_j->orig_id ) //tryQEq ||1 + if( j < n || atom_i->orig_id < atom_j->orig_id ) //tryQEq ||1 + //++(*Htop); + atomicAdd (Htop, 1); + else if( j < n || atom_i->orig_id > atom_j->orig_id ) //tryQEq ||1 + //++(*Htop); + atomicAdd (Htop, 1); + + if( control->hbond_cut > 0.1 && (ihb==1 || ihb==2) && + nbr_pj->d <= control->hbond_cut + ) { + jhb = sbp_j->p_hbond; + if( (ihb == 1) && (jhb == 2)) + //++hb_top[i]; + atomicAdd (&hb_top[i], 1); + //else if( j < n && ihb == 2 && jhb == 1 ) + //else if( ihb == 2 && jhb == 1 && j < n) + else if( ihb == 2 && jhb == 1 && j < n) + //++hb_top[j]; + atomicAdd (&hb_top[i], 1); + } + } + + // uncorrected bond orders + if( nbr_pj->d <= control->bond_cut ) { + r2 = SQR(r_ij); + + if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) { + C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 ); + BO_s = (1.0 + control->bo_cut) * EXP( C12 ); + } + else BO_s = C12 = 0.0; + + if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) { + C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 ); + BO_pi = EXP( C34 ); + } + else BO_pi = C34 = 0.0; + + if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) { + C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 ); + BO_pi2= EXP( C56 ); + } + else BO_pi2 = C56 = 0.0; + + // Initially BO values are the uncorrected ones, page 1 + BO = BO_s + BO_pi + BO_pi2; + + if( BO >= control->bo_cut ) { + //++bond_top[i]; + //++bond_top[j]; + atomicAdd (&bond_top [i], 1); + //atomicAdd (&bond_top [j], 1); + } + } + } + } + //} -- Commented in CUDA_KERNEL } + CUDA_GLOBAL void ker_init_system_atoms(reax_atom *my_atoms, int N, - int *hb_top, int *bond_top) + int *hb_top, int *bond_top) { - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= N) return; + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= N) return; - my_atoms[i].num_bonds = bond_top [i]; - my_atoms[i].num_hbonds = hb_top [i]; + my_atoms[i].num_bonds = bond_top [i]; + my_atoms[i].num_hbonds = hb_top [i]; } + void Cuda_Estimate_Storages(reax_system *system, control_params *control, - reax_list **lists, int local_cap, int total_cap, - int *Htop, int *hb_top, - int *bond_top, int *num_3body) + reax_list **lists, int local_cap, int total_cap, + int *Htop, int *hb_top, + int *bond_top, int *num_3body) { - int blocks = 0; - int *l_Htop, *l_hb_top, *l_bond_top, *l_num_3body; - int *tmp = (int *)scratch; - - *Htop = 0; - //memset( hb_top, 0, sizeof(int) * local_cap); - memset( hb_top, 0, sizeof(int) * total_cap); - memset( bond_top, 0, sizeof(int) * total_cap ); - *num_3body = 0; - - //cuda_memset (tmp, 0, 1 + 1 + sizeof (int) * (local_cap+ total_cap), "Cuda_Estimate_Storages"); - cuda_memset (tmp, 0, sizeof (int) * (1 + 1 + total_cap+ total_cap), "Cuda_Estimate_Storages"); - - l_Htop = tmp; - l_num_3body = l_Htop + 1; - l_hb_top = l_num_3body + 1; - //l_bond_top = l_hb_top + local_cap; - l_bond_top = l_hb_top + total_cap; - - blocks = system->N / ST_BLOCK_SIZE + - ((system->N % ST_BLOCK_SIZE == 0) ? 0 : 1); - - ker_estimate_storages <<< blocks, ST_BLOCK_SIZE>>> - (system->d_my_atoms, system->reax_param.d_sbp, system->reax_param.d_tbp, - (control_params *)control->d_control_params, *(*dev_lists + FAR_NBRS), system->reax_param.num_atom_types, - system->n, system->N, system->Hcap, system->total_cap, - l_Htop, l_num_3body, l_bond_top, l_hb_top ); - cudaThreadSynchronize (); - cudaCheckError (); - - copy_host_device( Htop, l_Htop, sizeof (int), cudaMemcpyDeviceToHost, "Htop"); - copy_host_device( num_3body, l_num_3body, sizeof (int), cudaMemcpyDeviceToHost, "num_3body"); - //copy_host_device( hb_top, l_hb_top, sizeof (int) * local_cap, cudaMemcpyDeviceToHost, "hb_top"); - copy_host_device( hb_top, l_hb_top, sizeof (int) * total_cap, cudaMemcpyDeviceToHost, "hb_top"); - copy_host_device( bond_top, l_bond_top, sizeof (int) * total_cap, cudaMemcpyDeviceToHost, "bond_top"); - - - int bond_count = 0; - int hbond_count = 0; - int max_bonds = 0, min_bonds = 999999; - int max_hbonds = 0, min_hbonds = 999999; - - for (int i = 0; i < system->N; i++) { - if (bond_top[i] >= max_bonds) max_bonds = bond_top[i]; - if (bond_top[i] <= min_bonds) min_bonds = bond_top[i]; - bond_count += bond_top[i]; - } - system->max_bonds = max_bonds * SAFER_ZONE; - //for (int i = 0; i < system->n; i++) - for (int i = 0; i < system->N; i++){ - if (hb_top[i] >= max_hbonds) max_hbonds = hb_top[i]; - if (hb_top[i] <= min_hbonds) min_hbonds = hb_top[i]; - hbond_count += hb_top [i]; - } - system->max_hbonds = max_hbonds * SAFER_ZONE; - //fprintf (stderr, " TOTAL DEVICE BOND COUNT: %d \n", bond_count); - //fprintf (stderr, " TOTAL DEVICE HBOND COUNT: %d \n", hbond_count); - //fprintf (stderr, " TOTAL DEVICE SPARSE COUNT: %d \n", *Htop); - fprintf (stderr, "p:%d --> Bonds(%d, %d) HBonds (%d, %d) *******\n", - system->my_rank, min_bonds, max_bonds, min_hbonds, max_hbonds); - - ker_init_system_atoms <<<blocks, ST_BLOCK_SIZE>>> - (system->d_my_atoms, system->N, l_hb_top, l_bond_top ); - cudaThreadSynchronize (); - cudaCheckError (); + int blocks = 0; + int *l_Htop, *l_hb_top, *l_bond_top, *l_num_3body; + int *tmp = (int *)scratch; + + *Htop = 0; + //memset( hb_top, 0, sizeof(int) * local_cap); + memset( hb_top, 0, sizeof(int) * total_cap); + memset( bond_top, 0, sizeof(int) * total_cap ); + *num_3body = 0; + + //cuda_memset (tmp, 0, 1 + 1 + sizeof (int) * (local_cap+ total_cap), "Cuda_Estimate_Storages"); + cuda_memset (tmp, 0, sizeof (int) * (1 + 1 + total_cap+ total_cap), "Cuda_Estimate_Storages"); + + l_Htop = tmp; + l_num_3body = l_Htop + 1; + l_hb_top = l_num_3body + 1; + //l_bond_top = l_hb_top + local_cap; + l_bond_top = l_hb_top + total_cap; + + blocks = system->N / ST_BLOCK_SIZE + + ((system->N % ST_BLOCK_SIZE == 0) ? 0 : 1); + + ker_estimate_storages <<< blocks, ST_BLOCK_SIZE>>> + (system->d_my_atoms, system->reax_param.d_sbp, system->reax_param.d_tbp, + (control_params *)control->d_control_params, *(*dev_lists + FAR_NBRS), system->reax_param.num_atom_types, + system->n, system->N, system->Hcap, system->total_cap, + l_Htop, l_num_3body, l_bond_top, l_hb_top ); + cudaThreadSynchronize (); + cudaCheckError (); + + copy_host_device( Htop, l_Htop, sizeof (int), cudaMemcpyDeviceToHost, "Htop"); + copy_host_device( num_3body, l_num_3body, sizeof (int), cudaMemcpyDeviceToHost, "num_3body"); + //copy_host_device( hb_top, l_hb_top, sizeof (int) * local_cap, cudaMemcpyDeviceToHost, "hb_top"); + copy_host_device( hb_top, l_hb_top, sizeof (int) * total_cap, cudaMemcpyDeviceToHost, "hb_top"); + copy_host_device( bond_top, l_bond_top, sizeof (int) * total_cap, cudaMemcpyDeviceToHost, "bond_top"); + + + int bond_count = 0; + int hbond_count = 0; + int max_bonds = 0, min_bonds = 999999; + int max_hbonds = 0, min_hbonds = 999999; + + for (int i = 0; i < system->N; i++) { + if (bond_top[i] >= max_bonds) max_bonds = bond_top[i]; + if (bond_top[i] <= min_bonds) min_bonds = bond_top[i]; + bond_count += bond_top[i]; + } + system->max_bonds = max_bonds * SAFER_ZONE; + //for (int i = 0; i < system->n; i++) + for (int i = 0; i < system->N; i++){ + if (hb_top[i] >= max_hbonds) max_hbonds = hb_top[i]; + if (hb_top[i] <= min_hbonds) min_hbonds = hb_top[i]; + hbond_count += hb_top [i]; + } + system->max_hbonds = max_hbonds * SAFER_ZONE; + //fprintf (stderr, " TOTAL DEVICE BOND COUNT: %d \n", bond_count); + //fprintf (stderr, " TOTAL DEVICE HBOND COUNT: %d \n", hbond_count); + //fprintf (stderr, " TOTAL DEVICE SPARSE COUNT: %d \n", *Htop); + fprintf (stderr, "p:%d --> Bonds(%d, %d) HBonds (%d, %d) *******\n", + system->my_rank, min_bonds, max_bonds, min_hbonds, max_hbonds); + + ker_init_system_atoms <<<blocks, ST_BLOCK_SIZE>>> + (system->d_my_atoms, system->N, l_hb_top, l_bond_top ); + cudaThreadSynchronize (); + cudaCheckError (); } + CUDA_DEVICE real Compute_H( real r, real gamma, real *ctap ) { - real taper, dr3gamij_1, dr3gamij_3; - - taper = ctap[7] * r + ctap[6]; - taper = taper * r + ctap[5]; - taper = taper * r + ctap[4]; - taper = taper * r + ctap[3]; - taper = taper * r + ctap[2]; - taper = taper * r + ctap[1]; - taper = taper * r + ctap[0]; - - dr3gamij_1 = ( r*r*r + gamma ); - dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 ); - return taper * EV_to_KCALpMOL / dr3gamij_3; + real taper, dr3gamij_1, dr3gamij_3; + + taper = ctap[7] * r + ctap[6]; + taper = taper * r + ctap[5]; + taper = taper * r + ctap[4]; + taper = taper * r + ctap[3]; + taper = taper * r + ctap[2]; + taper = taper * r + ctap[1]; + taper = taper * r + ctap[0]; + + dr3gamij_1 = ( r*r*r + gamma ); + dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 ); + return taper * EV_to_KCALpMOL / dr3gamij_3; } CUDA_DEVICE real Compute_tabH( LR_lookup_table *t_LR, real r_ij, int ti, int tj, int num_atom_types ) { - int r, tmin, tmax; - real val, dif, base; - LR_lookup_table *t; - - tmin = MIN( ti, tj ); - tmax = MAX( ti, tj ); - t = &( t_LR[index_lr (tmin,tmax, num_atom_types)] ); - - /* cubic spline interpolation */ - r = (int)(r_ij * t->inv_dx); - if( r == 0 ) ++r; - base = (real)(r+1) * t->dx; - dif = r_ij - base; - val = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + - t->ele[r].a; - val *= EV_to_KCALpMOL / C_ele; - - return val; + int r, tmin, tmax; + real val, dif, base; + LR_lookup_table *t; + + tmin = MIN( ti, tj ); + tmax = MAX( ti, tj ); + t = &( t_LR[index_lr (tmin,tmax, num_atom_types)] ); + + /* cubic spline interpolation */ + r = (int)(r_ij * t->inv_dx); + if( r == 0 ) ++r; + base = (real)(r+1) * t->dx; + dif = r_ij - base; + val = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + + t->ele[r].a; + val *= EV_to_KCALpMOL / C_ele; + + return val; } + CUDA_GLOBAL void ker_estimate_sparse_matrix (reax_atom *my_atoms, control_params *control, - reax_list p_far_nbrs, int n, int N, int renbr, int *indices) + reax_list p_far_nbrs, int n, int N, int renbr, int *indices) +{ + int i, j, pj; + int start_i, end_i; + int flag; + real cutoff; + far_neighbor_data *nbr_pj; + reax_atom *atom_i, *atom_j; + reax_list *far_nbrs = &( p_far_nbrs ); + + i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= N) return; + + atom_i = &(my_atoms[i]); + start_i = Dev_Start_Index(i, far_nbrs); + end_i = Dev_End_Index(i, far_nbrs); + + cutoff = control->nonb_cut; + + //++Htop; + if ( i < n) + indices [i] ++; + + /* update i-j distance - check if j is within cutoff */ + for( pj = start_i; pj < end_i; ++pj ) { + nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); + j = nbr_pj->nbr; + atom_j = &(my_atoms[j]); + if( renbr ) { + if(nbr_pj->d <= cutoff) + flag = 1; + else flag = 0; + } + else { + if (i < j) { + nbr_pj->dvec[0] = atom_j->x[0] - atom_i->x[0]; + nbr_pj->dvec[1] = atom_j->x[1] - atom_i->x[1]; + nbr_pj->dvec[2] = atom_j->x[2] - atom_i->x[2]; + } else { + nbr_pj->dvec[0] = atom_i->x[0] - atom_j->x[0]; + nbr_pj->dvec[1] = atom_i->x[1] - atom_j->x[1]; + nbr_pj->dvec[2] = atom_i->x[2] - atom_j->x[2]; + } + nbr_pj->d = rvec_Norm_Sqr( nbr_pj->dvec ); + //TODO + //TODO + //TODO + //if( nbr_pj->d <= (cutoff) ) { + if( nbr_pj->d <= SQR(cutoff) ) + { + nbr_pj->d = sqrt(nbr_pj->d); + flag = 1; + } + else + { + flag = 0; + } + } + + if( flag ) + { + /* H matrix entry */ + //if( j < n || atom_i->orig_id < atom_j->orig_id ) + //++Htop; + // indices [i] ++; + //else if (j < n || atom_i->orig_id > atom_j->orig_id ) + // indices [i] ++; + + //if ((i < n) || (j < n)) + // indices [i] ++; + //if ((i < n) && (i < j) && ((j < n) || atom_i->orig_id < atom_j->orig_id)) + // indices [i] ++; + //if ( i >= n && j < n && atom_i->orig_id > atom_j->orig_id) + // indices [i] ++; + //else if ((i >=n) && (i > j) && ((j < n) || (atom_i->orig_id > atom_j->orig_id))) + // indices [i] ++; + //THIS IS THE HOST CONDITION + //if (i < n && i < j && ( j < n || atom_i->orig_id < atom_j->orig_id )) + //if (i < n && i < j && atom_i->orig_id < atom_j->orig_id && j >=n) + // indices [i] ++; + //THIS IS THE DEVICE CONDITION + //if ( i > j && i >= n && j < n && atom_j->orig_id < atom_i->orig_id) + // indices [i] ++; + + //this is the working condition + if (i < j && i < n && ( j < n || atom_i->orig_id < atom_j->orig_id)) + indices [i]++; + else if (i > j && i >= n && j < n && atom_j->orig_id < atom_i->orig_id) + indices [i] ++; + else if (i > j && i < n && ( j < n || atom_j->orig_id < atom_i->orig_id )) + indices [i] ++; + } + } +} + + +int Cuda_Estimate_Sparse_Matrix (reax_system *system, control_params *control, + simulation_data *data, reax_list **lists) +{ + int blocks, max_sp_entries; + int *indices = (int *) scratch; + int *h_indices = (int *) host_scratch; + int total_sparse = 0; + + cuda_memset (indices, 0, sizeof (int) * system->N, "sp_matrix:indices"); + + blocks = system->N / DEF_BLOCK_SIZE + + ((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1); + + //TODO + //TODO + //TODO + //TODO + ker_estimate_sparse_matrix <<< blocks, DEF_BLOCK_SIZE >>> + (system->d_my_atoms, (control_params *)control->d_control_params, + *(*dev_lists + FAR_NBRS), system->n, system->N, + (((data->step-data->prev_steps) % control->reneighbor) == 0), indices); + cudaThreadSynchronize (); + cudaCheckError (); + + copy_host_device (h_indices, indices, sizeof (int) * system->N, + cudaMemcpyDeviceToHost, "sp_matrix:indices"); + max_sp_entries = 0; + for (int i = 0; i < system->N; i++){ + total_sparse += h_indices [i]; + if (max_sp_entries < h_indices[i]) + max_sp_entries = h_indices[i]; + } + + //fprintf (stderr, " TOTAL DEVICE SPARSE ENTRIES: %d \n", total_sparse ); + //fprintf (stderr, "p%d: Max sparse entries -> %d \n", system->my_rank, max_sp_entries ); + system->max_sparse_entries = max_sp_entries * SAFE_ZONE; + + return SUCCESS; +} + + +CUDA_GLOBAL void ker_init_forces (reax_atom *my_atoms, single_body_parameters *sbp, + two_body_parameters *tbp, storage workspace, + control_params *control, + reax_list far_nbrs, reax_list bonds, reax_list hbonds, + LR_lookup_table *t_LR, + int n, int N, int num_atom_types, + int max_sparse_entries, int renbr, + int max_bonds, int max_hbonds) +{ + int i, j, pj; + int start_i, end_i; + int type_i, type_j; + int Htop; + int btop_i, ihb, jhb, ihb_top; + //int btop_j, jhb, jhb_top; + int local, flag, flag2, flag3; + real r_ij, cutoff; + //reax_list *far_nbrs, *bonds, *hbonds; + single_body_parameters *sbp_i, *sbp_j; + two_body_parameters *twbp; + far_neighbor_data *nbr_pj; + reax_atom *atom_i, *atom_j; + sparse_matrix *H = &(workspace.H); + + i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= N) return; + + Htop = i * max_sparse_entries; + btop_i = 0; + + //Commented for CUDA KERNEL + //for( i = 0; i < system->N; ++i ) { + atom_i = &(my_atoms[i]); + type_i = atom_i->type; + start_i = Dev_Start_Index(i, &far_nbrs); + end_i = Dev_End_Index(i, &far_nbrs); + //CHANGE ORIGINAL + //btop_i = Dev_Start_Index( i, &bonds ); + btop_i = i * max_bonds; + Dev_Set_Start_Index (i, btop_i, &bonds); + //CHANGE ORIGINAL + + sbp_i = &(sbp[type_i]); + + if( i < n ) { + local = 1; + cutoff = control->nonb_cut; + + //update bond mark here + workspace.bond_mark [i] = 0; + + } + else { + local = 0; + cutoff = control->bond_cut; + + //update bond mark here + workspace.bond_mark [i] = 1000; + } + + ihb = -1; + ihb_top = -1; + //CHANGE ORIGINAL + H->start[i] = Htop; + + if( local ) { + H->entries[Htop].j = i; + H->entries[Htop].val = sbp_i->eta; + ++Htop; + } + //CHANGE ORIGINAL + + if( control->hbond_cut > 0 ) { + ihb = sbp_i->p_hbond; + //CHANGE ORIGINAL + if( ihb == 1 || ihb == 2) { + //CHANGE ORIGINAL + //ihb_top = Dev_Start_Index( atom_i->Hindex, &hbonds ); + ihb_top = i * max_hbonds; + Dev_Set_Start_Index (atom_i->Hindex, ihb_top, &hbonds ); + } + else ihb_top = -1; + } + + /* update i-j distance - check if j is within cutoff */ + for( pj = start_i; pj < end_i; ++pj ) { + nbr_pj = &( far_nbrs.select.far_nbr_list[pj] ); + j = nbr_pj->nbr; + atom_j = &(my_atoms[j]); + if( renbr ) { + if(nbr_pj->d <= cutoff) + flag = 1; + else flag = 0; + + if(nbr_pj->d <= control->nonb_cut) + flag2 = 1; + else flag2 = 0; + + } + else{ + if (i < j) { + nbr_pj->dvec[0] = atom_j->x[0] - atom_i->x[0]; + nbr_pj->dvec[1] = atom_j->x[1] - atom_i->x[1]; + nbr_pj->dvec[2] = atom_j->x[2] - atom_i->x[2]; + nbr_pj->d = rvec_Norm_Sqr( nbr_pj->dvec ); + } else { + nbr_pj->dvec[0] = atom_i->x[0] - atom_j->x[0]; + nbr_pj->dvec[1] = atom_i->x[1] - atom_j->x[1]; + nbr_pj->dvec[2] = atom_i->x[2] - atom_j->x[2]; + nbr_pj->d = rvec_Norm_Sqr( nbr_pj->dvec ); + } + + if(nbr_pj->d <= SQR (control->nonb_cut)) + flag2 = 1; + else flag2 = 0; + + //if( nbr_pj->d <= SQR(cutoff) ) { + if( nbr_pj->d <= SQR(control->nonb_cut) ) { + nbr_pj->d = sqrt(nbr_pj->d); + flag = 1; + } + else { + flag = 0; + } + } + if (flag2) { + ihb = sbp_i->p_hbond; + type_j = atom_j->type; + sbp_j = &(sbp[type_j]); + jhb = sbp_j->p_hbond; + if( control->hbond_cut > 0 + && nbr_pj->d <= control->hbond_cut + && (ihb == 2) + && (jhb == 1) + && (i >= n) + && (j < n) + ) + { + hbonds.select.hbond_list[ihb_top].nbr = j; + hbonds.select.hbond_list[ihb_top].scl = -1; + hbonds.select.hbond_list[ihb_top].ptr = nbr_pj; + + //CUDA SPECIFIC + hbonds.select.hbond_list[ihb_top].sym_index = -1; + rvec_MakeZero (hbonds.select.hbond_list[ihb_top].hb_f); + + ++ihb_top; + } + + //if ((i < n) || (j < n)) + //if (local || ((i >= n) &&(j < n))) + + flag3 = false; + if (i < j && i < n && ( j < n || atom_i->orig_id < atom_j->orig_id)) + flag3 = true; + else if (i > j && i >= n && j < n && atom_j->orig_id < atom_i->orig_id) + flag3 = true; + else if (i > j && i < n && ( j < n || atom_j->orig_id < atom_i->orig_id )) + flag3 = true; + + if (flag3) + { + twbp = &(tbp[ index_tbp (type_i,type_j,num_atom_types)]); + r_ij = nbr_pj->d; + + //if (renbr) { + H->entries[Htop].j = j; + if( control->tabulate == 0 ) + H->entries[Htop].val = Compute_H(r_ij,twbp->gamma,workspace.Tap); + else H->entries[Htop].val = Compute_tabH(t_LR, r_ij, type_i, type_j,num_atom_types); + //} + ++Htop; + } + } + + if( flag ){ + type_j = atom_j->type; + r_ij = nbr_pj->d; + sbp_j = &(sbp[type_j]); + twbp = &(tbp[ index_tbp (type_i,type_j,num_atom_types)]); + + if( local ) { + /* H matrix entry */ + /* + if( j < n || atom_i->orig_id < atom_j->orig_id ) {//tryQEq||1 + H->entries[Htop].j = j; + if( control->tabulate == 0 ) + H->entries[Htop].val = Compute_H(r_ij,twbp->gamma,workspace.Tap); + else H->entries[Htop].val = Compute_tabH(t_LR, r_ij, type_i, type_j,num_atom_types); + ++Htop; + } + else if( j < n || atom_i->orig_id > atom_j->orig_id ) {//tryQEq||1 + H->entries[Htop].j = j; + if( control->tabulate == 0 ) + H->entries[Htop].val = Compute_H(r_ij,twbp->gamma,workspace.Tap); + else H->entries[Htop].val = Compute_tabH(t_LR, r_ij, type_i, type_j,num_atom_types); + ++Htop; + } + */ + + //bool condition = !((i >= n) && (j >= n)); + /* hydrogen bond lists */ + if( control->hbond_cut > 0 && (ihb==1 || ihb==2) && + nbr_pj->d <= control->hbond_cut // && i < j + ) { + jhb = sbp_j->p_hbond; + if( ihb == 1 && jhb == 2 ) { + hbonds.select.hbond_list[ihb_top].nbr = j; + if (i < j) + hbonds.select.hbond_list[ihb_top].scl = 1; + else + hbonds.select.hbond_list[ihb_top].scl = -1; + hbonds.select.hbond_list[ihb_top].ptr = nbr_pj; + + //CUDA SPECIFIC + hbonds.select.hbond_list[ihb_top].sym_index = -1; + rvec_MakeZero (hbonds.select.hbond_list[ihb_top].hb_f); + + + ++ihb_top; + } + //else if( j < n && ihb == 2 && jhb == 1 ) + else if( ihb == 2 && jhb == 1 && j < n) { + //jhb_top = End_Index( atom_j->Hindex, hbonds ); + hbonds.select.hbond_list[ihb_top].nbr = j; + hbonds.select.hbond_list[ihb_top].scl = -1; + hbonds.select.hbond_list[ihb_top].ptr = nbr_pj; + + //CUDA SPECIFIC + hbonds.select.hbond_list[ihb_top].sym_index = -1; + rvec_MakeZero (hbonds.select.hbond_list[ihb_top].hb_f); + + ++ihb_top; + + //Set_End_Index( atom_j->Hindex, jhb_top+1, hbonds ); + //++num_hbonds; + } + } + } + + + + /* uncorrected bond orders */ + if( nbr_pj->d <= control->bond_cut + && Dev_BOp( bonds, control->bo_cut, + i , btop_i, nbr_pj, sbp_i, sbp_j, twbp, + workspace.dDeltap_self, workspace.total_bond_order) + ) { + //num_bonds += 2; + ++btop_i; + + /* Need to do later... since i and j are parallel + if( workspace->bond_mark[j] > workspace->bond_mark[i] + 1 ) + workspace->bond_mark[j] = workspace->bond_mark[i] + 1; + else if( workspace->bond_mark[i] > workspace->bond_mark[j] + 1 ) { + workspace->bond_mark[i] = workspace->bond_mark[j] + 1; + } + */ + } + } + } + + Dev_Set_End_Index( i, btop_i, &bonds ); + // if( local ) { + H->end[i] = Htop; + // } + //CHANGE ORIGINAL + if(( ihb == 1 || ihb == 2 ) && (ihb_top > 0) && (control->hbond_cut > 0)) + Dev_Set_End_Index( atom_i->Hindex, ihb_top, &hbonds ); + //} Commented for cuda kernel +} + + +CUDA_GLOBAL void ker_init_bond_mark (int offset, int n, int *bond_mark) +{ + int i; + + i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) return; + + bond_mark [offset + threadIdx.x] = 1000; +} + + +CUDA_GLOBAL void New_fix_sym_dbond_indices (reax_list pbonds, int N) +{ + int i, nbr; + bond_data *ibond, *jbond; + int atom_j; + + reax_list *bonds = &pbonds; + + i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= N) return; + + for (int j = Dev_Start_Index (i, bonds); j < Dev_End_Index (i, bonds); j++) + { + ibond = &( bonds->select.bond_list [j] ); + nbr = ibond->nbr; + + for (int k = Dev_Start_Index (nbr, bonds); k < Dev_End_Index (nbr, bonds); k ++) + { + jbond = &( bonds->select.bond_list[ k ] ); + atom_j = jbond->nbr; + + if ( (atom_j == i) ) + { + if (i > nbr) { + ibond->dbond_index = j; + jbond->dbond_index = j; + + ibond->sym_index = k; + jbond->sym_index = j; + } + } + } + } +} + + +CUDA_GLOBAL void New_fix_sym_hbond_indices (reax_atom *my_atoms, reax_list hbonds, int N ) +{ + + hbond_data *ihbond, *jhbond; + + int __THREADS_PER_ATOM__ = HB_KER_SYM_THREADS_PER_ATOM; + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + int warp_id = thread_id / __THREADS_PER_ATOM__; + int lane_id = thread_id & (__THREADS_PER_ATOM__ - 1); + int my_bucket = threadIdx.x / __THREADS_PER_ATOM__; + + if (warp_id > N) return; + + int i = warp_id; + int nbr; + int k; + int start = Dev_Start_Index (my_atoms[i].Hindex, &hbonds); + int end = Dev_End_Index (my_atoms[i].Hindex, &hbonds); + int j = start + lane_id; + while (j < end) + { + ihbond = &( hbonds.select.hbond_list [j] ); + nbr = ihbond->nbr; + + int nbrstart = Dev_Start_Index (my_atoms[nbr].Hindex, &hbonds); + int nbrend = Dev_End_Index (my_atoms[nbr].Hindex, &hbonds); + + for (k = nbrstart; k < nbrend; k++) + { + jhbond = &( hbonds.select.hbond_list [k] ); + + if (jhbond->nbr == i){ + ihbond->sym_index = k; + jhbond->sym_index = j; + break; + } + } + + j += __THREADS_PER_ATOM__; + } +} + + +//////////////////////// +// HBOND ISSUE +CUDA_GLOBAL void ker_update_bonds (reax_atom *my_atoms, + reax_list bonds, + int n) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) return; + + my_atoms [i].num_bonds = + MAX(Dev_Num_Entries(i, &bonds) * 2, MIN_BONDS); +} + + +CUDA_GLOBAL void ker_update_hbonds (reax_atom *my_atoms, + reax_list hbonds, + int n) { - int i, j, pj; - int start_i, end_i; - int flag; - real cutoff; - far_neighbor_data *nbr_pj; - reax_atom *atom_i, *atom_j; - reax_list *far_nbrs = &( p_far_nbrs ); - - i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= N) return; - - atom_i = &(my_atoms[i]); - start_i = Dev_Start_Index(i, far_nbrs); - end_i = Dev_End_Index(i, far_nbrs); - - cutoff = control->nonb_cut; - - //++Htop; - if ( i < n) - indices [i] ++; - - /* update i-j distance - check if j is within cutoff */ - for( pj = start_i; pj < end_i; ++pj ) { - nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); - j = nbr_pj->nbr; - atom_j = &(my_atoms[j]); - if( renbr ) { - if(nbr_pj->d <= cutoff) - flag = 1; - else flag = 0; - } - else { - if (i < j) { - nbr_pj->dvec[0] = atom_j->x[0] - atom_i->x[0]; - nbr_pj->dvec[1] = atom_j->x[1] - atom_i->x[1]; - nbr_pj->dvec[2] = atom_j->x[2] - atom_i->x[2]; - } else { - nbr_pj->dvec[0] = atom_i->x[0] - atom_j->x[0]; - nbr_pj->dvec[1] = atom_i->x[1] - atom_j->x[1]; - nbr_pj->dvec[2] = atom_i->x[2] - atom_j->x[2]; - } - nbr_pj->d = rvec_Norm_Sqr( nbr_pj->dvec ); - //TODO - //TODO - //TODO - //if( nbr_pj->d <= (cutoff) ) { - if( nbr_pj->d <= SQR(cutoff) ) { - nbr_pj->d = sqrt(nbr_pj->d); - flag = 1; - } - else { - flag = 0; - } - } - - if( flag ){ - /* H matrix entry */ - //if( j < n || atom_i->orig_id < atom_j->orig_id ) - //++Htop; - // indices [i] ++; - //else if (j < n || atom_i->orig_id > atom_j->orig_id ) - // indices [i] ++; - - //if ((i < n) || (j < n)) - // indices [i] ++; - //if ((i < n) && (i < j) && ((j < n) || atom_i->orig_id < atom_j->orig_id)) - // indices [i] ++; - //if ( i >= n && j < n && atom_i->orig_id > atom_j->orig_id) - // indices [i] ++; - //else if ((i >=n) && (i > j) && ((j < n) || (atom_i->orig_id > atom_j->orig_id))) - // indices [i] ++; - //THIS IS THE HOST CONDITION - //if (i < n && i < j && ( j < n || atom_i->orig_id < atom_j->orig_id )) - //if (i < n && i < j && atom_i->orig_id < atom_j->orig_id && j >=n) - // indices [i] ++; - //THIS IS THE DEVICE CONDITION - //if ( i > j && i >= n && j < n && atom_j->orig_id < atom_i->orig_id) - // indices [i] ++; - - //this is the working condition - if (i < j && i < n && ( j < n || atom_i->orig_id < atom_j->orig_id)) - indices [i]++; - else if (i > j && i >= n && j < n && atom_j->orig_id < atom_i->orig_id) - indices [i] ++; - else if (i > j && i < n && ( j < n || atom_j->orig_id < atom_i->orig_id )) - indices [i] ++; - } - } - } - - int Cuda_Estimate_Sparse_Matrix (reax_system *system, control_params *control, - simulation_data *data, reax_list **lists) - { - int blocks, max_sp_entries; - int *indices = (int *) scratch; - int *h_indices = (int *) host_scratch; - int total_sparse = 0; - - cuda_memset (indices, 0, sizeof (int) * system->N, "sp_matrix:indices"); - - blocks = system->N / DEF_BLOCK_SIZE + - ((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1); - - //TODO - //TODO - //TODO - //TODO - ker_estimate_sparse_matrix <<< blocks, DEF_BLOCK_SIZE >>> - (system->d_my_atoms, (control_params *)control->d_control_params, - *(*dev_lists + FAR_NBRS), system->n, system->N, - (((data->step-data->prev_steps) % control->reneighbor) == 0), indices); - cudaThreadSynchronize (); - cudaCheckError (); - - copy_host_device (h_indices, indices, sizeof (int) * system->N, - cudaMemcpyDeviceToHost, "sp_matrix:indices"); - max_sp_entries = 0; - for (int i = 0; i < system->N; i++){ - total_sparse += h_indices [i]; - if (max_sp_entries < h_indices[i]) - max_sp_entries = h_indices[i]; - } - - //fprintf (stderr, " TOTAL DEVICE SPARSE ENTRIES: %d \n", total_sparse ); - //fprintf (stderr, "p%d: Max sparse entries -> %d \n", system->my_rank, max_sp_entries ); - system->max_sparse_entries = max_sp_entries * SAFE_ZONE; - - return SUCCESS; - } - - - CUDA_GLOBAL void ker_init_forces (reax_atom *my_atoms, single_body_parameters *sbp, - two_body_parameters *tbp, storage workspace, - control_params *control, - reax_list far_nbrs, reax_list bonds, reax_list hbonds, - LR_lookup_table *t_LR, - int n, int N, int num_atom_types, - int max_sparse_entries, int renbr, - int max_bonds, int max_hbonds) - { - int i, j, pj; - int start_i, end_i; - int type_i, type_j; - int Htop; - int btop_i, ihb, jhb, ihb_top; - //int btop_j, jhb, jhb_top; - int local, flag, flag2, flag3; - real r_ij, cutoff; - //reax_list *far_nbrs, *bonds, *hbonds; - single_body_parameters *sbp_i, *sbp_j; - two_body_parameters *twbp; - far_neighbor_data *nbr_pj; - reax_atom *atom_i, *atom_j; - sparse_matrix *H = &(workspace.H); - - i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= N) return; - - Htop = i * max_sparse_entries; - btop_i = 0; - - //Commented for CUDA KERNEL - //for( i = 0; i < system->N; ++i ) { - atom_i = &(my_atoms[i]); - type_i = atom_i->type; - start_i = Dev_Start_Index(i, &far_nbrs); - end_i = Dev_End_Index(i, &far_nbrs); - //CHANGE ORIGINAL - //btop_i = Dev_Start_Index( i, &bonds ); - btop_i = i * max_bonds; - Dev_Set_Start_Index (i, btop_i, &bonds); - //CHANGE ORIGINAL - - sbp_i = &(sbp[type_i]); - - if( i < n ) { - local = 1; - cutoff = control->nonb_cut; - - //update bond mark here - workspace.bond_mark [i] = 0; - - } - else { - local = 0; - cutoff = control->bond_cut; - - //update bond mark here - workspace.bond_mark [i] = 1000; - } - - ihb = -1; - ihb_top = -1; - //CHANGE ORIGINAL - H->start[i] = Htop; - - if( local ) { - H->entries[Htop].j = i; - H->entries[Htop].val = sbp_i->eta; - ++Htop; - } - //CHANGE ORIGINAL - - if( control->hbond_cut > 0 ) { - ihb = sbp_i->p_hbond; - //CHANGE ORIGINAL - if( ihb == 1 || ihb == 2) { - //CHANGE ORIGINAL - //ihb_top = Dev_Start_Index( atom_i->Hindex, &hbonds ); - ihb_top = i * max_hbonds; - Dev_Set_Start_Index (atom_i->Hindex, ihb_top, &hbonds ); - } - else ihb_top = -1; - } - - /* update i-j distance - check if j is within cutoff */ - for( pj = start_i; pj < end_i; ++pj ) { - nbr_pj = &( far_nbrs.select.far_nbr_list[pj] ); - j = nbr_pj->nbr; - atom_j = &(my_atoms[j]); - if( renbr ) { - if(nbr_pj->d <= cutoff) - flag = 1; - else flag = 0; - - if(nbr_pj->d <= control->nonb_cut) - flag2 = 1; - else flag2 = 0; - - } - else{ - if (i < j) { - nbr_pj->dvec[0] = atom_j->x[0] - atom_i->x[0]; - nbr_pj->dvec[1] = atom_j->x[1] - atom_i->x[1]; - nbr_pj->dvec[2] = atom_j->x[2] - atom_i->x[2]; - nbr_pj->d = rvec_Norm_Sqr( nbr_pj->dvec ); - } else { - nbr_pj->dvec[0] = atom_i->x[0] - atom_j->x[0]; - nbr_pj->dvec[1] = atom_i->x[1] - atom_j->x[1]; - nbr_pj->dvec[2] = atom_i->x[2] - atom_j->x[2]; - nbr_pj->d = rvec_Norm_Sqr( nbr_pj->dvec ); - } - - if(nbr_pj->d <= SQR (control->nonb_cut)) - flag2 = 1; - else flag2 = 0; - - //if( nbr_pj->d <= SQR(cutoff) ) { - if( nbr_pj->d <= SQR(control->nonb_cut) ) { - nbr_pj->d = sqrt(nbr_pj->d); - flag = 1; - } - else { - flag = 0; - } - } - if (flag2) { - ihb = sbp_i->p_hbond; - type_j = atom_j->type; - sbp_j = &(sbp[type_j]); - jhb = sbp_j->p_hbond; - if( control->hbond_cut > 0 - && nbr_pj->d <= control->hbond_cut - && (ihb == 2) - && (jhb == 1) - && (i >= n) - && (j < n) - ) - { - hbonds.select.hbond_list[ihb_top].nbr = j; - hbonds.select.hbond_list[ihb_top].scl = -1; - hbonds.select.hbond_list[ihb_top].ptr = nbr_pj; - - //CUDA SPECIFIC - hbonds.select.hbond_list[ihb_top].sym_index = -1; - rvec_MakeZero (hbonds.select.hbond_list[ihb_top].hb_f); - - ++ihb_top; - } - - //if ((i < n) || (j < n)) - //if (local || ((i >= n) &&(j < n))) - - flag3 = false; - if (i < j && i < n && ( j < n || atom_i->orig_id < atom_j->orig_id)) - flag3 = true; - else if (i > j && i >= n && j < n && atom_j->orig_id < atom_i->orig_id) - flag3 = true; - else if (i > j && i < n && ( j < n || atom_j->orig_id < atom_i->orig_id )) - flag3 = true; - - if (flag3) - { - twbp = &(tbp[ index_tbp (type_i,type_j,num_atom_types)]); - r_ij = nbr_pj->d; - - //if (renbr) { - H->entries[Htop].j = j; - if( control->tabulate == 0 ) - H->entries[Htop].val = Compute_H(r_ij,twbp->gamma,workspace.Tap); - else H->entries[Htop].val = Compute_tabH(t_LR, r_ij, type_i, type_j,num_atom_types); - //} - ++Htop; - } - } - - if( flag ){ - type_j = atom_j->type; - r_ij = nbr_pj->d; - sbp_j = &(sbp[type_j]); - twbp = &(tbp[ index_tbp (type_i,type_j,num_atom_types)]); - - if( local ) { - /* H matrix entry */ - /* - if( j < n || atom_i->orig_id < atom_j->orig_id ) {//tryQEq||1 - H->entries[Htop].j = j; - if( control->tabulate == 0 ) - H->entries[Htop].val = Compute_H(r_ij,twbp->gamma,workspace.Tap); - else H->entries[Htop].val = Compute_tabH(t_LR, r_ij, type_i, type_j,num_atom_types); - ++Htop; - } - else if( j < n || atom_i->orig_id > atom_j->orig_id ) {//tryQEq||1 - H->entries[Htop].j = j; - if( control->tabulate == 0 ) - H->entries[Htop].val = Compute_H(r_ij,twbp->gamma,workspace.Tap); - else H->entries[Htop].val = Compute_tabH(t_LR, r_ij, type_i, type_j,num_atom_types); - ++Htop; - } - */ - - //bool condition = !((i >= n) && (j >= n)); - /* hydrogen bond lists */ - if( control->hbond_cut > 0 && (ihb==1 || ihb==2) && - nbr_pj->d <= control->hbond_cut // && i < j - ) { - jhb = sbp_j->p_hbond; - if( ihb == 1 && jhb == 2 ) { - hbonds.select.hbond_list[ihb_top].nbr = j; - if (i < j) - hbonds.select.hbond_list[ihb_top].scl = 1; - else - hbonds.select.hbond_list[ihb_top].scl = -1; - hbonds.select.hbond_list[ihb_top].ptr = nbr_pj; - - //CUDA SPECIFIC - hbonds.select.hbond_list[ihb_top].sym_index = -1; - rvec_MakeZero (hbonds.select.hbond_list[ihb_top].hb_f); - - - ++ihb_top; - } - //else if( j < n && ihb == 2 && jhb == 1 ) - else if( ihb == 2 && jhb == 1 && j < n) { - //jhb_top = End_Index( atom_j->Hindex, hbonds ); - hbonds.select.hbond_list[ihb_top].nbr = j; - hbonds.select.hbond_list[ihb_top].scl = -1; - hbonds.select.hbond_list[ihb_top].ptr = nbr_pj; - - //CUDA SPECIFIC - hbonds.select.hbond_list[ihb_top].sym_index = -1; - rvec_MakeZero (hbonds.select.hbond_list[ihb_top].hb_f); - - ++ihb_top; - - //Set_End_Index( atom_j->Hindex, jhb_top+1, hbonds ); - //++num_hbonds; - } - } - } - - - - /* uncorrected bond orders */ - if( nbr_pj->d <= control->bond_cut - && Dev_BOp( bonds, control->bo_cut, - i , btop_i, nbr_pj, sbp_i, sbp_j, twbp, - workspace.dDeltap_self, workspace.total_bond_order) - ) { - //num_bonds += 2; - ++btop_i; - - /* Need to do later... since i and j are parallel - if( workspace->bond_mark[j] > workspace->bond_mark[i] + 1 ) - workspace->bond_mark[j] = workspace->bond_mark[i] + 1; - else if( workspace->bond_mark[i] > workspace->bond_mark[j] + 1 ) { - workspace->bond_mark[i] = workspace->bond_mark[j] + 1; - } - */ - } - } - } - - Dev_Set_End_Index( i, btop_i, &bonds ); - // if( local ) { - H->end[i] = Htop; - // } - //CHANGE ORIGINAL - if(( ihb == 1 || ihb == 2 ) && (ihb_top > 0) && (control->hbond_cut > 0)) - Dev_Set_End_Index( atom_i->Hindex, ihb_top, &hbonds ); - //} Commented for cuda kernel - } - - - - CUDA_GLOBAL void ker_init_bond_mark (int offset, int n, int *bond_mark) - { - int i; - - i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= n) return; - - bond_mark [offset + threadIdx.x] = 1000; - } - - CUDA_GLOBAL void New_fix_sym_dbond_indices (reax_list pbonds, int N) - { - int i, nbr; - bond_data *ibond, *jbond; - int atom_j; - - reax_list *bonds = &pbonds; - - i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= N) return; - - for (int j = Dev_Start_Index (i, bonds); j < Dev_End_Index (i, bonds); j++) - { - ibond = &( bonds->select.bond_list [j] ); - nbr = ibond->nbr; - - for (int k = Dev_Start_Index (nbr, bonds); k < Dev_End_Index (nbr, bonds); k ++) - { - jbond = &( bonds->select.bond_list[ k ] ); - atom_j = jbond->nbr; - - if ( (atom_j == i) ) - { - if (i > nbr) { - ibond->dbond_index = j; - jbond->dbond_index = j; - - ibond->sym_index = k; - jbond->sym_index = j; - } - } - } - } - } - - - CUDA_GLOBAL void New_fix_sym_hbond_indices (reax_atom *my_atoms, reax_list hbonds, int N ) - { - - hbond_data *ihbond, *jhbond; - - int __THREADS_PER_ATOM__ = HB_KER_SYM_THREADS_PER_ATOM; - int thread_id = blockIdx.x * blockDim.x + threadIdx.x; - int warp_id = thread_id / __THREADS_PER_ATOM__; - int lane_id = thread_id & (__THREADS_PER_ATOM__ - 1); - int my_bucket = threadIdx.x / __THREADS_PER_ATOM__; - - if (warp_id > N) return; - - int i = warp_id; - int nbr; - int k; - int start = Dev_Start_Index (my_atoms[i].Hindex, &hbonds); - int end = Dev_End_Index (my_atoms[i].Hindex, &hbonds); - int j = start + lane_id; - while (j < end) - { - ihbond = &( hbonds.select.hbond_list [j] ); - nbr = ihbond->nbr; - - int nbrstart = Dev_Start_Index (my_atoms[nbr].Hindex, &hbonds); - int nbrend = Dev_End_Index (my_atoms[nbr].Hindex, &hbonds); - - for (k = nbrstart; k < nbrend; k++) - { - jhbond = &( hbonds.select.hbond_list [k] ); - - if (jhbond->nbr == i){ - ihbond->sym_index = k; - jhbond->sym_index = j; - break; - } - } - - j += __THREADS_PER_ATOM__; - } - } - - //////////////////////// - // HBOND ISSUE - CUDA_GLOBAL void ker_update_bonds (reax_atom *my_atoms, - reax_list bonds, - int n) - { - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= n) return; - - my_atoms [i].num_bonds = - MAX(Dev_Num_Entries(i, &bonds) * 2, MIN_BONDS); - } - - CUDA_GLOBAL void ker_update_hbonds (reax_atom *my_atoms, - reax_list hbonds, - int n) - { - int Hindex; - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= n) return; - - Hindex = my_atoms[i].Hindex; - my_atoms [i].num_hbonds = - MAX(Dev_Num_Entries(Hindex, &hbonds) * SAFER_ZONE, MIN_HBONDS); - } - //////////////////////// - //////////////////////// - //////////////////////// - - int Cuda_Validate_Lists (reax_system *system, storage *workspace, reax_list **lists, control_params *control, - int step, int n, int N, int numH ) - { - int blocks; - int i, comp, Hindex; - int *index, *end_index; - reax_list *bonds, *hbonds; - reax_atom *my_atoms; - reallocate_data *realloc; - realloc = &( dev_workspace->realloc); - - int max_sp_entries, num_hbonds, num_bonds; - int total_sp_entries; - - blocks = system->n / DEF_BLOCK_SIZE + - ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1); - - ker_update_bonds <<< blocks, DEF_BLOCK_SIZE >>> - (system->d_my_atoms, *(*lists + BONDS), - system->n); - cudaThreadSynchronize (); - cudaCheckError (); - - //////////////////////// - // HBOND ISSUE - //FIX - 4 - Added this check for hydrogen bond issue - if ((control->hbond_cut > 0) && (system->numH > 0)){ - ker_update_hbonds <<< blocks, DEF_BLOCK_SIZE >>> - (system->d_my_atoms, *(*lists + HBONDS), - system->n); - cudaThreadSynchronize (); - cudaCheckError (); - } - - //validate sparse matrix entries. - memset (host_scratch, 0, 2 * system->N * sizeof (int)); - index = (int *) host_scratch; - end_index = index + system->N; - copy_host_device (index, dev_workspace->H.start, system->N * sizeof (int), - cudaMemcpyDeviceToHost, "sparse_matrix:start" ); - copy_host_device (end_index, dev_workspace->H.end, system->N * sizeof (int), - cudaMemcpyDeviceToHost, "sparse_matrix:end" ); - max_sp_entries = total_sp_entries = 0; - for (i = 0; i < N; i++ ){ - //if (i < N-1) - // comp = index [i+1]; - //else - // comp = dev_workspace->H.m; - - total_sp_entries += end_index [i] - index[i]; - if (end_index [i] - index[i] > system->max_sparse_entries) { - fprintf( stderr, "step%d-sparsemat-chk failed: i=%d start(i)=%d end(i)=%d \n", - step, i, index[i], end_index[i] ); - return FAILURE; - } else if (end_index[i] >= dev_workspace->H.m) { - //SUDHIR_FIX_SPARSE_MATRIX - //TODO move this carver - //TODO move this carver - //TODO move this carver - fprintf (stderr, "p:%d - step%d-sparsemat-chk failed (exceed limits): i=%d start(i)=%d end(i)=%d \n", - system->my_rank, step, i, index[i], end_index[i]); - //TODO move this carver - //TODO move this carver - //TODO move this carver - return FAILURE; - } else { - if (max_sp_entries <= end_index[i] - index [i]) - max_sp_entries = end_index[i] - index [i]; - } - } - //if (max_sp_entries <= end_index[i] - index [i]) - // max_sp_entries = end_index[i] - index [i]; - - //update the current step max_sp_entries; - realloc->Htop = max_sp_entries; - fprintf (stderr, "p:%d - Cuda_Reallocate: Total H matrix entries: %d, cap: %d, used: %d \n", - system->my_rank, dev_workspace->H.n, dev_workspace->H.m, total_sp_entries); - - if (total_sp_entries >= dev_workspace->H.m) { - fprintf (stderr, "p:%d - **ran out of space for sparse matrix: step: %d, allocated: %d, used: %d \n", - system->my_rank, step, dev_workspace->H.m, total_sp_entries); - - return FAILURE; - } - - - //validate Bond list - if (N > 0) { - num_bonds = 0; - - bonds = *lists + BONDS; - memset (host_scratch, 0, 2 * bonds->n * sizeof (int)); - index = (int *) host_scratch; - end_index = index + bonds->n; - - copy_host_device (index, bonds->index, bonds->n * sizeof (int), - cudaMemcpyDeviceToHost, "bonds:index"); - copy_host_device (end_index, bonds->end_index, bonds->n * sizeof (int), - cudaMemcpyDeviceToHost, "bonds:end_index"); - - /* - for (i = 0; i < N; i++) { - if (i < N-1) - comp = index [i+1]; - else - comp = bonds->num_intrs; - - if (end_index [i] > comp) { - fprintf( stderr, "step%d-bondchk failed: i=%d start(i)=%d end(i)=%d str(i+1)=%d\n", - step, i, index[i], end_index[i], comp ); - return FAILURE; - } - - num_bonds += MAX( (end_index[i] - index[i]) * 4, MIN_BONDS); - } - - if (end_index[N-1] >= bonds->num_intrs) { - fprintf( stderr, "step%d-bondchk failed(end): i=N-1 start(i)=%d end(i)=%d num_intrs=%d\n", - step, index[N-1], end_index[N-1], bonds->num_intrs); - return FAILURE; - } - num_bonds = MAX( num_bonds, MIN_CAP*MIN_BONDS ); - //check the condition for reallocation - realloc->num_bonds = num_bonds; - */ - - int max_bonds = 0; - for (i = 0; i < N; i++) { - if (end_index[i] - index[i] >= system->max_bonds) { - fprintf( stderr, "step%d-bondchk failed: i=%d start(i)=%d end(i)=%d max_bonds=%d\n", - step, i, index[i], end_index[i], system->max_bonds); - return FAILURE; - } - if (end_index[i] - index[i] >= max_bonds) - max_bonds = index[i] - index[i]; - } - realloc->num_bonds = max_bonds; - - } - - //validate Hbonds list - num_hbonds = 0; - // FIX - 4 - added additional check here - if ((numH > 0) && (control->hbond_cut > 0)) { - hbonds = *lists + HBONDS; - memset (host_scratch, 0, 2 * hbonds->n * sizeof (int) + sizeof (reax_atom) * system->N); - index = (int *) host_scratch; - end_index = index + hbonds->n; - my_atoms = (reax_atom *)(end_index + hbonds->n); - - copy_host_device (index, hbonds->index, hbonds->n * sizeof (int), - cudaMemcpyDeviceToHost, "hbonds:index"); - copy_host_device (end_index, hbonds->end_index, hbonds->n * sizeof (int), - cudaMemcpyDeviceToHost, "hbonds:end_index"); - copy_host_device (my_atoms, system->d_my_atoms, system->N * sizeof (reax_atom), - cudaMemcpyDeviceToHost, "system:d_my_atoms"); - - //fprintf (stderr, " Total local atoms: %d \n", n); - - /* - for (i = 0; i < N-1; i++) { - Hindex = my_atoms [i].Hindex; - if (Hindex > -1) - comp = index [Hindex + 1]; - else - comp = hbonds->num_intrs; - - if (end_index [Hindex] > comp) { - fprintf(stderr,"step%d-atom:%d hbondchk failed: H=%d start(H)=%d end(H)=%d str(H+1)=%d\n", - step, i, Hindex, index[Hindex], end_index[Hindex], comp ); - return FAILURE; - } - - num_hbonds += MAX( (end_index [Hindex] - index [Hindex]) * 2, MIN_HBONDS * 2); - } - if (end_index [my_atoms[i].Hindex] > hbonds->num_intrs) { - fprintf(stderr,"step%d-atom:%d hbondchk failed: H=%d start(H)=%d end(H)=%d num_intrs=%d\n", - step, i, Hindex, index[Hindex], end_index[Hindex], hbonds->num_intrs); - return FAILURE; - } - - num_hbonds += MIN( (end_index [my_atoms[i].Hindex] - index [my_atoms[i].Hindex]) * 2, - 2 * MIN_HBONDS); - num_hbonds = MAX( num_hbonds, MIN_CAP*MIN_HBONDS ); - realloc->num_hbonds = num_hbonds; - */ - - int max_hbonds = 0; - for (i = 0; i < N; i++) { - if (end_index[i] - index[i] >= system->max_hbonds) { - fprintf( stderr, "step%d-hbondchk failed: i=%d start(i)=%d end(i)=%d max_hbonds=%d\n", - step, i, index[i], end_index[i], system->max_hbonds); - return FAILURE; - } - if (end_index[i] - index[i] >= max_hbonds) - max_hbonds = end_index[i] - index[i]; - } - realloc->num_hbonds = max_hbonds; - } - - return SUCCESS; - } - - CUDA_GLOBAL void ker_init_bond_orders (reax_atom *my_atoms, - reax_list far_nbrs, - reax_list bonds, - real *total_bond_order, - int N) - { - int i, j, pj; - int start_i, end_i; - int type_i, type_j; - far_neighbor_data *nbr_pj; - reax_atom *atom_i, *atom_j; - - i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= N) return; - - atom_i = &(my_atoms[i]); - start_i = Dev_Start_Index(i, &far_nbrs); - end_i = Dev_End_Index(i, &far_nbrs); - - for( pj = start_i; pj < end_i; ++pj ) { - // nbr_pj = &( far_nbrs.select.far_nbr_list[pj] ); - // j = nbr_pj->nbr; - // atom_j = &(my_atoms[j]); - - //total_bond_order [i] ++; - //atom_i->Hindex ++; - } - } - - CUDA_GLOBAL void ker_bond_mark (reax_list p_bonds, storage p_workspace, int N) - { - reax_list *bonds = &( p_bonds ); - storage *workspace = &( p_workspace ); - int j; - - //int i = blockIdx.x * blockDim.x + threadIdx.x; - //if (i >= N) return; - - for (int i = 0; i < N; i++) - for (int k = Dev_Start_Index (i, bonds); k < Dev_End_Index (i, bonds); k++) - { - bond_data *bdata = &( bonds->select.bond_list [k] ); - j = bdata->nbr; - - if (i < j ) { - if ( workspace->bond_mark [j] > (workspace->bond_mark [i] + 1) ) - workspace->bond_mark [j] = workspace->bond_mark [i] + 1; - else if ( workspace->bond_mark [i] > (workspace->bond_mark [j] + 1) ) - workspace->bond_mark [i] = workspace->bond_mark [j] + 1; - } - } - } - - - int Cuda_Init_Forces( reax_system *system, control_params *control, - simulation_data *data, storage *workspace, - reax_list **lists, output_controls *out_control ) - { - int init_blocks; - int hblocks; - - //init the workspace (bond_mark) - /* - int blocks; - cuda_memset (dev_workspace->bond_mark, 0, sizeof (int) * system->n, "bond_mark"); - - blocks = (system->N - system->n) / DEF_BLOCK_SIZE + - (((system->N - system->n) % DEF_BLOCK_SIZE == 0) ? 0 : 1); - ker_init_bond_mark <<< blocks, DEF_BLOCK_SIZE >>> - (system->n, (system->N - system->n), dev_workspace->bond_mark); - cudaThreadSynchronize (); - cudaCheckError (); - */ - //validate total_bond_orders - - //main kernel - init_blocks = (system->N) / DEF_BLOCK_SIZE + - (((system->N % DEF_BLOCK_SIZE) == 0) ? 0 : 1); - //fprintf (stderr, " Total atoms: %d, blocks: %d \n", system->N, init_blocks ); - - // ker_init_bond_orders <<<init_blocks, DEF_BLOCK_SIZE >>> - // ( system->d_my_atoms, *(*dev_lists + FAR_NBRS), *(*dev_lists + BONDS), - // dev_workspace->total_bond_order, system->N); - // cudaThreadSynchronize (); - // cudaCheckError (); - // fprintf (stderr, " DONE WITH VALIDATION \n"); - - ker_init_forces <<<init_blocks, DEF_BLOCK_SIZE >>> - (system->d_my_atoms, system->reax_param.d_sbp, - system->reax_param.d_tbp, *dev_workspace, - (control_params *)control->d_control_params, - *(*dev_lists + FAR_NBRS), *(*dev_lists + BONDS), *(*dev_lists + HBONDS), - d_LR, system->n, system->N, system->reax_param.num_atom_types, - //system->max_sparse_entries, ((data->step-data->prev_steps) % control->reneighbor)); - system->max_sparse_entries, (((data->step-data->prev_steps) % control->reneighbor) == 0), - system->max_bonds, system->max_hbonds); - cudaThreadSynchronize (); - cudaCheckError (); - - - //fix - sym_index and dbond_index - New_fix_sym_dbond_indices <<<init_blocks, BLOCK_SIZE>>> - (*(*dev_lists + BONDS), system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - /////////////////////// - /////////////////////// - // FIX - 4 - HBOND ISSUE - if ((control->hbond_cut > 0 ) && (system->numH > 0)) - { - //make hbond_list symmetric - hblocks = (system->N * HB_KER_SYM_THREADS_PER_ATOM) / HB_SYM_BLOCK_SIZE + - ((((system->N * HB_KER_SYM_THREADS_PER_ATOM) % HB_SYM_BLOCK_SIZE) == 0) ? 0 : 1); - //New_fix_sym_hbond_indices <<<hblocks, HB_BLOCK_SIZE >>> - New_fix_sym_hbond_indices <<<hblocks, HB_BLOCK_SIZE >>> - (system->d_my_atoms, *(*dev_lists + HBONDS), system->N); - cudaThreadSynchronize (); - cudaCheckError (); - } - - //update bond_mark - //ker_bond_mark <<< init_blocks, DEF_BLOCK_SIZE>>> - /* - ker_bond_mark <<< 1, 1>>> - ( *(*dev_lists + BONDS), *dev_workspace, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - */ - - //TODO - //1. update the sparse matrix count for reallocation - //2. update the bonds count for reallocation - //3. update the hydrogen bonds count for reallocation - - //Validate lists here. - return Cuda_Validate_Lists (system, workspace, dev_lists, control, - data->step, system->n, system->N, system->numH ); - } - - int Cuda_Init_Forces_noQEq( reax_system *system, control_params *control, - simulation_data *data, storage *workspace, - reax_list **lists, output_controls *out_control ) - { - //TODO Implement later - // when you figure out the bond_mark usage. - - return FAILURE; - } - - int Cuda_Compute_Bonded_Forces (reax_system *system, control_params *control, - simulation_data *data, storage *workspace, - reax_list **lists, output_controls *out_control ) - { - real t_start, t_elapsed; - real *spad = (real *) scratch; - rvec *rvec_spad; - - //1. Bond Order Interactions. - bond_orders.c - t_start = Get_Time( ); - //fprintf (stderr, " Begin Bonded Forces ... %d x %d\n", BLOCKS_N, BLOCK_SIZE); - Cuda_Calculate_BO_init <<< BLOCKS_N, BLOCK_SIZE >>> - ( system->d_my_atoms, system->reax_param.d_sbp, - *dev_workspace, - system->N ); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_Calculate_BO <<< BLOCKS_N, BLOCK_SIZE >>> - ( system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_sbp, - system->reax_param.d_tbp, *dev_workspace, - *(*dev_lists + BONDS), - system->reax_param.num_atom_types, system->N ); - cudaThreadSynchronize (); - cudaCheckError (); - - - Cuda_Update_Uncorrected_BO <<<BLOCKS_N, BLOCK_SIZE>>> - (*dev_workspace, *(*dev_lists + BONDS), system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_Update_Workspace_After_BO <<<BLOCKS_N, BLOCK_SIZE>>> - (system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_sbp, - *dev_workspace, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - t_elapsed = Get_Timing_Info( t_start ); - //fprintf (stderr, "Bond Orders... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed ); - //fprintf (stderr, "Cuda_Calculate_Bond_Orders Done... \n"); - - //2. Bond Energy Interactions. - bonds.c - t_start = Get_Time( ); - cuda_memset (spad, 0, system->N * ( 2 * sizeof (real)) , "scratch"); - - Cuda_Bonds <<< BLOCKS, BLOCK_SIZE, sizeof (real)* BLOCK_SIZE >>> - ( system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_sbp, system->reax_param.d_tbp, - *dev_workspace, *(*dev_lists + BONDS), - system->n, system->reax_param.num_atom_types, spad ); - cudaThreadSynchronize (); - cudaCheckError (); - - //Reduction for E_BE - k_reduction <<<BLOCKS, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>> - (spad, spad + system->n, system->n); - cudaThreadSynchronize (); - cudaCheckError (); - - k_reduction <<<1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2>>> - (spad + system->n, &((simulation_data *)data->d_simulation_data)->my_en.e_bond, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); - - t_elapsed = Get_Timing_Info( t_start ); - //fprintf (stderr, "Cuda_Bond_Energy ... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed ); - //fprintf (stderr, "Cuda_Bond_Energy Done... \n"); - - - //3. Atom Energy Interactions. - t_start = Get_Time( ); - cuda_memset (spad, 0, ( 6 * sizeof (real) * system->n ), "scratch"); - - Cuda_Atom_Energy <<<BLOCKS, BLOCK_SIZE>>>( system->d_my_atoms, system->reax_param.d_gp, - system->reax_param.d_sbp, system->reax_param.d_tbp, - *dev_workspace, - *(*dev_lists + BONDS), system->n, system->reax_param.num_atom_types, - spad, spad + 2 * system->n, spad + 4*system->n); - cudaThreadSynchronize (); - cudaCheckError (); - - //CHANGE ORIGINAL - //Cuda_Atom_Energy_PostProcess <<<BLOCKS, BLOCK_SIZE >>> - // ( *(*dev_lists + BONDS), *dev_workspace, system->n ); - Cuda_Atom_Energy_PostProcess <<<BLOCKS_N, BLOCK_SIZE >>> - ( *(*dev_lists + BONDS), *dev_workspace, system->N ); - //CHANGE ORIGINAL - cudaThreadSynchronize (); - cudaCheckError (); - - //Reduction for E_Lp - k_reduction <<<BLOCKS, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>> - (spad, spad + system->n, system->n); - cudaThreadSynchronize (); - cudaCheckError (); - - k_reduction <<<1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2 >>> - (spad + system->n, &((simulation_data *)data->d_simulation_data)->my_en.e_lp, BLOCKS); - cudaThreadSynchronize (); - cudaCheckError (); - - //Reduction for E_Ov - k_reduction <<<BLOCKS, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>> - (spad + 2*system->n, spad + 3*system->n, system->n); - cudaThreadSynchronize (); - cudaCheckError (); - - k_reduction <<<1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2 >>> - (spad + 3*system->n, &((simulation_data *)data->d_simulation_data)->my_en.e_ov, BLOCKS); - cudaThreadSynchronize (); - cudaCheckError (); - - //Reduction for E_Un - k_reduction <<<BLOCKS, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>> - (spad + 4*system->n, spad + 5*system->n, system->n); - cudaThreadSynchronize (); - cudaCheckError (); - - k_reduction <<<1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2 >>> - (spad + 5*system->n, &((simulation_data *)data->d_simulation_data)->my_en.e_un, BLOCKS); - cudaThreadSynchronize (); - cudaCheckError (); - - t_elapsed = Get_Timing_Info( t_start ); - //fprintf (stderr, "test_LonePair_postprocess ... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed ); - //fprintf (stderr, "test_LonePair_postprocess Done... \n"); - - - //4. Valence Angles Interactions. - t_start = Get_Time( ); - - //THREE BODY CHANGES HERE - cuda_memset(spad, 0, (*dev_lists + BONDS)->num_intrs * sizeof (int), "scratch"); - Estimate_Cuda_Valence_Angles <<<BLOCKS_N, BLOCK_SIZE>>> - (system->d_my_atoms, - (control_params *)control->d_control_params, - *(*dev_lists + BONDS), - system->n, system->N, (int *)spad); - cudaThreadSynchronize (); - cudaCheckError (); - - - int *thbody = (int *) host_scratch; - memset (thbody, 0, sizeof (int) * (*dev_lists + BONDS)->num_intrs); - copy_host_device (thbody, spad, (*dev_lists + BONDS)->num_intrs * sizeof (int), cudaMemcpyDeviceToHost, "thb:offsets"); - - int total_3body = thbody [0] * SAFE_ZONE; - for (int x = 1; x < (*dev_lists + BONDS)->num_intrs; x++) { - total_3body += thbody [x]*SAFE_ZONE; - thbody [x] += thbody [x-1]; - } - - system->num_thbodies = thbody [(*dev_lists+BONDS)->num_intrs-1]; - if (!system->init_thblist) - { - system->init_thblist = true; - if(!Dev_Make_List((*dev_lists+BONDS)->num_intrs, total_3body, TYP_THREE_BODY, (*dev_lists + THREE_BODIES))) { - fprintf( stderr, "Problem in initializing three-body list. Terminating!\n" ); - MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY ); - } - if(!Make_List((*dev_lists+BONDS)->num_intrs, total_3body, TYP_THREE_BODY, (*lists + THREE_BODIES))) { - fprintf( stderr, "Problem in initializing three-body list on host. Terminating!\n" ); - MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY ); - } + int Hindex; + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) return; + + Hindex = my_atoms[i].Hindex; + my_atoms [i].num_hbonds = + MAX(Dev_Num_Entries(Hindex, &hbonds) * SAFER_ZONE, MIN_HBONDS); +} +//////////////////////// +//////////////////////// +//////////////////////// + + +int Cuda_Validate_Lists (reax_system *system, storage *workspace, reax_list **lists, control_params *control, + int step, int n, int N, int numH ) +{ + int blocks; + int i, comp, Hindex; + int *index, *end_index; + reax_list *bonds, *hbonds; + reax_atom *my_atoms; + reallocate_data *realloc; + realloc = &( dev_workspace->realloc); + + int max_sp_entries, num_hbonds, num_bonds; + int total_sp_entries; + + blocks = system->n / DEF_BLOCK_SIZE + + ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1); + + ker_update_bonds <<< blocks, DEF_BLOCK_SIZE >>> + (system->d_my_atoms, *(*lists + BONDS), + system->n); + cudaThreadSynchronize (); + cudaCheckError (); + + //////////////////////// + // HBOND ISSUE + //FIX - 4 - Added this check for hydrogen bond issue + if ((control->hbond_cut > 0) && (system->numH > 0)){ + ker_update_hbonds <<< blocks, DEF_BLOCK_SIZE >>> + (system->d_my_atoms, *(*lists + HBONDS), + system->n); + cudaThreadSynchronize (); + cudaCheckError (); + } + + //validate sparse matrix entries. + memset (host_scratch, 0, 2 * system->N * sizeof (int)); + index = (int *) host_scratch; + end_index = index + system->N; + copy_host_device (index, dev_workspace->H.start, system->N * sizeof (int), + cudaMemcpyDeviceToHost, "sparse_matrix:start" ); + copy_host_device (end_index, dev_workspace->H.end, system->N * sizeof (int), + cudaMemcpyDeviceToHost, "sparse_matrix:end" ); + max_sp_entries = total_sp_entries = 0; + for (i = 0; i < N; i++ ){ + //if (i < N-1) + // comp = index [i+1]; + //else + // comp = dev_workspace->H.m; + + total_sp_entries += end_index [i] - index[i]; + if (end_index [i] - index[i] > system->max_sparse_entries) { + fprintf( stderr, "step%d-sparsemat-chk failed: i=%d start(i)=%d end(i)=%d \n", + step, i, index[i], end_index[i] ); + return FAILURE; + } else if (end_index[i] >= dev_workspace->H.m) { + //SUDHIR_FIX_SPARSE_MATRIX + //TODO move this carver + //TODO move this carver + //TODO move this carver + fprintf (stderr, "p:%d - step%d-sparsemat-chk failed (exceed limits): i=%d start(i)=%d end(i)=%d \n", + system->my_rank, step, i, index[i], end_index[i]); + //TODO move this carver + //TODO move this carver + //TODO move this carver + return FAILURE; + } else { + if (max_sp_entries <= end_index[i] - index [i]) + max_sp_entries = end_index[i] - index [i]; + } + } + //if (max_sp_entries <= end_index[i] - index [i]) + // max_sp_entries = end_index[i] - index [i]; + + //update the current step max_sp_entries; + realloc->Htop = max_sp_entries; + fprintf (stderr, "p:%d - Cuda_Reallocate: Total H matrix entries: %d, cap: %d, used: %d \n", + system->my_rank, dev_workspace->H.n, dev_workspace->H.m, total_sp_entries); + + if (total_sp_entries >= dev_workspace->H.m) { + fprintf (stderr, "p:%d - **ran out of space for sparse matrix: step: %d, allocated: %d, used: %d \n", + system->my_rank, step, dev_workspace->H.m, total_sp_entries); + + return FAILURE; + } + + + //validate Bond list + if (N > 0) { + num_bonds = 0; + + bonds = *lists + BONDS; + memset (host_scratch, 0, 2 * bonds->n * sizeof (int)); + index = (int *) host_scratch; + end_index = index + bonds->n; + + copy_host_device (index, bonds->index, bonds->n * sizeof (int), + cudaMemcpyDeviceToHost, "bonds:index"); + copy_host_device (end_index, bonds->end_index, bonds->n * sizeof (int), + cudaMemcpyDeviceToHost, "bonds:end_index"); + + /* + for (i = 0; i < N; i++) { + if (i < N-1) + comp = index [i+1]; + else + comp = bonds->num_intrs; + + if (end_index [i] > comp) { + fprintf( stderr, "step%d-bondchk failed: i=%d start(i)=%d end(i)=%d str(i+1)=%d\n", + step, i, index[i], end_index[i], comp ); + return FAILURE; + } + + num_bonds += MAX( (end_index[i] - index[i]) * 4, MIN_BONDS); + } + + if (end_index[N-1] >= bonds->num_intrs) { + fprintf( stderr, "step%d-bondchk failed(end): i=N-1 start(i)=%d end(i)=%d num_intrs=%d\n", + step, index[N-1], end_index[N-1], bonds->num_intrs); + return FAILURE; + } + num_bonds = MAX( num_bonds, MIN_CAP*MIN_BONDS ); + //check the condition for reallocation + realloc->num_bonds = num_bonds; + */ + + int max_bonds = 0; + for (i = 0; i < N; i++) { + if (end_index[i] - index[i] >= system->max_bonds) { + fprintf( stderr, "step%d-bondchk failed: i=%d start(i)=%d end(i)=%d max_bonds=%d\n", + step, i, index[i], end_index[i], system->max_bonds); + return FAILURE; + } + if (end_index[i] - index[i] >= max_bonds) + max_bonds = index[i] - index[i]; + } + realloc->num_bonds = max_bonds; + + } + + //validate Hbonds list + num_hbonds = 0; + // FIX - 4 - added additional check here + if ((numH > 0) && (control->hbond_cut > 0)) { + hbonds = *lists + HBONDS; + memset (host_scratch, 0, 2 * hbonds->n * sizeof (int) + sizeof (reax_atom) * system->N); + index = (int *) host_scratch; + end_index = index + hbonds->n; + my_atoms = (reax_atom *)(end_index + hbonds->n); + + copy_host_device (index, hbonds->index, hbonds->n * sizeof (int), + cudaMemcpyDeviceToHost, "hbonds:index"); + copy_host_device (end_index, hbonds->end_index, hbonds->n * sizeof (int), + cudaMemcpyDeviceToHost, "hbonds:end_index"); + copy_host_device (my_atoms, system->d_my_atoms, system->N * sizeof (reax_atom), + cudaMemcpyDeviceToHost, "system:d_my_atoms"); + + //fprintf (stderr, " Total local atoms: %d \n", n); + + /* + for (i = 0; i < N-1; i++) { + Hindex = my_atoms [i].Hindex; + if (Hindex > -1) + comp = index [Hindex + 1]; + else + comp = hbonds->num_intrs; + + if (end_index [Hindex] > comp) { + fprintf(stderr,"step%d-atom:%d hbondchk failed: H=%d start(H)=%d end(H)=%d str(H+1)=%d\n", + step, i, Hindex, index[Hindex], end_index[Hindex], comp ); + return FAILURE; + } + + num_hbonds += MAX( (end_index [Hindex] - index [Hindex]) * 2, MIN_HBONDS * 2); + } + if (end_index [my_atoms[i].Hindex] > hbonds->num_intrs) { + fprintf(stderr,"step%d-atom:%d hbondchk failed: H=%d start(H)=%d end(H)=%d num_intrs=%d\n", + step, i, Hindex, index[Hindex], end_index[Hindex], hbonds->num_intrs); + return FAILURE; + } + + num_hbonds += MIN( (end_index [my_atoms[i].Hindex] - index [my_atoms[i].Hindex]) * 2, + 2 * MIN_HBONDS); + num_hbonds = MAX( num_hbonds, MIN_CAP*MIN_HBONDS ); + realloc->num_hbonds = num_hbonds; + */ + + int max_hbonds = 0; + for (i = 0; i < N; i++) { + if (end_index[i] - index[i] >= system->max_hbonds) { + fprintf( stderr, "step%d-hbondchk failed: i=%d start(i)=%d end(i)=%d max_hbonds=%d\n", + step, i, index[i], end_index[i], system->max_hbonds); + return FAILURE; + } + if (end_index[i] - index[i] >= max_hbonds) + max_hbonds = end_index[i] - index[i]; + } + realloc->num_hbonds = max_hbonds; + } + + return SUCCESS; +} + + +CUDA_GLOBAL void ker_init_bond_orders (reax_atom *my_atoms, + reax_list far_nbrs, + reax_list bonds, + real *total_bond_order, + int N) +{ + int i, j, pj; + int start_i, end_i; + int type_i, type_j; + far_neighbor_data *nbr_pj; + reax_atom *atom_i, *atom_j; + + i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= N) return; + + atom_i = &(my_atoms[i]); + start_i = Dev_Start_Index(i, &far_nbrs); + end_i = Dev_End_Index(i, &far_nbrs); + + for( pj = start_i; pj < end_i; ++pj ) { + // nbr_pj = &( far_nbrs.select.far_nbr_list[pj] ); + // j = nbr_pj->nbr; + // atom_j = &(my_atoms[j]); + + //total_bond_order [i] ++; + //atom_i->Hindex ++; + } +} + + +CUDA_GLOBAL void ker_bond_mark (reax_list p_bonds, storage p_workspace, int N) +{ + reax_list *bonds = &( p_bonds ); + storage *workspace = &( p_workspace ); + int j; + + //int i = blockIdx.x * blockDim.x + threadIdx.x; + //if (i >= N) return; + + for (int i = 0; i < N; i++) + for (int k = Dev_Start_Index (i, bonds); k < Dev_End_Index (i, bonds); k++) + { + bond_data *bdata = &( bonds->select.bond_list [k] ); + j = bdata->nbr; + + if (i < j ) { + if ( workspace->bond_mark [j] > (workspace->bond_mark [i] + 1) ) + workspace->bond_mark [j] = workspace->bond_mark [i] + 1; + else if ( workspace->bond_mark [i] > (workspace->bond_mark [j] + 1) ) + workspace->bond_mark [i] = workspace->bond_mark [j] + 1; + } + } +} + + +int Cuda_Init_Forces( reax_system *system, control_params *control, + simulation_data *data, storage *workspace, + reax_list **lists, output_controls *out_control ) +{ + int init_blocks; + int hblocks; + + //init the workspace (bond_mark) + /* + int blocks; + cuda_memset (dev_workspace->bond_mark, 0, sizeof (int) * system->n, "bond_mark"); + + blocks = (system->N - system->n) / DEF_BLOCK_SIZE + + (((system->N - system->n) % DEF_BLOCK_SIZE == 0) ? 0 : 1); + ker_init_bond_mark <<< blocks, DEF_BLOCK_SIZE >>> + (system->n, (system->N - system->n), dev_workspace->bond_mark); + cudaThreadSynchronize (); + cudaCheckError (); + */ + //validate total_bond_orders + + //main kernel + init_blocks = (system->N) / DEF_BLOCK_SIZE + + (((system->N % DEF_BLOCK_SIZE) == 0) ? 0 : 1); + //fprintf (stderr, " Total atoms: %d, blocks: %d \n", system->N, init_blocks ); + + // ker_init_bond_orders <<<init_blocks, DEF_BLOCK_SIZE >>> + // ( system->d_my_atoms, *(*dev_lists + FAR_NBRS), *(*dev_lists + BONDS), + // dev_workspace->total_bond_order, system->N); + // cudaThreadSynchronize (); + // cudaCheckError (); + // fprintf (stderr, " DONE WITH VALIDATION \n"); + + ker_init_forces <<<init_blocks, DEF_BLOCK_SIZE >>> + (system->d_my_atoms, system->reax_param.d_sbp, + system->reax_param.d_tbp, *dev_workspace, + (control_params *)control->d_control_params, + *(*dev_lists + FAR_NBRS), *(*dev_lists + BONDS), *(*dev_lists + HBONDS), + d_LR, system->n, system->N, system->reax_param.num_atom_types, + //system->max_sparse_entries, ((data->step-data->prev_steps) % control->reneighbor)); + system->max_sparse_entries, (((data->step-data->prev_steps) % control->reneighbor) == 0), + system->max_bonds, system->max_hbonds); + cudaThreadSynchronize (); + cudaCheckError (); + + + //fix - sym_index and dbond_index + New_fix_sym_dbond_indices <<<init_blocks, BLOCK_SIZE>>> + (*(*dev_lists + BONDS), system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + /////////////////////// + /////////////////////// + // FIX - 4 - HBOND ISSUE + if ((control->hbond_cut > 0 ) && (system->numH > 0)) + { + //make hbond_list symmetric + hblocks = (system->N * HB_KER_SYM_THREADS_PER_ATOM) / HB_SYM_BLOCK_SIZE + + ((((system->N * HB_KER_SYM_THREADS_PER_ATOM) % HB_SYM_BLOCK_SIZE) == 0) ? 0 : 1); + //New_fix_sym_hbond_indices <<<hblocks, HB_BLOCK_SIZE >>> + New_fix_sym_hbond_indices <<<hblocks, HB_BLOCK_SIZE >>> + (system->d_my_atoms, *(*dev_lists + HBONDS), system->N); + cudaThreadSynchronize (); + cudaCheckError (); + } + + //update bond_mark + //ker_bond_mark <<< init_blocks, DEF_BLOCK_SIZE>>> + /* + ker_bond_mark <<< 1, 1>>> + ( *(*dev_lists + BONDS), *dev_workspace, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + */ + + //TODO + //1. update the sparse matrix count for reallocation + //2. update the bonds count for reallocation + //3. update the hydrogen bonds count for reallocation + + //Validate lists here. + return Cuda_Validate_Lists (system, workspace, dev_lists, control, + data->step, system->n, system->N, system->numH ); +} + + +int Cuda_Init_Forces_noQEq( reax_system *system, control_params *control, + simulation_data *data, storage *workspace, + reax_list **lists, output_controls *out_control ) +{ + //TODO Implement later + // when you figure out the bond_mark usage. + + return FAILURE; +} + + +int Cuda_Compute_Bonded_Forces (reax_system *system, control_params *control, + simulation_data *data, storage *workspace, + reax_list **lists, output_controls *out_control ) +{ + real t_start, t_elapsed; + real *spad = (real *) scratch; + rvec *rvec_spad; + + //1. Bond Order Interactions. - bond_orders.c + t_start = Get_Time( ); + //fprintf (stderr, " Begin Bonded Forces ... %d x %d\n", BLOCKS_N, BLOCK_SIZE); + Cuda_Calculate_BO_init <<< BLOCKS_N, BLOCK_SIZE >>> + ( system->d_my_atoms, system->reax_param.d_sbp, + *dev_workspace, + system->N ); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_Calculate_BO <<< BLOCKS_N, BLOCK_SIZE >>> + ( system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_sbp, + system->reax_param.d_tbp, *dev_workspace, + *(*dev_lists + BONDS), + system->reax_param.num_atom_types, system->N ); + cudaThreadSynchronize (); + cudaCheckError (); + + + Cuda_Update_Uncorrected_BO <<<BLOCKS_N, BLOCK_SIZE>>> + (*dev_workspace, *(*dev_lists + BONDS), system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_Update_Workspace_After_BO <<<BLOCKS_N, BLOCK_SIZE>>> + (system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_sbp, + *dev_workspace, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + t_elapsed = Get_Timing_Info( t_start ); + //fprintf (stderr, "Bond Orders... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed ); + //fprintf (stderr, "Cuda_Calculate_Bond_Orders Done... \n"); + + //2. Bond Energy Interactions. - bonds.c + t_start = Get_Time( ); + cuda_memset (spad, 0, system->N * ( 2 * sizeof (real)) , "scratch"); + + Cuda_Bonds <<< BLOCKS, BLOCK_SIZE, sizeof (real)* BLOCK_SIZE >>> + ( system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_sbp, system->reax_param.d_tbp, + *dev_workspace, *(*dev_lists + BONDS), + system->n, system->reax_param.num_atom_types, spad ); + cudaThreadSynchronize (); + cudaCheckError (); + + //Reduction for E_BE + k_reduction <<<BLOCKS, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>> + (spad, spad + system->n, system->n); + cudaThreadSynchronize (); + cudaCheckError (); + + k_reduction <<<1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2>>> + (spad + system->n, &((simulation_data *)data->d_simulation_data)->my_en.e_bond, BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); + + t_elapsed = Get_Timing_Info( t_start ); + //fprintf (stderr, "Cuda_Bond_Energy ... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed ); + //fprintf (stderr, "Cuda_Bond_Energy Done... \n"); + + + //3. Atom Energy Interactions. + t_start = Get_Time( ); + cuda_memset (spad, 0, ( 6 * sizeof (real) * system->n ), "scratch"); + + Cuda_Atom_Energy <<<BLOCKS, BLOCK_SIZE>>>( system->d_my_atoms, system->reax_param.d_gp, + system->reax_param.d_sbp, system->reax_param.d_tbp, + *dev_workspace, + *(*dev_lists + BONDS), system->n, system->reax_param.num_atom_types, + spad, spad + 2 * system->n, spad + 4*system->n); + cudaThreadSynchronize (); + cudaCheckError (); + + //CHANGE ORIGINAL + //Cuda_Atom_Energy_PostProcess <<<BLOCKS, BLOCK_SIZE >>> + // ( *(*dev_lists + BONDS), *dev_workspace, system->n ); + Cuda_Atom_Energy_PostProcess <<<BLOCKS_N, BLOCK_SIZE >>> + ( *(*dev_lists + BONDS), *dev_workspace, system->N ); + //CHANGE ORIGINAL + cudaThreadSynchronize (); + cudaCheckError (); + + //Reduction for E_Lp + k_reduction <<<BLOCKS, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>> + (spad, spad + system->n, system->n); + cudaThreadSynchronize (); + cudaCheckError (); + + k_reduction <<<1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2 >>> + (spad + system->n, &((simulation_data *)data->d_simulation_data)->my_en.e_lp, BLOCKS); + cudaThreadSynchronize (); + cudaCheckError (); + + //Reduction for E_Ov + k_reduction <<<BLOCKS, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>> + (spad + 2*system->n, spad + 3*system->n, system->n); + cudaThreadSynchronize (); + cudaCheckError (); + + k_reduction <<<1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2 >>> + (spad + 3*system->n, &((simulation_data *)data->d_simulation_data)->my_en.e_ov, BLOCKS); + cudaThreadSynchronize (); + cudaCheckError (); + + //Reduction for E_Un + k_reduction <<<BLOCKS, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>> + (spad + 4*system->n, spad + 5*system->n, system->n); + cudaThreadSynchronize (); + cudaCheckError (); + + k_reduction <<<1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2 >>> + (spad + 5*system->n, &((simulation_data *)data->d_simulation_data)->my_en.e_un, BLOCKS); + cudaThreadSynchronize (); + cudaCheckError (); + + t_elapsed = Get_Timing_Info( t_start ); + //fprintf (stderr, "test_LonePair_postprocess ... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed ); + //fprintf (stderr, "test_LonePair_postprocess Done... \n"); + + + //4. Valence Angles Interactions. + t_start = Get_Time( ); + + //THREE BODY CHANGES HERE + cuda_memset(spad, 0, (*dev_lists + BONDS)->num_intrs * sizeof (int), "scratch"); + Estimate_Cuda_Valence_Angles <<<BLOCKS_N, BLOCK_SIZE>>> + (system->d_my_atoms, + (control_params *)control->d_control_params, + *(*dev_lists + BONDS), + system->n, system->N, (int *)spad); + cudaThreadSynchronize (); + cudaCheckError (); + + + int *thbody = (int *) host_scratch; + memset (thbody, 0, sizeof (int) * (*dev_lists + BONDS)->num_intrs); + copy_host_device (thbody, spad, (*dev_lists + BONDS)->num_intrs * sizeof (int), cudaMemcpyDeviceToHost, "thb:offsets"); + + int total_3body = thbody [0] * SAFE_ZONE; + for (int x = 1; x < (*dev_lists + BONDS)->num_intrs; x++) { + total_3body += thbody [x]*SAFE_ZONE; + thbody [x] += thbody [x-1]; + } + + system->num_thbodies = thbody [(*dev_lists+BONDS)->num_intrs-1]; + if (!system->init_thblist) + { + system->init_thblist = true; + if(!Dev_Make_List((*dev_lists+BONDS)->num_intrs, total_3body, TYP_THREE_BODY, (*dev_lists + THREE_BODIES))) { + fprintf( stderr, "Problem in initializing three-body list. Terminating!\n" ); + MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY ); + } + if(!Make_List((*dev_lists+BONDS)->num_intrs, total_3body, TYP_THREE_BODY, (*lists + THREE_BODIES))) { + fprintf( stderr, "Problem in initializing three-body list on host. Terminating!\n" ); + MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY ); + } #ifdef __CUDA_MEM__ - fprintf (stderr, "Device memory allocated: three body list = %d MB\n", - sizeof (three_body_interaction_data) * total_3body / (1024*1024)); + fprintf (stderr, "Device memory allocated: three body list = %d MB\n", + sizeof (three_body_interaction_data) * total_3body / (1024*1024)); #endif - } else { - //if (((dev_workspace->realloc.num_bonds * DANGER_ZONE) >= (*dev_lists+BONDS)->num_intrs) || - // (system->num_thbodies > (*dev_lists+THREE_BODIES)->num_intrs )) { - //int size = dev_workspace->realloc.num_bonds; - if ((system->num_thbodies >= (*dev_lists+THREE_BODIES)->num_intrs ) || - ((*dev_lists+THREE_BODIES)->n < (*dev_lists+BONDS)->num_intrs) ) { - - int size = (*dev_lists + BONDS)->num_intrs; - - /*Delete Three-body list*/ - Dev_Delete_List( *dev_lists + THREE_BODIES ); - Delete_List ( *lists + THREE_BODIES ); - - fprintf (stderr, "p%d ***** Reallocating the Three-body list threebody.n: %d, bonds.num_intrs: %d, num_thb: %d, thb_entries: %d \n", - system->my_rank, (*dev_lists+THREE_BODIES)->n, (*dev_lists+BONDS)->num_intrs, - system->num_thbodies, (*dev_lists+THREE_BODIES)->num_intrs); + } else { + //if (((dev_workspace->realloc.num_bonds * DANGER_ZONE) >= (*dev_lists+BONDS)->num_intrs) || + // (system->num_thbodies > (*dev_lists+THREE_BODIES)->num_intrs )) { + //int size = dev_workspace->realloc.num_bonds; + if ((system->num_thbodies >= (*dev_lists+THREE_BODIES)->num_intrs ) || + ((*dev_lists+THREE_BODIES)->n < (*dev_lists+BONDS)->num_intrs) ) { + + int size = (*dev_lists + BONDS)->num_intrs; + + /*Delete Three-body list*/ + Dev_Delete_List( *dev_lists + THREE_BODIES ); + Delete_List ( *lists + THREE_BODIES ); + + fprintf (stderr, "p%d ***** Reallocating the Three-body list threebody.n: %d, bonds.num_intrs: %d, num_thb: %d, thb_entries: %d \n", + system->my_rank, (*dev_lists+THREE_BODIES)->n, (*dev_lists+BONDS)->num_intrs, + system->num_thbodies, (*dev_lists+THREE_BODIES)->num_intrs); #ifdef __CUDA_MEM__ - fprintf (stderr, "Reallocating Three-body list: step: %d n - %d num_intrs - %d used: %d \n", - data->step, dev_workspace->realloc.num_bonds, total_3body, system->num_thbodies); + fprintf (stderr, "Reallocating Three-body list: step: %d n - %d num_intrs - %d used: %d \n", + data->step, dev_workspace->realloc.num_bonds, total_3body, system->num_thbodies); #endif - /*Recreate Three-body list */ - if(!Dev_Make_List(size, total_3body, TYP_THREE_BODY, *dev_lists + THREE_BODIES )) { - fprintf( stderr, "Problem in initializing three-body list. Terminating!\n" ); - MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY ); - } - if(!Make_List(size, total_3body, TYP_THREE_BODY, *lists + THREE_BODIES )) { - fprintf( stderr, "Problem in initializing three-body list on host. Terminating!\n" ); - MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY ); - } - } - } - - //copy the indexes into the thb list; - copy_host_device (thbody, ((*dev_lists + THREE_BODIES)->index + 1), sizeof (int) * ((*dev_lists+BONDS)->num_intrs - 1), - cudaMemcpyHostToDevice, "thb:index"); - copy_host_device (thbody, ((*dev_lists + THREE_BODIES)->end_index + 1), sizeof (int) * ((*dev_lists+BONDS)->num_intrs - 1), - cudaMemcpyHostToDevice, "thb:end_index"); - //THREE_BODY CHANGES HERE - - - cuda_memset (spad, 0, ( 6 * sizeof (real) * system->N + sizeof (rvec) * system->N * 2), "scratch"); - Cuda_Valence_Angles <<< BLOCKS_N, BLOCK_SIZE >>> - ( system->d_my_atoms, - system->reax_param.d_gp, - system->reax_param.d_sbp, system->reax_param.d_thbp, - (control_params *)control->d_control_params, - *dev_workspace, - *(*dev_lists + BONDS), *(*dev_lists + THREE_BODIES), - system->n, system->N, system->reax_param.num_atom_types, - spad, spad + 2*system->N, spad + 4*system->N, (rvec *)(spad + 6*system->N)); - cudaThreadSynchronize (); - cudaCheckError (); - - //Reduction for E_Ang - k_reduction <<<BLOCKS_N, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>> - (spad, spad + system->N, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - k_reduction <<<1, BLOCKS_POW_2_N, sizeof (real) * BLOCKS_POW_2_N >>> - (spad + system->N, &((simulation_data *)data->d_simulation_data)->my_en.e_ang, BLOCKS_N); - cudaThreadSynchronize (); - cudaCheckError (); - - //Reduction for E_Pen - k_reduction <<<BLOCKS_N, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>> - (spad + 2*system->N, spad + 3*system->N, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - k_reduction <<<1, BLOCKS_POW_2_N, sizeof (real) * BLOCKS_POW_2_N >>> - (spad + 3*system->N, &((simulation_data *)data->d_simulation_data)->my_en.e_pen, BLOCKS_N); - cudaThreadSynchronize (); - cudaCheckError (); - - //Reduction for E_Coa - k_reduction <<<BLOCKS_N, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>> - (spad + 4*system->N, spad + 5*system->N, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - k_reduction <<<1, BLOCKS_POW_2_N, sizeof (real) * BLOCKS_POW_2_N >>> - (spad + 5*system->N, &((simulation_data *)data->d_simulation_data)->my_en.e_coa, BLOCKS_N); - cudaThreadSynchronize (); - cudaCheckError (); - - //Reduction for ext_pres - rvec_spad = (rvec *) (spad + 6*system->N); - k_reduction_rvec <<<BLOCKS_N, BLOCK_SIZE, sizeof (rvec) * BLOCK_SIZE >>> - (rvec_spad, rvec_spad + system->N, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - k_reduction_rvec <<<1, BLOCKS_POW_2_N, sizeof (rvec) * BLOCKS_POW_2_N >>> - (rvec_spad + system->N, &((simulation_data *)data->d_simulation_data)->my_ext_press, BLOCKS_N); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_Valence_Angles_PostProcess <<< BLOCKS_N, BLOCK_SIZE >>> - ( system->d_my_atoms, - (control_params *)control->d_control_params, - *dev_workspace, - *(*dev_lists + BONDS), - system->N ); - cudaThreadSynchronize (); - cudaCheckError (); - - t_elapsed = Get_Timing_Info( t_start ); - //fprintf (stderr, "Three_Body_Interactions ... Timing %lf \n", t_elapsed ); - //fprintf (stderr, "Three_Body_Interactions Done... \n"); - - - //5. Torsion Angles Interactions. - t_start = Get_Time( ); - cuda_memset (spad, 0, ( 4 * sizeof (real) * system->n + sizeof (rvec) * system->n * 2), "scratch"); - Cuda_Torsion_Angles <<< BLOCKS, BLOCK_SIZE >>> - ( system->d_my_atoms, - system->reax_param.d_gp, - system->reax_param.d_fbp, - (control_params *)control->d_control_params, - *(*dev_lists + BONDS), *(*dev_lists + THREE_BODIES), - *dev_workspace, - system->n, system->reax_param.num_atom_types, - spad, spad + 2*system->n, (rvec *) (spad + 4*system->n)); - cudaThreadSynchronize (); - cudaCheckError (); - - //Reduction for E_Tor - k_reduction <<<BLOCKS, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>> - (spad, spad + system->n, system->n); - cudaThreadSynchronize (); - cudaCheckError (); - - k_reduction <<<1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2 >>> - (spad + system->n, &((simulation_data *)data->d_simulation_data)->my_en.e_tor, BLOCKS); - cudaThreadSynchronize (); - cudaCheckError (); - - //Reduction for E_Con - k_reduction <<<BLOCKS, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>> - (spad + 2*system->n, spad + 3*system->n, system->n); - cudaThreadSynchronize (); - cudaCheckError (); - - k_reduction <<<1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2 >>> - (spad + 3*system->n, &((simulation_data *)data->d_simulation_data)->my_en.e_con, BLOCKS); - cudaThreadSynchronize (); - cudaCheckError (); - - //Reduction for ext_pres - rvec_spad = (rvec *) (spad + 4*system->n); - k_reduction_rvec <<<BLOCKS, BLOCK_SIZE, sizeof (rvec) * BLOCK_SIZE >>> - (rvec_spad, rvec_spad + system->n, system->n); - cudaThreadSynchronize (); - cudaCheckError (); - - k_reduction_rvec <<<1, BLOCKS_POW_2, sizeof (rvec) * BLOCKS_POW_2 >>> - (rvec_spad + system->n, &((simulation_data *)data->d_simulation_data)->my_ext_press, BLOCKS); - cudaThreadSynchronize (); - cudaCheckError (); - - //Post process here - Cuda_Torsion_Angles_PostProcess <<< BLOCKS_N, BLOCK_SIZE >>> - ( system->d_my_atoms, - *dev_workspace, - *(*dev_lists + BONDS), - system->N ); - cudaThreadSynchronize (); - cudaCheckError (); - - t_elapsed = Get_Timing_Info( t_start ); - //fprintf (stderr, "Four_Body_post process return value --> %d --- Four body Timing %lf \n", cudaGetLastError (), t_elapsed ); - //fprintf (stderr, " Four_Body_ Done... \n"); - - - //6. Hydrogen Bonds Interactions. - // FIX - 4 - Added additional check here - if ((control->hbond_cut > 0) && (system->numH > 0)) { - - t_start = Get_Time( ); - cuda_memset (spad, 0, ( 2 * sizeof (real) * system->n + sizeof (rvec) * system->n * 2 ), "scratch"); - - - int hbs = ((system->n * HB_KER_THREADS_PER_ATOM)/ HB_BLOCK_SIZE) + - (((system->n * HB_KER_THREADS_PER_ATOM) % HB_BLOCK_SIZE) == 0 ? 0 : 1); - Cuda_Hydrogen_Bonds_MT <<<hbs, HB_BLOCK_SIZE, - HB_BLOCK_SIZE * (2 * sizeof (real) + 2 * sizeof (rvec)) >>> - //Cuda_Hydrogen_Bonds <<< BLOCKS, BLOCK_SIZE>>> - ( system->d_my_atoms, - system->reax_param.d_sbp, - system->reax_param.d_hbp, - system->reax_param.d_gp, - (control_params *)control->d_control_params, - *dev_workspace, - *(*dev_lists + BONDS), *(*dev_lists + HBONDS), - system->n, system->reax_param.num_atom_types, - spad, (rvec *) (spad + 2*system->n)); - cudaThreadSynchronize (); - cudaCheckError (); - - //Reduction for E_HB - k_reduction <<<BLOCKS, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>> - (spad, spad + system->n, system->n); - cudaThreadSynchronize (); - cudaCheckError (); - - k_reduction <<<1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2 >>> - (spad + system->n, &((simulation_data *)data->d_simulation_data)->my_en.e_hb, BLOCKS); - cudaThreadSynchronize (); - cudaCheckError (); - - - //Reduction for ext_pres - rvec_spad = (rvec *) (spad + 2*system->n); - k_reduction_rvec <<<BLOCKS, BLOCK_SIZE, sizeof (rvec) * BLOCK_SIZE >>> - (rvec_spad, rvec_spad + system->n, system->n); - cudaThreadSynchronize (); - cudaCheckError (); - - k_reduction_rvec <<<1, BLOCKS_POW_2, sizeof (rvec) * BLOCKS_POW_2 >>> - (rvec_spad + system->n, &((simulation_data *)data->d_simulation_data)->my_ext_press, BLOCKS); - cudaThreadSynchronize (); - cudaCheckError (); - - ////post process step1: - Cuda_Hydrogen_Bonds_PostProcess <<< BLOCKS_N, BLOCK_SIZE, BLOCK_SIZE * sizeof (rvec) >>> - ( system->d_my_atoms, - *dev_workspace, - *(*dev_lists + BONDS), - system->N ); - cudaThreadSynchronize (); - cudaCheckError (); - - ////post process step2: - /* - Cuda_Hydrogen_Bonds_HNbrs <<< system->N, 32, 32 * sizeof (rvec)>>> - ( system->d_my_atoms, - *dev_workspace, - *(*dev_lists + HBONDS)); - */ - int hnbrs_bl = ((system->N * HB_POST_PROC_KER_THREADS_PER_ATOM)/ HB_POST_PROC_BLOCK_SIZE) + - (((system->N * HB_POST_PROC_KER_THREADS_PER_ATOM) % HB_POST_PROC_BLOCK_SIZE) == 0 ? 0 : 1); - Cuda_Hydrogen_Bonds_HNbrs_BL <<< hnbrs_bl, HB_POST_PROC_BLOCK_SIZE, - HB_POST_PROC_BLOCK_SIZE * sizeof (rvec)>>> - ( system->d_my_atoms, - *dev_workspace, - *(*dev_lists + HBONDS), system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - t_elapsed = Get_Timing_Info( t_start ); - //fprintf (stderr, "Hydrogen bonds return value --> %d --- HydrogenBonds Timing %lf \n", cudaGetLastError (), t_elapsed ); - //fprintf (stderr, "Hydrogen_Bond Done... \n"); - } - - return SUCCESS; - } - - void Cuda_Compute_NonBonded_Forces( reax_system *system, control_params *control, - simulation_data *data, storage *workspace, - reax_list **lists, output_controls *out_control, - mpi_datatypes *mpi_data ) - { - /* van der Waals and Coulomb interactions */ - Cuda_NonBonded_Energy( system, control, workspace, data, - lists, out_control, (control->tabulate == 0) ? false: true); - } + /*Recreate Three-body list */ + if(!Dev_Make_List(size, total_3body, TYP_THREE_BODY, *dev_lists + THREE_BODIES )) { + fprintf( stderr, "Problem in initializing three-body list. Terminating!\n" ); + MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY ); + } + if(!Make_List(size, total_3body, TYP_THREE_BODY, *lists + THREE_BODIES )) { + fprintf( stderr, "Problem in initializing three-body list on host. Terminating!\n" ); + MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY ); + } + } + } + + //copy the indexes into the thb list; + copy_host_device (thbody, ((*dev_lists + THREE_BODIES)->index + 1), sizeof (int) * ((*dev_lists+BONDS)->num_intrs - 1), + cudaMemcpyHostToDevice, "thb:index"); + copy_host_device (thbody, ((*dev_lists + THREE_BODIES)->end_index + 1), sizeof (int) * ((*dev_lists+BONDS)->num_intrs - 1), + cudaMemcpyHostToDevice, "thb:end_index"); + //THREE_BODY CHANGES HERE + + + cuda_memset (spad, 0, ( 6 * sizeof (real) * system->N + sizeof (rvec) * system->N * 2), "scratch"); + Cuda_Valence_Angles <<< BLOCKS_N, BLOCK_SIZE >>> + ( system->d_my_atoms, + system->reax_param.d_gp, + system->reax_param.d_sbp, system->reax_param.d_thbp, + (control_params *)control->d_control_params, + *dev_workspace, + *(*dev_lists + BONDS), *(*dev_lists + THREE_BODIES), + system->n, system->N, system->reax_param.num_atom_types, + spad, spad + 2*system->N, spad + 4*system->N, (rvec *)(spad + 6*system->N)); + cudaThreadSynchronize (); + cudaCheckError (); + + //Reduction for E_Ang + k_reduction <<<BLOCKS_N, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>> + (spad, spad + system->N, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + k_reduction <<<1, BLOCKS_POW_2_N, sizeof (real) * BLOCKS_POW_2_N >>> + (spad + system->N, &((simulation_data *)data->d_simulation_data)->my_en.e_ang, BLOCKS_N); + cudaThreadSynchronize (); + cudaCheckError (); + + //Reduction for E_Pen + k_reduction <<<BLOCKS_N, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>> + (spad + 2*system->N, spad + 3*system->N, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + k_reduction <<<1, BLOCKS_POW_2_N, sizeof (real) * BLOCKS_POW_2_N >>> + (spad + 3*system->N, &((simulation_data *)data->d_simulation_data)->my_en.e_pen, BLOCKS_N); + cudaThreadSynchronize (); + cudaCheckError (); + + //Reduction for E_Coa + k_reduction <<<BLOCKS_N, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>> + (spad + 4*system->N, spad + 5*system->N, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + k_reduction <<<1, BLOCKS_POW_2_N, sizeof (real) * BLOCKS_POW_2_N >>> + (spad + 5*system->N, &((simulation_data *)data->d_simulation_data)->my_en.e_coa, BLOCKS_N); + cudaThreadSynchronize (); + cudaCheckError (); + + //Reduction for ext_pres + rvec_spad = (rvec *) (spad + 6*system->N); + k_reduction_rvec <<<BLOCKS_N, BLOCK_SIZE, sizeof (rvec) * BLOCK_SIZE >>> + (rvec_spad, rvec_spad + system->N, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + k_reduction_rvec <<<1, BLOCKS_POW_2_N, sizeof (rvec) * BLOCKS_POW_2_N >>> + (rvec_spad + system->N, &((simulation_data *)data->d_simulation_data)->my_ext_press, BLOCKS_N); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_Valence_Angles_PostProcess <<< BLOCKS_N, BLOCK_SIZE >>> + ( system->d_my_atoms, + (control_params *)control->d_control_params, + *dev_workspace, + *(*dev_lists + BONDS), + system->N ); + cudaThreadSynchronize (); + cudaCheckError (); + + t_elapsed = Get_Timing_Info( t_start ); + //fprintf (stderr, "Three_Body_Interactions ... Timing %lf \n", t_elapsed ); + //fprintf (stderr, "Three_Body_Interactions Done... \n"); + + + //5. Torsion Angles Interactions. + t_start = Get_Time( ); + cuda_memset (spad, 0, ( 4 * sizeof (real) * system->n + sizeof (rvec) * system->n * 2), "scratch"); + Cuda_Torsion_Angles <<< BLOCKS, BLOCK_SIZE >>> + ( system->d_my_atoms, + system->reax_param.d_gp, + system->reax_param.d_fbp, + (control_params *)control->d_control_params, + *(*dev_lists + BONDS), *(*dev_lists + THREE_BODIES), + *dev_workspace, + system->n, system->reax_param.num_atom_types, + spad, spad + 2*system->n, (rvec *) (spad + 4*system->n)); + cudaThreadSynchronize (); + cudaCheckError (); + + //Reduction for E_Tor + k_reduction <<<BLOCKS, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>> + (spad, spad + system->n, system->n); + cudaThreadSynchronize (); + cudaCheckError (); + + k_reduction <<<1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2 >>> + (spad + system->n, &((simulation_data *)data->d_simulation_data)->my_en.e_tor, BLOCKS); + cudaThreadSynchronize (); + cudaCheckError (); + + //Reduction for E_Con + k_reduction <<<BLOCKS, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>> + (spad + 2*system->n, spad + 3*system->n, system->n); + cudaThreadSynchronize (); + cudaCheckError (); + + k_reduction <<<1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2 >>> + (spad + 3*system->n, &((simulation_data *)data->d_simulation_data)->my_en.e_con, BLOCKS); + cudaThreadSynchronize (); + cudaCheckError (); + + //Reduction for ext_pres + rvec_spad = (rvec *) (spad + 4*system->n); + k_reduction_rvec <<<BLOCKS, BLOCK_SIZE, sizeof (rvec) * BLOCK_SIZE >>> + (rvec_spad, rvec_spad + system->n, system->n); + cudaThreadSynchronize (); + cudaCheckError (); + + k_reduction_rvec <<<1, BLOCKS_POW_2, sizeof (rvec) * BLOCKS_POW_2 >>> + (rvec_spad + system->n, &((simulation_data *)data->d_simulation_data)->my_ext_press, BLOCKS); + cudaThreadSynchronize (); + cudaCheckError (); + + //Post process here + Cuda_Torsion_Angles_PostProcess <<< BLOCKS_N, BLOCK_SIZE >>> + ( system->d_my_atoms, + *dev_workspace, + *(*dev_lists + BONDS), + system->N ); + cudaThreadSynchronize (); + cudaCheckError (); + + t_elapsed = Get_Timing_Info( t_start ); + //fprintf (stderr, "Four_Body_post process return value --> %d --- Four body Timing %lf \n", cudaGetLastError (), t_elapsed ); + //fprintf (stderr, " Four_Body_ Done... \n"); + + + //6. Hydrogen Bonds Interactions. + // FIX - 4 - Added additional check here + if ((control->hbond_cut > 0) && (system->numH > 0)) { + + t_start = Get_Time( ); + cuda_memset (spad, 0, ( 2 * sizeof (real) * system->n + sizeof (rvec) * system->n * 2 ), "scratch"); + + + int hbs = ((system->n * HB_KER_THREADS_PER_ATOM)/ HB_BLOCK_SIZE) + + (((system->n * HB_KER_THREADS_PER_ATOM) % HB_BLOCK_SIZE) == 0 ? 0 : 1); + Cuda_Hydrogen_Bonds_MT <<<hbs, HB_BLOCK_SIZE, + HB_BLOCK_SIZE * (2 * sizeof (real) + 2 * sizeof (rvec)) >>> + //Cuda_Hydrogen_Bonds <<< BLOCKS, BLOCK_SIZE>>> + ( system->d_my_atoms, + system->reax_param.d_sbp, + system->reax_param.d_hbp, + system->reax_param.d_gp, + (control_params *)control->d_control_params, + *dev_workspace, + *(*dev_lists + BONDS), *(*dev_lists + HBONDS), + system->n, system->reax_param.num_atom_types, + spad, (rvec *) (spad + 2*system->n)); + cudaThreadSynchronize (); + cudaCheckError (); + + //Reduction for E_HB + k_reduction <<<BLOCKS, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>> + (spad, spad + system->n, system->n); + cudaThreadSynchronize (); + cudaCheckError (); + + k_reduction <<<1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2 >>> + (spad + system->n, &((simulation_data *)data->d_simulation_data)->my_en.e_hb, BLOCKS); + cudaThreadSynchronize (); + cudaCheckError (); + + + //Reduction for ext_pres + rvec_spad = (rvec *) (spad + 2*system->n); + k_reduction_rvec <<<BLOCKS, BLOCK_SIZE, sizeof (rvec) * BLOCK_SIZE >>> + (rvec_spad, rvec_spad + system->n, system->n); + cudaThreadSynchronize (); + cudaCheckError (); + + k_reduction_rvec <<<1, BLOCKS_POW_2, sizeof (rvec) * BLOCKS_POW_2 >>> + (rvec_spad + system->n, &((simulation_data *)data->d_simulation_data)->my_ext_press, BLOCKS); + cudaThreadSynchronize (); + cudaCheckError (); + + ////post process step1: + Cuda_Hydrogen_Bonds_PostProcess <<< BLOCKS_N, BLOCK_SIZE, BLOCK_SIZE * sizeof (rvec) >>> + ( system->d_my_atoms, + *dev_workspace, + *(*dev_lists + BONDS), + system->N ); + cudaThreadSynchronize (); + cudaCheckError (); + + ////post process step2: + /* + Cuda_Hydrogen_Bonds_HNbrs <<< system->N, 32, 32 * sizeof (rvec)>>> + ( system->d_my_atoms, + *dev_workspace, + *(*dev_lists + HBONDS)); + */ + int hnbrs_bl = ((system->N * HB_POST_PROC_KER_THREADS_PER_ATOM)/ HB_POST_PROC_BLOCK_SIZE) + + (((system->N * HB_POST_PROC_KER_THREADS_PER_ATOM) % HB_POST_PROC_BLOCK_SIZE) == 0 ? 0 : 1); + Cuda_Hydrogen_Bonds_HNbrs_BL <<< hnbrs_bl, HB_POST_PROC_BLOCK_SIZE, + HB_POST_PROC_BLOCK_SIZE * sizeof (rvec)>>> + ( system->d_my_atoms, + *dev_workspace, + *(*dev_lists + HBONDS), system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + t_elapsed = Get_Timing_Info( t_start ); + //fprintf (stderr, "Hydrogen bonds return value --> %d --- HydrogenBonds Timing %lf \n", cudaGetLastError (), t_elapsed ); + //fprintf (stderr, "Hydrogen_Bond Done... \n"); + } + + return SUCCESS; +} + +void Cuda_Compute_NonBonded_Forces( reax_system *system, control_params *control, + simulation_data *data, storage *workspace, + reax_list **lists, output_controls *out_control, + mpi_datatypes *mpi_data ) +{ + /* van der Waals and Coulomb interactions */ + Cuda_NonBonded_Energy( system, control, workspace, data, + lists, out_control, (control->tabulate == 0) ? false: true); +} diff --git a/PG-PuReMD/src/cuda_hydrogen_bonds.cu b/PG-PuReMD/src/cuda_hydrogen_bonds.cu index db34b1b8..358c5073 100644 --- a/PG-PuReMD/src/cuda_hydrogen_bonds.cu +++ b/PG-PuReMD/src/cuda_hydrogen_bonds.cu @@ -32,731 +32,731 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds( reax_atom *my_atoms, - single_body_parameters *sbp, - hbond_parameters *d_hbp, - global_parameters gp, - control_params *control, - storage p_workspace, - reax_list p_bonds, - reax_list p_hbonds, - int n, - int num_atom_types, - real *data_e_hb, - rvec *data_ext_press) + single_body_parameters *sbp, + hbond_parameters *d_hbp, + global_parameters gp, + control_params *control, + storage p_workspace, + reax_list p_bonds, + reax_list p_hbonds, + int n, + int num_atom_types, + real *data_e_hb, + rvec *data_ext_press) { - int i, j, k, pi, pk; - int type_i, type_j, type_k; - int start_j, end_j, hb_start_j, hb_end_j; - int hblist[MAX_BONDS]; - int itr, top; - int num_hb_intrs = 0; - ivec rel_jk; - real r_ij, r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2; - real e_hb, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3; - rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk; - rvec dvec_jk, force, ext_press; - // rtensor temp_rtensor, total_rtensor; - hbond_parameters *hbp; - bond_order_data *bo_ij; - bond_data *pbond_ij; - far_neighbor_data *nbr_jk; - reax_list *bonds, *hbonds; - bond_data *bond_list; - hbond_data *hbond_list, *hbond_jk; - storage *workspace = &( p_workspace ); - - bonds = &( p_bonds ); - bond_list = bonds->select.bond_list; - hbonds = & ( p_hbonds ); - hbond_list = hbonds->select.hbond_list; - - j = blockIdx.x * blockDim.x + threadIdx.x; - if (j >= n) return; - - /* loops below discover the Hydrogen bonds between i-j-k triplets. - here j is H atom and there has to be some bond between i and j. - Hydrogen bond is between j and k. - so in this function i->X, j->H, k->Z when we map - variables onto the ones in the handout.*/ - //for( j = 0; j < system->n; ++j ) - /* j has to be of type H */ - if( sbp[ my_atoms[j].type ].p_hbond == 1 ) { - /*set j's variables */ - type_j = my_atoms[j].type; - start_j = Dev_Start_Index(j, bonds); - end_j = Dev_End_Index(j, bonds); - hb_start_j = Dev_Start_Index( my_atoms[j].Hindex, hbonds ); - hb_end_j = Dev_End_Index( my_atoms[j].Hindex, hbonds ); - - top = 0; - for( pi = start_j; pi < end_j; ++pi ) { - pbond_ij = &( bond_list[pi] ); - i = pbond_ij->nbr; - bo_ij = &(pbond_ij->bo_data); - type_i = my_atoms[i].type; - - if( sbp[type_i].p_hbond == 2 && - bo_ij->BO >= HB_THRESHOLD ) - hblist[top++] = pi; - } - - // fprintf( stderr, "j: %d, top: %d, hb_start_j: %d, hb_end_j:%d\n", - // j, top, hb_start_j, hb_end_j ); - - for( pk = hb_start_j; pk < hb_end_j; ++pk ) { - /* set k's varibles */ - k = hbond_list[pk].nbr; - type_k = my_atoms[k].type; - nbr_jk = hbond_list[pk].ptr; - r_jk = nbr_jk->d; - rvec_Scale( dvec_jk, hbond_list[pk].scl, nbr_jk->dvec ); - - hbond_jk = &( hbond_list [pk] ); - rvec_MakeZero (hbond_jk->hb_f); - - for( itr = 0; itr < top; ++itr ) { - pi = hblist[itr]; - pbond_ij = &( bonds->select.bond_list[pi] ); - i = pbond_ij->nbr; - - if( my_atoms[i].orig_id != my_atoms[k].orig_id ) { - bo_ij = &(pbond_ij->bo_data); - type_i = my_atoms[i].type; - r_ij = pbond_ij->d; - hbp = &(d_hbp[ index_hbp (type_i,type_j,type_k,num_atom_types) ]); - ++num_hb_intrs; - - Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk, - &theta, &cos_theta ); - /* the derivative of cos(theta) */ - Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk, - &dcos_theta_di, &dcos_theta_dj, - &dcos_theta_dk ); - - /* hyrogen bond energy*/ - sin_theta2 = SIN( theta/2.0 ); - sin_xhz4 = SQR(sin_theta2); - sin_xhz4 *= sin_xhz4; - cos_xhz1 = ( 1.0 - cos_theta ); - exp_hb2 = EXP( -hbp->p_hb2 * bo_ij->BO ); - exp_hb3 = EXP( -hbp->p_hb3 * ( hbp->r0_hb / r_jk + - r_jk / hbp->r0_hb - 2.0 ) ); - - //data_e_hb [j] += - e_hb = hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4; - data_e_hb [j] += e_hb; - - CEhb1 = hbp->p_hb1 * hbp->p_hb2 * exp_hb2 * exp_hb3 * sin_xhz4; - CEhb2 = -hbp->p_hb1/2.0 * (1.0 - exp_hb2) * exp_hb3 * cos_xhz1; - CEhb3 = -hbp->p_hb3 * - (-hbp->r0_hb / SQR(r_jk) + 1.0 / hbp->r0_hb) * e_hb; - - /*fprintf( stdout, - "%6d%6d%6d%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f\n", - system->my_atoms[i].orig_id, system->my_atoms[j].orig_id, - system->my_atoms[k].orig_id, - r_jk, theta, hbp->p_hb1, exp_hb2, hbp->p_hb3, hbp->r0_hb, - exp_hb3, sin_xhz4, e_hb ); */ - - /* hydrogen bond forces */ - bo_ij->Cdbo += CEhb1; // dbo term - - if( control->virial == 0 ) { - // dcos terms - //rvec_ScaledAdd( workspace->f[i], +CEhb2, dcos_theta_di ); - //atomic_rvecScaledAdd (workspace->f[i], +CEhb2, dcos_theta_di ); - rvec_ScaledAdd( pbond_ij->hb_f, +CEhb2, dcos_theta_di ); - - rvec_ScaledAdd( workspace->f[j], +CEhb2, dcos_theta_dj ); - - //rvec_ScaledAdd( workspace->f[k], +CEhb2, dcos_theta_dk ); - //atomic_rvecScaledAdd( workspace->f[k], +CEhb2, dcos_theta_dk ); - rvec_ScaledAdd( hbond_jk->hb_f, +CEhb2, dcos_theta_dk ); - - // dr terms - rvec_ScaledAdd( workspace->f[j], -CEhb3/r_jk, dvec_jk ); - - //rvec_ScaledAdd( workspace->f[k], +CEhb3/r_jk, dvec_jk ); - //atomic_rvecScaledAdd( workspace->f[k], +CEhb3/r_jk, dvec_jk ); - rvec_ScaledAdd( hbond_jk->hb_f, +CEhb3/r_jk, dvec_jk ); - } - else { - /* for pressure coupling, terms that are not related to bond order - derivatives are added directly into pressure vector/tensor */ - rvec_Scale( force, +CEhb2, dcos_theta_di ); // dcos terms - //rvec_Add( workspace->f[i], force ); - rvec_Add( pbond_ij->hb_f, force ); - rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); - rvec_ScaledAdd( data_ext_press [j], 1.0, ext_press ); - - rvec_ScaledAdd( workspace->f[j], +CEhb2, dcos_theta_dj ); - - ivec_Scale( rel_jk, hbond_list[pk].scl, nbr_jk->rel_box ); - rvec_Scale( force, +CEhb2, dcos_theta_dk ); - //rvec_Add( workspace->f[k], force ); - rvec_Add( hbond_jk->hb_f, force ); - rvec_iMultiply( ext_press, rel_jk, force ); - rvec_ScaledAdd( data_ext_press[j], 1.0, ext_press ); - // dr terms - rvec_ScaledAdd( workspace->f[j], -CEhb3/r_jk, dvec_jk ); - - rvec_Scale( force, CEhb3/r_jk, dvec_jk ); - //rvec_Add( workspace->f[k], force ); - rvec_Add( hbond_jk->hb_f, force ); - rvec_iMultiply( ext_press, rel_jk, force ); - rvec_ScaledAdd( data_ext_press[j], 1.0, ext_press ); - } + int i, j, k, pi, pk; + int type_i, type_j, type_k; + int start_j, end_j, hb_start_j, hb_end_j; + int hblist[MAX_BONDS]; + int itr, top; + int num_hb_intrs = 0; + ivec rel_jk; + real r_ij, r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2; + real e_hb, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3; + rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk; + rvec dvec_jk, force, ext_press; + // rtensor temp_rtensor, total_rtensor; + hbond_parameters *hbp; + bond_order_data *bo_ij; + bond_data *pbond_ij; + far_neighbor_data *nbr_jk; + reax_list *bonds, *hbonds; + bond_data *bond_list; + hbond_data *hbond_list, *hbond_jk; + storage *workspace = &( p_workspace ); + + bonds = &( p_bonds ); + bond_list = bonds->select.bond_list; + hbonds = & ( p_hbonds ); + hbond_list = hbonds->select.hbond_list; + + j = blockIdx.x * blockDim.x + threadIdx.x; + if (j >= n) return; + + /* loops below discover the Hydrogen bonds between i-j-k triplets. + here j is H atom and there has to be some bond between i and j. + Hydrogen bond is between j and k. + so in this function i->X, j->H, k->Z when we map + variables onto the ones in the handout.*/ + //for( j = 0; j < system->n; ++j ) + /* j has to be of type H */ + if( sbp[ my_atoms[j].type ].p_hbond == 1 ) { + /*set j's variables */ + type_j = my_atoms[j].type; + start_j = Dev_Start_Index(j, bonds); + end_j = Dev_End_Index(j, bonds); + hb_start_j = Dev_Start_Index( my_atoms[j].Hindex, hbonds ); + hb_end_j = Dev_End_Index( my_atoms[j].Hindex, hbonds ); + + top = 0; + for( pi = start_j; pi < end_j; ++pi ) { + pbond_ij = &( bond_list[pi] ); + i = pbond_ij->nbr; + bo_ij = &(pbond_ij->bo_data); + type_i = my_atoms[i].type; + + if( sbp[type_i].p_hbond == 2 && + bo_ij->BO >= HB_THRESHOLD ) + hblist[top++] = pi; + } + + // fprintf( stderr, "j: %d, top: %d, hb_start_j: %d, hb_end_j:%d\n", + // j, top, hb_start_j, hb_end_j ); + + for( pk = hb_start_j; pk < hb_end_j; ++pk ) { + /* set k's varibles */ + k = hbond_list[pk].nbr; + type_k = my_atoms[k].type; + nbr_jk = hbond_list[pk].ptr; + r_jk = nbr_jk->d; + rvec_Scale( dvec_jk, hbond_list[pk].scl, nbr_jk->dvec ); + + hbond_jk = &( hbond_list [pk] ); + rvec_MakeZero (hbond_jk->hb_f); + + for( itr = 0; itr < top; ++itr ) { + pi = hblist[itr]; + pbond_ij = &( bonds->select.bond_list[pi] ); + i = pbond_ij->nbr; + + if( my_atoms[i].orig_id != my_atoms[k].orig_id ) { + bo_ij = &(pbond_ij->bo_data); + type_i = my_atoms[i].type; + r_ij = pbond_ij->d; + hbp = &(d_hbp[ index_hbp (type_i,type_j,type_k,num_atom_types) ]); + ++num_hb_intrs; + + Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk, + &theta, &cos_theta ); + /* the derivative of cos(theta) */ + Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk, + &dcos_theta_di, &dcos_theta_dj, + &dcos_theta_dk ); + + /* hyrogen bond energy*/ + sin_theta2 = SIN( theta/2.0 ); + sin_xhz4 = SQR(sin_theta2); + sin_xhz4 *= sin_xhz4; + cos_xhz1 = ( 1.0 - cos_theta ); + exp_hb2 = EXP( -hbp->p_hb2 * bo_ij->BO ); + exp_hb3 = EXP( -hbp->p_hb3 * ( hbp->r0_hb / r_jk + + r_jk / hbp->r0_hb - 2.0 ) ); + + //data_e_hb [j] += + e_hb = hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4; + data_e_hb [j] += e_hb; + + CEhb1 = hbp->p_hb1 * hbp->p_hb2 * exp_hb2 * exp_hb3 * sin_xhz4; + CEhb2 = -hbp->p_hb1/2.0 * (1.0 - exp_hb2) * exp_hb3 * cos_xhz1; + CEhb3 = -hbp->p_hb3 * + (-hbp->r0_hb / SQR(r_jk) + 1.0 / hbp->r0_hb) * e_hb; + + /*fprintf( stdout, + "%6d%6d%6d%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f\n", + system->my_atoms[i].orig_id, system->my_atoms[j].orig_id, + system->my_atoms[k].orig_id, + r_jk, theta, hbp->p_hb1, exp_hb2, hbp->p_hb3, hbp->r0_hb, + exp_hb3, sin_xhz4, e_hb ); */ + + /* hydrogen bond forces */ + bo_ij->Cdbo += CEhb1; // dbo term + + if( control->virial == 0 ) { + // dcos terms + //rvec_ScaledAdd( workspace->f[i], +CEhb2, dcos_theta_di ); + //atomic_rvecScaledAdd (workspace->f[i], +CEhb2, dcos_theta_di ); + rvec_ScaledAdd( pbond_ij->hb_f, +CEhb2, dcos_theta_di ); + + rvec_ScaledAdd( workspace->f[j], +CEhb2, dcos_theta_dj ); + + //rvec_ScaledAdd( workspace->f[k], +CEhb2, dcos_theta_dk ); + //atomic_rvecScaledAdd( workspace->f[k], +CEhb2, dcos_theta_dk ); + rvec_ScaledAdd( hbond_jk->hb_f, +CEhb2, dcos_theta_dk ); + + // dr terms + rvec_ScaledAdd( workspace->f[j], -CEhb3/r_jk, dvec_jk ); + + //rvec_ScaledAdd( workspace->f[k], +CEhb3/r_jk, dvec_jk ); + //atomic_rvecScaledAdd( workspace->f[k], +CEhb3/r_jk, dvec_jk ); + rvec_ScaledAdd( hbond_jk->hb_f, +CEhb3/r_jk, dvec_jk ); + } + else { + /* for pressure coupling, terms that are not related to bond order + derivatives are added directly into pressure vector/tensor */ + rvec_Scale( force, +CEhb2, dcos_theta_di ); // dcos terms + //rvec_Add( workspace->f[i], force ); + rvec_Add( pbond_ij->hb_f, force ); + rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); + rvec_ScaledAdd( data_ext_press [j], 1.0, ext_press ); + + rvec_ScaledAdd( workspace->f[j], +CEhb2, dcos_theta_dj ); + + ivec_Scale( rel_jk, hbond_list[pk].scl, nbr_jk->rel_box ); + rvec_Scale( force, +CEhb2, dcos_theta_dk ); + //rvec_Add( workspace->f[k], force ); + rvec_Add( hbond_jk->hb_f, force ); + rvec_iMultiply( ext_press, rel_jk, force ); + rvec_ScaledAdd( data_ext_press[j], 1.0, ext_press ); + // dr terms + rvec_ScaledAdd( workspace->f[j], -CEhb3/r_jk, dvec_jk ); + + rvec_Scale( force, CEhb3/r_jk, dvec_jk ); + //rvec_Add( workspace->f[k], force ); + rvec_Add( hbond_jk->hb_f, force ); + rvec_iMultiply( ext_press, rel_jk, force ); + rvec_ScaledAdd( data_ext_press[j], 1.0, ext_press ); + } #ifdef TEST_ENERGY - /* fprintf( out_control->ehb, - "%24.15e%24.15e%24.15e\n%24.15e%24.15e%24.15e\n%24.15e%24.15e%24.15e\n", - dcos_theta_di[0], dcos_theta_di[1], dcos_theta_di[2], - dcos_theta_dj[0], dcos_theta_dj[1], dcos_theta_dj[2], - dcos_theta_dk[0], dcos_theta_dk[1], dcos_theta_dk[2]); - fprintf( out_control->ehb, "%24.15e%24.15e%24.15e\n", - CEhb1, CEhb2, CEhb3 ); */ - fprintf( out_control->ehb, - //"%6d%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n", - "%6d%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f\n", - system->my_atoms[i].orig_id, system->my_atoms[j].orig_id, - system->my_atoms[k].orig_id, - r_jk, theta, bo_ij->BO, e_hb, data->my_en.e_hb ); + /* fprintf( out_control->ehb, + "%24.15e%24.15e%24.15e\n%24.15e%24.15e%24.15e\n%24.15e%24.15e%24.15e\n", + dcos_theta_di[0], dcos_theta_di[1], dcos_theta_di[2], + dcos_theta_dj[0], dcos_theta_dj[1], dcos_theta_dj[2], + dcos_theta_dk[0], dcos_theta_dk[1], dcos_theta_dk[2]); + fprintf( out_control->ehb, "%24.15e%24.15e%24.15e\n", + CEhb1, CEhb2, CEhb3 ); */ + fprintf( out_control->ehb, + //"%6d%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n", + "%6d%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f\n", + system->my_atoms[i].orig_id, system->my_atoms[j].orig_id, + system->my_atoms[k].orig_id, + r_jk, theta, bo_ij->BO, e_hb, data->my_en.e_hb ); #endif #ifdef TEST_FORCES - Add_dBO( system, lists, j, pi, +CEhb1, workspace->f_hb ); //dbo term - // dcos terms - rvec_ScaledAdd( workspace->f_hb[i], +CEhb2, dcos_theta_di ); - rvec_ScaledAdd( workspace->f_hb[j], +CEhb2, dcos_theta_dj ); - rvec_ScaledAdd( workspace->f_hb[k], +CEhb2, dcos_theta_dk ); - // dr terms - rvec_ScaledAdd( workspace->f_hb[j], -CEhb3/r_jk, dvec_jk ); - rvec_ScaledAdd( workspace->f_hb[k], +CEhb3/r_jk, dvec_jk ); + Add_dBO( system, lists, j, pi, +CEhb1, workspace->f_hb ); //dbo term + // dcos terms + rvec_ScaledAdd( workspace->f_hb[i], +CEhb2, dcos_theta_di ); + rvec_ScaledAdd( workspace->f_hb[j], +CEhb2, dcos_theta_dj ); + rvec_ScaledAdd( workspace->f_hb[k], +CEhb2, dcos_theta_dk ); + // dr terms + rvec_ScaledAdd( workspace->f_hb[j], -CEhb3/r_jk, dvec_jk ); + rvec_ScaledAdd( workspace->f_hb[k], +CEhb3/r_jk, dvec_jk ); #endif - } - } - } - } + } + } + } + } } //CUDA_GLOBAL void __launch_bounds__ (256, 4) Cuda_Hydrogen_Bonds_MT ( reax_atom *my_atoms, CUDA_GLOBAL void Cuda_Hydrogen_Bonds_MT ( reax_atom *my_atoms, - single_body_parameters *sbp, - hbond_parameters *d_hbp, - global_parameters gp, - control_params *control, - storage p_workspace, - reax_list p_bonds, - reax_list p_hbonds, - int n, - int num_atom_types, - real *data_e_hb, - rvec *data_ext_press) + single_body_parameters *sbp, + hbond_parameters *d_hbp, + global_parameters gp, + control_params *control, + storage p_workspace, + reax_list p_bonds, + reax_list p_hbonds, + int n, + int num_atom_types, + real *data_e_hb, + rvec *data_ext_press) { #if defined( __SM_35__) - real sh_hb; - real sh_cdbo; - rvec sh_atomf; - rvec sh_hf; + real sh_hb; + real sh_cdbo; + rvec sh_atomf; + rvec sh_hf; #else - extern __shared__ real t_hb[]; - extern __shared__ rvec t__f[]; - extern __shared__ rvec t_cdbo[]; - extern __shared__ rvec t_hf []; + extern __shared__ real t_hb[]; + extern __shared__ rvec t__f[]; + extern __shared__ rvec t_cdbo[]; + extern __shared__ rvec t_hf []; - real *sh_hb = t_hb; - real *sh_cdbo = t_hb + blockDim.x; - rvec *sh_atomf = (rvec *)(sh_cdbo + blockDim.x); - rvec *sh_hf = (rvec *) (sh_atomf + blockDim.x); + real *sh_hb = t_hb; + real *sh_cdbo = t_hb + blockDim.x; + rvec *sh_atomf = (rvec *)(sh_cdbo + blockDim.x); + rvec *sh_hf = (rvec *) (sh_atomf + blockDim.x); #endif - int __THREADS_PER_ATOM__ = HB_KER_THREADS_PER_ATOM; - - int thread_id = blockIdx.x * blockDim.x + threadIdx.x; - int warp_id = thread_id / __THREADS_PER_ATOM__; - int lane_id = thread_id & (__THREADS_PER_ATOM__ -1); - - if (warp_id >= n ) return; - - int i, j, k, pi, pk; - int type_i, type_j, type_k; - int start_j, end_j, hb_start_j, hb_end_j; - int hblist[MAX_BONDS]; - int itr, top; - int num_hb_intrs = 0; - ivec rel_jk; - real r_ij, r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2; - real e_hb, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3; - rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk; - rvec dvec_jk, force, ext_press; - // rtensor temp_rtensor, total_rtensor; - hbond_parameters *hbp; - bond_order_data *bo_ij; - bond_data *pbond_ij; - far_neighbor_data *nbr_jk; - reax_list *bonds, *hbonds; - bond_data *bond_list; - hbond_data *hbond_list, *hbond_jk; - storage *workspace = &( p_workspace ); - - bonds = &( p_bonds ); - bond_list = bonds->select.bond_list; - hbonds = & ( p_hbonds ); - hbond_list = hbonds->select.hbond_list; - - /* - j = blockIdx.x * blockDim.x + threadIdx.x; - if (j >= n) return; - */ - j = warp_id; - - /* loops below discover the Hydrogen bonds between i-j-k triplets. - here j is H atom and there has to be some bond between i and j. - Hydrogen bond is between j and k. - so in this function i->X, j->H, k->Z when we map - variables onto the ones in the handout.*/ - //for( j = 0; j < system->n; ++j ) + int __THREADS_PER_ATOM__ = HB_KER_THREADS_PER_ATOM; + + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + int warp_id = thread_id / __THREADS_PER_ATOM__; + int lane_id = thread_id & (__THREADS_PER_ATOM__ -1); + + if (warp_id >= n ) return; + + int i, j, k, pi, pk; + int type_i, type_j, type_k; + int start_j, end_j, hb_start_j, hb_end_j; + int hblist[MAX_BONDS]; + int itr, top; + int num_hb_intrs = 0; + ivec rel_jk; + real r_ij, r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2; + real e_hb, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3; + rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk; + rvec dvec_jk, force, ext_press; + // rtensor temp_rtensor, total_rtensor; + hbond_parameters *hbp; + bond_order_data *bo_ij; + bond_data *pbond_ij; + far_neighbor_data *nbr_jk; + reax_list *bonds, *hbonds; + bond_data *bond_list; + hbond_data *hbond_list, *hbond_jk; + storage *workspace = &( p_workspace ); + + bonds = &( p_bonds ); + bond_list = bonds->select.bond_list; + hbonds = & ( p_hbonds ); + hbond_list = hbonds->select.hbond_list; + + /* + j = blockIdx.x * blockDim.x + threadIdx.x; + if (j >= n) return; + */ + j = warp_id; + + /* loops below discover the Hydrogen bonds between i-j-k triplets. + here j is H atom and there has to be some bond between i and j. + Hydrogen bond is between j and k. + so in this function i->X, j->H, k->Z when we map + variables onto the ones in the handout.*/ + //for( j = 0; j < system->n; ++j ) #if defined( __SM_35__) - sh_hb = 0; - rvec_MakeZero ( sh_atomf ); + sh_hb = 0; + rvec_MakeZero ( sh_atomf ); #else - sh_hb [threadIdx.x] = 0; - rvec_MakeZero ( sh_atomf[ threadIdx.x] ); + sh_hb [threadIdx.x] = 0; + rvec_MakeZero ( sh_atomf[ threadIdx.x] ); #endif - /* j has to be of type H */ - if( sbp[ my_atoms[j].type ].p_hbond == 1 ) { - /*set j's variables */ - type_j = my_atoms[j].type; - start_j = Dev_Start_Index(j, bonds); - end_j = Dev_End_Index(j, bonds); - hb_start_j = Dev_Start_Index( my_atoms[j].Hindex, hbonds ); - hb_end_j = Dev_End_Index( my_atoms[j].Hindex, hbonds ); - - top = 0; - for( pi = start_j; pi < end_j; ++pi ) { - pbond_ij = &( bond_list[pi] ); - i = pbond_ij->nbr; - bo_ij = &(pbond_ij->bo_data); - type_i = my_atoms[i].type; - - if( sbp[type_i].p_hbond == 2 && - bo_ij->BO >= HB_THRESHOLD ) - hblist[top++] = pi; - } - - // fprintf( stderr, "j: %d, top: %d, hb_start_j: %d, hb_end_j:%d\n", - // j, top, hb_start_j, hb_end_j ); - - for( itr = 0; itr < top; ++itr ) { - pi = hblist[itr]; - pbond_ij = &( bonds->select.bond_list[pi] ); - i = pbond_ij->nbr; + /* j has to be of type H */ + if( sbp[ my_atoms[j].type ].p_hbond == 1 ) { + /*set j's variables */ + type_j = my_atoms[j].type; + start_j = Dev_Start_Index(j, bonds); + end_j = Dev_End_Index(j, bonds); + hb_start_j = Dev_Start_Index( my_atoms[j].Hindex, hbonds ); + hb_end_j = Dev_End_Index( my_atoms[j].Hindex, hbonds ); + + top = 0; + for( pi = start_j; pi < end_j; ++pi ) { + pbond_ij = &( bond_list[pi] ); + i = pbond_ij->nbr; + bo_ij = &(pbond_ij->bo_data); + type_i = my_atoms[i].type; + + if( sbp[type_i].p_hbond == 2 && + bo_ij->BO >= HB_THRESHOLD ) + hblist[top++] = pi; + } + + // fprintf( stderr, "j: %d, top: %d, hb_start_j: %d, hb_end_j:%d\n", + // j, top, hb_start_j, hb_end_j ); + + for( itr = 0; itr < top; ++itr ) { + pi = hblist[itr]; + pbond_ij = &( bonds->select.bond_list[pi] ); + i = pbond_ij->nbr; #if defined( __SM_35__) - rvec_MakeZero (sh_hf ); - sh_cdbo = 0; + rvec_MakeZero (sh_hf ); + sh_cdbo = 0; #else - rvec_MakeZero (sh_hf [threadIdx.x]); - sh_cdbo [threadIdx.x] = 0; + rvec_MakeZero (sh_hf [threadIdx.x]); + sh_cdbo [threadIdx.x] = 0; #endif - //for( pk = hb_start_j; pk < hb_end_j; ++pk ) { - int loopcount = (hb_end_j - hb_start_j) / HB_KER_THREADS_PER_ATOM + - (((hb_end_j - hb_start_j) % HB_KER_THREADS_PER_ATOM == 0) ? 0 : 1); - - int count = 0; - pk = hb_start_j + lane_id; - while (count < loopcount) - { - - if (pk < hb_end_j) - { - hbond_jk = &( hbond_list [pk] ); - - /* set k's varibles */ - k = hbond_list[pk].nbr; - type_k = my_atoms[k].type; - nbr_jk = hbond_list[pk].ptr; - r_jk = nbr_jk->d; - rvec_Scale( dvec_jk, hbond_list[pk].scl, nbr_jk->dvec ); - } - else k = -1; - - - if( (my_atoms[i].orig_id != my_atoms[k].orig_id) - && (k != -1) ) { - - bo_ij = &(pbond_ij->bo_data); - type_i = my_atoms[i].type; - r_ij = pbond_ij->d; - hbp = &(d_hbp[ index_hbp (type_i,type_j,type_k,num_atom_types) ]); - ++num_hb_intrs; - - Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk, - &theta, &cos_theta ); - /* the derivative of cos(theta) */ - Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk, - &dcos_theta_di, &dcos_theta_dj, - &dcos_theta_dk ); - - /* hyrogen bond energy*/ - sin_theta2 = SIN( theta/2.0 ); - sin_xhz4 = SQR(sin_theta2); - sin_xhz4 *= sin_xhz4; - cos_xhz1 = ( 1.0 - cos_theta ); - exp_hb2 = EXP( -hbp->p_hb2 * bo_ij->BO ); - exp_hb3 = EXP( -hbp->p_hb3 * ( hbp->r0_hb / r_jk + - r_jk / hbp->r0_hb - 2.0 ) ); - - //data_e_hb [j] += - e_hb = hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4; - //data_e_hb [j] += e_hb; + //for( pk = hb_start_j; pk < hb_end_j; ++pk ) { + int loopcount = (hb_end_j - hb_start_j) / HB_KER_THREADS_PER_ATOM + + (((hb_end_j - hb_start_j) % HB_KER_THREADS_PER_ATOM == 0) ? 0 : 1); + + int count = 0; + pk = hb_start_j + lane_id; + while (count < loopcount) + { + + if (pk < hb_end_j) + { + hbond_jk = &( hbond_list [pk] ); + + /* set k's varibles */ + k = hbond_list[pk].nbr; + type_k = my_atoms[k].type; + nbr_jk = hbond_list[pk].ptr; + r_jk = nbr_jk->d; + rvec_Scale( dvec_jk, hbond_list[pk].scl, nbr_jk->dvec ); + } + else k = -1; + + + if( (my_atoms[i].orig_id != my_atoms[k].orig_id) + && (k != -1) ) { + + bo_ij = &(pbond_ij->bo_data); + type_i = my_atoms[i].type; + r_ij = pbond_ij->d; + hbp = &(d_hbp[ index_hbp (type_i,type_j,type_k,num_atom_types) ]); + ++num_hb_intrs; + + Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk, + &theta, &cos_theta ); + /* the derivative of cos(theta) */ + Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk, + &dcos_theta_di, &dcos_theta_dj, + &dcos_theta_dk ); + + /* hyrogen bond energy*/ + sin_theta2 = SIN( theta/2.0 ); + sin_xhz4 = SQR(sin_theta2); + sin_xhz4 *= sin_xhz4; + cos_xhz1 = ( 1.0 - cos_theta ); + exp_hb2 = EXP( -hbp->p_hb2 * bo_ij->BO ); + exp_hb3 = EXP( -hbp->p_hb3 * ( hbp->r0_hb / r_jk + + r_jk / hbp->r0_hb - 2.0 ) ); + + //data_e_hb [j] += + e_hb = hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4; + //data_e_hb [j] += e_hb; #if defined( __SM_35__) - sh_hb += e_hb; + sh_hb += e_hb; #else - sh_hb [threadIdx.x] += e_hb; + sh_hb [threadIdx.x] += e_hb; #endif - CEhb1 = hbp->p_hb1 * hbp->p_hb2 * exp_hb2 * exp_hb3 * sin_xhz4; - CEhb2 = -hbp->p_hb1/2.0 * (1.0 - exp_hb2) * exp_hb3 * cos_xhz1; - CEhb3 = -hbp->p_hb3 * - (-hbp->r0_hb / SQR(r_jk) + 1.0 / hbp->r0_hb) * e_hb; + CEhb1 = hbp->p_hb1 * hbp->p_hb2 * exp_hb2 * exp_hb3 * sin_xhz4; + CEhb2 = -hbp->p_hb1/2.0 * (1.0 - exp_hb2) * exp_hb3 * cos_xhz1; + CEhb3 = -hbp->p_hb3 * + (-hbp->r0_hb / SQR(r_jk) + 1.0 / hbp->r0_hb) * e_hb; - /*fprintf( stdout, - "%6d%6d%6d%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f\n", - system->my_atoms[i].orig_id, system->my_atoms[j].orig_id, - system->my_atoms[k].orig_id, - r_jk, theta, hbp->p_hb1, exp_hb2, hbp->p_hb3, hbp->r0_hb, - exp_hb3, sin_xhz4, e_hb ); */ + /*fprintf( stdout, + "%6d%6d%6d%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f\n", + system->my_atoms[i].orig_id, system->my_atoms[j].orig_id, + system->my_atoms[k].orig_id, + r_jk, theta, hbp->p_hb1, exp_hb2, hbp->p_hb3, hbp->r0_hb, + exp_hb3, sin_xhz4, e_hb ); */ - /* hydrogen bond forces */ - // bo_ij->Cdbo += CEhb1; // dbo term + /* hydrogen bond forces */ + // bo_ij->Cdbo += CEhb1; // dbo term #if defined( __SM_35__) - sh_cdbo += CEhb1; + sh_cdbo += CEhb1; #else - sh_cdbo[threadIdx.x] += CEhb1; + sh_cdbo[threadIdx.x] += CEhb1; #endif - if( control->virial == 0 ) { - // dcos terms - //rvec_ScaledAdd( workspace->f[i], +CEhb2, dcos_theta_di ); - //atomic_rvecScaledAdd (workspace->f[i], +CEhb2, dcos_theta_di ); - //rvec_ScaledAdd( pbond_ij->hb_f, +CEhb2, dcos_theta_di ); + if( control->virial == 0 ) { + // dcos terms + //rvec_ScaledAdd( workspace->f[i], +CEhb2, dcos_theta_di ); + //atomic_rvecScaledAdd (workspace->f[i], +CEhb2, dcos_theta_di ); + //rvec_ScaledAdd( pbond_ij->hb_f, +CEhb2, dcos_theta_di ); #if defined( __SM_35__) - rvec_ScaledAdd( sh_hf , +CEhb2, dcos_theta_di ); + rvec_ScaledAdd( sh_hf , +CEhb2, dcos_theta_di ); #else - rvec_ScaledAdd( sh_hf [threadIdx.x], +CEhb2, dcos_theta_di ); + rvec_ScaledAdd( sh_hf [threadIdx.x], +CEhb2, dcos_theta_di ); #endif - //rvec_ScaledAdd( workspace->f[j], +CEhb2, dcos_theta_dj ); + //rvec_ScaledAdd( workspace->f[j], +CEhb2, dcos_theta_dj ); #if defined( __SM_35__) - rvec_ScaledAdd( sh_atomf , +CEhb2, dcos_theta_dj ); + rvec_ScaledAdd( sh_atomf , +CEhb2, dcos_theta_dj ); #else - rvec_ScaledAdd( sh_atomf [threadIdx.x], +CEhb2, dcos_theta_dj ); + rvec_ScaledAdd( sh_atomf [threadIdx.x], +CEhb2, dcos_theta_dj ); #endif - //rvec_ScaledAdd( workspace->f[k], +CEhb2, dcos_theta_dk ); - //atomic_rvecScaledAdd( workspace->f[k], +CEhb2, dcos_theta_dk ); - rvec_ScaledAdd( hbond_jk->hb_f, +CEhb2, dcos_theta_dk ); + //rvec_ScaledAdd( workspace->f[k], +CEhb2, dcos_theta_dk ); + //atomic_rvecScaledAdd( workspace->f[k], +CEhb2, dcos_theta_dk ); + rvec_ScaledAdd( hbond_jk->hb_f, +CEhb2, dcos_theta_dk ); - // dr terms - //rvec_ScaledAdd( workspace->f[j], -CEhb3/r_jk, dvec_jk ); + // dr terms + //rvec_ScaledAdd( workspace->f[j], -CEhb3/r_jk, dvec_jk ); #if defined( __SM_35__) - rvec_ScaledAdd( sh_atomf , -CEhb3/r_jk, dvec_jk ); + rvec_ScaledAdd( sh_atomf , -CEhb3/r_jk, dvec_jk ); #else - rvec_ScaledAdd( sh_atomf [threadIdx.x], -CEhb3/r_jk, dvec_jk ); + rvec_ScaledAdd( sh_atomf [threadIdx.x], -CEhb3/r_jk, dvec_jk ); #endif - //rvec_ScaledAdd( workspace->f[k], +CEhb3/r_jk, dvec_jk ); - //atomic_rvecScaledAdd( workspace->f[k], +CEhb3/r_jk, dvec_jk ); - rvec_ScaledAdd( hbond_jk->hb_f, +CEhb3/r_jk, dvec_jk ); - } - else { - /* for pressure coupling, terms that are not related to bond order - derivatives are added directly into pressure vector/tensor */ - rvec_Scale( force, +CEhb2, dcos_theta_di ); // dcos terms - //rvec_Add( workspace->f[i], force ); - rvec_Add( pbond_ij->hb_f, force ); - rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); - rvec_ScaledAdd( data_ext_press [j], 1.0, ext_press ); - - rvec_ScaledAdd( workspace->f[j], +CEhb2, dcos_theta_dj ); - - ivec_Scale( rel_jk, hbond_list[pk].scl, nbr_jk->rel_box ); - rvec_Scale( force, +CEhb2, dcos_theta_dk ); - //rvec_Add( workspace->f[k], force ); - rvec_Add( hbond_jk->hb_f, force ); - rvec_iMultiply( ext_press, rel_jk, force ); - rvec_ScaledAdd( data_ext_press[j], 1.0, ext_press ); - // dr terms - rvec_ScaledAdd( workspace->f[j], -CEhb3/r_jk, dvec_jk ); - - rvec_Scale( force, CEhb3/r_jk, dvec_jk ); - //rvec_Add( workspace->f[k], force ); - rvec_Add( hbond_jk->hb_f, force ); - rvec_iMultiply( ext_press, rel_jk, force ); - rvec_ScaledAdd( data_ext_press[j], 1.0, ext_press ); - } - - } //orid id end - - pk += __THREADS_PER_ATOM__; - count ++; - - } //for itr loop end - - //Reduction here + //rvec_ScaledAdd( workspace->f[k], +CEhb3/r_jk, dvec_jk ); + //atomic_rvecScaledAdd( workspace->f[k], +CEhb3/r_jk, dvec_jk ); + rvec_ScaledAdd( hbond_jk->hb_f, +CEhb3/r_jk, dvec_jk ); + } + else { + /* for pressure coupling, terms that are not related to bond order + derivatives are added directly into pressure vector/tensor */ + rvec_Scale( force, +CEhb2, dcos_theta_di ); // dcos terms + //rvec_Add( workspace->f[i], force ); + rvec_Add( pbond_ij->hb_f, force ); + rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); + rvec_ScaledAdd( data_ext_press [j], 1.0, ext_press ); + + rvec_ScaledAdd( workspace->f[j], +CEhb2, dcos_theta_dj ); + + ivec_Scale( rel_jk, hbond_list[pk].scl, nbr_jk->rel_box ); + rvec_Scale( force, +CEhb2, dcos_theta_dk ); + //rvec_Add( workspace->f[k], force ); + rvec_Add( hbond_jk->hb_f, force ); + rvec_iMultiply( ext_press, rel_jk, force ); + rvec_ScaledAdd( data_ext_press[j], 1.0, ext_press ); + // dr terms + rvec_ScaledAdd( workspace->f[j], -CEhb3/r_jk, dvec_jk ); + + rvec_Scale( force, CEhb3/r_jk, dvec_jk ); + //rvec_Add( workspace->f[k], force ); + rvec_Add( hbond_jk->hb_f, force ); + rvec_iMultiply( ext_press, rel_jk, force ); + rvec_ScaledAdd( data_ext_press[j], 1.0, ext_press ); + } + + } //orid id end + + pk += __THREADS_PER_ATOM__; + count ++; + + } //for itr loop end + + //Reduction here #if defined( __SM_35__) - for (int s = __THREADS_PER_ATOM__ >> 1; s >= 1; s/=2){ - sh_cdbo += shfl( sh_cdbo, s); - sh_hf[0] += shfl( sh_hf[0], s); - sh_hf[1] += shfl( sh_hf[1], s); - sh_hf[2] += shfl( sh_hf[2], s); - } - //end of the shuffle - if (lane_id == 0) { - bo_ij->Cdbo += sh_cdbo ; - rvec_Add (pbond_ij->hb_f, sh_hf ); - } + for (int s = __THREADS_PER_ATOM__ >> 1; s >= 1; s/=2){ + sh_cdbo += shfl( sh_cdbo, s); + sh_hf[0] += shfl( sh_hf[0], s); + sh_hf[1] += shfl( sh_hf[1], s); + sh_hf[2] += shfl( sh_hf[2], s); + } + //end of the shuffle + if (lane_id == 0) { + bo_ij->Cdbo += sh_cdbo ; + rvec_Add (pbond_ij->hb_f, sh_hf ); + } #else - if (lane_id < 16) { - sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 16]; - rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 16]); - } - if (lane_id < 8) { - sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 8]; - rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 8]); - } - if (lane_id < 4) { - sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 4]; - rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 4]); - } - if (lane_id < 2) { - sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 2]; - rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 2]); - } - if (lane_id < 1) { - sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 1]; - rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 1]); - - bo_ij->Cdbo += sh_cdbo [threadIdx.x]; - rvec_Add (pbond_ij->hb_f, sh_hf [threadIdx.x]); - } + if (lane_id < 16) { + sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 16]; + rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 16]); + } + if (lane_id < 8) { + sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 8]; + rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 8]); + } + if (lane_id < 4) { + sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 4]; + rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 4]); + } + if (lane_id < 2) { + sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 2]; + rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 2]); + } + if (lane_id < 1) { + sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 1]; + rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 1]); + + bo_ij->Cdbo += sh_cdbo [threadIdx.x]; + rvec_Add (pbond_ij->hb_f, sh_hf [threadIdx.x]); + } #endif - } // for loop hbonds end - } //if Hbond check end + } // for loop hbonds end + } //if Hbond check end #if defined( __SM_35__) - for (int s = __THREADS_PER_ATOM__ >> 1; s >= 1; s/=2){ - sh_hb += shfl( sh_hb, s); - sh_atomf[0] += shfl( sh_atomf[0], s); - sh_atomf[1] += shfl( sh_atomf[1], s); - sh_atomf[2] += shfl( sh_atomf[2], s); - } - if (lane_id == 0){ - data_e_hb[j] += sh_hb; - rvec_Add (workspace->f[j], sh_atomf); - } + for (int s = __THREADS_PER_ATOM__ >> 1; s >= 1; s/=2){ + sh_hb += shfl( sh_hb, s); + sh_atomf[0] += shfl( sh_atomf[0], s); + sh_atomf[1] += shfl( sh_atomf[1], s); + sh_atomf[2] += shfl( sh_atomf[2], s); + } + if (lane_id == 0){ + data_e_hb[j] += sh_hb; + rvec_Add (workspace->f[j], sh_atomf); + } #else - if (lane_id < 16){ - sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 16]; - rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 16] ); - } - if (lane_id < 8){ - sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 8]; - rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 8] ); - } - if (lane_id < 4){ - sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 4]; - rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 4] ); - } - if (lane_id < 2){ - sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 2]; - rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 2] ); - } - if (lane_id < 1){ - sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 1]; - rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 1] ); - - data_e_hb[j] += sh_hb [threadIdx.x]; - rvec_Add (workspace->f[j], sh_atomf [threadIdx.x]); - } + if (lane_id < 16){ + sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 16]; + rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 16] ); + } + if (lane_id < 8){ + sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 8]; + rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 8] ); + } + if (lane_id < 4){ + sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 4]; + rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 4] ); + } + if (lane_id < 2){ + sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 2]; + rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 2] ); + } + if (lane_id < 1){ + sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 1]; + rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 1] ); + + data_e_hb[j] += sh_hb [threadIdx.x]; + rvec_Add (workspace->f[j], sh_atomf [threadIdx.x]); + } #endif - } + } - CUDA_GLOBAL void Cuda_Hydrogen_Bonds_PostProcess ( reax_atom *atoms, - storage p_workspace, - reax_list p_bonds, int N) - { - int i, pj; + CUDA_GLOBAL void Cuda_Hydrogen_Bonds_PostProcess ( reax_atom *atoms, + storage p_workspace, + reax_list p_bonds, int N) + { + int i, pj; - storage *workspace = &( p_workspace ); - bond_data *pbond; - bond_data *sym_index_bond; - reax_list *bonds = &p_bonds; + storage *workspace = &( p_workspace ); + bond_data *pbond; + bond_data *sym_index_bond; + reax_list *bonds = &p_bonds; - i = blockIdx.x * blockDim.x + threadIdx.x; - if ( i >= N) return; + i = blockIdx.x * blockDim.x + threadIdx.x; + if ( i >= N) return; - for( pj = Dev_Start_Index(i, bonds); pj < Dev_End_Index(i, bonds); ++pj ){ + for( pj = Dev_Start_Index(i, bonds); pj < Dev_End_Index(i, bonds); ++pj ){ - pbond = &(bonds->select.bond_list[pj]); - sym_index_bond = &( bonds->select.bond_list[ pbond->sym_index ] ); + pbond = &(bonds->select.bond_list[pj]); + sym_index_bond = &( bonds->select.bond_list[ pbond->sym_index ] ); - //rvec_Add (atoms[i].f, sym_index_bond->hb_f ); - rvec_Add (workspace->f[i], sym_index_bond->hb_f ); - } - } + //rvec_Add (atoms[i].f, sym_index_bond->hb_f ); + rvec_Add (workspace->f[i], sym_index_bond->hb_f ); + } + } - CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs ( reax_atom *atoms, - storage p_workspace, - reax_list p_hbonds ) - { + CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs ( reax_atom *atoms, + storage p_workspace, + reax_list p_hbonds ) + { #if defined(__SM_35__) - rvec __f; + rvec __f; #else - extern __shared__ rvec __f[]; + extern __shared__ rvec __f[]; #endif - int i, pj,j; - int start, end; + int i, pj,j; + int start, end; - storage *workspace = &( p_workspace ); - hbond_data *nbr_pj, *sym_index_nbr; - reax_list *hbonds = &p_hbonds; + storage *workspace = &( p_workspace ); + hbond_data *nbr_pj, *sym_index_nbr; + reax_list *hbonds = &p_hbonds; - i = blockIdx.x; + i = blockIdx.x; - start = Dev_Start_Index (i, hbonds); - end = Dev_End_Index (i, hbonds); - pj = start + threadIdx.x; + start = Dev_Start_Index (i, hbonds); + end = Dev_End_Index (i, hbonds); + pj = start + threadIdx.x; #if defined(__SM_35__) - rvec_MakeZero (__f); + rvec_MakeZero (__f); #else - rvec_MakeZero (__f[threadIdx.x]); + rvec_MakeZero (__f[threadIdx.x]); #endif - while (pj < end) - { - nbr_pj = &( hbonds->select.hbond_list[pj] ); - j = nbr_pj->nbr; + while (pj < end) + { + nbr_pj = &( hbonds->select.hbond_list[pj] ); + j = nbr_pj->nbr; - sym_index_nbr = & (hbonds->select.hbond_list[ nbr_pj->sym_index ]); + sym_index_nbr = & (hbonds->select.hbond_list[ nbr_pj->sym_index ]); #if defined(__SM_35__) - rvec_Add (__f, sym_index_nbr->hb_f ); + rvec_Add (__f, sym_index_nbr->hb_f ); #else - rvec_Add (__f[threadIdx.x], sym_index_nbr->hb_f ); + rvec_Add (__f[threadIdx.x], sym_index_nbr->hb_f ); #endif - pj += blockDim.x; - } + pj += blockDim.x; + } #if defined(__SM_35__) - for (int s = 16; s >= 1; s/=2){ - __f[0] += shfl( __f[0], s); - __f[1] += shfl( __f[1], s); - __f[2] += shfl( __f[2], s); - } - - if (threadIdx.x == 0) - rvec_Add (workspace->f[i], __f); + for (int s = 16; s >= 1; s/=2){ + __f[0] += shfl( __f[0], s); + __f[1] += shfl( __f[1], s); + __f[2] += shfl( __f[2], s); + } + + if (threadIdx.x == 0) + rvec_Add (workspace->f[i], __f); #else - if (threadIdx.x < 16) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 16]); - if (threadIdx.x < 8) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 8]); - if (threadIdx.x < 4) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 4]); - if (threadIdx.x < 2) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 2]); - if (threadIdx.x < 1) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 1]); - - if (threadIdx.x == 0) - //rvec_Add (atoms[i].f, __f[0]); - rvec_Add (workspace->f[i], __f[0]); + if (threadIdx.x < 16) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 16]); + if (threadIdx.x < 8) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 8]); + if (threadIdx.x < 4) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 4]); + if (threadIdx.x < 2) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 2]); + if (threadIdx.x < 1) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 1]); + + if (threadIdx.x == 0) + //rvec_Add (atoms[i].f, __f[0]); + rvec_Add (workspace->f[i], __f[0]); #endif - } + } - CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs_BL ( reax_atom *atoms, - storage p_workspace, - reax_list p_hbonds, int N ) - { + CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs_BL ( reax_atom *atoms, + storage p_workspace, + reax_list p_hbonds, int N ) + { #if defined(__SM_35__) - rvec __f; + rvec __f; #else - extern __shared__ rvec __f[]; + extern __shared__ rvec __f[]; #endif - int i, pj,j; - int start, end; + int i, pj,j; + int start, end; - storage *workspace = &( p_workspace ); - hbond_data *nbr_pj, *sym_index_nbr; - reax_list *hbonds = &p_hbonds; + storage *workspace = &( p_workspace ); + hbond_data *nbr_pj, *sym_index_nbr; + reax_list *hbonds = &p_hbonds; - int __THREADS_PER_ATOM__ = HB_POST_PROC_KER_THREADS_PER_ATOM; + int __THREADS_PER_ATOM__ = HB_POST_PROC_KER_THREADS_PER_ATOM; - int thread_id = blockIdx.x * blockDim.x + threadIdx.x; - int warp_id = thread_id / __THREADS_PER_ATOM__; - int lane_id = thread_id & (__THREADS_PER_ATOM__ -1); - if (warp_id >= N ) return; + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + int warp_id = thread_id / __THREADS_PER_ATOM__; + int lane_id = thread_id & (__THREADS_PER_ATOM__ -1); + if (warp_id >= N ) return; - i = warp_id; + i = warp_id; - start = Dev_Start_Index (i, hbonds); - end = Dev_End_Index (i, hbonds); - pj = start + lane_id; + start = Dev_Start_Index (i, hbonds); + end = Dev_End_Index (i, hbonds); + pj = start + lane_id; #if defined(__SM_35__) - rvec_MakeZero (__f); + rvec_MakeZero (__f); #else - rvec_MakeZero (__f[threadIdx.x]); + rvec_MakeZero (__f[threadIdx.x]); #endif - while (pj < end) - { - nbr_pj = &( hbonds->select.hbond_list[pj] ); - j = nbr_pj->nbr; + while (pj < end) + { + nbr_pj = &( hbonds->select.hbond_list[pj] ); + j = nbr_pj->nbr; - sym_index_nbr = & (hbonds->select.hbond_list[ nbr_pj->sym_index ]); + sym_index_nbr = & (hbonds->select.hbond_list[ nbr_pj->sym_index ]); #if defined(__SM_35__) - rvec_Add (__f, sym_index_nbr->hb_f ); + rvec_Add (__f, sym_index_nbr->hb_f ); #else - rvec_Add (__f[threadIdx.x], sym_index_nbr->hb_f ); + rvec_Add (__f[threadIdx.x], sym_index_nbr->hb_f ); #endif - pj += __THREADS_PER_ATOM__; - } + pj += __THREADS_PER_ATOM__; + } #if defined(__SM_35__) - for (int s = __THREADS_PER_ATOM__ >> 1; s >= 1; s/=2){ - __f[0] += shfl( __f[0], s); - __f[1] += shfl( __f[1], s); - __f[2] += shfl( __f[2], s); - } - - if (lane_id == 0) - rvec_Add (workspace->f[i], __f); + for (int s = __THREADS_PER_ATOM__ >> 1; s >= 1; s/=2){ + __f[0] += shfl( __f[0], s); + __f[1] += shfl( __f[1], s); + __f[2] += shfl( __f[2], s); + } + + if (lane_id == 0) + rvec_Add (workspace->f[i], __f); #else - if (lane_id < 16) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 16]); - if (lane_id < 8) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 8]); - if (lane_id < 4) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 4]); - if (lane_id < 2) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 2]); - if (lane_id < 1) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 1]); - - if (lane_id == 0) - rvec_Add (workspace->f[i], __f[threadIdx.x]); + if (lane_id < 16) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 16]); + if (lane_id < 8) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 8]); + if (lane_id < 4) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 4]); + if (lane_id < 2) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 2]); + if (lane_id < 1) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 1]); + + if (lane_id == 0) + rvec_Add (workspace->f[i], __f[threadIdx.x]); #endif - } + } diff --git a/PG-PuReMD/src/cuda_init_md.cu b/PG-PuReMD/src/cuda_init_md.cu index 0bce2d22..827a63a3 100644 --- a/PG-PuReMD/src/cuda_init_md.cu +++ b/PG-PuReMD/src/cuda_init_md.cu @@ -6,7 +6,7 @@ void Cuda_Init_ScratchArea () { - cuda_malloc ((void **)& scratch, SCRATCH_SIZE, 1, "Device:Scratch"); + cuda_malloc ((void **)& scratch, SCRATCH_SIZE, 1, "Device:Scratch"); - host_scratch = (void *)malloc (HOST_SCRATCH_SIZE ); + host_scratch = (void *)malloc (HOST_SCRATCH_SIZE ); } diff --git a/PG-PuReMD/src/cuda_integrate.cu b/PG-PuReMD/src/cuda_integrate.cu index 4d2d3d93..7f042ce9 100644 --- a/PG-PuReMD/src/cuda_integrate.cu +++ b/PG-PuReMD/src/cuda_integrate.cu @@ -6,92 +6,92 @@ #include "cuda_utils.h" CUDA_GLOBAL void ker_update_velocity_1 (reax_atom *my_atoms, - single_body_parameters *sbp, - real dt, - int n) + single_body_parameters *sbp, + real dt, + int n) { - real inv_m; - rvec dx; - reax_atom *atom; - int i = blockIdx.x * blockDim.x + threadIdx.x; - if ( i >= n ) return; + real inv_m; + rvec dx; + reax_atom *atom; + int i = blockIdx.x * blockDim.x + threadIdx.x; + if ( i >= n ) return; - /* velocity verlet, 1st part */ - //for( i = 0; i < system->n; i++ ) { - atom = &(my_atoms[i]); - inv_m = 1.0 / sbp[atom->type].mass; - /* Compute x(t + dt) */ - rvec_ScaledSum( dx, dt, atom->v, 0.5 * -F_CONV * inv_m * SQR(dt), atom->f ); - rvec_Add( atom->x, dx ); - /* Compute v(t + dt/2) */ - rvec_ScaledAdd( atom->v, 0.5 * -F_CONV * inv_m * dt, atom->f ); - //} + /* velocity verlet, 1st part */ + //for( i = 0; i < system->n; i++ ) { + atom = &(my_atoms[i]); + inv_m = 1.0 / sbp[atom->type].mass; + /* Compute x(t + dt) */ + rvec_ScaledSum( dx, dt, atom->v, 0.5 * -F_CONV * inv_m * SQR(dt), atom->f ); + rvec_Add( atom->x, dx ); + /* Compute v(t + dt/2) */ + rvec_ScaledAdd( atom->v, 0.5 * -F_CONV * inv_m * dt, atom->f ); + //} } void bNVT_update_velocity_part1 (reax_system *system, real dt) { - int blocks; + int blocks; - blocks = system->n / DEF_BLOCK_SIZE + - ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1); - ker_update_velocity_1 <<< blocks, DEF_BLOCK_SIZE >>> - (system->d_my_atoms, system->reax_param.d_sbp, dt, system->n); - cudaThreadSynchronize (); - cudaCheckError (); + blocks = system->n / DEF_BLOCK_SIZE + + ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1); + ker_update_velocity_1 <<< blocks, DEF_BLOCK_SIZE >>> + (system->d_my_atoms, system->reax_param.d_sbp, dt, system->n); + cudaThreadSynchronize (); + cudaCheckError (); } CUDA_GLOBAL void ker_update_velocity_2 (reax_atom *my_atoms, - single_body_parameters *sbp, - real dt, - int n) + single_body_parameters *sbp, + real dt, + int n) { - reax_atom *atom; - real inv_m; - int i = blockIdx.x * blockDim.x + threadIdx.x; - if ( i >= n ) return; + reax_atom *atom; + real inv_m; + int i = blockIdx.x * blockDim.x + threadIdx.x; + if ( i >= n ) return; - /* velocity verlet, 2nd part */ - //for( i = 0; i < system->n; i++ ) { - atom = &(my_atoms[i]); - inv_m = 1.0 / sbp[atom->type].mass; - /* Compute v(t + dt) */ - rvec_ScaledAdd( atom->v, 0.5 * dt * -F_CONV * inv_m, atom->f ); - //} + /* velocity verlet, 2nd part */ + //for( i = 0; i < system->n; i++ ) { + atom = &(my_atoms[i]); + inv_m = 1.0 / sbp[atom->type].mass; + /* Compute v(t + dt) */ + rvec_ScaledAdd( atom->v, 0.5 * dt * -F_CONV * inv_m, atom->f ); + //} } void bNVT_update_velocity_part2 (reax_system *system, real dt) { - int blocks; + int blocks; - blocks = system->n / DEF_BLOCK_SIZE + - ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1); - ker_update_velocity_2 <<< blocks, DEF_BLOCK_SIZE >>> - (system->d_my_atoms, system->reax_param.d_sbp, dt, system->n); - cudaThreadSynchronize (); - cudaCheckError (); + blocks = system->n / DEF_BLOCK_SIZE + + ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1); + ker_update_velocity_2 <<< blocks, DEF_BLOCK_SIZE >>> + (system->d_my_atoms, system->reax_param.d_sbp, dt, system->n); + cudaThreadSynchronize (); + cudaCheckError (); } CUDA_GLOBAL void ker_scale_velocities (reax_atom *my_atoms, real lambda, int n) { - reax_atom *atom; - int i = blockIdx.x * blockDim.x + threadIdx.x; - if ( i >= n ) return; + reax_atom *atom; + int i = blockIdx.x * blockDim.x + threadIdx.x; + if ( i >= n ) return; - /* Scale velocities and positions at t+dt */ - //for( i = 0; i < system->n; ++i ) { - atom = &(my_atoms[i]); - rvec_Scale( atom->v, lambda, atom->v ); - //} + /* Scale velocities and positions at t+dt */ + //for( i = 0; i < system->n; ++i ) { + atom = &(my_atoms[i]); + rvec_Scale( atom->v, lambda, atom->v ); + //} } void bNVT_scale_velocities (reax_system *system, real lambda) { - int blocks; + int blocks; - blocks = system->n / DEF_BLOCK_SIZE + - ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1); - ker_scale_velocities <<< blocks, DEF_BLOCK_SIZE >>> - (system->d_my_atoms, lambda, system->n); - cudaThreadSynchronize (); - cudaCheckError (); + blocks = system->n / DEF_BLOCK_SIZE + + ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1); + ker_scale_velocities <<< blocks, DEF_BLOCK_SIZE >>> + (system->d_my_atoms, lambda, system->n); + cudaThreadSynchronize (); + cudaCheckError (); } diff --git a/PG-PuReMD/src/cuda_linear_solvers.cu b/PG-PuReMD/src/cuda_linear_solvers.cu index 7ad92cc1..1b1f510c 100644 --- a/PG-PuReMD/src/cuda_linear_solvers.cu +++ b/PG-PuReMD/src/cuda_linear_solvers.cu @@ -31,263 +31,263 @@ void get_from_device (real *host, real *device, unsigned int bytes, char *msg) { - copy_host_device (host, device, bytes, cudaMemcpyDeviceToHost, msg); + copy_host_device (host, device, bytes, cudaMemcpyDeviceToHost, msg); } void put_on_device (real *host, real *device, unsigned int bytes, char *msg) { - copy_host_device (host, device, bytes, cudaMemcpyHostToDevice, msg); + copy_host_device (host, device, bytes, cudaMemcpyHostToDevice, msg); } void Cuda_Vector_Sum (real *res, real a, real *x, real b, real *y, int count) { - //res = ax + by - //use the cublas here - int blocks; - blocks = (count / DEF_BLOCK_SIZE) + - ((count % DEF_BLOCK_SIZE == 0) ? 0 : 1); - k_vector_sum <<< blocks, DEF_BLOCK_SIZE >>> - ( res, a, x, b, y, count ); - cudaThreadSynchronize (); - cudaCheckError (); + //res = ax + by + //use the cublas here + int blocks; + blocks = (count / DEF_BLOCK_SIZE) + + ((count % DEF_BLOCK_SIZE == 0) ? 0 : 1); + k_vector_sum <<< blocks, DEF_BLOCK_SIZE >>> + ( res, a, x, b, y, count ); + cudaThreadSynchronize (); + cudaCheckError (); } void Cuda_CG_Preconditioner (real *res, real *a, real *b, int count) { - //res = a*b - vector multiplication - //use the cublas here. - int blocks; - blocks = (count / DEF_BLOCK_SIZE) + - ((count % DEF_BLOCK_SIZE == 0) ? 0 : 1); - k_vector_mul <<< blocks, DEF_BLOCK_SIZE >>> - ( res, a, b, count ); - cudaThreadSynchronize (); + //res = a*b - vector multiplication + //use the cublas here. + int blocks; + blocks = (count / DEF_BLOCK_SIZE) + + ((count % DEF_BLOCK_SIZE == 0) ? 0 : 1); + k_vector_mul <<< blocks, DEF_BLOCK_SIZE >>> + ( res, a, b, count ); + cudaThreadSynchronize (); } CUDA_GLOBAL void k_diagnol_preconditioner (storage p_workspace, rvec2 *b, int n) { - storage *workspace = &( p_workspace ); - int j = blockIdx.x * blockDim.x + threadIdx.x; - if (j >= n) return; - - //for( j = 0; j < system->n; ++j ) { - // residual - workspace->r2[j][0] = b[j][0] - workspace->q2[j][0]; - workspace->r2[j][1] = b[j][1] - workspace->q2[j][1]; - // apply diagonal pre-conditioner - workspace->d2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j]; - workspace->d2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j]; - //} + storage *workspace = &( p_workspace ); + int j = blockIdx.x * blockDim.x + threadIdx.x; + if (j >= n) return; + + //for( j = 0; j < system->n; ++j ) { + // residual + workspace->r2[j][0] = b[j][0] - workspace->q2[j][0]; + workspace->r2[j][1] = b[j][1] - workspace->q2[j][1]; + // apply diagonal pre-conditioner + workspace->d2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j]; + workspace->d2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j]; + //} } void Cuda_CG_Diagnol_Preconditioner (storage *workspace, rvec2 *b, int n) { - int blocks; - - blocks = (n / DEF_BLOCK_SIZE) + - (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1); - k_diagnol_preconditioner <<< blocks, DEF_BLOCK_SIZE >>> - (*workspace, b, n); - cudaThreadSynchronize (); - cudaCheckError (); + int blocks; + + blocks = (n / DEF_BLOCK_SIZE) + + (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1); + k_diagnol_preconditioner <<< blocks, DEF_BLOCK_SIZE >>> + (*workspace, b, n); + cudaThreadSynchronize (); + cudaCheckError (); } CUDA_GLOBAL void k_dual_cg_preconditioner (storage p_workspace, rvec2 *x, - real alpha_0, real alpha_1, int n, rvec2 *my_dot) + real alpha_0, real alpha_1, int n, rvec2 *my_dot) { - storage *workspace = &( p_workspace ); - rvec2 alpha; - alpha[0] = alpha_0; - alpha[1] = alpha_1; - - int j = blockIdx.x * blockDim.x + threadIdx.x; - if (j >= n) return; - my_dot[j][0] = my_dot[j][1] = 0.0; - - //for( j = 0; j < system->n; ++j ) { - // update x - x[j][0] += alpha[0] * workspace->d2[j][0]; - x[j][1] += alpha[1] * workspace->d2[j][1]; - // update residual - workspace->r2[j][0] -= alpha[0] * workspace->q2[j][0]; - workspace->r2[j][1] -= alpha[1] * workspace->q2[j][1]; - // apply diagonal pre-conditioner - workspace->p2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j]; - workspace->p2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j]; - // dot product: r.p - my_dot[j][0] = workspace->r2[j][0] * workspace->p2[j][0]; - my_dot[j][1] = workspace->r2[j][1] * workspace->p2[j][1]; - //} + storage *workspace = &( p_workspace ); + rvec2 alpha; + alpha[0] = alpha_0; + alpha[1] = alpha_1; + + int j = blockIdx.x * blockDim.x + threadIdx.x; + if (j >= n) return; + my_dot[j][0] = my_dot[j][1] = 0.0; + + //for( j = 0; j < system->n; ++j ) { + // update x + x[j][0] += alpha[0] * workspace->d2[j][0]; + x[j][1] += alpha[1] * workspace->d2[j][1]; + // update residual + workspace->r2[j][0] -= alpha[0] * workspace->q2[j][0]; + workspace->r2[j][1] -= alpha[1] * workspace->q2[j][1]; + // apply diagonal pre-conditioner + workspace->p2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j]; + workspace->p2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j]; + // dot product: r.p + my_dot[j][0] = workspace->r2[j][0] * workspace->p2[j][0]; + my_dot[j][1] = workspace->r2[j][1] * workspace->p2[j][1]; + //} } void Cuda_DualCG_Preconditioer (storage *workspace, rvec2 *x, rvec2 alpha, int n, rvec2 result) { - int blocks; - rvec2 *tmp = (rvec2 *) scratch; - cuda_memset (tmp, 0, sizeof (rvec2) * ( 2 * n + 1), "cuda_dualcg_preconditioner"); - - blocks = (n / DEF_BLOCK_SIZE) + - (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1); - k_dual_cg_preconditioner <<< blocks, DEF_BLOCK_SIZE >>> - (*workspace, x, alpha[0], alpha[1], n, tmp); - cudaThreadSynchronize (); - cudaCheckError (); - - //Reduction to calculate my_dot - k_reduction_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof (rvec2) * DEF_BLOCK_SIZE >>> - ( tmp, tmp + n, n); - cudaThreadSynchronize (); - cudaCheckError (); - - k_reduction_rvec2 <<< 1, BLOCKS_POW_2, sizeof (rvec2) * BLOCKS_POW_2 >>> - ( tmp + n, tmp + 2*n, blocks); - cudaThreadSynchronize (); - cudaCheckError (); - - copy_host_device (result, (tmp + 2*n), sizeof (rvec2), cudaMemcpyDeviceToHost, "my_dot"); + int blocks; + rvec2 *tmp = (rvec2 *) scratch; + cuda_memset (tmp, 0, sizeof (rvec2) * ( 2 * n + 1), "cuda_dualcg_preconditioner"); + + blocks = (n / DEF_BLOCK_SIZE) + + (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1); + k_dual_cg_preconditioner <<< blocks, DEF_BLOCK_SIZE >>> + (*workspace, x, alpha[0], alpha[1], n, tmp); + cudaThreadSynchronize (); + cudaCheckError (); + + //Reduction to calculate my_dot + k_reduction_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof (rvec2) * DEF_BLOCK_SIZE >>> + ( tmp, tmp + n, n); + cudaThreadSynchronize (); + cudaCheckError (); + + k_reduction_rvec2 <<< 1, BLOCKS_POW_2, sizeof (rvec2) * BLOCKS_POW_2 >>> + ( tmp + n, tmp + 2*n, blocks); + cudaThreadSynchronize (); + cudaCheckError (); + + copy_host_device (result, (tmp + 2*n), sizeof (rvec2), cudaMemcpyDeviceToHost, "my_dot"); } void Cuda_Norm (rvec2 *arr, int n, rvec2 result) { - int blocks; - rvec2 *tmp = (rvec2 *) scratch; - - blocks = (n / DEF_BLOCK_SIZE) + - (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1); - k_norm_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof (rvec2) * DEF_BLOCK_SIZE >>> - (arr, tmp, n, INITIAL); - cudaThreadSynchronize (); - cudaCheckError (); - - k_norm_rvec2 <<< 1, BLOCKS_POW_2, sizeof (rvec2) * BLOCKS_POW_2 >>> - (tmp, tmp + BLOCKS_POW_2, blocks, FINAL ); - cudaThreadSynchronize (); - cudaCheckError (); - - copy_host_device (result, tmp + BLOCKS_POW_2, sizeof (rvec2), - cudaMemcpyDeviceToHost, "cuda_norm_rvec2"); + int blocks; + rvec2 *tmp = (rvec2 *) scratch; + + blocks = (n / DEF_BLOCK_SIZE) + + (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1); + k_norm_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof (rvec2) * DEF_BLOCK_SIZE >>> + (arr, tmp, n, INITIAL); + cudaThreadSynchronize (); + cudaCheckError (); + + k_norm_rvec2 <<< 1, BLOCKS_POW_2, sizeof (rvec2) * BLOCKS_POW_2 >>> + (tmp, tmp + BLOCKS_POW_2, blocks, FINAL ); + cudaThreadSynchronize (); + cudaCheckError (); + + copy_host_device (result, tmp + BLOCKS_POW_2, sizeof (rvec2), + cudaMemcpyDeviceToHost, "cuda_norm_rvec2"); } void Cuda_Dot (rvec2 *a, rvec2 *b, rvec2 result, int n) { - int blocks; - rvec2 *tmp = (rvec2 *) scratch; - - blocks = (n / DEF_BLOCK_SIZE) + - (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1); - k_dot_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof (rvec2) * DEF_BLOCK_SIZE >>> - ( a, b, tmp, n ); - cudaThreadSynchronize (); - cudaCheckError (); - - k_norm_rvec2 <<< 1, BLOCKS_POW_2, sizeof (rvec2) * BLOCKS_POW_2 >>> - //k_norm_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof (rvec2) * BLOCKS_POW_2 >>> - ( tmp, tmp + BLOCKS_POW_2, blocks, FINAL ); - cudaThreadSynchronize (); - cudaCheckError (); - - copy_host_device (result, tmp + BLOCKS_POW_2, sizeof (rvec2), - cudaMemcpyDeviceToHost, "cuda_dot"); + int blocks; + rvec2 *tmp = (rvec2 *) scratch; + + blocks = (n / DEF_BLOCK_SIZE) + + (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1); + k_dot_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof (rvec2) * DEF_BLOCK_SIZE >>> + ( a, b, tmp, n ); + cudaThreadSynchronize (); + cudaCheckError (); + + k_norm_rvec2 <<< 1, BLOCKS_POW_2, sizeof (rvec2) * BLOCKS_POW_2 >>> + //k_norm_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof (rvec2) * BLOCKS_POW_2 >>> + ( tmp, tmp + BLOCKS_POW_2, blocks, FINAL ); + cudaThreadSynchronize (); + cudaCheckError (); + + copy_host_device (result, tmp + BLOCKS_POW_2, sizeof (rvec2), + cudaMemcpyDeviceToHost, "cuda_dot"); } void Cuda_Vector_Sum_Rvec2 (rvec2 *x, rvec2 *a, rvec2 b, rvec2 *c, int n) { - int blocks; - - blocks = (n / DEF_BLOCK_SIZE) + - (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1); - k_rvec2_pbetad <<< blocks, DEF_BLOCK_SIZE >>> - ( x, a, b[0], b[1], c, n); - cudaThreadSynchronize (); - cudaCheckError (); + int blocks; + + blocks = (n / DEF_BLOCK_SIZE) + + (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1); + k_rvec2_pbetad <<< blocks, DEF_BLOCK_SIZE >>> + ( x, a, b[0], b[1], c, n); + cudaThreadSynchronize (); + cudaCheckError (); } CUDA_GLOBAL void k_rvec2_to_real_copy ( real *dst, rvec2 *src, int index, int n) { - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= n) return; + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) return; - dst[i] = src[i][index]; + dst[i] = src[i][index]; } void Cuda_RvecCopy_From (real *dst, rvec2 *src, int index, int n) { - int blocks; - blocks = (n / DEF_BLOCK_SIZE) + - (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1); - k_rvec2_to_real_copy <<< blocks, DEF_BLOCK_SIZE >>> - ( dst, src, index, n); - cudaThreadSynchronize (); - cudaCheckError (); + int blocks; + blocks = (n / DEF_BLOCK_SIZE) + + (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1); + k_rvec2_to_real_copy <<< blocks, DEF_BLOCK_SIZE >>> + ( dst, src, index, n); + cudaThreadSynchronize (); + cudaCheckError (); } CUDA_GLOBAL void k_real_to_rvec2_copy ( rvec2 *dst, real *src, int index, int n) { - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= n) return; + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) return; - dst[i][index] = src[i]; + dst[i][index] = src[i]; } void Cuda_RvecCopy_To (rvec2 *dst, real *src, int index, int n) { - int blocks; - blocks = (n / DEF_BLOCK_SIZE) + - (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1); - k_real_to_rvec2_copy <<< blocks, DEF_BLOCK_SIZE >>> - ( dst, src, index, n); - cudaThreadSynchronize (); - cudaCheckError (); + int blocks; + blocks = (n / DEF_BLOCK_SIZE) + + (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1); + k_real_to_rvec2_copy <<< blocks, DEF_BLOCK_SIZE >>> + ( dst, src, index, n); + cudaThreadSynchronize (); + cudaCheckError (); } void Cuda_Dual_Matvec (sparse_matrix *H, rvec2 *a, rvec2 *b, int n, int size) { - int blocks; - blocks = (n / DEF_BLOCK_SIZE) + - (( n % DEF_BLOCK_SIZE) == 0 ? 0 : 1); + int blocks; + blocks = (n / DEF_BLOCK_SIZE) + + (( n % DEF_BLOCK_SIZE) == 0 ? 0 : 1); - cuda_memset (b, 0, sizeof (rvec2) * size, "dual_matvec:result"); + cuda_memset (b, 0, sizeof (rvec2) * size, "dual_matvec:result"); - //One thread per row implementation - //k_dual_matvec <<< blocks, DEF_BLOCK_SIZE >>> - // (*H, a, b, n); - //cudaThreadSynchronize (); - //cudaCheckError (); + //One thread per row implementation + //k_dual_matvec <<< blocks, DEF_BLOCK_SIZE >>> + // (*H, a, b, n); + //cudaThreadSynchronize (); + //cudaCheckError (); - //One warp per row implementation + //One warp per row implementation #if defined(__SM_35__) - k_dual_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE >>> + k_dual_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE >>> #else - k_dual_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, - sizeof (rvec2) * MATVEC_BLOCK_SIZE >>> + k_dual_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, + sizeof (rvec2) * MATVEC_BLOCK_SIZE >>> #endif - (*H, a, b, n); - cudaThreadSynchronize (); - cudaCheckError (); + (*H, a, b, n); + cudaThreadSynchronize (); + cudaCheckError (); } void Cuda_Matvec (sparse_matrix *H, real *a, real *b, int n, int size) { - int blocks; - blocks = (n / DEF_BLOCK_SIZE) + - (( n % DEF_BLOCK_SIZE) == 0 ? 0 : 1); + int blocks; + blocks = (n / DEF_BLOCK_SIZE) + + (( n % DEF_BLOCK_SIZE) == 0 ? 0 : 1); - cuda_memset (b, 0, sizeof (real) * size, "dual_matvec:result"); + cuda_memset (b, 0, sizeof (real) * size, "dual_matvec:result"); - //one thread per row implementation - //k_matvec <<< blocks, DEF_BLOCK_SIZE >>> - // (*H, a, b, n); - //cudaThreadSynchronize (); - //cudaCheckError (); + //one thread per row implementation + //k_matvec <<< blocks, DEF_BLOCK_SIZE >>> + // (*H, a, b, n); + //cudaThreadSynchronize (); + //cudaCheckError (); #if defined(__SM_35__) - k_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE >>> + k_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE >>> #else - k_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, - sizeof (real) * MATVEC_BLOCK_SIZE>>> + k_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, + sizeof (real) * MATVEC_BLOCK_SIZE>>> #endif - (*H, a, b, n); - cudaThreadSynchronize (); - cudaCheckError (); + (*H, a, b, n); + cudaThreadSynchronize (); + cudaCheckError (); } diff --git a/PG-PuReMD/src/cuda_lookup.cu b/PG-PuReMD/src/cuda_lookup.cu index 277a5b5d..bad6af13 100644 --- a/PG-PuReMD/src/cuda_lookup.cu +++ b/PG-PuReMD/src/cuda_lookup.cu @@ -7,71 +7,71 @@ void copy_LR_table_to_device (reax_system *system, control_params *control, int *aggregated) { - int i, j, r; - int num_atom_types; - LR_data *d_y; - cubic_spline_coef *temp; - - num_atom_types = system->reax_param.num_atom_types; - - fprintf (stderr, "Copying the LR Lookyp Table to the device ... \n"); - - cuda_malloc ((void **) &d_LR, sizeof (LR_lookup_table) * ( num_atom_types * num_atom_types ), 0, "LR_lookup:table"); - - /* - for( i = 0; i < MAX_ATOM_TYPES; ++i ) - existing_types[i] = 0; - - for( i = 0; i < system->N; ++i ) - existing_types[ system->atoms[i].type ] = 1; - */ - - copy_host_device ( LR, d_LR, sizeof (LR_lookup_table) * (num_atom_types * num_atom_types), - cudaMemcpyHostToDevice, "LR_lookup:table"); - - for( i = 0; i < num_atom_types; ++i ) - if( aggregated [i] ) - for( j = i; j < num_atom_types; ++j ) - - if( aggregated [j] ) { - - cuda_malloc ((void **) &d_y, sizeof (LR_data) * (control->tabulate + 1), 0, "LR_lookup:d_y"); - copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].y, d_y, - sizeof (LR_data) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:y"); - copy_host_device ( &d_y, &d_LR [ index_lr (i, j, num_atom_types) ].y, - sizeof (LR_data *), cudaMemcpyHostToDevice, "LR_lookup:y"); - - cuda_malloc ((void **) &temp, sizeof (cubic_spline_coef) * (control->tabulate + 1), 0, "LR_lookup:h"); - copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].H, temp, - sizeof (cubic_spline_coef) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:h"); - copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].H, - sizeof (cubic_spline_coef *), cudaMemcpyHostToDevice, "LR_lookup:h"); - - cuda_malloc ((void **) &temp, sizeof (cubic_spline_coef) * (control->tabulate + 1), 0, "LR_lookup:vdW"); - copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].vdW, temp, - sizeof (cubic_spline_coef) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:vdW"); - copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].vdW, - sizeof (cubic_spline_coef *), cudaMemcpyHostToDevice, "LR_lookup:vdW"); - - cuda_malloc ((void **) &temp, sizeof (cubic_spline_coef) * (control->tabulate + 1), 0, "LR_lookup:CEvd"); - copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].CEvd, temp, - sizeof (cubic_spline_coef) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:CEvd"); - copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].CEvd, - sizeof (cubic_spline_coef *), cudaMemcpyHostToDevice, "LR_lookup:CDvd"); - - cuda_malloc ((void **) &temp, sizeof (cubic_spline_coef) * (control->tabulate + 1), 0, "LR_lookup:ele"); - copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].ele, temp, - sizeof (cubic_spline_coef) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:ele"); - copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].ele, - sizeof (cubic_spline_coef *), cudaMemcpyHostToDevice, "LR_lookup:ele"); - - cuda_malloc ((void **) &temp, sizeof (cubic_spline_coef) * (control->tabulate + 1), 0, "LR_lookup:ceclmb"); - copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].CEclmb, temp, - sizeof (cubic_spline_coef) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:ceclmb"); - copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].CEclmb, - sizeof (cubic_spline_coef *), cudaMemcpyHostToDevice, "LR_lookup:ceclmb"); - } - - fprintf (stderr, "Copy of the LR Lookup Table to the device complete ... \n"); + int i, j, r; + int num_atom_types; + LR_data *d_y; + cubic_spline_coef *temp; + + num_atom_types = system->reax_param.num_atom_types; + + fprintf (stderr, "Copying the LR Lookyp Table to the device ... \n"); + + cuda_malloc ((void **) &d_LR, sizeof (LR_lookup_table) * ( num_atom_types * num_atom_types ), 0, "LR_lookup:table"); + + /* + for( i = 0; i < MAX_ATOM_TYPES; ++i ) + existing_types[i] = 0; + + for( i = 0; i < system->N; ++i ) + existing_types[ system->atoms[i].type ] = 1; + */ + + copy_host_device ( LR, d_LR, sizeof (LR_lookup_table) * (num_atom_types * num_atom_types), + cudaMemcpyHostToDevice, "LR_lookup:table"); + + for( i = 0; i < num_atom_types; ++i ) + if( aggregated [i] ) + for( j = i; j < num_atom_types; ++j ) + + if( aggregated [j] ) { + + cuda_malloc ((void **) &d_y, sizeof (LR_data) * (control->tabulate + 1), 0, "LR_lookup:d_y"); + copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].y, d_y, + sizeof (LR_data) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:y"); + copy_host_device ( &d_y, &d_LR [ index_lr (i, j, num_atom_types) ].y, + sizeof (LR_data *), cudaMemcpyHostToDevice, "LR_lookup:y"); + + cuda_malloc ((void **) &temp, sizeof (cubic_spline_coef) * (control->tabulate + 1), 0, "LR_lookup:h"); + copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].H, temp, + sizeof (cubic_spline_coef) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:h"); + copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].H, + sizeof (cubic_spline_coef *), cudaMemcpyHostToDevice, "LR_lookup:h"); + + cuda_malloc ((void **) &temp, sizeof (cubic_spline_coef) * (control->tabulate + 1), 0, "LR_lookup:vdW"); + copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].vdW, temp, + sizeof (cubic_spline_coef) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:vdW"); + copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].vdW, + sizeof (cubic_spline_coef *), cudaMemcpyHostToDevice, "LR_lookup:vdW"); + + cuda_malloc ((void **) &temp, sizeof (cubic_spline_coef) * (control->tabulate + 1), 0, "LR_lookup:CEvd"); + copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].CEvd, temp, + sizeof (cubic_spline_coef) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:CEvd"); + copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].CEvd, + sizeof (cubic_spline_coef *), cudaMemcpyHostToDevice, "LR_lookup:CDvd"); + + cuda_malloc ((void **) &temp, sizeof (cubic_spline_coef) * (control->tabulate + 1), 0, "LR_lookup:ele"); + copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].ele, temp, + sizeof (cubic_spline_coef) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:ele"); + copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].ele, + sizeof (cubic_spline_coef *), cudaMemcpyHostToDevice, "LR_lookup:ele"); + + cuda_malloc ((void **) &temp, sizeof (cubic_spline_coef) * (control->tabulate + 1), 0, "LR_lookup:ceclmb"); + copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].CEclmb, temp, + sizeof (cubic_spline_coef) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:ceclmb"); + copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].CEclmb, + sizeof (cubic_spline_coef *), cudaMemcpyHostToDevice, "LR_lookup:ceclmb"); + } + + fprintf (stderr, "Copy of the LR Lookup Table to the device complete ... \n"); } diff --git a/PG-PuReMD/src/cuda_multi_body.cu b/PG-PuReMD/src/cuda_multi_body.cu index e3d7c60a..24a51005 100644 --- a/PG-PuReMD/src/cuda_multi_body.cu +++ b/PG-PuReMD/src/cuda_multi_body.cu @@ -27,322 +27,322 @@ CUDA_GLOBAL void Cuda_Atom_Energy( reax_atom *my_atoms, - global_parameters gp, - single_body_parameters *sbp, - two_body_parameters *tbp, - storage p_workspace, - reax_list p_bonds, - int n, - int num_atom_types, - real *data_elp, - real *data_eov, - real *data_eun - ) + global_parameters gp, + single_body_parameters *sbp, + two_body_parameters *tbp, + storage p_workspace, + reax_list p_bonds, + int n, + int num_atom_types, + real *data_elp, + real *data_eov, + real *data_eun + ) { - int i, j, pj, type_i, type_j; - real Delta_lpcorr, dfvl; - real e_lp, expvd2, inv_expvd2, dElp, CElp, DlpVi; - real e_lph, Di, vov3, deahu2dbo, deahu2dsbo; - real e_ov, CEover1, CEover2, CEover3, CEover4; - real exp_ovun1, exp_ovun2, sum_ovun1, sum_ovun2; - real exp_ovun2n, exp_ovun6, exp_ovun8; - real inv_exp_ovun1, inv_exp_ovun2, inv_exp_ovun2n, inv_exp_ovun8; - real e_un, CEunder1, CEunder2, CEunder3, CEunder4; - real p_lp1, p_lp2, p_lp3; - real p_ovun2, p_ovun3, p_ovun4, p_ovun5, p_ovun6, p_ovun7, p_ovun8; - - single_body_parameters *sbp_i, *sbp_j; - two_body_parameters *twbp; - bond_data *pbond; - bond_order_data *bo_ij; - - i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= n) return; - - reax_list *bonds = &( p_bonds ); - storage *workspace = &( p_workspace ); - - /* Initialize parameters */ - p_lp1 = gp.l[15]; - p_lp3 = gp.l[5]; - p_ovun3 = gp.l[32]; - p_ovun4 = gp.l[31]; - p_ovun6 = gp.l[6]; - p_ovun7 = gp.l[8]; - p_ovun8 = gp.l[9]; - - //for( i = 0; i < system->n; ++i ) { - /* set the parameter pointer */ - type_i = my_atoms[i].type; - sbp_i = &(sbp[ type_i ]); - - /* lone-pair Energy */ - p_lp2 = sbp_i->p_lp2; - expvd2 = EXP( -75 * workspace->Delta_lp[i] ); - inv_expvd2 = 1. / (1. + expvd2 ); - - /* calculate the energy */ - data_elp [i] += e_lp = - p_lp2 * workspace->Delta_lp[i] * inv_expvd2; - - dElp = p_lp2 * inv_expvd2 + - 75 * p_lp2 * workspace->Delta_lp[i] * expvd2 * SQR(inv_expvd2); - CElp = dElp * workspace->dDelta_lp[i]; - - workspace->CdDelta[i] += CElp; // lp - 1st term + int i, j, pj, type_i, type_j; + real Delta_lpcorr, dfvl; + real e_lp, expvd2, inv_expvd2, dElp, CElp, DlpVi; + real e_lph, Di, vov3, deahu2dbo, deahu2dsbo; + real e_ov, CEover1, CEover2, CEover3, CEover4; + real exp_ovun1, exp_ovun2, sum_ovun1, sum_ovun2; + real exp_ovun2n, exp_ovun6, exp_ovun8; + real inv_exp_ovun1, inv_exp_ovun2, inv_exp_ovun2n, inv_exp_ovun8; + real e_un, CEunder1, CEunder2, CEunder3, CEunder4; + real p_lp1, p_lp2, p_lp3; + real p_ovun2, p_ovun3, p_ovun4, p_ovun5, p_ovun6, p_ovun7, p_ovun8; + + single_body_parameters *sbp_i, *sbp_j; + two_body_parameters *twbp; + bond_data *pbond; + bond_order_data *bo_ij; + + i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) return; + + reax_list *bonds = &( p_bonds ); + storage *workspace = &( p_workspace ); + + /* Initialize parameters */ + p_lp1 = gp.l[15]; + p_lp3 = gp.l[5]; + p_ovun3 = gp.l[32]; + p_ovun4 = gp.l[31]; + p_ovun6 = gp.l[6]; + p_ovun7 = gp.l[8]; + p_ovun8 = gp.l[9]; + + //for( i = 0; i < system->n; ++i ) { + /* set the parameter pointer */ + type_i = my_atoms[i].type; + sbp_i = &(sbp[ type_i ]); + + /* lone-pair Energy */ + p_lp2 = sbp_i->p_lp2; + expvd2 = EXP( -75 * workspace->Delta_lp[i] ); + inv_expvd2 = 1. / (1. + expvd2 ); + + /* calculate the energy */ + data_elp [i] += e_lp = + p_lp2 * workspace->Delta_lp[i] * inv_expvd2; + + dElp = p_lp2 * inv_expvd2 + + 75 * p_lp2 * workspace->Delta_lp[i] * expvd2 * SQR(inv_expvd2); + CElp = dElp * workspace->dDelta_lp[i]; + + workspace->CdDelta[i] += CElp; // lp - 1st term #ifdef TEST_ENERGY - // fprintf( out_control->elp, "%24.15e%24.15e%24.15e%24.15e\n", - // p_lp2, workspace->Delta_lp_temp[i], expvd2, dElp ); - // fprintf( out_control->elp, "%6d%24.15e%24.15e%24.15e\n", - fprintf( out_control->elp, "%6d%12.4f%12.4f%12.4f\n", - system->my_atoms[i].orig_id, workspace->nlp[i], - e_lp, data->my_en.e_lp ); + // fprintf( out_control->elp, "%24.15e%24.15e%24.15e%24.15e\n", + // p_lp2, workspace->Delta_lp_temp[i], expvd2, dElp ); + // fprintf( out_control->elp, "%6d%24.15e%24.15e%24.15e\n", + fprintf( out_control->elp, "%6d%12.4f%12.4f%12.4f\n", + system->my_atoms[i].orig_id, workspace->nlp[i], + e_lp, data->my_en.e_lp ); #endif #ifdef TEST_FORCES - Add_dDelta( system, lists, i, CElp, workspace->f_lp ); // lp - 1st term + Add_dDelta( system, lists, i, CElp, workspace->f_lp ); // lp - 1st term #endif - /* correction for C2 */ - if( gp.l[5] > 0.001 && - !cuda_strcmp( sbp[type_i].name, "C", 1 ) ) - for( pj = Dev_Start_Index(i, bonds); pj < Dev_End_Index(i, bonds); ++pj ) - if( my_atoms[i].orig_id < - my_atoms[bonds->select.bond_list[pj].nbr].orig_id ) { - j = bonds->select.bond_list[pj].nbr; - type_j = my_atoms[j].type; - - if( !cuda_strcmp( sbp[type_j].name, "C", 1 ) ) { - twbp = &( tbp[index_tbp (type_i,type_j, num_atom_types) ]); - bo_ij = &( bonds->select.bond_list[pj].bo_data ); - Di = workspace->Delta[i]; - vov3 = bo_ij->BO - Di - 0.040*POW(Di, 4.); - - if( vov3 > 3. ) { - data_elp [i] += e_lph = p_lp3 * SQR(vov3-3.0); - - deahu2dbo = 2.*p_lp3*(vov3 - 3.); - deahu2dsbo = 2.*p_lp3*(vov3 - 3.)*(-1. - 0.16*POW(Di, 3.)); - - bo_ij->Cdbo += deahu2dbo; - workspace->CdDelta[i] += deahu2dsbo; + /* correction for C2 */ + if( gp.l[5] > 0.001 && + !cuda_strcmp( sbp[type_i].name, "C", 1 ) ) + for( pj = Dev_Start_Index(i, bonds); pj < Dev_End_Index(i, bonds); ++pj ) + if( my_atoms[i].orig_id < + my_atoms[bonds->select.bond_list[pj].nbr].orig_id ) { + j = bonds->select.bond_list[pj].nbr; + type_j = my_atoms[j].type; + + if( !cuda_strcmp( sbp[type_j].name, "C", 1 ) ) { + twbp = &( tbp[index_tbp (type_i,type_j, num_atom_types) ]); + bo_ij = &( bonds->select.bond_list[pj].bo_data ); + Di = workspace->Delta[i]; + vov3 = bo_ij->BO - Di - 0.040*POW(Di, 4.); + + if( vov3 > 3. ) { + data_elp [i] += e_lph = p_lp3 * SQR(vov3-3.0); + + deahu2dbo = 2.*p_lp3*(vov3 - 3.); + deahu2dsbo = 2.*p_lp3*(vov3 - 3.)*(-1. - 0.16*POW(Di, 3.)); + + bo_ij->Cdbo += deahu2dbo; + workspace->CdDelta[i] += deahu2dsbo; #ifdef TEST_ENERGY - fprintf(out_control->elp,"C2cor%6d%6d%12.6f%12.6f%12.6f\n", - system->my_atoms[i].orig_id, system->my_atoms[j].orig_id, - e_lph, deahu2dbo, deahu2dsbo ); + fprintf(out_control->elp,"C2cor%6d%6d%12.6f%12.6f%12.6f\n", + system->my_atoms[i].orig_id, system->my_atoms[j].orig_id, + e_lph, deahu2dbo, deahu2dsbo ); #endif #ifdef TEST_FORCES - Add_dBO(system, lists, i, pj, deahu2dbo, workspace->f_lp); - Add_dDelta(system, lists, i, deahu2dsbo, workspace->f_lp); + Add_dBO(system, lists, i, pj, deahu2dbo, workspace->f_lp); + Add_dDelta(system, lists, i, deahu2dsbo, workspace->f_lp); #endif - } - } - } - //} + } + } + } + //} - //for( i = 0; i < system->n; ++i ) { - type_i = my_atoms[i].type; - sbp_i = &(sbp[ type_i ]); + //for( i = 0; i < system->n; ++i ) { + type_i = my_atoms[i].type; + sbp_i = &(sbp[ type_i ]); - /* over-coordination energy */ - if( sbp_i->mass > 21.0 ) - dfvl = 0.0; - else dfvl = 1.0; // only for 1st-row elements + /* over-coordination energy */ + if( sbp_i->mass > 21.0 ) + dfvl = 0.0; + else dfvl = 1.0; // only for 1st-row elements - p_ovun2 = sbp_i->p_ovun2; - sum_ovun1 = sum_ovun2 = 0; - for( pj = Dev_Start_Index(i, bonds); pj < Dev_End_Index(i, bonds); ++pj ) { - j = bonds->select.bond_list[pj].nbr; - type_j = my_atoms[j].type; - bo_ij = &(bonds->select.bond_list[pj].bo_data); - sbp_j = &(sbp[ type_j ]); - twbp = &(tbp[ index_tbp (type_i, type_j, num_atom_types )]); + p_ovun2 = sbp_i->p_ovun2; + sum_ovun1 = sum_ovun2 = 0; + for( pj = Dev_Start_Index(i, bonds); pj < Dev_End_Index(i, bonds); ++pj ) { + j = bonds->select.bond_list[pj].nbr; + type_j = my_atoms[j].type; + bo_ij = &(bonds->select.bond_list[pj].bo_data); + sbp_j = &(sbp[ type_j ]); + twbp = &(tbp[ index_tbp (type_i, type_j, num_atom_types )]); - sum_ovun1 += twbp->p_ovun1 * twbp->De_s * bo_ij->BO; - sum_ovun2 += (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j])* - ( bo_ij->BO_pi + bo_ij->BO_pi2 ); + sum_ovun1 += twbp->p_ovun1 * twbp->De_s * bo_ij->BO; + sum_ovun2 += (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j])* + ( bo_ij->BO_pi + bo_ij->BO_pi2 ); - /*fprintf( stdout, "%4d%4d%12.6f%12.6f%12.6f\n", - i+1, j+1, - dfvl * workspace->Delta_lp_temp[j], - sbp_j->nlp_opt, - workspace->nlp_temp[j] );*/ - } + /*fprintf( stdout, "%4d%4d%12.6f%12.6f%12.6f\n", + i+1, j+1, + dfvl * workspace->Delta_lp_temp[j], + sbp_j->nlp_opt, + workspace->nlp_temp[j] );*/ + } - exp_ovun1 = p_ovun3 * EXP( p_ovun4 * sum_ovun2 ); - inv_exp_ovun1 = 1.0 / (1 + exp_ovun1); - Delta_lpcorr = workspace->Delta[i] - - (dfvl * workspace->Delta_lp_temp[i]) * inv_exp_ovun1; + exp_ovun1 = p_ovun3 * EXP( p_ovun4 * sum_ovun2 ); + inv_exp_ovun1 = 1.0 / (1 + exp_ovun1); + Delta_lpcorr = workspace->Delta[i] - + (dfvl * workspace->Delta_lp_temp[i]) * inv_exp_ovun1; - exp_ovun2 = EXP( p_ovun2 * Delta_lpcorr ); - inv_exp_ovun2 = 1.0 / (1.0 + exp_ovun2); + exp_ovun2 = EXP( p_ovun2 * Delta_lpcorr ); + inv_exp_ovun2 = 1.0 / (1.0 + exp_ovun2); - DlpVi = 1.0 / (Delta_lpcorr + sbp_i->valency + 1e-8); - CEover1 = Delta_lpcorr * DlpVi * inv_exp_ovun2; + DlpVi = 1.0 / (Delta_lpcorr + sbp_i->valency + 1e-8); + CEover1 = Delta_lpcorr * DlpVi * inv_exp_ovun2; - data_eov [i] += e_ov = sum_ovun1 * CEover1; + data_eov [i] += e_ov = sum_ovun1 * CEover1; - CEover2 = sum_ovun1 * DlpVi * inv_exp_ovun2 * - (1.0 - Delta_lpcorr * ( DlpVi + p_ovun2 * exp_ovun2 * inv_exp_ovun2 )); + CEover2 = sum_ovun1 * DlpVi * inv_exp_ovun2 * + (1.0 - Delta_lpcorr * ( DlpVi + p_ovun2 * exp_ovun2 * inv_exp_ovun2 )); - CEover3 = CEover2 * (1.0 - dfvl * workspace->dDelta_lp[i] * inv_exp_ovun1 ); + CEover3 = CEover2 * (1.0 - dfvl * workspace->dDelta_lp[i] * inv_exp_ovun1 ); - CEover4 = CEover2 * (dfvl * workspace->Delta_lp_temp[i]) * - p_ovun4 * exp_ovun1 * SQR(inv_exp_ovun1); + CEover4 = CEover2 * (dfvl * workspace->Delta_lp_temp[i]) * + p_ovun4 * exp_ovun1 * SQR(inv_exp_ovun1); - /* under-coordination potential */ - p_ovun2 = sbp_i->p_ovun2; - p_ovun5 = sbp_i->p_ovun5; + /* under-coordination potential */ + p_ovun2 = sbp_i->p_ovun2; + p_ovun5 = sbp_i->p_ovun5; - exp_ovun2n = 1.0 / exp_ovun2; - exp_ovun6 = EXP( p_ovun6 * Delta_lpcorr ); - exp_ovun8 = p_ovun7 * EXP(p_ovun8 * sum_ovun2); - inv_exp_ovun2n = 1.0 / (1.0 + exp_ovun2n); - inv_exp_ovun8 = 1.0 / (1.0 + exp_ovun8); + exp_ovun2n = 1.0 / exp_ovun2; + exp_ovun6 = EXP( p_ovun6 * Delta_lpcorr ); + exp_ovun8 = p_ovun7 * EXP(p_ovun8 * sum_ovun2); + inv_exp_ovun2n = 1.0 / (1.0 + exp_ovun2n); + inv_exp_ovun8 = 1.0 / (1.0 + exp_ovun8); - data_eun [i] += e_un = - -p_ovun5 * (1.0 - exp_ovun6) * inv_exp_ovun2n * inv_exp_ovun8; + data_eun [i] += e_un = + -p_ovun5 * (1.0 - exp_ovun6) * inv_exp_ovun2n * inv_exp_ovun8; - CEunder1 = inv_exp_ovun2n * - ( p_ovun5 * p_ovun6 * exp_ovun6 * inv_exp_ovun8 + - p_ovun2 * e_un * exp_ovun2n ); - CEunder2 = -e_un * p_ovun8 * exp_ovun8 * inv_exp_ovun8; - CEunder3 = CEunder1 * (1.0 - dfvl*workspace->dDelta_lp[i]*inv_exp_ovun1); - CEunder4 = CEunder1 * (dfvl*workspace->Delta_lp_temp[i]) * - p_ovun4 * exp_ovun1 * SQR(inv_exp_ovun1) + CEunder2; + CEunder1 = inv_exp_ovun2n * + ( p_ovun5 * p_ovun6 * exp_ovun6 * inv_exp_ovun8 + + p_ovun2 * e_un * exp_ovun2n ); + CEunder2 = -e_un * p_ovun8 * exp_ovun8 * inv_exp_ovun8; + CEunder3 = CEunder1 * (1.0 - dfvl*workspace->dDelta_lp[i]*inv_exp_ovun1); + CEunder4 = CEunder1 * (dfvl*workspace->Delta_lp_temp[i]) * + p_ovun4 * exp_ovun1 * SQR(inv_exp_ovun1) + CEunder2; - /* forces */ - workspace->CdDelta[i] += CEover3; // OvCoor - 2nd term - workspace->CdDelta[i] += CEunder3; // UnCoor - 1st term + /* forces */ + workspace->CdDelta[i] += CEover3; // OvCoor - 2nd term + workspace->CdDelta[i] += CEunder3; // UnCoor - 1st term #ifdef TEST_FORCES - Add_dDelta( system, lists, i, CEover3, workspace->f_ov ); // OvCoor 2nd - Add_dDelta( system, lists, i, CEunder3, workspace->f_un ); // UnCoor 1st + Add_dDelta( system, lists, i, CEover3, workspace->f_ov ); // OvCoor 2nd + Add_dDelta( system, lists, i, CEunder3, workspace->f_un ); // UnCoor 1st #endif - for( pj = Dev_Start_Index(i, bonds); pj < Dev_End_Index(i, bonds); ++pj ) { - pbond = &(bonds->select.bond_list[pj]); - j = pbond->nbr; - bo_ij = &(pbond->bo_data); - twbp = &(tbp[ index_tbp (my_atoms[i].type, my_atoms[pbond->nbr].type, - num_atom_types) ]); + for( pj = Dev_Start_Index(i, bonds); pj < Dev_End_Index(i, bonds); ++pj ) { + pbond = &(bonds->select.bond_list[pj]); + j = pbond->nbr; + bo_ij = &(pbond->bo_data); + twbp = &(tbp[ index_tbp (my_atoms[i].type, my_atoms[pbond->nbr].type, + num_atom_types) ]); - bo_ij->Cdbo += CEover1 * twbp->p_ovun1 * twbp->De_s;// OvCoor-1st - //workspace->CdDelta[j] += CEover4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * - pbond->ae_CdDelta += CEover4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * - (bo_ij->BO_pi + bo_ij->BO_pi2); // OvCoor-3a - bo_ij->Cdbopi += CEover4 * - (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]); // OvCoor-3b - bo_ij->Cdbopi2 += CEover4 * - (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]); // OvCoor-3b + bo_ij->Cdbo += CEover1 * twbp->p_ovun1 * twbp->De_s;// OvCoor-1st + //workspace->CdDelta[j] += CEover4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * + pbond->ae_CdDelta += CEover4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * + (bo_ij->BO_pi + bo_ij->BO_pi2); // OvCoor-3a + bo_ij->Cdbopi += CEover4 * + (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]); // OvCoor-3b + bo_ij->Cdbopi2 += CEover4 * + (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]); // OvCoor-3b - //workspace->CdDelta[j] += CEunder4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * - pbond->ae_CdDelta += CEunder4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * - (bo_ij->BO_pi + bo_ij->BO_pi2); // UnCoor - 2a - bo_ij->Cdbopi += CEunder4 * - (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]); // UnCoor-2b - bo_ij->Cdbopi2 += CEunder4 * - (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]); // UnCoor-2b + //workspace->CdDelta[j] += CEunder4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * + pbond->ae_CdDelta += CEunder4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * + (bo_ij->BO_pi + bo_ij->BO_pi2); // UnCoor - 2a + bo_ij->Cdbopi += CEunder4 * + (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]); // UnCoor-2b + bo_ij->Cdbopi2 += CEunder4 * + (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]); // UnCoor-2b #ifdef TEST_ENERGY - /* fprintf( out_control->eov, "%6d%12.6f\n", - workspace->reverse_map[j], - // CEover1 * twbp->p_ovun1 * twbp->De_s, CEover3, - CEover4 * (1.0 - workspace->dDelta_lp[j]) * - (bo_ij->BO_pi + bo_ij->BO_pi2) - */// /*CEover4 * (workspace->Delta[j]-workspace->Delta_lp[j])*/); - // fprintf( out_control->eov, "%6d%12.6f\n", - // fprintf( out_control->eov, "%6d%24.15e\n", - // system->my_atoms[j].orig_id, - // CEover1 * twbp->p_ovun1 * twbp->De_s, CEover3, - // CEover4 * (1.0 - workspace->dDelta_lp[j]) * - // (bo_ij->BO_pi + bo_ij->BO_pi2) - // /*CEover4 * (workspace->Delta[j]-workspace->Delta_lp[j])*/); - - // CEunder4 * (1.0 - workspace->dDelta_lp[j]) * - // (bo_ij->BO_pi + bo_ij->BO_pi2), - // CEunder4 * (workspace->Delta[j] - workspace->Delta_lp[j]) ); + /* fprintf( out_control->eov, "%6d%12.6f\n", + workspace->reverse_map[j], + // CEover1 * twbp->p_ovun1 * twbp->De_s, CEover3, + CEover4 * (1.0 - workspace->dDelta_lp[j]) * + (bo_ij->BO_pi + bo_ij->BO_pi2) + */// /*CEover4 * (workspace->Delta[j]-workspace->Delta_lp[j])*/); + // fprintf( out_control->eov, "%6d%12.6f\n", + // fprintf( out_control->eov, "%6d%24.15e\n", + // system->my_atoms[j].orig_id, + // CEover1 * twbp->p_ovun1 * twbp->De_s, CEover3, + // CEover4 * (1.0 - workspace->dDelta_lp[j]) * + // (bo_ij->BO_pi + bo_ij->BO_pi2) + // /*CEover4 * (workspace->Delta[j]-workspace->Delta_lp[j])*/); + + // CEunder4 * (1.0 - workspace->dDelta_lp[j]) * + // (bo_ij->BO_pi + bo_ij->BO_pi2), + // CEunder4 * (workspace->Delta[j] - workspace->Delta_lp[j]) ); #endif #ifdef TEST_FORCES - Add_dBO( system, lists, i, pj, CEover1 * twbp->p_ovun1 * twbp->De_s, - workspace->f_ov ); // OvCoor - 1st term - - Add_dDelta( system, lists, j, - CEover4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * - (bo_ij->BO_pi + bo_ij->BO_pi2), - workspace->f_ov ); // OvCoor - 3a - - Add_dBOpinpi2( system, lists, i, pj, - CEover4 * (workspace->Delta[j] - - dfvl * workspace->Delta_lp_temp[j]), - CEover4 * (workspace->Delta[j] - - dfvl * workspace->Delta_lp_temp[j]), - workspace->f_ov, workspace->f_ov ); // OvCoor - 3b - - Add_dDelta( system, lists, j, - CEunder4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * - (bo_ij->BO_pi + bo_ij->BO_pi2), - workspace->f_un ); // UnCoor - 2a - - Add_dBOpinpi2( system, lists, i, pj, - CEunder4 * (workspace->Delta[j] - - dfvl * workspace->Delta_lp_temp[j]), - CEunder4 * (workspace->Delta[j] - - dfvl * workspace->Delta_lp_temp[j]), - workspace->f_un, workspace->f_un ); // UnCoor - 2b + Add_dBO( system, lists, i, pj, CEover1 * twbp->p_ovun1 * twbp->De_s, + workspace->f_ov ); // OvCoor - 1st term + + Add_dDelta( system, lists, j, + CEover4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * + (bo_ij->BO_pi + bo_ij->BO_pi2), + workspace->f_ov ); // OvCoor - 3a + + Add_dBOpinpi2( system, lists, i, pj, + CEover4 * (workspace->Delta[j] - + dfvl * workspace->Delta_lp_temp[j]), + CEover4 * (workspace->Delta[j] - + dfvl * workspace->Delta_lp_temp[j]), + workspace->f_ov, workspace->f_ov ); // OvCoor - 3b + + Add_dDelta( system, lists, j, + CEunder4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * + (bo_ij->BO_pi + bo_ij->BO_pi2), + workspace->f_un ); // UnCoor - 2a + + Add_dBOpinpi2( system, lists, i, pj, + CEunder4 * (workspace->Delta[j] - + dfvl * workspace->Delta_lp_temp[j]), + CEunder4 * (workspace->Delta[j] - + dfvl * workspace->Delta_lp_temp[j]), + workspace->f_un, workspace->f_un ); // UnCoor - 2b #endif - } + } #ifdef TEST_ENERGY - //fprintf( out_control->elp, "%6d%24.15e%24.15e%24.15e\n", - //fprintf( out_control->elp, "%6d%12.4f%12.4f%12.4f\n", - // system->my_atoms[i].orig_id, workspace->nlp[i], - // e_lp, data->my_en.e_lp ); - - //fprintf( out_control->eov, "%6d%24.15e%24.15e\n", - fprintf( out_control->eov, "%6d%12.4f%12.4f\n", - system->my_atoms[i].orig_id, - e_ov, data->my_en.e_ov + data->my_en.e_un ); - - //fprintf( out_control->eun, "%6d%24.15e%24.15e\n", - fprintf( out_control->eun, "%6d%12.4f%12.4f\n", - system->my_atoms[i].orig_id, - e_un, data->my_en.e_ov + data->my_en.e_un ); + //fprintf( out_control->elp, "%6d%24.15e%24.15e%24.15e\n", + //fprintf( out_control->elp, "%6d%12.4f%12.4f%12.4f\n", + // system->my_atoms[i].orig_id, workspace->nlp[i], + // e_lp, data->my_en.e_lp ); + + //fprintf( out_control->eov, "%6d%24.15e%24.15e\n", + fprintf( out_control->eov, "%6d%12.4f%12.4f\n", + system->my_atoms[i].orig_id, + e_ov, data->my_en.e_ov + data->my_en.e_un ); + + //fprintf( out_control->eun, "%6d%24.15e%24.15e\n", + fprintf( out_control->eun, "%6d%12.4f%12.4f\n", + system->my_atoms[i].orig_id, + e_un, data->my_en.e_ov + data->my_en.e_un ); #endif - //} + //} } CUDA_GLOBAL void Cuda_Atom_Energy_PostProcess ( reax_list p_bonds, - storage p_workspace, int n ) + storage p_workspace, int n ) { - int i,pj; - bond_data *pbond, *sbond; - bond_data *sym_index_bond; + int i,pj; + bond_data *pbond, *sbond; + bond_data *sym_index_bond; - reax_list *bonds = &p_bonds; - storage *workspace = &p_workspace; + reax_list *bonds = &p_bonds; + storage *workspace = &p_workspace; - i = blockIdx.x * blockDim.x + threadIdx.x; - if ( i >= n) return; + i = blockIdx.x * blockDim.x + threadIdx.x; + if ( i >= n) return; - for( pj = Dev_Start_Index(i, bonds); pj < Dev_End_Index(i, bonds); ++pj ){ + for( pj = Dev_Start_Index(i, bonds); pj < Dev_End_Index(i, bonds); ++pj ){ - /* - pbond = &(bonds->select.bond_list[pj]); - dbond_index_bond = &( bonds->select.bond_list[ pbond->dbond_index ] ); - workspace->CdDelta [i] += dbond_index_bond->ae_CdDelta; - */ + /* + pbond = &(bonds->select.bond_list[pj]); + dbond_index_bond = &( bonds->select.bond_list[ pbond->dbond_index ] ); + workspace->CdDelta [i] += dbond_index_bond->ae_CdDelta; + */ - sbond = &(bonds->select.bond_list [pj]); - sym_index_bond = &( bonds->select.bond_list[ sbond->sym_index ]); - workspace->CdDelta [i] += sym_index_bond->ae_CdDelta; + sbond = &(bonds->select.bond_list [pj]); + sym_index_bond = &( bonds->select.bond_list[ sbond->sym_index ]); + workspace->CdDelta [i] += sym_index_bond->ae_CdDelta; - } + } } diff --git a/PG-PuReMD/src/cuda_neighbors.cu b/PG-PuReMD/src/cuda_neighbors.cu index 9072de22..e552ab6b 100644 --- a/PG-PuReMD/src/cuda_neighbors.cu +++ b/PG-PuReMD/src/cuda_neighbors.cu @@ -33,681 +33,681 @@ CUDA_DEVICE real Dev_DistSqr_to_Special_Point( rvec cp, rvec x ) { - int i; - real d_sqr = 0; + int i; + real d_sqr = 0; - for( i = 0; i < 3; ++i ) - if( cp[i] > NEG_INF ) - d_sqr += SQR( cp[i] - x[i] ); + for( i = 0; i < 3; ++i ) + if( cp[i] > NEG_INF ) + d_sqr += SQR( cp[i] - x[i] ); - return d_sqr; + return d_sqr; } -CUDA_GLOBAL void ker_generate_neighbor_lists ( reax_atom *my_atoms, - simulation_box my_ext_box, - grid g, - reax_list far_nbrs, - int n, int N ) +CUDA_GLOBAL void ker_generate_neighbor_lists ( reax_atom *my_atoms, + simulation_box my_ext_box, + grid g, + reax_list far_nbrs, + int n, int N ) { - int i, j, k, l, m, itr, num_far; - real d, cutoff; - ivec c, nbrs_x; - rvec dvec; - grid_cell *gci, *gcj; - far_neighbor_data *nbr_data;//, *my_start; - reax_atom *atom1, *atom2; - - l = blockIdx.x * blockDim.x + threadIdx.x; - if (l >= N) return; - - atom1 = &(my_atoms[l]); - num_far = Dev_Start_Index (l, &far_nbrs); - - //get the coordinates of the atom and - //compute the grid cell - /* - i = (int) (my_atoms[ l ].x[0] * g.inv_len[0]); - j = (int) (my_atoms[ l ].x[1] * g.inv_len[1]); - k = (int) (my_atoms[ l ].x[2] * g.inv_len[2]); - */ - if (l < n) { - for (i = 0; i < 3; i++) - { - c[i] = (int)((my_atoms[l].x[i]- my_ext_box.min[i])*g.inv_len[i]); - if( c[i] >= g.native_end[i] ) - c[i] = g.native_end[i] - 1; - else if( c[i] < g.native_str[i] ) - c[i] = g.native_str[i]; - } - } else { - for (i = 0; i < 3; i++) - { - c[i] = (int)((my_atoms[l].x[i] - my_ext_box.min[i]) * g.inv_len[i]); - if( c[i] < 0 ) c[i] = 0; - else if( c[i] >= g.ncells[i] ) c[i] = g.ncells[i] - 1; - } - } - - i = c[0]; - j = c[1]; - k = c[2]; - - //gci = &( g.cells[ index_grid_3d (i, j, k, &g) ] ); - cutoff = SQR(g.cutoff[index_grid_3d (i, j, k, &g)]); - - itr = 0; - while( (g.nbrs_x[index_grid_nbrs (i, j, k, itr, &g)][0]) >= 0 ) { - - ivec_Copy (nbrs_x, g.nbrs_x[index_grid_nbrs (i, j, k, itr, &g)] ); - //gcj = &( g.cells [ index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g) ]); - - if( g.str[index_grid_3d (i, j, k, &g)] <= g.str[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)] && - (Dev_DistSqr_to_Special_Point(g.nbrs_cp[index_grid_nbrs (i, j, k, itr, &g)],atom1->x)<=cutoff) ) - /* pick up another atom from the neighbor cell */ - for( m = g.str[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; - m < g.end[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; ++m ) { - if(( l < m )) { // prevent recounting same pairs within a gcell - atom2 = &(my_atoms[m]); - dvec[0] = atom2->x[0] - atom1->x[0]; - dvec[1] = atom2->x[1] - atom1->x[1]; - dvec[2] = atom2->x[2] - atom1->x[2]; - d = rvec_Norm_Sqr( dvec ); - if( d <= cutoff ) { - nbr_data = &(far_nbrs.select.far_nbr_list[num_far]); - nbr_data->nbr = m; - nbr_data->d = SQRT(d); - rvec_Copy( nbr_data->dvec, dvec ); - //ivec_ScaledSum( nbr_data->rel_box, 1, gcj->rel_box, -1, gci->rel_box ); - ivec_ScaledSum( nbr_data->rel_box, 1, g.rel_box[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)], - -1, g.rel_box[index_grid_3d (i, j, k, &g)] ); - ++num_far; - } - } - /* - if(( l > m )) { // prevent recounting same pairs within a gcell - atom2 = &(my_atoms[m]); - dvec[0] = atom1->x[0] - atom2->x[0]; - dvec[1] = atom1->x[1] - atom2->x[1]; - dvec[2] = atom1->x[2] - atom2->x[2]; - d = rvec_Norm_Sqr( dvec ); - if( d <= cutoff ) { - nbr_data = &(far_nbrs.select.far_nbr_list[num_far]); - nbr_data->nbr = m; - nbr_data->d = SQRT(d); - rvec_Copy( nbr_data->dvec, dvec ); - ivec_ScaledSum( nbr_data->rel_box, - -1, gcj->rel_box, 1, gci->rel_box ); - ++num_far; - } - } - */ - } - ++itr; - } - - itr = 0; - while( (g.nbrs_x[index_grid_nbrs (i, j, k, itr, &g)][0]) >= 0 ) { - ivec_Copy (nbrs_x, g.nbrs_x[index_grid_nbrs (i, j, k, itr, &g)] ); - //gcj = &( g.cells [ index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g) ]); - cutoff = SQR(g.cutoff[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]); - - if( g.str[index_grid_3d (i, j, k, &g)] >= g.str[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)] && - (Dev_DistSqr_to_Special_Point(g.nbrs_cp[index_grid_nbrs (i, j, k, itr, &g)],atom1->x)<=cutoff) ) - for( m = g.str[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; - m < g.end[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; ++m ) { - if(( l > m )) { - atom2 = &(my_atoms[m]); - dvec[0] = atom1->x[0] - atom2->x[0]; - dvec[1] = atom1->x[1] - atom2->x[1]; - dvec[2] = atom1->x[2] - atom2->x[2]; - d = rvec_Norm_Sqr( dvec ); - if( d <= cutoff ) { - nbr_data = &(far_nbrs.select.far_nbr_list[num_far]); - nbr_data->nbr = m; - nbr_data->d = SQRT(d); - rvec_Copy( nbr_data->dvec, dvec ); - //ivec_ScaledSum( nbr_data->rel_box, -1, gcj->rel_box, 1, gci->rel_box ); - ivec_ScaledSum( nbr_data->rel_box, 1, g.rel_box[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)], - -1, g.rel_box[index_grid_3d (i, j, k, &g)] ); - ++num_far; - } - } - } - ++itr; - } - - Dev_Set_End_Index( l, num_far, &far_nbrs ); + int i, j, k, l, m, itr, num_far; + real d, cutoff; + ivec c, nbrs_x; + rvec dvec; + grid_cell *gci, *gcj; + far_neighbor_data *nbr_data;//, *my_start; + reax_atom *atom1, *atom2; + + l = blockIdx.x * blockDim.x + threadIdx.x; + if (l >= N) return; + + atom1 = &(my_atoms[l]); + num_far = Dev_Start_Index (l, &far_nbrs); + + //get the coordinates of the atom and + //compute the grid cell + /* + i = (int) (my_atoms[ l ].x[0] * g.inv_len[0]); + j = (int) (my_atoms[ l ].x[1] * g.inv_len[1]); + k = (int) (my_atoms[ l ].x[2] * g.inv_len[2]); + */ + if (l < n) { + for (i = 0; i < 3; i++) + { + c[i] = (int)((my_atoms[l].x[i]- my_ext_box.min[i])*g.inv_len[i]); + if( c[i] >= g.native_end[i] ) + c[i] = g.native_end[i] - 1; + else if( c[i] < g.native_str[i] ) + c[i] = g.native_str[i]; + } + } else { + for (i = 0; i < 3; i++) + { + c[i] = (int)((my_atoms[l].x[i] - my_ext_box.min[i]) * g.inv_len[i]); + if( c[i] < 0 ) c[i] = 0; + else if( c[i] >= g.ncells[i] ) c[i] = g.ncells[i] - 1; + } + } + + i = c[0]; + j = c[1]; + k = c[2]; + + //gci = &( g.cells[ index_grid_3d (i, j, k, &g) ] ); + cutoff = SQR(g.cutoff[index_grid_3d (i, j, k, &g)]); + + itr = 0; + while( (g.nbrs_x[index_grid_nbrs (i, j, k, itr, &g)][0]) >= 0 ) { + + ivec_Copy (nbrs_x, g.nbrs_x[index_grid_nbrs (i, j, k, itr, &g)] ); + //gcj = &( g.cells [ index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g) ]); + + if( g.str[index_grid_3d (i, j, k, &g)] <= g.str[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)] && + (Dev_DistSqr_to_Special_Point(g.nbrs_cp[index_grid_nbrs (i, j, k, itr, &g)],atom1->x)<=cutoff) ) + /* pick up another atom from the neighbor cell */ + for( m = g.str[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; + m < g.end[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; ++m ) { + if(( l < m )) { // prevent recounting same pairs within a gcell + atom2 = &(my_atoms[m]); + dvec[0] = atom2->x[0] - atom1->x[0]; + dvec[1] = atom2->x[1] - atom1->x[1]; + dvec[2] = atom2->x[2] - atom1->x[2]; + d = rvec_Norm_Sqr( dvec ); + if( d <= cutoff ) { + nbr_data = &(far_nbrs.select.far_nbr_list[num_far]); + nbr_data->nbr = m; + nbr_data->d = SQRT(d); + rvec_Copy( nbr_data->dvec, dvec ); + //ivec_ScaledSum( nbr_data->rel_box, 1, gcj->rel_box, -1, gci->rel_box ); + ivec_ScaledSum( nbr_data->rel_box, 1, g.rel_box[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)], + -1, g.rel_box[index_grid_3d (i, j, k, &g)] ); + ++num_far; + } + } + /* + if(( l > m )) { // prevent recounting same pairs within a gcell + atom2 = &(my_atoms[m]); + dvec[0] = atom1->x[0] - atom2->x[0]; + dvec[1] = atom1->x[1] - atom2->x[1]; + dvec[2] = atom1->x[2] - atom2->x[2]; + d = rvec_Norm_Sqr( dvec ); + if( d <= cutoff ) { + nbr_data = &(far_nbrs.select.far_nbr_list[num_far]); + nbr_data->nbr = m; + nbr_data->d = SQRT(d); + rvec_Copy( nbr_data->dvec, dvec ); + ivec_ScaledSum( nbr_data->rel_box, + -1, gcj->rel_box, 1, gci->rel_box ); + ++num_far; + } + } + */ + } + ++itr; + } + + itr = 0; + while( (g.nbrs_x[index_grid_nbrs (i, j, k, itr, &g)][0]) >= 0 ) { + ivec_Copy (nbrs_x, g.nbrs_x[index_grid_nbrs (i, j, k, itr, &g)] ); + //gcj = &( g.cells [ index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g) ]); + cutoff = SQR(g.cutoff[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]); + + if( g.str[index_grid_3d (i, j, k, &g)] >= g.str[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)] && + (Dev_DistSqr_to_Special_Point(g.nbrs_cp[index_grid_nbrs (i, j, k, itr, &g)],atom1->x)<=cutoff) ) + for( m = g.str[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; + m < g.end[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; ++m ) { + if(( l > m )) { + atom2 = &(my_atoms[m]); + dvec[0] = atom1->x[0] - atom2->x[0]; + dvec[1] = atom1->x[1] - atom2->x[1]; + dvec[2] = atom1->x[2] - atom2->x[2]; + d = rvec_Norm_Sqr( dvec ); + if( d <= cutoff ) { + nbr_data = &(far_nbrs.select.far_nbr_list[num_far]); + nbr_data->nbr = m; + nbr_data->d = SQRT(d); + rvec_Copy( nbr_data->dvec, dvec ); + //ivec_ScaledSum( nbr_data->rel_box, -1, gcj->rel_box, 1, gci->rel_box ); + ivec_ScaledSum( nbr_data->rel_box, 1, g.rel_box[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)], + -1, g.rel_box[index_grid_3d (i, j, k, &g)] ); + ++num_far; + } + } + } + ++itr; + } + + Dev_Set_End_Index( l, num_far, &far_nbrs ); } -CUDA_GLOBAL void ker_mt_generate_neighbor_lists ( reax_atom *my_atoms, - //CUDA_GLOBAL void __launch_bounds__ (1024) ker_mt_generate_neighbor_lists ( reax_atom *my_atoms, - simulation_box my_ext_box, - grid g, - reax_list far_nbrs, - int n, int N ) - { - - extern __shared__ int __nbr[]; - extern __shared__ int __sofar []; - bool nbrgen; - - int __THREADS_PER_ATOM__ = NB_KER_THREADS_PER_ATOM; - - int thread_id = blockIdx.x * blockDim.x + threadIdx.x; - int warp_id = thread_id / __THREADS_PER_ATOM__; - int lane_id = thread_id & (__THREADS_PER_ATOM__ -1); - int my_bucket = threadIdx.x / __THREADS_PER_ATOM__; - - if (warp_id >= N ) return; - - int *tnbr = __nbr; - int *nbrssofar = __nbr + blockDim.x; - int max, leader; - - int i, j, k, l, m, itr, num_far, ll; - real d, cutoff, cutoff_ji; - ivec c, nbrs_x; - rvec dvec; - grid_cell *gci, *gcj; - far_neighbor_data *nbr_data, *my_start; - reax_atom *atom1, *atom2; - - //l = blockIdx.x * blockDim.x + threadIdx.x; - //if (l >= N) return; - - l = warp_id; - - atom1 = &(my_atoms[l]); - num_far = Dev_Start_Index (l, &far_nbrs); - - my_start = &( far_nbrs.select.far_nbr_list [num_far] ); - - //get the coordinates of the atom and - //compute the grid cell - if (l < n) { - for (i = 0; i < 3; i++) - { - c[i] = (int)((my_atoms[l].x[i]- my_ext_box.min[i])*g.inv_len[i]); - if( c[i] >= g.native_end[i] ) - c[i] = g.native_end[i] - 1; - else if( c[i] < g.native_str[i] ) - c[i] = g.native_str[i]; - } - } else { - for (i = 0; i < 3; i++) - { - c[i] = (int)((my_atoms[l].x[i] - my_ext_box.min[i]) * g.inv_len[i]); - if( c[i] < 0 ) c[i] = 0; - else if( c[i] >= g.ncells[i] ) c[i] = g.ncells[i] - 1; - } - } - - i = c[0]; - j = c[1]; - k = c[2]; - - //gci = &( g.cells[ index_grid_3d (i, j, k, &g) ] ); - - - tnbr[threadIdx.x] = 0; - if (lane_id == 0) { - nbrssofar [my_bucket] = 0; - } - __syncthreads (); - - itr = 0; - //while( (gci->nbrs_x[itr][0]) >= 0 ) { - while( (g.nbrs_x[index_grid_nbrs (i, j, k, itr, &g)][0]) >= 0 ) { - - tnbr[threadIdx.x] = 0; - nbrgen = false; - - //ivec_Copy (nbrs_x, gci->nbrs_x[itr] ); - ivec_Copy (nbrs_x, g.nbrs_x[index_grid_nbrs (i, j, k, itr, &g)] ); - //gcj = &( g.cells [ index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g) ]); - - //cutoff = SQR(gci->cutoff); - cutoff = SQR (g.cutoff [index_grid_3d (i, j, k, &g)]); - //cutoff_ji = SQR(gcj->cutoff); - cutoff_ji = SQR(g.cutoff[ index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g) ]); - //if( ((gci->str <= gcj->str) && (Dev_DistSqr_to_Special_Point(gci->nbrs_cp[itr],atom1->x)<=cutoff)) - // || ((gci->str >= gcj->str) && (Dev_DistSqr_to_Special_Point(gci->nbrs_cp[itr],atom1->x)<=cutoff_ji))) - if( ((g.str[index_grid_3d (i, j, k, &g)] <= g.str[index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]) - && (Dev_DistSqr_to_Special_Point(g.nbrs_cp[index_grid_nbrs (i, j, k, itr, &g)],atom1->x)<=cutoff)) - || ((g.str[index_grid_3d (i, j, k, &g)] >= g.str[index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]) - && (Dev_DistSqr_to_Special_Point(g.nbrs_cp[index_grid_nbrs (i, j, k, itr, &g)],atom1->x)<=cutoff_ji))) - { - //max = gcj->end - gcj->str; - max = g.end[index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)] - g.str[index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; - tnbr[threadIdx.x] = 0; - nbrgen = false; - //m = lane_id + gcj->str; //0-31 - m = lane_id + g.str[index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; //0-31 - int loopcount = max / __THREADS_PER_ATOM__ + ((max % __THREADS_PER_ATOM__) == 0 ? 0 : 1); - int iterations = 0; - - // pick up another atom from the neighbor cell - //for( m = gcj->str; m < gcj->end; ++m ) - while (iterations < loopcount) { - tnbr [threadIdx.x] = 0; - nbrgen = false; - - //if(( l < m ) && (m < gcj->end)) { // prevent recounting same pairs within a gcell - if(( l < m ) && (m < g.end [index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)])) { // prevent recounting same pairs within a gcell - atom2 = &(my_atoms[m]); - dvec[0] = atom2->x[0] - atom1->x[0]; - dvec[1] = atom2->x[1] - atom1->x[1]; - dvec[2] = atom2->x[2] - atom1->x[2]; - d = rvec_Norm_Sqr( dvec ); - if( d <= cutoff ) { - tnbr [threadIdx.x] = 1; - nbrgen = true; - } - } - - //if(( l > m ) && (m < gcj->end)) { - if(( l > m ) && (m < g.end[index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)])) { - atom2 = &(my_atoms[m]); - dvec[0] = atom1->x[0] - atom2->x[0]; - dvec[1] = atom1->x[1] - atom2->x[1]; - dvec[2] = atom1->x[2] - atom2->x[2]; - d = rvec_Norm_Sqr( dvec ); - if( d <= cutoff_ji ) { - tnbr [threadIdx.x] = 1; - nbrgen = true; - } - } - - //is neighbor generated - if (nbrgen) - { - //do leader selection here - leader = -1; - for (ll = my_bucket *__THREADS_PER_ATOM__; ll < (my_bucket)*__THREADS_PER_ATOM__ + __THREADS_PER_ATOM__; ll++) - if (tnbr[ll]){ - leader = ll; - break; - } - - //do the reduction; - if (threadIdx.x == leader) - for (ll = 1; ll < __THREADS_PER_ATOM__; ll++) - tnbr [my_bucket * __THREADS_PER_ATOM__ + ll] += tnbr [my_bucket * __THREADS_PER_ATOM__ + (ll-1)]; - } - - if (nbrgen) - { - //got the indices - nbr_data = my_start + nbrssofar[my_bucket] + tnbr [threadIdx.x] - 1; - nbr_data->nbr = m; - if (l < m) { - dvec[0] = atom2->x[0] - atom1->x[0]; - dvec[1] = atom2->x[1] - atom1->x[1]; - dvec[2] = atom2->x[2] - atom1->x[2]; - d = rvec_Norm_Sqr( dvec ); - nbr_data->d = SQRT (d); - rvec_Copy( nbr_data->dvec, dvec ); - //ivec_ScaledSum( nbr_data->rel_box, 1, gcj->rel_box, -1, gci->rel_box ); - ivec_ScaledSum( nbr_data->rel_box, 1, g.rel_box[index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)], - -1, g.rel_box[index_grid_3d( i, j, k, &g)] ); - } - else { - dvec[0] = atom1->x[0] - atom2->x[0]; - dvec[1] = atom1->x[1] - atom2->x[1]; - dvec[2] = atom1->x[2] - atom2->x[2]; - d = rvec_Norm_Sqr( dvec ); - nbr_data->d = SQRT(d); - rvec_Copy( nbr_data->dvec, dvec ); - //ivec_ScaledSum( nbr_data->rel_box, -1, gcj->rel_box, 1, gci->rel_box ); - /* - CHANGE ORIGINAL - This is a bug in the original code - ivec_ScaledSum( nbr_data->rel_box, 1, g.rel_box[index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)], - -1, g.rel_box[index_grid_3d( i, j, k, &g)] ); - */ - ivec_ScaledSum( nbr_data->rel_box, -1, g.rel_box[index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)], - 1, g.rel_box[index_grid_3d( i, j, k, &g)] ); - } - - if (threadIdx.x == leader) - nbrssofar[my_bucket] += tnbr[my_bucket *__THREADS_PER_ATOM__ + (__THREADS_PER_ATOM__ - 1)]; - } - - m += __THREADS_PER_ATOM__; - iterations ++; - - //cleanup - nbrgen = false; - tnbr [threadIdx.x] = 0; - } - } - ++itr; - } - - if (lane_id == 0) - Dev_Set_End_Index (l, num_far + nbrssofar[my_bucket], &far_nbrs); - //Dev_Set_End_Index( l, num_far, &far_nbrs ); - } - - - - CUDA_GLOBAL void ker_count_total_nbrs (reax_list far_nbrs, int N, int *result) - { - //strided access - extern __shared__ int count[]; - unsigned int i = threadIdx.x; - int my_count = 0; - count[i] = 0; - - for (i = threadIdx.x; i < N; i += threadIdx.x + blockDim.x) - count[threadIdx.x] += Dev_Num_Entries (i, &far_nbrs); - - __syncthreads (); - - for (int offset = blockDim.x/2; offset > 0; offset >>=1 ) - if(threadIdx.x < offset) - count [threadIdx.x] += count [threadIdx.x + offset]; - - __syncthreads (); - - if (threadIdx.x == 0) - *result = count [threadIdx.x]; - } - - extern "C" void Cuda_Generate_Neighbor_Lists( reax_system *system, simulation_data *data, - storage *workspace, reax_list **lists ) - { - int blocks, num_far; - int *d_num_far = (int *) scratch; +CUDA_GLOBAL void ker_mt_generate_neighbor_lists ( reax_atom *my_atoms, + //CUDA_GLOBAL void __launch_bounds__ (1024) ker_mt_generate_neighbor_lists ( reax_atom *my_atoms, + simulation_box my_ext_box, + grid g, + reax_list far_nbrs, + int n, int N ) + { + + extern __shared__ int __nbr[]; + extern __shared__ int __sofar []; + bool nbrgen; + + int __THREADS_PER_ATOM__ = NB_KER_THREADS_PER_ATOM; + + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + int warp_id = thread_id / __THREADS_PER_ATOM__; + int lane_id = thread_id & (__THREADS_PER_ATOM__ -1); + int my_bucket = threadIdx.x / __THREADS_PER_ATOM__; + + if (warp_id >= N ) return; + + int *tnbr = __nbr; + int *nbrssofar = __nbr + blockDim.x; + int max, leader; + + int i, j, k, l, m, itr, num_far, ll; + real d, cutoff, cutoff_ji; + ivec c, nbrs_x; + rvec dvec; + grid_cell *gci, *gcj; + far_neighbor_data *nbr_data, *my_start; + reax_atom *atom1, *atom2; + + //l = blockIdx.x * blockDim.x + threadIdx.x; + //if (l >= N) return; + + l = warp_id; + + atom1 = &(my_atoms[l]); + num_far = Dev_Start_Index (l, &far_nbrs); + + my_start = &( far_nbrs.select.far_nbr_list [num_far] ); + + //get the coordinates of the atom and + //compute the grid cell + if (l < n) { + for (i = 0; i < 3; i++) + { + c[i] = (int)((my_atoms[l].x[i]- my_ext_box.min[i])*g.inv_len[i]); + if( c[i] >= g.native_end[i] ) + c[i] = g.native_end[i] - 1; + else if( c[i] < g.native_str[i] ) + c[i] = g.native_str[i]; + } + } else { + for (i = 0; i < 3; i++) + { + c[i] = (int)((my_atoms[l].x[i] - my_ext_box.min[i]) * g.inv_len[i]); + if( c[i] < 0 ) c[i] = 0; + else if( c[i] >= g.ncells[i] ) c[i] = g.ncells[i] - 1; + } + } + + i = c[0]; + j = c[1]; + k = c[2]; + + //gci = &( g.cells[ index_grid_3d (i, j, k, &g) ] ); + + + tnbr[threadIdx.x] = 0; + if (lane_id == 0) { + nbrssofar [my_bucket] = 0; + } + __syncthreads (); + + itr = 0; + //while( (gci->nbrs_x[itr][0]) >= 0 ) { + while( (g.nbrs_x[index_grid_nbrs (i, j, k, itr, &g)][0]) >= 0 ) { + + tnbr[threadIdx.x] = 0; + nbrgen = false; + + //ivec_Copy (nbrs_x, gci->nbrs_x[itr] ); + ivec_Copy (nbrs_x, g.nbrs_x[index_grid_nbrs (i, j, k, itr, &g)] ); + //gcj = &( g.cells [ index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g) ]); + + //cutoff = SQR(gci->cutoff); + cutoff = SQR (g.cutoff [index_grid_3d (i, j, k, &g)]); + //cutoff_ji = SQR(gcj->cutoff); + cutoff_ji = SQR(g.cutoff[ index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g) ]); + //if( ((gci->str <= gcj->str) && (Dev_DistSqr_to_Special_Point(gci->nbrs_cp[itr],atom1->x)<=cutoff)) + // || ((gci->str >= gcj->str) && (Dev_DistSqr_to_Special_Point(gci->nbrs_cp[itr],atom1->x)<=cutoff_ji))) + if( ((g.str[index_grid_3d (i, j, k, &g)] <= g.str[index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]) + && (Dev_DistSqr_to_Special_Point(g.nbrs_cp[index_grid_nbrs (i, j, k, itr, &g)],atom1->x)<=cutoff)) + || ((g.str[index_grid_3d (i, j, k, &g)] >= g.str[index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]) + && (Dev_DistSqr_to_Special_Point(g.nbrs_cp[index_grid_nbrs (i, j, k, itr, &g)],atom1->x)<=cutoff_ji))) + { + //max = gcj->end - gcj->str; + max = g.end[index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)] - g.str[index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; + tnbr[threadIdx.x] = 0; + nbrgen = false; + //m = lane_id + gcj->str; //0-31 + m = lane_id + g.str[index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; //0-31 + int loopcount = max / __THREADS_PER_ATOM__ + ((max % __THREADS_PER_ATOM__) == 0 ? 0 : 1); + int iterations = 0; + + // pick up another atom from the neighbor cell + //for( m = gcj->str; m < gcj->end; ++m ) + while (iterations < loopcount) { + tnbr [threadIdx.x] = 0; + nbrgen = false; + + //if(( l < m ) && (m < gcj->end)) { // prevent recounting same pairs within a gcell + if(( l < m ) && (m < g.end [index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)])) { // prevent recounting same pairs within a gcell + atom2 = &(my_atoms[m]); + dvec[0] = atom2->x[0] - atom1->x[0]; + dvec[1] = atom2->x[1] - atom1->x[1]; + dvec[2] = atom2->x[2] - atom1->x[2]; + d = rvec_Norm_Sqr( dvec ); + if( d <= cutoff ) { + tnbr [threadIdx.x] = 1; + nbrgen = true; + } + } + + //if(( l > m ) && (m < gcj->end)) { + if(( l > m ) && (m < g.end[index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)])) { + atom2 = &(my_atoms[m]); + dvec[0] = atom1->x[0] - atom2->x[0]; + dvec[1] = atom1->x[1] - atom2->x[1]; + dvec[2] = atom1->x[2] - atom2->x[2]; + d = rvec_Norm_Sqr( dvec ); + if( d <= cutoff_ji ) { + tnbr [threadIdx.x] = 1; + nbrgen = true; + } + } + + //is neighbor generated + if (nbrgen) + { + //do leader selection here + leader = -1; + for (ll = my_bucket *__THREADS_PER_ATOM__; ll < (my_bucket)*__THREADS_PER_ATOM__ + __THREADS_PER_ATOM__; ll++) + if (tnbr[ll]){ + leader = ll; + break; + } + + //do the reduction; + if (threadIdx.x == leader) + for (ll = 1; ll < __THREADS_PER_ATOM__; ll++) + tnbr [my_bucket * __THREADS_PER_ATOM__ + ll] += tnbr [my_bucket * __THREADS_PER_ATOM__ + (ll-1)]; + } + + if (nbrgen) + { + //got the indices + nbr_data = my_start + nbrssofar[my_bucket] + tnbr [threadIdx.x] - 1; + nbr_data->nbr = m; + if (l < m) { + dvec[0] = atom2->x[0] - atom1->x[0]; + dvec[1] = atom2->x[1] - atom1->x[1]; + dvec[2] = atom2->x[2] - atom1->x[2]; + d = rvec_Norm_Sqr( dvec ); + nbr_data->d = SQRT (d); + rvec_Copy( nbr_data->dvec, dvec ); + //ivec_ScaledSum( nbr_data->rel_box, 1, gcj->rel_box, -1, gci->rel_box ); + ivec_ScaledSum( nbr_data->rel_box, 1, g.rel_box[index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)], + -1, g.rel_box[index_grid_3d( i, j, k, &g)] ); + } + else { + dvec[0] = atom1->x[0] - atom2->x[0]; + dvec[1] = atom1->x[1] - atom2->x[1]; + dvec[2] = atom1->x[2] - atom2->x[2]; + d = rvec_Norm_Sqr( dvec ); + nbr_data->d = SQRT(d); + rvec_Copy( nbr_data->dvec, dvec ); + //ivec_ScaledSum( nbr_data->rel_box, -1, gcj->rel_box, 1, gci->rel_box ); + /* + CHANGE ORIGINAL + This is a bug in the original code + ivec_ScaledSum( nbr_data->rel_box, 1, g.rel_box[index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)], + -1, g.rel_box[index_grid_3d( i, j, k, &g)] ); + */ + ivec_ScaledSum( nbr_data->rel_box, -1, g.rel_box[index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)], + 1, g.rel_box[index_grid_3d( i, j, k, &g)] ); + } + + if (threadIdx.x == leader) + nbrssofar[my_bucket] += tnbr[my_bucket *__THREADS_PER_ATOM__ + (__THREADS_PER_ATOM__ - 1)]; + } + + m += __THREADS_PER_ATOM__; + iterations ++; + + //cleanup + nbrgen = false; + tnbr [threadIdx.x] = 0; + } + } + ++itr; + } + + if (lane_id == 0) + Dev_Set_End_Index (l, num_far + nbrssofar[my_bucket], &far_nbrs); + //Dev_Set_End_Index( l, num_far, &far_nbrs ); + } + + + + CUDA_GLOBAL void ker_count_total_nbrs (reax_list far_nbrs, int N, int *result) + { + //strided access + extern __shared__ int count[]; + unsigned int i = threadIdx.x; + int my_count = 0; + count[i] = 0; + + for (i = threadIdx.x; i < N; i += threadIdx.x + blockDim.x) + count[threadIdx.x] += Dev_Num_Entries (i, &far_nbrs); + + __syncthreads (); + + for (int offset = blockDim.x/2; offset > 0; offset >>=1 ) + if(threadIdx.x < offset) + count [threadIdx.x] += count [threadIdx.x + offset]; + + __syncthreads (); + + if (threadIdx.x == 0) + *result = count [threadIdx.x]; + } + + extern "C" void Cuda_Generate_Neighbor_Lists( reax_system *system, simulation_data *data, + storage *workspace, reax_list **lists ) + { + int blocks, num_far; + int *d_num_far = (int *) scratch; #if defined(LOG_PERFORMANCE) - real t_start=0, t_elapsed=0; + real t_start=0, t_elapsed=0; - if( system->my_rank == MASTER_NODE ) - t_start = Get_Time( ); + if( system->my_rank == MASTER_NODE ) + t_start = Get_Time( ); #endif - cuda_memset (d_num_far, 0, sizeof (int), "num_far"); - - //invoke the kernel here - //one thread per atom implementation - /* - blocks = (system->N / NBRS_BLOCK_SIZE) + - ((system->N % NBRS_BLOCK_SIZE) == 0 ? 0 : 1); - ker_generate_neighbor_lists <<<blocks, NBRS_BLOCK_SIZE>>> - (system->d_my_atoms, system->my_ext_box, system->d_my_grid, - *(*dev_lists + FAR_NBRS), system->n, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - */ - - //Multiple threads per atom implementation - blocks = ((system->N * NB_KER_THREADS_PER_ATOM) / NBRS_BLOCK_SIZE) + - (((system->N * NB_KER_THREADS_PER_ATOM) % NBRS_BLOCK_SIZE) == 0 ? 0 : 1); - ker_mt_generate_neighbor_lists <<<blocks, NBRS_BLOCK_SIZE, - //sizeof (int) * (NBRS_BLOCK_SIZE + (NBRS_BLOCK_SIZE / NB_KER_THREADS_PER_ATOM)) >>> - sizeof (int) * 2 * (NBRS_BLOCK_SIZE) >>> - (system->d_my_atoms, system->my_ext_box, system->d_my_grid, - *(*dev_lists + FAR_NBRS), system->n, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - /* - ker_count_total_nbrs <<<1, NBRS_BLOCK_SIZE, sizeof (int) * NBRS_BLOCK_SIZE>>> - (*(*dev_lists + FAR_NBRS), system->N, d_num_far); - cudaThreadSynchronize (); - cudaCheckError (); - copy_host_device (&num_far, d_num_far, sizeof (int), cudaMemcpyDeviceToHost, "num_far"); - */ - - int *index = (int *) host_scratch; - memset (index , 0, 2 * sizeof (int) * system->N); - int *end_index = index + system->N; - - copy_host_device (index, (*dev_lists + FAR_NBRS)->index, - sizeof (int) * (*dev_lists + FAR_NBRS)->n, cudaMemcpyDeviceToHost, "nbrs:index"); - copy_host_device (end_index, (*dev_lists + FAR_NBRS)->end_index, - sizeof (int) * (*dev_lists + FAR_NBRS)->n, cudaMemcpyDeviceToHost, "nbrs:end_index"); - - num_far = 0; - for (int i = 0; i < system->N; i++) - num_far = end_index[i] - index[i]; - - dev_workspace->realloc.num_far = num_far; + cuda_memset (d_num_far, 0, sizeof (int), "num_far"); + + //invoke the kernel here + //one thread per atom implementation + /* + blocks = (system->N / NBRS_BLOCK_SIZE) + + ((system->N % NBRS_BLOCK_SIZE) == 0 ? 0 : 1); + ker_generate_neighbor_lists <<<blocks, NBRS_BLOCK_SIZE>>> + (system->d_my_atoms, system->my_ext_box, system->d_my_grid, + *(*dev_lists + FAR_NBRS), system->n, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + */ + + //Multiple threads per atom implementation + blocks = ((system->N * NB_KER_THREADS_PER_ATOM) / NBRS_BLOCK_SIZE) + + (((system->N * NB_KER_THREADS_PER_ATOM) % NBRS_BLOCK_SIZE) == 0 ? 0 : 1); + ker_mt_generate_neighbor_lists <<<blocks, NBRS_BLOCK_SIZE, + //sizeof (int) * (NBRS_BLOCK_SIZE + (NBRS_BLOCK_SIZE / NB_KER_THREADS_PER_ATOM)) >>> + sizeof (int) * 2 * (NBRS_BLOCK_SIZE) >>> + (system->d_my_atoms, system->my_ext_box, system->d_my_grid, + *(*dev_lists + FAR_NBRS), system->n, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + /* + ker_count_total_nbrs <<<1, NBRS_BLOCK_SIZE, sizeof (int) * NBRS_BLOCK_SIZE>>> + (*(*dev_lists + FAR_NBRS), system->N, d_num_far); + cudaThreadSynchronize (); + cudaCheckError (); + copy_host_device (&num_far, d_num_far, sizeof (int), cudaMemcpyDeviceToHost, "num_far"); + */ + + int *index = (int *) host_scratch; + memset (index , 0, 2 * sizeof (int) * system->N); + int *end_index = index + system->N; + + copy_host_device (index, (*dev_lists + FAR_NBRS)->index, + sizeof (int) * (*dev_lists + FAR_NBRS)->n, cudaMemcpyDeviceToHost, "nbrs:index"); + copy_host_device (end_index, (*dev_lists + FAR_NBRS)->end_index, + sizeof (int) * (*dev_lists + FAR_NBRS)->n, cudaMemcpyDeviceToHost, "nbrs:end_index"); + + num_far = 0; + for (int i = 0; i < system->N; i++) + num_far = end_index[i] - index[i]; + + dev_workspace->realloc.num_far = num_far; #if defined(LOG_PERFORMANCE) - if( system->my_rank == MASTER_NODE ) { - t_elapsed = Get_Timing_Info( t_start ); - data->timing.nbrs += t_elapsed; - } + if( system->my_rank == MASTER_NODE ) { + t_elapsed = Get_Timing_Info( t_start ); + data->timing.nbrs += t_elapsed; + } #endif #if defined(DEBUG_FOCUS) - fprintf( stderr, "p%d @ step%d: nbrs done - num_far=%d\n", - system->my_rank, data->step, num_far ); - MPI_Barrier( MPI_COMM_WORLD ); + fprintf( stderr, "p%d @ step%d: nbrs done - num_far=%d\n", + system->my_rank, data->step, num_far ); + MPI_Barrier( MPI_COMM_WORLD ); #endif - } - - CUDA_GLOBAL void ker_estimate_neighbors ( reax_atom *my_atoms, - simulation_box my_ext_box, - grid g, - int n, - int N, - int *indices) - { - int i, j, k, l, m, itr, num_far; - real d, cutoff; - rvec dvec, c; - ivec nbrs_x; - grid_cell *gci, *gcj; - far_neighbor_data *nbr_data;//, *my_start; - reax_atom *atom1, *atom2; - - l = blockIdx.x * blockDim.x + threadIdx.x; - if (l >= N) return; - - num_far = 0; - atom1 = &(my_atoms[l]); - indices [l] = 0; - - //if (atom1->orig_id < 0) return; - - //get the coordinates of the atom and - //compute the grid cell - if (l < n) { - for (i = 0; i < 3; i++) - { - c[i] = (int)((my_atoms[l].x[i]- my_ext_box.min[i])*g.inv_len[i]); - if( c[i] >= g.native_end[i] ) - c[i] = g.native_end[i] - 1; - else if( c[i] < g.native_str[i] ) - c[i] = g.native_str[i]; - } - } else { - for (i = 0; i < 3; i++) - { - c[i] = (int)((my_atoms[l].x[i] - my_ext_box.min[i]) * g.inv_len[i]); - if( c[i] < 0 ) c[i] = 0; - else if( c[i] >= g.ncells[i] ) c[i] = g.ncells[i] - 1; - } - } - - i = c[0]; - j = c[1]; - k = c[2]; - - //gci = &( g.cells[ index_grid_3d (i, j, k, &g) ] ); - //cutoff = SQR(gci->cutoff); - cutoff = SQR(g.cutoff [index_grid_3d (i, j, k, &g) ]); - - itr = 0; - while( (g.nbrs_x[index_grid_nbrs (i, j, k, itr, &g)][0]) >= 0) { - ivec_Copy (nbrs_x, g.nbrs_x[index_grid_nbrs (i, j, k, itr, &g)] ); - //gcj = &( g.cells [ index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g) ]); - - if( //(g.str[index_grid_3d (i, j, k, &g)] <= g.str[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]) && - (Dev_DistSqr_to_Special_Point(g.nbrs_cp[index_grid_nbrs (i, j, k, itr, &g)],atom1->x)<=cutoff) ) - { - // pick up another atom from the neighbor cell - for( m = g.str[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; - m < g.end[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; ++m ) - { - if( l < m ) { // prevent recounting same pairs within a gcell - atom2 = &(my_atoms[m]); - dvec[0] = atom2->x[0] - atom1->x[0]; - dvec[1] = atom2->x[1] - atom1->x[1]; - dvec[2] = atom2->x[2] - atom1->x[2]; - d = rvec_Norm_Sqr( dvec ); - if( d <= cutoff ) { - num_far ++; - } - } - } - } - ++itr; - - } - - itr = 0; - while( (g.nbrs_x[index_grid_nbrs (i, j, k, itr, &g)][0]) >= 0) { - ivec_Copy (nbrs_x, g.nbrs_x[index_grid_nbrs (i, j, k, itr, &g)] ); - //gcj = &( g.cells [ index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g) ]); - cutoff = SQR(g.cutoff[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]); - - if( g.str[index_grid_3d (i, j, k, &g)] >= g.str[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)] && - (Dev_DistSqr_to_Special_Point(g.nbrs_cp[index_grid_nbrs (i, j, k, itr, &g)],atom1->x)<=cutoff) ) - { - // pick up another atom from the neighbor cell - for( m = g.str[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; - m < g.end[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; ++m ) - { - if( l > m ) { // prevent recounting same pairs within a gcell - atom2 = &(my_atoms[m]); - dvec[0] = atom2->x[0] - atom1->x[0]; - dvec[1] = atom2->x[1] - atom1->x[1]; - dvec[2] = atom2->x[2] - atom1->x[2]; - d = rvec_Norm_Sqr( dvec ); - if( d <= cutoff ) { - num_far ++; - } - } - } - } - ++itr; - } - - indices [l] = num_far;// * SAFE_ZONE; - } - - void Cuda_Estimate_Neighbors( reax_system *system, int *nbr_indices ) - { - int blocks, num_nbrs; - int *indices = (int *) scratch; - reax_list *far_nbrs; - - cuda_memset (indices, 0, sizeof (int) * system->total_cap, - "neighbors:indices"); - - blocks = system->N / DEF_BLOCK_SIZE + - ((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1); - ker_estimate_neighbors <<< blocks, DEF_BLOCK_SIZE >>> - (system->d_my_atoms, (system->my_ext_box), system->d_my_grid, - system->n, system->N, indices); - cudaThreadSynchronize (); - cudaCheckError (); - - copy_host_device (nbr_indices, indices, sizeof (int) * system->total_cap, - cudaMemcpyDeviceToHost, "nbrs:indices"); - } - - void Cuda_Init_Neighbors_Indices (int *indices, int entries) - { - reax_list *far_nbrs = *dev_lists + FAR_NBRS; - - copy_host_device (indices, (far_nbrs->index + 1), (entries -1) * sizeof (int), - cudaMemcpyHostToDevice, "nbrs:index"); - copy_host_device (indices, (far_nbrs->end_index + 1), (entries-1) * sizeof (int), - cudaMemcpyHostToDevice, "nbrs:end_index"); - } - - void Cuda_Init_HBond_Indices (int *indices, int entries) - { - reax_list *hbonds = *dev_lists + HBONDS; - - for (int i = 1 ; i < entries; i++) - indices [i] += indices [i-1]; - - copy_host_device (indices, hbonds->index + 1, (entries-1) * sizeof (int), - cudaMemcpyHostToDevice, "hbonds:index"); - copy_host_device (indices, hbonds->end_index + 1, (entries-1) * sizeof (int), - cudaMemcpyHostToDevice, "hbonds:end_index"); - } - - void Cuda_Init_Bond_Indices (int *indices, int entries, int num_intrs) - { - reax_list *bonds = *dev_lists + BONDS; - - indices[0] = MAX( indices[0]*2, MIN_BONDS); - for (int i = 1 ; i < entries; i++) { - indices[i] = MAX( indices[i]*2, MIN_BONDS); - } - - for (int i = 1 ; i < entries; i++) { - indices[i] += indices[i-1]; - } - - copy_host_device (indices, (bonds->index + 1), (entries - 1) * sizeof (int), - cudaMemcpyHostToDevice, "bonds:index"); - copy_host_device (indices, (bonds->end_index + 1), (entries - 1) * sizeof (int), - cudaMemcpyHostToDevice, "bonds:end_index"); - - for (int i = 1 ; i < entries; i++) - if (indices [i] > num_intrs) { - fprintf (stderr, "We have a problem here ==> %d index: %d, num_intrs: %d \n", - i, indices[i], num_intrs); - exit (-1); - } - } - - /* - - CUDA_GLOBAL void ker_validate_neighbors (reax_atom *my_atoms, - reax_list far_nbrs, - int N) - { - int i, j, pj; - far_neighbor_data *nbr_pj; - reax_atom *atom_i; - int start_i, end_i; - - i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= N) return; - - atom_i = &( my_atoms[i] ); - start_i = Dev_Start_Index (i, &far_nbrs ); - end_i = Dev_End_Index (i, &far_nbrs ); - - for( pj = start_i; pj < end_i; ++pj ) { - nbr_pj = &( far_nbrs.select.far_nbr_list[pj] ); - j = nbr_pj->nbr; - nbr_pj->d = 0; - rvec_MakeZero (nbr_pj->dvec); - } - } - - void validate_neighbors (reax_system *system) - { - int blocks; - blocks = (system->N / NBRS_BLOCK_SIZE) + - ((system->N % NBRS_BLOCK_SIZE) == 0 ? 0 : 1); - ker_validate_neighbors <<< blocks, NBRS_BLOCK_SIZE>>> - (system->d_my_atoms, *(*dev_lists + FAR_NBRS), system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - fprintf (stderr, " Neighbors validated and is fine... \n"); - } - - */ + } + + CUDA_GLOBAL void ker_estimate_neighbors ( reax_atom *my_atoms, + simulation_box my_ext_box, + grid g, + int n, + int N, + int *indices) + { + int i, j, k, l, m, itr, num_far; + real d, cutoff; + rvec dvec, c; + ivec nbrs_x; + grid_cell *gci, *gcj; + far_neighbor_data *nbr_data;//, *my_start; + reax_atom *atom1, *atom2; + + l = blockIdx.x * blockDim.x + threadIdx.x; + if (l >= N) return; + + num_far = 0; + atom1 = &(my_atoms[l]); + indices [l] = 0; + + //if (atom1->orig_id < 0) return; + + //get the coordinates of the atom and + //compute the grid cell + if (l < n) { + for (i = 0; i < 3; i++) + { + c[i] = (int)((my_atoms[l].x[i]- my_ext_box.min[i])*g.inv_len[i]); + if( c[i] >= g.native_end[i] ) + c[i] = g.native_end[i] - 1; + else if( c[i] < g.native_str[i] ) + c[i] = g.native_str[i]; + } + } else { + for (i = 0; i < 3; i++) + { + c[i] = (int)((my_atoms[l].x[i] - my_ext_box.min[i]) * g.inv_len[i]); + if( c[i] < 0 ) c[i] = 0; + else if( c[i] >= g.ncells[i] ) c[i] = g.ncells[i] - 1; + } + } + + i = c[0]; + j = c[1]; + k = c[2]; + + //gci = &( g.cells[ index_grid_3d (i, j, k, &g) ] ); + //cutoff = SQR(gci->cutoff); + cutoff = SQR(g.cutoff [index_grid_3d (i, j, k, &g) ]); + + itr = 0; + while( (g.nbrs_x[index_grid_nbrs (i, j, k, itr, &g)][0]) >= 0) { + ivec_Copy (nbrs_x, g.nbrs_x[index_grid_nbrs (i, j, k, itr, &g)] ); + //gcj = &( g.cells [ index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g) ]); + + if( //(g.str[index_grid_3d (i, j, k, &g)] <= g.str[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]) && + (Dev_DistSqr_to_Special_Point(g.nbrs_cp[index_grid_nbrs (i, j, k, itr, &g)],atom1->x)<=cutoff) ) + { + // pick up another atom from the neighbor cell + for( m = g.str[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; + m < g.end[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; ++m ) + { + if( l < m ) { // prevent recounting same pairs within a gcell + atom2 = &(my_atoms[m]); + dvec[0] = atom2->x[0] - atom1->x[0]; + dvec[1] = atom2->x[1] - atom1->x[1]; + dvec[2] = atom2->x[2] - atom1->x[2]; + d = rvec_Norm_Sqr( dvec ); + if( d <= cutoff ) { + num_far ++; + } + } + } + } + ++itr; + + } + + itr = 0; + while( (g.nbrs_x[index_grid_nbrs (i, j, k, itr, &g)][0]) >= 0) { + ivec_Copy (nbrs_x, g.nbrs_x[index_grid_nbrs (i, j, k, itr, &g)] ); + //gcj = &( g.cells [ index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g) ]); + cutoff = SQR(g.cutoff[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]); + + if( g.str[index_grid_3d (i, j, k, &g)] >= g.str[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)] && + (Dev_DistSqr_to_Special_Point(g.nbrs_cp[index_grid_nbrs (i, j, k, itr, &g)],atom1->x)<=cutoff) ) + { + // pick up another atom from the neighbor cell + for( m = g.str[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; + m < g.end[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; ++m ) + { + if( l > m ) { // prevent recounting same pairs within a gcell + atom2 = &(my_atoms[m]); + dvec[0] = atom2->x[0] - atom1->x[0]; + dvec[1] = atom2->x[1] - atom1->x[1]; + dvec[2] = atom2->x[2] - atom1->x[2]; + d = rvec_Norm_Sqr( dvec ); + if( d <= cutoff ) { + num_far ++; + } + } + } + } + ++itr; + } + + indices [l] = num_far;// * SAFE_ZONE; + } + + void Cuda_Estimate_Neighbors( reax_system *system, int *nbr_indices ) + { + int blocks, num_nbrs; + int *indices = (int *) scratch; + reax_list *far_nbrs; + + cuda_memset (indices, 0, sizeof (int) * system->total_cap, + "neighbors:indices"); + + blocks = system->N / DEF_BLOCK_SIZE + + ((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1); + ker_estimate_neighbors <<< blocks, DEF_BLOCK_SIZE >>> + (system->d_my_atoms, (system->my_ext_box), system->d_my_grid, + system->n, system->N, indices); + cudaThreadSynchronize (); + cudaCheckError (); + + copy_host_device (nbr_indices, indices, sizeof (int) * system->total_cap, + cudaMemcpyDeviceToHost, "nbrs:indices"); + } + + void Cuda_Init_Neighbors_Indices (int *indices, int entries) + { + reax_list *far_nbrs = *dev_lists + FAR_NBRS; + + copy_host_device (indices, (far_nbrs->index + 1), (entries -1) * sizeof (int), + cudaMemcpyHostToDevice, "nbrs:index"); + copy_host_device (indices, (far_nbrs->end_index + 1), (entries-1) * sizeof (int), + cudaMemcpyHostToDevice, "nbrs:end_index"); + } + + void Cuda_Init_HBond_Indices (int *indices, int entries) + { + reax_list *hbonds = *dev_lists + HBONDS; + + for (int i = 1 ; i < entries; i++) + indices [i] += indices [i-1]; + + copy_host_device (indices, hbonds->index + 1, (entries-1) * sizeof (int), + cudaMemcpyHostToDevice, "hbonds:index"); + copy_host_device (indices, hbonds->end_index + 1, (entries-1) * sizeof (int), + cudaMemcpyHostToDevice, "hbonds:end_index"); + } + + void Cuda_Init_Bond_Indices (int *indices, int entries, int num_intrs) + { + reax_list *bonds = *dev_lists + BONDS; + + indices[0] = MAX( indices[0]*2, MIN_BONDS); + for (int i = 1 ; i < entries; i++) { + indices[i] = MAX( indices[i]*2, MIN_BONDS); + } + + for (int i = 1 ; i < entries; i++) { + indices[i] += indices[i-1]; + } + + copy_host_device (indices, (bonds->index + 1), (entries - 1) * sizeof (int), + cudaMemcpyHostToDevice, "bonds:index"); + copy_host_device (indices, (bonds->end_index + 1), (entries - 1) * sizeof (int), + cudaMemcpyHostToDevice, "bonds:end_index"); + + for (int i = 1 ; i < entries; i++) + if (indices [i] > num_intrs) { + fprintf (stderr, "We have a problem here ==> %d index: %d, num_intrs: %d \n", + i, indices[i], num_intrs); + exit (-1); + } + } + + /* + + CUDA_GLOBAL void ker_validate_neighbors (reax_atom *my_atoms, + reax_list far_nbrs, + int N) + { + int i, j, pj; + far_neighbor_data *nbr_pj; + reax_atom *atom_i; + int start_i, end_i; + + i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= N) return; + + atom_i = &( my_atoms[i] ); + start_i = Dev_Start_Index (i, &far_nbrs ); + end_i = Dev_End_Index (i, &far_nbrs ); + + for( pj = start_i; pj < end_i; ++pj ) { + nbr_pj = &( far_nbrs.select.far_nbr_list[pj] ); + j = nbr_pj->nbr; + nbr_pj->d = 0; + rvec_MakeZero (nbr_pj->dvec); + } + } + + void validate_neighbors (reax_system *system) + { + int blocks; + blocks = (system->N / NBRS_BLOCK_SIZE) + + ((system->N % NBRS_BLOCK_SIZE) == 0 ? 0 : 1); + ker_validate_neighbors <<< blocks, NBRS_BLOCK_SIZE>>> + (system->d_my_atoms, *(*dev_lists + FAR_NBRS), system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + fprintf (stderr, " Neighbors validated and is fine... \n"); + } + + */ diff --git a/PG-PuReMD/src/cuda_nonbonded.cu b/PG-PuReMD/src/cuda_nonbonded.cu index 6dc60b06..15eae7bc 100644 --- a/PG-PuReMD/src/cuda_nonbonded.cu +++ b/PG-PuReMD/src/cuda_nonbonded.cu @@ -30,590 +30,590 @@ #include "cuda_shuffle.h" CUDA_GLOBAL void ker_vdW_coulomb_energy( - //CUDA_GLOBAL void __launch_bounds__ (960) ker_vdW_coulomb_energy( - reax_atom *my_atoms, - two_body_parameters *tbp, - global_parameters gp, - control_params *control, - storage p_workspace, - reax_list p_far_nbrs, - int n, int N, int num_atom_types, - real *data_e_vdW, real *data_e_ele, - rvec *data_ext_press) - { + //CUDA_GLOBAL void __launch_bounds__ (960) ker_vdW_coulomb_energy( + reax_atom *my_atoms, + two_body_parameters *tbp, + global_parameters gp, + control_params *control, + storage p_workspace, + reax_list p_far_nbrs, + int n, int N, int num_atom_types, + real *data_e_vdW, real *data_e_ele, + rvec *data_ext_press) + { #if defined(__SM_35__) - real sh_vdw; - real sh_ele; - rvec sh_force; + real sh_vdw; + real sh_ele; + rvec sh_force; #else - extern __shared__ real _vdw[]; - extern __shared__ real _ele[]; - extern __shared__ rvec _force []; + extern __shared__ real _vdw[]; + extern __shared__ real _ele[]; + extern __shared__ rvec _force []; - real *sh_vdw; - real *sh_ele; - rvec *sh_force; + real *sh_vdw; + real *sh_ele; + rvec *sh_force; #endif - int i, j, pj, natoms; - int start_i, end_i, orig_i, orig_j; - real p_vdW1, p_vdW1i; - real powr_vdW1, powgi_vdW1; - real tmp, r_ij, fn13, exp1, exp2; - real Tap, dTap, dfn13, CEvd, CEclmb, de_core; - real dr3gamij_1, dr3gamij_3; - real e_ele, e_vdW, e_core; - rvec temp, ext_press; - two_body_parameters *twbp; - far_neighbor_data *nbr_pj; - reax_list *far_nbrs; - storage *workspace = &( p_workspace ); - // rtensor temp_rtensor, total_rtensor; - - int thread_id = blockIdx.x * blockDim.x + threadIdx.x; - int warpid = thread_id / VDW_KER_THREADS_PER_ATOM; - int laneid = thread_id & (VDW_KER_THREADS_PER_ATOM -1); + int i, j, pj, natoms; + int start_i, end_i, orig_i, orig_j; + real p_vdW1, p_vdW1i; + real powr_vdW1, powgi_vdW1; + real tmp, r_ij, fn13, exp1, exp2; + real Tap, dTap, dfn13, CEvd, CEclmb, de_core; + real dr3gamij_1, dr3gamij_3; + real e_ele, e_vdW, e_core; + rvec temp, ext_press; + two_body_parameters *twbp; + far_neighbor_data *nbr_pj; + reax_list *far_nbrs; + storage *workspace = &( p_workspace ); + // rtensor temp_rtensor, total_rtensor; + + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + int warpid = thread_id / VDW_KER_THREADS_PER_ATOM; + int laneid = thread_id & (VDW_KER_THREADS_PER_ATOM -1); #if defined(__SM_35__) - sh_vdw = 0.0; - sh_ele = 0.0; - rvec_MakeZero ( sh_force ); + sh_vdw = 0.0; + sh_ele = 0.0; + rvec_MakeZero ( sh_force ); #else - sh_vdw = _vdw; - sh_ele = _vdw + blockDim.x; - sh_force = (rvec *)( _vdw + 2*blockDim.x); + sh_vdw = _vdw; + sh_ele = _vdw + blockDim.x; + sh_force = (rvec *)( _vdw + 2*blockDim.x); - sh_vdw[threadIdx.x] = 0.0; - sh_ele[threadIdx.x] = 0.0; - rvec_MakeZero ( sh_force [threadIdx.x] ); + sh_vdw[threadIdx.x] = 0.0; + sh_ele[threadIdx.x] = 0.0; + rvec_MakeZero ( sh_force [threadIdx.x] ); #endif - //i = blockIdx.x * blockDim.x + threadIdx.x; - //if (i >= N) return; - i = warpid; - - if (i < N) - { - natoms = n; - far_nbrs = &( p_far_nbrs ); - p_vdW1 = gp.l[28]; - p_vdW1i = 1.0 / p_vdW1; - e_core = 0; - e_vdW = 0; - - data_e_vdW [i] = 0; - data_e_ele [i] = 0; - - //for( i = 0; i < natoms; ++i ) { - start_i = Dev_Start_Index(i, far_nbrs); - end_i = Dev_End_Index(i, far_nbrs); - orig_i = my_atoms[i].orig_id; - //fprintf( stderr, "i:%d, start_i: %d, end_i: %d\n", i, start_i, end_i ); - - //for( pj = start_i; pj < end_i; ++pj ) - pj = start_i + laneid; - while (pj < end_i) - { - - nbr_pj = &(far_nbrs->select.far_nbr_list[pj]); - j = nbr_pj->nbr; - orig_j = my_atoms[j].orig_id; - - if( nbr_pj->d <= control->nonb_cut && - (((i < j) && (i < natoms) && (j < natoms || orig_i < orig_j)) - || ((i > j) && (i < natoms) && (j < natoms)) - || (i > j && i >= natoms && j < natoms && orig_j < orig_i))) - { // ji with j >= n - r_ij = nbr_pj->d; - twbp = &(tbp[ index_tbp (my_atoms[i].type, my_atoms[j].type, num_atom_types) ]); - - /* Calculate Taper and its derivative */ - // Tap = nbr_pj->Tap; -- precomputed during compte_H - Tap = workspace->Tap[7] * r_ij + workspace->Tap[6]; - Tap = Tap * r_ij + workspace->Tap[5]; - Tap = Tap * r_ij + workspace->Tap[4]; - Tap = Tap * r_ij + workspace->Tap[3]; - Tap = Tap * r_ij + workspace->Tap[2]; - Tap = Tap * r_ij + workspace->Tap[1]; - Tap = Tap * r_ij + workspace->Tap[0]; - - dTap = 7*workspace->Tap[7] * r_ij + 6*workspace->Tap[6]; - dTap = dTap * r_ij + 5*workspace->Tap[5]; - dTap = dTap * r_ij + 4*workspace->Tap[4]; - dTap = dTap * r_ij + 3*workspace->Tap[3]; - dTap = dTap * r_ij + 2*workspace->Tap[2]; - dTap += workspace->Tap[1]/r_ij; - - /*vdWaals Calculations*/ - if(gp.vdw_type==1 || gp.vdw_type==3) - { // shielding - powr_vdW1 = POW(r_ij, p_vdW1); - powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1); - - fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i ); - exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); - exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); - - e_vdW = twbp->D * (exp1 - 2.0 * exp2); - - //data_e_vdW [i] += Tap * e_vdW; - // data_e_vdW [i] += Tap * e_vdW / 2.0; + //i = blockIdx.x * blockDim.x + threadIdx.x; + //if (i >= N) return; + i = warpid; + + if (i < N) + { + natoms = n; + far_nbrs = &( p_far_nbrs ); + p_vdW1 = gp.l[28]; + p_vdW1i = 1.0 / p_vdW1; + e_core = 0; + e_vdW = 0; + + data_e_vdW [i] = 0; + data_e_ele [i] = 0; + + //for( i = 0; i < natoms; ++i ) { + start_i = Dev_Start_Index(i, far_nbrs); + end_i = Dev_End_Index(i, far_nbrs); + orig_i = my_atoms[i].orig_id; + //fprintf( stderr, "i:%d, start_i: %d, end_i: %d\n", i, start_i, end_i ); + + //for( pj = start_i; pj < end_i; ++pj ) + pj = start_i + laneid; + while (pj < end_i) + { + + nbr_pj = &(far_nbrs->select.far_nbr_list[pj]); + j = nbr_pj->nbr; + orig_j = my_atoms[j].orig_id; + + if( nbr_pj->d <= control->nonb_cut && + (((i < j) && (i < natoms) && (j < natoms || orig_i < orig_j)) + || ((i > j) && (i < natoms) && (j < natoms)) + || (i > j && i >= natoms && j < natoms && orig_j < orig_i))) + { // ji with j >= n + r_ij = nbr_pj->d; + twbp = &(tbp[ index_tbp (my_atoms[i].type, my_atoms[j].type, num_atom_types) ]); + + /* Calculate Taper and its derivative */ + // Tap = nbr_pj->Tap; -- precomputed during compte_H + Tap = workspace->Tap[7] * r_ij + workspace->Tap[6]; + Tap = Tap * r_ij + workspace->Tap[5]; + Tap = Tap * r_ij + workspace->Tap[4]; + Tap = Tap * r_ij + workspace->Tap[3]; + Tap = Tap * r_ij + workspace->Tap[2]; + Tap = Tap * r_ij + workspace->Tap[1]; + Tap = Tap * r_ij + workspace->Tap[0]; + + dTap = 7*workspace->Tap[7] * r_ij + 6*workspace->Tap[6]; + dTap = dTap * r_ij + 5*workspace->Tap[5]; + dTap = dTap * r_ij + 4*workspace->Tap[4]; + dTap = dTap * r_ij + 3*workspace->Tap[3]; + dTap = dTap * r_ij + 2*workspace->Tap[2]; + dTap += workspace->Tap[1]/r_ij; + + /*vdWaals Calculations*/ + if(gp.vdw_type==1 || gp.vdw_type==3) + { // shielding + powr_vdW1 = POW(r_ij, p_vdW1); + powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1); + + fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i ); + exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); + exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); + + e_vdW = twbp->D * (exp1 - 2.0 * exp2); + + //data_e_vdW [i] += Tap * e_vdW; + // data_e_vdW [i] += Tap * e_vdW / 2.0; #if defined(__SM_35__) - sh_vdw += Tap * e_vdW / 2.0; + sh_vdw += Tap * e_vdW / 2.0; #else - sh_vdw [threadIdx.x] += Tap * e_vdW / 2.0; + sh_vdw [threadIdx.x] += Tap * e_vdW / 2.0; #endif - dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * - POW(r_ij, p_vdW1 - 2.0); + dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * + POW(r_ij, p_vdW1 - 2.0); - CEvd = dTap * e_vdW - - Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13; - } - else{ // no shielding - exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) ); - exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) ); + CEvd = dTap * e_vdW - + Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13; + } + else{ // no shielding + exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) ); + exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) ); - e_vdW = twbp->D * (exp1 - 2.0 * exp2); + e_vdW = twbp->D * (exp1 - 2.0 * exp2); - //data_e_vdW [i] += Tap * e_vdW; - //data_e_vdW [i] += Tap * e_vdW / 2.0; + //data_e_vdW [i] += Tap * e_vdW; + //data_e_vdW [i] += Tap * e_vdW / 2.0; #if defined(__SM_35__) - sh_vdw += Tap * e_vdW / 2.0; + sh_vdw += Tap * e_vdW / 2.0; #else - sh_vdw [threadIdx.x] += Tap * e_vdW / 2.0; + sh_vdw [threadIdx.x] += Tap * e_vdW / 2.0; #endif - CEvd = dTap * e_vdW - - Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2); - } + CEvd = dTap * e_vdW - + Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2); + } - if(gp.vdw_type==2 || gp.vdw_type==3) - { // innner wall - e_core = twbp->ecore * EXP(twbp->acore * (1.0-(r_ij/twbp->rcore))); + if(gp.vdw_type==2 || gp.vdw_type==3) + { // innner wall + e_core = twbp->ecore * EXP(twbp->acore * (1.0-(r_ij/twbp->rcore))); - //data_e_vdW [i] += Tap * e_core; - //data_e_vdW [i] += Tap * e_core / 2.0; + //data_e_vdW [i] += Tap * e_core; + //data_e_vdW [i] += Tap * e_core / 2.0; #if defined(__SM_35__) - sh_vdw += Tap * e_core / 2.0; + sh_vdw += Tap * e_core / 2.0; #else - sh_vdw[ threadIdx.x ] += Tap * e_core / 2.0; + sh_vdw[ threadIdx.x ] += Tap * e_core / 2.0; #endif - de_core = -(twbp->acore/twbp->rcore) * e_core; - CEvd += dTap * e_core + Tap * de_core; - } + de_core = -(twbp->acore/twbp->rcore) * e_core; + CEvd += dTap * e_core + Tap * de_core; + } - /*Coulomb Calculations*/ - dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma ); - dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 ); + /*Coulomb Calculations*/ + dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma ); + dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 ); - tmp = Tap / dr3gamij_3; - //data_e_ele [i] += e_ele = C_ele * my_atoms[i].q * my_atoms[j].q * tmp; - e_ele = C_ele * my_atoms[i].q * my_atoms[j].q * tmp; - //data_e_ele [i] += e_ele; - //data_e_ele [i] += e_ele / 2.0; + tmp = Tap / dr3gamij_3; + //data_e_ele [i] += e_ele = C_ele * my_atoms[i].q * my_atoms[j].q * tmp; + e_ele = C_ele * my_atoms[i].q * my_atoms[j].q * tmp; + //data_e_ele [i] += e_ele; + //data_e_ele [i] += e_ele / 2.0; #if defined(__SM_35__) - sh_ele += e_ele / 2.0; + sh_ele += e_ele / 2.0; #else - sh_ele [ threadIdx.x ] += e_ele / 2.0; + sh_ele [ threadIdx.x ] += e_ele / 2.0; #endif - CEclmb = C_ele * my_atoms[i].q * my_atoms[j].q * - ( dTap - Tap * r_ij / dr3gamij_1 ) / dr3gamij_3; - // fprintf( fout, "%5d %5d %10.6f %10.6f\n", - // MIN( system->my_atoms[i].orig_id, system->my_atoms[j].orig_id ), - // MAX( system->my_atoms[i].orig_id, system->my_atoms[j].orig_id ), - // CEvd, CEclmb ); + CEclmb = C_ele * my_atoms[i].q * my_atoms[j].q * + ( dTap - Tap * r_ij / dr3gamij_1 ) / dr3gamij_3; + // fprintf( fout, "%5d %5d %10.6f %10.6f\n", + // MIN( system->my_atoms[i].orig_id, system->my_atoms[j].orig_id ), + // MAX( system->my_atoms[i].orig_id, system->my_atoms[j].orig_id ), + // CEvd, CEclmb ); - if( control->virial == 0 ) { - if ( i < j ) - //rvec_ScaledAdd( workspace->f[i], -(CEvd + CEclmb), nbr_pj->dvec ); + if( control->virial == 0 ) { + if ( i < j ) + //rvec_ScaledAdd( workspace->f[i], -(CEvd + CEclmb), nbr_pj->dvec ); #if defined (__SM_35__) - rvec_ScaledAdd( sh_force, -(CEvd + CEclmb), nbr_pj->dvec ); + rvec_ScaledAdd( sh_force, -(CEvd + CEclmb), nbr_pj->dvec ); #else - rvec_ScaledAdd( sh_force[ threadIdx.x ], -(CEvd + CEclmb), nbr_pj->dvec ); + rvec_ScaledAdd( sh_force[ threadIdx.x ], -(CEvd + CEclmb), nbr_pj->dvec ); #endif - else - //rvec_ScaledAdd( workspace->f[i], +(CEvd + CEclmb), nbr_pj->dvec ); + else + //rvec_ScaledAdd( workspace->f[i], +(CEvd + CEclmb), nbr_pj->dvec ); #if defined (__SM_35__) - rvec_ScaledAdd( sh_force , +(CEvd + CEclmb), nbr_pj->dvec ); + rvec_ScaledAdd( sh_force , +(CEvd + CEclmb), nbr_pj->dvec ); #else - rvec_ScaledAdd( sh_force [ threadIdx.x ], +(CEvd + CEclmb), nbr_pj->dvec ); + rvec_ScaledAdd( sh_force [ threadIdx.x ], +(CEvd + CEclmb), nbr_pj->dvec ); #endif - //rvec_ScaledAdd( workspace->f[j], +(CEvd + CEclmb), nbr_pj->dvec ); - } - else { /* NPT, iNPT or sNPT */ - /* for pressure coupling, terms not related to bond order - derivatives are added directly into pressure vector/tensor */ - rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec ); - - rvec_ScaledAdd( workspace->f[i], -1., temp ); - rvec_Add( workspace->f[j], temp ); - - rvec_iMultiply( ext_press, nbr_pj->rel_box, temp ); - rvec_Add( data_ext_press [i], ext_press ); - - // fprintf( stderr, "nonbonded(%d,%d): rel_box (%f %f %f) - // force(%f %f %f) ext_press (%12.6f %12.6f %12.6f)\n", - // i, j, nbr_pj->rel_box[0], nbr_pj->rel_box[1], nbr_pj->rel_box[2], - // temp[0], temp[1], temp[2], - // data->ext_press[0], data->ext_press[1], data->ext_press[2] ); - } + //rvec_ScaledAdd( workspace->f[j], +(CEvd + CEclmb), nbr_pj->dvec ); + } + else { /* NPT, iNPT or sNPT */ + /* for pressure coupling, terms not related to bond order + derivatives are added directly into pressure vector/tensor */ + rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec ); + + rvec_ScaledAdd( workspace->f[i], -1., temp ); + rvec_Add( workspace->f[j], temp ); + + rvec_iMultiply( ext_press, nbr_pj->rel_box, temp ); + rvec_Add( data_ext_press [i], ext_press ); + + // fprintf( stderr, "nonbonded(%d,%d): rel_box (%f %f %f) + // force(%f %f %f) ext_press (%12.6f %12.6f %12.6f)\n", + // i, j, nbr_pj->rel_box[0], nbr_pj->rel_box[1], nbr_pj->rel_box[2], + // temp[0], temp[1], temp[2], + // data->ext_press[0], data->ext_press[1], data->ext_press[2] ); + } #ifdef TEST_ENERGY - // fprintf( out_control->evdw, - // "%12.9f%12.9f%12.9f%12.9f%12.9f%12.9f%12.9f%12.9f\n", - // workspace->Tap[7],workspace->Tap[6],workspace->Tap[5], - // workspace->Tap[4],workspace->Tap[3],workspace->Tap[2], - // workspace->Tap[1], Tap ); - //fprintf( out_control->evdw, "%6d%6d%24.15e%24.15e%24.15e\n", - fprintf( out_control->evdw, "%6d%6d%12.4f%12.4f%12.4f\n", - system->my_atoms[i].orig_id, system->my_atoms[j].orig_id, - r_ij, e_vdW, data->my_en.e_vdW ); - //fprintf(out_control->ecou,"%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n", - fprintf( out_control->ecou, "%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f\n", - system->my_atoms[i].orig_id, system->my_atoms[j].orig_id, - r_ij, system->my_atoms[i].q, system->my_atoms[j].q, - e_ele, data->my_en.e_ele ); + // fprintf( out_control->evdw, + // "%12.9f%12.9f%12.9f%12.9f%12.9f%12.9f%12.9f%12.9f\n", + // workspace->Tap[7],workspace->Tap[6],workspace->Tap[5], + // workspace->Tap[4],workspace->Tap[3],workspace->Tap[2], + // workspace->Tap[1], Tap ); + //fprintf( out_control->evdw, "%6d%6d%24.15e%24.15e%24.15e\n", + fprintf( out_control->evdw, "%6d%6d%12.4f%12.4f%12.4f\n", + system->my_atoms[i].orig_id, system->my_atoms[j].orig_id, + r_ij, e_vdW, data->my_en.e_vdW ); + //fprintf(out_control->ecou,"%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n", + fprintf( out_control->ecou, "%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f\n", + system->my_atoms[i].orig_id, system->my_atoms[j].orig_id, + r_ij, system->my_atoms[i].q, system->my_atoms[j].q, + e_ele, data->my_en.e_ele ); #endif #ifdef TEST_FORCES - rvec_ScaledAdd( workspace->f_vdw[i], -CEvd, nbr_pj->dvec ); - rvec_ScaledAdd( workspace->f_vdw[j], +CEvd, nbr_pj->dvec ); - rvec_ScaledAdd( workspace->f_ele[i], -CEclmb, nbr_pj->dvec ); - rvec_ScaledAdd( workspace->f_ele[j], +CEclmb, nbr_pj->dvec ); + rvec_ScaledAdd( workspace->f_vdw[i], -CEvd, nbr_pj->dvec ); + rvec_ScaledAdd( workspace->f_vdw[j], +CEvd, nbr_pj->dvec ); + rvec_ScaledAdd( workspace->f_ele[i], -CEclmb, nbr_pj->dvec ); + rvec_ScaledAdd( workspace->f_ele[j], +CEclmb, nbr_pj->dvec ); #endif - } + } - pj += VDW_KER_THREADS_PER_ATOM; + pj += VDW_KER_THREADS_PER_ATOM; - } - // } - } // if i < N + } + // } + } // if i < N #if defined( __SM_35__) - for (int x = VDW_KER_THREADS_PER_ATOM >> 1; x >= 1; x/=2){ - sh_vdw += shfl( sh_vdw, x); - sh_ele += shfl( sh_ele, x ); - sh_force[0] += shfl( sh_force[0], x ); - sh_force[1] += shfl( sh_force[1], x ); - sh_force[2] += shfl( sh_force[2], x ); - } - - if (laneid == 0) { - data_e_vdW[i] += sh_vdw; - data_e_ele[i] += sh_ele; - rvec_Add (workspace->f[i], sh_force ); - } + for (int x = VDW_KER_THREADS_PER_ATOM >> 1; x >= 1; x/=2){ + sh_vdw += shfl( sh_vdw, x); + sh_ele += shfl( sh_ele, x ); + sh_force[0] += shfl( sh_force[0], x ); + sh_force[1] += shfl( sh_force[1], x ); + sh_force[2] += shfl( sh_force[2], x ); + } + + if (laneid == 0) { + data_e_vdW[i] += sh_vdw; + data_e_ele[i] += sh_ele; + rvec_Add (workspace->f[i], sh_force ); + } #else - __syncthreads (); - - if (laneid < 16) { - sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 16]; - sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 16]; - rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 16] ); - } - __syncthreads (); - if (laneid < 8) { - sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 8]; - sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 8]; - rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 8] ); - } - __syncthreads (); - if (laneid < 4) { - sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 4]; - sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 4]; - rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 4] ); - } - __syncthreads (); - if (laneid < 2) { - sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 2]; - sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 2]; - rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 2] ); - } - __syncthreads (); - if (laneid < 1) { - sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 1]; - sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 1]; - rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 1] ); - } - __syncthreads (); - if (laneid == 0) { - data_e_vdW[i] += sh_vdw[threadIdx.x]; - data_e_ele[i] += sh_ele[threadIdx.x]; - rvec_Add (workspace->f[i], sh_force [ threadIdx.x ]); - } + __syncthreads (); + + if (laneid < 16) { + sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 16]; + sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 16]; + rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 16] ); + } + __syncthreads (); + if (laneid < 8) { + sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 8]; + sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 8]; + rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 8] ); + } + __syncthreads (); + if (laneid < 4) { + sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 4]; + sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 4]; + rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 4] ); + } + __syncthreads (); + if (laneid < 2) { + sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 2]; + sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 2]; + rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 2] ); + } + __syncthreads (); + if (laneid < 1) { + sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 1]; + sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 1]; + rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 1] ); + } + __syncthreads (); + if (laneid == 0) { + data_e_vdW[i] += sh_vdw[threadIdx.x]; + data_e_ele[i] += sh_ele[threadIdx.x]; + rvec_Add (workspace->f[i], sh_force [ threadIdx.x ]); + } #endif - } + } CUDA_GLOBAL void ker_tabulated_vdW_coulomb_energy( reax_atom *my_atoms, - global_parameters gp, - control_params *control, - storage p_workspace, - reax_list p_far_nbrs, - LR_lookup_table *t_LR, - int n, int N, int num_atom_types, - int step, int prev_steps, - int energy_update_freq, - real *data_e_vdW, real *data_e_ele, - rvec *data_ext_press) + global_parameters gp, + control_params *control, + storage p_workspace, + reax_list p_far_nbrs, + LR_lookup_table *t_LR, + int n, int N, int num_atom_types, + int step, int prev_steps, + int energy_update_freq, + real *data_e_vdW, real *data_e_ele, + rvec *data_ext_press) { - int i, j, pj, r, natoms, steps, update_freq, update_energies; - int type_i, type_j, tmin, tmax; - int start_i, end_i, orig_i, orig_j; - real r_ij, base, dif; - real e_vdW, e_ele; - real CEvd, CEclmb; - rvec temp, ext_press; - far_neighbor_data *nbr_pj; - reax_list *far_nbrs; - LR_lookup_table *t; - - storage *workspace = &( p_workspace ); - - natoms = n; - far_nbrs = &( p_far_nbrs ); - steps = step - prev_steps; - update_freq = energy_update_freq; - update_energies = update_freq > 0 && steps % update_freq == 0; - e_ele = e_vdW = 0; - - i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= N) return; - - data_e_vdW [i] = 0; - data_e_ele [i] = 0; - - //for( i = 0; i < natoms; ++i ) { - type_i = my_atoms[i].type; - start_i = Dev_Start_Index(i,far_nbrs); - end_i = Dev_End_Index(i,far_nbrs); - orig_i = my_atoms[i].orig_id; - - for( pj = start_i; pj < end_i; ++pj ) { - nbr_pj = &(far_nbrs->select.far_nbr_list[pj]); - j = nbr_pj->nbr; - orig_j = my_atoms[j].orig_id; - - //if( nbr_pj->d <= control->nonb_cut && (j < natoms || orig_i < orig_j) ) { - if( nbr_pj->d <= control->nonb_cut && - (((i < j) && (i < natoms) && (j < natoms || orig_i < orig_j)) - || ((i > j) && (i < natoms) && (j < natoms)) - || (i > j && i >= natoms && j < natoms && orig_j < orig_i))) - { // ji with j >= n - j = nbr_pj->nbr; - type_j = my_atoms[j].type; - r_ij = nbr_pj->d; - tmin = MIN( type_i, type_j ); - tmax = MAX( type_i, type_j ); - - t = &( t_LR[ index_lr (tmin, tmax, num_atom_types) ]); - - // table = &( LR[type_i][type_j] ); - - /* Cubic Spline Interpolation */ - r = (int)(r_ij * t->inv_dx); - if( r == 0 ) ++r; - base = (real)(r+1) * t->dx; - dif = r_ij - base; - //fprintf(stderr, "r: %f, i: %d, base: %f, dif: %f\n", r, i, base, dif); - - if( update_energies ) { - e_vdW = ((t->vdW[r].d*dif + t->vdW[r].c)*dif + t->vdW[r].b)*dif + - t->vdW[r].a; - - e_ele = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + - t->ele[r].a; - e_ele *= my_atoms[i].q * my_atoms[j].q; - - //data_e_vdW [i] += e_vdW; - data_e_vdW [i] += e_vdW / 2.0; - //data_e_ele [i] += e_ele; - data_e_ele [i] += e_ele / 2.0; - } - - CEvd = ((t->CEvd[r].d*dif + t->CEvd[r].c)*dif + t->CEvd[r].b)*dif + - t->CEvd[r].a; - - CEclmb = ((t->CEclmb[r].d*dif+t->CEclmb[r].c)*dif+t->CEclmb[r].b)*dif + - t->CEclmb[r].a; - CEclmb *= my_atoms[i].q * my_atoms[j].q; - - if( control->virial == 0 ) { - if ( i < j ) - rvec_ScaledAdd( workspace->f[i], -(CEvd + CEclmb), nbr_pj->dvec ); - else - rvec_ScaledAdd( workspace->f[i], +(CEvd + CEclmb), nbr_pj->dvec ); - //rvec_ScaledAdd( workspace->f[i], -(CEvd + CEclmb), nbr_pj->dvec ); - //rvec_ScaledAdd( workspace->f[j], +(CEvd + CEclmb), nbr_pj->dvec ); - } - else { // NPT, iNPT or sNPT - /* for pressure coupling, terms not related to bond order derivatives - are added directly into pressure vector/tensor */ - rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec ); - - rvec_ScaledAdd( workspace->f[i], -1., temp ); - rvec_Add( workspace->f[j], temp ); - - rvec_iMultiply( ext_press, nbr_pj->rel_box, temp ); - rvec_Add( data_ext_press [i], ext_press ); - } + int i, j, pj, r, natoms, steps, update_freq, update_energies; + int type_i, type_j, tmin, tmax; + int start_i, end_i, orig_i, orig_j; + real r_ij, base, dif; + real e_vdW, e_ele; + real CEvd, CEclmb; + rvec temp, ext_press; + far_neighbor_data *nbr_pj; + reax_list *far_nbrs; + LR_lookup_table *t; + + storage *workspace = &( p_workspace ); + + natoms = n; + far_nbrs = &( p_far_nbrs ); + steps = step - prev_steps; + update_freq = energy_update_freq; + update_energies = update_freq > 0 && steps % update_freq == 0; + e_ele = e_vdW = 0; + + i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= N) return; + + data_e_vdW [i] = 0; + data_e_ele [i] = 0; + + //for( i = 0; i < natoms; ++i ) { + type_i = my_atoms[i].type; + start_i = Dev_Start_Index(i,far_nbrs); + end_i = Dev_End_Index(i,far_nbrs); + orig_i = my_atoms[i].orig_id; + + for( pj = start_i; pj < end_i; ++pj ) { + nbr_pj = &(far_nbrs->select.far_nbr_list[pj]); + j = nbr_pj->nbr; + orig_j = my_atoms[j].orig_id; + + //if( nbr_pj->d <= control->nonb_cut && (j < natoms || orig_i < orig_j) ) { + if( nbr_pj->d <= control->nonb_cut && + (((i < j) && (i < natoms) && (j < natoms || orig_i < orig_j)) + || ((i > j) && (i < natoms) && (j < natoms)) + || (i > j && i >= natoms && j < natoms && orig_j < orig_i))) + { // ji with j >= n + j = nbr_pj->nbr; + type_j = my_atoms[j].type; + r_ij = nbr_pj->d; + tmin = MIN( type_i, type_j ); + tmax = MAX( type_i, type_j ); + + t = &( t_LR[ index_lr (tmin, tmax, num_atom_types) ]); + + // table = &( LR[type_i][type_j] ); + + /* Cubic Spline Interpolation */ + r = (int)(r_ij * t->inv_dx); + if( r == 0 ) ++r; + base = (real)(r+1) * t->dx; + dif = r_ij - base; + //fprintf(stderr, "r: %f, i: %d, base: %f, dif: %f\n", r, i, base, dif); + + if( update_energies ) { + e_vdW = ((t->vdW[r].d*dif + t->vdW[r].c)*dif + t->vdW[r].b)*dif + + t->vdW[r].a; + + e_ele = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + + t->ele[r].a; + e_ele *= my_atoms[i].q * my_atoms[j].q; + + //data_e_vdW [i] += e_vdW; + data_e_vdW [i] += e_vdW / 2.0; + //data_e_ele [i] += e_ele; + data_e_ele [i] += e_ele / 2.0; + } + + CEvd = ((t->CEvd[r].d*dif + t->CEvd[r].c)*dif + t->CEvd[r].b)*dif + + t->CEvd[r].a; + + CEclmb = ((t->CEclmb[r].d*dif+t->CEclmb[r].c)*dif+t->CEclmb[r].b)*dif + + t->CEclmb[r].a; + CEclmb *= my_atoms[i].q * my_atoms[j].q; + + if( control->virial == 0 ) { + if ( i < j ) + rvec_ScaledAdd( workspace->f[i], -(CEvd + CEclmb), nbr_pj->dvec ); + else + rvec_ScaledAdd( workspace->f[i], +(CEvd + CEclmb), nbr_pj->dvec ); + //rvec_ScaledAdd( workspace->f[i], -(CEvd + CEclmb), nbr_pj->dvec ); + //rvec_ScaledAdd( workspace->f[j], +(CEvd + CEclmb), nbr_pj->dvec ); + } + else { // NPT, iNPT or sNPT + /* for pressure coupling, terms not related to bond order derivatives + are added directly into pressure vector/tensor */ + rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec ); + + rvec_ScaledAdd( workspace->f[i], -1., temp ); + rvec_Add( workspace->f[j], temp ); + + rvec_iMultiply( ext_press, nbr_pj->rel_box, temp ); + rvec_Add( data_ext_press [i], ext_press ); + } #ifdef TEST_ENERGY - //fprintf( out_control->evdw, "%6d%6d%24.15e%24.15e%24.15e\n", - fprintf( out_control->evdw, "%6d%6d%12.4f%12.4f%12.4f\n", - system->my_atoms[i].orig_id, system->my_atoms[j].orig_id, - r_ij, e_vdW, data->my_en.e_vdW ); - //fprintf(out_control->ecou,"%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n", - fprintf( out_control->ecou, "%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f\n", - system->my_atoms[i].orig_id, system->my_atoms[j].orig_id, - r_ij, system->my_atoms[i].q, system->my_atoms[j].q, - e_ele, data->my_en.e_ele ); + //fprintf( out_control->evdw, "%6d%6d%24.15e%24.15e%24.15e\n", + fprintf( out_control->evdw, "%6d%6d%12.4f%12.4f%12.4f\n", + system->my_atoms[i].orig_id, system->my_atoms[j].orig_id, + r_ij, e_vdW, data->my_en.e_vdW ); + //fprintf(out_control->ecou,"%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n", + fprintf( out_control->ecou, "%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f\n", + system->my_atoms[i].orig_id, system->my_atoms[j].orig_id, + r_ij, system->my_atoms[i].q, system->my_atoms[j].q, + e_ele, data->my_en.e_ele ); #endif #ifdef TEST_FORCES - rvec_ScaledAdd( workspace->f_vdw[i], -CEvd, nbr_pj->dvec ); - rvec_ScaledAdd( workspace->f_vdw[j], +CEvd, nbr_pj->dvec ); - rvec_ScaledAdd( workspace->f_ele[i], -CEclmb, nbr_pj->dvec ); - rvec_ScaledAdd( workspace->f_ele[j], +CEclmb, nbr_pj->dvec ); + rvec_ScaledAdd( workspace->f_vdw[i], -CEvd, nbr_pj->dvec ); + rvec_ScaledAdd( workspace->f_vdw[j], +CEvd, nbr_pj->dvec ); + rvec_ScaledAdd( workspace->f_ele[i], -CEclmb, nbr_pj->dvec ); + rvec_ScaledAdd( workspace->f_ele[j], +CEclmb, nbr_pj->dvec ); #endif - } - } - // } + } + } + // } } CUDA_GLOBAL void ker_pol_energy (reax_atom *my_atoms, - single_body_parameters *sbp, - int n, - real *data_e_pol) + single_body_parameters *sbp, + int n, + real *data_e_pol) { - int type_i; - real q; + int type_i; + real q; - int i = blockIdx.x * blockDim.x + threadIdx.x; - if ( i >= n) return; + int i = blockIdx.x * blockDim.x + threadIdx.x; + if ( i >= n) return; - data_e_pol [i] = 0; + data_e_pol [i] = 0; - //for( i = 0; i < system->n; i++ ) { - q = my_atoms[i].q; - type_i = my_atoms[i].type; + //for( i = 0; i < system->n; i++ ) { + q = my_atoms[i].q; + type_i = my_atoms[i].type; - data_e_pol[i] += - KCALpMOL_to_EV * (sbp[type_i].chi * q + - (sbp[type_i].eta / 2.) * SQR(q)); - //} + data_e_pol[i] += + KCALpMOL_to_EV * (sbp[type_i].chi * q + + (sbp[type_i].eta / 2.) * SQR(q)); + //} } void Cuda_Compute_Polarization_Energy( reax_system *system, simulation_data *data ) { - int blocks; - real *spad = (real *) scratch; - cuda_memset (spad, 0, sizeof (real) * 2 * system->n, "pol_energy"); - - blocks = system->n / DEF_BLOCK_SIZE + - ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1); - ker_pol_energy <<< blocks, DEF_BLOCK_SIZE >>> - ( system->d_my_atoms, system->reax_param.d_sbp, - system->n, spad ); - cudaThreadSynchronize (); - cudaCheckError (); - - //Reduction for polarization energy - k_reduction <<< blocks, DEF_BLOCK_SIZE, sizeof (real) * DEF_BLOCK_SIZE >>> - ( spad, spad + system->n, system->n); - cudaThreadSynchronize (); - cudaCheckError (); - - k_reduction <<< 1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2>>> - ( spad + system->n, &((simulation_data *)data->d_simulation_data)->my_en.e_pol, blocks); - cudaThreadSynchronize (); - cudaCheckError (); + int blocks; + real *spad = (real *) scratch; + cuda_memset (spad, 0, sizeof (real) * 2 * system->n, "pol_energy"); + + blocks = system->n / DEF_BLOCK_SIZE + + ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1); + ker_pol_energy <<< blocks, DEF_BLOCK_SIZE >>> + ( system->d_my_atoms, system->reax_param.d_sbp, + system->n, spad ); + cudaThreadSynchronize (); + cudaCheckError (); + + //Reduction for polarization energy + k_reduction <<< blocks, DEF_BLOCK_SIZE, sizeof (real) * DEF_BLOCK_SIZE >>> + ( spad, spad + system->n, system->n); + cudaThreadSynchronize (); + cudaCheckError (); + + k_reduction <<< 1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2>>> + ( spad + system->n, &((simulation_data *)data->d_simulation_data)->my_en.e_pol, blocks); + cudaThreadSynchronize (); + cudaCheckError (); } void Cuda_NonBonded_Energy ( reax_system *system, control_params *control, - storage *workspace, simulation_data *data, reax_list **lists, - output_controls *out_control, bool isTabulated ) + storage *workspace, simulation_data *data, reax_list **lists, + output_controls *out_control, bool isTabulated ) { - int blocks; - int rblocks; - int size = (2 * system->N + 2 * system->N ) * sizeof (real) + - 2 * system->N * sizeof (rvec); - - rvec *spad_rvec; - real *spad = (real *) scratch; - cuda_memset (spad, 0, size, "pol_energy"); - - rblocks = system->N / DEF_BLOCK_SIZE + ((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1); - blocks = ((system->N * VDW_KER_THREADS_PER_ATOM) / DEF_BLOCK_SIZE) - + (((system->N * VDW_KER_THREADS_PER_ATOM) % DEF_BLOCK_SIZE == 0) ? 0 : 1); - - if (!isTabulated) { - ker_vdW_coulomb_energy <<< blocks, DEF_BLOCK_SIZE, DEF_BLOCK_SIZE * (2 * sizeof(real) + sizeof(rvec)) >>> - ( system->d_my_atoms, system->reax_param.d_tbp, - system->reax_param.d_gp, (control_params *)control->d_control_params, - *(dev_workspace), *(*dev_lists + FAR_NBRS), - system->n, system->N, system->reax_param.num_atom_types, - spad, spad + 2 * system->N, (rvec *)(spad + 4 * system->N)); - cudaThreadSynchronize (); - cudaCheckError (); - } else { - ker_tabulated_vdW_coulomb_energy <<< blocks, DEF_BLOCK_SIZE >>> - ( system->d_my_atoms, system->reax_param.d_gp, - (control_params *)control->d_control_params, - *(dev_workspace), *(*dev_lists + FAR_NBRS), - d_LR, system->n, system->N, - system->reax_param.num_atom_types, - data->step, data->prev_steps, - out_control->energy_update_freq, - spad, spad + 2 * system->N, - (rvec *)(spad + 4 * system->N)); - cudaThreadSynchronize (); - cudaCheckError (); - } - - //reduction for vdw - k_reduction <<< rblocks, DEF_BLOCK_SIZE, sizeof (real) * DEF_BLOCK_SIZE >>> - ( spad, spad + system->N, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - k_reduction <<< 1, BLOCKS_POW_2_N, sizeof (real) * BLOCKS_POW_2_N>>> - ( spad + system->N, &((simulation_data *)data->d_simulation_data)->my_en.e_vdW, rblocks); - cudaThreadSynchronize (); - cudaCheckError (); - - //reduction for ele - k_reduction <<< rblocks, DEF_BLOCK_SIZE, sizeof (real) * DEF_BLOCK_SIZE >>> - ( spad + 2 * system->N, spad + 3 * system->N, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - k_reduction <<< 1, BLOCKS_POW_2_N, sizeof (real) * BLOCKS_POW_2_N>>> - ( spad + 3 * system->N, &((simulation_data *)data->d_simulation_data)->my_en.e_ele, rblocks); - cudaThreadSynchronize (); - cudaCheckError (); - - //reduction for ext_press - spad_rvec = (rvec *) (spad + 4 * system->N); - k_reduction_rvec <<< rblocks, DEF_BLOCK_SIZE, sizeof (rvec) * DEF_BLOCK_SIZE >>> - ( spad_rvec, spad_rvec + system->N, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - k_reduction_rvec <<< 1, BLOCKS_POW_2_N, sizeof (rvec) * BLOCKS_POW_2_N>>> - ( spad_rvec + system->N, &((simulation_data *)data->d_simulation_data)->my_ext_press, rblocks); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_Compute_Polarization_Energy( system, data ); + int blocks; + int rblocks; + int size = (2 * system->N + 2 * system->N ) * sizeof (real) + + 2 * system->N * sizeof (rvec); + + rvec *spad_rvec; + real *spad = (real *) scratch; + cuda_memset (spad, 0, size, "pol_energy"); + + rblocks = system->N / DEF_BLOCK_SIZE + ((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1); + blocks = ((system->N * VDW_KER_THREADS_PER_ATOM) / DEF_BLOCK_SIZE) + + (((system->N * VDW_KER_THREADS_PER_ATOM) % DEF_BLOCK_SIZE == 0) ? 0 : 1); + + if (!isTabulated) { + ker_vdW_coulomb_energy <<< blocks, DEF_BLOCK_SIZE, DEF_BLOCK_SIZE * (2 * sizeof(real) + sizeof(rvec)) >>> + ( system->d_my_atoms, system->reax_param.d_tbp, + system->reax_param.d_gp, (control_params *)control->d_control_params, + *(dev_workspace), *(*dev_lists + FAR_NBRS), + system->n, system->N, system->reax_param.num_atom_types, + spad, spad + 2 * system->N, (rvec *)(spad + 4 * system->N)); + cudaThreadSynchronize (); + cudaCheckError (); + } else { + ker_tabulated_vdW_coulomb_energy <<< blocks, DEF_BLOCK_SIZE >>> + ( system->d_my_atoms, system->reax_param.d_gp, + (control_params *)control->d_control_params, + *(dev_workspace), *(*dev_lists + FAR_NBRS), + d_LR, system->n, system->N, + system->reax_param.num_atom_types, + data->step, data->prev_steps, + out_control->energy_update_freq, + spad, spad + 2 * system->N, + (rvec *)(spad + 4 * system->N)); + cudaThreadSynchronize (); + cudaCheckError (); + } + + //reduction for vdw + k_reduction <<< rblocks, DEF_BLOCK_SIZE, sizeof (real) * DEF_BLOCK_SIZE >>> + ( spad, spad + system->N, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + k_reduction <<< 1, BLOCKS_POW_2_N, sizeof (real) * BLOCKS_POW_2_N>>> + ( spad + system->N, &((simulation_data *)data->d_simulation_data)->my_en.e_vdW, rblocks); + cudaThreadSynchronize (); + cudaCheckError (); + + //reduction for ele + k_reduction <<< rblocks, DEF_BLOCK_SIZE, sizeof (real) * DEF_BLOCK_SIZE >>> + ( spad + 2 * system->N, spad + 3 * system->N, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + k_reduction <<< 1, BLOCKS_POW_2_N, sizeof (real) * BLOCKS_POW_2_N>>> + ( spad + 3 * system->N, &((simulation_data *)data->d_simulation_data)->my_en.e_ele, rblocks); + cudaThreadSynchronize (); + cudaCheckError (); + + //reduction for ext_press + spad_rvec = (rvec *) (spad + 4 * system->N); + k_reduction_rvec <<< rblocks, DEF_BLOCK_SIZE, sizeof (rvec) * DEF_BLOCK_SIZE >>> + ( spad_rvec, spad_rvec + system->N, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + k_reduction_rvec <<< 1, BLOCKS_POW_2_N, sizeof (rvec) * BLOCKS_POW_2_N>>> + ( spad_rvec + system->N, &((simulation_data *)data->d_simulation_data)->my_ext_press, rblocks); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_Compute_Polarization_Energy( system, data ); } diff --git a/PG-PuReMD/src/cuda_post_evolve.cu b/PG-PuReMD/src/cuda_post_evolve.cu index ebcb22fa..b8008e85 100644 --- a/PG-PuReMD/src/cuda_post_evolve.cu +++ b/PG-PuReMD/src/cuda_post_evolve.cu @@ -5,31 +5,31 @@ #include "cuda_utils.h" CUDA_GLOBAL void ker_post_evolve (reax_atom *my_atoms, - simulation_data *data, int n) + simulation_data *data, int n) { - rvec diff, cross; - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= n) return; + rvec diff, cross; + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) return; - //for( i = 0; i < system->n; i++ ) { - /* remove translational vel */ - rvec_ScaledAdd( my_atoms[i].v, -1., data->vcm ); + //for( i = 0; i < system->n; i++ ) { + /* remove translational vel */ + rvec_ScaledAdd( my_atoms[i].v, -1., data->vcm ); - /* remove rotational */ - rvec_ScaledSum( diff, 1., my_atoms[i].x, -1., data->xcm ); - rvec_Cross( cross, data->avcm, diff ); - rvec_ScaledAdd( my_atoms[i].v, -1., cross ); - //} + /* remove rotational */ + rvec_ScaledSum( diff, 1., my_atoms[i].x, -1., data->xcm ); + rvec_Cross( cross, data->avcm, diff ); + rvec_ScaledAdd( my_atoms[i].v, -1., cross ); + //} } void post_evolve_velocities (reax_system *system, simulation_data *data) { - int blocks; + int blocks; - blocks = system->n / DEF_BLOCK_SIZE + - ((system->n % DEF_BLOCK_SIZE) == 0 ? 0 : 1); - ker_post_evolve <<< blocks, DEF_BLOCK_SIZE >>> - (system->d_my_atoms, (simulation_data *)data->d_simulation_data, system->n); - cudaThreadSynchronize (); - cudaCheckError (); + blocks = system->n / DEF_BLOCK_SIZE + + ((system->n % DEF_BLOCK_SIZE) == 0 ? 0 : 1); + ker_post_evolve <<< blocks, DEF_BLOCK_SIZE >>> + (system->d_my_atoms, (simulation_data *)data->d_simulation_data, system->n); + cudaThreadSynchronize (); + cudaCheckError (); } diff --git a/PG-PuReMD/src/cuda_qEq.cu b/PG-PuReMD/src/cuda_qEq.cu index 271a190e..b2094583 100644 --- a/PG-PuReMD/src/cuda_qEq.cu +++ b/PG-PuReMD/src/cuda_qEq.cu @@ -27,95 +27,95 @@ #include "validation.h" -CUDA_GLOBAL void ker_init_matvec( reax_atom *my_atoms, - single_body_parameters *sbp, - storage p_workspace, int n ) +CUDA_GLOBAL void ker_init_matvec( reax_atom *my_atoms, + single_body_parameters *sbp, + storage p_workspace, int n ) { - storage *workspace = &( p_workspace ); - reax_atom *atom; + storage *workspace = &( p_workspace ); + reax_atom *atom; - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= n) return; + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) return; - //for( i = 0; i < system->n; ++i ) { - atom = &( my_atoms[i] ); + //for( i = 0; i < system->n; ++i ) { + atom = &( my_atoms[i] ); - /* init pre-conditioner for H and init solution vectors */ - workspace->Hdia_inv[i] = 1. / sbp[ atom->type ].eta; - workspace->b_s[i] = -sbp[ atom->type ].chi; - workspace->b_t[i] = -1.0; - workspace->b[i][0] = -sbp[ atom->type ].chi; - workspace->b[i][1] = -1.0; + /* init pre-conditioner for H and init solution vectors */ + workspace->Hdia_inv[i] = 1. / sbp[ atom->type ].eta; + workspace->b_s[i] = -sbp[ atom->type ].chi; + workspace->b_t[i] = -1.0; + workspace->b[i][0] = -sbp[ atom->type ].chi; + workspace->b[i][1] = -1.0; - workspace->x[i][1] = atom->t[2] + 3 * ( atom->t[0] - atom->t[1] ); + workspace->x[i][1] = atom->t[2] + 3 * ( atom->t[0] - atom->t[1] ); - /* cubic extrapolation for s and t */ - workspace->x[i][0] = 4*(atom->s[0]+atom->s[2])-(6*atom->s[1]+atom->s[3]); - //} + /* cubic extrapolation for s and t */ + workspace->x[i][0] = 4*(atom->s[0]+atom->s[2])-(6*atom->s[1]+atom->s[3]); + //} } void Cuda_Init_MatVec ( reax_system *system, storage *workspace ) { - int blocks; + int blocks; - blocks = system->n / DEF_BLOCK_SIZE + - (( system->n % DEF_BLOCK_SIZE == 0 ) ? 0 : 1); + blocks = system->n / DEF_BLOCK_SIZE + + (( system->n % DEF_BLOCK_SIZE == 0 ) ? 0 : 1); - ker_init_matvec <<< blocks, DEF_BLOCK_SIZE >>> - ( system->d_my_atoms, system->reax_param.d_sbp, - *dev_workspace, system->n ); - cudaThreadSynchronize (); - cudaCheckError (); + ker_init_matvec <<< blocks, DEF_BLOCK_SIZE >>> + ( system->d_my_atoms, system->reax_param.d_sbp, + *dev_workspace, system->n ); + cudaThreadSynchronize (); + cudaCheckError (); } void cuda_charges_x (reax_system *system, rvec2 my_sum) { - int blocks; - rvec2 *output = (rvec2 *) scratch; - cuda_memset (output, 0, sizeof (rvec2) * 2 * system->n, "cuda_charges_x:q"); + int blocks; + rvec2 *output = (rvec2 *) scratch; + cuda_memset (output, 0, sizeof (rvec2) * 2 * system->n, "cuda_charges_x:q"); - blocks = system->n / DEF_BLOCK_SIZE + - (( system->n % DEF_BLOCK_SIZE == 0 ) ? 0 : 1); + blocks = system->n / DEF_BLOCK_SIZE + + (( system->n % DEF_BLOCK_SIZE == 0 ) ? 0 : 1); - k_reduction_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof (rvec2) * DEF_BLOCK_SIZE >>> - ( dev_workspace->x, output, system->n ); - cudaThreadSynchronize (); - cudaCheckError (); + k_reduction_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof (rvec2) * DEF_BLOCK_SIZE >>> + ( dev_workspace->x, output, system->n ); + cudaThreadSynchronize (); + cudaCheckError (); - k_reduction_rvec2 <<< 1, BLOCKS_POW_2, sizeof (rvec2) * BLOCKS_POW_2 >>> - ( output, output + system->n, blocks ); - cudaThreadSynchronize (); - cudaCheckError (); + k_reduction_rvec2 <<< 1, BLOCKS_POW_2, sizeof (rvec2) * BLOCKS_POW_2 >>> + ( output, output + system->n, blocks ); + cudaThreadSynchronize (); + cudaCheckError (); - copy_host_device (my_sum, output + system->n, sizeof (rvec2), cudaMemcpyDeviceToHost, "charges:x"); + copy_host_device (my_sum, output + system->n, sizeof (rvec2), cudaMemcpyDeviceToHost, "charges:x"); } CUDA_GLOBAL void ker_calculate_st (reax_atom *my_atoms, storage p_workspace, - real u, real *q, int n) + real u, real *q, int n) { - storage *workspace = &( p_workspace ); - reax_atom *atom; - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= n) return; - - //for( i = 0; i < system->n; ++i ) { - atom = &( my_atoms[i] ); - - //atom->q = workspace->s[i] - u * workspace->t[i]; - q[i] = atom->q = workspace->x[i][0] - u * workspace->x[i][1]; - - atom->s[3] = atom->s[2]; - atom->s[2] = atom->s[1]; - atom->s[1] = atom->s[0]; - //atom->s[0] = workspace->s[i]; - atom->s[0] = workspace->x[i][0]; - - atom->t[3] = atom->t[2]; - atom->t[2] = atom->t[1]; - atom->t[1] = atom->t[0]; - //atom->t[0] = workspace->t[i]; - atom->t[0] = workspace->x[i][1]; - //} + storage *workspace = &( p_workspace ); + reax_atom *atom; + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) return; + + //for( i = 0; i < system->n; ++i ) { + atom = &( my_atoms[i] ); + + //atom->q = workspace->s[i] - u * workspace->t[i]; + q[i] = atom->q = workspace->x[i][0] - u * workspace->x[i][1]; + + atom->s[3] = atom->s[2]; + atom->s[2] = atom->s[1]; + atom->s[1] = atom->s[0]; + //atom->s[0] = workspace->s[i]; + atom->s[0] = workspace->x[i][0]; + + atom->t[3] = atom->t[2]; + atom->t[2] = atom->t[1]; + atom->t[1] = atom->t[0]; + //atom->t[0] = workspace->t[i]; + atom->t[0] = workspace->x[i][1]; + //} } //TODO if we use the function argument (output), we are getting @@ -128,22 +128,22 @@ CUDA_GLOBAL void ker_calculate_st (reax_atom *my_atoms, storage p_workspace, extern "C" void cuda_charges_st (reax_system *system, storage *workspace, real *output, real u) { - int blocks; - real *tmp = (real *) scratch; - real *tmp_output = (real *) host_scratch; - - cuda_memset (tmp, 0, sizeof (real) * system->n, "charges:q"); - memset (tmp_output, 0, sizeof (real) * system->n); - - blocks = system->n / DEF_BLOCK_SIZE + - (( system->n % DEF_BLOCK_SIZE == 0 ) ? 0 : 1); - ker_calculate_st <<< blocks, DEF_BLOCK_SIZE >>> - ( system->d_my_atoms, *dev_workspace, u, tmp, system->n); - cudaThreadSynchronize (); - cudaCheckError (); - - copy_host_device (output, tmp, sizeof (real) * system->n, - cudaMemcpyDeviceToHost, "charges:q"); + int blocks; + real *tmp = (real *) scratch; + real *tmp_output = (real *) host_scratch; + + cuda_memset (tmp, 0, sizeof (real) * system->n, "charges:q"); + memset (tmp_output, 0, sizeof (real) * system->n); + + blocks = system->n / DEF_BLOCK_SIZE + + (( system->n % DEF_BLOCK_SIZE == 0 ) ? 0 : 1); + ker_calculate_st <<< blocks, DEF_BLOCK_SIZE >>> + ( system->d_my_atoms, *dev_workspace, u, tmp, system->n); + cudaThreadSynchronize (); + cudaCheckError (); + + copy_host_device (output, tmp, sizeof (real) * system->n, + cudaMemcpyDeviceToHost, "charges:q"); } //TODO //TODO @@ -155,23 +155,23 @@ extern "C" void cuda_charges_st (reax_system *system, storage *workspace, real * CUDA_GLOBAL void ker_update_q (reax_atom *my_atoms, real *q, int n, int N) { - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= (N-n)) return; + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= (N-n)) return; - //for( i = system->n; i < system->N; ++i ) - my_atoms[i + n].q = q[i + n]; + //for( i = system->n; i < system->N; ++i ) + my_atoms[i + n].q = q[i + n]; } void cuda_charges_updateq (reax_system *system, real *q) { - int blocks; - real *dev_q = (real *) scratch; - copy_host_device (q, dev_q, system->N * sizeof (real), - cudaMemcpyHostToDevice, "charges:q"); - blocks = (system->N - system->n) / DEF_BLOCK_SIZE + - (( (system->N - system->n) % DEF_BLOCK_SIZE == 0 ) ? 0 : 1); - ker_update_q <<< blocks, DEF_BLOCK_SIZE >>> - ( system->d_my_atoms, dev_q, system->n, system->N); - cudaThreadSynchronize (); - cudaCheckError (); + int blocks; + real *dev_q = (real *) scratch; + copy_host_device (q, dev_q, system->N * sizeof (real), + cudaMemcpyHostToDevice, "charges:q"); + blocks = (system->N - system->n) / DEF_BLOCK_SIZE + + (( (system->N - system->n) % DEF_BLOCK_SIZE == 0 ) ? 0 : 1); + ker_update_q <<< blocks, DEF_BLOCK_SIZE >>> + ( system->d_my_atoms, dev_q, system->n, system->N); + cudaThreadSynchronize (); + cudaCheckError (); } diff --git a/PG-PuReMD/src/cuda_reset_tools.cu b/PG-PuReMD/src/cuda_reset_tools.cu index 084da6b0..850a7c5d 100644 --- a/PG-PuReMD/src/cuda_reset_tools.cu +++ b/PG-PuReMD/src/cuda_reset_tools.cu @@ -4,159 +4,159 @@ #include "dev_list.h" CUDA_GLOBAL void ker_reset_hbond_list (reax_atom *my_atoms, - reax_list hbonds, - int N) + reax_list hbonds, + int N) { - int Hindex = 0; - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= N) return; - - Hindex = my_atoms[i].Hindex; - if (Hindex > 1) { - Dev_Set_End_Index ( Hindex, Dev_Start_Index (Hindex, &hbonds), &hbonds); - } + int Hindex = 0; + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= N) return; + + Hindex = my_atoms[i].Hindex; + if (Hindex > 1) { + Dev_Set_End_Index ( Hindex, Dev_Start_Index (Hindex, &hbonds), &hbonds); + } } CUDA_GLOBAL void ker_reset_bond_list (reax_atom *my_atoms, - reax_list bonds, - int N) + reax_list bonds, + int N) { - int Hindex = 0; - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= N) return; + int Hindex = 0; + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= N) return; - Dev_Set_End_Index ( i, Dev_Start_Index (i, &bonds), &bonds); + Dev_Set_End_Index ( i, Dev_Start_Index (i, &bonds), &bonds); } extern "C" { - void Cuda_Reset_Workspace (reax_system *system, storage *workspace) - { - cuda_memset ( dev_workspace->total_bond_order, 0, system->total_cap * sizeof (real), "total_bond_order"); - cuda_memset ( dev_workspace->dDeltap_self, 0, system->total_cap * sizeof (rvec), "dDeltap_self"); - cuda_memset ( dev_workspace->CdDelta, 0, system->total_cap * sizeof (real), "CdDelta"); - cuda_memset ( dev_workspace->f, 0, system->total_cap * sizeof (rvec), "f"); - } - - CUDA_GLOBAL void ker_reset_hindex (reax_atom *my_atoms, int N) - { - int Hindex = 0; - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= N) return; - - my_atoms[i].Hindex = i; - } - - void Cuda_Reset_Atoms( reax_system* system, control_params *control ) - { - int i; - reax_atom *atom; - int blocks; - - /* - if( control->hbond_cut > 0 ) - //TODO - for( i = 0; i < system->N; ++i ) { - atom = &(system->my_atoms[i]); - //if( system->reax_param.sbp[ atom->type ].p_hbond == 1 ) - atom->Hindex = system->numH++; - //else atom->Hindex = -1; - } - //TODO - */ - //////////////////////////////// - //////////////////////////////// - //////////////////////////////// - //////////////////////////////// - // FIX - 3 - Commented out this line for Hydrogen Bond fix - // FIX - HBOND ISSUE - // FIX - HBOND ISSUE - // FIX - HBOND ISSUE - // COMMENTED OUT THIS LINE BELOW - //system->numH = system->N; - // FIX - HBOND ISSUE - // FIX - HBOND ISSUE - // FIX - HBOND ISSUE - //////////////////////////////// - //////////////////////////////// - //////////////////////////////// - //////////////////////////////// - //////////////////////////////// - - - blocks = system->N / DEF_BLOCK_SIZE + - ((system->N % DEF_BLOCK_SIZE == 0 ) ? 0 : 1); - ker_reset_hindex <<<blocks, DEF_BLOCK_SIZE>>> - (system->d_my_atoms, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - } - - int Cuda_Reset_Neighbor_Lists( reax_system *system, control_params *control, - storage *workspace, reax_list **lists ) - { - int i, total_bonds, Hindex, total_hbonds; - reax_list *bonds, *hbonds; - int blocks; - - if (system->N > 0) { - bonds = *dev_lists + BONDS; - total_bonds = 0; - - //cuda_memset (bonds->index, 0, sizeof (int) * system->total_cap, "bonds:index"); - //cuda_memset (bonds->end_index, 0, sizeof (int) * system->total_cap, "bonds:end_index"); - blocks = system->N / DEF_BLOCK_SIZE + - ((system->N % DEF_BLOCK_SIZE == 0 ) ? 0 : 1); - ker_reset_bond_list <<<blocks, DEF_BLOCK_SIZE>>> - (system->d_my_atoms, *(*dev_lists + BONDS), system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - total_bonds = 0;// TODO compute the total bonds here. - - /* is reallocation needed? */ - if( total_bonds >= bonds->num_intrs * DANGER_ZONE ) { - workspace->realloc.bonds = 1; - if( total_bonds >= bonds->num_intrs ) { - fprintf(stderr, "p%d: not enough space for bonds! total=%d allocated=%d\n", - system->my_rank, total_bonds, bonds->num_intrs ); - return FAILURE; - } - } - } - - //HBonds processing - //FIX - 4 - Added additional check - if( (control->hbond_cut > 0) && (system->numH > 0)) { - hbonds = (*dev_lists) + HBONDS; - total_hbonds = 0; - - /* reset start-end indexes */ - //TODO - blocks = system->N / DEF_BLOCK_SIZE + - ((system->N % DEF_BLOCK_SIZE == 0 ) ? 0 : 1); - ker_reset_hbond_list <<<blocks, DEF_BLOCK_SIZE>>> - (system->d_my_atoms, *(*dev_lists + HBONDS), system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - //TODO compute the total hbonds here - total_hbonds = 0; - - /* is reallocation needed? */ - if( total_hbonds >= hbonds->num_intrs * 0.90/*DANGER_ZONE*/ ) { - workspace->realloc.hbonds = 1; - if( total_hbonds >= hbonds->num_intrs ) { - fprintf(stderr, "p%d: not enough space for hbonds! total=%d allocated=%d\n", - system->my_rank, total_hbonds, hbonds->num_intrs ); - return FAILURE; - } - } - } - - return SUCCESS; - } + void Cuda_Reset_Workspace (reax_system *system, storage *workspace) + { + cuda_memset ( dev_workspace->total_bond_order, 0, system->total_cap * sizeof (real), "total_bond_order"); + cuda_memset ( dev_workspace->dDeltap_self, 0, system->total_cap * sizeof (rvec), "dDeltap_self"); + cuda_memset ( dev_workspace->CdDelta, 0, system->total_cap * sizeof (real), "CdDelta"); + cuda_memset ( dev_workspace->f, 0, system->total_cap * sizeof (rvec), "f"); + } + + CUDA_GLOBAL void ker_reset_hindex (reax_atom *my_atoms, int N) + { + int Hindex = 0; + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= N) return; + + my_atoms[i].Hindex = i; + } + + void Cuda_Reset_Atoms( reax_system* system, control_params *control ) + { + int i; + reax_atom *atom; + int blocks; + + /* + if( control->hbond_cut > 0 ) + //TODO + for( i = 0; i < system->N; ++i ) { + atom = &(system->my_atoms[i]); + //if( system->reax_param.sbp[ atom->type ].p_hbond == 1 ) + atom->Hindex = system->numH++; + //else atom->Hindex = -1; + } + //TODO + */ + //////////////////////////////// + //////////////////////////////// + //////////////////////////////// + //////////////////////////////// + // FIX - 3 - Commented out this line for Hydrogen Bond fix + // FIX - HBOND ISSUE + // FIX - HBOND ISSUE + // FIX - HBOND ISSUE + // COMMENTED OUT THIS LINE BELOW + //system->numH = system->N; + // FIX - HBOND ISSUE + // FIX - HBOND ISSUE + // FIX - HBOND ISSUE + //////////////////////////////// + //////////////////////////////// + //////////////////////////////// + //////////////////////////////// + //////////////////////////////// + + + blocks = system->N / DEF_BLOCK_SIZE + + ((system->N % DEF_BLOCK_SIZE == 0 ) ? 0 : 1); + ker_reset_hindex <<<blocks, DEF_BLOCK_SIZE>>> + (system->d_my_atoms, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + } + + int Cuda_Reset_Neighbor_Lists( reax_system *system, control_params *control, + storage *workspace, reax_list **lists ) + { + int i, total_bonds, Hindex, total_hbonds; + reax_list *bonds, *hbonds; + int blocks; + + if (system->N > 0) { + bonds = *dev_lists + BONDS; + total_bonds = 0; + + //cuda_memset (bonds->index, 0, sizeof (int) * system->total_cap, "bonds:index"); + //cuda_memset (bonds->end_index, 0, sizeof (int) * system->total_cap, "bonds:end_index"); + blocks = system->N / DEF_BLOCK_SIZE + + ((system->N % DEF_BLOCK_SIZE == 0 ) ? 0 : 1); + ker_reset_bond_list <<<blocks, DEF_BLOCK_SIZE>>> + (system->d_my_atoms, *(*dev_lists + BONDS), system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + total_bonds = 0;// TODO compute the total bonds here. + + /* is reallocation needed? */ + if( total_bonds >= bonds->num_intrs * DANGER_ZONE ) { + workspace->realloc.bonds = 1; + if( total_bonds >= bonds->num_intrs ) { + fprintf(stderr, "p%d: not enough space for bonds! total=%d allocated=%d\n", + system->my_rank, total_bonds, bonds->num_intrs ); + return FAILURE; + } + } + } + + //HBonds processing + //FIX - 4 - Added additional check + if( (control->hbond_cut > 0) && (system->numH > 0)) { + hbonds = (*dev_lists) + HBONDS; + total_hbonds = 0; + + /* reset start-end indexes */ + //TODO + blocks = system->N / DEF_BLOCK_SIZE + + ((system->N % DEF_BLOCK_SIZE == 0 ) ? 0 : 1); + ker_reset_hbond_list <<<blocks, DEF_BLOCK_SIZE>>> + (system->d_my_atoms, *(*dev_lists + HBONDS), system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + //TODO compute the total hbonds here + total_hbonds = 0; + + /* is reallocation needed? */ + if( total_hbonds >= hbonds->num_intrs * 0.90/*DANGER_ZONE*/ ) { + workspace->realloc.hbonds = 1; + if( total_hbonds >= hbonds->num_intrs ) { + fprintf(stderr, "p%d: not enough space for hbonds! total=%d allocated=%d\n", + system->my_rank, total_hbonds, hbonds->num_intrs ); + return FAILURE; + } + } + } + + return SUCCESS; + } } diff --git a/PG-PuReMD/src/cuda_torsion_angles.cu b/PG-PuReMD/src/cuda_torsion_angles.cu index 42ffe859..e9a9b1f0 100644 --- a/PG-PuReMD/src/cuda_torsion_angles.cu +++ b/PG-PuReMD/src/cuda_torsion_angles.cu @@ -29,609 +29,609 @@ #define MIN_SINE 1e-10 CUDA_DEVICE real Calculate_Omega( rvec dvec_ij, real r_ij, - rvec dvec_jk, real r_jk, - rvec dvec_kl, real r_kl, - rvec dvec_li, real r_li, - three_body_interaction_data *p_ijk, - three_body_interaction_data *p_jkl, - rvec dcos_omega_di, rvec dcos_omega_dj, - rvec dcos_omega_dk, rvec dcos_omega_dl, - output_controls *out_control ) + rvec dvec_jk, real r_jk, + rvec dvec_kl, real r_kl, + rvec dvec_li, real r_li, + three_body_interaction_data *p_ijk, + three_body_interaction_data *p_jkl, + rvec dcos_omega_di, rvec dcos_omega_dj, + rvec dcos_omega_dk, rvec dcos_omega_dl, + output_controls *out_control ) { - real unnorm_cos_omega, unnorm_sin_omega, omega; - real sin_ijk, cos_ijk, sin_jkl, cos_jkl; - real htra, htrb, htrc, hthd, hthe, hnra, hnrc, hnhd, hnhe; - real arg, poem, tel; - rvec cross_jk_kl; - - sin_ijk = SIN( p_ijk->theta ); - cos_ijk = COS( p_ijk->theta ); - sin_jkl = SIN( p_jkl->theta ); - cos_jkl = COS( p_jkl->theta ); - - /* omega */ - unnorm_cos_omega = -rvec_Dot(dvec_ij, dvec_jk) * rvec_Dot(dvec_jk, dvec_kl) + - SQR( r_jk ) * rvec_Dot( dvec_ij, dvec_kl ); - - rvec_Cross( cross_jk_kl, dvec_jk, dvec_kl ); - unnorm_sin_omega = -r_jk * rvec_Dot( dvec_ij, cross_jk_kl ); - - omega = atan2( unnorm_sin_omega, unnorm_cos_omega ); - - - /* derivatives */ - /* coef for adjusments to cos_theta's */ - /* rla = r_ij, rlb = r_jk, rlc = r_kl, r4 = r_li; - coshd = cos_ijk, coshe = cos_jkl; - sinhd = sin_ijk, sinhe = sin_jkl; */ - htra = r_ij + cos_ijk * ( r_kl * cos_jkl - r_jk ); - htrb = r_jk - r_ij * cos_ijk - r_kl * cos_jkl; - htrc = r_kl + cos_jkl * ( r_ij * cos_ijk - r_jk ); - hthd = r_ij * sin_ijk * ( r_jk - r_kl * cos_jkl ); - hthe = r_kl * sin_jkl * ( r_jk - r_ij * cos_ijk ); - hnra = r_kl * sin_ijk * sin_jkl; - hnrc = r_ij * sin_ijk * sin_jkl; - hnhd = r_ij * r_kl * cos_ijk * sin_jkl; - hnhe = r_ij * r_kl * sin_ijk * cos_jkl; - - - poem = 2.0 * r_ij * r_kl * sin_ijk * sin_jkl; - if( poem < 1e-20 ) poem = 1e-20; - - tel = SQR( r_ij ) + SQR( r_jk ) + SQR( r_kl ) - SQR( r_li ) - - 2.0 * ( r_ij * r_jk * cos_ijk - r_ij * r_kl * cos_ijk * cos_jkl + - r_jk * r_kl * cos_jkl ); - - arg = tel / poem; - if( arg > 1.0 ) arg = 1.0; - if( arg < -1.0 ) arg = -1.0; - - - /* fprintf( out_control->etor, - "%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f\n", - htra, htrb, htrc, hthd, hthe, hnra, hnrc, hnhd, hnhe ); - fprintf( out_control->etor, "%12.6f%12.6f%12.6f\n", - dvec_ij[0]/r_ij, dvec_ij[1]/r_ij, dvec_ij[2]/r_ij ); - fprintf( out_control->etor, "%12.6f%12.6f%12.6f\n", - -dvec_jk[0]/r_jk, -dvec_jk[1]/r_jk, -dvec_jk[2]/r_jk ); - fprintf( out_control->etor, "%12.6f%12.6f%12.6f\n", - -dvec_kl[0]/r_kl, -dvec_kl[1]/r_kl, -dvec_kl[2]/r_kl ); - fprintf( out_control->etor, "%12.6f%12.6f%12.6f%12.6f\n", - r_li, dvec_li[0], dvec_li[1], dvec_li[2] ); - fprintf( out_control->etor, "%12.6f%12.6f%12.6f\n", - poem, tel, arg ); */ - /* fprintf( out_control->etor, "%12.6f%12.6f%12.6f\n", - -p_ijk->dcos_dk[0]/sin_ijk, -p_ijk->dcos_dk[1]/sin_ijk, - -p_ijk->dcos_dk[2]/sin_ijk ); - fprintf( out_control->etor, "%12.6f%12.6f%12.6f\n", - -p_jkl->dcos_dk[0]/sin_jkl, -p_jkl->dcos_dk[1]/sin_jkl, - -p_jkl->dcos_dk[2]/sin_jkl );*/ - - if( sin_ijk >= 0 && sin_ijk <= MIN_SINE ) sin_ijk = MIN_SINE; - else if( sin_ijk <= 0 && sin_ijk >= -MIN_SINE ) sin_ijk = -MIN_SINE; - if( sin_jkl >= 0 && sin_jkl <= MIN_SINE ) sin_jkl = MIN_SINE; - else if( sin_jkl <= 0 && sin_jkl >= -MIN_SINE ) sin_jkl = -MIN_SINE; - - // dcos_omega_di - rvec_ScaledSum( dcos_omega_di, (htra-arg*hnra)/r_ij, dvec_ij, -1., dvec_li ); - rvec_ScaledAdd( dcos_omega_di,-(hthd-arg*hnhd)/sin_ijk, p_ijk->dcos_dk ); - rvec_Scale( dcos_omega_di, 2.0 / poem, dcos_omega_di ); - - // dcos_omega_dj - rvec_ScaledSum( dcos_omega_dj,-(htra-arg*hnra)/r_ij, dvec_ij, - -htrb / r_jk, dvec_jk ); - rvec_ScaledAdd( dcos_omega_dj,-(hthd-arg*hnhd)/sin_ijk, p_ijk->dcos_dj ); - rvec_ScaledAdd( dcos_omega_dj,-(hthe-arg*hnhe)/sin_jkl, p_jkl->dcos_di ); - rvec_Scale( dcos_omega_dj, 2.0 / poem, dcos_omega_dj ); - - // dcos_omega_dk - rvec_ScaledSum( dcos_omega_dk,-(htrc-arg*hnrc)/r_kl, dvec_kl, - htrb / r_jk, dvec_jk ); - rvec_ScaledAdd( dcos_omega_dk,-(hthd-arg*hnhd)/sin_ijk, p_ijk->dcos_di ); - rvec_ScaledAdd( dcos_omega_dk,-(hthe-arg*hnhe)/sin_jkl, p_jkl->dcos_dj ); - rvec_Scale( dcos_omega_dk, 2.0 / poem, dcos_omega_dk ); - - // dcos_omega_dl - rvec_ScaledSum( dcos_omega_dl, (htrc-arg*hnrc)/r_kl, dvec_kl, 1., dvec_li ); - rvec_ScaledAdd( dcos_omega_dl,-(hthe-arg*hnhe)/sin_jkl, p_jkl->dcos_dk ); - rvec_Scale( dcos_omega_dl, 2.0 / poem, dcos_omega_dl ); - - return omega; + real unnorm_cos_omega, unnorm_sin_omega, omega; + real sin_ijk, cos_ijk, sin_jkl, cos_jkl; + real htra, htrb, htrc, hthd, hthe, hnra, hnrc, hnhd, hnhe; + real arg, poem, tel; + rvec cross_jk_kl; + + sin_ijk = SIN( p_ijk->theta ); + cos_ijk = COS( p_ijk->theta ); + sin_jkl = SIN( p_jkl->theta ); + cos_jkl = COS( p_jkl->theta ); + + /* omega */ + unnorm_cos_omega = -rvec_Dot(dvec_ij, dvec_jk) * rvec_Dot(dvec_jk, dvec_kl) + + SQR( r_jk ) * rvec_Dot( dvec_ij, dvec_kl ); + + rvec_Cross( cross_jk_kl, dvec_jk, dvec_kl ); + unnorm_sin_omega = -r_jk * rvec_Dot( dvec_ij, cross_jk_kl ); + + omega = atan2( unnorm_sin_omega, unnorm_cos_omega ); + + + /* derivatives */ + /* coef for adjusments to cos_theta's */ + /* rla = r_ij, rlb = r_jk, rlc = r_kl, r4 = r_li; + coshd = cos_ijk, coshe = cos_jkl; + sinhd = sin_ijk, sinhe = sin_jkl; */ + htra = r_ij + cos_ijk * ( r_kl * cos_jkl - r_jk ); + htrb = r_jk - r_ij * cos_ijk - r_kl * cos_jkl; + htrc = r_kl + cos_jkl * ( r_ij * cos_ijk - r_jk ); + hthd = r_ij * sin_ijk * ( r_jk - r_kl * cos_jkl ); + hthe = r_kl * sin_jkl * ( r_jk - r_ij * cos_ijk ); + hnra = r_kl * sin_ijk * sin_jkl; + hnrc = r_ij * sin_ijk * sin_jkl; + hnhd = r_ij * r_kl * cos_ijk * sin_jkl; + hnhe = r_ij * r_kl * sin_ijk * cos_jkl; + + + poem = 2.0 * r_ij * r_kl * sin_ijk * sin_jkl; + if( poem < 1e-20 ) poem = 1e-20; + + tel = SQR( r_ij ) + SQR( r_jk ) + SQR( r_kl ) - SQR( r_li ) - + 2.0 * ( r_ij * r_jk * cos_ijk - r_ij * r_kl * cos_ijk * cos_jkl + + r_jk * r_kl * cos_jkl ); + + arg = tel / poem; + if( arg > 1.0 ) arg = 1.0; + if( arg < -1.0 ) arg = -1.0; + + + /* fprintf( out_control->etor, + "%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f\n", + htra, htrb, htrc, hthd, hthe, hnra, hnrc, hnhd, hnhe ); + fprintf( out_control->etor, "%12.6f%12.6f%12.6f\n", + dvec_ij[0]/r_ij, dvec_ij[1]/r_ij, dvec_ij[2]/r_ij ); + fprintf( out_control->etor, "%12.6f%12.6f%12.6f\n", + -dvec_jk[0]/r_jk, -dvec_jk[1]/r_jk, -dvec_jk[2]/r_jk ); + fprintf( out_control->etor, "%12.6f%12.6f%12.6f\n", + -dvec_kl[0]/r_kl, -dvec_kl[1]/r_kl, -dvec_kl[2]/r_kl ); + fprintf( out_control->etor, "%12.6f%12.6f%12.6f%12.6f\n", + r_li, dvec_li[0], dvec_li[1], dvec_li[2] ); + fprintf( out_control->etor, "%12.6f%12.6f%12.6f\n", + poem, tel, arg ); */ + /* fprintf( out_control->etor, "%12.6f%12.6f%12.6f\n", + -p_ijk->dcos_dk[0]/sin_ijk, -p_ijk->dcos_dk[1]/sin_ijk, + -p_ijk->dcos_dk[2]/sin_ijk ); + fprintf( out_control->etor, "%12.6f%12.6f%12.6f\n", + -p_jkl->dcos_dk[0]/sin_jkl, -p_jkl->dcos_dk[1]/sin_jkl, + -p_jkl->dcos_dk[2]/sin_jkl );*/ + + if( sin_ijk >= 0 && sin_ijk <= MIN_SINE ) sin_ijk = MIN_SINE; + else if( sin_ijk <= 0 && sin_ijk >= -MIN_SINE ) sin_ijk = -MIN_SINE; + if( sin_jkl >= 0 && sin_jkl <= MIN_SINE ) sin_jkl = MIN_SINE; + else if( sin_jkl <= 0 && sin_jkl >= -MIN_SINE ) sin_jkl = -MIN_SINE; + + // dcos_omega_di + rvec_ScaledSum( dcos_omega_di, (htra-arg*hnra)/r_ij, dvec_ij, -1., dvec_li ); + rvec_ScaledAdd( dcos_omega_di,-(hthd-arg*hnhd)/sin_ijk, p_ijk->dcos_dk ); + rvec_Scale( dcos_omega_di, 2.0 / poem, dcos_omega_di ); + + // dcos_omega_dj + rvec_ScaledSum( dcos_omega_dj,-(htra-arg*hnra)/r_ij, dvec_ij, + -htrb / r_jk, dvec_jk ); + rvec_ScaledAdd( dcos_omega_dj,-(hthd-arg*hnhd)/sin_ijk, p_ijk->dcos_dj ); + rvec_ScaledAdd( dcos_omega_dj,-(hthe-arg*hnhe)/sin_jkl, p_jkl->dcos_di ); + rvec_Scale( dcos_omega_dj, 2.0 / poem, dcos_omega_dj ); + + // dcos_omega_dk + rvec_ScaledSum( dcos_omega_dk,-(htrc-arg*hnrc)/r_kl, dvec_kl, + htrb / r_jk, dvec_jk ); + rvec_ScaledAdd( dcos_omega_dk,-(hthd-arg*hnhd)/sin_ijk, p_ijk->dcos_di ); + rvec_ScaledAdd( dcos_omega_dk,-(hthe-arg*hnhe)/sin_jkl, p_jkl->dcos_dj ); + rvec_Scale( dcos_omega_dk, 2.0 / poem, dcos_omega_dk ); + + // dcos_omega_dl + rvec_ScaledSum( dcos_omega_dl, (htrc-arg*hnrc)/r_kl, dvec_kl, 1., dvec_li ); + rvec_ScaledAdd( dcos_omega_dl,-(hthe-arg*hnhe)/sin_jkl, p_jkl->dcos_dk ); + rvec_Scale( dcos_omega_dl, 2.0 / poem, dcos_omega_dl ); + + return omega; } CUDA_GLOBAL void Cuda_Torsion_Angles( reax_atom *my_atoms, - global_parameters gp, - four_body_header *d_fbp, - control_params *control, - reax_list p_bonds, reax_list p_thb_intrs, - storage p_workspace, - int n, int num_atom_types, - real *data_e_tor, real *data_e_con, - rvec *data_ext_press ) + global_parameters gp, + four_body_header *d_fbp, + control_params *control, + reax_list p_bonds, reax_list p_thb_intrs, + storage p_workspace, + int n, int num_atom_types, + real *data_e_tor, real *data_e_con, + rvec *data_ext_press ) { - int i, j, k, l, pi, pj, pk, pl, pij, plk, natoms; - int type_i, type_j, type_k, type_l; - int start_j, end_j, start_k, end_k; - int start_pj, end_pj, start_pk, end_pk; - int num_frb_intrs = 0; - - real Delta_j, Delta_k; - real r_ij, r_jk, r_kl, r_li; - real BOA_ij, BOA_jk, BOA_kl; - - real exp_tor2_ij, exp_tor2_jk, exp_tor2_kl; - real exp_tor1, exp_tor3_DjDk, exp_tor4_DjDk, exp_tor34_inv; - real exp_cot2_jk, exp_cot2_ij, exp_cot2_kl; - real fn10, f11_DjDk, dfn11, fn12; - real theta_ijk, theta_jkl; - real sin_ijk, sin_jkl; - real cos_ijk, cos_jkl; - real tan_ijk_i, tan_jkl_i; - real omega, cos_omega, cos2omega, cos3omega; - rvec dcos_omega_di, dcos_omega_dj, dcos_omega_dk, dcos_omega_dl; - real CV, cmn, CEtors1, CEtors2, CEtors3, CEtors4; - real CEtors5, CEtors6, CEtors7, CEtors8, CEtors9; - real Cconj, CEconj1, CEconj2, CEconj3; - real CEconj4, CEconj5, CEconj6; - real e_tor, e_con; - rvec dvec_li; - rvec force, ext_press; - ivec rel_box_jl; - // rtensor total_rtensor, temp_rtensor; - four_body_header *fbh; - four_body_parameters *fbp; - bond_data *pbond_ij, *pbond_jk, *pbond_kl; - bond_order_data *bo_ij, *bo_jk, *bo_kl; - three_body_interaction_data *p_ijk, *p_jkl; - - reax_list *bonds = &( p_bonds ); - reax_list *thb_intrs = &( p_thb_intrs ); - storage *workspace = &( p_workspace ); - - j = blockIdx.x * blockDim.x + threadIdx.x; - if (j >= n) return; - - real p_tor2 = gp.l[23]; - real p_tor3 = gp.l[24]; - real p_tor4 = gp.l[25]; - real p_cot2 = gp.l[27]; - // char fname[100]; - // FILE *ftor; - - // sprintf( fname, "tor%d.out", system->my_rank ); - // ftor = fopen( fname, "w" ); - - //natoms = system->n; - - //for( j = 0; j < natoms; ++j ) { - type_j = my_atoms[j].type; - Delta_j = workspace->Delta_boc[j]; - start_j = Dev_Start_Index(j, bonds); - end_j = Dev_End_Index(j, bonds); - - for( pk = start_j; pk < end_j; ++pk ) { - pbond_jk = &( bonds->select.bond_list[pk] ); - k = pbond_jk->nbr; - bo_jk = &( pbond_jk->bo_data ); - BOA_jk = bo_jk->BO - control->thb_cut; - - /* see if there are any 3-body interactions involving j&k - where j is the central atom. Otherwise there is no point in - trying to form a 4-body interaction out of this neighborhood */ - if( my_atoms[j].orig_id < my_atoms[k].orig_id && - bo_jk->BO > control->thb_cut/*0*/ && Dev_Num_Entries(pk, thb_intrs) ) { - start_k = Dev_Start_Index(k, bonds); - end_k = Dev_End_Index(k, bonds); - pj = pbond_jk->sym_index; // pj points to j on k's list - - /* do the same check as above: - are there any 3-body interactions involving k&j - where k is the central atom */ - if( Dev_Num_Entries(pj, thb_intrs) ) { - type_k = my_atoms[k].type; - Delta_k = workspace->Delta_boc[k]; - r_jk = pbond_jk->d; - - start_pk = Dev_Start_Index(pk, thb_intrs ); - end_pk = Dev_End_Index(pk, thb_intrs ); - start_pj = Dev_Start_Index(pj, thb_intrs ); - end_pj = Dev_End_Index(pj, thb_intrs ); - - exp_tor2_jk = EXP( -p_tor2 * BOA_jk ); - exp_cot2_jk = EXP( -p_cot2 * SQR(BOA_jk - 1.5) ); - exp_tor3_DjDk = EXP( -p_tor3 * (Delta_j + Delta_k) ); - exp_tor4_DjDk = EXP( p_tor4 * (Delta_j + Delta_k) ); - exp_tor34_inv = 1.0 / (1.0 + exp_tor3_DjDk + exp_tor4_DjDk); - f11_DjDk = (2.0 + exp_tor3_DjDk) * exp_tor34_inv; - - - /* pick i up from j-k interaction where j is the central atom */ - for( pi = start_pk; pi < end_pk; ++pi ) { - p_ijk = &( thb_intrs->select.three_body_list[pi] ); - pij = p_ijk->pthb; // pij is pointer to i on j's bond_list - pbond_ij = &( bonds->select.bond_list[pij] ); - bo_ij = &( pbond_ij->bo_data ); - - - if( bo_ij->BO > control->thb_cut/*0*/ ) { - i = p_ijk->thb; - type_i = my_atoms[i].type; - r_ij = pbond_ij->d; - BOA_ij = bo_ij->BO - control->thb_cut; - - theta_ijk = p_ijk->theta; - sin_ijk = SIN( theta_ijk ); - cos_ijk = COS( theta_ijk ); - //tan_ijk_i = 1. / TAN( theta_ijk ); - if( sin_ijk >= 0 && sin_ijk <= MIN_SINE ) - tan_ijk_i = cos_ijk / MIN_SINE; - else if( sin_ijk <= 0 && sin_ijk >= -MIN_SINE ) - tan_ijk_i = cos_ijk / -MIN_SINE; - else tan_ijk_i = cos_ijk / sin_ijk; - - exp_tor2_ij = EXP( -p_tor2 * BOA_ij ); - exp_cot2_ij = EXP( -p_cot2 * SQR(BOA_ij -1.5) ); - - - /* pick l up from j-k interaction where k is the central atom */ - for( pl = start_pj; pl < end_pj; ++pl ) { - p_jkl = &( thb_intrs->select.three_body_list[pl] ); - l = p_jkl->thb; - plk = p_jkl->pthb; //pointer to l on k's bond_list! - pbond_kl = &( bonds->select.bond_list[plk] ); - bo_kl = &( pbond_kl->bo_data ); - type_l = my_atoms[l].type; - fbh = &(d_fbp[index_fbp (type_i,type_j,type_k,type_l,num_atom_types)]); - fbp = &(d_fbp[index_fbp (type_i,type_j,type_k,type_l,num_atom_types)].prm[0]); - - - if( i != l && fbh->cnt && - bo_kl->BO > control->thb_cut/*0*/ && - bo_ij->BO * bo_jk->BO * bo_kl->BO > control->thb_cut/*0*/ ){ - ++num_frb_intrs; - r_kl = pbond_kl->d; - BOA_kl = bo_kl->BO - control->thb_cut; - - theta_jkl = p_jkl->theta; - sin_jkl = SIN( theta_jkl ); - cos_jkl = COS( theta_jkl ); - //tan_jkl_i = 1. / TAN( theta_jkl ); - if( sin_jkl >= 0 && sin_jkl <= MIN_SINE ) - tan_jkl_i = cos_jkl / MIN_SINE; - else if( sin_jkl <= 0 && sin_jkl >= -MIN_SINE ) - tan_jkl_i = cos_jkl / -MIN_SINE; - else tan_jkl_i = cos_jkl /sin_jkl; - - rvec_ScaledSum( dvec_li, 1., my_atoms[i].x, - -1., my_atoms[l].x ); - r_li = rvec_Norm( dvec_li ); - - - /* omega and its derivative */ - omega = Calculate_Omega( pbond_ij->dvec, r_ij, - pbond_jk->dvec, r_jk, - pbond_kl->dvec, r_kl, - dvec_li, r_li, - p_ijk, p_jkl, - dcos_omega_di, dcos_omega_dj, - dcos_omega_dk, dcos_omega_dl, - NULL); - - cos_omega = COS( omega ); - cos2omega = COS( 2. * omega ); - cos3omega = COS( 3. * omega ); - /* end omega calculations */ - - /* torsion energy */ - exp_tor1 = EXP( fbp->p_tor1 * - SQR(2.0 - bo_jk->BO_pi - f11_DjDk) ); - exp_tor2_kl = EXP( -p_tor2 * BOA_kl ); - exp_cot2_kl = EXP( -p_cot2 * SQR(BOA_kl - 1.5) ); - fn10 = (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_jk) * - (1.0 - exp_tor2_kl); - - CV = 0.5 * ( fbp->V1 * (1.0 + cos_omega) + - fbp->V2 * exp_tor1 * (1.0 - cos2omega) + - fbp->V3 * (1.0 + cos3omega) ); - - data_e_tor [j] += e_tor = fn10 * sin_ijk * sin_jkl * CV; - - dfn11 = (-p_tor3 * exp_tor3_DjDk + - (p_tor3 * exp_tor3_DjDk - p_tor4 * exp_tor4_DjDk) * - (2.0 + exp_tor3_DjDk) * exp_tor34_inv) * - exp_tor34_inv; - - CEtors1 = sin_ijk * sin_jkl * CV; - - CEtors2 = -fn10 * 2.0 * fbp->p_tor1 * fbp->V2 * exp_tor1 * - (2.0 - bo_jk->BO_pi - f11_DjDk) * (1.0 - SQR(cos_omega)) * - sin_ijk * sin_jkl; - CEtors3 = CEtors2 * dfn11; - - CEtors4 = CEtors1 * p_tor2 * exp_tor2_ij * - (1.0 - exp_tor2_jk) * (1.0 - exp_tor2_kl); - CEtors5 = CEtors1 * p_tor2 * - (1.0 - exp_tor2_ij) * exp_tor2_jk * (1.0 - exp_tor2_kl); - CEtors6 = CEtors1 * p_tor2 * - (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_jk) * exp_tor2_kl; - - cmn = -fn10 * CV; - CEtors7 = cmn * sin_jkl * tan_ijk_i; - CEtors8 = cmn * sin_ijk * tan_jkl_i; - - CEtors9 = fn10 * sin_ijk * sin_jkl * - (0.5 * fbp->V1 - 2.0 * fbp->V2 * exp_tor1 * cos_omega + - 1.5 * fbp->V3 * (cos2omega + 2.0 * SQR(cos_omega))); - /* end of torsion energy */ - - - /* 4-body conjugation energy */ - fn12 = exp_cot2_ij * exp_cot2_jk * exp_cot2_kl; - data_e_con [j] += e_con = - fbp->p_cot1 * fn12 * - (1.0 + (SQR(cos_omega) - 1.0) * sin_ijk * sin_jkl); - - Cconj = -2.0 * fn12 * fbp->p_cot1 * p_cot2 * - (1.0 + (SQR(cos_omega) - 1.0) * sin_ijk * sin_jkl); - - CEconj1 = Cconj * (BOA_ij - 1.5e0); - CEconj2 = Cconj * (BOA_jk - 1.5e0); - CEconj3 = Cconj * (BOA_kl - 1.5e0); - - CEconj4 = -fbp->p_cot1 * fn12 * - (SQR(cos_omega) - 1.0) * sin_jkl * tan_ijk_i; - CEconj5 = -fbp->p_cot1 * fn12 * - (SQR(cos_omega) - 1.0) * sin_ijk * tan_jkl_i; - CEconj6 = 2.0 * fbp->p_cot1 * fn12 * - cos_omega * sin_ijk * sin_jkl; - /* end 4-body conjugation energy */ - - /* forces */ - /* - bo_jk->Cdbopi += CEtors2; - workspace->CdDelta[j] += CEtors3; - workspace->CdDelta[k] += CEtors3; - bo_ij->Cdbo += (CEtors4 + CEconj1); - bo_jk->Cdbo += (CEtors5 + CEconj2); - bo_kl->Cdbo += (CEtors6 + CEconj3); - */ - bo_jk->Cdbopi += CEtors2; - workspace->CdDelta[j] += CEtors3; - pbond_jk->ta_CdDelta += CEtors3; - bo_ij->Cdbo += (CEtors4 + CEconj1); - bo_jk->Cdbo += (CEtors5 + CEconj2); - atomicAdd ( &pbond_kl->ta_Cdbo, (CEtors6 + CEconj3)); - - if( control->virial == 0 ) { - /* dcos_theta_ijk */ - //rvec_ScaledAdd( workspace->f[i], - atomic_rvecScaledAdd( pbond_ij->ta_f, - CEtors7 + CEconj4, p_ijk->dcos_dk ); - rvec_ScaledAdd( workspace->f[j], - CEtors7 + CEconj4, p_ijk->dcos_dj ); - //rvec_ScaledAdd( workspace->f[k], - atomic_rvecScaledAdd( pbond_jk->ta_f, - CEtors7 + CEconj4, p_ijk->dcos_di ); - - /* dcos_theta_jkl */ - rvec_ScaledAdd( workspace->f[j], - CEtors8 + CEconj5, p_jkl->dcos_di ); - //rvec_ScaledAdd( workspace->f[k], - atomic_rvecScaledAdd( pbond_jk->ta_f, - CEtors8 + CEconj5, p_jkl->dcos_dj ); - //rvec_ScaledAdd( workspace->f[l], - atomic_rvecScaledAdd( pbond_kl->ta_f, - CEtors8 + CEconj5, p_jkl->dcos_dk ); - - /* dcos_omega */ - //rvec_ScaledAdd( workspace->f[i], - atomic_rvecScaledAdd( pbond_ij->ta_f, - CEtors9 + CEconj6, dcos_omega_di ); - rvec_ScaledAdd( workspace->f[j], - CEtors9 + CEconj6, dcos_omega_dj ); - //rvec_ScaledAdd( workspace->f[k], - atomic_rvecScaledAdd( pbond_jk->ta_f, - CEtors9 + CEconj6, dcos_omega_dk ); - //rvec_ScaledAdd( workspace->f[l], - atomic_rvecScaledAdd( pbond_kl->ta_f, - CEtors9 + CEconj6, dcos_omega_dl ); - } - else { - ivec_Sum(rel_box_jl, pbond_jk->rel_box, pbond_kl->rel_box); - - /* dcos_theta_ijk */ - rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_dk ); - //rvec_Add( workspace->f[i], force ); - atomic_rvecAdd( pbond_ij->ta_f, force ); - rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); - rvec_Add( data_ext_press [j], ext_press ); - - rvec_ScaledAdd( workspace->f[j], - CEtors7 + CEconj4, p_ijk->dcos_dj ); - - rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_di ); - //rvec_Add( workspace->f[k], force ); - atomic_rvecAdd( pbond_jk->ta_f, force ); - rvec_iMultiply( ext_press, pbond_jk->rel_box, force ); - rvec_Add( data_ext_press[j], ext_press ); - - - /* dcos_theta_jkl */ - rvec_ScaledAdd( workspace->f[j], - CEtors8 + CEconj5, p_jkl->dcos_di ); - - rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dj ); - //rvec_Add( workspace->f[k], force ); - atomic_rvecAdd( pbond_jk->ta_f, force ); - rvec_iMultiply( ext_press, pbond_jk->rel_box, force ); - rvec_Add( data_ext_press [j], ext_press ); - - rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dk ); - //rvec_Add( workspace->f[l], force ); - rvec_Add( pbond_kl->ta_f, force ); - rvec_iMultiply( ext_press, rel_box_jl, force ); - rvec_Add( data_ext_press [j], ext_press ); - - - /* dcos_omega */ - rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_di ); - //rvec_Add( workspace->f[i], force ); - atomic_rvecAdd( pbond_ij->ta_f, force ); - rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); - rvec_Add( data_ext_press [j], ext_press ); - - rvec_ScaledAdd( workspace->f[j], - CEtors9 + CEconj6, dcos_omega_dj ); - - rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dk ); - //rvec_Add( workspace->f[k], force ); - rvec_Add( pbond_jk->ta_f, force ); - rvec_iMultiply( ext_press, pbond_jk->rel_box, force ); - rvec_Add( data_ext_press [j], ext_press ); - - rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dl ); - //rvec_Add( workspace->f[l], force ); - rvec_Add( pbond_kl->ta_f, force ); - rvec_iMultiply( ext_press, rel_box_jl, force ); - rvec_Add( data_ext_press [j], ext_press ); - } + int i, j, k, l, pi, pj, pk, pl, pij, plk, natoms; + int type_i, type_j, type_k, type_l; + int start_j, end_j, start_k, end_k; + int start_pj, end_pj, start_pk, end_pk; + int num_frb_intrs = 0; + + real Delta_j, Delta_k; + real r_ij, r_jk, r_kl, r_li; + real BOA_ij, BOA_jk, BOA_kl; + + real exp_tor2_ij, exp_tor2_jk, exp_tor2_kl; + real exp_tor1, exp_tor3_DjDk, exp_tor4_DjDk, exp_tor34_inv; + real exp_cot2_jk, exp_cot2_ij, exp_cot2_kl; + real fn10, f11_DjDk, dfn11, fn12; + real theta_ijk, theta_jkl; + real sin_ijk, sin_jkl; + real cos_ijk, cos_jkl; + real tan_ijk_i, tan_jkl_i; + real omega, cos_omega, cos2omega, cos3omega; + rvec dcos_omega_di, dcos_omega_dj, dcos_omega_dk, dcos_omega_dl; + real CV, cmn, CEtors1, CEtors2, CEtors3, CEtors4; + real CEtors5, CEtors6, CEtors7, CEtors8, CEtors9; + real Cconj, CEconj1, CEconj2, CEconj3; + real CEconj4, CEconj5, CEconj6; + real e_tor, e_con; + rvec dvec_li; + rvec force, ext_press; + ivec rel_box_jl; + // rtensor total_rtensor, temp_rtensor; + four_body_header *fbh; + four_body_parameters *fbp; + bond_data *pbond_ij, *pbond_jk, *pbond_kl; + bond_order_data *bo_ij, *bo_jk, *bo_kl; + three_body_interaction_data *p_ijk, *p_jkl; + + reax_list *bonds = &( p_bonds ); + reax_list *thb_intrs = &( p_thb_intrs ); + storage *workspace = &( p_workspace ); + + j = blockIdx.x * blockDim.x + threadIdx.x; + if (j >= n) return; + + real p_tor2 = gp.l[23]; + real p_tor3 = gp.l[24]; + real p_tor4 = gp.l[25]; + real p_cot2 = gp.l[27]; + // char fname[100]; + // FILE *ftor; + + // sprintf( fname, "tor%d.out", system->my_rank ); + // ftor = fopen( fname, "w" ); + + //natoms = system->n; + + //for( j = 0; j < natoms; ++j ) { + type_j = my_atoms[j].type; + Delta_j = workspace->Delta_boc[j]; + start_j = Dev_Start_Index(j, bonds); + end_j = Dev_End_Index(j, bonds); + + for( pk = start_j; pk < end_j; ++pk ) { + pbond_jk = &( bonds->select.bond_list[pk] ); + k = pbond_jk->nbr; + bo_jk = &( pbond_jk->bo_data ); + BOA_jk = bo_jk->BO - control->thb_cut; + + /* see if there are any 3-body interactions involving j&k + where j is the central atom. Otherwise there is no point in + trying to form a 4-body interaction out of this neighborhood */ + if( my_atoms[j].orig_id < my_atoms[k].orig_id && + bo_jk->BO > control->thb_cut/*0*/ && Dev_Num_Entries(pk, thb_intrs) ) { + start_k = Dev_Start_Index(k, bonds); + end_k = Dev_End_Index(k, bonds); + pj = pbond_jk->sym_index; // pj points to j on k's list + + /* do the same check as above: + are there any 3-body interactions involving k&j + where k is the central atom */ + if( Dev_Num_Entries(pj, thb_intrs) ) { + type_k = my_atoms[k].type; + Delta_k = workspace->Delta_boc[k]; + r_jk = pbond_jk->d; + + start_pk = Dev_Start_Index(pk, thb_intrs ); + end_pk = Dev_End_Index(pk, thb_intrs ); + start_pj = Dev_Start_Index(pj, thb_intrs ); + end_pj = Dev_End_Index(pj, thb_intrs ); + + exp_tor2_jk = EXP( -p_tor2 * BOA_jk ); + exp_cot2_jk = EXP( -p_cot2 * SQR(BOA_jk - 1.5) ); + exp_tor3_DjDk = EXP( -p_tor3 * (Delta_j + Delta_k) ); + exp_tor4_DjDk = EXP( p_tor4 * (Delta_j + Delta_k) ); + exp_tor34_inv = 1.0 / (1.0 + exp_tor3_DjDk + exp_tor4_DjDk); + f11_DjDk = (2.0 + exp_tor3_DjDk) * exp_tor34_inv; + + + /* pick i up from j-k interaction where j is the central atom */ + for( pi = start_pk; pi < end_pk; ++pi ) { + p_ijk = &( thb_intrs->select.three_body_list[pi] ); + pij = p_ijk->pthb; // pij is pointer to i on j's bond_list + pbond_ij = &( bonds->select.bond_list[pij] ); + bo_ij = &( pbond_ij->bo_data ); + + + if( bo_ij->BO > control->thb_cut/*0*/ ) { + i = p_ijk->thb; + type_i = my_atoms[i].type; + r_ij = pbond_ij->d; + BOA_ij = bo_ij->BO - control->thb_cut; + + theta_ijk = p_ijk->theta; + sin_ijk = SIN( theta_ijk ); + cos_ijk = COS( theta_ijk ); + //tan_ijk_i = 1. / TAN( theta_ijk ); + if( sin_ijk >= 0 && sin_ijk <= MIN_SINE ) + tan_ijk_i = cos_ijk / MIN_SINE; + else if( sin_ijk <= 0 && sin_ijk >= -MIN_SINE ) + tan_ijk_i = cos_ijk / -MIN_SINE; + else tan_ijk_i = cos_ijk / sin_ijk; + + exp_tor2_ij = EXP( -p_tor2 * BOA_ij ); + exp_cot2_ij = EXP( -p_cot2 * SQR(BOA_ij -1.5) ); + + + /* pick l up from j-k interaction where k is the central atom */ + for( pl = start_pj; pl < end_pj; ++pl ) { + p_jkl = &( thb_intrs->select.three_body_list[pl] ); + l = p_jkl->thb; + plk = p_jkl->pthb; //pointer to l on k's bond_list! + pbond_kl = &( bonds->select.bond_list[plk] ); + bo_kl = &( pbond_kl->bo_data ); + type_l = my_atoms[l].type; + fbh = &(d_fbp[index_fbp (type_i,type_j,type_k,type_l,num_atom_types)]); + fbp = &(d_fbp[index_fbp (type_i,type_j,type_k,type_l,num_atom_types)].prm[0]); + + + if( i != l && fbh->cnt && + bo_kl->BO > control->thb_cut/*0*/ && + bo_ij->BO * bo_jk->BO * bo_kl->BO > control->thb_cut/*0*/ ){ + ++num_frb_intrs; + r_kl = pbond_kl->d; + BOA_kl = bo_kl->BO - control->thb_cut; + + theta_jkl = p_jkl->theta; + sin_jkl = SIN( theta_jkl ); + cos_jkl = COS( theta_jkl ); + //tan_jkl_i = 1. / TAN( theta_jkl ); + if( sin_jkl >= 0 && sin_jkl <= MIN_SINE ) + tan_jkl_i = cos_jkl / MIN_SINE; + else if( sin_jkl <= 0 && sin_jkl >= -MIN_SINE ) + tan_jkl_i = cos_jkl / -MIN_SINE; + else tan_jkl_i = cos_jkl /sin_jkl; + + rvec_ScaledSum( dvec_li, 1., my_atoms[i].x, + -1., my_atoms[l].x ); + r_li = rvec_Norm( dvec_li ); + + + /* omega and its derivative */ + omega = Calculate_Omega( pbond_ij->dvec, r_ij, + pbond_jk->dvec, r_jk, + pbond_kl->dvec, r_kl, + dvec_li, r_li, + p_ijk, p_jkl, + dcos_omega_di, dcos_omega_dj, + dcos_omega_dk, dcos_omega_dl, + NULL); + + cos_omega = COS( omega ); + cos2omega = COS( 2. * omega ); + cos3omega = COS( 3. * omega ); + /* end omega calculations */ + + /* torsion energy */ + exp_tor1 = EXP( fbp->p_tor1 * + SQR(2.0 - bo_jk->BO_pi - f11_DjDk) ); + exp_tor2_kl = EXP( -p_tor2 * BOA_kl ); + exp_cot2_kl = EXP( -p_cot2 * SQR(BOA_kl - 1.5) ); + fn10 = (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_jk) * + (1.0 - exp_tor2_kl); + + CV = 0.5 * ( fbp->V1 * (1.0 + cos_omega) + + fbp->V2 * exp_tor1 * (1.0 - cos2omega) + + fbp->V3 * (1.0 + cos3omega) ); + + data_e_tor [j] += e_tor = fn10 * sin_ijk * sin_jkl * CV; + + dfn11 = (-p_tor3 * exp_tor3_DjDk + + (p_tor3 * exp_tor3_DjDk - p_tor4 * exp_tor4_DjDk) * + (2.0 + exp_tor3_DjDk) * exp_tor34_inv) * + exp_tor34_inv; + + CEtors1 = sin_ijk * sin_jkl * CV; + + CEtors2 = -fn10 * 2.0 * fbp->p_tor1 * fbp->V2 * exp_tor1 * + (2.0 - bo_jk->BO_pi - f11_DjDk) * (1.0 - SQR(cos_omega)) * + sin_ijk * sin_jkl; + CEtors3 = CEtors2 * dfn11; + + CEtors4 = CEtors1 * p_tor2 * exp_tor2_ij * + (1.0 - exp_tor2_jk) * (1.0 - exp_tor2_kl); + CEtors5 = CEtors1 * p_tor2 * + (1.0 - exp_tor2_ij) * exp_tor2_jk * (1.0 - exp_tor2_kl); + CEtors6 = CEtors1 * p_tor2 * + (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_jk) * exp_tor2_kl; + + cmn = -fn10 * CV; + CEtors7 = cmn * sin_jkl * tan_ijk_i; + CEtors8 = cmn * sin_ijk * tan_jkl_i; + + CEtors9 = fn10 * sin_ijk * sin_jkl * + (0.5 * fbp->V1 - 2.0 * fbp->V2 * exp_tor1 * cos_omega + + 1.5 * fbp->V3 * (cos2omega + 2.0 * SQR(cos_omega))); + /* end of torsion energy */ + + + /* 4-body conjugation energy */ + fn12 = exp_cot2_ij * exp_cot2_jk * exp_cot2_kl; + data_e_con [j] += e_con = + fbp->p_cot1 * fn12 * + (1.0 + (SQR(cos_omega) - 1.0) * sin_ijk * sin_jkl); + + Cconj = -2.0 * fn12 * fbp->p_cot1 * p_cot2 * + (1.0 + (SQR(cos_omega) - 1.0) * sin_ijk * sin_jkl); + + CEconj1 = Cconj * (BOA_ij - 1.5e0); + CEconj2 = Cconj * (BOA_jk - 1.5e0); + CEconj3 = Cconj * (BOA_kl - 1.5e0); + + CEconj4 = -fbp->p_cot1 * fn12 * + (SQR(cos_omega) - 1.0) * sin_jkl * tan_ijk_i; + CEconj5 = -fbp->p_cot1 * fn12 * + (SQR(cos_omega) - 1.0) * sin_ijk * tan_jkl_i; + CEconj6 = 2.0 * fbp->p_cot1 * fn12 * + cos_omega * sin_ijk * sin_jkl; + /* end 4-body conjugation energy */ + + /* forces */ + /* + bo_jk->Cdbopi += CEtors2; + workspace->CdDelta[j] += CEtors3; + workspace->CdDelta[k] += CEtors3; + bo_ij->Cdbo += (CEtors4 + CEconj1); + bo_jk->Cdbo += (CEtors5 + CEconj2); + bo_kl->Cdbo += (CEtors6 + CEconj3); + */ + bo_jk->Cdbopi += CEtors2; + workspace->CdDelta[j] += CEtors3; + pbond_jk->ta_CdDelta += CEtors3; + bo_ij->Cdbo += (CEtors4 + CEconj1); + bo_jk->Cdbo += (CEtors5 + CEconj2); + atomicAdd ( &pbond_kl->ta_Cdbo, (CEtors6 + CEconj3)); + + if( control->virial == 0 ) { + /* dcos_theta_ijk */ + //rvec_ScaledAdd( workspace->f[i], + atomic_rvecScaledAdd( pbond_ij->ta_f, + CEtors7 + CEconj4, p_ijk->dcos_dk ); + rvec_ScaledAdd( workspace->f[j], + CEtors7 + CEconj4, p_ijk->dcos_dj ); + //rvec_ScaledAdd( workspace->f[k], + atomic_rvecScaledAdd( pbond_jk->ta_f, + CEtors7 + CEconj4, p_ijk->dcos_di ); + + /* dcos_theta_jkl */ + rvec_ScaledAdd( workspace->f[j], + CEtors8 + CEconj5, p_jkl->dcos_di ); + //rvec_ScaledAdd( workspace->f[k], + atomic_rvecScaledAdd( pbond_jk->ta_f, + CEtors8 + CEconj5, p_jkl->dcos_dj ); + //rvec_ScaledAdd( workspace->f[l], + atomic_rvecScaledAdd( pbond_kl->ta_f, + CEtors8 + CEconj5, p_jkl->dcos_dk ); + + /* dcos_omega */ + //rvec_ScaledAdd( workspace->f[i], + atomic_rvecScaledAdd( pbond_ij->ta_f, + CEtors9 + CEconj6, dcos_omega_di ); + rvec_ScaledAdd( workspace->f[j], + CEtors9 + CEconj6, dcos_omega_dj ); + //rvec_ScaledAdd( workspace->f[k], + atomic_rvecScaledAdd( pbond_jk->ta_f, + CEtors9 + CEconj6, dcos_omega_dk ); + //rvec_ScaledAdd( workspace->f[l], + atomic_rvecScaledAdd( pbond_kl->ta_f, + CEtors9 + CEconj6, dcos_omega_dl ); + } + else { + ivec_Sum(rel_box_jl, pbond_jk->rel_box, pbond_kl->rel_box); + + /* dcos_theta_ijk */ + rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_dk ); + //rvec_Add( workspace->f[i], force ); + atomic_rvecAdd( pbond_ij->ta_f, force ); + rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); + rvec_Add( data_ext_press [j], ext_press ); + + rvec_ScaledAdd( workspace->f[j], + CEtors7 + CEconj4, p_ijk->dcos_dj ); + + rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_di ); + //rvec_Add( workspace->f[k], force ); + atomic_rvecAdd( pbond_jk->ta_f, force ); + rvec_iMultiply( ext_press, pbond_jk->rel_box, force ); + rvec_Add( data_ext_press[j], ext_press ); + + + /* dcos_theta_jkl */ + rvec_ScaledAdd( workspace->f[j], + CEtors8 + CEconj5, p_jkl->dcos_di ); + + rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dj ); + //rvec_Add( workspace->f[k], force ); + atomic_rvecAdd( pbond_jk->ta_f, force ); + rvec_iMultiply( ext_press, pbond_jk->rel_box, force ); + rvec_Add( data_ext_press [j], ext_press ); + + rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dk ); + //rvec_Add( workspace->f[l], force ); + rvec_Add( pbond_kl->ta_f, force ); + rvec_iMultiply( ext_press, rel_box_jl, force ); + rvec_Add( data_ext_press [j], ext_press ); + + + /* dcos_omega */ + rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_di ); + //rvec_Add( workspace->f[i], force ); + atomic_rvecAdd( pbond_ij->ta_f, force ); + rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); + rvec_Add( data_ext_press [j], ext_press ); + + rvec_ScaledAdd( workspace->f[j], + CEtors9 + CEconj6, dcos_omega_dj ); + + rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dk ); + //rvec_Add( workspace->f[k], force ); + rvec_Add( pbond_jk->ta_f, force ); + rvec_iMultiply( ext_press, pbond_jk->rel_box, force ); + rvec_Add( data_ext_press [j], ext_press ); + + rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dl ); + //rvec_Add( workspace->f[l], force ); + rvec_Add( pbond_kl->ta_f, force ); + rvec_iMultiply( ext_press, rel_box_jl, force ); + rvec_Add( data_ext_press [j], ext_press ); + } #ifdef TEST_ENERGY - /* fprintf( out_control->etor, - "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", - r_ij, r_jk, r_kl, cos_ijk, cos_jkl, sin_ijk, sin_jkl ); - fprintf( out_control->etor, "%12.8f\n", dfn11 ); */ - /* fprintf( out_control->etor, - "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", - CEtors2, CEtors3, CEtors4, CEtors5, CEtors6, - CEtors7, CEtors8, CEtors9 ); */ - /* fprintf( out_control->etor, - "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", - htra, htrb, htrc, hthd, hthe, hnra, hnrc, hnhd, hnhe ); */ - /* fprintf( out_control->etor, - "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", - CEconj1, CEconj2, CEconj3, CEconj4, CEconj5, CEconj6 ); */ - - /* fprintf( out_control->etor, "%12.6f%12.6f%12.6f%12.6f\n", - fbp->V1, fbp->V2, fbp->V3, fbp->p_tor1 );*/ - - fprintf(out_control->etor, - //"%6d%6d%6d%6d%24.15e%24.15e%24.15e%24.15e\n", - "%6d%6d%6d%6d%12.4f%12.4f%12.4f%12.4f\n", - system->my_atoms[i].orig_id,system->my_atoms[j].orig_id, - system->my_atoms[k].orig_id,system->my_atoms[l].orig_id, - RAD2DEG(omega), BOA_jk, e_tor, data->my_en.e_tor ); - - fprintf(out_control->econ, - //"%6d%6d%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e\n", - "%6d%6d%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f%12.4f\n", - system->my_atoms[i].orig_id,system->my_atoms[j].orig_id, - system->my_atoms[k].orig_id,system->my_atoms[l].orig_id, - RAD2DEG(omega), BOA_ij, BOA_jk, BOA_kl, - e_con, data->my_en.e_con ); + /* fprintf( out_control->etor, + "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", + r_ij, r_jk, r_kl, cos_ijk, cos_jkl, sin_ijk, sin_jkl ); + fprintf( out_control->etor, "%12.8f\n", dfn11 ); */ + /* fprintf( out_control->etor, + "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", + CEtors2, CEtors3, CEtors4, CEtors5, CEtors6, + CEtors7, CEtors8, CEtors9 ); */ + /* fprintf( out_control->etor, + "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", + htra, htrb, htrc, hthd, hthe, hnra, hnrc, hnhd, hnhe ); */ + /* fprintf( out_control->etor, + "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", + CEconj1, CEconj2, CEconj3, CEconj4, CEconj5, CEconj6 ); */ + + /* fprintf( out_control->etor, "%12.6f%12.6f%12.6f%12.6f\n", + fbp->V1, fbp->V2, fbp->V3, fbp->p_tor1 );*/ + + fprintf(out_control->etor, + //"%6d%6d%6d%6d%24.15e%24.15e%24.15e%24.15e\n", + "%6d%6d%6d%6d%12.4f%12.4f%12.4f%12.4f\n", + system->my_atoms[i].orig_id,system->my_atoms[j].orig_id, + system->my_atoms[k].orig_id,system->my_atoms[l].orig_id, + RAD2DEG(omega), BOA_jk, e_tor, data->my_en.e_tor ); + + fprintf(out_control->econ, + //"%6d%6d%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e\n", + "%6d%6d%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f%12.4f\n", + system->my_atoms[i].orig_id,system->my_atoms[j].orig_id, + system->my_atoms[k].orig_id,system->my_atoms[l].orig_id, + RAD2DEG(omega), BOA_ij, BOA_jk, BOA_kl, + e_con, data->my_en.e_con ); #endif #ifdef TEST_FORCES - /* Torsion Forces */ - Add_dBOpinpi2( system, lists, j, pk, CEtors2, 0.0, - workspace->f_tor, workspace->f_tor ); - Add_dDelta( system, lists, j, CEtors3, workspace->f_tor ); - Add_dDelta( system, lists, k, CEtors3, workspace->f_tor ); - Add_dBO( system, lists, j, pij, CEtors4, workspace->f_tor ); - Add_dBO( system, lists, j, pk, CEtors5, workspace->f_tor ); - Add_dBO( system, lists, k, plk, CEtors6, workspace->f_tor ); - - rvec_ScaledAdd( workspace->f_tor[i], - CEtors7, p_ijk->dcos_dk ); - rvec_ScaledAdd( workspace->f_tor[j], - CEtors7, p_ijk->dcos_dj ); - rvec_ScaledAdd( workspace->f_tor[k], - CEtors7, p_ijk->dcos_di ); - - rvec_ScaledAdd( workspace->f_tor[j], - CEtors8, p_jkl->dcos_di ); - rvec_ScaledAdd( workspace->f_tor[k], - CEtors8, p_jkl->dcos_dj ); - rvec_ScaledAdd( workspace->f_tor[l], - CEtors8, p_jkl->dcos_dk ); - - rvec_ScaledAdd( workspace->f_tor[i], - CEtors9, dcos_omega_di ); - rvec_ScaledAdd( workspace->f_tor[j], - CEtors9, dcos_omega_dj ); - rvec_ScaledAdd( workspace->f_tor[k], - CEtors9, dcos_omega_dk ); - rvec_ScaledAdd( workspace->f_tor[l], - CEtors9, dcos_omega_dl ); - - /* Conjugation Forces */ - Add_dBO( system, lists, j, pij, CEconj1, workspace->f_con ); - Add_dBO( system, lists, j, pk, CEconj2, workspace->f_con ); - Add_dBO( system, lists, k, plk, CEconj3, workspace->f_con ); - - rvec_ScaledAdd( workspace->f_con[i], - CEconj4, p_ijk->dcos_dk ); - rvec_ScaledAdd( workspace->f_con[j], - CEconj4, p_ijk->dcos_dj ); - rvec_ScaledAdd( workspace->f_con[k], - CEconj4, p_ijk->dcos_di ); - - rvec_ScaledAdd( workspace->f_con[j], - CEconj5, p_jkl->dcos_di ); - rvec_ScaledAdd( workspace->f_con[k], - CEconj5, p_jkl->dcos_dj ); - rvec_ScaledAdd( workspace->f_con[l], - CEconj5, p_jkl->dcos_dk ); - - rvec_ScaledAdd( workspace->f_con[i], - CEconj6, dcos_omega_di ); - rvec_ScaledAdd( workspace->f_con[j], - CEconj6, dcos_omega_dj ); - rvec_ScaledAdd( workspace->f_con[k], - CEconj6, dcos_omega_dk ); - rvec_ScaledAdd( workspace->f_con[l], - CEconj6, dcos_omega_dl ); + /* Torsion Forces */ + Add_dBOpinpi2( system, lists, j, pk, CEtors2, 0.0, + workspace->f_tor, workspace->f_tor ); + Add_dDelta( system, lists, j, CEtors3, workspace->f_tor ); + Add_dDelta( system, lists, k, CEtors3, workspace->f_tor ); + Add_dBO( system, lists, j, pij, CEtors4, workspace->f_tor ); + Add_dBO( system, lists, j, pk, CEtors5, workspace->f_tor ); + Add_dBO( system, lists, k, plk, CEtors6, workspace->f_tor ); + + rvec_ScaledAdd( workspace->f_tor[i], + CEtors7, p_ijk->dcos_dk ); + rvec_ScaledAdd( workspace->f_tor[j], + CEtors7, p_ijk->dcos_dj ); + rvec_ScaledAdd( workspace->f_tor[k], + CEtors7, p_ijk->dcos_di ); + + rvec_ScaledAdd( workspace->f_tor[j], + CEtors8, p_jkl->dcos_di ); + rvec_ScaledAdd( workspace->f_tor[k], + CEtors8, p_jkl->dcos_dj ); + rvec_ScaledAdd( workspace->f_tor[l], + CEtors8, p_jkl->dcos_dk ); + + rvec_ScaledAdd( workspace->f_tor[i], + CEtors9, dcos_omega_di ); + rvec_ScaledAdd( workspace->f_tor[j], + CEtors9, dcos_omega_dj ); + rvec_ScaledAdd( workspace->f_tor[k], + CEtors9, dcos_omega_dk ); + rvec_ScaledAdd( workspace->f_tor[l], + CEtors9, dcos_omega_dl ); + + /* Conjugation Forces */ + Add_dBO( system, lists, j, pij, CEconj1, workspace->f_con ); + Add_dBO( system, lists, j, pk, CEconj2, workspace->f_con ); + Add_dBO( system, lists, k, plk, CEconj3, workspace->f_con ); + + rvec_ScaledAdd( workspace->f_con[i], + CEconj4, p_ijk->dcos_dk ); + rvec_ScaledAdd( workspace->f_con[j], + CEconj4, p_ijk->dcos_dj ); + rvec_ScaledAdd( workspace->f_con[k], + CEconj4, p_ijk->dcos_di ); + + rvec_ScaledAdd( workspace->f_con[j], + CEconj5, p_jkl->dcos_di ); + rvec_ScaledAdd( workspace->f_con[k], + CEconj5, p_jkl->dcos_dj ); + rvec_ScaledAdd( workspace->f_con[l], + CEconj5, p_jkl->dcos_dk ); + + rvec_ScaledAdd( workspace->f_con[i], + CEconj6, dcos_omega_di ); + rvec_ScaledAdd( workspace->f_con[j], + CEconj6, dcos_omega_dj ); + rvec_ScaledAdd( workspace->f_con[k], + CEconj6, dcos_omega_dk ); + rvec_ScaledAdd( workspace->f_con[l], + CEconj6, dcos_omega_dl ); #endif - } // pl check ends - } // pl loop ends - } // pi check ends - } // pi loop ends - } // k-j neighbor check ends - } // j<k && j-k neighbor check ends - } // pk loop ends - // } // j loop + } // pl check ends + } // pl loop ends + } // pi check ends + } // pi loop ends + } // k-j neighbor check ends + } // j<k && j-k neighbor check ends + } // pk loop ends + // } // j loop } CUDA_GLOBAL void Cuda_Torsion_Angles_PostProcess ( reax_atom *my_atoms, - storage p_workspace, - reax_list p_bonds, int N ) + storage p_workspace, + reax_list p_bonds, int N ) { - int i, pj; + int i, pj; - bond_data *pbond; - bond_data *sym_index_bond; - bond_order_data *bo_data; + bond_data *pbond; + bond_data *sym_index_bond; + bond_order_data *bo_data; - reax_list *bonds = &p_bonds; - storage *workspace = &p_workspace; + reax_list *bonds = &p_bonds; + storage *workspace = &p_workspace; - i = blockIdx.x * blockDim.x + threadIdx.x; - if ( i >= N) return; + i = blockIdx.x * blockDim.x + threadIdx.x; + if ( i >= N) return; - for( pj = Dev_Start_Index(i, bonds); pj < Dev_End_Index(i, bonds); ++pj ){ + for( pj = Dev_Start_Index(i, bonds); pj < Dev_End_Index(i, bonds); ++pj ){ - pbond = &(bonds->select.bond_list[pj]); - bo_data = &pbond->bo_data; - sym_index_bond = &( bonds->select.bond_list[ pbond->sym_index ] ); + pbond = &(bonds->select.bond_list[pj]); + bo_data = &pbond->bo_data; + sym_index_bond = &( bonds->select.bond_list[ pbond->sym_index ] ); - workspace->CdDelta [i] += sym_index_bond->ta_CdDelta; + workspace->CdDelta [i] += sym_index_bond->ta_CdDelta; - bo_data->Cdbo += pbond->ta_Cdbo; + bo_data->Cdbo += pbond->ta_Cdbo; - //update f vector - //rvec_Add (my_atoms [i].f, sym_index_bond->ta_f ); - rvec_Add (workspace->f[i], sym_index_bond->ta_f ); - } + //update f vector + //rvec_Add (my_atoms [i].f, sym_index_bond->ta_f ); + rvec_Add (workspace->f[i], sym_index_bond->ta_f ); + } } diff --git a/PG-PuReMD/src/cuda_utils.cu b/PG-PuReMD/src/cuda_utils.cu index 3f304e59..dcd8d61f 100644 --- a/PG-PuReMD/src/cuda_utils.cu +++ b/PG-PuReMD/src/cuda_utils.cu @@ -2,114 +2,114 @@ extern "C" void cuda_malloc (void **ptr, int size, int memset, char *msg) { - cudaError_t retVal = cudaSuccess; - - retVal = cudaMalloc (ptr, size); - if (retVal != cudaSuccess) { - fprintf (stderr, "Failed to allocate memory on device for the res: %s... exiting with code: %d size: %d \n", - msg, retVal, size); - exit (-1); - } - - if (memset) { - retVal = cudaMemset (*ptr, 0, size); - if (retVal != cudaSuccess) { - fprintf (stderr, "Failed to memset memory on device for resource %s\n", - msg); - exit (-1); - } - } + cudaError_t retVal = cudaSuccess; + + retVal = cudaMalloc (ptr, size); + if (retVal != cudaSuccess) { + fprintf (stderr, "Failed to allocate memory on device for the res: %s... exiting with code: %d size: %d \n", + msg, retVal, size); + exit (-1); + } + + if (memset) { + retVal = cudaMemset (*ptr, 0, size); + if (retVal != cudaSuccess) { + fprintf (stderr, "Failed to memset memory on device for resource %s\n", + msg); + exit (-1); + } + } } extern "C" void cuda_free (void *ptr, char *msg) { - cudaError_t retVal = cudaSuccess; - if (!ptr) return; + cudaError_t retVal = cudaSuccess; + if (!ptr) return; - retVal = cudaFree (ptr); + retVal = cudaFree (ptr); - if (retVal != cudaSuccess) { - fprintf (stderr, "Failed to release memory on device for res %s... exiting with code %d -- Address %ld\n", - msg, retVal, (long int) ptr); - return; - } + if (retVal != cudaSuccess) { + fprintf (stderr, "Failed to release memory on device for res %s... exiting with code %d -- Address %ld\n", + msg, retVal, (long int) ptr); + return; + } } extern "C" void cuda_memset (void *ptr, int data, size_t count, char *msg){ - cudaError_t retVal = cudaSuccess; - - retVal = cudaMemset (ptr, data, count); - if (retVal != cudaSuccess) { - fprintf (stderr, "Failed to memset memory on device for %s, cuda code %d\n", - msg, retVal); - exit (-1); - } + cudaError_t retVal = cudaSuccess; + + retVal = cudaMemset (ptr, data, count); + if (retVal != cudaSuccess) { + fprintf (stderr, "Failed to memset memory on device for %s, cuda code %d\n", + msg, retVal); + exit (-1); + } } extern "C" void copy_host_device (void *host, void *dev, int size, enum cudaMemcpyKind dir, char *msg) { - cudaError_t retVal = cudaErrorNotReady; - - if (dir == cudaMemcpyHostToDevice) - retVal = cudaMemcpy (dev, host, size, cudaMemcpyHostToDevice); - else - retVal = cudaMemcpy (host, dev, size, cudaMemcpyDeviceToHost); - - if (retVal != cudaSuccess) { - fprintf (stderr, "could not copy resource %s from host to device: reason %d \n", - msg, retVal); - exit (-1); - } + cudaError_t retVal = cudaErrorNotReady; + + if (dir == cudaMemcpyHostToDevice) + retVal = cudaMemcpy (dev, host, size, cudaMemcpyHostToDevice); + else + retVal = cudaMemcpy (host, dev, size, cudaMemcpyDeviceToHost); + + if (retVal != cudaSuccess) { + fprintf (stderr, "could not copy resource %s from host to device: reason %d \n", + msg, retVal); + exit (-1); + } } extern "C" void copy_device (void *dest, void *src, int size, char *msg) { - cudaError_t retVal = cudaErrorNotReady; - - retVal = cudaMemcpy (dest, src, size, cudaMemcpyDeviceToDevice); - if (retVal != cudaSuccess) { - fprintf (stderr, "could not copy resource %s from device to device: reason %d \n", - msg, retVal); - exit (-1); - } + cudaError_t retVal = cudaErrorNotReady; + + retVal = cudaMemcpy (dest, src, size, cudaMemcpyDeviceToDevice); + if (retVal != cudaSuccess) { + fprintf (stderr, "could not copy resource %s from device to device: reason %d \n", + msg, retVal); + exit (-1); + } } extern "C" void compute_blocks ( int *blocks, int *block_size, int count ) { - *block_size = CUDA_BLOCK_SIZE; - *blocks = (count / CUDA_BLOCK_SIZE ) + (count % CUDA_BLOCK_SIZE == 0 ? 0 : 1); + *block_size = CUDA_BLOCK_SIZE; + *blocks = (count / CUDA_BLOCK_SIZE ) + (count % CUDA_BLOCK_SIZE == 0 ? 0 : 1); } extern "C" void compute_matvec_blocks ( int *blocks, int count ) { - *blocks = ((count * MATVEC_KER_THREADS_PER_ROW) / MATVEC_BLOCK_SIZE) + - (((count * MATVEC_KER_THREADS_PER_ROW) % MATVEC_BLOCK_SIZE) == 0 ? 0 : 1); + *blocks = ((count * MATVEC_KER_THREADS_PER_ROW) / MATVEC_BLOCK_SIZE) + + (((count * MATVEC_KER_THREADS_PER_ROW) % MATVEC_BLOCK_SIZE) == 0 ? 0 : 1); } extern "C" void compute_nearest_pow_2 (int blocks, int *result) { - int power = 1; - while (power < blocks) power *= 2; + int power = 1; + while (power < blocks) power *= 2; - *result = power; + *result = power; } void print_info () { - size_t total, free; - cudaMemGetInfo (&free, &total); - if (cudaGetLastError () != cudaSuccess ) - { - fprintf (stderr, "Error on the memory call \n"); - return; - } - - fprintf (stderr, "Total %ld Mb %ld gig %ld , free %ld, Mb %ld , gig %ld \n", - total, total/(1024*1024), total/ (1024*1024*1024), - free, free/(1024*1024), free/ (1024*1024*1024) ); + size_t total, free; + cudaMemGetInfo (&free, &total); + if (cudaGetLastError () != cudaSuccess ) + { + fprintf (stderr, "Error on the memory call \n"); + return; + } + + fprintf (stderr, "Total %ld Mb %ld gig %ld , free %ld, Mb %ld , gig %ld \n", + total, total/(1024*1024), total/ (1024*1024*1024), + free, free/(1024*1024), free/ (1024*1024*1024) ); } extern "C" void print_device_mem_usage () { - print_info (); + print_info (); } diff --git a/PG-PuReMD/src/cuda_valence_angles.cu b/PG-PuReMD/src/cuda_valence_angles.cu index 18dfb16c..b7e62c90 100644 --- a/PG-PuReMD/src/cuda_valence_angles.cu +++ b/PG-PuReMD/src/cuda_valence_angles.cu @@ -29,586 +29,586 @@ /* this is a 3-body interaction in which the main role is played by j which sits in the middle of the other two. */ CUDA_GLOBAL void Cuda_Valence_Angles( reax_atom *my_atoms, - global_parameters gp, - single_body_parameters *sbp, - three_body_header *d_thbh, - control_params *control, - storage p_workspace, - reax_list p_bonds, reax_list p_thb_intrs, - int n, int N, int num_atom_types, - real *data_e_ang, real *data_e_pen, real *data_e_coa, - rvec *my_ext_press - ) + global_parameters gp, + single_body_parameters *sbp, + three_body_header *d_thbh, + control_params *control, + storage p_workspace, + reax_list p_bonds, reax_list p_thb_intrs, + int n, int N, int num_atom_types, + real *data_e_ang, real *data_e_pen, real *data_e_coa, + rvec *my_ext_press + ) { - int i, j, pi, k, pk, t; - int type_i, type_j, type_k; - int start_j, end_j, start_pk, end_pk; - int cnt, num_thb_intrs; - - real temp, temp_bo_jt, pBOjt7; - real p_val1, p_val2, p_val3, p_val4, p_val5; - real p_val6, p_val7, p_val8, p_val9, p_val10; - real p_pen1, p_pen2, p_pen3, p_pen4; - real p_coa1, p_coa2, p_coa3, p_coa4; - real trm8, expval6, expval7, expval2theta, expval12theta, exp3ij, exp3jk; - real exp_pen2ij, exp_pen2jk, exp_pen3, exp_pen4, trm_pen34, exp_coa2; - real dSBO1, dSBO2, SBO, SBO2, CSBO2, SBOp, prod_SBO, vlpadj; - real CEval1, CEval2, CEval3, CEval4, CEval5, CEval6, CEval7, CEval8; - real CEpen1, CEpen2, CEpen3; - real e_ang, e_coa, e_pen; - real CEcoa1, CEcoa2, CEcoa3, CEcoa4, CEcoa5; - real Cf7ij, Cf7jk, Cf8j, Cf9j; - real f7_ij, f7_jk, f8_Dj, f9_Dj; - real Ctheta_0, theta_0, theta_00, theta, cos_theta, sin_theta; - real r_ij, r_jk; - real BOA_ij, BOA_jk; - rvec force, ext_press; - // rtensor temp_rtensor, total_rtensor; - - three_body_header *thbh; - three_body_parameters *thbp; - three_body_interaction_data *p_ijk, *p_kji; - bond_data *pbond_ij, *pbond_jk, *pbond_jt; - bond_order_data *bo_ij, *bo_jk, *bo_jt; - - reax_list *bonds = &( p_bonds ); - reax_list *thb_intrs = &( p_thb_intrs ); - storage *workspace = &( p_workspace ); - - /* global parameters used in these calculations */ - p_val6 = gp.l[14]; - p_val8 = gp.l[33]; - p_val9 = gp.l[16]; - p_val10 = gp.l[17]; - - j = blockIdx.x * blockDim.x + threadIdx.x; - if (j >= N) return; - - //num_thb_intrs = j * THREE_BODY_OFFSET; - - //for( j = 0; j < system->N; ++j ) { - // fprintf( out_control->eval, "j: %d\n", j ); - type_j = my_atoms[j].type; - start_j = Dev_Start_Index(j, bonds); - end_j = Dev_End_Index(j, bonds); - - p_val3 = sbp[ type_j ].p_val3; - p_val5 = sbp[ type_j ].p_val5; - - SBOp = 0, prod_SBO = 1; - for( t = start_j; t < end_j; ++t ) { - bo_jt = &(bonds->select.bond_list[t].bo_data); - SBOp += (bo_jt->BO_pi + bo_jt->BO_pi2); - temp = SQR( bo_jt->BO ); - temp *= temp; - temp *= temp; - prod_SBO *= EXP( -temp ); - } - - /* modifications to match Adri's code - 09/01/09 */ - if( workspace->vlpex[j] >= 0 ){ - vlpadj = 0; - dSBO2 = prod_SBO - 1; - } - else{ - vlpadj = workspace->nlp[j]; - dSBO2 = (prod_SBO - 1) * (1 - p_val8 * workspace->dDelta_lp[j]); - } - - SBO = SBOp + (1 - prod_SBO) * (-workspace->Delta_boc[j] - p_val8 * vlpadj); - dSBO1 = -8 * prod_SBO * ( workspace->Delta_boc[j] + p_val8 * vlpadj ); - - if( SBO <= 0 ) - SBO2 = 0, CSBO2 = 0; - else if( SBO > 0 && SBO <= 1 ) { - SBO2 = POW( SBO, p_val9 ); - CSBO2 = p_val9 * POW( SBO, p_val9 - 1 ); - } - else if( SBO > 1 && SBO < 2 ) { - SBO2 = 2 - POW( 2-SBO, p_val9 ); - CSBO2 = p_val9 * POW( 2 - SBO, p_val9 - 1 ); - } - else - SBO2 = 2, CSBO2 = 0; - - expval6 = EXP( p_val6 * workspace->Delta_boc[j] ); - - for( pi = start_j; pi < end_j; ++pi ) { - - //num_thb_intrs = pi * THREE_BODY_OFFSET; - //Dev_Set_Start_Index( pi, num_thb_intrs, thb_intrs ); - num_thb_intrs = Dev_Start_Index (pi, thb_intrs); - - pbond_ij = &(bonds->select.bond_list[pi]); - bo_ij = &(pbond_ij->bo_data); - BOA_ij = bo_ij->BO - control->thb_cut; - - //TODO REMOVE THIS - //TODO REMOVE THIS - //TODO REMOVE THIS - //TODO REMOVE THIS - //TODO REMOVE THIS - - if( BOA_ij/*bo_ij->BO*/ > 0.0 && - ( j < n || pbond_ij->nbr < n ) ) { - //if( BOA_ij/*bo_ij->BO*/ > 0.0) { - i = pbond_ij->nbr; - r_ij = pbond_ij->d; - type_i = my_atoms[i].type; - // fprintf( out_control->eval, "i: %d\n", i ); - - - /* first copy 3-body intrs from previously computed ones where i>k. - in the second for-loop below, - we compute only new 3-body intrs where i < k */ - - for( pk = start_j; pk < pi; ++pk ) { - // fprintf( out_control->eval, "pk: %d\n", pk ); - start_pk = Dev_Start_Index( pk, thb_intrs ); - end_pk = Dev_End_Index( pk, thb_intrs ); - - for( t = start_pk; t < end_pk; ++t ) - if( thb_intrs->select.three_body_list[t].thb == i ) { - p_ijk = &(thb_intrs->select.three_body_list[num_thb_intrs] ); - p_kji = &(thb_intrs->select.three_body_list[t]); - - p_ijk->thb = bonds->select.bond_list[pk].nbr; - p_ijk->pthb = pk; - p_ijk->theta = p_kji->theta; - rvec_Copy( p_ijk->dcos_di, p_kji->dcos_dk ); - rvec_Copy( p_ijk->dcos_dj, p_kji->dcos_dj ); - rvec_Copy( p_ijk->dcos_dk, p_kji->dcos_di ); - - ++num_thb_intrs; - break; - } - } - - - - /* and this is the second for loop mentioned above */ - for( pk = pi+1; pk < end_j; ++pk ) { - //for( pk = start_j; pk < end_j; ++pk ) { - if (pk == pi) continue; - pbond_jk = &(bonds->select.bond_list[pk]); - bo_jk = &(pbond_jk->bo_data); - BOA_jk = bo_jk->BO - control->thb_cut; - k = pbond_jk->nbr; - type_k = my_atoms[k].type; - p_ijk = &( thb_intrs->select.three_body_list[num_thb_intrs] ); - - //CHANGE ORIGINAL - if ((BOA_jk <= 0) || ((j >= n) && (k >= n))) continue; - //if ((BOA_jk <= 0) ) continue; - //CHANGE ORIGINAL - - Calculate_Theta( pbond_ij->dvec, pbond_ij->d, - pbond_jk->dvec, pbond_jk->d, - &theta, &cos_theta ); - - Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, - pbond_jk->dvec, pbond_jk->d, - &(p_ijk->dcos_di), &(p_ijk->dcos_dj), - &(p_ijk->dcos_dk) ); - p_ijk->thb = k; - p_ijk->pthb = pk; - p_ijk->theta = theta; - - sin_theta = SIN( theta ); - if( sin_theta < 1.0e-5 ) - sin_theta = 1.0e-5; - - ++num_thb_intrs; - - - if( (j < n) && (BOA_jk > 0.0) && - (bo_ij->BO * bo_jk->BO > SQR(control->thb_cut)/*0*/) ) { - r_jk = pbond_jk->d; - thbh = &( d_thbh[ index_thbp (type_i,type_j,type_k,num_atom_types) ] ); - - /* if( system->my_atoms[i].orig_id < system->my_atoms[k].orig_id ) - fprintf( fval, "%6d %6d %6d %7.3f %7.3f %7.3f\n", - system->my_atoms[i].orig_id, - system->my_atoms[j].orig_id, - system->my_atoms[k].orig_id, - bo_ij->BO, bo_jk->BO, p_ijk->theta ); - else - fprintf( fval, "%6d %6d %6d %7.3f %7.3f %7.3f\n", - system->my_atoms[k].orig_id, - system->my_atoms[j].orig_id, - system->my_atoms[i].orig_id, - bo_jk->BO, bo_ij->BO, p_ijk->theta ); */ - - for( cnt = 0; cnt < thbh->cnt; ++cnt ) { - // fprintf( out_control->eval, "%6d%6d%6d -- exists in thbp\n", - // i+1, j+1, k+1 ); - - if( fabs(thbh->prm[cnt].p_val1) > 0.001 ) { - thbp = &( thbh->prm[cnt] ); - - /* ANGLE ENERGY */ - p_val1 = thbp->p_val1; - p_val2 = thbp->p_val2; - p_val4 = thbp->p_val4; - p_val7 = thbp->p_val7; - theta_00 = thbp->theta_00; - - exp3ij = EXP( -p_val3 * POW( BOA_ij, p_val4 ) ); - f7_ij = 1.0 - exp3ij; - Cf7ij = p_val3 * p_val4 * POW( BOA_ij, p_val4 - 1.0 ) * exp3ij; - - exp3jk = EXP( -p_val3 * POW( BOA_jk, p_val4 ) ); - f7_jk = 1.0 - exp3jk; - Cf7jk = p_val3 * p_val4 * POW( BOA_jk, p_val4 - 1.0 ) * exp3jk; - - expval7 = EXP( -p_val7 * workspace->Delta_boc[j] ); - trm8 = 1.0 + expval6 + expval7; - f8_Dj = p_val5 - ( (p_val5 - 1.0) * (2.0 + expval6) / trm8 ); - Cf8j = ( (1.0 - p_val5) / SQR(trm8) ) * - ( p_val6 * expval6 * trm8 - - (2.0 + expval6) * ( p_val6*expval6 - p_val7*expval7 ) ); - - theta_0 = 180.0 - theta_00 * (1.0 - - EXP(-p_val10 * (2.0 - SBO2))); - theta_0 = DEG2RAD( theta_0 ); - - expval2theta = EXP( -p_val2 * SQR(theta_0 - theta) ); - if( p_val1 >= 0 ) - expval12theta = p_val1 * (1.0 - expval2theta); - else // To avoid linear Me-H-Me angles (6/6/06) - expval12theta = p_val1 * -expval2theta; - - CEval1 = Cf7ij * f7_jk * f8_Dj * expval12theta; - CEval2 = Cf7jk * f7_ij * f8_Dj * expval12theta; - CEval3 = Cf8j * f7_ij * f7_jk * expval12theta; - CEval4 = -2.0 * p_val1 * p_val2 * f7_ij * f7_jk * f8_Dj * - expval2theta * (theta_0 - theta); - - Ctheta_0 = p_val10 * DEG2RAD(theta_00) * - exp( -p_val10 * (2.0 - SBO2) ); - - CEval5 = -CEval4 * Ctheta_0 * CSBO2; - CEval6 = CEval5 * dSBO1; - CEval7 = CEval5 * dSBO2; - CEval8 = -CEval4 / sin_theta; - - data_e_ang [j] += e_ang = - f7_ij * f7_jk * f8_Dj * expval12theta; - /* END ANGLE ENERGY*/ - - - /* PENALTY ENERGY */ - p_pen1 = thbp->p_pen1; - p_pen2 = gp.l[19]; - p_pen3 = gp.l[20]; - p_pen4 = gp.l[21]; - - exp_pen2ij = EXP( -p_pen2 * SQR( BOA_ij - 2.0 ) ); - exp_pen2jk = EXP( -p_pen2 * SQR( BOA_jk - 2.0 ) ); - exp_pen3 = EXP( -p_pen3 * workspace->Delta[j] ); - exp_pen4 = EXP( p_pen4 * workspace->Delta[j] ); - trm_pen34 = 1.0 + exp_pen3 + exp_pen4; - f9_Dj = ( 2.0 + exp_pen3 ) / trm_pen34; - Cf9j = ( -p_pen3 * exp_pen3 * trm_pen34 - - (2.0 + exp_pen3) * ( -p_pen3 * exp_pen3 + - p_pen4 * exp_pen4 ) ) / - SQR( trm_pen34 ); - - data_e_pen [j] += e_pen = - p_pen1 * f9_Dj * exp_pen2ij * exp_pen2jk; - - CEpen1 = e_pen * Cf9j / f9_Dj; - temp = -2.0 * p_pen2 * e_pen; - CEpen2 = temp * (BOA_ij - 2.0); - CEpen3 = temp * (BOA_jk - 2.0); - /* END PENALTY ENERGY */ - - - /* COALITION ENERGY */ - p_coa1 = thbp->p_coa1; - p_coa2 = gp.l[2]; - p_coa3 = gp.l[38]; - p_coa4 = gp.l[30]; - - exp_coa2 = EXP( p_coa2 * workspace->Delta_boc[j] ); - data_e_coa [j] += e_coa = - p_coa1 / (1. + exp_coa2) * - EXP( -p_coa3 * SQR(workspace->total_bond_order[i]-BOA_ij) ) * - EXP( -p_coa3 * SQR(workspace->total_bond_order[k]-BOA_jk) ) * - EXP( -p_coa4 * SQR(BOA_ij - 1.5) ) * - EXP( -p_coa4 * SQR(BOA_jk - 1.5) ); - - CEcoa1 = -2 * p_coa4 * (BOA_ij - 1.5) * e_coa; - CEcoa2 = -2 * p_coa4 * (BOA_jk - 1.5) * e_coa; - CEcoa3 = -p_coa2 * exp_coa2 * e_coa / (1 + exp_coa2); - CEcoa4 = -2 * p_coa3 * - (workspace->total_bond_order[i]-BOA_ij) * e_coa; - CEcoa5 = -2 * p_coa3 * - (workspace->total_bond_order[k]-BOA_jk) * e_coa; - /* END COALITION ENERGY */ - - /* FORCES */ - /* - bo_ij->Cdbo += (CEval1 + CEpen2 + (CEcoa1 - CEcoa4)); - bo_jk->Cdbo += (CEval2 + CEpen3 + (CEcoa2 - CEcoa5)); - workspace->CdDelta[j] += ((CEval3 + CEval7) + CEpen1 + CEcoa3); - workspace->CdDelta[i] += CEcoa4; - workspace->CdDelta[k] += CEcoa5; - */ - bo_ij->Cdbo += (CEval1 + CEpen2 + (CEcoa1 - CEcoa4)); - bo_jk->Cdbo += (CEval2 + CEpen3 + (CEcoa2 - CEcoa5)); - workspace->CdDelta[j] += ((CEval3 + CEval7) + CEpen1 + CEcoa3); - pbond_ij->va_CdDelta += CEcoa4; - pbond_jk->va_CdDelta += CEcoa5; - - - for( t = start_j; t < end_j; ++t ) { - pbond_jt = &( bonds->select.bond_list[t] ); - bo_jt = &(pbond_jt->bo_data); - temp_bo_jt = bo_jt->BO; - temp = CUBE( temp_bo_jt ); - pBOjt7 = temp * temp * temp_bo_jt; - - // fprintf( out_control->eval, "%6d%12.8f\n", - // workspace->reverse_map[bonds->select.bond_list[t].nbr], - // (CEval6 * pBOjt7) ); - - bo_jt->Cdbo += (CEval6 * pBOjt7); - bo_jt->Cdbopi += CEval5; - bo_jt->Cdbopi2 += CEval5; - } - - - if( control->virial == 0 ) { - /* - rvec_ScaledAdd( workspace->f[i], CEval8, p_ijk->dcos_di ); - rvec_ScaledAdd( workspace->f[j], CEval8, p_ijk->dcos_dj ); - rvec_ScaledAdd( workspace->f[k], CEval8, p_ijk->dcos_dk ); - */ - - rvec_ScaledAdd( pbond_ij->va_f, CEval8, p_ijk->dcos_di ); - rvec_ScaledAdd( workspace->f[j], CEval8, p_ijk->dcos_dj ); - rvec_ScaledAdd( pbond_jk->va_f, CEval8, p_ijk->dcos_dk ); - } - else { - /* terms not related to bond order derivatives are - added directly into forces and pressure vector/tensor */ - rvec_Scale( force, CEval8, p_ijk->dcos_di ); - //rvec_Add( workspace->f[i], force ); - rvec_Add( pbond_ij->va_f, force ); - rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); - //rvec_Add( data->my_ext_press, ext_press ); - rvec_Add( my_ext_press [j], ext_press ); - - rvec_ScaledAdd( workspace->f[j], CEval8, p_ijk->dcos_dj ); - - rvec_Scale( force, CEval8, p_ijk->dcos_dk ); - //rvec_Add( workspace->f[k], force ); - rvec_Add( pbond_jk->va_f, force ); - rvec_iMultiply( ext_press, pbond_jk->rel_box, force ); - rvec_Add( my_ext_press [j], ext_press ); - } + int i, j, pi, k, pk, t; + int type_i, type_j, type_k; + int start_j, end_j, start_pk, end_pk; + int cnt, num_thb_intrs; + + real temp, temp_bo_jt, pBOjt7; + real p_val1, p_val2, p_val3, p_val4, p_val5; + real p_val6, p_val7, p_val8, p_val9, p_val10; + real p_pen1, p_pen2, p_pen3, p_pen4; + real p_coa1, p_coa2, p_coa3, p_coa4; + real trm8, expval6, expval7, expval2theta, expval12theta, exp3ij, exp3jk; + real exp_pen2ij, exp_pen2jk, exp_pen3, exp_pen4, trm_pen34, exp_coa2; + real dSBO1, dSBO2, SBO, SBO2, CSBO2, SBOp, prod_SBO, vlpadj; + real CEval1, CEval2, CEval3, CEval4, CEval5, CEval6, CEval7, CEval8; + real CEpen1, CEpen2, CEpen3; + real e_ang, e_coa, e_pen; + real CEcoa1, CEcoa2, CEcoa3, CEcoa4, CEcoa5; + real Cf7ij, Cf7jk, Cf8j, Cf9j; + real f7_ij, f7_jk, f8_Dj, f9_Dj; + real Ctheta_0, theta_0, theta_00, theta, cos_theta, sin_theta; + real r_ij, r_jk; + real BOA_ij, BOA_jk; + rvec force, ext_press; + // rtensor temp_rtensor, total_rtensor; + + three_body_header *thbh; + three_body_parameters *thbp; + three_body_interaction_data *p_ijk, *p_kji; + bond_data *pbond_ij, *pbond_jk, *pbond_jt; + bond_order_data *bo_ij, *bo_jk, *bo_jt; + + reax_list *bonds = &( p_bonds ); + reax_list *thb_intrs = &( p_thb_intrs ); + storage *workspace = &( p_workspace ); + + /* global parameters used in these calculations */ + p_val6 = gp.l[14]; + p_val8 = gp.l[33]; + p_val9 = gp.l[16]; + p_val10 = gp.l[17]; + + j = blockIdx.x * blockDim.x + threadIdx.x; + if (j >= N) return; + + //num_thb_intrs = j * THREE_BODY_OFFSET; + + //for( j = 0; j < system->N; ++j ) { + // fprintf( out_control->eval, "j: %d\n", j ); + type_j = my_atoms[j].type; + start_j = Dev_Start_Index(j, bonds); + end_j = Dev_End_Index(j, bonds); + + p_val3 = sbp[ type_j ].p_val3; + p_val5 = sbp[ type_j ].p_val5; + + SBOp = 0, prod_SBO = 1; + for( t = start_j; t < end_j; ++t ) { + bo_jt = &(bonds->select.bond_list[t].bo_data); + SBOp += (bo_jt->BO_pi + bo_jt->BO_pi2); + temp = SQR( bo_jt->BO ); + temp *= temp; + temp *= temp; + prod_SBO *= EXP( -temp ); + } + + /* modifications to match Adri's code - 09/01/09 */ + if( workspace->vlpex[j] >= 0 ){ + vlpadj = 0; + dSBO2 = prod_SBO - 1; + } + else{ + vlpadj = workspace->nlp[j]; + dSBO2 = (prod_SBO - 1) * (1 - p_val8 * workspace->dDelta_lp[j]); + } + + SBO = SBOp + (1 - prod_SBO) * (-workspace->Delta_boc[j] - p_val8 * vlpadj); + dSBO1 = -8 * prod_SBO * ( workspace->Delta_boc[j] + p_val8 * vlpadj ); + + if( SBO <= 0 ) + SBO2 = 0, CSBO2 = 0; + else if( SBO > 0 && SBO <= 1 ) { + SBO2 = POW( SBO, p_val9 ); + CSBO2 = p_val9 * POW( SBO, p_val9 - 1 ); + } + else if( SBO > 1 && SBO < 2 ) { + SBO2 = 2 - POW( 2-SBO, p_val9 ); + CSBO2 = p_val9 * POW( 2 - SBO, p_val9 - 1 ); + } + else + SBO2 = 2, CSBO2 = 0; + + expval6 = EXP( p_val6 * workspace->Delta_boc[j] ); + + for( pi = start_j; pi < end_j; ++pi ) { + + //num_thb_intrs = pi * THREE_BODY_OFFSET; + //Dev_Set_Start_Index( pi, num_thb_intrs, thb_intrs ); + num_thb_intrs = Dev_Start_Index (pi, thb_intrs); + + pbond_ij = &(bonds->select.bond_list[pi]); + bo_ij = &(pbond_ij->bo_data); + BOA_ij = bo_ij->BO - control->thb_cut; + + //TODO REMOVE THIS + //TODO REMOVE THIS + //TODO REMOVE THIS + //TODO REMOVE THIS + //TODO REMOVE THIS + + if( BOA_ij/*bo_ij->BO*/ > 0.0 && + ( j < n || pbond_ij->nbr < n ) ) { + //if( BOA_ij/*bo_ij->BO*/ > 0.0) { + i = pbond_ij->nbr; + r_ij = pbond_ij->d; + type_i = my_atoms[i].type; + // fprintf( out_control->eval, "i: %d\n", i ); + + + /* first copy 3-body intrs from previously computed ones where i>k. + in the second for-loop below, + we compute only new 3-body intrs where i < k */ + + for( pk = start_j; pk < pi; ++pk ) { + // fprintf( out_control->eval, "pk: %d\n", pk ); + start_pk = Dev_Start_Index( pk, thb_intrs ); + end_pk = Dev_End_Index( pk, thb_intrs ); + + for( t = start_pk; t < end_pk; ++t ) + if( thb_intrs->select.three_body_list[t].thb == i ) { + p_ijk = &(thb_intrs->select.three_body_list[num_thb_intrs] ); + p_kji = &(thb_intrs->select.three_body_list[t]); + + p_ijk->thb = bonds->select.bond_list[pk].nbr; + p_ijk->pthb = pk; + p_ijk->theta = p_kji->theta; + rvec_Copy( p_ijk->dcos_di, p_kji->dcos_dk ); + rvec_Copy( p_ijk->dcos_dj, p_kji->dcos_dj ); + rvec_Copy( p_ijk->dcos_dk, p_kji->dcos_di ); + + ++num_thb_intrs; + break; + } + } + + + + /* and this is the second for loop mentioned above */ + for( pk = pi+1; pk < end_j; ++pk ) { + //for( pk = start_j; pk < end_j; ++pk ) { + if (pk == pi) continue; + pbond_jk = &(bonds->select.bond_list[pk]); + bo_jk = &(pbond_jk->bo_data); + BOA_jk = bo_jk->BO - control->thb_cut; + k = pbond_jk->nbr; + type_k = my_atoms[k].type; + p_ijk = &( thb_intrs->select.three_body_list[num_thb_intrs] ); + + //CHANGE ORIGINAL + if ((BOA_jk <= 0) || ((j >= n) && (k >= n))) continue; + //if ((BOA_jk <= 0) ) continue; + //CHANGE ORIGINAL + + Calculate_Theta( pbond_ij->dvec, pbond_ij->d, + pbond_jk->dvec, pbond_jk->d, + &theta, &cos_theta ); + + Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, + pbond_jk->dvec, pbond_jk->d, + &(p_ijk->dcos_di), &(p_ijk->dcos_dj), + &(p_ijk->dcos_dk) ); + p_ijk->thb = k; + p_ijk->pthb = pk; + p_ijk->theta = theta; + + sin_theta = SIN( theta ); + if( sin_theta < 1.0e-5 ) + sin_theta = 1.0e-5; + + ++num_thb_intrs; + + + if( (j < n) && (BOA_jk > 0.0) && + (bo_ij->BO * bo_jk->BO > SQR(control->thb_cut)/*0*/) ) { + r_jk = pbond_jk->d; + thbh = &( d_thbh[ index_thbp (type_i,type_j,type_k,num_atom_types) ] ); + + /* if( system->my_atoms[i].orig_id < system->my_atoms[k].orig_id ) + fprintf( fval, "%6d %6d %6d %7.3f %7.3f %7.3f\n", + system->my_atoms[i].orig_id, + system->my_atoms[j].orig_id, + system->my_atoms[k].orig_id, + bo_ij->BO, bo_jk->BO, p_ijk->theta ); + else + fprintf( fval, "%6d %6d %6d %7.3f %7.3f %7.3f\n", + system->my_atoms[k].orig_id, + system->my_atoms[j].orig_id, + system->my_atoms[i].orig_id, + bo_jk->BO, bo_ij->BO, p_ijk->theta ); */ + + for( cnt = 0; cnt < thbh->cnt; ++cnt ) { + // fprintf( out_control->eval, "%6d%6d%6d -- exists in thbp\n", + // i+1, j+1, k+1 ); + + if( fabs(thbh->prm[cnt].p_val1) > 0.001 ) { + thbp = &( thbh->prm[cnt] ); + + /* ANGLE ENERGY */ + p_val1 = thbp->p_val1; + p_val2 = thbp->p_val2; + p_val4 = thbp->p_val4; + p_val7 = thbp->p_val7; + theta_00 = thbp->theta_00; + + exp3ij = EXP( -p_val3 * POW( BOA_ij, p_val4 ) ); + f7_ij = 1.0 - exp3ij; + Cf7ij = p_val3 * p_val4 * POW( BOA_ij, p_val4 - 1.0 ) * exp3ij; + + exp3jk = EXP( -p_val3 * POW( BOA_jk, p_val4 ) ); + f7_jk = 1.0 - exp3jk; + Cf7jk = p_val3 * p_val4 * POW( BOA_jk, p_val4 - 1.0 ) * exp3jk; + + expval7 = EXP( -p_val7 * workspace->Delta_boc[j] ); + trm8 = 1.0 + expval6 + expval7; + f8_Dj = p_val5 - ( (p_val5 - 1.0) * (2.0 + expval6) / trm8 ); + Cf8j = ( (1.0 - p_val5) / SQR(trm8) ) * + ( p_val6 * expval6 * trm8 - + (2.0 + expval6) * ( p_val6*expval6 - p_val7*expval7 ) ); + + theta_0 = 180.0 - theta_00 * (1.0 - + EXP(-p_val10 * (2.0 - SBO2))); + theta_0 = DEG2RAD( theta_0 ); + + expval2theta = EXP( -p_val2 * SQR(theta_0 - theta) ); + if( p_val1 >= 0 ) + expval12theta = p_val1 * (1.0 - expval2theta); + else // To avoid linear Me-H-Me angles (6/6/06) + expval12theta = p_val1 * -expval2theta; + + CEval1 = Cf7ij * f7_jk * f8_Dj * expval12theta; + CEval2 = Cf7jk * f7_ij * f8_Dj * expval12theta; + CEval3 = Cf8j * f7_ij * f7_jk * expval12theta; + CEval4 = -2.0 * p_val1 * p_val2 * f7_ij * f7_jk * f8_Dj * + expval2theta * (theta_0 - theta); + + Ctheta_0 = p_val10 * DEG2RAD(theta_00) * + exp( -p_val10 * (2.0 - SBO2) ); + + CEval5 = -CEval4 * Ctheta_0 * CSBO2; + CEval6 = CEval5 * dSBO1; + CEval7 = CEval5 * dSBO2; + CEval8 = -CEval4 / sin_theta; + + data_e_ang [j] += e_ang = + f7_ij * f7_jk * f8_Dj * expval12theta; + /* END ANGLE ENERGY*/ + + + /* PENALTY ENERGY */ + p_pen1 = thbp->p_pen1; + p_pen2 = gp.l[19]; + p_pen3 = gp.l[20]; + p_pen4 = gp.l[21]; + + exp_pen2ij = EXP( -p_pen2 * SQR( BOA_ij - 2.0 ) ); + exp_pen2jk = EXP( -p_pen2 * SQR( BOA_jk - 2.0 ) ); + exp_pen3 = EXP( -p_pen3 * workspace->Delta[j] ); + exp_pen4 = EXP( p_pen4 * workspace->Delta[j] ); + trm_pen34 = 1.0 + exp_pen3 + exp_pen4; + f9_Dj = ( 2.0 + exp_pen3 ) / trm_pen34; + Cf9j = ( -p_pen3 * exp_pen3 * trm_pen34 - + (2.0 + exp_pen3) * ( -p_pen3 * exp_pen3 + + p_pen4 * exp_pen4 ) ) / + SQR( trm_pen34 ); + + data_e_pen [j] += e_pen = + p_pen1 * f9_Dj * exp_pen2ij * exp_pen2jk; + + CEpen1 = e_pen * Cf9j / f9_Dj; + temp = -2.0 * p_pen2 * e_pen; + CEpen2 = temp * (BOA_ij - 2.0); + CEpen3 = temp * (BOA_jk - 2.0); + /* END PENALTY ENERGY */ + + + /* COALITION ENERGY */ + p_coa1 = thbp->p_coa1; + p_coa2 = gp.l[2]; + p_coa3 = gp.l[38]; + p_coa4 = gp.l[30]; + + exp_coa2 = EXP( p_coa2 * workspace->Delta_boc[j] ); + data_e_coa [j] += e_coa = + p_coa1 / (1. + exp_coa2) * + EXP( -p_coa3 * SQR(workspace->total_bond_order[i]-BOA_ij) ) * + EXP( -p_coa3 * SQR(workspace->total_bond_order[k]-BOA_jk) ) * + EXP( -p_coa4 * SQR(BOA_ij - 1.5) ) * + EXP( -p_coa4 * SQR(BOA_jk - 1.5) ); + + CEcoa1 = -2 * p_coa4 * (BOA_ij - 1.5) * e_coa; + CEcoa2 = -2 * p_coa4 * (BOA_jk - 1.5) * e_coa; + CEcoa3 = -p_coa2 * exp_coa2 * e_coa / (1 + exp_coa2); + CEcoa4 = -2 * p_coa3 * + (workspace->total_bond_order[i]-BOA_ij) * e_coa; + CEcoa5 = -2 * p_coa3 * + (workspace->total_bond_order[k]-BOA_jk) * e_coa; + /* END COALITION ENERGY */ + + /* FORCES */ + /* + bo_ij->Cdbo += (CEval1 + CEpen2 + (CEcoa1 - CEcoa4)); + bo_jk->Cdbo += (CEval2 + CEpen3 + (CEcoa2 - CEcoa5)); + workspace->CdDelta[j] += ((CEval3 + CEval7) + CEpen1 + CEcoa3); + workspace->CdDelta[i] += CEcoa4; + workspace->CdDelta[k] += CEcoa5; + */ + bo_ij->Cdbo += (CEval1 + CEpen2 + (CEcoa1 - CEcoa4)); + bo_jk->Cdbo += (CEval2 + CEpen3 + (CEcoa2 - CEcoa5)); + workspace->CdDelta[j] += ((CEval3 + CEval7) + CEpen1 + CEcoa3); + pbond_ij->va_CdDelta += CEcoa4; + pbond_jk->va_CdDelta += CEcoa5; + + + for( t = start_j; t < end_j; ++t ) { + pbond_jt = &( bonds->select.bond_list[t] ); + bo_jt = &(pbond_jt->bo_data); + temp_bo_jt = bo_jt->BO; + temp = CUBE( temp_bo_jt ); + pBOjt7 = temp * temp * temp_bo_jt; + + // fprintf( out_control->eval, "%6d%12.8f\n", + // workspace->reverse_map[bonds->select.bond_list[t].nbr], + // (CEval6 * pBOjt7) ); + + bo_jt->Cdbo += (CEval6 * pBOjt7); + bo_jt->Cdbopi += CEval5; + bo_jt->Cdbopi2 += CEval5; + } + + + if( control->virial == 0 ) { + /* + rvec_ScaledAdd( workspace->f[i], CEval8, p_ijk->dcos_di ); + rvec_ScaledAdd( workspace->f[j], CEval8, p_ijk->dcos_dj ); + rvec_ScaledAdd( workspace->f[k], CEval8, p_ijk->dcos_dk ); + */ + + rvec_ScaledAdd( pbond_ij->va_f, CEval8, p_ijk->dcos_di ); + rvec_ScaledAdd( workspace->f[j], CEval8, p_ijk->dcos_dj ); + rvec_ScaledAdd( pbond_jk->va_f, CEval8, p_ijk->dcos_dk ); + } + else { + /* terms not related to bond order derivatives are + added directly into forces and pressure vector/tensor */ + rvec_Scale( force, CEval8, p_ijk->dcos_di ); + //rvec_Add( workspace->f[i], force ); + rvec_Add( pbond_ij->va_f, force ); + rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); + //rvec_Add( data->my_ext_press, ext_press ); + rvec_Add( my_ext_press [j], ext_press ); + + rvec_ScaledAdd( workspace->f[j], CEval8, p_ijk->dcos_dj ); + + rvec_Scale( force, CEval8, p_ijk->dcos_dk ); + //rvec_Add( workspace->f[k], force ); + rvec_Add( pbond_jk->va_f, force ); + rvec_iMultiply( ext_press, pbond_jk->rel_box, force ); + rvec_Add( my_ext_press [j], ext_press ); + } #ifdef TEST_ENERGY - /*fprintf( out_control->eval, "%12.8f%12.8f%12.8f%12.8f\n", - p_val3, p_val4, BOA_ij, BOA_jk ); - fprintf(out_control->eval, "%13.8f%13.8f%13.8f%13.8f%13.8f\n", - workspace->Delta_e[j], workspace->vlpex[j], - dSBO1, dSBO2, vlpadj ); - fprintf( out_control->eval, "%12.8f%12.8f%12.8f%12.8f\n", - f7_ij, f7_jk, f8_Dj, expval12theta ); - fprintf( out_control->eval, - "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", - CEval1, CEval2, CEval3, CEval4, - CEval5, CEval6, CEval7, CEval8 ); - - fprintf( out_control->eval, - "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n", - p_ijk->dcos_di[0]/sin_theta, p_ijk->dcos_di[1]/sin_theta, - p_ijk->dcos_di[2]/sin_theta, - p_ijk->dcos_dj[0]/sin_theta, p_ijk->dcos_dj[1]/sin_theta, - p_ijk->dcos_dj[2]/sin_theta, - p_ijk->dcos_dk[0]/sin_theta, p_ijk->dcos_dk[1]/sin_theta, - p_ijk->dcos_dk[2]/sin_theta); - - fprintf( out_control->eval, - "%6d%6d%6d%15.8f%15.8f\n", - system->my_atoms[i].orig_id, - system->my_atoms[j].orig_id, - system->my_atoms[k].orig_id, - RAD2DEG(theta), e_ang );*/ - - fprintf( out_control->eval, - //"%6d%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e\n", - "%6d%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f%12.4f\n", - system->my_atoms[i].orig_id, - system->my_atoms[j].orig_id, - system->my_atoms[k].orig_id, - RAD2DEG(theta), theta_0, BOA_ij, BOA_jk, - e_ang, data->my_en.e_ang ); - - fprintf( out_control->epen, - //"%6d%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n", - "%6d%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f\n", - system->my_atoms[i].orig_id, - system->my_atoms[j].orig_id, - system->my_atoms[k].orig_id, - RAD2DEG(theta), BOA_ij, BOA_jk, e_pen, - data->my_en.e_pen ); - - fprintf( out_control->ecoa, - //"%6d%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n", - "%6d%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f\n", - system->my_atoms[i].orig_id, - system->my_atoms[j].orig_id, - system->my_atoms[k].orig_id, - RAD2DEG(theta), BOA_ij, BOA_jk, - e_coa, data->my_en.e_coa ); + /*fprintf( out_control->eval, "%12.8f%12.8f%12.8f%12.8f\n", + p_val3, p_val4, BOA_ij, BOA_jk ); + fprintf(out_control->eval, "%13.8f%13.8f%13.8f%13.8f%13.8f\n", + workspace->Delta_e[j], workspace->vlpex[j], + dSBO1, dSBO2, vlpadj ); + fprintf( out_control->eval, "%12.8f%12.8f%12.8f%12.8f\n", + f7_ij, f7_jk, f8_Dj, expval12theta ); + fprintf( out_control->eval, + "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", + CEval1, CEval2, CEval3, CEval4, + CEval5, CEval6, CEval7, CEval8 ); + + fprintf( out_control->eval, + "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n", + p_ijk->dcos_di[0]/sin_theta, p_ijk->dcos_di[1]/sin_theta, + p_ijk->dcos_di[2]/sin_theta, + p_ijk->dcos_dj[0]/sin_theta, p_ijk->dcos_dj[1]/sin_theta, + p_ijk->dcos_dj[2]/sin_theta, + p_ijk->dcos_dk[0]/sin_theta, p_ijk->dcos_dk[1]/sin_theta, + p_ijk->dcos_dk[2]/sin_theta); + + fprintf( out_control->eval, + "%6d%6d%6d%15.8f%15.8f\n", + system->my_atoms[i].orig_id, + system->my_atoms[j].orig_id, + system->my_atoms[k].orig_id, + RAD2DEG(theta), e_ang );*/ + + fprintf( out_control->eval, + //"%6d%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e\n", + "%6d%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f%12.4f\n", + system->my_atoms[i].orig_id, + system->my_atoms[j].orig_id, + system->my_atoms[k].orig_id, + RAD2DEG(theta), theta_0, BOA_ij, BOA_jk, + e_ang, data->my_en.e_ang ); + + fprintf( out_control->epen, + //"%6d%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n", + "%6d%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f\n", + system->my_atoms[i].orig_id, + system->my_atoms[j].orig_id, + system->my_atoms[k].orig_id, + RAD2DEG(theta), BOA_ij, BOA_jk, e_pen, + data->my_en.e_pen ); + + fprintf( out_control->ecoa, + //"%6d%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n", + "%6d%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f\n", + system->my_atoms[i].orig_id, + system->my_atoms[j].orig_id, + system->my_atoms[k].orig_id, + RAD2DEG(theta), BOA_ij, BOA_jk, + e_coa, data->my_en.e_coa ); #endif #ifdef TEST_FORCES /* angle forces */ - Add_dBO( system, lists, j, pi, CEval1, workspace->f_ang ); - Add_dBO( system, lists, j, pk, CEval2, workspace->f_ang ); - Add_dDelta( system, lists, j, - CEval3 + CEval7, workspace->f_ang ); - - for( t = start_j; t < end_j; ++t ) { - pbond_jt = &( bonds->select.bond_list[t] ); - bo_jt = &(pbond_jt->bo_data); - temp_bo_jt = bo_jt->BO; - temp = CUBE( temp_bo_jt ); - pBOjt7 = temp * temp * temp_bo_jt; - - Add_dBO( system, lists, j, t, pBOjt7 * CEval6, - workspace->f_ang ); - Add_dBOpinpi2( system, lists, j, t, CEval5, CEval5, - workspace->f_ang, workspace->f_ang ); - } - - rvec_ScaledAdd( workspace->f_ang[i], CEval8, p_ijk->dcos_di ); - rvec_ScaledAdd( workspace->f_ang[j], CEval8, p_ijk->dcos_dj ); - rvec_ScaledAdd( workspace->f_ang[k], CEval8, p_ijk->dcos_dk ); - /* end angle forces */ - - /* penalty forces */ - Add_dDelta( system, lists, j, CEpen1, workspace->f_pen ); - Add_dBO( system, lists, j, pi, CEpen2, workspace->f_pen ); - Add_dBO( system, lists, j, pk, CEpen3, workspace->f_pen ); - /* end penalty forces */ - - /* coalition forces */ - Add_dBO( system, lists, j, pi, CEcoa1 - CEcoa4, - workspace->f_coa ); - Add_dBO( system, lists, j, pk, CEcoa2 - CEcoa5, - workspace->f_coa ); - Add_dDelta( system, lists, j, CEcoa3, workspace->f_coa ); - Add_dDelta( system, lists, i, CEcoa4, workspace->f_coa ); - Add_dDelta( system, lists, k, CEcoa5, workspace->f_coa ); - /* end coalition forces */ + Add_dBO( system, lists, j, pi, CEval1, workspace->f_ang ); + Add_dBO( system, lists, j, pk, CEval2, workspace->f_ang ); + Add_dDelta( system, lists, j, + CEval3 + CEval7, workspace->f_ang ); + + for( t = start_j; t < end_j; ++t ) { + pbond_jt = &( bonds->select.bond_list[t] ); + bo_jt = &(pbond_jt->bo_data); + temp_bo_jt = bo_jt->BO; + temp = CUBE( temp_bo_jt ); + pBOjt7 = temp * temp * temp_bo_jt; + + Add_dBO( system, lists, j, t, pBOjt7 * CEval6, + workspace->f_ang ); + Add_dBOpinpi2( system, lists, j, t, CEval5, CEval5, + workspace->f_ang, workspace->f_ang ); + } + + rvec_ScaledAdd( workspace->f_ang[i], CEval8, p_ijk->dcos_di ); + rvec_ScaledAdd( workspace->f_ang[j], CEval8, p_ijk->dcos_dj ); + rvec_ScaledAdd( workspace->f_ang[k], CEval8, p_ijk->dcos_dk ); + /* end angle forces */ + + /* penalty forces */ + Add_dDelta( system, lists, j, CEpen1, workspace->f_pen ); + Add_dBO( system, lists, j, pi, CEpen2, workspace->f_pen ); + Add_dBO( system, lists, j, pk, CEpen3, workspace->f_pen ); + /* end penalty forces */ + + /* coalition forces */ + Add_dBO( system, lists, j, pi, CEcoa1 - CEcoa4, + workspace->f_coa ); + Add_dBO( system, lists, j, pk, CEcoa2 - CEcoa5, + workspace->f_coa ); + Add_dDelta( system, lists, j, CEcoa3, workspace->f_coa ); + Add_dDelta( system, lists, i, CEcoa4, workspace->f_coa ); + Add_dDelta( system, lists, k, CEcoa5, workspace->f_coa ); + /* end coalition forces */ #endif - } - } - } - } - } + } + } + } + } + } - Dev_Set_End_Index(pi, num_thb_intrs, thb_intrs ); - } - // } CUDA Commented - } + Dev_Set_End_Index(pi, num_thb_intrs, thb_intrs ); + } + // } CUDA Commented + } - CUDA_GLOBAL void Cuda_Valence_Angles_PostProcess ( reax_atom *atoms, control_params *control, - storage p_workspace, - reax_list p_bonds, int N ) - { - int i, pj; + CUDA_GLOBAL void Cuda_Valence_Angles_PostProcess ( reax_atom *atoms, control_params *control, + storage p_workspace, + reax_list p_bonds, int N ) + { + int i, pj; - bond_data *pbond; - bond_data *sym_index_bond; - reax_list *bonds = &p_bonds; - storage *workspace = &p_workspace; + bond_data *pbond; + bond_data *sym_index_bond; + reax_list *bonds = &p_bonds; + storage *workspace = &p_workspace; - i = blockIdx.x * blockDim.x + threadIdx.x; - if ( i >= N) return; + i = blockIdx.x * blockDim.x + threadIdx.x; + if ( i >= N) return; - for( pj = Dev_Start_Index(i, bonds); pj < Dev_End_Index(i, bonds); ++pj ){ + for( pj = Dev_Start_Index(i, bonds); pj < Dev_End_Index(i, bonds); ++pj ){ - pbond = &(bonds->select.bond_list[pj]); - sym_index_bond = &( bonds->select.bond_list[ pbond->sym_index ] ); + pbond = &(bonds->select.bond_list[pj]); + sym_index_bond = &( bonds->select.bond_list[ pbond->sym_index ] ); - workspace->CdDelta [i] += sym_index_bond->va_CdDelta; + workspace->CdDelta [i] += sym_index_bond->va_CdDelta; - //rvec_Add (atoms[i].f, sym_index_bond->va_f ); - rvec_Add (workspace->f[i], sym_index_bond->va_f ); - } - } + //rvec_Add (atoms[i].f, sym_index_bond->va_f ); + rvec_Add (workspace->f[i], sym_index_bond->va_f ); + } + } - // THREE BODY ESTIMATION HERE - CUDA_GLOBAL void Estimate_Cuda_Valence_Angles( reax_atom *my_atoms, - control_params *control, - reax_list p_bonds, - int n, int N, - int *count - ) - { - int i, j, pi, k, pk, t; - int type_i, type_j, type_k; - int start_j, end_j; - int cnt, num_thb_intrs; + // THREE BODY ESTIMATION HERE + CUDA_GLOBAL void Estimate_Cuda_Valence_Angles( reax_atom *my_atoms, + control_params *control, + reax_list p_bonds, + int n, int N, + int *count + ) + { + int i, j, pi, k, pk, t; + int type_i, type_j, type_k; + int start_j, end_j; + int cnt, num_thb_intrs; - real r_ij, r_jk; - real BOA_ij, BOA_jk; + real r_ij, r_jk; + real BOA_ij, BOA_jk; - bond_data *pbond_ij, *pbond_jk, *pbond_jt; - bond_order_data *bo_ij, *bo_jk, *bo_jt; + bond_data *pbond_ij, *pbond_jk, *pbond_jt; + bond_order_data *bo_ij, *bo_jk, *bo_jt; - reax_list *bonds = &( p_bonds ); + reax_list *bonds = &( p_bonds ); - j = blockIdx.x * blockDim.x + threadIdx.x; - if (j >= N) return; + j = blockIdx.x * blockDim.x + threadIdx.x; + if (j >= N) return; - type_j = my_atoms[j].type; - start_j = Dev_Start_Index(j, bonds); - end_j = Dev_End_Index(j, bonds); + type_j = my_atoms[j].type; + start_j = Dev_Start_Index(j, bonds); + end_j = Dev_End_Index(j, bonds); - for( pi = start_j; pi < end_j; ++pi ) { + for( pi = start_j; pi < end_j; ++pi ) { - num_thb_intrs = 0; - count[ pi ] = 0; + num_thb_intrs = 0; + count[ pi ] = 0; - pbond_ij = &(bonds->select.bond_list[pi]); - bo_ij = &(pbond_ij->bo_data); - BOA_ij = bo_ij->BO - control->thb_cut; + pbond_ij = &(bonds->select.bond_list[pi]); + bo_ij = &(pbond_ij->bo_data); + BOA_ij = bo_ij->BO - control->thb_cut; - if( BOA_ij/*bo_ij->BO*/ > 0.0 && - ( j < n || pbond_ij->nbr < n ) ) { - //if( BOA_ij/*bo_ij->BO*/ > 0.0) { - i = pbond_ij->nbr; - r_ij = pbond_ij->d; - type_i = my_atoms[i].type; + if( BOA_ij/*bo_ij->BO*/ > 0.0 && + ( j < n || pbond_ij->nbr < n ) ) { + //if( BOA_ij/*bo_ij->BO*/ > 0.0) { + i = pbond_ij->nbr; + r_ij = pbond_ij->d; + type_i = my_atoms[i].type; - for( pk = start_j; pk < end_j; ++pk ) { - if (pk == pi) continue; + for( pk = start_j; pk < end_j; ++pk ) { + if (pk == pi) continue; - pbond_jk = &(bonds->select.bond_list[pk]); - bo_jk = &(pbond_jk->bo_data); - BOA_jk = bo_jk->BO - control->thb_cut; + pbond_jk = &(bonds->select.bond_list[pk]); + bo_jk = &(pbond_jk->bo_data); + BOA_jk = bo_jk->BO - control->thb_cut; - //CHANGE ORIGINAL - //if ((BOA_jk <= 0) || ((j >= n) && (k >= n))) continue; - if ((BOA_jk <= 0) ) continue; - //CHANGE ORIGINAL + //CHANGE ORIGINAL + //if ((BOA_jk <= 0) || ((j >= n) && (k >= n))) continue; + if ((BOA_jk <= 0) ) continue; + //CHANGE ORIGINAL - ++num_thb_intrs; - } + ++num_thb_intrs; + } - } + } - count[ pi ] = num_thb_intrs; - } - } + count[ pi ] = num_thb_intrs; + } + } diff --git a/PG-PuReMD/src/dev_alloc.cu b/PG-PuReMD/src/dev_alloc.cu index 72ae58e7..b0a76a21 100644 --- a/PG-PuReMD/src/dev_alloc.cu +++ b/PG-PuReMD/src/dev_alloc.cu @@ -7,403 +7,403 @@ extern "C" { - int dev_alloc_control (control_params *control) - { - cuda_malloc ((void **)&control->d_control_params, sizeof (control_params), 1, "control_params"); - copy_host_device (control, control->d_control_params, sizeof (control_params), cudaMemcpyHostToDevice, "control_params"); - } - - CUDA_GLOBAL void Init_Nbrs(ivec *nbrs, int N) - { - int index = blockIdx.x * blockDim.x + threadIdx.x; - if (index >= N) return; - - nbrs[index][0] = -1; - nbrs[index][1] = -1; - nbrs[index][2] = -1; - } - - - int dev_alloc_grid (reax_system *system) - { - int total; - grid_cell local_cell; - grid *host = &system->my_grid; - grid *device = &system->d_my_grid; - ivec *nbrs_x = (ivec *) scratch; - - total = host->ncells[0] * host->ncells[1] * host->ncells[2]; - ivec_Copy (device->ncells, host->ncells); - rvec_Copy (device->cell_len, host->cell_len); - rvec_Copy (device->inv_len, host->inv_len); - - ivec_Copy (device->bond_span, host->bond_span ); - ivec_Copy (device->nonb_span, host->nonb_span ); - ivec_Copy (device->vlist_span, host->vlist_span ); - - ivec_Copy (device->native_cells, host->native_cells ); - ivec_Copy (device->native_str, host->native_str ); - ivec_Copy (device->native_end, host->native_end ); - - device->ghost_cut = host->ghost_cut; - ivec_Copy (device->ghost_span, host->ghost_span ); - ivec_Copy (device->ghost_nonb_span, host->ghost_nonb_span ); - ivec_Copy (device->ghost_hbond_span, host->ghost_hbond_span ); - ivec_Copy (device->ghost_bond_span, host->ghost_bond_span ); - - cuda_malloc ((void **) &device->str, sizeof (int) * total, 1, "grid:str"); - cuda_malloc ((void **) &device->end, sizeof (int) * total, 1, "grid:end"); - cuda_malloc ((void **) &device->cutoff, sizeof (real) * total, 1, "grid:cutoff"); - cuda_malloc ((void **) &device->nbrs_x, sizeof (ivec) * total * host->max_nbrs, 1, "grid:nbrs_x"); - cuda_malloc ((void **) &device->nbrs_cp, sizeof (rvec) * total * host->max_nbrs, 1, "grid:nbrs_cp"); - cuda_malloc ((void **) &device->rel_box, sizeof (ivec) * total, 1, "grid:rel_box"); - - /* - int block_size = 512; - int blocks = (host->max_nbrs) / block_size + ((host->max_nbrs) % block_size == 0 ? 0 : 1); - - Init_Nbrs <<<blocks, block_size>>> - (nbrs_x, host->max_nbrs ); - cudaThreadSynchronize (); - cudaCheckError (); - - cuda_malloc ((void **)& device->cells, - sizeof (grid_cell) * total, - 1, "grid:cells"); - fprintf (stderr, " Device cells address --> %ld \n", device->cells ); - cuda_malloc ((void **) &device->order, sizeof (ivec) * (host->total + 1), 1, "grid:order"); - - local_cell.top = local_cell.mark = local_cell.str = local_cell.end = 0; - fprintf (stderr, "Total cells to be allocated -- > %d \n", total ); - for (int i = 0; i < total; i++) { - //fprintf (stderr, "Address of the local atom -> %ld \n", &local_cell); - - cuda_malloc ((void **) &local_cell.atoms, sizeof (int) * host->max_atoms, - 1, "alloc:grid:cells:atoms"); - //fprintf (stderr, "Allocated address of the atoms --> %ld (%d)\n", local_cell.atoms, host->max_atoms ); - - cuda_malloc ((void **) &local_cell.nbrs_x, sizeof (ivec) * host->max_nbrs, - 1, "alloc:grid:cells:nbrs_x" ); - copy_device (local_cell.nbrs_x, nbrs_x, host->max_nbrs * sizeof (ivec), "grid:nbrs_x"); - //fprintf (stderr, "Allocated address of the nbrs_x--> %ld \n", local_cell.nbrs_x); - - cuda_malloc ((void **) &local_cell.nbrs_cp, sizeof (rvec) * host->max_nbrs, - 1, "alloc:grid:cells:nbrs_cp" ); - //fprintf (stderr, "Allocated address of the nbrs_cp--> %ld \n", local_cell.nbrs_cp); - - //cuda_malloc ((void **) &local_cell.nbrs, sizeof (grid_cell *) * host->max_nbrs , - // 1, "alloc:grid:cells:nbrs" ); - //fprintf (stderr, "Allocated address of the nbrs--> %ld \n", local_cell.nbrs); - - copy_host_device (&local_cell, &device->cells[i], sizeof (grid_cell), cudaMemcpyHostToDevice, "grid:cell-alloc"); - } - */ - - return SUCCESS; - } - - int dev_dealloc_grid_cell_atoms (reax_system *system) - { - int total; - grid_cell local_cell; - grid *host = &system->my_grid; - grid *device = &system->d_my_grid; - - total = host->ncells[0] * host->ncells[1] * host->ncells[2]; - - - for (int i = 0; i < total; i++) { - copy_host_device (&local_cell, &device->cells[i], - sizeof (grid_cell), cudaMemcpyDeviceToHost, "grid:cell-dealloc"); - cuda_free (local_cell.atoms, "grid_cell:atoms" ); - } - } - - int dev_alloc_grid_cell_atoms (reax_system *system, int cap) - { - int total; - grid_cell local_cell; - grid *host = &system->my_grid; - grid *device = &system->d_my_grid; - - total = host->ncells[0] * host->ncells[1] * host->ncells[2]; - - for (int i = 0; i < total; i++) { - copy_host_device (&local_cell, &device->cells[i], - sizeof (grid_cell), cudaMemcpyDeviceToHost, "grid:cell-dealloc"); - cuda_malloc ((void **) &local_cell.atoms, sizeof (int) * cap, - 1, "realloc:grid:cells:atoms"); - copy_host_device (&local_cell, &device->cells[i], - sizeof (grid_cell), cudaMemcpyHostToDevice, "grid:cell-realloc"); - } - } - - - int dev_alloc_system (reax_system *system) - { - cuda_malloc ( (void **) &system->d_my_atoms, system->total_cap * sizeof (reax_atom), 1, "system:d_my_atoms"); - //fprintf (stderr, "p:%d - allocated atoms : %d (%ld, %ld) \n", system->my_rank, system->total_cap, - // system->my_atoms, system->d_my_atoms); - - //simulation boxes - cuda_malloc ( (void **) &system->d_big_box, sizeof (simulation_box), 1, "system:d_big_box"); - cuda_malloc ( (void **) &system->d_my_box, sizeof (simulation_box), 1, "system:d_my_box"); - cuda_malloc ( (void **) &system->d_my_ext_box, sizeof (simulation_box), 1, "d_my_ext_box"); - - //interaction parameters - cuda_malloc ((void **) &system->reax_param.d_sbp, system->reax_param.num_atom_types * sizeof (single_body_parameters), - 1, "system:d_sbp"); - - cuda_malloc ((void **) &system->reax_param.d_tbp, pow (system->reax_param.num_atom_types, 2) * sizeof (two_body_parameters), - 1, "system:d_tbp"); - - cuda_malloc ((void **) &system->reax_param.d_thbp, pow (system->reax_param.num_atom_types, 3) * sizeof (three_body_header), - 1, "system:d_thbp"); - - cuda_malloc ((void **) &system->reax_param.d_hbp, pow (system->reax_param.num_atom_types, 3) * sizeof (hbond_parameters), - 1, "system:d_hbp"); - - cuda_malloc ((void **) &system->reax_param.d_fbp, pow (system->reax_param.num_atom_types, 4) * sizeof (four_body_header), - 1, "system:d_fbp"); - - cuda_malloc ((void **) &system->reax_param.d_gp.l, system->reax_param.gp.n_global * sizeof (real), 1, "system:d_gp.l"); - - system->reax_param.d_gp.n_global = 0; - system->reax_param.d_gp.vdw_type = 0; - - return SUCCESS; - } - - int dev_realloc_system (reax_system *system, int local_cap, int total_cap, char *msg) - { - //free the existing storage for atoms - cuda_free (system->d_my_atoms, "system:d_my_atoms"); - - cuda_malloc ((void **) &system->d_my_atoms, sizeof (reax_atom) * total_cap, - 1, "system:d_my_atoms"); - return FAILURE; - } - - - int dev_alloc_simulation_data(simulation_data *data) - { - cuda_malloc ((void **) &(data->d_simulation_data), sizeof (simulation_data), 1, "simulation_data"); - return SUCCESS; - } - - int dev_alloc_workspace (reax_system *system, control_params *control, - storage *workspace, int local_cap, int total_cap, - char *msg) - { - int i, total_real, total_rvec, local_int, local_real, local_rvec; - - workspace->allocated = 1; - total_real = total_cap * sizeof(real); - total_rvec = total_cap * sizeof(rvec); - local_int = local_cap * sizeof(int); - local_real = local_cap * sizeof(real); - local_rvec = local_cap * sizeof(rvec); - - /* communication storage */ - /* - workspace->tmp_dbl = NULL; - workspace->tmp_rvec = NULL; - workspace->tmp_rvec2 = NULL; - */ - - //fprintf (stderr, "Deltap and TOTAL BOND ORDER size --> %d \n", total_cap ); - - /* bond order related storage */ - cuda_malloc ((void **) &workspace->within_bond_box, total_cap * sizeof (int), 1, "skin"); - cuda_malloc ((void **) &workspace->total_bond_order, total_real, 1, "total_bo"); - cuda_malloc ((void **) &workspace->Deltap, total_real, 1, "Deltap"); - cuda_malloc ((void **) &workspace->Deltap_boc, total_real, 1, "Deltap_boc"); - cuda_malloc ((void **) &workspace->dDeltap_self, total_rvec, 1, "dDeltap_self"); - cuda_malloc ((void **) &workspace->Delta, total_real, 1, "Delta" ); - cuda_malloc ((void **) &workspace->Delta_lp, total_real, 1, "Delta_lp" ); - cuda_malloc ((void **) &workspace->Delta_lp_temp, total_real, 1, "Delta_lp_temp" ); - cuda_malloc ((void **) &workspace->dDelta_lp, total_real, 1, "Delta_lp_temp" ); - cuda_malloc ((void **) &workspace->dDelta_lp_temp, total_real, 1, "dDelta_lp_temp" ); - cuda_malloc ((void **) &workspace->Delta_e, total_real, 1, "Delta_e" ); - cuda_malloc ((void **) &workspace->Delta_boc, total_real, 1, "Delta_boc"); - cuda_malloc ((void **) &workspace->nlp, total_real, 1, "nlp"); - cuda_malloc ((void **) &workspace->nlp_temp, total_real, 1, "nlp_temp"); - cuda_malloc ((void **) &workspace->Clp, total_real, 1, "Clp"); - cuda_malloc ((void **) &workspace->vlpex, total_real, 1, "vlpex"); - cuda_malloc ((void **) &workspace->bond_mark, total_real, 1, "bond_mark"); - cuda_malloc ((void **) &workspace->done_after, total_real, 1, "done_after"); - - - /* QEq storage */ - cuda_malloc ((void **) &workspace->Hdia_inv, total_cap * sizeof (real), 1, "Hdia_inv"); - cuda_malloc ((void **) &workspace->b_s, total_cap * sizeof (real), 1, "b_s"); - cuda_malloc ((void **) &workspace->b_t, total_cap * sizeof (real), 1, "b_t"); - cuda_malloc ((void **) &workspace->b_prc, total_cap * sizeof (real), 1, "b_prc"); - cuda_malloc ((void **) &workspace->b_prm, total_cap * sizeof (real), 1, "b_prm"); - cuda_malloc ((void **) &workspace->s, total_cap * sizeof (real), 1, "s"); - cuda_malloc ((void **) &workspace->t, total_cap * sizeof (real), 1, "t"); - cuda_malloc ((void **) &workspace->droptol, total_cap * sizeof (real), 1, "droptol"); - cuda_malloc ((void **) &workspace->b, total_cap * sizeof (rvec2), 1, "b"); - cuda_malloc ((void **) &workspace->x, total_cap * sizeof (rvec2), 1, "x"); - - /* GMRES storage */ - cuda_malloc ((void **) &workspace->y, (RESTART+1)*sizeof (real), 1, "y"); - cuda_malloc ((void **) &workspace->z, (RESTART+1)*sizeof (real), 1, "z"); - cuda_malloc ((void **) &workspace->g, (RESTART+1)*sizeof (real), 1, "g"); - cuda_malloc ((void **) &workspace->h, (RESTART+1)*(RESTART+1)*sizeof (real), 1, "h"); - cuda_malloc ((void **) &workspace->hs, (RESTART+1)*sizeof (real), 1, "hs"); - cuda_malloc ((void **) &workspace->hc, (RESTART+1)*sizeof (real), 1, "hc"); - cuda_malloc ((void **) &workspace->v, (RESTART+1)*(RESTART+1)*sizeof (real), 1, "v"); - - /* CG storage */ - cuda_malloc ((void **) &workspace->r, total_cap * sizeof (real), 1, "r"); - cuda_malloc ((void **) &workspace->d, total_cap * sizeof (real), 1, "d"); - cuda_malloc ((void **) &workspace->q, total_cap * sizeof (real), 1, "q"); - cuda_malloc ((void **) &workspace->p, total_cap * sizeof (real), 1, "p"); - cuda_malloc ((void **) &workspace->r2, total_cap * sizeof (rvec2), 1, "r2"); - cuda_malloc ((void **) &workspace->d2, total_cap * sizeof (rvec2), 1, "d2"); - cuda_malloc ((void **) &workspace->q2, total_cap * sizeof (rvec2), 1, "q2"); - cuda_malloc ((void **) &workspace->p2, total_cap * sizeof (rvec2), 1, "p2"); - - /* integrator storage */ - cuda_malloc ((void **) &workspace->v_const, local_rvec, 1, "v_const"); - - /* storage for analysis */ - if( control->molecular_analysis || control->diffusion_coef ) { - cuda_malloc ((void **) &workspace->mark, local_cap * sizeof (int), 1, "mark"); - cuda_malloc ((void **) &workspace->old_mark, local_cap * sizeof (int), 1, "old_mark"); - } - else - workspace->mark = workspace->old_mark = NULL; - - if( control->diffusion_coef ) - cuda_malloc ((void **) &workspace->x_old, local_cap * sizeof (rvec), 1, "x_old"); - else - workspace->x_old = NULL; - - /* force related storage */ - cuda_malloc ((void **) &workspace->f, total_cap * sizeof (rvec), 1, "f"); - cuda_malloc ((void **) &workspace->CdDelta, total_cap * sizeof (rvec), 1, "CdDelta"); - - /* Taper params */ - cuda_malloc ((void **) &workspace->Tap, 8 * sizeof (real), 1, "Tap"); - - return SUCCESS; - } - - int dev_dealloc_workspace (reax_system *system, control_params *control, - storage *workspace, int local_cap, int total_cap, - char *msg) - { - /* communication storage */ - /* - workspace->tmp_dbl = NULL; - workspace->tmp_rvec = NULL; - workspace->tmp_rvec2 = NULL; - */ - - /* bond order related storage */ - cuda_free (workspace->within_bond_box, "skin"); - cuda_free (workspace->total_bond_order, "total_bo"); - cuda_free (workspace->Deltap, "Deltap"); - cuda_free (workspace->Deltap_boc, "Deltap_boc"); - cuda_free (workspace->dDeltap_self, "dDeltap_self"); - cuda_free (workspace->Delta, "Delta" ); - cuda_free (workspace->Delta_lp, "Delta_lp" ); - cuda_free (workspace->Delta_lp_temp, "Delta_lp_temp" ); - cuda_free (workspace->dDelta_lp, "Delta_lp_temp" ); - cuda_free (workspace->dDelta_lp_temp, "dDelta_lp_temp" ); - cuda_free (workspace->Delta_e, "Delta_e" ); - cuda_free (workspace->Delta_boc, "Delta_boc"); - cuda_free (workspace->nlp, "nlp"); - cuda_free (workspace->nlp_temp, "nlp_temp"); - cuda_free (workspace->Clp, "Clp"); - cuda_free (workspace->vlpex, "vlpex"); - cuda_free (workspace->bond_mark, "bond_mark"); - cuda_free (workspace->done_after, "done_after"); - - /* QEq storage */ - cuda_free (workspace->Hdia_inv, "Hdia_inv"); - cuda_free (workspace->b_s, "b_s"); - cuda_free (workspace->b_t, "b_t"); - cuda_free (workspace->b_prc, "b_prc"); - cuda_free (workspace->b_prm, "b_prm"); - cuda_free (workspace->s, "s"); - cuda_free (workspace->t, "t"); - cuda_free (workspace->droptol, "droptol"); - cuda_free (workspace->b, "b"); - cuda_free (workspace->x, "x"); - - /* GMRES storage */ - cuda_free (workspace->y, "y"); - cuda_free (workspace->z, "z"); - cuda_free (workspace->g, "g"); - cuda_free (workspace->h, "h"); - cuda_free (workspace->hs, "hs"); - cuda_free (workspace->hc, "hc"); - cuda_free (workspace->v, "v"); - - /* CG storage */ - cuda_free (workspace->r, "r"); - cuda_free (workspace->d, "d"); - cuda_free (workspace->q, "q"); - cuda_free (workspace->p, "p"); - cuda_free (workspace->r2, "r2"); - cuda_free (workspace->d2, "d2"); - cuda_free (workspace->q2, "q2"); - cuda_free (workspace->p2, "p2"); - - /* integrator storage */ - cuda_free (workspace->v_const, "v_const"); - - /* storage for analysis */ - if( control->molecular_analysis || control->diffusion_coef ) { - cuda_free (workspace->mark, "mark"); - cuda_free (workspace->old_mark, "old_mark"); - } - else - workspace->mark = workspace->old_mark = NULL; - - if( control->diffusion_coef ) - cuda_free (workspace->x_old, "x_old"); - else - workspace->x_old = NULL; - - /* force related storage */ - cuda_free (workspace->f, "f"); - cuda_free (workspace->CdDelta, "CdDelta"); - - /* Taper params */ - cuda_free (workspace->Tap, "Tap"); - - return FAILURE; - } - - - - - int dev_alloc_matrix (sparse_matrix *H, int cap, int m) - { - //sparse_matrix *H; - //H = *pH; - - H->cap = cap; - H->m = m; - cuda_malloc ((void **) &H->start, sizeof (int) * cap, 1, "matrix_start"); - cuda_malloc ((void **) &H->end, sizeof (int) * cap, 1, "matrix_end"); - cuda_malloc ((void **) &H->entries, sizeof (sparse_matrix_entry) * m, 1, "matrix_entries"); - - return SUCCESS; - } - - int dev_dealloc_matrix (sparse_matrix *H) - { - cuda_free (H->start, "matrix_start"); - cuda_free (H->end, "matrix_end"); - cuda_free (H->entries, "matrix_entries"); - - return SUCCESS; - } + int dev_alloc_control (control_params *control) + { + cuda_malloc ((void **)&control->d_control_params, sizeof (control_params), 1, "control_params"); + copy_host_device (control, control->d_control_params, sizeof (control_params), cudaMemcpyHostToDevice, "control_params"); + } + + CUDA_GLOBAL void Init_Nbrs(ivec *nbrs, int N) + { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= N) return; + + nbrs[index][0] = -1; + nbrs[index][1] = -1; + nbrs[index][2] = -1; + } + + + int dev_alloc_grid (reax_system *system) + { + int total; + grid_cell local_cell; + grid *host = &system->my_grid; + grid *device = &system->d_my_grid; + ivec *nbrs_x = (ivec *) scratch; + + total = host->ncells[0] * host->ncells[1] * host->ncells[2]; + ivec_Copy (device->ncells, host->ncells); + rvec_Copy (device->cell_len, host->cell_len); + rvec_Copy (device->inv_len, host->inv_len); + + ivec_Copy (device->bond_span, host->bond_span ); + ivec_Copy (device->nonb_span, host->nonb_span ); + ivec_Copy (device->vlist_span, host->vlist_span ); + + ivec_Copy (device->native_cells, host->native_cells ); + ivec_Copy (device->native_str, host->native_str ); + ivec_Copy (device->native_end, host->native_end ); + + device->ghost_cut = host->ghost_cut; + ivec_Copy (device->ghost_span, host->ghost_span ); + ivec_Copy (device->ghost_nonb_span, host->ghost_nonb_span ); + ivec_Copy (device->ghost_hbond_span, host->ghost_hbond_span ); + ivec_Copy (device->ghost_bond_span, host->ghost_bond_span ); + + cuda_malloc ((void **) &device->str, sizeof (int) * total, 1, "grid:str"); + cuda_malloc ((void **) &device->end, sizeof (int) * total, 1, "grid:end"); + cuda_malloc ((void **) &device->cutoff, sizeof (real) * total, 1, "grid:cutoff"); + cuda_malloc ((void **) &device->nbrs_x, sizeof (ivec) * total * host->max_nbrs, 1, "grid:nbrs_x"); + cuda_malloc ((void **) &device->nbrs_cp, sizeof (rvec) * total * host->max_nbrs, 1, "grid:nbrs_cp"); + cuda_malloc ((void **) &device->rel_box, sizeof (ivec) * total, 1, "grid:rel_box"); + + /* + int block_size = 512; + int blocks = (host->max_nbrs) / block_size + ((host->max_nbrs) % block_size == 0 ? 0 : 1); + + Init_Nbrs <<<blocks, block_size>>> + (nbrs_x, host->max_nbrs ); + cudaThreadSynchronize (); + cudaCheckError (); + + cuda_malloc ((void **)& device->cells, + sizeof (grid_cell) * total, + 1, "grid:cells"); + fprintf (stderr, " Device cells address --> %ld \n", device->cells ); + cuda_malloc ((void **) &device->order, sizeof (ivec) * (host->total + 1), 1, "grid:order"); + + local_cell.top = local_cell.mark = local_cell.str = local_cell.end = 0; + fprintf (stderr, "Total cells to be allocated -- > %d \n", total ); + for (int i = 0; i < total; i++) { + //fprintf (stderr, "Address of the local atom -> %ld \n", &local_cell); + + cuda_malloc ((void **) &local_cell.atoms, sizeof (int) * host->max_atoms, + 1, "alloc:grid:cells:atoms"); + //fprintf (stderr, "Allocated address of the atoms --> %ld (%d)\n", local_cell.atoms, host->max_atoms ); + + cuda_malloc ((void **) &local_cell.nbrs_x, sizeof (ivec) * host->max_nbrs, + 1, "alloc:grid:cells:nbrs_x" ); + copy_device (local_cell.nbrs_x, nbrs_x, host->max_nbrs * sizeof (ivec), "grid:nbrs_x"); + //fprintf (stderr, "Allocated address of the nbrs_x--> %ld \n", local_cell.nbrs_x); + + cuda_malloc ((void **) &local_cell.nbrs_cp, sizeof (rvec) * host->max_nbrs, + 1, "alloc:grid:cells:nbrs_cp" ); + //fprintf (stderr, "Allocated address of the nbrs_cp--> %ld \n", local_cell.nbrs_cp); + + //cuda_malloc ((void **) &local_cell.nbrs, sizeof (grid_cell *) * host->max_nbrs , + // 1, "alloc:grid:cells:nbrs" ); + //fprintf (stderr, "Allocated address of the nbrs--> %ld \n", local_cell.nbrs); + + copy_host_device (&local_cell, &device->cells[i], sizeof (grid_cell), cudaMemcpyHostToDevice, "grid:cell-alloc"); + } + */ + + return SUCCESS; + } + + int dev_dealloc_grid_cell_atoms (reax_system *system) + { + int total; + grid_cell local_cell; + grid *host = &system->my_grid; + grid *device = &system->d_my_grid; + + total = host->ncells[0] * host->ncells[1] * host->ncells[2]; + + + for (int i = 0; i < total; i++) { + copy_host_device (&local_cell, &device->cells[i], + sizeof (grid_cell), cudaMemcpyDeviceToHost, "grid:cell-dealloc"); + cuda_free (local_cell.atoms, "grid_cell:atoms" ); + } + } + + int dev_alloc_grid_cell_atoms (reax_system *system, int cap) + { + int total; + grid_cell local_cell; + grid *host = &system->my_grid; + grid *device = &system->d_my_grid; + + total = host->ncells[0] * host->ncells[1] * host->ncells[2]; + + for (int i = 0; i < total; i++) { + copy_host_device (&local_cell, &device->cells[i], + sizeof (grid_cell), cudaMemcpyDeviceToHost, "grid:cell-dealloc"); + cuda_malloc ((void **) &local_cell.atoms, sizeof (int) * cap, + 1, "realloc:grid:cells:atoms"); + copy_host_device (&local_cell, &device->cells[i], + sizeof (grid_cell), cudaMemcpyHostToDevice, "grid:cell-realloc"); + } + } + + + int dev_alloc_system (reax_system *system) + { + cuda_malloc ( (void **) &system->d_my_atoms, system->total_cap * sizeof (reax_atom), 1, "system:d_my_atoms"); + //fprintf (stderr, "p:%d - allocated atoms : %d (%ld, %ld) \n", system->my_rank, system->total_cap, + // system->my_atoms, system->d_my_atoms); + + //simulation boxes + cuda_malloc ( (void **) &system->d_big_box, sizeof (simulation_box), 1, "system:d_big_box"); + cuda_malloc ( (void **) &system->d_my_box, sizeof (simulation_box), 1, "system:d_my_box"); + cuda_malloc ( (void **) &system->d_my_ext_box, sizeof (simulation_box), 1, "d_my_ext_box"); + + //interaction parameters + cuda_malloc ((void **) &system->reax_param.d_sbp, system->reax_param.num_atom_types * sizeof (single_body_parameters), + 1, "system:d_sbp"); + + cuda_malloc ((void **) &system->reax_param.d_tbp, pow (system->reax_param.num_atom_types, 2) * sizeof (two_body_parameters), + 1, "system:d_tbp"); + + cuda_malloc ((void **) &system->reax_param.d_thbp, pow (system->reax_param.num_atom_types, 3) * sizeof (three_body_header), + 1, "system:d_thbp"); + + cuda_malloc ((void **) &system->reax_param.d_hbp, pow (system->reax_param.num_atom_types, 3) * sizeof (hbond_parameters), + 1, "system:d_hbp"); + + cuda_malloc ((void **) &system->reax_param.d_fbp, pow (system->reax_param.num_atom_types, 4) * sizeof (four_body_header), + 1, "system:d_fbp"); + + cuda_malloc ((void **) &system->reax_param.d_gp.l, system->reax_param.gp.n_global * sizeof (real), 1, "system:d_gp.l"); + + system->reax_param.d_gp.n_global = 0; + system->reax_param.d_gp.vdw_type = 0; + + return SUCCESS; + } + + int dev_realloc_system (reax_system *system, int local_cap, int total_cap, char *msg) + { + //free the existing storage for atoms + cuda_free (system->d_my_atoms, "system:d_my_atoms"); + + cuda_malloc ((void **) &system->d_my_atoms, sizeof (reax_atom) * total_cap, + 1, "system:d_my_atoms"); + return FAILURE; + } + + + int dev_alloc_simulation_data(simulation_data *data) + { + cuda_malloc ((void **) &(data->d_simulation_data), sizeof (simulation_data), 1, "simulation_data"); + return SUCCESS; + } + + int dev_alloc_workspace (reax_system *system, control_params *control, + storage *workspace, int local_cap, int total_cap, + char *msg) + { + int i, total_real, total_rvec, local_int, local_real, local_rvec; + + workspace->allocated = 1; + total_real = total_cap * sizeof(real); + total_rvec = total_cap * sizeof(rvec); + local_int = local_cap * sizeof(int); + local_real = local_cap * sizeof(real); + local_rvec = local_cap * sizeof(rvec); + + /* communication storage */ + /* + workspace->tmp_dbl = NULL; + workspace->tmp_rvec = NULL; + workspace->tmp_rvec2 = NULL; + */ + + //fprintf (stderr, "Deltap and TOTAL BOND ORDER size --> %d \n", total_cap ); + + /* bond order related storage */ + cuda_malloc ((void **) &workspace->within_bond_box, total_cap * sizeof (int), 1, "skin"); + cuda_malloc ((void **) &workspace->total_bond_order, total_real, 1, "total_bo"); + cuda_malloc ((void **) &workspace->Deltap, total_real, 1, "Deltap"); + cuda_malloc ((void **) &workspace->Deltap_boc, total_real, 1, "Deltap_boc"); + cuda_malloc ((void **) &workspace->dDeltap_self, total_rvec, 1, "dDeltap_self"); + cuda_malloc ((void **) &workspace->Delta, total_real, 1, "Delta" ); + cuda_malloc ((void **) &workspace->Delta_lp, total_real, 1, "Delta_lp" ); + cuda_malloc ((void **) &workspace->Delta_lp_temp, total_real, 1, "Delta_lp_temp" ); + cuda_malloc ((void **) &workspace->dDelta_lp, total_real, 1, "Delta_lp_temp" ); + cuda_malloc ((void **) &workspace->dDelta_lp_temp, total_real, 1, "dDelta_lp_temp" ); + cuda_malloc ((void **) &workspace->Delta_e, total_real, 1, "Delta_e" ); + cuda_malloc ((void **) &workspace->Delta_boc, total_real, 1, "Delta_boc"); + cuda_malloc ((void **) &workspace->nlp, total_real, 1, "nlp"); + cuda_malloc ((void **) &workspace->nlp_temp, total_real, 1, "nlp_temp"); + cuda_malloc ((void **) &workspace->Clp, total_real, 1, "Clp"); + cuda_malloc ((void **) &workspace->vlpex, total_real, 1, "vlpex"); + cuda_malloc ((void **) &workspace->bond_mark, total_real, 1, "bond_mark"); + cuda_malloc ((void **) &workspace->done_after, total_real, 1, "done_after"); + + + /* QEq storage */ + cuda_malloc ((void **) &workspace->Hdia_inv, total_cap * sizeof (real), 1, "Hdia_inv"); + cuda_malloc ((void **) &workspace->b_s, total_cap * sizeof (real), 1, "b_s"); + cuda_malloc ((void **) &workspace->b_t, total_cap * sizeof (real), 1, "b_t"); + cuda_malloc ((void **) &workspace->b_prc, total_cap * sizeof (real), 1, "b_prc"); + cuda_malloc ((void **) &workspace->b_prm, total_cap * sizeof (real), 1, "b_prm"); + cuda_malloc ((void **) &workspace->s, total_cap * sizeof (real), 1, "s"); + cuda_malloc ((void **) &workspace->t, total_cap * sizeof (real), 1, "t"); + cuda_malloc ((void **) &workspace->droptol, total_cap * sizeof (real), 1, "droptol"); + cuda_malloc ((void **) &workspace->b, total_cap * sizeof (rvec2), 1, "b"); + cuda_malloc ((void **) &workspace->x, total_cap * sizeof (rvec2), 1, "x"); + + /* GMRES storage */ + cuda_malloc ((void **) &workspace->y, (RESTART+1)*sizeof (real), 1, "y"); + cuda_malloc ((void **) &workspace->z, (RESTART+1)*sizeof (real), 1, "z"); + cuda_malloc ((void **) &workspace->g, (RESTART+1)*sizeof (real), 1, "g"); + cuda_malloc ((void **) &workspace->h, (RESTART+1)*(RESTART+1)*sizeof (real), 1, "h"); + cuda_malloc ((void **) &workspace->hs, (RESTART+1)*sizeof (real), 1, "hs"); + cuda_malloc ((void **) &workspace->hc, (RESTART+1)*sizeof (real), 1, "hc"); + cuda_malloc ((void **) &workspace->v, (RESTART+1)*(RESTART+1)*sizeof (real), 1, "v"); + + /* CG storage */ + cuda_malloc ((void **) &workspace->r, total_cap * sizeof (real), 1, "r"); + cuda_malloc ((void **) &workspace->d, total_cap * sizeof (real), 1, "d"); + cuda_malloc ((void **) &workspace->q, total_cap * sizeof (real), 1, "q"); + cuda_malloc ((void **) &workspace->p, total_cap * sizeof (real), 1, "p"); + cuda_malloc ((void **) &workspace->r2, total_cap * sizeof (rvec2), 1, "r2"); + cuda_malloc ((void **) &workspace->d2, total_cap * sizeof (rvec2), 1, "d2"); + cuda_malloc ((void **) &workspace->q2, total_cap * sizeof (rvec2), 1, "q2"); + cuda_malloc ((void **) &workspace->p2, total_cap * sizeof (rvec2), 1, "p2"); + + /* integrator storage */ + cuda_malloc ((void **) &workspace->v_const, local_rvec, 1, "v_const"); + + /* storage for analysis */ + if( control->molecular_analysis || control->diffusion_coef ) { + cuda_malloc ((void **) &workspace->mark, local_cap * sizeof (int), 1, "mark"); + cuda_malloc ((void **) &workspace->old_mark, local_cap * sizeof (int), 1, "old_mark"); + } + else + workspace->mark = workspace->old_mark = NULL; + + if( control->diffusion_coef ) + cuda_malloc ((void **) &workspace->x_old, local_cap * sizeof (rvec), 1, "x_old"); + else + workspace->x_old = NULL; + + /* force related storage */ + cuda_malloc ((void **) &workspace->f, total_cap * sizeof (rvec), 1, "f"); + cuda_malloc ((void **) &workspace->CdDelta, total_cap * sizeof (rvec), 1, "CdDelta"); + + /* Taper params */ + cuda_malloc ((void **) &workspace->Tap, 8 * sizeof (real), 1, "Tap"); + + return SUCCESS; + } + + int dev_dealloc_workspace (reax_system *system, control_params *control, + storage *workspace, int local_cap, int total_cap, + char *msg) + { + /* communication storage */ + /* + workspace->tmp_dbl = NULL; + workspace->tmp_rvec = NULL; + workspace->tmp_rvec2 = NULL; + */ + + /* bond order related storage */ + cuda_free (workspace->within_bond_box, "skin"); + cuda_free (workspace->total_bond_order, "total_bo"); + cuda_free (workspace->Deltap, "Deltap"); + cuda_free (workspace->Deltap_boc, "Deltap_boc"); + cuda_free (workspace->dDeltap_self, "dDeltap_self"); + cuda_free (workspace->Delta, "Delta" ); + cuda_free (workspace->Delta_lp, "Delta_lp" ); + cuda_free (workspace->Delta_lp_temp, "Delta_lp_temp" ); + cuda_free (workspace->dDelta_lp, "Delta_lp_temp" ); + cuda_free (workspace->dDelta_lp_temp, "dDelta_lp_temp" ); + cuda_free (workspace->Delta_e, "Delta_e" ); + cuda_free (workspace->Delta_boc, "Delta_boc"); + cuda_free (workspace->nlp, "nlp"); + cuda_free (workspace->nlp_temp, "nlp_temp"); + cuda_free (workspace->Clp, "Clp"); + cuda_free (workspace->vlpex, "vlpex"); + cuda_free (workspace->bond_mark, "bond_mark"); + cuda_free (workspace->done_after, "done_after"); + + /* QEq storage */ + cuda_free (workspace->Hdia_inv, "Hdia_inv"); + cuda_free (workspace->b_s, "b_s"); + cuda_free (workspace->b_t, "b_t"); + cuda_free (workspace->b_prc, "b_prc"); + cuda_free (workspace->b_prm, "b_prm"); + cuda_free (workspace->s, "s"); + cuda_free (workspace->t, "t"); + cuda_free (workspace->droptol, "droptol"); + cuda_free (workspace->b, "b"); + cuda_free (workspace->x, "x"); + + /* GMRES storage */ + cuda_free (workspace->y, "y"); + cuda_free (workspace->z, "z"); + cuda_free (workspace->g, "g"); + cuda_free (workspace->h, "h"); + cuda_free (workspace->hs, "hs"); + cuda_free (workspace->hc, "hc"); + cuda_free (workspace->v, "v"); + + /* CG storage */ + cuda_free (workspace->r, "r"); + cuda_free (workspace->d, "d"); + cuda_free (workspace->q, "q"); + cuda_free (workspace->p, "p"); + cuda_free (workspace->r2, "r2"); + cuda_free (workspace->d2, "d2"); + cuda_free (workspace->q2, "q2"); + cuda_free (workspace->p2, "p2"); + + /* integrator storage */ + cuda_free (workspace->v_const, "v_const"); + + /* storage for analysis */ + if( control->molecular_analysis || control->diffusion_coef ) { + cuda_free (workspace->mark, "mark"); + cuda_free (workspace->old_mark, "old_mark"); + } + else + workspace->mark = workspace->old_mark = NULL; + + if( control->diffusion_coef ) + cuda_free (workspace->x_old, "x_old"); + else + workspace->x_old = NULL; + + /* force related storage */ + cuda_free (workspace->f, "f"); + cuda_free (workspace->CdDelta, "CdDelta"); + + /* Taper params */ + cuda_free (workspace->Tap, "Tap"); + + return FAILURE; + } + + + + + int dev_alloc_matrix (sparse_matrix *H, int cap, int m) + { + //sparse_matrix *H; + //H = *pH; + + H->cap = cap; + H->m = m; + cuda_malloc ((void **) &H->start, sizeof (int) * cap, 1, "matrix_start"); + cuda_malloc ((void **) &H->end, sizeof (int) * cap, 1, "matrix_end"); + cuda_malloc ((void **) &H->entries, sizeof (sparse_matrix_entry) * m, 1, "matrix_entries"); + + return SUCCESS; + } + + int dev_dealloc_matrix (sparse_matrix *H) + { + cuda_free (H->start, "matrix_start"); + cuda_free (H->end, "matrix_end"); + cuda_free (H->entries, "matrix_entries"); + + return SUCCESS; + } } diff --git a/PG-PuReMD/src/dev_list.cu b/PG-PuReMD/src/dev_list.cu index 35e74d4a..7453fc8e 100644 --- a/PG-PuReMD/src/dev_list.cu +++ b/PG-PuReMD/src/dev_list.cu @@ -33,80 +33,80 @@ extern "C" { - /************* allocate list space ******************/ - int Dev_Make_List(int n, int num_intrs, int type, reax_list *l) - { - l->allocated = 1; + /************* allocate list space ******************/ + int Dev_Make_List(int n, int num_intrs, int type, reax_list *l) + { + l->allocated = 1; - l->n = n; - l->num_intrs = num_intrs; + l->n = n; + l->num_intrs = num_intrs; - cuda_malloc ((void **) &l->index, n * sizeof (int), 1, "list:index"); - cuda_malloc ((void **) &l->end_index, n * sizeof (int), 1, "list:end_index"); + cuda_malloc ((void **) &l->index, n * sizeof (int), 1, "list:index"); + cuda_malloc ((void **) &l->end_index, n * sizeof (int), 1, "list:end_index"); - l->type = type; + l->type = type; #if defined(DEBUG_FOCUS) - fprintf( stderr, "list: n=%d num_intrs=%d type=%d\n", n, num_intrs, type ); + fprintf( stderr, "list: n=%d num_intrs=%d type=%d\n", n, num_intrs, type ); #endif - switch(l->type) { - - case TYP_FAR_NEIGHBOR: - cuda_malloc ((void **) &l->select.far_nbr_list, - l->num_intrs * sizeof (far_neighbor_data), 1, "list:far_nbrs"); - break; - - case TYP_THREE_BODY: - cuda_malloc ((void **) &l->select.three_body_list, - l->num_intrs * sizeof (three_body_interaction_data), 1, - "list:three_bodies" ); - break; - - case TYP_HBOND: - cuda_malloc ((void **) &l->select.hbond_list, - l->num_intrs * sizeof(hbond_data), 1, "list:hbonds" ); - break; - - case TYP_BOND: - cuda_malloc ((void **) &l->select.bond_list, - l->num_intrs * sizeof(bond_data), 1, "list:bonds" ); - break; - - default: - fprintf( stderr, "ERROR: no %d list type defined!\n", l->type ); - MPI_Abort( MPI_COMM_WORLD, INVALID_INPUT ); - } - - return SUCCESS; - } - - - void Dev_Delete_List( reax_list *l) - { - if( l->allocated == 0 ) - return; - l->allocated = 0; - - cuda_free ( l->index, "index"); - cuda_free ( l->end_index, "end_index" ); - - switch (l->type) { - case TYP_HBOND: - cuda_free( l->select.hbond_list, "list:hbonds" ); - break; - case TYP_FAR_NEIGHBOR: - cuda_free( l->select.far_nbr_list, "list:far_nbrs" ); - break; - case TYP_BOND: - cuda_free( l->select.bond_list, "list:bonds" ); - break; - case TYP_THREE_BODY: - cuda_free( l->select.three_body_list, "list:three_bodies" ); - break; - default: - fprintf (stderr, "ERROR no %d list type defined !\n", l->type); - MPI_Abort( MPI_COMM_WORLD, INVALID_INPUT ); - } - } + switch(l->type) { + + case TYP_FAR_NEIGHBOR: + cuda_malloc ((void **) &l->select.far_nbr_list, + l->num_intrs * sizeof (far_neighbor_data), 1, "list:far_nbrs"); + break; + + case TYP_THREE_BODY: + cuda_malloc ((void **) &l->select.three_body_list, + l->num_intrs * sizeof (three_body_interaction_data), 1, + "list:three_bodies" ); + break; + + case TYP_HBOND: + cuda_malloc ((void **) &l->select.hbond_list, + l->num_intrs * sizeof(hbond_data), 1, "list:hbonds" ); + break; + + case TYP_BOND: + cuda_malloc ((void **) &l->select.bond_list, + l->num_intrs * sizeof(bond_data), 1, "list:bonds" ); + break; + + default: + fprintf( stderr, "ERROR: no %d list type defined!\n", l->type ); + MPI_Abort( MPI_COMM_WORLD, INVALID_INPUT ); + } + + return SUCCESS; + } + + + void Dev_Delete_List( reax_list *l) + { + if( l->allocated == 0 ) + return; + l->allocated = 0; + + cuda_free ( l->index, "index"); + cuda_free ( l->end_index, "end_index" ); + + switch (l->type) { + case TYP_HBOND: + cuda_free( l->select.hbond_list, "list:hbonds" ); + break; + case TYP_FAR_NEIGHBOR: + cuda_free( l->select.far_nbr_list, "list:far_nbrs" ); + break; + case TYP_BOND: + cuda_free( l->select.bond_list, "list:bonds" ); + break; + case TYP_THREE_BODY: + cuda_free( l->select.three_body_list, "list:three_bodies" ); + break; + default: + fprintf (stderr, "ERROR no %d list type defined !\n", l->type); + MPI_Abort( MPI_COMM_WORLD, INVALID_INPUT ); + } + } } diff --git a/PG-PuReMD/src/dev_system_props.cu b/PG-PuReMD/src/dev_system_props.cu index 53bc68d3..fdb3a567 100644 --- a/PG-PuReMD/src/dev_system_props.cu +++ b/PG-PuReMD/src/dev_system_props.cu @@ -10,307 +10,307 @@ #include "cuda_shuffle.h" CUDA_GLOBAL void k_compute_total_mass (single_body_parameters *sbp, reax_atom *my_atoms, - real *block_results, int n) + real *block_results, int n) { #if defined(__SM_35__) - extern __shared__ real my_sbp[]; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - real sdata = 0; + extern __shared__ real my_sbp[]; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + real sdata = 0; - if (i < n) - sdata = sbp [ my_atoms [i].type ].mass; - __syncthreads (); + if (i < n) + sdata = sbp [ my_atoms [i].type ].mass; + __syncthreads (); - for(int z = 16; z >=1; z/=2) - sdata += shfl ( sdata, z); + for(int z = 16; z >=1; z/=2) + sdata += shfl ( sdata, z); - if (threadIdx.x % 32 == 0) - my_sbp[threadIdx.x >> 5] = sdata; + if (threadIdx.x % 32 == 0) + my_sbp[threadIdx.x >> 5] = sdata; - __syncthreads (); + __syncthreads (); - for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1) { - if(threadIdx.x < offset) - my_sbp[threadIdx.x] += my_sbp[threadIdx.x + offset]; + for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1) { + if(threadIdx.x < offset) + my_sbp[threadIdx.x] += my_sbp[threadIdx.x + offset]; - __syncthreads(); - } + __syncthreads(); + } - if(threadIdx.x == 0) - block_results[blockIdx.x] = my_sbp[0]; + if(threadIdx.x == 0) + block_results[blockIdx.x] = my_sbp[0]; #else - extern __shared__ real sdata []; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - real x = 0; + extern __shared__ real sdata []; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + real x = 0; - if (i < n) - x = sbp [ my_atoms [i].type ].mass; + if (i < n) + x = sbp [ my_atoms [i].type ].mass; - sdata[ threadIdx.x ] = x; - __syncthreads (); + sdata[ threadIdx.x ] = x; + __syncthreads (); - for (int offset = blockDim.x / 2; offset > 0; offset >>= 1){ - if (threadIdx.x < offset) - sdata [threadIdx.x] += sdata [threadIdx.x + offset]; + for (int offset = blockDim.x / 2; offset > 0; offset >>= 1){ + if (threadIdx.x < offset) + sdata [threadIdx.x] += sdata [threadIdx.x + offset]; - __syncthreads (); - } + __syncthreads (); + } - if (threadIdx.x == 0) - block_results[ blockIdx.x] = sdata [0]; + if (threadIdx.x == 0) + block_results[ blockIdx.x] = sdata [0]; #endif } extern "C" void dev_compute_total_mass (reax_system *system, real *local_val) { - real *block_mass = (real *) scratch; - cuda_memset (block_mass, 0, sizeof (real) * (1 + BLOCKS_POW_2), "total_mass:tmp"); + real *block_mass = (real *) scratch; + cuda_memset (block_mass, 0, sizeof (real) * (1 + BLOCKS_POW_2), "total_mass:tmp"); - k_compute_total_mass <<<BLOCKS, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>> - (system->reax_param.d_sbp, system->d_my_atoms, block_mass, system->n); - cudaThreadSynchronize (); - cudaCheckError (); + k_compute_total_mass <<<BLOCKS, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>> + (system->reax_param.d_sbp, system->d_my_atoms, block_mass, system->n); + cudaThreadSynchronize (); + cudaCheckError (); - k_reduction <<<1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2 >>> - (block_mass, block_mass + BLOCKS_POW_2, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); + k_reduction <<<1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2 >>> + (block_mass, block_mass + BLOCKS_POW_2, BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); - copy_host_device (local_val, block_mass + BLOCKS_POW_2, sizeof (real), - cudaMemcpyDeviceToHost, "total_mass:tmp"); + copy_host_device (local_val, block_mass + BLOCKS_POW_2, sizeof (real), + cudaMemcpyDeviceToHost, "total_mass:tmp"); } CUDA_GLOBAL void k_compute_kinetic_energy (single_body_parameters *sbp, reax_atom *my_atoms, - real *block_results, int n) + real *block_results, int n) { #if defined(__SM_35__) - extern __shared__ real my_sbpdot[]; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - real sdata = 0; - rvec p; + extern __shared__ real my_sbpdot[]; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + real sdata = 0; + rvec p; - if (i < n) { - sdata = sbp [ my_atoms [i].type ].mass; - rvec_Scale( p, sdata, my_atoms[ i ].v ); - sdata = 0.5 * rvec_Dot( p, my_atoms[ i ].v ); - } + if (i < n) { + sdata = sbp [ my_atoms [i].type ].mass; + rvec_Scale( p, sdata, my_atoms[ i ].v ); + sdata = 0.5 * rvec_Dot( p, my_atoms[ i ].v ); + } - __syncthreads (); + __syncthreads (); - for(int z = 16; z >=1; z/=2) - sdata += shfl ( sdata, z); + for(int z = 16; z >=1; z/=2) + sdata += shfl ( sdata, z); - if (threadIdx.x % 32 == 0) - my_sbpdot[threadIdx.x >> 5] = sdata; + if (threadIdx.x % 32 == 0) + my_sbpdot[threadIdx.x >> 5] = sdata; - __syncthreads (); + __syncthreads (); - for (int offset = blockDim.x >> 6; offset > 0; offset >>= 1){ - if (threadIdx.x < offset) - my_sbpdot[threadIdx.x] += my_sbpdot[threadIdx.x + offset]; + for (int offset = blockDim.x >> 6; offset > 0; offset >>= 1){ + if (threadIdx.x < offset) + my_sbpdot[threadIdx.x] += my_sbpdot[threadIdx.x + offset]; - __syncthreads (); - } + __syncthreads (); + } - if (threadIdx.x == 0) - block_results[ blockIdx.x] = my_sbpdot[0]; + if (threadIdx.x == 0) + block_results[ blockIdx.x] = my_sbpdot[0]; #else - extern __shared__ real sdata []; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - real m = 0; - rvec p; + extern __shared__ real sdata []; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + real m = 0; + rvec p; - if (i < n) { - m = sbp [ my_atoms [i].type ].mass; - rvec_Scale( p, m, my_atoms[ i ].v ); - m = 0.5 * rvec_Dot( p, my_atoms[ i ].v ); - } + if (i < n) { + m = sbp [ my_atoms [i].type ].mass; + rvec_Scale( p, m, my_atoms[ i ].v ); + m = 0.5 * rvec_Dot( p, my_atoms[ i ].v ); + } - sdata[ threadIdx.x ] = m; - __syncthreads (); + sdata[ threadIdx.x ] = m; + __syncthreads (); - for (int offset = blockDim.x / 2; offset > 0; offset >>= 1){ - if (threadIdx.x < offset) - sdata [threadIdx.x] += sdata [threadIdx.x + offset]; + for (int offset = blockDim.x / 2; offset > 0; offset >>= 1){ + if (threadIdx.x < offset) + sdata [threadIdx.x] += sdata [threadIdx.x + offset]; - __syncthreads (); - } + __syncthreads (); + } - if (threadIdx.x == 0) - block_results[ blockIdx.x] = sdata [0]; + if (threadIdx.x == 0) + block_results[ blockIdx.x] = sdata [0]; #endif } extern "C" void dev_compute_kinetic_energy (reax_system *system, simulation_data *data, real *local_val) { - real *block_energy = (real *) scratch; - cuda_memset (block_energy, 0, sizeof (real) * (BLOCKS_POW_2 + 1), "kinetic_energy:tmp"); - - k_compute_kinetic_energy <<<BLOCKS, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>> - (system->reax_param.d_sbp, system->d_my_atoms, block_energy, system->n); - cudaThreadSynchronize (); - cudaCheckError (); - - k_reduction <<<1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2 >>> - (block_energy, block_energy + BLOCKS_POW_2, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); - - copy_host_device (local_val, block_energy + BLOCKS_POW_2, - //copy_host_device (local_val, &((simulation_data *)data->d_simulation_data)->my_en.e_kin, - sizeof (real), cudaMemcpyDeviceToHost, "kinetic_energy:tmp"); - //copy_device (block_energy + BLOCKS_POW_2, &((simulation_data *)data->d_simulation_data)->my_en.e_kin, - // sizeof (real), "kinetic_energy"); - } - - extern "C" void dev_compute_momentum (reax_system *system, rvec xcm, - rvec vcm, rvec amcm) - { - rvec *l_xcm, *l_vcm, *l_amcm; - rvec *r_scratch = (rvec *)scratch; + real *block_energy = (real *) scratch; + cuda_memset (block_energy, 0, sizeof (real) * (BLOCKS_POW_2 + 1), "kinetic_energy:tmp"); + + k_compute_kinetic_energy <<<BLOCKS, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>> + (system->reax_param.d_sbp, system->d_my_atoms, block_energy, system->n); + cudaThreadSynchronize (); + cudaCheckError (); + + k_reduction <<<1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2 >>> + (block_energy, block_energy + BLOCKS_POW_2, BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); + + copy_host_device (local_val, block_energy + BLOCKS_POW_2, + //copy_host_device (local_val, &((simulation_data *)data->d_simulation_data)->my_en.e_kin, + sizeof (real), cudaMemcpyDeviceToHost, "kinetic_energy:tmp"); + //copy_device (block_energy + BLOCKS_POW_2, &((simulation_data *)data->d_simulation_data)->my_en.e_kin, + // sizeof (real), "kinetic_energy"); + } + + extern "C" void dev_compute_momentum (reax_system *system, rvec xcm, + rvec vcm, rvec amcm) + { + rvec *l_xcm, *l_vcm, *l_amcm; + rvec *r_scratch = (rvec *)scratch; #if defined( __SM_35__) - // xcm - cuda_memset( scratch, 0, sizeof (rvec) * (BLOCKS_POW_2 + 1), "momentum:tmp"); - l_xcm = r_scratch; - - center_of_mass_blocks_xcm <<<BLOCKS_POW_2,BLOCK_SIZE,(sizeof (rvec) * BLOCK_SIZE) >>> - (system->reax_param.d_sbp, system->d_my_atoms, l_xcm, system->n ); - cudaThreadSynchronize (); - cudaCheckError (); - - k_reduction_rvec <<<1, BLOCKS_POW_2, (sizeof (rvec) * BLOCKS_POW_2) >>> - (l_xcm, l_xcm + BLOCKS_POW_2, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); - copy_host_device (xcm, l_xcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost, "momentum:xcm"); - - // vcm - cuda_memset( scratch, 0, sizeof (rvec) * (BLOCKS_POW_2 + 1), "momentum:tmp"); - l_vcm = r_scratch; - - center_of_mass_blocks_vcm <<<BLOCKS_POW_2,BLOCK_SIZE,(sizeof (rvec) * BLOCK_SIZE) >>> - (system->reax_param.d_sbp, system->d_my_atoms, l_vcm, system->n ); - cudaThreadSynchronize (); - cudaCheckError (); - - k_reduction_rvec <<<1, BLOCKS_POW_2, (sizeof (rvec) * BLOCKS_POW_2) >>> - (l_vcm, l_vcm + BLOCKS_POW_2, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); - copy_host_device (vcm, l_vcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost, "momentum:vcm"); - - // amcm - cuda_memset( scratch, 0, sizeof (rvec) * (BLOCKS_POW_2 + 1), "momentum:tmp"); - l_amcm = r_scratch; - - center_of_mass_blocks_amcm <<<BLOCKS_POW_2,BLOCK_SIZE,(sizeof (rvec) * BLOCK_SIZE) >>> - (system->reax_param.d_sbp, system->d_my_atoms, l_amcm, system->n ); - cudaThreadSynchronize (); - cudaCheckError (); - - k_reduction_rvec <<<1, BLOCKS_POW_2, (sizeof (rvec) * BLOCKS_POW_2) >>> - (l_amcm, l_amcm + BLOCKS_POW_2, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); - copy_host_device (amcm, l_amcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost, "momemtum:amcm"); + // xcm + cuda_memset( scratch, 0, sizeof (rvec) * (BLOCKS_POW_2 + 1), "momentum:tmp"); + l_xcm = r_scratch; + + center_of_mass_blocks_xcm <<<BLOCKS_POW_2,BLOCK_SIZE,(sizeof (rvec) * BLOCK_SIZE) >>> + (system->reax_param.d_sbp, system->d_my_atoms, l_xcm, system->n ); + cudaThreadSynchronize (); + cudaCheckError (); + + k_reduction_rvec <<<1, BLOCKS_POW_2, (sizeof (rvec) * BLOCKS_POW_2) >>> + (l_xcm, l_xcm + BLOCKS_POW_2, BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); + copy_host_device (xcm, l_xcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost, "momentum:xcm"); + + // vcm + cuda_memset( scratch, 0, sizeof (rvec) * (BLOCKS_POW_2 + 1), "momentum:tmp"); + l_vcm = r_scratch; + + center_of_mass_blocks_vcm <<<BLOCKS_POW_2,BLOCK_SIZE,(sizeof (rvec) * BLOCK_SIZE) >>> + (system->reax_param.d_sbp, system->d_my_atoms, l_vcm, system->n ); + cudaThreadSynchronize (); + cudaCheckError (); + + k_reduction_rvec <<<1, BLOCKS_POW_2, (sizeof (rvec) * BLOCKS_POW_2) >>> + (l_vcm, l_vcm + BLOCKS_POW_2, BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); + copy_host_device (vcm, l_vcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost, "momentum:vcm"); + + // amcm + cuda_memset( scratch, 0, sizeof (rvec) * (BLOCKS_POW_2 + 1), "momentum:tmp"); + l_amcm = r_scratch; + + center_of_mass_blocks_amcm <<<BLOCKS_POW_2,BLOCK_SIZE,(sizeof (rvec) * BLOCK_SIZE) >>> + (system->reax_param.d_sbp, system->d_my_atoms, l_amcm, system->n ); + cudaThreadSynchronize (); + cudaCheckError (); + + k_reduction_rvec <<<1, BLOCKS_POW_2, (sizeof (rvec) * BLOCKS_POW_2) >>> + (l_amcm, l_amcm + BLOCKS_POW_2, BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); + copy_host_device (amcm, l_amcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost, "momemtum:amcm"); #else - cuda_memset ( scratch, 0, 3 * sizeof (rvec) * (BLOCKS_POW_2 + 1), "momentum:tmp"); - - l_xcm = r_scratch; - l_vcm = r_scratch + (BLOCKS_POW_2 + 1); - l_amcm = r_scratch + 2 * (BLOCKS_POW_2 + 1); - - center_of_mass_blocks <<<BLOCKS_POW_2, BLOCK_SIZE, 3 * (sizeof (rvec) * BLOCK_SIZE) >>> - (system->reax_param.d_sbp, system->d_my_atoms, l_xcm, l_vcm, l_amcm, system->n); - cudaThreadSynchronize (); - cudaCheckError (); - - center_of_mass <<<1, BLOCKS_POW_2, 3 * (sizeof (rvec) * BLOCKS_POW_2) >>> - (l_xcm, l_vcm, l_amcm, - l_xcm + BLOCKS_POW_2, - l_vcm + BLOCKS_POW_2, - l_amcm + BLOCKS_POW_2, - BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); - - copy_host_device (xcm, l_xcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost, "momemtum:xcm" ); - copy_host_device (vcm, l_vcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost, "momentum:vcm" ); - copy_host_device (amcm, l_amcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost,"momentum:amcm" ); + cuda_memset ( scratch, 0, 3 * sizeof (rvec) * (BLOCKS_POW_2 + 1), "momentum:tmp"); + + l_xcm = r_scratch; + l_vcm = r_scratch + (BLOCKS_POW_2 + 1); + l_amcm = r_scratch + 2 * (BLOCKS_POW_2 + 1); + + center_of_mass_blocks <<<BLOCKS_POW_2, BLOCK_SIZE, 3 * (sizeof (rvec) * BLOCK_SIZE) >>> + (system->reax_param.d_sbp, system->d_my_atoms, l_xcm, l_vcm, l_amcm, system->n); + cudaThreadSynchronize (); + cudaCheckError (); + + center_of_mass <<<1, BLOCKS_POW_2, 3 * (sizeof (rvec) * BLOCKS_POW_2) >>> + (l_xcm, l_vcm, l_amcm, + l_xcm + BLOCKS_POW_2, + l_vcm + BLOCKS_POW_2, + l_amcm + BLOCKS_POW_2, + BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); + + copy_host_device (xcm, l_xcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost, "momemtum:xcm" ); + copy_host_device (vcm, l_vcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost, "momentum:vcm" ); + copy_host_device (amcm, l_amcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost,"momentum:amcm" ); #endif - } + } extern "C" void dev_compute_inertial_tensor (reax_system *system, real *local_results, rvec my_xcm) { #if defined(__SM_35__) - real *partial_results = (real *) scratch; - cuda_memset (partial_results, 0, sizeof (real) * 6 * (BLOCKS_POW_2 + 1), "tensor:tmp"); - - compute_center_mass_xx_xy <<<BLOCKS_POW_2, BLOCK_SIZE, 2 * (sizeof (real) * BLOCK_SIZE) >>> - (system->reax_param.d_sbp, system->d_my_atoms, partial_results, - my_xcm[0], my_xcm[1], my_xcm[2], system->n); - cudaThreadSynchronize (); - cudaCheckError (); - - compute_center_mass_xz_yy <<<BLOCKS_POW_2, BLOCK_SIZE, 2 * (sizeof (real) * BLOCK_SIZE) >>> - (system->reax_param.d_sbp, system->d_my_atoms, partial_results, - my_xcm[0], my_xcm[1], my_xcm[2], system->n); - cudaThreadSynchronize (); - cudaCheckError (); - - compute_center_mass_yz_zz <<<BLOCKS_POW_2, BLOCK_SIZE, 2 * (sizeof (real) * BLOCK_SIZE) >>> - (system->reax_param.d_sbp, system->d_my_atoms, partial_results, - my_xcm[0], my_xcm[1], my_xcm[2], system->n); - cudaThreadSynchronize (); - cudaCheckError (); - - compute_center_mass <<<1, BLOCKS_POW_2, 6 * (sizeof (real) * BLOCKS_POW_2) >>> - (partial_results, partial_results + (BLOCKS_POW_2 * 6), BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); - - copy_host_device (local_results, partial_results + 6 * BLOCKS_POW_2, sizeof (real) * 6, cudaMemcpyDeviceToHost, "tensor:local_results"); + real *partial_results = (real *) scratch; + cuda_memset (partial_results, 0, sizeof (real) * 6 * (BLOCKS_POW_2 + 1), "tensor:tmp"); + + compute_center_mass_xx_xy <<<BLOCKS_POW_2, BLOCK_SIZE, 2 * (sizeof (real) * BLOCK_SIZE) >>> + (system->reax_param.d_sbp, system->d_my_atoms, partial_results, + my_xcm[0], my_xcm[1], my_xcm[2], system->n); + cudaThreadSynchronize (); + cudaCheckError (); + + compute_center_mass_xz_yy <<<BLOCKS_POW_2, BLOCK_SIZE, 2 * (sizeof (real) * BLOCK_SIZE) >>> + (system->reax_param.d_sbp, system->d_my_atoms, partial_results, + my_xcm[0], my_xcm[1], my_xcm[2], system->n); + cudaThreadSynchronize (); + cudaCheckError (); + + compute_center_mass_yz_zz <<<BLOCKS_POW_2, BLOCK_SIZE, 2 * (sizeof (real) * BLOCK_SIZE) >>> + (system->reax_param.d_sbp, system->d_my_atoms, partial_results, + my_xcm[0], my_xcm[1], my_xcm[2], system->n); + cudaThreadSynchronize (); + cudaCheckError (); + + compute_center_mass <<<1, BLOCKS_POW_2, 6 * (sizeof (real) * BLOCKS_POW_2) >>> + (partial_results, partial_results + (BLOCKS_POW_2 * 6), BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); + + copy_host_device (local_results, partial_results + 6 * BLOCKS_POW_2, sizeof (real) * 6, cudaMemcpyDeviceToHost, "tensor:local_results"); #else - real *partial_results = (real *) scratch; - //real *local_results; + real *partial_results = (real *) scratch; + //real *local_results; - cuda_memset (partial_results, 0, sizeof (real) * 6 * (BLOCKS_POW_2 + 1), "tensor:tmp"); - //local_results = (real *) malloc (sizeof (real) * 6 *(BLOCKS_POW_2+ 1)); + cuda_memset (partial_results, 0, sizeof (real) * 6 * (BLOCKS_POW_2 + 1), "tensor:tmp"); + //local_results = (real *) malloc (sizeof (real) * 6 *(BLOCKS_POW_2+ 1)); - compute_center_mass <<<BLOCKS_POW_2, BLOCK_SIZE, 6 * (sizeof (real) * BLOCK_SIZE) >>> - (system->reax_param.d_sbp, system->d_my_atoms, partial_results, - my_xcm[0], my_xcm[1], my_xcm[2], system->n); - cudaThreadSynchronize (); - cudaCheckError (); + compute_center_mass <<<BLOCKS_POW_2, BLOCK_SIZE, 6 * (sizeof (real) * BLOCK_SIZE) >>> + (system->reax_param.d_sbp, system->d_my_atoms, partial_results, + my_xcm[0], my_xcm[1], my_xcm[2], system->n); + cudaThreadSynchronize (); + cudaCheckError (); - compute_center_mass <<<1, BLOCKS_POW_2, 6 * (sizeof (real) * BLOCKS_POW_2) >>> - (partial_results, partial_results + (BLOCKS_POW_2 * 6), BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); + compute_center_mass <<<1, BLOCKS_POW_2, 6 * (sizeof (real) * BLOCKS_POW_2) >>> + (partial_results, partial_results + (BLOCKS_POW_2 * 6), BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); - copy_host_device (local_results, partial_results + 6 * BLOCKS_POW_2, - sizeof (real) * 6, cudaMemcpyDeviceToHost, "tensor:local_results"); + copy_host_device (local_results, partial_results + 6 * BLOCKS_POW_2, + sizeof (real) * 6, cudaMemcpyDeviceToHost, "tensor:local_results"); #endif } extern "C" void dev_sync_simulation_data (simulation_data *data) { - Output_Sync_Simulation_Data (data, (simulation_data *)data->d_simulation_data ); + Output_Sync_Simulation_Data (data, (simulation_data *)data->d_simulation_data ); } /* CUDA_GLOBAL void ker_kinetic_energy (reax_atom *my_atoms, diff --git a/PG-PuReMD/src/dual_matvec.cu b/PG-PuReMD/src/dual_matvec.cu index d27fc361..a674118f 100644 --- a/PG-PuReMD/src/dual_matvec.cu +++ b/PG-PuReMD/src/dual_matvec.cu @@ -5,26 +5,26 @@ //one thread per row CUDA_GLOBAL void k_dual_matvec(sparse_matrix H, rvec2 *vec, rvec2 *results, int rows) { - rvec2 results_row; - int col; - real val; + rvec2 results_row; + int col; + real val; - int i = blockIdx.x * blockDim.x + threadIdx.x; - if ( i >= rows) return; + int i = blockIdx.x * blockDim.x + threadIdx.x; + if ( i >= rows) return; - results_row [0] = results_row[1] = 0; + results_row [0] = results_row[1] = 0; - for (int c = H.start[i]; c < H.end[i]; c++) - { - col = H.entries [c].j; - val = H.entries[c].val; + for (int c = H.start[i]; c < H.end[i]; c++) + { + col = H.entries [c].j; + val = H.entries[c].val; - results_row[0] += val * vec [col][0]; - results_row[1] += val * vec [col][1]; - } + results_row[0] += val * vec [col][0]; + results_row[1] += val * vec [col][1]; + } - results [i][0] = results_row[0]; - results [i][1] = results_row[1]; + results [i][0] = results_row[0]; + results [i][1] = results_row[1]; } //32 thread warp per matrix row. @@ -35,106 +35,106 @@ CUDA_GLOBAL void k_dual_matvec_csr(sparse_matrix H, rvec2 *vec, rvec2 *results, { #if defined(__SM_35__) - rvec2 vals; - int thread_id = blockDim.x * blockIdx.x + threadIdx.x; - int warp_id = thread_id / MATVEC_KER_THREADS_PER_ROW; - int lane = thread_id & (MATVEC_KER_THREADS_PER_ROW - 1); + rvec2 vals; + int thread_id = blockDim.x * blockIdx.x + threadIdx.x; + int warp_id = thread_id / MATVEC_KER_THREADS_PER_ROW; + int lane = thread_id & (MATVEC_KER_THREADS_PER_ROW - 1); - int row_start; - int row_end; + int row_start; + int row_end; - // one warp per row - int row = warp_id; + // one warp per row + int row = warp_id; - vals[0] = 0; - vals[1] = 0; + vals[0] = 0; + vals[1] = 0; - if (row < num_rows) { - row_start = H.start[row]; - row_end = H.end[row]; + if (row < num_rows) { + row_start = H.start[row]; + row_end = H.end[row]; - for(int jj = row_start + lane; jj < row_end; jj += MATVEC_KER_THREADS_PER_ROW) { - vals[0] += H.entries[jj].val * vec [ H.entries[jj].j ][0]; - vals[1] += H.entries[jj].val * vec [ H.entries[jj].j ][1]; - } - } + for(int jj = row_start + lane; jj < row_end; jj += MATVEC_KER_THREADS_PER_ROW) { + vals[0] += H.entries[jj].val * vec [ H.entries[jj].j ][0]; + vals[1] += H.entries[jj].val * vec [ H.entries[jj].j ][1]; + } + } - for (int s = MATVEC_KER_THREADS_PER_ROW >> 1; s >= 1; s /= 2){ - vals[0] += shfl( vals[0], s); - vals[1] += shfl( vals[1], s); - } + for (int s = MATVEC_KER_THREADS_PER_ROW >> 1; s >= 1; s /= 2){ + vals[0] += shfl( vals[0], s); + vals[1] += shfl( vals[1], s); + } - if (lane == 0 && row < num_rows){ - results[row][0] = vals[0]; - results[row][1] = vals[1]; - } + if (lane == 0 && row < num_rows){ + results[row][0] = vals[0]; + results[row][1] = vals[1]; + } #else - extern __shared__ rvec2 vals []; - int thread_id = blockDim.x * blockIdx.x + threadIdx.x; - int warp_id = thread_id / 32; - int lane = thread_id & (32 - 1); - - int row_start; - int row_end; - - // one warp per row - //int row = warp_id; - int row = warp_id; - //if (row < num_rows) - { - vals[threadIdx.x][0] = 0; - vals[threadIdx.x][1] = 0; - - if (row < num_rows) { - row_start = H.start[row]; - row_end = H.end[row]; - - // compute running sum per thread - for(int jj = row_start + lane; jj < row_end; jj += 32) { - vals[threadIdx.x][0] += H.entries[jj].val * vec [ H.entries[jj].j ][0]; - vals[threadIdx.x][1] += H.entries[jj].val * vec [ H.entries[jj].j ][1]; - } - } - - __syncthreads (); - - // parallel reduction in shared memory - //SIMD instructions with a WARP are synchronous -- so we do not need to synch here - if (lane < 16) { - vals[threadIdx.x][0] += vals[threadIdx.x + 16][0]; - vals[threadIdx.x][1] += vals[threadIdx.x + 16][1]; - } - __syncthreads(); - if (lane < 8) { - vals[threadIdx.x][0] += vals[threadIdx.x + 8][0]; - vals[threadIdx.x][1] += vals[threadIdx.x + 8][1]; - } - __syncthreads (); - if (lane < 4) { - vals[threadIdx.x][0] += vals[threadIdx.x + 4][0]; - vals[threadIdx.x][1] += vals[threadIdx.x + 4][1]; - } - __syncthreads (); - if (lane < 2) { - vals[threadIdx.x][0] += vals[threadIdx.x + 2][0]; - vals[threadIdx.x][1] += vals[threadIdx.x + 2][1]; - } - __syncthreads (); - if (lane < 1) { - vals[threadIdx.x][0] += vals[threadIdx.x + 1][0]; - vals[threadIdx.x][1] += vals[threadIdx.x + 1][1]; - } - __syncthreads (); - - // first thread writes the result - if (lane == 0 && row < num_rows) { - results[row][0] = vals[threadIdx.x][0]; - results[row][1] = vals[threadIdx.x][1]; - } - } + extern __shared__ rvec2 vals []; + int thread_id = blockDim.x * blockIdx.x + threadIdx.x; + int warp_id = thread_id / 32; + int lane = thread_id & (32 - 1); + + int row_start; + int row_end; + + // one warp per row + //int row = warp_id; + int row = warp_id; + //if (row < num_rows) + { + vals[threadIdx.x][0] = 0; + vals[threadIdx.x][1] = 0; + + if (row < num_rows) { + row_start = H.start[row]; + row_end = H.end[row]; + + // compute running sum per thread + for(int jj = row_start + lane; jj < row_end; jj += 32) { + vals[threadIdx.x][0] += H.entries[jj].val * vec [ H.entries[jj].j ][0]; + vals[threadIdx.x][1] += H.entries[jj].val * vec [ H.entries[jj].j ][1]; + } + } + + __syncthreads (); + + // parallel reduction in shared memory + //SIMD instructions with a WARP are synchronous -- so we do not need to synch here + if (lane < 16) { + vals[threadIdx.x][0] += vals[threadIdx.x + 16][0]; + vals[threadIdx.x][1] += vals[threadIdx.x + 16][1]; + } + __syncthreads(); + if (lane < 8) { + vals[threadIdx.x][0] += vals[threadIdx.x + 8][0]; + vals[threadIdx.x][1] += vals[threadIdx.x + 8][1]; + } + __syncthreads (); + if (lane < 4) { + vals[threadIdx.x][0] += vals[threadIdx.x + 4][0]; + vals[threadIdx.x][1] += vals[threadIdx.x + 4][1]; + } + __syncthreads (); + if (lane < 2) { + vals[threadIdx.x][0] += vals[threadIdx.x + 2][0]; + vals[threadIdx.x][1] += vals[threadIdx.x + 2][1]; + } + __syncthreads (); + if (lane < 1) { + vals[threadIdx.x][0] += vals[threadIdx.x + 1][0]; + vals[threadIdx.x][1] += vals[threadIdx.x + 1][1]; + } + __syncthreads (); + + // first thread writes the result + if (lane == 0 && row < num_rows) { + results[row][0] = vals[threadIdx.x][0]; + results[row][1] = vals[threadIdx.x][1]; + } + } #endif } diff --git a/PG-PuReMD/src/matvec.cu b/PG-PuReMD/src/matvec.cu index 960b1dad..dcde4165 100644 --- a/PG-PuReMD/src/matvec.cu +++ b/PG-PuReMD/src/matvec.cu @@ -6,22 +6,22 @@ //one thread per row CUDA_GLOBAL void k_matvec (sparse_matrix H, real *vec, real *results, int rows) { - real results_row = 0; - int col; - real val; + real results_row = 0; + int col; + real val; - int i = blockIdx.x * blockDim.x + threadIdx.x; - if ( i >= rows) return; + int i = blockIdx.x * blockDim.x + threadIdx.x; + if ( i >= rows) return; - for (int c = H.start[i]; c < H.end[i]; c++) - { - col = H.entries [c].j; - val = H.entries[c].val; + for (int c = H.start[i]; c < H.end[i]; c++) + { + col = H.entries [c].j; + val = H.entries[c].val; - results_row += val * vec [col]; - } + results_row += val * vec [col]; + } - results [i] = results_row; + results [i] = results_row; } //32 thread warp per matrix row. @@ -31,61 +31,61 @@ CUDA_GLOBAL void k_matvec (sparse_matrix H, real *vec, real *results, int rows) CUDA_GLOBAL void k_matvec_csr(sparse_matrix H, real *vec, real *results, int num_rows) { #if defined(__SM_35__) - real vals; + real vals; #else - extern __shared__ real vals []; + extern __shared__ real vals []; #endif - int thread_id = blockDim.x * blockIdx.x + threadIdx.x; - int warp_id = thread_id / MATVEC_KER_THREADS_PER_ROW; - int lane = thread_id & ( MATVEC_KER_THREADS_PER_ROW - 1); + int thread_id = blockDim.x * blockIdx.x + threadIdx.x; + int warp_id = thread_id / MATVEC_KER_THREADS_PER_ROW; + int lane = thread_id & ( MATVEC_KER_THREADS_PER_ROW - 1); - int row_start; - int row_end; + int row_start; + int row_end; - // one warp per row - //int row = warp_id; - int row = warp_id; - //if (row < num_rows) - { + // one warp per row + //int row = warp_id; + int row = warp_id; + //if (row < num_rows) + { #if defined(__SM_35__) - vals = 0; + vals = 0; #else - vals[threadIdx.x] = 0; + vals[threadIdx.x] = 0; #endif - if (row < num_rows) { - row_start = H.start[row]; - row_end = H.end[row]; + if (row < num_rows) { + row_start = H.start[row]; + row_end = H.end[row]; - // compute running sum per thread - for(int jj = row_start + lane; jj < row_end; jj += MATVEC_KER_THREADS_PER_ROW) + // compute running sum per thread + for(int jj = row_start + lane; jj < row_end; jj += MATVEC_KER_THREADS_PER_ROW) #if defined(__SM_35__) - vals += H.entries[jj].val * vec [ H.entries[jj].j ]; - } + vals += H.entries[jj].val * vec [ H.entries[jj].j ]; + } #else - vals[threadIdx.x] += H.entries[jj].val * vec [ H.entries[jj].j ]; - } - __syncthreads (); + vals[threadIdx.x] += H.entries[jj].val * vec [ H.entries[jj].j ]; + } + __syncthreads (); #endif - // parallel reduction in shared memory - //SIMD instructions with a WARP are synchronous -- so we do not need to synch here + // parallel reduction in shared memory + //SIMD instructions with a WARP are synchronous -- so we do not need to synch here #if defined(__SM_35__) - for (int x = MATVEC_KER_THREADS_PER_ROW >> 1; x >= 1; x/=2) - vals += shfl( vals, x ); + for (int x = MATVEC_KER_THREADS_PER_ROW >> 1; x >= 1; x/=2) + vals += shfl( vals, x ); - if (lane == 0 && row < num_rows) - results[row] = vals; + if (lane == 0 && row < num_rows) + results[row] = vals; #else - if (lane < 16) vals[threadIdx.x] += vals[threadIdx.x + 16]; __syncthreads(); - if (lane < 8) vals[threadIdx.x] += vals[threadIdx.x + 8]; __syncthreads (); - if (lane < 4) vals[threadIdx.x] += vals[threadIdx.x + 4]; __syncthreads (); - if (lane < 2) vals[threadIdx.x] += vals[threadIdx.x + 2]; __syncthreads (); - if (lane < 1) vals[threadIdx.x] += vals[threadIdx.x + 1]; __syncthreads (); + if (lane < 16) vals[threadIdx.x] += vals[threadIdx.x + 16]; __syncthreads(); + if (lane < 8) vals[threadIdx.x] += vals[threadIdx.x + 8]; __syncthreads (); + if (lane < 4) vals[threadIdx.x] += vals[threadIdx.x + 4]; __syncthreads (); + if (lane < 2) vals[threadIdx.x] += vals[threadIdx.x + 2]; __syncthreads (); + if (lane < 1) vals[threadIdx.x] += vals[threadIdx.x + 1]; __syncthreads (); - // first thread writes the result - if (lane == 0 && row < num_rows) - results[row] = vals[threadIdx.x]; + // first thread writes the result + if (lane == 0 && row < num_rows) + results[row] = vals[threadIdx.x]; #endif } } diff --git a/PG-PuReMD/src/reduction.cu b/PG-PuReMD/src/reduction.cu index 770e4301..370e491b 100644 --- a/PG-PuReMD/src/reduction.cu +++ b/PG-PuReMD/src/reduction.cu @@ -7,62 +7,62 @@ CUDA_GLOBAL void k_reduction(const real *input, real *per_block_results, const size_t n) { #if defined(__SM_35__) - extern __shared__ real my_results[]; - real sdata; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - real x = 0; + extern __shared__ real my_results[]; + real sdata; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + real x = 0; - if(i < n) - x = input[i]; + if(i < n) + x = input[i]; - sdata = x; - __syncthreads(); + sdata = x; + __syncthreads(); - for(int z = 16; z >=1; z/=2) - sdata+= shfl ( sdata, z); + for(int z = 16; z >=1; z/=2) + sdata+= shfl ( sdata, z); - if (threadIdx.x % 32 == 0) - my_results[threadIdx.x >> 5] = sdata; + if (threadIdx.x % 32 == 0) + my_results[threadIdx.x >> 5] = sdata; - __syncthreads (); + __syncthreads (); - for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1) { - if(threadIdx.x < offset) - my_results[threadIdx.x] += my_results[threadIdx.x + offset]; + for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1) { + if(threadIdx.x < offset) + my_results[threadIdx.x] += my_results[threadIdx.x + offset]; - __syncthreads(); - } + __syncthreads(); + } - if(threadIdx.x == 0) - per_block_results[blockIdx.x] = my_results[0]; + if(threadIdx.x == 0) + per_block_results[blockIdx.x] = my_results[0]; #else - extern __shared__ real sdata[]; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - real x = 0; - - if(i < n) - { - x = input[i]; - } - sdata[threadIdx.x] = x; - __syncthreads(); - - for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) - { - if(threadIdx.x < offset) - { - sdata[threadIdx.x] += sdata[threadIdx.x + offset]; - } - - __syncthreads(); - } - - if(threadIdx.x == 0) - { - per_block_results[blockIdx.x] = sdata[0]; - } + extern __shared__ real sdata[]; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + real x = 0; + + if(i < n) + { + x = input[i]; + } + sdata[threadIdx.x] = x; + __syncthreads(); + + for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) + { + if(threadIdx.x < offset) + { + sdata[threadIdx.x] += sdata[threadIdx.x + offset]; + } + + __syncthreads(); + } + + if(threadIdx.x == 0) + { + per_block_results[blockIdx.x] = sdata[0]; + } #endif } @@ -71,70 +71,70 @@ CUDA_GLOBAL void k_reduction_rvec (rvec *input, rvec *results, size_t n) #if defined(__SM_35__) - extern __shared__ rvec my_rvec[]; - rvec sdata; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - rvec_MakeZero( sdata ); + extern __shared__ rvec my_rvec[]; + rvec sdata; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + rvec_MakeZero( sdata ); - if(i < n) - rvec_Copy (sdata, input[i]); + if(i < n) + rvec_Copy (sdata, input[i]); - __syncthreads(); + __syncthreads(); - for(int z = 16; z >=1; z/=2){ - sdata[0] += shfl ( sdata[0], z); - sdata[1] += shfl ( sdata[1], z); - sdata[2] += shfl ( sdata[2], z); - } + for(int z = 16; z >=1; z/=2){ + sdata[0] += shfl ( sdata[0], z); + sdata[1] += shfl ( sdata[1], z); + sdata[2] += shfl ( sdata[2], z); + } - if (threadIdx.x % 32 == 0) - rvec_Copy( my_rvec[threadIdx.x >> 5] , sdata ); + if (threadIdx.x % 32 == 0) + rvec_Copy( my_rvec[threadIdx.x >> 5] , sdata ); - __syncthreads (); + __syncthreads (); - for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1) { - if(threadIdx.x < offset) - rvec_Add( my_rvec[threadIdx.x], my_rvec[threadIdx.x + offset] ); + for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1) { + if(threadIdx.x < offset) + rvec_Add( my_rvec[threadIdx.x], my_rvec[threadIdx.x + offset] ); - __syncthreads(); - } + __syncthreads(); + } - if(threadIdx.x == 0) - rvec_Add (results[blockIdx.x], my_rvec[0]); + if(threadIdx.x == 0) + rvec_Add (results[blockIdx.x], my_rvec[0]); #else - extern __shared__ rvec svec_data[]; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - rvec x; + extern __shared__ rvec svec_data[]; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + rvec x; - rvec_MakeZero (x); + rvec_MakeZero (x); - if(i < n) - { - rvec_Copy (x, input[i]); - } + if(i < n) + { + rvec_Copy (x, input[i]); + } - rvec_Copy (svec_data[threadIdx.x], x); - __syncthreads(); + rvec_Copy (svec_data[threadIdx.x], x); + __syncthreads(); - for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) - { - if(threadIdx.x < offset) - { - rvec_Add (svec_data[threadIdx.x], svec_data[threadIdx.x + offset]); - } + for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) + { + if(threadIdx.x < offset) + { + rvec_Add (svec_data[threadIdx.x], svec_data[threadIdx.x + offset]); + } - __syncthreads(); - } + __syncthreads(); + } - if(threadIdx.x == 0) - { - //rvec_Copy (results[blockIdx.x], svec_data[0]); - rvec_Add (results[blockIdx.x], svec_data[0]); - } + if(threadIdx.x == 0) + { + //rvec_Copy (results[blockIdx.x], svec_data[0]); + rvec_Add (results[blockIdx.x], svec_data[0]); + } #endif @@ -144,81 +144,81 @@ CUDA_GLOBAL void k_reduction_rvec2 (rvec2 *input, rvec2 *results, size_t n) { #if defined(__SM_35__) - extern __shared__ rvec2 my_rvec2[]; - rvec2 sdata; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + extern __shared__ rvec2 my_rvec2[]; + rvec2 sdata; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - sdata[0] = 0.0; - sdata[1] = 0.0; + sdata[0] = 0.0; + sdata[1] = 0.0; - if(i < n){ - sdata[0] = input[i][0]; - sdata[1] = input[i][1]; - } + if(i < n){ + sdata[0] = input[i][0]; + sdata[1] = input[i][1]; + } - __syncthreads(); + __syncthreads(); - for(int z = 16; z >=1; z/=2){ - sdata[0] += shfl ( sdata[0], z); - sdata[1] += shfl ( sdata[1], z); - } + for(int z = 16; z >=1; z/=2){ + sdata[0] += shfl ( sdata[0], z); + sdata[1] += shfl ( sdata[1], z); + } - if (threadIdx.x % 32 == 0){ - my_rvec2[threadIdx.x >> 5][0] = sdata[0]; - my_rvec2[threadIdx.x >> 5][1] = sdata[1]; - } + if (threadIdx.x % 32 == 0){ + my_rvec2[threadIdx.x >> 5][0] = sdata[0]; + my_rvec2[threadIdx.x >> 5][1] = sdata[1]; + } - __syncthreads (); + __syncthreads (); - for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1) { - if(threadIdx.x < offset){ - my_rvec2[threadIdx.x][0] += my_rvec2[threadIdx.x + offset][0]; - my_rvec2[threadIdx.x][1] += my_rvec2[threadIdx.x + offset][1]; - } + for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1) { + if(threadIdx.x < offset){ + my_rvec2[threadIdx.x][0] += my_rvec2[threadIdx.x + offset][0]; + my_rvec2[threadIdx.x][1] += my_rvec2[threadIdx.x + offset][1]; + } - __syncthreads(); - } + __syncthreads(); + } - if(threadIdx.x == 0){ - results[blockIdx.x][0] = my_rvec2[0][0]; - results[blockIdx.x][1] = my_rvec2[0][1]; - } + if(threadIdx.x == 0){ + results[blockIdx.x][0] = my_rvec2[0][0]; + results[blockIdx.x][1] = my_rvec2[0][1]; + } #else - extern __shared__ rvec2 svec2_data[]; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - rvec2 x; - - x[0] = 0.0; - x[1] = 0.0; - - if(i < n) - { - x[0] += input[i][0]; - x[1] += input[i][1]; - } - - svec2_data [threadIdx.x][0] = x[0]; - svec2_data [threadIdx.x][1] = x[1]; - __syncthreads(); - - for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) - { - if(threadIdx.x < offset) - { - svec2_data [threadIdx.x][0] += svec2_data [threadIdx.x + offset][0]; - svec2_data [threadIdx.x][1] += svec2_data [threadIdx.x + offset][1]; - } - - __syncthreads(); - } - - if(threadIdx.x == 0) - { - //rvec_Copy (results[blockIdx.x], svec_data[0]); - results [blockIdx.x][0] += svec2_data [0][0]; - results [blockIdx.x][1] += svec2_data [0][1]; - } + extern __shared__ rvec2 svec2_data[]; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + rvec2 x; + + x[0] = 0.0; + x[1] = 0.0; + + if(i < n) + { + x[0] += input[i][0]; + x[1] += input[i][1]; + } + + svec2_data [threadIdx.x][0] = x[0]; + svec2_data [threadIdx.x][1] = x[1]; + __syncthreads(); + + for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) + { + if(threadIdx.x < offset) + { + svec2_data [threadIdx.x][0] += svec2_data [threadIdx.x + offset][0]; + svec2_data [threadIdx.x][1] += svec2_data [threadIdx.x + offset][1]; + } + + __syncthreads(); + } + + if(threadIdx.x == 0) + { + //rvec_Copy (results[blockIdx.x], svec_data[0]); + results [blockIdx.x][0] += svec2_data [0][0]; + results [blockIdx.x][1] += svec2_data [0][1]; + } #endif } @@ -226,61 +226,61 @@ CUDA_GLOBAL void k_dot (const real *a, const real *b, real *per_block_results, c { #if defined(__SM_35__) - extern __shared__ real my_dot[]; - real sdot; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + extern __shared__ real my_dot[]; + real sdot; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - sdot = 0.0; - if(i < n) - sdot = a[i] * b[i]; + sdot = 0.0; + if(i < n) + sdot = a[i] * b[i]; - __syncthreads(); + __syncthreads(); - for(int z = 16; z >=1; z/=2) - sdot += shfl ( sdot, z); + for(int z = 16; z >=1; z/=2) + sdot += shfl ( sdot, z); - if (threadIdx.x % 32 == 0) - my_dot[threadIdx.x >> 5] = sdot; + if (threadIdx.x % 32 == 0) + my_dot[threadIdx.x >> 5] = sdot; - __syncthreads (); + __syncthreads (); - for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1) { - if(threadIdx.x < offset) - my_dot[threadIdx.x] += my_dot[threadIdx.x + offset]; + for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1) { + if(threadIdx.x < offset) + my_dot[threadIdx.x] += my_dot[threadIdx.x + offset]; - __syncthreads(); - } + __syncthreads(); + } - if(threadIdx.x == 0) - per_block_results[blockIdx.x] = my_dot[0]; + if(threadIdx.x == 0) + per_block_results[blockIdx.x] = my_dot[0]; #else - extern __shared__ real sdot[]; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - real x = 0; + extern __shared__ real sdot[]; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + real x = 0; - if(i < n) - { - x = a[i] * b[i]; - } - sdot[threadIdx.x] = x; - __syncthreads(); + if(i < n) + { + x = a[i] * b[i]; + } + sdot[threadIdx.x] = x; + __syncthreads(); - for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) - { - if(threadIdx.x < offset) - { - sdot[threadIdx.x] += sdot[threadIdx.x + offset]; - } + for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) + { + if(threadIdx.x < offset) + { + sdot[threadIdx.x] += sdot[threadIdx.x + offset]; + } - __syncthreads(); - } + __syncthreads(); + } - if(threadIdx.x == 0) - { - per_block_results[blockIdx.x] = sdot[0]; - } + if(threadIdx.x == 0) + { + per_block_results[blockIdx.x] = sdot[0]; + } #endif @@ -290,56 +290,56 @@ CUDA_GLOBAL void k_norm (const real *input, real *per_block_results, const size_ { #if defined(__SM_35__) - extern __shared__ real my_norm[]; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - real snorm = 0.0; + extern __shared__ real my_norm[]; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + real snorm = 0.0; - if(i < n) - snorm = SQR (input[i]); + if(i < n) + snorm = SQR (input[i]); - __syncthreads(); + __syncthreads(); - for(int z = 16; z >=1; z/=2) - snorm += shfl ( snorm, z); + for(int z = 16; z >=1; z/=2) + snorm += shfl ( snorm, z); - if (threadIdx.x % 32 == 0) - my_norm[threadIdx.x >> 5] = snorm; + if (threadIdx.x % 32 == 0) + my_norm[threadIdx.x >> 5] = snorm; - __syncthreads (); + __syncthreads (); - for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1) { - if(threadIdx.x < offset) - my_norm[threadIdx.x] += my_norm[threadIdx.x + offset]; + for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1) { + if(threadIdx.x < offset) + my_norm[threadIdx.x] += my_norm[threadIdx.x + offset]; - __syncthreads(); - } + __syncthreads(); + } - if(threadIdx.x == 0) - per_block_results[blockIdx.x] = my_norm[0]; + if(threadIdx.x == 0) + per_block_results[blockIdx.x] = my_norm[0]; #else - extern __shared__ real snorm[]; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - real x = 0; + extern __shared__ real snorm[]; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + real x = 0; - if(i < n) - x = SQR (input[i]); + if(i < n) + x = SQR (input[i]); - snorm[threadIdx.x] = x; - __syncthreads(); + snorm[threadIdx.x] = x; + __syncthreads(); - for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) - { - if(threadIdx.x < offset) - { - snorm[threadIdx.x] += snorm[threadIdx.x + offset]; - } + for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) + { + if(threadIdx.x < offset) + { + snorm[threadIdx.x] += snorm[threadIdx.x + offset]; + } - __syncthreads(); - } + __syncthreads(); + } - if(threadIdx.x == 0) - per_block_results[blockIdx.x] = snorm[0]; + if(threadIdx.x == 0) + per_block_results[blockIdx.x] = snorm[0]; #endif @@ -351,84 +351,84 @@ CUDA_GLOBAL void k_norm_rvec2 (const rvec2 *input, rvec2 *per_block_results, con { #if defined(__SM_35__) - extern __shared__ rvec2 my_norm2[]; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - rvec2 snorm2; - snorm2[0] = snorm2[1] = 0; - - if(i < n) { - if (pass == INITIAL) { - snorm2[0] = SQR (input[i][0]); - snorm2[1] = SQR (input[i][1]); - } else { - snorm2[0] = input[i][0]; - snorm2[1] = input[i][1]; - } - } - __syncthreads(); - - for(int z = 16; z >=1; z/=2){ - snorm2[0] += shfl ( snorm2[0], z); - snorm2[1] += shfl ( snorm2[1], z); - } - - if (threadIdx.x % 32 == 0){ - my_norm2[threadIdx.x >> 5][0] = snorm2[0]; - my_norm2[threadIdx.x >> 5][1] = snorm2[1]; - } - - __syncthreads (); - - for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1) { - if(threadIdx.x < offset){ - my_norm2[threadIdx.x][0] += my_norm2[threadIdx.x + offset][0]; - my_norm2[threadIdx.x][1] += my_norm2[threadIdx.x + offset][1]; - } - - __syncthreads(); - } - - if(threadIdx.x == 0) { - per_block_results[blockIdx.x][0] = my_norm2[0][0]; - per_block_results[blockIdx.x][1] = my_norm2[0][1]; - } + extern __shared__ rvec2 my_norm2[]; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + rvec2 snorm2; + snorm2[0] = snorm2[1] = 0; + + if(i < n) { + if (pass == INITIAL) { + snorm2[0] = SQR (input[i][0]); + snorm2[1] = SQR (input[i][1]); + } else { + snorm2[0] = input[i][0]; + snorm2[1] = input[i][1]; + } + } + __syncthreads(); + + for(int z = 16; z >=1; z/=2){ + snorm2[0] += shfl ( snorm2[0], z); + snorm2[1] += shfl ( snorm2[1], z); + } + + if (threadIdx.x % 32 == 0){ + my_norm2[threadIdx.x >> 5][0] = snorm2[0]; + my_norm2[threadIdx.x >> 5][1] = snorm2[1]; + } + + __syncthreads (); + + for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1) { + if(threadIdx.x < offset){ + my_norm2[threadIdx.x][0] += my_norm2[threadIdx.x + offset][0]; + my_norm2[threadIdx.x][1] += my_norm2[threadIdx.x + offset][1]; + } + + __syncthreads(); + } + + if(threadIdx.x == 0) { + per_block_results[blockIdx.x][0] = my_norm2[0][0]; + per_block_results[blockIdx.x][1] = my_norm2[0][1]; + } #else - extern __shared__ rvec2 snorm2[]; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - rvec2 x; - x[0] = x[1] = 0; - - if(i < n) { - if (pass == INITIAL) { - x[0] = SQR (input[i][0]); - x[1] = SQR (input[i][1]); - } else { - x[0] = input[i][0]; - x[1] = input[i][1]; - } - } - - snorm2[threadIdx.x][0] = x[0]; - snorm2[threadIdx.x][1] = x[1]; - __syncthreads(); - - for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) - { - if(threadIdx.x < offset) - { - snorm2[threadIdx.x][0] += snorm2[threadIdx.x + offset][0]; - snorm2[threadIdx.x][1] += snorm2[threadIdx.x + offset][1]; - } - - __syncthreads(); - } - - if(threadIdx.x == 0) { - per_block_results[blockIdx.x][0] = snorm2[0][0]; - per_block_results[blockIdx.x][1] = snorm2[0][1]; - } + extern __shared__ rvec2 snorm2[]; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + rvec2 x; + x[0] = x[1] = 0; + + if(i < n) { + if (pass == INITIAL) { + x[0] = SQR (input[i][0]); + x[1] = SQR (input[i][1]); + } else { + x[0] = input[i][0]; + x[1] = input[i][1]; + } + } + + snorm2[threadIdx.x][0] = x[0]; + snorm2[threadIdx.x][1] = x[1]; + __syncthreads(); + + for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) + { + if(threadIdx.x < offset) + { + snorm2[threadIdx.x][0] += snorm2[threadIdx.x + offset][0]; + snorm2[threadIdx.x][1] += snorm2[threadIdx.x + offset][1]; + } + + __syncthreads(); + } + + if(threadIdx.x == 0) { + per_block_results[blockIdx.x][0] = snorm2[0][0]; + per_block_results[blockIdx.x][1] = snorm2[0][1]; + } #endif } @@ -436,76 +436,76 @@ CUDA_GLOBAL void k_dot_rvec2 (const rvec2 *a, rvec2 *b, rvec2 *res, const size_t { #if defined(__SM_35__) - extern __shared__ rvec2 my_dot2[]; - rvec2 sdot2; + extern __shared__ rvec2 my_dot2[]; + rvec2 sdot2; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - sdot2[0] = sdot2[1] = 0; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + sdot2[0] = sdot2[1] = 0; - if(i < n) { - sdot2[0] = a[i][0] * b[i][0]; - sdot2[1] = a[i][1] * b[i][1]; - } + if(i < n) { + sdot2[0] = a[i][0] * b[i][0]; + sdot2[1] = a[i][1] * b[i][1]; + } - __syncthreads(); + __syncthreads(); - for(int z = 16; z >=1; z/=2){ - sdot2[0] += shfl ( sdot2[0], z); - sdot2[1] += shfl ( sdot2[1], z); - } + for(int z = 16; z >=1; z/=2){ + sdot2[0] += shfl ( sdot2[0], z); + sdot2[1] += shfl ( sdot2[1], z); + } - if (threadIdx.x % 32 == 0){ - my_dot2[threadIdx.x >> 5][0] = sdot2[0]; - my_dot2[threadIdx.x >> 5][1] = sdot2[1]; - } + if (threadIdx.x % 32 == 0){ + my_dot2[threadIdx.x >> 5][0] = sdot2[0]; + my_dot2[threadIdx.x >> 5][1] = sdot2[1]; + } - __syncthreads (); + __syncthreads (); - for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1) { - if(threadIdx.x < offset){ - my_dot2[threadIdx.x][0] += my_dot2[threadIdx.x + offset][0]; - my_dot2[threadIdx.x][1] += my_dot2[threadIdx.x + offset][1]; - } + for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1) { + if(threadIdx.x < offset){ + my_dot2[threadIdx.x][0] += my_dot2[threadIdx.x + offset][0]; + my_dot2[threadIdx.x][1] += my_dot2[threadIdx.x + offset][1]; + } - __syncthreads(); - } + __syncthreads(); + } - if(threadIdx.x == 0) { - res[blockIdx.x][0] = my_dot2[0][0]; - res[blockIdx.x][1] = my_dot2[0][1]; - } + if(threadIdx.x == 0) { + res[blockIdx.x][0] = my_dot2[0][0]; + res[blockIdx.x][1] = my_dot2[0][1]; + } #else - extern __shared__ rvec2 sdot2[]; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - rvec2 x; - x[0] = x[1] = 0; - - if(i < n) { - x[0] = a[i][0] * b[i][0]; - x[1] = a[i][1] * b[i][1]; - } - - sdot2[threadIdx.x][0] = x[0]; - sdot2[threadIdx.x][1] = x[1]; - __syncthreads(); - - for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) - { - if(threadIdx.x < offset) - { - sdot2[threadIdx.x][0] += sdot2[threadIdx.x + offset][0]; - sdot2[threadIdx.x][1] += sdot2[threadIdx.x + offset][1]; - } - - __syncthreads(); - } - - if(threadIdx.x == 0) { - res[blockIdx.x][0] = sdot2[0][0]; - res[blockIdx.x][1] = sdot2[0][1]; - } + extern __shared__ rvec2 sdot2[]; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + rvec2 x; + x[0] = x[1] = 0; + + if(i < n) { + x[0] = a[i][0] * b[i][0]; + x[1] = a[i][1] * b[i][1]; + } + + sdot2[threadIdx.x][0] = x[0]; + sdot2[threadIdx.x][1] = x[1]; + __syncthreads(); + + for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) + { + if(threadIdx.x < offset) + { + sdot2[threadIdx.x][0] += sdot2[threadIdx.x + offset][0]; + sdot2[threadIdx.x][1] += sdot2[threadIdx.x + offset][1]; + } + + __syncthreads(); + } + + if(threadIdx.x == 0) { + res[blockIdx.x][0] = sdot2[0][0]; + res[blockIdx.x][1] = sdot2[0][1]; + } #endif } @@ -515,37 +515,37 @@ CUDA_GLOBAL void k_dot_rvec2 (const rvec2 *a, rvec2 *b, rvec2 *res, const size_t CUDA_GLOBAL void k_vector_sum( real* dest, real c, real* v, real d, real* y, int k ) { - int i = blockIdx.x * blockDim.x + threadIdx.x; - if ( i >= k) return; + int i = blockIdx.x * blockDim.x + threadIdx.x; + if ( i >= k) return; - dest[i] = c * v[i] + d * y[i]; + dest[i] = c * v[i] + d * y[i]; } CUDA_GLOBAL void k_vector_mul( real* dest, real* v, real* y, int k ) { - int i = blockIdx.x * blockDim.x + threadIdx.x; - if ( i >= k) return; + int i = blockIdx.x * blockDim.x + threadIdx.x; + if ( i >= k) return; - dest[i] = v[i] * y[i]; + dest[i] = v[i] * y[i]; } CUDA_GLOBAL void k_rvec2_mul( rvec2* dest, rvec2* v, rvec2* y, int k ) { - int i = blockIdx.x * blockDim.x + threadIdx.x; - if ( i >= k) return; + int i = blockIdx.x * blockDim.x + threadIdx.x; + if ( i >= k) return; - dest[i][0] = v[i][0] * y[i][0]; - dest[i][1] = v[i][1] * y[i][1]; + dest[i][0] = v[i][0] * y[i][0]; + dest[i][1] = v[i][1] * y[i][1]; } CUDA_GLOBAL void k_rvec2_pbetad (rvec2 *dest, rvec2 *a, - real beta0, real beta1, - rvec2 *b, int n) + real beta0, real beta1, + rvec2 *b, int n) { - int i = blockIdx.x * blockDim.x + threadIdx.x; - if ( i >= n) return; + int i = blockIdx.x * blockDim.x + threadIdx.x; + if ( i >= n) return; - dest[i][0] = a[i][0] + beta0 * b[i][0]; - dest[i][1] = a[i][1] + beta1 * b[i][1]; + dest[i][0] = a[i][0] + beta0 * b[i][0]; + dest[i][1] = a[i][1] + beta1 * b[i][1]; } diff --git a/PG-PuReMD/src/validation.cu b/PG-PuReMD/src/validation.cu index f3b74862..2bffc36f 100644 --- a/PG-PuReMD/src/validation.cu +++ b/PG-PuReMD/src/validation.cu @@ -9,1454 +9,1454 @@ bool check_zero (real p1, real p2) { - if (abs (p1 - p2) >= GPU_TOLERANCE) - return true; - else - return false; + if (abs (p1 - p2) >= GPU_TOLERANCE) + return true; + else + return false; } bool check_zero (rvec p1, rvec p2) { - if (((abs (p1[0] - p2[0])) >= GPU_TOLERANCE) || - ((abs (p1[1] - p2[1])) >= GPU_TOLERANCE) || - ((abs (p1[2] - p2[2])) >= GPU_TOLERANCE )) - return true; - else return false; + if (((abs (p1[0] - p2[0])) >= GPU_TOLERANCE) || + ((abs (p1[1] - p2[1])) >= GPU_TOLERANCE) || + ((abs (p1[2] - p2[2])) >= GPU_TOLERANCE )) + return true; + else return false; } bool check_zero_rvec2 (rvec2 p1, rvec2 p2) { - if (((abs (p1[0] - p2[0])) >= GPU_TOLERANCE) || - ((abs (p1[1] - p2[1])) >= GPU_TOLERANCE )) - return true; - else return false; + if (((abs (p1[0] - p2[0])) >= GPU_TOLERANCE) || + ((abs (p1[1] - p2[1])) >= GPU_TOLERANCE )) + return true; + else return false; } bool check_same (ivec p1, ivec p2) { - if ( (p1[0] == p2[0]) || (p1[1] == p2[1]) || (p1[2] == p2[2]) ) - return true; - else - return false; + if ( (p1[0] == p2[0]) || (p1[1] == p2[1]) || (p1[2] == p2[2]) ) + return true; + else + return false; } void print_bond_data (bond_order_data *s) { - /* - fprintf (stderr, "Bond_Order_Data BO (%f ) BO_s (%f ) BO_pi (%f ) BO_pi2 (%f ) ", - s->BO, - s->BO_s, - s->BO_pi, - s->BO_pi2 ); - */ - fprintf (stderr, " Cdbo (%e) ", s->Cdbo ); - fprintf (stderr, " Cdbopi (%e) ", s->Cdbopi ); - fprintf (stderr, " Cdbopi2 (%e) ", s->Cdbopi2 ); + /* + fprintf (stderr, "Bond_Order_Data BO (%f ) BO_s (%f ) BO_pi (%f ) BO_pi2 (%f ) ", + s->BO, + s->BO_s, + s->BO_pi, + s->BO_pi2 ); + */ + fprintf (stderr, " Cdbo (%e) ", s->Cdbo ); + fprintf (stderr, " Cdbopi (%e) ", s->Cdbopi ); + fprintf (stderr, " Cdbopi2 (%e) ", s->Cdbopi2 ); } int validate_neighbors (reax_system *system, reax_list **lists) { - reax_list *far_nbrs = *lists + FAR_NBRS; - reax_list *d_nbrs = *dev_lists + FAR_NBRS; - far_neighbor_data gpu, cpu; - int index, count, jicount; - int hostcount, dijcount, djicount; - int i; - - int *end = (int *)malloc (sizeof (int) * system->N); - int *start = (int *) malloc (sizeof (int) * system->N ); - - copy_host_device (start, d_nbrs->index, - sizeof (int) * system->N, cudaMemcpyDeviceToHost, "far_nbrs:index"); - copy_host_device (end, d_nbrs->end_index, - sizeof (int) * system->N, cudaMemcpyDeviceToHost, "far_nbrs:end_index"); - - far_neighbor_data *data = (far_neighbor_data *) - malloc (sizeof (far_neighbor_data)* d_nbrs->num_intrs); - copy_host_device (data, d_nbrs->select.far_nbr_list, - sizeof (far_neighbor_data) * d_nbrs->num_intrs, cudaMemcpyDeviceToHost, "far_nbr_list"); - - hostcount = dijcount = djicount = 0; - - for (i= 0; i < system->N-1; i++){ - if (end [i] > start [i+1]) - { - fprintf (stderr, " Far Neighbors index over write @ index %d (%d, %d) and (%d %d)\n", - i, start[i], end[i], start[i+1], end[i+1]); - return FAILURE; - } - hostcount += end[i] - start[i]; - } - hostcount += end[i] - start[i]; - fprintf (stderr, "Total Neighbors count: %d \n", hostcount); - hostcount = 0; - - return 0; - - /* - for (int i = 0; i < 2; i++) { - for (int j = start[i]; j < end[i]; j++){ - gpu = data[j]; - fprintf (stderr, " atom %d neighbor %d (%f, %d, %d, %d - %f %f %f) - %d \n", i, data[j].nbr, - data[j].d, - data[j].rel_box[0], - data[j].rel_box[1], - data[j].rel_box[2], - data[j].dvec[0], - data[j].dvec[1], - data[j].dvec[2], - j - ); - } - } - - return SUCCESS; - */ - - for (int i = 0; i < system->N; i++){ - index = Start_Index (i, far_nbrs); - - for (int j = start[i]; j < end[i]; j++){ - - - if (i > data[j].nbr) { - - int src = data[j].nbr; - int dest = i; - int x; - - - for (x = start[src]; x < end[src]; x++) { - if (data[x].nbr != dest) continue; - - gpu = data[x]; - cpu = data[j]; - - if ( (gpu.d != cpu.d) || - (cpu.dvec[0] != gpu.dvec[0]) || (cpu.dvec[1] != gpu.dvec[1]) || (cpu.dvec[2] != gpu.dvec[2]) || - (cpu.rel_box[0] != gpu.rel_box[0]) || (cpu.rel_box[1] != gpu.rel_box[1]) || (cpu.rel_box[2] != gpu.rel_box[2])) { - fprintf (stderr, " atom %d neighbor %d (%f, %d, %d, %d - %f %f %f) - %d \n", i, data[j].nbr, - data[j].d, - data[j].rel_box[0], - data[j].rel_box[1], - data[j].rel_box[2], - data[j].dvec[0], - data[j].dvec[1], - data[j].dvec[2], - j - ); - fprintf (stderr, " atom %d neighbor %d (%f, %d, %d, %d - %f %f %f) - %d \n", data[j].nbr, data[x].nbr, - data[x].d, - data[x].rel_box[0], - data[x].rel_box[1], - data[x].rel_box[2], - data[x].dvec[0], - data[x].dvec[1], - data[x].dvec[2], - x - ); - jicount++; - - fprintf (stderr, " Far Neighbors DOES NOT match between Deivce and Host \n"); - exit (-1); - } - djicount ++; - break; - } - - if (x >= end[src]) { - fprintf (stderr, "could not find the neighbor duplicate data for ij (%d %d)\n", i, src ); - exit (-1); - } - continue; - } - - gpu = data[j]; - cpu = far_nbrs->select.far_nbr_list[index]; - if ( check_zero (gpu.d, cpu.d) || - (gpu.nbr != cpu.nbr) || - check_zero (cpu.dvec, gpu.dvec) || - !check_same (cpu.rel_box, gpu.rel_box)) { - - fprintf (stderr, "GPU:atom --> %d (s: %d , e: %d, i: %d )\n", i, start[i], end[i], j ); - fprintf (stderr, "CPU:atom --> %d (s: %d , e: %d, i: %d )\n", i, Start_Index(i, far_nbrs), End_Index (i, far_nbrs), index); - fprintf (stdout, "Far neighbors does not match atom: %d \n", i ); - fprintf (stdout, "neighbor %d , %d \n", cpu.nbr, gpu.nbr); - fprintf (stdout, "d %f , %f \n", cpu.d, data[j].d); - fprintf (stdout, "dvec (%f %f %f) (%f %f %f) \n", - cpu.dvec[0], cpu.dvec[1], cpu.dvec[2], - gpu.dvec[0], gpu.dvec[1], gpu.dvec[2] ); - - fprintf (stdout, "rel_box (%d %d %d) (%d %d %d) \n", - cpu.rel_box[0], cpu.rel_box[1], cpu.rel_box[2], - gpu.rel_box[0], gpu.rel_box[1], gpu.rel_box[2] ); - - fprintf (stderr, " Far Neighbors DOES NOT match between Deivce and Host **** \n"); - return FAILURE; - count ++; - } - index ++; - hostcount ++; - dijcount ++; - } - - if (index != End_Index (i, far_nbrs)) - { - fprintf (stderr, "End index does not match for atom --> %d end index (%d) Cpu (%d, %d ) gpu (%d, %d)\n", - i, index, Start_Index (i, far_nbrs), End_Index(i, far_nbrs), start[i], end[i]); - return FAILURE; - } - } - - fprintf (stderr, "FAR Neighbors match between device and host host:%d, device:%d dji: %d \n", - hostcount, dijcount, djicount); - free (start); - free (end); - free (data); - return SUCCESS; + reax_list *far_nbrs = *lists + FAR_NBRS; + reax_list *d_nbrs = *dev_lists + FAR_NBRS; + far_neighbor_data gpu, cpu; + int index, count, jicount; + int hostcount, dijcount, djicount; + int i; + + int *end = (int *)malloc (sizeof (int) * system->N); + int *start = (int *) malloc (sizeof (int) * system->N ); + + copy_host_device (start, d_nbrs->index, + sizeof (int) * system->N, cudaMemcpyDeviceToHost, "far_nbrs:index"); + copy_host_device (end, d_nbrs->end_index, + sizeof (int) * system->N, cudaMemcpyDeviceToHost, "far_nbrs:end_index"); + + far_neighbor_data *data = (far_neighbor_data *) + malloc (sizeof (far_neighbor_data)* d_nbrs->num_intrs); + copy_host_device (data, d_nbrs->select.far_nbr_list, + sizeof (far_neighbor_data) * d_nbrs->num_intrs, cudaMemcpyDeviceToHost, "far_nbr_list"); + + hostcount = dijcount = djicount = 0; + + for (i= 0; i < system->N-1; i++){ + if (end [i] > start [i+1]) + { + fprintf (stderr, " Far Neighbors index over write @ index %d (%d, %d) and (%d %d)\n", + i, start[i], end[i], start[i+1], end[i+1]); + return FAILURE; + } + hostcount += end[i] - start[i]; + } + hostcount += end[i] - start[i]; + fprintf (stderr, "Total Neighbors count: %d \n", hostcount); + hostcount = 0; + + return 0; + + /* + for (int i = 0; i < 2; i++) { + for (int j = start[i]; j < end[i]; j++){ + gpu = data[j]; + fprintf (stderr, " atom %d neighbor %d (%f, %d, %d, %d - %f %f %f) - %d \n", i, data[j].nbr, + data[j].d, + data[j].rel_box[0], + data[j].rel_box[1], + data[j].rel_box[2], + data[j].dvec[0], + data[j].dvec[1], + data[j].dvec[2], + j + ); + } + } + + return SUCCESS; + */ + + for (int i = 0; i < system->N; i++){ + index = Start_Index (i, far_nbrs); + + for (int j = start[i]; j < end[i]; j++){ + + + if (i > data[j].nbr) { + + int src = data[j].nbr; + int dest = i; + int x; + + + for (x = start[src]; x < end[src]; x++) { + if (data[x].nbr != dest) continue; + + gpu = data[x]; + cpu = data[j]; + + if ( (gpu.d != cpu.d) || + (cpu.dvec[0] != gpu.dvec[0]) || (cpu.dvec[1] != gpu.dvec[1]) || (cpu.dvec[2] != gpu.dvec[2]) || + (cpu.rel_box[0] != gpu.rel_box[0]) || (cpu.rel_box[1] != gpu.rel_box[1]) || (cpu.rel_box[2] != gpu.rel_box[2])) { + fprintf (stderr, " atom %d neighbor %d (%f, %d, %d, %d - %f %f %f) - %d \n", i, data[j].nbr, + data[j].d, + data[j].rel_box[0], + data[j].rel_box[1], + data[j].rel_box[2], + data[j].dvec[0], + data[j].dvec[1], + data[j].dvec[2], + j + ); + fprintf (stderr, " atom %d neighbor %d (%f, %d, %d, %d - %f %f %f) - %d \n", data[j].nbr, data[x].nbr, + data[x].d, + data[x].rel_box[0], + data[x].rel_box[1], + data[x].rel_box[2], + data[x].dvec[0], + data[x].dvec[1], + data[x].dvec[2], + x + ); + jicount++; + + fprintf (stderr, " Far Neighbors DOES NOT match between Deivce and Host \n"); + exit (-1); + } + djicount ++; + break; + } + + if (x >= end[src]) { + fprintf (stderr, "could not find the neighbor duplicate data for ij (%d %d)\n", i, src ); + exit (-1); + } + continue; + } + + gpu = data[j]; + cpu = far_nbrs->select.far_nbr_list[index]; + if ( check_zero (gpu.d, cpu.d) || + (gpu.nbr != cpu.nbr) || + check_zero (cpu.dvec, gpu.dvec) || + !check_same (cpu.rel_box, gpu.rel_box)) { + + fprintf (stderr, "GPU:atom --> %d (s: %d , e: %d, i: %d )\n", i, start[i], end[i], j ); + fprintf (stderr, "CPU:atom --> %d (s: %d , e: %d, i: %d )\n", i, Start_Index(i, far_nbrs), End_Index (i, far_nbrs), index); + fprintf (stdout, "Far neighbors does not match atom: %d \n", i ); + fprintf (stdout, "neighbor %d , %d \n", cpu.nbr, gpu.nbr); + fprintf (stdout, "d %f , %f \n", cpu.d, data[j].d); + fprintf (stdout, "dvec (%f %f %f) (%f %f %f) \n", + cpu.dvec[0], cpu.dvec[1], cpu.dvec[2], + gpu.dvec[0], gpu.dvec[1], gpu.dvec[2] ); + + fprintf (stdout, "rel_box (%d %d %d) (%d %d %d) \n", + cpu.rel_box[0], cpu.rel_box[1], cpu.rel_box[2], + gpu.rel_box[0], gpu.rel_box[1], gpu.rel_box[2] ); + + fprintf (stderr, " Far Neighbors DOES NOT match between Deivce and Host **** \n"); + return FAILURE; + count ++; + } + index ++; + hostcount ++; + dijcount ++; + } + + if (index != End_Index (i, far_nbrs)) + { + fprintf (stderr, "End index does not match for atom --> %d end index (%d) Cpu (%d, %d ) gpu (%d, %d)\n", + i, index, Start_Index (i, far_nbrs), End_Index(i, far_nbrs), start[i], end[i]); + return FAILURE; + } + } + + fprintf (stderr, "FAR Neighbors match between device and host host:%d, device:%d dji: %d \n", + hostcount, dijcount, djicount); + free (start); + free (end); + free (data); + return SUCCESS; } int validate_sym_dbond_indices (reax_system *system, storage *workspace, reax_list **lists) { - int start, end, index, count, miscount; - int hostcount, devicecount, h, d; - int *d_start, *d_end; - bond_data *d_bond_data; - reax_list *d_bonds = *dev_lists + BONDS; - reax_list *bonds = *lists + BONDS; - - d_end = (int *)malloc (sizeof (int) * system->N); - d_start = (int *) malloc (sizeof (int) * system->N ); - d_bond_data = (bond_data *) malloc (sizeof (bond_data) * d_bonds->num_intrs); - //fprintf (stderr, "Num bonds copied from device to host is --> %d \n", system->num_bonds ); - - copy_host_device (d_start, d_bonds->index, sizeof (int) * system->N, cudaMemcpyDeviceToHost, "index"); - copy_host_device (d_end, d_bonds->end_index, sizeof (int) * system->N, cudaMemcpyDeviceToHost, "index"); - copy_host_device (d_bond_data, d_bonds->select.bond_list, sizeof (bond_data) * d_bonds->num_intrs, cudaMemcpyDeviceToHost, "bond_data"); - - count = 0; - miscount = 0; - hostcount = 0; - devicecount = 0; - - for (int i = 0; i < system->N; i++) { - h= End_Index (i, bonds) - Start_Index (i, bonds); - d= d_end[i] - d_start[i]; - //if (h != d) - // fprintf (stderr, "Count does not match atom:%d, host:%d, device:%d \n", - // i, h, d); - hostcount += h; - devicecount += d; - } - fprintf (stderr, "Bonds count: host: %d device: %d \n", hostcount, devicecount); - - for (int i = 0; i < system->N; i++) { - - for (int j = d_start[i]; j < d_end[i]; j++) { - bond_data *src, *tgt; - src = &d_bond_data[j]; - - tgt = &d_bond_data[ src->sym_index ]; - - if ((src->dbond_index == tgt->dbond_index) ) - count ++; - else - miscount ++; - } - } - fprintf (stderr, "Sym and dbond indexes done count(device) --> %d (%d)\n", count, miscount); - - count = 0; - miscount = 0; - for (int i = 0; i < system->N; i++) { - - for (int j = Start_Index (i, bonds); j < End_Index(i, bonds); j++) { - bond_data *src, *tgt; - src = &bonds->select.bond_list [j]; - - tgt = &bonds->select.bond_list [ src->sym_index ]; - - if ((src->dbond_index == tgt->dbond_index) ) - count ++; - else - miscount ++; - } - } - fprintf (stderr, "Sym and dbond indexes done count (host) --> %d (%d)\n", count, miscount); - - free (d_end); - free (d_start); - free (d_bond_data); - - return SUCCESS; + int start, end, index, count, miscount; + int hostcount, devicecount, h, d; + int *d_start, *d_end; + bond_data *d_bond_data; + reax_list *d_bonds = *dev_lists + BONDS; + reax_list *bonds = *lists + BONDS; + + d_end = (int *)malloc (sizeof (int) * system->N); + d_start = (int *) malloc (sizeof (int) * system->N ); + d_bond_data = (bond_data *) malloc (sizeof (bond_data) * d_bonds->num_intrs); + //fprintf (stderr, "Num bonds copied from device to host is --> %d \n", system->num_bonds ); + + copy_host_device (d_start, d_bonds->index, sizeof (int) * system->N, cudaMemcpyDeviceToHost, "index"); + copy_host_device (d_end, d_bonds->end_index, sizeof (int) * system->N, cudaMemcpyDeviceToHost, "index"); + copy_host_device (d_bond_data, d_bonds->select.bond_list, sizeof (bond_data) * d_bonds->num_intrs, cudaMemcpyDeviceToHost, "bond_data"); + + count = 0; + miscount = 0; + hostcount = 0; + devicecount = 0; + + for (int i = 0; i < system->N; i++) { + h= End_Index (i, bonds) - Start_Index (i, bonds); + d= d_end[i] - d_start[i]; + //if (h != d) + // fprintf (stderr, "Count does not match atom:%d, host:%d, device:%d \n", + // i, h, d); + hostcount += h; + devicecount += d; + } + fprintf (stderr, "Bonds count: host: %d device: %d \n", hostcount, devicecount); + + for (int i = 0; i < system->N; i++) { + + for (int j = d_start[i]; j < d_end[i]; j++) { + bond_data *src, *tgt; + src = &d_bond_data[j]; + + tgt = &d_bond_data[ src->sym_index ]; + + if ((src->dbond_index == tgt->dbond_index) ) + count ++; + else + miscount ++; + } + } + fprintf (stderr, "Sym and dbond indexes done count(device) --> %d (%d)\n", count, miscount); + + count = 0; + miscount = 0; + for (int i = 0; i < system->N; i++) { + + for (int j = Start_Index (i, bonds); j < End_Index(i, bonds); j++) { + bond_data *src, *tgt; + src = &bonds->select.bond_list [j]; + + tgt = &bonds->select.bond_list [ src->sym_index ]; + + if ((src->dbond_index == tgt->dbond_index) ) + count ++; + else + miscount ++; + } + } + fprintf (stderr, "Sym and dbond indexes done count (host) --> %d (%d)\n", count, miscount); + + free (d_end); + free (d_start); + free (d_bond_data); + + return SUCCESS; } int validate_sparse_matrix( reax_system *system, storage *workspace ) { - sparse_matrix test; - int index, count, total; - test.start = (int *) malloc (sizeof (int) * (system->N)); - test.end = (int *) malloc (sizeof (int) * (system->N)); - - test.entries = (sparse_matrix_entry *) malloc - (sizeof (sparse_matrix_entry) * (dev_workspace->H.m)); - - memset (test.entries, 0xFF, - sizeof (sparse_matrix_entry) * dev_workspace->H.m); - copy_host_device ( test.entries, dev_workspace->H.entries, - sizeof (sparse_matrix_entry)* dev_workspace->H.m, - cudaMemcpyDeviceToHost, "sparse_matrix_entries"); - copy_host_device ( test.start, dev_workspace->H.start, sizeof (int) * (system->N), cudaMemcpyDeviceToHost, "start"); - copy_host_device ( test.end , dev_workspace->H.end, sizeof (int) * (system->N), cudaMemcpyDeviceToHost, "end"); - - for (int i = 0 ; i < system->N; i++) { - if ((test.end[i] >= dev_workspace->H.m)) { - fprintf (stderr, " exceeding number of entries for atom: %d \n", i); - exit (-1); - } - - if (( i < (system->N-1)) && (test.end[i] >= test.start[i+1])) - { - fprintf (stderr, " Index exceeding for atom : %d \n", i ); - fprintf (stderr, "end(i): %d \n", test.end[i]); - fprintf (stderr, "start(i+1): %d \n", test.start[i+1]); - exit (-1); - } - } - fprintf (stderr, "Sparse Matrix Boundary Check PASSED !!!\n"); - - //TODO - //TODO - //TODO - return SUCCESS; - - count = 0; - for (int i = 0 ; i < system->N; i++) - count += test.end[i] - test.start[i]; - fprintf (stderr, " Total number of entries : %d \n", count); - - fprintf (stderr, " ALlocated memeory for entries : %d\n", dev_workspace->H.m); - - //////////////////////////// - //for (int i = workspace->H.start[0]; i < workspace->H.end[0]; i++) { - // fprintf (stderr, "Row: 0, col: %d val: %f \n", workspace->H.entries[i].j, workspace->H.entries[i].val ); - //} - ////////////////////////////// - - count = 0; - total = 0; - for (int i = 0; i < system->n; i++) { - for (int j = workspace->H.start[i]; j < workspace->H.end[i]; j++) { - sparse_matrix_entry *src = &workspace->H.entries[j]; - - for (int k = test.start[i]; k < test.end[i]; k++) { - sparse_matrix_entry *tgt = &test.entries [k]; - if (src->j == tgt->j){ - if ( check_zero (src->val, tgt->val)) { - index = test.start [i]; - /* - fprintf (stderr, " i-1 (%d %d ) (%d %d) \n", - test.start[i-1], test.end[i-1], - workspace->H.start[i-1], workspace->H.start[i]); - */ - fprintf (stderr, " Sparse matrix entry does not match for atom %d at index %d (%d %d) (%d %d) \n", - i, k, test.start[i], test.end[i], - workspace->H.start[i], workspace->H.end[i]); - for (int x = workspace->H.start[i]; x < workspace->H.end[i]; x ++) - { - src = &workspace->H.entries[x]; - tgt = &test.entries [index]; - fprintf (stderr, " cpu (%d %f)**** <--> gpu (%d %f) index %d \n", src->j, src->val, tgt->j, tgt->val, index); - index ++; - } - fprintf (stderr, "Sparse Matrix DOES NOT match between device and host \n"); - exit (-1); - count++; - } else - { - total ++; - if (i == tgt->j) continue; - //if (tgt->j >= system->n) continue; - - //success case here. check for row - k and column i; - for (int x = test.start[tgt->j]; x < test.end[tgt->j]; x++){ - sparse_matrix_entry *rtgt = &test.entries [x]; - if (i == rtgt->j) { - if (check_zero (tgt->val, rtgt->val)) { - fprintf (stderr, "symmetric entry not matching for (%d, %d) \n", i, tgt->j); - fprintf (stderr, "row: %d col: %d val: %f \n", i, tgt->j, tgt->val); - fprintf (stderr, "row: %d col: %d val: %f \n", tgt->j, rtgt->j, rtgt->val); - exit (-1); - } else { - total ++; - break; - } - } - } - } - } - } - } - } - - fprintf (stderr, "Sparse Matrix mismatch total: %d, miscount %d \n", total, count); - free (test.start); - free (test.end); - free (test.entries); - return SUCCESS; + sparse_matrix test; + int index, count, total; + test.start = (int *) malloc (sizeof (int) * (system->N)); + test.end = (int *) malloc (sizeof (int) * (system->N)); + + test.entries = (sparse_matrix_entry *) malloc + (sizeof (sparse_matrix_entry) * (dev_workspace->H.m)); + + memset (test.entries, 0xFF, + sizeof (sparse_matrix_entry) * dev_workspace->H.m); + copy_host_device ( test.entries, dev_workspace->H.entries, + sizeof (sparse_matrix_entry)* dev_workspace->H.m, + cudaMemcpyDeviceToHost, "sparse_matrix_entries"); + copy_host_device ( test.start, dev_workspace->H.start, sizeof (int) * (system->N), cudaMemcpyDeviceToHost, "start"); + copy_host_device ( test.end , dev_workspace->H.end, sizeof (int) * (system->N), cudaMemcpyDeviceToHost, "end"); + + for (int i = 0 ; i < system->N; i++) { + if ((test.end[i] >= dev_workspace->H.m)) { + fprintf (stderr, " exceeding number of entries for atom: %d \n", i); + exit (-1); + } + + if (( i < (system->N-1)) && (test.end[i] >= test.start[i+1])) + { + fprintf (stderr, " Index exceeding for atom : %d \n", i ); + fprintf (stderr, "end(i): %d \n", test.end[i]); + fprintf (stderr, "start(i+1): %d \n", test.start[i+1]); + exit (-1); + } + } + fprintf (stderr, "Sparse Matrix Boundary Check PASSED !!!\n"); + + //TODO + //TODO + //TODO + return SUCCESS; + + count = 0; + for (int i = 0 ; i < system->N; i++) + count += test.end[i] - test.start[i]; + fprintf (stderr, " Total number of entries : %d \n", count); + + fprintf (stderr, " ALlocated memeory for entries : %d\n", dev_workspace->H.m); + + //////////////////////////// + //for (int i = workspace->H.start[0]; i < workspace->H.end[0]; i++) { + // fprintf (stderr, "Row: 0, col: %d val: %f \n", workspace->H.entries[i].j, workspace->H.entries[i].val ); + //} + ////////////////////////////// + + count = 0; + total = 0; + for (int i = 0; i < system->n; i++) { + for (int j = workspace->H.start[i]; j < workspace->H.end[i]; j++) { + sparse_matrix_entry *src = &workspace->H.entries[j]; + + for (int k = test.start[i]; k < test.end[i]; k++) { + sparse_matrix_entry *tgt = &test.entries [k]; + if (src->j == tgt->j){ + if ( check_zero (src->val, tgt->val)) { + index = test.start [i]; + /* + fprintf (stderr, " i-1 (%d %d ) (%d %d) \n", + test.start[i-1], test.end[i-1], + workspace->H.start[i-1], workspace->H.start[i]); + */ + fprintf (stderr, " Sparse matrix entry does not match for atom %d at index %d (%d %d) (%d %d) \n", + i, k, test.start[i], test.end[i], + workspace->H.start[i], workspace->H.end[i]); + for (int x = workspace->H.start[i]; x < workspace->H.end[i]; x ++) + { + src = &workspace->H.entries[x]; + tgt = &test.entries [index]; + fprintf (stderr, " cpu (%d %f)**** <--> gpu (%d %f) index %d \n", src->j, src->val, tgt->j, tgt->val, index); + index ++; + } + fprintf (stderr, "Sparse Matrix DOES NOT match between device and host \n"); + exit (-1); + count++; + } else + { + total ++; + if (i == tgt->j) continue; + //if (tgt->j >= system->n) continue; + + //success case here. check for row - k and column i; + for (int x = test.start[tgt->j]; x < test.end[tgt->j]; x++){ + sparse_matrix_entry *rtgt = &test.entries [x]; + if (i == rtgt->j) { + if (check_zero (tgt->val, rtgt->val)) { + fprintf (stderr, "symmetric entry not matching for (%d, %d) \n", i, tgt->j); + fprintf (stderr, "row: %d col: %d val: %f \n", i, tgt->j, tgt->val); + fprintf (stderr, "row: %d col: %d val: %f \n", tgt->j, rtgt->j, rtgt->val); + exit (-1); + } else { + total ++; + break; + } + } + } + } + } + } + } + } + + fprintf (stderr, "Sparse Matrix mismatch total: %d, miscount %d \n", total, count); + free (test.start); + free (test.end); + free (test.entries); + return SUCCESS; } bool print_hbonds (int *d_start, int *d_end, int i, hbond_data *data) { - hbond_data src, tgt; + hbond_data src, tgt; - fprintf (stderr, " start %d end %d count ---> %d \n", d_start[i], d_end[i], d_end[i] - d_start[i]); + fprintf (stderr, " start %d end %d count ---> %d \n", d_start[i], d_end[i], d_end[i] - d_start[i]); - for (int j = d_start[i]; j < d_end[i]; j++) - fprintf (stderr, "Atom : %d , Hbond Info . nbr: %d scl: %d index:%d\n", i, data[j].nbr, data[j].scl); - fprintf (stderr, " ========================================= \n"); + for (int j = d_start[i]; j < d_end[i]; j++) + fprintf (stderr, "Atom : %d , Hbond Info . nbr: %d scl: %d index:%d\n", i, data[j].nbr, data[j].scl); + fprintf (stderr, " ========================================= \n"); } int validate_hbonds (reax_system *system, storage *workspace, reax_list **lists) { - int count, nbr, sym_count, dev_count; - int *d_start, *d_end, index, d_index; - hbond_data *data, src, tgt; - reax_list *d_hbonds = *dev_lists + HBONDS; - reax_list *hbonds = *lists + HBONDS; - - d_end = (int *)malloc (sizeof (int)* d_hbonds->n); - d_start = (int *) malloc (sizeof (int) * d_hbonds->n ); - fprintf (stderr, "Total index values: %d \n", d_hbonds->n); - - copy_host_device (d_start, d_hbonds->index, sizeof (int)* d_hbonds->n, cudaMemcpyDeviceToHost, "start"); - copy_host_device (d_end, d_hbonds->end_index, sizeof (int) * d_hbonds->n, cudaMemcpyDeviceToHost, "end"); - - //fprintf (stderr, "Copying hbonds to host %d \n", system->num_hbonds); - data = (hbond_data *) malloc (sizeof (hbond_data) * d_hbonds->num_intrs); - copy_host_device (data, d_hbonds->select.hbond_list, sizeof (hbond_data) * d_hbonds->num_intrs, - cudaMemcpyDeviceToHost, "hbond_data"); - - count = 0; - dev_count = 0; - sym_count = 0; - for (int i = 0; i < system->n; i++) { - - if ( system->reax_param.sbp[ system->my_atoms[i].type ].p_hbond == 1 ) - { - count += End_Index (i, hbonds) - Start_Index (i, hbonds); - dev_count += d_end [i] - d_start[i]; - - if ((d_end[ i] - d_start[i]) != - (End_Index (i, hbonds) - Start_Index (i, hbonds))) { - fprintf (stderr, "%d %d - d(%d %d) c(%d %d) \n",i, i, - d_start[i], d_end[ i], - Start_Index (i, hbonds), - End_Index (i, hbonds) ); - print_hbonds (d_start, d_end, i, data); - print_hbonds (hbonds->index, hbonds->end_index, i, hbonds->select.hbond_list); - exit (-1); - } - } - else { - sym_count += d_end[ i] - d_start[i]; - } - } - fprintf (stderr, "hbonds count match between host: %d and device: %d (%d) \n", count,dev_count, sym_count); - sym_count = 0; - - for (int i = system->n; i < system->N; i++) { - //if (system->reax_param.sbp[ system->my_atoms[i].type].p_hbond == 2) - { - sym_count += d_end[i] - d_start[i]; - } - } - fprintf (stderr, "Sym count outside 'n' : %d \n", sym_count ); - //print_hbonds (d_start, d_end, 0, data); - - - count = 0; - for (int i = 0; i < system->n; i++) { - - d_index = i; - /* - fprintf (stderr, " Count cpu %d gpu %d \n", - End_Index (workspace->hbond_index[i], hbonds) - index, - d_end[d_index] - d_start[d_index]); - */ - - if ( system->reax_param.sbp[ system->my_atoms[i].type ].p_hbond != 1 ) - { - /* - int x; - for (int j = d_start[d_index]; j < d_end[d_index]; j++ ) - { - tgt = data [j]; - nbr = tgt.nbr; - for (x = d_start[nbr]; x < d_end[nbr]; x++) - { - src = data [x]; - if (src.nbr == i) { - break; - } - } - if (x >= d_end[nbr]) { - fprintf (stderr, "HBONDS is NOT SYMMETRIC \n"); - fprintf (stderr, "Atom: %d, nbr: %d (%d)\n", i, nbr); - fprintf (stderr, "Atom: %d, start: %d end: %d \n", nbr, d_start[nbr], d_end[nbr]); - for (x = d_start[nbr]; x < d_end[nbr]; x++) - { - src = data [x]; - fprintf (stderr, "Atom: %d, nbr: %d \n", nbr, src.nbr); - } - - exit (1); - } - } - */ - - for (int j = d_start[d_index]; j < d_end[d_index]; j++ ) - { - tgt = data[j]; - nbr = tgt.sym_index; - - if (nbr >= d_hbonds->num_intrs || nbr < 0){ - fprintf (stderr, "Index out of range for atom: %d sym_index:%d Hbond index: %d, nbr: %d\n", i, nbr, j, data[j].nbr); - fprintf (stderr, "atom type: %d \n", system->reax_param.sbp[ system->my_atoms [ data[j].nbr ].type].p_hbond); - exit (1); - } - - if (data[nbr].sym_index != j) { - fprintf (stderr, "Sym Index for hydrogen bonds does not match \n"); - exit (1); - } - } - continue; - } - - for (int j = d_start[d_index]; j < d_end[d_index]; j++ ) - { - tgt = data[j]; - - int k = 0; - for (k = Start_Index (i, hbonds); - k < End_Index (i, hbonds); k++) { - src = hbonds->select.hbond_list[k]; - - if ((src.nbr == tgt.nbr) && (src.scl == tgt.scl)) { - /* - fprintf (stderr, "Mismatch at atom %d index %d (%d %d) -- (%d %d) \n", i, k, - src.nbr, src.scl, - tgt.nbr, tgt.scl); - */ - count ++; - break; - } - } - - /* - if ( ((End_Index (workspace->hbond_index[i], hbonds) - index) != index ) && - (k >= End_Index (workspace->hbond_index[i], hbonds))) { - fprintf (stderr, "Hbonds does not match for atom %d hbond_Index %d \n", i, d_index ); - exit (-1); - } - */ - - if ( k >= (End_Index (i, hbonds) )){ - fprintf (stderr, "Hbonds does not match for atom %d hbond_Index %d \n", i, j); - fprintf (stderr, " ==========Host============ \n"); - print_hbonds (hbonds->index, hbonds->end_index, - i, hbonds->select.hbond_list); - fprintf (stderr, " ==========Device============ \n"); - print_hbonds (d_start, d_end, - i, data); - exit (-1); - } - } - - if ((End_Index (i, hbonds)- Start_Index(i, hbonds)) != (d_end[i] - d_start[i])){ - fprintf (stderr, "End index does not match between device and host \n"); - fprintf (stderr, " Atom: %d Host: %d %d \n", i, Start_Index (i, hbonds), End_Index (i, hbonds)); - fprintf (stderr, " Device: %d %d \n", d_start[i], d_end[i]); - exit (-1); - } - } - - fprintf (stderr, "HBONDs match on device and Host count --> %d\n", count); - - free (d_start); - free (d_end); - free (data); - return SUCCESS; + int count, nbr, sym_count, dev_count; + int *d_start, *d_end, index, d_index; + hbond_data *data, src, tgt; + reax_list *d_hbonds = *dev_lists + HBONDS; + reax_list *hbonds = *lists + HBONDS; + + d_end = (int *)malloc (sizeof (int)* d_hbonds->n); + d_start = (int *) malloc (sizeof (int) * d_hbonds->n ); + fprintf (stderr, "Total index values: %d \n", d_hbonds->n); + + copy_host_device (d_start, d_hbonds->index, sizeof (int)* d_hbonds->n, cudaMemcpyDeviceToHost, "start"); + copy_host_device (d_end, d_hbonds->end_index, sizeof (int) * d_hbonds->n, cudaMemcpyDeviceToHost, "end"); + + //fprintf (stderr, "Copying hbonds to host %d \n", system->num_hbonds); + data = (hbond_data *) malloc (sizeof (hbond_data) * d_hbonds->num_intrs); + copy_host_device (data, d_hbonds->select.hbond_list, sizeof (hbond_data) * d_hbonds->num_intrs, + cudaMemcpyDeviceToHost, "hbond_data"); + + count = 0; + dev_count = 0; + sym_count = 0; + for (int i = 0; i < system->n; i++) { + + if ( system->reax_param.sbp[ system->my_atoms[i].type ].p_hbond == 1 ) + { + count += End_Index (i, hbonds) - Start_Index (i, hbonds); + dev_count += d_end [i] - d_start[i]; + + if ((d_end[ i] - d_start[i]) != + (End_Index (i, hbonds) - Start_Index (i, hbonds))) { + fprintf (stderr, "%d %d - d(%d %d) c(%d %d) \n",i, i, + d_start[i], d_end[ i], + Start_Index (i, hbonds), + End_Index (i, hbonds) ); + print_hbonds (d_start, d_end, i, data); + print_hbonds (hbonds->index, hbonds->end_index, i, hbonds->select.hbond_list); + exit (-1); + } + } + else { + sym_count += d_end[ i] - d_start[i]; + } + } + fprintf (stderr, "hbonds count match between host: %d and device: %d (%d) \n", count,dev_count, sym_count); + sym_count = 0; + + for (int i = system->n; i < system->N; i++) { + //if (system->reax_param.sbp[ system->my_atoms[i].type].p_hbond == 2) + { + sym_count += d_end[i] - d_start[i]; + } + } + fprintf (stderr, "Sym count outside 'n' : %d \n", sym_count ); + //print_hbonds (d_start, d_end, 0, data); + + + count = 0; + for (int i = 0; i < system->n; i++) { + + d_index = i; + /* + fprintf (stderr, " Count cpu %d gpu %d \n", + End_Index (workspace->hbond_index[i], hbonds) - index, + d_end[d_index] - d_start[d_index]); + */ + + if ( system->reax_param.sbp[ system->my_atoms[i].type ].p_hbond != 1 ) + { + /* + int x; + for (int j = d_start[d_index]; j < d_end[d_index]; j++ ) + { + tgt = data [j]; + nbr = tgt.nbr; + for (x = d_start[nbr]; x < d_end[nbr]; x++) + { + src = data [x]; + if (src.nbr == i) { + break; + } + } + if (x >= d_end[nbr]) { + fprintf (stderr, "HBONDS is NOT SYMMETRIC \n"); + fprintf (stderr, "Atom: %d, nbr: %d (%d)\n", i, nbr); + fprintf (stderr, "Atom: %d, start: %d end: %d \n", nbr, d_start[nbr], d_end[nbr]); + for (x = d_start[nbr]; x < d_end[nbr]; x++) + { + src = data [x]; + fprintf (stderr, "Atom: %d, nbr: %d \n", nbr, src.nbr); + } + + exit (1); + } + } + */ + + for (int j = d_start[d_index]; j < d_end[d_index]; j++ ) + { + tgt = data[j]; + nbr = tgt.sym_index; + + if (nbr >= d_hbonds->num_intrs || nbr < 0){ + fprintf (stderr, "Index out of range for atom: %d sym_index:%d Hbond index: %d, nbr: %d\n", i, nbr, j, data[j].nbr); + fprintf (stderr, "atom type: %d \n", system->reax_param.sbp[ system->my_atoms [ data[j].nbr ].type].p_hbond); + exit (1); + } + + if (data[nbr].sym_index != j) { + fprintf (stderr, "Sym Index for hydrogen bonds does not match \n"); + exit (1); + } + } + continue; + } + + for (int j = d_start[d_index]; j < d_end[d_index]; j++ ) + { + tgt = data[j]; + + int k = 0; + for (k = Start_Index (i, hbonds); + k < End_Index (i, hbonds); k++) { + src = hbonds->select.hbond_list[k]; + + if ((src.nbr == tgt.nbr) && (src.scl == tgt.scl)) { + /* + fprintf (stderr, "Mismatch at atom %d index %d (%d %d) -- (%d %d) \n", i, k, + src.nbr, src.scl, + tgt.nbr, tgt.scl); + */ + count ++; + break; + } + } + + /* + if ( ((End_Index (workspace->hbond_index[i], hbonds) - index) != index ) && + (k >= End_Index (workspace->hbond_index[i], hbonds))) { + fprintf (stderr, "Hbonds does not match for atom %d hbond_Index %d \n", i, d_index ); + exit (-1); + } + */ + + if ( k >= (End_Index (i, hbonds) )){ + fprintf (stderr, "Hbonds does not match for atom %d hbond_Index %d \n", i, j); + fprintf (stderr, " ==========Host============ \n"); + print_hbonds (hbonds->index, hbonds->end_index, + i, hbonds->select.hbond_list); + fprintf (stderr, " ==========Device============ \n"); + print_hbonds (d_start, d_end, + i, data); + exit (-1); + } + } + + if ((End_Index (i, hbonds)- Start_Index(i, hbonds)) != (d_end[i] - d_start[i])){ + fprintf (stderr, "End index does not match between device and host \n"); + fprintf (stderr, " Atom: %d Host: %d %d \n", i, Start_Index (i, hbonds), End_Index (i, hbonds)); + fprintf (stderr, " Device: %d %d \n", d_start[i], d_end[i]); + exit (-1); + } + } + + fprintf (stderr, "HBONDs match on device and Host count --> %d\n", count); + + free (d_start); + free (d_end); + free (data); + return SUCCESS; } int validate_bonds (reax_system *system, storage *workspace, reax_list **lists) { - int start, end, index, count, miscount; - int *d_start, *d_end; - bond_data *d_bond_data; - reax_list *d_bonds = *dev_lists + BONDS; - reax_list *bonds = *lists + BONDS; - - d_end = (int *)malloc (sizeof (int) * system->N); - d_start = (int *) malloc (sizeof (int) * system->N ); - d_bond_data = (bond_data *) malloc (sizeof (bond_data) * d_bonds->num_intrs); - //fprintf (stderr, "Num bonds copied from device to host is --> %d \n", system->num_bonds ); - - copy_host_device (d_start, d_bonds->index, sizeof (int) * system->N, cudaMemcpyDeviceToHost, "start"); - copy_host_device (d_end, d_bonds->end_index, sizeof (int) * system->N, cudaMemcpyDeviceToHost, "end"); - copy_host_device (d_bond_data, d_bonds->select.bond_list, sizeof (bond_data) * d_bonds->num_intrs, - cudaMemcpyDeviceToHost, "bond_data"); - - count = 0; - for (int i = 0; i < system->N; i++) { - start = Start_Index (i, bonds); - end = End_Index (i, bonds); - - count += end - start; - if ((end-start) != (d_end[i]-d_start[i])){ - fprintf (stderr, "Entries does NOT match --> atom %d: cpu (%d %d) gpu (%d %d) \n", - i, start, end, d_start[i], d_end[i]); - exit (-1); - } - - } - fprintf (stderr, "BOND LIST COUNT match on device and host count %d \n", count); - - for (int i = 0; i < system->N-1; i++) { - if ( d_end[i] >= d_start[i+1] ){ - fprintf (stderr, "Bonds list check Overwrite @ index --> %d \n", i); - exit (-1); - } - } - //fprintf (stderr, " BOND LIST Overwrite *PASSED* \n"); - count = 0; - miscount = 0; - for (int i = 0; i < system->N; i++) { - - for (int j = d_start[i]; j < d_end[i]; j++) { - bond_data *src, *tgt; - src = &d_bond_data[j]; - bond_data *src_sym = & d_bond_data[ src->sym_index ]; - - //Previously this was commented out. Thats why it was working. - //if (i >= src->nbr) continue; - - int k = 0; - for (k = Start_Index (i, bonds); k < End_Index (i, bonds); k++) { - tgt = & (bonds->select.bond_list[k]); - bond_data *tgt_sym = &(bonds->select.bond_list [tgt->sym_index]); - - if ((src->nbr == tgt->nbr) && !check_zero (src->d,tgt->d) && - !check_zero (src->dvec,tgt->dvec) && check_same (src->rel_box, tgt->rel_box)) { - - bond_order_data *s, *t; - s = &(src->bo_data); - t = &(tgt->bo_data); - - if ( !check_zero (s->BO,t->BO) && - !check_zero (s->BO_s,t->BO_s) && - !check_zero(s->BO_pi,t->BO_pi) && - !check_zero (s->BO_pi2,t->BO_pi2) && - !check_zero (s->Cdbo,t->Cdbo) && !check_zero (s->Cdbopi,t->Cdbopi) && !check_zero (s->Cdbopi2,t->Cdbopi2) && - !check_zero (s->C1dbo,t->C1dbo) && !check_zero (s->C2dbo,t->C2dbo) && !check_zero (s->C3dbo,t->C3dbo) && - !check_zero(s->C1dbopi,t->C1dbopi) && !check_zero(s->C2dbopi,t->C2dbopi) && !check_zero(s->C3dbopi,t->C3dbopi) && !check_zero(s->C4dbopi,t->C4dbopi) && - !check_zero(s->C1dbopi2,t->C1dbopi2) && !check_zero(s->C2dbopi2,t->C2dbopi2) &&!check_zero(s->C3dbopi2,t->C3dbopi2) &&!check_zero(s->C4dbopi2,t->C4dbopi2) && - !check_zero (s->dln_BOp_s, t->dln_BOp_s ) && - !check_zero (s->dln_BOp_pi, t->dln_BOp_pi ) && - !check_zero (s->dln_BOp_pi2, t->dln_BOp_pi2 ) && - !check_zero (s->dBOp, t->dBOp )) { - count ++; - - //Check the sym index and dbond index here for double checking - // bond_ij on both device and hosts are matched now. - /* - bond_order_data *ss, *ts; - ss = & (src_sym->bo_data ); - ts = & (tgt_sym->bo_data ); - - if ((src_sym->nbr != tgt_sym->nbr) || check_zero (src_sym->d,tgt_sym->d) || - check_zero (src_sym->dvec,tgt_sym->dvec) || !check_same (src_sym->rel_box, tgt_sym->rel_box) - || check_zero (ss->Cdbo, ts->Cdbo)){ - - fprintf (stderr, " Sym Index information does not match for atom %d \n", i); - fprintf (stderr, " atom --> %d \n", i); - fprintf (stderr, " nbr --> %d %d\n", src->nbr, tgt->nbr ); - fprintf (stderr, " d --> %f %f \n", src_sym->d, tgt_sym->d ); - fprintf (stderr, " sym Index nbr --> %d %d \n", src_sym->nbr, tgt_sym->nbr ); - fprintf (stderr, " dvec (%f %f %f) (%f %f %f) \n", - src_sym->dvec[0], src_sym->dvec[1], src_sym->dvec[2], - tgt_sym->dvec[0], tgt_sym->dvec[1], tgt_sym->dvec[2] ); - fprintf (stderr, " ivec (%d %d %d) (%d %d %d) \n", - src_sym->rel_box[0], src_sym->rel_box[1], src_sym->rel_box[2], - tgt_sym->rel_box[0], tgt_sym->rel_box[1], tgt_sym->rel_box[2] ); - - fprintf (stderr, " sym index Cdbo (%4.10e %4.10e) \n", ss->Cdbo,ts->Cdbo ); - exit (-1); - } - */ - - break; - } - fprintf (stderr, " d --> %f %f \n", src->d, tgt->d ); - fprintf (stderr, " dvec (%f %f %f) (%f %f %f) \n", - src->dvec[0], src->dvec[1], src->dvec[2], - tgt->dvec[0], tgt->dvec[1], tgt->dvec[2] ); - fprintf (stderr, " ivec (%d %d %d) (%d %d %d) \n", - src->rel_box[0], src->rel_box[1], src->rel_box[2], - tgt->rel_box[0], tgt->rel_box[1], tgt->rel_box[2] ); - - fprintf (stderr, "Bond_Order_Data does not match for atom %d neighbor (%d %d) BO (%e %e) BO_s (%e %e) BO_pi (%e %e) BO_pi2 (%e %e) \n", i, - src->nbr, tgt->nbr, - s->BO, t->BO, - s->BO_s, t->BO_s, - s->BO_pi, t->BO_pi, - s->BO_pi2, t->BO_pi2 - ); - fprintf (stderr, " dBOp (%e %e %e) (%e %e %e) \n", s->dBOp[0], s->dBOp[1], s->dBOp[2], - t->dBOp[0], t->dBOp[1], t->dBOp[2] ); - - fprintf (stderr, " Cdbo (%4.10e %4.10e) \n", s->Cdbo,t->Cdbo ); - fprintf (stderr, " Cdbopi (%e %e) \n", s->Cdbopi,t->Cdbopi ); - fprintf (stderr, " Cdbopi2 (%e %e) \n", s->Cdbopi2,t->Cdbopi2 ); - fprintf (stderr, " C1dbo (%e %e %e)(%e %e %e) \n", s->C1dbo,s->C2dbo,s->C3dbo, t->C1dbo,t->C2dbo,t->C3dbo ); - fprintf (stderr, " C1dbopi (%e %e %e %e) (%e %e %e %e)\n", s->C1dbopi,s->C2dbopi,s->C3dbopi,s->C4dbopi, t->C1dbopi,t->C2dbopi,t->C3dbopi,t->C4dbopi); - fprintf (stderr, " C1dbopi2 (%e %e %e %e) (%e %e %e %e)\n", s->C1dbopi2,s->C2dbopi2,s->C3dbopi2,s->C4dbopi2, t->C1dbopi2,t->C2dbopi2,t->C3dbopi2,t->C4dbopi2); - fprintf (stderr, " dln_BOp_s (%e %e %e ) (%e %e %e) \n", - s->dln_BOp_s[0], s->dln_BOp_s[1], s->dln_BOp_s[2], - t->dln_BOp_s[0], t->dln_BOp_s[1], t->dln_BOp_s[2] ); - fprintf (stderr, " dln_BOp_pi (%e %e %e ) (%e %e %e) \n", - s->dln_BOp_pi[0], s->dln_BOp_pi[1], s->dln_BOp_pi[2], - t->dln_BOp_pi[0], t->dln_BOp_pi[1], t->dln_BOp_pi[2] ); - fprintf (stderr, " dln_BOp_pi2 (%e %e %e ) (%e %e %e) \n", - s->dln_BOp_pi2[0], s->dln_BOp_pi2[1], s->dln_BOp_pi2[2], - t->dln_BOp_pi2[0], t->dln_BOp_pi2[1], t->dln_BOp_pi2[2] ); - - //miscount ++; - //break; - exit (-1); - } - } - - if (k >= End_Index (i, bonds)) { - miscount ++; - fprintf (stderr, " We have a problem with the atom %d and bond entry %d \n", i, j); - exit (-1); - } - } - } - - fprintf (stderr, " BONDS matched count %d miscount %d (%d) \n", count, miscount, (count+miscount)); - free (d_start); - free (d_end); - free (d_bond_data); - return SUCCESS; + int start, end, index, count, miscount; + int *d_start, *d_end; + bond_data *d_bond_data; + reax_list *d_bonds = *dev_lists + BONDS; + reax_list *bonds = *lists + BONDS; + + d_end = (int *)malloc (sizeof (int) * system->N); + d_start = (int *) malloc (sizeof (int) * system->N ); + d_bond_data = (bond_data *) malloc (sizeof (bond_data) * d_bonds->num_intrs); + //fprintf (stderr, "Num bonds copied from device to host is --> %d \n", system->num_bonds ); + + copy_host_device (d_start, d_bonds->index, sizeof (int) * system->N, cudaMemcpyDeviceToHost, "start"); + copy_host_device (d_end, d_bonds->end_index, sizeof (int) * system->N, cudaMemcpyDeviceToHost, "end"); + copy_host_device (d_bond_data, d_bonds->select.bond_list, sizeof (bond_data) * d_bonds->num_intrs, + cudaMemcpyDeviceToHost, "bond_data"); + + count = 0; + for (int i = 0; i < system->N; i++) { + start = Start_Index (i, bonds); + end = End_Index (i, bonds); + + count += end - start; + if ((end-start) != (d_end[i]-d_start[i])){ + fprintf (stderr, "Entries does NOT match --> atom %d: cpu (%d %d) gpu (%d %d) \n", + i, start, end, d_start[i], d_end[i]); + exit (-1); + } + + } + fprintf (stderr, "BOND LIST COUNT match on device and host count %d \n", count); + + for (int i = 0; i < system->N-1; i++) { + if ( d_end[i] >= d_start[i+1] ){ + fprintf (stderr, "Bonds list check Overwrite @ index --> %d \n", i); + exit (-1); + } + } + //fprintf (stderr, " BOND LIST Overwrite *PASSED* \n"); + count = 0; + miscount = 0; + for (int i = 0; i < system->N; i++) { + + for (int j = d_start[i]; j < d_end[i]; j++) { + bond_data *src, *tgt; + src = &d_bond_data[j]; + bond_data *src_sym = & d_bond_data[ src->sym_index ]; + + //Previously this was commented out. Thats why it was working. + //if (i >= src->nbr) continue; + + int k = 0; + for (k = Start_Index (i, bonds); k < End_Index (i, bonds); k++) { + tgt = & (bonds->select.bond_list[k]); + bond_data *tgt_sym = &(bonds->select.bond_list [tgt->sym_index]); + + if ((src->nbr == tgt->nbr) && !check_zero (src->d,tgt->d) && + !check_zero (src->dvec,tgt->dvec) && check_same (src->rel_box, tgt->rel_box)) { + + bond_order_data *s, *t; + s = &(src->bo_data); + t = &(tgt->bo_data); + + if ( !check_zero (s->BO,t->BO) && + !check_zero (s->BO_s,t->BO_s) && + !check_zero(s->BO_pi,t->BO_pi) && + !check_zero (s->BO_pi2,t->BO_pi2) && + !check_zero (s->Cdbo,t->Cdbo) && !check_zero (s->Cdbopi,t->Cdbopi) && !check_zero (s->Cdbopi2,t->Cdbopi2) && + !check_zero (s->C1dbo,t->C1dbo) && !check_zero (s->C2dbo,t->C2dbo) && !check_zero (s->C3dbo,t->C3dbo) && + !check_zero(s->C1dbopi,t->C1dbopi) && !check_zero(s->C2dbopi,t->C2dbopi) && !check_zero(s->C3dbopi,t->C3dbopi) && !check_zero(s->C4dbopi,t->C4dbopi) && + !check_zero(s->C1dbopi2,t->C1dbopi2) && !check_zero(s->C2dbopi2,t->C2dbopi2) &&!check_zero(s->C3dbopi2,t->C3dbopi2) &&!check_zero(s->C4dbopi2,t->C4dbopi2) && + !check_zero (s->dln_BOp_s, t->dln_BOp_s ) && + !check_zero (s->dln_BOp_pi, t->dln_BOp_pi ) && + !check_zero (s->dln_BOp_pi2, t->dln_BOp_pi2 ) && + !check_zero (s->dBOp, t->dBOp )) { + count ++; + + //Check the sym index and dbond index here for double checking + // bond_ij on both device and hosts are matched now. + /* + bond_order_data *ss, *ts; + ss = & (src_sym->bo_data ); + ts = & (tgt_sym->bo_data ); + + if ((src_sym->nbr != tgt_sym->nbr) || check_zero (src_sym->d,tgt_sym->d) || + check_zero (src_sym->dvec,tgt_sym->dvec) || !check_same (src_sym->rel_box, tgt_sym->rel_box) + || check_zero (ss->Cdbo, ts->Cdbo)){ + + fprintf (stderr, " Sym Index information does not match for atom %d \n", i); + fprintf (stderr, " atom --> %d \n", i); + fprintf (stderr, " nbr --> %d %d\n", src->nbr, tgt->nbr ); + fprintf (stderr, " d --> %f %f \n", src_sym->d, tgt_sym->d ); + fprintf (stderr, " sym Index nbr --> %d %d \n", src_sym->nbr, tgt_sym->nbr ); + fprintf (stderr, " dvec (%f %f %f) (%f %f %f) \n", + src_sym->dvec[0], src_sym->dvec[1], src_sym->dvec[2], + tgt_sym->dvec[0], tgt_sym->dvec[1], tgt_sym->dvec[2] ); + fprintf (stderr, " ivec (%d %d %d) (%d %d %d) \n", + src_sym->rel_box[0], src_sym->rel_box[1], src_sym->rel_box[2], + tgt_sym->rel_box[0], tgt_sym->rel_box[1], tgt_sym->rel_box[2] ); + + fprintf (stderr, " sym index Cdbo (%4.10e %4.10e) \n", ss->Cdbo,ts->Cdbo ); + exit (-1); + } + */ + + break; + } + fprintf (stderr, " d --> %f %f \n", src->d, tgt->d ); + fprintf (stderr, " dvec (%f %f %f) (%f %f %f) \n", + src->dvec[0], src->dvec[1], src->dvec[2], + tgt->dvec[0], tgt->dvec[1], tgt->dvec[2] ); + fprintf (stderr, " ivec (%d %d %d) (%d %d %d) \n", + src->rel_box[0], src->rel_box[1], src->rel_box[2], + tgt->rel_box[0], tgt->rel_box[1], tgt->rel_box[2] ); + + fprintf (stderr, "Bond_Order_Data does not match for atom %d neighbor (%d %d) BO (%e %e) BO_s (%e %e) BO_pi (%e %e) BO_pi2 (%e %e) \n", i, + src->nbr, tgt->nbr, + s->BO, t->BO, + s->BO_s, t->BO_s, + s->BO_pi, t->BO_pi, + s->BO_pi2, t->BO_pi2 + ); + fprintf (stderr, " dBOp (%e %e %e) (%e %e %e) \n", s->dBOp[0], s->dBOp[1], s->dBOp[2], + t->dBOp[0], t->dBOp[1], t->dBOp[2] ); + + fprintf (stderr, " Cdbo (%4.10e %4.10e) \n", s->Cdbo,t->Cdbo ); + fprintf (stderr, " Cdbopi (%e %e) \n", s->Cdbopi,t->Cdbopi ); + fprintf (stderr, " Cdbopi2 (%e %e) \n", s->Cdbopi2,t->Cdbopi2 ); + fprintf (stderr, " C1dbo (%e %e %e)(%e %e %e) \n", s->C1dbo,s->C2dbo,s->C3dbo, t->C1dbo,t->C2dbo,t->C3dbo ); + fprintf (stderr, " C1dbopi (%e %e %e %e) (%e %e %e %e)\n", s->C1dbopi,s->C2dbopi,s->C3dbopi,s->C4dbopi, t->C1dbopi,t->C2dbopi,t->C3dbopi,t->C4dbopi); + fprintf (stderr, " C1dbopi2 (%e %e %e %e) (%e %e %e %e)\n", s->C1dbopi2,s->C2dbopi2,s->C3dbopi2,s->C4dbopi2, t->C1dbopi2,t->C2dbopi2,t->C3dbopi2,t->C4dbopi2); + fprintf (stderr, " dln_BOp_s (%e %e %e ) (%e %e %e) \n", + s->dln_BOp_s[0], s->dln_BOp_s[1], s->dln_BOp_s[2], + t->dln_BOp_s[0], t->dln_BOp_s[1], t->dln_BOp_s[2] ); + fprintf (stderr, " dln_BOp_pi (%e %e %e ) (%e %e %e) \n", + s->dln_BOp_pi[0], s->dln_BOp_pi[1], s->dln_BOp_pi[2], + t->dln_BOp_pi[0], t->dln_BOp_pi[1], t->dln_BOp_pi[2] ); + fprintf (stderr, " dln_BOp_pi2 (%e %e %e ) (%e %e %e) \n", + s->dln_BOp_pi2[0], s->dln_BOp_pi2[1], s->dln_BOp_pi2[2], + t->dln_BOp_pi2[0], t->dln_BOp_pi2[1], t->dln_BOp_pi2[2] ); + + //miscount ++; + //break; + exit (-1); + } + } + + if (k >= End_Index (i, bonds)) { + miscount ++; + fprintf (stderr, " We have a problem with the atom %d and bond entry %d \n", i, j); + exit (-1); + } + } + } + + fprintf (stderr, " BONDS matched count %d miscount %d (%d) \n", count, miscount, (count+miscount)); + free (d_start); + free (d_end); + free (d_bond_data); + return SUCCESS; } int validate_workspace (reax_system *system, storage *workspace) { - int miscount; - int count, tcount; - - /////////////////////// - //INIT FORCES - /////////////////////// - - // bond_mark - int *bond_mark = (int *)malloc (sizeof (int) * system->N); - copy_host_device (bond_mark, dev_workspace->bond_mark, sizeof (int) * system->N, - cudaMemcpyDeviceToHost, "bond_mark"); - miscount = 0; - for (int i = 0; i < system->N; i++) { - if (workspace->bond_mark [i] != bond_mark [i]) { - fprintf (stderr, "Bond_mark atom:%d -- %d:%d \n", i, bond_mark [i], workspace->bond_mark [i]); - miscount ++; - } - } - free (bond_mark); - fprintf (stderr, " Bond Mark : %d \n", miscount ); - - //total_bond_order - real *total_bond_order = (real *) malloc ( system->N * sizeof (real)); - copy_host_device (total_bond_order, dev_workspace->total_bond_order, system->N * sizeof (real), - cudaMemcpyDeviceToHost, "total_bond_order"); - count = 0; - for (int i = 0; i < system->N; i++) { - - if ( check_zero (workspace->total_bond_order[i], total_bond_order[i])){ - fprintf (stderr, "Total bond order does not match for atom %d (%4.15e %4.15e)\n", - i, workspace->total_bond_order[i], total_bond_order[i]); - exit (-1); - count ++; - } - } - free (total_bond_order); - fprintf (stderr, "TOTAL Bond Order mismatch count %d\n", count); - - ////////////////////////////// - //BOND ORDERS - ////////////////////////////// - - //deltap - real *deltap= (real *) malloc ( system->N * sizeof (real)); - copy_host_device (deltap, dev_workspace->Deltap, system->N * sizeof (real), - cudaMemcpyDeviceToHost, "deltap"); - count = 0; - for (int i = 0; i < system->N; i++) { - - if ( check_zero (workspace->Deltap[i], deltap[i])){ - fprintf (stderr, "deltap does not match for atom %d (%4.15e %4.15e)\n", - i, workspace->Deltap[i], deltap[i]); - exit (-1); - count ++; - } - } - free (deltap); - fprintf (stderr, "Deltap mismatch count %d\n", count); - - //deltap_boc - real *deltap_boc = (real *) malloc ( system->N * sizeof (real)); - copy_host_device (deltap_boc, dev_workspace->Deltap_boc, system->N * sizeof (real), - cudaMemcpyDeviceToHost, "deltap_boc"); - count = 0; - for (int i = 0; i < system->N; i++) { - - if ( check_zero (workspace->Deltap_boc[i], deltap_boc[i])){ - fprintf (stderr, "deltap_boc does not match for atom %d (%4.15e %4.15e)\n", - i, workspace->Deltap_boc[i], deltap_boc[i]); - exit (-1); - count ++; - } - } - free (deltap_boc); - fprintf (stderr, "Deltap_boc mismatch count %d\n", count); - - - rvec *dDeltap_self; - dDeltap_self = (rvec *) calloc (system->N, sizeof (rvec) ); - copy_host_device (dDeltap_self, dev_workspace->dDeltap_self, system->N * sizeof (rvec), cudaMemcpyDeviceToHost, "ddeltap_self"); - - count = 0; - for (int i = 0; i < system->N; i++ ) - { - if (check_zero (workspace->dDeltap_self[i], dDeltap_self[i])) - { - fprintf (stderr, "index: %d c (%f %f %f) g (%f %f %f )\n", i, - workspace->dDeltap_self[i][0], - workspace->dDeltap_self[i][1], - workspace->dDeltap_self[i][2], - dDeltap_self[3*i+0], - dDeltap_self[3*i+1], - dDeltap_self[3*i+2] ); - exit (-1); - count ++; - } - } - free (dDeltap_self); - fprintf (stderr, "dDeltap_self mismatch count %d\n", count); - - //Delta - real *delta = (real *) malloc ( system->N * sizeof (real)); - copy_host_device (delta, dev_workspace->Delta, system->N * sizeof (real), - cudaMemcpyDeviceToHost, "Delta"); - count = 0; - for (int i = 0; i < system->N; i++) { - - if ( check_zero (workspace->Delta[i], delta[i])){ - fprintf (stderr, "delta does not match for atom %d (%4.15e %4.15e)\n", - i, workspace->Delta[i], delta[i]); - exit (-1); - count ++; - } - } - free (delta); - fprintf (stderr, "Delta mismatch count %d\n", count); - - //Delta_e - real *deltae = (real *) malloc ( system->N * sizeof (real)); - copy_host_device (deltae, dev_workspace->Delta_e, system->N * sizeof (real), - cudaMemcpyDeviceToHost, "Deltae"); - count = 0; - for (int i = 0; i < system->N; i++) { - - if ( check_zero (workspace->Delta_e[i], deltae[i])){ - fprintf (stderr, "deltae does not match for atom %d (%4.15e %4.15e)\n", - i, workspace->Delta_e[i], deltae[i]); - exit (-1); - count ++; - } - } - free (deltae); - fprintf (stderr, "Delta_e mismatch count %d\n", count); - - //vlpex - real *vlpex= (real *) malloc ( system->N * sizeof (real)); - copy_host_device (vlpex, dev_workspace->vlpex, system->N * sizeof (real), - cudaMemcpyDeviceToHost, "vlpex"); - count = 0; - for (int i = 0; i < system->N; i++) { - - if ( check_zero (workspace->vlpex[i], vlpex[i])){ - fprintf (stderr, "vlpex does not match for atom %d (%4.15e %4.15e)\n", - i, workspace->vlpex[i], vlpex[i]); - exit (-1); - count ++; - } - } - free (vlpex); - fprintf (stderr, "vlpex mismatch count %d\n", count); - - //nlp - real *nlp = (real *) malloc ( system->N * sizeof (real)); - copy_host_device (nlp, dev_workspace->nlp, system->N * sizeof (real), - cudaMemcpyDeviceToHost, ""); - count = 0; - for (int i = 0; i < system->N; i++) { - - if ( check_zero (workspace->nlp[i], nlp[i])){ - fprintf (stderr, "nlp does not match for atom %d (%4.15e %4.15e)\n", - i, workspace->nlp[i], nlp[i]); - exit (-1); - count ++; - } - } - free (nlp); - fprintf (stderr, "nlp mismatch count %d\n", count); - - //delta_lp - real *Delta_lp = (real *) malloc ( system->N * sizeof (real)); - copy_host_device (Delta_lp , dev_workspace->Delta_lp , system->N * sizeof (real), - cudaMemcpyDeviceToHost, "Delta_lp "); - count = 0; - for (int i = 0; i < system->N; i++) { - - if ( check_zero (workspace->Delta_lp [i], Delta_lp [i])){ - fprintf (stderr, "Delta_lp does not match for atom %d (%4.15e %4.15e)\n", - i, workspace->Delta_lp [i], Delta_lp [i]); - exit (-1); - count ++; - } - } - free (Delta_lp ); - fprintf (stderr, "Delta_lp mismatch count %d\n", count); - - //Clp - real *Clp = (real *) malloc ( system->N * sizeof (real)); - copy_host_device (Clp, dev_workspace->Clp, system->N * sizeof (real), - cudaMemcpyDeviceToHost, "Clp"); - count = 0; - for (int i = 0; i < system->N; i++) { - - if ( check_zero (workspace->Clp[i], Clp[i])){ - fprintf (stderr, "Clp does not match for atom %d (%4.15e %4.15e)\n", - i, workspace->Clp[i], Clp[i]); - exit (-1); - count ++; - } - } - free (Clp); - fprintf (stderr, "Clp mismatch count %d\n", count); - - //dDelta_lp - real *dDelta_lp = (real *) malloc ( system->N * sizeof (real)); - copy_host_device (dDelta_lp, dev_workspace->dDelta_lp, system->N * sizeof (real), - cudaMemcpyDeviceToHost, "dDelta_lp"); - count = 0; - for (int i = 0; i < system->N; i++) { - - if ( check_zero (workspace->dDelta_lp[i], dDelta_lp[i])){ - fprintf (stderr, "dDelta_lp does not match for atom %d (%4.15e %4.15e)\n", - i, workspace->dDelta_lp[i], dDelta_lp[i]); - exit (-1); - count ++; - } - } - free (dDelta_lp); - fprintf (stderr, "dDelta_lp mismatch count %d\n", count); - - //nlp_temp - real *nlp_temp = (real *) malloc ( system->N * sizeof (real)); - copy_host_device (nlp_temp, dev_workspace->nlp_temp, system->N * sizeof (real), - cudaMemcpyDeviceToHost, "nlp_temp"); - count = 0; - for (int i = 0; i < system->N; i++) { - - if ( check_zero (workspace->nlp_temp[i], nlp_temp[i])){ - fprintf (stderr, "nlp_temp does not match for atom %d (%4.15e %4.15e)\n", - i, workspace->nlp_temp[i], nlp_temp[i]); - exit (-1); - count ++; - } - } - free (nlp_temp); - fprintf (stderr, "nlp_temp mismatch count %d\n", count); - - //Delta_lp_temp - real *Delta_lp_temp = (real *) malloc ( system->N * sizeof (real)); - copy_host_device (Delta_lp_temp, dev_workspace->Delta_lp_temp, system->N * sizeof (real), - cudaMemcpyDeviceToHost, "Delta_lp_temp"); - count = 0; - for (int i = 0; i < system->N; i++) { - - if ( check_zero (workspace->Delta_lp_temp[i], Delta_lp_temp[i])){ - fprintf (stderr, "Delta_lp_temp does not match for atom %d (%4.15e %4.15e)\n", - i, workspace->Delta_lp_temp[i], Delta_lp_temp[i]); - exit (-1); - count ++; - } - } - free (Delta_lp_temp); - fprintf (stderr, "Delta_lp_temp mismatch count %d\n", count); - - - //dDelta_lp_temp - real *dDelta_lp_temp = (real *) malloc ( system->N * sizeof (real)); - copy_host_device (dDelta_lp_temp, dev_workspace->dDelta_lp_temp, system->N * sizeof (real), - cudaMemcpyDeviceToHost, "dDelta_lp_temp"); - count = 0; - for (int i = 0; i < system->N; i++) { - - if ( check_zero (workspace->dDelta_lp_temp[i], dDelta_lp_temp[i])){ - fprintf (stderr, "dDelta_lp_temp does not match for atom %d (%4.15e %4.15e)\n", - i, workspace->dDelta_lp_temp[i], dDelta_lp_temp[i]); - exit (-1); - count ++; - } - } - free (dDelta_lp_temp); - fprintf (stderr, "dDelta_lp_temp mismatch count %d\n", count); - - ////////////////////////////// - //BONDS - ////////////////////////////// - - //CdDelta - real *CdDelta= (real *) malloc ( system->N * sizeof (real)); - copy_host_device (CdDelta, dev_workspace->CdDelta, system->N * sizeof (real), - cudaMemcpyDeviceToHost, "CdDelta"); - count = 0; - for (int i = 0; i < system->N; i++) { - - if ( check_zero (workspace->CdDelta[i], CdDelta[i])){ - fprintf (stderr, "CdDelta does not match for atom %d (%4.15e %4.15e)\n", - i, workspace->CdDelta[i], CdDelta[i]); - exit (-1); - count ++; - } - } - free (CdDelta); - fprintf (stderr, "CdDelta mismatch count %d\n", count); - - - ////////////////////////////////// - //ATOM ENERGY - ////////////////////////////////// - - ////////////////////////////////// - //VALENCE ANGLES - ////////////////////////////////// - rvec *f= (rvec *) malloc ( system->N * sizeof (rvec)); - copy_host_device (f, dev_workspace->f, system->N * sizeof (rvec), - cudaMemcpyDeviceToHost, "f"); - count = 0; - for (int i = 0; i < system->N; i++) { - - if ( check_zero (workspace->f[i], f[i])){ - fprintf (stderr, "f does not match for atom %d (%4.15e %4.15e, %4.15e) (%4.15e %4.15e, %4.15e)\n", - i, - workspace->f[i][0], workspace->f[i][1], workspace->f[i][2], - f[i][0], f[i][1], f[i][2]); - //exit (-1); - count ++; - } - } - free (f); - fprintf (stderr, "f mismatch count %d\n", count); - - ///////////////////////////////////////////////////// - //QEq part - ///////////////////////////////////////////////////// - compare_rvec2 (workspace->d2, dev_workspace->d2, system->N, "d2"); - - compare_rvec2 (workspace->q2, dev_workspace->q2, system->N, "q2"); - - compare_rvec2 (workspace->x, dev_workspace->x, system->N, "x"); - - compare_rvec2 (workspace->b, dev_workspace->b, system->N, "b"); - - return SUCCESS; + int miscount; + int count, tcount; + + /////////////////////// + //INIT FORCES + /////////////////////// + + // bond_mark + int *bond_mark = (int *)malloc (sizeof (int) * system->N); + copy_host_device (bond_mark, dev_workspace->bond_mark, sizeof (int) * system->N, + cudaMemcpyDeviceToHost, "bond_mark"); + miscount = 0; + for (int i = 0; i < system->N; i++) { + if (workspace->bond_mark [i] != bond_mark [i]) { + fprintf (stderr, "Bond_mark atom:%d -- %d:%d \n", i, bond_mark [i], workspace->bond_mark [i]); + miscount ++; + } + } + free (bond_mark); + fprintf (stderr, " Bond Mark : %d \n", miscount ); + + //total_bond_order + real *total_bond_order = (real *) malloc ( system->N * sizeof (real)); + copy_host_device (total_bond_order, dev_workspace->total_bond_order, system->N * sizeof (real), + cudaMemcpyDeviceToHost, "total_bond_order"); + count = 0; + for (int i = 0; i < system->N; i++) { + + if ( check_zero (workspace->total_bond_order[i], total_bond_order[i])){ + fprintf (stderr, "Total bond order does not match for atom %d (%4.15e %4.15e)\n", + i, workspace->total_bond_order[i], total_bond_order[i]); + exit (-1); + count ++; + } + } + free (total_bond_order); + fprintf (stderr, "TOTAL Bond Order mismatch count %d\n", count); + + ////////////////////////////// + //BOND ORDERS + ////////////////////////////// + + //deltap + real *deltap= (real *) malloc ( system->N * sizeof (real)); + copy_host_device (deltap, dev_workspace->Deltap, system->N * sizeof (real), + cudaMemcpyDeviceToHost, "deltap"); + count = 0; + for (int i = 0; i < system->N; i++) { + + if ( check_zero (workspace->Deltap[i], deltap[i])){ + fprintf (stderr, "deltap does not match for atom %d (%4.15e %4.15e)\n", + i, workspace->Deltap[i], deltap[i]); + exit (-1); + count ++; + } + } + free (deltap); + fprintf (stderr, "Deltap mismatch count %d\n", count); + + //deltap_boc + real *deltap_boc = (real *) malloc ( system->N * sizeof (real)); + copy_host_device (deltap_boc, dev_workspace->Deltap_boc, system->N * sizeof (real), + cudaMemcpyDeviceToHost, "deltap_boc"); + count = 0; + for (int i = 0; i < system->N; i++) { + + if ( check_zero (workspace->Deltap_boc[i], deltap_boc[i])){ + fprintf (stderr, "deltap_boc does not match for atom %d (%4.15e %4.15e)\n", + i, workspace->Deltap_boc[i], deltap_boc[i]); + exit (-1); + count ++; + } + } + free (deltap_boc); + fprintf (stderr, "Deltap_boc mismatch count %d\n", count); + + + rvec *dDeltap_self; + dDeltap_self = (rvec *) calloc (system->N, sizeof (rvec) ); + copy_host_device (dDeltap_self, dev_workspace->dDeltap_self, system->N * sizeof (rvec), cudaMemcpyDeviceToHost, "ddeltap_self"); + + count = 0; + for (int i = 0; i < system->N; i++ ) + { + if (check_zero (workspace->dDeltap_self[i], dDeltap_self[i])) + { + fprintf (stderr, "index: %d c (%f %f %f) g (%f %f %f )\n", i, + workspace->dDeltap_self[i][0], + workspace->dDeltap_self[i][1], + workspace->dDeltap_self[i][2], + dDeltap_self[3*i+0], + dDeltap_self[3*i+1], + dDeltap_self[3*i+2] ); + exit (-1); + count ++; + } + } + free (dDeltap_self); + fprintf (stderr, "dDeltap_self mismatch count %d\n", count); + + //Delta + real *delta = (real *) malloc ( system->N * sizeof (real)); + copy_host_device (delta, dev_workspace->Delta, system->N * sizeof (real), + cudaMemcpyDeviceToHost, "Delta"); + count = 0; + for (int i = 0; i < system->N; i++) { + + if ( check_zero (workspace->Delta[i], delta[i])){ + fprintf (stderr, "delta does not match for atom %d (%4.15e %4.15e)\n", + i, workspace->Delta[i], delta[i]); + exit (-1); + count ++; + } + } + free (delta); + fprintf (stderr, "Delta mismatch count %d\n", count); + + //Delta_e + real *deltae = (real *) malloc ( system->N * sizeof (real)); + copy_host_device (deltae, dev_workspace->Delta_e, system->N * sizeof (real), + cudaMemcpyDeviceToHost, "Deltae"); + count = 0; + for (int i = 0; i < system->N; i++) { + + if ( check_zero (workspace->Delta_e[i], deltae[i])){ + fprintf (stderr, "deltae does not match for atom %d (%4.15e %4.15e)\n", + i, workspace->Delta_e[i], deltae[i]); + exit (-1); + count ++; + } + } + free (deltae); + fprintf (stderr, "Delta_e mismatch count %d\n", count); + + //vlpex + real *vlpex= (real *) malloc ( system->N * sizeof (real)); + copy_host_device (vlpex, dev_workspace->vlpex, system->N * sizeof (real), + cudaMemcpyDeviceToHost, "vlpex"); + count = 0; + for (int i = 0; i < system->N; i++) { + + if ( check_zero (workspace->vlpex[i], vlpex[i])){ + fprintf (stderr, "vlpex does not match for atom %d (%4.15e %4.15e)\n", + i, workspace->vlpex[i], vlpex[i]); + exit (-1); + count ++; + } + } + free (vlpex); + fprintf (stderr, "vlpex mismatch count %d\n", count); + + //nlp + real *nlp = (real *) malloc ( system->N * sizeof (real)); + copy_host_device (nlp, dev_workspace->nlp, system->N * sizeof (real), + cudaMemcpyDeviceToHost, ""); + count = 0; + for (int i = 0; i < system->N; i++) { + + if ( check_zero (workspace->nlp[i], nlp[i])){ + fprintf (stderr, "nlp does not match for atom %d (%4.15e %4.15e)\n", + i, workspace->nlp[i], nlp[i]); + exit (-1); + count ++; + } + } + free (nlp); + fprintf (stderr, "nlp mismatch count %d\n", count); + + //delta_lp + real *Delta_lp = (real *) malloc ( system->N * sizeof (real)); + copy_host_device (Delta_lp , dev_workspace->Delta_lp , system->N * sizeof (real), + cudaMemcpyDeviceToHost, "Delta_lp "); + count = 0; + for (int i = 0; i < system->N; i++) { + + if ( check_zero (workspace->Delta_lp [i], Delta_lp [i])){ + fprintf (stderr, "Delta_lp does not match for atom %d (%4.15e %4.15e)\n", + i, workspace->Delta_lp [i], Delta_lp [i]); + exit (-1); + count ++; + } + } + free (Delta_lp ); + fprintf (stderr, "Delta_lp mismatch count %d\n", count); + + //Clp + real *Clp = (real *) malloc ( system->N * sizeof (real)); + copy_host_device (Clp, dev_workspace->Clp, system->N * sizeof (real), + cudaMemcpyDeviceToHost, "Clp"); + count = 0; + for (int i = 0; i < system->N; i++) { + + if ( check_zero (workspace->Clp[i], Clp[i])){ + fprintf (stderr, "Clp does not match for atom %d (%4.15e %4.15e)\n", + i, workspace->Clp[i], Clp[i]); + exit (-1); + count ++; + } + } + free (Clp); + fprintf (stderr, "Clp mismatch count %d\n", count); + + //dDelta_lp + real *dDelta_lp = (real *) malloc ( system->N * sizeof (real)); + copy_host_device (dDelta_lp, dev_workspace->dDelta_lp, system->N * sizeof (real), + cudaMemcpyDeviceToHost, "dDelta_lp"); + count = 0; + for (int i = 0; i < system->N; i++) { + + if ( check_zero (workspace->dDelta_lp[i], dDelta_lp[i])){ + fprintf (stderr, "dDelta_lp does not match for atom %d (%4.15e %4.15e)\n", + i, workspace->dDelta_lp[i], dDelta_lp[i]); + exit (-1); + count ++; + } + } + free (dDelta_lp); + fprintf (stderr, "dDelta_lp mismatch count %d\n", count); + + //nlp_temp + real *nlp_temp = (real *) malloc ( system->N * sizeof (real)); + copy_host_device (nlp_temp, dev_workspace->nlp_temp, system->N * sizeof (real), + cudaMemcpyDeviceToHost, "nlp_temp"); + count = 0; + for (int i = 0; i < system->N; i++) { + + if ( check_zero (workspace->nlp_temp[i], nlp_temp[i])){ + fprintf (stderr, "nlp_temp does not match for atom %d (%4.15e %4.15e)\n", + i, workspace->nlp_temp[i], nlp_temp[i]); + exit (-1); + count ++; + } + } + free (nlp_temp); + fprintf (stderr, "nlp_temp mismatch count %d\n", count); + + //Delta_lp_temp + real *Delta_lp_temp = (real *) malloc ( system->N * sizeof (real)); + copy_host_device (Delta_lp_temp, dev_workspace->Delta_lp_temp, system->N * sizeof (real), + cudaMemcpyDeviceToHost, "Delta_lp_temp"); + count = 0; + for (int i = 0; i < system->N; i++) { + + if ( check_zero (workspace->Delta_lp_temp[i], Delta_lp_temp[i])){ + fprintf (stderr, "Delta_lp_temp does not match for atom %d (%4.15e %4.15e)\n", + i, workspace->Delta_lp_temp[i], Delta_lp_temp[i]); + exit (-1); + count ++; + } + } + free (Delta_lp_temp); + fprintf (stderr, "Delta_lp_temp mismatch count %d\n", count); + + + //dDelta_lp_temp + real *dDelta_lp_temp = (real *) malloc ( system->N * sizeof (real)); + copy_host_device (dDelta_lp_temp, dev_workspace->dDelta_lp_temp, system->N * sizeof (real), + cudaMemcpyDeviceToHost, "dDelta_lp_temp"); + count = 0; + for (int i = 0; i < system->N; i++) { + + if ( check_zero (workspace->dDelta_lp_temp[i], dDelta_lp_temp[i])){ + fprintf (stderr, "dDelta_lp_temp does not match for atom %d (%4.15e %4.15e)\n", + i, workspace->dDelta_lp_temp[i], dDelta_lp_temp[i]); + exit (-1); + count ++; + } + } + free (dDelta_lp_temp); + fprintf (stderr, "dDelta_lp_temp mismatch count %d\n", count); + + ////////////////////////////// + //BONDS + ////////////////////////////// + + //CdDelta + real *CdDelta= (real *) malloc ( system->N * sizeof (real)); + copy_host_device (CdDelta, dev_workspace->CdDelta, system->N * sizeof (real), + cudaMemcpyDeviceToHost, "CdDelta"); + count = 0; + for (int i = 0; i < system->N; i++) { + + if ( check_zero (workspace->CdDelta[i], CdDelta[i])){ + fprintf (stderr, "CdDelta does not match for atom %d (%4.15e %4.15e)\n", + i, workspace->CdDelta[i], CdDelta[i]); + exit (-1); + count ++; + } + } + free (CdDelta); + fprintf (stderr, "CdDelta mismatch count %d\n", count); + + + ////////////////////////////////// + //ATOM ENERGY + ////////////////////////////////// + + ////////////////////////////////// + //VALENCE ANGLES + ////////////////////////////////// + rvec *f= (rvec *) malloc ( system->N * sizeof (rvec)); + copy_host_device (f, dev_workspace->f, system->N * sizeof (rvec), + cudaMemcpyDeviceToHost, "f"); + count = 0; + for (int i = 0; i < system->N; i++) { + + if ( check_zero (workspace->f[i], f[i])){ + fprintf (stderr, "f does not match for atom %d (%4.15e %4.15e, %4.15e) (%4.15e %4.15e, %4.15e)\n", + i, + workspace->f[i][0], workspace->f[i][1], workspace->f[i][2], + f[i][0], f[i][1], f[i][2]); + //exit (-1); + count ++; + } + } + free (f); + fprintf (stderr, "f mismatch count %d\n", count); + + ///////////////////////////////////////////////////// + //QEq part + ///////////////////////////////////////////////////// + compare_rvec2 (workspace->d2, dev_workspace->d2, system->N, "d2"); + + compare_rvec2 (workspace->q2, dev_workspace->q2, system->N, "q2"); + + compare_rvec2 (workspace->x, dev_workspace->x, system->N, "x"); + + compare_rvec2 (workspace->b, dev_workspace->b, system->N, "b"); + + return SUCCESS; } void compare_rvec2( rvec2 *host, rvec2 *device, int N, char *msg) { - int count = 0; - int miscount = 0; - rvec2 *tmp = (rvec2 *) host_scratch; - copy_host_device (tmp, device, sizeof (rvec2) * N, cudaMemcpyDeviceToHost, msg); - - for (int i = 0; i < N; i++) - { - if (check_zero_rvec2 (host [i], tmp [i])) { - fprintf (stderr, " %s does not match at index: %d (%f %f) - (%f %f) \n", - msg, i, host[i][0], host[i][1], tmp[i][0], tmp[i][1] ); - // exit (-1); - miscount ++; - } - count ++; - } - fprintf (stderr, "%s match between host and device (%d - %d) \n", msg, count, miscount); + int count = 0; + int miscount = 0; + rvec2 *tmp = (rvec2 *) host_scratch; + copy_host_device (tmp, device, sizeof (rvec2) * N, cudaMemcpyDeviceToHost, msg); + + for (int i = 0; i < N; i++) + { + if (check_zero_rvec2 (host [i], tmp [i])) { + fprintf (stderr, " %s does not match at index: %d (%f %f) - (%f %f) \n", + msg, i, host[i][0], host[i][1], tmp[i][0], tmp[i][1] ); + // exit (-1); + miscount ++; + } + count ++; + } + fprintf (stderr, "%s match between host and device (%d - %d) \n", msg, count, miscount); } void compare_array( real *host, real *device, int N, char *msg) { - int count = 0; - int miscount = 0; - real *tmp = (real *) host_scratch; - copy_host_device (tmp, device, sizeof (real) * N, cudaMemcpyDeviceToHost, msg); - - for (int i = 0; i < N; i++) - { - if (check_zero (host [i], tmp [i])) { - fprintf (stderr, " %s does not match at index: %d (%f) - (%f) \n", - msg, i, host[i], tmp[i] ); - // exit (-1); - miscount ++; - } - count ++; - } - fprintf (stderr, "%s match between host and device (%d - %d) \n", msg, count, miscount); + int count = 0; + int miscount = 0; + real *tmp = (real *) host_scratch; + copy_host_device (tmp, device, sizeof (real) * N, cudaMemcpyDeviceToHost, msg); + + for (int i = 0; i < N; i++) + { + if (check_zero (host [i], tmp [i])) { + fprintf (stderr, " %s does not match at index: %d (%f) - (%f) \n", + msg, i, host[i], tmp[i] ); + // exit (-1); + miscount ++; + } + count ++; + } + fprintf (stderr, "%s match between host and device (%d - %d) \n", msg, count, miscount); } int validate_data (reax_system *system, simulation_data *host) { - simulation_data device; - - copy_host_device (&device, host->d_simulation_data, sizeof (simulation_data), - cudaMemcpyDeviceToHost, "simulation_data"); - - if (check_zero (host->my_en.e_bond, device.my_en.e_bond)){ - fprintf (stderr, "E_BE does not match (%4.15e %4.15e) \n", host->my_en.e_bond, device.my_en.e_bond); - exit (-1); - } - - if (check_zero (host->my_en.e_lp, device.my_en.e_lp)){ - fprintf (stderr, "E_Lp does not match (%4.10e %4.10e) \n", host->my_en.e_lp, device.my_en.e_lp); - exit (-1); - } - - if (check_zero (host->my_en.e_ov, device.my_en.e_ov)){ - fprintf (stderr, "E_Ov does not match (%4.10e %4.10e) \n", host->my_en.e_ov, device.my_en.e_ov); - exit (-1); - } - - if (check_zero (host->my_en.e_un, device.my_en.e_un)){ - fprintf (stderr, "E_Un does not match (%4.10e %4.10e) \n", host->my_en.e_un, device.my_en.e_un); - exit (-1); - } - - if (check_zero (host->my_en.e_tor, device.my_en.e_tor)) { - fprintf (stderr, "E_Tor does not match (%4.10e %4.10e) \n", host->my_en.e_tor, device.my_en.e_tor); - exit (-1); - } - - if (check_zero (host->my_en.e_con, device.my_en.e_con)) { - fprintf (stderr, "E_Con does not match (%4.10e %4.10e) \n", host->my_en.e_con, device.my_en.e_con); - exit (-1); - } - - fprintf (stderr, "E_Hb does not match (%4.10e %4.10e) \n", host->my_en.e_hb, device.my_en.e_hb); - if (check_zero (host->my_en.e_hb, device.my_en.e_hb)) { - fprintf (stderr, "E_Hb does not match (%4.10e %4.10e) \n", host->my_en.e_hb, device.my_en.e_hb); - exit (-1); - } - - if (check_zero (host->my_en.e_ang, device.my_en.e_ang)) { - fprintf (stderr, "E_Ang does not match (%4.10e %4.10e) \n", host->my_en.e_ang, device.my_en.e_ang); - exit (-1); - } - - if (check_zero (host->my_en.e_pen, device.my_en.e_pen)) { - fprintf (stderr, "E_Pen does not match (%4.10e %4.10e) \n", host->my_en.e_pen, device.my_en.e_pen); - exit (-1); - } - - if (check_zero (host->my_en.e_coa, device.my_en.e_coa)) { - fprintf (stderr, "E_Coa does not match (%4.10e %4.10e) \n", host->my_en.e_coa, device.my_en.e_coa); - exit (-1); - } - - if (check_zero (host->my_en.e_vdW, device.my_en.e_vdW)) { - fprintf (stderr, "E_vdW does not match (%4.20e %4.20e) \n", host->my_en.e_vdW, device.my_en.e_vdW); - exit (-1); - } - - if (check_zero (host->my_en.e_pol, device.my_en.e_pol)) { - fprintf (stderr, "E_Pol does not match (%4.10e %4.10e) \n", host->my_en.e_pol, device.my_en.e_pol); - //exit (-1); - } - - if (check_zero (host->my_en.e_kin, device.my_en.e_kin)) { - fprintf (stderr, "E_Kin does not match (%4.10e %4.10e) \n", host->my_en.e_kin, device.my_en.e_kin); - //exit (-1); - } - - if (check_zero (host->my_en.e_ele, device.my_en.e_ele)) { - fprintf (stderr, "E_Ele does not match (%4.20e %4.20e) \n", host->my_en.e_ele, device.my_en.e_ele); - //exit (-1); - } - - fprintf (stderr, "Simulation Data match between host and device \n"); - return SUCCESS; + simulation_data device; + + copy_host_device (&device, host->d_simulation_data, sizeof (simulation_data), + cudaMemcpyDeviceToHost, "simulation_data"); + + if (check_zero (host->my_en.e_bond, device.my_en.e_bond)){ + fprintf (stderr, "E_BE does not match (%4.15e %4.15e) \n", host->my_en.e_bond, device.my_en.e_bond); + exit (-1); + } + + if (check_zero (host->my_en.e_lp, device.my_en.e_lp)){ + fprintf (stderr, "E_Lp does not match (%4.10e %4.10e) \n", host->my_en.e_lp, device.my_en.e_lp); + exit (-1); + } + + if (check_zero (host->my_en.e_ov, device.my_en.e_ov)){ + fprintf (stderr, "E_Ov does not match (%4.10e %4.10e) \n", host->my_en.e_ov, device.my_en.e_ov); + exit (-1); + } + + if (check_zero (host->my_en.e_un, device.my_en.e_un)){ + fprintf (stderr, "E_Un does not match (%4.10e %4.10e) \n", host->my_en.e_un, device.my_en.e_un); + exit (-1); + } + + if (check_zero (host->my_en.e_tor, device.my_en.e_tor)) { + fprintf (stderr, "E_Tor does not match (%4.10e %4.10e) \n", host->my_en.e_tor, device.my_en.e_tor); + exit (-1); + } + + if (check_zero (host->my_en.e_con, device.my_en.e_con)) { + fprintf (stderr, "E_Con does not match (%4.10e %4.10e) \n", host->my_en.e_con, device.my_en.e_con); + exit (-1); + } + + fprintf (stderr, "E_Hb does not match (%4.10e %4.10e) \n", host->my_en.e_hb, device.my_en.e_hb); + if (check_zero (host->my_en.e_hb, device.my_en.e_hb)) { + fprintf (stderr, "E_Hb does not match (%4.10e %4.10e) \n", host->my_en.e_hb, device.my_en.e_hb); + exit (-1); + } + + if (check_zero (host->my_en.e_ang, device.my_en.e_ang)) { + fprintf (stderr, "E_Ang does not match (%4.10e %4.10e) \n", host->my_en.e_ang, device.my_en.e_ang); + exit (-1); + } + + if (check_zero (host->my_en.e_pen, device.my_en.e_pen)) { + fprintf (stderr, "E_Pen does not match (%4.10e %4.10e) \n", host->my_en.e_pen, device.my_en.e_pen); + exit (-1); + } + + if (check_zero (host->my_en.e_coa, device.my_en.e_coa)) { + fprintf (stderr, "E_Coa does not match (%4.10e %4.10e) \n", host->my_en.e_coa, device.my_en.e_coa); + exit (-1); + } + + if (check_zero (host->my_en.e_vdW, device.my_en.e_vdW)) { + fprintf (stderr, "E_vdW does not match (%4.20e %4.20e) \n", host->my_en.e_vdW, device.my_en.e_vdW); + exit (-1); + } + + if (check_zero (host->my_en.e_pol, device.my_en.e_pol)) { + fprintf (stderr, "E_Pol does not match (%4.10e %4.10e) \n", host->my_en.e_pol, device.my_en.e_pol); + //exit (-1); + } + + if (check_zero (host->my_en.e_kin, device.my_en.e_kin)) { + fprintf (stderr, "E_Kin does not match (%4.10e %4.10e) \n", host->my_en.e_kin, device.my_en.e_kin); + //exit (-1); + } + + if (check_zero (host->my_en.e_ele, device.my_en.e_ele)) { + fprintf (stderr, "E_Ele does not match (%4.20e %4.20e) \n", host->my_en.e_ele, device.my_en.e_ele); + //exit (-1); + } + + fprintf (stderr, "Simulation Data match between host and device \n"); + return SUCCESS; } int validate_grid (reax_system *system) { - int x,i, j, k,l, itr; //, tmp, tested; - int itr_nbr,itr_11, miscount; - ivec src, dest; - grid *g; - grid_cell *gci, *gcj, *gcj_nbr; - int found = 0; - - int *tmp = (int *) host_scratch; - int total; - - g = &( system->my_grid ); - miscount = 0; - - total = g->ncells[0] * g->ncells[1] * g->ncells[2]; - - copy_host_device (tmp, system->d_my_grid.str, sizeof(int) * total, cudaMemcpyDeviceToHost, "grid:str"); - copy_host_device (tmp + total, system->d_my_grid.end, sizeof(int) * total, cudaMemcpyDeviceToHost, "grid:end"); - - real *cutoff = (real *) (tmp + 2 * total); - copy_host_device (cutoff, system->d_my_grid.cutoff, sizeof (real) * total, cudaMemcpyDeviceToHost, "grid:cutoff"); - - for( i = 0; i < g->ncells[0]; i++ ) - for( j = 0; j < g->ncells[1]; j++ ) - for( k = 0; k < g->ncells[2]; k++ ) - { - if ((g->str [index_grid_3d (i, j, k, g)] != tmp [index_grid_3d (i, j, k, g)]) || - (g->end [index_grid_3d (i, j, k, g)] != tmp[total + index_grid_3d (i, j, k, g)]) || - (cutoff [index_grid_3d (i, j, k, g)] != g->cutoff [index_grid_3d (i, j, k, g)])) - { - fprintf (stderr, "we have a problem here \n"); - exit (0); - } - /* - fprintf (stderr, " %d %d %d - str: %d end: %d (%d %d) ( %f %f)\n", - i, j, k, g->str [index_grid_3d (i, j, k, g)], g->end [index_grid_3d (i, j, k, g)], - tmp [index_grid_3d (i, j, k, g)], tmp[total + index_grid_3d (i, j, k, g)], - cutoff [index_grid_3d (i, j, k, g)], g->cutoff [index_grid_3d (i, j, k, g)]); - */ - } - - rvec *tmpvec = (rvec *) host_scratch; - copy_host_device (tmpvec, system->d_my_grid.nbrs_cp, sizeof (rvec) * total * g->max_nbrs, - cudaMemcpyDeviceToHost, "grid:nbrs_cp"); - - ivec *tivec = (ivec *) (((rvec *)host_scratch) + total * g->max_nbrs); - copy_host_device (tivec, system->d_my_grid.nbrs_x, sizeof (ivec) * total * g->max_nbrs, - cudaMemcpyDeviceToHost, "grid:nbrs_x"); - - - for( i = 0; i < g->ncells[0]; i++ ) - for( j = 0; j < g->ncells[1]; j++ ) - for( k = 0; k < g->ncells[2]; k++ ) - for (l = 0; l < g->max_nbrs; l++) { - - if (( g->nbrs_cp[index_grid_nbrs(i, j, k, l, g)][0] != tmpvec[index_grid_nbrs(i, j, k, l, g)][0]) || - (g->nbrs_cp[index_grid_nbrs(i, j, k, l, g)][1] != tmpvec[index_grid_nbrs(i, j, k, l, g)][1]) || - (g->nbrs_cp[index_grid_nbrs(i, j, k, l, g)][2] != tmpvec[index_grid_nbrs(i, j, k, l, g)][2]) || - (g->nbrs_x[index_grid_nbrs(i, j, k, l, g)][0] != tivec[index_grid_nbrs(i, j, k, l, g)][0]) || - (g->nbrs_x[index_grid_nbrs(i, j, k, l, g)][1] != tivec[index_grid_nbrs(i, j, k, l, g)][1]) || - (g->nbrs_x[index_grid_nbrs(i, j, k, l, g)][2] != tivec[index_grid_nbrs(i, j, k, l, g)][2] )) - { - fprintf (stderr, "we have a big problem here \n"); - exit (0); - } - - if ((g->nbrs_cp[index_grid_nbrs(i, j, k, l, g)][0] > NEG_INF) && - (g->nbrs_cp[index_grid_nbrs(i, j, k, l, g)][1] > NEG_INF) && - (g->nbrs_cp[index_grid_nbrs(i, j, k, l, g)][2] > NEG_INF) ) - ;/* - fprintf (stderr, "%d %d %d %d ---- %d %d %d - %d %d %d \n", - //fprintf (stderr, "%d %d %d %d ---- (%3.2f %3.2f %3.2f) - (%3.2f %3.2f %3.2f) \n", - i, j, k, l, - g->nbrs_x[index_grid_nbrs(i, j, k, l, g)][0], - g->nbrs_x[index_grid_nbrs(i, j, k, l, g)][1], - g->nbrs_x[index_grid_nbrs(i, j, k, l, g)][2], - tivec[index_grid_nbrs(i, j, k, l, g)][0], - tivec[index_grid_nbrs(i, j, k, l, g)][1], - tivec[index_grid_nbrs(i, j, k, l, g)][2] - ); - */ - } - - return 0; - - // for( i = 0; i < g->ncells[0]; i++ ) - // for( j = 0; j < g->ncells[1]; j++ ) - // for( k = 0; k < g->ncells[2]; k++ ) - // { - // gci = &(g->cells[ index_grid_3d (i, j, k, g) ]); - // //for (x = 0; x < g->max_nbrs; x++) - // // fprintf (stderr, "(%d, %d, %d) - (%d, %d, %d) \n", - // // i, j, k, - // // gci->nbrs_x[x][0], - // // gci->nbrs_x[x][1], - // // gci->nbrs_x[x][2] ); - // //exit (0); - // - // itr = 0; - // while( (gcj=gci->nbrs[itr]) != NULL ) - // { - // //iterate through the neighbors of gcj and find (i, j, k) - // itr_nbr = 0; - // found = 0; - // while ( (gcj_nbr=gcj->nbrs [itr_nbr]) != NULL ) - // { - // ivec_Copy (dest, gcj_nbr->nbrs_x[itr_nbr] ); - // - // if ( (i == dest[0]) && (j == dest[1]) && (k == dest[2])) - // { - // found = 1; - // break; - // } - // itr_nbr ++; - // } - // - // if (found == 0) { - // fprintf (stderr, "we have a problem here: (%d, %d, %d): (%d, %d, %d) type: (%d, %d) \n", - // i, j, k, - // gci->nbrs_x[itr][0], - // gci->nbrs_x[itr][1], - // gci->nbrs_x[itr][2], - // gci->type, - // gcj->type); - // itr_11 = 0; - // while ( (gcj_nbr=gcj->nbrs [itr_11]) != NULL ) - // { - // ivec_Copy (dest, gcj_nbr->nbrs_x[itr_11] ); - // fprintf (stderr, "%d, %d, %d \n", dest[0], dest[1], dest[2]); - // itr_11 ++; - // } - // exit (0); - // miscount ++; - // } - // - // itr ++; - // } - // } - // - // fprintf (stderr, " cell miscount: %d \n", miscount); + int x,i, j, k,l, itr; //, tmp, tested; + int itr_nbr,itr_11, miscount; + ivec src, dest; + grid *g; + grid_cell *gci, *gcj, *gcj_nbr; + int found = 0; + + int *tmp = (int *) host_scratch; + int total; + + g = &( system->my_grid ); + miscount = 0; + + total = g->ncells[0] * g->ncells[1] * g->ncells[2]; + + copy_host_device (tmp, system->d_my_grid.str, sizeof(int) * total, cudaMemcpyDeviceToHost, "grid:str"); + copy_host_device (tmp + total, system->d_my_grid.end, sizeof(int) * total, cudaMemcpyDeviceToHost, "grid:end"); + + real *cutoff = (real *) (tmp + 2 * total); + copy_host_device (cutoff, system->d_my_grid.cutoff, sizeof (real) * total, cudaMemcpyDeviceToHost, "grid:cutoff"); + + for( i = 0; i < g->ncells[0]; i++ ) + for( j = 0; j < g->ncells[1]; j++ ) + for( k = 0; k < g->ncells[2]; k++ ) + { + if ((g->str [index_grid_3d (i, j, k, g)] != tmp [index_grid_3d (i, j, k, g)]) || + (g->end [index_grid_3d (i, j, k, g)] != tmp[total + index_grid_3d (i, j, k, g)]) || + (cutoff [index_grid_3d (i, j, k, g)] != g->cutoff [index_grid_3d (i, j, k, g)])) + { + fprintf (stderr, "we have a problem here \n"); + exit (0); + } + /* + fprintf (stderr, " %d %d %d - str: %d end: %d (%d %d) ( %f %f)\n", + i, j, k, g->str [index_grid_3d (i, j, k, g)], g->end [index_grid_3d (i, j, k, g)], + tmp [index_grid_3d (i, j, k, g)], tmp[total + index_grid_3d (i, j, k, g)], + cutoff [index_grid_3d (i, j, k, g)], g->cutoff [index_grid_3d (i, j, k, g)]); + */ + } + + rvec *tmpvec = (rvec *) host_scratch; + copy_host_device (tmpvec, system->d_my_grid.nbrs_cp, sizeof (rvec) * total * g->max_nbrs, + cudaMemcpyDeviceToHost, "grid:nbrs_cp"); + + ivec *tivec = (ivec *) (((rvec *)host_scratch) + total * g->max_nbrs); + copy_host_device (tivec, system->d_my_grid.nbrs_x, sizeof (ivec) * total * g->max_nbrs, + cudaMemcpyDeviceToHost, "grid:nbrs_x"); + + + for( i = 0; i < g->ncells[0]; i++ ) + for( j = 0; j < g->ncells[1]; j++ ) + for( k = 0; k < g->ncells[2]; k++ ) + for (l = 0; l < g->max_nbrs; l++) { + + if (( g->nbrs_cp[index_grid_nbrs(i, j, k, l, g)][0] != tmpvec[index_grid_nbrs(i, j, k, l, g)][0]) || + (g->nbrs_cp[index_grid_nbrs(i, j, k, l, g)][1] != tmpvec[index_grid_nbrs(i, j, k, l, g)][1]) || + (g->nbrs_cp[index_grid_nbrs(i, j, k, l, g)][2] != tmpvec[index_grid_nbrs(i, j, k, l, g)][2]) || + (g->nbrs_x[index_grid_nbrs(i, j, k, l, g)][0] != tivec[index_grid_nbrs(i, j, k, l, g)][0]) || + (g->nbrs_x[index_grid_nbrs(i, j, k, l, g)][1] != tivec[index_grid_nbrs(i, j, k, l, g)][1]) || + (g->nbrs_x[index_grid_nbrs(i, j, k, l, g)][2] != tivec[index_grid_nbrs(i, j, k, l, g)][2] )) + { + fprintf (stderr, "we have a big problem here \n"); + exit (0); + } + + if ((g->nbrs_cp[index_grid_nbrs(i, j, k, l, g)][0] > NEG_INF) && + (g->nbrs_cp[index_grid_nbrs(i, j, k, l, g)][1] > NEG_INF) && + (g->nbrs_cp[index_grid_nbrs(i, j, k, l, g)][2] > NEG_INF) ) + ;/* + fprintf (stderr, "%d %d %d %d ---- %d %d %d - %d %d %d \n", + //fprintf (stderr, "%d %d %d %d ---- (%3.2f %3.2f %3.2f) - (%3.2f %3.2f %3.2f) \n", + i, j, k, l, + g->nbrs_x[index_grid_nbrs(i, j, k, l, g)][0], + g->nbrs_x[index_grid_nbrs(i, j, k, l, g)][1], + g->nbrs_x[index_grid_nbrs(i, j, k, l, g)][2], + tivec[index_grid_nbrs(i, j, k, l, g)][0], + tivec[index_grid_nbrs(i, j, k, l, g)][1], + tivec[index_grid_nbrs(i, j, k, l, g)][2] + ); + */ + } + + return 0; + + // for( i = 0; i < g->ncells[0]; i++ ) + // for( j = 0; j < g->ncells[1]; j++ ) + // for( k = 0; k < g->ncells[2]; k++ ) + // { + // gci = &(g->cells[ index_grid_3d (i, j, k, g) ]); + // //for (x = 0; x < g->max_nbrs; x++) + // // fprintf (stderr, "(%d, %d, %d) - (%d, %d, %d) \n", + // // i, j, k, + // // gci->nbrs_x[x][0], + // // gci->nbrs_x[x][1], + // // gci->nbrs_x[x][2] ); + // //exit (0); + // + // itr = 0; + // while( (gcj=gci->nbrs[itr]) != NULL ) + // { + // //iterate through the neighbors of gcj and find (i, j, k) + // itr_nbr = 0; + // found = 0; + // while ( (gcj_nbr=gcj->nbrs [itr_nbr]) != NULL ) + // { + // ivec_Copy (dest, gcj_nbr->nbrs_x[itr_nbr] ); + // + // if ( (i == dest[0]) && (j == dest[1]) && (k == dest[2])) + // { + // found = 1; + // break; + // } + // itr_nbr ++; + // } + // + // if (found == 0) { + // fprintf (stderr, "we have a problem here: (%d, %d, %d): (%d, %d, %d) type: (%d, %d) \n", + // i, j, k, + // gci->nbrs_x[itr][0], + // gci->nbrs_x[itr][1], + // gci->nbrs_x[itr][2], + // gci->type, + // gcj->type); + // itr_11 = 0; + // while ( (gcj_nbr=gcj->nbrs [itr_11]) != NULL ) + // { + // ivec_Copy (dest, gcj_nbr->nbrs_x[itr_11] ); + // fprintf (stderr, "%d, %d, %d \n", dest[0], dest[1], dest[2]); + // itr_11 ++; + // } + // exit (0); + // miscount ++; + // } + // + // itr ++; + // } + // } + // + // fprintf (stderr, " cell miscount: %d \n", miscount); } int validate_three_bodies (reax_system *system, storage *workspace, reax_list **lists) { - reax_list *three = *lists + THREE_BODIES; - reax_list *bonds = *lists + BONDS; - - reax_list *d_three = *dev_lists + THREE_BODIES; - reax_list *d_bonds = *dev_lists + BONDS; - bond_data *d_bond_data; - real *test; - - three_body_interaction_data *data = (three_body_interaction_data *) - malloc ( sizeof (three_body_interaction_data) * d_three->num_intrs); - int *start = (int *) malloc (sizeof (int) * d_three->n); - int *end = (int *) malloc (sizeof (int) * d_three->n); - - int *b_start = (int *) malloc (sizeof (int) * d_bonds->n); - int *b_end = (int *) malloc (sizeof (int) * d_bonds->n); - int count; - int hcount, dcount; - - - copy_host_device ( start, d_three->index, - sizeof (int) * d_three->n, cudaMemcpyDeviceToHost, "three:start"); - copy_host_device ( end, d_three->end_index, - sizeof (int) * d_three->n, cudaMemcpyDeviceToHost, "three:end"); - copy_host_device ( data, d_three->select.three_body_list, - sizeof (three_body_interaction_data) * d_three->num_intrs, - cudaMemcpyDeviceToHost, "three:data"); - - d_bond_data = (bond_data *) malloc (sizeof (bond_data)* d_bonds->num_intrs); - - copy_host_device ( b_start, d_bonds->index, - sizeof (int) * d_bonds->n, cudaMemcpyDeviceToHost, "bonds:start"); - copy_host_device ( b_end, d_bonds->end_index, - sizeof (int) * d_bonds->n, cudaMemcpyDeviceToHost, "bonds:end"); - copy_host_device (d_bond_data, d_bonds->select.bond_list, sizeof (bond_data) * d_bonds->num_intrs, - cudaMemcpyDeviceToHost, "bonds:data"); - - count = 0; - hcount = dcount = 0; - for (int i = 0; i < system->N; i++) - { - - int x, y, z; - for (x = b_start[i]; x < b_end[i]; x++) - { - int t_start = start[x]; - int t_end = end[x]; - - bond_data *dev_bond = &d_bond_data [x]; - bond_data *host_bond; - for (z = Start_Index (i, bonds); z < End_Index (i, bonds); z++) - { - host_bond = &bonds->select.bond_list [z]; - if ((dev_bond->nbr == host_bond->nbr) && - check_same (dev_bond->rel_box, host_bond->rel_box) && - !check_zero (dev_bond->dvec, host_bond->dvec) && - !check_zero (dev_bond->d, host_bond->d) ) - { - break; - } - } - if (z >= End_Index (i, bonds)){ - fprintf (stderr, "Could not find the matching bond on host and device \n"); - exit (-1); - } - - dcount += end[x] - start[x]; - hcount += Num_Entries (z, three); - - if ((end[x] - start[x]) != (End_Index (z, three) - Start_Index (z, three))) - { - count ++; - /* - fprintf (stderr, " Three body count does not match between host and device\n"); - fprintf (stderr, " Host count : (%d, %d)\n", Start_Index (z, three), End_Index (z, three)); - fprintf (stderr, " atom: %d - bond: %d Device count: (%d, %d)\n", i, x, start[x], end[x]); - */ - } - } - - /* - if ((dcount != hcount)) { - - fprintf (stderr, " Three body count does not match for the bond %d - %d \n", hcount, dcount); - - for (int j = b_start[i]; j < b_end[i]; j ++) { - bond_order_data *src = &d_bond_data[j].bo_data; - dcount = end[j] - start[j]; - hcount = Num_Entries (j, three); - fprintf (stderr, "device \n"); - print_bond_data (src); - - fprintf (stderr, "\n"); - src = &bonds->select.bond_list[j].bo_data; - fprintf (stderr, "host \n"); - print_bond_data (src); - fprintf (stderr, "\n"); + reax_list *three = *lists + THREE_BODIES; + reax_list *bonds = *lists + BONDS; + + reax_list *d_three = *dev_lists + THREE_BODIES; + reax_list *d_bonds = *dev_lists + BONDS; + bond_data *d_bond_data; + real *test; + + three_body_interaction_data *data = (three_body_interaction_data *) + malloc ( sizeof (three_body_interaction_data) * d_three->num_intrs); + int *start = (int *) malloc (sizeof (int) * d_three->n); + int *end = (int *) malloc (sizeof (int) * d_three->n); + + int *b_start = (int *) malloc (sizeof (int) * d_bonds->n); + int *b_end = (int *) malloc (sizeof (int) * d_bonds->n); + int count; + int hcount, dcount; + + + copy_host_device ( start, d_three->index, + sizeof (int) * d_three->n, cudaMemcpyDeviceToHost, "three:start"); + copy_host_device ( end, d_three->end_index, + sizeof (int) * d_three->n, cudaMemcpyDeviceToHost, "three:end"); + copy_host_device ( data, d_three->select.three_body_list, + sizeof (three_body_interaction_data) * d_three->num_intrs, + cudaMemcpyDeviceToHost, "three:data"); + + d_bond_data = (bond_data *) malloc (sizeof (bond_data)* d_bonds->num_intrs); + + copy_host_device ( b_start, d_bonds->index, + sizeof (int) * d_bonds->n, cudaMemcpyDeviceToHost, "bonds:start"); + copy_host_device ( b_end, d_bonds->end_index, + sizeof (int) * d_bonds->n, cudaMemcpyDeviceToHost, "bonds:end"); + copy_host_device (d_bond_data, d_bonds->select.bond_list, sizeof (bond_data) * d_bonds->num_intrs, + cudaMemcpyDeviceToHost, "bonds:data"); + + count = 0; + hcount = dcount = 0; + for (int i = 0; i < system->N; i++) + { + + int x, y, z; + for (x = b_start[i]; x < b_end[i]; x++) + { + int t_start = start[x]; + int t_end = end[x]; + + bond_data *dev_bond = &d_bond_data [x]; + bond_data *host_bond; + for (z = Start_Index (i, bonds); z < End_Index (i, bonds); z++) + { + host_bond = &bonds->select.bond_list [z]; + if ((dev_bond->nbr == host_bond->nbr) && + check_same (dev_bond->rel_box, host_bond->rel_box) && + !check_zero (dev_bond->dvec, host_bond->dvec) && + !check_zero (dev_bond->d, host_bond->d) ) + { + break; + } + } + if (z >= End_Index (i, bonds)){ + fprintf (stderr, "Could not find the matching bond on host and device \n"); + exit (-1); + } + + dcount += end[x] - start[x]; + hcount += Num_Entries (z, three); + + if ((end[x] - start[x]) != (End_Index (z, three) - Start_Index (z, three))) + { + count ++; + /* + fprintf (stderr, " Three body count does not match between host and device\n"); + fprintf (stderr, " Host count : (%d, %d)\n", Start_Index (z, three), End_Index (z, three)); + fprintf (stderr, " atom: %d - bond: %d Device count: (%d, %d)\n", i, x, start[x], end[x]); + */ + } + } + + /* + if ((dcount != hcount)) { + + fprintf (stderr, " Three body count does not match for the bond %d - %d \n", hcount, dcount); + + for (int j = b_start[i]; j < b_end[i]; j ++) { + bond_order_data *src = &d_bond_data[j].bo_data; + dcount = end[j] - start[j]; + hcount = Num_Entries (j, three); + fprintf (stderr, "device \n"); + print_bond_data (src); + + fprintf (stderr, "\n"); + src = &bonds->select.bond_list[j].bo_data; + fprintf (stderr, "host \n"); + print_bond_data (src); + fprintf (stderr, "\n"); //fprintf (stderr, "--- Device bo is %f \n", test[j]); fprintf (stderr, "Device %d %d bonds (%d %d) - Host %d %d bonds (%d %d) \n", start[j], end[j],b_start[i], b_end[i], @@ -1467,65 +1467,65 @@ fprintf (stderr, "------\n"); fprintf (stderr, " Three Bodies count does not match between host and device \n"); exit (-1); } - */ + */ } fprintf (stderr, "Three body count on DEVICE %d HOST %d -- miscount: %d\n", dcount, hcount, count); count = 0; for (int i = 0; i < system->N; i++) { - int x, y, z; - for (x = b_start[i]; x < b_end[i]; x++) - { - int t_start = start[x]; - int t_end = end[x]; - - bond_data *dev_bond = &d_bond_data [x]; - bond_data *host_bond; - for (z = Start_Index (i, bonds); z < End_Index (i, bonds); z++) - { - host_bond = &bonds->select.bond_list [z]; - if ((dev_bond->nbr == host_bond->nbr) && - check_same (dev_bond->rel_box, host_bond->rel_box) && - !check_zero (dev_bond->dvec, host_bond->dvec) && - !check_zero (dev_bond->d, host_bond->d) ) - { - break; - } - } - if (z >= End_Index (i, bonds)){ - fprintf (stderr, "Could not find the matching bond on host and device \n"); - exit (-1); - } - - //find this three-body in the bonds on the host side. - for (y = t_start; y < t_end; y++) - { - three_body_interaction_data *device = data + y; - three_body_interaction_data *host; - - //fprintf (stderr, "Device thb %d pthb %d \n", device->thb, device->pthb); - - int xx; - for (xx = Start_Index (z, three); xx < End_Index (z, three); xx++) - { - host = &three->select.three_body_list [xx]; - //fprintf (stderr, "Host thb %d pthb %d \n", host->thb, host->pthb); - //if ((host->thb == device->thb) && (host->pthb == device->pthb)) - if ((host->thb == device->thb) && !check_zero (host->theta, device->theta)) - { - count ++; - break; - } - } - - if ( xx >= End_Index (z, three) ) { - fprintf (stderr, " Could not match for atom %d bonds %d (%d) Three body(%d %d) (%d %d) \n", i, x, z, - Start_Index (z, three), End_Index (z, three), start[x], end[x] ); - exit (-1); - }// else fprintf (stderr, "----------------- \n"); - } - } + int x, y, z; + for (x = b_start[i]; x < b_end[i]; x++) + { + int t_start = start[x]; + int t_end = end[x]; + + bond_data *dev_bond = &d_bond_data [x]; + bond_data *host_bond; + for (z = Start_Index (i, bonds); z < End_Index (i, bonds); z++) + { + host_bond = &bonds->select.bond_list [z]; + if ((dev_bond->nbr == host_bond->nbr) && + check_same (dev_bond->rel_box, host_bond->rel_box) && + !check_zero (dev_bond->dvec, host_bond->dvec) && + !check_zero (dev_bond->d, host_bond->d) ) + { + break; + } + } + if (z >= End_Index (i, bonds)){ + fprintf (stderr, "Could not find the matching bond on host and device \n"); + exit (-1); + } + + //find this three-body in the bonds on the host side. + for (y = t_start; y < t_end; y++) + { + three_body_interaction_data *device = data + y; + three_body_interaction_data *host; + + //fprintf (stderr, "Device thb %d pthb %d \n", device->thb, device->pthb); + + int xx; + for (xx = Start_Index (z, three); xx < End_Index (z, three); xx++) + { + host = &three->select.three_body_list [xx]; + //fprintf (stderr, "Host thb %d pthb %d \n", host->thb, host->pthb); + //if ((host->thb == device->thb) && (host->pthb == device->pthb)) + if ((host->thb == device->thb) && !check_zero (host->theta, device->theta)) + { + count ++; + break; + } + } + + if ( xx >= End_Index (z, three) ) { + fprintf (stderr, " Could not match for atom %d bonds %d (%d) Three body(%d %d) (%d %d) \n", i, x, z, + Start_Index (z, three), End_Index (z, three), start[x], end[x] ); + exit (-1); + }// else fprintf (stderr, "----------------- \n"); + } + } } free (data); free (start); @@ -1542,170 +1542,170 @@ return SUCCESS; int validate_atoms (reax_system *system, reax_list **lists) { - int start, end, index, count, miscount; - reax_atom *test = (reax_atom *) malloc (sizeof (reax_atom)* system->N); - copy_host_device (test, system->d_my_atoms, sizeof (reax_atom) * system->N, cudaMemcpyDeviceToHost, "atoms"); - - /* - for (int i = system->n; i < system->n + 10; i++) - { - fprintf (stderr, " Atom: %d HIndex: %d \n", i, test[i].Hindex); - } - */ - - count = miscount = 0; - for (int i = 0; i < system->N; i++) - { - if (test[i].type != system->my_atoms[i].type) { - fprintf (stderr, " Type does not match (%d %d) @ index %d \n", system->my_atoms[i].type, test[i].type, i); - exit (-1); - } - - if ( check_zero (test[i].x, system->my_atoms[i].x) ) - { - fprintf (stderr, "Atom :%d x --> host (%f %f %f) device (%f %f %f) \n", i, - system->my_atoms[i].x[0], system->my_atoms[i].x[1], system->my_atoms[i].x[2], - test[i].x[0], test[i].x[1], test[i].x[2] ); - miscount ++; - exit (-1); - } - if ( check_zero (test[i].v, system->my_atoms[i].v) ) - { - fprintf (stderr, "Atom :%d v --> host (%6.10f %6.10f %6.10f) device (%6.10f %6.10f %6.10f) \n", i, - system->my_atoms[i].v[0], system->my_atoms[i].v[1], system->my_atoms[i].v[2], - test[i].v[0], test[i].v[1], test[i].v[2] ); - miscount ++; - exit (-1); - } - if ( check_zero (test[i].f, system->my_atoms[i].f) ) - { - fprintf (stderr, "Atom :%d f --> host (%6.10f %6.10f %6.10f) device (%6.10f %6.10f %6.10f) \n", i, - system->my_atoms[i].f[0], system->my_atoms[i].f[1], system->my_atoms[i].f[2], - test[i].f[0], test[i].f[1], test[i].f[2] ); - miscount ++; - exit (-1); - } - - if ( check_zero (test[i].q, system->my_atoms[i].q) ) - { - fprintf (stderr, "Atom :%d q --> host (%f) device (%f) \n", i, - system->my_atoms[i].q, test[i].q ); - miscount ++; - exit (-1); - } - - count ++; - } - - fprintf (stderr, "Reax Atoms DOES **match** between host and device --> %d miscount --> %d \n", count, miscount); - - free (test); - return true; + int start, end, index, count, miscount; + reax_atom *test = (reax_atom *) malloc (sizeof (reax_atom)* system->N); + copy_host_device (test, system->d_my_atoms, sizeof (reax_atom) * system->N, cudaMemcpyDeviceToHost, "atoms"); + + /* + for (int i = system->n; i < system->n + 10; i++) + { + fprintf (stderr, " Atom: %d HIndex: %d \n", i, test[i].Hindex); + } + */ + + count = miscount = 0; + for (int i = 0; i < system->N; i++) + { + if (test[i].type != system->my_atoms[i].type) { + fprintf (stderr, " Type does not match (%d %d) @ index %d \n", system->my_atoms[i].type, test[i].type, i); + exit (-1); + } + + if ( check_zero (test[i].x, system->my_atoms[i].x) ) + { + fprintf (stderr, "Atom :%d x --> host (%f %f %f) device (%f %f %f) \n", i, + system->my_atoms[i].x[0], system->my_atoms[i].x[1], system->my_atoms[i].x[2], + test[i].x[0], test[i].x[1], test[i].x[2] ); + miscount ++; + exit (-1); + } + if ( check_zero (test[i].v, system->my_atoms[i].v) ) + { + fprintf (stderr, "Atom :%d v --> host (%6.10f %6.10f %6.10f) device (%6.10f %6.10f %6.10f) \n", i, + system->my_atoms[i].v[0], system->my_atoms[i].v[1], system->my_atoms[i].v[2], + test[i].v[0], test[i].v[1], test[i].v[2] ); + miscount ++; + exit (-1); + } + if ( check_zero (test[i].f, system->my_atoms[i].f) ) + { + fprintf (stderr, "Atom :%d f --> host (%6.10f %6.10f %6.10f) device (%6.10f %6.10f %6.10f) \n", i, + system->my_atoms[i].f[0], system->my_atoms[i].f[1], system->my_atoms[i].f[2], + test[i].f[0], test[i].f[1], test[i].f[2] ); + miscount ++; + exit (-1); + } + + if ( check_zero (test[i].q, system->my_atoms[i].q) ) + { + fprintf (stderr, "Atom :%d q --> host (%f) device (%f) \n", i, + system->my_atoms[i].q, test[i].q ); + miscount ++; + exit (-1); + } + + count ++; + } + + fprintf (stderr, "Reax Atoms DOES **match** between host and device --> %d miscount --> %d \n", count, miscount); + + free (test); + return true; } int print_sparse_matrix (sparse_matrix *H) { - sparse_matrix test; - int index, count; - - test.start = (int *) malloc (sizeof (int) * (H->cap)); - test.end = (int *) malloc (sizeof (int) * (H->cap)); - - test.entries = (sparse_matrix_entry *) malloc (sizeof (sparse_matrix_entry) * (H->m)); - memset (test.entries, 0xFF, sizeof (sparse_matrix_entry) * H->m); - - copy_host_device ( test.entries, dev_workspace->H.entries, - sizeof (sparse_matrix_entry) * H->m, cudaMemcpyDeviceToHost, "H:m"); - copy_host_device ( test.start, dev_workspace->H.start, sizeof (int)* (H->cap), cudaMemcpyDeviceToHost, "H:start"); - copy_host_device ( test.end , dev_workspace->H.end, sizeof (int) * (H->cap), cudaMemcpyDeviceToHost, "H:end"); - - count = 0; - for (int i = 0; i < 1; i++) { - for (int j = test.start[i]; j < test.end[i]; j++) { - sparse_matrix_entry *src = &test.entries[j]; - fprintf (stderr, "Row:%d:%d:%f\n", i, src->j, src->val); - } - } - fprintf (stderr, "--------------- "); - - free (test.start); - free (test.end); - free (test.entries); - - return SUCCESS; + sparse_matrix test; + int index, count; + + test.start = (int *) malloc (sizeof (int) * (H->cap)); + test.end = (int *) malloc (sizeof (int) * (H->cap)); + + test.entries = (sparse_matrix_entry *) malloc (sizeof (sparse_matrix_entry) * (H->m)); + memset (test.entries, 0xFF, sizeof (sparse_matrix_entry) * H->m); + + copy_host_device ( test.entries, dev_workspace->H.entries, + sizeof (sparse_matrix_entry) * H->m, cudaMemcpyDeviceToHost, "H:m"); + copy_host_device ( test.start, dev_workspace->H.start, sizeof (int)* (H->cap), cudaMemcpyDeviceToHost, "H:start"); + copy_host_device ( test.end , dev_workspace->H.end, sizeof (int) * (H->cap), cudaMemcpyDeviceToHost, "H:end"); + + count = 0; + for (int i = 0; i < 1; i++) { + for (int j = test.start[i]; j < test.end[i]; j++) { + sparse_matrix_entry *src = &test.entries[j]; + fprintf (stderr, "Row:%d:%d:%f\n", i, src->j, src->val); + } + } + fprintf (stderr, "--------------- "); + + free (test.start); + free (test.end); + free (test.entries); + + return SUCCESS; } int print_sparse_matrix_host (sparse_matrix *H) { - int index, count; - - count = 0; - for (int i = 0; i < 1; i++) { - for (int j = H->start[i]; j < H->end[i]; j++) { - sparse_matrix_entry *src = &H->entries[j]; - fprintf (stderr, "Row:%d:%d:%f\n", i, src->j, src->val); - } - } - fprintf (stderr, "--------------- "); - return SUCCESS; + int index, count; + + count = 0; + for (int i = 0; i < 1; i++) { + for (int j = H->start[i]; j < H->end[i]; j++) { + sparse_matrix_entry *src = &H->entries[j]; + fprintf (stderr, "Row:%d:%d:%f\n", i, src->j, src->val); + } + } + fprintf (stderr, "--------------- "); + return SUCCESS; } int print_host_rvec2 (rvec2 *a, int n) { - for (int i = 0; i < n; i++) - fprintf (stderr, "a[%f][%f] \n", a[i][0], a[i][1]); - fprintf (stderr, " ---------------------------------\n"); + for (int i = 0; i < n; i++) + fprintf (stderr, "a[%f][%f] \n", a[i][0], a[i][1]); + fprintf (stderr, " ---------------------------------\n"); - return SUCCESS; + return SUCCESS; } int print_device_rvec2 (rvec2 *b, int n) { - rvec2 *a = (rvec2 *) host_scratch; + rvec2 *a = (rvec2 *) host_scratch; - copy_host_device (a, b, sizeof (rvec2) * n, cudaMemcpyDeviceToHost, "rvec2"); + copy_host_device (a, b, sizeof (rvec2) * n, cudaMemcpyDeviceToHost, "rvec2"); - return print_host_rvec2 (a, n); + return print_host_rvec2 (a, n); } int print_host_array (real *a, int n) { - for (int i = 0; i < n; i++) - fprintf (stderr," a[%d] = %f \n", i, a[i]); - fprintf(stderr, " ----------------------------------\n"); - return SUCCESS; + for (int i = 0; i < n; i++) + fprintf (stderr," a[%d] = %f \n", i, a[i]); + fprintf(stderr, " ----------------------------------\n"); + return SUCCESS; } int print_device_array (real *a, int n) { - real *b = (real *) host_scratch; - copy_host_device (b, a, sizeof (real) * n, cudaMemcpyDeviceToHost, "real"); - print_host_array (b, n); + real *b = (real *) host_scratch; + copy_host_device (b, a, sizeof (real) * n, cudaMemcpyDeviceToHost, "real"); + print_host_array (b, n); } int check_zeros_host (rvec2 *host, int n, char *msg) { - int count, count1; - count = count1 = 0; - for (int i = 0; i < n; i++){ - if (host[i][0] == 0) count ++; - if (host[i][1] == 0) count1 ++; - } + int count, count1; + count = count1 = 0; + for (int i = 0; i < n; i++){ + if (host[i][0] == 0) count ++; + if (host[i][1] == 0) count1 ++; + } - fprintf (stderr, "%s has %d, %d zero elements \n", msg, count, count1 ); + fprintf (stderr, "%s has %d, %d zero elements \n", msg, count, count1 ); - return 1; + return 1; } int check_zeros_device (rvec2 *device, int n, char *msg) { - rvec2 *a = (rvec2 *) host_scratch; + rvec2 *a = (rvec2 *) host_scratch; - copy_host_device (a, device, sizeof (rvec2) * n, cudaMemcpyDeviceToHost, msg); + copy_host_device (a, device, sizeof (rvec2) * n, cudaMemcpyDeviceToHost, msg); - check_zeros_host (a, n, msg); + check_zeros_host (a, n, msg); - return 1; + return 1; } diff --git a/PG-PuReMD/src/vector.cu b/PG-PuReMD/src/vector.cu index 2cfa0b41..489477f2 100644 --- a/PG-PuReMD/src/vector.cu +++ b/PG-PuReMD/src/vector.cu @@ -29,494 +29,494 @@ extern "C" { #endif - inline int Vector_isZero( real* v, int k ) - { - for( --k; k>=0; --k ) - if( fabs( v[k] ) > ALMOST_ZERO ) - return 0; + inline int Vector_isZero( real* v, int k ) + { + for( --k; k>=0; --k ) + if( fabs( v[k] ) > ALMOST_ZERO ) + return 0; - return 1; - } + return 1; + } - inline void Vector_MakeZero( real *v, int k ) - { - for( --k; k>=0; --k ) - v[k] = 0; - } + inline void Vector_MakeZero( real *v, int k ) + { + for( --k; k>=0; --k ) + v[k] = 0; + } - inline void Vector_Copy( real* dest, real* v, int k ) - { - for( --k; k>=0; --k ) - dest[k] = v[k]; - } + inline void Vector_Copy( real* dest, real* v, int k ) + { + for( --k; k>=0; --k ) + dest[k] = v[k]; + } - inline void Vector_Scale( real* dest, real c, real* v, int k ) - { - for( --k; k>=0; --k ) - dest[k] = c * v[k]; - } + inline void Vector_Scale( real* dest, real c, real* v, int k ) + { + for( --k; k>=0; --k ) + dest[k] = c * v[k]; + } - inline void Vector_Sum( real* dest, real c, real* v, real d, real* y, int k ) - { - for( --k; k>=0; --k ) - dest[k] = c * v[k] + d * y[k]; - } + inline void Vector_Sum( real* dest, real c, real* v, real d, real* y, int k ) + { + for( --k; k>=0; --k ) + dest[k] = c * v[k] + d * y[k]; + } - inline void Vector_Add( real* dest, real c, real* v, int k ) - { - for( --k; k>=0; --k ) - dest[k] += c * v[k]; - } + inline void Vector_Add( real* dest, real c, real* v, int k ) + { + for( --k; k>=0; --k ) + dest[k] += c * v[k]; + } - inline real Dot( real* v1, real* v2, int k ) - { - real ret = 0; + inline real Dot( real* v1, real* v2, int k ) + { + real ret = 0; - for( --k; k>=0; --k ) - ret += v1[k] * v2[k]; + for( --k; k>=0; --k ) + ret += v1[k] * v2[k]; - return ret; - } + return ret; + } - inline real Norm( real* v1, int k ) - { - real ret = 0; + inline real Norm( real* v1, int k ) + { + real ret = 0; - for( --k; k>=0; --k ) - ret += SQR( v1[k] ); + for( --k; k>=0; --k ) + ret += SQR( v1[k] ); - return SQRT( ret ); - } + return SQRT( ret ); + } - inline void Vector_Print( FILE *fout, char *vname, real *v, int k ) - { - int i; + inline void Vector_Print( FILE *fout, char *vname, real *v, int k ) + { + int i; - fprintf( fout, "%s:", vname ); - for( i = 0; i < k; ++i ) - fprintf( fout, "%24.15e\n", v[i] ); - fprintf( fout, "\n" ); - } + fprintf( fout, "%s:", vname ); + for( i = 0; i < k; ++i ) + fprintf( fout, "%24.15e\n", v[i] ); + fprintf( fout, "\n" ); + } - void rvec_Copy( rvec dest, rvec src ) - { - dest[0] = src[0], dest[1] = src[1], dest[2] = src[2]; - } + void rvec_Copy( rvec dest, rvec src ) + { + dest[0] = src[0], dest[1] = src[1], dest[2] = src[2]; + } - inline void rvec_Scale( rvec ret, real c, rvec v ) - { - ret[0] = c * v[0], ret[1] = c * v[1], ret[2] = c * v[2]; - } + inline void rvec_Scale( rvec ret, real c, rvec v ) + { + ret[0] = c * v[0], ret[1] = c * v[1], ret[2] = c * v[2]; + } - inline void rvec_Add( rvec ret, rvec v ) - { - ret[0] += v[0], ret[1] += v[1], ret[2] += v[2]; - } + inline void rvec_Add( rvec ret, rvec v ) + { + ret[0] += v[0], ret[1] += v[1], ret[2] += v[2]; + } - inline void rvec_ScaledAdd( rvec ret, real c, rvec v ) - { - ret[0] += c * v[0], ret[1] += c * v[1], ret[2] += c * v[2]; - } + inline void rvec_ScaledAdd( rvec ret, real c, rvec v ) + { + ret[0] += c * v[0], ret[1] += c * v[1], ret[2] += c * v[2]; + } - inline void rvec_Sum( rvec ret, rvec v1 ,rvec v2 ) - { - ret[0] = v1[0] + v2[0]; - ret[1] = v1[1] + v2[1]; - ret[2] = v1[2] + v2[2]; - } + inline void rvec_Sum( rvec ret, rvec v1 ,rvec v2 ) + { + ret[0] = v1[0] + v2[0]; + ret[1] = v1[1] + v2[1]; + ret[2] = v1[2] + v2[2]; + } - inline void rvec_ScaledSum( rvec ret, real c1, rvec v1 ,real c2, rvec v2 ) - { - ret[0] = c1 * v1[0] + c2 * v2[0]; - ret[1] = c1 * v1[1] + c2 * v2[1]; - ret[2] = c1 * v1[2] + c2 * v2[2]; - } + inline void rvec_ScaledSum( rvec ret, real c1, rvec v1 ,real c2, rvec v2 ) + { + ret[0] = c1 * v1[0] + c2 * v2[0]; + ret[1] = c1 * v1[1] + c2 * v2[1]; + ret[2] = c1 * v1[2] + c2 * v2[2]; + } - inline real rvec_Dot( rvec v1, rvec v2 ) - { - return v1[0]*v2[0] + v1[1]*v2[1] + v1[2]*v2[2]; - } + inline real rvec_Dot( rvec v1, rvec v2 ) + { + return v1[0]*v2[0] + v1[1]*v2[1] + v1[2]*v2[2]; + } - inline real rvec_ScaledDot( real c1, rvec v1, real c2, rvec v2 ) - { - return (c1*c2) * (v1[0]*v2[0] + v1[1]*v2[1] + v1[2]*v2[2]); - } + inline real rvec_ScaledDot( real c1, rvec v1, real c2, rvec v2 ) + { + return (c1*c2) * (v1[0]*v2[0] + v1[1]*v2[1] + v1[2]*v2[2]); + } - inline void rvec_Multiply( rvec r, rvec v1, rvec v2 ) - { - r[0] = v1[0] * v2[0]; - r[1] = v1[1] * v2[1]; - r[2] = v1[2] * v2[2]; - } + inline void rvec_Multiply( rvec r, rvec v1, rvec v2 ) + { + r[0] = v1[0] * v2[0]; + r[1] = v1[1] * v2[1]; + r[2] = v1[2] * v2[2]; + } - inline void rvec_iMultiply( rvec r, ivec v1, rvec v2 ) - { - r[0] = v1[0] * v2[0]; - r[1] = v1[1] * v2[1]; - r[2] = v1[2] * v2[2]; - } + inline void rvec_iMultiply( rvec r, ivec v1, rvec v2 ) + { + r[0] = v1[0] * v2[0]; + r[1] = v1[1] * v2[1]; + r[2] = v1[2] * v2[2]; + } - inline void rvec_Divide( rvec r, rvec v1, rvec v2 ) - { - r[0] = v1[0] / v2[0]; - r[1] = v1[1] / v2[1]; - r[2] = v1[2] / v2[2]; - } + inline void rvec_Divide( rvec r, rvec v1, rvec v2 ) + { + r[0] = v1[0] / v2[0]; + r[1] = v1[1] / v2[1]; + r[2] = v1[2] / v2[2]; + } - inline void rvec_iDivide( rvec r, rvec v1, ivec v2 ) - { - r[0] = v1[0] / v2[0]; - r[1] = v1[1] / v2[1]; - r[2] = v1[2] / v2[2]; - } + inline void rvec_iDivide( rvec r, rvec v1, ivec v2 ) + { + r[0] = v1[0] / v2[0]; + r[1] = v1[1] / v2[1]; + r[2] = v1[2] / v2[2]; + } - inline void rvec_Invert( rvec r, rvec v ) - { - r[0] = 1. / v[0]; - r[1] = 1. / v[1]; - r[2] = 1. / v[2]; - } + inline void rvec_Invert( rvec r, rvec v ) + { + r[0] = 1. / v[0]; + r[1] = 1. / v[1]; + r[2] = 1. / v[2]; + } - inline void rvec_Cross( rvec ret, rvec v1, rvec v2 ) - { - ret[0] = v1[1] * v2[2] - v1[2] * v2[1]; - ret[1] = v1[2] * v2[0] - v1[0] * v2[2]; - ret[2] = v1[0] * v2[1] - v1[1] * v2[0]; - } + inline void rvec_Cross( rvec ret, rvec v1, rvec v2 ) + { + ret[0] = v1[1] * v2[2] - v1[2] * v2[1]; + ret[1] = v1[2] * v2[0] - v1[0] * v2[2]; + ret[2] = v1[0] * v2[1] - v1[1] * v2[0]; + } - inline void rvec_OuterProduct( rtensor r, rvec v1, rvec v2 ) - { - int i, j; + inline void rvec_OuterProduct( rtensor r, rvec v1, rvec v2 ) + { + int i, j; - for( i = 0; i < 3; ++i ) - for( j = 0; j < 3; ++j ) - r[i][j] = v1[i] * v2[j]; - } + for( i = 0; i < 3; ++i ) + for( j = 0; j < 3; ++j ) + r[i][j] = v1[i] * v2[j]; + } - inline real rvec_Norm_Sqr( rvec v ) - { - return SQR(v[0]) + SQR(v[1]) + SQR(v[2]); - } + inline real rvec_Norm_Sqr( rvec v ) + { + return SQR(v[0]) + SQR(v[1]) + SQR(v[2]); + } - inline real rvec_Norm( rvec v ) - { - return SQRT( SQR(v[0]) + SQR(v[1]) + SQR(v[2]) ); - } + inline real rvec_Norm( rvec v ) + { + return SQRT( SQR(v[0]) + SQR(v[1]) + SQR(v[2]) ); + } - inline int rvec_isZero( rvec v ) - { - if( fabs(v[0]) > ALMOST_ZERO || - fabs(v[1]) > ALMOST_ZERO || - fabs(v[2]) > ALMOST_ZERO ) - return 0; - return 1; - } + inline int rvec_isZero( rvec v ) + { + if( fabs(v[0]) > ALMOST_ZERO || + fabs(v[1]) > ALMOST_ZERO || + fabs(v[2]) > ALMOST_ZERO ) + return 0; + return 1; + } - inline void rvec_MakeZero( rvec v ) - { - // v[0] = v[1] = v[2] = 0.0000000000000; - v[0] = v[1] = v[2] = 0.000000000000000e+00; - } + inline void rvec_MakeZero( rvec v ) + { + // v[0] = v[1] = v[2] = 0.0000000000000; + v[0] = v[1] = v[2] = 0.000000000000000e+00; + } #if defined(PURE_REAX) - inline void rvec_Random( rvec v ) - { - v[0] = Random(2.0)-1.0; - v[1] = Random(2.0)-1.0; - v[2] = Random(2.0)-1.0; - } + inline void rvec_Random( rvec v ) + { + v[0] = Random(2.0)-1.0; + v[1] = Random(2.0)-1.0; + v[2] = Random(2.0)-1.0; + } #endif - inline void rtensor_Multiply( rtensor ret, rtensor m1, rtensor m2 ) - { - int i, j, k; - rtensor temp; - - // check if the result matrix is the same as one of m1, m2. - // if so, we cannot modify the contents of m1 or m2, so - // we have to use a temp matrix. - if( ret == m1 || ret == m2 ) - { - for( i = 0; i < 3; ++i ) - for( j = 0; j < 3; ++j ) - { - temp[i][j] = 0; - for( k = 0; k < 3; ++k ) - temp[i][j] += m1[i][k] * m2[k][j]; - } - - for( i = 0; i < 3; ++i ) - for( j = 0; j < 3; ++j ) - ret[i][j] = temp[i][j]; - } - else - { - for( i = 0; i < 3; ++i ) - for( j = 0; j < 3; ++j ) - { - ret[i][j] = 0; - for( k = 0; k < 3; ++k ) - ret[i][j] += m1[i][k] * m2[k][j]; - } - } - } - - - inline void rtensor_MatVec( rvec ret, rtensor m, rvec v ) - { - int i; - rvec temp; - - // if ret is the same vector as v, we cannot modify the - // contents of v until all computation is finished. - if( ret == v ) - { - for( i = 0; i < 3; ++i ) - temp[i] = m[i][0] * v[0] + m[i][1] * v[1] + m[i][2] * v[2]; - - for( i = 0; i < 3; ++i ) - ret[i] = temp[i]; - } - else - { - for( i = 0; i < 3; ++i ) - ret[i] = m[i][0] * v[0] + m[i][1] * v[1] + m[i][2] * v[2]; - } - } - - - inline void rtensor_Scale( rtensor ret, real c, rtensor m ) - { - int i, j; - - for( i = 0; i < 3; ++i ) - for( j = 0; j < 3; ++j ) - ret[i][j] = c * m[i][j]; - } - - - inline void rtensor_Add( rtensor ret, rtensor t ) - { - int i, j; - - for( i = 0; i < 3; ++i ) - for( j = 0; j < 3; ++j ) - ret[i][j] += t[i][j]; - } - - - inline void rtensor_ScaledAdd( rtensor ret, real c, rtensor t ) - { - int i, j; - - for( i = 0; i < 3; ++i ) - for( j = 0; j < 3; ++j ) - ret[i][j] += c * t[i][j]; - } - - - inline void rtensor_Sum( rtensor ret, rtensor t1, rtensor t2 ) - { - int i, j; - - for( i = 0; i < 3; ++i ) - for( j = 0; j < 3; ++j ) - ret[i][j] = t1[i][j] + t2[i][j]; - } - - - inline void rtensor_ScaledSum( rtensor ret, real c1, rtensor t1, - real c2, rtensor t2 ) - { - int i, j; - - for( i = 0; i < 3; ++i ) - for( j = 0; j < 3; ++j ) - ret[i][j] = c1 * t1[i][j] + c2 * t2[i][j]; - } - - - inline void rtensor_Copy( rtensor ret, rtensor t ) - { - int i, j; - - for( i = 0; i < 3; ++i ) - for( j = 0; j < 3; ++j ) - ret[i][j] = t[i][j]; - } + inline void rtensor_Multiply( rtensor ret, rtensor m1, rtensor m2 ) + { + int i, j, k; + rtensor temp; + + // check if the result matrix is the same as one of m1, m2. + // if so, we cannot modify the contents of m1 or m2, so + // we have to use a temp matrix. + if( ret == m1 || ret == m2 ) + { + for( i = 0; i < 3; ++i ) + for( j = 0; j < 3; ++j ) + { + temp[i][j] = 0; + for( k = 0; k < 3; ++k ) + temp[i][j] += m1[i][k] * m2[k][j]; + } + + for( i = 0; i < 3; ++i ) + for( j = 0; j < 3; ++j ) + ret[i][j] = temp[i][j]; + } + else + { + for( i = 0; i < 3; ++i ) + for( j = 0; j < 3; ++j ) + { + ret[i][j] = 0; + for( k = 0; k < 3; ++k ) + ret[i][j] += m1[i][k] * m2[k][j]; + } + } + } + + + inline void rtensor_MatVec( rvec ret, rtensor m, rvec v ) + { + int i; + rvec temp; + + // if ret is the same vector as v, we cannot modify the + // contents of v until all computation is finished. + if( ret == v ) + { + for( i = 0; i < 3; ++i ) + temp[i] = m[i][0] * v[0] + m[i][1] * v[1] + m[i][2] * v[2]; + + for( i = 0; i < 3; ++i ) + ret[i] = temp[i]; + } + else + { + for( i = 0; i < 3; ++i ) + ret[i] = m[i][0] * v[0] + m[i][1] * v[1] + m[i][2] * v[2]; + } + } + + + inline void rtensor_Scale( rtensor ret, real c, rtensor m ) + { + int i, j; + + for( i = 0; i < 3; ++i ) + for( j = 0; j < 3; ++j ) + ret[i][j] = c * m[i][j]; + } + + + inline void rtensor_Add( rtensor ret, rtensor t ) + { + int i, j; + + for( i = 0; i < 3; ++i ) + for( j = 0; j < 3; ++j ) + ret[i][j] += t[i][j]; + } + + + inline void rtensor_ScaledAdd( rtensor ret, real c, rtensor t ) + { + int i, j; + + for( i = 0; i < 3; ++i ) + for( j = 0; j < 3; ++j ) + ret[i][j] += c * t[i][j]; + } + + + inline void rtensor_Sum( rtensor ret, rtensor t1, rtensor t2 ) + { + int i, j; + + for( i = 0; i < 3; ++i ) + for( j = 0; j < 3; ++j ) + ret[i][j] = t1[i][j] + t2[i][j]; + } + + + inline void rtensor_ScaledSum( rtensor ret, real c1, rtensor t1, + real c2, rtensor t2 ) + { + int i, j; + + for( i = 0; i < 3; ++i ) + for( j = 0; j < 3; ++j ) + ret[i][j] = c1 * t1[i][j] + c2 * t2[i][j]; + } + + + inline void rtensor_Copy( rtensor ret, rtensor t ) + { + int i, j; + + for( i = 0; i < 3; ++i ) + for( j = 0; j < 3; ++j ) + ret[i][j] = t[i][j]; + } - - inline void rtensor_Identity( rtensor t ) - { - t[0][0] = t[1][1] = t[2][2] = 1; - t[0][1] = t[0][2] = t[1][0] = t[1][2] = t[2][0] = t[2][1] = 0; - } + + inline void rtensor_Identity( rtensor t ) + { + t[0][0] = t[1][1] = t[2][2] = 1; + t[0][1] = t[0][2] = t[1][0] = t[1][2] = t[2][0] = t[2][1] = 0; + } - inline void rtensor_MakeZero( rtensor t ) - { - t[0][0] = t[0][1] = t[0][2] = 0; - t[1][0] = t[1][1] = t[1][2] = 0; - t[2][0] = t[2][1] = t[2][2] = 0; - } + inline void rtensor_MakeZero( rtensor t ) + { + t[0][0] = t[0][1] = t[0][2] = 0; + t[1][0] = t[1][1] = t[1][2] = 0; + t[2][0] = t[2][1] = t[2][2] = 0; + } - inline void rtensor_Transpose( rtensor ret, rtensor t ) - { - ret[0][0] = t[0][0], ret[1][1] = t[1][1], ret[2][2] = t[2][2]; - ret[0][1] = t[1][0], ret[0][2] = t[2][0]; - ret[1][0] = t[0][1], ret[1][2] = t[2][1]; - ret[2][0] = t[0][2], ret[2][1] = t[1][2]; - } + inline void rtensor_Transpose( rtensor ret, rtensor t ) + { + ret[0][0] = t[0][0], ret[1][1] = t[1][1], ret[2][2] = t[2][2]; + ret[0][1] = t[1][0], ret[0][2] = t[2][0]; + ret[1][0] = t[0][1], ret[1][2] = t[2][1]; + ret[2][0] = t[0][2], ret[2][1] = t[1][2]; + } - inline real rtensor_Det( rtensor t ) - { - return ( t[0][0] * (t[1][1] * t[2][2] - t[1][2] * t[2][1] ) + - t[0][1] * (t[1][2] * t[2][0] - t[1][0] * t[2][2] ) + - t[0][2] * (t[1][0] * t[2][1] - t[1][1] * t[2][0] ) ); - } + inline real rtensor_Det( rtensor t ) + { + return ( t[0][0] * (t[1][1] * t[2][2] - t[1][2] * t[2][1] ) + + t[0][1] * (t[1][2] * t[2][0] - t[1][0] * t[2][2] ) + + t[0][2] * (t[1][0] * t[2][1] - t[1][1] * t[2][0] ) ); + } - inline real rtensor_Trace( rtensor t ) - { - return (t[0][0] + t[1][1] + t[2][2]); - } + inline real rtensor_Trace( rtensor t ) + { + return (t[0][0] + t[1][1] + t[2][2]); + } - inline void Print_rTensor(FILE* fp, rtensor t) - { - int i, j; + inline void Print_rTensor(FILE* fp, rtensor t) + { + int i, j; - for (i=0; i < 3; i++) - { - fprintf(fp,"["); - for (j=0; j < 3; j++) - fprintf(fp,"%8.3f,\t",t[i][j]); - fprintf(fp,"]\n"); - } - } + for (i=0; i < 3; i++) + { + fprintf(fp,"["); + for (j=0; j < 3; j++) + fprintf(fp,"%8.3f,\t",t[i][j]); + fprintf(fp,"]\n"); + } + } - inline void ivec_MakeZero( ivec v ) - { - // LGJ v[0] = v[1] = v[2] = 0; - v[0] = v[1] = v[2] = 0.000000000000000e+00; - } + inline void ivec_MakeZero( ivec v ) + { + // LGJ v[0] = v[1] = v[2] = 0; + v[0] = v[1] = v[2] = 0.000000000000000e+00; + } - inline void ivec_Copy( ivec dest, ivec src ) - { - dest[0] = src[0], dest[1] = src[1], dest[2] = src[2]; - } + inline void ivec_Copy( ivec dest, ivec src ) + { + dest[0] = src[0], dest[1] = src[1], dest[2] = src[2]; + } - inline void ivec_Scale( ivec dest, real C, ivec src ) - { - dest[0] = (int)(C * src[0]); - dest[1] = (int)(C * src[1]); - dest[2] = (int)(C * src[2]); - } + inline void ivec_Scale( ivec dest, real C, ivec src ) + { + dest[0] = (int)(C * src[0]); + dest[1] = (int)(C * src[1]); + dest[2] = (int)(C * src[2]); + } - inline void ivec_rScale( ivec dest, real C, rvec src ) - { - dest[0] = (int)(C * src[0]); - dest[1] = (int)(C * src[1]); - dest[2] = (int)(C * src[2]); - } + inline void ivec_rScale( ivec dest, real C, rvec src ) + { + dest[0] = (int)(C * src[0]); + dest[1] = (int)(C * src[1]); + dest[2] = (int)(C * src[2]); + } - inline int ivec_isZero( ivec v ) - { - if( v[0]==0 && v[1]==0 && v[2]==0 ) - return 1; - return 0; - } + inline int ivec_isZero( ivec v ) + { + if( v[0]==0 && v[1]==0 && v[2]==0 ) + return 1; + return 0; + } - inline int ivec_isEqual( ivec v1, ivec v2 ) - { - if( v1[0]==v2[0] && v1[1]==v2[1] && v1[2]==v2[2] ) - return 1; - return 0; - } + inline int ivec_isEqual( ivec v1, ivec v2 ) + { + if( v1[0]==v2[0] && v1[1]==v2[1] && v1[2]==v2[2] ) + return 1; + return 0; + } - inline void ivec_Sum( ivec dest, ivec v1, ivec v2 ) - { - dest[0] = v1[0] + v2[0]; - dest[1] = v1[1] + v2[1]; - dest[2] = v1[2] + v2[2]; - } + inline void ivec_Sum( ivec dest, ivec v1, ivec v2 ) + { + dest[0] = v1[0] + v2[0]; + dest[1] = v1[1] + v2[1]; + dest[2] = v1[2] + v2[2]; + } - inline void ivec_ScaledSum( ivec dest, int k1, ivec v1, int k2, ivec v2 ) - { - dest[0] = k1*v1[0] + k2*v2[0]; - dest[1] = k1*v1[1] + k2*v2[1]; - dest[2] = k1*v1[2] + k2*v2[2]; - } + inline void ivec_ScaledSum( ivec dest, int k1, ivec v1, int k2, ivec v2 ) + { + dest[0] = k1*v1[0] + k2*v2[0]; + dest[1] = k1*v1[1] + k2*v2[1]; + dest[2] = k1*v1[2] + k2*v2[2]; + } - inline void ivec_Add( ivec dest, ivec v ) - { - dest[0] += v[0]; - dest[1] += v[1]; - dest[2] += v[2]; - } + inline void ivec_Add( ivec dest, ivec v ) + { + dest[0] += v[0]; + dest[1] += v[1]; + dest[2] += v[2]; + } - inline void ivec_ScaledAdd( ivec dest, int k, ivec v ) - { - dest[0] += k * v[0]; - dest[1] += k * v[1]; - dest[2] += k * v[2]; - } + inline void ivec_ScaledAdd( ivec dest, int k, ivec v ) + { + dest[0] += k * v[0]; + dest[1] += k * v[1]; + dest[2] += k * v[2]; + } - inline void ivec_Max( ivec res, ivec v1, ivec v2 ) - { - res[0] = MAX( v1[0], v2[0] ); - res[1] = MAX( v1[1], v2[1] ); - res[2] = MAX( v1[2], v2[2] ); - } + inline void ivec_Max( ivec res, ivec v1, ivec v2 ) + { + res[0] = MAX( v1[0], v2[0] ); + res[1] = MAX( v1[1], v2[1] ); + res[2] = MAX( v1[2], v2[2] ); + } - inline void ivec_Max3( ivec res, ivec v1, ivec v2, ivec v3 ) - { - res[0] = MAX3( v1[0], v2[0], v3[0] ); - res[1] = MAX3( v1[1], v2[1], v3[1] ); - res[2] = MAX3( v1[2], v2[2], v3[2] ); - } + inline void ivec_Max3( ivec res, ivec v1, ivec v2, ivec v3 ) + { + res[0] = MAX3( v1[0], v2[0], v3[0] ); + res[1] = MAX3( v1[1], v2[1], v3[1] ); + res[2] = MAX3( v1[2], v2[2], v3[2] ); + } #ifdef __cplusplus } diff --git a/PuReMD-GPU/src/GMRES.cu b/PuReMD-GPU/src/GMRES.cu index 011c4eeb..d00100e9 100644 --- a/PuReMD-GPU/src/GMRES.cu +++ b/PuReMD-GPU/src/GMRES.cu @@ -34,186 +34,186 @@ void Sparse_MatVec( sparse_matrix *A, real *x, real *b ) { - int i, j, k, n, si, ei; - real H; - - n = A->n; - for( i = 0; i < n; ++i ) - b[i] = 0; - - for( i = 0; i < n; ++i ) { - si = A->start[i]; - ei = A->start[i+1]-1; - - for( k = si; k < ei; ++k ) { - j = A->entries[k].j; - H = A->entries[k].val; - b[j] += H * x[i]; - b[i] += H * x[j]; - } - - // the diagonal entry is the last one in - b[i] += A->entries[k].val * x[i]; - } + int i, j, k, n, si, ei; + real H; + + n = A->n; + for( i = 0; i < n; ++i ) + b[i] = 0; + + for( i = 0; i < n; ++i ) { + si = A->start[i]; + ei = A->start[i+1]-1; + + for( k = si; k < ei; ++k ) { + j = A->entries[k].j; + H = A->entries[k].val; + b[j] += H * x[i]; + b[i] += H * x[j]; + } + + // the diagonal entry is the last one in + b[i] += A->entries[k].val * x[i]; + } } void Forward_Subs( sparse_matrix *L, real *b, real *y ) { - int i, pj, j, si, ei; - real val; - - for( i = 0; i < L->n; ++i ) { - y[i] = b[i]; - si = L->start[i]; - ei = L->start[i+1]; - for( pj = si; pj < ei-1; ++pj ){ - j = L->entries[pj].j; - val = L->entries[pj].val; - y[i] -= val * y[j]; - } - y[i] /= L->entries[pj].val; - } + int i, pj, j, si, ei; + real val; + + for( i = 0; i < L->n; ++i ) { + y[i] = b[i]; + si = L->start[i]; + ei = L->start[i+1]; + for( pj = si; pj < ei-1; ++pj ){ + j = L->entries[pj].j; + val = L->entries[pj].val; + y[i] -= val * y[j]; + } + y[i] /= L->entries[pj].val; + } } void Backward_Subs( sparse_matrix *U, real *y, real *x ) { - int i, pj, j, si, ei; - real val; - - for( i = U->n-1; i >= 0; --i ) { - x[i] = y[i]; - si = U->start[i]; - ei = U->start[i+1]; - for( pj = si+1; pj < ei; ++pj ){ - j = U->entries[pj].j; - val = U->entries[pj].val; - x[i] -= val * x[j]; - } - x[i] /= U->entries[si].val; - } + int i, pj, j, si, ei; + real val; + + for( i = U->n-1; i >= 0; --i ) { + x[i] = y[i]; + si = U->start[i]; + ei = U->start[i+1]; + for( pj = si+1; pj < ei; ++pj ){ + j = U->entries[pj].j; + val = U->entries[pj].val; + x[i] -= val * x[j]; + } + x[i] /= U->entries[si].val; + } } int GMRES( static_storage *workspace, sparse_matrix *H, - real *b, real tol, real *x, FILE *fout, reax_system* system) + real *b, real tol, real *x, FILE *fout, reax_system* system) { - int i, j, k, itr, N; - real cc, tmp1, tmp2, temp, bnorm; - - N = H->n; - bnorm = Norm( b, N ); - - /* apply the diagonal pre-conditioner to rhs */ - for( i = 0; i < N; ++i ) - workspace->b_prc[i] = b[i] * workspace->Hdia_inv[i]; - - /* GMRES outer-loop */ - for( itr = 0; itr < MAX_ITR; ++itr ) { - /* calculate r0 */ - Sparse_MatVec( H, x, workspace->b_prm ); - - for( i = 0; i < N; ++i ) - workspace->b_prm[i] *= workspace->Hdia_inv[i]; /* pre-conditioner */ - - - Vector_Sum(&workspace->v[ index_wkspace_sys (0,0,system) ], 1.,workspace->b_prc, -1., workspace->b_prm, N); - workspace->g[0] = Norm( &workspace->v[index_wkspace_sys (0,0,system)], N ); - Vector_Scale( &workspace->v[ index_wkspace_sys (0,0,system) ], 1.0/workspace->g[0], &workspace->v[index_wkspace_sys(0,0,system)], N ); - - /* GMRES inner-loop */ - for( j = 0; j < RESTART && fabs(workspace->g[j]) / bnorm > tol; j++ ) { - /* matvec */ - Sparse_MatVec( H, &workspace->v[index_wkspace_sys(j,0,system)], &workspace->v[index_wkspace_sys(j+1,0,system)] ); - - for( k = 0; k < N; ++k ) - workspace->v[ index_wkspace_sys (j+1,k,system)] *= workspace->Hdia_inv[k]; /*pre-conditioner*/ - - /* apply modified Gram-Schmidt to orthogonalize the new residual */ - for( i = 0; i <= j; i++ ) { - workspace->h[ index_wkspace_res (i,j) ] = Dot( &workspace->v[index_wkspace_sys(i,0,system)], &workspace->v[index_wkspace_sys(j+1,0,system)], N ); - Vector_Add( &workspace->v[index_wkspace_sys(j+1,0,system)], - -workspace->h[index_wkspace_res (i,j) ], &workspace->v[index_wkspace_sys(i,0,system)], N ); - } - - - workspace->h[ index_wkspace_res (j+1,j) ] = Norm( &workspace->v[index_wkspace_sys(j+1,0,system)], N ); - Vector_Scale( &workspace->v[index_wkspace_sys(j+1,0,system)], - 1. / workspace->h[ index_wkspace_res (j+1,j) ], &workspace->v[index_wkspace_sys(j+1,0,system)], N ); - // fprintf( stderr, "%d-%d: orthogonalization completed.\n", itr, j ); - - - /* Givens rotations on the upper-Hessenberg matrix to make it U */ - for( i = 0; i <= j; i++ ) { - if( i == j ) { - cc = SQRT( SQR(workspace->h[ index_wkspace_res (j,j) ])+SQR(workspace->h[ index_wkspace_res (j+1,j) ]) ); - workspace->hc[j] = workspace->h[ index_wkspace_res (j,j) ] / cc; - workspace->hs[j] = workspace->h[ index_wkspace_res (j+1,j) ] / cc; - } - - tmp1 = workspace->hc[i] * workspace->h[ index_wkspace_res (i,j) ] + - workspace->hs[i] * workspace->h[ index_wkspace_res (i+1,j) ]; - tmp2 = -workspace->hs[i] * workspace->h[ index_wkspace_res (i,j) ] + - workspace->hc[i] * workspace->h[ index_wkspace_res (i+1,j) ]; - - workspace->h[ index_wkspace_res (i,j) ] = tmp1; - workspace->h[ index_wkspace_res (i+1,j) ] = tmp2; - } - - /* apply Givens rotations to the rhs as well */ - tmp1 = workspace->hc[j] * workspace->g[j]; - tmp2 = -workspace->hs[j] * workspace->g[j]; - workspace->g[j] = tmp1; - workspace->g[j+1] = tmp2; - - // fprintf( stderr, "h: " ); - // for( i = 0; i <= j+1; ++i ) - // fprintf( stderr, "%.6f ", workspace->h[i][j] ); - // fprintf( stderr, "\n" ); - //fprintf( stderr, "res: %.15e\n", workspace->g[j+1] ); - } - - - /* solve Hy = g. - H is now upper-triangular, do back-substitution */ - for( i = j-1; i >= 0; i-- ) { - temp = workspace->g[i]; - for( k = j-1; k > i; k-- ) - temp -= workspace->h[ index_wkspace_res (i,k) ] * workspace->y[k]; - - workspace->y[i] = temp / workspace->h[ index_wkspace_res (i,i) ]; - } - - - /* update x = x_0 + Vy */ - for( i = 0; i < j; i++ ) - Vector_Add( x, workspace->y[i], &workspace->v[index_wkspace_sys(i,0,system)], N ); - - /* stopping condition */ - if( fabs(workspace->g[j]) / bnorm <= tol ) - break; - } - - // Sparse_MatVec( H, x, workspace->b_prm ); - // for( i = 0; i < N; ++i ) - // workspace->b_prm[i] *= workspace->Hdia_inv[i]; - // fprintf( fout, "\n%10s%15s%15s\n", "b_prc", "b_prm", "x" ); - // for( i = 0; i < N; ++i ) - // fprintf( fout, "%10.5f%15.12f%15.12f\n", - // workspace->b_prc[i], workspace->b_prm[i], x[i] );*/ - - // fprintf(fout,"GMRES outer:%d, inner:%d iters - residual norm: %25.20f\n", - // itr, j, fabs( workspace->g[j] ) / bnorm ); - // data->timing.matvec += itr * RESTART + j; - - if( itr >= MAX_ITR ) { - fprintf( stderr, "GMRES convergence failed\n" ); - // return -1; - return itr * (RESTART+1) + j + 1; - } - - return itr * (RESTART+1) + j + 1; + int i, j, k, itr, N; + real cc, tmp1, tmp2, temp, bnorm; + + N = H->n; + bnorm = Norm( b, N ); + + /* apply the diagonal pre-conditioner to rhs */ + for( i = 0; i < N; ++i ) + workspace->b_prc[i] = b[i] * workspace->Hdia_inv[i]; + + /* GMRES outer-loop */ + for( itr = 0; itr < MAX_ITR; ++itr ) { + /* calculate r0 */ + Sparse_MatVec( H, x, workspace->b_prm ); + + for( i = 0; i < N; ++i ) + workspace->b_prm[i] *= workspace->Hdia_inv[i]; /* pre-conditioner */ + + + Vector_Sum(&workspace->v[ index_wkspace_sys (0,0,system) ], 1.,workspace->b_prc, -1., workspace->b_prm, N); + workspace->g[0] = Norm( &workspace->v[index_wkspace_sys (0,0,system)], N ); + Vector_Scale( &workspace->v[ index_wkspace_sys (0,0,system) ], 1.0/workspace->g[0], &workspace->v[index_wkspace_sys(0,0,system)], N ); + + /* GMRES inner-loop */ + for( j = 0; j < RESTART && fabs(workspace->g[j]) / bnorm > tol; j++ ) { + /* matvec */ + Sparse_MatVec( H, &workspace->v[index_wkspace_sys(j,0,system)], &workspace->v[index_wkspace_sys(j+1,0,system)] ); + + for( k = 0; k < N; ++k ) + workspace->v[ index_wkspace_sys (j+1,k,system)] *= workspace->Hdia_inv[k]; /*pre-conditioner*/ + + /* apply modified Gram-Schmidt to orthogonalize the new residual */ + for( i = 0; i <= j; i++ ) { + workspace->h[ index_wkspace_res (i,j) ] = Dot( &workspace->v[index_wkspace_sys(i,0,system)], &workspace->v[index_wkspace_sys(j+1,0,system)], N ); + Vector_Add( &workspace->v[index_wkspace_sys(j+1,0,system)], + -workspace->h[index_wkspace_res (i,j) ], &workspace->v[index_wkspace_sys(i,0,system)], N ); + } + + + workspace->h[ index_wkspace_res (j+1,j) ] = Norm( &workspace->v[index_wkspace_sys(j+1,0,system)], N ); + Vector_Scale( &workspace->v[index_wkspace_sys(j+1,0,system)], + 1. / workspace->h[ index_wkspace_res (j+1,j) ], &workspace->v[index_wkspace_sys(j+1,0,system)], N ); + // fprintf( stderr, "%d-%d: orthogonalization completed.\n", itr, j ); + + + /* Givens rotations on the upper-Hessenberg matrix to make it U */ + for( i = 0; i <= j; i++ ) { + if( i == j ) { + cc = SQRT( SQR(workspace->h[ index_wkspace_res (j,j) ])+SQR(workspace->h[ index_wkspace_res (j+1,j) ]) ); + workspace->hc[j] = workspace->h[ index_wkspace_res (j,j) ] / cc; + workspace->hs[j] = workspace->h[ index_wkspace_res (j+1,j) ] / cc; + } + + tmp1 = workspace->hc[i] * workspace->h[ index_wkspace_res (i,j) ] + + workspace->hs[i] * workspace->h[ index_wkspace_res (i+1,j) ]; + tmp2 = -workspace->hs[i] * workspace->h[ index_wkspace_res (i,j) ] + + workspace->hc[i] * workspace->h[ index_wkspace_res (i+1,j) ]; + + workspace->h[ index_wkspace_res (i,j) ] = tmp1; + workspace->h[ index_wkspace_res (i+1,j) ] = tmp2; + } + + /* apply Givens rotations to the rhs as well */ + tmp1 = workspace->hc[j] * workspace->g[j]; + tmp2 = -workspace->hs[j] * workspace->g[j]; + workspace->g[j] = tmp1; + workspace->g[j+1] = tmp2; + + // fprintf( stderr, "h: " ); + // for( i = 0; i <= j+1; ++i ) + // fprintf( stderr, "%.6f ", workspace->h[i][j] ); + // fprintf( stderr, "\n" ); + //fprintf( stderr, "res: %.15e\n", workspace->g[j+1] ); + } + + + /* solve Hy = g. + H is now upper-triangular, do back-substitution */ + for( i = j-1; i >= 0; i-- ) { + temp = workspace->g[i]; + for( k = j-1; k > i; k-- ) + temp -= workspace->h[ index_wkspace_res (i,k) ] * workspace->y[k]; + + workspace->y[i] = temp / workspace->h[ index_wkspace_res (i,i) ]; + } + + + /* update x = x_0 + Vy */ + for( i = 0; i < j; i++ ) + Vector_Add( x, workspace->y[i], &workspace->v[index_wkspace_sys(i,0,system)], N ); + + /* stopping condition */ + if( fabs(workspace->g[j]) / bnorm <= tol ) + break; + } + + // Sparse_MatVec( H, x, workspace->b_prm ); + // for( i = 0; i < N; ++i ) + // workspace->b_prm[i] *= workspace->Hdia_inv[i]; + // fprintf( fout, "\n%10s%15s%15s\n", "b_prc", "b_prm", "x" ); + // for( i = 0; i < N; ++i ) + // fprintf( fout, "%10.5f%15.12f%15.12f\n", + // workspace->b_prc[i], workspace->b_prm[i], x[i] );*/ + + // fprintf(fout,"GMRES outer:%d, inner:%d iters - residual norm: %25.20f\n", + // itr, j, fabs( workspace->g[j] ) / bnorm ); + // data->timing.matvec += itr * RESTART + j; + + if( itr >= MAX_ITR ) { + fprintf( stderr, "GMRES convergence failed\n" ); + // return -1; + return itr * (RESTART+1) + j + 1; + } + + return itr * (RESTART+1) + j + 1; } @@ -223,916 +223,916 @@ int GMRES( static_storage *workspace, sparse_matrix *H, GLOBAL void GMRES_Diagonal_Preconditioner (real *b_proc, real *b, real *Hdia_inv, int entries) { - int i = blockIdx.x * blockDim.x + threadIdx.x; + int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= entries) return; + if (i >= entries) return; - b_proc [i] = b[i] * Hdia_inv[i]; + b_proc [i] = b[i] * Hdia_inv[i]; } GLOBAL void GMRES_Givens_Rotation (int j, real *h, real *hc, real *hs, real g_j, real *output) { - real tmp1, tmp2, cc; + real tmp1, tmp2, cc; - for( int i = 0; i <= j; i++ ) { - if( i == j ) { - cc = SQRT( SQR(h[ index_wkspace_res (j,j) ])+SQR(h[ index_wkspace_res (j+1,j) ]) ); - hc[j] = h[ index_wkspace_res (j,j) ] / cc; - hs[j] = h[ index_wkspace_res (j+1,j) ] / cc; - } + for( int i = 0; i <= j; i++ ) { + if( i == j ) { + cc = SQRT( SQR(h[ index_wkspace_res (j,j) ])+SQR(h[ index_wkspace_res (j+1,j) ]) ); + hc[j] = h[ index_wkspace_res (j,j) ] / cc; + hs[j] = h[ index_wkspace_res (j+1,j) ] / cc; + } - tmp1 = hc[i] * h[ index_wkspace_res (i,j) ] + hs[i] * h[ index_wkspace_res (i+1,j) ]; - tmp2 = -hs[i] * h[ index_wkspace_res (i,j) ] + hc[i] * h[ index_wkspace_res (i+1,j) ]; + tmp1 = hc[i] * h[ index_wkspace_res (i,j) ] + hs[i] * h[ index_wkspace_res (i+1,j) ]; + tmp2 = -hs[i] * h[ index_wkspace_res (i,j) ] + hc[i] * h[ index_wkspace_res (i+1,j) ]; - h[ index_wkspace_res (i,j) ] = tmp1; - h[ index_wkspace_res (i+1,j) ] = tmp2; - } + h[ index_wkspace_res (i,j) ] = tmp1; + h[ index_wkspace_res (i+1,j) ] = tmp2; + } - /* apply Givens rotations to the rhs as well */ - tmp1 = hc[j] * g_j; - tmp2 = -hs[j] * g_j; + /* apply Givens rotations to the rhs as well */ + tmp1 = hc[j] * g_j; + tmp2 = -hs[j] * g_j; - output[0] = tmp1; - output[1] = tmp2; + output[0] = tmp1; + output[1] = tmp2; } GLOBAL void GMRES_BackSubstitution (int j, real *g, real *h, real *y) { - real temp; - for( int i = j-1; i >= 0; i-- ) { - temp = g[i]; - for( int k = j-1; k > i; k-- ) - temp -= h[ index_wkspace_res (i,k) ] * y[k]; - - y[i] = temp / h[ index_wkspace_res (i,i) ]; - } + real temp; + for( int i = j-1; i >= 0; i-- ) { + temp = g[i]; + for( int k = j-1; k > i; k-- ) + temp -= h[ index_wkspace_res (i,k) ] * y[k]; + + y[i] = temp / h[ index_wkspace_res (i,i) ]; + } } int Cuda_GMRES( static_storage *workspace, real *b, real tol, real *x ) { - int i, j, k, itr, N; - real cc, tmp1, tmp2, temp, bnorm; - real v_add_tmp; - sparse_matrix *H = &workspace->H; + int i, j, k, itr, N; + real cc, tmp1, tmp2, temp, bnorm; + real v_add_tmp; + sparse_matrix *H = &workspace->H; - real t_start, t_elapsed; + real t_start, t_elapsed; - real *spad = (real *)scratch; - real *g = (real *) calloc ((RESTART+1), REAL_SIZE); + real *spad = (real *)scratch; + real *g = (real *) calloc ((RESTART+1), REAL_SIZE); - N = H->n; + N = H->n; - cuda_memset (spad, 0, REAL_SIZE * H->n * 2, RES_SCRATCH ); + cuda_memset (spad, 0, REAL_SIZE * H->n * 2, RES_SCRATCH ); - Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> (b, spad, H->n, INITIAL); - cudaThreadSynchronize (); - cudaCheckError (); + Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> (b, spad, H->n, INITIAL); + cudaThreadSynchronize (); + cudaCheckError (); - Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, spad + BLOCKS_POW_2, BLOCKS_POW_2, FINAL); - cudaThreadSynchronize (); - cudaCheckError (); + Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, spad + BLOCKS_POW_2, BLOCKS_POW_2, FINAL); + cudaThreadSynchronize (); + cudaCheckError (); - copy_host_device ( &bnorm, spad + BLOCKS_POW_2, REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + copy_host_device ( &bnorm, spad + BLOCKS_POW_2, REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); #ifdef __DEBUG_CUDA__ - fprintf (stderr, "Norm of the array is %e \n", bnorm ); + fprintf (stderr, "Norm of the array is %e \n", bnorm ); #endif - /* apply the diagonal pre-conditioner to rhs */ - GMRES_Diagonal_Preconditioner <<<BLOCKS, BLOCK_SIZE>>> - (workspace->b_prc, b, workspace->Hdia_inv, N); - cudaThreadSynchronize (); - cudaCheckError (); - - /* GMRES outer-loop */ - for( itr = 0; itr < MAX_ITR; ++itr ) { - /* calculate r0 */ - //Sparse_MatVec( H, x, workspace->b_prm ); - Cuda_Matvec_csr <<<MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, REAL_SIZE * MATVEC_BLOCK_SIZE>>> ( *H, x, workspace->b_prm, N ); - cudaThreadSynchronize (); - cudaCheckError (); - - GMRES_Diagonal_Preconditioner <<< BLOCKS, BLOCK_SIZE >>> - (workspace->b_prm, workspace->b_prm, workspace->Hdia_inv, N); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_Vector_Sum <<< BLOCKS, BLOCK_SIZE >>> - (&workspace->v[ index_wkspace_sys (0,0,N) ], 1.,workspace->b_prc, -1., workspace->b_prm, N); - cudaThreadSynchronize (); - cudaCheckError (); - - //workspace->g[0] = Norm( &workspace->v[index_wkspace_sys (0,0,system)], N ); - { - cuda_memset (spad, 0, REAL_SIZE * H->n * 2, RES_SCRATCH ); - - Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> - (&workspace->v [index_wkspace_sys (0, 0, N)], spad, N, INITIAL); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, &workspace->g[0], BLOCKS_POW_2, FINAL); - cudaThreadSynchronize (); - cudaCheckError (); - - copy_host_device( g, workspace->g, REAL_SIZE, cudaMemcpyDeviceToHost, RES_STORAGE_G); - } - - Cuda_Vector_Scale <<< BLOCKS, BLOCK_SIZE >>> - ( &workspace->v[ index_wkspace_sys (0,0,N) ], 1.0/g[0], &workspace->v[index_wkspace_sys(0,0,N)], N ); - cudaThreadSynchronize (); - cudaCheckError (); - - /* GMRES inner-loop */ + /* apply the diagonal pre-conditioner to rhs */ + GMRES_Diagonal_Preconditioner <<<BLOCKS, BLOCK_SIZE>>> + (workspace->b_prc, b, workspace->Hdia_inv, N); + cudaThreadSynchronize (); + cudaCheckError (); + + /* GMRES outer-loop */ + for( itr = 0; itr < MAX_ITR; ++itr ) { + /* calculate r0 */ + //Sparse_MatVec( H, x, workspace->b_prm ); + Cuda_Matvec_csr <<<MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, REAL_SIZE * MATVEC_BLOCK_SIZE>>> ( *H, x, workspace->b_prm, N ); + cudaThreadSynchronize (); + cudaCheckError (); + + GMRES_Diagonal_Preconditioner <<< BLOCKS, BLOCK_SIZE >>> + (workspace->b_prm, workspace->b_prm, workspace->Hdia_inv, N); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_Vector_Sum <<< BLOCKS, BLOCK_SIZE >>> + (&workspace->v[ index_wkspace_sys (0,0,N) ], 1.,workspace->b_prc, -1., workspace->b_prm, N); + cudaThreadSynchronize (); + cudaCheckError (); + + //workspace->g[0] = Norm( &workspace->v[index_wkspace_sys (0,0,system)], N ); + { + cuda_memset (spad, 0, REAL_SIZE * H->n * 2, RES_SCRATCH ); + + Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> + (&workspace->v [index_wkspace_sys (0, 0, N)], spad, N, INITIAL); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, &workspace->g[0], BLOCKS_POW_2, FINAL); + cudaThreadSynchronize (); + cudaCheckError (); + + copy_host_device( g, workspace->g, REAL_SIZE, cudaMemcpyDeviceToHost, RES_STORAGE_G); + } + + Cuda_Vector_Scale <<< BLOCKS, BLOCK_SIZE >>> + ( &workspace->v[ index_wkspace_sys (0,0,N) ], 1.0/g[0], &workspace->v[index_wkspace_sys(0,0,N)], N ); + cudaThreadSynchronize (); + cudaCheckError (); + + /* GMRES inner-loop */ #ifdef __DEBUG_CUDA__ - fprintf (stderr, " Inner loop inputs bnorm : %f , tol : %f g[j] : %f \n", bnorm, tol, g[0] ); + fprintf (stderr, " Inner loop inputs bnorm : %f , tol : %f g[j] : %f \n", bnorm, tol, g[0] ); #endif - for( j = 0; j < RESTART && fabs(g[j]) / bnorm > tol; j++ ) { - /* matvec */ - //Sparse_MatVec( H, &workspace->v[index_wkspace_sys(j,0,system)], &workspace->v[index_wkspace_sys(j+1,0,system)] ); - Cuda_Matvec_csr - <<<MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, REAL_SIZE * MATVEC_BLOCK_SIZE>>> - ( *H, &workspace->v[ index_wkspace_sys (j, 0, N)], &workspace->v[ index_wkspace_sys (j+1, 0, N) ], N ); - cudaThreadSynchronize (); - cudaCheckError (); - - GMRES_Diagonal_Preconditioner <<<BLOCKS, BLOCK_SIZE>>> - (&workspace->v[ index_wkspace_sys (j+1,0,N) ], &workspace->v[ index_wkspace_sys (j+1,0,N) ], workspace->Hdia_inv, N); - cudaThreadSynchronize (); - cudaCheckError (); - - - /* apply modified Gram-Schmidt to orthogonalize the new residual */ - for( i = 0; i <= j; i++ ) { - Cuda_Dot <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> - (&workspace->v[index_wkspace_sys(i,0,N)], &workspace->v[index_wkspace_sys(j+1,0,N)], spad, N); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, &workspace->h[ index_wkspace_res (i,j) ], BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); - - copy_host_device (&v_add_tmp, &workspace->h[ index_wkspace_res (i,j)], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - - Cuda_Vector_Add <<< BLOCKS, BLOCK_SIZE >>> - ( &workspace->v[index_wkspace_sys(j+1,0,N)], - -v_add_tmp, &workspace->v[index_wkspace_sys(i,0,N)], N ); - cudaThreadSynchronize (); - cudaCheckError (); - } - - - //workspace->h[ index_wkspace_res (j+1,j) ] = Norm( &workspace->v[index_wkspace_sys(j+1,0,system)], N ); - cuda_memset (spad, 0, REAL_SIZE * N * 2, RES_SCRATCH ); - - Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> (&workspace->v[index_wkspace_sys(j+1,0,N)], spad, N, INITIAL); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, &workspace->h[ index_wkspace_res (j+1,j) ], BLOCKS_POW_2, FINAL); - cudaThreadSynchronize (); - cudaCheckError (); - - copy_host_device (&v_add_tmp, &workspace->h[ index_wkspace_res (j+1,j) ], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - - Cuda_Vector_Scale <<< BLOCKS, BLOCK_SIZE >>> - ( &workspace->v[index_wkspace_sys(j+1,0,N)], - 1. / v_add_tmp, &workspace->v[index_wkspace_sys(j+1,0,N)], N ); - cudaThreadSynchronize (); - cudaCheckError (); - - /* Givens rotations on the upper-Hessenberg matrix to make it U */ - GMRES_Givens_Rotation <<<1, 1>>> - (j, workspace->h, workspace->hc, workspace->hs, g[j], spad); - cudaThreadSynchronize (); - cudaCheckError (); - copy_host_device (&g[j], spad, 2 * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - } - - copy_host_device (g, workspace->g, (RESTART+1)*REAL_SIZE, cudaMemcpyHostToDevice, __LINE__); - - /* solve Hy = g. - H is now upper-triangular, do back-substitution */ - copy_host_device (g, spad, (RESTART+1) * REAL_SIZE, cudaMemcpyHostToDevice, RES_STORAGE_G); - GMRES_BackSubstitution <<<1, 1>>> - (j, spad, workspace->h, workspace->y); - cudaThreadSynchronize (); - cudaCheckError (); - - /* update x = x_0 + Vy */ - for( i = 0; i < j; i++ ) - { - copy_host_device (&v_add_tmp, &workspace->y[i], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - Cuda_Vector_Add <<<BLOCKS, BLOCK_SIZE>>> - ( x, v_add_tmp, &workspace->v[index_wkspace_sys(i,0,N)], N ); - cudaThreadSynchronize (); - cudaCheckError (); - } - - /* stopping condition */ - if( fabs(g[j]) / bnorm <= tol ) - break; - } - - if( itr >= MAX_ITR ) { - fprintf( stderr, "GMRES convergence failed\n" ); - return itr * (RESTART+1) + j + 1; - } + for( j = 0; j < RESTART && fabs(g[j]) / bnorm > tol; j++ ) { + /* matvec */ + //Sparse_MatVec( H, &workspace->v[index_wkspace_sys(j,0,system)], &workspace->v[index_wkspace_sys(j+1,0,system)] ); + Cuda_Matvec_csr + <<<MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, REAL_SIZE * MATVEC_BLOCK_SIZE>>> + ( *H, &workspace->v[ index_wkspace_sys (j, 0, N)], &workspace->v[ index_wkspace_sys (j+1, 0, N) ], N ); + cudaThreadSynchronize (); + cudaCheckError (); + + GMRES_Diagonal_Preconditioner <<<BLOCKS, BLOCK_SIZE>>> + (&workspace->v[ index_wkspace_sys (j+1,0,N) ], &workspace->v[ index_wkspace_sys (j+1,0,N) ], workspace->Hdia_inv, N); + cudaThreadSynchronize (); + cudaCheckError (); + + + /* apply modified Gram-Schmidt to orthogonalize the new residual */ + for( i = 0; i <= j; i++ ) { + Cuda_Dot <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> + (&workspace->v[index_wkspace_sys(i,0,N)], &workspace->v[index_wkspace_sys(j+1,0,N)], spad, N); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, &workspace->h[ index_wkspace_res (i,j) ], BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); + + copy_host_device (&v_add_tmp, &workspace->h[ index_wkspace_res (i,j)], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + + Cuda_Vector_Add <<< BLOCKS, BLOCK_SIZE >>> + ( &workspace->v[index_wkspace_sys(j+1,0,N)], + -v_add_tmp, &workspace->v[index_wkspace_sys(i,0,N)], N ); + cudaThreadSynchronize (); + cudaCheckError (); + } + + + //workspace->h[ index_wkspace_res (j+1,j) ] = Norm( &workspace->v[index_wkspace_sys(j+1,0,system)], N ); + cuda_memset (spad, 0, REAL_SIZE * N * 2, RES_SCRATCH ); + + Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> (&workspace->v[index_wkspace_sys(j+1,0,N)], spad, N, INITIAL); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, &workspace->h[ index_wkspace_res (j+1,j) ], BLOCKS_POW_2, FINAL); + cudaThreadSynchronize (); + cudaCheckError (); + + copy_host_device (&v_add_tmp, &workspace->h[ index_wkspace_res (j+1,j) ], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + + Cuda_Vector_Scale <<< BLOCKS, BLOCK_SIZE >>> + ( &workspace->v[index_wkspace_sys(j+1,0,N)], + 1. / v_add_tmp, &workspace->v[index_wkspace_sys(j+1,0,N)], N ); + cudaThreadSynchronize (); + cudaCheckError (); + + /* Givens rotations on the upper-Hessenberg matrix to make it U */ + GMRES_Givens_Rotation <<<1, 1>>> + (j, workspace->h, workspace->hc, workspace->hs, g[j], spad); + cudaThreadSynchronize (); + cudaCheckError (); + copy_host_device (&g[j], spad, 2 * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + } + + copy_host_device (g, workspace->g, (RESTART+1)*REAL_SIZE, cudaMemcpyHostToDevice, __LINE__); + + /* solve Hy = g. + H is now upper-triangular, do back-substitution */ + copy_host_device (g, spad, (RESTART+1) * REAL_SIZE, cudaMemcpyHostToDevice, RES_STORAGE_G); + GMRES_BackSubstitution <<<1, 1>>> + (j, spad, workspace->h, workspace->y); + cudaThreadSynchronize (); + cudaCheckError (); + + /* update x = x_0 + Vy */ + for( i = 0; i < j; i++ ) + { + copy_host_device (&v_add_tmp, &workspace->y[i], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + Cuda_Vector_Add <<<BLOCKS, BLOCK_SIZE>>> + ( x, v_add_tmp, &workspace->v[index_wkspace_sys(i,0,N)], N ); + cudaThreadSynchronize (); + cudaCheckError (); + } + + /* stopping condition */ + if( fabs(g[j]) / bnorm <= tol ) + break; + } + + if( itr >= MAX_ITR ) { + fprintf( stderr, "GMRES convergence failed\n" ); + return itr * (RESTART+1) + j + 1; + } #ifdef __DEBUG_CUDA__ - fprintf (stderr, " GPU values itr : %d, RESTART: %d, j: %d \n", itr, RESTART, j); + fprintf (stderr, " GPU values itr : %d, RESTART: %d, j: %d \n", itr, RESTART, j); #endif - return itr * (RESTART+1) + j + 1; + return itr * (RESTART+1) + j + 1; } int Cublas_GMRES(reax_system *system, static_storage *workspace, real *b, real tol, real *x ) { - real CSR_ALPHA = 1, CSR_BETA = 0; + real CSR_ALPHA = 1, CSR_BETA = 0; - int i, j, k, itr, N; - real cc, tmp1, tmp2, temp, bnorm; - real v_add_tmp; - sparse_matrix *H = &workspace->H; + int i, j, k, itr, N; + real cc, tmp1, tmp2, temp, bnorm; + real v_add_tmp; + sparse_matrix *H = &workspace->H; - real t_start, t_elapsed; + real t_start, t_elapsed; - real *spad = (real *)scratch; - real *g = (real *) calloc ((RESTART+1), REAL_SIZE); + real *spad = (real *)scratch; + real *g = (real *) calloc ((RESTART+1), REAL_SIZE); - N = H->n; + N = H->n; - cuda_memset (spad, 0, REAL_SIZE * H->n * 2, RES_SCRATCH ); + cuda_memset (spad, 0, REAL_SIZE * H->n * 2, RES_SCRATCH ); - /* - Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> (b, spad, H->n, INITIAL); - cudaThreadSynchronize (); - cudaCheckError (); + /* + Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> (b, spad, H->n, INITIAL); + cudaThreadSynchronize (); + cudaCheckError (); - Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, spad + BLOCKS_POW_2, BLOCKS_POW_2, FINAL); - cudaThreadSynchronize (); - cudaCheckError (); + Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, spad + BLOCKS_POW_2, BLOCKS_POW_2, FINAL); + cudaThreadSynchronize (); + cudaCheckError (); - copy_host_device ( &bnorm, spad + BLOCKS_POW_2, REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - */ + copy_host_device ( &bnorm, spad + BLOCKS_POW_2, REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + */ - cublasCheckError (cublasDnrm2 ( cublasHandle, N, b, 1, &bnorm )); + cublasCheckError (cublasDnrm2 ( cublasHandle, N, b, 1, &bnorm )); #ifdef __DEBUG_CUDA__ - fprintf (stderr, "Norm of the array is %e \n", bnorm ); + fprintf (stderr, "Norm of the array is %e \n", bnorm ); #endif - /* apply the diagonal pre-conditioner to rhs */ - GMRES_Diagonal_Preconditioner <<<BLOCKS, BLOCK_SIZE>>> - (workspace->b_prc, b, workspace->Hdia_inv, N); - cudaThreadSynchronize (); - cudaCheckError (); - - /* GMRES outer-loop */ - for( itr = 0; itr < MAX_ITR; ++itr ) { - /* calculate r0 */ - //Sparse_MatVec( H, x, workspace->b_prm ); - Cuda_Matvec_csr <<<MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, REAL_SIZE * MATVEC_BLOCK_SIZE>>> ( *H, x, workspace->b_prm, N ); - cudaThreadSynchronize (); - cudaCheckError (); - - GMRES_Diagonal_Preconditioner <<< BLOCKS, BLOCK_SIZE >>> - (workspace->b_prm, workspace->b_prm, workspace->Hdia_inv, N); - cudaThreadSynchronize (); - cudaCheckError (); - - /* - Cuda_Vector_Sum <<< BLOCKS, BLOCK_SIZE >>> - (&workspace->v[ index_wkspace_sys (0,0,N) ], 1.,workspace->b_prc, -1., workspace->b_prm, N); - cudaThreadSynchronize (); - cudaCheckError (); - */ - cuda_memset (workspace->v, 0, REAL_SIZE * (RESTART+1) * N, RES_STORAGE_V); - - double D_ONE = 1.; - double D_MINUS_ONE = -1.; - cublasCheckError (cublasDaxpy (cublasHandle, N, &D_ONE, workspace->b_prc, 1, &workspace->v[ index_wkspace_sys (0,0,N) ], 1)); - cublasCheckError (cublasDaxpy (cublasHandle, N, &D_MINUS_ONE, workspace->b_prm, 1, &workspace->v[ index_wkspace_sys (0,0,N) ], 1)); - - //workspace->g[0] = Norm( &workspace->v[index_wkspace_sys (0,0,system)], N ); - { - /* - cuda_memset (spad, 0, REAL_SIZE * H->n * 2, RES_SCRATCH ); - - Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> - (&workspace->v [index_wkspace_sys (0, 0, N)], spad, N, INITIAL); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, &workspace->g[0], BLOCKS_POW_2, FINAL); - cudaThreadSynchronize (); - cudaCheckError (); - - copy_host_device( g, workspace->g, REAL_SIZE, cudaMemcpyDeviceToHost, RES_STORAGE_G); - */ - - cublasCheckError (cublasDnrm2 ( cublasHandle, N, &workspace->v [index_wkspace_sys (0, 0, N)], 1, g )); - copy_host_device( g, workspace->g, REAL_SIZE, cudaMemcpyHostToDevice, RES_STORAGE_G); - } - - /* - Cuda_Vector_Scale <<< BLOCKS, BLOCK_SIZE >>> - ( &workspace->v[ index_wkspace_sys (0,0,N) ], 1.0/g[0], &workspace->v[index_wkspace_sys(0,0,N)], N ); - cudaThreadSynchronize (); - cudaCheckError (); - */ - - double D_SCALE = 1.0 / g[0]; - cublasCheckError (cublasDscal (cublasHandle, N, &D_SCALE, &workspace->v[ index_wkspace_sys (0,0,N) ], 1)); - - - /* GMRES inner-loop */ + /* apply the diagonal pre-conditioner to rhs */ + GMRES_Diagonal_Preconditioner <<<BLOCKS, BLOCK_SIZE>>> + (workspace->b_prc, b, workspace->Hdia_inv, N); + cudaThreadSynchronize (); + cudaCheckError (); + + /* GMRES outer-loop */ + for( itr = 0; itr < MAX_ITR; ++itr ) { + /* calculate r0 */ + //Sparse_MatVec( H, x, workspace->b_prm ); + Cuda_Matvec_csr <<<MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, REAL_SIZE * MATVEC_BLOCK_SIZE>>> ( *H, x, workspace->b_prm, N ); + cudaThreadSynchronize (); + cudaCheckError (); + + GMRES_Diagonal_Preconditioner <<< BLOCKS, BLOCK_SIZE >>> + (workspace->b_prm, workspace->b_prm, workspace->Hdia_inv, N); + cudaThreadSynchronize (); + cudaCheckError (); + + /* + Cuda_Vector_Sum <<< BLOCKS, BLOCK_SIZE >>> + (&workspace->v[ index_wkspace_sys (0,0,N) ], 1.,workspace->b_prc, -1., workspace->b_prm, N); + cudaThreadSynchronize (); + cudaCheckError (); + */ + cuda_memset (workspace->v, 0, REAL_SIZE * (RESTART+1) * N, RES_STORAGE_V); + + double D_ONE = 1.; + double D_MINUS_ONE = -1.; + cublasCheckError (cublasDaxpy (cublasHandle, N, &D_ONE, workspace->b_prc, 1, &workspace->v[ index_wkspace_sys (0,0,N) ], 1)); + cublasCheckError (cublasDaxpy (cublasHandle, N, &D_MINUS_ONE, workspace->b_prm, 1, &workspace->v[ index_wkspace_sys (0,0,N) ], 1)); + + //workspace->g[0] = Norm( &workspace->v[index_wkspace_sys (0,0,system)], N ); + { + /* + cuda_memset (spad, 0, REAL_SIZE * H->n * 2, RES_SCRATCH ); + + Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> + (&workspace->v [index_wkspace_sys (0, 0, N)], spad, N, INITIAL); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, &workspace->g[0], BLOCKS_POW_2, FINAL); + cudaThreadSynchronize (); + cudaCheckError (); + + copy_host_device( g, workspace->g, REAL_SIZE, cudaMemcpyDeviceToHost, RES_STORAGE_G); + */ + + cublasCheckError (cublasDnrm2 ( cublasHandle, N, &workspace->v [index_wkspace_sys (0, 0, N)], 1, g )); + copy_host_device( g, workspace->g, REAL_SIZE, cudaMemcpyHostToDevice, RES_STORAGE_G); + } + + /* + Cuda_Vector_Scale <<< BLOCKS, BLOCK_SIZE >>> + ( &workspace->v[ index_wkspace_sys (0,0,N) ], 1.0/g[0], &workspace->v[index_wkspace_sys(0,0,N)], N ); + cudaThreadSynchronize (); + cudaCheckError (); + */ + + double D_SCALE = 1.0 / g[0]; + cublasCheckError (cublasDscal (cublasHandle, N, &D_SCALE, &workspace->v[ index_wkspace_sys (0,0,N) ], 1)); + + + /* GMRES inner-loop */ #ifdef __DEBUG_CUDA__ - fprintf (stderr, " Inner loop inputs bnorm : %f , tol : %f g[j] : %f \n", bnorm, tol, g[0] ); + fprintf (stderr, " Inner loop inputs bnorm : %f , tol : %f g[j] : %f \n", bnorm, tol, g[0] ); #endif - for( j = 0; j < RESTART && fabs(g[j]) / bnorm > tol; j++ ) { - /* matvec */ - Cuda_Matvec_csr - <<<MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, REAL_SIZE * MATVEC_BLOCK_SIZE>>> - ( *H, &workspace->v[ index_wkspace_sys (j, 0, N)], &workspace->v[ index_wkspace_sys (j+1, 0, N) ], N ); - cudaThreadSynchronize (); - cudaCheckError (); - - GMRES_Diagonal_Preconditioner <<<BLOCKS, BLOCK_SIZE>>> - (&workspace->v[ index_wkspace_sys (j+1,0,N) ], &workspace->v[ index_wkspace_sys (j+1,0,N) ], workspace->Hdia_inv, N); - cudaThreadSynchronize (); - cudaCheckError (); - - - /* apply modified Gram-Schmidt to orthogonalize the new residual */ - for( i = 0; i <= j; i++ ) { - - /* - Cuda_Dot <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> - (&workspace->v[index_wkspace_sys(i,0,N)], &workspace->v[index_wkspace_sys(j+1,0,N)], spad, N); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, &workspace->h[ index_wkspace_res (i,j) ], BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); - - copy_host_device (&v_add_tmp, &workspace->h[ index_wkspace_res (i,j)], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - */ - - cublasCheckError (cublasDdot (cublasHandle, N, &workspace->v[index_wkspace_sys(i,0,N)], 1, - &workspace->v[index_wkspace_sys(j+1,0,N)], 1, - &v_add_tmp)); - copy_host_device (&v_add_tmp, &workspace->h[ index_wkspace_res (i,j)], REAL_SIZE, cudaMemcpyHostToDevice, __LINE__); - - /* - Cuda_Vector_Add <<< BLOCKS, BLOCK_SIZE >>> - ( &workspace->v[index_wkspace_sys(j+1,0,N)], - -v_add_tmp, &workspace->v[index_wkspace_sys(i,0,N)], N ); - cudaThreadSynchronize (); - cudaCheckError (); - */ - - double NEG_V_ADD_TMP = -v_add_tmp; - cublasCheckError (cublasDaxpy (cublasHandle, N, &NEG_V_ADD_TMP, &workspace->v[index_wkspace_sys(i,0,N)], 1, - &workspace->v[index_wkspace_sys(j+1,0,N)], 1 )); - } - - - //workspace->h[ index_wkspace_res (j+1,j) ] = Norm( &workspace->v[index_wkspace_sys(j+1,0,system)], N ); - /* - cuda_memset (spad, 0, REAL_SIZE * N * 2, RES_SCRATCH ); - - Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> (&workspace->v[index_wkspace_sys(j+1,0,N)], spad, N, INITIAL); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, &workspace->h[ index_wkspace_res (j+1,j) ], BLOCKS_POW_2, FINAL); - cudaThreadSynchronize (); - cudaCheckError (); - - copy_host_device (&v_add_tmp, &workspace->h[ index_wkspace_res (j+1,j) ], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - */ - cublasCheckError (cublasDnrm2 ( cublasHandle, N, &workspace->v [index_wkspace_sys (j+1, 0, N)], 1, &v_add_tmp )); - copy_host_device (&v_add_tmp, &workspace->h[ index_wkspace_res (j+1,j) ], REAL_SIZE, cudaMemcpyHostToDevice, __LINE__); - - - /* - Cuda_Vector_Scale <<< BLOCKS, BLOCK_SIZE >>> - ( &workspace->v[index_wkspace_sys(j+1,0,N)], - 1. / v_add_tmp, &workspace->v[index_wkspace_sys(j+1,0,N)], N ); - cudaThreadSynchronize (); - cudaCheckError (); - */ - double REC_V_ADD_TMP = 1. / v_add_tmp; - cublasCheckError (cublasDscal (cublasHandle, N, &REC_V_ADD_TMP, &workspace->v[index_wkspace_sys(j+1,0,N)], 1)); - - - - /* Givens rotations on the upper-Hessenberg matrix to make it U */ - GMRES_Givens_Rotation <<<1, 1>>> - (j, workspace->h, workspace->hc, workspace->hs, g[j], spad); - cudaThreadSynchronize (); - cudaCheckError (); - copy_host_device (&g[j], spad, 2 * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - } - - copy_host_device (g, workspace->g, (RESTART+1)*REAL_SIZE, cudaMemcpyHostToDevice, __LINE__); - - /* solve Hy = g. - H is now upper-triangular, do back-substitution */ - copy_host_device (g, spad, (RESTART+1) * REAL_SIZE, cudaMemcpyHostToDevice, RES_STORAGE_G); - GMRES_BackSubstitution <<<1, 1>>> - (j, spad, workspace->h, workspace->y); - cudaThreadSynchronize (); - cudaCheckError (); - - /* update x = x_0 + Vy */ - for( i = 0; i < j; i++ ) - { - /* - copy_host_device (&v_add_tmp, &workspace->y[i], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - Cuda_Vector_Add <<<BLOCKS, BLOCK_SIZE>>> - ( x, v_add_tmp, &workspace->v[index_wkspace_sys(i,0,N)], N ); - cudaThreadSynchronize (); - cudaCheckError (); - */ - - copy_host_device (&v_add_tmp, &workspace->y[i], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - cublasCheckError (cublasDaxpy (cublasHandle, N, &v_add_tmp, &workspace->v[index_wkspace_sys(i,0,N)], 1, - x, 1)); - } - - /* stopping condition */ - if( fabs(g[j]) / bnorm <= tol ) - break; - } - - if( itr >= MAX_ITR ) { - fprintf( stderr, "GMRES convergence failed\n" ); - return itr * (RESTART+1) + j + 1; - } + for( j = 0; j < RESTART && fabs(g[j]) / bnorm > tol; j++ ) { + /* matvec */ + Cuda_Matvec_csr + <<<MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, REAL_SIZE * MATVEC_BLOCK_SIZE>>> + ( *H, &workspace->v[ index_wkspace_sys (j, 0, N)], &workspace->v[ index_wkspace_sys (j+1, 0, N) ], N ); + cudaThreadSynchronize (); + cudaCheckError (); + + GMRES_Diagonal_Preconditioner <<<BLOCKS, BLOCK_SIZE>>> + (&workspace->v[ index_wkspace_sys (j+1,0,N) ], &workspace->v[ index_wkspace_sys (j+1,0,N) ], workspace->Hdia_inv, N); + cudaThreadSynchronize (); + cudaCheckError (); + + + /* apply modified Gram-Schmidt to orthogonalize the new residual */ + for( i = 0; i <= j; i++ ) { + + /* + Cuda_Dot <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> + (&workspace->v[index_wkspace_sys(i,0,N)], &workspace->v[index_wkspace_sys(j+1,0,N)], spad, N); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, &workspace->h[ index_wkspace_res (i,j) ], BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); + + copy_host_device (&v_add_tmp, &workspace->h[ index_wkspace_res (i,j)], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + */ + + cublasCheckError (cublasDdot (cublasHandle, N, &workspace->v[index_wkspace_sys(i,0,N)], 1, + &workspace->v[index_wkspace_sys(j+1,0,N)], 1, + &v_add_tmp)); + copy_host_device (&v_add_tmp, &workspace->h[ index_wkspace_res (i,j)], REAL_SIZE, cudaMemcpyHostToDevice, __LINE__); + + /* + Cuda_Vector_Add <<< BLOCKS, BLOCK_SIZE >>> + ( &workspace->v[index_wkspace_sys(j+1,0,N)], + -v_add_tmp, &workspace->v[index_wkspace_sys(i,0,N)], N ); + cudaThreadSynchronize (); + cudaCheckError (); + */ + + double NEG_V_ADD_TMP = -v_add_tmp; + cublasCheckError (cublasDaxpy (cublasHandle, N, &NEG_V_ADD_TMP, &workspace->v[index_wkspace_sys(i,0,N)], 1, + &workspace->v[index_wkspace_sys(j+1,0,N)], 1 )); + } + + + //workspace->h[ index_wkspace_res (j+1,j) ] = Norm( &workspace->v[index_wkspace_sys(j+1,0,system)], N ); + /* + cuda_memset (spad, 0, REAL_SIZE * N * 2, RES_SCRATCH ); + + Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> (&workspace->v[index_wkspace_sys(j+1,0,N)], spad, N, INITIAL); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, &workspace->h[ index_wkspace_res (j+1,j) ], BLOCKS_POW_2, FINAL); + cudaThreadSynchronize (); + cudaCheckError (); + + copy_host_device (&v_add_tmp, &workspace->h[ index_wkspace_res (j+1,j) ], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + */ + cublasCheckError (cublasDnrm2 ( cublasHandle, N, &workspace->v [index_wkspace_sys (j+1, 0, N)], 1, &v_add_tmp )); + copy_host_device (&v_add_tmp, &workspace->h[ index_wkspace_res (j+1,j) ], REAL_SIZE, cudaMemcpyHostToDevice, __LINE__); + + + /* + Cuda_Vector_Scale <<< BLOCKS, BLOCK_SIZE >>> + ( &workspace->v[index_wkspace_sys(j+1,0,N)], + 1. / v_add_tmp, &workspace->v[index_wkspace_sys(j+1,0,N)], N ); + cudaThreadSynchronize (); + cudaCheckError (); + */ + double REC_V_ADD_TMP = 1. / v_add_tmp; + cublasCheckError (cublasDscal (cublasHandle, N, &REC_V_ADD_TMP, &workspace->v[index_wkspace_sys(j+1,0,N)], 1)); + + + + /* Givens rotations on the upper-Hessenberg matrix to make it U */ + GMRES_Givens_Rotation <<<1, 1>>> + (j, workspace->h, workspace->hc, workspace->hs, g[j], spad); + cudaThreadSynchronize (); + cudaCheckError (); + copy_host_device (&g[j], spad, 2 * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + } + + copy_host_device (g, workspace->g, (RESTART+1)*REAL_SIZE, cudaMemcpyHostToDevice, __LINE__); + + /* solve Hy = g. + H is now upper-triangular, do back-substitution */ + copy_host_device (g, spad, (RESTART+1) * REAL_SIZE, cudaMemcpyHostToDevice, RES_STORAGE_G); + GMRES_BackSubstitution <<<1, 1>>> + (j, spad, workspace->h, workspace->y); + cudaThreadSynchronize (); + cudaCheckError (); + + /* update x = x_0 + Vy */ + for( i = 0; i < j; i++ ) + { + /* + copy_host_device (&v_add_tmp, &workspace->y[i], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + Cuda_Vector_Add <<<BLOCKS, BLOCK_SIZE>>> + ( x, v_add_tmp, &workspace->v[index_wkspace_sys(i,0,N)], N ); + cudaThreadSynchronize (); + cudaCheckError (); + */ + + copy_host_device (&v_add_tmp, &workspace->y[i], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + cublasCheckError (cublasDaxpy (cublasHandle, N, &v_add_tmp, &workspace->v[index_wkspace_sys(i,0,N)], 1, + x, 1)); + } + + /* stopping condition */ + if( fabs(g[j]) / bnorm <= tol ) + break; + } + + if( itr >= MAX_ITR ) { + fprintf( stderr, "GMRES convergence failed\n" ); + return itr * (RESTART+1) + j + 1; + } #ifdef __DEBUG_CUDA__ - fprintf (stderr, " GPU values itr : %d, RESTART: %d, j: %d \n", itr, RESTART, j); + fprintf (stderr, " GPU values itr : %d, RESTART: %d, j: %d \n", itr, RESTART, j); #endif - return itr * (RESTART+1) + j + 1; + return itr * (RESTART+1) + j + 1; } int GMRES_HouseHolder( static_storage *workspace, sparse_matrix *H, - real *b, real tol, real *x, FILE *fout, reax_system *system) + real *b, real tol, real *x, FILE *fout, reax_system *system) { - int i, j, k, itr, N; - real cc, tmp1, tmp2, temp, bnorm; - real v[10000], z[RESTART+2][10000], w[RESTART+2]; - real u[RESTART+2][10000]; + int i, j, k, itr, N; + real cc, tmp1, tmp2, temp, bnorm; + real v[10000], z[RESTART+2][10000], w[RESTART+2]; + real u[RESTART+2][10000]; - N = H->n; - bnorm = Norm( b, N ); + N = H->n; + bnorm = Norm( b, N ); - /* apply the diagonal pre-conditioner to rhs */ - for( i = 0; i < N; ++i ) - workspace->b_prc[i] = b[i] * workspace->Hdia_inv[i]; + /* apply the diagonal pre-conditioner to rhs */ + for( i = 0; i < N; ++i ) + workspace->b_prc[i] = b[i] * workspace->Hdia_inv[i]; - // memset( x, 0, sizeof(real) * N ); + // memset( x, 0, sizeof(real) * N ); - /* GMRES outer-loop */ - for( itr = 0; itr < MAX_ITR; ++itr ) { - /* compute z = r0 */ - Sparse_MatVec( H, x, workspace->b_prm ); - for( i = 0; i < N; ++i ) - workspace->b_prm[i] *= workspace->Hdia_inv[i]; /* pre-conditioner */ - Vector_Sum( z[0], 1., workspace->b_prc, -1., workspace->b_prm, N ); + /* GMRES outer-loop */ + for( itr = 0; itr < MAX_ITR; ++itr ) { + /* compute z = r0 */ + Sparse_MatVec( H, x, workspace->b_prm ); + for( i = 0; i < N; ++i ) + workspace->b_prm[i] *= workspace->Hdia_inv[i]; /* pre-conditioner */ + Vector_Sum( z[0], 1., workspace->b_prc, -1., workspace->b_prm, N ); - Vector_MakeZero( w, RESTART+1 ); - w[0] = Norm( z[0], N ); + Vector_MakeZero( w, RESTART+1 ); + w[0] = Norm( z[0], N ); - Vector_Copy( u[0], z[0], N ); - u[0][0] += ( u[0][0] < 0.0 ? -1 : 1 ) * w[0]; - Vector_Scale( u[0], 1 / Norm( u[0], N ), u[0], N ); + Vector_Copy( u[0], z[0], N ); + u[0][0] += ( u[0][0] < 0.0 ? -1 : 1 ) * w[0]; + Vector_Scale( u[0], 1 / Norm( u[0], N ), u[0], N ); - w[0] *= ( u[0][0] < 0.0 ? 1 :-1 ); - // fprintf( stderr, "\n\n%12.6f\n", w[0] ); + w[0] *= ( u[0][0] < 0.0 ? 1 :-1 ); + // fprintf( stderr, "\n\n%12.6f\n", w[0] ); - /* GMRES inner-loop */ - for( j = 0; j < RESTART && fabs( w[j] ) / bnorm > tol; j++ ) { - /* compute v_j */ - Vector_Scale( z[j], -2 * u[j][j], u[j], N ); - z[j][j] += 1.; /* due to e_j */ + /* GMRES inner-loop */ + for( j = 0; j < RESTART && fabs( w[j] ) / bnorm > tol; j++ ) { + /* compute v_j */ + Vector_Scale( z[j], -2 * u[j][j], u[j], N ); + z[j][j] += 1.; /* due to e_j */ - for( i = j-1; i >= 0; --i ) - Vector_Add( z[j]+i, -2 * Dot( u[i]+i, z[j]+i, N-i ), u[i]+i, N-i ); + for( i = j-1; i >= 0; --i ) + Vector_Add( z[j]+i, -2 * Dot( u[i]+i, z[j]+i, N-i ), u[i]+i, N-i ); - /* matvec */ - Sparse_MatVec( H, z[j], v ); + /* matvec */ + Sparse_MatVec( H, z[j], v ); - for( k = 0; k < N; ++k ) - v[k] *= workspace->Hdia_inv[k]; /* pre-conditioner */ + for( k = 0; k < N; ++k ) + v[k] *= workspace->Hdia_inv[k]; /* pre-conditioner */ - for( i = 0; i <= j; ++i ) - Vector_Add( v+i, -2 * Dot( u[i]+i, v+i, N-i ), u[i]+i, N-i ); + for( i = 0; i <= j; ++i ) + Vector_Add( v+i, -2 * Dot( u[i]+i, v+i, N-i ), u[i]+i, N-i ); - if( !Vector_isZero( v + (j+1), N - (j+1) ) ) { - /* compute the HouseHolder unit vector u_j+1 */ - for( i = 0; i <= j; ++i ) - u[j+1][i] = 0; + if( !Vector_isZero( v + (j+1), N - (j+1) ) ) { + /* compute the HouseHolder unit vector u_j+1 */ + for( i = 0; i <= j; ++i ) + u[j+1][i] = 0; - Vector_Copy( u[j+1] + (j+1), v + (j+1), N - (j+1) ); + Vector_Copy( u[j+1] + (j+1), v + (j+1), N - (j+1) ); - u[j+1][j+1] += ( v[j+1]<0.0 ? -1:1 ) * Norm( v+(j+1), N-(j+1) ); + u[j+1][j+1] += ( v[j+1]<0.0 ? -1:1 ) * Norm( v+(j+1), N-(j+1) ); - Vector_Scale( u[j+1], 1 / Norm( u[j+1], N ), u[j+1], N ); + Vector_Scale( u[j+1], 1 / Norm( u[j+1], N ), u[j+1], N ); - /* overwrite v with P_m+1 * v */ - v[j+1] -= 2 * Dot( u[j+1]+(j+1), v+(j+1), N-(j+1) ) * u[j+1][j+1]; - Vector_MakeZero( v + (j+2), N - (j+2) ); - // Vector_Add( v, -2 * Dot( u[j+1], v, N ), u[j+1], N ); - } + /* overwrite v with P_m+1 * v */ + v[j+1] -= 2 * Dot( u[j+1]+(j+1), v+(j+1), N-(j+1) ) * u[j+1][j+1]; + Vector_MakeZero( v + (j+2), N - (j+2) ); + // Vector_Add( v, -2 * Dot( u[j+1], v, N ), u[j+1], N ); + } - /* prev Givens rots on the upper-Hessenberg matrix to make it U */ - for( i = 0; i < j; i++ ) { - tmp1 = workspace->hc[i] * v[i] + workspace->hs[i] * v[i+1]; - tmp2 = -workspace->hs[i] * v[i] + workspace->hc[i] * v[i+1]; + /* prev Givens rots on the upper-Hessenberg matrix to make it U */ + for( i = 0; i < j; i++ ) { + tmp1 = workspace->hc[i] * v[i] + workspace->hs[i] * v[i+1]; + tmp2 = -workspace->hs[i] * v[i] + workspace->hc[i] * v[i+1]; - v[i] = tmp1; - v[i+1] = tmp2; - } + v[i] = tmp1; + v[i+1] = tmp2; + } - /* apply the new Givens rotation to H and right-hand side */ - if( fabs(v[j+1]) >= ALMOST_ZERO ) { - cc = SQRT( SQR( v[j] ) + SQR( v[j+1] ) ); - workspace->hc[j] = v[j] / cc; - workspace->hs[j] = v[j+1] / cc; + /* apply the new Givens rotation to H and right-hand side */ + if( fabs(v[j+1]) >= ALMOST_ZERO ) { + cc = SQRT( SQR( v[j] ) + SQR( v[j+1] ) ); + workspace->hc[j] = v[j] / cc; + workspace->hs[j] = v[j+1] / cc; - tmp1 = workspace->hc[j] * v[j] + workspace->hs[j] * v[j+1]; - tmp2 = -workspace->hs[j] * v[j] + workspace->hc[j] * v[j+1]; + tmp1 = workspace->hc[j] * v[j] + workspace->hs[j] * v[j+1]; + tmp2 = -workspace->hs[j] * v[j] + workspace->hc[j] * v[j+1]; - v[j] = tmp1; - v[j+1] = tmp2; + v[j] = tmp1; + v[j+1] = tmp2; - /* Givens rotations to rhs */ - tmp1 = workspace->hc[j] * w[j]; - tmp2 = -workspace->hs[j] * w[j]; - w[j] = tmp1; - w[j+1] = tmp2; - } + /* Givens rotations to rhs */ + tmp1 = workspace->hc[j] * w[j]; + tmp2 = -workspace->hs[j] * w[j]; + w[j] = tmp1; + w[j+1] = tmp2; + } - /* extend R */ - for( i = 0; i <= j; ++i ) - workspace->h[ index_wkspace_res (i,j) ] = v[i]; + /* extend R */ + for( i = 0; i <= j; ++i ) + workspace->h[ index_wkspace_res (i,j) ] = v[i]; - // fprintf( stderr, "h:" ); - // for( i = 0; i <= j+1 ; ++i ) - // fprintf( stderr, "%.6f ", h[i][j] ); - // fprintf( stderr, "\n" ); - // fprintf( stderr, "%12.6f\n", w[j+1] ); - } + // fprintf( stderr, "h:" ); + // for( i = 0; i <= j+1 ; ++i ) + // fprintf( stderr, "%.6f ", h[i][j] ); + // fprintf( stderr, "\n" ); + // fprintf( stderr, "%12.6f\n", w[j+1] ); + } - /* solve Hy = w. - H is now upper-triangular, do back-substitution */ - for( i = j-1; i >= 0; i-- ) { - temp = w[i]; - for( k = j-1; k > i; k-- ) - temp -= workspace->h[ index_wkspace_res (i,k) ] * workspace->y[k]; + /* solve Hy = w. + H is now upper-triangular, do back-substitution */ + for( i = j-1; i >= 0; i-- ) { + temp = w[i]; + for( k = j-1; k > i; k-- ) + temp -= workspace->h[ index_wkspace_res (i,k) ] * workspace->y[k]; - workspace->y[i] = temp / workspace->h[ index_wkspace_res (i,i) ]; - } + workspace->y[i] = temp / workspace->h[ index_wkspace_res (i,i) ]; + } - // fprintf( stderr, "y: " ); - // for( i = 0; i < RESTART+1; ++i ) - // fprintf( stderr, "%8.3f ", workspace->y[i] ); + // fprintf( stderr, "y: " ); + // for( i = 0; i < RESTART+1; ++i ) + // fprintf( stderr, "%8.3f ", workspace->y[i] ); - /* update x = x_0 + Vy */ - // memset( z, 0, sizeof(real) * N ); - // for( i = j-1; i >= 0; i-- ) - // { - // Vector_Copy( v, z, N ); - // v[i] += workspace->y[i]; - // - // Vector_Sum( z, 1., v, -2 * Dot( u[i], v, N ), u[i], N ); - // } - // - // fprintf( stderr, "\nz: " ); - // for( k = 0; k < N; ++k ) - // fprintf( stderr, "%6.2f ", z[k] ); + /* update x = x_0 + Vy */ + // memset( z, 0, sizeof(real) * N ); + // for( i = j-1; i >= 0; i-- ) + // { + // Vector_Copy( v, z, N ); + // v[i] += workspace->y[i]; + // + // Vector_Sum( z, 1., v, -2 * Dot( u[i], v, N ), u[i], N ); + // } + // + // fprintf( stderr, "\nz: " ); + // for( k = 0; k < N; ++k ) + // fprintf( stderr, "%6.2f ", z[k] ); - // fprintf( stderr, "\nx_bef: " ); - // for( i = 0; i < N; ++i ) - // fprintf( stderr, "%6.2f ", x[i] ); + // fprintf( stderr, "\nx_bef: " ); + // for( i = 0; i < N; ++i ) + // fprintf( stderr, "%6.2f ", x[i] ); - // Vector_Add( x, 1, z, N ); - for( i = j-1; i >= 0; i-- ) - Vector_Add( x, workspace->y[i], z[i], N ); + // Vector_Add( x, 1, z, N ); + for( i = j-1; i >= 0; i-- ) + Vector_Add( x, workspace->y[i], z[i], N ); - // fprintf( stderr, "\nx_aft: " ); - // for( i = 0; i < N; ++i ) - // fprintf( stderr, "%6.2f ", x[i] ); - - /* stopping condition */ - if( fabs( w[j] ) / bnorm <= tol ) - break; - } - - // Sparse_MatVec( H, x, workspace->b_prm ); - // for( i = 0; i < N; ++i ) - // workspace->b_prm[i] *= workspace->Hdia_inv[i]; - - // fprintf( fout, "\n%10s%15s%15s\n", "b_prc", "b_prm", "x" ); - // for( i = 0; i < N; ++i ) - // fprintf( fout, "%10.5f%15.12f%15.12f\n", - // workspace->b_prc[i], workspace->b_prm[i], x[i] ); - - //fprintf( fout,"GMRES outer:%d, inner:%d iters - residual norm: %15.10f\n", - // itr, j, fabs( workspace->g[j] ) / bnorm ); - - if( itr >= MAX_ITR ) { - fprintf( stderr, "GMRES convergence failed\n" ); - // return -1; - return itr * (RESTART+1) + j + 1; - } - - return itr * (RESTART+1) + j + 1; + // fprintf( stderr, "\nx_aft: " ); + // for( i = 0; i < N; ++i ) + // fprintf( stderr, "%6.2f ", x[i] ); + + /* stopping condition */ + if( fabs( w[j] ) / bnorm <= tol ) + break; + } + + // Sparse_MatVec( H, x, workspace->b_prm ); + // for( i = 0; i < N; ++i ) + // workspace->b_prm[i] *= workspace->Hdia_inv[i]; + + // fprintf( fout, "\n%10s%15s%15s\n", "b_prc", "b_prm", "x" ); + // for( i = 0; i < N; ++i ) + // fprintf( fout, "%10.5f%15.12f%15.12f\n", + // workspace->b_prc[i], workspace->b_prm[i], x[i] ); + + //fprintf( fout,"GMRES outer:%d, inner:%d iters - residual norm: %15.10f\n", + // itr, j, fabs( workspace->g[j] ) / bnorm ); + + if( itr >= MAX_ITR ) { + fprintf( stderr, "GMRES convergence failed\n" ); + // return -1; + return itr * (RESTART+1) + j + 1; + } + + return itr * (RESTART+1) + j + 1; } int PGMRES( static_storage *workspace, sparse_matrix *H, real *b, real tol, - sparse_matrix *L, sparse_matrix *U, real *x, FILE *fout, reax_system *system ) + sparse_matrix *L, sparse_matrix *U, real *x, FILE *fout, reax_system *system ) { - int i, j, k, itr, N; - real cc, tmp1, tmp2, temp, bnorm; - - N = H->n; - bnorm = Norm( b, N ); - - /* GMRES outer-loop */ - for( itr = 0; itr < MAX_ITR; ++itr ) { - /* calculate r0 */ - Sparse_MatVec( H, x, workspace->b_prm ); - Vector_Sum( &workspace->v[index_wkspace_sys(0,0,system)], 1., b, -1., workspace->b_prm, N ); - Forward_Subs( L, &workspace->v[index_wkspace_sys(0,0,system)], &workspace->v[index_wkspace_sys(0,0,system)] ); - Backward_Subs( U, &workspace->v[index_wkspace_sys(0,0,system)], &workspace->v[index_wkspace_sys(0,0,system)] ); - workspace->g[0] = Norm( &workspace->v[index_wkspace_sys(0,0,system)], N ); - Vector_Scale( &workspace->v[index_wkspace_sys(0,0,system)], 1. / workspace->g[0], &workspace->v[index_wkspace_sys (0,0,system)], N ); - //fprintf( stderr, "res: %.15e\n", workspace->g[0] ); - - /* GMRES inner-loop */ - for( j = 0; j < RESTART && fabs(workspace->g[j]) / bnorm > tol; j++ ) { - /* matvec */ - Sparse_MatVec( H, &workspace->v[index_wkspace_sys (j,0,system)], &workspace->v[index_wkspace_sys (j+1,0,system)] ); - Forward_Subs( L, &workspace->v[index_wkspace_sys(j+1,0,system)], &workspace->v[index_wkspace_sys(j+1,0,system)] ); - Backward_Subs( U, &workspace->v[index_wkspace_sys(j+1,0,system)], &workspace->v[index_wkspace_sys(j+1,0,system)] ); - - /* apply modified Gram-Schmidt to orthogonalize the new residual */ - for( i = 0; i < j-1; i++ ) workspace->h[ index_wkspace_res (i,j)] = 0; - - //for( i = 0; i <= j; i++ ) { - for( i = MAX(j-1,0); i <= j; i++ ) { - workspace->h[index_wkspace_res (i,j)] = Dot( &workspace->v[index_wkspace_sys (i,0,system)], &workspace->v[index_wkspace_sys(j+1,0,system)], N ); - Vector_Add( &workspace->v[index_wkspace_sys(j+1,0,system)],-workspace->h[ index_wkspace_res (i,j) ], &workspace->v[index_wkspace_sys(i,0,system)], N ); - } - - workspace->h[index_wkspace_res (j+1,j) ] = Norm( &workspace->v[index_wkspace_sys (j+1,0,system)], N ); - Vector_Scale( &workspace->v[index_wkspace_sys(j+1,0,system)], - 1. / workspace->h[ index_wkspace_res (j+1,j)], &workspace->v[index_wkspace_sys(j+1,0,system)], N ); - // fprintf( stderr, "%d-%d: orthogonalization completed.\n", itr, j ); - - /* Givens rotations on the upper-Hessenberg matrix to make it U */ - for( i = MAX(j-1,0); i <= j; i++ ) { - if( i == j ) { - cc = SQRT( SQR(workspace->h[ index_wkspace_res (j,j) ])+SQR(workspace->h[ index_wkspace_res (j+1,j) ]) ); - workspace->hc[j] = workspace->h[ index_wkspace_res (j,j) ] / cc; - workspace->hs[j] = workspace->h[ index_wkspace_res (j+1,j) ] / cc; - } - - tmp1 = workspace->hc[i] * workspace->h[ index_wkspace_res (i,j) ] + - workspace->hs[i] * workspace->h[index_wkspace_res (i+1,j) ]; - tmp2 = -workspace->hs[i] * workspace->h[index_wkspace_res (i,j)] + - workspace->hc[i] * workspace->h[index_wkspace_res (i+1,j) ]; - - workspace->h[ index_wkspace_res (i,j) ] = tmp1; - workspace->h[ index_wkspace_res (i+1,j) ] = tmp2; - } - - /* apply Givens rotations to the rhs as well */ - tmp1 = workspace->hc[j] * workspace->g[j]; - tmp2 = -workspace->hs[j] * workspace->g[j]; - workspace->g[j] = tmp1; - workspace->g[j+1] = tmp2; - - //fprintf( stderr, "h: " ); - //for( i = 0; i <= j+1; ++i ) - //fprintf( stderr, "%.6f ", workspace->h[i][j] ); - //fprintf( stderr, "\n" ); - //fprintf( stderr, "res: %.15e\n", workspace->g[j+1] ); - } - - - /* solve Hy = g: H is now upper-triangular, do back-substitution */ - for( i = j-1; i >= 0; i-- ) { - temp = workspace->g[i]; - for( k = j-1; k > i; k-- ) - temp -= workspace->h[ index_wkspace_res (i,k) ] * workspace->y[k]; - - workspace->y[i] = temp / workspace->h[index_wkspace_res (i,i)]; - } - - /* update x = x_0 + Vy */ - Vector_MakeZero( workspace->p, N ); - for( i = 0; i < j; i++ ) - Vector_Add( workspace->p, workspace->y[i], &workspace->v[index_wkspace_sys(i,0,system)], N ); - //Backward_Subs( U, workspace->p, workspace->p ); - //Forward_Subs( L, workspace->p, workspace->p ); - Vector_Add( x, 1., workspace->p, N ); - - /* stopping condition */ - if( fabs(workspace->g[j]) / bnorm <= tol ) - break; - } - - // Sparse_MatVec( H, x, workspace->b_prm ); - // for( i = 0; i < N; ++i ) - // workspace->b_prm[i] *= workspace->Hdia_inv[i]; - // fprintf( fout, "\n%10s%15s%15s\n", "b_prc", "b_prm", "x" ); - // for( i = 0; i < N; ++i ) - // fprintf( fout, "%10.5f%15.12f%15.12f\n", - // workspace->b_prc[i], workspace->b_prm[i], x[i] );*/ - - // fprintf(fout,"GMRES outer:%d, inner:%d iters - residual norm: %25.20f\n", - // itr, j, fabs( workspace->g[j] ) / bnorm ); - // data->timing.matvec += itr * RESTART + j; - - if( itr >= MAX_ITR ) { - fprintf( stderr, "GMRES convergence failed\n" ); - // return -1; - return itr * (RESTART+1) + j + 1; - } - - return itr * (RESTART+1) + j + 1; - } - - - - int PCG( static_storage *workspace, sparse_matrix *A, real *b, real tol, - sparse_matrix *L, sparse_matrix *U, real *x, FILE *fout, reax_system* system ) - { - int i, N; - real tmp, alpha, beta, b_norm, r_norm; - real sig0, sig_old, sig_new; - - N = A->n; - b_norm = Norm( b, N ); - //fprintf( stderr, "b_norm: %.15e\n", b_norm ); - - Sparse_MatVec( A, x, workspace->q ); - Vector_Sum( workspace->r , 1., b, -1., workspace->q, N ); - r_norm = Norm(workspace->r, N); - //Print_Soln( workspace, x, q, b, N ); - //fprintf( stderr, "res: %.15e\n", r_norm ); - - Forward_Subs( L, workspace->r, workspace->d ); - Backward_Subs( U, workspace->d, workspace->p ); - sig_new = Dot( workspace->r, workspace->p, N ); - sig0 = sig_new; - - for( i = 0; i < 200 && r_norm/b_norm > tol; ++i ) { - //for( i = 0; i < 200 && sig_new > SQR(tol) * sig0; ++i ) { - Sparse_MatVec( A, workspace->p, workspace->q ); - tmp = Dot( workspace->q, workspace->p, N ); - alpha = sig_new / tmp; - Vector_Add( x, alpha, workspace->p, N ); - //fprintf( stderr, "iter%d: |p|=%.15e |q|=%.15e tmp=%.15e\n", - // i+1, Norm(workspace->p,N), Norm(workspace->q,N), tmp ); - - Vector_Add( workspace->r, -alpha, workspace->q, N ); - r_norm = Norm(workspace->r, N); - //fprintf( stderr, "res: %.15e\n", r_norm ); - - Forward_Subs( L, workspace->r, workspace->d ); - Backward_Subs( U, workspace->d, workspace->d ); - sig_old = sig_new; - sig_new = Dot( workspace->r, workspace->d, N ); - beta = sig_new / sig_old; - Vector_Sum( workspace->p, 1., workspace->d, beta, workspace->p, N ); - } - - //fprintf( fout, "CG took %d iterations\n", i ); - if( i >= 200 ) { - fprintf( stderr, "CG convergence failed!\n" ); - return i; - } - - return i; - } - - - int CG( static_storage *workspace, sparse_matrix *H, - real *b, real tol, real *x, FILE *fout, reax_system *system) - { - int i, j, N; - real tmp, alpha, beta, b_norm; - real sig_old, sig_new, sig0; - - N = H->n; - b_norm = Norm( b, N ); - //fprintf( stderr, "b_norm: %10.6f\n", b_norm ); - - Sparse_MatVec( H, x, workspace->q ); - Vector_Sum( workspace->r , 1., b, -1., workspace->q, N ); - for( j = 0; j < N; ++j ) - workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; - - sig_new = Dot( workspace->r, workspace->d, N ); - sig0 = sig_new; - //Print_Soln( workspace, x, q, b, N ); - //fprintf( stderr, "sig_new: %24.15e, d_norm:%24.15e, q_norm:%24.15e\n", - // sqrt(sig_new), Norm(workspace->d,N), Norm(workspace->q,N) ); - //fprintf( stderr, "sig_new: %f\n", sig_new ); - - for( i = 0; i < 300 && SQRT(sig_new) / b_norm > tol; ++i ) { - //for( i = 0; i < 300 && sig_new > SQR(tol)*sig0; ++i ) { - Sparse_MatVec( H, workspace->d, workspace->q ); - tmp = Dot( workspace->d, workspace->q, N ); - //fprintf( stderr, "tmp: %f\n", tmp ); - alpha = sig_new / tmp; - Vector_Add( x, alpha, workspace->d, N ); - //fprintf( stderr, "d_norm:%24.15e, q_norm:%24.15e, tmp:%24.15e\n", - // Norm(workspace->d,N), Norm(workspace->q,N), tmp ); - - Vector_Add( workspace->r, -alpha, workspace->q, N ); - for( j = 0; j < N; ++j ) - workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j]; - - sig_old = sig_new; - sig_new = Dot( workspace->r, workspace->p, N ); - beta = sig_new / sig_old; - Vector_Sum( workspace->d, 1., workspace->p, beta, workspace->d, N ); - //fprintf( stderr, "sig_new: %f\n", sig_new ); - } - - fprintf( stderr, "CG took %d iterations\n", i ); - - if( i >= 300 ) { - fprintf( stderr, "CG convergence failed!\n" ); - return i; - } - - return i; - } - - - - /* Steepest Descent */ - int SDM( static_storage *workspace, sparse_matrix *H, - real *b, real tol, real *x, FILE *fout ) - { - int i, j, N; - real tmp, alpha, beta, b_norm; - real sig0, sig; - - N = H->n; - b_norm = Norm( b, N ); - //fprintf( stderr, "b_norm: %10.6f\n", b_norm ); - - Sparse_MatVec( H, x, workspace->q ); - Vector_Sum( workspace->r , 1., b, -1., workspace->q, N ); - for( j = 0; j < N; ++j ) - workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; - - sig = Dot( workspace->r, workspace->d, N ); - sig0 = sig; - - for( i = 0; i < 300 && SQRT(sig) / b_norm > tol; ++i ) { - Sparse_MatVec( H, workspace->d, workspace->q ); - - sig = Dot( workspace->r, workspace->d, N ); - tmp = Dot( workspace->d, workspace->q, N ); - alpha = sig / tmp; - - Vector_Add( x, alpha, workspace->d, N ); - Vector_Add( workspace->r, -alpha, workspace->q, N ); - for( j = 0; j < N; ++j ) - workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; - - //fprintf( stderr, "d_norm:%24.15e, q_norm:%24.15e, tmp:%24.15e\n", - // Norm(workspace->d,N), Norm(workspace->q,N), tmp ); - } - - fprintf( stderr, "SDM took %d iterations\n", i ); - - if( i >= 300 ) { - fprintf( stderr, "SDM convergence failed!\n" ); - return i; - } - - return i; - } + int i, j, k, itr, N; + real cc, tmp1, tmp2, temp, bnorm; + + N = H->n; + bnorm = Norm( b, N ); + + /* GMRES outer-loop */ + for( itr = 0; itr < MAX_ITR; ++itr ) { + /* calculate r0 */ + Sparse_MatVec( H, x, workspace->b_prm ); + Vector_Sum( &workspace->v[index_wkspace_sys(0,0,system)], 1., b, -1., workspace->b_prm, N ); + Forward_Subs( L, &workspace->v[index_wkspace_sys(0,0,system)], &workspace->v[index_wkspace_sys(0,0,system)] ); + Backward_Subs( U, &workspace->v[index_wkspace_sys(0,0,system)], &workspace->v[index_wkspace_sys(0,0,system)] ); + workspace->g[0] = Norm( &workspace->v[index_wkspace_sys(0,0,system)], N ); + Vector_Scale( &workspace->v[index_wkspace_sys(0,0,system)], 1. / workspace->g[0], &workspace->v[index_wkspace_sys (0,0,system)], N ); + //fprintf( stderr, "res: %.15e\n", workspace->g[0] ); + + /* GMRES inner-loop */ + for( j = 0; j < RESTART && fabs(workspace->g[j]) / bnorm > tol; j++ ) { + /* matvec */ + Sparse_MatVec( H, &workspace->v[index_wkspace_sys (j,0,system)], &workspace->v[index_wkspace_sys (j+1,0,system)] ); + Forward_Subs( L, &workspace->v[index_wkspace_sys(j+1,0,system)], &workspace->v[index_wkspace_sys(j+1,0,system)] ); + Backward_Subs( U, &workspace->v[index_wkspace_sys(j+1,0,system)], &workspace->v[index_wkspace_sys(j+1,0,system)] ); + + /* apply modified Gram-Schmidt to orthogonalize the new residual */ + for( i = 0; i < j-1; i++ ) workspace->h[ index_wkspace_res (i,j)] = 0; + + //for( i = 0; i <= j; i++ ) { + for( i = MAX(j-1,0); i <= j; i++ ) { + workspace->h[index_wkspace_res (i,j)] = Dot( &workspace->v[index_wkspace_sys (i,0,system)], &workspace->v[index_wkspace_sys(j+1,0,system)], N ); + Vector_Add( &workspace->v[index_wkspace_sys(j+1,0,system)],-workspace->h[ index_wkspace_res (i,j) ], &workspace->v[index_wkspace_sys(i,0,system)], N ); + } + + workspace->h[index_wkspace_res (j+1,j) ] = Norm( &workspace->v[index_wkspace_sys (j+1,0,system)], N ); + Vector_Scale( &workspace->v[index_wkspace_sys(j+1,0,system)], + 1. / workspace->h[ index_wkspace_res (j+1,j)], &workspace->v[index_wkspace_sys(j+1,0,system)], N ); + // fprintf( stderr, "%d-%d: orthogonalization completed.\n", itr, j ); + + /* Givens rotations on the upper-Hessenberg matrix to make it U */ + for( i = MAX(j-1,0); i <= j; i++ ) { + if( i == j ) { + cc = SQRT( SQR(workspace->h[ index_wkspace_res (j,j) ])+SQR(workspace->h[ index_wkspace_res (j+1,j) ]) ); + workspace->hc[j] = workspace->h[ index_wkspace_res (j,j) ] / cc; + workspace->hs[j] = workspace->h[ index_wkspace_res (j+1,j) ] / cc; + } + + tmp1 = workspace->hc[i] * workspace->h[ index_wkspace_res (i,j) ] + + workspace->hs[i] * workspace->h[index_wkspace_res (i+1,j) ]; + tmp2 = -workspace->hs[i] * workspace->h[index_wkspace_res (i,j)] + + workspace->hc[i] * workspace->h[index_wkspace_res (i+1,j) ]; + + workspace->h[ index_wkspace_res (i,j) ] = tmp1; + workspace->h[ index_wkspace_res (i+1,j) ] = tmp2; + } + + /* apply Givens rotations to the rhs as well */ + tmp1 = workspace->hc[j] * workspace->g[j]; + tmp2 = -workspace->hs[j] * workspace->g[j]; + workspace->g[j] = tmp1; + workspace->g[j+1] = tmp2; + + //fprintf( stderr, "h: " ); + //for( i = 0; i <= j+1; ++i ) + //fprintf( stderr, "%.6f ", workspace->h[i][j] ); + //fprintf( stderr, "\n" ); + //fprintf( stderr, "res: %.15e\n", workspace->g[j+1] ); + } + + + /* solve Hy = g: H is now upper-triangular, do back-substitution */ + for( i = j-1; i >= 0; i-- ) { + temp = workspace->g[i]; + for( k = j-1; k > i; k-- ) + temp -= workspace->h[ index_wkspace_res (i,k) ] * workspace->y[k]; + + workspace->y[i] = temp / workspace->h[index_wkspace_res (i,i)]; + } + + /* update x = x_0 + Vy */ + Vector_MakeZero( workspace->p, N ); + for( i = 0; i < j; i++ ) + Vector_Add( workspace->p, workspace->y[i], &workspace->v[index_wkspace_sys(i,0,system)], N ); + //Backward_Subs( U, workspace->p, workspace->p ); + //Forward_Subs( L, workspace->p, workspace->p ); + Vector_Add( x, 1., workspace->p, N ); + + /* stopping condition */ + if( fabs(workspace->g[j]) / bnorm <= tol ) + break; + } + + // Sparse_MatVec( H, x, workspace->b_prm ); + // for( i = 0; i < N; ++i ) + // workspace->b_prm[i] *= workspace->Hdia_inv[i]; + // fprintf( fout, "\n%10s%15s%15s\n", "b_prc", "b_prm", "x" ); + // for( i = 0; i < N; ++i ) + // fprintf( fout, "%10.5f%15.12f%15.12f\n", + // workspace->b_prc[i], workspace->b_prm[i], x[i] );*/ + + // fprintf(fout,"GMRES outer:%d, inner:%d iters - residual norm: %25.20f\n", + // itr, j, fabs( workspace->g[j] ) / bnorm ); + // data->timing.matvec += itr * RESTART + j; + + if( itr >= MAX_ITR ) { + fprintf( stderr, "GMRES convergence failed\n" ); + // return -1; + return itr * (RESTART+1) + j + 1; + } + + return itr * (RESTART+1) + j + 1; + } + + + + int PCG( static_storage *workspace, sparse_matrix *A, real *b, real tol, + sparse_matrix *L, sparse_matrix *U, real *x, FILE *fout, reax_system* system ) + { + int i, N; + real tmp, alpha, beta, b_norm, r_norm; + real sig0, sig_old, sig_new; + + N = A->n; + b_norm = Norm( b, N ); + //fprintf( stderr, "b_norm: %.15e\n", b_norm ); + + Sparse_MatVec( A, x, workspace->q ); + Vector_Sum( workspace->r , 1., b, -1., workspace->q, N ); + r_norm = Norm(workspace->r, N); + //Print_Soln( workspace, x, q, b, N ); + //fprintf( stderr, "res: %.15e\n", r_norm ); + + Forward_Subs( L, workspace->r, workspace->d ); + Backward_Subs( U, workspace->d, workspace->p ); + sig_new = Dot( workspace->r, workspace->p, N ); + sig0 = sig_new; + + for( i = 0; i < 200 && r_norm/b_norm > tol; ++i ) { + //for( i = 0; i < 200 && sig_new > SQR(tol) * sig0; ++i ) { + Sparse_MatVec( A, workspace->p, workspace->q ); + tmp = Dot( workspace->q, workspace->p, N ); + alpha = sig_new / tmp; + Vector_Add( x, alpha, workspace->p, N ); + //fprintf( stderr, "iter%d: |p|=%.15e |q|=%.15e tmp=%.15e\n", + // i+1, Norm(workspace->p,N), Norm(workspace->q,N), tmp ); + + Vector_Add( workspace->r, -alpha, workspace->q, N ); + r_norm = Norm(workspace->r, N); + //fprintf( stderr, "res: %.15e\n", r_norm ); + + Forward_Subs( L, workspace->r, workspace->d ); + Backward_Subs( U, workspace->d, workspace->d ); + sig_old = sig_new; + sig_new = Dot( workspace->r, workspace->d, N ); + beta = sig_new / sig_old; + Vector_Sum( workspace->p, 1., workspace->d, beta, workspace->p, N ); + } + + //fprintf( fout, "CG took %d iterations\n", i ); + if( i >= 200 ) { + fprintf( stderr, "CG convergence failed!\n" ); + return i; + } + + return i; + } + + + int CG( static_storage *workspace, sparse_matrix *H, + real *b, real tol, real *x, FILE *fout, reax_system *system) + { + int i, j, N; + real tmp, alpha, beta, b_norm; + real sig_old, sig_new, sig0; + + N = H->n; + b_norm = Norm( b, N ); + //fprintf( stderr, "b_norm: %10.6f\n", b_norm ); + + Sparse_MatVec( H, x, workspace->q ); + Vector_Sum( workspace->r , 1., b, -1., workspace->q, N ); + for( j = 0; j < N; ++j ) + workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; + + sig_new = Dot( workspace->r, workspace->d, N ); + sig0 = sig_new; + //Print_Soln( workspace, x, q, b, N ); + //fprintf( stderr, "sig_new: %24.15e, d_norm:%24.15e, q_norm:%24.15e\n", + // sqrt(sig_new), Norm(workspace->d,N), Norm(workspace->q,N) ); + //fprintf( stderr, "sig_new: %f\n", sig_new ); + + for( i = 0; i < 300 && SQRT(sig_new) / b_norm > tol; ++i ) { + //for( i = 0; i < 300 && sig_new > SQR(tol)*sig0; ++i ) { + Sparse_MatVec( H, workspace->d, workspace->q ); + tmp = Dot( workspace->d, workspace->q, N ); + //fprintf( stderr, "tmp: %f\n", tmp ); + alpha = sig_new / tmp; + Vector_Add( x, alpha, workspace->d, N ); + //fprintf( stderr, "d_norm:%24.15e, q_norm:%24.15e, tmp:%24.15e\n", + // Norm(workspace->d,N), Norm(workspace->q,N), tmp ); + + Vector_Add( workspace->r, -alpha, workspace->q, N ); + for( j = 0; j < N; ++j ) + workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j]; + + sig_old = sig_new; + sig_new = Dot( workspace->r, workspace->p, N ); + beta = sig_new / sig_old; + Vector_Sum( workspace->d, 1., workspace->p, beta, workspace->d, N ); + //fprintf( stderr, "sig_new: %f\n", sig_new ); + } + + fprintf( stderr, "CG took %d iterations\n", i ); + + if( i >= 300 ) { + fprintf( stderr, "CG convergence failed!\n" ); + return i; + } + + return i; + } + + + + /* Steepest Descent */ + int SDM( static_storage *workspace, sparse_matrix *H, + real *b, real tol, real *x, FILE *fout ) + { + int i, j, N; + real tmp, alpha, beta, b_norm; + real sig0, sig; + + N = H->n; + b_norm = Norm( b, N ); + //fprintf( stderr, "b_norm: %10.6f\n", b_norm ); + + Sparse_MatVec( H, x, workspace->q ); + Vector_Sum( workspace->r , 1., b, -1., workspace->q, N ); + for( j = 0; j < N; ++j ) + workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; + + sig = Dot( workspace->r, workspace->d, N ); + sig0 = sig; + + for( i = 0; i < 300 && SQRT(sig) / b_norm > tol; ++i ) { + Sparse_MatVec( H, workspace->d, workspace->q ); + + sig = Dot( workspace->r, workspace->d, N ); + tmp = Dot( workspace->d, workspace->q, N ); + alpha = sig / tmp; + + Vector_Add( x, alpha, workspace->d, N ); + Vector_Add( workspace->r, -alpha, workspace->q, N ); + for( j = 0; j < N; ++j ) + workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; + + //fprintf( stderr, "d_norm:%24.15e, q_norm:%24.15e, tmp:%24.15e\n", + // Norm(workspace->d,N), Norm(workspace->q,N), tmp ); + } + + fprintf( stderr, "SDM took %d iterations\n", i ); + + if( i >= 300 ) { + fprintf( stderr, "SDM convergence failed!\n" ); + return i; + } + + return i; + } diff --git a/PuReMD-GPU/src/QEq.cu b/PuReMD-GPU/src/QEq.cu index 03f3fe74..5d849b26 100644 --- a/PuReMD-GPU/src/QEq.cu +++ b/PuReMD-GPU/src/QEq.cu @@ -36,416 +36,416 @@ HOST_DEVICE void swap(sparse_matrix_entry *array, int index1, int index2) { - sparse_matrix_entry temp = array[index1]; - array[index1] = array[index2]; - array[index2] = temp; + sparse_matrix_entry temp = array[index1]; + array[index1] = array[index2]; + array[index2] = temp; } HOST_DEVICE void quick_sort(sparse_matrix_entry *array, int start, int end) { - int i = start; - int k = end; - - if (end - start >= 1) - { - int pivot = array[start].j; - - while (k > i) - { - while ((array[i].j <= pivot) && (i <= end) && (k > i)) i++; - while ((array[k].j > pivot) && (k >= start) && (k >= i)) k--; - if (k > i) swap(array, i, k); - } - swap(array, start, k); - quick_sort(array, start, k - 1); - quick_sort(array, k + 1, end); - } + int i = start; + int k = end; + + if (end - start >= 1) + { + int pivot = array[start].j; + + while (k > i) + { + while ((array[i].j <= pivot) && (i <= end) && (k > i)) i++; + while ((array[k].j > pivot) && (k >= start) && (k >= i)) k--; + if (k > i) swap(array, i, k); + } + swap(array, start, k); + quick_sort(array, start, k - 1); + quick_sort(array, k + 1, end); + } } int compare_matrix_entry(const void *v1, const void *v2) { - return ((sparse_matrix_entry *)v1)->j - ((sparse_matrix_entry *)v2)->j; + return ((sparse_matrix_entry *)v1)->j - ((sparse_matrix_entry *)v2)->j; } void Sort_Matrix_Rows( sparse_matrix *A ) { - int i, si, ei; - - for( i = 0; i < A->n; ++i ) { - si = A->start[i]; - ei = A->start[i+1]; - qsort( &(A->entries[si]), ei - si, - sizeof(sparse_matrix_entry), compare_matrix_entry ); - } + int i, si, ei; + + for( i = 0; i < A->n; ++i ) { + si = A->start[i]; + ei = A->start[i+1]; + qsort( &(A->entries[si]), ei - si, + sizeof(sparse_matrix_entry), compare_matrix_entry ); + } } GLOBAL void Cuda_Sort_Matrix_Rows ( sparse_matrix A ) { - int i; - int si, ei; + int i; + int si, ei; - i = blockIdx.x * blockDim.x + threadIdx.x; + i = blockIdx.x * blockDim.x + threadIdx.x; - if ( i >= A.n ) return; + if ( i >= A.n ) return; - si = A.start[i]; - ei = A.end [i]; + si = A.start[i]; + ei = A.end [i]; - quick_sort( A.entries + si, 0, ei-si-1 ); + quick_sort( A.entries + si, 0, ei-si-1 ); } void Calculate_Droptol( sparse_matrix *A, real *droptol, real dtol ) { - int i, j, k; - real val; - - /* init droptol to 0 */ - for( i = 0; i < A->n; ++i ) - droptol[i] = 0; - - /* calculate sqaure of the norm of each row */ - for( i = 0; i < A->n; ++i ) { - for( k = A->start[i]; k < A->start[i+1]-1; ++k ) { - j = A->entries[k].j; - val = A->entries[k].val; - - droptol[i] += val*val; - droptol[j] += val*val; - } - - val = A->entries[k].val; // diagonal entry - droptol[i] += val*val; - } - - /* calculate local droptol for each row */ - //fprintf( stderr, "droptol: " ); - for( i = 0; i < A->n; ++i ) { - //fprintf( stderr, "%f-->", droptol[i] ); - droptol[i] = SQRT( droptol[i] ) * dtol; - //fprintf( stderr, "%f ", droptol[i] ); - } - //fprintf( stderr, "\n" ); + int i, j, k; + real val; + + /* init droptol to 0 */ + for( i = 0; i < A->n; ++i ) + droptol[i] = 0; + + /* calculate sqaure of the norm of each row */ + for( i = 0; i < A->n; ++i ) { + for( k = A->start[i]; k < A->start[i+1]-1; ++k ) { + j = A->entries[k].j; + val = A->entries[k].val; + + droptol[i] += val*val; + droptol[j] += val*val; + } + + val = A->entries[k].val; // diagonal entry + droptol[i] += val*val; + } + + /* calculate local droptol for each row */ + //fprintf( stderr, "droptol: " ); + for( i = 0; i < A->n; ++i ) { + //fprintf( stderr, "%f-->", droptol[i] ); + droptol[i] = SQRT( droptol[i] ) * dtol; + //fprintf( stderr, "%f ", droptol[i] ); + } + //fprintf( stderr, "\n" ); } GLOBAL void Cuda_Calculate_Droptol ( sparse_matrix p_A, real *droptol, real dtol ) { - int i = blockIdx.x * blockDim.x + threadIdx.x; - int k, j, offset, x, diagnol; - real val; - sparse_matrix *A = &p_A; + int i = blockIdx.x * blockDim.x + threadIdx.x; + int k, j, offset, x, diagnol; + real val; + sparse_matrix *A = &p_A; - if ( i < A->n ) { - droptol [i] = 0; + if ( i < A->n ) { + droptol [i] = 0; - for (k = A->start[i]; k < A->end[i]; ++k ) { - val = A->entries[k].val; - droptol [i] += val*val; - } - } + for (k = A->start[i]; k < A->end[i]; ++k ) { + val = A->entries[k].val; + droptol [i] += val*val; + } + } - __syncthreads (); - if ( i < A->n ) { - droptol [i] = SQRT (droptol[i]) * dtol; - } + __syncthreads (); + if ( i < A->n ) { + droptol [i] = SQRT (droptol[i]) * dtol; + } } GLOBAL void Cuda_Calculate_Droptol_js ( sparse_matrix p_A, real *droptol, real dtol ) { - int i = blockIdx.x * blockDim.x + threadIdx.x; - int k, j, offset, x, diagnol; - real val; - sparse_matrix *A = &p_A; - - for (x = 0; x < A->n; x ++) - { - if (i < (A->end[i]-1 - A->start[i])) { - offset = A->start [i] + i; - j = A->entries[offset].j; - val = A->entries[offset].val; - droptol [j] += val * val; - } - __syncthreads (); - } + int i = blockIdx.x * blockDim.x + threadIdx.x; + int k, j, offset, x, diagnol; + real val; + sparse_matrix *A = &p_A; + + for (x = 0; x < A->n; x ++) + { + if (i < (A->end[i]-1 - A->start[i])) { + offset = A->start [i] + i; + j = A->entries[offset].j; + val = A->entries[offset].val; + droptol [j] += val * val; + } + __syncthreads (); + } } GLOBAL void Cuda_Calculate_Droptol_diagnol ( sparse_matrix p_A, real *droptol, real dtol ) { - int i = blockIdx.x * blockDim.x + threadIdx.x; - int k, j, offset, x, diagnol; - real val; - sparse_matrix *A = &p_A; - - if ( i < A->n ) { - //diagnol element - diagnol = A->end[i]-1; - val = A->entries [diagnol].val; - droptol [i] += val*val; - } - - /*calculate local droptol for each row*/ - if ( i < A->n ) - droptol [i] = SQRT (droptol[i]) * dtol; + int i = blockIdx.x * blockDim.x + threadIdx.x; + int k, j, offset, x, diagnol; + real val; + sparse_matrix *A = &p_A; + + if ( i < A->n ) { + //diagnol element + diagnol = A->end[i]-1; + val = A->entries [diagnol].val; + droptol [i] += val*val; + } + + /*calculate local droptol for each row*/ + if ( i < A->n ) + droptol [i] = SQRT (droptol[i]) * dtol; } int Estimate_LU_Fill( sparse_matrix *A, real *droptol ) { - int i, j, pj; - int fillin; - real val; + int i, j, pj; + int fillin; + real val; - fillin = 0; + fillin = 0; - //fprintf( stderr, "n: %d\n", A->n ); - for( i = 0; i < A->n; ++i ) - for( pj = A->start[i]; pj < A->start[i+1]-1; ++pj ){ - j = A->entries[pj].j; - val = A->entries[pj].val; - //fprintf( stderr, "i: %d, j: %d", i, j ); + //fprintf( stderr, "n: %d\n", A->n ); + for( i = 0; i < A->n; ++i ) + for( pj = A->start[i]; pj < A->start[i+1]-1; ++pj ){ + j = A->entries[pj].j; + val = A->entries[pj].val; + //fprintf( stderr, "i: %d, j: %d", i, j ); - if( fabs(val) > droptol[i] ) - ++fillin; - } + if( fabs(val) > droptol[i] ) + ++fillin; + } - return fillin + A->n; + return fillin + A->n; } GLOBAL void Cuda_Estimate_LU_Fill ( sparse_matrix p_A, real *droptol, int *fillin) { - int i, j, pj; - real val; - sparse_matrix *A = &p_A; + int i, j, pj; + real val; + sparse_matrix *A = &p_A; - i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= A->n) return; + i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= A->n) return; - fillin [i] = 0; + fillin [i] = 0; - for (pj = A->start[i]; pj < A->end[i]-1; ++pj) - { - j = A->entries [pj].j; - val = A->entries[pj].val; + for (pj = A->start[i]; pj < A->end[i]-1; ++pj) + { + j = A->entries [pj].j; + val = A->entries[pj].val; - if (fabs (val) > droptol [i]) ++fillin [i]; - } + if (fabs (val) > droptol [i]) ++fillin [i]; + } } void ICHOLT( sparse_matrix *A, real *droptol, - sparse_matrix *L, sparse_matrix *U ) + sparse_matrix *L, sparse_matrix *U ) { - sparse_matrix_entry tmp[1000]; - int i, j, pj, k1, k2, tmptop, Ltop; - real val; - int *Utop; - - Utop = (int*) malloc((A->n+1) * sizeof(int)); - - // clear variables - Ltop = 0; - tmptop = 0; - for( i = 0; i <= A->n; ++i ) - L->start[i] = U->start[i] = 0; - - for( i = 0; i < A->n; ++i ) - Utop[i] = 0; - - //fprintf( stderr, "n: %d\n", A->n ); - for( i = 0; i < A->n; ++i ){ - L->start[i] = Ltop; - tmptop = 0; - - for( pj = A->start[i]; pj < A->start[i+1]-1; ++pj ){ - j = A->entries[pj].j; - val = A->entries[pj].val; - //fprintf( stderr, "i: %d, j: %d", i, j ); - - if( fabs(val) > droptol[i] ){ - k1 = 0; - k2 = L->start[j]; - while( k1 < tmptop && k2 < L->start[j+1] ){ - if( tmp[k1].j < L->entries[k2].j ) - ++k1; - else if( tmp[k1].j > L->entries[k2].j ) - ++k2; - else - val -= (tmp[k1++].val * L->entries[k2++].val); - } - - // L matrix is lower triangular, - // so right before the start of next row comes jth diagonal - val /= L->entries[L->start[j+1]-1].val; - - tmp[tmptop].j = j; - tmp[tmptop].val = val; - ++tmptop; - } - //fprintf( stderr, " -- done\n" ); - } - - // compute the ith diagonal in L - // sanity check - if( A->entries[pj].j != i ) { - fprintf( stderr, "i=%d, badly built A matrix!\n", i ); - exit(999); - } - - val = A->entries[pj].val; - for( k1 = 0; k1 < tmptop; ++k1 ) - val -= (tmp[k1].val * tmp[k1].val); - - tmp[tmptop].j = i; - tmp[tmptop].val = SQRT(val); - - // apply the dropping rule once again - //fprintf( stderr, "row%d: tmptop: %d\n", i, tmptop ); - //for( k1 = 0; k1<= tmptop; ++k1 ) - // fprintf( stderr, "%d(%f) ", tmp[k1].j, tmp[k1].val ); - //fprintf( stderr, "\n" ); - //fprintf( stderr, "row(%d): droptol=%.4f\n", i+1, droptol[i] ); - for( k1 = 0; k1 < tmptop; ++k1 ) - if( fabs(tmp[k1].val) > droptol[i] / tmp[tmptop].val ){ - L->entries[Ltop].j = tmp[k1].j; - L->entries[Ltop].val = tmp[k1].val; - U->start[tmp[k1].j+1]++; - ++Ltop; - //fprintf( stderr, "%d(%.4f) ", tmp[k1].j+1, tmp[k1].val ); - } - // keep the diagonal in any case - L->entries[Ltop].j = tmp[k1].j; - L->entries[Ltop].val = tmp[k1].val; - ++Ltop; - //fprintf( stderr, "%d(%.4f)\n", tmp[k1].j+1, tmp[k1].val ); - } - - L->start[i] = Ltop; - //fprintf( stderr, "nnz(L): %d, max: %d\n", Ltop, L->n * 50 ); - - for( i = 1; i <= U->n; ++i ) - Utop[i] = U->start[i] = U->start[i] + U->start[i-1] + 1; - - for( i = 0; i < L->n; ++i ) - for( pj = L->start[i]; pj < L->start[i+1]; ++pj ){ - j = L->entries[pj].j; - U->entries[Utop[j]].j = i; - U->entries[Utop[j]].val = L->entries[pj].val; - Utop[j]++; - } - - //fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 ); + sparse_matrix_entry tmp[1000]; + int i, j, pj, k1, k2, tmptop, Ltop; + real val; + int *Utop; + + Utop = (int*) malloc((A->n+1) * sizeof(int)); + + // clear variables + Ltop = 0; + tmptop = 0; + for( i = 0; i <= A->n; ++i ) + L->start[i] = U->start[i] = 0; + + for( i = 0; i < A->n; ++i ) + Utop[i] = 0; + + //fprintf( stderr, "n: %d\n", A->n ); + for( i = 0; i < A->n; ++i ){ + L->start[i] = Ltop; + tmptop = 0; + + for( pj = A->start[i]; pj < A->start[i+1]-1; ++pj ){ + j = A->entries[pj].j; + val = A->entries[pj].val; + //fprintf( stderr, "i: %d, j: %d", i, j ); + + if( fabs(val) > droptol[i] ){ + k1 = 0; + k2 = L->start[j]; + while( k1 < tmptop && k2 < L->start[j+1] ){ + if( tmp[k1].j < L->entries[k2].j ) + ++k1; + else if( tmp[k1].j > L->entries[k2].j ) + ++k2; + else + val -= (tmp[k1++].val * L->entries[k2++].val); + } + + // L matrix is lower triangular, + // so right before the start of next row comes jth diagonal + val /= L->entries[L->start[j+1]-1].val; + + tmp[tmptop].j = j; + tmp[tmptop].val = val; + ++tmptop; + } + //fprintf( stderr, " -- done\n" ); + } + + // compute the ith diagonal in L + // sanity check + if( A->entries[pj].j != i ) { + fprintf( stderr, "i=%d, badly built A matrix!\n", i ); + exit(999); + } + + val = A->entries[pj].val; + for( k1 = 0; k1 < tmptop; ++k1 ) + val -= (tmp[k1].val * tmp[k1].val); + + tmp[tmptop].j = i; + tmp[tmptop].val = SQRT(val); + + // apply the dropping rule once again + //fprintf( stderr, "row%d: tmptop: %d\n", i, tmptop ); + //for( k1 = 0; k1<= tmptop; ++k1 ) + // fprintf( stderr, "%d(%f) ", tmp[k1].j, tmp[k1].val ); + //fprintf( stderr, "\n" ); + //fprintf( stderr, "row(%d): droptol=%.4f\n", i+1, droptol[i] ); + for( k1 = 0; k1 < tmptop; ++k1 ) + if( fabs(tmp[k1].val) > droptol[i] / tmp[tmptop].val ){ + L->entries[Ltop].j = tmp[k1].j; + L->entries[Ltop].val = tmp[k1].val; + U->start[tmp[k1].j+1]++; + ++Ltop; + //fprintf( stderr, "%d(%.4f) ", tmp[k1].j+1, tmp[k1].val ); + } + // keep the diagonal in any case + L->entries[Ltop].j = tmp[k1].j; + L->entries[Ltop].val = tmp[k1].val; + ++Ltop; + //fprintf( stderr, "%d(%.4f)\n", tmp[k1].j+1, tmp[k1].val ); + } + + L->start[i] = Ltop; + //fprintf( stderr, "nnz(L): %d, max: %d\n", Ltop, L->n * 50 ); + + for( i = 1; i <= U->n; ++i ) + Utop[i] = U->start[i] = U->start[i] + U->start[i-1] + 1; + + for( i = 0; i < L->n; ++i ) + for( pj = L->start[i]; pj < L->start[i+1]; ++pj ){ + j = L->entries[pj].j; + U->entries[Utop[j]].j = i; + U->entries[Utop[j]].val = L->entries[pj].val; + Utop[j]++; + } + + //fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 ); } void Cuda_ICHOLT( sparse_matrix *A, real *droptol, - sparse_matrix *L, sparse_matrix *U ) + sparse_matrix *L, sparse_matrix *U ) { - sparse_matrix_entry tmp[1000]; - int i, j, pj, k1, k2, tmptop, Ltop; - real val; - int *Utop; - - Utop = (int*) malloc((A->n+1) * sizeof(int)); - - // clear variables - Ltop = 0; - tmptop = 0; - for( i = 0; i <= A->n; ++i ) - L->start[i] = U->start[i] = 0; - - for( i = 0; i < A->n; ++i ) - Utop[i] = 0; - - //fprintf( stderr, "n: %d\n", A->n ); - for( i = 0; i < A->n; ++i ){ - L->start[i] = Ltop; - tmptop = 0; - - for( pj = A->start[i]; pj < A->end[i]-1; ++pj ){ - j = A->entries[pj].j; - val = A->entries[pj].val; - //fprintf( stderr, "i: %d, j: %d", i, j ); - - //CHANGE ORIGINAL - if (j >= i) break; - //CHANGE ORIGINAL - - if( fabs(val) > droptol[i] ){ - k1 = 0; - k2 = L->start[j]; - while( k1 < tmptop && k2 < L->start[j+1] ){ - if( tmp[k1].j < L->entries[k2].j ) - ++k1; - else if( tmp[k1].j > L->entries[k2].j ) - ++k2; - else - val -= (tmp[k1++].val * L->entries[k2++].val); - } - - // L matrix is lower triangular, - // so right before the start of next row comes jth diagonal - val /= L->entries[L->start[j+1]-1].val; - - tmp[tmptop].j = j; - tmp[tmptop].val = val; - ++tmptop; - } - - //fprintf( stderr, " -- done\n" ); - } - - // compute the ith diagonal in L - // sanity check - if( A->entries[pj].j != i ) { - fprintf( stderr, "i=%d, badly built A matrix!\n", i ); - exit(999); - } - - val = A->entries[pj].val; - for( k1 = 0; k1 < tmptop; ++k1 ) - val -= (tmp[k1].val * tmp[k1].val); - - tmp[tmptop].j = i; - tmp[tmptop].val = SQRT(val); - - // apply the dropping rule once again - //fprintf( stderr, "row%d: tmptop: %d\n", i, tmptop ); - //for( k1 = 0; k1<= tmptop; ++k1 ) - // fprintf( stderr, "%d(%f) ", tmp[k1].j, tmp[k1].val ); - //fprintf( stderr, "\n" ); - //fprintf( stderr, "row(%d): droptol=%.4f\n", i+1, droptol[i] ); - for( k1 = 0; k1 < tmptop; ++k1 ) - if( fabs(tmp[k1].val) > droptol[i] / tmp[tmptop].val ){ - L->entries[Ltop].j = tmp[k1].j; - L->entries[Ltop].val = tmp[k1].val; - U->start[tmp[k1].j+1]++; - ++Ltop; - //fprintf( stderr, "%d(%.4f) ", tmp[k1].j+1, tmp[k1].val ); - } - // keep the diagonal in any case - L->entries[Ltop].j = tmp[k1].j; - L->entries[Ltop].val = tmp[k1].val; - ++Ltop; - //fprintf( stderr, "%d(%.4f)\n", tmp[k1].j+1, tmp[k1].val ); - } - - L->start[i] = Ltop; - //fprintf( stderr, "nnz(L): %d, max: %d\n", Ltop, L->n * 50 ); - - for( i = 1; i <= U->n; ++i ) - Utop[i] = U->start[i] = U->start[i] + U->start[i-1] + 1; - - for( i = 0; i < L->n; ++i ) - for( pj = L->start[i]; pj < L->start[i+1]; ++pj ){ - j = L->entries[pj].j; - U->entries[Utop[j]].j = i; - U->entries[Utop[j]].val = L->entries[pj].val; - Utop[j]++; - } - - //fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 ); + sparse_matrix_entry tmp[1000]; + int i, j, pj, k1, k2, tmptop, Ltop; + real val; + int *Utop; + + Utop = (int*) malloc((A->n+1) * sizeof(int)); + + // clear variables + Ltop = 0; + tmptop = 0; + for( i = 0; i <= A->n; ++i ) + L->start[i] = U->start[i] = 0; + + for( i = 0; i < A->n; ++i ) + Utop[i] = 0; + + //fprintf( stderr, "n: %d\n", A->n ); + for( i = 0; i < A->n; ++i ){ + L->start[i] = Ltop; + tmptop = 0; + + for( pj = A->start[i]; pj < A->end[i]-1; ++pj ){ + j = A->entries[pj].j; + val = A->entries[pj].val; + //fprintf( stderr, "i: %d, j: %d", i, j ); + + //CHANGE ORIGINAL + if (j >= i) break; + //CHANGE ORIGINAL + + if( fabs(val) > droptol[i] ){ + k1 = 0; + k2 = L->start[j]; + while( k1 < tmptop && k2 < L->start[j+1] ){ + if( tmp[k1].j < L->entries[k2].j ) + ++k1; + else if( tmp[k1].j > L->entries[k2].j ) + ++k2; + else + val -= (tmp[k1++].val * L->entries[k2++].val); + } + + // L matrix is lower triangular, + // so right before the start of next row comes jth diagonal + val /= L->entries[L->start[j+1]-1].val; + + tmp[tmptop].j = j; + tmp[tmptop].val = val; + ++tmptop; + } + + //fprintf( stderr, " -- done\n" ); + } + + // compute the ith diagonal in L + // sanity check + if( A->entries[pj].j != i ) { + fprintf( stderr, "i=%d, badly built A matrix!\n", i ); + exit(999); + } + + val = A->entries[pj].val; + for( k1 = 0; k1 < tmptop; ++k1 ) + val -= (tmp[k1].val * tmp[k1].val); + + tmp[tmptop].j = i; + tmp[tmptop].val = SQRT(val); + + // apply the dropping rule once again + //fprintf( stderr, "row%d: tmptop: %d\n", i, tmptop ); + //for( k1 = 0; k1<= tmptop; ++k1 ) + // fprintf( stderr, "%d(%f) ", tmp[k1].j, tmp[k1].val ); + //fprintf( stderr, "\n" ); + //fprintf( stderr, "row(%d): droptol=%.4f\n", i+1, droptol[i] ); + for( k1 = 0; k1 < tmptop; ++k1 ) + if( fabs(tmp[k1].val) > droptol[i] / tmp[tmptop].val ){ + L->entries[Ltop].j = tmp[k1].j; + L->entries[Ltop].val = tmp[k1].val; + U->start[tmp[k1].j+1]++; + ++Ltop; + //fprintf( stderr, "%d(%.4f) ", tmp[k1].j+1, tmp[k1].val ); + } + // keep the diagonal in any case + L->entries[Ltop].j = tmp[k1].j; + L->entries[Ltop].val = tmp[k1].val; + ++Ltop; + //fprintf( stderr, "%d(%.4f)\n", tmp[k1].j+1, tmp[k1].val ); + } + + L->start[i] = Ltop; + //fprintf( stderr, "nnz(L): %d, max: %d\n", Ltop, L->n * 50 ); + + for( i = 1; i <= U->n; ++i ) + Utop[i] = U->start[i] = U->start[i] + U->start[i-1] + 1; + + for( i = 0; i < L->n; ++i ) + for( pj = L->start[i]; pj < L->start[i+1]; ++pj ){ + j = L->entries[pj].j; + U->entries[Utop[j]].j = i; + U->entries[Utop[j]].val = L->entries[pj].val; + Utop[j]++; + } + + //fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 ); } @@ -534,29 +534,29 @@ __syncthreads (); // sanity check if (kid == 0) { - if( A->entries[end].j != i ) { - //intentional core dump here for sanity sake - *null_val = 1; - } + if( A->entries[end].j != i ) { + //intentional core dump here for sanity sake + *null_val = 1; + } } //diagnol element //val = A->entries[pj].val; //for( k1 = 0; k1 < tmptop; ++k1 ) if (kid < count) - tmp_val[kid] = (tmp[kid].val * tmp[kid].val); + tmp_val[kid] = (tmp[kid].val * tmp[kid].val); - __syncthreads (); + __syncthreads (); if (kid == 0) { - val = A->entries [end].val; - for (i = 0; i < count; i++) - tempvalue += tmp_val [i]; + val = A->entries [end].val; + for (i = 0; i < count; i++) + tempvalue += tmp_val [i]; - val -= tempvalue; - tmp[tmptop].j = i; - tmp[tmptop].val = SQRT(val); + val -= tempvalue; + tmp[tmptop].j = i; + tmp[tmptop].val = SQRT(val); } __syncthreads (); @@ -564,510 +564,510 @@ __syncthreads (); //for( k1 = 0; k1 < count; ++k1 ) if (kid < count ) { - if( fabs(tmp[kid].val) > droptol[i] / tmp[tmptop].val ){ - L->entries[offset + kid].j = tmp[kid].j; - L->entries[offset + kid].val = tmp[kid].val; - U->start[tmp[kid].j+1]++; - } + if( fabs(tmp[kid].val) > droptol[i] / tmp[tmptop].val ){ + L->entries[offset + kid].j = tmp[kid].j; + L->entries[offset + kid].val = tmp[kid].val; + U->start[tmp[kid].j+1]++; + } } __syncthreads (); if (kid == 0) { - // keep the diagonal in any case - offset += count; - L->entries[offset].j = tmp[count].j; - L->entries[offset].val = tmp[count].val; - ++offset; - L->end [i] = offset; + // keep the diagonal in any case + offset += count; + L->entries[offset].j = tmp[count].j; + L->entries[offset].val = tmp[count].val; + ++offset; + L->end [i] = offset; } __syncthreads (); } // end of main for loop } -void Cuda_Fill_U ( sparse_matrix *A, real *droptol, - sparse_matrix *L, sparse_matrix *U ) +void Cuda_Fill_U ( sparse_matrix *A, real *droptol, + sparse_matrix *L, sparse_matrix *U ) { - int i, pj, j; - - for( i = 1; i <= U->n; ++i ) - Utop[i] = U->start[i] = U->start[i] + U->start[i-1] + 1; - - for( i = 0; i < L->n; ++i ) - for( pj = L->start[i]; pj < L->start[i+1]; ++pj ){ - j = L->entries[pj].j; - U->entries[Utop[j]].j = i; - U->entries[Utop[j]].val = L->entries[pj].val; - Utop[j]++; - } + int i, pj, j; + + for( i = 1; i <= U->n; ++i ) + Utop[i] = U->start[i] = U->start[i] + U->start[i-1] + 1; + + for( i = 0; i < L->n; ++i ) + for( pj = L->start[i]; pj < L->start[i+1]; ++pj ){ + j = L->entries[pj].j; + U->entries[Utop[j]].j = i; + U->entries[Utop[j]].val = L->entries[pj].val; + Utop[j]++; + } } */ void Init_MatVec( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - list *far_nbrs ) + simulation_data *data, static_storage *workspace, + list *far_nbrs ) { - int i, fillin; - real s_tmp, t_tmp; - //char fname[100]; + int i, fillin; + real s_tmp, t_tmp; + //char fname[100]; - if(control->refactor > 0 && - ((data->step-data->prev_steps)%control->refactor==0 || workspace->L.entries==NULL)){ - //Print_Linear_System( system, control, workspace, data->step ); - Sort_Matrix_Rows( &workspace->H ); + if(control->refactor > 0 && + ((data->step-data->prev_steps)%control->refactor==0 || workspace->L.entries==NULL)){ + //Print_Linear_System( system, control, workspace, data->step ); + Sort_Matrix_Rows( &workspace->H ); - //fprintf( stderr, "H matrix sorted\n" ); + //fprintf( stderr, "H matrix sorted\n" ); - Calculate_Droptol( &workspace->H, workspace->droptol, control->droptol ); - //fprintf( stderr, "drop tolerances calculated\n" ); + Calculate_Droptol( &workspace->H, workspace->droptol, control->droptol ); + //fprintf( stderr, "drop tolerances calculated\n" ); - if( workspace->L.entries == NULL ) { - fillin = Estimate_LU_Fill( &workspace->H, workspace->droptol ); + if( workspace->L.entries == NULL ) { + fillin = Estimate_LU_Fill( &workspace->H, workspace->droptol ); #ifdef __DEBUG_CUDA__ - fprintf( stderr, "fillin = %d\n", fillin ); + fprintf( stderr, "fillin = %d\n", fillin ); #endif - if( Allocate_Matrix( &(workspace->L), far_nbrs->n, fillin ) == 0 || - Allocate_Matrix( &(workspace->U), far_nbrs->n, fillin ) == 0 ){ - fprintf( stderr, "not enough memory for LU matrices. terminating.\n" ); - exit(INSUFFICIENT_SPACE); - } + if( Allocate_Matrix( &(workspace->L), far_nbrs->n, fillin ) == 0 || + Allocate_Matrix( &(workspace->U), far_nbrs->n, fillin ) == 0 ){ + fprintf( stderr, "not enough memory for LU matrices. terminating.\n" ); + exit(INSUFFICIENT_SPACE); + } #if defined(DEBUG_FOCUS) - fprintf( stderr, "fillin = %d\n", fillin ); - fprintf( stderr, "allocated memory: L = U = %ldMB\n", - fillin * sizeof(sparse_matrix_entry) / (1024*1024) ); + fprintf( stderr, "fillin = %d\n", fillin ); + fprintf( stderr, "allocated memory: L = U = %ldMB\n", + fillin * sizeof(sparse_matrix_entry) / (1024*1024) ); #endif - } + } - ICHOLT( &workspace->H, workspace->droptol, &workspace->L, &workspace->U ); + ICHOLT( &workspace->H, workspace->droptol, &workspace->L, &workspace->U ); #if defined(DEBUG_FOCUS) - fprintf( stderr, "icholt-" ); - //sprintf( fname, "%s.L%d.out", control->sim_name, data->step ); - //Print_Sparse_Matrix2( workspace->L, fname ); - //Print_Sparse_Matrix( U ); + fprintf( stderr, "icholt-" ); + //sprintf( fname, "%s.L%d.out", control->sim_name, data->step ); + //Print_Sparse_Matrix2( workspace->L, fname ); + //Print_Sparse_Matrix( U ); #endif - } - - /* extrapolation for s & t */ - for( i = 0; i < system->N; ++i ) { - // no extrapolation - //s_tmp = workspace->s[0][i]; - //t_tmp = workspace->t[0][i]; - - // linear - //s_tmp = 2 * workspace->s[0][i] - workspace->s[1][i]; - //t_tmp = 2 * workspace->t[0][i] - workspace->t[1][i]; - - // quadratic - //s_tmp = workspace->s[2][i] + 3 * (workspace->s[0][i]-workspace->s[1][i]); - t_tmp = workspace->t[index_wkspace_sys(2,i,system)] + 3*(workspace->t[index_wkspace_sys(0,i,system)]-workspace->t[index_wkspace_sys(1,i,system)]); - - // cubic - s_tmp = 4 * (workspace->s[index_wkspace_sys(0,i,system)] + workspace->s[index_wkspace_sys(2,i,system)]) - - (6 * workspace->s[index_wkspace_sys(1,i,system)] + workspace->s[index_wkspace_sys(3,i,system)] ); - //t_tmp = 4 * (workspace->t[0][i] + workspace->t[2][i]) - - // (6 * workspace->t[1][i] + workspace->t[3][i] ); - - // 4th order - //s_tmp = 5 * (workspace->s[0][i] - workspace->s[3][i]) + - // 10 * (-workspace->s[1][i] + workspace->s[2][i] ) + workspace->s[4][i]; - //t_tmp = 5 * (workspace->t[0][i] - workspace->t[3][i]) + - // 10 * (-workspace->t[1][i] + workspace->t[2][i] ) + workspace->t[4][i]; - - workspace->s[index_wkspace_sys(4,i,system)] = workspace->s[index_wkspace_sys(3,i,system)]; - workspace->s[index_wkspace_sys(3,i,system)] = workspace->s[index_wkspace_sys(2,i,system)]; - workspace->s[index_wkspace_sys(2,i,system)] = workspace->s[index_wkspace_sys(1,i,system)]; - workspace->s[index_wkspace_sys(1,i,system)] = workspace->s[index_wkspace_sys(0,i,system)]; - workspace->s[index_wkspace_sys(0,i,system)] = s_tmp; - - workspace->t[index_wkspace_sys(4,i,system)] = workspace->t[index_wkspace_sys(3,i,system)]; - workspace->t[index_wkspace_sys(3,i,system)] = workspace->t[index_wkspace_sys(2,i,system)]; - workspace->t[index_wkspace_sys(2,i,system)] = workspace->t[index_wkspace_sys(1,i,system)]; - workspace->t[index_wkspace_sys(1,i,system)] = workspace->t[index_wkspace_sys(0,i,system)]; - workspace->t[index_wkspace_sys(0,i,system)] = t_tmp; - } + } + + /* extrapolation for s & t */ + for( i = 0; i < system->N; ++i ) { + // no extrapolation + //s_tmp = workspace->s[0][i]; + //t_tmp = workspace->t[0][i]; + + // linear + //s_tmp = 2 * workspace->s[0][i] - workspace->s[1][i]; + //t_tmp = 2 * workspace->t[0][i] - workspace->t[1][i]; + + // quadratic + //s_tmp = workspace->s[2][i] + 3 * (workspace->s[0][i]-workspace->s[1][i]); + t_tmp = workspace->t[index_wkspace_sys(2,i,system)] + 3*(workspace->t[index_wkspace_sys(0,i,system)]-workspace->t[index_wkspace_sys(1,i,system)]); + + // cubic + s_tmp = 4 * (workspace->s[index_wkspace_sys(0,i,system)] + workspace->s[index_wkspace_sys(2,i,system)]) - + (6 * workspace->s[index_wkspace_sys(1,i,system)] + workspace->s[index_wkspace_sys(3,i,system)] ); + //t_tmp = 4 * (workspace->t[0][i] + workspace->t[2][i]) - + // (6 * workspace->t[1][i] + workspace->t[3][i] ); + + // 4th order + //s_tmp = 5 * (workspace->s[0][i] - workspace->s[3][i]) + + // 10 * (-workspace->s[1][i] + workspace->s[2][i] ) + workspace->s[4][i]; + //t_tmp = 5 * (workspace->t[0][i] - workspace->t[3][i]) + + // 10 * (-workspace->t[1][i] + workspace->t[2][i] ) + workspace->t[4][i]; + + workspace->s[index_wkspace_sys(4,i,system)] = workspace->s[index_wkspace_sys(3,i,system)]; + workspace->s[index_wkspace_sys(3,i,system)] = workspace->s[index_wkspace_sys(2,i,system)]; + workspace->s[index_wkspace_sys(2,i,system)] = workspace->s[index_wkspace_sys(1,i,system)]; + workspace->s[index_wkspace_sys(1,i,system)] = workspace->s[index_wkspace_sys(0,i,system)]; + workspace->s[index_wkspace_sys(0,i,system)] = s_tmp; + + workspace->t[index_wkspace_sys(4,i,system)] = workspace->t[index_wkspace_sys(3,i,system)]; + workspace->t[index_wkspace_sys(3,i,system)] = workspace->t[index_wkspace_sys(2,i,system)]; + workspace->t[index_wkspace_sys(2,i,system)] = workspace->t[index_wkspace_sys(1,i,system)]; + workspace->t[index_wkspace_sys(1,i,system)] = workspace->t[index_wkspace_sys(0,i,system)]; + workspace->t[index_wkspace_sys(0,i,system)] = t_tmp; + } } -void Cuda_Init_MatVec( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - list *far_nbrs ) +void Cuda_Init_MatVec( reax_system *system, control_params *control, + simulation_data *data, static_storage *workspace, + list *far_nbrs ) { - int i, fillin; - real s_tmp, t_tmp; - int *spad = (int *)scratch; - real start = 0, end = 0; + int i, fillin; + real s_tmp, t_tmp; + int *spad = (int *)scratch; + real start = 0, end = 0; - if(control->refactor > 0 && - ((data->step-data->prev_steps)%control->refactor==0 || dev_workspace->L.entries==NULL)){ + if(control->refactor > 0 && + ((data->step-data->prev_steps)%control->refactor==0 || dev_workspace->L.entries==NULL)){ - Cuda_Sort_Matrix_Rows <<< BLOCKS, BLOCK_SIZE >>> - ( dev_workspace->H ); - cudaThreadSynchronize (); - cudaCheckError (); + Cuda_Sort_Matrix_Rows <<< BLOCKS, BLOCK_SIZE >>> + ( dev_workspace->H ); + cudaThreadSynchronize (); + cudaCheckError (); #ifdef __DEBUG_CUDA__ - fprintf (stderr, "Sorting done... \n"); + fprintf (stderr, "Sorting done... \n"); #endif - Cuda_Calculate_Droptol <<<BLOCKS, BLOCK_SIZE >>> - ( dev_workspace->H, dev_workspace->droptol, control->droptol ); - cudaThreadSynchronize (); - cudaCheckError (); + Cuda_Calculate_Droptol <<<BLOCKS, BLOCK_SIZE >>> + ( dev_workspace->H, dev_workspace->droptol, control->droptol ); + cudaThreadSynchronize (); + cudaCheckError (); #ifdef __DEBUG_CUDA__ - fprintf (stderr, "Droptol done... \n"); + fprintf (stderr, "Droptol done... \n"); #endif - if( dev_workspace->L.entries == NULL ) { + if( dev_workspace->L.entries == NULL ) { - cuda_memset ( spad, 0, 2 * INT_SIZE * system->N, RES_SCRATCH ); - Cuda_Estimate_LU_Fill <<< BLOCKS, BLOCK_SIZE >>> - ( dev_workspace->H, dev_workspace->droptol, spad ); - cudaThreadSynchronize (); - cudaCheckError (); + cuda_memset ( spad, 0, 2 * INT_SIZE * system->N, RES_SCRATCH ); + Cuda_Estimate_LU_Fill <<< BLOCKS, BLOCK_SIZE >>> + ( dev_workspace->H, dev_workspace->droptol, spad ); + cudaThreadSynchronize (); + cudaCheckError (); - //Reduction for fill in - Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, INT_SIZE * BLOCK_SIZE >>> - (spad, spad + system->N, system->N); - cudaThreadSynchronize (); - cudaCheckError (); + //Reduction for fill in + Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, INT_SIZE * BLOCK_SIZE >>> + (spad, spad + system->N, system->N); + cudaThreadSynchronize (); + cudaCheckError (); - Cuda_reduction <<<1, BLOCKS_POW_2, INT_SIZE * BLOCKS_POW_2>>> - (spad + system->N, spad + system->N + BLOCKS_POW_2, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); + Cuda_reduction <<<1, BLOCKS_POW_2, INT_SIZE * BLOCKS_POW_2>>> + (spad + system->N, spad + system->N + BLOCKS_POW_2, BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); - copy_host_device (&fillin, spad + system->N + BLOCKS_POW_2, INT_SIZE, cudaMemcpyDeviceToHost, RES_SCRATCH ); - fillin += dev_workspace->H.n; + copy_host_device (&fillin, spad + system->N + BLOCKS_POW_2, INT_SIZE, cudaMemcpyDeviceToHost, RES_SCRATCH ); + fillin += dev_workspace->H.n; #ifdef __DEBUG_CUDA__ - fprintf (stderr, "Calculated value of the fill in is --> %d \n ", fillin ); + fprintf (stderr, "Calculated value of the fill in is --> %d \n ", fillin ); #endif - dev_workspace->L.n = far_nbrs->n; - dev_workspace->L.m = fillin; - Cuda_Init_Sparse_Matrix( &dev_workspace->L, fillin, far_nbrs->n ); + dev_workspace->L.n = far_nbrs->n; + dev_workspace->L.m = fillin; + Cuda_Init_Sparse_Matrix( &dev_workspace->L, fillin, far_nbrs->n ); - dev_workspace->U.n = far_nbrs->n; - dev_workspace->U.m = fillin; - Cuda_Init_Sparse_Matrix( &dev_workspace->U, fillin, far_nbrs->n ); - } + dev_workspace->U.n = far_nbrs->n; + dev_workspace->U.m = fillin; + Cuda_Init_Sparse_Matrix( &dev_workspace->U, fillin, far_nbrs->n ); + } #ifdef __DEBUG_CUDA__ - fprintf (stderr, "LU matrix done...\n"); + fprintf (stderr, "LU matrix done...\n"); #endif - //TODO -- This is the ILU Factorization of the H Matrix. - //This is present in the CUDA 5.0 compilation which is not working currently. - //Fix this when CUDA 5.0 is correctly setup. - //TODO - //shared memory is per block - // here we have only one block - - /* - fprintf (stderr, "max sparse matrix entries %d \n", system->max_sparse_matrix_entries ); - Cuda_ICHOLT <<<1, system->max_sparse_matrix_entries, - system->max_sparse_matrix_entries *(REAL_SIZE + SPARSE_MATRIX_ENTRY_SIZE) >>> - ( system, dev_workspace->H, - dev_workspace->droptol, - dev_workspace->L, - dev_workspace->U ); - cudaThreadSynchronize (); - fprintf (stderr, "Cuda_ICHOLT .. done ...-> %d\n ", cudaGetLastError ()); - */ - - //1. copy the H matrix from device to host - //2. Allocate the L/U matrices on the host and device. - //3. Compute the L/U on the host - //4. copy the results to the device - //5. Continue the computation. - sparse_matrix t_H, t_L, t_U; - real *t_droptol; - - t_droptol = (real *) malloc (REAL_SIZE * system->N); + //TODO -- This is the ILU Factorization of the H Matrix. + //This is present in the CUDA 5.0 compilation which is not working currently. + //Fix this when CUDA 5.0 is correctly setup. + //TODO + //shared memory is per block + // here we have only one block - + /* + fprintf (stderr, "max sparse matrix entries %d \n", system->max_sparse_matrix_entries ); + Cuda_ICHOLT <<<1, system->max_sparse_matrix_entries, + system->max_sparse_matrix_entries *(REAL_SIZE + SPARSE_MATRIX_ENTRY_SIZE) >>> + ( system, dev_workspace->H, + dev_workspace->droptol, + dev_workspace->L, + dev_workspace->U ); + cudaThreadSynchronize (); + fprintf (stderr, "Cuda_ICHOLT .. done ...-> %d\n ", cudaGetLastError ()); + */ + + //1. copy the H matrix from device to host + //2. Allocate the L/U matrices on the host and device. + //3. Compute the L/U on the host + //4. copy the results to the device + //5. Continue the computation. + sparse_matrix t_H, t_L, t_U; + real *t_droptol; + + t_droptol = (real *) malloc (REAL_SIZE * system->N); #ifdef __DEBUG_CUDA__ - fprintf (stderr, " Allocation temp matrices count %d entries %d \n", dev_workspace->H.n, dev_workspace->H.m ); + fprintf (stderr, " Allocation temp matrices count %d entries %d \n", dev_workspace->H.n, dev_workspace->H.m ); #endif - start = Get_Time (); - if (!Allocate_Matrix (&t_H, dev_workspace->H.n, dev_workspace->H.m)) { fprintf (stderr, "No space for H matrix \n"); exit (0);} - if (!Allocate_Matrix (&t_L, far_nbrs->n, dev_workspace->L.m)) { fprintf (stderr, "No space for L matrix \n"); exit (0); } - if (!Allocate_Matrix (&t_U, far_nbrs->n, dev_workspace->U.m)) { fprintf (stderr, "No space for U matrix \n"); exit (0); } - - copy_host_device ( t_H.start, dev_workspace->H.start, INT_SIZE * (dev_workspace->H.n + 1), cudaMemcpyDeviceToHost, RES_SPARSE_MATRIX_INDEX ); - copy_host_device ( t_H.end, dev_workspace->H.end, INT_SIZE * (dev_workspace->H.n + 1), cudaMemcpyDeviceToHost, RES_SPARSE_MATRIX_INDEX ); - copy_host_device ( t_H.entries, dev_workspace->H.entries, SPARSE_MATRIX_ENTRY_SIZE * dev_workspace->H.m, cudaMemcpyDeviceToHost, RES_SPARSE_MATRIX_ENTRY ); - - copy_host_device ( t_droptol, dev_workspace->droptol, REAL_SIZE * system->N, cudaMemcpyDeviceToHost, RES_STORAGE_DROPTOL ); - - //fprintf (stderr, " Done copying LUH .. \n"); - Cuda_ICHOLT (&t_H, t_droptol, &t_L, &t_U); - - Sync_Host_Device (&t_L, &t_U, cudaMemcpyHostToDevice); - end += Get_Timing_Info (start); - - /* - fprintf (stderr, "Done syncing .... \n"); - free (t_droptol); - fprintf (stderr, "Freed droptol ... \n"); - Deallocate_Matrix (&t_H); - fprintf (stderr, "Freed H ... \n"); - Deallocate_Matrix (&t_L); - fprintf (stderr, "Freed l ... \n"); - Deallocate_Matrix (&t_U); - fprintf (stderr, "Freed u ... \n"); - */ - - //#ifdef __DEBUG_CUDA__ - fprintf (stderr, "Done copying the L/U matrices to the device ---> %f \n", end); - //#endif - - //#ifdef __BUILD_DEBUG__ - // validate_lu (workspace); - //#endif - } + start = Get_Time (); + if (!Allocate_Matrix (&t_H, dev_workspace->H.n, dev_workspace->H.m)) { fprintf (stderr, "No space for H matrix \n"); exit (0);} + if (!Allocate_Matrix (&t_L, far_nbrs->n, dev_workspace->L.m)) { fprintf (stderr, "No space for L matrix \n"); exit (0); } + if (!Allocate_Matrix (&t_U, far_nbrs->n, dev_workspace->U.m)) { fprintf (stderr, "No space for U matrix \n"); exit (0); } + + copy_host_device ( t_H.start, dev_workspace->H.start, INT_SIZE * (dev_workspace->H.n + 1), cudaMemcpyDeviceToHost, RES_SPARSE_MATRIX_INDEX ); + copy_host_device ( t_H.end, dev_workspace->H.end, INT_SIZE * (dev_workspace->H.n + 1), cudaMemcpyDeviceToHost, RES_SPARSE_MATRIX_INDEX ); + copy_host_device ( t_H.entries, dev_workspace->H.entries, SPARSE_MATRIX_ENTRY_SIZE * dev_workspace->H.m, cudaMemcpyDeviceToHost, RES_SPARSE_MATRIX_ENTRY ); + + copy_host_device ( t_droptol, dev_workspace->droptol, REAL_SIZE * system->N, cudaMemcpyDeviceToHost, RES_STORAGE_DROPTOL ); + + //fprintf (stderr, " Done copying LUH .. \n"); + Cuda_ICHOLT (&t_H, t_droptol, &t_L, &t_U); + + Sync_Host_Device (&t_L, &t_U, cudaMemcpyHostToDevice); + end += Get_Timing_Info (start); + + /* + fprintf (stderr, "Done syncing .... \n"); + free (t_droptol); + fprintf (stderr, "Freed droptol ... \n"); + Deallocate_Matrix (&t_H); + fprintf (stderr, "Freed H ... \n"); + Deallocate_Matrix (&t_L); + fprintf (stderr, "Freed l ... \n"); + Deallocate_Matrix (&t_U); + fprintf (stderr, "Freed u ... \n"); + */ + + //#ifdef __DEBUG_CUDA__ + fprintf (stderr, "Done copying the L/U matrices to the device ---> %f \n", end); + //#endif + + //#ifdef __BUILD_DEBUG__ + // validate_lu (workspace); + //#endif + } } GLOBAL void Init_MatVec_Postprocess (static_storage p_workspace, int N ) { - static_storage *workspace = &p_workspace; - real s_tmp, t_tmp; - int i = blockIdx.x * blockDim.x + threadIdx.x; - - if (i >= N) return; - // no extrapolation - //s_tmp = workspace->s[0][i]; - //t_tmp = workspace->t[0][i]; - - // linear - //s_tmp = 2 * workspace->s[0][i] - workspace->s[1][i]; - //t_tmp = 2 * workspace->t[0][i] - workspace->t[1][i]; - - // quadratic - //s_tmp = workspace->s[2][i] + 3 * (workspace->s[0][i]-workspace->s[1][i]); - t_tmp = workspace->t[index_wkspace_sys(2,i,N)] + 3*(workspace->t[index_wkspace_sys(0,i,N)]-workspace->t[index_wkspace_sys(1,i,N)]); - - // cubic - s_tmp = 4 * (workspace->s[index_wkspace_sys(0,i,N)] + workspace->s[index_wkspace_sys(2,i,N)]) - - (6 * workspace->s[index_wkspace_sys(1,i,N)] + workspace->s[index_wkspace_sys(3,i,N)] ); - //t_tmp = 4 * (workspace->t[0][i] + workspace->t[2][i]) - - // (6 * workspace->t[1][i] + workspace->t[3][i] ); - - // 4th order - //s_tmp = 5 * (workspace->s[0][i] - workspace->s[3][i]) + - // 10 * (-workspace->s[1][i] + workspace->s[2][i] ) + workspace->s[4][i]; - //t_tmp = 5 * (workspace->t[0][i] - workspace->t[3][i]) + - // 10 * (-workspace->t[1][i] + workspace->t[2][i] ) + workspace->t[4][i]; - - workspace->s[index_wkspace_sys(4,i,N)] = workspace->s[index_wkspace_sys(3,i,N)]; - workspace->s[index_wkspace_sys(3,i,N)] = workspace->s[index_wkspace_sys(2,i,N)]; - workspace->s[index_wkspace_sys(2,i,N)] = workspace->s[index_wkspace_sys(1,i,N)]; - workspace->s[index_wkspace_sys(1,i,N)] = workspace->s[index_wkspace_sys(0,i,N)]; - workspace->s[index_wkspace_sys(0,i,N)] = s_tmp; - - workspace->t[index_wkspace_sys(4,i,N)] = workspace->t[index_wkspace_sys(3,i,N)]; - workspace->t[index_wkspace_sys(3,i,N)] = workspace->t[index_wkspace_sys(2,i,N)]; - workspace->t[index_wkspace_sys(2,i,N)] = workspace->t[index_wkspace_sys(1,i,N)]; - workspace->t[index_wkspace_sys(1,i,N)] = workspace->t[index_wkspace_sys(0,i,N)]; - workspace->t[index_wkspace_sys(0,i,N)] = t_tmp; + static_storage *workspace = &p_workspace; + real s_tmp, t_tmp; + int i = blockIdx.x * blockDim.x + threadIdx.x; + + if (i >= N) return; + // no extrapolation + //s_tmp = workspace->s[0][i]; + //t_tmp = workspace->t[0][i]; + + // linear + //s_tmp = 2 * workspace->s[0][i] - workspace->s[1][i]; + //t_tmp = 2 * workspace->t[0][i] - workspace->t[1][i]; + + // quadratic + //s_tmp = workspace->s[2][i] + 3 * (workspace->s[0][i]-workspace->s[1][i]); + t_tmp = workspace->t[index_wkspace_sys(2,i,N)] + 3*(workspace->t[index_wkspace_sys(0,i,N)]-workspace->t[index_wkspace_sys(1,i,N)]); + + // cubic + s_tmp = 4 * (workspace->s[index_wkspace_sys(0,i,N)] + workspace->s[index_wkspace_sys(2,i,N)]) - + (6 * workspace->s[index_wkspace_sys(1,i,N)] + workspace->s[index_wkspace_sys(3,i,N)] ); + //t_tmp = 4 * (workspace->t[0][i] + workspace->t[2][i]) - + // (6 * workspace->t[1][i] + workspace->t[3][i] ); + + // 4th order + //s_tmp = 5 * (workspace->s[0][i] - workspace->s[3][i]) + + // 10 * (-workspace->s[1][i] + workspace->s[2][i] ) + workspace->s[4][i]; + //t_tmp = 5 * (workspace->t[0][i] - workspace->t[3][i]) + + // 10 * (-workspace->t[1][i] + workspace->t[2][i] ) + workspace->t[4][i]; + + workspace->s[index_wkspace_sys(4,i,N)] = workspace->s[index_wkspace_sys(3,i,N)]; + workspace->s[index_wkspace_sys(3,i,N)] = workspace->s[index_wkspace_sys(2,i,N)]; + workspace->s[index_wkspace_sys(2,i,N)] = workspace->s[index_wkspace_sys(1,i,N)]; + workspace->s[index_wkspace_sys(1,i,N)] = workspace->s[index_wkspace_sys(0,i,N)]; + workspace->s[index_wkspace_sys(0,i,N)] = s_tmp; + + workspace->t[index_wkspace_sys(4,i,N)] = workspace->t[index_wkspace_sys(3,i,N)]; + workspace->t[index_wkspace_sys(3,i,N)] = workspace->t[index_wkspace_sys(2,i,N)]; + workspace->t[index_wkspace_sys(2,i,N)] = workspace->t[index_wkspace_sys(1,i,N)]; + workspace->t[index_wkspace_sys(1,i,N)] = workspace->t[index_wkspace_sys(0,i,N)]; + workspace->t[index_wkspace_sys(0,i,N)] = t_tmp; } void Calculate_Charges( reax_system *system, static_storage *workspace ) { - int i; - real u, s_sum, t_sum; + int i; + real u, s_sum, t_sum; - s_sum = t_sum = 0.; - for( i = 0; i < system->N; ++i ) { - s_sum += workspace->s[index_wkspace_sys(0,i,system)]; - t_sum += workspace->t[index_wkspace_sys(0,i,system)]; - } + s_sum = t_sum = 0.; + for( i = 0; i < system->N; ++i ) { + s_sum += workspace->s[index_wkspace_sys(0,i,system)]; + t_sum += workspace->t[index_wkspace_sys(0,i,system)]; + } - u = s_sum / t_sum; + u = s_sum / t_sum; #ifdef __DEBUG_CUDA__ - fprintf (stderr, "Host --->s %13.2f, t %13.f, u %13.2f \n", s_sum, t_sum, u ); + fprintf (stderr, "Host --->s %13.2f, t %13.f, u %13.2f \n", s_sum, t_sum, u ); #endif - for( i = 0; i < system->N; ++i ) - system->atoms[i].q = workspace->s[index_wkspace_sys(0,i,system)] - u * workspace->t[index_wkspace_sys(0,i,system)]; + for( i = 0; i < system->N; ++i ) + system->atoms[i].q = workspace->s[index_wkspace_sys(0,i,system)] - u * workspace->t[index_wkspace_sys(0,i,system)]; } GLOBAL void Cuda_Update_Atoms_q ( reax_atom *atoms, real *s, real u, real *t, int N) { - int i = blockIdx.x*blockDim.x + threadIdx.x; - if (i >= N) return; + int i = blockIdx.x*blockDim.x + threadIdx.x; + if (i >= N) return; - atoms[i].q = s[index_wkspace_sys(0,i,N)] - u * t[index_wkspace_sys(0,i,N)]; + atoms[i].q = s[index_wkspace_sys(0,i,N)] - u * t[index_wkspace_sys(0,i,N)]; } void Cuda_Calculate_Charges (reax_system *system, static_storage *workspace) { - real *spad = (real *) scratch; - real u, s_sum, t_sum; + real *spad = (real *) scratch; + real u, s_sum, t_sum; - cuda_memset (spad, 0, (BLOCKS_POW_2 * 2 * REAL_SIZE), RES_SCRATCH ); + cuda_memset (spad, 0, (BLOCKS_POW_2 * 2 * REAL_SIZE), RES_SCRATCH ); - //s_sum - Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> - (&dev_workspace->s [index_wkspace_sys (0, 0,system->N)], spad, system->N); - cudaThreadSynchronize (); - cudaCheckError (); + //s_sum + Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> + (&dev_workspace->s [index_wkspace_sys (0, 0,system->N)], spad, system->N); + cudaThreadSynchronize (); + cudaCheckError (); - Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> - (spad, spad+BLOCKS_POW_2, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); + Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> + (spad, spad+BLOCKS_POW_2, BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); - copy_host_device (&s_sum, spad+BLOCKS_POW_2, REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + copy_host_device (&s_sum, spad+BLOCKS_POW_2, REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - //t_sum - cuda_memset (spad, 0, (BLOCKS_POW_2 * 2 * REAL_SIZE), RES_SCRATCH ); - Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> - (&dev_workspace->t [index_wkspace_sys (0, 0,system->N)], spad, system->N); - cudaThreadSynchronize (); - cudaCheckError (); + //t_sum + cuda_memset (spad, 0, (BLOCKS_POW_2 * 2 * REAL_SIZE), RES_SCRATCH ); + Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> + (&dev_workspace->t [index_wkspace_sys (0, 0,system->N)], spad, system->N); + cudaThreadSynchronize (); + cudaCheckError (); - Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> - (spad, spad+BLOCKS_POW_2, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); + Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> + (spad, spad+BLOCKS_POW_2, BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); - copy_host_device (&t_sum, spad+BLOCKS_POW_2, REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + copy_host_device (&t_sum, spad+BLOCKS_POW_2, REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - //fraction here - u = s_sum / t_sum; + //fraction here + u = s_sum / t_sum; #ifdef __DEBUG_CUDA__ - fprintf (stderr, "DEVICE ---> s %13.2f, t %13.f, u %13.2f \n", s_sum, t_sum, u ); + fprintf (stderr, "DEVICE ---> s %13.2f, t %13.f, u %13.2f \n", s_sum, t_sum, u ); #endif - Cuda_Update_Atoms_q <<< BLOCKS, BLOCK_SIZE >>> - ( (reax_atom *)system->d_atoms, dev_workspace->s, u, dev_workspace->t, system->N); - cudaThreadSynchronize (); - cudaCheckError (); + Cuda_Update_Atoms_q <<< BLOCKS, BLOCK_SIZE >>> + ( (reax_atom *)system->d_atoms, dev_workspace->s, u, dev_workspace->t, system->N); + cudaThreadSynchronize (); + cudaCheckError (); } void QEq( reax_system *system, control_params *control, simulation_data *data, - static_storage *workspace, list *far_nbrs, - output_controls *out_control ) + static_storage *workspace, list *far_nbrs, + output_controls *out_control ) { - int matvecs; + int matvecs; - //real t_start, t_elapsed; + //real t_start, t_elapsed; - //t_start = Get_Time (); - Init_MatVec( system, control, data, workspace, far_nbrs ); - //t_elapsed = Get_Timing_Info ( t_start ); + //t_start = Get_Time (); + Init_MatVec( system, control, data, workspace, far_nbrs ); + //t_elapsed = Get_Timing_Info ( t_start ); - //fprintf (stderr, " CPU Init_MatVec timing ----> %f \n", t_elapsed ); + //fprintf (stderr, " CPU Init_MatVec timing ----> %f \n", t_elapsed ); - //if( data->step % 10 == 0 ) - // Print_Linear_System( system, control, workspace, far_nbrs, data->step ); + //if( data->step % 10 == 0 ) + // Print_Linear_System( system, control, workspace, far_nbrs, data->step ); - //t_start = Get_Time ( ); - matvecs = GMRES( workspace, &workspace->H, - workspace->b_s, control->q_err, &workspace->s[0], out_control->log, system ); - matvecs += GMRES( workspace, &workspace->H, - workspace->b_t, control->q_err, &workspace->t[0], out_control->log, system ); - //t_elapsed = Get_Timing_Info ( t_start ); + //t_start = Get_Time ( ); + matvecs = GMRES( workspace, &workspace->H, + workspace->b_s, control->q_err, &workspace->s[0], out_control->log, system ); + matvecs += GMRES( workspace, &workspace->H, + workspace->b_t, control->q_err, &workspace->t[0], out_control->log, system ); + //t_elapsed = Get_Timing_Info ( t_start ); - //fprintf (stderr, " CPU GMRES timing ---> %f \n", t_elapsed ); + //fprintf (stderr, " CPU GMRES timing ---> %f \n", t_elapsed ); - //matvecs = GMRES_HouseHolder( workspace, workspace->H, - // workspace->b_s, control->q_err, workspace->s[0], out_control->log ); - //matvecs += GMRES_HouseHolder( workspace, workspace->H, - // workspace->b_t, control->q_err, workspace->t[0], out_control->log ); + //matvecs = GMRES_HouseHolder( workspace, workspace->H, + // workspace->b_s, control->q_err, workspace->s[0], out_control->log ); + //matvecs += GMRES_HouseHolder( workspace, workspace->H, + // workspace->b_t, control->q_err, workspace->t[0], out_control->log ); - //matvecs = PGMRES( workspace, &workspace->H, workspace->b_s, control->q_err, - // &workspace->L, &workspace->U, &workspace->s[index_wkspace_sys(0,0,system)], out_control->log, system ); - //matvecs += PGMRES( workspace, &workspace->H, workspace->b_t, control->q_err, - // &workspace->L, &workspace->U, &workspace->t[index_wkspace_sys(0,0,system)], out_control->log, system ); + //matvecs = PGMRES( workspace, &workspace->H, workspace->b_s, control->q_err, + // &workspace->L, &workspace->U, &workspace->s[index_wkspace_sys(0,0,system)], out_control->log, system ); + //matvecs += PGMRES( workspace, &workspace->H, workspace->b_t, control->q_err, + // &workspace->L, &workspace->U, &workspace->t[index_wkspace_sys(0,0,system)], out_control->log, system ); - //matvecs=PCG( workspace, workspace->H, workspace->b_s, control->q_err, - // workspace->L, workspace->U, workspace->s[0], out_control->log ) + 1; - ///matvecs+=PCG( workspace, workspace->H, workspace->b_t, control->q_err, - // workspace->L, workspace->U, workspace->t[0], out_control->log ) + 1; + //matvecs=PCG( workspace, workspace->H, workspace->b_s, control->q_err, + // workspace->L, workspace->U, workspace->s[0], out_control->log ) + 1; + ///matvecs+=PCG( workspace, workspace->H, workspace->b_t, control->q_err, + // workspace->L, workspace->U, workspace->t[0], out_control->log ) + 1; - //matvecs = CG( workspace, workspace->H, - // workspace->b_s, control->q_err, workspace->s[0], out_control->log ) + 1; - //matvecs += CG( workspace, workspace->H, - // workspace->b_t, control->q_err, workspace->t[0], out_control->log ) + 1; + //matvecs = CG( workspace, workspace->H, + // workspace->b_s, control->q_err, workspace->s[0], out_control->log ) + 1; + //matvecs += CG( workspace, workspace->H, + // workspace->b_t, control->q_err, workspace->t[0], out_control->log ) + 1; - //matvecs = SDM( workspace, workspace->H, - // workspace->b_s, control->q_err, workspace->s[0], out_control->log ) + 1; - //matvecs += SDM( workspace, workspace->H, - // workspace->b_t, control->q_err, workspace->t[0], out_control->log ) + 1; + //matvecs = SDM( workspace, workspace->H, + // workspace->b_s, control->q_err, workspace->s[0], out_control->log ) + 1; + //matvecs += SDM( workspace, workspace->H, + // workspace->b_t, control->q_err, workspace->t[0], out_control->log ) + 1; - //fprintf (stderr, " GMRES done with iterations %d \n", matvecs ); + //fprintf (stderr, " GMRES done with iterations %d \n", matvecs ); - data->timing.matvecs += matvecs; + data->timing.matvecs += matvecs; #if defined(DEBUG_FOCUS) - fprintf( stderr, "linsolve-" ); + fprintf( stderr, "linsolve-" ); #endif - Calculate_Charges( system, workspace ); - //fprintf( stderr, "%d %.9f %.9f %.9f %.9f %.9f %.9f\n", - // data->step, - // workspace->s[0][0], workspace->t[0][0], - // workspace->s[0][1], workspace->t[0][1], - // workspace->s[0][2], workspace->t[0][2] ); - // if( data->step == control->nsteps ) - //Print_Charges( system, control, workspace, data->step ); + Calculate_Charges( system, workspace ); + //fprintf( stderr, "%d %.9f %.9f %.9f %.9f %.9f %.9f\n", + // data->step, + // workspace->s[0][0], workspace->t[0][0], + // workspace->s[0][1], workspace->t[0][1], + // workspace->s[0][2], workspace->t[0][2] ); + // if( data->step == control->nsteps ) + //Print_Charges( system, control, workspace, data->step ); } void Cuda_QEq( reax_system *system, control_params *control, simulation_data *data, - static_storage *workspace, list *far_nbrs, - output_controls *out_control ) + static_storage *workspace, list *far_nbrs, + output_controls *out_control ) { - int matvecs = 0; - real t_start, t_elapsed; + int matvecs = 0; + real t_start, t_elapsed; #ifdef __DEBUG_CUDA__ - t_start = Get_Time (); + t_start = Get_Time (); #endif - /* - //Cuda_Init_MatVec( system, control, data, workspace, far_nbrs ); + /* + //Cuda_Init_MatVec( system, control, data, workspace, far_nbrs ); - Cuda_Sort_Matrix_Rows <<< BLOCKS, BLOCK_SIZE >>> - ( dev_workspace->H ); - cudaThreadSynchronize (); - cudaCheckError (); + Cuda_Sort_Matrix_Rows <<< BLOCKS, BLOCK_SIZE >>> + ( dev_workspace->H ); + cudaThreadSynchronize (); + cudaCheckError (); - t_elapsed = Get_Timing_Info (t_start); - fprintf (stderr, "Sorting done...tming --> %f \n", t_elapsed); - */ - Init_MatVec_Postprocess <<< BLOCKS, BLOCK_SIZE >>> - (*dev_workspace, system->N); - cudaThreadSynchronize (); - cudaCheckError (); + t_elapsed = Get_Timing_Info (t_start); + fprintf (stderr, "Sorting done...tming --> %f \n", t_elapsed); + */ + Init_MatVec_Postprocess <<< BLOCKS, BLOCK_SIZE >>> + (*dev_workspace, system->N); + cudaThreadSynchronize (); + cudaCheckError (); #ifdef __DEBUG_CUDA__ - t_elapsed = Get_Timing_Info (t_start); - fprintf (stderr, "Done with post processing of init_matvec --> %d with time ---> %f \n", cudaGetLastError (), t_elapsed); + t_elapsed = Get_Timing_Info (t_start); + fprintf (stderr, "Done with post processing of init_matvec --> %d with time ---> %f \n", cudaGetLastError (), t_elapsed); #endif - //Here goes the GMRES part of the program () - //#ifdef __DEBUG_CUDA__ - t_start = Get_Time (); - //#endif + //Here goes the GMRES part of the program () + //#ifdef __DEBUG_CUDA__ + t_start = Get_Time (); + //#endif - //matvecs = Cuda_GMRES( dev_workspace, dev_workspace->b_s, control->q_err, dev_workspace->s ); - //matvecs += Cuda_GMRES( dev_workspace, dev_workspace->b_t, control->q_err, dev_workspace->t ); + //matvecs = Cuda_GMRES( dev_workspace, dev_workspace->b_s, control->q_err, dev_workspace->s ); + //matvecs += Cuda_GMRES( dev_workspace, dev_workspace->b_t, control->q_err, dev_workspace->t ); - matvecs = Cublas_GMRES( system, dev_workspace, dev_workspace->b_s, control->q_err, dev_workspace->s ); - matvecs += Cublas_GMRES( system, dev_workspace, dev_workspace->b_t, control->q_err, dev_workspace->t ); + matvecs = Cublas_GMRES( system, dev_workspace, dev_workspace->b_s, control->q_err, dev_workspace->s ); + matvecs += Cublas_GMRES( system, dev_workspace, dev_workspace->b_t, control->q_err, dev_workspace->t ); - d_timing.matvecs += matvecs; + d_timing.matvecs += matvecs; #ifdef __DEBUG_CUDA__ - t_elapsed = Get_Timing_Info ( t_start ); - fprintf (stderr, " Cuda_GMRES done with iterations %d with timing ---> %f \n", matvecs, t_elapsed ); + t_elapsed = Get_Timing_Info ( t_start ); + fprintf (stderr, " Cuda_GMRES done with iterations %d with timing ---> %f \n", matvecs, t_elapsed ); #endif - //Here cuda calculate charges - Cuda_Calculate_Charges (system, workspace); + //Here cuda calculate charges + Cuda_Calculate_Charges (system, workspace); } diff --git a/PuReMD-GPU/src/allocate.cu b/PuReMD-GPU/src/allocate.cu index 3de11aa4..37b80693 100644 --- a/PuReMD-GPU/src/allocate.cu +++ b/PuReMD-GPU/src/allocate.cu @@ -26,480 +26,480 @@ void Reallocate_Neighbor_List( list *far_nbrs, int n, int num_intrs ) { - Delete_List( far_nbrs ); - if(!Make_List( n, num_intrs, TYP_FAR_NEIGHBOR, far_nbrs )){ - fprintf(stderr, "Problem in initializing far nbrs list. Terminating!\n"); - exit( INIT_ERR ); - } + Delete_List( far_nbrs ); + if(!Make_List( n, num_intrs, TYP_FAR_NEIGHBOR, far_nbrs )){ + fprintf(stderr, "Problem in initializing far nbrs list. Terminating!\n"); + exit( INIT_ERR ); + } #if defined(DEBUG_FOCUS) - fprintf( stderr, "num_far = %d, far_nbrs = %d -> reallocating!\n", - num_intrs, far_nbrs->num_intrs ); - fprintf( stderr, "memory allocated: far_nbrs = %ldMB\n", - num_intrs * sizeof(far_neighbor_data) / (1024*1024) ); + fprintf( stderr, "num_far = %d, far_nbrs = %d -> reallocating!\n", + num_intrs, far_nbrs->num_intrs ); + fprintf( stderr, "memory allocated: far_nbrs = %ldMB\n", + num_intrs * sizeof(far_neighbor_data) / (1024*1024) ); #endif } void Cuda_Reallocate_Neighbor_List( list *far_nbrs, int n, int num_intrs ) { - Delete_List( far_nbrs, TYP_DEVICE ); - if(!Make_List( n, num_intrs, TYP_FAR_NEIGHBOR, far_nbrs, TYP_DEVICE )){ - fprintf(stderr, "Problem in initializing far nbrs list. Terminating!\n"); - exit( INIT_ERR ); - } + Delete_List( far_nbrs, TYP_DEVICE ); + if(!Make_List( n, num_intrs, TYP_FAR_NEIGHBOR, far_nbrs, TYP_DEVICE )){ + fprintf(stderr, "Problem in initializing far nbrs list. Terminating!\n"); + exit( INIT_ERR ); + } #if defined(DEBUG_FOCUS) - fprintf( stderr, "num_far = %d, far_nbrs = %d -> reallocating!\n", - num_intrs, far_nbrs->num_intrs ); - fprintf( stderr, "memory allocated: far_nbrs = %ldMB\n", - num_intrs * sizeof(far_neighbor_data) / (1024*1024) ); + fprintf( stderr, "num_far = %d, far_nbrs = %d -> reallocating!\n", + num_intrs, far_nbrs->num_intrs ); + fprintf( stderr, "memory allocated: far_nbrs = %ldMB\n", + num_intrs * sizeof(far_neighbor_data) / (1024*1024) ); #endif } int Allocate_Matrix( sparse_matrix *H, int n, int m ) { - H->n = n; - H->m = m; - if( (H->start = (int*) malloc(sizeof(int) * n+1)) == NULL ) - return 0; + H->n = n; + H->m = m; + if( (H->start = (int*) malloc(sizeof(int) * n+1)) == NULL ) + return 0; - if( (H->end = (int*) malloc(sizeof(int) * n+1)) == NULL ) - return 0; + if( (H->end = (int*) malloc(sizeof(int) * n+1)) == NULL ) + return 0; - if( (H->entries = - (sparse_matrix_entry*) malloc(sizeof(sparse_matrix_entry)*m)) == NULL ) - return 0; + if( (H->entries = + (sparse_matrix_entry*) malloc(sizeof(sparse_matrix_entry)*m)) == NULL ) + return 0; - return 1; + return 1; } int Cuda_Allocate_Matrix( sparse_matrix *H, int n, int m ) { - H->n = n; - H->m = m; + H->n = n; + H->m = m; - cuda_malloc ((void **) &H->start, INT_SIZE * (n+1), 0, RES_SPARSE_MATRIX_INDEX ); - cuda_malloc ((void **) &H->end, INT_SIZE *(n+1), 0, RES_SPARSE_MATRIX_INDEX ); - cuda_malloc ((void **) &H->entries, SPARSE_MATRIX_ENTRY_SIZE * m, 0, RES_SPARSE_MATRIX_ENTRY ); + cuda_malloc ((void **) &H->start, INT_SIZE * (n+1), 0, RES_SPARSE_MATRIX_INDEX ); + cuda_malloc ((void **) &H->end, INT_SIZE *(n+1), 0, RES_SPARSE_MATRIX_INDEX ); + cuda_malloc ((void **) &H->entries, SPARSE_MATRIX_ENTRY_SIZE * m, 0, RES_SPARSE_MATRIX_ENTRY ); - return 1; + return 1; } void Deallocate_Matrix( sparse_matrix *H ) { - free(H->start); - free(H->entries); - free(H->end); + free(H->start); + free(H->entries); + free(H->end); } void Cuda_Deallocate_Matrix( sparse_matrix *H ) { - cuda_free(H->start, RES_SPARSE_MATRIX_INDEX); - cuda_free(H->end, RES_SPARSE_MATRIX_INDEX); - cuda_free(H->entries, RES_SPARSE_MATRIX_ENTRY); + cuda_free(H->start, RES_SPARSE_MATRIX_INDEX); + cuda_free(H->end, RES_SPARSE_MATRIX_INDEX); + cuda_free(H->entries, RES_SPARSE_MATRIX_ENTRY); - H->start = NULL; - H->end = NULL; - H->entries = NULL; + H->start = NULL; + H->end = NULL; + H->entries = NULL; } int Reallocate_Matrix( sparse_matrix *H, int n, int m, char *name ) { - Deallocate_Matrix( H ); - if( !Allocate_Matrix( H, n, m ) ) { - fprintf(stderr, "not enough space for %s matrix. terminating!\n", name); - exit( 1 ); - } + Deallocate_Matrix( H ); + if( !Allocate_Matrix( H, n, m ) ) { + fprintf(stderr, "not enough space for %s matrix. terminating!\n", name); + exit( 1 ); + } #if defined(DEBUG_FOCUS) - fprintf( stderr, "reallocating %s matrix, n = %d, m = %d\n", - name, n, m ); - fprintf( stderr, "memory allocated: %s = %ldMB\n", - name, m * sizeof(sparse_matrix_entry) / (1024*1024) ); + fprintf( stderr, "reallocating %s matrix, n = %d, m = %d\n", + name, n, m ); + fprintf( stderr, "memory allocated: %s = %ldMB\n", + name, m * sizeof(sparse_matrix_entry) / (1024*1024) ); #endif - return 1; + return 1; } int Cuda_Reallocate_Matrix( sparse_matrix *H, int n, int m, char *name ) { - Cuda_Deallocate_Matrix( H ); + Cuda_Deallocate_Matrix( H ); - if( !Cuda_Allocate_Matrix( H, n, m ) ) { - fprintf(stderr, "not enough space for %s matrix on GPU . terminating!\n", name); - exit( 1 ); - } + if( !Cuda_Allocate_Matrix( H, n, m ) ) { + fprintf(stderr, "not enough space for %s matrix on GPU . terminating!\n", name); + exit( 1 ); + } #if defined(DEBUG_FOCUS) - fprintf( stderr, "reallocating %s matrix, n = %d, m = %d\n", - name, n, m ); - fprintf( stderr, "memory allocated: %s = %ldMB\n", - name, m * sizeof(sparse_matrix_entry) / (1024*1024) ); + fprintf( stderr, "reallocating %s matrix, n = %d, m = %d\n", + name, n, m ); + fprintf( stderr, "memory allocated: %s = %ldMB\n", + name, m * sizeof(sparse_matrix_entry) / (1024*1024) ); #endif - return 1; + return 1; } int Allocate_HBond_List( int n, int num_h, int *h_index, int *hb_top, - list *hbonds ) + list *hbonds ) { - int i, num_hbonds; - - num_hbonds = 0; - /* find starting indexes for each H and the total number of hbonds */ - for( i = 1; i < n; ++i ) - hb_top[i] += hb_top[i-1]; - num_hbonds = hb_top[n-1]; - - if( !Make_List(num_h, num_hbonds, TYP_HBOND, hbonds ) ) { - fprintf( stderr, "not enough space for hbonds list. terminating!\n" ); - exit( INIT_ERR ); - } - - for( i = 0; i < n; ++i ) - if( h_index[i] == 0 ){ - Set_Start_Index( 0, 0, hbonds ); - Set_End_Index( 0, 0, hbonds ); - } - else if( h_index[i] > 0 ){ - Set_Start_Index( h_index[i], hb_top[i-1], hbonds ); - Set_End_Index( h_index[i], hb_top[i-1], hbonds ); - } + int i, num_hbonds; + + num_hbonds = 0; + /* find starting indexes for each H and the total number of hbonds */ + for( i = 1; i < n; ++i ) + hb_top[i] += hb_top[i-1]; + num_hbonds = hb_top[n-1]; + + if( !Make_List(num_h, num_hbonds, TYP_HBOND, hbonds ) ) { + fprintf( stderr, "not enough space for hbonds list. terminating!\n" ); + exit( INIT_ERR ); + } + + for( i = 0; i < n; ++i ) + if( h_index[i] == 0 ){ + Set_Start_Index( 0, 0, hbonds ); + Set_End_Index( 0, 0, hbonds ); + } + else if( h_index[i] > 0 ){ + Set_Start_Index( h_index[i], hb_top[i-1], hbonds ); + Set_End_Index( h_index[i], hb_top[i-1], hbonds ); + } #if defined(DEBUG_FOCUS) - fprintf( stderr, "allocating hbonds - num_hbonds: %d\n", num_hbonds ); - fprintf( stderr, "memory allocated: hbonds = %ldMB\n", - num_hbonds * sizeof(hbond_data) / (1024*1024) ); + fprintf( stderr, "allocating hbonds - num_hbonds: %d\n", num_hbonds ); + fprintf( stderr, "memory allocated: hbonds = %ldMB\n", + num_hbonds * sizeof(hbond_data) / (1024*1024) ); #endif - return 1; + return 1; } GLOBAL void Init_HBond_Indexes ( int *h_index, int *hb_top, list hbonds, int N ) { - int index = blockIdx.x * blockDim.x + threadIdx.x; - - if (index >= N) return; - - if( h_index[index] == 0 ){ - Set_Start_Index( 0, 0, &hbonds ); - Set_End_Index( 0, 0, &hbonds ); - } - else if( h_index[index] > 0 ){ - Set_Start_Index( h_index[index], hb_top[index-1], &hbonds ); - Set_End_Index( h_index[index], hb_top[index-1], &hbonds ); - } + int index = blockIdx.x * blockDim.x + threadIdx.x; + + if (index >= N) return; + + if( h_index[index] == 0 ){ + Set_Start_Index( 0, 0, &hbonds ); + Set_End_Index( 0, 0, &hbonds ); + } + else if( h_index[index] > 0 ){ + Set_Start_Index( h_index[index], hb_top[index-1], &hbonds ); + Set_End_Index( h_index[index], hb_top[index-1], &hbonds ); + } } int Cuda_Allocate_HBond_List( int n, int num_h, int *h_index, int *hb_top, list *hbonds ) { - int i, num_hbonds; - int blocks, block_size; - int *d_hb_top; - num_hbonds = 0; - - /* find starting indexes for each H and the total number of hbonds */ - for( i = 1; i < n; ++i ) - hb_top[i] += hb_top[i-1]; - num_hbonds = hb_top[n-1]; - - if( !Make_List(num_h, num_hbonds, TYP_HBOND, hbonds , TYP_DEVICE) ) { - fprintf( stderr, "not enough space for hbonds list. terminating!\n" ); - exit( INIT_ERR ); - } - - //cuda_malloc ((void **) &d_hb_top, INT_SIZE * (n), 1, __LINE__); - d_hb_top = (int *) scratch; - cuda_memset ( d_hb_top, 0, INT_SIZE * n, RES_SCRATCH ); - copy_host_device (hb_top, (d_hb_top), INT_SIZE * n, cudaMemcpyHostToDevice, __LINE__); - - Init_HBond_Indexes <<< BLOCKS, BLOCK_SIZE >>> - ( h_index, d_hb_top, *hbonds, n); - cudaThreadSynchronize (); + int i, num_hbonds; + int blocks, block_size; + int *d_hb_top; + num_hbonds = 0; + + /* find starting indexes for each H and the total number of hbonds */ + for( i = 1; i < n; ++i ) + hb_top[i] += hb_top[i-1]; + num_hbonds = hb_top[n-1]; + + if( !Make_List(num_h, num_hbonds, TYP_HBOND, hbonds , TYP_DEVICE) ) { + fprintf( stderr, "not enough space for hbonds list. terminating!\n" ); + exit( INIT_ERR ); + } + + //cuda_malloc ((void **) &d_hb_top, INT_SIZE * (n), 1, __LINE__); + d_hb_top = (int *) scratch; + cuda_memset ( d_hb_top, 0, INT_SIZE * n, RES_SCRATCH ); + copy_host_device (hb_top, (d_hb_top), INT_SIZE * n, cudaMemcpyHostToDevice, __LINE__); + + Init_HBond_Indexes <<< BLOCKS, BLOCK_SIZE >>> + ( h_index, d_hb_top, *hbonds, n); + cudaThreadSynchronize (); #ifdef __DEBUG_CUDA__ - fprintf( stderr, "Done with allocating hbonds - num_hbonds: %d\n", num_hbonds ); + fprintf( stderr, "Done with allocating hbonds - num_hbonds: %d\n", num_hbonds ); #endif - return 1; + return 1; } int Reallocate_HBonds_List( int n, int num_h, int *h_index, list *hbonds ) { - int i; - int *hb_top; + int i; + int *hb_top; #if defined(DEBUG_FOCUS) - fprintf( stderr, "reallocating hbonds\n" ); + fprintf( stderr, "reallocating hbonds\n" ); #endif - hb_top = (int *)calloc( n, sizeof(int) ); - for( i = 0; i < n; ++i ) - if( h_index[i] >= 0 ) - hb_top[i] = MAX(Num_Entries(h_index[i],hbonds)*SAFE_HBONDS, MIN_HBONDS); + hb_top = (int *)calloc( n, sizeof(int) ); + for( i = 0; i < n; ++i ) + if( h_index[i] >= 0 ) + hb_top[i] = MAX(Num_Entries(h_index[i],hbonds)*SAFE_HBONDS, MIN_HBONDS); - Delete_List( hbonds ); + Delete_List( hbonds ); - Allocate_HBond_List( n, num_h, h_index, hb_top, hbonds ); + Allocate_HBond_List( n, num_h, h_index, hb_top, hbonds ); - free( hb_top ); + free( hb_top ); - return 1; + return 1; } int Cuda_Reallocate_HBonds_List( int n, int num_h, int *h_index, list *hbonds ) { - int i; - int *hb_top; - int *hb_start; - int *hb_end; + int i; + int *hb_top; + int *hb_start; + int *hb_end; #if defined(DEBUG_FOCUS) - fprintf( stderr, "reallocating hbonds\n" ); + fprintf( stderr, "reallocating hbonds\n" ); #endif - hb_top = (int *)calloc( n, sizeof(int) ); - hb_start = (int *) calloc (hbonds->n, sizeof (int)); - hb_end = (int *) calloc (hbonds->n, sizeof (int)); + hb_top = (int *)calloc( n, sizeof(int) ); + hb_start = (int *) calloc (hbonds->n, sizeof (int)); + hb_end = (int *) calloc (hbonds->n, sizeof (int)); - copy_host_device (hb_start, hbonds->index, sizeof (int) * hbonds->n, - cudaMemcpyDeviceToHost, LIST_INDEX); - copy_host_device (hb_end , hbonds->end_index, sizeof (int) * hbonds->n, - cudaMemcpyDeviceToHost, LIST_END_INDEX); + copy_host_device (hb_start, hbonds->index, sizeof (int) * hbonds->n, + cudaMemcpyDeviceToHost, LIST_INDEX); + copy_host_device (hb_end , hbonds->end_index, sizeof (int) * hbonds->n, + cudaMemcpyDeviceToHost, LIST_END_INDEX); - for( i = 0; i < n; ++i ) - //if( h_index[i] >= 0 ) - hb_top[i] = MAX((hb_end [i] - hb_start[i])*SAFE_HBONDS, MIN_HBONDS); + for( i = 0; i < n; ++i ) + //if( h_index[i] >= 0 ) + hb_top[i] = MAX((hb_end [i] - hb_start[i])*SAFE_HBONDS, MIN_HBONDS); - Delete_List( hbonds, TYP_DEVICE ); + Delete_List( hbonds, TYP_DEVICE ); - Cuda_Allocate_HBond_List( n, num_h, h_index, hb_top, hbonds ); + Cuda_Allocate_HBond_List( n, num_h, h_index, hb_top, hbonds ); - free( hb_top ); - free( hb_start ); - free( hb_end ); + free( hb_top ); + free( hb_start ); + free( hb_end ); - return 1; + return 1; } GLOBAL void Init_Bond_Indexes ( int *b_top, list bonds, int N ) { - int index = blockIdx.x * blockDim.x + threadIdx.x; - - if (index >= N) return; - - if( index == 0 ){ - Set_Start_Index( 0, 0, &bonds ); - Set_End_Index( 0, 0, &bonds ); - } - else if( index > 0 ){ - Set_Start_Index( index, b_top[index-1], &bonds ); - Set_End_Index( index, b_top[index-1], &bonds ); - } + int index = blockIdx.x * blockDim.x + threadIdx.x; + + if (index >= N) return; + + if( index == 0 ){ + Set_Start_Index( 0, 0, &bonds ); + Set_End_Index( 0, 0, &bonds ); + } + else if( index > 0 ){ + Set_Start_Index( index, b_top[index-1], &bonds ); + Set_End_Index( index, b_top[index-1], &bonds ); + } } int Cuda_Allocate_Bond_List( int num_b, int *b_top, list *bonds ) { - int i, num_bonds; - int *d_b_top = (int *) scratch; - num_bonds = 0; - - /* find starting indexes for each H and the total number of hbonds */ - for( i = 1; i < num_b; ++i ) - b_top[i] += b_top[i-1]; - num_bonds = b_top[num_b-1]; - - if( !Make_List(num_b, num_bonds, TYP_BOND, bonds, TYP_DEVICE) ) { - fprintf( stderr, "not enough space for bonds list. terminating!\n" ); - exit( INIT_ERR ); - } - - //cuda_malloc ((void **) &d_b_top, INT_SIZE * num_b, 1, __LINE__); - cuda_memset ( d_b_top, 0, INT_SIZE * num_b, RES_SCRATCH ); - copy_host_device (b_top, d_b_top, INT_SIZE * num_b, cudaMemcpyHostToDevice, __LINE__); - - Init_Bond_Indexes <<< BLOCKS, BLOCK_SIZE>>> - ( d_b_top, *bonds, num_b); - cudaThreadSynchronize (); - cudaCheckError (); - - return 1; + int i, num_bonds; + int *d_b_top = (int *) scratch; + num_bonds = 0; + + /* find starting indexes for each H and the total number of hbonds */ + for( i = 1; i < num_b; ++i ) + b_top[i] += b_top[i-1]; + num_bonds = b_top[num_b-1]; + + if( !Make_List(num_b, num_bonds, TYP_BOND, bonds, TYP_DEVICE) ) { + fprintf( stderr, "not enough space for bonds list. terminating!\n" ); + exit( INIT_ERR ); + } + + //cuda_malloc ((void **) &d_b_top, INT_SIZE * num_b, 1, __LINE__); + cuda_memset ( d_b_top, 0, INT_SIZE * num_b, RES_SCRATCH ); + copy_host_device (b_top, d_b_top, INT_SIZE * num_b, cudaMemcpyHostToDevice, __LINE__); + + Init_Bond_Indexes <<< BLOCKS, BLOCK_SIZE>>> + ( d_b_top, *bonds, num_b); + cudaThreadSynchronize (); + cudaCheckError (); + + return 1; } int Allocate_Bond_List( int n, int *bond_top, list *bonds ) { - int i, num_bonds; - - num_bonds = 0; - /* find starting indexes for each atom and the total number of bonds */ - for( i = 1; i < n; ++i ) - bond_top[i] += bond_top[i-1]; - num_bonds = bond_top[n-1]; - - if( !Make_List(n, num_bonds, TYP_BOND, bonds ) ) { - fprintf( stderr, "not enough space for bonds list. terminating!\n" ); - exit( INIT_ERR ); - } - - Set_Start_Index( 0, 0, bonds ); - Set_End_Index( 0, 0, bonds ); - for( i = 1; i < n; ++i ) { - Set_Start_Index( i, bond_top[i-1], bonds ); - Set_End_Index( i, bond_top[i-1], bonds ); - } + int i, num_bonds; + + num_bonds = 0; + /* find starting indexes for each atom and the total number of bonds */ + for( i = 1; i < n; ++i ) + bond_top[i] += bond_top[i-1]; + num_bonds = bond_top[n-1]; + + if( !Make_List(n, num_bonds, TYP_BOND, bonds ) ) { + fprintf( stderr, "not enough space for bonds list. terminating!\n" ); + exit( INIT_ERR ); + } + + Set_Start_Index( 0, 0, bonds ); + Set_End_Index( 0, 0, bonds ); + for( i = 1; i < n; ++i ) { + Set_Start_Index( i, bond_top[i-1], bonds ); + Set_End_Index( i, bond_top[i-1], bonds ); + } #if defined(DEBUG_FOCUS) - fprintf( stderr, "allocating bonds - num_bonds: %d\n", num_bonds ); - fprintf( stderr, "memory allocated: bonds = %ldMB\n", - num_bonds * sizeof(bond_data) / (1024*1024) ); + fprintf( stderr, "allocating bonds - num_bonds: %d\n", num_bonds ); + fprintf( stderr, "memory allocated: bonds = %ldMB\n", + num_bonds * sizeof(bond_data) / (1024*1024) ); #endif - return 1; + return 1; } int Reallocate_Bonds_List( int n, list *bonds, int *num_bonds, int *est_3body ) { - int i; - int *bond_top; + int i; + int *bond_top; #if defined(DEBUG_FOCUS) - fprintf( stderr, "reallocating bonds\n" ); + fprintf( stderr, "reallocating bonds\n" ); #endif - bond_top = (int *)calloc( n, sizeof(int) ); - *est_3body = 0; - for( i = 0; i < n; ++i ){ - *est_3body += SQR( Num_Entries( i, bonds ) ); - bond_top[i] = MAX( Num_Entries( i, bonds ) * 2, MIN_BONDS ); - } + bond_top = (int *)calloc( n, sizeof(int) ); + *est_3body = 0; + for( i = 0; i < n; ++i ){ + *est_3body += SQR( Num_Entries( i, bonds ) ); + bond_top[i] = MAX( Num_Entries( i, bonds ) * 2, MIN_BONDS ); + } - Delete_List( bonds ); + Delete_List( bonds ); - Allocate_Bond_List( n, bond_top, bonds ); - *num_bonds = bond_top[n-1]; + Allocate_Bond_List( n, bond_top, bonds ); + *num_bonds = bond_top[n-1]; - free( bond_top ); + free( bond_top ); - return 1; + return 1; } void GLOBAL Calculate_Bond_Indexes (int *bond_top, list bonds, int *per_block_results, int n) { - extern __shared__ int sh_input[]; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - real x = 0; - - if(i < n) - { - x = SQR (Num_Entries( i, &bonds ) ); - bond_top[i] = MAX( Num_Entries( i, &bonds ) * 2, MIN_BONDS ); - } - sh_input[threadIdx.x] = x; - __syncthreads(); - - for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) - { - if(threadIdx.x < offset) - { - sh_input[threadIdx.x] += sh_input[threadIdx.x + offset]; - } - - __syncthreads(); - } - - if(threadIdx.x == 0) - { - per_block_results[blockIdx.x] = sh_input[0]; - } + extern __shared__ int sh_input[]; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + real x = 0; + + if(i < n) + { + x = SQR (Num_Entries( i, &bonds ) ); + bond_top[i] = MAX( Num_Entries( i, &bonds ) * 2, MIN_BONDS ); + } + sh_input[threadIdx.x] = x; + __syncthreads(); + + for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) + { + if(threadIdx.x < offset) + { + sh_input[threadIdx.x] += sh_input[threadIdx.x + offset]; + } + + __syncthreads(); + } + + if(threadIdx.x == 0) + { + per_block_results[blockIdx.x] = sh_input[0]; + } } int Cuda_Reallocate_Bonds_List( int n, list *bonds, int *num_3body ) { - int i; - int *b_top; - int *b_start; - int *b_end; + int i; + int *b_top; + int *b_start; + int *b_end; #if defined(DEBUG_FOCUS) - fprintf( stderr, "reallocating bonds\n" ); + fprintf( stderr, "reallocating bonds\n" ); #endif - b_top = (int *)calloc( n, sizeof(int) ); - b_start = (int *) calloc (bonds->n, sizeof (int)); - b_end = (int *) calloc (bonds->n, sizeof (int)); + b_top = (int *)calloc( n, sizeof(int) ); + b_start = (int *) calloc (bonds->n, sizeof (int)); + b_end = (int *) calloc (bonds->n, sizeof (int)); - copy_host_device (b_start, bonds->index, sizeof (int) * bonds->n, - cudaMemcpyDeviceToHost, LIST_INDEX); - copy_host_device (b_end , bonds->end_index, sizeof (int) * bonds->n, - cudaMemcpyDeviceToHost, LIST_END_INDEX); + copy_host_device (b_start, bonds->index, sizeof (int) * bonds->n, + cudaMemcpyDeviceToHost, LIST_INDEX); + copy_host_device (b_end , bonds->end_index, sizeof (int) * bonds->n, + cudaMemcpyDeviceToHost, LIST_END_INDEX); - for( i = 0; i < n; ++i ) { - *num_3body += SQR (b_end[i] - b_start[i]); - b_top[i] = MAX((b_end [i] - b_start[i])*2, MIN_BONDS); - } + for( i = 0; i < n; ++i ) { + *num_3body += SQR (b_end[i] - b_start[i]); + b_top[i] = MAX((b_end [i] - b_start[i])*2, MIN_BONDS); + } - Delete_List( bonds, TYP_DEVICE ); + Delete_List( bonds, TYP_DEVICE ); - Cuda_Allocate_Bond_List(n, b_top, bonds ); + Cuda_Allocate_Bond_List(n, b_top, bonds ); - i = b_top[ n-1 ]; + i = b_top[ n-1 ]; - free( b_top ); - free( b_start ); - free( b_end ); + free( b_top ); + free( b_start ); + free( b_end ); - return i; + return i; } int Cuda_Reallocate_ThreeBody_List ( list *thblist, int count ) { - int i; - int thb_total = 0; - int *thb_start; - int *thb_end; + int i; + int thb_total = 0; + int *thb_start; + int *thb_end; - int new_total, new_count; + int new_total, new_count; #if defined(DEBUG_FOCUS) - fprintf( stderr, "reallocating bonds\n" ); + fprintf( stderr, "reallocating bonds\n" ); #endif - thb_start = (int *) calloc (thblist->n, sizeof (int)); - thb_end = (int *) calloc (thblist->n, sizeof (int)); + thb_start = (int *) calloc (thblist->n, sizeof (int)); + thb_end = (int *) calloc (thblist->n, sizeof (int)); - copy_host_device (thb_start, thblist->index, sizeof (int) * thblist->n, - cudaMemcpyDeviceToHost, LIST_INDEX); - copy_host_device (thb_end , thblist->end_index, sizeof (int) * thblist->n, - cudaMemcpyDeviceToHost, LIST_END_INDEX); + copy_host_device (thb_start, thblist->index, sizeof (int) * thblist->n, + cudaMemcpyDeviceToHost, LIST_INDEX); + copy_host_device (thb_end , thblist->end_index, sizeof (int) * thblist->n, + cudaMemcpyDeviceToHost, LIST_END_INDEX); - for( i = 0; i < thblist->n; ++i ) - thb_total += (thb_end[i] - thb_start[i]) * SAFE_ZONE; + for( i = 0; i < thblist->n; ++i ) + thb_total += (thb_end[i] - thb_start[i]) * SAFE_ZONE; - //new_total = MAX( thb_total, thblist->num_intrs ); - //new_count = MAX( num_3body, thblist->n ); + //new_total = MAX( thb_total, thblist->num_intrs ); + //new_count = MAX( num_3body, thblist->n ); - new_total = thb_total; - new_count = count; + new_total = thb_total; + new_count = count; - Delete_List( thblist, TYP_DEVICE ); + Delete_List( thblist, TYP_DEVICE ); - /*Allocate the list */ - if(!Make_List( new_count, new_total, TYP_THREE_BODY, thblist, TYP_DEVICE )){ - fprintf(stderr, "Problem in reallocating three-body list. Terminating!\n"); - exit( INIT_ERR ); - } + /*Allocate the list */ + if(!Make_List( new_count, new_total, TYP_THREE_BODY, thblist, TYP_DEVICE )){ + fprintf(stderr, "Problem in reallocating three-body list. Terminating!\n"); + exit( INIT_ERR ); + } #if defined(__CUDA_MEM__) - fprintf( stderr, "reallocating 3 bodies - \n" ); - fprintf( stderr, "num_bonds: %d ", new_count); - fprintf( stderr, "num_3body: %d ", new_total); - fprintf( stderr, "3body memory: %ldMB\n", - new_total * sizeof(three_body_interaction_data)/ - (1024*1024) ); + fprintf( stderr, "reallocating 3 bodies - \n" ); + fprintf( stderr, "num_bonds: %d ", new_count); + fprintf( stderr, "num_3body: %d ", new_total); + fprintf( stderr, "3body memory: %ldMB\n", + new_total * sizeof(three_body_interaction_data)/ + (1024*1024) ); #endif - free( thb_start ); - free( thb_end ); + free( thb_start ); + free( thb_end ); - return 1; + return 1; } @@ -543,184 +543,184 @@ cuda_memset (d_bond_top, 0, (n+BLOCKS_POW_2+1) * INT_SIZE, RES_SCRATCH ); void Reallocate( reax_system *system, static_storage *workspace, list **lists, - int nbr_flag ) + int nbr_flag ) { - int num_bonds, est_3body; - reallocate_data *realloc; - grid *g; - - realloc = &(workspace->realloc); - g = &(system->g); - - if( realloc->num_far > 0 && nbr_flag ) { - fprintf (stderr, " Reallocating neighbors \n"); - Reallocate_Neighbor_List( (*lists)+FAR_NBRS, - system->N, realloc->num_far * SAFE_ZONE ); - realloc->num_far = -1; - } - - if( realloc->Htop > 0 ){ - fprintf (stderr, " Reallocating Matrix \n"); - Reallocate_Matrix(&(workspace->H), system->N, realloc->Htop*SAFE_ZONE,"H"); - realloc->Htop = -1; - - Deallocate_Matrix( &workspace->L ); - Deallocate_Matrix( &workspace->U ); - } - - if( realloc->hbonds > 0 ){ - fprintf (stderr, " Reallocating hbonds \n"); - Reallocate_HBonds_List(system->N, workspace->num_H, workspace->hbond_index, - (*lists)+HBONDS ); - realloc->hbonds = -1; - } - - num_bonds = est_3body = -1; - if( realloc->bonds > 0 ){ - fprintf (stderr, " Reallocating bonds \n"); - Reallocate_Bonds_List( system->N, (*lists)+BONDS, &num_bonds, &est_3body ); - realloc->bonds = -1; - realloc->num_3body = MAX( realloc->num_3body, est_3body ); - } - - if( realloc->num_3body > 0 ) { - fprintf (stderr, " Reallocating 3Body \n"); - Delete_List( (*lists)+THREE_BODIES ); - - if( num_bonds == -1 ) - num_bonds = ((*lists)+BONDS)->num_intrs; - realloc->num_3body *= SAFE_ZONE; - - if( !Make_List( num_bonds, realloc->num_3body, - TYP_THREE_BODY, (*lists)+THREE_BODIES ) ) { - fprintf( stderr, "Problem in initializing angles list. Terminating!\n" ); - exit( INIT_ERR ); - } - realloc->num_3body = -1; + int num_bonds, est_3body; + reallocate_data *realloc; + grid *g; + + realloc = &(workspace->realloc); + g = &(system->g); + + if( realloc->num_far > 0 && nbr_flag ) { + fprintf (stderr, " Reallocating neighbors \n"); + Reallocate_Neighbor_List( (*lists)+FAR_NBRS, + system->N, realloc->num_far * SAFE_ZONE ); + realloc->num_far = -1; + } + + if( realloc->Htop > 0 ){ + fprintf (stderr, " Reallocating Matrix \n"); + Reallocate_Matrix(&(workspace->H), system->N, realloc->Htop*SAFE_ZONE,"H"); + realloc->Htop = -1; + + Deallocate_Matrix( &workspace->L ); + Deallocate_Matrix( &workspace->U ); + } + + if( realloc->hbonds > 0 ){ + fprintf (stderr, " Reallocating hbonds \n"); + Reallocate_HBonds_List(system->N, workspace->num_H, workspace->hbond_index, + (*lists)+HBONDS ); + realloc->hbonds = -1; + } + + num_bonds = est_3body = -1; + if( realloc->bonds > 0 ){ + fprintf (stderr, " Reallocating bonds \n"); + Reallocate_Bonds_List( system->N, (*lists)+BONDS, &num_bonds, &est_3body ); + realloc->bonds = -1; + realloc->num_3body = MAX( realloc->num_3body, est_3body ); + } + + if( realloc->num_3body > 0 ) { + fprintf (stderr, " Reallocating 3Body \n"); + Delete_List( (*lists)+THREE_BODIES ); + + if( num_bonds == -1 ) + num_bonds = ((*lists)+BONDS)->num_intrs; + realloc->num_3body *= SAFE_ZONE; + + if( !Make_List( num_bonds, realloc->num_3body, + TYP_THREE_BODY, (*lists)+THREE_BODIES ) ) { + fprintf( stderr, "Problem in initializing angles list. Terminating!\n" ); + exit( INIT_ERR ); + } + realloc->num_3body = -1; #if defined(DEBUG_FOCUS) - fprintf( stderr, "reallocating 3 bodies\n" ); - fprintf( stderr, "reallocated - num_bonds: %d\n", num_bonds ); - fprintf( stderr, "reallocated - num_3body: %d\n", realloc->num_3body ); - fprintf( stderr, "reallocated 3body memory: %ldMB\n", - realloc->num_3body*sizeof(three_body_interaction_data)/ - (1024*1024) ); + fprintf( stderr, "reallocating 3 bodies\n" ); + fprintf( stderr, "reallocated - num_bonds: %d\n", num_bonds ); + fprintf( stderr, "reallocated - num_3body: %d\n", realloc->num_3body ); + fprintf( stderr, "reallocated 3body memory: %ldMB\n", + realloc->num_3body*sizeof(three_body_interaction_data)/ + (1024*1024) ); #endif - } + } - if( realloc->gcell_atoms > -1 ){ + if( realloc->gcell_atoms > -1 ){ #if defined(DEBUG_FOCUS) - fprintf(stderr, "reallocating gcell: g->max_atoms: %d\n", g->max_atoms); + fprintf(stderr, "reallocating gcell: g->max_atoms: %d\n", g->max_atoms); #endif - free (g->atoms); - g->atoms = (int *) calloc ( g->ncell[0]*g->ncell[1]*g->ncell[2], - sizeof (int) * workspace->realloc.gcell_atoms); - realloc->gcell_atoms = -1; - } + free (g->atoms); + g->atoms = (int *) calloc ( g->ncell[0]*g->ncell[1]*g->ncell[2], + sizeof (int) * workspace->realloc.gcell_atoms); + realloc->gcell_atoms = -1; + } } void Cuda_Reallocate( reax_system *system, static_storage *workspace, list *lists, - int nbr_flag, int step ) + int nbr_flag, int step ) { - int num_bonds, est_3body; - int old_count = 0; - reallocate_data *realloc; - grid *g; + int num_bonds, est_3body; + int old_count = 0; + reallocate_data *realloc; + grid *g; - realloc = &(workspace->realloc); - g = &(system->d_g); + realloc = &(workspace->realloc); + g = &(system->d_g); - if( realloc->num_far > 0 && nbr_flag ) { + if( realloc->num_far > 0 && nbr_flag ) { #ifdef __CUDA_MEM__ - fprintf (stderr, " Reallocating Neighbors: step: %d, old_count: %d new_count: %d size: %d (MB)\n", - step, (dev_lists+FAR_NBRS)->num_intrs, (int)(realloc->num_far * SAFE_ZONE), - (int)(sizeof (far_neighbor_data) * realloc->num_far * SAFE_ZONE)/(1024*1024)); + fprintf (stderr, " Reallocating Neighbors: step: %d, old_count: %d new_count: %d size: %d (MB)\n", + step, (dev_lists+FAR_NBRS)->num_intrs, (int)(realloc->num_far * SAFE_ZONE), + (int)(sizeof (far_neighbor_data) * realloc->num_far * SAFE_ZONE)/(1024*1024)); #endif - Cuda_Reallocate_Neighbor_List( lists+FAR_NBRS, - system->N, realloc->num_far * SAFE_ZONE ); + Cuda_Reallocate_Neighbor_List( lists+FAR_NBRS, + system->N, realloc->num_far * SAFE_ZONE ); - realloc->num_far = -1; - realloc->estimate_nbrs = 1; - } + realloc->num_far = -1; + realloc->estimate_nbrs = 1; + } - if( realloc->Htop > 0 ){ + if( realloc->Htop > 0 ){ #ifdef __CUDA_MEM__ - fprintf (stderr, " Reallocating Matrix : step: %d, old_count: %d new_count: %d size: %d (MB)\n", - step, dev_workspace->H.m, (int)(realloc->Htop * system->N * SAFE_ZONE), - (int) (sizeof (sparse_matrix_entry) * (realloc->Htop * system->N * SAFE_ZONE))/(1024 * 1024)); + fprintf (stderr, " Reallocating Matrix : step: %d, old_count: %d new_count: %d size: %d (MB)\n", + step, dev_workspace->H.m, (int)(realloc->Htop * system->N * SAFE_ZONE), + (int) (sizeof (sparse_matrix_entry) * (realloc->Htop * system->N * SAFE_ZONE))/(1024 * 1024)); #endif - //Cuda_Reallocate_Matrix(&(workspace->H), system->N, realloc->Htop*SAFE_ZONE,"H"); - Cuda_Reallocate_Matrix(&(workspace->H), system->N, realloc->Htop * system->N * SAFE_ZONE,"H"); - system->max_sparse_matrix_entries = realloc->Htop * SAFE_ZONE; - realloc->Htop = -1; + //Cuda_Reallocate_Matrix(&(workspace->H), system->N, realloc->Htop*SAFE_ZONE,"H"); + Cuda_Reallocate_Matrix(&(workspace->H), system->N, realloc->Htop * system->N * SAFE_ZONE,"H"); + system->max_sparse_matrix_entries = realloc->Htop * SAFE_ZONE; + realloc->Htop = -1; - /* - Cuda_Deallocate_Matrix( &workspace->L ); - fprintf (stderr, "Done deallocating the L ower matrix \n"); - Cuda_Deallocate_Matrix( &workspace->U ); - fprintf (stderr, "Done deallocating the Upper matrix \n"); - */ - } + /* + Cuda_Deallocate_Matrix( &workspace->L ); + fprintf (stderr, "Done deallocating the L ower matrix \n"); + Cuda_Deallocate_Matrix( &workspace->U ); + fprintf (stderr, "Done deallocating the Upper matrix \n"); + */ + } - if( realloc->hbonds > 0 ){ + if( realloc->hbonds > 0 ){ - old_count = (dev_lists+HBONDS)->num_intrs; + old_count = (dev_lists+HBONDS)->num_intrs; - Cuda_Reallocate_HBonds_List(system->N, workspace->num_H, workspace->hbond_index, - dev_lists+HBONDS ); + Cuda_Reallocate_HBonds_List(system->N, workspace->num_H, workspace->hbond_index, + dev_lists+HBONDS ); #ifdef __CUDA_MEM__ - fprintf (stderr, " Reallocating HBonds: step: %d, old_count: %d, new_count: %d, size: %d (MB)\n", - step, old_count,(dev_lists+HBONDS)->num_intrs, - (int) sizeof (hbond_data) * (dev_lists+HBONDS)->num_intrs / (1024 * 1024)); + fprintf (stderr, " Reallocating HBonds: step: %d, old_count: %d, new_count: %d, size: %d (MB)\n", + step, old_count,(dev_lists+HBONDS)->num_intrs, + (int) sizeof (hbond_data) * (dev_lists+HBONDS)->num_intrs / (1024 * 1024)); #endif - realloc->hbonds = -1; - } + realloc->hbonds = -1; + } - num_bonds = est_3body = -1; - if( realloc->bonds > 0 ){ + num_bonds = est_3body = -1; + if( realloc->bonds > 0 ){ - old_count = (dev_lists+BONDS)->num_intrs; - num_bonds = Cuda_Reallocate_Bonds_List( system->N, dev_lists+BONDS, &est_3body ); + old_count = (dev_lists+BONDS)->num_intrs; + num_bonds = Cuda_Reallocate_Bonds_List( system->N, dev_lists+BONDS, &est_3body ); #ifdef __CUDA_MEM__ - fprintf (stderr, " Reallocating Bonds: step: %d, old_count: %d, new_count: %d, size: %d (MB) \n", - step, old_count,(dev_lists+BONDS)->num_intrs, - (int) sizeof (bond_data) * (dev_lists+BONDS)->num_intrs / (1024 * 1024)); + fprintf (stderr, " Reallocating Bonds: step: %d, old_count: %d, new_count: %d, size: %d (MB) \n", + step, old_count,(dev_lists+BONDS)->num_intrs, + (int) sizeof (bond_data) * (dev_lists+BONDS)->num_intrs / (1024 * 1024)); #endif - realloc->bonds = -1; - realloc->num_3body = 1;//MAX( realloc->num_3body, est_3body ); - } + realloc->bonds = -1; + realloc->num_3body = 1;//MAX( realloc->num_3body, est_3body ); + } - /* - if( realloc->num_3body > 0 ) { + /* + if( realloc->num_3body > 0 ) { - if (num_bonds < 0) - num_bonds = (dev_lists+BONDS)->num_intrs; + if (num_bonds < 0) + num_bonds = (dev_lists+BONDS)->num_intrs; - Cuda_Reallocate_ThreeBody_List (dev_lists + THREE_BODIES, num_bonds); - realloc->num_3body = -1; - } - */ + Cuda_Reallocate_ThreeBody_List (dev_lists + THREE_BODIES, num_bonds); + realloc->num_3body = -1; + } + */ - if( realloc->gcell_atoms > -1 ){ + if( realloc->gcell_atoms > -1 ){ #if defined(DEBUG_FOCUS) - fprintf(stderr, "reallocating gcell: g->max_atoms: %d\n", g->max_atoms); + fprintf(stderr, "reallocating gcell: g->max_atoms: %d\n", g->max_atoms); #endif #ifdef __CUDA_MEM__ - fprintf (stderr, "Reallocating the atoms in the grid ---> %d \n", workspace->realloc.gcell_atoms ); + fprintf (stderr, "Reallocating the atoms in the grid ---> %d \n", workspace->realloc.gcell_atoms ); #endif - free (g->atoms); - g->atoms = (int *) calloc ( g->ncell[0]*g->ncell[1]*g->ncell[2], - sizeof (int) * workspace->realloc.gcell_atoms); + free (g->atoms); + g->atoms = (int *) calloc ( g->ncell[0]*g->ncell[1]*g->ncell[2], + sizeof (int) * workspace->realloc.gcell_atoms); - cuda_free (g->atoms, RES_GRID_ATOMS); - cuda_malloc ((void **) &g->atoms, INT_SIZE * workspace->realloc.gcell_atoms * g->ncell[0]*g->ncell[1]*g->ncell[2], 1, RES_GRID_ATOMS ); - realloc->gcell_atoms = -1; - } + cuda_free (g->atoms, RES_GRID_ATOMS); + cuda_malloc ((void **) &g->atoms, INT_SIZE * workspace->realloc.gcell_atoms * g->ncell[0]*g->ncell[1]*g->ncell[2], 1, RES_GRID_ATOMS ); + realloc->gcell_atoms = -1; + } } diff --git a/PuReMD-GPU/src/bond_orders.cu b/PuReMD-GPU/src/bond_orders.cu index e180e680..57f5baac 100644 --- a/PuReMD-GPU/src/bond_orders.cu +++ b/PuReMD-GPU/src/bond_orders.cu @@ -32,493 +32,493 @@ inline real Cf45( real p1, real p2 ) { - return -EXP(-p2 / 2) / - ( SQR( EXP(-p1 / 2) + EXP(p1 / 2) ) * (EXP(-p2 / 2) + EXP(p2 / 2)) ); + return -EXP(-p2 / 2) / + ( SQR( EXP(-p1 / 2) + EXP(p1 / 2) ) * (EXP(-p2 / 2) + EXP(p2 / 2)) ); } #ifdef TEST_FORCES void Get_dBO( reax_system *system, list **lists, - int i, int pj, real C, rvec *v ) + int i, int pj, real C, rvec *v ) { - list *bonds = (*lists) + BONDS; - list *dBOs = (*lists) + DBO; - int start_pj, end_pj, k; + list *bonds = (*lists) + BONDS; + list *dBOs = (*lists) + DBO; + int start_pj, end_pj, k; - pj = bonds->select.bond_list[pj].dbond_index; - start_pj = Start_Index(pj, dBOs); - end_pj = End_Index(pj, dBOs); + pj = bonds->select.bond_list[pj].dbond_index; + start_pj = Start_Index(pj, dBOs); + end_pj = End_Index(pj, dBOs); - for( k = start_pj; k < end_pj; ++k ) - rvec_Scale( v[dBOs->select.dbo_list[k].wrt], - C, dBOs->select.dbo_list[k].dBO ); + for( k = start_pj; k < end_pj; ++k ) + rvec_Scale( v[dBOs->select.dbo_list[k].wrt], + C, dBOs->select.dbo_list[k].dBO ); } void Get_dBOpinpi2( reax_system *system, list **lists, - int i, int pj, real Cpi, real Cpi2, rvec *vpi, rvec *vpi2 ) + int i, int pj, real Cpi, real Cpi2, rvec *vpi, rvec *vpi2 ) { - list *bonds = (*lists) + BONDS; - list *dBOs = (*lists) + DBO; - dbond_data *dbo_k; - int start_pj, end_pj, k; - - pj = bonds->select.bond_list[pj].dbond_index; - start_pj = Start_Index(pj, dBOs); - end_pj = End_Index(pj, dBOs); - - for( k = start_pj; k < end_pj; ++k ) { - dbo_k = &(dBOs->select.dbo_list[k]); - rvec_Scale( vpi[dbo_k->wrt], Cpi, dbo_k->dBOpi ); - rvec_Scale( vpi2[dbo_k->wrt], Cpi2, dbo_k->dBOpi2 ); - } + list *bonds = (*lists) + BONDS; + list *dBOs = (*lists) + DBO; + dbond_data *dbo_k; + int start_pj, end_pj, k; + + pj = bonds->select.bond_list[pj].dbond_index; + start_pj = Start_Index(pj, dBOs); + end_pj = End_Index(pj, dBOs); + + for( k = start_pj; k < end_pj; ++k ) { + dbo_k = &(dBOs->select.dbo_list[k]); + rvec_Scale( vpi[dbo_k->wrt], Cpi, dbo_k->dBOpi ); + rvec_Scale( vpi2[dbo_k->wrt], Cpi2, dbo_k->dBOpi2 ); + } } void Add_dBO( reax_system *system, list **lists, - int i, int pj, real C, rvec *v ) + int i, int pj, real C, rvec *v ) { - list *bonds = (*lists) + BONDS; - list *dBOs = (*lists) + DBO; - int start_pj, end_pj, k; + list *bonds = (*lists) + BONDS; + list *dBOs = (*lists) + DBO; + int start_pj, end_pj, k; - pj = bonds->select.bond_list[pj].dbond_index; - start_pj = Start_Index(pj, dBOs); - end_pj = End_Index(pj, dBOs); + pj = bonds->select.bond_list[pj].dbond_index; + start_pj = Start_Index(pj, dBOs); + end_pj = End_Index(pj, dBOs); - //fprintf( stderr, "i=%d j=%d start=%d end=%d\n", i, pj, start_pj, end_pj ); + //fprintf( stderr, "i=%d j=%d start=%d end=%d\n", i, pj, start_pj, end_pj ); - for( k = start_pj; k < end_pj; ++k ) - rvec_ScaledAdd( v[dBOs->select.dbo_list[k].wrt], - C, dBOs->select.dbo_list[k].dBO ); + for( k = start_pj; k < end_pj; ++k ) + rvec_ScaledAdd( v[dBOs->select.dbo_list[k].wrt], + C, dBOs->select.dbo_list[k].dBO ); } void Add_dBOpinpi2( reax_system *system, list **lists, - int i, int pj, real Cpi, real Cpi2, rvec *vpi, rvec *vpi2 ) + int i, int pj, real Cpi, real Cpi2, rvec *vpi, rvec *vpi2 ) { - list *bonds = (*lists) + BONDS; - list *dBOs = (*lists) + DBO; - dbond_data *dbo_k; - int start_pj, end_pj, k; - - pj = bonds->select.bond_list[pj].dbond_index; - start_pj = Start_Index(pj, dBOs); - end_pj = End_Index(pj, dBOs); - - for( k = start_pj; k < end_pj; ++k ) - { - dbo_k = &(dBOs->select.dbo_list[k]); - rvec_ScaledAdd( vpi[dbo_k->wrt], Cpi, dbo_k->dBOpi ); - rvec_ScaledAdd( vpi2[dbo_k->wrt], Cpi2, dbo_k->dBOpi2 ); - } + list *bonds = (*lists) + BONDS; + list *dBOs = (*lists) + DBO; + dbond_data *dbo_k; + int start_pj, end_pj, k; + + pj = bonds->select.bond_list[pj].dbond_index; + start_pj = Start_Index(pj, dBOs); + end_pj = End_Index(pj, dBOs); + + for( k = start_pj; k < end_pj; ++k ) + { + dbo_k = &(dBOs->select.dbo_list[k]); + rvec_ScaledAdd( vpi[dbo_k->wrt], Cpi, dbo_k->dBOpi ); + rvec_ScaledAdd( vpi2[dbo_k->wrt], Cpi2, dbo_k->dBOpi2 ); + } } void Add_dBO_to_Forces( reax_system *system, list **lists, - int i, int pj, real C ) + int i, int pj, real C ) { - list *bonds = (*lists) + BONDS; - list *dBOs = (*lists) + DBO; - int start_pj, end_pj, k; + list *bonds = (*lists) + BONDS; + list *dBOs = (*lists) + DBO; + int start_pj, end_pj, k; - pj = bonds->select.bond_list[pj].dbond_index; - start_pj = Start_Index(pj, dBOs); - end_pj = End_Index(pj, dBOs); + pj = bonds->select.bond_list[pj].dbond_index; + start_pj = Start_Index(pj, dBOs); + end_pj = End_Index(pj, dBOs); - for( k = start_pj; k < end_pj; ++k ) - rvec_ScaledAdd( system->atoms[dBOs->select.dbo_list[k].wrt].f, - C, dBOs->select.dbo_list[k].dBO ); + for( k = start_pj; k < end_pj; ++k ) + rvec_ScaledAdd( system->atoms[dBOs->select.dbo_list[k].wrt].f, + C, dBOs->select.dbo_list[k].dBO ); } void Add_dBOpinpi2_to_Forces( reax_system *system, list **lists, - int i, int pj, real Cpi, real Cpi2 ) + int i, int pj, real Cpi, real Cpi2 ) { - list *bonds = (*lists) + BONDS; - list *dBOs = (*lists) + DBO; - dbond_data *dbo_k; - int start_pj, end_pj, k; - - pj = bonds->select.bond_list[pj].dbond_index; - start_pj = Start_Index(pj, dBOs); - end_pj = End_Index(pj, dBOs); - - for( k = start_pj; k < end_pj; ++k ) - { - dbo_k = &(dBOs->select.dbo_list[k]); - rvec_ScaledAdd( system->atoms[dbo_k->wrt].f, Cpi, dbo_k->dBOpi ); - rvec_ScaledAdd( system->atoms[dbo_k->wrt].f, Cpi2, dbo_k->dBOpi2 ); - } + list *bonds = (*lists) + BONDS; + list *dBOs = (*lists) + DBO; + dbond_data *dbo_k; + int start_pj, end_pj, k; + + pj = bonds->select.bond_list[pj].dbond_index; + start_pj = Start_Index(pj, dBOs); + end_pj = End_Index(pj, dBOs); + + for( k = start_pj; k < end_pj; ++k ) + { + dbo_k = &(dBOs->select.dbo_list[k]); + rvec_ScaledAdd( system->atoms[dbo_k->wrt].f, Cpi, dbo_k->dBOpi ); + rvec_ScaledAdd( system->atoms[dbo_k->wrt].f, Cpi2, dbo_k->dBOpi2 ); + } } void Add_dDelta( reax_system *system, list **lists, int i, real C, rvec *v ) { - list *dDeltas = &((*lists)[DDELTA]); - int start = Start_Index(i, dDeltas); - int end = End_Index(i, dDeltas); - int k; - - for( k = start; k < end; ++k ) - rvec_ScaledAdd( v[dDeltas->select.dDelta_list[k].wrt], - C, dDeltas->select.dDelta_list[k].dVal ); + list *dDeltas = &((*lists)[DDELTA]); + int start = Start_Index(i, dDeltas); + int end = End_Index(i, dDeltas); + int k; + + for( k = start; k < end; ++k ) + rvec_ScaledAdd( v[dDeltas->select.dDelta_list[k].wrt], + C, dDeltas->select.dDelta_list[k].dVal ); } void Add_dDelta_to_Forces( reax_system *system, list **lists, int i, real C ) { - list *dDeltas = &((*lists)[DDELTA]); - int start = Start_Index(i, dDeltas); - int end = End_Index(i, dDeltas); - int k; - - for( k = start; k < end; ++k ) - rvec_ScaledAdd( system->atoms[dDeltas->select.dDelta_list[k].wrt].f, - C, dDeltas->select.dDelta_list[k].dVal ); + list *dDeltas = &((*lists)[DDELTA]); + int start = Start_Index(i, dDeltas); + int end = End_Index(i, dDeltas); + int k; + + for( k = start; k < end; ++k ) + rvec_ScaledAdd( system->atoms[dDeltas->select.dDelta_list[k].wrt].f, + C, dDeltas->select.dDelta_list[k].dVal ); } HOST_DEVICE void Calculate_dBO( int i, int pj, static_storage p_workspace, - list p_bonds, list p_dBOs, int *top ) + list p_bonds, list p_dBOs, int *top ) { - /* Initializations */ - int j, k, l, start_i, end_i, end_j; - rvec dDeltap_self, dBOp; - list *bonds, *dBOs; - bond_data *nbr_l, *nbr_k; - bond_order_data *bo_ij; - dbond_data *top_dbo; - - list *bonds = &p_bonds; - list *dBOs = &p_dBOs; - static_storage *workspace = &p_workspace; - - j = bonds->select.bond_list[pj].nbr; - bo_ij = &(bonds->select.bond_list[pj].bo_data); - - /*rvec due_j[1000], due_i[1000]; - rvec due_j_pi[1000], due_i_pi[1000]; - - memset(due_j, 0, sizeof(rvec)*1000 ); - memset(due_i, 0, sizeof(rvec)*1000 ); - memset(due_j_pi, 0, sizeof(rvec)*1000 ); - memset(due_i_pi, 0, sizeof(rvec)*1000 );*/ - - //fprintf( stderr,"dbo %d-%d\n",workspace->orig_id[i],workspace->orig_id[j] ); - - start_i = Start_Index(i, bonds); - end_i = End_Index(i, bonds); - - l = Start_Index(j, bonds); - end_j = End_Index(j, bonds); - - top_dbo = &(dBOs->select.dbo_list[ (*top) ]); - - for( k = start_i; k < end_i; ++k ) { - nbr_k = &(bonds->select.bond_list[k]); - //fprintf( stderr, "\tnbr_k = %d\n", workspace->orig_id[nbr_k->nbr] ); - - for( ; l < end_j && bonds->select.bond_list[l].nbr < nbr_k->nbr; ++l ) { - /* These are the neighbors of j which aren't in the neighbor_list of i - Note that they might also include i! */ - nbr_l = &(bonds->select.bond_list[l]); - top_dbo->wrt = nbr_l->nbr; - rvec_Copy( dBOp, nbr_l->bo_data.dBOp ); - //fprintf( stderr,"\t\tnbr_l = %d\n",workspace->orig_id[nbr_l->nbr] ); - - rvec_Scale( top_dbo->dBO, -bo_ij->C3dbo, dBOp ); // dBO, 3rd - rvec_Scale( top_dbo->dBOpi, -bo_ij->C4dbopi, dBOp ); // dBOpi, 4th - rvec_Scale( top_dbo->dBOpi2, -bo_ij->C4dbopi2, dBOp );// dBOpipi, 4th - //rvec_ScaledAdd(due_j[top_dbo->wrt],-bo_ij->BO*bo_ij->A2_ji, dBOp); - - if( nbr_l->nbr == i ) { - rvec_Copy( dDeltap_self, workspace->dDeltap_self[i] ); - - /* dBO */ - rvec_ScaledAdd( top_dbo->dBO, bo_ij->C1dbo, bo_ij->dBOp ); //1st - rvec_ScaledAdd( top_dbo->dBO, bo_ij->C2dbo, dDeltap_self ); //2nd - - /* dBOpi */ - rvec_ScaledAdd(top_dbo->dBOpi,bo_ij->C1dbopi,bo_ij->dln_BOp_pi);//1 - rvec_ScaledAdd(top_dbo->dBOpi,bo_ij->C2dbopi,bo_ij->dBOp); //2nd - rvec_ScaledAdd(top_dbo->dBOpi,bo_ij->C3dbopi,dDeltap_self); //3rd - - /* dBOpp, 1st */ - rvec_ScaledAdd(top_dbo->dBOpi2,bo_ij->C1dbopi2,bo_ij->dln_BOp_pi2); - rvec_ScaledAdd(top_dbo->dBOpi2,bo_ij->C2dbopi2,bo_ij->dBOp); //2nd - rvec_ScaledAdd(top_dbo->dBOpi2,bo_ij->C3dbopi2,dDeltap_self);//3rd - - /* do the adjustments on i */ - //rvec_ScaledAdd( due_i[i], - //bo_ij->A0_ij + bo_ij->BO * bo_ij->A1_ij, bo_ij->dBOp );//1st,dBO - //rvec_ScaledAdd( due_i[i], bo_ij->BO * bo_ij->A2_ij, - //dDeltap_self ); //2nd, dBO - } - - //rvec_Add( workspace->dDelta[nbr_l->nbr], top_dbo->dBO ); - ++(*top), ++top_dbo; - } - - /* Now we are processing neighbor k of i. */ - top_dbo->wrt = nbr_k->nbr; - rvec_Copy( dBOp, nbr_k->bo_data.dBOp ); - - rvec_Scale( top_dbo->dBO, -bo_ij->C2dbo, dBOp ); //dBO-2 - rvec_Scale( top_dbo->dBOpi, -bo_ij->C3dbopi, dBOp ); //dBOpi-3 - rvec_Scale( top_dbo->dBOpi2, -bo_ij->C3dbopi2, dBOp );//dBOpp-3 - //rvec_ScaledAdd(due_i[top_dbo->wrt],-bo_ij->BO*bo_ij->A2_ij,dBOp);//dBO-2 - - // fprintf( stderr, "\tnbr_k = %d, nbr_l = %d, l = %d, end_j = %d\n", - // workspace->orig_id[nbr_k->nbr], - // workspace->orig_id[bonds->select.bond_list[l].nbr], l, end_j ); - - if( l < end_j && bonds->select.bond_list[l].nbr == nbr_k->nbr ) { - /* This is a common neighbor of i and j. */ - nbr_l = &(bonds->select.bond_list[l]); - rvec_Copy( dBOp, nbr_l->bo_data.dBOp ); - - rvec_ScaledAdd( top_dbo->dBO, -bo_ij->C3dbo, dBOp ); //dBO,3rd - rvec_ScaledAdd( top_dbo->dBOpi, -bo_ij->C4dbopi, dBOp ); //dBOpi,4th - rvec_ScaledAdd( top_dbo->dBOpi2, -bo_ij->C4dbopi2, dBOp );//dBOpp.4th - ++l; - - //rvec_ScaledAdd( due_j[top_dbo->wrt], -bo_ij->BO * bo_ij->A2_ji, - //nbr_l->bo_data.dBOp ); //3rd, dBO - } - else if( k == pj ) { - /* This negihbor is j. */ - rvec_Copy( dDeltap_self, workspace->dDeltap_self[j] ); - - rvec_ScaledAdd( top_dbo->dBO, -bo_ij->C1dbo, bo_ij->dBOp );// 1st, dBO - rvec_ScaledAdd( top_dbo->dBO, bo_ij->C3dbo, dDeltap_self );// 3rd, dBO - - /* dBOpi, 1st */ - rvec_ScaledAdd(top_dbo->dBOpi,-bo_ij->C1dbopi,bo_ij->dln_BOp_pi); - rvec_ScaledAdd(top_dbo->dBOpi,-bo_ij->C2dbopi,bo_ij->dBOp); //2nd - rvec_ScaledAdd( top_dbo->dBOpi, bo_ij->C4dbopi, dDeltap_self ); //4th - - /* dBOpi2, 1st */ - rvec_ScaledAdd(top_dbo->dBOpi2,-bo_ij->C1dbopi2,bo_ij->dln_BOp_pi2 ); - rvec_ScaledAdd(top_dbo->dBOpi2,-bo_ij->C2dbopi2,bo_ij->dBOp ); //2nd - rvec_ScaledAdd(top_dbo->dBOpi2,bo_ij->C4dbopi2,dDeltap_self ); //4th - - //rvec_ScaledAdd( due_j[j], -(bo_ij->A0_ij + bo_ij->BO*bo_ij->A1_ij), - //bo_ij->dBOp ); //1st, dBO - //rvec_ScaledAdd( due_j[j], bo_ij->BO * bo_ij->A2_ji, - //workspace->dDeltap_self[j] ); //3rd, dBO - } - - // rvec_Add( workspace->dDelta[nbr_k->nbr], top_dbo->dBO ); - ++(*top), ++top_dbo; - } - - for( ; l < end_j; ++l ) { - /* These are the remaining neighbors of j which are not in the - neighbor_list of i. Note that they might also include i!*/ - nbr_l = &(bonds->select.bond_list[l]); - top_dbo->wrt = nbr_l->nbr; - rvec_Copy( dBOp, nbr_l->bo_data.dBOp ); - //fprintf( stderr,"\tl=%d, nbr_l=%d\n",l,workspace->orig_id[nbr_l->nbr] ); - - rvec_Scale( top_dbo->dBO, -bo_ij->C3dbo, dBOp ); //3rd, dBO - rvec_Scale( top_dbo->dBOpi, -bo_ij->C4dbopi, dBOp ); //4th, dBOpi - rvec_Scale( top_dbo->dBOpi2, -bo_ij->C4dbopi2, dBOp );//4th, dBOpp - - // rvec_ScaledAdd( due_j[top_dbo->wrt], -bo_ij->BO * bo_ij->A2_ji, - // nbr_l->bo_data.dBOp ); - - if( nbr_l->nbr == i ) { - /* do the adjustments on i */ - rvec_Copy( dDeltap_self, workspace->dDeltap_self[i] ); - - /* dBO, 1st */ - rvec_ScaledAdd( top_dbo->dBO, bo_ij->C1dbo, bo_ij->dBOp ); - rvec_ScaledAdd( top_dbo->dBO, bo_ij->C2dbo, dDeltap_self ); //2nd, dBO - - /* dBOpi, 1st */ - rvec_ScaledAdd( top_dbo->dBOpi, bo_ij->C1dbopi, bo_ij->dln_BOp_pi ); - rvec_ScaledAdd( top_dbo->dBOpi, bo_ij->C2dbopi, bo_ij->dBOp ); //2nd - rvec_ScaledAdd( top_dbo->dBOpi, bo_ij->C3dbopi, dDeltap_self ); //3rd - - /* dBOpipi, 1st */ - rvec_ScaledAdd(top_dbo->dBOpi2, bo_ij->C1dbopi2, bo_ij->dln_BOp_pi2); - rvec_ScaledAdd( top_dbo->dBOpi2, bo_ij->C2dbopi2, bo_ij->dBOp ); //2nd - rvec_ScaledAdd( top_dbo->dBOpi2, bo_ij->C3dbopi2, dDeltap_self );//3rd - - //rvec_ScaledAdd( due_i[i], bo_ij->A0_ij + bo_ij->BO * bo_ij->A1_ij, - //bo_ij->dBOp ); /*1st, dBO*/ - //rvec_ScaledAdd( due_i[i], bo_ij->BO * bo_ij->A2_ij, - //dDeltap_self ); /*2nd, dBO*/ - } - - // rvec_Add( workspace->dDelta[nbr_l->nbr], top_dbo->dBO ); - ++(*top), ++top_dbo; - } - - /*for( k = 0; k < 21; ++k ){ - fprintf( stderr, "%d %d %d, due_i:[%g %g %g]\n", - i+1, j+1, k+1, due_i[k][0], due_i[k][1], due_i[k][2] ); - fprintf( stderr, "%d %d %d, due_j:[%g %g %g]\n", - i+1, j+1, k+1, due_j[k][0], due_j[k][1], due_j[k][2] ); - }*/ + /* Initializations */ + int j, k, l, start_i, end_i, end_j; + rvec dDeltap_self, dBOp; + list *bonds, *dBOs; + bond_data *nbr_l, *nbr_k; + bond_order_data *bo_ij; + dbond_data *top_dbo; + + list *bonds = &p_bonds; + list *dBOs = &p_dBOs; + static_storage *workspace = &p_workspace; + + j = bonds->select.bond_list[pj].nbr; + bo_ij = &(bonds->select.bond_list[pj].bo_data); + + /*rvec due_j[1000], due_i[1000]; + rvec due_j_pi[1000], due_i_pi[1000]; + + memset(due_j, 0, sizeof(rvec)*1000 ); + memset(due_i, 0, sizeof(rvec)*1000 ); + memset(due_j_pi, 0, sizeof(rvec)*1000 ); + memset(due_i_pi, 0, sizeof(rvec)*1000 );*/ + + //fprintf( stderr,"dbo %d-%d\n",workspace->orig_id[i],workspace->orig_id[j] ); + + start_i = Start_Index(i, bonds); + end_i = End_Index(i, bonds); + + l = Start_Index(j, bonds); + end_j = End_Index(j, bonds); + + top_dbo = &(dBOs->select.dbo_list[ (*top) ]); + + for( k = start_i; k < end_i; ++k ) { + nbr_k = &(bonds->select.bond_list[k]); + //fprintf( stderr, "\tnbr_k = %d\n", workspace->orig_id[nbr_k->nbr] ); + + for( ; l < end_j && bonds->select.bond_list[l].nbr < nbr_k->nbr; ++l ) { + /* These are the neighbors of j which aren't in the neighbor_list of i + Note that they might also include i! */ + nbr_l = &(bonds->select.bond_list[l]); + top_dbo->wrt = nbr_l->nbr; + rvec_Copy( dBOp, nbr_l->bo_data.dBOp ); + //fprintf( stderr,"\t\tnbr_l = %d\n",workspace->orig_id[nbr_l->nbr] ); + + rvec_Scale( top_dbo->dBO, -bo_ij->C3dbo, dBOp ); // dBO, 3rd + rvec_Scale( top_dbo->dBOpi, -bo_ij->C4dbopi, dBOp ); // dBOpi, 4th + rvec_Scale( top_dbo->dBOpi2, -bo_ij->C4dbopi2, dBOp );// dBOpipi, 4th + //rvec_ScaledAdd(due_j[top_dbo->wrt],-bo_ij->BO*bo_ij->A2_ji, dBOp); + + if( nbr_l->nbr == i ) { + rvec_Copy( dDeltap_self, workspace->dDeltap_self[i] ); + + /* dBO */ + rvec_ScaledAdd( top_dbo->dBO, bo_ij->C1dbo, bo_ij->dBOp ); //1st + rvec_ScaledAdd( top_dbo->dBO, bo_ij->C2dbo, dDeltap_self ); //2nd + + /* dBOpi */ + rvec_ScaledAdd(top_dbo->dBOpi,bo_ij->C1dbopi,bo_ij->dln_BOp_pi);//1 + rvec_ScaledAdd(top_dbo->dBOpi,bo_ij->C2dbopi,bo_ij->dBOp); //2nd + rvec_ScaledAdd(top_dbo->dBOpi,bo_ij->C3dbopi,dDeltap_self); //3rd + + /* dBOpp, 1st */ + rvec_ScaledAdd(top_dbo->dBOpi2,bo_ij->C1dbopi2,bo_ij->dln_BOp_pi2); + rvec_ScaledAdd(top_dbo->dBOpi2,bo_ij->C2dbopi2,bo_ij->dBOp); //2nd + rvec_ScaledAdd(top_dbo->dBOpi2,bo_ij->C3dbopi2,dDeltap_self);//3rd + + /* do the adjustments on i */ + //rvec_ScaledAdd( due_i[i], + //bo_ij->A0_ij + bo_ij->BO * bo_ij->A1_ij, bo_ij->dBOp );//1st,dBO + //rvec_ScaledAdd( due_i[i], bo_ij->BO * bo_ij->A2_ij, + //dDeltap_self ); //2nd, dBO + } + + //rvec_Add( workspace->dDelta[nbr_l->nbr], top_dbo->dBO ); + ++(*top), ++top_dbo; + } + + /* Now we are processing neighbor k of i. */ + top_dbo->wrt = nbr_k->nbr; + rvec_Copy( dBOp, nbr_k->bo_data.dBOp ); + + rvec_Scale( top_dbo->dBO, -bo_ij->C2dbo, dBOp ); //dBO-2 + rvec_Scale( top_dbo->dBOpi, -bo_ij->C3dbopi, dBOp ); //dBOpi-3 + rvec_Scale( top_dbo->dBOpi2, -bo_ij->C3dbopi2, dBOp );//dBOpp-3 + //rvec_ScaledAdd(due_i[top_dbo->wrt],-bo_ij->BO*bo_ij->A2_ij,dBOp);//dBO-2 + + // fprintf( stderr, "\tnbr_k = %d, nbr_l = %d, l = %d, end_j = %d\n", + // workspace->orig_id[nbr_k->nbr], + // workspace->orig_id[bonds->select.bond_list[l].nbr], l, end_j ); + + if( l < end_j && bonds->select.bond_list[l].nbr == nbr_k->nbr ) { + /* This is a common neighbor of i and j. */ + nbr_l = &(bonds->select.bond_list[l]); + rvec_Copy( dBOp, nbr_l->bo_data.dBOp ); + + rvec_ScaledAdd( top_dbo->dBO, -bo_ij->C3dbo, dBOp ); //dBO,3rd + rvec_ScaledAdd( top_dbo->dBOpi, -bo_ij->C4dbopi, dBOp ); //dBOpi,4th + rvec_ScaledAdd( top_dbo->dBOpi2, -bo_ij->C4dbopi2, dBOp );//dBOpp.4th + ++l; + + //rvec_ScaledAdd( due_j[top_dbo->wrt], -bo_ij->BO * bo_ij->A2_ji, + //nbr_l->bo_data.dBOp ); //3rd, dBO + } + else if( k == pj ) { + /* This negihbor is j. */ + rvec_Copy( dDeltap_self, workspace->dDeltap_self[j] ); + + rvec_ScaledAdd( top_dbo->dBO, -bo_ij->C1dbo, bo_ij->dBOp );// 1st, dBO + rvec_ScaledAdd( top_dbo->dBO, bo_ij->C3dbo, dDeltap_self );// 3rd, dBO + + /* dBOpi, 1st */ + rvec_ScaledAdd(top_dbo->dBOpi,-bo_ij->C1dbopi,bo_ij->dln_BOp_pi); + rvec_ScaledAdd(top_dbo->dBOpi,-bo_ij->C2dbopi,bo_ij->dBOp); //2nd + rvec_ScaledAdd( top_dbo->dBOpi, bo_ij->C4dbopi, dDeltap_self ); //4th + + /* dBOpi2, 1st */ + rvec_ScaledAdd(top_dbo->dBOpi2,-bo_ij->C1dbopi2,bo_ij->dln_BOp_pi2 ); + rvec_ScaledAdd(top_dbo->dBOpi2,-bo_ij->C2dbopi2,bo_ij->dBOp ); //2nd + rvec_ScaledAdd(top_dbo->dBOpi2,bo_ij->C4dbopi2,dDeltap_self ); //4th + + //rvec_ScaledAdd( due_j[j], -(bo_ij->A0_ij + bo_ij->BO*bo_ij->A1_ij), + //bo_ij->dBOp ); //1st, dBO + //rvec_ScaledAdd( due_j[j], bo_ij->BO * bo_ij->A2_ji, + //workspace->dDeltap_self[j] ); //3rd, dBO + } + + // rvec_Add( workspace->dDelta[nbr_k->nbr], top_dbo->dBO ); + ++(*top), ++top_dbo; + } + + for( ; l < end_j; ++l ) { + /* These are the remaining neighbors of j which are not in the + neighbor_list of i. Note that they might also include i!*/ + nbr_l = &(bonds->select.bond_list[l]); + top_dbo->wrt = nbr_l->nbr; + rvec_Copy( dBOp, nbr_l->bo_data.dBOp ); + //fprintf( stderr,"\tl=%d, nbr_l=%d\n",l,workspace->orig_id[nbr_l->nbr] ); + + rvec_Scale( top_dbo->dBO, -bo_ij->C3dbo, dBOp ); //3rd, dBO + rvec_Scale( top_dbo->dBOpi, -bo_ij->C4dbopi, dBOp ); //4th, dBOpi + rvec_Scale( top_dbo->dBOpi2, -bo_ij->C4dbopi2, dBOp );//4th, dBOpp + + // rvec_ScaledAdd( due_j[top_dbo->wrt], -bo_ij->BO * bo_ij->A2_ji, + // nbr_l->bo_data.dBOp ); + + if( nbr_l->nbr == i ) { + /* do the adjustments on i */ + rvec_Copy( dDeltap_self, workspace->dDeltap_self[i] ); + + /* dBO, 1st */ + rvec_ScaledAdd( top_dbo->dBO, bo_ij->C1dbo, bo_ij->dBOp ); + rvec_ScaledAdd( top_dbo->dBO, bo_ij->C2dbo, dDeltap_self ); //2nd, dBO + + /* dBOpi, 1st */ + rvec_ScaledAdd( top_dbo->dBOpi, bo_ij->C1dbopi, bo_ij->dln_BOp_pi ); + rvec_ScaledAdd( top_dbo->dBOpi, bo_ij->C2dbopi, bo_ij->dBOp ); //2nd + rvec_ScaledAdd( top_dbo->dBOpi, bo_ij->C3dbopi, dDeltap_self ); //3rd + + /* dBOpipi, 1st */ + rvec_ScaledAdd(top_dbo->dBOpi2, bo_ij->C1dbopi2, bo_ij->dln_BOp_pi2); + rvec_ScaledAdd( top_dbo->dBOpi2, bo_ij->C2dbopi2, bo_ij->dBOp ); //2nd + rvec_ScaledAdd( top_dbo->dBOpi2, bo_ij->C3dbopi2, dDeltap_self );//3rd + + //rvec_ScaledAdd( due_i[i], bo_ij->A0_ij + bo_ij->BO * bo_ij->A1_ij, + //bo_ij->dBOp ); /*1st, dBO*/ + //rvec_ScaledAdd( due_i[i], bo_ij->BO * bo_ij->A2_ij, + //dDeltap_self ); /*2nd, dBO*/ + } + + // rvec_Add( workspace->dDelta[nbr_l->nbr], top_dbo->dBO ); + ++(*top), ++top_dbo; + } + + /*for( k = 0; k < 21; ++k ){ + fprintf( stderr, "%d %d %d, due_i:[%g %g %g]\n", + i+1, j+1, k+1, due_i[k][0], due_i[k][1], due_i[k][2] ); + fprintf( stderr, "%d %d %d, due_j:[%g %g %g]\n", + i+1, j+1, k+1, due_j[k][0], due_j[k][1], due_j[k][2] ); + }*/ } #endif void Add_dBond_to_Forces_NPT( int i, int pj, reax_system *system, - simulation_data *data, static_storage *workspace, - list **lists ) + simulation_data *data, static_storage *workspace, + list **lists ) { - list *bonds = (*lists) + BONDS; - bond_data *nbr_j, *nbr_k; - bond_order_data *bo_ij, *bo_ji; - dbond_coefficients coef; - rvec temp, ext_press; - ivec rel_box; - int pk, k, j; - - /* Initializations */ - nbr_j = &(bonds->select.bond_list[pj]); - j = nbr_j->nbr; - bo_ij = &(nbr_j->bo_data); - bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data); - - coef.C1dbo = bo_ij->C1dbo * (bo_ij->Cdbo + bo_ji->Cdbo); - coef.C2dbo = bo_ij->C2dbo * (bo_ij->Cdbo + bo_ji->Cdbo); - coef.C3dbo = bo_ij->C3dbo * (bo_ij->Cdbo + bo_ji->Cdbo); - - coef.C1dbopi = bo_ij->C1dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); - coef.C2dbopi = bo_ij->C2dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); - coef.C3dbopi = bo_ij->C3dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); - coef.C4dbopi = bo_ij->C4dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); - - coef.C1dbopi2 = bo_ij->C1dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); - coef.C2dbopi2 = bo_ij->C2dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); - coef.C3dbopi2 = bo_ij->C3dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); - coef.C4dbopi2 = bo_ij->C4dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); - - coef.C1dDelta = bo_ij->C1dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]); - coef.C2dDelta = bo_ij->C2dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]); - coef.C3dDelta = bo_ij->C3dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]); - - - /************************************ - * forces related to atom i * - * first neighbors of atom i * - ************************************/ - for( pk = Start_Index(i, bonds); pk < End_Index(i, bonds); ++pk ) { - nbr_k = &(bonds->select.bond_list[pk]); - k = nbr_k->nbr; - - rvec_Scale( temp, -coef.C2dbo, nbr_k->bo_data.dBOp ); /*2nd,dBO*/ - rvec_ScaledAdd( temp, -coef.C2dDelta, nbr_k->bo_data.dBOp );/*dDelta*/ - rvec_ScaledAdd( temp, -coef.C3dbopi, nbr_k->bo_data.dBOp ); /*3rd,dBOpi*/ - rvec_ScaledAdd( temp, -coef.C3dbopi2, nbr_k->bo_data.dBOp );/*3rd,dBOpi2*/ - - /* force */ - rvec_Add( system->atoms[k].f, temp ); - /* pressure */ - rvec_iMultiply( ext_press, nbr_k->rel_box, temp ); - rvec_Add( data->ext_press, ext_press ); - - /* if( !ivec_isZero( nbr_k->rel_box ) ) - fprintf( stderr, "%3d %3d %3d: dvec[%10.6f %10.6f %10.6f] - ext[%3d %3d %3d] f[%10.6f %10.6f %10.6f]\n", - i+1, - system->atoms[i].x[0],system->atoms[i].x[1],system->atoms[i].x[2], - j+1, k+1, - system->atoms[k].x[0], system->atoms[k].x[1], system->atoms[k].x[2], - nbr_k->dvec[0], nbr_k->dvec[1], nbr_k->dvec[2], - nbr_k->rel_box[0], nbr_k->rel_box[1], nbr_k->rel_box[2], - temp[0], temp[1], temp[2] ); */ - } - - /* then atom i itself */ - rvec_Scale( temp, coef.C1dbo, bo_ij->dBOp ); /*1st, dBO*/ - rvec_ScaledAdd( temp, coef.C2dbo, workspace->dDeltap_self[i] ); /*2nd, dBO*/ - - rvec_ScaledAdd( temp, coef.C1dDelta, bo_ij->dBOp ); /*1st, dBO*/ - rvec_ScaledAdd( temp, coef.C2dDelta, workspace->dDeltap_self[i] );/*2nd, dBO*/ - - rvec_ScaledAdd( temp, coef.C1dbopi, bo_ij->dln_BOp_pi ); /*1st,dBOpi*/ - rvec_ScaledAdd( temp, coef.C2dbopi, bo_ij->dBOp ); /*2nd,dBOpi*/ - rvec_ScaledAdd( temp, coef.C3dbopi, workspace->dDeltap_self[i] );/*3rd,dBOpi*/ - - rvec_ScaledAdd(temp, coef.C1dbopi2, bo_ij->dln_BOp_pi2) ; /*1st,dBO_pi2*/ - rvec_ScaledAdd(temp, coef.C2dbopi2, bo_ij->dBOp); /*2nd,dBO_pi2*/ - rvec_ScaledAdd(temp, coef.C3dbopi2, workspace->dDeltap_self[i]);/*3rd,dBO_pi2*/ - - /* force */ - rvec_Add( system->atoms[i].f, temp ); - /* ext pressure due to i dropped, counting force on j only will be enough */ - - - /**************************************************************************** - * forces and pressure related to atom j * - * first neighbors of atom j * - ***************************************************************************/ - for( pk = Start_Index(j, bonds); pk < End_Index(j, bonds); ++pk ) { - nbr_k = &(bonds->select.bond_list[pk]); - k = nbr_k->nbr; - - rvec_Scale( temp, -coef.C3dbo, nbr_k->bo_data.dBOp ); /*3rd,dBO*/ - rvec_ScaledAdd( temp, -coef.C3dDelta, nbr_k->bo_data.dBOp );/*dDelta*/ - rvec_ScaledAdd( temp, -coef.C4dbopi, nbr_k->bo_data.dBOp ); /*4th,dBOpi*/ - rvec_ScaledAdd( temp, -coef.C4dbopi2, nbr_k->bo_data.dBOp );/*4th,dBOpi2*/ - - /* force */ - rvec_Add( system->atoms[k].f, temp ); - /* pressure */ - if( k != i ) { - ivec_Sum(rel_box, nbr_k->rel_box, nbr_j->rel_box);//k's rel_box wrt i - rvec_iMultiply( ext_press, rel_box, temp ); - rvec_Add( data->ext_press, ext_press ); - - /* if( !ivec_isZero( rel_box ) ) - fprintf( stderr, "%3d %3d %3d: dvec[%10.6f %10.6f %10.6f] - ext[%3d %3d %3d] f[%10.6f %10.6f %10.6f]\n", - i+1, j+1, - system->atoms[j].x[0],system->atoms[j].x[1],system->atoms[j].x[2], - k+1, - system->atoms[k].x[0], system->atoms[k].x[1], system->atoms[k].x[2], - nbr_k->dvec[0], nbr_k->dvec[1], nbr_k->dvec[2], - rel_box[0], rel_box[1], rel_box[2], - temp[0], temp[1], temp[2] ); */ - } - } - - /* then atom j itself */ - rvec_Scale( temp, -coef.C1dbo, bo_ij->dBOp ); /*1st, dBO*/ - rvec_ScaledAdd( temp, coef.C3dbo, workspace->dDeltap_self[j] ); /*2nd, dBO*/ - - rvec_ScaledAdd( temp, -coef.C1dDelta, bo_ij->dBOp ); /*1st, dBO*/ - rvec_ScaledAdd( temp, coef.C3dDelta, workspace->dDeltap_self[j] );/*2nd, dBO*/ - - rvec_ScaledAdd( temp, -coef.C1dbopi, bo_ij->dln_BOp_pi ); /*1st,dBOpi*/ - rvec_ScaledAdd( temp, -coef.C2dbopi, bo_ij->dBOp ); /*2nd,dBOpi*/ - rvec_ScaledAdd( temp, coef.C4dbopi, workspace->dDeltap_self[j] );/*3rd,dBOpi*/ - - rvec_ScaledAdd(temp, -coef.C1dbopi2, bo_ij->dln_BOp_pi2); /*1st,dBOpi2*/ - rvec_ScaledAdd(temp, -coef.C2dbopi2, bo_ij->dBOp); /*2nd,dBOpi2*/ - rvec_ScaledAdd(temp, coef.C4dbopi2, workspace->dDeltap_self[j]);/*3rd,dBOpi2*/ - - /* force */ - rvec_Add( system->atoms[j].f, temp ); - /* pressure */ - rvec_iMultiply( ext_press, nbr_j->rel_box, temp ); - rvec_Add( data->ext_press, ext_press ); - - /* if( !ivec_isZero( nbr_j->rel_box ) ) - fprintf( stderr, "%3d %3d %3d: dvec[%10.6f %10.6f %10.6f] - ext[%3d %3d %3d] f[%10.6f %10.6f %10.6f]\n", - i+1, system->atoms[i].x[0], system->atoms[i].x[1], system->atoms[i].x[2], - j+1, system->atoms[j].x[0], system->atoms[j].x[1], system->atoms[j].x[2], - j+1, nbr_j->dvec[0], nbr_j->dvec[1], nbr_j->dvec[2], - nbr_j->rel_box[0], nbr_j->rel_box[1], nbr_j->rel_box[2], - temp[0], temp[1], temp[2] ); */ + list *bonds = (*lists) + BONDS; + bond_data *nbr_j, *nbr_k; + bond_order_data *bo_ij, *bo_ji; + dbond_coefficients coef; + rvec temp, ext_press; + ivec rel_box; + int pk, k, j; + + /* Initializations */ + nbr_j = &(bonds->select.bond_list[pj]); + j = nbr_j->nbr; + bo_ij = &(nbr_j->bo_data); + bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data); + + coef.C1dbo = bo_ij->C1dbo * (bo_ij->Cdbo + bo_ji->Cdbo); + coef.C2dbo = bo_ij->C2dbo * (bo_ij->Cdbo + bo_ji->Cdbo); + coef.C3dbo = bo_ij->C3dbo * (bo_ij->Cdbo + bo_ji->Cdbo); + + coef.C1dbopi = bo_ij->C1dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); + coef.C2dbopi = bo_ij->C2dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); + coef.C3dbopi = bo_ij->C3dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); + coef.C4dbopi = bo_ij->C4dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); + + coef.C1dbopi2 = bo_ij->C1dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); + coef.C2dbopi2 = bo_ij->C2dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); + coef.C3dbopi2 = bo_ij->C3dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); + coef.C4dbopi2 = bo_ij->C4dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); + + coef.C1dDelta = bo_ij->C1dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]); + coef.C2dDelta = bo_ij->C2dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]); + coef.C3dDelta = bo_ij->C3dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]); + + + /************************************ + * forces related to atom i * + * first neighbors of atom i * + ************************************/ + for( pk = Start_Index(i, bonds); pk < End_Index(i, bonds); ++pk ) { + nbr_k = &(bonds->select.bond_list[pk]); + k = nbr_k->nbr; + + rvec_Scale( temp, -coef.C2dbo, nbr_k->bo_data.dBOp ); /*2nd,dBO*/ + rvec_ScaledAdd( temp, -coef.C2dDelta, nbr_k->bo_data.dBOp );/*dDelta*/ + rvec_ScaledAdd( temp, -coef.C3dbopi, nbr_k->bo_data.dBOp ); /*3rd,dBOpi*/ + rvec_ScaledAdd( temp, -coef.C3dbopi2, nbr_k->bo_data.dBOp );/*3rd,dBOpi2*/ + + /* force */ + rvec_Add( system->atoms[k].f, temp ); + /* pressure */ + rvec_iMultiply( ext_press, nbr_k->rel_box, temp ); + rvec_Add( data->ext_press, ext_press ); + + /* if( !ivec_isZero( nbr_k->rel_box ) ) + fprintf( stderr, "%3d %3d %3d: dvec[%10.6f %10.6f %10.6f] + ext[%3d %3d %3d] f[%10.6f %10.6f %10.6f]\n", + i+1, + system->atoms[i].x[0],system->atoms[i].x[1],system->atoms[i].x[2], + j+1, k+1, + system->atoms[k].x[0], system->atoms[k].x[1], system->atoms[k].x[2], + nbr_k->dvec[0], nbr_k->dvec[1], nbr_k->dvec[2], + nbr_k->rel_box[0], nbr_k->rel_box[1], nbr_k->rel_box[2], + temp[0], temp[1], temp[2] ); */ + } + + /* then atom i itself */ + rvec_Scale( temp, coef.C1dbo, bo_ij->dBOp ); /*1st, dBO*/ + rvec_ScaledAdd( temp, coef.C2dbo, workspace->dDeltap_self[i] ); /*2nd, dBO*/ + + rvec_ScaledAdd( temp, coef.C1dDelta, bo_ij->dBOp ); /*1st, dBO*/ + rvec_ScaledAdd( temp, coef.C2dDelta, workspace->dDeltap_self[i] );/*2nd, dBO*/ + + rvec_ScaledAdd( temp, coef.C1dbopi, bo_ij->dln_BOp_pi ); /*1st,dBOpi*/ + rvec_ScaledAdd( temp, coef.C2dbopi, bo_ij->dBOp ); /*2nd,dBOpi*/ + rvec_ScaledAdd( temp, coef.C3dbopi, workspace->dDeltap_self[i] );/*3rd,dBOpi*/ + + rvec_ScaledAdd(temp, coef.C1dbopi2, bo_ij->dln_BOp_pi2) ; /*1st,dBO_pi2*/ + rvec_ScaledAdd(temp, coef.C2dbopi2, bo_ij->dBOp); /*2nd,dBO_pi2*/ + rvec_ScaledAdd(temp, coef.C3dbopi2, workspace->dDeltap_self[i]);/*3rd,dBO_pi2*/ + + /* force */ + rvec_Add( system->atoms[i].f, temp ); + /* ext pressure due to i dropped, counting force on j only will be enough */ + + + /**************************************************************************** + * forces and pressure related to atom j * + * first neighbors of atom j * + ***************************************************************************/ + for( pk = Start_Index(j, bonds); pk < End_Index(j, bonds); ++pk ) { + nbr_k = &(bonds->select.bond_list[pk]); + k = nbr_k->nbr; + + rvec_Scale( temp, -coef.C3dbo, nbr_k->bo_data.dBOp ); /*3rd,dBO*/ + rvec_ScaledAdd( temp, -coef.C3dDelta, nbr_k->bo_data.dBOp );/*dDelta*/ + rvec_ScaledAdd( temp, -coef.C4dbopi, nbr_k->bo_data.dBOp ); /*4th,dBOpi*/ + rvec_ScaledAdd( temp, -coef.C4dbopi2, nbr_k->bo_data.dBOp );/*4th,dBOpi2*/ + + /* force */ + rvec_Add( system->atoms[k].f, temp ); + /* pressure */ + if( k != i ) { + ivec_Sum(rel_box, nbr_k->rel_box, nbr_j->rel_box);//k's rel_box wrt i + rvec_iMultiply( ext_press, rel_box, temp ); + rvec_Add( data->ext_press, ext_press ); + + /* if( !ivec_isZero( rel_box ) ) + fprintf( stderr, "%3d %3d %3d: dvec[%10.6f %10.6f %10.6f] + ext[%3d %3d %3d] f[%10.6f %10.6f %10.6f]\n", + i+1, j+1, + system->atoms[j].x[0],system->atoms[j].x[1],system->atoms[j].x[2], + k+1, + system->atoms[k].x[0], system->atoms[k].x[1], system->atoms[k].x[2], + nbr_k->dvec[0], nbr_k->dvec[1], nbr_k->dvec[2], + rel_box[0], rel_box[1], rel_box[2], + temp[0], temp[1], temp[2] ); */ + } + } + + /* then atom j itself */ + rvec_Scale( temp, -coef.C1dbo, bo_ij->dBOp ); /*1st, dBO*/ + rvec_ScaledAdd( temp, coef.C3dbo, workspace->dDeltap_self[j] ); /*2nd, dBO*/ + + rvec_ScaledAdd( temp, -coef.C1dDelta, bo_ij->dBOp ); /*1st, dBO*/ + rvec_ScaledAdd( temp, coef.C3dDelta, workspace->dDeltap_self[j] );/*2nd, dBO*/ + + rvec_ScaledAdd( temp, -coef.C1dbopi, bo_ij->dln_BOp_pi ); /*1st,dBOpi*/ + rvec_ScaledAdd( temp, -coef.C2dbopi, bo_ij->dBOp ); /*2nd,dBOpi*/ + rvec_ScaledAdd( temp, coef.C4dbopi, workspace->dDeltap_self[j] );/*3rd,dBOpi*/ + + rvec_ScaledAdd(temp, -coef.C1dbopi2, bo_ij->dln_BOp_pi2); /*1st,dBOpi2*/ + rvec_ScaledAdd(temp, -coef.C2dbopi2, bo_ij->dBOp); /*2nd,dBOpi2*/ + rvec_ScaledAdd(temp, coef.C4dbopi2, workspace->dDeltap_self[j]);/*3rd,dBOpi2*/ + + /* force */ + rvec_Add( system->atoms[j].f, temp ); + /* pressure */ + rvec_iMultiply( ext_press, nbr_j->rel_box, temp ); + rvec_Add( data->ext_press, ext_press ); + + /* if( !ivec_isZero( nbr_j->rel_box ) ) + fprintf( stderr, "%3d %3d %3d: dvec[%10.6f %10.6f %10.6f] + ext[%3d %3d %3d] f[%10.6f %10.6f %10.6f]\n", + i+1, system->atoms[i].x[0], system->atoms[i].x[1], system->atoms[i].x[2], + j+1, system->atoms[j].x[0], system->atoms[j].x[1], system->atoms[j].x[2], + j+1, nbr_j->dvec[0], nbr_j->dvec[1], nbr_j->dvec[2], + nbr_j->rel_box[0], nbr_j->rel_box[1], nbr_j->rel_box[2], + temp[0], temp[1], temp[2] ); */ } ///////////////////////////////////////////////////////////// @@ -526,388 +526,388 @@ void Add_dBond_to_Forces_NPT( int i, int pj, reax_system *system, ///////////////////////////////////////////////////////////// HOST_DEVICE void Cuda_Add_dBond_to_Forces_NPT( int i, int pj, reax_atom *atoms, - simulation_data *data, static_storage *workspace, - list *bonds ) + simulation_data *data, static_storage *workspace, + list *bonds ) { - bond_data *nbr_j, *nbr_k; - bond_order_data *bo_ij, *bo_ji; - dbond_coefficients coef; - rvec temp, ext_press; - ivec rel_box; - int pk, k, j; - - /* Initializations */ - nbr_j = &(bonds->select.bond_list[pj]); - j = nbr_j->nbr; - bo_ij = &(nbr_j->bo_data); - bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data); - - coef.C1dbo = bo_ij->C1dbo * (bo_ij->Cdbo + bo_ji->Cdbo); - coef.C2dbo = bo_ij->C2dbo * (bo_ij->Cdbo + bo_ji->Cdbo); - coef.C3dbo = bo_ij->C3dbo * (bo_ij->Cdbo + bo_ji->Cdbo); - - coef.C1dbopi = bo_ij->C1dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); - coef.C2dbopi = bo_ij->C2dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); - coef.C3dbopi = bo_ij->C3dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); - coef.C4dbopi = bo_ij->C4dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); - - coef.C1dbopi2 = bo_ij->C1dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); - coef.C2dbopi2 = bo_ij->C2dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); - coef.C3dbopi2 = bo_ij->C3dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); - coef.C4dbopi2 = bo_ij->C4dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); - - coef.C1dDelta = bo_ij->C1dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]); - coef.C2dDelta = bo_ij->C2dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]); - coef.C3dDelta = bo_ij->C3dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]); - - - /************************************ - * forces related to atom i * - * first neighbors of atom i * - ************************************/ - for( pk = Start_Index(i, bonds); pk < End_Index(i, bonds); ++pk ) { - nbr_k = &(bonds->select.bond_list[pk]); - k = nbr_k->nbr; - - rvec_Scale( temp, -coef.C2dbo, nbr_k->bo_data.dBOp ); /*2nd,dBO*/ - rvec_ScaledAdd( temp, -coef.C2dDelta, nbr_k->bo_data.dBOp );/*dDelta*/ - rvec_ScaledAdd( temp, -coef.C3dbopi, nbr_k->bo_data.dBOp ); /*3rd,dBOpi*/ - rvec_ScaledAdd( temp, -coef.C3dbopi2, nbr_k->bo_data.dBOp );/*3rd,dBOpi2*/ - - /* force */ - rvec_Add( atoms[k].f, temp ); - /* pressure */ - rvec_iMultiply( ext_press, nbr_k->rel_box, temp ); - rvec_Add( data->ext_press, ext_press ); - } - - /* then atom i itself */ - rvec_Scale( temp, coef.C1dbo, bo_ij->dBOp ); /*1st, dBO*/ - rvec_ScaledAdd( temp, coef.C2dbo, workspace->dDeltap_self[i] ); /*2nd, dBO*/ - - rvec_ScaledAdd( temp, coef.C1dDelta, bo_ij->dBOp ); /*1st, dBO*/ - rvec_ScaledAdd( temp, coef.C2dDelta, workspace->dDeltap_self[i] );/*2nd, dBO*/ - - rvec_ScaledAdd( temp, coef.C1dbopi, bo_ij->dln_BOp_pi ); /*1st,dBOpi*/ - rvec_ScaledAdd( temp, coef.C2dbopi, bo_ij->dBOp ); /*2nd,dBOpi*/ - rvec_ScaledAdd( temp, coef.C3dbopi, workspace->dDeltap_self[i] );/*3rd,dBOpi*/ - - rvec_ScaledAdd(temp, coef.C1dbopi2, bo_ij->dln_BOp_pi2) ; /*1st,dBO_pi2*/ - rvec_ScaledAdd(temp, coef.C2dbopi2, bo_ij->dBOp); /*2nd,dBO_pi2*/ - rvec_ScaledAdd(temp, coef.C3dbopi2, workspace->dDeltap_self[i]);/*3rd,dBO_pi2*/ - - /* force */ - rvec_Add( atoms[i].f, temp ); - /* ext pressure due to i dropped, counting force on j only will be enough */ - - - /**************************************************************************** - * forces and pressure related to atom j * - * first neighbors of atom j * - ***************************************************************************/ - for( pk = Start_Index(j, bonds); pk < End_Index(j, bonds); ++pk ) { - nbr_k = &(bonds->select.bond_list[pk]); - k = nbr_k->nbr; - - rvec_Scale( temp, -coef.C3dbo, nbr_k->bo_data.dBOp ); /*3rd,dBO*/ - rvec_ScaledAdd( temp, -coef.C3dDelta, nbr_k->bo_data.dBOp );/*dDelta*/ - rvec_ScaledAdd( temp, -coef.C4dbopi, nbr_k->bo_data.dBOp ); /*4th,dBOpi*/ - rvec_ScaledAdd( temp, -coef.C4dbopi2, nbr_k->bo_data.dBOp );/*4th,dBOpi2*/ - - /* force */ - rvec_Add( atoms[k].f, temp ); - /* pressure */ - if( k != i ) { - ivec_Sum(rel_box, nbr_k->rel_box, nbr_j->rel_box);//k's rel_box wrt i - rvec_iMultiply( ext_press, rel_box, temp ); - rvec_Add( data->ext_press, ext_press ); - } - } - - /* then atom j itself */ - rvec_Scale( temp, -coef.C1dbo, bo_ij->dBOp ); /*1st, dBO*/ - rvec_ScaledAdd( temp, coef.C3dbo, workspace->dDeltap_self[j] ); /*2nd, dBO*/ - - rvec_ScaledAdd( temp, -coef.C1dDelta, bo_ij->dBOp ); /*1st, dBO*/ - rvec_ScaledAdd( temp, coef.C3dDelta, workspace->dDeltap_self[j] );/*2nd, dBO*/ - - rvec_ScaledAdd( temp, -coef.C1dbopi, bo_ij->dln_BOp_pi ); /*1st,dBOpi*/ - rvec_ScaledAdd( temp, -coef.C2dbopi, bo_ij->dBOp ); /*2nd,dBOpi*/ - rvec_ScaledAdd( temp, coef.C4dbopi, workspace->dDeltap_self[j] );/*3rd,dBOpi*/ - - rvec_ScaledAdd(temp, -coef.C1dbopi2, bo_ij->dln_BOp_pi2); /*1st,dBOpi2*/ - rvec_ScaledAdd(temp, -coef.C2dbopi2, bo_ij->dBOp); /*2nd,dBOpi2*/ - rvec_ScaledAdd(temp, coef.C4dbopi2, workspace->dDeltap_self[j]);/*3rd,dBOpi2*/ - - /* force */ - rvec_Add( atoms[j].f, temp ); - /* pressure */ - rvec_iMultiply( ext_press, nbr_j->rel_box, temp ); - rvec_Add( data->ext_press, ext_press ); + bond_data *nbr_j, *nbr_k; + bond_order_data *bo_ij, *bo_ji; + dbond_coefficients coef; + rvec temp, ext_press; + ivec rel_box; + int pk, k, j; + + /* Initializations */ + nbr_j = &(bonds->select.bond_list[pj]); + j = nbr_j->nbr; + bo_ij = &(nbr_j->bo_data); + bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data); + + coef.C1dbo = bo_ij->C1dbo * (bo_ij->Cdbo + bo_ji->Cdbo); + coef.C2dbo = bo_ij->C2dbo * (bo_ij->Cdbo + bo_ji->Cdbo); + coef.C3dbo = bo_ij->C3dbo * (bo_ij->Cdbo + bo_ji->Cdbo); + + coef.C1dbopi = bo_ij->C1dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); + coef.C2dbopi = bo_ij->C2dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); + coef.C3dbopi = bo_ij->C3dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); + coef.C4dbopi = bo_ij->C4dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); + + coef.C1dbopi2 = bo_ij->C1dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); + coef.C2dbopi2 = bo_ij->C2dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); + coef.C3dbopi2 = bo_ij->C3dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); + coef.C4dbopi2 = bo_ij->C4dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); + + coef.C1dDelta = bo_ij->C1dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]); + coef.C2dDelta = bo_ij->C2dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]); + coef.C3dDelta = bo_ij->C3dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]); + + + /************************************ + * forces related to atom i * + * first neighbors of atom i * + ************************************/ + for( pk = Start_Index(i, bonds); pk < End_Index(i, bonds); ++pk ) { + nbr_k = &(bonds->select.bond_list[pk]); + k = nbr_k->nbr; + + rvec_Scale( temp, -coef.C2dbo, nbr_k->bo_data.dBOp ); /*2nd,dBO*/ + rvec_ScaledAdd( temp, -coef.C2dDelta, nbr_k->bo_data.dBOp );/*dDelta*/ + rvec_ScaledAdd( temp, -coef.C3dbopi, nbr_k->bo_data.dBOp ); /*3rd,dBOpi*/ + rvec_ScaledAdd( temp, -coef.C3dbopi2, nbr_k->bo_data.dBOp );/*3rd,dBOpi2*/ + + /* force */ + rvec_Add( atoms[k].f, temp ); + /* pressure */ + rvec_iMultiply( ext_press, nbr_k->rel_box, temp ); + rvec_Add( data->ext_press, ext_press ); + } + + /* then atom i itself */ + rvec_Scale( temp, coef.C1dbo, bo_ij->dBOp ); /*1st, dBO*/ + rvec_ScaledAdd( temp, coef.C2dbo, workspace->dDeltap_self[i] ); /*2nd, dBO*/ + + rvec_ScaledAdd( temp, coef.C1dDelta, bo_ij->dBOp ); /*1st, dBO*/ + rvec_ScaledAdd( temp, coef.C2dDelta, workspace->dDeltap_self[i] );/*2nd, dBO*/ + + rvec_ScaledAdd( temp, coef.C1dbopi, bo_ij->dln_BOp_pi ); /*1st,dBOpi*/ + rvec_ScaledAdd( temp, coef.C2dbopi, bo_ij->dBOp ); /*2nd,dBOpi*/ + rvec_ScaledAdd( temp, coef.C3dbopi, workspace->dDeltap_self[i] );/*3rd,dBOpi*/ + + rvec_ScaledAdd(temp, coef.C1dbopi2, bo_ij->dln_BOp_pi2) ; /*1st,dBO_pi2*/ + rvec_ScaledAdd(temp, coef.C2dbopi2, bo_ij->dBOp); /*2nd,dBO_pi2*/ + rvec_ScaledAdd(temp, coef.C3dbopi2, workspace->dDeltap_self[i]);/*3rd,dBO_pi2*/ + + /* force */ + rvec_Add( atoms[i].f, temp ); + /* ext pressure due to i dropped, counting force on j only will be enough */ + + + /**************************************************************************** + * forces and pressure related to atom j * + * first neighbors of atom j * + ***************************************************************************/ + for( pk = Start_Index(j, bonds); pk < End_Index(j, bonds); ++pk ) { + nbr_k = &(bonds->select.bond_list[pk]); + k = nbr_k->nbr; + + rvec_Scale( temp, -coef.C3dbo, nbr_k->bo_data.dBOp ); /*3rd,dBO*/ + rvec_ScaledAdd( temp, -coef.C3dDelta, nbr_k->bo_data.dBOp );/*dDelta*/ + rvec_ScaledAdd( temp, -coef.C4dbopi, nbr_k->bo_data.dBOp ); /*4th,dBOpi*/ + rvec_ScaledAdd( temp, -coef.C4dbopi2, nbr_k->bo_data.dBOp );/*4th,dBOpi2*/ + + /* force */ + rvec_Add( atoms[k].f, temp ); + /* pressure */ + if( k != i ) { + ivec_Sum(rel_box, nbr_k->rel_box, nbr_j->rel_box);//k's rel_box wrt i + rvec_iMultiply( ext_press, rel_box, temp ); + rvec_Add( data->ext_press, ext_press ); + } + } + + /* then atom j itself */ + rvec_Scale( temp, -coef.C1dbo, bo_ij->dBOp ); /*1st, dBO*/ + rvec_ScaledAdd( temp, coef.C3dbo, workspace->dDeltap_self[j] ); /*2nd, dBO*/ + + rvec_ScaledAdd( temp, -coef.C1dDelta, bo_ij->dBOp ); /*1st, dBO*/ + rvec_ScaledAdd( temp, coef.C3dDelta, workspace->dDeltap_self[j] );/*2nd, dBO*/ + + rvec_ScaledAdd( temp, -coef.C1dbopi, bo_ij->dln_BOp_pi ); /*1st,dBOpi*/ + rvec_ScaledAdd( temp, -coef.C2dbopi, bo_ij->dBOp ); /*2nd,dBOpi*/ + rvec_ScaledAdd( temp, coef.C4dbopi, workspace->dDeltap_self[j] );/*3rd,dBOpi*/ + + rvec_ScaledAdd(temp, -coef.C1dbopi2, bo_ij->dln_BOp_pi2); /*1st,dBOpi2*/ + rvec_ScaledAdd(temp, -coef.C2dbopi2, bo_ij->dBOp); /*2nd,dBOpi2*/ + rvec_ScaledAdd(temp, coef.C4dbopi2, workspace->dDeltap_self[j]);/*3rd,dBOpi2*/ + + /* force */ + rvec_Add( atoms[j].f, temp ); + /* pressure */ + rvec_iMultiply( ext_press, nbr_j->rel_box, temp ); + rvec_Add( data->ext_press, ext_press ); } ///////////////////////////////////////////////////////////// //Cuda Functions ///////////////////////////////////////////////////////////// void Add_dBond_to_Forces( int i, int pj, reax_system *system, - simulation_data *data, static_storage *workspace, - list **lists ) + simulation_data *data, static_storage *workspace, + list **lists ) { - list *bonds = (*lists) + BONDS; - bond_data *nbr_j, *nbr_k; - bond_order_data *bo_ij, *bo_ji; - dbond_coefficients coef; - int pk, k, j; - - /* Initializations */ - nbr_j = &(bonds->select.bond_list[pj]); - j = nbr_j->nbr; - bo_ij = &(nbr_j->bo_data); - bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data); - - coef.C1dbo = bo_ij->C1dbo * (bo_ij->Cdbo + bo_ji->Cdbo); - coef.C2dbo = bo_ij->C2dbo * (bo_ij->Cdbo + bo_ji->Cdbo); - coef.C3dbo = bo_ij->C3dbo * (bo_ij->Cdbo + bo_ji->Cdbo); - - coef.C1dbopi = bo_ij->C1dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); - coef.C2dbopi = bo_ij->C2dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); - coef.C3dbopi = bo_ij->C3dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); - coef.C4dbopi = bo_ij->C4dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); - - coef.C1dbopi2 = bo_ij->C1dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); - coef.C2dbopi2 = bo_ij->C2dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); - coef.C3dbopi2 = bo_ij->C3dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); - coef.C4dbopi2 = bo_ij->C4dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); - - coef.C1dDelta = bo_ij->C1dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]); - coef.C2dDelta = bo_ij->C2dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]); - coef.C3dDelta = bo_ij->C3dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]); - - for( pk = Start_Index(i, bonds); pk < End_Index(i, bonds); ++pk ) { - nbr_k = &(bonds->select.bond_list[pk]); - k = nbr_k->nbr; - - rvec_ScaledAdd( system->atoms[k].f, -coef.C2dbo, nbr_k->bo_data.dBOp ); - /*2nd, dBO*/ - rvec_ScaledAdd( system->atoms[k].f, -coef.C2dDelta, nbr_k->bo_data.dBOp ); - /*dDelta*/ - rvec_ScaledAdd( system->atoms[k].f, -coef.C3dbopi, nbr_k->bo_data.dBOp ); - /*3rd, dBOpi*/ - rvec_ScaledAdd( system->atoms[k].f, -coef.C3dbopi2, nbr_k->bo_data.dBOp ); - /*3rd, dBOpi2*/ - } - - rvec_ScaledAdd( system->atoms[i].f, coef.C1dbo, bo_ij->dBOp ); - /*1st, dBO*/ - rvec_ScaledAdd( system->atoms[i].f, coef.C2dbo, workspace->dDeltap_self[i] ); - /*2nd, dBO*/ - - rvec_ScaledAdd(system->atoms[i].f, coef.C1dDelta, bo_ij->dBOp); - /*1st, dBO*/ - rvec_ScaledAdd(system->atoms[i].f, coef.C2dDelta, workspace->dDeltap_self[i]); - /*2nd, dBO*/ - - rvec_ScaledAdd( system->atoms[i].f, coef.C1dbopi, bo_ij->dln_BOp_pi ); - /*1st, dBOpi*/ - rvec_ScaledAdd( system->atoms[i].f, coef.C2dbopi, bo_ij->dBOp ); - /*2nd, dBOpi*/ - rvec_ScaledAdd(system->atoms[i].f, coef.C3dbopi, workspace->dDeltap_self[i]); - /*3rd, dBOpi*/ - - rvec_ScaledAdd( system->atoms[i].f, coef.C1dbopi2, bo_ij->dln_BOp_pi2 ); - /*1st, dBO_pi2*/ - rvec_ScaledAdd( system->atoms[i].f, coef.C2dbopi2, bo_ij->dBOp ); - /*2nd, dBO_pi2*/ - rvec_ScaledAdd(system->atoms[i].f, coef.C3dbopi2, workspace->dDeltap_self[i]); - /*3rd, dBO_pi2*/ - - - for( pk = Start_Index(j, bonds); pk < End_Index(j, bonds); ++pk ) { - nbr_k = &(bonds->select.bond_list[pk]); - k = nbr_k->nbr; - - rvec_ScaledAdd( system->atoms[k].f, -coef.C3dbo, nbr_k->bo_data.dBOp ); - /*3rd, dBO*/ - rvec_ScaledAdd( system->atoms[k].f, -coef.C3dDelta, nbr_k->bo_data.dBOp ); - /*dDelta*/ - rvec_ScaledAdd( system->atoms[k].f, -coef.C4dbopi, nbr_k->bo_data.dBOp ); - /*4th, dBOpi*/ - rvec_ScaledAdd( system->atoms[k].f, -coef.C4dbopi2, nbr_k->bo_data.dBOp ); - /*4th, dBOpi2*/ - } - - rvec_ScaledAdd( system->atoms[j].f, -coef.C1dbo, bo_ij->dBOp ); - /*1st, dBO*/ - rvec_ScaledAdd( system->atoms[j].f, coef.C3dbo, workspace->dDeltap_self[j] ); - /*2nd, dBO*/ - - rvec_ScaledAdd( system->atoms[j].f, -coef.C1dDelta, bo_ij->dBOp ); - /*1st, dBO*/ - rvec_ScaledAdd(system->atoms[j].f, coef.C3dDelta, workspace->dDeltap_self[j]); - /*2nd, dBO*/ - - rvec_ScaledAdd( system->atoms[j].f, -coef.C1dbopi, bo_ij->dln_BOp_pi ); - /*1st, dBOpi*/ - rvec_ScaledAdd( system->atoms[j].f, -coef.C2dbopi, bo_ij->dBOp ); - /*2nd, dBOpi*/ - rvec_ScaledAdd(system->atoms[j].f, coef.C4dbopi, workspace->dDeltap_self[j]); - /*3rd, dBOpi*/ - - rvec_ScaledAdd( system->atoms[j].f, -coef.C1dbopi2, bo_ij->dln_BOp_pi2 ); - /*1st, dBOpi2*/ - rvec_ScaledAdd( system->atoms[j].f, -coef.C2dbopi2, bo_ij->dBOp ); - /*2nd, dBOpi2*/ - rvec_ScaledAdd(system->atoms[j].f, coef.C4dbopi2, workspace->dDeltap_self[j]); - /*3rd, dBOpi2*/ + list *bonds = (*lists) + BONDS; + bond_data *nbr_j, *nbr_k; + bond_order_data *bo_ij, *bo_ji; + dbond_coefficients coef; + int pk, k, j; + + /* Initializations */ + nbr_j = &(bonds->select.bond_list[pj]); + j = nbr_j->nbr; + bo_ij = &(nbr_j->bo_data); + bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data); + + coef.C1dbo = bo_ij->C1dbo * (bo_ij->Cdbo + bo_ji->Cdbo); + coef.C2dbo = bo_ij->C2dbo * (bo_ij->Cdbo + bo_ji->Cdbo); + coef.C3dbo = bo_ij->C3dbo * (bo_ij->Cdbo + bo_ji->Cdbo); + + coef.C1dbopi = bo_ij->C1dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); + coef.C2dbopi = bo_ij->C2dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); + coef.C3dbopi = bo_ij->C3dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); + coef.C4dbopi = bo_ij->C4dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); + + coef.C1dbopi2 = bo_ij->C1dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); + coef.C2dbopi2 = bo_ij->C2dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); + coef.C3dbopi2 = bo_ij->C3dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); + coef.C4dbopi2 = bo_ij->C4dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); + + coef.C1dDelta = bo_ij->C1dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]); + coef.C2dDelta = bo_ij->C2dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]); + coef.C3dDelta = bo_ij->C3dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]); + + for( pk = Start_Index(i, bonds); pk < End_Index(i, bonds); ++pk ) { + nbr_k = &(bonds->select.bond_list[pk]); + k = nbr_k->nbr; + + rvec_ScaledAdd( system->atoms[k].f, -coef.C2dbo, nbr_k->bo_data.dBOp ); + /*2nd, dBO*/ + rvec_ScaledAdd( system->atoms[k].f, -coef.C2dDelta, nbr_k->bo_data.dBOp ); + /*dDelta*/ + rvec_ScaledAdd( system->atoms[k].f, -coef.C3dbopi, nbr_k->bo_data.dBOp ); + /*3rd, dBOpi*/ + rvec_ScaledAdd( system->atoms[k].f, -coef.C3dbopi2, nbr_k->bo_data.dBOp ); + /*3rd, dBOpi2*/ + } + + rvec_ScaledAdd( system->atoms[i].f, coef.C1dbo, bo_ij->dBOp ); + /*1st, dBO*/ + rvec_ScaledAdd( system->atoms[i].f, coef.C2dbo, workspace->dDeltap_self[i] ); + /*2nd, dBO*/ + + rvec_ScaledAdd(system->atoms[i].f, coef.C1dDelta, bo_ij->dBOp); + /*1st, dBO*/ + rvec_ScaledAdd(system->atoms[i].f, coef.C2dDelta, workspace->dDeltap_self[i]); + /*2nd, dBO*/ + + rvec_ScaledAdd( system->atoms[i].f, coef.C1dbopi, bo_ij->dln_BOp_pi ); + /*1st, dBOpi*/ + rvec_ScaledAdd( system->atoms[i].f, coef.C2dbopi, bo_ij->dBOp ); + /*2nd, dBOpi*/ + rvec_ScaledAdd(system->atoms[i].f, coef.C3dbopi, workspace->dDeltap_self[i]); + /*3rd, dBOpi*/ + + rvec_ScaledAdd( system->atoms[i].f, coef.C1dbopi2, bo_ij->dln_BOp_pi2 ); + /*1st, dBO_pi2*/ + rvec_ScaledAdd( system->atoms[i].f, coef.C2dbopi2, bo_ij->dBOp ); + /*2nd, dBO_pi2*/ + rvec_ScaledAdd(system->atoms[i].f, coef.C3dbopi2, workspace->dDeltap_self[i]); + /*3rd, dBO_pi2*/ + + + for( pk = Start_Index(j, bonds); pk < End_Index(j, bonds); ++pk ) { + nbr_k = &(bonds->select.bond_list[pk]); + k = nbr_k->nbr; + + rvec_ScaledAdd( system->atoms[k].f, -coef.C3dbo, nbr_k->bo_data.dBOp ); + /*3rd, dBO*/ + rvec_ScaledAdd( system->atoms[k].f, -coef.C3dDelta, nbr_k->bo_data.dBOp ); + /*dDelta*/ + rvec_ScaledAdd( system->atoms[k].f, -coef.C4dbopi, nbr_k->bo_data.dBOp ); + /*4th, dBOpi*/ + rvec_ScaledAdd( system->atoms[k].f, -coef.C4dbopi2, nbr_k->bo_data.dBOp ); + /*4th, dBOpi2*/ + } + + rvec_ScaledAdd( system->atoms[j].f, -coef.C1dbo, bo_ij->dBOp ); + /*1st, dBO*/ + rvec_ScaledAdd( system->atoms[j].f, coef.C3dbo, workspace->dDeltap_self[j] ); + /*2nd, dBO*/ + + rvec_ScaledAdd( system->atoms[j].f, -coef.C1dDelta, bo_ij->dBOp ); + /*1st, dBO*/ + rvec_ScaledAdd(system->atoms[j].f, coef.C3dDelta, workspace->dDeltap_self[j]); + /*2nd, dBO*/ + + rvec_ScaledAdd( system->atoms[j].f, -coef.C1dbopi, bo_ij->dln_BOp_pi ); + /*1st, dBOpi*/ + rvec_ScaledAdd( system->atoms[j].f, -coef.C2dbopi, bo_ij->dBOp ); + /*2nd, dBOpi*/ + rvec_ScaledAdd(system->atoms[j].f, coef.C4dbopi, workspace->dDeltap_self[j]); + /*3rd, dBOpi*/ + + rvec_ScaledAdd( system->atoms[j].f, -coef.C1dbopi2, bo_ij->dln_BOp_pi2 ); + /*1st, dBOpi2*/ + rvec_ScaledAdd( system->atoms[j].f, -coef.C2dbopi2, bo_ij->dBOp ); + /*2nd, dBOpi2*/ + rvec_ScaledAdd(system->atoms[j].f, coef.C4dbopi2, workspace->dDeltap_self[j]); + /*3rd, dBOpi2*/ } HOST_DEVICE void Cuda_Add_dBond_to_Forces ( int i, int pj, reax_atom *atoms, - static_storage *workspace, list *bonds ) + static_storage *workspace, list *bonds ) { - bond_data *nbr_j, *nbr_k; - bond_order_data *bo_ij, *bo_ji; - dbond_coefficients coef; - int pk, k, j; - rvec t_f; - - /* Initializations */ - nbr_j = &(bonds->select.bond_list[pj]); - j = nbr_j->nbr; - - if (i < j) - { - bo_ij = &(nbr_j->bo_data); - bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data); - } else { - bo_ji = &(nbr_j->bo_data); - bo_ij = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data); - } - - coef.C1dbo = bo_ij->C1dbo * (bo_ij->Cdbo + bo_ji->Cdbo); - coef.C2dbo = bo_ij->C2dbo * (bo_ij->Cdbo + bo_ji->Cdbo); - coef.C3dbo = bo_ij->C3dbo * (bo_ij->Cdbo + bo_ji->Cdbo); - - coef.C1dbopi = bo_ij->C1dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); - coef.C2dbopi = bo_ij->C2dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); - coef.C3dbopi = bo_ij->C3dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); - coef.C4dbopi = bo_ij->C4dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); - - coef.C1dbopi2 = bo_ij->C1dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); - coef.C2dbopi2 = bo_ij->C2dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); - coef.C3dbopi2 = bo_ij->C3dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); - coef.C4dbopi2 = bo_ij->C4dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); - - coef.C1dDelta = bo_ij->C1dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]); - coef.C2dDelta = bo_ij->C2dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]); - coef.C3dDelta = bo_ij->C3dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]); - - if ( i < j) { - for( pk = Start_Index(i, bonds); pk < End_Index(i, bonds); ++pk ) { - nbr_k = &(bonds->select.bond_list[pk]); - k = nbr_k->nbr; - rvec_MakeZero (t_f); - - rvec_ScaledAdd( t_f, -coef.C2dbo, nbr_k->bo_data.dBOp ); - /*2nd, dBO*/ - rvec_ScaledAdd( t_f, -coef.C2dDelta, nbr_k->bo_data.dBOp ); - /*dDelta*/ - rvec_ScaledAdd( t_f, -coef.C3dbopi, nbr_k->bo_data.dBOp ); - /*3rd, dBOpi*/ - rvec_ScaledAdd( t_f, -coef.C3dbopi2, nbr_k->bo_data.dBOp ); - /*3rd, dBOpi2*/ - - //Store in the temp place - rvec_Add (nbr_k->t_f, t_f); - } - - rvec_ScaledAdd( atoms[i].f, coef.C1dbo, bo_ij->dBOp ); - /*1st, dBO*/ - rvec_ScaledAdd( atoms[i].f, coef.C2dbo, workspace->dDeltap_self[i] ); - /*2nd, dBO*/ - - rvec_ScaledAdd(atoms[i].f, coef.C1dDelta, bo_ij->dBOp); - /*1st, dBO*/ - rvec_ScaledAdd(atoms[i].f, coef.C2dDelta, workspace->dDeltap_self[i]); - /*2nd, dBO*/ - - rvec_ScaledAdd( atoms[i].f, coef.C1dbopi, bo_ij->dln_BOp_pi ); - /*1st, dBOpi*/ - rvec_ScaledAdd( atoms[i].f, coef.C2dbopi, bo_ij->dBOp ); - /*2nd, dBOpi*/ - rvec_ScaledAdd( atoms[i].f, coef.C3dbopi, workspace->dDeltap_self[i]); - /*3rd, dBOpi*/ - - rvec_ScaledAdd( atoms[i].f, coef.C1dbopi2, bo_ij->dln_BOp_pi2 ); - /*1st, dBO_pi2*/ - rvec_ScaledAdd( atoms[i].f, coef.C2dbopi2, bo_ij->dBOp ); - /*2nd, dBO_pi2*/ - rvec_ScaledAdd( atoms[i].f, coef.C3dbopi2, workspace->dDeltap_self[i]); - /*3rd, dBO_pi2*/ - } - else - { - for( pk = Start_Index(i, bonds); pk < End_Index(i, bonds); ++pk ) { - nbr_k = &(bonds->select.bond_list[pk]); - k = nbr_k->nbr; - rvec_MakeZero (t_f); - - rvec_ScaledAdd( t_f, -coef.C3dbo, nbr_k->bo_data.dBOp ); - /*3rd, dBO*/ - rvec_ScaledAdd( t_f, -coef.C3dDelta, nbr_k->bo_data.dBOp ); - /*dDelta*/ - rvec_ScaledAdd( t_f, -coef.C4dbopi, nbr_k->bo_data.dBOp ); - /*4th, dBOpi*/ - rvec_ScaledAdd( t_f, -coef.C4dbopi2, nbr_k->bo_data.dBOp ); - /*4th, dBOpi2*/ - - //Store in the temp place - rvec_Add (nbr_k->t_f, t_f); - } - - rvec_ScaledAdd( atoms[i].f, -coef.C1dbo, bo_ij->dBOp ); - /*1st, dBO*/ - rvec_ScaledAdd( atoms[i].f, coef.C3dbo, workspace->dDeltap_self[i] ); - /*2nd, dBO*/ - - rvec_ScaledAdd( atoms[i].f, -coef.C1dDelta, bo_ij->dBOp ); - /*1st, dBO*/ - rvec_ScaledAdd(atoms[i].f, coef.C3dDelta, workspace->dDeltap_self[i]); - /*2nd, dBO*/ - - rvec_ScaledAdd( atoms[i].f, -coef.C1dbopi, bo_ij->dln_BOp_pi ); - /*1st, dBOpi*/ - rvec_ScaledAdd( atoms[i].f, -coef.C2dbopi, bo_ij->dBOp ); - /*2nd, dBOpi*/ - rvec_ScaledAdd(atoms[i].f, coef.C4dbopi, workspace->dDeltap_self[i]); - /*3rd, dBOpi*/ - - rvec_ScaledAdd( atoms[i].f, -coef.C1dbopi2, bo_ij->dln_BOp_pi2 ); - /*1st, dBOpi2*/ - rvec_ScaledAdd( atoms[i].f, -coef.C2dbopi2, bo_ij->dBOp ); - /*2nd, dBOpi2*/ - rvec_ScaledAdd(atoms[i].f, coef.C4dbopi2, workspace->dDeltap_self[i]); - /*3rd, dBOpi2*/ - } + bond_data *nbr_j, *nbr_k; + bond_order_data *bo_ij, *bo_ji; + dbond_coefficients coef; + int pk, k, j; + rvec t_f; + + /* Initializations */ + nbr_j = &(bonds->select.bond_list[pj]); + j = nbr_j->nbr; + + if (i < j) + { + bo_ij = &(nbr_j->bo_data); + bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data); + } else { + bo_ji = &(nbr_j->bo_data); + bo_ij = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data); + } + + coef.C1dbo = bo_ij->C1dbo * (bo_ij->Cdbo + bo_ji->Cdbo); + coef.C2dbo = bo_ij->C2dbo * (bo_ij->Cdbo + bo_ji->Cdbo); + coef.C3dbo = bo_ij->C3dbo * (bo_ij->Cdbo + bo_ji->Cdbo); + + coef.C1dbopi = bo_ij->C1dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); + coef.C2dbopi = bo_ij->C2dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); + coef.C3dbopi = bo_ij->C3dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); + coef.C4dbopi = bo_ij->C4dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); + + coef.C1dbopi2 = bo_ij->C1dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); + coef.C2dbopi2 = bo_ij->C2dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); + coef.C3dbopi2 = bo_ij->C3dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); + coef.C4dbopi2 = bo_ij->C4dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); + + coef.C1dDelta = bo_ij->C1dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]); + coef.C2dDelta = bo_ij->C2dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]); + coef.C3dDelta = bo_ij->C3dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]); + + if ( i < j) { + for( pk = Start_Index(i, bonds); pk < End_Index(i, bonds); ++pk ) { + nbr_k = &(bonds->select.bond_list[pk]); + k = nbr_k->nbr; + rvec_MakeZero (t_f); + + rvec_ScaledAdd( t_f, -coef.C2dbo, nbr_k->bo_data.dBOp ); + /*2nd, dBO*/ + rvec_ScaledAdd( t_f, -coef.C2dDelta, nbr_k->bo_data.dBOp ); + /*dDelta*/ + rvec_ScaledAdd( t_f, -coef.C3dbopi, nbr_k->bo_data.dBOp ); + /*3rd, dBOpi*/ + rvec_ScaledAdd( t_f, -coef.C3dbopi2, nbr_k->bo_data.dBOp ); + /*3rd, dBOpi2*/ + + //Store in the temp place + rvec_Add (nbr_k->t_f, t_f); + } + + rvec_ScaledAdd( atoms[i].f, coef.C1dbo, bo_ij->dBOp ); + /*1st, dBO*/ + rvec_ScaledAdd( atoms[i].f, coef.C2dbo, workspace->dDeltap_self[i] ); + /*2nd, dBO*/ + + rvec_ScaledAdd(atoms[i].f, coef.C1dDelta, bo_ij->dBOp); + /*1st, dBO*/ + rvec_ScaledAdd(atoms[i].f, coef.C2dDelta, workspace->dDeltap_self[i]); + /*2nd, dBO*/ + + rvec_ScaledAdd( atoms[i].f, coef.C1dbopi, bo_ij->dln_BOp_pi ); + /*1st, dBOpi*/ + rvec_ScaledAdd( atoms[i].f, coef.C2dbopi, bo_ij->dBOp ); + /*2nd, dBOpi*/ + rvec_ScaledAdd( atoms[i].f, coef.C3dbopi, workspace->dDeltap_self[i]); + /*3rd, dBOpi*/ + + rvec_ScaledAdd( atoms[i].f, coef.C1dbopi2, bo_ij->dln_BOp_pi2 ); + /*1st, dBO_pi2*/ + rvec_ScaledAdd( atoms[i].f, coef.C2dbopi2, bo_ij->dBOp ); + /*2nd, dBO_pi2*/ + rvec_ScaledAdd( atoms[i].f, coef.C3dbopi2, workspace->dDeltap_self[i]); + /*3rd, dBO_pi2*/ + } + else + { + for( pk = Start_Index(i, bonds); pk < End_Index(i, bonds); ++pk ) { + nbr_k = &(bonds->select.bond_list[pk]); + k = nbr_k->nbr; + rvec_MakeZero (t_f); + + rvec_ScaledAdd( t_f, -coef.C3dbo, nbr_k->bo_data.dBOp ); + /*3rd, dBO*/ + rvec_ScaledAdd( t_f, -coef.C3dDelta, nbr_k->bo_data.dBOp ); + /*dDelta*/ + rvec_ScaledAdd( t_f, -coef.C4dbopi, nbr_k->bo_data.dBOp ); + /*4th, dBOpi*/ + rvec_ScaledAdd( t_f, -coef.C4dbopi2, nbr_k->bo_data.dBOp ); + /*4th, dBOpi2*/ + + //Store in the temp place + rvec_Add (nbr_k->t_f, t_f); + } + + rvec_ScaledAdd( atoms[i].f, -coef.C1dbo, bo_ij->dBOp ); + /*1st, dBO*/ + rvec_ScaledAdd( atoms[i].f, coef.C3dbo, workspace->dDeltap_self[i] ); + /*2nd, dBO*/ + + rvec_ScaledAdd( atoms[i].f, -coef.C1dDelta, bo_ij->dBOp ); + /*1st, dBO*/ + rvec_ScaledAdd(atoms[i].f, coef.C3dDelta, workspace->dDeltap_self[i]); + /*2nd, dBO*/ + + rvec_ScaledAdd( atoms[i].f, -coef.C1dbopi, bo_ij->dln_BOp_pi ); + /*1st, dBOpi*/ + rvec_ScaledAdd( atoms[i].f, -coef.C2dbopi, bo_ij->dBOp ); + /*2nd, dBOpi*/ + rvec_ScaledAdd(atoms[i].f, coef.C4dbopi, workspace->dDeltap_self[i]); + /*3rd, dBOpi*/ + + rvec_ScaledAdd( atoms[i].f, -coef.C1dbopi2, bo_ij->dln_BOp_pi2 ); + /*1st, dBOpi2*/ + rvec_ScaledAdd( atoms[i].f, -coef.C2dbopi2, bo_ij->dBOp ); + /*2nd, dBOpi2*/ + rvec_ScaledAdd(atoms[i].f, coef.C4dbopi2, workspace->dDeltap_self[i]); + /*3rd, dBOpi2*/ + } } HOST_DEVICE void Cuda_dbond_to_Forces_postprocess (int i, reax_atom *atoms, list *bonds) { - int pk; - bond_data *nbr_k, *nbr_k_sym; - - /* - for( pk = Start_Index(i, bonds); pk < End_Index(i, bonds); ++pk ) { - nbr_k = &(bonds->select.bond_list[pk]); - rvec_Add (atoms[i].f, nbr_k->t_f); - } - */ - - for( pk = Start_Index(i, bonds); pk < End_Index(i, bonds); ++pk ) { - nbr_k = &(bonds->select.bond_list[pk]); - nbr_k_sym = &( bonds->select.bond_list [nbr_k->sym_index] ); - - rvec_Add (atoms[i].f, nbr_k_sym->t_f); - } + int pk; + bond_data *nbr_k, *nbr_k_sym; + + /* + for( pk = Start_Index(i, bonds); pk < End_Index(i, bonds); ++pk ) { + nbr_k = &(bonds->select.bond_list[pk]); + rvec_Add (atoms[i].f, nbr_k->t_f); + } + */ + + for( pk = Start_Index(i, bonds); pk < End_Index(i, bonds); ++pk ) { + nbr_k = &(bonds->select.bond_list[pk]); + nbr_k_sym = &( bonds->select.bond_list [nbr_k->sym_index] ); + + rvec_Add (atoms[i].f, nbr_k_sym->t_f); + } } /* Locate j on i's list. @@ -915,52 +915,52 @@ HOST_DEVICE void Cuda_dbond_to_Forces_postprocess (int i, reax_atom *atoms, list And this is the case given our method of neighbor generation*/ int Locate_Symmetric_Bond( list *bonds, int i, int j ) { - int start = Start_Index(i, bonds); - int end = End_Index(i, bonds); - int mid = (start + end) / 2; - int mid_nbr; - - while( (mid_nbr = bonds->select.bond_list[mid].nbr) != j ) { - /*fprintf( stderr, "\tstart: %d end: %d mid: %d\n", - start, end, mid );*/ - if( mid_nbr < j ) - start = mid+1; - else end = mid - 1; - - mid = (start + end) / 2; - } - - return mid; + int start = Start_Index(i, bonds); + int end = End_Index(i, bonds); + int mid = (start + end) / 2; + int mid_nbr; + + while( (mid_nbr = bonds->select.bond_list[mid].nbr) != j ) { + /*fprintf( stderr, "\tstart: %d end: %d mid: %d\n", + start, end, mid );*/ + if( mid_nbr < j ) + start = mid+1; + else end = mid - 1; + + mid = (start + end) / 2; + } + + return mid; } inline void Copy_Neighbor_Data( bond_data *dest, near_neighbor_data *src ) { - dest->nbr = src->nbr; - dest->d = src->d; - rvec_Copy( dest->dvec, src->dvec ); - ivec_Copy( dest->rel_box, src->rel_box ); - /* rvec_Copy( dest->ext_factor, src->ext_factor );*/ + dest->nbr = src->nbr; + dest->d = src->d; + rvec_Copy( dest->dvec, src->dvec ); + ivec_Copy( dest->rel_box, src->rel_box ); + /* rvec_Copy( dest->ext_factor, src->ext_factor );*/ } inline void Copy_Bond_Order_Data( bond_order_data *dest, bond_order_data *src ) { - dest->BO = src->BO; - dest->BO_s = src->BO_s; - dest->BO_pi = src->BO_pi; - dest->BO_pi2 = src->BO_pi2; - - rvec_Scale( dest->dBOp, -1.0, src->dBOp ); - rvec_Scale( dest->dln_BOp_s, -1.0, src->dln_BOp_s ); - rvec_Scale( dest->dln_BOp_pi, -1.0, src->dln_BOp_pi ); - rvec_Scale( dest->dln_BOp_pi2, -1.0, src->dln_BOp_pi2 ); + dest->BO = src->BO; + dest->BO_s = src->BO_s; + dest->BO_pi = src->BO_pi; + dest->BO_pi2 = src->BO_pi2; + + rvec_Scale( dest->dBOp, -1.0, src->dBOp ); + rvec_Scale( dest->dln_BOp_s, -1.0, src->dln_BOp_s ); + rvec_Scale( dest->dln_BOp_pi, -1.0, src->dln_BOp_pi ); + rvec_Scale( dest->dln_BOp_pi2, -1.0, src->dln_BOp_pi2 ); } int compare_bonds( const void *p1, const void *p2 ) { - return ((bond_data *)p1)->nbr - ((bond_data *)p2)->nbr; + return ((bond_data *)p1)->nbr - ((bond_data *)p2)->nbr; } @@ -968,257 +968,257 @@ int compare_bonds( const void *p1, const void *p2 ) belonging to a different atom in nbrhoods->nbr_list is sorted in its own. This can either be done in the general coordinator function or here */ void Calculate_Bond_Orders( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - list **lists, output_controls *out_control ) + simulation_data *data, static_storage *workspace, + list **lists, output_controls *out_control ) { - int i, j, pj, type_i, type_j; - int start_i, end_i; - int num_bonds, sym_index; - real p_boc1, p_boc2; - real val_i, Deltap_i, Deltap_boc_i; - real val_j, Deltap_j, Deltap_boc_j; - real temp, f1, f2, f3, f4, f5, f4f5, exp_f4, exp_f5; - real exp_p1i, exp_p2i, exp_p1j, exp_p2j; - real u1_ij, u1_ji, Cf1A_ij, Cf1B_ij, Cf1_ij, Cf1_ji; - real Cf45_ij, Cf45_ji, p_lp1; - real A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji; - real explp1; - two_body_parameters *twbp; - bond_order_data *bo_ij, *bo_ji; - single_body_parameters *sbp_i, *sbp_j; - list *bonds = (*lists) + BONDS; + int i, j, pj, type_i, type_j; + int start_i, end_i; + int num_bonds, sym_index; + real p_boc1, p_boc2; + real val_i, Deltap_i, Deltap_boc_i; + real val_j, Deltap_j, Deltap_boc_j; + real temp, f1, f2, f3, f4, f5, f4f5, exp_f4, exp_f5; + real exp_p1i, exp_p2i, exp_p1j, exp_p2j; + real u1_ij, u1_ji, Cf1A_ij, Cf1B_ij, Cf1_ij, Cf1_ji; + real Cf45_ij, Cf45_ji, p_lp1; + real A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji; + real explp1; + two_body_parameters *twbp; + bond_order_data *bo_ij, *bo_ji; + single_body_parameters *sbp_i, *sbp_j; + list *bonds = (*lists) + BONDS; #if defined(TEST_FORCES) - int k, pk, start_j, end_j; - int top_dbo=0, top_dDelta=0; - dbond_data *pdbo; - dDelta_data *ptop_dDelta; - list *dDeltas = (*lists) + DDELTA; - list *dBOs = (*lists) + DBO; + int k, pk, start_j, end_j; + int top_dbo=0, top_dDelta=0; + dbond_data *pdbo; + dDelta_data *ptop_dDelta; + list *dDeltas = (*lists) + DDELTA; + list *dBOs = (*lists) + DBO; #endif - num_bonds = 0; - p_boc1 = system->reaxprm.gp.l[0]; - p_boc2 = system->reaxprm.gp.l[1]; - - /* Calculate Deltaprime, Deltaprime_boc values */ - for( i = 0; i < system->N; ++i ) { - type_i = system->atoms[i].type; - sbp_i = &(system->reaxprm.sbp[type_i]); - workspace->Deltap[i] = workspace->total_bond_order[i] - sbp_i->valency; - workspace->Deltap_boc[i] = - workspace->total_bond_order[i] - sbp_i->valency_val; - workspace->total_bond_order[i] = 0; - } - // fprintf( stderr, "done with uncorrected bond orders\n" ); - - - /* Corrected Bond Order calculations */ - for( i = 0; i < system->N; ++i ) { - type_i = system->atoms[i].type; - sbp_i = &(system->reaxprm.sbp[type_i]); - val_i = sbp_i->valency; - Deltap_i = workspace->Deltap[i]; - Deltap_boc_i = workspace->Deltap_boc[i]; - start_i = Start_Index(i, bonds); - end_i = End_Index(i, bonds); - //fprintf( stderr, "i:%d Dp:%g Dbocp:%g s:%d e:%d\n", - // i+1, Deltap_i, Deltap_boc_i, start_i, end_i ); - - for( pj = start_i; pj < end_i; ++pj ) { - j = bonds->select.bond_list[pj].nbr; - type_j = system->atoms[j].type; - bo_ij = &( bonds->select.bond_list[pj].bo_data ); - //fprintf( stderr, "\tj:%d - ubo: %8.3f\n", j+1, bo_ij->BO ); - - if( i < j ) { - twbp = &( system->reaxprm.tbp[ index_tbp (type_i,type_j,&system->reaxprm) ] ); + num_bonds = 0; + p_boc1 = system->reaxprm.gp.l[0]; + p_boc2 = system->reaxprm.gp.l[1]; + + /* Calculate Deltaprime, Deltaprime_boc values */ + for( i = 0; i < system->N; ++i ) { + type_i = system->atoms[i].type; + sbp_i = &(system->reaxprm.sbp[type_i]); + workspace->Deltap[i] = workspace->total_bond_order[i] - sbp_i->valency; + workspace->Deltap_boc[i] = + workspace->total_bond_order[i] - sbp_i->valency_val; + workspace->total_bond_order[i] = 0; + } + // fprintf( stderr, "done with uncorrected bond orders\n" ); + + + /* Corrected Bond Order calculations */ + for( i = 0; i < system->N; ++i ) { + type_i = system->atoms[i].type; + sbp_i = &(system->reaxprm.sbp[type_i]); + val_i = sbp_i->valency; + Deltap_i = workspace->Deltap[i]; + Deltap_boc_i = workspace->Deltap_boc[i]; + start_i = Start_Index(i, bonds); + end_i = End_Index(i, bonds); + //fprintf( stderr, "i:%d Dp:%g Dbocp:%g s:%d e:%d\n", + // i+1, Deltap_i, Deltap_boc_i, start_i, end_i ); + + for( pj = start_i; pj < end_i; ++pj ) { + j = bonds->select.bond_list[pj].nbr; + type_j = system->atoms[j].type; + bo_ij = &( bonds->select.bond_list[pj].bo_data ); + //fprintf( stderr, "\tj:%d - ubo: %8.3f\n", j+1, bo_ij->BO ); + + if( i < j ) { + twbp = &( system->reaxprm.tbp[ index_tbp (type_i,type_j,&system->reaxprm) ] ); #ifdef TEST_FORCES - Set_Start_Index( pj, top_dbo, dBOs ); - /* fprintf( stderr, "%6d%6d%23.15e%23.15e%23.15e\n", - workspace->reverse_map[i], workspace->reverse_map[j], - twbp->ovc, twbp->v13cor, bo_ij->BO ); */ + Set_Start_Index( pj, top_dbo, dBOs ); + /* fprintf( stderr, "%6d%6d%23.15e%23.15e%23.15e\n", + workspace->reverse_map[i], workspace->reverse_map[j], + twbp->ovc, twbp->v13cor, bo_ij->BO ); */ #endif - if( twbp->ovc < 0.001 && twbp->v13cor < 0.001 ) { - /* There is no correction to bond orders nor to derivatives of - bond order prime! So we leave bond orders unchanged and - set derivative of bond order coefficients s.t. - dBO = dBOp & dBOxx = dBOxxp in Add_dBO_to_Forces */ - bo_ij->C1dbo = 1.000000; - bo_ij->C2dbo = 0.000000; - bo_ij->C3dbo = 0.000000; - - bo_ij->C1dbopi = bo_ij->BO_pi; - bo_ij->C2dbopi = 0.000000; - bo_ij->C3dbopi = 0.000000; - bo_ij->C4dbopi = 0.000000; - - bo_ij->C1dbopi2 = bo_ij->BO_pi2; - bo_ij->C2dbopi2 = 0.000000; - bo_ij->C3dbopi2 = 0.000000; - bo_ij->C4dbopi2 = 0.000000; + if( twbp->ovc < 0.001 && twbp->v13cor < 0.001 ) { + /* There is no correction to bond orders nor to derivatives of + bond order prime! So we leave bond orders unchanged and + set derivative of bond order coefficients s.t. + dBO = dBOp & dBOxx = dBOxxp in Add_dBO_to_Forces */ + bo_ij->C1dbo = 1.000000; + bo_ij->C2dbo = 0.000000; + bo_ij->C3dbo = 0.000000; + + bo_ij->C1dbopi = bo_ij->BO_pi; + bo_ij->C2dbopi = 0.000000; + bo_ij->C3dbopi = 0.000000; + bo_ij->C4dbopi = 0.000000; + + bo_ij->C1dbopi2 = bo_ij->BO_pi2; + bo_ij->C2dbopi2 = 0.000000; + bo_ij->C3dbopi2 = 0.000000; + bo_ij->C4dbopi2 = 0.000000; #ifdef TEST_FORCES - pdbo = &(dBOs->select.dbo_list[ top_dbo ]); - - // compute dBO_ij/dr_i - pdbo->wrt = i; - rvec_Copy( pdbo->dBO, bo_ij->dBOp ); - rvec_Scale( pdbo->dBOpi, bo_ij->BO_pi, bo_ij->dln_BOp_pi ); - rvec_Scale( pdbo->dBOpi2, bo_ij->BO_pi2, bo_ij->dln_BOp_pi2 ); - - // compute dBO_ij/dr_j - pdbo++; - pdbo->wrt = j; - rvec_Scale( pdbo->dBO,-1.0,bo_ij->dBOp ); - rvec_Scale( pdbo->dBOpi,-bo_ij->BO_pi,bo_ij->dln_BOp_pi ); - rvec_Scale( pdbo->dBOpi2,-bo_ij->BO_pi2,bo_ij->dln_BOp_pi2 ); - - top_dbo += 2; + pdbo = &(dBOs->select.dbo_list[ top_dbo ]); + + // compute dBO_ij/dr_i + pdbo->wrt = i; + rvec_Copy( pdbo->dBO, bo_ij->dBOp ); + rvec_Scale( pdbo->dBOpi, bo_ij->BO_pi, bo_ij->dln_BOp_pi ); + rvec_Scale( pdbo->dBOpi2, bo_ij->BO_pi2, bo_ij->dln_BOp_pi2 ); + + // compute dBO_ij/dr_j + pdbo++; + pdbo->wrt = j; + rvec_Scale( pdbo->dBO,-1.0,bo_ij->dBOp ); + rvec_Scale( pdbo->dBOpi,-bo_ij->BO_pi,bo_ij->dln_BOp_pi ); + rvec_Scale( pdbo->dBOpi2,-bo_ij->BO_pi2,bo_ij->dln_BOp_pi2 ); + + top_dbo += 2; #endif - } - else { - val_j = system->reaxprm.sbp[type_j].valency; - Deltap_j = workspace->Deltap[j]; - Deltap_boc_j = workspace->Deltap_boc[j]; - - /* on page 1 */ - if( twbp->ovc >= 0.001 ) { - /* Correction for overcoordination */ - exp_p1i = EXP( -p_boc1 * Deltap_i ); - exp_p2i = EXP( -p_boc2 * Deltap_i ); - exp_p1j = EXP( -p_boc1 * Deltap_j ); - exp_p2j = EXP( -p_boc2 * Deltap_j ); - - f2 = exp_p1i + exp_p1j; - f3 = -1.0 / p_boc2 * log( 0.5 * ( exp_p2i + exp_p2j ) ); - f1 = 0.5 * ( ( val_i + f2 )/( val_i + f2 + f3 ) + - ( val_j + f2 )/( val_j + f2 + f3 ) ); - - /*fprintf( stderr,"%6d%6d\t%g %g j:%g %g p_boc:%g %g\n", - i+1, j+1, val_i, Deltap_i, val_j, Deltap_j, p_boc1, p_boc2 ); - fprintf( stderr,"\tf:%g %g %g, exp:%g %g %g %g\n", - f1, f2, f3, exp_p1i, exp_p2i, exp_p1j, exp_p2j );*/ - - /* Now come the derivates */ - /* Bond Order pages 5-7, derivative of f1 */ - temp = f2 + f3; - u1_ij = val_i + temp; - u1_ji = val_j + temp; - Cf1A_ij = 0.5 * f3 * (1.0 / SQR( u1_ij ) + 1.0 / SQR( u1_ji )); - Cf1B_ij = -0.5 * (( u1_ij - f3 ) / SQR( u1_ij ) + - ( u1_ji - f3 ) / SQR( u1_ji )); - - //Cf1_ij = -Cf1A_ij * p_boc1 * exp_p1i + - // Cf1B_ij * exp_p2i / ( exp_p2i + exp_p2j ); - Cf1_ij = 0.50 * ( -p_boc1 * exp_p1i / u1_ij - - ((val_i+f2) / SQR(u1_ij)) * - ( -p_boc1 * exp_p1i + - exp_p2i / ( exp_p2i + exp_p2j ) ) + - -p_boc1 * exp_p1i / u1_ji - - ((val_j+f2)/SQR(u1_ji)) * ( -p_boc1*exp_p1i + - exp_p2i / ( exp_p2i + exp_p2j ) )); - - Cf1_ji = -Cf1A_ij * p_boc1 * exp_p1j + - Cf1B_ij * exp_p2j / ( exp_p2i + exp_p2j ); - //fprintf( stderr, "\tCf1:%g %g\n", Cf1_ij, Cf1_ji ); - } - else { - /* No overcoordination correction! */ - f1 = 1.0; - Cf1_ij = Cf1_ji = 0.0; - } - - if( twbp->v13cor >= 0.001 ) { - /* Correction for 1-3 bond orders */ - exp_f4 =EXP(-(twbp->p_boc4 * SQR( bo_ij->BO ) - - Deltap_boc_i) * twbp->p_boc3 + twbp->p_boc5); - exp_f5 =EXP(-(twbp->p_boc4 * SQR( bo_ij->BO ) - - Deltap_boc_j) * twbp->p_boc3 + twbp->p_boc5); - - f4 = 1. / (1. + exp_f4); - f5 = 1. / (1. + exp_f5); - f4f5 = f4 * f5; - - /* Bond Order pages 8-9, derivative of f4 and f5 */ - /*temp = twbp->p_boc5 - - twbp->p_boc3 * twbp->p_boc4 * SQR( bo_ij->BO ); - u_ij = temp + twbp->p_boc3 * Deltap_boc_i; - u_ji = temp + twbp->p_boc3 * Deltap_boc_j; - Cf45_ij = Cf45( u_ij, u_ji ) / f4f5; - Cf45_ji = Cf45( u_ji, u_ij ) / f4f5;*/ - Cf45_ij = -f4 * exp_f4; - Cf45_ji = -f5 * exp_f5; - } - else { - f4 = f5 = f4f5 = 1.0; - Cf45_ij = Cf45_ji = 0.0; - } - - /* Bond Order page 10, derivative of total bond order */ - A0_ij = f1 * f4f5; - A1_ij = -2 * twbp->p_boc3 * twbp->p_boc4 * bo_ij->BO * - (Cf45_ij + Cf45_ji); - A2_ij = Cf1_ij / f1 + twbp->p_boc3 * Cf45_ij; - A2_ji = Cf1_ji / f1 + twbp->p_boc3 * Cf45_ji; - A3_ij = A2_ij + Cf1_ij / f1; - A3_ji = A2_ji + Cf1_ji / f1; - - /*fprintf( stderr, "\tBO: %f, A0: %f, A1: %f, A2_ij: %f + } + else { + val_j = system->reaxprm.sbp[type_j].valency; + Deltap_j = workspace->Deltap[j]; + Deltap_boc_j = workspace->Deltap_boc[j]; + + /* on page 1 */ + if( twbp->ovc >= 0.001 ) { + /* Correction for overcoordination */ + exp_p1i = EXP( -p_boc1 * Deltap_i ); + exp_p2i = EXP( -p_boc2 * Deltap_i ); + exp_p1j = EXP( -p_boc1 * Deltap_j ); + exp_p2j = EXP( -p_boc2 * Deltap_j ); + + f2 = exp_p1i + exp_p1j; + f3 = -1.0 / p_boc2 * log( 0.5 * ( exp_p2i + exp_p2j ) ); + f1 = 0.5 * ( ( val_i + f2 )/( val_i + f2 + f3 ) + + ( val_j + f2 )/( val_j + f2 + f3 ) ); + + /*fprintf( stderr,"%6d%6d\t%g %g j:%g %g p_boc:%g %g\n", + i+1, j+1, val_i, Deltap_i, val_j, Deltap_j, p_boc1, p_boc2 ); + fprintf( stderr,"\tf:%g %g %g, exp:%g %g %g %g\n", + f1, f2, f3, exp_p1i, exp_p2i, exp_p1j, exp_p2j );*/ + + /* Now come the derivates */ + /* Bond Order pages 5-7, derivative of f1 */ + temp = f2 + f3; + u1_ij = val_i + temp; + u1_ji = val_j + temp; + Cf1A_ij = 0.5 * f3 * (1.0 / SQR( u1_ij ) + 1.0 / SQR( u1_ji )); + Cf1B_ij = -0.5 * (( u1_ij - f3 ) / SQR( u1_ij ) + + ( u1_ji - f3 ) / SQR( u1_ji )); + + //Cf1_ij = -Cf1A_ij * p_boc1 * exp_p1i + + // Cf1B_ij * exp_p2i / ( exp_p2i + exp_p2j ); + Cf1_ij = 0.50 * ( -p_boc1 * exp_p1i / u1_ij - + ((val_i+f2) / SQR(u1_ij)) * + ( -p_boc1 * exp_p1i + + exp_p2i / ( exp_p2i + exp_p2j ) ) + + -p_boc1 * exp_p1i / u1_ji - + ((val_j+f2)/SQR(u1_ji)) * ( -p_boc1*exp_p1i + + exp_p2i / ( exp_p2i + exp_p2j ) )); + + Cf1_ji = -Cf1A_ij * p_boc1 * exp_p1j + + Cf1B_ij * exp_p2j / ( exp_p2i + exp_p2j ); + //fprintf( stderr, "\tCf1:%g %g\n", Cf1_ij, Cf1_ji ); + } + else { + /* No overcoordination correction! */ + f1 = 1.0; + Cf1_ij = Cf1_ji = 0.0; + } + + if( twbp->v13cor >= 0.001 ) { + /* Correction for 1-3 bond orders */ + exp_f4 =EXP(-(twbp->p_boc4 * SQR( bo_ij->BO ) - + Deltap_boc_i) * twbp->p_boc3 + twbp->p_boc5); + exp_f5 =EXP(-(twbp->p_boc4 * SQR( bo_ij->BO ) - + Deltap_boc_j) * twbp->p_boc3 + twbp->p_boc5); + + f4 = 1. / (1. + exp_f4); + f5 = 1. / (1. + exp_f5); + f4f5 = f4 * f5; + + /* Bond Order pages 8-9, derivative of f4 and f5 */ + /*temp = twbp->p_boc5 - + twbp->p_boc3 * twbp->p_boc4 * SQR( bo_ij->BO ); + u_ij = temp + twbp->p_boc3 * Deltap_boc_i; + u_ji = temp + twbp->p_boc3 * Deltap_boc_j; + Cf45_ij = Cf45( u_ij, u_ji ) / f4f5; + Cf45_ji = Cf45( u_ji, u_ij ) / f4f5;*/ + Cf45_ij = -f4 * exp_f4; + Cf45_ji = -f5 * exp_f5; + } + else { + f4 = f5 = f4f5 = 1.0; + Cf45_ij = Cf45_ji = 0.0; + } + + /* Bond Order page 10, derivative of total bond order */ + A0_ij = f1 * f4f5; + A1_ij = -2 * twbp->p_boc3 * twbp->p_boc4 * bo_ij->BO * + (Cf45_ij + Cf45_ji); + A2_ij = Cf1_ij / f1 + twbp->p_boc3 * Cf45_ij; + A2_ji = Cf1_ji / f1 + twbp->p_boc3 * Cf45_ji; + A3_ij = A2_ij + Cf1_ij / f1; + A3_ji = A2_ji + Cf1_ji / f1; + + /*fprintf( stderr, "\tBO: %f, A0: %f, A1: %f, A2_ij: %f A2_ji: %f, A3_ij: %f, A3_ji: %f\n", bo_ij->BO, A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji );*/ - /* find corrected bond order values and their deriv coefs */ - bo_ij->BO = bo_ij->BO * A0_ij; - bo_ij->BO_pi = bo_ij->BO_pi * A0_ij *f1; - bo_ij->BO_pi2= bo_ij->BO_pi2* A0_ij *f1; - bo_ij->BO_s = bo_ij->BO - ( bo_ij->BO_pi + bo_ij->BO_pi2 ); + /* find corrected bond order values and their deriv coefs */ + bo_ij->BO = bo_ij->BO * A0_ij; + bo_ij->BO_pi = bo_ij->BO_pi * A0_ij *f1; + bo_ij->BO_pi2= bo_ij->BO_pi2* A0_ij *f1; + bo_ij->BO_s = bo_ij->BO - ( bo_ij->BO_pi + bo_ij->BO_pi2 ); - bo_ij->C1dbo = A0_ij + bo_ij->BO * A1_ij; - bo_ij->C2dbo = bo_ij->BO * A2_ij; - bo_ij->C3dbo = bo_ij->BO * A2_ji; + bo_ij->C1dbo = A0_ij + bo_ij->BO * A1_ij; + bo_ij->C2dbo = bo_ij->BO * A2_ij; + bo_ij->C3dbo = bo_ij->BO * A2_ji; - bo_ij->C1dbopi = f1*f1*f4*f5; - bo_ij->C2dbopi = bo_ij->BO_pi * A1_ij; - bo_ij->C3dbopi = bo_ij->BO_pi * A3_ij; - bo_ij->C4dbopi = bo_ij->BO_pi * A3_ji; + bo_ij->C1dbopi = f1*f1*f4*f5; + bo_ij->C2dbopi = bo_ij->BO_pi * A1_ij; + bo_ij->C3dbopi = bo_ij->BO_pi * A3_ij; + bo_ij->C4dbopi = bo_ij->BO_pi * A3_ji; - bo_ij->C1dbopi2 = f1*f1*f4*f5; - bo_ij->C2dbopi2 = bo_ij->BO_pi2 * A1_ij; - bo_ij->C3dbopi2 = bo_ij->BO_pi2 * A3_ij; - bo_ij->C4dbopi2 = bo_ij->BO_pi2 * A3_ji; + bo_ij->C1dbopi2 = f1*f1*f4*f5; + bo_ij->C2dbopi2 = bo_ij->BO_pi2 * A1_ij; + bo_ij->C3dbopi2 = bo_ij->BO_pi2 * A3_ij; + bo_ij->C4dbopi2 = bo_ij->BO_pi2 * A3_ji; #ifdef TEST_FORCES - /*fprintf( stderr, "%6d%6d%13.6f%13.6f%13.6f%13.6f\n", - i+1, j+1, bo_ij->BO, bo_ij->C1dbo, Cf45_ij, Cf45_ji );*/ - - /* fprintf( stderr, "%6d%6d%13.6f%13.6f%13.6f%13.6f\n", - //"%6d%6d%10.6f%10.6f%10.6f%10.6f\n%10.6f%10.6f%10.6f\n%10.6f%10.6f%10.6f%10.6f\n%10.6f%10.6f%10.6f%10.6f\n\n", - workspace->orig_id[i], workspace->orig_id[j] - A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji - bo_ij->BO, bo_ij->BO_pi, bo_ij->BO_pi2, bo_ij->BO_s, - bo_ij->C1dbo, bo_ij->C2dbo, bo_ij->C3dbo, - bo_ij->C1dbopi,bo_ij->C2dbopi,bo_ij->C3dbopi,bo_ij->C4dbopi, - bo_ij->C1dbopi2,bo_ij->C2dbopi2,bo_ij->C3dbopi2,bo_ij->C4dbopi2 - ); */ - - Calculate_dBO( i, pj, workspace, lists, &top_dbo ); + /*fprintf( stderr, "%6d%6d%13.6f%13.6f%13.6f%13.6f\n", + i+1, j+1, bo_ij->BO, bo_ij->C1dbo, Cf45_ij, Cf45_ji );*/ + + /* fprintf( stderr, "%6d%6d%13.6f%13.6f%13.6f%13.6f\n", + //"%6d%6d%10.6f%10.6f%10.6f%10.6f\n%10.6f%10.6f%10.6f\n%10.6f%10.6f%10.6f%10.6f\n%10.6f%10.6f%10.6f%10.6f\n\n", + workspace->orig_id[i], workspace->orig_id[j] + A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji + bo_ij->BO, bo_ij->BO_pi, bo_ij->BO_pi2, bo_ij->BO_s, + bo_ij->C1dbo, bo_ij->C2dbo, bo_ij->C3dbo, + bo_ij->C1dbopi,bo_ij->C2dbopi,bo_ij->C3dbopi,bo_ij->C4dbopi, + bo_ij->C1dbopi2,bo_ij->C2dbopi2,bo_ij->C3dbopi2,bo_ij->C4dbopi2 + ); */ + + Calculate_dBO( i, pj, workspace, lists, &top_dbo ); #endif - } + } - /* neglect bonds that are < 1e-10 */ - if( bo_ij->BO < 1e-10 ) - bo_ij->BO = 0.0; - if( bo_ij->BO_s < 1e-10 ) - bo_ij->BO_s = 0.0; - if( bo_ij->BO_pi < 1e-10 ) - bo_ij->BO_pi = 0.0; - if( bo_ij->BO_pi2 < 1e-10 ) - bo_ij->BO_pi2 = 0.0; + /* neglect bonds that are < 1e-10 */ + if( bo_ij->BO < 1e-10 ) + bo_ij->BO = 0.0; + if( bo_ij->BO_s < 1e-10 ) + bo_ij->BO_s = 0.0; + if( bo_ij->BO_pi < 1e-10 ) + bo_ij->BO_pi = 0.0; + if( bo_ij->BO_pi2 < 1e-10 ) + bo_ij->BO_pi2 = 0.0; - workspace->total_bond_order[i] += bo_ij->BO; // now keeps total_BO + workspace->total_bond_order[i] += bo_ij->BO; // now keeps total_BO - /* fprintf( stderr, "%d %d\t%g %g %g %g\n + /* fprintf( stderr, "%d %d\t%g %g %g %g\n Cdbo:\t%g %g %g\n Cdbopi:\t%g %g %g %g\n Cdbopi2:%g %g %g %g\n\n", @@ -1229,148 +1229,148 @@ bo_ij->C1dbopi, bo_ij->C2dbopi, bo_ij->C3dbopi, bo_ij->C4dbopi, bo_ij->C1dbopi2, bo_ij->C2dbopi2, bo_ij->C3dbopi2, bo_ij->C4dbopi2 ); */ - /* fprintf( stderr, "%d %d, BO:%f BO_s:%f BO_pi:%f BO_pi2:%f\n", - i+1,j+1,bo_ij->BO,bo_ij->BO_s,bo_ij->BO_pi,bo_ij->BO_pi2 ); */ + /* fprintf( stderr, "%d %d, BO:%f BO_s:%f BO_pi:%f BO_pi2:%f\n", + i+1,j+1,bo_ij->BO,bo_ij->BO_s,bo_ij->BO_pi,bo_ij->BO_pi2 ); */ #ifdef TEST_FORCES - Set_End_Index( pj, top_dbo, dBOs ); - //Add_dBO( system, lists, i, pj, 1.0, workspace->dDelta ); + Set_End_Index( pj, top_dbo, dBOs ); + //Add_dBO( system, lists, i, pj, 1.0, workspace->dDelta ); #endif - } - else { - /* We only need to update bond orders from bo_ji - everything else is set in uncorrected_bo calculations */ - sym_index = bonds->select.bond_list[pj].sym_index; - bo_ji = &(bonds->select.bond_list[ sym_index ].bo_data); - bo_ij->BO = bo_ji->BO; - bo_ij->BO_s = bo_ji->BO_s; - bo_ij->BO_pi = bo_ji->BO_pi; - bo_ij->BO_pi2 = bo_ji->BO_pi2; - - workspace->total_bond_order[i] += bo_ij->BO; // now keeps total_BO + } + else { + /* We only need to update bond orders from bo_ji + everything else is set in uncorrected_bo calculations */ + sym_index = bonds->select.bond_list[pj].sym_index; + bo_ji = &(bonds->select.bond_list[ sym_index ].bo_data); + bo_ij->BO = bo_ji->BO; + bo_ij->BO_s = bo_ji->BO_s; + bo_ij->BO_pi = bo_ji->BO_pi; + bo_ij->BO_pi2 = bo_ji->BO_pi2; + + workspace->total_bond_order[i] += bo_ij->BO; // now keeps total_BO #ifdef TEST_FORCES - //Add_dBO( system, lists, j, sym_index, 1.0, workspace->dDelta ); + //Add_dBO( system, lists, j, sym_index, 1.0, workspace->dDelta ); #endif - } - } + } + } #ifdef TEST_FORCES - // fprintf( stderr, "dDelta computations\nj:" ); - Set_Start_Index( i, top_dDelta, dDeltas ); - ptop_dDelta = &( dDeltas->select.dDelta_list[top_dDelta] ); - - for( pj = start_i; pj < end_i; ++pj ) { - j = bonds->select.bond_list[pj].nbr; - // fprintf( stderr, "%d ", j ); - - if( !rvec_isZero( workspace->dDelta[j] ) ) { - ptop_dDelta->wrt = j; - rvec_Copy( ptop_dDelta->dVal, workspace->dDelta[j] ); - rvec_MakeZero( workspace->dDelta[j] ); - ++top_dDelta, ++ptop_dDelta; - } - - start_j = Start_Index(j, bonds); - end_j = End_Index(j, bonds); - for( pk = start_j; pk < end_j; ++pk ) { - k = bonds->select.bond_list[pk].nbr; - if( !rvec_isZero( workspace->dDelta[k] ) ) { - ptop_dDelta->wrt = k; - rvec_Copy( ptop_dDelta->dVal, workspace->dDelta[k] ); - rvec_MakeZero( workspace->dDelta[k] ); - ++top_dDelta, ++ptop_dDelta; - } - } - } - - Set_End_Index( i, top_dDelta, dDeltas ); - - /*for( pj=Start_Index(i,dDeltas); pj<End_Index(i,dDeltas); ++pj ) - fprintf( stdout, "dDel: %d %d [%g %g %g]\n", - i+1, dDeltas->select.dDelta_list[pj].wrt+1, - dDeltas->select.dDelta_list[pj].dVal[0], - dDeltas->select.dDelta_list[pj].dVal[1], - dDeltas->select.dDelta_list[pj].dVal[2] );*/ + // fprintf( stderr, "dDelta computations\nj:" ); + Set_Start_Index( i, top_dDelta, dDeltas ); + ptop_dDelta = &( dDeltas->select.dDelta_list[top_dDelta] ); + + for( pj = start_i; pj < end_i; ++pj ) { + j = bonds->select.bond_list[pj].nbr; + // fprintf( stderr, "%d ", j ); + + if( !rvec_isZero( workspace->dDelta[j] ) ) { + ptop_dDelta->wrt = j; + rvec_Copy( ptop_dDelta->dVal, workspace->dDelta[j] ); + rvec_MakeZero( workspace->dDelta[j] ); + ++top_dDelta, ++ptop_dDelta; + } + + start_j = Start_Index(j, bonds); + end_j = End_Index(j, bonds); + for( pk = start_j; pk < end_j; ++pk ) { + k = bonds->select.bond_list[pk].nbr; + if( !rvec_isZero( workspace->dDelta[k] ) ) { + ptop_dDelta->wrt = k; + rvec_Copy( ptop_dDelta->dVal, workspace->dDelta[k] ); + rvec_MakeZero( workspace->dDelta[k] ); + ++top_dDelta, ++ptop_dDelta; + } + } + } + + Set_End_Index( i, top_dDelta, dDeltas ); + + /*for( pj=Start_Index(i,dDeltas); pj<End_Index(i,dDeltas); ++pj ) + fprintf( stdout, "dDel: %d %d [%g %g %g]\n", + i+1, dDeltas->select.dDelta_list[pj].wrt+1, + dDeltas->select.dDelta_list[pj].dVal[0], + dDeltas->select.dDelta_list[pj].dVal[1], + dDeltas->select.dDelta_list[pj].dVal[2] );*/ #endif - } - - /*fprintf(stderr,"\tCalculated actual bond orders ...\n" ); - fprintf(stderr,"%6s%8s%8s%8s%8s%8s%8s%8s\n", - "atom", "Delta", "Delta_e", "Delta_boc", "nlp", - "Delta_lp", "Clp", "dDelta_lp" );*/ - - p_lp1 = system->reaxprm.gp.l[15]; - /* Calculate some helper variables that are used at many places - throughout force calculations */ - for( j = 0; j < system->N; ++j ) { - type_j = system->atoms[j].type; - sbp_j = &(system->reaxprm.sbp[ type_j ]); - - workspace->Delta[j] = workspace->total_bond_order[j] - sbp_j->valency; - workspace->Delta_e[j] = workspace->total_bond_order[j] - sbp_j->valency_e; - workspace->Delta_boc[j] = workspace->total_bond_order[j] - - sbp_j->valency_boc; - - workspace->vlpex[j] = workspace->Delta_e[j] - - 2.0 * (int)(workspace->Delta_e[j]/2.0); - explp1 = EXP(-p_lp1 * SQR(2.0 + workspace->vlpex[j])); - workspace->nlp[j] = explp1 - (int)(workspace->Delta_e[j] / 2.0); - workspace->Delta_lp[j] = sbp_j->nlp_opt - workspace->nlp[j]; - workspace->Clp[j] = 2.0 * p_lp1 * explp1 * (2.0 + workspace->vlpex[j]); - /* Adri uses different dDelta_lp values than the ones in notes... */ - workspace->dDelta_lp[j] = workspace->Clp[j]; - //workspace->dDelta_lp[j] = workspace->Clp[j] + (0.5-workspace->Clp[j]) * - //((fabs(workspace->Delta_e[j]/2.0 - - // (int)(workspace->Delta_e[j]/2.0)) < 0.1) ? 1 : 0 ); - - if( sbp_j->mass > 21.0 ) { - workspace->nlp_temp[j] = 0.5 * (sbp_j->valency_e - sbp_j->valency); - workspace->Delta_lp_temp[j] = sbp_j->nlp_opt - workspace->nlp_temp[j]; - workspace->dDelta_lp_temp[j] = 0.; - } - else { - workspace->nlp_temp[j] = workspace->nlp[j]; - workspace->Delta_lp_temp[j] = sbp_j->nlp_opt - workspace->nlp_temp[j]; - workspace->dDelta_lp_temp[j] = workspace->Clp[j]; - } - - //fprintf( stderr, "%d\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\n", - //j, workspace->Delta[j], workspace->Delta_e[j], workspace->Delta_boc[j], - //workspace->nlp[j], system->reaxprm.sbp[type_j].nlp_opt, - //workspace->Delta_lp[j], workspace->Clp[j], workspace->dDelta_lp[j] ); - } - - //Print_Bonds( system, bonds, "sbonds.out" ); + } + + /*fprintf(stderr,"\tCalculated actual bond orders ...\n" ); + fprintf(stderr,"%6s%8s%8s%8s%8s%8s%8s%8s\n", + "atom", "Delta", "Delta_e", "Delta_boc", "nlp", + "Delta_lp", "Clp", "dDelta_lp" );*/ + + p_lp1 = system->reaxprm.gp.l[15]; + /* Calculate some helper variables that are used at many places + throughout force calculations */ + for( j = 0; j < system->N; ++j ) { + type_j = system->atoms[j].type; + sbp_j = &(system->reaxprm.sbp[ type_j ]); + + workspace->Delta[j] = workspace->total_bond_order[j] - sbp_j->valency; + workspace->Delta_e[j] = workspace->total_bond_order[j] - sbp_j->valency_e; + workspace->Delta_boc[j] = workspace->total_bond_order[j] - + sbp_j->valency_boc; + + workspace->vlpex[j] = workspace->Delta_e[j] - + 2.0 * (int)(workspace->Delta_e[j]/2.0); + explp1 = EXP(-p_lp1 * SQR(2.0 + workspace->vlpex[j])); + workspace->nlp[j] = explp1 - (int)(workspace->Delta_e[j] / 2.0); + workspace->Delta_lp[j] = sbp_j->nlp_opt - workspace->nlp[j]; + workspace->Clp[j] = 2.0 * p_lp1 * explp1 * (2.0 + workspace->vlpex[j]); + /* Adri uses different dDelta_lp values than the ones in notes... */ + workspace->dDelta_lp[j] = workspace->Clp[j]; + //workspace->dDelta_lp[j] = workspace->Clp[j] + (0.5-workspace->Clp[j]) * + //((fabs(workspace->Delta_e[j]/2.0 - + // (int)(workspace->Delta_e[j]/2.0)) < 0.1) ? 1 : 0 ); + + if( sbp_j->mass > 21.0 ) { + workspace->nlp_temp[j] = 0.5 * (sbp_j->valency_e - sbp_j->valency); + workspace->Delta_lp_temp[j] = sbp_j->nlp_opt - workspace->nlp_temp[j]; + workspace->dDelta_lp_temp[j] = 0.; + } + else { + workspace->nlp_temp[j] = workspace->nlp[j]; + workspace->Delta_lp_temp[j] = sbp_j->nlp_opt - workspace->nlp_temp[j]; + workspace->dDelta_lp_temp[j] = workspace->Clp[j]; + } + + //fprintf( stderr, "%d\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\n", + //j, workspace->Delta[j], workspace->Delta_e[j], workspace->Delta_boc[j], + //workspace->nlp[j], system->reaxprm.sbp[type_j].nlp_opt, + //workspace->Delta_lp[j], workspace->Clp[j], workspace->dDelta_lp[j] ); + } + + //Print_Bonds( system, bonds, "sbonds.out" ); #if defined(DEBUG) - fprintf( stderr, "Number of bonds: %d\n", num_bonds ); - Print_Bond_Orders( system, control, data, workspace, lists, out_control ); + fprintf( stderr, "Number of bonds: %d\n", num_bonds ); + Print_Bond_Orders( system, control, data, workspace, lists, out_control ); #endif } //Cuda Functions GLOBAL void Cuda_Calculate_Bond_Orders_Init ( reax_atom *atoms, global_parameters g_params, single_body_parameters *sbp, - static_storage workspace, int num_atom_types, int N ) + static_storage workspace, int num_atom_types, int N ) { - int i, type_i; - real p_boc1, p_boc2; - single_body_parameters *sbp_i; - - i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= N) return; - - p_boc1 = g_params.l[0]; - p_boc2 = g_params.l[1]; - - /* Calculate Deltaprime, Deltaprime_boc values */ - type_i = atoms[i].type; - sbp_i = &(sbp[type_i]); - workspace.Deltap[i] = workspace.total_bond_order[i] - sbp_i->valency; - workspace.Deltap_boc[i] = - workspace.total_bond_order[i] - sbp_i->valency_val; - workspace.total_bond_order[i] = 0; + int i, type_i; + real p_boc1, p_boc2; + single_body_parameters *sbp_i; + + i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= N) return; + + p_boc1 = g_params.l[0]; + p_boc2 = g_params.l[1]; + + /* Calculate Deltaprime, Deltaprime_boc values */ + type_i = atoms[i].type; + sbp_i = &(sbp[type_i]); + workspace.Deltap[i] = workspace.total_bond_order[i] - sbp_i->valency; + workspace.Deltap_boc[i] = + workspace.total_bond_order[i] - sbp_i->valency_val; + workspace.total_bond_order[i] = 0; } @@ -1379,267 +1379,267 @@ GLOBAL void Cuda_Calculate_Bond_Orders_Init ( reax_atom *atoms, global_paramete This can either be done in the general coordinator function or here */ GLOBAL void Cuda_Calculate_Bond_Orders ( reax_atom *atoms, global_parameters g_params, single_body_parameters *sbp, - two_body_parameters *tbp, static_storage workspace, list bonds, - list dDeltas, list dBOs, int num_atom_types, int N ) + two_body_parameters *tbp, static_storage workspace, list bonds, + list dDeltas, list dBOs, int num_atom_types, int N ) { - int i, j, pj, type_i, type_j; - int start_i, end_i; - int num_bonds, sym_index; - real p_boc1, p_boc2; - real val_i, Deltap_i, Deltap_boc_i; - real val_j, Deltap_j, Deltap_boc_j; - real temp, f1, f2, f3, f4, f5, f4f5, exp_f4, exp_f5; - real exp_p1i, exp_p2i, exp_p1j, exp_p2j; - real u1_ij, u1_ji, Cf1A_ij, Cf1B_ij, Cf1_ij, Cf1_ji; - real Cf45_ij, Cf45_ji, p_lp1; - real A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji; - real explp1; - two_body_parameters *twbp; - bond_order_data *bo_ij, *bo_ji; - single_body_parameters *sbp_i, *sbp_j; + int i, j, pj, type_i, type_j; + int start_i, end_i; + int num_bonds, sym_index; + real p_boc1, p_boc2; + real val_i, Deltap_i, Deltap_boc_i; + real val_j, Deltap_j, Deltap_boc_j; + real temp, f1, f2, f3, f4, f5, f4f5, exp_f4, exp_f5; + real exp_p1i, exp_p2i, exp_p1j, exp_p2j; + real u1_ij, u1_ji, Cf1A_ij, Cf1B_ij, Cf1_ij, Cf1_ji; + real Cf45_ij, Cf45_ji, p_lp1; + real A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji; + real explp1; + two_body_parameters *twbp; + bond_order_data *bo_ij, *bo_ji; + single_body_parameters *sbp_i, *sbp_j; #if defined(TEST_FORCES) - int k, pk, start_j, end_j; - int top_dbo=0, top_dDelta=0; - dbond_data *pdbo; - dDelta_data *ptop_dDelta; + int k, pk, start_j, end_j; + int top_dbo=0, top_dDelta=0; + dbond_data *pdbo; + dDelta_data *ptop_dDelta; #endif - i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= N) return; - - num_bonds = 0; - p_boc1 = g_params.l[0]; - p_boc2 = g_params.l[1]; - - /* Calculate Deltaprime, Deltaprime_boc values */ - //for( i = 0; i < system->N; ++i ) { - /* - if (i < N) { - type_i = atoms[i].type; - sbp_i = &(sbp[type_i]); - workspace.Deltap[i] = workspace.total_bond_order[i] - sbp_i->valency; - workspace.Deltap_boc[i] = - workspace.total_bond_order[i] - sbp_i->valency_val; - workspace.total_bond_order[i] = 0; - - } - - __syncthreads (); - */ - - - // fprintf( stderr, "done with uncorrected bond orders\n" ); - - - /* Corrected Bond Order calculations */ - //for( i = 0; i < system->N; ++i ) { - type_i = atoms[i].type; - sbp_i = &(sbp[type_i]); - val_i = sbp_i->valency; - Deltap_i = workspace.Deltap[i]; - Deltap_boc_i = workspace.Deltap_boc[i]; - start_i = Start_Index(i, &bonds); - end_i = End_Index(i, &bonds); - //fprintf( stderr, "i:%d Dp:%g Dbocp:%g s:%d e:%d\n", - // i+1, Deltap_i, Deltap_boc_i, start_i, end_i ); - - for( pj = start_i; pj < end_i; ++pj ) { - j = bonds.select.bond_list[pj].nbr; - type_j = atoms[j].type; - bo_ij = &( bonds.select.bond_list[pj].bo_data ); - //fprintf( stderr, "\tj:%d - ubo: %8.3f\n", j+1, bo_ij->BO ); - - if( i < j ) { - twbp = &( tbp[ index_tbp (type_i,type_j,num_atom_types) ] ); + i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= N) return; + + num_bonds = 0; + p_boc1 = g_params.l[0]; + p_boc2 = g_params.l[1]; + + /* Calculate Deltaprime, Deltaprime_boc values */ + //for( i = 0; i < system->N; ++i ) { + /* + if (i < N) { + type_i = atoms[i].type; + sbp_i = &(sbp[type_i]); + workspace.Deltap[i] = workspace.total_bond_order[i] - sbp_i->valency; + workspace.Deltap_boc[i] = + workspace.total_bond_order[i] - sbp_i->valency_val; + workspace.total_bond_order[i] = 0; + + } + + __syncthreads (); + */ + + + // fprintf( stderr, "done with uncorrected bond orders\n" ); + + + /* Corrected Bond Order calculations */ + //for( i = 0; i < system->N; ++i ) { + type_i = atoms[i].type; + sbp_i = &(sbp[type_i]); + val_i = sbp_i->valency; + Deltap_i = workspace.Deltap[i]; + Deltap_boc_i = workspace.Deltap_boc[i]; + start_i = Start_Index(i, &bonds); + end_i = End_Index(i, &bonds); + //fprintf( stderr, "i:%d Dp:%g Dbocp:%g s:%d e:%d\n", + // i+1, Deltap_i, Deltap_boc_i, start_i, end_i ); + + for( pj = start_i; pj < end_i; ++pj ) { + j = bonds.select.bond_list[pj].nbr; + type_j = atoms[j].type; + bo_ij = &( bonds.select.bond_list[pj].bo_data ); + //fprintf( stderr, "\tj:%d - ubo: %8.3f\n", j+1, bo_ij->BO ); + + if( i < j ) { + twbp = &( tbp[ index_tbp (type_i,type_j,num_atom_types) ] ); #ifdef TEST_FORCES - Set_Start_Index( pj, top_dbo, &dBOs ); - /* fprintf( stderr, "%6d%6d%23.15e%23.15e%23.15e\n", - workspace->reverse_map[i], workspace->reverse_map[j], - twbp->ovc, twbp->v13cor, bo_ij->BO ); */ + Set_Start_Index( pj, top_dbo, &dBOs ); + /* fprintf( stderr, "%6d%6d%23.15e%23.15e%23.15e\n", + workspace->reverse_map[i], workspace->reverse_map[j], + twbp->ovc, twbp->v13cor, bo_ij->BO ); */ #endif - if( twbp->ovc < 0.001 && twbp->v13cor < 0.001 ) { - /* There is no correction to bond orders nor to derivatives of - bond order prime! So we leave bond orders unchanged and - set derivative of bond order coefficients s.t. - dBO = dBOp & dBOxx = dBOxxp in Add_dBO_to_Forces */ - bo_ij->C1dbo = 1.000000; - bo_ij->C2dbo = 0.000000; - bo_ij->C3dbo = 0.000000; - - bo_ij->C1dbopi = bo_ij->BO_pi; - bo_ij->C2dbopi = 0.000000; - bo_ij->C3dbopi = 0.000000; - bo_ij->C4dbopi = 0.000000; - - bo_ij->C1dbopi2 = bo_ij->BO_pi2; - bo_ij->C2dbopi2 = 0.000000; - bo_ij->C3dbopi2 = 0.000000; - bo_ij->C4dbopi2 = 0.000000; + if( twbp->ovc < 0.001 && twbp->v13cor < 0.001 ) { + /* There is no correction to bond orders nor to derivatives of + bond order prime! So we leave bond orders unchanged and + set derivative of bond order coefficients s.t. + dBO = dBOp & dBOxx = dBOxxp in Add_dBO_to_Forces */ + bo_ij->C1dbo = 1.000000; + bo_ij->C2dbo = 0.000000; + bo_ij->C3dbo = 0.000000; + + bo_ij->C1dbopi = bo_ij->BO_pi; + bo_ij->C2dbopi = 0.000000; + bo_ij->C3dbopi = 0.000000; + bo_ij->C4dbopi = 0.000000; + + bo_ij->C1dbopi2 = bo_ij->BO_pi2; + bo_ij->C2dbopi2 = 0.000000; + bo_ij->C3dbopi2 = 0.000000; + bo_ij->C4dbopi2 = 0.000000; #ifdef TEST_FORCES - pdbo = &(dBOs.select.dbo_list[ top_dbo ]); - - // compute dBO_ij/dr_i - pdbo->wrt = i; - rvec_Copy( pdbo->dBO, bo_ij->dBOp ); - rvec_Scale( pdbo->dBOpi, bo_ij->BO_pi, bo_ij->dln_BOp_pi ); - rvec_Scale( pdbo->dBOpi2, bo_ij->BO_pi2, bo_ij->dln_BOp_pi2 ); - - // compute dBO_ij/dr_j - pdbo++; - pdbo->wrt = j; - rvec_Scale( pdbo->dBO,-1.0,bo_ij->dBOp ); - rvec_Scale( pdbo->dBOpi,-bo_ij->BO_pi,bo_ij->dln_BOp_pi ); - rvec_Scale( pdbo->dBOpi2,-bo_ij->BO_pi2,bo_ij->dln_BOp_pi2 ); - - top_dbo += 2; + pdbo = &(dBOs.select.dbo_list[ top_dbo ]); + + // compute dBO_ij/dr_i + pdbo->wrt = i; + rvec_Copy( pdbo->dBO, bo_ij->dBOp ); + rvec_Scale( pdbo->dBOpi, bo_ij->BO_pi, bo_ij->dln_BOp_pi ); + rvec_Scale( pdbo->dBOpi2, bo_ij->BO_pi2, bo_ij->dln_BOp_pi2 ); + + // compute dBO_ij/dr_j + pdbo++; + pdbo->wrt = j; + rvec_Scale( pdbo->dBO,-1.0,bo_ij->dBOp ); + rvec_Scale( pdbo->dBOpi,-bo_ij->BO_pi,bo_ij->dln_BOp_pi ); + rvec_Scale( pdbo->dBOpi2,-bo_ij->BO_pi2,bo_ij->dln_BOp_pi2 ); + + top_dbo += 2; #endif - } - else { - val_j = sbp[type_j].valency; - Deltap_j = workspace.Deltap[j]; - Deltap_boc_j = workspace.Deltap_boc[j]; - - /* on page 1 */ - if( twbp->ovc >= 0.001 ) { - /* Correction for overcoordination */ - exp_p1i = EXP( -p_boc1 * Deltap_i ); - exp_p2i = EXP( -p_boc2 * Deltap_i ); - exp_p1j = EXP( -p_boc1 * Deltap_j ); - exp_p2j = EXP( -p_boc2 * Deltap_j ); - - f2 = exp_p1i + exp_p1j; - f3 = -1.0 / p_boc2 * log( 0.5 * ( exp_p2i + exp_p2j ) ); - f1 = 0.5 * ( ( val_i + f2 )/( val_i + f2 + f3 ) + - ( val_j + f2 )/( val_j + f2 + f3 ) ); - - /*fprintf( stderr,"%6d%6d\t%g %g j:%g %g p_boc:%g %g\n", - i+1, j+1, val_i, Deltap_i, val_j, Deltap_j, p_boc1, p_boc2 ); - fprintf( stderr,"\tf:%g %g %g, exp:%g %g %g %g\n", - f1, f2, f3, exp_p1i, exp_p2i, exp_p1j, exp_p2j );*/ - - /* Now come the derivates */ - /* Bond Order pages 5-7, derivative of f1 */ - temp = f2 + f3; - u1_ij = val_i + temp; - u1_ji = val_j + temp; - Cf1A_ij = 0.5 * f3 * (1.0 / SQR( u1_ij ) + 1.0 / SQR( u1_ji )); - Cf1B_ij = -0.5 * (( u1_ij - f3 ) / SQR( u1_ij ) + - ( u1_ji - f3 ) / SQR( u1_ji )); - - //Cf1_ij = -Cf1A_ij * p_boc1 * exp_p1i + - // Cf1B_ij * exp_p2i / ( exp_p2i + exp_p2j ); - Cf1_ij = 0.50 * ( -p_boc1 * exp_p1i / u1_ij - - ((val_i+f2) / SQR(u1_ij)) * - ( -p_boc1 * exp_p1i + - exp_p2i / ( exp_p2i + exp_p2j ) ) + - -p_boc1 * exp_p1i / u1_ji - - ((val_j+f2)/SQR(u1_ji)) * ( -p_boc1*exp_p1i + - exp_p2i / ( exp_p2i + exp_p2j ) )); - - Cf1_ji = -Cf1A_ij * p_boc1 * exp_p1j + - Cf1B_ij * exp_p2j / ( exp_p2i + exp_p2j ); - //fprintf( stderr, "\tCf1:%g %g\n", Cf1_ij, Cf1_ji ); - } - else { - /* No overcoordination correction! */ - f1 = 1.0; - Cf1_ij = Cf1_ji = 0.0; - } - - if( twbp->v13cor >= 0.001 ) { - /* Correction for 1-3 bond orders */ - exp_f4 =EXP(-(twbp->p_boc4 * SQR( bo_ij->BO ) - - Deltap_boc_i) * twbp->p_boc3 + twbp->p_boc5); - exp_f5 =EXP(-(twbp->p_boc4 * SQR( bo_ij->BO ) - - Deltap_boc_j) * twbp->p_boc3 + twbp->p_boc5); - - f4 = 1. / (1. + exp_f4); - f5 = 1. / (1. + exp_f5); - f4f5 = f4 * f5; - - /* Bond Order pages 8-9, derivative of f4 and f5 */ - /*temp = twbp->p_boc5 - - twbp->p_boc3 * twbp->p_boc4 * SQR( bo_ij->BO ); - u_ij = temp + twbp->p_boc3 * Deltap_boc_i; - u_ji = temp + twbp->p_boc3 * Deltap_boc_j; - Cf45_ij = Cf45( u_ij, u_ji ) / f4f5; - Cf45_ji = Cf45( u_ji, u_ij ) / f4f5;*/ - Cf45_ij = -f4 * exp_f4; - Cf45_ji = -f5 * exp_f5; - } - else { - f4 = f5 = f4f5 = 1.0; - Cf45_ij = Cf45_ji = 0.0; - } - - /* Bond Order page 10, derivative of total bond order */ - A0_ij = f1 * f4f5; - A1_ij = -2 * twbp->p_boc3 * twbp->p_boc4 * bo_ij->BO * - (Cf45_ij + Cf45_ji); - A2_ij = Cf1_ij / f1 + twbp->p_boc3 * Cf45_ij; - A2_ji = Cf1_ji / f1 + twbp->p_boc3 * Cf45_ji; - A3_ij = A2_ij + Cf1_ij / f1; - A3_ji = A2_ji + Cf1_ji / f1; - - /*fprintf( stderr, "\tBO: %f, A0: %f, A1: %f, A2_ij: %f + } + else { + val_j = sbp[type_j].valency; + Deltap_j = workspace.Deltap[j]; + Deltap_boc_j = workspace.Deltap_boc[j]; + + /* on page 1 */ + if( twbp->ovc >= 0.001 ) { + /* Correction for overcoordination */ + exp_p1i = EXP( -p_boc1 * Deltap_i ); + exp_p2i = EXP( -p_boc2 * Deltap_i ); + exp_p1j = EXP( -p_boc1 * Deltap_j ); + exp_p2j = EXP( -p_boc2 * Deltap_j ); + + f2 = exp_p1i + exp_p1j; + f3 = -1.0 / p_boc2 * log( 0.5 * ( exp_p2i + exp_p2j ) ); + f1 = 0.5 * ( ( val_i + f2 )/( val_i + f2 + f3 ) + + ( val_j + f2 )/( val_j + f2 + f3 ) ); + + /*fprintf( stderr,"%6d%6d\t%g %g j:%g %g p_boc:%g %g\n", + i+1, j+1, val_i, Deltap_i, val_j, Deltap_j, p_boc1, p_boc2 ); + fprintf( stderr,"\tf:%g %g %g, exp:%g %g %g %g\n", + f1, f2, f3, exp_p1i, exp_p2i, exp_p1j, exp_p2j );*/ + + /* Now come the derivates */ + /* Bond Order pages 5-7, derivative of f1 */ + temp = f2 + f3; + u1_ij = val_i + temp; + u1_ji = val_j + temp; + Cf1A_ij = 0.5 * f3 * (1.0 / SQR( u1_ij ) + 1.0 / SQR( u1_ji )); + Cf1B_ij = -0.5 * (( u1_ij - f3 ) / SQR( u1_ij ) + + ( u1_ji - f3 ) / SQR( u1_ji )); + + //Cf1_ij = -Cf1A_ij * p_boc1 * exp_p1i + + // Cf1B_ij * exp_p2i / ( exp_p2i + exp_p2j ); + Cf1_ij = 0.50 * ( -p_boc1 * exp_p1i / u1_ij - + ((val_i+f2) / SQR(u1_ij)) * + ( -p_boc1 * exp_p1i + + exp_p2i / ( exp_p2i + exp_p2j ) ) + + -p_boc1 * exp_p1i / u1_ji - + ((val_j+f2)/SQR(u1_ji)) * ( -p_boc1*exp_p1i + + exp_p2i / ( exp_p2i + exp_p2j ) )); + + Cf1_ji = -Cf1A_ij * p_boc1 * exp_p1j + + Cf1B_ij * exp_p2j / ( exp_p2i + exp_p2j ); + //fprintf( stderr, "\tCf1:%g %g\n", Cf1_ij, Cf1_ji ); + } + else { + /* No overcoordination correction! */ + f1 = 1.0; + Cf1_ij = Cf1_ji = 0.0; + } + + if( twbp->v13cor >= 0.001 ) { + /* Correction for 1-3 bond orders */ + exp_f4 =EXP(-(twbp->p_boc4 * SQR( bo_ij->BO ) - + Deltap_boc_i) * twbp->p_boc3 + twbp->p_boc5); + exp_f5 =EXP(-(twbp->p_boc4 * SQR( bo_ij->BO ) - + Deltap_boc_j) * twbp->p_boc3 + twbp->p_boc5); + + f4 = 1. / (1. + exp_f4); + f5 = 1. / (1. + exp_f5); + f4f5 = f4 * f5; + + /* Bond Order pages 8-9, derivative of f4 and f5 */ + /*temp = twbp->p_boc5 - + twbp->p_boc3 * twbp->p_boc4 * SQR( bo_ij->BO ); + u_ij = temp + twbp->p_boc3 * Deltap_boc_i; + u_ji = temp + twbp->p_boc3 * Deltap_boc_j; + Cf45_ij = Cf45( u_ij, u_ji ) / f4f5; + Cf45_ji = Cf45( u_ji, u_ij ) / f4f5;*/ + Cf45_ij = -f4 * exp_f4; + Cf45_ji = -f5 * exp_f5; + } + else { + f4 = f5 = f4f5 = 1.0; + Cf45_ij = Cf45_ji = 0.0; + } + + /* Bond Order page 10, derivative of total bond order */ + A0_ij = f1 * f4f5; + A1_ij = -2 * twbp->p_boc3 * twbp->p_boc4 * bo_ij->BO * + (Cf45_ij + Cf45_ji); + A2_ij = Cf1_ij / f1 + twbp->p_boc3 * Cf45_ij; + A2_ji = Cf1_ji / f1 + twbp->p_boc3 * Cf45_ji; + A3_ij = A2_ij + Cf1_ij / f1; + A3_ji = A2_ji + Cf1_ji / f1; + + /*fprintf( stderr, "\tBO: %f, A0: %f, A1: %f, A2_ij: %f A2_ji: %f, A3_ij: %f, A3_ji: %f\n", bo_ij->BO, A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji );*/ - /* find corrected bond order values and their deriv coefs */ - bo_ij->BO = bo_ij->BO * A0_ij; - bo_ij->BO_pi = bo_ij->BO_pi * A0_ij *f1; - bo_ij->BO_pi2= bo_ij->BO_pi2* A0_ij *f1; - bo_ij->BO_s = bo_ij->BO - ( bo_ij->BO_pi + bo_ij->BO_pi2 ); + /* find corrected bond order values and their deriv coefs */ + bo_ij->BO = bo_ij->BO * A0_ij; + bo_ij->BO_pi = bo_ij->BO_pi * A0_ij *f1; + bo_ij->BO_pi2= bo_ij->BO_pi2* A0_ij *f1; + bo_ij->BO_s = bo_ij->BO - ( bo_ij->BO_pi + bo_ij->BO_pi2 ); - bo_ij->C1dbo = A0_ij + bo_ij->BO * A1_ij; - bo_ij->C2dbo = bo_ij->BO * A2_ij; - bo_ij->C3dbo = bo_ij->BO * A2_ji; + bo_ij->C1dbo = A0_ij + bo_ij->BO * A1_ij; + bo_ij->C2dbo = bo_ij->BO * A2_ij; + bo_ij->C3dbo = bo_ij->BO * A2_ji; - bo_ij->C1dbopi = f1*f1*f4*f5; - bo_ij->C2dbopi = bo_ij->BO_pi * A1_ij; - bo_ij->C3dbopi = bo_ij->BO_pi * A3_ij; - bo_ij->C4dbopi = bo_ij->BO_pi * A3_ji; + bo_ij->C1dbopi = f1*f1*f4*f5; + bo_ij->C2dbopi = bo_ij->BO_pi * A1_ij; + bo_ij->C3dbopi = bo_ij->BO_pi * A3_ij; + bo_ij->C4dbopi = bo_ij->BO_pi * A3_ji; - bo_ij->C1dbopi2 = f1*f1*f4*f5; - bo_ij->C2dbopi2 = bo_ij->BO_pi2 * A1_ij; - bo_ij->C3dbopi2 = bo_ij->BO_pi2 * A3_ij; + bo_ij->C1dbopi2 = f1*f1*f4*f5; + bo_ij->C2dbopi2 = bo_ij->BO_pi2 * A1_ij; + bo_ij->C3dbopi2 = bo_ij->BO_pi2 * A3_ij; #ifdef TEST_FORCES - /*fprintf( stderr, "%6d%6d%13.6f%13.6f%13.6f%13.6f\n", - i+1, j+1, bo_ij->BO, bo_ij->C1dbo, Cf45_ij, Cf45_ji );*/ - - /* fprintf( stderr, "%6d%6d%13.6f%13.6f%13.6f%13.6f\n", - //"%6d%6d%10.6f%10.6f%10.6f%10.6f\n%10.6f%10.6f%10.6f\n%10.6f%10.6f%10.6f%10.6f\n%10.6f%10.6f%10.6f%10.6f\n\n", - workspace->orig_id[i], workspace->orig_id[j] - A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji - bo_ij->BO, bo_ij->BO_pi, bo_ij->BO_pi2, bo_ij->BO_s, - bo_ij->C1dbo, bo_ij->C2dbo, bo_ij->C3dbo, - bo_ij->C1dbopi,bo_ij->C2dbopi,bo_ij->C3dbopi,bo_ij->C4dbopi, - bo_ij->C1dbopi2,bo_ij->C2dbopi2,bo_ij->C3dbopi2,bo_ij->C4dbopi2 - ); */ - - Calculate_dBO( i, pj, workspace, lists, &top_dbo ); + /*fprintf( stderr, "%6d%6d%13.6f%13.6f%13.6f%13.6f\n", + i+1, j+1, bo_ij->BO, bo_ij->C1dbo, Cf45_ij, Cf45_ji );*/ + + /* fprintf( stderr, "%6d%6d%13.6f%13.6f%13.6f%13.6f\n", + //"%6d%6d%10.6f%10.6f%10.6f%10.6f\n%10.6f%10.6f%10.6f\n%10.6f%10.6f%10.6f%10.6f\n%10.6f%10.6f%10.6f%10.6f\n\n", + workspace->orig_id[i], workspace->orig_id[j] + A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji + bo_ij->BO, bo_ij->BO_pi, bo_ij->BO_pi2, bo_ij->BO_s, + bo_ij->C1dbo, bo_ij->C2dbo, bo_ij->C3dbo, + bo_ij->C1dbopi,bo_ij->C2dbopi,bo_ij->C3dbopi,bo_ij->C4dbopi, + bo_ij->C1dbopi2,bo_ij->C2dbopi2,bo_ij->C3dbopi2,bo_ij->C4dbopi2 + ); */ + + Calculate_dBO( i, pj, workspace, lists, &top_dbo ); #endif - } + } - /* neglect bonds that are < 1e-10 */ - if( bo_ij->BO < 1e-10 ) - bo_ij->BO = 0.0; - if( bo_ij->BO_s < 1e-10 ) - bo_ij->BO_s = 0.0; - if( bo_ij->BO_pi < 1e-10 ) - bo_ij->BO_pi = 0.0; - if( bo_ij->BO_pi2 < 1e-10 ) - bo_ij->BO_pi2 = 0.0; + /* neglect bonds that are < 1e-10 */ + if( bo_ij->BO < 1e-10 ) + bo_ij->BO = 0.0; + if( bo_ij->BO_s < 1e-10 ) + bo_ij->BO_s = 0.0; + if( bo_ij->BO_pi < 1e-10 ) + bo_ij->BO_pi = 0.0; + if( bo_ij->BO_pi2 < 1e-10 ) + bo_ij->BO_pi2 = 0.0; - workspace.total_bond_order[i] += bo_ij->BO; // now keeps total_BO + workspace.total_bond_order[i] += bo_ij->BO; // now keeps total_BO - /* fprintf( stderr, "%d %d\t%g %g %g %g\n + /* fprintf( stderr, "%d %d\t%g %g %g %g\n Cdbo:\t%g %g %g\n Cdbopi:\t%g %g %g %g\n Cdbopi2:%g %g %g %g\n\n", @@ -1650,32 +1650,32 @@ bo_ij->C1dbopi, bo_ij->C2dbopi, bo_ij->C3dbopi, bo_ij->C4dbopi, bo_ij->C1dbopi2, bo_ij->C2dbopi2, bo_ij->C3dbopi2, bo_ij->C4dbopi2 ); */ - /* fprintf( stderr, "%d %d, BO:%f BO_s:%f BO_pi:%f BO_pi2:%f\n", - i+1,j+1,bo_ij->BO,bo_ij->BO_s,bo_ij->BO_pi,bo_ij->BO_pi2 ); */ + /* fprintf( stderr, "%d %d, BO:%f BO_s:%f BO_pi:%f BO_pi2:%f\n", + i+1,j+1,bo_ij->BO,bo_ij->BO_s,bo_ij->BO_pi,bo_ij->BO_pi2 ); */ #ifdef TEST_FORCES - Set_End_Index( pj, top_dbo, &dBOs ); - //Add_dBO( system, lists, i, pj, 1.0, workspace->dDelta ); + Set_End_Index( pj, top_dbo, &dBOs ); + //Add_dBO( system, lists, i, pj, 1.0, workspace->dDelta ); #endif - } - /* - else { - // We only need to update bond orders from bo_ji - // everything else is set in uncorrected_bo calculations - sym_index = bonds.select.bond_list[pj].sym_index; - bo_ji = &(bonds.select.bond_list[ sym_index ].bo_data); - bo_ij->BO = bo_ji->BO; - bo_ij->BO_s = bo_ji->BO_s; - bo_ij->BO_pi = bo_ji->BO_pi; - bo_ij->BO_pi2 = bo_ji->BO_pi2; - - workspace.total_bond_order[i] += bo_ij->BO; // now keeps total_BO + } + /* + else { + // We only need to update bond orders from bo_ji + // everything else is set in uncorrected_bo calculations + sym_index = bonds.select.bond_list[pj].sym_index; + bo_ji = &(bonds.select.bond_list[ sym_index ].bo_data); + bo_ij->BO = bo_ji->BO; + bo_ij->BO_s = bo_ji->BO_s; + bo_ij->BO_pi = bo_ji->BO_pi; + bo_ij->BO_pi2 = bo_ji->BO_pi2; + + workspace.total_bond_order[i] += bo_ij->BO; // now keeps total_BO #ifdef TEST_FORCES - //Add_dBO( system, lists, j, sym_index, 1.0, workspace.dDelta ); + //Add_dBO( system, lists, j, sym_index, 1.0, workspace.dDelta ); #endif -} - */ +} + */ } #ifdef TEST_FORCES @@ -1684,27 +1684,27 @@ Set_Start_Index( i, top_dDelta, &dDeltas ); ptop_dDelta = &( dDeltas.select.dDelta_list[top_dDelta] ); for( pj = start_i; pj < end_i; ++pj ) { - j = bonds.select.bond_list[pj].nbr; - // fprintf( stderr, "%d ", j ); - - if( !rvec_isZero( workspace.dDelta[j] ) ) { - ptop_dDelta->wrt = j; - rvec_Copy( ptop_dDelta->dVal, workspace.dDelta[j] ); - rvec_MakeZero( workspace.dDelta[j] ); - ++top_dDelta, ++ptop_dDelta; - } - - start_j = Start_Index(j, &bonds); - end_j = End_Index(j, &bonds); - for( pk = start_j; pk < end_j; ++pk ) { - k = bonds.select.bond_list[pk].nbr; - if( !rvec_isZero( workspace.dDelta[k] ) ) { - ptop_dDelta->wrt = k; - rvec_Copy( ptop_dDelta->dVal, workspace.dDelta[k] ); - rvec_MakeZero( workspace.dDelta[k] ); - ++top_dDelta, ++ptop_dDelta; - } - } + j = bonds.select.bond_list[pj].nbr; + // fprintf( stderr, "%d ", j ); + + if( !rvec_isZero( workspace.dDelta[j] ) ) { + ptop_dDelta->wrt = j; + rvec_Copy( ptop_dDelta->dVal, workspace.dDelta[j] ); + rvec_MakeZero( workspace.dDelta[j] ); + ++top_dDelta, ++ptop_dDelta; + } + + start_j = Start_Index(j, &bonds); + end_j = End_Index(j, &bonds); + for( pk = start_j; pk < end_j; ++pk ) { + k = bonds.select.bond_list[pk].nbr; + if( !rvec_isZero( workspace.dDelta[k] ) ) { + ptop_dDelta->wrt = k; + rvec_Copy( ptop_dDelta->dVal, workspace.dDelta[k] ); + rvec_MakeZero( workspace.dDelta[k] ); + ++top_dDelta, ++ptop_dDelta; + } + } } Set_End_Index( i, top_dDelta, &dDeltas ); @@ -1780,125 +1780,125 @@ workspace.dDelta_lp_temp[j] = workspace.Clp[j]; GLOBAL void Cuda_Update_Uncorrected_BO ( static_storage workspace, list bonds, int N ) { - int i, j, pj; - int start_i, end_i; - int sym_index; + int i, j, pj; + int start_i, end_i; + int sym_index; - bond_order_data *bo_ij, *bo_ji; + bond_order_data *bo_ij, *bo_ji; - i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= N) return; + i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= N) return; - start_i = Start_Index(i, &bonds); - end_i = End_Index(i, &bonds); + start_i = Start_Index(i, &bonds); + end_i = End_Index(i, &bonds); - for( pj = start_i; pj < end_i; ++pj ) { + for( pj = start_i; pj < end_i; ++pj ) { - j = bonds.select.bond_list[pj].nbr; - bo_ij = &( bonds.select.bond_list[pj].bo_data ); + j = bonds.select.bond_list[pj].nbr; + bo_ij = &( bonds.select.bond_list[pj].bo_data ); - if( i >= j ) { - // We only need to update bond orders from bo_ji - // everything else is set in uncorrected_bo calculations - sym_index = bonds.select.bond_list[pj].sym_index; - bo_ji = &(bonds.select.bond_list[ sym_index ].bo_data); - bo_ij->BO = bo_ji->BO; - bo_ij->BO_s = bo_ji->BO_s; - bo_ij->BO_pi = bo_ji->BO_pi; - bo_ij->BO_pi2 = bo_ji->BO_pi2; + if( i >= j ) { + // We only need to update bond orders from bo_ji + // everything else is set in uncorrected_bo calculations + sym_index = bonds.select.bond_list[pj].sym_index; + bo_ji = &(bonds.select.bond_list[ sym_index ].bo_data); + bo_ij->BO = bo_ji->BO; + bo_ij->BO_s = bo_ji->BO_s; + bo_ij->BO_pi = bo_ji->BO_pi; + bo_ij->BO_pi2 = bo_ji->BO_pi2; - workspace.total_bond_order[i] += bo_ij->BO; // now keeps total_BO - } - } + workspace.total_bond_order[i] += bo_ij->BO; // now keeps total_BO + } + } } GLOBAL void Cuda_Update_Workspace_After_Bond_Orders( reax_atom *atoms, global_parameters g_params, single_body_parameters *sbp, - static_storage workspace, int N ) + static_storage workspace, int N ) { - int j, type_j; - real explp1; - real p_lp1; - single_body_parameters *sbp_i, *sbp_j; - - j = blockIdx.x * blockDim.x + threadIdx.x; - if (j >= N) return; - - p_lp1 = g_params.l[15]; - - /* Calculate some helper variables that are used at many places - throughout force calculations */ - //for( j = 0; j < system->N; ++j ) { - type_j = atoms[j].type; - sbp_j = &(sbp[ type_j ]); - - workspace.Delta[j] = workspace.total_bond_order[j] - sbp_j->valency; - workspace.Delta_e[j] = workspace.total_bond_order[j] - sbp_j->valency_e; - workspace.Delta_boc[j] = workspace.total_bond_order[j] - - sbp_j->valency_boc; - - workspace.vlpex[j] = workspace.Delta_e[j] - - 2.0 * (int)(workspace.Delta_e[j]/2.0); - explp1 = EXP(-p_lp1 * SQR(2.0 + workspace.vlpex[j])); - workspace.nlp[j] = explp1 - (int)(workspace.Delta_e[j] / 2.0); - workspace.Delta_lp[j] = sbp_j->nlp_opt - workspace.nlp[j]; - workspace.Clp[j] = 2.0 * p_lp1 * explp1 * (2.0 + workspace.vlpex[j]); - /* Adri uses different dDelta_lp values than the ones in notes... */ - workspace.dDelta_lp[j] = workspace.Clp[j]; - //workspace->dDelta_lp[j] = workspace->Clp[j] + (0.5-workspace->Clp[j]) * - //((fabs(workspace->Delta_e[j]/2.0 - - // (int)(workspace->Delta_e[j]/2.0)) < 0.1) ? 1 : 0 ); - - if( sbp_j->mass > 21.0 ) { - workspace.nlp_temp[j] = 0.5 * (sbp_j->valency_e - sbp_j->valency); - workspace.Delta_lp_temp[j] = sbp_j->nlp_opt - workspace.nlp_temp[j]; - workspace.dDelta_lp_temp[j] = 0.; - } - else { - workspace.nlp_temp[j] = workspace.nlp[j]; - workspace.Delta_lp_temp[j] = sbp_j->nlp_opt - workspace.nlp_temp[j]; - workspace.dDelta_lp_temp[j] = workspace.Clp[j]; - } - - //fprintf( stderr, "%d\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\n", - //j, workspace->Delta[j], workspace->Delta_e[j], workspace->Delta_boc[j], - //workspace->nlp[j], system->reaxprm.sbp[type_j].nlp_opt, - //workspace->Delta_lp[j], workspace->Clp[j], workspace->dDelta_lp[j] ); - //} + int j, type_j; + real explp1; + real p_lp1; + single_body_parameters *sbp_i, *sbp_j; + + j = blockIdx.x * blockDim.x + threadIdx.x; + if (j >= N) return; + + p_lp1 = g_params.l[15]; + + /* Calculate some helper variables that are used at many places + throughout force calculations */ + //for( j = 0; j < system->N; ++j ) { + type_j = atoms[j].type; + sbp_j = &(sbp[ type_j ]); + + workspace.Delta[j] = workspace.total_bond_order[j] - sbp_j->valency; + workspace.Delta_e[j] = workspace.total_bond_order[j] - sbp_j->valency_e; + workspace.Delta_boc[j] = workspace.total_bond_order[j] - + sbp_j->valency_boc; + + workspace.vlpex[j] = workspace.Delta_e[j] - + 2.0 * (int)(workspace.Delta_e[j]/2.0); + explp1 = EXP(-p_lp1 * SQR(2.0 + workspace.vlpex[j])); + workspace.nlp[j] = explp1 - (int)(workspace.Delta_e[j] / 2.0); + workspace.Delta_lp[j] = sbp_j->nlp_opt - workspace.nlp[j]; + workspace.Clp[j] = 2.0 * p_lp1 * explp1 * (2.0 + workspace.vlpex[j]); + /* Adri uses different dDelta_lp values than the ones in notes... */ + workspace.dDelta_lp[j] = workspace.Clp[j]; + //workspace->dDelta_lp[j] = workspace->Clp[j] + (0.5-workspace->Clp[j]) * + //((fabs(workspace->Delta_e[j]/2.0 - + // (int)(workspace->Delta_e[j]/2.0)) < 0.1) ? 1 : 0 ); + + if( sbp_j->mass > 21.0 ) { + workspace.nlp_temp[j] = 0.5 * (sbp_j->valency_e - sbp_j->valency); + workspace.Delta_lp_temp[j] = sbp_j->nlp_opt - workspace.nlp_temp[j]; + workspace.dDelta_lp_temp[j] = 0.; + } + else { + workspace.nlp_temp[j] = workspace.nlp[j]; + workspace.Delta_lp_temp[j] = sbp_j->nlp_opt - workspace.nlp_temp[j]; + workspace.dDelta_lp_temp[j] = workspace.Clp[j]; + } + + //fprintf( stderr, "%d\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\n", + //j, workspace->Delta[j], workspace->Delta_e[j], workspace->Delta_boc[j], + //workspace->nlp[j], system->reaxprm.sbp[type_j].nlp_opt, + //workspace->Delta_lp[j], workspace->Clp[j], workspace->dDelta_lp[j] ); + //} } //Import from the forces file. GLOBAL void Cuda_Compute_Total_Force (reax_atom *atoms, simulation_data *data, - static_storage workspace, list p_bonds, int ensemble, int N) + static_storage workspace, list p_bonds, int ensemble, int N) { - int i, pj; - list *bonds = &p_bonds; - - i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < N) - { - for (pj = Start_Index (i, bonds); pj < End_Index (i, bonds); ++pj) - { - //int j = bonds->select.bond_list[pj].nbr; - if (ensemble == NVE || ensemble == NVT || ensemble == bNVT) - Cuda_Add_dBond_to_Forces (i, pj, atoms, &workspace, bonds ); - else - Cuda_Add_dBond_to_Forces_NPT (i, pj, atoms, data, &workspace, bonds ); - } - } + int i, pj; + list *bonds = &p_bonds; + + i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < N) + { + for (pj = Start_Index (i, bonds); pj < End_Index (i, bonds); ++pj) + { + //int j = bonds->select.bond_list[pj].nbr; + if (ensemble == NVE || ensemble == NVT || ensemble == bNVT) + Cuda_Add_dBond_to_Forces (i, pj, atoms, &workspace, bonds ); + else + Cuda_Add_dBond_to_Forces_NPT (i, pj, atoms, data, &workspace, bonds ); + } + } } GLOBAL void Cuda_Compute_Total_Force_PostProcess (reax_atom *atoms, simulation_data *data, - static_storage workspace, list p_bonds, int ensemble, int N) + static_storage workspace, list p_bonds, int ensemble, int N) { - int i, pj; - list *bonds = &p_bonds; - - i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < N) - { - if (ensemble == NVE || ensemble == NVT || ensemble == bNVT) - Cuda_dbond_to_Forces_postprocess (i, atoms, bonds ); - } + int i, pj; + list *bonds = &p_bonds; + + i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < N) + { + if (ensemble == NVE || ensemble == NVT || ensemble == bNVT) + Cuda_dbond_to_Forces_postprocess (i, atoms, bonds ); + } } diff --git a/PuReMD-GPU/src/box.cu b/PuReMD-GPU/src/box.cu index ae9a07ed..e42395c5 100644 --- a/PuReMD-GPU/src/box.cu +++ b/PuReMD-GPU/src/box.cu @@ -23,295 +23,295 @@ void Init_Box_From_CRYST(real a, real b, real c, - real alpha, real beta, real gamma, - simulation_box* box ) + real alpha, real beta, real gamma, + simulation_box* box ) { - double c_alpha, c_beta, c_gamma, s_gamma, zi; + double c_alpha, c_beta, c_gamma, s_gamma, zi; - c_alpha = cos(DEG2RAD(alpha)); - c_beta = cos(DEG2RAD(beta)); - c_gamma = cos(DEG2RAD(gamma)); - s_gamma = sin(DEG2RAD(gamma)); + c_alpha = cos(DEG2RAD(alpha)); + c_beta = cos(DEG2RAD(beta)); + c_gamma = cos(DEG2RAD(gamma)); + s_gamma = sin(DEG2RAD(gamma)); - zi = (c_alpha - c_beta * c_gamma)/s_gamma; + zi = (c_alpha - c_beta * c_gamma)/s_gamma; - box->box[0][0] = a; - box->box[0][1] = 0.0; - box->box[0][2] = 0.0; + box->box[0][0] = a; + box->box[0][1] = 0.0; + box->box[0][2] = 0.0; - box->box[1][0] = b * c_gamma; - box->box[1][1] = b * s_gamma; - box->box[1][2] = 0.0; + box->box[1][0] = b * c_gamma; + box->box[1][1] = b * s_gamma; + box->box[1][2] = 0.0; - box->box[2][0] = c * c_beta; - box->box[2][1] = c * zi; - box->box[2][2] = c * SQRT(1.0 - SQR(c_beta) - SQR(zi)); + box->box[2][0] = c * c_beta; + box->box[2][1] = c * zi; + box->box[2][2] = c * SQRT(1.0 - SQR(c_beta) - SQR(zi)); - Make_Consistent( box ); + Make_Consistent( box ); #if defined(DEBUG_FOCUS) - fprintf( stderr, "box is %8.2f x %8.2f x %8.2f\n", - box->box[0][0], box->box[1][1], box->box[2][2] ); + fprintf( stderr, "box is %8.2f x %8.2f x %8.2f\n", + box->box[0][0], box->box[1][1], box->box[2][2] ); #endif } void Update_Box( rtensor box_tensor, simulation_box* box ) { - int i, j; + int i, j; - for (i=0; i < 3; i++) - for (j=0; j < 3; j++) - box->box[i][j] = box_tensor[i][j]; + for (i=0; i < 3; i++) + for (j=0; j < 3; j++) + box->box[i][j] = box_tensor[i][j]; - Make_Consistent( box ); + Make_Consistent( box ); } void Update_Box_Isotropic( simulation_box *box, real mu ) { - /*box->box[0][0] = - POW( V_new / ( box->side_prop[1] * box->side_prop[2] ), 1.0/3.0 ); - box->box[1][1] = box->box[0][0] * box->side_prop[1]; - box->box[2][2] = box->box[0][0] * box->side_prop[2]; - */ - rtensor_Copy( box->old_box, box->box ); - box->box[0][0] *= mu; - box->box[1][1] *= mu; - box->box[2][2] *= mu; - - box->volume = box->box[0][0]*box->box[1][1]*box->box[2][2]; - Make_Consistent(box/*, periodic*/); + /*box->box[0][0] = + POW( V_new / ( box->side_prop[1] * box->side_prop[2] ), 1.0/3.0 ); + box->box[1][1] = box->box[0][0] * box->side_prop[1]; + box->box[2][2] = box->box[0][0] * box->side_prop[2]; + */ + rtensor_Copy( box->old_box, box->box ); + box->box[0][0] *= mu; + box->box[1][1] *= mu; + box->box[2][2] *= mu; + + box->volume = box->box[0][0]*box->box[1][1]*box->box[2][2]; + Make_Consistent(box/*, periodic*/); } void Update_Box_SemiIsotropic( simulation_box *box, rvec mu ) { - /*box->box[0][0] = - POW( V_new / ( box->side_prop[1] * box->side_prop[2] ), 1.0/3.0 ); - box->box[1][1] = box->box[0][0] * box->side_prop[1]; - box->box[2][2] = box->box[0][0] * box->side_prop[2]; */ - rtensor_Copy( box->old_box, box->box ); - box->box[0][0] *= mu[0]; - box->box[1][1] *= mu[1]; - box->box[2][2] *= mu[2]; - - box->volume = box->box[0][0]*box->box[1][1]*box->box[2][2]; - Make_Consistent(box); + /*box->box[0][0] = + POW( V_new / ( box->side_prop[1] * box->side_prop[2] ), 1.0/3.0 ); + box->box[1][1] = box->box[0][0] * box->side_prop[1]; + box->box[2][2] = box->box[0][0] * box->side_prop[2]; */ + rtensor_Copy( box->old_box, box->box ); + box->box[0][0] *= mu[0]; + box->box[1][1] *= mu[1]; + box->box[2][2] *= mu[2]; + + box->volume = box->box[0][0]*box->box[1][1]*box->box[2][2]; + Make_Consistent(box); } void Make_Consistent(simulation_box* box) { - real one_vol; - - box->volume = - box->box[0][0] * (box->box[1][1]*box->box[2][2] - - box->box[2][1]*box->box[2][1]) + - box->box[0][1] * (box->box[2][0]*box->box[1][2] - - box->box[1][0]*box->box[2][2]) + - box->box[0][2] * (box->box[1][0]*box->box[2][1] - - box->box[2][0]*box->box[1][1]); - - one_vol = 1.0/box->volume; - - box->box_inv[0][0] = (box->box[1][1]*box->box[2][2] - - box->box[1][2]*box->box[2][1]) * one_vol; - box->box_inv[0][1] = (box->box[0][2]*box->box[2][1] - - box->box[0][1]*box->box[2][2]) * one_vol; - box->box_inv[0][2] = (box->box[0][1]*box->box[1][2] - - box->box[0][2]*box->box[1][1]) * one_vol; - - box->box_inv[1][0] = (box->box[1][2]*box->box[2][0] - - box->box[1][0]*box->box[2][2]) * one_vol; - box->box_inv[1][1] = (box->box[0][0]*box->box[2][2] - - box->box[0][2]*box->box[2][0]) * one_vol; - box->box_inv[1][2] = (box->box[0][2]*box->box[1][0] - - box->box[0][0]*box->box[1][2]) * one_vol; - - box->box_inv[2][0] = (box->box[1][0]*box->box[2][1] - - box->box[1][1]*box->box[2][0]) * one_vol; - box->box_inv[2][1] = (box->box[0][1]*box->box[2][0] - - box->box[0][0]*box->box[2][1]) * one_vol; - box->box_inv[2][2] = (box->box[0][0]*box->box[1][1] - - box->box[0][1]*box->box[1][0]) * one_vol; - - box->box_norms[0] = SQRT( SQR(box->box[0][0]) + - SQR(box->box[0][1]) + - SQR(box->box[0][2]) ); - box->box_norms[1] = SQRT( SQR(box->box[1][0]) + - SQR(box->box[1][1]) + - SQR(box->box[1][2]) ); - box->box_norms[2] = SQRT( SQR(box->box[2][0]) + - SQR(box->box[2][1]) + - SQR(box->box[2][2]) ); - - box->trans[0][0] = box->box[0][0]/box->box_norms[0]; - box->trans[0][1] = box->box[1][0]/box->box_norms[0]; - box->trans[0][2] = box->box[2][0]/box->box_norms[0]; - - box->trans[1][0] = box->box[0][1]/box->box_norms[1]; - box->trans[1][1] = box->box[1][1]/box->box_norms[1]; - box->trans[1][2] = box->box[2][1]/box->box_norms[1]; - - box->trans[2][0] = box->box[0][2]/box->box_norms[2]; - box->trans[2][1] = box->box[1][2]/box->box_norms[2]; - box->trans[2][2] = box->box[2][2]/box->box_norms[2]; - - one_vol = box->box_norms[0]*box->box_norms[1]*box->box_norms[2]*one_vol; - - box->trans_inv[0][0] = (box->trans[1][1]*box->trans[2][2] - - box->trans[1][2]*box->trans[2][1]) * one_vol; - box->trans_inv[0][1] = (box->trans[0][2]*box->trans[2][1] - - box->trans[0][1]*box->trans[2][2]) * one_vol; - box->trans_inv[0][2] = (box->trans[0][1]*box->trans[1][2] - - box->trans[0][2]*box->trans[1][1]) * one_vol; - - box->trans_inv[1][0] = (box->trans[1][2]*box->trans[2][0] - - box->trans[1][0]*box->trans[2][2]) * one_vol; - box->trans_inv[1][1] = (box->trans[0][0]*box->trans[2][2] - - box->trans[0][2]*box->trans[2][0]) * one_vol; - box->trans_inv[1][2] = (box->trans[0][2]*box->trans[1][0] - - box->trans[0][0]*box->trans[1][2]) * one_vol; - - box->trans_inv[2][0] = (box->trans[1][0]*box->trans[2][1] - - box->trans[1][1]*box->trans[2][0]) * one_vol; - box->trans_inv[2][1] = (box->trans[0][1]*box->trans[2][0] - - box->trans[0][0]*box->trans[2][1]) * one_vol; - box->trans_inv[2][2] = (box->trans[0][0]*box->trans[1][1] - - box->trans[0][1]*box->trans[1][0]) * one_vol; - - // for (i=0; i < 3; i++) - // { - // for (j=0; j < 3; j++) - // fprintf(stderr,"%lf\t",box->trans[i][j]); - // fprintf(stderr,"\n"); - // } - // fprintf(stderr,"\n"); - // for (i=0; i < 3; i++) - // { - // for (j=0; j < 3; j++) - // fprintf(stderr,"%lf\t",box->trans_inv[i][j]); - // fprintf(stderr,"\n"); - // } - - - box->g[0][0] = box->box[0][0] * box->box[0][0] + - box->box[0][1] * box->box[0][1] + - box->box[0][2] * box->box[0][2]; - box->g[1][0] = - box->g[0][1] = box->box[0][0] * box->box[1][0] + - box->box[0][1] * box->box[1][1] + - box->box[0][2] * box->box[1][2]; - box->g[2][0] = - box->g[0][2] = box->box[0][0] * box->box[2][0] + - box->box[0][1] * box->box[2][1] + - box->box[0][2] * box->box[2][2]; - - box->g[1][1] = box->box[1][0] * box->box[1][0] + - box->box[1][1] * box->box[1][1] + - box->box[1][2] * box->box[1][2]; - box->g[1][2] = - box->g[2][1] = box->box[1][0] * box->box[2][0] + - box->box[1][1] * box->box[2][1] + - box->box[1][2] * box->box[2][2]; - - box->g[2][2] = box->box[2][0] * box->box[2][0] + - box->box[2][1] * box->box[2][1] + - box->box[2][2] * box->box[2][2]; - - // These proportions are only used for isotropic_NPT! - box->side_prop[0] = box->box[0][0] / box->box[0][0]; - box->side_prop[1] = box->box[1][1] / box->box[0][0]; - box->side_prop[2] = box->box[2][2] / box->box[0][0]; + real one_vol; + + box->volume = + box->box[0][0] * (box->box[1][1]*box->box[2][2] - + box->box[2][1]*box->box[2][1]) + + box->box[0][1] * (box->box[2][0]*box->box[1][2] - + box->box[1][0]*box->box[2][2]) + + box->box[0][2] * (box->box[1][0]*box->box[2][1] - + box->box[2][0]*box->box[1][1]); + + one_vol = 1.0/box->volume; + + box->box_inv[0][0] = (box->box[1][1]*box->box[2][2] - + box->box[1][2]*box->box[2][1]) * one_vol; + box->box_inv[0][1] = (box->box[0][2]*box->box[2][1] - + box->box[0][1]*box->box[2][2]) * one_vol; + box->box_inv[0][2] = (box->box[0][1]*box->box[1][2] - + box->box[0][2]*box->box[1][1]) * one_vol; + + box->box_inv[1][0] = (box->box[1][2]*box->box[2][0] - + box->box[1][0]*box->box[2][2]) * one_vol; + box->box_inv[1][1] = (box->box[0][0]*box->box[2][2] - + box->box[0][2]*box->box[2][0]) * one_vol; + box->box_inv[1][2] = (box->box[0][2]*box->box[1][0] - + box->box[0][0]*box->box[1][2]) * one_vol; + + box->box_inv[2][0] = (box->box[1][0]*box->box[2][1] - + box->box[1][1]*box->box[2][0]) * one_vol; + box->box_inv[2][1] = (box->box[0][1]*box->box[2][0] - + box->box[0][0]*box->box[2][1]) * one_vol; + box->box_inv[2][2] = (box->box[0][0]*box->box[1][1] - + box->box[0][1]*box->box[1][0]) * one_vol; + + box->box_norms[0] = SQRT( SQR(box->box[0][0]) + + SQR(box->box[0][1]) + + SQR(box->box[0][2]) ); + box->box_norms[1] = SQRT( SQR(box->box[1][0]) + + SQR(box->box[1][1]) + + SQR(box->box[1][2]) ); + box->box_norms[2] = SQRT( SQR(box->box[2][0]) + + SQR(box->box[2][1]) + + SQR(box->box[2][2]) ); + + box->trans[0][0] = box->box[0][0]/box->box_norms[0]; + box->trans[0][1] = box->box[1][0]/box->box_norms[0]; + box->trans[0][2] = box->box[2][0]/box->box_norms[0]; + + box->trans[1][0] = box->box[0][1]/box->box_norms[1]; + box->trans[1][1] = box->box[1][1]/box->box_norms[1]; + box->trans[1][2] = box->box[2][1]/box->box_norms[1]; + + box->trans[2][0] = box->box[0][2]/box->box_norms[2]; + box->trans[2][1] = box->box[1][2]/box->box_norms[2]; + box->trans[2][2] = box->box[2][2]/box->box_norms[2]; + + one_vol = box->box_norms[0]*box->box_norms[1]*box->box_norms[2]*one_vol; + + box->trans_inv[0][0] = (box->trans[1][1]*box->trans[2][2] - + box->trans[1][2]*box->trans[2][1]) * one_vol; + box->trans_inv[0][1] = (box->trans[0][2]*box->trans[2][1] - + box->trans[0][1]*box->trans[2][2]) * one_vol; + box->trans_inv[0][2] = (box->trans[0][1]*box->trans[1][2] - + box->trans[0][2]*box->trans[1][1]) * one_vol; + + box->trans_inv[1][0] = (box->trans[1][2]*box->trans[2][0] - + box->trans[1][0]*box->trans[2][2]) * one_vol; + box->trans_inv[1][1] = (box->trans[0][0]*box->trans[2][2] - + box->trans[0][2]*box->trans[2][0]) * one_vol; + box->trans_inv[1][2] = (box->trans[0][2]*box->trans[1][0] - + box->trans[0][0]*box->trans[1][2]) * one_vol; + + box->trans_inv[2][0] = (box->trans[1][0]*box->trans[2][1] - + box->trans[1][1]*box->trans[2][0]) * one_vol; + box->trans_inv[2][1] = (box->trans[0][1]*box->trans[2][0] - + box->trans[0][0]*box->trans[2][1]) * one_vol; + box->trans_inv[2][2] = (box->trans[0][0]*box->trans[1][1] - + box->trans[0][1]*box->trans[1][0]) * one_vol; + + // for (i=0; i < 3; i++) + // { + // for (j=0; j < 3; j++) + // fprintf(stderr,"%lf\t",box->trans[i][j]); + // fprintf(stderr,"\n"); + // } + // fprintf(stderr,"\n"); + // for (i=0; i < 3; i++) + // { + // for (j=0; j < 3; j++) + // fprintf(stderr,"%lf\t",box->trans_inv[i][j]); + // fprintf(stderr,"\n"); + // } + + + box->g[0][0] = box->box[0][0] * box->box[0][0] + + box->box[0][1] * box->box[0][1] + + box->box[0][2] * box->box[0][2]; + box->g[1][0] = + box->g[0][1] = box->box[0][0] * box->box[1][0] + + box->box[0][1] * box->box[1][1] + + box->box[0][2] * box->box[1][2]; + box->g[2][0] = + box->g[0][2] = box->box[0][0] * box->box[2][0] + + box->box[0][1] * box->box[2][1] + + box->box[0][2] * box->box[2][2]; + + box->g[1][1] = box->box[1][0] * box->box[1][0] + + box->box[1][1] * box->box[1][1] + + box->box[1][2] * box->box[1][2]; + box->g[1][2] = + box->g[2][1] = box->box[1][0] * box->box[2][0] + + box->box[1][1] * box->box[2][1] + + box->box[1][2] * box->box[2][2]; + + box->g[2][2] = box->box[2][0] * box->box[2][0] + + box->box[2][1] * box->box[2][1] + + box->box[2][2] * box->box[2][2]; + + // These proportions are only used for isotropic_NPT! + box->side_prop[0] = box->box[0][0] / box->box[0][0]; + box->side_prop[1] = box->box[1][1] / box->box[0][0]; + box->side_prop[2] = box->box[2][2] / box->box[0][0]; } void Transform( rvec x1, simulation_box *box, char flag, rvec x2 ) { - int i, j; - real tmp; - - // printf(">x1: (%lf, %lf, %lf)\n",x1[0],x1[1],x1[2]); - - if (flag > 0) { - for (i=0; i < 3; i++) { - tmp = 0.0; - for (j=0; j < 3; j++) - tmp += box->trans[i][j]*x1[j]; - x2[i] = tmp; - } - } - else { - for (i=0; i < 3; i++) { - tmp = 0.0; - for (j=0; j < 3; j++) - tmp += box->trans_inv[i][j]*x1[j]; - x2[i] = tmp; - } - } - // printf(">x2: (%lf, %lf, %lf)\n", x2[0], x2[1], x2[2]); + int i, j; + real tmp; + + // printf(">x1: (%lf, %lf, %lf)\n",x1[0],x1[1],x1[2]); + + if (flag > 0) { + for (i=0; i < 3; i++) { + tmp = 0.0; + for (j=0; j < 3; j++) + tmp += box->trans[i][j]*x1[j]; + x2[i] = tmp; + } + } + else { + for (i=0; i < 3; i++) { + tmp = 0.0; + for (j=0; j < 3; j++) + tmp += box->trans_inv[i][j]*x1[j]; + x2[i] = tmp; + } + } + // printf(">x2: (%lf, %lf, %lf)\n", x2[0], x2[1], x2[2]); } void Transform_to_UnitBox( rvec x1, simulation_box *box, char flag, rvec x2 ) { - Transform( x1, box, flag, x2 ); + Transform( x1, box, flag, x2 ); - x2[0] /= box->box_norms[0]; - x2[1] /= box->box_norms[1]; - x2[2] /= box->box_norms[2]; + x2[0] /= box->box_norms[0]; + x2[1] /= box->box_norms[1]; + x2[2] /= box->box_norms[2]; } void Distance_on_T3_Gen( rvec x1, rvec x2, simulation_box* box, rvec r ) { - rvec xa, xb, ra; + rvec xa, xb, ra; - Transform( x1, box, -1, xa ); - Transform( x2, box, -1, xb ); + Transform( x1, box, -1, xa ); + Transform( x2, box, -1, xb ); - //printf(">xa: (%lf, %lf, %lf)\n",xa[0],xa[1],xa[2]); - //printf(">xb: (%lf, %lf, %lf)\n",xb[0],xb[1],xb[2]); + //printf(">xa: (%lf, %lf, %lf)\n",xa[0],xa[1],xa[2]); + //printf(">xb: (%lf, %lf, %lf)\n",xb[0],xb[1],xb[2]); - Sq_Distance_on_T3( xa, xb, box, ra ); + Sq_Distance_on_T3( xa, xb, box, ra ); - Transform( ra, box, 1, r ); + Transform( ra, box, 1, r ); } void Inc_on_T3_Gen( rvec x, rvec dx, simulation_box* box ) { - rvec xa, dxa; + rvec xa, dxa; - Transform( x, box, -1, xa ); - Transform( dx, box, -1, dxa ); + Transform( x, box, -1, xa ); + Transform( dx, box, -1, dxa ); - //printf(">xa: (%lf, %lf, %lf)\n",xa[0],xa[1],xa[2]); - //printf(">dxa: (%lf, %lf, %lf)\n",dxa[0],dxa[1],dxa[2]); + //printf(">xa: (%lf, %lf, %lf)\n",xa[0],xa[1],xa[2]); + //printf(">dxa: (%lf, %lf, %lf)\n",dxa[0],dxa[1],dxa[2]); - Inc_on_T3( xa, dxa, box ); + Inc_on_T3( xa, dxa, box ); - //printf(">new_xa: (%lf, %lf, %lf)\n",xa[0],xa[1],xa[2]); + //printf(">new_xa: (%lf, %lf, %lf)\n",xa[0],xa[1],xa[2]); - Transform( xa, box, 1, x ); + Transform( xa, box, 1, x ); } real Metric_Product( rvec x1, rvec x2, simulation_box* box ) { - int i, j; - real dist=0.0, tmp; - - for( i = 0; i < 3; i++ ) - { - tmp = 0.0; - for( j = 0; j < 3; j++ ) - tmp += box->g[i][j] * x2[j]; - dist += x1[i] * tmp; - } - - return dist; + int i, j; + real dist=0.0, tmp; + + for( i = 0; i < 3; i++ ) + { + tmp = 0.0; + for( j = 0; j < 3; j++ ) + tmp += box->g[i][j] * x2[j]; + dist += x1[i] * tmp; + } + + return dist; } @@ -319,23 +319,23 @@ real Metric_Product( rvec x1, rvec x2, simulation_box* box ) If so, this neighborhood is added to the list of far neighbors. Periodic boundary conditions do not apply. */ void Get_NonPeriodic_Far_Neighbors( rvec x1, rvec x2, simulation_box *box, - control_params *control, - far_neighbor_data *new_nbrs, int *count ) + control_params *control, + far_neighbor_data *new_nbrs, int *count ) { - real norm_sqr; + real norm_sqr; - rvec_ScaledSum( new_nbrs[0].dvec, 1.0, x2, -1.0, x1 ); + rvec_ScaledSum( new_nbrs[0].dvec, 1.0, x2, -1.0, x1 ); - norm_sqr = rvec_Norm_Sqr( new_nbrs[0].dvec ); + norm_sqr = rvec_Norm_Sqr( new_nbrs[0].dvec ); - if( norm_sqr <= SQR( control->vlist_cut ) ) { - *count = 1; - new_nbrs[0].d = SQRT( norm_sqr ); + if( norm_sqr <= SQR( control->vlist_cut ) ) { + *count = 1; + new_nbrs[0].d = SQRT( norm_sqr ); - ivec_MakeZero( new_nbrs[0].rel_box ); - // rvec_MakeZero( new_nbrs[0].ext_factor ); - } - else *count = 0; + ivec_MakeZero( new_nbrs[0].rel_box ); + // rvec_MakeZero( new_nbrs[0].ext_factor ); + } + else *count = 0; } @@ -344,49 +344,49 @@ void Get_NonPeriodic_Far_Neighbors( rvec x1, rvec x2, simulation_box *box, If the periodic distance between x1 and x2 is than vlist_cut, this neighborhood is added to the list of far neighbors. */ void Get_Periodic_Far_Neighbors_Big_Box( rvec x1, rvec x2, simulation_box *box, - control_params *control, - far_neighbor_data *periodic_nbrs, - int *count ) + control_params *control, + far_neighbor_data *periodic_nbrs, + int *count ) { - real norm_sqr, d, tmp; - int i; - - norm_sqr = 0; - - for( i = 0; i < 3; i++ ) { - d = x2[i] - x1[i]; - tmp = SQR(d); - // fprintf(out,"Inside Sq_Distance_on_T3, %d, %lf, %lf\n", - // i,tmp,SQR(box->box_norms[i]/2.0)); - - if( tmp >= SQR( box->box_norms[i] / 2.0 ) ) { - if( x2[i] > x1[i] ) { - d -= box->box_norms[i]; - periodic_nbrs[0].rel_box[i] = -1; - // periodic_nbrs[0].ext_factor[i] = +1; - } - else { - d += box->box_norms[i]; - periodic_nbrs[0].rel_box[i] = +1; - // periodic_nbrs[0].ext_factor[i] = -1; - } - - periodic_nbrs[0].dvec[i] = d; - norm_sqr += SQR(d); - } - else { - periodic_nbrs[0].dvec[i] = d; - norm_sqr += tmp; - periodic_nbrs[0].rel_box[i] = 0; - // periodic_nbrs[0].ext_factor[i] = 0; - } - } - - if( norm_sqr <= SQR( control->vlist_cut ) ) { - *count = 1; - periodic_nbrs[0].d = SQRT( norm_sqr ); - } - else *count = 0; + real norm_sqr, d, tmp; + int i; + + norm_sqr = 0; + + for( i = 0; i < 3; i++ ) { + d = x2[i] - x1[i]; + tmp = SQR(d); + // fprintf(out,"Inside Sq_Distance_on_T3, %d, %lf, %lf\n", + // i,tmp,SQR(box->box_norms[i]/2.0)); + + if( tmp >= SQR( box->box_norms[i] / 2.0 ) ) { + if( x2[i] > x1[i] ) { + d -= box->box_norms[i]; + periodic_nbrs[0].rel_box[i] = -1; + // periodic_nbrs[0].ext_factor[i] = +1; + } + else { + d += box->box_norms[i]; + periodic_nbrs[0].rel_box[i] = +1; + // periodic_nbrs[0].ext_factor[i] = -1; + } + + periodic_nbrs[0].dvec[i] = d; + norm_sqr += SQR(d); + } + else { + periodic_nbrs[0].dvec[i] = d; + norm_sqr += tmp; + periodic_nbrs[0].rel_box[i] = 0; + // periodic_nbrs[0].ext_factor[i] = 0; + } + } + + if( norm_sqr <= SQR( control->vlist_cut ) ) { + *count = 1; + periodic_nbrs[0].d = SQRT( norm_sqr ); + } + else *count = 0; } @@ -398,69 +398,69 @@ might get too small (such as <5 A!). In this case we have to consider the periodic images of x2 that are two boxs away!!! */ void Get_Periodic_Far_Neighbors_Small_Box( rvec x1, rvec x2, simulation_box *box, - control_params *control, - far_neighbor_data *periodic_nbrs, - int *count ) + control_params *control, + far_neighbor_data *periodic_nbrs, + int *count ) { - int i, j, k; - int imax, jmax, kmax; - real sqr_norm, d_i, d_j, d_k; - - *count = 0; - /* determine the max stretch of imaginary boxs in each direction - to handle periodic boundary conditions correctly. */ - imax = (int)(control->vlist_cut / box->box_norms[0] + 1); - jmax = (int)(control->vlist_cut / box->box_norms[1] + 1); - kmax = (int)(control->vlist_cut / box->box_norms[2] + 1); - /*if( imax > 1 || jmax > 1 || kmax > 1 ) - fprintf( stderr, "box %8.3f x %8.3f x %8.3f --> %2d %2d %2d\n", - box->box_norms[0], box->box_norms[1], box->box_norms[2], - imax, jmax, kmax ); */ - - - for( i = -imax; i <= imax; ++i ) - if(fabs(d_i=((x2[0]+i*box->box_norms[0])-x1[0]))<=control->vlist_cut) { - for( j = -jmax; j <= jmax; ++j ) - if(fabs(d_j=((x2[1]+j*box->box_norms[1])-x1[1]))<=control->vlist_cut) { - for( k = -kmax; k <= kmax; ++k ) - if(fabs(d_k=((x2[2]+k*box->box_norms[2])-x1[2]))<=control->vlist_cut) { - sqr_norm = SQR(d_i) + SQR(d_j) + SQR(d_k); - if( sqr_norm <= SQR(control->vlist_cut) ) { - periodic_nbrs[ *count ].d = SQRT( sqr_norm ); - - periodic_nbrs[ *count ].dvec[0] = d_i; - periodic_nbrs[ *count ].dvec[1] = d_j; - periodic_nbrs[ *count ].dvec[2] = d_k; - - periodic_nbrs[ *count ].rel_box[0] = i; - periodic_nbrs[ *count ].rel_box[1] = j; - periodic_nbrs[ *count ].rel_box[2] = k; - - /* if( i || j || k ) { - fprintf(stderr, "x1: %.2f %.2f %.2f\n", x1[0], x1[1], x1[2]); - fprintf(stderr, "x2: %.2f %.2f %.2f\n", x2[0], x2[1], x2[2]); - fprintf( stderr, "d : %8.2f%8.2f%8.2f\n\n", d_i, d_j, d_k ); - } */ - - /* if(i) periodic_nbrs[*count].ext_factor[0] = (real)i/-abs(i); - else periodic_nbrs[*count].ext_factor[0] = 0; - - if(j) periodic_nbrs[*count].ext_factor[1] = (real)j/-abs(j); - else periodic_nbrs[*count].ext_factor[1] = 0; - - if(k) periodic_nbrs[*count].ext_factor[2] = (real)k/-abs(k); - else periodic_nbrs[*count].ext_factor[2] = 0; */ - - - /* if( i == 0 && j == 0 && k == 0 ) - * periodic_nbrs[ *count ].imaginary = 0; - * else periodic_nbrs[ *count ].imaginary = 1; - */ - ++(*count); - } - } - } - } + int i, j, k; + int imax, jmax, kmax; + real sqr_norm, d_i, d_j, d_k; + + *count = 0; + /* determine the max stretch of imaginary boxs in each direction + to handle periodic boundary conditions correctly. */ + imax = (int)(control->vlist_cut / box->box_norms[0] + 1); + jmax = (int)(control->vlist_cut / box->box_norms[1] + 1); + kmax = (int)(control->vlist_cut / box->box_norms[2] + 1); + /*if( imax > 1 || jmax > 1 || kmax > 1 ) + fprintf( stderr, "box %8.3f x %8.3f x %8.3f --> %2d %2d %2d\n", + box->box_norms[0], box->box_norms[1], box->box_norms[2], + imax, jmax, kmax ); */ + + + for( i = -imax; i <= imax; ++i ) + if(fabs(d_i=((x2[0]+i*box->box_norms[0])-x1[0]))<=control->vlist_cut) { + for( j = -jmax; j <= jmax; ++j ) + if(fabs(d_j=((x2[1]+j*box->box_norms[1])-x1[1]))<=control->vlist_cut) { + for( k = -kmax; k <= kmax; ++k ) + if(fabs(d_k=((x2[2]+k*box->box_norms[2])-x1[2]))<=control->vlist_cut) { + sqr_norm = SQR(d_i) + SQR(d_j) + SQR(d_k); + if( sqr_norm <= SQR(control->vlist_cut) ) { + periodic_nbrs[ *count ].d = SQRT( sqr_norm ); + + periodic_nbrs[ *count ].dvec[0] = d_i; + periodic_nbrs[ *count ].dvec[1] = d_j; + periodic_nbrs[ *count ].dvec[2] = d_k; + + periodic_nbrs[ *count ].rel_box[0] = i; + periodic_nbrs[ *count ].rel_box[1] = j; + periodic_nbrs[ *count ].rel_box[2] = k; + + /* if( i || j || k ) { + fprintf(stderr, "x1: %.2f %.2f %.2f\n", x1[0], x1[1], x1[2]); + fprintf(stderr, "x2: %.2f %.2f %.2f\n", x2[0], x2[1], x2[2]); + fprintf( stderr, "d : %8.2f%8.2f%8.2f\n\n", d_i, d_j, d_k ); + } */ + + /* if(i) periodic_nbrs[*count].ext_factor[0] = (real)i/-abs(i); + else periodic_nbrs[*count].ext_factor[0] = 0; + + if(j) periodic_nbrs[*count].ext_factor[1] = (real)j/-abs(j); + else periodic_nbrs[*count].ext_factor[1] = 0; + + if(k) periodic_nbrs[*count].ext_factor[2] = (real)k/-abs(k); + else periodic_nbrs[*count].ext_factor[2] = 0; */ + + + /* if( i == 0 && j == 0 && k == 0 ) + * periodic_nbrs[ *count ].imaginary = 0; + * else periodic_nbrs[ *count ].imaginary = 1; + */ + ++(*count); + } + } + } + } } @@ -505,39 +505,39 @@ rvec_Add( box->nbr_box_press[map], v ); void Print_Box_Information( simulation_box* box, FILE *out ) { - int i, j; - - fprintf( out, "box: {" ); - for( i = 0; i < 3; ++i ) - { - fprintf( out, "{" ); - for( j = 0; j < 3; ++j ) - fprintf( out, "%8.3f ", box->box[i][j] ); - fprintf( out, "}" ); - } - fprintf( out, "}\n" ); - - fprintf( out, "V: %8.3f\tdims: {%8.3f, %8.3f, %8.3f}\n", - box->volume, - box->box_norms[0], box->box_norms[1], box->box_norms[2] ); - - fprintf( out, "box_trans: {" ); - for( i = 0; i < 3; ++i ) - { - fprintf( out, "{" ); - for( j = 0; j < 3; ++j ) - fprintf( out, "%8.3f ", box->trans[i][j] ); - fprintf( out, "}" ); - } - fprintf( out, "}\n" ); - - fprintf( out, "box_trinv: {" ); - for( i = 0; i < 3; ++i ) - { - fprintf( out, "{" ); - for( j = 0; j < 3; ++j ) - fprintf( out, "%8.3f ", box->trans_inv[i][j] ); - fprintf( out, "}" ); - } - fprintf( out, "}\n" ); + int i, j; + + fprintf( out, "box: {" ); + for( i = 0; i < 3; ++i ) + { + fprintf( out, "{" ); + for( j = 0; j < 3; ++j ) + fprintf( out, "%8.3f ", box->box[i][j] ); + fprintf( out, "}" ); + } + fprintf( out, "}\n" ); + + fprintf( out, "V: %8.3f\tdims: {%8.3f, %8.3f, %8.3f}\n", + box->volume, + box->box_norms[0], box->box_norms[1], box->box_norms[2] ); + + fprintf( out, "box_trans: {" ); + for( i = 0; i < 3; ++i ) + { + fprintf( out, "{" ); + for( j = 0; j < 3; ++j ) + fprintf( out, "%8.3f ", box->trans[i][j] ); + fprintf( out, "}" ); + } + fprintf( out, "}\n" ); + + fprintf( out, "box_trinv: {" ); + for( i = 0; i < 3; ++i ) + { + fprintf( out, "{" ); + for( j = 0; j < 3; ++j ) + fprintf( out, "%8.3f ", box->trans_inv[i][j] ); + fprintf( out, "}" ); + } + fprintf( out, "}\n" ); } diff --git a/PuReMD-GPU/src/center_mass.cu b/PuReMD-GPU/src/center_mass.cu index d81e769e..ea8f7998 100644 --- a/PuReMD-GPU/src/center_mass.cu +++ b/PuReMD-GPU/src/center_mass.cu @@ -25,235 +25,235 @@ #include "vector.h" GLOBAL void center_of_mass_blocks (single_body_parameters *sbp, reax_atom *atoms, - rvec *res_xcm, - rvec *res_vcm, - rvec *res_amcm, - size_t n) + rvec *res_xcm, + rvec *res_vcm, + rvec *res_amcm, + size_t n) { - extern __shared__ rvec xcm[]; - extern __shared__ rvec vcm[]; - extern __shared__ rvec amcm[]; - - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - - unsigned int xcm_id = threadIdx.x; - unsigned int vcm_id = blockDim.x; - unsigned int amcm_id = 2 *(blockDim.x); - - unsigned int index = 0; - rvec tmp; - real m; - - rvec_MakeZero (xcm [threadIdx.x]); - rvec_MakeZero (vcm [vcm_id + threadIdx.x]); - rvec_MakeZero (amcm[amcm_id + threadIdx.x]); - rvec_MakeZero (tmp); - - if (i < n){ - m = sbp [ atoms[i].type ].mass; - rvec_ScaledAdd (xcm [threadIdx.x], m, atoms [i].x); - rvec_ScaledAdd (vcm [vcm_id + threadIdx.x], m, atoms [i].v); - rvec_Cross (tmp, atoms[i].x, atoms [i].v); - rvec_ScaledAdd (amcm[amcm_id + threadIdx.x], m, tmp); - } - __syncthreads (); - - for( int offset = blockDim.x / 2; offset > 0; offset >>= 1 ) { - - if ((threadIdx.x < offset)) { - index = threadIdx.x + offset; - rvec_Add (xcm [threadIdx.x], xcm[index]); - rvec_Add (vcm [vcm_id + threadIdx.x], vcm[vcm_id + index]); - rvec_Add (amcm[amcm_id + threadIdx.x], amcm[amcm_id + index]); - } - __syncthreads (); - } - - if ((threadIdx.x == 0)){ - rvec_Copy (res_xcm[blockIdx.x], xcm[0]); - rvec_Copy (res_vcm[blockIdx.x], vcm[vcm_id]); - rvec_Copy (res_amcm[blockIdx.x], amcm[amcm_id]); - } + extern __shared__ rvec xcm[]; + extern __shared__ rvec vcm[]; + extern __shared__ rvec amcm[]; + + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + + unsigned int xcm_id = threadIdx.x; + unsigned int vcm_id = blockDim.x; + unsigned int amcm_id = 2 *(blockDim.x); + + unsigned int index = 0; + rvec tmp; + real m; + + rvec_MakeZero (xcm [threadIdx.x]); + rvec_MakeZero (vcm [vcm_id + threadIdx.x]); + rvec_MakeZero (amcm[amcm_id + threadIdx.x]); + rvec_MakeZero (tmp); + + if (i < n){ + m = sbp [ atoms[i].type ].mass; + rvec_ScaledAdd (xcm [threadIdx.x], m, atoms [i].x); + rvec_ScaledAdd (vcm [vcm_id + threadIdx.x], m, atoms [i].v); + rvec_Cross (tmp, atoms[i].x, atoms [i].v); + rvec_ScaledAdd (amcm[amcm_id + threadIdx.x], m, tmp); + } + __syncthreads (); + + for( int offset = blockDim.x / 2; offset > 0; offset >>= 1 ) { + + if ((threadIdx.x < offset)) { + index = threadIdx.x + offset; + rvec_Add (xcm [threadIdx.x], xcm[index]); + rvec_Add (vcm [vcm_id + threadIdx.x], vcm[vcm_id + index]); + rvec_Add (amcm[amcm_id + threadIdx.x], amcm[amcm_id + index]); + } + __syncthreads (); + } + + if ((threadIdx.x == 0)){ + rvec_Copy (res_xcm[blockIdx.x], xcm[0]); + rvec_Copy (res_vcm[blockIdx.x], vcm[vcm_id]); + rvec_Copy (res_amcm[blockIdx.x], amcm[amcm_id]); + } } GLOBAL void center_of_mass (rvec *xcm, - rvec *vcm, - rvec *amcm, - rvec *res_xcm, - rvec *res_vcm, - rvec *res_amcm, - size_t n) + rvec *vcm, + rvec *amcm, + rvec *res_xcm, + rvec *res_vcm, + rvec *res_amcm, + size_t n) { - extern __shared__ rvec sh_xcm[]; - extern __shared__ rvec sh_vcm[]; - extern __shared__ rvec sh_amcm[]; - - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - - unsigned int xcm_id = threadIdx.x; - unsigned int vcm_id = blockDim.x; - unsigned int amcm_id = 2 * (blockDim.x); - - unsigned int index = 0; - rvec t_xcm, t_vcm, t_amcm; - - rvec_MakeZero (t_xcm); - rvec_MakeZero (t_vcm); - rvec_MakeZero (t_amcm); - - if (i < n){ - rvec_Copy ( t_xcm, xcm[threadIdx.x]); - rvec_Copy ( t_vcm, vcm[threadIdx.x]); - rvec_Copy ( t_amcm, amcm[threadIdx.x]); - } - - rvec_Copy (sh_xcm[xcm_id], t_xcm); - rvec_Copy (sh_vcm[vcm_id + threadIdx.x], t_vcm); - rvec_Copy (sh_amcm[amcm_id + threadIdx.x], t_amcm); - - __syncthreads (); - - for( int offset = blockDim.x / 2; offset > 0; offset >>= 1 ) { - - if (threadIdx.x < offset) { - index = threadIdx.x + offset; - rvec_Add (sh_xcm [threadIdx.x], sh_xcm[index]); - rvec_Add (sh_vcm [vcm_id + threadIdx.x], sh_vcm[vcm_id + index]); - rvec_Add (sh_amcm [amcm_id + threadIdx.x], sh_amcm[amcm_id + index]); - } - __syncthreads (); - } - - if (threadIdx.x == 0){ - rvec_Copy (res_xcm[blockIdx.x], sh_xcm[0]); - rvec_Copy (res_vcm[blockIdx.x], sh_vcm[vcm_id]); - rvec_Copy (res_amcm[blockIdx.x], sh_amcm[amcm_id]); - } + extern __shared__ rvec sh_xcm[]; + extern __shared__ rvec sh_vcm[]; + extern __shared__ rvec sh_amcm[]; + + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + + unsigned int xcm_id = threadIdx.x; + unsigned int vcm_id = blockDim.x; + unsigned int amcm_id = 2 * (blockDim.x); + + unsigned int index = 0; + rvec t_xcm, t_vcm, t_amcm; + + rvec_MakeZero (t_xcm); + rvec_MakeZero (t_vcm); + rvec_MakeZero (t_amcm); + + if (i < n){ + rvec_Copy ( t_xcm, xcm[threadIdx.x]); + rvec_Copy ( t_vcm, vcm[threadIdx.x]); + rvec_Copy ( t_amcm, amcm[threadIdx.x]); + } + + rvec_Copy (sh_xcm[xcm_id], t_xcm); + rvec_Copy (sh_vcm[vcm_id + threadIdx.x], t_vcm); + rvec_Copy (sh_amcm[amcm_id + threadIdx.x], t_amcm); + + __syncthreads (); + + for( int offset = blockDim.x / 2; offset > 0; offset >>= 1 ) { + + if (threadIdx.x < offset) { + index = threadIdx.x + offset; + rvec_Add (sh_xcm [threadIdx.x], sh_xcm[index]); + rvec_Add (sh_vcm [vcm_id + threadIdx.x], sh_vcm[vcm_id + index]); + rvec_Add (sh_amcm [amcm_id + threadIdx.x], sh_amcm[amcm_id + index]); + } + __syncthreads (); + } + + if (threadIdx.x == 0){ + rvec_Copy (res_xcm[blockIdx.x], sh_xcm[0]); + rvec_Copy (res_vcm[blockIdx.x], sh_vcm[vcm_id]); + rvec_Copy (res_amcm[blockIdx.x], sh_amcm[amcm_id]); + } } GLOBAL void compute_center_mass (single_body_parameters *sbp, - reax_atom *atoms, - real *results, - real xcm0, real xcm1, real xcm2, - size_t n) + reax_atom *atoms, + real *results, + real xcm0, real xcm1, real xcm2, + size_t n) { - extern __shared__ real xx[]; - extern __shared__ real xy[]; - extern __shared__ real xz[]; - extern __shared__ real yy[]; - extern __shared__ real yz[]; - extern __shared__ real zz[]; - - unsigned int xx_i = threadIdx.x; - unsigned int xy_i = blockDim.x; - unsigned int xz_i = 2 * blockDim.x; - unsigned int yy_i = 3 * blockDim.x; - unsigned int yz_i = 4 * blockDim.x; - unsigned int zz_i = 5 * blockDim.x; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int index = 0; - - rvec diff, xcm; - real m = 0; - rvec_MakeZero (diff); - xcm[0] = xcm0; - xcm[1] = xcm1; - xcm[2] = xcm2; - - - xx[xx_i] = xy [xy_i + threadIdx.x] = xz[xz_i + threadIdx.x] = - yy[yy_i + threadIdx.x] = yz[yz_i + threadIdx.x] = zz[zz_i + threadIdx.x] = 0; - - if (i < n){ - m = sbp[ atoms[i].type ].mass; - rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm ); - xx[ xx_i ] = diff[0] * diff[0] * m; - xy[ xy_i + threadIdx.x ] = diff[0] * diff[1] * m; - xz[ xz_i + threadIdx.x ] = diff[0] * diff[2] * m; - yy[ yy_i + threadIdx.x ] = diff[1] * diff[1] * m; - yz[ yz_i + threadIdx.x ] = diff[1] * diff[2] * m; - zz[ zz_i + threadIdx.x ] = diff[2] * diff[2] * m; - } - __syncthreads (); - - for (int offset = blockDim.x / 2; offset > 0; offset >>= 1){ - if (threadIdx.x < offset){ - index = threadIdx.x + offset; - xx[ threadIdx.x ] += xx[ index ]; - xy[ xy_i + threadIdx.x ] += xy [ xy_i + index ]; - xz[ xz_i + threadIdx.x ] += xz [ xz_i + index ]; - yy[ yy_i + threadIdx.x ] += yy [ yy_i + index ]; - yz[ yz_i + threadIdx.x ] += yz [ yz_i + index ]; - zz[ zz_i + threadIdx.x ] += zz [ zz_i + index ]; - } - __syncthreads (); - } - - if (threadIdx.x == 0) { - results [ blockIdx.x*6 ] = xx [ 0 ]; - results [ blockIdx.x*6 + 1 ] = xy [ xy_i + 0 ]; - results [ blockIdx.x*6 + 2 ] = xz [ xz_i + 0 ]; - results [ blockIdx.x*6 + 3 ] = yy [ yy_i + 0 ]; - results [ blockIdx.x*6 + 4 ] = yz [ yz_i + 0 ]; - results [ blockIdx.x*6 + 5 ] = zz [ zz_i + 0 ]; - } + extern __shared__ real xx[]; + extern __shared__ real xy[]; + extern __shared__ real xz[]; + extern __shared__ real yy[]; + extern __shared__ real yz[]; + extern __shared__ real zz[]; + + unsigned int xx_i = threadIdx.x; + unsigned int xy_i = blockDim.x; + unsigned int xz_i = 2 * blockDim.x; + unsigned int yy_i = 3 * blockDim.x; + unsigned int yz_i = 4 * blockDim.x; + unsigned int zz_i = 5 * blockDim.x; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int index = 0; + + rvec diff, xcm; + real m = 0; + rvec_MakeZero (diff); + xcm[0] = xcm0; + xcm[1] = xcm1; + xcm[2] = xcm2; + + + xx[xx_i] = xy [xy_i + threadIdx.x] = xz[xz_i + threadIdx.x] = + yy[yy_i + threadIdx.x] = yz[yz_i + threadIdx.x] = zz[zz_i + threadIdx.x] = 0; + + if (i < n){ + m = sbp[ atoms[i].type ].mass; + rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm ); + xx[ xx_i ] = diff[0] * diff[0] * m; + xy[ xy_i + threadIdx.x ] = diff[0] * diff[1] * m; + xz[ xz_i + threadIdx.x ] = diff[0] * diff[2] * m; + yy[ yy_i + threadIdx.x ] = diff[1] * diff[1] * m; + yz[ yz_i + threadIdx.x ] = diff[1] * diff[2] * m; + zz[ zz_i + threadIdx.x ] = diff[2] * diff[2] * m; + } + __syncthreads (); + + for (int offset = blockDim.x / 2; offset > 0; offset >>= 1){ + if (threadIdx.x < offset){ + index = threadIdx.x + offset; + xx[ threadIdx.x ] += xx[ index ]; + xy[ xy_i + threadIdx.x ] += xy [ xy_i + index ]; + xz[ xz_i + threadIdx.x ] += xz [ xz_i + index ]; + yy[ yy_i + threadIdx.x ] += yy [ yy_i + index ]; + yz[ yz_i + threadIdx.x ] += yz [ yz_i + index ]; + zz[ zz_i + threadIdx.x ] += zz [ zz_i + index ]; + } + __syncthreads (); + } + + if (threadIdx.x == 0) { + results [ blockIdx.x*6 ] = xx [ 0 ]; + results [ blockIdx.x*6 + 1 ] = xy [ xy_i + 0 ]; + results [ blockIdx.x*6 + 2 ] = xz [ xz_i + 0 ]; + results [ blockIdx.x*6 + 3 ] = yy [ yy_i + 0 ]; + results [ blockIdx.x*6 + 4 ] = yz [ yz_i + 0 ]; + results [ blockIdx.x*6 + 5 ] = zz [ zz_i + 0 ]; + } } GLOBAL void compute_center_mass (real *input, real *output, size_t n) { - extern __shared__ real xx[]; - extern __shared__ real xy[]; - extern __shared__ real xz[]; - extern __shared__ real yy[]; - extern __shared__ real yz[]; - extern __shared__ real zz[]; - - unsigned int xx_i = threadIdx.x; - unsigned int xy_i = blockDim.x; - unsigned int xz_i = 2 * blockDim.x; - unsigned int yy_i = 3 * blockDim.x; - unsigned int yz_i = 4 * blockDim.x; - unsigned int zz_i = 5 * blockDim.x; - - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int index = 0; - - xx[xx_i] = xy [xy_i + threadIdx.x] = xz[xz_i + threadIdx.x] = - yy[yy_i + threadIdx.x] = yz[yz_i + threadIdx.x] = zz[zz_i + threadIdx.x] = 0; - - if (i < n) - { - xx [ xx_i ] = input [ threadIdx.x*6 + 0 ]; - xy [ xy_i + threadIdx.x ] = input [ threadIdx.x*6 + 1 ]; - xz [ xz_i + threadIdx.x ] = input [ threadIdx.x*6 + 2 ]; - yy [ yy_i + threadIdx.x ] = input [ threadIdx.x*6 + 3 ]; - yz [ yz_i + threadIdx.x ] = input [ threadIdx.x*6 + 4 ]; - zz [ zz_i + threadIdx.x ] = input [ threadIdx.x*6 + 5 ]; - } - __syncthreads (); - - for (int offset = blockDim.x / 2; offset > 0; offset >>= 1) - { - if (threadIdx.x < offset ) - { - index = threadIdx.x + offset; - xx [ threadIdx.x ] += xx [ index ]; - xy [ xy_i + threadIdx.x ] += xy [ xy_i + index ]; - xz [ xz_i + threadIdx.x ] += xz [ xz_i + index ]; - yy [ yy_i + threadIdx.x ] += yy [ yy_i + index ]; - yz [ yz_i + threadIdx.x ] += yz [ yz_i + index ]; - zz [ zz_i + threadIdx.x ] += zz [ zz_i + index ]; - } - __syncthreads (); - } - - if (threadIdx.x == 0) - { - output[0] = xx[0]; - output[1] = xy[xy_i]; - output[2] = xz[xz_i]; - output[3] = xz[yy_i]; - output[4] = xz[yz_i]; - output[5] = xz[zz_i]; - } + extern __shared__ real xx[]; + extern __shared__ real xy[]; + extern __shared__ real xz[]; + extern __shared__ real yy[]; + extern __shared__ real yz[]; + extern __shared__ real zz[]; + + unsigned int xx_i = threadIdx.x; + unsigned int xy_i = blockDim.x; + unsigned int xz_i = 2 * blockDim.x; + unsigned int yy_i = 3 * blockDim.x; + unsigned int yz_i = 4 * blockDim.x; + unsigned int zz_i = 5 * blockDim.x; + + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int index = 0; + + xx[xx_i] = xy [xy_i + threadIdx.x] = xz[xz_i + threadIdx.x] = + yy[yy_i + threadIdx.x] = yz[yz_i + threadIdx.x] = zz[zz_i + threadIdx.x] = 0; + + if (i < n) + { + xx [ xx_i ] = input [ threadIdx.x*6 + 0 ]; + xy [ xy_i + threadIdx.x ] = input [ threadIdx.x*6 + 1 ]; + xz [ xz_i + threadIdx.x ] = input [ threadIdx.x*6 + 2 ]; + yy [ yy_i + threadIdx.x ] = input [ threadIdx.x*6 + 3 ]; + yz [ yz_i + threadIdx.x ] = input [ threadIdx.x*6 + 4 ]; + zz [ zz_i + threadIdx.x ] = input [ threadIdx.x*6 + 5 ]; + } + __syncthreads (); + + for (int offset = blockDim.x / 2; offset > 0; offset >>= 1) + { + if (threadIdx.x < offset ) + { + index = threadIdx.x + offset; + xx [ threadIdx.x ] += xx [ index ]; + xy [ xy_i + threadIdx.x ] += xy [ xy_i + index ]; + xz [ xz_i + threadIdx.x ] += xz [ xz_i + index ]; + yy [ yy_i + threadIdx.x ] += yy [ yy_i + index ]; + yz [ yz_i + threadIdx.x ] += yz [ yz_i + index ]; + zz [ zz_i + threadIdx.x ] += zz [ zz_i + index ]; + } + __syncthreads (); + } + + if (threadIdx.x == 0) + { + output[0] = xx[0]; + output[1] = xy[xy_i]; + output[2] = xz[xz_i]; + output[3] = xz[yy_i]; + output[4] = xz[yz_i]; + output[5] = xz[zz_i]; + } } diff --git a/PuReMD-GPU/src/cuda_copy.cu b/PuReMD-GPU/src/cuda_copy.cu index 6ce94690..2db79e37 100644 --- a/PuReMD-GPU/src/cuda_copy.cu +++ b/PuReMD-GPU/src/cuda_copy.cu @@ -26,70 +26,70 @@ void Sync_Host_Device (grid *host, grid *dev, enum cudaMemcpyKind dir) { - copy_host_device (host->top, dev->top, - INT_SIZE * host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_TOP); + copy_host_device (host->top, dev->top, + INT_SIZE * host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_TOP); - copy_host_device (host->mark, dev->mark, - INT_SIZE * host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_MARK); + copy_host_device (host->mark, dev->mark, + INT_SIZE * host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_MARK); - copy_host_device (host->start, dev->start, - INT_SIZE * host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_START); + copy_host_device (host->start, dev->start, + INT_SIZE * host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_START); - copy_host_device (host->end, dev->end, - INT_SIZE * host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_END); + copy_host_device (host->end, dev->end, + INT_SIZE * host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_END); - copy_host_device (host->atoms, dev->atoms, - INT_SIZE * host->max_atoms*host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_ATOMS); + copy_host_device (host->atoms, dev->atoms, + INT_SIZE * host->max_atoms*host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_ATOMS); - copy_host_device (host->nbrs, dev->nbrs, - IVEC_SIZE * host->max_nbrs*host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_NBRS); + copy_host_device (host->nbrs, dev->nbrs, + IVEC_SIZE * host->max_nbrs*host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_NBRS); - copy_host_device (host->nbrs_cp, dev->nbrs_cp, - RVEC_SIZE * host->max_nbrs*host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_NBRS_CP); + copy_host_device (host->nbrs_cp, dev->nbrs_cp, + RVEC_SIZE * host->max_nbrs*host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_NBRS_CP); } void Sync_Host_Device (reax_system *sys, enum cudaMemcpyKind dir) { - copy_host_device (sys->atoms, sys->d_atoms, - REAX_ATOM_SIZE * sys->N, dir, RES_SYSTEM_ATOMS); + copy_host_device (sys->atoms, sys->d_atoms, + REAX_ATOM_SIZE * sys->N, dir, RES_SYSTEM_ATOMS); - copy_host_device (&(sys->box), sys->d_box, SIMULATION_BOX_SIZE, dir, RES_SYSTEM_SIMULATION_BOX ); + copy_host_device (&(sys->box), sys->d_box, SIMULATION_BOX_SIZE, dir, RES_SYSTEM_SIMULATION_BOX ); - //synch bonds here. - copy_host_device (sys->reaxprm.sbp, sys->reaxprm.d_sbp, SBP_SIZE * sys->reaxprm.num_atom_types, - dir, RES_REAX_INT_SBP ); - copy_host_device (sys->reaxprm.tbp, sys->reaxprm.d_tbp, TBP_SIZE * pow (sys->reaxprm.num_atom_types, 2), - dir, RES_REAX_INT_TBP ); - copy_host_device (sys->reaxprm.thbp, sys->reaxprm.d_thbp, THBP_SIZE * pow (sys->reaxprm.num_atom_types, 3), - dir, RES_REAX_INT_THBP ); - copy_host_device (sys->reaxprm.hbp, sys->reaxprm.d_hbp, HBP_SIZE * pow (sys->reaxprm.num_atom_types, 3), - dir, RES_REAX_INT_HBP ); - copy_host_device (sys->reaxprm.fbp, sys->reaxprm.d_fbp, FBP_SIZE * pow (sys->reaxprm.num_atom_types, 4), - dir, RES_REAX_INT_FBP ); + //synch bonds here. + copy_host_device (sys->reaxprm.sbp, sys->reaxprm.d_sbp, SBP_SIZE * sys->reaxprm.num_atom_types, + dir, RES_REAX_INT_SBP ); + copy_host_device (sys->reaxprm.tbp, sys->reaxprm.d_tbp, TBP_SIZE * pow (sys->reaxprm.num_atom_types, 2), + dir, RES_REAX_INT_TBP ); + copy_host_device (sys->reaxprm.thbp, sys->reaxprm.d_thbp, THBP_SIZE * pow (sys->reaxprm.num_atom_types, 3), + dir, RES_REAX_INT_THBP ); + copy_host_device (sys->reaxprm.hbp, sys->reaxprm.d_hbp, HBP_SIZE * pow (sys->reaxprm.num_atom_types, 3), + dir, RES_REAX_INT_HBP ); + copy_host_device (sys->reaxprm.fbp, sys->reaxprm.d_fbp, FBP_SIZE * pow (sys->reaxprm.num_atom_types, 4), + dir, RES_REAX_INT_FBP ); - copy_host_device (sys->reaxprm.gp.l, sys->reaxprm.d_gp.l, REAL_SIZE * sys->reaxprm.gp.n_global, - dir, RES_GLOBAL_PARAMS ); + copy_host_device (sys->reaxprm.gp.l, sys->reaxprm.d_gp.l, REAL_SIZE * sys->reaxprm.gp.n_global, + dir, RES_GLOBAL_PARAMS ); - sys->reaxprm.d_gp.n_global = sys->reaxprm.gp.n_global; - sys->reaxprm.d_gp.vdw_type = sys->reaxprm.gp.vdw_type; + sys->reaxprm.d_gp.n_global = sys->reaxprm.gp.n_global; + sys->reaxprm.d_gp.vdw_type = sys->reaxprm.gp.vdw_type; } void Sync_Host_Device (simulation_data *host, simulation_data *dev, enum cudaMemcpyKind dir) { - copy_host_device (host, dev, SIMULATION_DATA_SIZE, dir, RES_SIMULATION_DATA ); + copy_host_device (host, dev, SIMULATION_DATA_SIZE, dir, RES_SIMULATION_DATA ); } void Sync_Host_Device (sparse_matrix *L, sparse_matrix *U, enum cudaMemcpyKind dir ) { - copy_host_device ( L->start, dev_workspace->L.start, INT_SIZE * (L->n + 1), dir, RES_SPARSE_MATRIX_INDEX ); - copy_host_device ( L->end, dev_workspace->L.end, INT_SIZE * (L->n + 1), dir, RES_SPARSE_MATRIX_INDEX ); - copy_host_device ( L->entries, dev_workspace->L.entries, SPARSE_MATRIX_ENTRY_SIZE * L->m, dir, RES_SPARSE_MATRIX_ENTRY ); + copy_host_device ( L->start, dev_workspace->L.start, INT_SIZE * (L->n + 1), dir, RES_SPARSE_MATRIX_INDEX ); + copy_host_device ( L->end, dev_workspace->L.end, INT_SIZE * (L->n + 1), dir, RES_SPARSE_MATRIX_INDEX ); + copy_host_device ( L->entries, dev_workspace->L.entries, SPARSE_MATRIX_ENTRY_SIZE * L->m, dir, RES_SPARSE_MATRIX_ENTRY ); - copy_host_device ( U->start, dev_workspace->U.start, INT_SIZE * (U->n + 1), dir, RES_SPARSE_MATRIX_INDEX ); - copy_host_device ( U->end, dev_workspace->U.end, INT_SIZE * (U->n + 1), dir, RES_SPARSE_MATRIX_INDEX ); - copy_host_device ( U->entries, dev_workspace->U.entries, SPARSE_MATRIX_ENTRY_SIZE * U->m, dir, RES_SPARSE_MATRIX_ENTRY ); + copy_host_device ( U->start, dev_workspace->U.start, INT_SIZE * (U->n + 1), dir, RES_SPARSE_MATRIX_INDEX ); + copy_host_device ( U->end, dev_workspace->U.end, INT_SIZE * (U->n + 1), dir, RES_SPARSE_MATRIX_INDEX ); + copy_host_device ( U->entries, dev_workspace->U.entries, SPARSE_MATRIX_ENTRY_SIZE * U->m, dir, RES_SPARSE_MATRIX_ENTRY ); } void Sync_Host_Device (output_controls *, control_params *, enum cudaMemcpyKind) @@ -98,86 +98,86 @@ void Sync_Host_Device (output_controls *, control_params *, enum cudaMemcpyKind) void Sync_Host_Device (control_params *host, control_params *device, enum cudaMemcpyKind) { - copy_host_device (host, device, CONTROL_PARAMS_SIZE, cudaMemcpyHostToDevice, RES_CONTROL_PARAMS ); + copy_host_device (host, device, CONTROL_PARAMS_SIZE, cudaMemcpyHostToDevice, RES_CONTROL_PARAMS ); } void Prep_Device_For_Output (reax_system *system, simulation_data *data ) { - //int size = sizeof (simulation_data) - (2*sizeof (reax_timing) + sizeof (void *)); - //unsigned long start_address = (unsigned long)data->d_simulation_data + (unsigned long) (2 * INT_SIZE + REAL_SIZE); - - //fprintf (stderr, "Address of Simulation data (address) --> %ld \n", data->d_simulation_data ); - //fprintf (stderr, "Size of simulation_data --> %d \n", sizeof (simulation_data)); - //fprintf (stderr, "size to copy --> %d \n", size ); - //copy_host_device (data, (simulation_data *)data->d_simulation_data, size, cudaMemcpyDeviceToHost, RES_SIMULATION_DATA ); - - //Sync_Host_Device (data, (simulation_data *)data->d_simulation_data, cudaMemcpyDeviceToHost ); - /* - copy_host_device (&data->E_BE, &((simulation_data *)data->d_simulation_data)->E_BE, - REAL_SIZE * 13, cudaMemcpyDeviceToHost, RES_SIMULATION_DATA ); - copy_host_device (&data->E_Kin, &((simulation_data *)data->d_simulation_data)->E_Kin, - REAL_SIZE, cudaMemcpyDeviceToHost, RES_SIMULATION_DATA ); - copy_host_device (&data->int_press, &((simulation_data *)data->d_simulation_data)->int_press, - 3*(RVEC_SIZE) + REAL_SIZE, cudaMemcpyDeviceToHost, RES_SIMULATION_DATA ); - - copy_host_device (&data->therm.T, &((simulation_data *)data->d_simulation_data)->therm.T, - REAL_SIZE, cudaMemcpyDeviceToHost, RES_SIMULATION_DATA ); - */ - - simulation_data local_data; - copy_host_device (&local_data, (simulation_data *)data->d_simulation_data, - SIMULATION_DATA_SIZE, cudaMemcpyDeviceToHost, RES_SIMULATION_DATA ); - data->E_BE = local_data.E_BE; - data->E_Ov = local_data.E_Ov; - data->E_Un = local_data.E_Un; - data->E_Lp = local_data.E_Lp; - data->E_Ang = local_data.E_Ang; - data->E_Pen = local_data.E_Pen; - data->E_Coa = local_data.E_Coa; - data->E_HB = local_data.E_HB; - data->E_Tor = local_data.E_Tor; - data->E_Con = local_data.E_Con; - data->E_vdW = local_data.E_vdW; - data->E_Ele = local_data.E_Ele; - data->E_Kin = local_data.E_Kin; - rvec_Copy (data->int_press, local_data.int_press); - rvec_Copy (data->ext_press, local_data.ext_press); - data->kin_press = local_data.kin_press; - data->therm.T = local_data.therm.T; - - //Sync_Host_Device (&system.g, &system.d_g, cudaMemcpyDeviceToHost ); - Sync_Host_Device (system, cudaMemcpyDeviceToHost ); + //int size = sizeof (simulation_data) - (2*sizeof (reax_timing) + sizeof (void *)); + //unsigned long start_address = (unsigned long)data->d_simulation_data + (unsigned long) (2 * INT_SIZE + REAL_SIZE); + + //fprintf (stderr, "Address of Simulation data (address) --> %ld \n", data->d_simulation_data ); + //fprintf (stderr, "Size of simulation_data --> %d \n", sizeof (simulation_data)); + //fprintf (stderr, "size to copy --> %d \n", size ); + //copy_host_device (data, (simulation_data *)data->d_simulation_data, size, cudaMemcpyDeviceToHost, RES_SIMULATION_DATA ); + + //Sync_Host_Device (data, (simulation_data *)data->d_simulation_data, cudaMemcpyDeviceToHost ); + /* + copy_host_device (&data->E_BE, &((simulation_data *)data->d_simulation_data)->E_BE, + REAL_SIZE * 13, cudaMemcpyDeviceToHost, RES_SIMULATION_DATA ); + copy_host_device (&data->E_Kin, &((simulation_data *)data->d_simulation_data)->E_Kin, + REAL_SIZE, cudaMemcpyDeviceToHost, RES_SIMULATION_DATA ); + copy_host_device (&data->int_press, &((simulation_data *)data->d_simulation_data)->int_press, + 3*(RVEC_SIZE) + REAL_SIZE, cudaMemcpyDeviceToHost, RES_SIMULATION_DATA ); + + copy_host_device (&data->therm.T, &((simulation_data *)data->d_simulation_data)->therm.T, + REAL_SIZE, cudaMemcpyDeviceToHost, RES_SIMULATION_DATA ); + */ + + simulation_data local_data; + copy_host_device (&local_data, (simulation_data *)data->d_simulation_data, + SIMULATION_DATA_SIZE, cudaMemcpyDeviceToHost, RES_SIMULATION_DATA ); + data->E_BE = local_data.E_BE; + data->E_Ov = local_data.E_Ov; + data->E_Un = local_data.E_Un; + data->E_Lp = local_data.E_Lp; + data->E_Ang = local_data.E_Ang; + data->E_Pen = local_data.E_Pen; + data->E_Coa = local_data.E_Coa; + data->E_HB = local_data.E_HB; + data->E_Tor = local_data.E_Tor; + data->E_Con = local_data.E_Con; + data->E_vdW = local_data.E_vdW; + data->E_Ele = local_data.E_Ele; + data->E_Kin = local_data.E_Kin; + rvec_Copy (data->int_press, local_data.int_press); + rvec_Copy (data->ext_press, local_data.ext_press); + data->kin_press = local_data.kin_press; + data->therm.T = local_data.therm.T; + + //Sync_Host_Device (&system.g, &system.d_g, cudaMemcpyDeviceToHost ); + Sync_Host_Device (system, cudaMemcpyDeviceToHost ); } void Sync_Host_Device (list *host, list *device, int type) { - //list is already allocated -- discard it first - if (host->n > 0) - Delete_List (host, TYP_HOST); - - //memory is allocated on the host - Make_List(device->n, device->num_intrs, type, host, TYP_HOST ); - - //memcpy the entries from device to host - copy_host_device (host->index, device->index, INT_SIZE * device->n, cudaMemcpyDeviceToHost, LIST_INDEX ); - copy_host_device (host->end_index, device->end_index, INT_SIZE * device->n, cudaMemcpyDeviceToHost, LIST_END_INDEX ); - - switch (type) - { - case TYP_BOND: - copy_host_device (host->select.bond_list, device->select.bond_list, - BOND_DATA_SIZE * device->num_intrs, cudaMemcpyDeviceToHost, LIST_BOND_DATA ); - break; - - case TYP_THREE_BODY: - copy_host_device (host->select.three_body_list, device->select.three_body_list, - sizeof (three_body_interaction_data )* device->num_intrs, cudaMemcpyDeviceToHost, LIST_THREE_BODY_DATA ); - break; - - default: - fprintf (stderr, "Unknown list synching from device to host ---- > %d \n", type ); - exit (1); - break; - } + //list is already allocated -- discard it first + if (host->n > 0) + Delete_List (host, TYP_HOST); + + //memory is allocated on the host + Make_List(device->n, device->num_intrs, type, host, TYP_HOST ); + + //memcpy the entries from device to host + copy_host_device (host->index, device->index, INT_SIZE * device->n, cudaMemcpyDeviceToHost, LIST_INDEX ); + copy_host_device (host->end_index, device->end_index, INT_SIZE * device->n, cudaMemcpyDeviceToHost, LIST_END_INDEX ); + + switch (type) + { + case TYP_BOND: + copy_host_device (host->select.bond_list, device->select.bond_list, + BOND_DATA_SIZE * device->num_intrs, cudaMemcpyDeviceToHost, LIST_BOND_DATA ); + break; + + case TYP_THREE_BODY: + copy_host_device (host->select.three_body_list, device->select.three_body_list, + sizeof (three_body_interaction_data )* device->num_intrs, cudaMemcpyDeviceToHost, LIST_THREE_BODY_DATA ); + break; + + default: + fprintf (stderr, "Unknown list synching from device to host ---- > %d \n", type ); + exit (1); + break; + } } diff --git a/PuReMD-GPU/src/cuda_init.cu b/PuReMD-GPU/src/cuda_init.cu index 9574a275..09515038 100644 --- a/PuReMD-GPU/src/cuda_init.cu +++ b/PuReMD-GPU/src/cuda_init.cu @@ -29,274 +29,274 @@ void Cuda_Init_System ( reax_system *system) { - cuda_malloc ( (void **) &system->d_atoms, system->N * REAX_ATOM_SIZE, 1, RES_SYSTEM_ATOMS ); + cuda_malloc ( (void **) &system->d_atoms, system->N * REAX_ATOM_SIZE, 1, RES_SYSTEM_ATOMS ); - cuda_malloc ( (void **) &system->d_box, sizeof (simulation_box), 1, RES_SYSTEM_SIMULATION_BOX ); + cuda_malloc ( (void **) &system->d_box, sizeof (simulation_box), 1, RES_SYSTEM_SIMULATION_BOX ); - //interaction parameters - cuda_malloc ((void **) &system->reaxprm.d_sbp, system->reaxprm.num_atom_types * SBP_SIZE, - 1, RES_REAX_INT_SBP ); + //interaction parameters + cuda_malloc ((void **) &system->reaxprm.d_sbp, system->reaxprm.num_atom_types * SBP_SIZE, + 1, RES_REAX_INT_SBP ); - cuda_malloc ((void **) &system->reaxprm.d_tbp, pow (system->reaxprm.num_atom_types, 2) * TBP_SIZE, - 1, RES_REAX_INT_TBP ); + cuda_malloc ((void **) &system->reaxprm.d_tbp, pow (system->reaxprm.num_atom_types, 2) * TBP_SIZE, + 1, RES_REAX_INT_TBP ); - cuda_malloc ((void **) &system->reaxprm.d_thbp, pow (system->reaxprm.num_atom_types, 3) * THBP_SIZE, - 1, RES_REAX_INT_THBP ); + cuda_malloc ((void **) &system->reaxprm.d_thbp, pow (system->reaxprm.num_atom_types, 3) * THBP_SIZE, + 1, RES_REAX_INT_THBP ); - cuda_malloc ((void **) &system->reaxprm.d_hbp, pow (system->reaxprm.num_atom_types, 3) * HBP_SIZE, - 1, RES_REAX_INT_HBP ); + cuda_malloc ((void **) &system->reaxprm.d_hbp, pow (system->reaxprm.num_atom_types, 3) * HBP_SIZE, + 1, RES_REAX_INT_HBP ); - cuda_malloc ((void **) &system->reaxprm.d_fbp, pow (system->reaxprm.num_atom_types, 4) * FBP_SIZE, - 1, RES_REAX_INT_FBP ); + cuda_malloc ((void **) &system->reaxprm.d_fbp, pow (system->reaxprm.num_atom_types, 4) * FBP_SIZE, + 1, RES_REAX_INT_FBP ); - cuda_malloc ((void **) &system->reaxprm.d_gp.l, REAL_SIZE * system->reaxprm.gp.n_global, 1, RES_GLOBAL_PARAMS ); + cuda_malloc ((void **) &system->reaxprm.d_gp.l, REAL_SIZE * system->reaxprm.gp.n_global, 1, RES_GLOBAL_PARAMS ); - system->reaxprm.d_gp.n_global = 0; - system->reaxprm.d_gp.vdw_type = 0; + system->reaxprm.d_gp.n_global = 0; + system->reaxprm.d_gp.vdw_type = 0; } void Cuda_Init_Control (control_params *control) { - cuda_malloc ((void **)&control->d_control, CONTROL_PARAMS_SIZE, 1, RES_CONTROL_PARAMS ); - copy_host_device (control, control->d_control, CONTROL_PARAMS_SIZE, cudaMemcpyHostToDevice, RES_CONTROL_PARAMS ); + cuda_malloc ((void **)&control->d_control, CONTROL_PARAMS_SIZE, 1, RES_CONTROL_PARAMS ); + copy_host_device (control, control->d_control, CONTROL_PARAMS_SIZE, cudaMemcpyHostToDevice, RES_CONTROL_PARAMS ); } void Cuda_Init_Simulation_Data (simulation_data *data) { - cuda_malloc ((void **) &(data->d_simulation_data), SIMULATION_DATA_SIZE, 1, RES_SIMULATION_DATA ); + cuda_malloc ((void **) &(data->d_simulation_data), SIMULATION_DATA_SIZE, 1, RES_SIMULATION_DATA ); } GLOBAL void Initialize_Grid (ivec *nbrs, rvec *nbrs_cp, int N) { - int index = blockIdx.x * blockDim.x + threadIdx.x; + int index = blockIdx.x * blockDim.x + threadIdx.x; - if (index >= N) return; + if (index >= N) return; - nbrs[index][0] = -1; - nbrs[index][1] = -1; - nbrs[index][2] = -1; - nbrs_cp[index][0] = -1; - nbrs_cp[index][1] = -1; - nbrs_cp[index][2] = -1; + nbrs[index][0] = -1; + nbrs[index][1] = -1; + nbrs[index][2] = -1; + nbrs_cp[index][0] = -1; + nbrs_cp[index][1] = -1; + nbrs_cp[index][2] = -1; } void Cuda_Init_Grid (grid *host, grid *dev) { - int total = host->ncell[0] * host->ncell[1] * host->ncell[2]; - dev->max_atoms = host->max_atoms; - dev->max_nbrs = host->max_nbrs; - dev->total = host->total; - dev->max_cuda_nbrs = host->max_cuda_nbrs; - dev->cell_size = host->cell_size; - - ivec_Copy (dev->spread, host->spread); - ivec_Copy (dev->ncell, host->ncell); - rvec_Copy (dev->len, host->len); - rvec_Copy (dev->inv_len, host->inv_len); - - cuda_malloc ((void **) &dev->top, INT_SIZE * total , 1, RES_GRID_TOP ); - cuda_malloc ((void **) &dev->mark, INT_SIZE * total , 1, RES_GRID_MARK ); - cuda_malloc ((void **) &dev->start, INT_SIZE * total , 1, RES_GRID_START ); - cuda_malloc ((void **) &dev->end, INT_SIZE * total , 1, RES_GRID_END ); - - cuda_malloc ((void **) &dev->atoms, INT_SIZE * total * host->max_atoms, 1, RES_GRID_ATOMS ); - cuda_malloc ((void **) &dev->nbrs, IVEC_SIZE * total * host->max_nbrs, 0, RES_GRID_NBRS ); - cuda_malloc ((void **) &dev->nbrs_cp, RVEC_SIZE * total * host->max_nbrs, 0, RES_GRID_NBRS_CP ); - - int block_size = 512; - int blocks = (total*dev->max_nbrs) / block_size + ((total*dev->max_nbrs) % block_size == 0 ? 0 : 1); - - Initialize_Grid <<<blocks, block_size>>> - (dev->nbrs, dev->nbrs_cp, total * host->max_nbrs ); - cudaThreadSynchronize (); - cudaCheckError (); + int total = host->ncell[0] * host->ncell[1] * host->ncell[2]; + dev->max_atoms = host->max_atoms; + dev->max_nbrs = host->max_nbrs; + dev->total = host->total; + dev->max_cuda_nbrs = host->max_cuda_nbrs; + dev->cell_size = host->cell_size; + + ivec_Copy (dev->spread, host->spread); + ivec_Copy (dev->ncell, host->ncell); + rvec_Copy (dev->len, host->len); + rvec_Copy (dev->inv_len, host->inv_len); + + cuda_malloc ((void **) &dev->top, INT_SIZE * total , 1, RES_GRID_TOP ); + cuda_malloc ((void **) &dev->mark, INT_SIZE * total , 1, RES_GRID_MARK ); + cuda_malloc ((void **) &dev->start, INT_SIZE * total , 1, RES_GRID_START ); + cuda_malloc ((void **) &dev->end, INT_SIZE * total , 1, RES_GRID_END ); + + cuda_malloc ((void **) &dev->atoms, INT_SIZE * total * host->max_atoms, 1, RES_GRID_ATOMS ); + cuda_malloc ((void **) &dev->nbrs, IVEC_SIZE * total * host->max_nbrs, 0, RES_GRID_NBRS ); + cuda_malloc ((void **) &dev->nbrs_cp, RVEC_SIZE * total * host->max_nbrs, 0, RES_GRID_NBRS_CP ); + + int block_size = 512; + int blocks = (total*dev->max_nbrs) / block_size + ((total*dev->max_nbrs) % block_size == 0 ? 0 : 1); + + Initialize_Grid <<<blocks, block_size>>> + (dev->nbrs, dev->nbrs_cp, total * host->max_nbrs ); + cudaThreadSynchronize (); + cudaCheckError (); } GLOBAL void Init_Workspace_Arrays (single_body_parameters *sbp, reax_atom *atoms, - static_storage workspace, int N) + static_storage workspace, int N) { - int i = blockIdx.x * blockDim.x + threadIdx.x; - if(i >= N) return; + int i = blockIdx.x * blockDim.x + threadIdx.x; + if(i >= N) return; - workspace.Hdia_inv[i] = 1./sbp[atoms[i].type].eta; - workspace.b_s[i] = -sbp[ atoms[i].type ].chi; - workspace.b_t[i] = -1.0; + workspace.Hdia_inv[i] = 1./sbp[atoms[i].type].eta; + workspace.b_s[i] = -sbp[ atoms[i].type ].chi; + workspace.b_t[i] = -1.0; - workspace.b[i] = -sbp[ atoms[i].type ].chi; - workspace.b[i+N] = -1.0; + workspace.b[i] = -sbp[ atoms[i].type ].chi; + workspace.b[i+N] = -1.0; } GLOBAL void Init_Map_Serials (int *input, int N) { - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= N) return; + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= N) return; - input[i] = -1; + input[i] = -1; } void Cuda_Init_Workspace_System (reax_system *system, static_storage *workspace ) { - int blocks, block_size = BLOCK_SIZE; - compute_blocks (&blocks, &block_size, MAX_ATOM_ID ); + int blocks, block_size = BLOCK_SIZE; + compute_blocks (&blocks, &block_size, MAX_ATOM_ID ); - cuda_malloc ( (void **) &workspace->map_serials, INT_SIZE * MAX_ATOM_ID, 0, RES_STORAGE_MAP_SERIALS ); + cuda_malloc ( (void **) &workspace->map_serials, INT_SIZE * MAX_ATOM_ID, 0, RES_STORAGE_MAP_SERIALS ); - Init_Map_Serials <<< blocks, block_size >>> - ( workspace->map_serials, MAX_ATOM_ID ); - cudaThreadSynchronize (); - cudaCheckError (); + Init_Map_Serials <<< blocks, block_size >>> + ( workspace->map_serials, MAX_ATOM_ID ); + cudaThreadSynchronize (); + cudaCheckError (); - cuda_malloc ( (void **) &workspace->orig_id, INT_SIZE * system->N, 0, RES_STORAGE_ORIG_ID ); - cuda_malloc ( (void **) &workspace->restricted, INT_SIZE * system->N, 0, RES_STORAGE_RESTRICTED ); - cuda_malloc ( (void **) &workspace->restricted_list, system->N * MAX_RESTRICT * INT_SIZE, 0, RES_STORAGE_RESTRICTED_LIST ); + cuda_malloc ( (void **) &workspace->orig_id, INT_SIZE * system->N, 0, RES_STORAGE_ORIG_ID ); + cuda_malloc ( (void **) &workspace->restricted, INT_SIZE * system->N, 0, RES_STORAGE_RESTRICTED ); + cuda_malloc ( (void **) &workspace->restricted_list, system->N * MAX_RESTRICT * INT_SIZE, 0, RES_STORAGE_RESTRICTED_LIST ); } void Cuda_Init_Workspace( reax_system *system, control_params *control, - static_storage *workspace ) + static_storage *workspace ) { - int i; - - /* Allocate space for hydrogen bond list */ - cuda_malloc ((void **) &workspace->hbond_index, system->N * INT_SIZE, 0, RES_STORAGE_HBOND_INDEX ); - - /* bond order related storage */ - cuda_malloc ((void **) &workspace->total_bond_order, system->N * REAL_SIZE, 0, RES_STORAGE_TOTAL_BOND_ORDER ); - cuda_malloc ((void **) &workspace->Deltap, system->N * REAL_SIZE, 0, RES_STORAGE_DELTAP ); - cuda_malloc ((void **) &workspace->Deltap_boc, system->N * REAL_SIZE, 0, RES_STORAGE_DELTAP_BOC ); - cuda_malloc ((void **) &workspace->dDeltap_self, system->N * RVEC_SIZE, 0, RES_STORAGE_DDELTAP_SELF ); - - cuda_malloc ((void **) &workspace->Delta, system->N * REAL_SIZE, 0, RES_STORAGE_DELTA ); - cuda_malloc ((void **) &workspace->Delta_lp, system->N * REAL_SIZE, 0, RES_STORAGE_DELTA_LP ); - cuda_malloc ((void **) &workspace->Delta_lp_temp, system->N * REAL_SIZE, 0, RES_STORAGE_DELTA_LP_TEMP ); - cuda_malloc ((void **) &workspace->dDelta_lp, system->N * REAL_SIZE, 0, RES_STORAGE_DDELTA_LP ); - cuda_malloc ((void **) &workspace->dDelta_lp_temp, system->N * REAL_SIZE, 0, RES_STORAGE_DDELTA_LP_TEMP ); - cuda_malloc ((void **) &workspace->Delta_e, system->N * REAL_SIZE, 0, RES_STORAGE_DELTA_E ); - cuda_malloc ((void **) &workspace->Delta_boc, system->N * REAL_SIZE, 0, RES_STORAGE_DELTA_BOC ); - cuda_malloc ((void **) &workspace->nlp, system->N * REAL_SIZE, 0, RES_STORAGE_NLP ); - cuda_malloc ((void **) &workspace->nlp_temp, system->N * REAL_SIZE, 0, RES_STORAGE_NLP_TEMP ); - cuda_malloc ((void **) &workspace->Clp, system->N * REAL_SIZE, 0, RES_STORAGE_CLP ); - cuda_malloc ((void **) &workspace->CdDelta, system->N * REAL_SIZE, 0, RES_STORAGE_CDDELTA ); - cuda_malloc ((void **) &workspace->vlpex, system->N * REAL_SIZE, 0, RES_STORAGE_VLPEX ); - - /* QEq storage */ - workspace->H.start = NULL; - //cuda_malloc ((void **) &workspace->H.start, (system->N+1)* INT_SIZE, 0, RES_SPARSE_MATRIX_INDEX ); - workspace->L.start = NULL; - //cuda_malloc ((void **) &workspace->L.start, (system->N+1)* INT_SIZE, 0, RES_SPARSE_MATRIX_INDEX ); - workspace->U.start = NULL; - //cuda_malloc ((void **) &workspace->U.start, (system->N+1)* INT_SIZE, 0, RES_SPARSE_MATRIX_INDEX ); - - workspace->H.end = NULL; - //cuda_malloc ((void **) &workspace->H.end, (system->N+1)* INT_SIZE, 0, RES_SPARSE_MATRIX_INDEX ); - workspace->L.end = NULL; - //cuda_malloc ((void **) &workspace->L.end, (system->N+1)* INT_SIZE, 0, RES_SPARSE_MATRIX_INDEX ); - workspace->U.end = NULL; - //cuda_malloc ((void **) &workspace->U.end, (system->N+1)* INT_SIZE, 0, RES_SPARSE_MATRIX_INDEX ); - - workspace->H.entries = NULL; - workspace->L.entries = NULL; - workspace->U.entries = NULL; - - cuda_malloc ((void **) &workspace->droptol, system->N * REAL_SIZE, 1, RES_STORAGE_DROPTOL ); - cuda_malloc ((void **) &workspace->w, system->N * REAL_SIZE, 1, RES_STORAGE_W ); - cuda_malloc ((void **) &workspace->Hdia_inv, system->N * REAL_SIZE, 1, RES_STORAGE_HDIA_INV ); - cuda_malloc ((void **) &workspace->b, system->N * 2 * REAL_SIZE, 1, RES_STORAGE_B ); - cuda_malloc ((void **) &workspace->b_s, system->N * REAL_SIZE, 1, RES_STORAGE_B_S ); - cuda_malloc ((void **) &workspace->b_t, system->N * REAL_SIZE, 1, RES_STORAGE_B_T ); - cuda_malloc ((void **) &workspace->b_prc, system->N * 2 * REAL_SIZE, 1, RES_STORAGE_B_PRC ); - cuda_malloc ((void **) &workspace->b_prm, system->N * 2 * REAL_SIZE, 1, RES_STORAGE_B_PRM ); - cuda_malloc ((void **) &workspace->s_t, system->N * 2 * REAL_SIZE, 1, RES_STORAGE_S_T ); - cuda_malloc ((void **) &workspace->s, 5 * system->N * REAL_SIZE, 1, RES_STORAGE_S ); - cuda_malloc ((void **) &workspace->t, 5 * system->N * REAL_SIZE, 1, RES_STORAGE_T ); - - Init_Workspace_Arrays <<<BLOCKS, BLOCK_SIZE>>> - (system->reaxprm.d_sbp, system->d_atoms, *workspace, system->N ); - cudaThreadSynchronize (); - cudaCheckError (); - - /* GMRES storage */ - cuda_malloc ((void **) &workspace->y, (RESTART+1) * REAL_SIZE, 1, RES_STORAGE_Y ); - cuda_malloc ((void **) &workspace->z, (RESTART+1) * REAL_SIZE, 1, RES_STORAGE_Z ); - cuda_malloc ((void **) &workspace->g, (RESTART+1) * REAL_SIZE, 1, RES_STORAGE_G ); - cuda_malloc ((void **) &workspace->hs, (RESTART+1) * REAL_SIZE, 1, RES_STORAGE_HS ); - cuda_malloc ((void **) &workspace->hc, (RESTART+1) * REAL_SIZE, 1, RES_STORAGE_HC ); - - cuda_malloc ((void **) &workspace->rn, (RESTART+1)*system->N * 2 * REAL_SIZE, 1, RES_STORAGE_RN ); - cuda_malloc ((void **) &workspace->v, (RESTART+1)*system->N * REAL_SIZE, 1, RES_STORAGE_V ); - cuda_malloc ((void **) &workspace->h, (RESTART+1)*(RESTART+1) * REAL_SIZE, 1, RES_STORAGE_H ); - - /* CG storage */ - cuda_malloc ((void **) &workspace->r, system->N * REAL_SIZE, 1, RES_STORAGE_R ); - cuda_malloc ((void **) &workspace->d, system->N * REAL_SIZE, 1, RES_STORAGE_D ); - cuda_malloc ((void **) &workspace->q, system->N * REAL_SIZE, 1, RES_STORAGE_Q ); - cuda_malloc ((void **) &workspace->p, system->N * REAL_SIZE, 1, RES_STORAGE_P ); - - - /* integrator storage */ - cuda_malloc ((void **) &workspace->a, system->N * RVEC_SIZE, 1, RES_STORAGE_A ); - cuda_malloc ((void **) &workspace->f_old, system->N * RVEC_SIZE, 1, RES_STORAGE_F_OLD ); - cuda_malloc ((void **) &workspace->v_const,system->N * RVEC_SIZE, 1, RES_STORAGE_V_CONST ); - - /* storage for analysis */ - if( control->molec_anal || control->diffusion_coef ) - { - cuda_malloc ((void **) &workspace->mark, system->N * INT_SIZE, 1, RES_STORAGE_MARK ); - cuda_malloc ((void **) &workspace->old_mark, system->N * INT_SIZE, 1, RES_STORAGE_OLD_MARK); - } - else - workspace->mark = workspace->old_mark = NULL; - - if( control->diffusion_coef ) - cuda_malloc ((void **) &workspace->x_old, system->N * RVEC_SIZE, 1, RES_STORAGE_X_OLD ); - else workspace->x_old = NULL; - - workspace->realloc.num_far = -1; - workspace->realloc.Htop = -1; - workspace->realloc.hbonds = -1; - workspace->realloc.bonds = -1; - workspace->realloc.num_3body = -1; - workspace->realloc.gcell_atoms = -1; - - Cuda_Reset_Workspace( system, workspace ); + int i; + + /* Allocate space for hydrogen bond list */ + cuda_malloc ((void **) &workspace->hbond_index, system->N * INT_SIZE, 0, RES_STORAGE_HBOND_INDEX ); + + /* bond order related storage */ + cuda_malloc ((void **) &workspace->total_bond_order, system->N * REAL_SIZE, 0, RES_STORAGE_TOTAL_BOND_ORDER ); + cuda_malloc ((void **) &workspace->Deltap, system->N * REAL_SIZE, 0, RES_STORAGE_DELTAP ); + cuda_malloc ((void **) &workspace->Deltap_boc, system->N * REAL_SIZE, 0, RES_STORAGE_DELTAP_BOC ); + cuda_malloc ((void **) &workspace->dDeltap_self, system->N * RVEC_SIZE, 0, RES_STORAGE_DDELTAP_SELF ); + + cuda_malloc ((void **) &workspace->Delta, system->N * REAL_SIZE, 0, RES_STORAGE_DELTA ); + cuda_malloc ((void **) &workspace->Delta_lp, system->N * REAL_SIZE, 0, RES_STORAGE_DELTA_LP ); + cuda_malloc ((void **) &workspace->Delta_lp_temp, system->N * REAL_SIZE, 0, RES_STORAGE_DELTA_LP_TEMP ); + cuda_malloc ((void **) &workspace->dDelta_lp, system->N * REAL_SIZE, 0, RES_STORAGE_DDELTA_LP ); + cuda_malloc ((void **) &workspace->dDelta_lp_temp, system->N * REAL_SIZE, 0, RES_STORAGE_DDELTA_LP_TEMP ); + cuda_malloc ((void **) &workspace->Delta_e, system->N * REAL_SIZE, 0, RES_STORAGE_DELTA_E ); + cuda_malloc ((void **) &workspace->Delta_boc, system->N * REAL_SIZE, 0, RES_STORAGE_DELTA_BOC ); + cuda_malloc ((void **) &workspace->nlp, system->N * REAL_SIZE, 0, RES_STORAGE_NLP ); + cuda_malloc ((void **) &workspace->nlp_temp, system->N * REAL_SIZE, 0, RES_STORAGE_NLP_TEMP ); + cuda_malloc ((void **) &workspace->Clp, system->N * REAL_SIZE, 0, RES_STORAGE_CLP ); + cuda_malloc ((void **) &workspace->CdDelta, system->N * REAL_SIZE, 0, RES_STORAGE_CDDELTA ); + cuda_malloc ((void **) &workspace->vlpex, system->N * REAL_SIZE, 0, RES_STORAGE_VLPEX ); + + /* QEq storage */ + workspace->H.start = NULL; + //cuda_malloc ((void **) &workspace->H.start, (system->N+1)* INT_SIZE, 0, RES_SPARSE_MATRIX_INDEX ); + workspace->L.start = NULL; + //cuda_malloc ((void **) &workspace->L.start, (system->N+1)* INT_SIZE, 0, RES_SPARSE_MATRIX_INDEX ); + workspace->U.start = NULL; + //cuda_malloc ((void **) &workspace->U.start, (system->N+1)* INT_SIZE, 0, RES_SPARSE_MATRIX_INDEX ); + + workspace->H.end = NULL; + //cuda_malloc ((void **) &workspace->H.end, (system->N+1)* INT_SIZE, 0, RES_SPARSE_MATRIX_INDEX ); + workspace->L.end = NULL; + //cuda_malloc ((void **) &workspace->L.end, (system->N+1)* INT_SIZE, 0, RES_SPARSE_MATRIX_INDEX ); + workspace->U.end = NULL; + //cuda_malloc ((void **) &workspace->U.end, (system->N+1)* INT_SIZE, 0, RES_SPARSE_MATRIX_INDEX ); + + workspace->H.entries = NULL; + workspace->L.entries = NULL; + workspace->U.entries = NULL; + + cuda_malloc ((void **) &workspace->droptol, system->N * REAL_SIZE, 1, RES_STORAGE_DROPTOL ); + cuda_malloc ((void **) &workspace->w, system->N * REAL_SIZE, 1, RES_STORAGE_W ); + cuda_malloc ((void **) &workspace->Hdia_inv, system->N * REAL_SIZE, 1, RES_STORAGE_HDIA_INV ); + cuda_malloc ((void **) &workspace->b, system->N * 2 * REAL_SIZE, 1, RES_STORAGE_B ); + cuda_malloc ((void **) &workspace->b_s, system->N * REAL_SIZE, 1, RES_STORAGE_B_S ); + cuda_malloc ((void **) &workspace->b_t, system->N * REAL_SIZE, 1, RES_STORAGE_B_T ); + cuda_malloc ((void **) &workspace->b_prc, system->N * 2 * REAL_SIZE, 1, RES_STORAGE_B_PRC ); + cuda_malloc ((void **) &workspace->b_prm, system->N * 2 * REAL_SIZE, 1, RES_STORAGE_B_PRM ); + cuda_malloc ((void **) &workspace->s_t, system->N * 2 * REAL_SIZE, 1, RES_STORAGE_S_T ); + cuda_malloc ((void **) &workspace->s, 5 * system->N * REAL_SIZE, 1, RES_STORAGE_S ); + cuda_malloc ((void **) &workspace->t, 5 * system->N * REAL_SIZE, 1, RES_STORAGE_T ); + + Init_Workspace_Arrays <<<BLOCKS, BLOCK_SIZE>>> + (system->reaxprm.d_sbp, system->d_atoms, *workspace, system->N ); + cudaThreadSynchronize (); + cudaCheckError (); + + /* GMRES storage */ + cuda_malloc ((void **) &workspace->y, (RESTART+1) * REAL_SIZE, 1, RES_STORAGE_Y ); + cuda_malloc ((void **) &workspace->z, (RESTART+1) * REAL_SIZE, 1, RES_STORAGE_Z ); + cuda_malloc ((void **) &workspace->g, (RESTART+1) * REAL_SIZE, 1, RES_STORAGE_G ); + cuda_malloc ((void **) &workspace->hs, (RESTART+1) * REAL_SIZE, 1, RES_STORAGE_HS ); + cuda_malloc ((void **) &workspace->hc, (RESTART+1) * REAL_SIZE, 1, RES_STORAGE_HC ); + + cuda_malloc ((void **) &workspace->rn, (RESTART+1)*system->N * 2 * REAL_SIZE, 1, RES_STORAGE_RN ); + cuda_malloc ((void **) &workspace->v, (RESTART+1)*system->N * REAL_SIZE, 1, RES_STORAGE_V ); + cuda_malloc ((void **) &workspace->h, (RESTART+1)*(RESTART+1) * REAL_SIZE, 1, RES_STORAGE_H ); + + /* CG storage */ + cuda_malloc ((void **) &workspace->r, system->N * REAL_SIZE, 1, RES_STORAGE_R ); + cuda_malloc ((void **) &workspace->d, system->N * REAL_SIZE, 1, RES_STORAGE_D ); + cuda_malloc ((void **) &workspace->q, system->N * REAL_SIZE, 1, RES_STORAGE_Q ); + cuda_malloc ((void **) &workspace->p, system->N * REAL_SIZE, 1, RES_STORAGE_P ); + + + /* integrator storage */ + cuda_malloc ((void **) &workspace->a, system->N * RVEC_SIZE, 1, RES_STORAGE_A ); + cuda_malloc ((void **) &workspace->f_old, system->N * RVEC_SIZE, 1, RES_STORAGE_F_OLD ); + cuda_malloc ((void **) &workspace->v_const,system->N * RVEC_SIZE, 1, RES_STORAGE_V_CONST ); + + /* storage for analysis */ + if( control->molec_anal || control->diffusion_coef ) + { + cuda_malloc ((void **) &workspace->mark, system->N * INT_SIZE, 1, RES_STORAGE_MARK ); + cuda_malloc ((void **) &workspace->old_mark, system->N * INT_SIZE, 1, RES_STORAGE_OLD_MARK); + } + else + workspace->mark = workspace->old_mark = NULL; + + if( control->diffusion_coef ) + cuda_malloc ((void **) &workspace->x_old, system->N * RVEC_SIZE, 1, RES_STORAGE_X_OLD ); + else workspace->x_old = NULL; + + workspace->realloc.num_far = -1; + workspace->realloc.Htop = -1; + workspace->realloc.hbonds = -1; + workspace->realloc.bonds = -1; + workspace->realloc.num_3body = -1; + workspace->realloc.gcell_atoms = -1; + + Cuda_Reset_Workspace( system, workspace ); } void Cuda_Init_Workspace_Device ( static_storage *workspace ) { - workspace->realloc.estimate_nbrs = -1; - workspace->realloc.num_far = -1; - workspace->realloc.Htop = -1; - workspace->realloc.hbonds = -1; - workspace->realloc.bonds = -1; - workspace->realloc.num_3body = -1; - workspace->realloc.gcell_atoms = -1; + workspace->realloc.estimate_nbrs = -1; + workspace->realloc.num_far = -1; + workspace->realloc.Htop = -1; + workspace->realloc.hbonds = -1; + workspace->realloc.bonds = -1; + workspace->realloc.num_3body = -1; + workspace->realloc.gcell_atoms = -1; } void Cuda_Init_Sparse_Matrix (sparse_matrix *matrix, int entries, int N) { - cuda_malloc ((void **) &matrix->start, INT_SIZE * (N + 1), 1, RES_SPARSE_MATRIX_INDEX ); - cuda_malloc ((void **) &matrix->end, INT_SIZE * (N + 1), 1, RES_SPARSE_MATRIX_INDEX ); - cuda_malloc ((void **) &matrix->entries, SPARSE_MATRIX_ENTRY_SIZE * entries, 1, RES_SPARSE_MATRIX_ENTRY ); + cuda_malloc ((void **) &matrix->start, INT_SIZE * (N + 1), 1, RES_SPARSE_MATRIX_INDEX ); + cuda_malloc ((void **) &matrix->end, INT_SIZE * (N + 1), 1, RES_SPARSE_MATRIX_INDEX ); + cuda_malloc ((void **) &matrix->entries, SPARSE_MATRIX_ENTRY_SIZE * entries, 1, RES_SPARSE_MATRIX_ENTRY ); - cuda_malloc ((void **) &matrix->j, INT_SIZE * entries, 1, RES_SPARSE_MATRIX_ENTRY ); - cuda_malloc ((void **) &matrix->val, REAL_SIZE * entries, 1, RES_SPARSE_MATRIX_ENTRY ); + cuda_malloc ((void **) &matrix->j, INT_SIZE * entries, 1, RES_SPARSE_MATRIX_ENTRY ); + cuda_malloc ((void **) &matrix->val, REAL_SIZE * entries, 1, RES_SPARSE_MATRIX_ENTRY ); } void Cuda_Init_Scratch () { - cuda_malloc ((void **) &scratch, SCRATCH_SIZE, 0, RES_SCRATCH ); - - /* - cudaError_t retval = cudaErrorInvalidDevice; - - retval = cudaMallocHost ( (void **) &scratch, SCRATCH_SIZE ); - //retval = cudaHostAlloc ((void **) &scratch, SCRATCH_SIZE, cudaHostAllocDefault ); - if (retval != cudaSuccess) - { - fprintf (stderr, "Error allocating the scratch area on the device \n"); - exit (0); - } - */ + cuda_malloc ((void **) &scratch, SCRATCH_SIZE, 0, RES_SCRATCH ); + + /* + cudaError_t retval = cudaErrorInvalidDevice; + + retval = cudaMallocHost ( (void **) &scratch, SCRATCH_SIZE ); + //retval = cudaHostAlloc ((void **) &scratch, SCRATCH_SIZE, cudaHostAllocDefault ); + if (retval != cudaSuccess) + { + fprintf (stderr, "Error allocating the scratch area on the device \n"); + exit (0); + } + */ } diff --git a/PuReMD-GPU/src/cuda_utils.cu b/PuReMD-GPU/src/cuda_utils.cu index 05bc0e2d..2c632c05 100644 --- a/PuReMD-GPU/src/cuda_utils.cu +++ b/PuReMD-GPU/src/cuda_utils.cu @@ -26,112 +26,112 @@ void cuda_malloc (void **ptr, int size, int memset, int err_code) { - cudaError_t retVal = cudaSuccess; - - //fprintf (stderr, "&ptr --. %ld \n", &ptr); - //fprintf (stderr, "ptr --> %ld \n", ptr ); - - retVal = cudaMalloc (ptr, size); - if (retVal != cudaSuccess) { - fprintf (stderr, "Failed to allocate memory on device for the res: %d... exiting with code: %d size: %d \n", - err_code, retVal, size); - exit (err_code); - } - - //fprintf (stderr, "&ptr --. %ld \n", &ptr); - //fprintf (stderr, "ptr --> %ld \n", ptr ); - - if (memset) { - retVal = cudaMemset (*ptr, 0, size); - if (retVal != cudaSuccess) { - fprintf (stderr, "Failed to memset memory on device... exiting with code %d\n", - err_code); - exit (err_code); - } - } + cudaError_t retVal = cudaSuccess; + + //fprintf (stderr, "&ptr --. %ld \n", &ptr); + //fprintf (stderr, "ptr --> %ld \n", ptr ); + + retVal = cudaMalloc (ptr, size); + if (retVal != cudaSuccess) { + fprintf (stderr, "Failed to allocate memory on device for the res: %d... exiting with code: %d size: %d \n", + err_code, retVal, size); + exit (err_code); + } + + //fprintf (stderr, "&ptr --. %ld \n", &ptr); + //fprintf (stderr, "ptr --> %ld \n", ptr ); + + if (memset) { + retVal = cudaMemset (*ptr, 0, size); + if (retVal != cudaSuccess) { + fprintf (stderr, "Failed to memset memory on device... exiting with code %d\n", + err_code); + exit (err_code); + } + } } void cuda_free (void *ptr, int err_code) { - cudaError_t retVal = cudaSuccess; - if (!ptr) return; + cudaError_t retVal = cudaSuccess; + if (!ptr) return; - retVal = cudaFree (ptr); + retVal = cudaFree (ptr); - if (retVal != cudaSuccess) { - fprintf (stderr, "Failed to release memory on device for res %d... exiting with code %d -- Address %ld\n", - err_code, retVal, ptr); - return; - } + if (retVal != cudaSuccess) { + fprintf (stderr, "Failed to release memory on device for res %d... exiting with code %d -- Address %ld\n", + err_code, retVal, ptr); + return; + } } void cuda_memset (void *ptr, int data, size_t count, int err_code){ - cudaError_t retVal = cudaSuccess; - - retVal = cudaMemset (ptr, data, count); - if (retVal != cudaSuccess) { - fprintf (stderr, "ptr passed is %ld, value: %ld \n", ptr, &ptr); - fprintf (stderr, " size to memset: %d \n", count); - fprintf (stderr, " target data is : %d \n", data); - fprintf (stderr, "Failed to memset memory on device... exiting with code %d, cuda code %d\n", - err_code, retVal); - exit (err_code); - } + cudaError_t retVal = cudaSuccess; + + retVal = cudaMemset (ptr, data, count); + if (retVal != cudaSuccess) { + fprintf (stderr, "ptr passed is %ld, value: %ld \n", ptr, &ptr); + fprintf (stderr, " size to memset: %d \n", count); + fprintf (stderr, " target data is : %d \n", data); + fprintf (stderr, "Failed to memset memory on device... exiting with code %d, cuda code %d\n", + err_code, retVal); + exit (err_code); + } } void copy_host_device (void *host, void *dev, int size, enum cudaMemcpyKind dir, int resid) { - cudaError_t retVal = cudaErrorNotReady; - - if (dir == cudaMemcpyHostToDevice) - retVal = cudaMemcpy (dev, host, size, cudaMemcpyHostToDevice); - else - retVal = cudaMemcpy (host, dev, size, cudaMemcpyDeviceToHost); - - if (retVal != cudaSuccess) { - fprintf (stderr, "could not copy resource %d from host to device: reason %d \n", - resid, retVal); - exit (resid); - } + cudaError_t retVal = cudaErrorNotReady; + + if (dir == cudaMemcpyHostToDevice) + retVal = cudaMemcpy (dev, host, size, cudaMemcpyHostToDevice); + else + retVal = cudaMemcpy (host, dev, size, cudaMemcpyDeviceToHost); + + if (retVal != cudaSuccess) { + fprintf (stderr, "could not copy resource %d from host to device: reason %d \n", + resid, retVal); + exit (resid); + } } void copy_device (void *dest, void *src, int size, int resid) { - cudaError_t retVal = cudaErrorNotReady; - - retVal = cudaMemcpy (dest, src, size, cudaMemcpyDeviceToDevice); - if (retVal != cudaSuccess) { - fprintf (stderr, "could not copy resource %d from host to device: reason %d \n", - resid, retVal); - exit (resid); - } + cudaError_t retVal = cudaErrorNotReady; + + retVal = cudaMemcpy (dest, src, size, cudaMemcpyDeviceToDevice); + if (retVal != cudaSuccess) { + fprintf (stderr, "could not copy resource %d from host to device: reason %d \n", + resid, retVal); + exit (resid); + } } void compute_blocks ( int *blocks, int *block_size, int count ) { - *block_size = CUDA_BLOCK_SIZE; - *blocks = (count / CUDA_BLOCK_SIZE ) + (count % CUDA_BLOCK_SIZE == 0 ? 0 : 1); + *block_size = CUDA_BLOCK_SIZE; + *blocks = (count / CUDA_BLOCK_SIZE ) + (count % CUDA_BLOCK_SIZE == 0 ? 0 : 1); } void compute_nearest_pow_2 (int blocks, int *result) { - int power = 1; - while (power < blocks) power *= 2; + int power = 1; + while (power < blocks) power *= 2; - *result = power; + *result = power; } void print_device_mem_usage () { - size_t total, free; - cudaMemGetInfo (&free, &total); - if (cudaGetLastError () != cudaSuccess ) - { - fprintf (stderr, "Error on the memory call \n"); - return; - } - - fprintf (stderr, "Total %ld Mb %ld gig %ld , free %ld, Mb %ld , gig %ld \n", - total, total/(1024*1024), total/ (1024*1024*1024), - free, free/(1024*1024), free/ (1024*1024*1024) ); + size_t total, free; + cudaMemGetInfo (&free, &total); + if (cudaGetLastError () != cudaSuccess ) + { + fprintf (stderr, "Error on the memory call \n"); + return; + } + + fprintf (stderr, "Total %ld Mb %ld gig %ld , free %ld, Mb %ld , gig %ld \n", + total, total/(1024*1024), total/ (1024*1024*1024), + free, free/(1024*1024), free/ (1024*1024*1024) ); } diff --git a/PuReMD-GPU/src/forces.cu b/PuReMD-GPU/src/forces.cu index 08f9d8a5..e8e1e291 100644 --- a/PuReMD-GPU/src/forces.cu +++ b/PuReMD-GPU/src/forces.cu @@ -43,2838 +43,2838 @@ void Dummy_Interaction( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - list **lists, output_controls *out_control ) + simulation_data *data, static_storage *workspace, + list **lists, output_controls *out_control ) { } void Init_Bonded_Force_Functions( control_params *control ) { - Interaction_Functions[0] = Calculate_Bond_Orders; - Interaction_Functions[1] = Bond_Energy; //*/Dummy_Interaction; - Interaction_Functions[2] = LonePair_OverUnder_Coordination_Energy; - //*/Dummy_Interaction; - Interaction_Functions[3] = Three_Body_Interactions; //*/Dummy_Interaction; - Interaction_Functions[4] = Four_Body_Interactions; //*/Dummy_Interaction; - if( control->hb_cut > 0 ) - Interaction_Functions[5] = Hydrogen_Bonds; //*/Dummy_Interaction; - else Interaction_Functions[5] = Dummy_Interaction; - Interaction_Functions[6] = Dummy_Interaction; //empty - Interaction_Functions[7] = Dummy_Interaction; //empty - Interaction_Functions[8] = Dummy_Interaction; //empty - Interaction_Functions[9] = Dummy_Interaction; //empty + Interaction_Functions[0] = Calculate_Bond_Orders; + Interaction_Functions[1] = Bond_Energy; //*/Dummy_Interaction; + Interaction_Functions[2] = LonePair_OverUnder_Coordination_Energy; + //*/Dummy_Interaction; + Interaction_Functions[3] = Three_Body_Interactions; //*/Dummy_Interaction; + Interaction_Functions[4] = Four_Body_Interactions; //*/Dummy_Interaction; + if( control->hb_cut > 0 ) + Interaction_Functions[5] = Hydrogen_Bonds; //*/Dummy_Interaction; + else Interaction_Functions[5] = Dummy_Interaction; + Interaction_Functions[6] = Dummy_Interaction; //empty + Interaction_Functions[7] = Dummy_Interaction; //empty + Interaction_Functions[8] = Dummy_Interaction; //empty + Interaction_Functions[9] = Dummy_Interaction; //empty } void Compute_Bonded_Forces( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - list **lists, output_controls *out_control ) + simulation_data *data, static_storage *workspace, + list **lists, output_controls *out_control ) { - int i; - real t_start, t_elapsed; + int i; + real t_start, t_elapsed; #ifdef TEST_ENERGY - /* Mark beginning of a new timestep in each energy file */ - fprintf( out_control->ebond, "step: %d\n%6s%6s%12s%12s%12s\n", - data->step, "atom1", "atom2", "bo", "ebond", "total" ); - fprintf( out_control->elp, "step: %d\n%6s%12s%12s%12s\n", - data->step, "atom", "nlp", "elp", "total" ); - fprintf( out_control->eov, "step: %d\n%6s%12s%12s\n", - data->step, "atom", "eov", "total" ); - fprintf( out_control->eun, "step: %d\n%6s%12s%12s\n", - data->step, "atom", "eun", "total" ); - fprintf( out_control->eval, "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s%12s\n", - data->step, "atom1", "atom2", "atom3", - "angle", "bo(12)", "bo(23)", "eval", "epen", "total" ); - fprintf( out_control->epen, "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s\n", - data->step, "atom1", "atom2", "atom3", - "angle", "bo(12)", "bo(23)", "epen", "total" ); - fprintf( out_control->ecoa, "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s\n", - data->step, "atom1", "atom2", "atom3", - "angle", "bo(12)", "bo(23)", "ecoa", "total" ); - fprintf( out_control->ehb, "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s\n", - data->step, "atom1", "atom2", "atom3", - "r(23)", "angle", "bo(12)", "ehb", "total" ); - fprintf( out_control->etor, "step: %d\n%6s%6s%6s%6s%12s%12s%12s%12s\n", - data->step, "atom1", "atom2", "atom3", "atom4", - "phi", "bo(23)", "etor", "total" ); - fprintf( out_control->econ, "step:%d\n%6s%6s%6s%6s%12s%12s%12s%12s%12s%12s\n", - data->step, "atom1", "atom2", "atom3", "atom4", - "phi", "bo(12)", "bo(23)", "bo(34)", "econ", "total" ); + /* Mark beginning of a new timestep in each energy file */ + fprintf( out_control->ebond, "step: %d\n%6s%6s%12s%12s%12s\n", + data->step, "atom1", "atom2", "bo", "ebond", "total" ); + fprintf( out_control->elp, "step: %d\n%6s%12s%12s%12s\n", + data->step, "atom", "nlp", "elp", "total" ); + fprintf( out_control->eov, "step: %d\n%6s%12s%12s\n", + data->step, "atom", "eov", "total" ); + fprintf( out_control->eun, "step: %d\n%6s%12s%12s\n", + data->step, "atom", "eun", "total" ); + fprintf( out_control->eval, "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s%12s\n", + data->step, "atom1", "atom2", "atom3", + "angle", "bo(12)", "bo(23)", "eval", "epen", "total" ); + fprintf( out_control->epen, "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s\n", + data->step, "atom1", "atom2", "atom3", + "angle", "bo(12)", "bo(23)", "epen", "total" ); + fprintf( out_control->ecoa, "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s\n", + data->step, "atom1", "atom2", "atom3", + "angle", "bo(12)", "bo(23)", "ecoa", "total" ); + fprintf( out_control->ehb, "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s\n", + data->step, "atom1", "atom2", "atom3", + "r(23)", "angle", "bo(12)", "ehb", "total" ); + fprintf( out_control->etor, "step: %d\n%6s%6s%6s%6s%12s%12s%12s%12s\n", + data->step, "atom1", "atom2", "atom3", "atom4", + "phi", "bo(23)", "etor", "total" ); + fprintf( out_control->econ, "step:%d\n%6s%6s%6s%6s%12s%12s%12s%12s%12s%12s\n", + data->step, "atom1", "atom2", "atom3", "atom4", + "phi", "bo(12)", "bo(23)", "bo(34)", "econ", "total" ); #endif - /* Implement all the function calls as function pointers */ - for( i = 0; i < NO_OF_INTERACTIONS; i++ ) { - //for( i = 0; i < 5; i++ ) { - t_start = Get_Time (); - (Interaction_Functions[i])(system, control, data, workspace, - lists, out_control); - t_elapsed = Get_Timing_Info ( t_start ); + /* Implement all the function calls as function pointers */ + for( i = 0; i < NO_OF_INTERACTIONS; i++ ) { + //for( i = 0; i < 5; i++ ) { + t_start = Get_Time (); + (Interaction_Functions[i])(system, control, data, workspace, + lists, out_control); + t_elapsed = Get_Timing_Info ( t_start ); #ifdef __DEBUG_CUDA__ - fprintf( stderr, "function %d tme %lf - \n", i, t_elapsed ); + fprintf( stderr, "function %d tme %lf - \n", i, t_elapsed ); #endif #if defined(DEBUG_FOCUS) - fprintf( stderr, "f%d-", i ); + fprintf( stderr, "f%d-", i ); #endif #ifdef TEST_FORCES - (Print_Interactions[i])(system, control, data, workspace, - lists, out_control); + (Print_Interactions[i])(system, control, data, workspace, + lists, out_control); #endif - } - } - - void Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - list **lists, output_controls *out_control ) - { - real t_start, t_elapsed; - real *spad = (real *)scratch; - rvec *rvec_spad; - - //Compute the bonded for interaction here. - //Step 1. + } + } + + void Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, + simulation_data *data, static_storage *workspace, + list **lists, output_controls *out_control ) + { + real t_start, t_elapsed; + real *spad = (real *)scratch; + rvec *rvec_spad; + + //Compute the bonded for interaction here. + //Step 1. #ifdef __DEBUG_CUDA__ - t_start = Get_Time( ); - fprintf (stderr, " Begin Bonded Forces ... %d x %d\n", BLOCKS, BLOCK_SIZE); + t_start = Get_Time( ); + fprintf (stderr, " Begin Bonded Forces ... %d x %d\n", BLOCKS, BLOCK_SIZE); #endif - Cuda_Calculate_Bond_Orders_Init <<< BLOCKS, BLOCK_SIZE >>> - ( system->d_atoms, system->reaxprm.d_gp, system->reaxprm.d_sbp, - *dev_workspace, system->reaxprm.num_atom_types, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_Calculate_Bond_Orders <<< BLOCKS, BLOCK_SIZE >>> - ( system->d_atoms, system->reaxprm.d_gp, system->reaxprm.d_sbp, - system->reaxprm.d_tbp, *dev_workspace, - *(dev_lists + BONDS), *(dev_lists + DDELTA), *(dev_lists + DBO), - system->reaxprm.num_atom_types, system->N ); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_Update_Uncorrected_BO <<<BLOCKS, BLOCK_SIZE>>> - (*dev_workspace, *(dev_lists + BONDS), system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_Update_Workspace_After_Bond_Orders <<<BLOCKS, BLOCK_SIZE>>> - (system->d_atoms, system->reaxprm.d_gp, system->reaxprm.d_sbp, - *dev_workspace, system->N); - cudaThreadSynchronize (); - cudaCheckError (); + Cuda_Calculate_Bond_Orders_Init <<< BLOCKS, BLOCK_SIZE >>> + ( system->d_atoms, system->reaxprm.d_gp, system->reaxprm.d_sbp, + *dev_workspace, system->reaxprm.num_atom_types, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_Calculate_Bond_Orders <<< BLOCKS, BLOCK_SIZE >>> + ( system->d_atoms, system->reaxprm.d_gp, system->reaxprm.d_sbp, + system->reaxprm.d_tbp, *dev_workspace, + *(dev_lists + BONDS), *(dev_lists + DDELTA), *(dev_lists + DBO), + system->reaxprm.num_atom_types, system->N ); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_Update_Uncorrected_BO <<<BLOCKS, BLOCK_SIZE>>> + (*dev_workspace, *(dev_lists + BONDS), system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_Update_Workspace_After_Bond_Orders <<<BLOCKS, BLOCK_SIZE>>> + (system->d_atoms, system->reaxprm.d_gp, system->reaxprm.d_sbp, + *dev_workspace, system->N); + cudaThreadSynchronize (); + cudaCheckError (); #ifdef __DEBUG_CUDA__ - t_elapsed = Get_Timing_Info( t_start ); - fprintf (stderr, "Bond Orders... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed ); - fprintf (stderr, "Cuda_Calculate_Bond_Orders Done... \n"); + t_elapsed = Get_Timing_Info( t_start ); + fprintf (stderr, "Bond Orders... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed ); + fprintf (stderr, "Cuda_Calculate_Bond_Orders Done... \n"); #endif - //Step 2. + //Step 2. #ifdef __DEBUG_CUDA__ - t_start = Get_Time( ); + t_start = Get_Time( ); #endif - //cuda_memset (spad, 0, system->N * ( 2 * REAL_SIZE + system->N * REAL_SIZE + 16 * REAL_SIZE), RES_SCRATCH ); - cuda_memset (spad, 0, system->N * ( 2 * REAL_SIZE ) , RES_SCRATCH ); - - Cuda_Bond_Energy <<< BLOCKS, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> - ( system->d_atoms, system->reaxprm.d_gp, system->reaxprm.d_sbp, system->reaxprm.d_tbp, - (simulation_data *)data->d_simulation_data, *dev_workspace, *(dev_lists + BONDS), - system->N, system->reaxprm.num_atom_types, spad ); - cudaThreadSynchronize (); - cudaCheckError (); - - //Reduction for E_BE - Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> - (spad, spad + system->N, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> - //(spad + system->N, spad + system->N + 16, 16); - (spad + system->N, &((simulation_data *)data->d_simulation_data)->E_BE, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); + //cuda_memset (spad, 0, system->N * ( 2 * REAL_SIZE + system->N * REAL_SIZE + 16 * REAL_SIZE), RES_SCRATCH ); + cuda_memset (spad, 0, system->N * ( 2 * REAL_SIZE ) , RES_SCRATCH ); + + Cuda_Bond_Energy <<< BLOCKS, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> + ( system->d_atoms, system->reaxprm.d_gp, system->reaxprm.d_sbp, system->reaxprm.d_tbp, + (simulation_data *)data->d_simulation_data, *dev_workspace, *(dev_lists + BONDS), + system->N, system->reaxprm.num_atom_types, spad ); + cudaThreadSynchronize (); + cudaCheckError (); + + //Reduction for E_BE + Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> + (spad, spad + system->N, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> + //(spad + system->N, spad + system->N + 16, 16); + (spad + system->N, &((simulation_data *)data->d_simulation_data)->E_BE, BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); #ifdef __DEBUG_CUDA__ - t_elapsed = Get_Timing_Info( t_start ); - fprintf (stderr, "Cuda_Bond_Energy ... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed ); - fprintf (stderr, "Cuda_Bond_Energy Done... \n"); + t_elapsed = Get_Timing_Info( t_start ); + fprintf (stderr, "Cuda_Bond_Energy ... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed ); + fprintf (stderr, "Cuda_Bond_Energy Done... \n"); #endif - //Step 3. + //Step 3. #ifdef __DEBUG_CUDA__ - t_start = Get_Time( ); + t_start = Get_Time( ); #endif - cuda_memset (spad, 0, ( 6 * REAL_SIZE * system->N ), RES_SCRATCH ); - - test_LonePair_OverUnder_Coordination_Energy_LP <<<BLOCKS, BLOCK_SIZE>>>( system->d_atoms, system->reaxprm.d_gp, - system->reaxprm.d_sbp, system->reaxprm.d_tbp, - *dev_workspace, (simulation_data *)data->d_simulation_data, - *(dev_lists + BONDS), system->N, system->reaxprm.num_atom_types, - spad, spad + 2 * system->N, spad + 4*system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - test_LonePair_OverUnder_Coordination_Energy <<<BLOCKS, BLOCK_SIZE>>>( system->d_atoms, system->reaxprm.d_gp, - system->reaxprm.d_sbp, system->reaxprm.d_tbp, - *dev_workspace, (simulation_data *)data->d_simulation_data, - *(dev_lists + BONDS), system->N, system->reaxprm.num_atom_types, - spad, spad + 2 * system->N, spad + 4*system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - test_LonePair_Postprocess <<<BLOCKS, BLOCK_SIZE, 0>>>( system->d_atoms, system->reaxprm.d_gp, - system->reaxprm.d_sbp, system->reaxprm.d_tbp, - *dev_workspace, (simulation_data *)data->d_simulation_data, - *(dev_lists + BONDS), system->N, system->reaxprm.num_atom_types); - cudaThreadSynchronize (); - cudaCheckError (); - - - //Reduction for E_Lp - Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> - (spad, spad + system->N, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> - (spad + system->N, &((simulation_data *)data->d_simulation_data)->E_Lp, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); - - //Reduction for E_Ov - Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> - (spad + 2*system->N, spad + 3*system->N, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> - (spad + 3*system->N, &((simulation_data *)data->d_simulation_data)->E_Ov, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); - - //Reduction for E_Un - Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> - (spad + 4*system->N, spad + 5*system->N, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> - (spad + 5*system->N, &((simulation_data *)data->d_simulation_data)->E_Un, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); + cuda_memset (spad, 0, ( 6 * REAL_SIZE * system->N ), RES_SCRATCH ); + + test_LonePair_OverUnder_Coordination_Energy_LP <<<BLOCKS, BLOCK_SIZE>>>( system->d_atoms, system->reaxprm.d_gp, + system->reaxprm.d_sbp, system->reaxprm.d_tbp, + *dev_workspace, (simulation_data *)data->d_simulation_data, + *(dev_lists + BONDS), system->N, system->reaxprm.num_atom_types, + spad, spad + 2 * system->N, spad + 4*system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + test_LonePair_OverUnder_Coordination_Energy <<<BLOCKS, BLOCK_SIZE>>>( system->d_atoms, system->reaxprm.d_gp, + system->reaxprm.d_sbp, system->reaxprm.d_tbp, + *dev_workspace, (simulation_data *)data->d_simulation_data, + *(dev_lists + BONDS), system->N, system->reaxprm.num_atom_types, + spad, spad + 2 * system->N, spad + 4*system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + test_LonePair_Postprocess <<<BLOCKS, BLOCK_SIZE, 0>>>( system->d_atoms, system->reaxprm.d_gp, + system->reaxprm.d_sbp, system->reaxprm.d_tbp, + *dev_workspace, (simulation_data *)data->d_simulation_data, + *(dev_lists + BONDS), system->N, system->reaxprm.num_atom_types); + cudaThreadSynchronize (); + cudaCheckError (); + + + //Reduction for E_Lp + Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> + (spad, spad + system->N, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> + (spad + system->N, &((simulation_data *)data->d_simulation_data)->E_Lp, BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); + + //Reduction for E_Ov + Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> + (spad + 2*system->N, spad + 3*system->N, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> + (spad + 3*system->N, &((simulation_data *)data->d_simulation_data)->E_Ov, BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); + + //Reduction for E_Un + Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> + (spad + 4*system->N, spad + 5*system->N, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> + (spad + 5*system->N, &((simulation_data *)data->d_simulation_data)->E_Un, BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); #ifdef __DEBUG_CUDA__ - t_elapsed = Get_Timing_Info( t_start ); - fprintf (stderr, "test_LonePair_postprocess ... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed ); - fprintf (stderr, "test_LonePair_postprocess Done... \n"); + t_elapsed = Get_Timing_Info( t_start ); + fprintf (stderr, "test_LonePair_postprocess ... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed ); + fprintf (stderr, "test_LonePair_postprocess Done... \n"); #endif - //Step 4. + //Step 4. #ifdef __DEBUG_CUDA__ - t_start = Get_Time( ); + t_start = Get_Time( ); #endif - cuda_memset(spad, 0, (dev_lists + BONDS)->num_intrs * sizeof (int), RES_SCRATCH); - Three_Body_Estimate <<<BLOCKS, BLOCK_SIZE>>> - (system->d_atoms, - (control_params *)control->d_control, - *(dev_lists + BONDS), - system->N, (int *)spad); - cudaThreadSynchronize (); - cudaCheckError (); + cuda_memset(spad, 0, (dev_lists + BONDS)->num_intrs * sizeof (int), RES_SCRATCH); + Three_Body_Estimate <<<BLOCKS, BLOCK_SIZE>>> + (system->d_atoms, + (control_params *)control->d_control, + *(dev_lists + BONDS), + system->N, (int *)spad); + cudaThreadSynchronize (); + cudaCheckError (); #ifdef __DEBUG_CUDA__ - t_elapsed = Get_Timing_Info( t_start ); - fprintf (stderr, "Three_Body_Estimate... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed ); + t_elapsed = Get_Timing_Info( t_start ); + fprintf (stderr, "Three_Body_Estimate... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed ); #endif - int *thbody = (int *) malloc (sizeof (int) * (dev_lists + BONDS)->num_intrs); - memset (thbody, 0, sizeof (int) * (dev_lists + BONDS)->num_intrs); - copy_host_device (thbody, spad, (dev_lists + BONDS)->num_intrs * sizeof (int), cudaMemcpyDeviceToHost, RES_SCRATCH); + int *thbody = (int *) malloc (sizeof (int) * (dev_lists + BONDS)->num_intrs); + memset (thbody, 0, sizeof (int) * (dev_lists + BONDS)->num_intrs); + copy_host_device (thbody, spad, (dev_lists + BONDS)->num_intrs * sizeof (int), cudaMemcpyDeviceToHost, RES_SCRATCH); - int total_3body = thbody [0] * SAFE_ZONE; - for (int x = 1; x < (dev_lists + BONDS)->num_intrs; x++) { - total_3body += thbody [x]*SAFE_ZONE; - thbody [x] += thbody [x-1]; - } - system->num_thbodies = thbody [(dev_lists+BONDS)->num_intrs-1]; + int total_3body = thbody [0] * SAFE_ZONE; + for (int x = 1; x < (dev_lists + BONDS)->num_intrs; x++) { + total_3body += thbody [x]*SAFE_ZONE; + thbody [x] += thbody [x-1]; + } + system->num_thbodies = thbody [(dev_lists+BONDS)->num_intrs-1]; #ifdef __DEBUG_CUDA__ - fprintf (stderr, "Total Three body estimate is %d (bonds: %d) \n", total_3body, (dev_lists+BONDS)->num_intrs); + fprintf (stderr, "Total Three body estimate is %d (bonds: %d) \n", total_3body, (dev_lists+BONDS)->num_intrs); #endif - if (!system->init_thblist) - { - system->init_thblist = true; - if(!Make_List((dev_lists+BONDS)->num_intrs, total_3body, TYP_THREE_BODY, dev_lists + THREE_BODIES, TYP_DEVICE)) { - fprintf( stderr, "Problem in initializing three-body list. Terminating!\n" ); - exit( INIT_ERR ); - } + if (!system->init_thblist) + { + system->init_thblist = true; + if(!Make_List((dev_lists+BONDS)->num_intrs, total_3body, TYP_THREE_BODY, dev_lists + THREE_BODIES, TYP_DEVICE)) { + fprintf( stderr, "Problem in initializing three-body list. Terminating!\n" ); + exit( INIT_ERR ); + } #ifdef __CUDA_MEM__ - fprintf (stderr, "Device memory allocated: three body list = %d MB\n", - sizeof (three_body_interaction_data) * total_3body / (1024*1024)); + fprintf (stderr, "Device memory allocated: three body list = %d MB\n", + sizeof (three_body_interaction_data) * total_3body / (1024*1024)); #endif - } else { - if ((dev_workspace->realloc.bonds > 0) || (system->num_thbodies > (dev_lists+THREE_BODIES)->num_intrs )) { - int size = MAX (dev_workspace->realloc.num_bonds, (dev_lists+BONDS)->num_intrs); + } else { + if ((dev_workspace->realloc.bonds > 0) || (system->num_thbodies > (dev_lists+THREE_BODIES)->num_intrs )) { + int size = MAX (dev_workspace->realloc.num_bonds, (dev_lists+BONDS)->num_intrs); - /*Delete Three-body list*/ - Delete_List( dev_lists + THREE_BODIES, TYP_DEVICE ); + /*Delete Three-body list*/ + Delete_List( dev_lists + THREE_BODIES, TYP_DEVICE ); #ifdef __CUDA_MEM__ - fprintf (stderr, "Reallocating Three-body list: step: %d n - %d num_intrs - %d used: %d \n", - data->step, dev_workspace->realloc.num_bonds, total_3body, system->num_thbodies); + fprintf (stderr, "Reallocating Three-body list: step: %d n - %d num_intrs - %d used: %d \n", + data->step, dev_workspace->realloc.num_bonds, total_3body, system->num_thbodies); #endif - /*Recreate Three-body list */ - if(!Make_List(size, total_3body, TYP_THREE_BODY, dev_lists + THREE_BODIES, TYP_DEVICE)) { - fprintf( stderr, "Problem in initializing three-body list. Terminating!\n" ); - exit( INIT_ERR ); - } - } - } - - //copy the indexes into the thb list; - copy_host_device (thbody, ((dev_lists + THREE_BODIES)->index + 1), sizeof (int) * ((dev_lists+BONDS)->num_intrs - 1), - cudaMemcpyHostToDevice, LIST_INDEX); - copy_host_device (thbody, ((dev_lists + THREE_BODIES)->end_index + 1), sizeof (int) * ((dev_lists+BONDS)->num_intrs - 1), - cudaMemcpyHostToDevice, LIST_END_INDEX); - - free (thbody ); + /*Recreate Three-body list */ + if(!Make_List(size, total_3body, TYP_THREE_BODY, dev_lists + THREE_BODIES, TYP_DEVICE)) { + fprintf( stderr, "Problem in initializing three-body list. Terminating!\n" ); + exit( INIT_ERR ); + } + } + } + + //copy the indexes into the thb list; + copy_host_device (thbody, ((dev_lists + THREE_BODIES)->index + 1), sizeof (int) * ((dev_lists+BONDS)->num_intrs - 1), + cudaMemcpyHostToDevice, LIST_INDEX); + copy_host_device (thbody, ((dev_lists + THREE_BODIES)->end_index + 1), sizeof (int) * ((dev_lists+BONDS)->num_intrs - 1), + cudaMemcpyHostToDevice, LIST_END_INDEX); + + free (thbody ); #ifdef __DEBUG_CUDA__ - t_start = Get_Time( ); + t_start = Get_Time( ); #endif - cuda_memset (spad, 0, ( 6 * REAL_SIZE * system->N + RVEC_SIZE * system->N * 2), RES_SCRATCH ); - - Three_Body_Interactions <<< BLOCKS, BLOCK_SIZE >>> - ( system->d_atoms, - system->reaxprm.d_sbp, system->reaxprm.d_thbp, system->reaxprm.d_gp, - (control_params *)control->d_control, - (simulation_data *)data->d_simulation_data, - *dev_workspace, - *(dev_lists + BONDS), *(dev_lists + THREE_BODIES), - system->N, system->reaxprm.num_atom_types, - spad, spad + 2*system->N, spad + 4*system->N, (rvec *)(spad + 6*system->N)); - cudaThreadSynchronize (); - cudaCheckError (); - - //Not necessary to validate three-body list anymore, - // Estimate is already done at the beginning which makes sure that - // we have sufficient size for this list - //Cuda_Threebody_List( system, workspace, dev_lists + THREE_BODIES, data->step ); - - //Reduction for E_Ang - Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> - (spad, spad + system->N, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> - (spad + system->N, &((simulation_data *)data->d_simulation_data)->E_Ang, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); - - //Reduction for E_Pen - Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> - (spad + 2*system->N, spad + 3*system->N, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> - (spad + 3*system->N, &((simulation_data *)data->d_simulation_data)->E_Pen, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); - - //Reduction for E_Coa - Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> - (spad + 4*system->N, spad + 5*system->N, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> - (spad + 5*system->N, &((simulation_data *)data->d_simulation_data)->E_Coa, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); - - //Reduction for ext_pres - rvec_spad = (rvec *) (spad + 6*system->N); - Cuda_reduction_rvec <<<BLOCKS_POW_2, BLOCK_SIZE, RVEC_SIZE * BLOCK_SIZE >>> - (rvec_spad, rvec_spad + system->N, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_reduction_rvec <<<1, BLOCKS_POW_2, RVEC_SIZE * BLOCKS_POW_2 >>> - (rvec_spad + system->N, &((simulation_data *)data->d_simulation_data)->ext_press, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); - - real t_1, t_2; - t_1 = Get_Time (); - //Sum up the f vector for each atom and collect the CdDelta from all the bonds - Three_Body_Interactions_results <<< BLOCKS, BLOCK_SIZE >>> - ( system->d_atoms, - (control_params *)control->d_control, - *dev_workspace, - *(dev_lists + BONDS), - system->N ); - cudaThreadSynchronize (); - cudaCheckError (); - t_2 = Get_Timing_Info (t_1); + cuda_memset (spad, 0, ( 6 * REAL_SIZE * system->N + RVEC_SIZE * system->N * 2), RES_SCRATCH ); + + Three_Body_Interactions <<< BLOCKS, BLOCK_SIZE >>> + ( system->d_atoms, + system->reaxprm.d_sbp, system->reaxprm.d_thbp, system->reaxprm.d_gp, + (control_params *)control->d_control, + (simulation_data *)data->d_simulation_data, + *dev_workspace, + *(dev_lists + BONDS), *(dev_lists + THREE_BODIES), + system->N, system->reaxprm.num_atom_types, + spad, spad + 2*system->N, spad + 4*system->N, (rvec *)(spad + 6*system->N)); + cudaThreadSynchronize (); + cudaCheckError (); + + //Not necessary to validate three-body list anymore, + // Estimate is already done at the beginning which makes sure that + // we have sufficient size for this list + //Cuda_Threebody_List( system, workspace, dev_lists + THREE_BODIES, data->step ); + + //Reduction for E_Ang + Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> + (spad, spad + system->N, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> + (spad + system->N, &((simulation_data *)data->d_simulation_data)->E_Ang, BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); + + //Reduction for E_Pen + Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> + (spad + 2*system->N, spad + 3*system->N, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> + (spad + 3*system->N, &((simulation_data *)data->d_simulation_data)->E_Pen, BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); + + //Reduction for E_Coa + Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> + (spad + 4*system->N, spad + 5*system->N, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> + (spad + 5*system->N, &((simulation_data *)data->d_simulation_data)->E_Coa, BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); + + //Reduction for ext_pres + rvec_spad = (rvec *) (spad + 6*system->N); + Cuda_reduction_rvec <<<BLOCKS_POW_2, BLOCK_SIZE, RVEC_SIZE * BLOCK_SIZE >>> + (rvec_spad, rvec_spad + system->N, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_reduction_rvec <<<1, BLOCKS_POW_2, RVEC_SIZE * BLOCKS_POW_2 >>> + (rvec_spad + system->N, &((simulation_data *)data->d_simulation_data)->ext_press, BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); + + real t_1, t_2; + t_1 = Get_Time (); + //Sum up the f vector for each atom and collect the CdDelta from all the bonds + Three_Body_Interactions_results <<< BLOCKS, BLOCK_SIZE >>> + ( system->d_atoms, + (control_params *)control->d_control, + *dev_workspace, + *(dev_lists + BONDS), + system->N ); + cudaThreadSynchronize (); + cudaCheckError (); + t_2 = Get_Timing_Info (t_1); #ifdef __DEBUG_CUDA__ - t_elapsed = Get_Timing_Info( t_start ); - fprintf (stderr, "Three_Body_Interactions post process Timing %lf \n", t_2); - fprintf (stderr, "Three_Body_Interactions ... Timing %lf \n", t_elapsed ); - fprintf (stderr, "Three_Body_Interactions Done... \n"); + t_elapsed = Get_Timing_Info( t_start ); + fprintf (stderr, "Three_Body_Interactions post process Timing %lf \n", t_2); + fprintf (stderr, "Three_Body_Interactions ... Timing %lf \n", t_elapsed ); + fprintf (stderr, "Three_Body_Interactions Done... \n"); #endif - //Step 5. + //Step 5. #ifdef __DEBUG_CUDA__ - t_start = Get_Time( ); + t_start = Get_Time( ); #endif - cuda_memset (spad, 0, ( 4 * REAL_SIZE * system->N + RVEC_SIZE * system->N * 2), RES_SCRATCH ); - Four_Body_Interactions <<< BLOCKS, BLOCK_SIZE >>> - //Four_Body_Interactions <<< system->N, 32, 32*( 2*REAL_SIZE + RVEC_SIZE)>>> - ( system->d_atoms, - system->reaxprm.d_gp, - system->reaxprm.d_fbp, - (control_params *)control->d_control, - *(dev_lists + BONDS), *(dev_lists + THREE_BODIES), - (simulation_box *)system->d_box, - (simulation_data *)data->d_simulation_data, - *dev_workspace, - system->N, system->reaxprm.num_atom_types, - spad, spad + 2*system->N, (rvec *) (spad + 4*system->N)); - cudaThreadSynchronize (); - cudaCheckError (); - - //Reduction for E_Tor - Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> - (spad, spad + system->N, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> - (spad + system->N, &((simulation_data *)data->d_simulation_data)->E_Tor, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); - - //Reduction for E_Con - Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> - (spad + 2*system->N, spad + 3*system->N, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> - (spad + 3*system->N, &((simulation_data *)data->d_simulation_data)->E_Con, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); - - //Reduction for ext_pres - rvec_spad = (rvec *) (spad + 4*system->N); - Cuda_reduction_rvec <<<BLOCKS_POW_2, BLOCK_SIZE, RVEC_SIZE * BLOCK_SIZE >>> - (rvec_spad, rvec_spad + system->N, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_reduction_rvec <<<1, BLOCKS_POW_2, RVEC_SIZE * BLOCKS_POW_2 >>> - (rvec_spad + system->N, &((simulation_data *)data->d_simulation_data)->ext_press, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); - - //Post process here - Four_Body_Postprocess <<< BLOCKS, BLOCK_SIZE >>> - ( system->d_atoms, - *dev_workspace, - *(dev_lists + BONDS), - system->N ); - cudaThreadSynchronize (); - cudaCheckError (); + cuda_memset (spad, 0, ( 4 * REAL_SIZE * system->N + RVEC_SIZE * system->N * 2), RES_SCRATCH ); + Four_Body_Interactions <<< BLOCKS, BLOCK_SIZE >>> + //Four_Body_Interactions <<< system->N, 32, 32*( 2*REAL_SIZE + RVEC_SIZE)>>> + ( system->d_atoms, + system->reaxprm.d_gp, + system->reaxprm.d_fbp, + (control_params *)control->d_control, + *(dev_lists + BONDS), *(dev_lists + THREE_BODIES), + (simulation_box *)system->d_box, + (simulation_data *)data->d_simulation_data, + *dev_workspace, + system->N, system->reaxprm.num_atom_types, + spad, spad + 2*system->N, (rvec *) (spad + 4*system->N)); + cudaThreadSynchronize (); + cudaCheckError (); + + //Reduction for E_Tor + Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> + (spad, spad + system->N, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> + (spad + system->N, &((simulation_data *)data->d_simulation_data)->E_Tor, BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); + + //Reduction for E_Con + Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> + (spad + 2*system->N, spad + 3*system->N, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> + (spad + 3*system->N, &((simulation_data *)data->d_simulation_data)->E_Con, BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); + + //Reduction for ext_pres + rvec_spad = (rvec *) (spad + 4*system->N); + Cuda_reduction_rvec <<<BLOCKS_POW_2, BLOCK_SIZE, RVEC_SIZE * BLOCK_SIZE >>> + (rvec_spad, rvec_spad + system->N, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_reduction_rvec <<<1, BLOCKS_POW_2, RVEC_SIZE * BLOCKS_POW_2 >>> + (rvec_spad + system->N, &((simulation_data *)data->d_simulation_data)->ext_press, BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); + + //Post process here + Four_Body_Postprocess <<< BLOCKS, BLOCK_SIZE >>> + ( system->d_atoms, + *dev_workspace, + *(dev_lists + BONDS), + system->N ); + cudaThreadSynchronize (); + cudaCheckError (); #ifdef __DEBUG_CUDA__ - t_elapsed = Get_Timing_Info( t_start ); - fprintf (stderr, "Four_Body_post process return value --> %d --- Four body Timing %lf \n", cudaGetLastError (), t_elapsed ); - fprintf (stderr, " Four_Body_ Done... \n"); + t_elapsed = Get_Timing_Info( t_start ); + fprintf (stderr, "Four_Body_post process return value --> %d --- Four body Timing %lf \n", cudaGetLastError (), t_elapsed ); + fprintf (stderr, " Four_Body_ Done... \n"); #endif - //Step 6. - if (control->hb_cut > 0) { + //Step 6. + if (control->hb_cut > 0) { #ifdef __DEBUG_CUDA__ - t_start = Get_Time( ); + t_start = Get_Time( ); #endif - cuda_memset (spad, 0, ( 2 * REAL_SIZE * system->N + RVEC_SIZE * system->N * 2 ), RES_SCRATCH ); - - /* - Hydrogen_Bonds <<< BLOCKS, BLOCK_SIZE, BLOCK_SIZE *( REAL_SIZE + RVEC_SIZE) >>> - ( system->d_atoms, - system->reaxprm.d_sbp, - system->reaxprm.d_hbp, - (control_params *)control->d_control, - (simulation_data *)data->d_simulation_data, - *dev_workspace, - *(dev_lists + BONDS), *(dev_lists + HBONDS), - system->N, system->reaxprm.num_atom_types, - spad, (rvec *) (spad + 2*system->N), NULL); - cudaThreadSynchronize (); - cudaCheckError (); - */ + cuda_memset (spad, 0, ( 2 * REAL_SIZE * system->N + RVEC_SIZE * system->N * 2 ), RES_SCRATCH ); + + /* + Hydrogen_Bonds <<< BLOCKS, BLOCK_SIZE, BLOCK_SIZE *( REAL_SIZE + RVEC_SIZE) >>> + ( system->d_atoms, + system->reaxprm.d_sbp, + system->reaxprm.d_hbp, + (control_params *)control->d_control, + (simulation_data *)data->d_simulation_data, + *dev_workspace, + *(dev_lists + BONDS), *(dev_lists + HBONDS), + system->N, system->reaxprm.num_atom_types, + spad, (rvec *) (spad + 2*system->N), NULL); + cudaThreadSynchronize (); + cudaCheckError (); + */ #ifdef __DEBUG_CUDA__ - real test1,test2; - test1 = Get_Time (); + real test1,test2; + test1 = Get_Time (); #endif - int hbs = (system->N * HBONDS_THREADS_PER_ATOM/ HBONDS_BLOCK_SIZE) + - (((system->N * HBONDS_THREADS_PER_ATOM) % HBONDS_BLOCK_SIZE) == 0 ? 0 : 1); - Hydrogen_Bonds_HB <<< hbs, HBONDS_BLOCK_SIZE, HBONDS_BLOCK_SIZE * ( 2 * REAL_SIZE + 2 * RVEC_SIZE ) >>> - ( system->d_atoms, - system->reaxprm.d_sbp, - system->reaxprm.d_hbp, - (control_params *)control->d_control, - (simulation_data *)data->d_simulation_data, - *dev_workspace, - *(dev_lists + BONDS), *(dev_lists + HBONDS), - system->N, system->reaxprm.num_atom_types, - spad, (rvec *) (spad + 2*system->N), NULL); - cudaThreadSynchronize (); - cudaCheckError (); + int hbs = (system->N * HBONDS_THREADS_PER_ATOM/ HBONDS_BLOCK_SIZE) + + (((system->N * HBONDS_THREADS_PER_ATOM) % HBONDS_BLOCK_SIZE) == 0 ? 0 : 1); + Hydrogen_Bonds_HB <<< hbs, HBONDS_BLOCK_SIZE, HBONDS_BLOCK_SIZE * ( 2 * REAL_SIZE + 2 * RVEC_SIZE ) >>> + ( system->d_atoms, + system->reaxprm.d_sbp, + system->reaxprm.d_hbp, + (control_params *)control->d_control, + (simulation_data *)data->d_simulation_data, + *dev_workspace, + *(dev_lists + BONDS), *(dev_lists + HBONDS), + system->N, system->reaxprm.num_atom_types, + spad, (rvec *) (spad + 2*system->N), NULL); + cudaThreadSynchronize (); + cudaCheckError (); #ifdef __DEBUG_CUDA__ - test2 = Get_Timing_Info (test1); - fprintf (stderr, "Timing for the hb and forces ---> %f \n", test2); + test2 = Get_Timing_Info (test1); + fprintf (stderr, "Timing for the hb and forces ---> %f \n", test2); #endif - //Reduction for E_HB - Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> - (spad, spad + system->N, system->N); - cudaThreadSynchronize (); - cudaCheckError (); + //Reduction for E_HB + Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> + (spad, spad + system->N, system->N); + cudaThreadSynchronize (); + cudaCheckError (); - Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> - (spad + system->N, &((simulation_data *)data->d_simulation_data)->E_HB, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); + Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> + (spad + system->N, &((simulation_data *)data->d_simulation_data)->E_HB, BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); - //Reduction for ext_pres - rvec_spad = (rvec *) (spad + 2*system->N); - Cuda_reduction_rvec <<<BLOCKS_POW_2, BLOCK_SIZE, RVEC_SIZE * BLOCK_SIZE >>> - (rvec_spad, rvec_spad + system->N, system->N); - cudaThreadSynchronize (); - cudaCheckError (); + //Reduction for ext_pres + rvec_spad = (rvec *) (spad + 2*system->N); + Cuda_reduction_rvec <<<BLOCKS_POW_2, BLOCK_SIZE, RVEC_SIZE * BLOCK_SIZE >>> + (rvec_spad, rvec_spad + system->N, system->N); + cudaThreadSynchronize (); + cudaCheckError (); - Cuda_reduction_rvec <<<1, BLOCKS_POW_2, RVEC_SIZE * BLOCKS_POW_2 >>> - (rvec_spad + system->N, &((simulation_data *)data->d_simulation_data)->ext_press, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); + Cuda_reduction_rvec <<<1, BLOCKS_POW_2, RVEC_SIZE * BLOCKS_POW_2 >>> + (rvec_spad + system->N, &((simulation_data *)data->d_simulation_data)->ext_press, BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); - //Post process here + //Post process here #ifdef __DEBUG_CUDA__ - real t_1, t_2; - t_1 = Get_Time (); + real t_1, t_2; + t_1 = Get_Time (); #endif - Hydrogen_Bonds_Postprocess <<< BLOCKS, BLOCK_SIZE, BLOCK_SIZE * RVEC_SIZE >>> - ( system->d_atoms, - system->reaxprm.d_sbp, - *dev_workspace, - *(dev_lists + BONDS), - *(dev_lists + HBONDS), - *(dev_lists + FAR_NBRS), - system->N, - spad); //this is for the fix to use the shared memory - cudaThreadSynchronize (); - cudaCheckError (); + Hydrogen_Bonds_Postprocess <<< BLOCKS, BLOCK_SIZE, BLOCK_SIZE * RVEC_SIZE >>> + ( system->d_atoms, + system->reaxprm.d_sbp, + *dev_workspace, + *(dev_lists + BONDS), + *(dev_lists + HBONDS), + *(dev_lists + FAR_NBRS), + system->N, + spad); //this is for the fix to use the shared memory + cudaThreadSynchronize (); + cudaCheckError (); #ifdef __DEBUG_CUDA__ - t_2 = Get_Timing_Info ( t_1 ); - fprintf (stderr, " Hydrogen Bonds post process -----%f \n", t_2); - t_1 = Get_Time (); + t_2 = Get_Timing_Info ( t_1 ); + fprintf (stderr, " Hydrogen Bonds post process -----%f \n", t_2); + t_1 = Get_Time (); #endif - //Hydrogen_Bonds_Far_Nbrs <<< system->N, 32, 32 * RVEC_SIZE>>> - Hydrogen_Bonds_HNbrs <<< system->N, 32, 32 * RVEC_SIZE>>> - ( system->d_atoms, - system->reaxprm.d_sbp, - *dev_workspace, - *(dev_lists + BONDS), - *(dev_lists + HBONDS), - *(dev_lists + FAR_NBRS), - system->N ); - cudaThreadSynchronize (); - cudaCheckError (); - t_2 = Get_Timing_Info ( t_1 ); + //Hydrogen_Bonds_Far_Nbrs <<< system->N, 32, 32 * RVEC_SIZE>>> + Hydrogen_Bonds_HNbrs <<< system->N, 32, 32 * RVEC_SIZE>>> + ( system->d_atoms, + system->reaxprm.d_sbp, + *dev_workspace, + *(dev_lists + BONDS), + *(dev_lists + HBONDS), + *(dev_lists + FAR_NBRS), + system->N ); + cudaThreadSynchronize (); + cudaCheckError (); + t_2 = Get_Timing_Info ( t_1 ); #ifdef __DEBUG_CUDA__ - fprintf (stderr, " Hydrogen Bonds post process -----%f \n", t_2); - t_elapsed = Get_Timing_Info( t_start ); - fprintf (stderr, "Hydrogen bonds post process return value --> %d --- HydrogenBonds Timing %lf \n", cudaGetLastError (), t_elapsed ); - fprintf (stderr, "Hydrogen_Bond Done... \n"); + fprintf (stderr, " Hydrogen Bonds post process -----%f \n", t_2); + t_elapsed = Get_Timing_Info( t_start ); + fprintf (stderr, "Hydrogen bonds post process return value --> %d --- HydrogenBonds Timing %lf \n", cudaGetLastError (), t_elapsed ); + fprintf (stderr, "Hydrogen_Bond Done... \n"); #endif - } - return; - } - - void Compute_NonBonded_Forces( reax_system *system, control_params *control, - simulation_data *data,static_storage *workspace, - list** lists, output_controls *out_control ) - { - real t_start, t_elapsed; + } + return; + } + + void Compute_NonBonded_Forces( reax_system *system, control_params *control, + simulation_data *data,static_storage *workspace, + list** lists, output_controls *out_control ) + { + real t_start, t_elapsed; #ifdef TEST_ENERGY - fprintf( out_control->evdw, "step: %d\n%6s%6s%12s%12s%12s\n", - data->step, "atom1", "atom2", "r12", "evdw", "total" ); - fprintf( out_control->ecou, "step: %d\n%6s%6s%12s%12s%12s%12s%12s\n", - data->step, "atom1", "atom2", "r12", "q1", "q2", "ecou", "total" ); + fprintf( out_control->evdw, "step: %d\n%6s%6s%12s%12s%12s\n", + data->step, "atom1", "atom2", "r12", "evdw", "total" ); + fprintf( out_control->ecou, "step: %d\n%6s%6s%12s%12s%12s%12s%12s\n", + data->step, "atom1", "atom2", "r12", "q1", "q2", "ecou", "total" ); #endif - t_start = Get_Time( ); - QEq( system, control, data, workspace, lists[FAR_NBRS], out_control ); - t_elapsed = Get_Timing_Info( t_start ); - data->timing.QEq += t_elapsed; + t_start = Get_Time( ); + QEq( system, control, data, workspace, lists[FAR_NBRS], out_control ); + t_elapsed = Get_Timing_Info( t_start ); + data->timing.QEq += t_elapsed; #if defined(DEBUG_FOCUS) - fprintf( stderr, "qeq - " ); + fprintf( stderr, "qeq - " ); #endif - if ( control->tabulate == 0) - vdW_Coulomb_Energy( system, control, data, workspace, lists, out_control ); - else - Tabulated_vdW_Coulomb_Energy( system, control, data, workspace, - lists, out_control ); + if ( control->tabulate == 0) + vdW_Coulomb_Energy( system, control, data, workspace, lists, out_control ); + else + Tabulated_vdW_Coulomb_Energy( system, control, data, workspace, + lists, out_control ); #if defined(DEBUG_FOCUS) - fprintf( stderr, "nonb forces - " ); + fprintf( stderr, "nonb forces - " ); #endif #ifdef TEST_FORCES - Print_vdW_Coulomb_Forces( system, control, data, workspace, - lists, out_control ); + Print_vdW_Coulomb_Forces( system, control, data, workspace, + lists, out_control ); #endif - } - - void Cuda_Compute_NonBonded_Forces( reax_system *system, control_params *control, - simulation_data *data,static_storage *workspace, - list** lists, output_controls *out_control ) - { - real t_start, t_elapsed; - real t1 = 0, t2 = 0; - real *spad = (real *) scratch; - rvec *rvec_spad; - int cblks; - - t_start = Get_Time( ); - Cuda_QEq( system, control, data, workspace, lists[FAR_NBRS], out_control ); - t_elapsed = Get_Timing_Info( t_start ); - d_timing.QEq += t_elapsed; + } + + void Cuda_Compute_NonBonded_Forces( reax_system *system, control_params *control, + simulation_data *data,static_storage *workspace, + list** lists, output_controls *out_control ) + { + real t_start, t_elapsed; + real t1 = 0, t2 = 0; + real *spad = (real *) scratch; + rvec *rvec_spad; + int cblks; + + t_start = Get_Time( ); + Cuda_QEq( system, control, data, workspace, lists[FAR_NBRS], out_control ); + t_elapsed = Get_Timing_Info( t_start ); + d_timing.QEq += t_elapsed; #ifdef __DEBUG_CUDA__ - fprintf (stderr, " Cuda_QEq done with timing %lf \n", t_elapsed ); + fprintf (stderr, " Cuda_QEq done with timing %lf \n", t_elapsed ); #endif - cuda_memset (spad, 0, system->N * ( 4 * REAL_SIZE + 2 * RVEC_SIZE), RES_SCRATCH ); - - t_start = Get_Time (); - if ( control->tabulate == 0) - { - cblks = (system->N * VDW_THREADS_PER_ATOM / VDW_BLOCK_SIZE) + - ((system->N * VDW_THREADS_PER_ATOM/VDW_BLOCK_SIZE) == 0 ? 0 : 1); - Cuda_vdW_Coulomb_Energy <<< cblks, VDW_BLOCK_SIZE, VDW_BLOCK_SIZE * ( 2*REAL_SIZE + RVEC_SIZE) >>> - ( system->d_atoms, - system->reaxprm.d_tbp, - system->reaxprm.d_gp, - (control_params *)control->d_control, - (simulation_data *)data->d_simulation_data, - *(dev_lists + FAR_NBRS), - spad , spad + 2 * system->N, (rvec *) (spad + system->N * 4), - system->reaxprm.num_atom_types, - system->N ) ; - cudaThreadSynchronize (); - cudaCheckError (); - } - else - { - cblks = (system->N * VDW_THREADS_PER_ATOM / VDW_BLOCK_SIZE) + - ((system->N * VDW_THREADS_PER_ATOM/VDW_BLOCK_SIZE) == 0 ? 0 : 1); - Cuda_Tabulated_vdW_Coulomb_Energy <<< cblks, VDW_BLOCK_SIZE, VDW_BLOCK_SIZE* (2*REAL_SIZE + RVEC_SIZE)>>> - ( (reax_atom *)system->d_atoms, - (control_params *)control->d_control, - (simulation_data *)data->d_simulation_data, - *(dev_lists + FAR_NBRS), - spad , spad + 2 * system->N, (rvec *) (spad + system->N * 4), - d_LR, - system->reaxprm.num_atom_types, - out_control->energy_update_freq, - system->N ) ; - - cudaThreadSynchronize (); - cudaCheckError (); - } - - t_elapsed = Get_Timing_Info (t_start ); + cuda_memset (spad, 0, system->N * ( 4 * REAL_SIZE + 2 * RVEC_SIZE), RES_SCRATCH ); + + t_start = Get_Time (); + if ( control->tabulate == 0) + { + cblks = (system->N * VDW_THREADS_PER_ATOM / VDW_BLOCK_SIZE) + + ((system->N * VDW_THREADS_PER_ATOM/VDW_BLOCK_SIZE) == 0 ? 0 : 1); + Cuda_vdW_Coulomb_Energy <<< cblks, VDW_BLOCK_SIZE, VDW_BLOCK_SIZE * ( 2*REAL_SIZE + RVEC_SIZE) >>> + ( system->d_atoms, + system->reaxprm.d_tbp, + system->reaxprm.d_gp, + (control_params *)control->d_control, + (simulation_data *)data->d_simulation_data, + *(dev_lists + FAR_NBRS), + spad , spad + 2 * system->N, (rvec *) (spad + system->N * 4), + system->reaxprm.num_atom_types, + system->N ) ; + cudaThreadSynchronize (); + cudaCheckError (); + } + else + { + cblks = (system->N * VDW_THREADS_PER_ATOM / VDW_BLOCK_SIZE) + + ((system->N * VDW_THREADS_PER_ATOM/VDW_BLOCK_SIZE) == 0 ? 0 : 1); + Cuda_Tabulated_vdW_Coulomb_Energy <<< cblks, VDW_BLOCK_SIZE, VDW_BLOCK_SIZE* (2*REAL_SIZE + RVEC_SIZE)>>> + ( (reax_atom *)system->d_atoms, + (control_params *)control->d_control, + (simulation_data *)data->d_simulation_data, + *(dev_lists + FAR_NBRS), + spad , spad + 2 * system->N, (rvec *) (spad + system->N * 4), + d_LR, + system->reaxprm.num_atom_types, + out_control->energy_update_freq, + system->N ) ; + + cudaThreadSynchronize (); + cudaCheckError (); + } + + t_elapsed = Get_Timing_Info (t_start ); #ifdef __DEBUG_CUDA__ - fprintf (stderr, "Cuda_Tabulated_vdW_Coulomb_Energy done... %lf \n", (t_elapsed - t2)); - fprintf (stderr, "Cuda_Tabulated_vdW_Coulomb_Energy done... %lf \n", (t_elapsed)); + fprintf (stderr, "Cuda_Tabulated_vdW_Coulomb_Energy done... %lf \n", (t_elapsed - t2)); + fprintf (stderr, "Cuda_Tabulated_vdW_Coulomb_Energy done... %lf \n", (t_elapsed)); #endif - //Reduction on E_vdW - Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> - (spad, spad + system->N, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> - (spad + system->N, &((simulation_data *)data->d_simulation_data)->E_vdW, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); - - //reduction on E_Ele - Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> - (spad + 2*system->N, spad + 3*system->N, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> - (spad + 3*system->N, &((simulation_data *)data->d_simulation_data)->E_Ele, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); - rvec_spad = (rvec *) (spad + 4*system->N); - - //reduction on ext_press - Cuda_reduction_rvec <<<BLOCKS_POW_2, BLOCK_SIZE, RVEC_SIZE * BLOCK_SIZE>>> - (rvec_spad, rvec_spad + system->N, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_reduction_rvec <<<1, BLOCKS_POW_2, RVEC_SIZE * BLOCKS_POW_2>>> - (rvec_spad + system->N, &((simulation_data *)data->d_simulation_data)->ext_press, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); - } - - - /* This version of Compute_Total_Force computes forces from coefficients - accumulated by all interaction functions. Saves enormous time & space! */ - void Compute_Total_Force( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - list **lists ) - { - int i, pj; - list *bonds = (*lists) + BONDS; - - for( i = 0; i < system->N; ++i ) - for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ) - if( i < bonds->select.bond_list[pj].nbr ) { - if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) - Add_dBond_to_Forces( i, pj, system, data, workspace, lists ); - else - Add_dBond_to_Forces_NPT( i, pj, system, data, workspace, lists ); - } - } - - - void Validate_Lists( static_storage *workspace, list **lists, int step, int n, - int Hmax, int Htop, int num_bonds, int num_hbonds ) - { - int i, flag; - list *bonds, *hbonds; - - bonds = *lists + BONDS; - hbonds = *lists + HBONDS; - - /* far neighbors */ - if( Htop > Hmax * DANGER_ZONE ) { - workspace->realloc.Htop = Htop; - if( Htop > Hmax ) { - fprintf( stderr, - "step%d - ran out of space on H matrix: Htop=%d, max = %d", - step, Htop, Hmax ); - exit(INSUFFICIENT_SPACE); - } - } - - /* bond list */ - flag = -1; - workspace->realloc.num_bonds = num_bonds; - for( i = 0; i < n-1; ++i ) - if( End_Index(i, bonds) >= Start_Index(i+1, bonds)-2 ) { - workspace->realloc.bonds = 1; - if( End_Index(i, bonds) > Start_Index(i+1, bonds) ) - flag = i; - } - - if( flag > -1 ) { - fprintf( stderr, "step%d-bondchk failed: i=%d end(i)=%d str(i+1)=%d\n", - step, flag, End_Index(flag,bonds), Start_Index(flag+1,bonds) ); - exit(INSUFFICIENT_SPACE); - } - - if( End_Index(i, bonds) >= bonds->num_intrs-2 ) { - workspace->realloc.bonds = 1; - - if( End_Index(i, bonds) > bonds->num_intrs ) { - fprintf( stderr, "step%d-bondchk failed: i=%d end(i)=%d bond_end=%d\n", - step, flag, End_Index(i,bonds), bonds->num_intrs ); - exit(INSUFFICIENT_SPACE); - } - } - - - /* hbonds list */ - if( workspace->num_H > 0 ) { - flag = -1; - workspace->realloc.num_hbonds = num_hbonds; - for( i = 0; i < workspace->num_H-1; ++i ) - if( Num_Entries(i, hbonds) >= - (Start_Index(i+1, hbonds) - Start_Index(i, hbonds)) * DANGER_ZONE ) { - workspace->realloc.hbonds = 1; - if( End_Index(i, hbonds) > Start_Index(i+1, hbonds) ) - flag = i; - } - - if( flag > -1 ) { - fprintf( stderr, "step%d-hbondchk failed: i=%d end(i)=%d str(i+1)=%d\n", - step, flag, End_Index(flag,hbonds), Start_Index(flag+1,hbonds) ); - exit(INSUFFICIENT_SPACE); - } - - if( Num_Entries(i,hbonds) >= - (hbonds->num_intrs - Start_Index(i,hbonds)) * DANGER_ZONE ) { - workspace->realloc.hbonds = 1; - - if( End_Index(i, hbonds) > hbonds->num_intrs ) { - fprintf( stderr, "step%d-hbondchk failed: i=%d end(i)=%d hbondend=%d\n", - step, flag, End_Index(i,hbonds), hbonds->num_intrs ); - exit(INSUFFICIENT_SPACE); - } - } - } - } - - - void Cuda_Validate_Lists( reax_system *system, static_storage *workspace, list **lists, int step, int n, - int num_bonds, int num_hbonds ) - { - int i, flag; - list *bonds, *hbonds, *thblist; - int *bonds_start, *bonds_end; - int *hbonds_start, *hbonds_end; - int *mat_start, *mat_end; - int max_sparse_entries = 0; - - bonds = *lists + BONDS; - hbonds = *lists + HBONDS; - - bonds_start = (int *) calloc (bonds->n, INT_SIZE); - bonds_end = (int *) calloc (bonds->n, INT_SIZE); - - hbonds_start = (int *) calloc (hbonds->n, INT_SIZE ); - hbonds_end = (int *) calloc (hbonds->n, INT_SIZE ); - - mat_start = (int *) calloc (workspace->H.n, INT_SIZE ); - mat_end = (int *) calloc (workspace->H.n, INT_SIZE ); - - copy_host_device (bonds_start, bonds->index, bonds->n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ ); - copy_host_device (bonds_end, bonds->end_index, bonds->n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ ); - - copy_host_device (hbonds_start, hbonds->index, hbonds->n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ ); - copy_host_device (hbonds_end, hbonds->end_index, hbonds->n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ ); - - copy_host_device (mat_start, workspace->H.start, workspace->H.n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ ); - copy_host_device (mat_end, workspace->H.end, workspace->H.n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ ); - - /* Sparse Matrix entries */ + //Reduction on E_vdW + Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> + (spad, spad + system->N, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> + (spad + system->N, &((simulation_data *)data->d_simulation_data)->E_vdW, BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); + + //reduction on E_Ele + Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> + (spad + 2*system->N, spad + 3*system->N, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> + (spad + 3*system->N, &((simulation_data *)data->d_simulation_data)->E_Ele, BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); + rvec_spad = (rvec *) (spad + 4*system->N); + + //reduction on ext_press + Cuda_reduction_rvec <<<BLOCKS_POW_2, BLOCK_SIZE, RVEC_SIZE * BLOCK_SIZE>>> + (rvec_spad, rvec_spad + system->N, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_reduction_rvec <<<1, BLOCKS_POW_2, RVEC_SIZE * BLOCKS_POW_2>>> + (rvec_spad + system->N, &((simulation_data *)data->d_simulation_data)->ext_press, BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); + } + + + /* This version of Compute_Total_Force computes forces from coefficients + accumulated by all interaction functions. Saves enormous time & space! */ + void Compute_Total_Force( reax_system *system, control_params *control, + simulation_data *data, static_storage *workspace, + list **lists ) + { + int i, pj; + list *bonds = (*lists) + BONDS; + + for( i = 0; i < system->N; ++i ) + for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ) + if( i < bonds->select.bond_list[pj].nbr ) { + if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) + Add_dBond_to_Forces( i, pj, system, data, workspace, lists ); + else + Add_dBond_to_Forces_NPT( i, pj, system, data, workspace, lists ); + } + } + + + void Validate_Lists( static_storage *workspace, list **lists, int step, int n, + int Hmax, int Htop, int num_bonds, int num_hbonds ) + { + int i, flag; + list *bonds, *hbonds; + + bonds = *lists + BONDS; + hbonds = *lists + HBONDS; + + /* far neighbors */ + if( Htop > Hmax * DANGER_ZONE ) { + workspace->realloc.Htop = Htop; + if( Htop > Hmax ) { + fprintf( stderr, + "step%d - ran out of space on H matrix: Htop=%d, max = %d", + step, Htop, Hmax ); + exit(INSUFFICIENT_SPACE); + } + } + + /* bond list */ + flag = -1; + workspace->realloc.num_bonds = num_bonds; + for( i = 0; i < n-1; ++i ) + if( End_Index(i, bonds) >= Start_Index(i+1, bonds)-2 ) { + workspace->realloc.bonds = 1; + if( End_Index(i, bonds) > Start_Index(i+1, bonds) ) + flag = i; + } + + if( flag > -1 ) { + fprintf( stderr, "step%d-bondchk failed: i=%d end(i)=%d str(i+1)=%d\n", + step, flag, End_Index(flag,bonds), Start_Index(flag+1,bonds) ); + exit(INSUFFICIENT_SPACE); + } + + if( End_Index(i, bonds) >= bonds->num_intrs-2 ) { + workspace->realloc.bonds = 1; + + if( End_Index(i, bonds) > bonds->num_intrs ) { + fprintf( stderr, "step%d-bondchk failed: i=%d end(i)=%d bond_end=%d\n", + step, flag, End_Index(i,bonds), bonds->num_intrs ); + exit(INSUFFICIENT_SPACE); + } + } + + + /* hbonds list */ + if( workspace->num_H > 0 ) { + flag = -1; + workspace->realloc.num_hbonds = num_hbonds; + for( i = 0; i < workspace->num_H-1; ++i ) + if( Num_Entries(i, hbonds) >= + (Start_Index(i+1, hbonds) - Start_Index(i, hbonds)) * DANGER_ZONE ) { + workspace->realloc.hbonds = 1; + if( End_Index(i, hbonds) > Start_Index(i+1, hbonds) ) + flag = i; + } + + if( flag > -1 ) { + fprintf( stderr, "step%d-hbondchk failed: i=%d end(i)=%d str(i+1)=%d\n", + step, flag, End_Index(flag,hbonds), Start_Index(flag+1,hbonds) ); + exit(INSUFFICIENT_SPACE); + } + + if( Num_Entries(i,hbonds) >= + (hbonds->num_intrs - Start_Index(i,hbonds)) * DANGER_ZONE ) { + workspace->realloc.hbonds = 1; + + if( End_Index(i, hbonds) > hbonds->num_intrs ) { + fprintf( stderr, "step%d-hbondchk failed: i=%d end(i)=%d hbondend=%d\n", + step, flag, End_Index(i,hbonds), hbonds->num_intrs ); + exit(INSUFFICIENT_SPACE); + } + } + } + } + + + void Cuda_Validate_Lists( reax_system *system, static_storage *workspace, list **lists, int step, int n, + int num_bonds, int num_hbonds ) + { + int i, flag; + list *bonds, *hbonds, *thblist; + int *bonds_start, *bonds_end; + int *hbonds_start, *hbonds_end; + int *mat_start, *mat_end; + int max_sparse_entries = 0; + + bonds = *lists + BONDS; + hbonds = *lists + HBONDS; + + bonds_start = (int *) calloc (bonds->n, INT_SIZE); + bonds_end = (int *) calloc (bonds->n, INT_SIZE); + + hbonds_start = (int *) calloc (hbonds->n, INT_SIZE ); + hbonds_end = (int *) calloc (hbonds->n, INT_SIZE ); + + mat_start = (int *) calloc (workspace->H.n, INT_SIZE ); + mat_end = (int *) calloc (workspace->H.n, INT_SIZE ); + + copy_host_device (bonds_start, bonds->index, bonds->n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ ); + copy_host_device (bonds_end, bonds->end_index, bonds->n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ ); + + copy_host_device (hbonds_start, hbonds->index, hbonds->n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ ); + copy_host_device (hbonds_end, hbonds->end_index, hbonds->n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ ); + + copy_host_device (mat_start, workspace->H.start, workspace->H.n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ ); + copy_host_device (mat_end, workspace->H.end, workspace->H.n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ ); + + /* Sparse Matrix entries */ #ifdef __CUDA_TEST__ - /* - workspace->realloc.Htop = 0; - for (i = 0; i < workspace->H.n-1; i++) { - if (workspace->realloc.Htop <= (mat_end[i] - mat_start[i])){ - workspace->realloc.Htop = mat_end[i] - mat_start[i]; - } - } - */ + /* + workspace->realloc.Htop = 0; + for (i = 0; i < workspace->H.n-1; i++) { + if (workspace->realloc.Htop <= (mat_end[i] - mat_start[i])){ + workspace->realloc.Htop = mat_end[i] - mat_start[i]; + } + } + */ #endif - flag = -1; - workspace->realloc.Htop = 0; - for ( i = 0; i < n-1; i ++){ - - if( (mat_end[i] - mat_start[i]) > - (system->max_sparse_matrix_entries * DANGER_ZONE )) { - //fprintf (stderr, "step %d, Reached the water mark for sparse matrix for index: %d (%d %d) \n", - // step, i, mat_start[i], mat_end[i]); - if (workspace->realloc.Htop <= (mat_end[i] - mat_start[i])) - workspace->realloc.Htop = (mat_end[i] - mat_start[i]) ; - } - - if ( (mat_end[i] > mat_start[i+1]) ){ - fprintf( stderr, "step%d-matcheck failed: i=%d end(i)=%d start(i+1)=%d\n", - step, flag, mat_end[i], mat_start[i+1]); - exit(INSUFFICIENT_SPACE); - } - } - - if( (mat_end[i] - mat_start[i]) > system->max_sparse_matrix_entries * DANGER_ZONE ) { - if (workspace->realloc.Htop <= (mat_end[i] - mat_start[i])) - workspace->realloc.Htop = (mat_end[i] - mat_start[i]) ; - //fprintf (stderr, "step %d, Reached the water mark for sparse matrix for index %d (%d %d) -- %d \n", - // step, i, mat_start[i], mat_end[i], - // (int) (system->max_sparse_matrix_entries * DANGER_ZONE)); - - if( mat_end[i] > system->N * system->max_sparse_matrix_entries ) { - fprintf( stderr, "step%d-matchk failed: i=%d end(i)=%d mat_end=%d\n", - step, flag, mat_end[i], system->N * system->max_sparse_matrix_entries); - exit(INSUFFICIENT_SPACE); - } - } - - - /* bond list */ + flag = -1; + workspace->realloc.Htop = 0; + for ( i = 0; i < n-1; i ++){ + + if( (mat_end[i] - mat_start[i]) > + (system->max_sparse_matrix_entries * DANGER_ZONE )) { + //fprintf (stderr, "step %d, Reached the water mark for sparse matrix for index: %d (%d %d) \n", + // step, i, mat_start[i], mat_end[i]); + if (workspace->realloc.Htop <= (mat_end[i] - mat_start[i])) + workspace->realloc.Htop = (mat_end[i] - mat_start[i]) ; + } + + if ( (mat_end[i] > mat_start[i+1]) ){ + fprintf( stderr, "step%d-matcheck failed: i=%d end(i)=%d start(i+1)=%d\n", + step, flag, mat_end[i], mat_start[i+1]); + exit(INSUFFICIENT_SPACE); + } + } + + if( (mat_end[i] - mat_start[i]) > system->max_sparse_matrix_entries * DANGER_ZONE ) { + if (workspace->realloc.Htop <= (mat_end[i] - mat_start[i])) + workspace->realloc.Htop = (mat_end[i] - mat_start[i]) ; + //fprintf (stderr, "step %d, Reached the water mark for sparse matrix for index %d (%d %d) -- %d \n", + // step, i, mat_start[i], mat_end[i], + // (int) (system->max_sparse_matrix_entries * DANGER_ZONE)); + + if( mat_end[i] > system->N * system->max_sparse_matrix_entries ) { + fprintf( stderr, "step%d-matchk failed: i=%d end(i)=%d mat_end=%d\n", + step, flag, mat_end[i], system->N * system->max_sparse_matrix_entries); + exit(INSUFFICIENT_SPACE); + } + } + + + /* bond list */ #ifdef __CUDA_TEST__ - //workspace->realloc.bonds = 1; + //workspace->realloc.bonds = 1; #endif - flag = -1; - workspace->realloc.num_bonds = 0; - for( i = 0; i < n-1; ++i ) { - workspace->realloc.num_bonds += MAX((bonds_end [i] - bonds_start[i]) * 2, MIN_BONDS ); - if( bonds_end[i] >= bonds_start[i+1]-2 ) { - workspace->realloc.bonds = 1; - //fprintf (stderr, "step: %d, reached the water mark for bonds for atom: %d (%d %d) \n", - // step, i, bonds_start [i], bonds_end[i]); - if( bonds_end[i] > bonds_start[i+1] ) - flag = i; - } - } - - if( flag > -1 ) { - fprintf( stderr, "step%d-bondchk failed: i=%d end(i)=%d str(i+1)=%d\n", - step, flag, bonds_end[flag], bonds_start[flag+1] ); - exit(INSUFFICIENT_SPACE); - } - - workspace->realloc.num_bonds += MAX((bonds_end [i] - bonds_start[i]) * 2, MIN_BONDS ); - if( bonds_end[i] >= bonds->num_intrs-2 ) { - workspace->realloc.bonds = 1; - //fprintf (stderr, "step: %d, reached the water mark for bonds for atom: %d (%d %d) \n", - // step, i, bonds_start [i], bonds_end[i]); - - if( bonds_end[i] > bonds->num_intrs ) { - fprintf( stderr, "step%d-bondchk failed: i=%d end(i)=%d bond_end=%d\n", - step, flag, bonds_end[i], bonds->num_intrs ); - exit(INSUFFICIENT_SPACE); - } - } - - //fprintf (stderr, "step:%d Total bonds: %d \n", step, workspace->realloc.num_bonds); - - /* hbonds list */ - if( workspace->num_H > 0 ) { + flag = -1; + workspace->realloc.num_bonds = 0; + for( i = 0; i < n-1; ++i ) { + workspace->realloc.num_bonds += MAX((bonds_end [i] - bonds_start[i]) * 2, MIN_BONDS ); + if( bonds_end[i] >= bonds_start[i+1]-2 ) { + workspace->realloc.bonds = 1; + //fprintf (stderr, "step: %d, reached the water mark for bonds for atom: %d (%d %d) \n", + // step, i, bonds_start [i], bonds_end[i]); + if( bonds_end[i] > bonds_start[i+1] ) + flag = i; + } + } + + if( flag > -1 ) { + fprintf( stderr, "step%d-bondchk failed: i=%d end(i)=%d str(i+1)=%d\n", + step, flag, bonds_end[flag], bonds_start[flag+1] ); + exit(INSUFFICIENT_SPACE); + } + + workspace->realloc.num_bonds += MAX((bonds_end [i] - bonds_start[i]) * 2, MIN_BONDS ); + if( bonds_end[i] >= bonds->num_intrs-2 ) { + workspace->realloc.bonds = 1; + //fprintf (stderr, "step: %d, reached the water mark for bonds for atom: %d (%d %d) \n", + // step, i, bonds_start [i], bonds_end[i]); + + if( bonds_end[i] > bonds->num_intrs ) { + fprintf( stderr, "step%d-bondchk failed: i=%d end(i)=%d bond_end=%d\n", + step, flag, bonds_end[i], bonds->num_intrs ); + exit(INSUFFICIENT_SPACE); + } + } + + //fprintf (stderr, "step:%d Total bonds: %d \n", step, workspace->realloc.num_bonds); + + /* hbonds list */ + if( workspace->num_H > 0 ) { #ifdef __CUDA_TEST__ - //workspace->realloc.hbonds = 1; + //workspace->realloc.hbonds = 1; #endif - flag = -1; - workspace->realloc.num_hbonds = 0; - for( i = 0; i < workspace->num_H-1; ++i ) { - workspace->realloc.num_hbonds += MAX( (hbonds_end[i] - hbonds_start[i]) * SAFE_HBONDS, MIN_HBONDS ); - - if( (hbonds_end[i] - hbonds_start[i]) >= - (hbonds_start[i+1] - hbonds_start[i]) * DANGER_ZONE ) { - workspace->realloc.hbonds = 1; - //fprintf (stderr, "step: %d, reached the water mark for hbonds for atom: %d (%d %d) \n", - // step, i, hbonds_start [i], hbonds_end[i]); - if( hbonds_end[i] > hbonds_start[i+1] ) - flag = i; - } - } - - if( flag > -1 ) { - fprintf( stderr, "step%d-hbondchk failed: i=%d start(i)=%d,end(i)=%d str(i+1)=%d\n", - step, flag, hbonds_start[(flag)],hbonds_end[(flag)], hbonds_start[(flag+1)] ); - exit(INSUFFICIENT_SPACE); - } - - workspace->realloc.num_hbonds += MAX( (hbonds_end[i] - hbonds_start[i]) * SAFE_HBONDS, MIN_HBONDS ); - if( (hbonds_end[i] - hbonds_start[i]) >= - (hbonds->num_intrs - hbonds_start[i]) * DANGER_ZONE ) { - workspace->realloc.hbonds = 1; - //fprintf (stderr, "step: %d, reached the water mark for hbonds for atom: %d (%d %d) \n", - // step, i, hbonds_start [i], hbonds_end[i]); - - if( hbonds_end[i] > hbonds->num_intrs ) { - fprintf( stderr, "step%d-hbondchk failed: i=%d end(i)=%d hbondend=%d\n", - step, flag, hbonds_end[i], hbonds->num_intrs ); - exit(INSUFFICIENT_SPACE); - } - } - } - - //fprintf (stderr, "step:%d Total Hbonds: %d \n", step, workspace->realloc.num_hbonds); - - free (bonds_start); - free (bonds_end ); - - free (hbonds_start ); - free (hbonds_end ); - - free (mat_start ); - free (mat_end ); - } - - void Cuda_Threebody_List( reax_system *system, static_storage *workspace, list *thblist, int step ) - { - int *thb_start, *thb_end; - int i, flag; - - thb_start = (int *) calloc (thblist->n, INT_SIZE); - thb_end = (int *) calloc (thblist->n, INT_SIZE ); - - copy_host_device (thb_start, thblist->index, thblist->n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ ); - copy_host_device (thb_end, thblist->end_index, thblist->n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ ); - - /*three_body list*/ - flag = -1; - workspace->realloc.num_3body = 0; - for( i = 0; i < thblist->n-1; ++i ){ - if( (thb_end[i] - thb_start[i]) >= (thb_start[i+1] - thb_start[i])*DANGER_ZONE ) { - workspace->realloc.thbody = 1; - if( thb_end[i] > thb_end[i+1] || thb_end[i] > thblist->num_intrs ) { - flag = i; - break; - } - } - } - - if( flag > -1 ) { - //fprintf( stderr, "step%d-thbchk failed: i=%d end(i)=%d str(i+1)=%d\n", - // step, flag, thb_end[flag], thb_start[flag+1] ); - fprintf( stderr, "step%d-thbchk failed: i=%d start(i)=%d end(i)=%d thb_end=%d\n", - step, flag-1, thb_start[flag-1], thb_end[flag-1], thblist->num_intrs ); - fprintf( stderr, "step%d-thbchk failed: i=%d start(i)=%d end(i)=%d thb_end=%d\n", - step, flag, thb_start[flag], thb_end[flag], thblist->num_intrs ); - exit(INSUFFICIENT_SPACE); - } - - if( (thb_end[i]-thb_start[i]) >= (thblist->num_intrs - thb_start[i])*DANGER_ZONE ) { - workspace->realloc.thbody = 1; - - if( thb_end[i] > thblist->num_intrs ) { - fprintf( stderr, "step%d-thbchk failed: i=%d start(i)=%d end(i)=%d thb_end=%d\n", - step, i-1, thb_start[i-1], thb_end[i-1], thblist->num_intrs ); - fprintf( stderr, "step%d-thbchk failed: i=%d start(i)=%d end(i)=%d thb_end=%d\n", - step, i, thb_start[i], thb_end[i], thblist->num_intrs ); - exit(INSUFFICIENT_SPACE); - } - } - - free (thb_start); - free (thb_end); - } - - - void Init_Forces( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - list **lists, output_controls *out_control ) { - int i, j, pj; - int start_i, end_i; - int type_i, type_j; - int Htop, btop_i, btop_j, num_bonds, num_hbonds; - int ihb, jhb, ihb_top, jhb_top; - int flag; - real r_ij, r2, self_coef; - real dr3gamij_1, dr3gamij_3, Tap; - //real val, dif, base; - real C12, C34, C56; - real Cln_BOp_s, Cln_BOp_pi, Cln_BOp_pi2; - real BO, BO_s, BO_pi, BO_pi2; - real p_boc1, p_boc2; - sparse_matrix *H; - list *far_nbrs, *bonds, *hbonds; - single_body_parameters *sbp_i, *sbp_j; - two_body_parameters *twbp; - far_neighbor_data *nbr_pj; - //LR_lookup_table *t; - reax_atom *atom_i, *atom_j; - bond_data *ibond, *jbond; - bond_order_data *bo_ij, *bo_ji; - - far_nbrs = *lists + FAR_NBRS; - bonds = *lists + BONDS; - hbonds = *lists + HBONDS; - - H = &workspace->H; - Htop = 0; - num_bonds = 0; - num_hbonds = 0; - btop_i = btop_j = 0; - p_boc1 = system->reaxprm.gp.l[0]; - p_boc2 = system->reaxprm.gp.l[1]; - - for( i = 0; i < system->N; ++i ) { - atom_i = &(system->atoms[i]); - type_i = atom_i->type; - start_i = Start_Index(i, far_nbrs); - end_i = End_Index(i, far_nbrs); - H->start[i] = Htop; - btop_i = End_Index( i, bonds ); - sbp_i = &(system->reaxprm.sbp[type_i]); - ihb = ihb_top = -1; - if( control->hb_cut > 0 && (ihb=sbp_i->p_hbond) == 1 ) - ihb_top = End_Index( workspace->hbond_index[i], hbonds ); - - for( pj = start_i; pj < end_i; ++pj ) { - nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); - j = nbr_pj->nbr; - atom_j = &(system->atoms[j]); - - flag = 0; - if((data->step-data->prev_steps) % control->reneighbor == 0) { - if( nbr_pj->d <= control->r_cut) - flag = 1; - else flag = 0; - } - else if((nbr_pj->d=Sq_Distance_on_T3(atom_i->x,atom_j->x,&(system->box), - nbr_pj->dvec))<=SQR(control->r_cut)){ - nbr_pj->d = sqrt(nbr_pj->d); - flag = 1; - } - - if( flag ){ - type_j = system->atoms[j].type; - r_ij = nbr_pj->d; - sbp_j = &(system->reaxprm.sbp[type_j]); - twbp = &(system->reaxprm.tbp[ index_tbp (type_i,type_j,&system->reaxprm) ]); - self_coef = (i == j) ? 0.5 : 1.0; - - /* H matrix entry */ - Tap = control->Tap7 * r_ij + control->Tap6; - Tap = Tap * r_ij + control->Tap5; - Tap = Tap * r_ij + control->Tap4; - Tap = Tap * r_ij + control->Tap3; - Tap = Tap * r_ij + control->Tap2; - Tap = Tap * r_ij + control->Tap1; - Tap = Tap * r_ij + control->Tap0; - - dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma ); - dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 ); - - H->entries[Htop].j = j; - H->entries[Htop].val = self_coef * Tap * EV_to_KCALpMOL / dr3gamij_3; - ++Htop; - - /* hydrogen bond lists */ - if( control->hb_cut > 0 && (ihb==1 || ihb==2) && - nbr_pj->d <= control->hb_cut ) { - // fprintf( stderr, "%d %d\n", atom1, atom2 ); - jhb = sbp_j->p_hbond; - if( ihb == 1 && jhb == 2 ) { - hbonds->select.hbond_list[ihb_top].nbr = j; - hbonds->select.hbond_list[ihb_top].scl = 1; - hbonds->select.hbond_list[ihb_top].ptr = nbr_pj; - ++ihb_top; - ++num_hbonds; - } - else if( ihb == 2 && jhb == 1 ) { - jhb_top = End_Index( workspace->hbond_index[j], hbonds ); - hbonds->select.hbond_list[jhb_top].nbr = i; - hbonds->select.hbond_list[jhb_top].scl = -1; - hbonds->select.hbond_list[jhb_top].ptr = nbr_pj; - Set_End_Index( workspace->hbond_index[j], jhb_top+1, hbonds ); - ++num_hbonds; - } - } - - /* uncorrected bond orders */ - if( far_nbrs->select.far_nbr_list[pj].d <= control->nbr_cut ) { - r2 = SQR(r_ij); - - if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) { - C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 ); - BO_s = (1.0 + control->bo_cut) * EXP( C12 ); - } - else BO_s = C12 = 0.0; - - if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) { - C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 ); - BO_pi = EXP( C34 ); - } - else BO_pi = C34 = 0.0; - - if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) { - C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 ); - BO_pi2= EXP( C56 ); - } - else BO_pi2 = C56 = 0.0; - - /* Initially BO values are the uncorrected ones, page 1 */ - BO = BO_s + BO_pi + BO_pi2; - - if( BO >= control->bo_cut ) { - num_bonds += 2; - /****** bonds i-j and j-i ******/ - ibond = &( bonds->select.bond_list[btop_i] ); - btop_j = End_Index( j, bonds ); - jbond = &(bonds->select.bond_list[btop_j]); - - ibond->nbr = j; - jbond->nbr = i; - ibond->d = r_ij; - jbond->d = r_ij; - rvec_Copy( ibond->dvec, nbr_pj->dvec ); - rvec_Scale( jbond->dvec, -1, nbr_pj->dvec ); - ivec_Copy( ibond->rel_box, nbr_pj->rel_box ); - ivec_Scale( jbond->rel_box, -1, nbr_pj->rel_box ); - ibond->dbond_index = btop_i; - jbond->dbond_index = btop_i; - ibond->sym_index = btop_j; - jbond->sym_index = btop_i; - ++btop_i; - Set_End_Index( j, btop_j+1, bonds ); - - bo_ij = &( ibond->bo_data ); - bo_ji = &( jbond->bo_data ); - bo_ji->BO = bo_ij->BO = BO; - bo_ji->BO_s = bo_ij->BO_s = BO_s; - bo_ji->BO_pi = bo_ij->BO_pi = BO_pi; - bo_ji->BO_pi2 = bo_ij->BO_pi2 = BO_pi2; - - /* Bond Order page2-3, derivative of total bond order prime */ - Cln_BOp_s = twbp->p_bo2 * C12 / r2; - Cln_BOp_pi = twbp->p_bo4 * C34 / r2; - Cln_BOp_pi2 = twbp->p_bo6 * C56 / r2; - - /* Only dln_BOp_xx wrt. dr_i is stored here, note that - dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 */ - rvec_Scale(bo_ij->dln_BOp_s,-bo_ij->BO_s*Cln_BOp_s,ibond->dvec); - rvec_Scale(bo_ij->dln_BOp_pi,-bo_ij->BO_pi*Cln_BOp_pi,ibond->dvec); - rvec_Scale(bo_ij->dln_BOp_pi2, - -bo_ij->BO_pi2*Cln_BOp_pi2,ibond->dvec); - rvec_Scale(bo_ji->dln_BOp_s, -1., bo_ij->dln_BOp_s); - rvec_Scale(bo_ji->dln_BOp_pi, -1., bo_ij->dln_BOp_pi ); - rvec_Scale(bo_ji->dln_BOp_pi2, -1., bo_ij->dln_BOp_pi2 ); - - /* Only dBOp wrt. dr_i is stored here, note that - dBOp/dr_i = -dBOp/dr_j and all others are 0 */ - rvec_Scale( bo_ij->dBOp, - -(bo_ij->BO_s * Cln_BOp_s + - bo_ij->BO_pi * Cln_BOp_pi + - bo_ij->BO_pi2 * Cln_BOp_pi2), ibond->dvec ); - rvec_Scale( bo_ji->dBOp, -1., bo_ij->dBOp ); - - rvec_Add( workspace->dDeltap_self[i], bo_ij->dBOp ); - rvec_Add( workspace->dDeltap_self[j], bo_ji->dBOp ); - - bo_ij->BO_s -= control->bo_cut; - bo_ij->BO -= control->bo_cut; - bo_ji->BO_s -= control->bo_cut; - bo_ji->BO -= control->bo_cut; - workspace->total_bond_order[i] += bo_ij->BO; //currently total_BOp - workspace->total_bond_order[j] += bo_ji->BO; //currently total_BOp - bo_ij->Cdbo = bo_ij->Cdbopi = bo_ij->Cdbopi2 = 0.0; - bo_ji->Cdbo = bo_ji->Cdbopi = bo_ji->Cdbopi2 = 0.0; - - /*fprintf( stderr, "%d %d %g %g %g\n", - i+1, j+1, bo_ij->BO, bo_ij->BO_pi, bo_ij->BO_pi2 );*/ - - /*fprintf( stderr, "Cln_BOp_s: %f, pbo2: %f, C12:%f\n", - Cln_BOp_s, twbp->p_bo2, C12 ); - fprintf( stderr, "Cln_BOp_pi: %f, pbo4: %f, C34:%f\n", - Cln_BOp_pi, twbp->p_bo4, C34 ); - fprintf( stderr, "Cln_BOp_pi2: %f, pbo6: %f, C56:%f\n", - Cln_BOp_pi2, twbp->p_bo6, C56 );*/ - /*fprintf(stderr, "pbo1: %f, pbo2:%f\n", twbp->p_bo1, twbp->p_bo2); - fprintf(stderr, "pbo3: %f, pbo4:%f\n", twbp->p_bo3, twbp->p_bo4); - fprintf(stderr, "pbo5: %f, pbo6:%f\n", twbp->p_bo5, twbp->p_bo6); - fprintf( stderr, "r_s: %f, r_p: %f, r_pp: %f\n", - twbp->r_s, twbp->r_p, twbp->r_pp ); - fprintf( stderr, "C12: %g, C34:%g, C56:%g\n", C12, C34, C56 );*/ - - /*fprintf( stderr, "\tfactors: %g %g %g\n", - -(bo_ij->BO_s * Cln_BOp_s + bo_ij->BO_pi * Cln_BOp_pi + - bo_ij->BO_pi2 * Cln_BOp_pp), - -bo_ij->BO_pi * Cln_BOp_pi, -bo_ij->BO_pi2 * Cln_BOp_pi2 );*/ - /*fprintf( stderr, "dBOpi:\t[%g, %g, %g]\n", - bo_ij->dBOp[0], bo_ij->dBOp[1], bo_ij->dBOp[2] ); - fprintf( stderr, "dBOpi:\t[%g, %g, %g]\n", - bo_ij->dln_BOp_pi[0], bo_ij->dln_BOp_pi[1], - bo_ij->dln_BOp_pi[2] ); - fprintf( stderr, "dBOpi2:\t[%g, %g, %g]\n\n", - bo_ij->dln_BOp_pi2[0], bo_ij->dln_BOp_pi2[1], - bo_ij->dln_BOp_pi2[2] );*/ - - Set_End_Index( j, btop_j+1, bonds ); - } - } - } - } - - H->entries[Htop].j = i; - H->entries[Htop].val = system->reaxprm.sbp[type_i].eta; - ++Htop; - - Set_End_Index( i, btop_i, bonds ); - if( ihb == 1 ) - Set_End_Index( workspace->hbond_index[i], ihb_top, hbonds ); - //fprintf( stderr, "%d bonds start: %d, end: %d\n", - // i, Start_Index( i, bonds ), End_Index( i, bonds ) ); - } - - // mark the end of j list - H->start[i] = Htop; - /* validate lists - decide if reallocation is required! */ - Validate_Lists( workspace, lists, - data->step, system->N, H->m, Htop, num_bonds, num_hbonds ); + flag = -1; + workspace->realloc.num_hbonds = 0; + for( i = 0; i < workspace->num_H-1; ++i ) { + workspace->realloc.num_hbonds += MAX( (hbonds_end[i] - hbonds_start[i]) * SAFE_HBONDS, MIN_HBONDS ); + + if( (hbonds_end[i] - hbonds_start[i]) >= + (hbonds_start[i+1] - hbonds_start[i]) * DANGER_ZONE ) { + workspace->realloc.hbonds = 1; + //fprintf (stderr, "step: %d, reached the water mark for hbonds for atom: %d (%d %d) \n", + // step, i, hbonds_start [i], hbonds_end[i]); + if( hbonds_end[i] > hbonds_start[i+1] ) + flag = i; + } + } + + if( flag > -1 ) { + fprintf( stderr, "step%d-hbondchk failed: i=%d start(i)=%d,end(i)=%d str(i+1)=%d\n", + step, flag, hbonds_start[(flag)],hbonds_end[(flag)], hbonds_start[(flag+1)] ); + exit(INSUFFICIENT_SPACE); + } + + workspace->realloc.num_hbonds += MAX( (hbonds_end[i] - hbonds_start[i]) * SAFE_HBONDS, MIN_HBONDS ); + if( (hbonds_end[i] - hbonds_start[i]) >= + (hbonds->num_intrs - hbonds_start[i]) * DANGER_ZONE ) { + workspace->realloc.hbonds = 1; + //fprintf (stderr, "step: %d, reached the water mark for hbonds for atom: %d (%d %d) \n", + // step, i, hbonds_start [i], hbonds_end[i]); + + if( hbonds_end[i] > hbonds->num_intrs ) { + fprintf( stderr, "step%d-hbondchk failed: i=%d end(i)=%d hbondend=%d\n", + step, flag, hbonds_end[i], hbonds->num_intrs ); + exit(INSUFFICIENT_SPACE); + } + } + } + + //fprintf (stderr, "step:%d Total Hbonds: %d \n", step, workspace->realloc.num_hbonds); + + free (bonds_start); + free (bonds_end ); + + free (hbonds_start ); + free (hbonds_end ); + + free (mat_start ); + free (mat_end ); + } + + void Cuda_Threebody_List( reax_system *system, static_storage *workspace, list *thblist, int step ) + { + int *thb_start, *thb_end; + int i, flag; + + thb_start = (int *) calloc (thblist->n, INT_SIZE); + thb_end = (int *) calloc (thblist->n, INT_SIZE ); + + copy_host_device (thb_start, thblist->index, thblist->n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ ); + copy_host_device (thb_end, thblist->end_index, thblist->n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ ); + + /*three_body list*/ + flag = -1; + workspace->realloc.num_3body = 0; + for( i = 0; i < thblist->n-1; ++i ){ + if( (thb_end[i] - thb_start[i]) >= (thb_start[i+1] - thb_start[i])*DANGER_ZONE ) { + workspace->realloc.thbody = 1; + if( thb_end[i] > thb_end[i+1] || thb_end[i] > thblist->num_intrs ) { + flag = i; + break; + } + } + } + + if( flag > -1 ) { + //fprintf( stderr, "step%d-thbchk failed: i=%d end(i)=%d str(i+1)=%d\n", + // step, flag, thb_end[flag], thb_start[flag+1] ); + fprintf( stderr, "step%d-thbchk failed: i=%d start(i)=%d end(i)=%d thb_end=%d\n", + step, flag-1, thb_start[flag-1], thb_end[flag-1], thblist->num_intrs ); + fprintf( stderr, "step%d-thbchk failed: i=%d start(i)=%d end(i)=%d thb_end=%d\n", + step, flag, thb_start[flag], thb_end[flag], thblist->num_intrs ); + exit(INSUFFICIENT_SPACE); + } + + if( (thb_end[i]-thb_start[i]) >= (thblist->num_intrs - thb_start[i])*DANGER_ZONE ) { + workspace->realloc.thbody = 1; + + if( thb_end[i] > thblist->num_intrs ) { + fprintf( stderr, "step%d-thbchk failed: i=%d start(i)=%d end(i)=%d thb_end=%d\n", + step, i-1, thb_start[i-1], thb_end[i-1], thblist->num_intrs ); + fprintf( stderr, "step%d-thbchk failed: i=%d start(i)=%d end(i)=%d thb_end=%d\n", + step, i, thb_start[i], thb_end[i], thblist->num_intrs ); + exit(INSUFFICIENT_SPACE); + } + } + + free (thb_start); + free (thb_end); + } + + + void Init_Forces( reax_system *system, control_params *control, + simulation_data *data, static_storage *workspace, + list **lists, output_controls *out_control ) { + int i, j, pj; + int start_i, end_i; + int type_i, type_j; + int Htop, btop_i, btop_j, num_bonds, num_hbonds; + int ihb, jhb, ihb_top, jhb_top; + int flag; + real r_ij, r2, self_coef; + real dr3gamij_1, dr3gamij_3, Tap; + //real val, dif, base; + real C12, C34, C56; + real Cln_BOp_s, Cln_BOp_pi, Cln_BOp_pi2; + real BO, BO_s, BO_pi, BO_pi2; + real p_boc1, p_boc2; + sparse_matrix *H; + list *far_nbrs, *bonds, *hbonds; + single_body_parameters *sbp_i, *sbp_j; + two_body_parameters *twbp; + far_neighbor_data *nbr_pj; + //LR_lookup_table *t; + reax_atom *atom_i, *atom_j; + bond_data *ibond, *jbond; + bond_order_data *bo_ij, *bo_ji; + + far_nbrs = *lists + FAR_NBRS; + bonds = *lists + BONDS; + hbonds = *lists + HBONDS; + + H = &workspace->H; + Htop = 0; + num_bonds = 0; + num_hbonds = 0; + btop_i = btop_j = 0; + p_boc1 = system->reaxprm.gp.l[0]; + p_boc2 = system->reaxprm.gp.l[1]; + + for( i = 0; i < system->N; ++i ) { + atom_i = &(system->atoms[i]); + type_i = atom_i->type; + start_i = Start_Index(i, far_nbrs); + end_i = End_Index(i, far_nbrs); + H->start[i] = Htop; + btop_i = End_Index( i, bonds ); + sbp_i = &(system->reaxprm.sbp[type_i]); + ihb = ihb_top = -1; + if( control->hb_cut > 0 && (ihb=sbp_i->p_hbond) == 1 ) + ihb_top = End_Index( workspace->hbond_index[i], hbonds ); + + for( pj = start_i; pj < end_i; ++pj ) { + nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); + j = nbr_pj->nbr; + atom_j = &(system->atoms[j]); + + flag = 0; + if((data->step-data->prev_steps) % control->reneighbor == 0) { + if( nbr_pj->d <= control->r_cut) + flag = 1; + else flag = 0; + } + else if((nbr_pj->d=Sq_Distance_on_T3(atom_i->x,atom_j->x,&(system->box), + nbr_pj->dvec))<=SQR(control->r_cut)){ + nbr_pj->d = sqrt(nbr_pj->d); + flag = 1; + } + + if( flag ){ + type_j = system->atoms[j].type; + r_ij = nbr_pj->d; + sbp_j = &(system->reaxprm.sbp[type_j]); + twbp = &(system->reaxprm.tbp[ index_tbp (type_i,type_j,&system->reaxprm) ]); + self_coef = (i == j) ? 0.5 : 1.0; + + /* H matrix entry */ + Tap = control->Tap7 * r_ij + control->Tap6; + Tap = Tap * r_ij + control->Tap5; + Tap = Tap * r_ij + control->Tap4; + Tap = Tap * r_ij + control->Tap3; + Tap = Tap * r_ij + control->Tap2; + Tap = Tap * r_ij + control->Tap1; + Tap = Tap * r_ij + control->Tap0; + + dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma ); + dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 ); + + H->entries[Htop].j = j; + H->entries[Htop].val = self_coef * Tap * EV_to_KCALpMOL / dr3gamij_3; + ++Htop; + + /* hydrogen bond lists */ + if( control->hb_cut > 0 && (ihb==1 || ihb==2) && + nbr_pj->d <= control->hb_cut ) { + // fprintf( stderr, "%d %d\n", atom1, atom2 ); + jhb = sbp_j->p_hbond; + if( ihb == 1 && jhb == 2 ) { + hbonds->select.hbond_list[ihb_top].nbr = j; + hbonds->select.hbond_list[ihb_top].scl = 1; + hbonds->select.hbond_list[ihb_top].ptr = nbr_pj; + ++ihb_top; + ++num_hbonds; + } + else if( ihb == 2 && jhb == 1 ) { + jhb_top = End_Index( workspace->hbond_index[j], hbonds ); + hbonds->select.hbond_list[jhb_top].nbr = i; + hbonds->select.hbond_list[jhb_top].scl = -1; + hbonds->select.hbond_list[jhb_top].ptr = nbr_pj; + Set_End_Index( workspace->hbond_index[j], jhb_top+1, hbonds ); + ++num_hbonds; + } + } + + /* uncorrected bond orders */ + if( far_nbrs->select.far_nbr_list[pj].d <= control->nbr_cut ) { + r2 = SQR(r_ij); + + if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) { + C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 ); + BO_s = (1.0 + control->bo_cut) * EXP( C12 ); + } + else BO_s = C12 = 0.0; + + if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) { + C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 ); + BO_pi = EXP( C34 ); + } + else BO_pi = C34 = 0.0; + + if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) { + C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 ); + BO_pi2= EXP( C56 ); + } + else BO_pi2 = C56 = 0.0; + + /* Initially BO values are the uncorrected ones, page 1 */ + BO = BO_s + BO_pi + BO_pi2; + + if( BO >= control->bo_cut ) { + num_bonds += 2; + /****** bonds i-j and j-i ******/ + ibond = &( bonds->select.bond_list[btop_i] ); + btop_j = End_Index( j, bonds ); + jbond = &(bonds->select.bond_list[btop_j]); + + ibond->nbr = j; + jbond->nbr = i; + ibond->d = r_ij; + jbond->d = r_ij; + rvec_Copy( ibond->dvec, nbr_pj->dvec ); + rvec_Scale( jbond->dvec, -1, nbr_pj->dvec ); + ivec_Copy( ibond->rel_box, nbr_pj->rel_box ); + ivec_Scale( jbond->rel_box, -1, nbr_pj->rel_box ); + ibond->dbond_index = btop_i; + jbond->dbond_index = btop_i; + ibond->sym_index = btop_j; + jbond->sym_index = btop_i; + ++btop_i; + Set_End_Index( j, btop_j+1, bonds ); + + bo_ij = &( ibond->bo_data ); + bo_ji = &( jbond->bo_data ); + bo_ji->BO = bo_ij->BO = BO; + bo_ji->BO_s = bo_ij->BO_s = BO_s; + bo_ji->BO_pi = bo_ij->BO_pi = BO_pi; + bo_ji->BO_pi2 = bo_ij->BO_pi2 = BO_pi2; + + /* Bond Order page2-3, derivative of total bond order prime */ + Cln_BOp_s = twbp->p_bo2 * C12 / r2; + Cln_BOp_pi = twbp->p_bo4 * C34 / r2; + Cln_BOp_pi2 = twbp->p_bo6 * C56 / r2; + + /* Only dln_BOp_xx wrt. dr_i is stored here, note that + dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 */ + rvec_Scale(bo_ij->dln_BOp_s,-bo_ij->BO_s*Cln_BOp_s,ibond->dvec); + rvec_Scale(bo_ij->dln_BOp_pi,-bo_ij->BO_pi*Cln_BOp_pi,ibond->dvec); + rvec_Scale(bo_ij->dln_BOp_pi2, + -bo_ij->BO_pi2*Cln_BOp_pi2,ibond->dvec); + rvec_Scale(bo_ji->dln_BOp_s, -1., bo_ij->dln_BOp_s); + rvec_Scale(bo_ji->dln_BOp_pi, -1., bo_ij->dln_BOp_pi ); + rvec_Scale(bo_ji->dln_BOp_pi2, -1., bo_ij->dln_BOp_pi2 ); + + /* Only dBOp wrt. dr_i is stored here, note that + dBOp/dr_i = -dBOp/dr_j and all others are 0 */ + rvec_Scale( bo_ij->dBOp, + -(bo_ij->BO_s * Cln_BOp_s + + bo_ij->BO_pi * Cln_BOp_pi + + bo_ij->BO_pi2 * Cln_BOp_pi2), ibond->dvec ); + rvec_Scale( bo_ji->dBOp, -1., bo_ij->dBOp ); + + rvec_Add( workspace->dDeltap_self[i], bo_ij->dBOp ); + rvec_Add( workspace->dDeltap_self[j], bo_ji->dBOp ); + + bo_ij->BO_s -= control->bo_cut; + bo_ij->BO -= control->bo_cut; + bo_ji->BO_s -= control->bo_cut; + bo_ji->BO -= control->bo_cut; + workspace->total_bond_order[i] += bo_ij->BO; //currently total_BOp + workspace->total_bond_order[j] += bo_ji->BO; //currently total_BOp + bo_ij->Cdbo = bo_ij->Cdbopi = bo_ij->Cdbopi2 = 0.0; + bo_ji->Cdbo = bo_ji->Cdbopi = bo_ji->Cdbopi2 = 0.0; + + /*fprintf( stderr, "%d %d %g %g %g\n", + i+1, j+1, bo_ij->BO, bo_ij->BO_pi, bo_ij->BO_pi2 );*/ + + /*fprintf( stderr, "Cln_BOp_s: %f, pbo2: %f, C12:%f\n", + Cln_BOp_s, twbp->p_bo2, C12 ); + fprintf( stderr, "Cln_BOp_pi: %f, pbo4: %f, C34:%f\n", + Cln_BOp_pi, twbp->p_bo4, C34 ); + fprintf( stderr, "Cln_BOp_pi2: %f, pbo6: %f, C56:%f\n", + Cln_BOp_pi2, twbp->p_bo6, C56 );*/ + /*fprintf(stderr, "pbo1: %f, pbo2:%f\n", twbp->p_bo1, twbp->p_bo2); + fprintf(stderr, "pbo3: %f, pbo4:%f\n", twbp->p_bo3, twbp->p_bo4); + fprintf(stderr, "pbo5: %f, pbo6:%f\n", twbp->p_bo5, twbp->p_bo6); + fprintf( stderr, "r_s: %f, r_p: %f, r_pp: %f\n", + twbp->r_s, twbp->r_p, twbp->r_pp ); + fprintf( stderr, "C12: %g, C34:%g, C56:%g\n", C12, C34, C56 );*/ + + /*fprintf( stderr, "\tfactors: %g %g %g\n", + -(bo_ij->BO_s * Cln_BOp_s + bo_ij->BO_pi * Cln_BOp_pi + + bo_ij->BO_pi2 * Cln_BOp_pp), + -bo_ij->BO_pi * Cln_BOp_pi, -bo_ij->BO_pi2 * Cln_BOp_pi2 );*/ + /*fprintf( stderr, "dBOpi:\t[%g, %g, %g]\n", + bo_ij->dBOp[0], bo_ij->dBOp[1], bo_ij->dBOp[2] ); + fprintf( stderr, "dBOpi:\t[%g, %g, %g]\n", + bo_ij->dln_BOp_pi[0], bo_ij->dln_BOp_pi[1], + bo_ij->dln_BOp_pi[2] ); + fprintf( stderr, "dBOpi2:\t[%g, %g, %g]\n\n", + bo_ij->dln_BOp_pi2[0], bo_ij->dln_BOp_pi2[1], + bo_ij->dln_BOp_pi2[2] );*/ + + Set_End_Index( j, btop_j+1, bonds ); + } + } + } + } + + H->entries[Htop].j = i; + H->entries[Htop].val = system->reaxprm.sbp[type_i].eta; + ++Htop; + + Set_End_Index( i, btop_i, bonds ); + if( ihb == 1 ) + Set_End_Index( workspace->hbond_index[i], ihb_top, hbonds ); + //fprintf( stderr, "%d bonds start: %d, end: %d\n", + // i, Start_Index( i, bonds ), End_Index( i, bonds ) ); + } + + // mark the end of j list + H->start[i] = Htop; + /* validate lists - decide if reallocation is required! */ + Validate_Lists( workspace, lists, + data->step, system->N, H->m, Htop, num_bonds, num_hbonds ); #if defined(DEBUG_FOCUS) - fprintf( stderr, "step%d: Htop = %d, num_bonds = %d, num_hbonds = %d\n", - data->step, Htop, num_bonds, num_hbonds ); + fprintf( stderr, "step%d: Htop = %d, num_bonds = %d, num_hbonds = %d\n", + data->step, Htop, num_bonds, num_hbonds ); #endif - } - - - GLOBAL void Estimate_Sparse_Matrix_Entries ( reax_atom *atoms, control_params *control, - simulation_data *data, simulation_box *box, list far_nbrs, int N, int *indices ) { - - int i, j, pj; - int start_i, end_i; - int type_i, type_j; - int Htop; - int flag; - far_neighbor_data *nbr_pj; - reax_atom *atom_i, *atom_j; - - int temp; - - Htop = 0; - - i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= N) return; - - atom_i = &(atoms[i]); - type_i = atom_i->type; - start_i = Start_Index(i, &far_nbrs); - end_i = End_Index(i, &far_nbrs); - indices[i] = Htop; - - for( pj = start_i; pj < end_i; ++pj ) { - nbr_pj = &( far_nbrs.select.far_nbr_list[pj] ); - j = nbr_pj->nbr; - atom_j = &(atoms[j]); - - //CHANGE ORIGINAL - //if (i < j) continue; - //CHANGE ORIGINAL - - flag = 0; - if((data->step-data->prev_steps) % control->reneighbor == 0) { - if( nbr_pj->d <= control->r_cut) - flag = 1; - else flag = 0; - } - else if((nbr_pj->d=Sq_Distance_on_T3(atom_i->x,atom_j->x,box,nbr_pj->dvec)) <= - SQR(control->r_cut)){ - nbr_pj->d = sqrt(nbr_pj->d); - flag = 1; - } - - if( flag ){ - ++Htop; - } - } - - ++Htop; - - // mark the end of j list - indices[i] = Htop; - } - - - - - GLOBAL void Init_Forces( reax_atom *atoms, global_parameters g_params, control_params *control, - single_body_parameters *sbp, two_body_parameters *tbp, - simulation_data *data, simulation_box *box, static_storage workspace, - list far_nbrs, list bonds, list hbonds, - int N, int max_sparse_entries, int num_atom_types ) - { - - int i, j, pj; - int start_i, end_i; - int type_i, type_j; - int Htop, btop_i, btop_j, num_bonds, num_hbonds; - int ihb, jhb, ihb_top, jhb_top; - int flag; - real r_ij, r2, self_coef; - real dr3gamij_1, dr3gamij_3, Tap; - //real val, dif, base; - real C12, C34, C56; - real Cln_BOp_s, Cln_BOp_pi, Cln_BOp_pi2; - real BO, BO_s, BO_pi, BO_pi2; - real p_boc1, p_boc2; - sparse_matrix *H; - single_body_parameters *sbp_i, *sbp_j; - two_body_parameters *twbp; - far_neighbor_data *nbr_pj; - //LR_lookup_table *t; - reax_atom *atom_i, *atom_j; - bond_data *ibond, *jbond; - bond_order_data *bo_ij, *bo_ji; - - i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= N) return; - - H = &( workspace.H ); - //CHANGE ORIGINAL - //Htop = 0; - Htop = i * max_sparse_entries; - //CHANGE ORIGINAL - num_bonds = 0; - num_hbonds = 0; - btop_i = btop_j = 0; - p_boc1 = g_params.l[0]; - p_boc2 = g_params.l[1]; - - //for( i = 0; i < system->N; ++i ) - atom_i = &(atoms[i]); - type_i = atom_i->type; - start_i = Start_Index(i, &far_nbrs); - end_i = End_Index(i, &far_nbrs); - - H->start[i] = Htop; - H->end[i] = Htop; - - btop_i = End_Index( i, &bonds ); - sbp_i = &(sbp[type_i]); - ihb = ihb_top = -1; - - ihb = sbp_i->p_hbond; - - if( control->hb_cut > 0 && (ihb==1 || ihb == 2)) - ihb_top = End_Index( workspace.hbond_index[i], &hbonds ); - - for( pj = start_i; pj < end_i; ++pj ) { - nbr_pj = &( far_nbrs.select.far_nbr_list[pj] ); - j = nbr_pj->nbr; - atom_j = &(atoms[j]); - - flag = 0; - if((data->step-data->prev_steps) % control->reneighbor == 0) { - if( nbr_pj->d <= control->r_cut) - flag = 1; - else flag = 0; - } - else if (i > j) { - if((nbr_pj->d=Sq_Distance_on_T3(atom_i->x,atom_j->x,box,nbr_pj->dvec))<=SQR(control->r_cut)){ - nbr_pj->d = sqrt(nbr_pj->d); - flag = 1; - } - } else if (i < j) { - if((nbr_pj->d=Sq_Distance_on_T3(atom_j->x,atom_i->x,box,nbr_pj->dvec))<=SQR(control->r_cut)){ - nbr_pj->d = sqrt(nbr_pj->d); - flag = 1; - } - } - - if( flag ){ - - type_j = atoms[j].type; - r_ij = nbr_pj->d; - sbp_j = &(sbp[type_j]); - twbp = &(tbp[ index_tbp (type_i,type_j, num_atom_types) ]); - self_coef = (i == j) ? 0.5 : 1.0; - - /* H matrix entry */ - - //CHANGE ORIGINAL - //if (i > j) { - Tap = control->Tap7 * r_ij + control->Tap6; - Tap = Tap * r_ij + control->Tap5; - Tap = Tap * r_ij + control->Tap4; - Tap = Tap * r_ij + control->Tap3; - Tap = Tap * r_ij + control->Tap2; - Tap = Tap * r_ij + control->Tap1; - Tap = Tap * r_ij + control->Tap0; - - dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma ); - dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 ); - - H->entries[Htop].j = j; - H->entries[Htop].val = self_coef * Tap * EV_to_KCALpMOL / dr3gamij_3; - - ++Htop; - //} - //CHANGE ORIGINAL - - /* hydrogen bond lists */ - if( control->hb_cut > 0 && (ihb==1 || ihb == 2) && - nbr_pj->d <= control->hb_cut ) { - // fprintf( stderr, "%d %d\n", atom1, atom2 ); - jhb = sbp_j->p_hbond; - - if (ihb == 1 && jhb == 2) { - if (i > j) { - hbonds.select.hbond_list[ihb_top].nbr = j; - hbonds.select.hbond_list[ihb_top].scl = 1; - hbonds.select.hbond_list[ihb_top].ptr = nbr_pj; - - //Auxilary data structures - rvec_MakeZero (hbonds.select.hbond_list[ihb_top].h_f); - hbonds.select.hbond_list[ihb_top].sym_index= -1; - ++ihb_top; - ++num_hbonds; - } else { - hbonds.select.hbond_list[ihb_top].nbr = j; - hbonds.select.hbond_list[ihb_top].scl = -1; - hbonds.select.hbond_list[ihb_top].ptr = nbr_pj; - - //Auxilary data structures - rvec_MakeZero (hbonds.select.hbond_list[ihb_top].h_f); - hbonds.select.hbond_list[ihb_top].sym_index= -1; - ++ihb_top; - ++num_hbonds; - } - } else if (ihb == 2 && jhb == 1) { - hbonds.select.hbond_list[ihb_top].nbr = j; - hbonds.select.hbond_list[ihb_top].scl = 1; - hbonds.select.hbond_list[ihb_top].ptr = nbr_pj; - //TODO - rvec_MakeZero (hbonds.select.hbond_list[ihb_top].h_f); - hbonds.select.hbond_list[ihb_top].sym_index= -1; - ++ihb_top; - ++num_hbonds; - } - } - - /* uncorrected bond orders */ - if( far_nbrs.select.far_nbr_list[pj].d <= control->nbr_cut ) { - r2 = SQR(r_ij); - - if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) { - C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 ); - BO_s = (1.0 + control->bo_cut) * EXP( C12 ); - } - else BO_s = C12 = 0.0; - - if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) { - C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 ); - BO_pi = EXP( C34 ); - } - else BO_pi = C34 = 0.0; - - if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) { - C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 ); - BO_pi2= EXP( C56 ); - } - else BO_pi2 = C56 = 0.0; - - /* Initially BO values are the uncorrected ones, page 1 */ - BO = BO_s + BO_pi + BO_pi2; - - - if( BO >= control->bo_cut ) { - //CHANGE ORIGINAL - num_bonds += 1; - //CHANGE ORIGINAL - - /****** bonds i-j and j-i ******/ - - /* Bond Order page2-3, derivative of total bond order prime */ - Cln_BOp_s = twbp->p_bo2 * C12 / r2; - Cln_BOp_pi = twbp->p_bo4 * C34 / r2; - Cln_BOp_pi2 = twbp->p_bo6 * C56 / r2; - - - if (i > j) - { - ibond = &( bonds.select.bond_list[btop_i] ); - ibond->nbr = j; - ibond->d = r_ij; - rvec_Copy( ibond->dvec, nbr_pj->dvec ); - ivec_Copy( ibond->rel_box, nbr_pj->rel_box ); - - //ibond->dbond_index = btop_i; - //ibond->sym_index = btop_j; - ++btop_i; - - bo_ij = &( ibond->bo_data ); - bo_ij->BO = BO; - bo_ij->BO_s = BO_s; - bo_ij->BO_pi = BO_pi; - bo_ij->BO_pi2 = BO_pi2; - - //Auxilary data structures - ibond->scratch = 0; - ibond->CdDelta_ij = 0; - rvec_MakeZero (ibond->f); - - ibond->l = -1; - ibond->CdDelta_jk = 0; - ibond->Cdbo_kl = 0; - rvec_MakeZero (ibond->i_f); - rvec_MakeZero (ibond->k_f); - - rvec_MakeZero (ibond->h_f); - - rvec_MakeZero (ibond->t_f); - - // Only dln_BOp_xx wrt. dr_i is stored here, note that - // dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 - rvec_Scale(bo_ij->dln_BOp_s,-bo_ij->BO_s*Cln_BOp_s,ibond->dvec); - rvec_Scale(bo_ij->dln_BOp_pi,-bo_ij->BO_pi*Cln_BOp_pi,ibond->dvec); - rvec_Scale(bo_ij->dln_BOp_pi2, - -bo_ij->BO_pi2*Cln_BOp_pi2,ibond->dvec); - - // Only dBOp wrt. dr_i is stored here, note that - // dBOp/dr_i = -dBOp/dr_j and all others are 0 - rvec_Scale( bo_ij->dBOp, - -(bo_ij->BO_s * Cln_BOp_s + - bo_ij->BO_pi * Cln_BOp_pi + - bo_ij->BO_pi2 * Cln_BOp_pi2), ibond->dvec ); - - rvec_Add( workspace.dDeltap_self[i], bo_ij->dBOp ); - - bo_ij->BO_s -= control->bo_cut; - bo_ij->BO -= control->bo_cut; - workspace.total_bond_order[i] += bo_ij->BO; //currently total_BOp - - bo_ij->Cdbo = bo_ij->Cdbopi = bo_ij->Cdbopi2 = 0.0; - - - } else if ( i < j ) - { - rvec dln_BOp_s, dln_BOp_pi, dln_BOp_pi2; - rvec dBOp; - - btop_j = btop_i; - - jbond = &(bonds.select.bond_list[btop_j]); - jbond->nbr = j; - jbond->d = r_ij; - rvec_Scale( jbond->dvec, -1, nbr_pj->dvec ); - ivec_Scale( jbond->rel_box, -1, nbr_pj->rel_box ); - - btop_i ++; - //jbond->dbond_index = btop_i; - //jbond->sym_index = btop_i; - - bo_ji = &( jbond->bo_data ); - bo_ji->BO = BO; - bo_ji->BO_s = BO_s; - bo_ji->BO_pi = BO_pi; - bo_ji->BO_pi2 = BO_pi2; - - //Auxilary data structures - jbond->scratch = 0; - jbond->CdDelta_ij = 0; - rvec_MakeZero (jbond->f); - - jbond->l = -1; - jbond->CdDelta_jk = 0; - jbond->Cdbo_kl = 0; - rvec_MakeZero (jbond->i_f); - rvec_MakeZero (jbond->k_f); - - rvec_MakeZero (jbond->h_f); - - rvec_MakeZero (jbond->t_f); - - // Only dln_BOp_xx wrt. dr_i is stored here, note that - // dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 - rvec_Scale(dln_BOp_s,-BO_s*Cln_BOp_s,nbr_pj->dvec); - rvec_Scale(dln_BOp_pi,-BO_pi*Cln_BOp_pi,nbr_pj->dvec); - rvec_Scale(dln_BOp_pi2, - -BO_pi2*Cln_BOp_pi2,nbr_pj->dvec); - - rvec_Scale(bo_ji->dln_BOp_s, -1., dln_BOp_s); - rvec_Scale(bo_ji->dln_BOp_pi, -1., dln_BOp_pi ); - rvec_Scale(bo_ji->dln_BOp_pi2, -1., dln_BOp_pi2 ); - - // Only dBOp wrt. dr_i is stored here, note that - // dBOp/dr_i = -dBOp/dr_j and all others are 0 - rvec_Scale( dBOp, - -(BO_s * Cln_BOp_s + - BO_pi * Cln_BOp_pi + - BO_pi2 * Cln_BOp_pi2), nbr_pj->dvec ); - rvec_Scale( bo_ji->dBOp, -1., dBOp ); - - rvec_Add( workspace.dDeltap_self[i] , bo_ji->dBOp ); - - bo_ji->BO_s -= control->bo_cut; - bo_ji->BO -= control->bo_cut; - workspace.total_bond_order[i] += bo_ji->BO; //currently total_BOp - - bo_ji->Cdbo = bo_ji->Cdbopi = bo_ji->Cdbopi2 = 0.0; - - } - } - } - } - } - - H->entries[Htop].j = i; - H->entries[Htop].val = sbp[type_i].eta; - ++Htop; - - H->end[i] = Htop; - - Set_End_Index( i, btop_i, &bonds ); - if( ihb == 1 || ihb == 2) - Set_End_Index( workspace.hbond_index[i], ihb_top, &hbonds ); - - //fprintf( stderr, "%d bonds start: %d, end: %d\n", - // i, Start_Index( i, bonds ), End_Index( i, bonds ) ); - //} - - // mark the end of j list - //H->start[i] = Htop; - /* validate lists - decide if reallocation is required! */ - //Validate_Lists( workspace, lists, - // data->step, system->N, H->m, Htop, num_bonds, num_hbonds ); + } + + + GLOBAL void Estimate_Sparse_Matrix_Entries ( reax_atom *atoms, control_params *control, + simulation_data *data, simulation_box *box, list far_nbrs, int N, int *indices ) { + + int i, j, pj; + int start_i, end_i; + int type_i, type_j; + int Htop; + int flag; + far_neighbor_data *nbr_pj; + reax_atom *atom_i, *atom_j; + + int temp; + + Htop = 0; + + i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= N) return; + + atom_i = &(atoms[i]); + type_i = atom_i->type; + start_i = Start_Index(i, &far_nbrs); + end_i = End_Index(i, &far_nbrs); + indices[i] = Htop; + + for( pj = start_i; pj < end_i; ++pj ) { + nbr_pj = &( far_nbrs.select.far_nbr_list[pj] ); + j = nbr_pj->nbr; + atom_j = &(atoms[j]); + + //CHANGE ORIGINAL + //if (i < j) continue; + //CHANGE ORIGINAL + + flag = 0; + if((data->step-data->prev_steps) % control->reneighbor == 0) { + if( nbr_pj->d <= control->r_cut) + flag = 1; + else flag = 0; + } + else if((nbr_pj->d=Sq_Distance_on_T3(atom_i->x,atom_j->x,box,nbr_pj->dvec)) <= + SQR(control->r_cut)){ + nbr_pj->d = sqrt(nbr_pj->d); + flag = 1; + } + + if( flag ){ + ++Htop; + } + } + + ++Htop; + + // mark the end of j list + indices[i] = Htop; + } + + + + + GLOBAL void Init_Forces( reax_atom *atoms, global_parameters g_params, control_params *control, + single_body_parameters *sbp, two_body_parameters *tbp, + simulation_data *data, simulation_box *box, static_storage workspace, + list far_nbrs, list bonds, list hbonds, + int N, int max_sparse_entries, int num_atom_types ) + { + + int i, j, pj; + int start_i, end_i; + int type_i, type_j; + int Htop, btop_i, btop_j, num_bonds, num_hbonds; + int ihb, jhb, ihb_top, jhb_top; + int flag; + real r_ij, r2, self_coef; + real dr3gamij_1, dr3gamij_3, Tap; + //real val, dif, base; + real C12, C34, C56; + real Cln_BOp_s, Cln_BOp_pi, Cln_BOp_pi2; + real BO, BO_s, BO_pi, BO_pi2; + real p_boc1, p_boc2; + sparse_matrix *H; + single_body_parameters *sbp_i, *sbp_j; + two_body_parameters *twbp; + far_neighbor_data *nbr_pj; + //LR_lookup_table *t; + reax_atom *atom_i, *atom_j; + bond_data *ibond, *jbond; + bond_order_data *bo_ij, *bo_ji; + + i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= N) return; + + H = &( workspace.H ); + //CHANGE ORIGINAL + //Htop = 0; + Htop = i * max_sparse_entries; + //CHANGE ORIGINAL + num_bonds = 0; + num_hbonds = 0; + btop_i = btop_j = 0; + p_boc1 = g_params.l[0]; + p_boc2 = g_params.l[1]; + + //for( i = 0; i < system->N; ++i ) + atom_i = &(atoms[i]); + type_i = atom_i->type; + start_i = Start_Index(i, &far_nbrs); + end_i = End_Index(i, &far_nbrs); + + H->start[i] = Htop; + H->end[i] = Htop; + + btop_i = End_Index( i, &bonds ); + sbp_i = &(sbp[type_i]); + ihb = ihb_top = -1; + + ihb = sbp_i->p_hbond; + + if( control->hb_cut > 0 && (ihb==1 || ihb == 2)) + ihb_top = End_Index( workspace.hbond_index[i], &hbonds ); + + for( pj = start_i; pj < end_i; ++pj ) { + nbr_pj = &( far_nbrs.select.far_nbr_list[pj] ); + j = nbr_pj->nbr; + atom_j = &(atoms[j]); + + flag = 0; + if((data->step-data->prev_steps) % control->reneighbor == 0) { + if( nbr_pj->d <= control->r_cut) + flag = 1; + else flag = 0; + } + else if (i > j) { + if((nbr_pj->d=Sq_Distance_on_T3(atom_i->x,atom_j->x,box,nbr_pj->dvec))<=SQR(control->r_cut)){ + nbr_pj->d = sqrt(nbr_pj->d); + flag = 1; + } + } else if (i < j) { + if((nbr_pj->d=Sq_Distance_on_T3(atom_j->x,atom_i->x,box,nbr_pj->dvec))<=SQR(control->r_cut)){ + nbr_pj->d = sqrt(nbr_pj->d); + flag = 1; + } + } + + if( flag ){ + + type_j = atoms[j].type; + r_ij = nbr_pj->d; + sbp_j = &(sbp[type_j]); + twbp = &(tbp[ index_tbp (type_i,type_j, num_atom_types) ]); + self_coef = (i == j) ? 0.5 : 1.0; + + /* H matrix entry */ + + //CHANGE ORIGINAL + //if (i > j) { + Tap = control->Tap7 * r_ij + control->Tap6; + Tap = Tap * r_ij + control->Tap5; + Tap = Tap * r_ij + control->Tap4; + Tap = Tap * r_ij + control->Tap3; + Tap = Tap * r_ij + control->Tap2; + Tap = Tap * r_ij + control->Tap1; + Tap = Tap * r_ij + control->Tap0; + + dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma ); + dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 ); + + H->entries[Htop].j = j; + H->entries[Htop].val = self_coef * Tap * EV_to_KCALpMOL / dr3gamij_3; + + ++Htop; + //} + //CHANGE ORIGINAL + + /* hydrogen bond lists */ + if( control->hb_cut > 0 && (ihb==1 || ihb == 2) && + nbr_pj->d <= control->hb_cut ) { + // fprintf( stderr, "%d %d\n", atom1, atom2 ); + jhb = sbp_j->p_hbond; + + if (ihb == 1 && jhb == 2) { + if (i > j) { + hbonds.select.hbond_list[ihb_top].nbr = j; + hbonds.select.hbond_list[ihb_top].scl = 1; + hbonds.select.hbond_list[ihb_top].ptr = nbr_pj; + + //Auxilary data structures + rvec_MakeZero (hbonds.select.hbond_list[ihb_top].h_f); + hbonds.select.hbond_list[ihb_top].sym_index= -1; + ++ihb_top; + ++num_hbonds; + } else { + hbonds.select.hbond_list[ihb_top].nbr = j; + hbonds.select.hbond_list[ihb_top].scl = -1; + hbonds.select.hbond_list[ihb_top].ptr = nbr_pj; + + //Auxilary data structures + rvec_MakeZero (hbonds.select.hbond_list[ihb_top].h_f); + hbonds.select.hbond_list[ihb_top].sym_index= -1; + ++ihb_top; + ++num_hbonds; + } + } else if (ihb == 2 && jhb == 1) { + hbonds.select.hbond_list[ihb_top].nbr = j; + hbonds.select.hbond_list[ihb_top].scl = 1; + hbonds.select.hbond_list[ihb_top].ptr = nbr_pj; + //TODO + rvec_MakeZero (hbonds.select.hbond_list[ihb_top].h_f); + hbonds.select.hbond_list[ihb_top].sym_index= -1; + ++ihb_top; + ++num_hbonds; + } + } + + /* uncorrected bond orders */ + if( far_nbrs.select.far_nbr_list[pj].d <= control->nbr_cut ) { + r2 = SQR(r_ij); + + if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) { + C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 ); + BO_s = (1.0 + control->bo_cut) * EXP( C12 ); + } + else BO_s = C12 = 0.0; + + if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) { + C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 ); + BO_pi = EXP( C34 ); + } + else BO_pi = C34 = 0.0; + + if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) { + C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 ); + BO_pi2= EXP( C56 ); + } + else BO_pi2 = C56 = 0.0; + + /* Initially BO values are the uncorrected ones, page 1 */ + BO = BO_s + BO_pi + BO_pi2; + + + if( BO >= control->bo_cut ) { + //CHANGE ORIGINAL + num_bonds += 1; + //CHANGE ORIGINAL + + /****** bonds i-j and j-i ******/ + + /* Bond Order page2-3, derivative of total bond order prime */ + Cln_BOp_s = twbp->p_bo2 * C12 / r2; + Cln_BOp_pi = twbp->p_bo4 * C34 / r2; + Cln_BOp_pi2 = twbp->p_bo6 * C56 / r2; + + + if (i > j) + { + ibond = &( bonds.select.bond_list[btop_i] ); + ibond->nbr = j; + ibond->d = r_ij; + rvec_Copy( ibond->dvec, nbr_pj->dvec ); + ivec_Copy( ibond->rel_box, nbr_pj->rel_box ); + + //ibond->dbond_index = btop_i; + //ibond->sym_index = btop_j; + ++btop_i; + + bo_ij = &( ibond->bo_data ); + bo_ij->BO = BO; + bo_ij->BO_s = BO_s; + bo_ij->BO_pi = BO_pi; + bo_ij->BO_pi2 = BO_pi2; + + //Auxilary data structures + ibond->scratch = 0; + ibond->CdDelta_ij = 0; + rvec_MakeZero (ibond->f); + + ibond->l = -1; + ibond->CdDelta_jk = 0; + ibond->Cdbo_kl = 0; + rvec_MakeZero (ibond->i_f); + rvec_MakeZero (ibond->k_f); + + rvec_MakeZero (ibond->h_f); + + rvec_MakeZero (ibond->t_f); + + // Only dln_BOp_xx wrt. dr_i is stored here, note that + // dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 + rvec_Scale(bo_ij->dln_BOp_s,-bo_ij->BO_s*Cln_BOp_s,ibond->dvec); + rvec_Scale(bo_ij->dln_BOp_pi,-bo_ij->BO_pi*Cln_BOp_pi,ibond->dvec); + rvec_Scale(bo_ij->dln_BOp_pi2, + -bo_ij->BO_pi2*Cln_BOp_pi2,ibond->dvec); + + // Only dBOp wrt. dr_i is stored here, note that + // dBOp/dr_i = -dBOp/dr_j and all others are 0 + rvec_Scale( bo_ij->dBOp, + -(bo_ij->BO_s * Cln_BOp_s + + bo_ij->BO_pi * Cln_BOp_pi + + bo_ij->BO_pi2 * Cln_BOp_pi2), ibond->dvec ); + + rvec_Add( workspace.dDeltap_self[i], bo_ij->dBOp ); + + bo_ij->BO_s -= control->bo_cut; + bo_ij->BO -= control->bo_cut; + workspace.total_bond_order[i] += bo_ij->BO; //currently total_BOp + + bo_ij->Cdbo = bo_ij->Cdbopi = bo_ij->Cdbopi2 = 0.0; + + + } else if ( i < j ) + { + rvec dln_BOp_s, dln_BOp_pi, dln_BOp_pi2; + rvec dBOp; + + btop_j = btop_i; + + jbond = &(bonds.select.bond_list[btop_j]); + jbond->nbr = j; + jbond->d = r_ij; + rvec_Scale( jbond->dvec, -1, nbr_pj->dvec ); + ivec_Scale( jbond->rel_box, -1, nbr_pj->rel_box ); + + btop_i ++; + //jbond->dbond_index = btop_i; + //jbond->sym_index = btop_i; + + bo_ji = &( jbond->bo_data ); + bo_ji->BO = BO; + bo_ji->BO_s = BO_s; + bo_ji->BO_pi = BO_pi; + bo_ji->BO_pi2 = BO_pi2; + + //Auxilary data structures + jbond->scratch = 0; + jbond->CdDelta_ij = 0; + rvec_MakeZero (jbond->f); + + jbond->l = -1; + jbond->CdDelta_jk = 0; + jbond->Cdbo_kl = 0; + rvec_MakeZero (jbond->i_f); + rvec_MakeZero (jbond->k_f); + + rvec_MakeZero (jbond->h_f); + + rvec_MakeZero (jbond->t_f); + + // Only dln_BOp_xx wrt. dr_i is stored here, note that + // dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 + rvec_Scale(dln_BOp_s,-BO_s*Cln_BOp_s,nbr_pj->dvec); + rvec_Scale(dln_BOp_pi,-BO_pi*Cln_BOp_pi,nbr_pj->dvec); + rvec_Scale(dln_BOp_pi2, + -BO_pi2*Cln_BOp_pi2,nbr_pj->dvec); + + rvec_Scale(bo_ji->dln_BOp_s, -1., dln_BOp_s); + rvec_Scale(bo_ji->dln_BOp_pi, -1., dln_BOp_pi ); + rvec_Scale(bo_ji->dln_BOp_pi2, -1., dln_BOp_pi2 ); + + // Only dBOp wrt. dr_i is stored here, note that + // dBOp/dr_i = -dBOp/dr_j and all others are 0 + rvec_Scale( dBOp, + -(BO_s * Cln_BOp_s + + BO_pi * Cln_BOp_pi + + BO_pi2 * Cln_BOp_pi2), nbr_pj->dvec ); + rvec_Scale( bo_ji->dBOp, -1., dBOp ); + + rvec_Add( workspace.dDeltap_self[i] , bo_ji->dBOp ); + + bo_ji->BO_s -= control->bo_cut; + bo_ji->BO -= control->bo_cut; + workspace.total_bond_order[i] += bo_ji->BO; //currently total_BOp + + bo_ji->Cdbo = bo_ji->Cdbopi = bo_ji->Cdbopi2 = 0.0; + + } + } + } + } + } + + H->entries[Htop].j = i; + H->entries[Htop].val = sbp[type_i].eta; + ++Htop; + + H->end[i] = Htop; + + Set_End_Index( i, btop_i, &bonds ); + if( ihb == 1 || ihb == 2) + Set_End_Index( workspace.hbond_index[i], ihb_top, &hbonds ); + + //fprintf( stderr, "%d bonds start: %d, end: %d\n", + // i, Start_Index( i, bonds ), End_Index( i, bonds ) ); + //} + + // mark the end of j list + //H->start[i] = Htop; + /* validate lists - decide if reallocation is required! */ + //Validate_Lists( workspace, lists, + // data->step, system->N, H->m, Htop, num_bonds, num_hbonds ); } -GLOBAL void Init_Forces_Tab ( reax_atom *atoms, global_parameters g_params, control_params *control, - single_body_parameters *sbp, two_body_parameters *tbp, - simulation_data *data, simulation_box *box, static_storage workspace, - list far_nbrs, list bonds, list hbonds, - int N, int max_sparse_entries, int num_atom_types, - LR_lookup_table *d_LR) +GLOBAL void Init_Forces_Tab ( reax_atom *atoms, global_parameters g_params, control_params *control, + single_body_parameters *sbp, two_body_parameters *tbp, + simulation_data *data, simulation_box *box, static_storage workspace, + list far_nbrs, list bonds, list hbonds, + int N, int max_sparse_entries, int num_atom_types, + LR_lookup_table *d_LR) { - int i, j, pj; - int start_i, end_i; - int type_i, type_j; - int Htop, btop_i, btop_j, num_bonds, num_hbonds; - int tmin, tmax, r; - int ihb, jhb, ihb_top, jhb_top; - int flag; - real r_ij, r2, self_coef; - real val, dif, base; - real C12, C34, C56; - real Cln_BOp_s, Cln_BOp_pi, Cln_BOp_pi2; - real BO, BO_s, BO_pi, BO_pi2; - real p_boc1, p_boc2; - sparse_matrix *H; - single_body_parameters *sbp_i, *sbp_j; - two_body_parameters *twbp; - far_neighbor_data *nbr_pj; - LR_lookup_table *t; - reax_atom *atom_i, *atom_j; - bond_data *ibond, *jbond; - bond_order_data *bo_ij, *bo_ji; - - i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= N) return; - - H = &(workspace.H); - //CHANGE ORIGINAL - Htop = i * max_sparse_entries; - //CHANGE ORIGINAL - num_bonds = 0; - num_hbonds = 0; - btop_i = btop_j = 0; - p_boc1 = g_params.l[0]; - p_boc2 = g_params.l[1]; - - //for( i = 0; i < system->N; ++i ) - atom_i = &(atoms[i]); - type_i = atom_i->type; - start_i = Start_Index(i, &far_nbrs); - end_i = End_Index(i, &far_nbrs); - H->start[i] = Htop; - H->end[i] = Htop; - btop_i = End_Index( i, &bonds ); - sbp_i = &(sbp[type_i]); - ihb = ihb_top = -1; - - ihb = sbp_i->p_hbond; - - if( control->hb_cut > 0 && (ihb==1 || ihb == 2)) - ihb_top = End_Index( workspace.hbond_index[i], &hbonds ); - - for( pj = start_i; pj < end_i; ++pj ) { - nbr_pj = &( far_nbrs.select.far_nbr_list[pj] ); - j = nbr_pj->nbr; - atom_j = &(atoms[j]); - - flag = 0; - if((data->step-data->prev_steps) % control->reneighbor == 0) { - if(nbr_pj->d <= control->r_cut) - flag = 1; - else flag = 0; - } - else if (i > j) { - if((nbr_pj->d=Sq_Distance_on_T3(atom_i->x,atom_j->x,box,nbr_pj->dvec))<=SQR(control->r_cut)){ - nbr_pj->d = sqrt(nbr_pj->d); - flag = 1; - } - } - else if ( i < j) { - if((nbr_pj->d=Sq_Distance_on_T3(atom_j->x,atom_i->x,box,nbr_pj->dvec))<=SQR(control->r_cut)){ - nbr_pj->d = sqrt(nbr_pj->d); - flag = 1; - } - } - - if( flag ){ - type_j = atoms[j].type; - r_ij = nbr_pj->d; - sbp_j = &(sbp[type_j]); - twbp = &(tbp[ index_tbp (type_i,type_j,num_atom_types) ]); - self_coef = (i == j) ? 0.5 : 1.0; - tmin = MIN( type_i, type_j ); - tmax = MAX( type_i, type_j ); - t = &( d_LR[ index_lr (tmin, tmax, num_atom_types) ]); - - /* cubic spline interpolation */ - //CHANGE ORIGINAL - //if (i > j) { - r = (int)(r_ij * t->inv_dx); - if( r == 0 ) ++r; - base = (real)(r+1) * t->dx; - dif = r_ij - base; - val = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + - t->ele[r].a; - val *= EV_to_KCALpMOL / C_ele; - - H->entries[Htop].j = j; - H->entries[Htop].val = self_coef * val; - //H->j [Htop] = j; - //H->val [Htop] = self_coef * val; - ++Htop; - //} - //CHANGE ORIGINAL - - /* hydrogen bond lists */ - if( control->hb_cut > 0 && (ihb==1 || ihb==2) && - nbr_pj->d <= control->hb_cut ) { - // fprintf( stderr, "%d %d\n", atom1, atom2 ); - jhb = sbp_j->p_hbond; - - if ( ihb == 1 && jhb == 2 ) { - if (i > j) { - hbonds.select.hbond_list[ihb_top].nbr = j; - hbonds.select.hbond_list[ihb_top].scl = 1; - hbonds.select.hbond_list[ihb_top].ptr = nbr_pj; - - //Auxilary data structures - rvec_MakeZero (hbonds.select.hbond_list[ihb_top].h_f); - hbonds.select.hbond_list[ihb_top].sym_index= -1; - ++ihb_top; - ++num_hbonds; - } else { - hbonds.select.hbond_list[ihb_top].nbr = j; - hbonds.select.hbond_list[ihb_top].scl = -1; - hbonds.select.hbond_list[ihb_top].ptr = nbr_pj; - - //Auxilary data structures - rvec_MakeZero (hbonds.select.hbond_list[ihb_top].h_f); - hbonds.select.hbond_list[ihb_top].sym_index= -1; - ++ihb_top; - ++num_hbonds; - } - } else if (ihb == 2 && jhb == 1) { - hbonds.select.hbond_list[ihb_top].nbr = j; - hbonds.select.hbond_list[ihb_top].scl = 1; - hbonds.select.hbond_list[ihb_top].ptr = nbr_pj; - - //Auxilary data structures - rvec_MakeZero (hbonds.select.hbond_list[ihb_top].h_f); - hbonds.select.hbond_list[ihb_top].sym_index= -1; - ++ihb_top; - ++num_hbonds; - } - } - - /* uncorrected bond orders */ - if( far_nbrs.select.far_nbr_list[pj].d <= control->nbr_cut ) { - r2 = SQR(r_ij); - - if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) { - C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 ); - BO_s = (1.0 + control->bo_cut) * EXP( C12 ); - } - else BO_s = C12 = 0.0; - - if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) { - C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 ); - BO_pi = EXP( C34 ); - } - else BO_pi = C34 = 0.0; - - if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) { - C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 ); - BO_pi2= EXP( C56 ); - } - else BO_pi2 = C56 = 0.0; - - /* Initially BO values are the uncorrected ones, page 1 */ - BO = BO_s + BO_pi + BO_pi2; - - if( BO >= control->bo_cut ) { - - //CHANGE ORIGINAL - num_bonds += 1; - //CHANGE ORIGINAL - - /****** bonds i-j and j-i ******/ - if ( i > j ) - { - ibond = &( bonds.select.bond_list[btop_i] ); - ibond->nbr = j; - ibond->d = r_ij; - - rvec_Copy( ibond->dvec, nbr_pj->dvec ); - ivec_Copy( ibond->rel_box, nbr_pj->rel_box ); - - //ibond->dbond_index = btop_i; - //ibond->sym_index = btop_j; - - ++btop_i; - - bo_ij = &( ibond->bo_data ); - bo_ij->BO = BO; - bo_ij->BO_s = BO_s; - bo_ij->BO_pi = BO_pi; - bo_ij->BO_pi2 = BO_pi2; - - //Auxilary data strucutres to resolve dependencies - ibond->scratch = 0; - ibond->CdDelta_ij = 0; - rvec_MakeZero (ibond->f); - - ibond->l = -1; - ibond->CdDelta_jk = 0; - ibond->Cdbo_kl = 0; - rvec_MakeZero (ibond->i_f); - rvec_MakeZero (ibond->k_f); - - rvec_MakeZero (ibond->h_f); - - rvec_MakeZero (ibond->t_f); - - /* Bond Order page2-3, derivative of total bond order prime */ - Cln_BOp_s = twbp->p_bo2 * C12 / r2; - Cln_BOp_pi = twbp->p_bo4 * C34 / r2; - Cln_BOp_pi2 = twbp->p_bo6 * C56 / r2; - - /* Only dln_BOp_xx wrt. dr_i is stored here, note that - dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 */ - rvec_Scale(bo_ij->dln_BOp_s,-bo_ij->BO_s*Cln_BOp_s,ibond->dvec); - rvec_Scale(bo_ij->dln_BOp_pi,-bo_ij->BO_pi*Cln_BOp_pi,ibond->dvec); - rvec_Scale(bo_ij->dln_BOp_pi2, - -bo_ij->BO_pi2*Cln_BOp_pi2,ibond->dvec); - - /* Only dBOp wrt. dr_i is stored here, note that - dBOp/dr_i = -dBOp/dr_j and all others are 0 */ - rvec_Scale( bo_ij->dBOp, - -(bo_ij->BO_s * Cln_BOp_s + - bo_ij->BO_pi * Cln_BOp_pi + - bo_ij->BO_pi2 * Cln_BOp_pi2), ibond->dvec ); - - rvec_Add( workspace.dDeltap_self[i], bo_ij->dBOp ); - - bo_ij->BO_s -= control->bo_cut; - bo_ij->BO -= control->bo_cut; - - workspace.total_bond_order[i] += bo_ij->BO; //currently total_BOp - - bo_ij->Cdbo = bo_ij->Cdbopi = bo_ij->Cdbopi2 = 0.0; - } - else { - rvec dln_BOp_s, dln_BOp_pi, dln_BOp_pi2; - rvec dBOp; + int i, j, pj; + int start_i, end_i; + int type_i, type_j; + int Htop, btop_i, btop_j, num_bonds, num_hbonds; + int tmin, tmax, r; + int ihb, jhb, ihb_top, jhb_top; + int flag; + real r_ij, r2, self_coef; + real val, dif, base; + real C12, C34, C56; + real Cln_BOp_s, Cln_BOp_pi, Cln_BOp_pi2; + real BO, BO_s, BO_pi, BO_pi2; + real p_boc1, p_boc2; + sparse_matrix *H; + single_body_parameters *sbp_i, *sbp_j; + two_body_parameters *twbp; + far_neighbor_data *nbr_pj; + LR_lookup_table *t; + reax_atom *atom_i, *atom_j; + bond_data *ibond, *jbond; + bond_order_data *bo_ij, *bo_ji; + + i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= N) return; + + H = &(workspace.H); + //CHANGE ORIGINAL + Htop = i * max_sparse_entries; + //CHANGE ORIGINAL + num_bonds = 0; + num_hbonds = 0; + btop_i = btop_j = 0; + p_boc1 = g_params.l[0]; + p_boc2 = g_params.l[1]; + + //for( i = 0; i < system->N; ++i ) + atom_i = &(atoms[i]); + type_i = atom_i->type; + start_i = Start_Index(i, &far_nbrs); + end_i = End_Index(i, &far_nbrs); + H->start[i] = Htop; + H->end[i] = Htop; + btop_i = End_Index( i, &bonds ); + sbp_i = &(sbp[type_i]); + ihb = ihb_top = -1; + + ihb = sbp_i->p_hbond; + + if( control->hb_cut > 0 && (ihb==1 || ihb == 2)) + ihb_top = End_Index( workspace.hbond_index[i], &hbonds ); + + for( pj = start_i; pj < end_i; ++pj ) { + nbr_pj = &( far_nbrs.select.far_nbr_list[pj] ); + j = nbr_pj->nbr; + atom_j = &(atoms[j]); + + flag = 0; + if((data->step-data->prev_steps) % control->reneighbor == 0) { + if(nbr_pj->d <= control->r_cut) + flag = 1; + else flag = 0; + } + else if (i > j) { + if((nbr_pj->d=Sq_Distance_on_T3(atom_i->x,atom_j->x,box,nbr_pj->dvec))<=SQR(control->r_cut)){ + nbr_pj->d = sqrt(nbr_pj->d); + flag = 1; + } + } + else if ( i < j) { + if((nbr_pj->d=Sq_Distance_on_T3(atom_j->x,atom_i->x,box,nbr_pj->dvec))<=SQR(control->r_cut)){ + nbr_pj->d = sqrt(nbr_pj->d); + flag = 1; + } + } + + if( flag ){ + type_j = atoms[j].type; + r_ij = nbr_pj->d; + sbp_j = &(sbp[type_j]); + twbp = &(tbp[ index_tbp (type_i,type_j,num_atom_types) ]); + self_coef = (i == j) ? 0.5 : 1.0; + tmin = MIN( type_i, type_j ); + tmax = MAX( type_i, type_j ); + t = &( d_LR[ index_lr (tmin, tmax, num_atom_types) ]); + + /* cubic spline interpolation */ + //CHANGE ORIGINAL + //if (i > j) { + r = (int)(r_ij * t->inv_dx); + if( r == 0 ) ++r; + base = (real)(r+1) * t->dx; + dif = r_ij - base; + val = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + + t->ele[r].a; + val *= EV_to_KCALpMOL / C_ele; + + H->entries[Htop].j = j; + H->entries[Htop].val = self_coef * val; + //H->j [Htop] = j; + //H->val [Htop] = self_coef * val; + ++Htop; + //} + //CHANGE ORIGINAL + + /* hydrogen bond lists */ + if( control->hb_cut > 0 && (ihb==1 || ihb==2) && + nbr_pj->d <= control->hb_cut ) { + // fprintf( stderr, "%d %d\n", atom1, atom2 ); + jhb = sbp_j->p_hbond; + + if ( ihb == 1 && jhb == 2 ) { + if (i > j) { + hbonds.select.hbond_list[ihb_top].nbr = j; + hbonds.select.hbond_list[ihb_top].scl = 1; + hbonds.select.hbond_list[ihb_top].ptr = nbr_pj; + + //Auxilary data structures + rvec_MakeZero (hbonds.select.hbond_list[ihb_top].h_f); + hbonds.select.hbond_list[ihb_top].sym_index= -1; + ++ihb_top; + ++num_hbonds; + } else { + hbonds.select.hbond_list[ihb_top].nbr = j; + hbonds.select.hbond_list[ihb_top].scl = -1; + hbonds.select.hbond_list[ihb_top].ptr = nbr_pj; + + //Auxilary data structures + rvec_MakeZero (hbonds.select.hbond_list[ihb_top].h_f); + hbonds.select.hbond_list[ihb_top].sym_index= -1; + ++ihb_top; + ++num_hbonds; + } + } else if (ihb == 2 && jhb == 1) { + hbonds.select.hbond_list[ihb_top].nbr = j; + hbonds.select.hbond_list[ihb_top].scl = 1; + hbonds.select.hbond_list[ihb_top].ptr = nbr_pj; + + //Auxilary data structures + rvec_MakeZero (hbonds.select.hbond_list[ihb_top].h_f); + hbonds.select.hbond_list[ihb_top].sym_index= -1; + ++ihb_top; + ++num_hbonds; + } + } + + /* uncorrected bond orders */ + if( far_nbrs.select.far_nbr_list[pj].d <= control->nbr_cut ) { + r2 = SQR(r_ij); + + if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) { + C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 ); + BO_s = (1.0 + control->bo_cut) * EXP( C12 ); + } + else BO_s = C12 = 0.0; + + if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) { + C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 ); + BO_pi = EXP( C34 ); + } + else BO_pi = C34 = 0.0; + + if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) { + C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 ); + BO_pi2= EXP( C56 ); + } + else BO_pi2 = C56 = 0.0; + + /* Initially BO values are the uncorrected ones, page 1 */ + BO = BO_s + BO_pi + BO_pi2; + + if( BO >= control->bo_cut ) { + + //CHANGE ORIGINAL + num_bonds += 1; + //CHANGE ORIGINAL + + /****** bonds i-j and j-i ******/ + if ( i > j ) + { + ibond = &( bonds.select.bond_list[btop_i] ); + ibond->nbr = j; + ibond->d = r_ij; + + rvec_Copy( ibond->dvec, nbr_pj->dvec ); + ivec_Copy( ibond->rel_box, nbr_pj->rel_box ); + + //ibond->dbond_index = btop_i; + //ibond->sym_index = btop_j; + + ++btop_i; + + bo_ij = &( ibond->bo_data ); + bo_ij->BO = BO; + bo_ij->BO_s = BO_s; + bo_ij->BO_pi = BO_pi; + bo_ij->BO_pi2 = BO_pi2; + + //Auxilary data strucutres to resolve dependencies + ibond->scratch = 0; + ibond->CdDelta_ij = 0; + rvec_MakeZero (ibond->f); + + ibond->l = -1; + ibond->CdDelta_jk = 0; + ibond->Cdbo_kl = 0; + rvec_MakeZero (ibond->i_f); + rvec_MakeZero (ibond->k_f); + + rvec_MakeZero (ibond->h_f); + + rvec_MakeZero (ibond->t_f); + + /* Bond Order page2-3, derivative of total bond order prime */ + Cln_BOp_s = twbp->p_bo2 * C12 / r2; + Cln_BOp_pi = twbp->p_bo4 * C34 / r2; + Cln_BOp_pi2 = twbp->p_bo6 * C56 / r2; + + /* Only dln_BOp_xx wrt. dr_i is stored here, note that + dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 */ + rvec_Scale(bo_ij->dln_BOp_s,-bo_ij->BO_s*Cln_BOp_s,ibond->dvec); + rvec_Scale(bo_ij->dln_BOp_pi,-bo_ij->BO_pi*Cln_BOp_pi,ibond->dvec); + rvec_Scale(bo_ij->dln_BOp_pi2, + -bo_ij->BO_pi2*Cln_BOp_pi2,ibond->dvec); + + /* Only dBOp wrt. dr_i is stored here, note that + dBOp/dr_i = -dBOp/dr_j and all others are 0 */ + rvec_Scale( bo_ij->dBOp, + -(bo_ij->BO_s * Cln_BOp_s + + bo_ij->BO_pi * Cln_BOp_pi + + bo_ij->BO_pi2 * Cln_BOp_pi2), ibond->dvec ); + + rvec_Add( workspace.dDeltap_self[i], bo_ij->dBOp ); + + bo_ij->BO_s -= control->bo_cut; + bo_ij->BO -= control->bo_cut; + + workspace.total_bond_order[i] += bo_ij->BO; //currently total_BOp + + bo_ij->Cdbo = bo_ij->Cdbopi = bo_ij->Cdbopi2 = 0.0; + } + else { + rvec dln_BOp_s, dln_BOp_pi, dln_BOp_pi2; + rvec dBOp; - btop_j = btop_i; + btop_j = btop_i; - jbond = &( bonds.select.bond_list[btop_j] ); - jbond->nbr = j; - jbond->d = r_ij; + jbond = &( bonds.select.bond_list[btop_j] ); + jbond->nbr = j; + jbond->d = r_ij; - rvec_Scale( jbond->dvec, -1, nbr_pj->dvec ); - ivec_Scale( jbond->rel_box, -1, nbr_pj->rel_box ); + rvec_Scale( jbond->dvec, -1, nbr_pj->dvec ); + ivec_Scale( jbond->rel_box, -1, nbr_pj->rel_box ); - //jbond->dbond_index = btop_i; - //jbond->sym_index = btop_i; + //jbond->dbond_index = btop_i; + //jbond->sym_index = btop_i; - ++btop_i; + ++btop_i; - bo_ji = &( jbond->bo_data ); + bo_ji = &( jbond->bo_data ); - bo_ji->BO = BO; - bo_ji->BO_s = BO_s; - bo_ji->BO_pi = BO_pi; - bo_ji->BO_pi2 = BO_pi2; + bo_ji->BO = BO; + bo_ji->BO_s = BO_s; + bo_ji->BO_pi = BO_pi; + bo_ji->BO_pi2 = BO_pi2; - // Auxilary data structures to resolve dependencies - jbond->scratch = 0; - jbond->CdDelta_ij = 0; - rvec_MakeZero (jbond->f); + // Auxilary data structures to resolve dependencies + jbond->scratch = 0; + jbond->CdDelta_ij = 0; + rvec_MakeZero (jbond->f); - jbond->l = -1; - jbond->CdDelta_jk = 0; - jbond->Cdbo_kl = 0; - rvec_MakeZero (jbond->i_f); - rvec_MakeZero (jbond->k_f); + jbond->l = -1; + jbond->CdDelta_jk = 0; + jbond->Cdbo_kl = 0; + rvec_MakeZero (jbond->i_f); + rvec_MakeZero (jbond->k_f); - rvec_MakeZero (jbond->h_f); + rvec_MakeZero (jbond->h_f); - rvec_MakeZero (jbond->t_f); + rvec_MakeZero (jbond->t_f); - // Bond Order page2-3, derivative of total bond order prime - Cln_BOp_s = twbp->p_bo2 * C12 / r2; - Cln_BOp_pi = twbp->p_bo4 * C34 / r2; - Cln_BOp_pi2 = twbp->p_bo6 * C56 / r2; + // Bond Order page2-3, derivative of total bond order prime + Cln_BOp_s = twbp->p_bo2 * C12 / r2; + Cln_BOp_pi = twbp->p_bo4 * C34 / r2; + Cln_BOp_pi2 = twbp->p_bo6 * C56 / r2; - // Only dln_BOp_xx wrt. dr_i is stored here, note that - // dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 + // Only dln_BOp_xx wrt. dr_i is stored here, note that + // dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 - rvec_Scale(dln_BOp_s,-BO_s*Cln_BOp_s,nbr_pj->dvec); - rvec_Scale(dln_BOp_pi,-BO_pi*Cln_BOp_pi,nbr_pj->dvec); - rvec_Scale(dln_BOp_pi2, -BO_pi2*Cln_BOp_pi2,nbr_pj->dvec); + rvec_Scale(dln_BOp_s,-BO_s*Cln_BOp_s,nbr_pj->dvec); + rvec_Scale(dln_BOp_pi,-BO_pi*Cln_BOp_pi,nbr_pj->dvec); + rvec_Scale(dln_BOp_pi2, -BO_pi2*Cln_BOp_pi2,nbr_pj->dvec); - rvec_Scale(bo_ji->dln_BOp_s, -1., dln_BOp_s); - rvec_Scale(bo_ji->dln_BOp_pi, -1., dln_BOp_pi ); - rvec_Scale(bo_ji->dln_BOp_pi2, -1., dln_BOp_pi2 ); + rvec_Scale(bo_ji->dln_BOp_s, -1., dln_BOp_s); + rvec_Scale(bo_ji->dln_BOp_pi, -1., dln_BOp_pi ); + rvec_Scale(bo_ji->dln_BOp_pi2, -1., dln_BOp_pi2 ); - // Only dBOp wrt. dr_i is stored here, note that - // dBOp/dr_i = -dBOp/dr_j and all others are 0 - //CHANGE ORIGINAL - rvec_Scale( dBOp, - -(BO_s * Cln_BOp_s + - BO_pi * Cln_BOp_pi + - BO_pi2 * Cln_BOp_pi2), nbr_pj->dvec); - rvec_Scale( bo_ji->dBOp, -1., dBOp); - //CHANGE ORIGINAL + // Only dBOp wrt. dr_i is stored here, note that + // dBOp/dr_i = -dBOp/dr_j and all others are 0 + //CHANGE ORIGINAL + rvec_Scale( dBOp, + -(BO_s * Cln_BOp_s + + BO_pi * Cln_BOp_pi + + BO_pi2 * Cln_BOp_pi2), nbr_pj->dvec); + rvec_Scale( bo_ji->dBOp, -1., dBOp); + //CHANGE ORIGINAL - rvec_Add( workspace.dDeltap_self[i], bo_ji->dBOp ); + rvec_Add( workspace.dDeltap_self[i], bo_ji->dBOp ); - bo_ji->BO_s -= control->bo_cut; - bo_ji->BO -= control->bo_cut; + bo_ji->BO_s -= control->bo_cut; + bo_ji->BO -= control->bo_cut; - workspace.total_bond_order[i] += bo_ji->BO; //currently total_BOp + workspace.total_bond_order[i] += bo_ji->BO; //currently total_BOp - bo_ji->Cdbo = bo_ji->Cdbopi = bo_ji->Cdbopi2 = 0.0; - } - } - } - } - } + bo_ji->Cdbo = bo_ji->Cdbopi = bo_ji->Cdbopi2 = 0.0; + } + } + } + } + } - H->entries[Htop].j = i; - H->entries[Htop].val = sbp[type_i].eta; + H->entries[Htop].j = i; + H->entries[Htop].val = sbp[type_i].eta; - //H->j [Htop] = i; - //H->val [Htop] = sbp[type_i].eta; + //H->j [Htop] = i; + //H->val [Htop] = sbp[type_i].eta; - ++Htop; + ++Htop; - H->end[i] = Htop; - Set_End_Index( i, btop_i, &bonds ); - if( ihb == 1 || ihb == 2) - Set_End_Index( workspace.hbond_index[i], ihb_top, &hbonds ); + H->end[i] = Htop; + Set_End_Index( i, btop_i, &bonds ); + if( ihb == 1 || ihb == 2) + Set_End_Index( workspace.hbond_index[i], ihb_top, &hbonds ); } GLOBAL void fix_sym_dbond_indices (list pbonds, int N) { - int i, nbr; - bond_data *ibond, *jbond; - int atom_j; - - list *bonds = &pbonds; - - i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= N) return; - - for (int j = Start_Index (i, bonds); j < End_Index (i, bonds); j++) - { - ibond = &( bonds->select.bond_list [j] ); - nbr = ibond->nbr; - - for (int k = Start_Index (nbr, bonds); k < End_Index (nbr, bonds); k ++) - { - jbond = &( bonds->select.bond_list[ k ] ); - atom_j = jbond->nbr; - - if ( (atom_j == i) ) - { - if (i > nbr) { - ibond->dbond_index = j; - jbond->dbond_index = j; - - ibond->sym_index = k; - jbond->sym_index = j; - } - } - } - } + int i, nbr; + bond_data *ibond, *jbond; + int atom_j; + + list *bonds = &pbonds; + + i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= N) return; + + for (int j = Start_Index (i, bonds); j < End_Index (i, bonds); j++) + { + ibond = &( bonds->select.bond_list [j] ); + nbr = ibond->nbr; + + for (int k = Start_Index (nbr, bonds); k < End_Index (nbr, bonds); k ++) + { + jbond = &( bonds->select.bond_list[ k ] ); + atom_j = jbond->nbr; + + if ( (atom_j == i) ) + { + if (i > nbr) { + ibond->dbond_index = j; + jbond->dbond_index = j; + + ibond->sym_index = k; + jbond->sym_index = j; + } + } + } + } } GLOBAL void fix_sym_hbond_indices (static_storage p_workspace, list hbonds, int N) { - static_storage *workspace = &p_workspace; - hbond_data *ihbond, *jhbond; - int nbr; - - //int i = (blockIdx.x * blockDim.x + threadIdx.x) >> 4; - int i = (blockIdx.x); - int start = Start_Index (workspace->hbond_index[i], &hbonds); - int end = End_Index (workspace->hbond_index[i], &hbonds); - //int j = start + threadIdx.x; - //int j = start + (threadIdx.x % 16); - - //for (int j = Start_Index (workspace->hbond_index[i], &hbonds); - // j < End_Index (workspace->hbond_index[i], &hbonds); j++) - int j = start + threadIdx.x; - while (j < end) - //for (int j = start; j < end; j++) - { - ihbond = &( hbonds.select.hbond_list [j] ); - nbr = ihbond->nbr; - - int nbrstart = Start_Index (workspace->hbond_index[nbr], &hbonds); - int nbrend = End_Index (workspace->hbond_index[nbr], &hbonds); - - for (int k = nbrstart; k < nbrend; k++) - //k = nbrstart + threadIdx.x; - //while (k < nbrend) - { - jhbond = &( hbonds.select.hbond_list [k] ); - - if (jhbond->nbr == i){ - ihbond->sym_index = k; - jhbond->sym_index = j; - break; - } - - //k += blockDim.x; - } - - j += 32; - } + static_storage *workspace = &p_workspace; + hbond_data *ihbond, *jhbond; + int nbr; + + //int i = (blockIdx.x * blockDim.x + threadIdx.x) >> 4; + int i = (blockIdx.x); + int start = Start_Index (workspace->hbond_index[i], &hbonds); + int end = End_Index (workspace->hbond_index[i], &hbonds); + //int j = start + threadIdx.x; + //int j = start + (threadIdx.x % 16); + + //for (int j = Start_Index (workspace->hbond_index[i], &hbonds); + // j < End_Index (workspace->hbond_index[i], &hbonds); j++) + int j = start + threadIdx.x; + while (j < end) + //for (int j = start; j < end; j++) + { + ihbond = &( hbonds.select.hbond_list [j] ); + nbr = ihbond->nbr; + + int nbrstart = Start_Index (workspace->hbond_index[nbr], &hbonds); + int nbrend = End_Index (workspace->hbond_index[nbr], &hbonds); + + for (int k = nbrstart; k < nbrend; k++) + //k = nbrstart + threadIdx.x; + //while (k < nbrend) + { + jhbond = &( hbonds.select.hbond_list [k] ); + + if (jhbond->nbr == i){ + ihbond->sym_index = k; + jhbond->sym_index = j; + break; + } + + //k += blockDim.x; + } + + j += 32; + } } GLOBAL void New_fix_sym_hbond_indices (static_storage p_workspace, list hbonds, int N ) { - static_storage *workspace = &p_workspace; - hbond_data *ihbond, *jhbond; - - int __THREADS_PER_ATOM__ = HBONDS_SYM_THREADS_PER_ATOM; - int thread_id = blockIdx.x * blockDim.x + threadIdx.x; - int warp_id = thread_id / __THREADS_PER_ATOM__; - int lane_id = thread_id & (__THREADS_PER_ATOM__ - 1); - int my_bucket = threadIdx.x / __THREADS_PER_ATOM__; - - if (warp_id >= N) return; - - int i = warp_id; - int nbr; - int k; - int start = Start_Index (workspace->hbond_index[i], &hbonds); - int end = End_Index (workspace->hbond_index[i], &hbonds); - int j = start + lane_id; - //for (int j = start; j < end; j++) - while (j < end) - { - ihbond = &( hbonds.select.hbond_list [j] ); - nbr = ihbond->nbr; - - int nbrstart = Start_Index (workspace->hbond_index[nbr], &hbonds); - int nbrend = End_Index (workspace->hbond_index[nbr], &hbonds); - - //k = nbrstart + lane_id; - //if (lane_id == 0) found [my_bucket] = 0; - //while (k < nbrend) - for (k = nbrstart; k < nbrend; k++) - { - jhbond = &( hbonds.select.hbond_list [k] ); - - if (jhbond->nbr == i){ - ihbond->sym_index = k; - jhbond->sym_index = j; - break; - } - } - - j += __THREADS_PER_ATOM__; - } + static_storage *workspace = &p_workspace; + hbond_data *ihbond, *jhbond; + + int __THREADS_PER_ATOM__ = HBONDS_SYM_THREADS_PER_ATOM; + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + int warp_id = thread_id / __THREADS_PER_ATOM__; + int lane_id = thread_id & (__THREADS_PER_ATOM__ - 1); + int my_bucket = threadIdx.x / __THREADS_PER_ATOM__; + + if (warp_id >= N) return; + + int i = warp_id; + int nbr; + int k; + int start = Start_Index (workspace->hbond_index[i], &hbonds); + int end = End_Index (workspace->hbond_index[i], &hbonds); + int j = start + lane_id; + //for (int j = start; j < end; j++) + while (j < end) + { + ihbond = &( hbonds.select.hbond_list [j] ); + nbr = ihbond->nbr; + + int nbrstart = Start_Index (workspace->hbond_index[nbr], &hbonds); + int nbrend = End_Index (workspace->hbond_index[nbr], &hbonds); + + //k = nbrstart + lane_id; + //if (lane_id == 0) found [my_bucket] = 0; + //while (k < nbrend) + for (k = nbrstart; k < nbrend; k++) + { + jhbond = &( hbonds.select.hbond_list [k] ); + + if (jhbond->nbr == i){ + ihbond->sym_index = k; + jhbond->sym_index = j; + break; + } + } + + j += __THREADS_PER_ATOM__; + } } void Init_Forces_Tab( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - list **lists, output_controls *out_control ) { - int i, j, pj; - int start_i, end_i; - int type_i, type_j; - int Htop, btop_i, btop_j, num_bonds, num_hbonds; - int tmin, tmax, r; - int ihb, jhb, ihb_top, jhb_top; - int flag; - real r_ij, r2, self_coef; - real val, dif, base; - real C12, C34, C56; - real Cln_BOp_s, Cln_BOp_pi, Cln_BOp_pi2; - real BO, BO_s, BO_pi, BO_pi2; - real p_boc1, p_boc2; - sparse_matrix *H; - list *far_nbrs, *bonds, *hbonds; - single_body_parameters *sbp_i, *sbp_j; - two_body_parameters *twbp; - far_neighbor_data *nbr_pj; - LR_lookup_table *t; - reax_atom *atom_i, *atom_j; - bond_data *ibond, *jbond; - bond_order_data *bo_ij, *bo_ji; - - far_nbrs = *lists + FAR_NBRS; - bonds = *lists + BONDS; - hbonds = *lists + HBONDS; - - H = &workspace->H; - Htop = 0; - num_bonds = 0; - num_hbonds = 0; - btop_i = btop_j = 0; - p_boc1 = system->reaxprm.gp.l[0]; - p_boc2 = system->reaxprm.gp.l[1]; - - for( i = 0; i < system->N; ++i ) { - atom_i = &(system->atoms[i]); - type_i = atom_i->type; - start_i = Start_Index(i, far_nbrs); - end_i = End_Index(i, far_nbrs); - H->start[i] = Htop; - btop_i = End_Index( i, bonds ); - sbp_i = &(system->reaxprm.sbp[type_i]); - ihb = ihb_top = -1; - if( control->hb_cut > 0 && (ihb=sbp_i->p_hbond) == 1 ) - ihb_top = End_Index( workspace->hbond_index[i], hbonds ); - - for( pj = start_i; pj < end_i; ++pj ) { - nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); - j = nbr_pj->nbr; - atom_j = &(system->atoms[j]); - - flag = 0; - if((data->step-data->prev_steps) % control->reneighbor == 0) { - if(nbr_pj->d <= control->r_cut) - flag = 1; - else flag = 0; - } - else if((nbr_pj->d=Sq_Distance_on_T3(atom_i->x,atom_j->x,&(system->box), - nbr_pj->dvec))<=SQR(control->r_cut)){ - nbr_pj->d = sqrt(nbr_pj->d); - flag = 1; - } - - if( flag ){ - type_j = system->atoms[j].type; - r_ij = nbr_pj->d; - sbp_j = &(system->reaxprm.sbp[type_j]); - twbp = &(system->reaxprm.tbp[ index_tbp (type_i,type_j,&system->reaxprm) ]); - self_coef = (i == j) ? 0.5 : 1.0; - tmin = MIN( type_i, type_j ); - tmax = MAX( type_i, type_j ); - t = &( LR[ index_lr (tmin,tmax,system->reaxprm.num_atom_types) ] ); - - /* cubic spline interpolation */ - r = (int)(r_ij * t->inv_dx); - if( r == 0 ) ++r; - base = (real)(r+1) * t->dx; - dif = r_ij - base; - val = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + - t->ele[r].a; - val *= EV_to_KCALpMOL / C_ele; - - H->entries[Htop].j = j; - H->entries[Htop].val = self_coef * val; - ++Htop; - - /* hydrogen bond lists */ - if( control->hb_cut > 0 && (ihb==1 || ihb==2) && - nbr_pj->d <= control->hb_cut ) { - // fprintf( stderr, "%d %d\n", atom1, atom2 ); - jhb = sbp_j->p_hbond; - if( ihb == 1 && jhb == 2 ) { - hbonds->select.hbond_list[ihb_top].nbr = j; - hbonds->select.hbond_list[ihb_top].scl = 1; - hbonds->select.hbond_list[ihb_top].ptr = nbr_pj; - ++ihb_top; - ++num_hbonds; - } - else if( ihb == 2 && jhb == 1 ) { - jhb_top = End_Index( workspace->hbond_index[j], hbonds ); - hbonds->select.hbond_list[jhb_top].nbr = i; - hbonds->select.hbond_list[jhb_top].scl = -1; - hbonds->select.hbond_list[jhb_top].ptr = nbr_pj; - Set_End_Index( workspace->hbond_index[j], jhb_top+1, hbonds ); - ++num_hbonds; - } - } - - /* uncorrected bond orders */ - if( far_nbrs->select.far_nbr_list[pj].d <= control->nbr_cut ) { - r2 = SQR(r_ij); - - if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) { - C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 ); - BO_s = (1.0 + control->bo_cut) * EXP( C12 ); - } - else BO_s = C12 = 0.0; - - if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) { - C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 ); - BO_pi = EXP( C34 ); - } - else BO_pi = C34 = 0.0; - - if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) { - C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 ); - BO_pi2= EXP( C56 ); - } - else BO_pi2 = C56 = 0.0; - - /* Initially BO values are the uncorrected ones, page 1 */ - BO = BO_s + BO_pi + BO_pi2; - - if( BO >= control->bo_cut ) { - num_bonds += 2; - /****** bonds i-j and j-i ******/ - ibond = &( bonds->select.bond_list[btop_i] ); - btop_j = End_Index( j, bonds ); - jbond = &(bonds->select.bond_list[btop_j]); - - ibond->nbr = j; - jbond->nbr = i; - ibond->d = r_ij; - jbond->d = r_ij; - rvec_Copy( ibond->dvec, nbr_pj->dvec ); - rvec_Scale( jbond->dvec, -1, nbr_pj->dvec ); - ivec_Copy( ibond->rel_box, nbr_pj->rel_box ); - ivec_Scale( jbond->rel_box, -1, nbr_pj->rel_box ); - ibond->dbond_index = btop_i; - jbond->dbond_index = btop_i; - ibond->sym_index = btop_j; - jbond->sym_index = btop_i; - ++btop_i; - Set_End_Index( j, btop_j+1, bonds ); - - bo_ij = &( ibond->bo_data ); - bo_ji = &( jbond->bo_data ); - bo_ji->BO = bo_ij->BO = BO; - bo_ji->BO_s = bo_ij->BO_s = BO_s; - bo_ji->BO_pi = bo_ij->BO_pi = BO_pi; - bo_ji->BO_pi2 = bo_ij->BO_pi2 = BO_pi2; - - /* Bond Order page2-3, derivative of total bond order prime */ - Cln_BOp_s = twbp->p_bo2 * C12 / r2; - Cln_BOp_pi = twbp->p_bo4 * C34 / r2; - Cln_BOp_pi2 = twbp->p_bo6 * C56 / r2; - - /* Only dln_BOp_xx wrt. dr_i is stored here, note that - dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 */ - rvec_Scale(bo_ij->dln_BOp_s,-bo_ij->BO_s*Cln_BOp_s,ibond->dvec); - rvec_Scale(bo_ij->dln_BOp_pi,-bo_ij->BO_pi*Cln_BOp_pi,ibond->dvec); - rvec_Scale(bo_ij->dln_BOp_pi2, - -bo_ij->BO_pi2*Cln_BOp_pi2,ibond->dvec); - rvec_Scale(bo_ji->dln_BOp_s, -1., bo_ij->dln_BOp_s); - rvec_Scale(bo_ji->dln_BOp_pi, -1., bo_ij->dln_BOp_pi ); - rvec_Scale(bo_ji->dln_BOp_pi2, -1., bo_ij->dln_BOp_pi2 ); - - /* Only dBOp wrt. dr_i is stored here, note that - dBOp/dr_i = -dBOp/dr_j and all others are 0 */ - rvec_Scale( bo_ij->dBOp, - -(bo_ij->BO_s * Cln_BOp_s + - bo_ij->BO_pi * Cln_BOp_pi + - bo_ij->BO_pi2 * Cln_BOp_pi2), ibond->dvec ); - rvec_Scale( bo_ji->dBOp, -1., bo_ij->dBOp ); - - rvec_Add( workspace->dDeltap_self[i], bo_ij->dBOp ); - rvec_Add( workspace->dDeltap_self[j], bo_ji->dBOp ); - - bo_ij->BO_s -= control->bo_cut; - bo_ij->BO -= control->bo_cut; - bo_ji->BO_s -= control->bo_cut; - bo_ji->BO -= control->bo_cut; - workspace->total_bond_order[i] += bo_ij->BO; //currently total_BOp - workspace->total_bond_order[j] += bo_ji->BO; //currently total_BOp - bo_ij->Cdbo = bo_ij->Cdbopi = bo_ij->Cdbopi2 = 0.0; - bo_ji->Cdbo = bo_ji->Cdbopi = bo_ji->Cdbopi2 = 0.0; - - Set_End_Index( j, btop_j+1, bonds ); - } - } - } - } - - H->entries[Htop].j = i; - H->entries[Htop].val = system->reaxprm.sbp[type_i].eta; - ++Htop; - - Set_End_Index( i, btop_i, bonds ); - if( ihb == 1 ) - Set_End_Index( workspace->hbond_index[i], ihb_top, hbonds ); - } - - // mark the end of j list - H->start[i] = Htop; - /* validate lists - decide if reallocation is required! */ - Validate_Lists( workspace, lists, - data->step, system->N, H->m, Htop, num_bonds, num_hbonds ); + simulation_data *data, static_storage *workspace, + list **lists, output_controls *out_control ) { + int i, j, pj; + int start_i, end_i; + int type_i, type_j; + int Htop, btop_i, btop_j, num_bonds, num_hbonds; + int tmin, tmax, r; + int ihb, jhb, ihb_top, jhb_top; + int flag; + real r_ij, r2, self_coef; + real val, dif, base; + real C12, C34, C56; + real Cln_BOp_s, Cln_BOp_pi, Cln_BOp_pi2; + real BO, BO_s, BO_pi, BO_pi2; + real p_boc1, p_boc2; + sparse_matrix *H; + list *far_nbrs, *bonds, *hbonds; + single_body_parameters *sbp_i, *sbp_j; + two_body_parameters *twbp; + far_neighbor_data *nbr_pj; + LR_lookup_table *t; + reax_atom *atom_i, *atom_j; + bond_data *ibond, *jbond; + bond_order_data *bo_ij, *bo_ji; + + far_nbrs = *lists + FAR_NBRS; + bonds = *lists + BONDS; + hbonds = *lists + HBONDS; + + H = &workspace->H; + Htop = 0; + num_bonds = 0; + num_hbonds = 0; + btop_i = btop_j = 0; + p_boc1 = system->reaxprm.gp.l[0]; + p_boc2 = system->reaxprm.gp.l[1]; + + for( i = 0; i < system->N; ++i ) { + atom_i = &(system->atoms[i]); + type_i = atom_i->type; + start_i = Start_Index(i, far_nbrs); + end_i = End_Index(i, far_nbrs); + H->start[i] = Htop; + btop_i = End_Index( i, bonds ); + sbp_i = &(system->reaxprm.sbp[type_i]); + ihb = ihb_top = -1; + if( control->hb_cut > 0 && (ihb=sbp_i->p_hbond) == 1 ) + ihb_top = End_Index( workspace->hbond_index[i], hbonds ); + + for( pj = start_i; pj < end_i; ++pj ) { + nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); + j = nbr_pj->nbr; + atom_j = &(system->atoms[j]); + + flag = 0; + if((data->step-data->prev_steps) % control->reneighbor == 0) { + if(nbr_pj->d <= control->r_cut) + flag = 1; + else flag = 0; + } + else if((nbr_pj->d=Sq_Distance_on_T3(atom_i->x,atom_j->x,&(system->box), + nbr_pj->dvec))<=SQR(control->r_cut)){ + nbr_pj->d = sqrt(nbr_pj->d); + flag = 1; + } + + if( flag ){ + type_j = system->atoms[j].type; + r_ij = nbr_pj->d; + sbp_j = &(system->reaxprm.sbp[type_j]); + twbp = &(system->reaxprm.tbp[ index_tbp (type_i,type_j,&system->reaxprm) ]); + self_coef = (i == j) ? 0.5 : 1.0; + tmin = MIN( type_i, type_j ); + tmax = MAX( type_i, type_j ); + t = &( LR[ index_lr (tmin,tmax,system->reaxprm.num_atom_types) ] ); + + /* cubic spline interpolation */ + r = (int)(r_ij * t->inv_dx); + if( r == 0 ) ++r; + base = (real)(r+1) * t->dx; + dif = r_ij - base; + val = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + + t->ele[r].a; + val *= EV_to_KCALpMOL / C_ele; + + H->entries[Htop].j = j; + H->entries[Htop].val = self_coef * val; + ++Htop; + + /* hydrogen bond lists */ + if( control->hb_cut > 0 && (ihb==1 || ihb==2) && + nbr_pj->d <= control->hb_cut ) { + // fprintf( stderr, "%d %d\n", atom1, atom2 ); + jhb = sbp_j->p_hbond; + if( ihb == 1 && jhb == 2 ) { + hbonds->select.hbond_list[ihb_top].nbr = j; + hbonds->select.hbond_list[ihb_top].scl = 1; + hbonds->select.hbond_list[ihb_top].ptr = nbr_pj; + ++ihb_top; + ++num_hbonds; + } + else if( ihb == 2 && jhb == 1 ) { + jhb_top = End_Index( workspace->hbond_index[j], hbonds ); + hbonds->select.hbond_list[jhb_top].nbr = i; + hbonds->select.hbond_list[jhb_top].scl = -1; + hbonds->select.hbond_list[jhb_top].ptr = nbr_pj; + Set_End_Index( workspace->hbond_index[j], jhb_top+1, hbonds ); + ++num_hbonds; + } + } + + /* uncorrected bond orders */ + if( far_nbrs->select.far_nbr_list[pj].d <= control->nbr_cut ) { + r2 = SQR(r_ij); + + if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) { + C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 ); + BO_s = (1.0 + control->bo_cut) * EXP( C12 ); + } + else BO_s = C12 = 0.0; + + if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) { + C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 ); + BO_pi = EXP( C34 ); + } + else BO_pi = C34 = 0.0; + + if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) { + C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 ); + BO_pi2= EXP( C56 ); + } + else BO_pi2 = C56 = 0.0; + + /* Initially BO values are the uncorrected ones, page 1 */ + BO = BO_s + BO_pi + BO_pi2; + + if( BO >= control->bo_cut ) { + num_bonds += 2; + /****** bonds i-j and j-i ******/ + ibond = &( bonds->select.bond_list[btop_i] ); + btop_j = End_Index( j, bonds ); + jbond = &(bonds->select.bond_list[btop_j]); + + ibond->nbr = j; + jbond->nbr = i; + ibond->d = r_ij; + jbond->d = r_ij; + rvec_Copy( ibond->dvec, nbr_pj->dvec ); + rvec_Scale( jbond->dvec, -1, nbr_pj->dvec ); + ivec_Copy( ibond->rel_box, nbr_pj->rel_box ); + ivec_Scale( jbond->rel_box, -1, nbr_pj->rel_box ); + ibond->dbond_index = btop_i; + jbond->dbond_index = btop_i; + ibond->sym_index = btop_j; + jbond->sym_index = btop_i; + ++btop_i; + Set_End_Index( j, btop_j+1, bonds ); + + bo_ij = &( ibond->bo_data ); + bo_ji = &( jbond->bo_data ); + bo_ji->BO = bo_ij->BO = BO; + bo_ji->BO_s = bo_ij->BO_s = BO_s; + bo_ji->BO_pi = bo_ij->BO_pi = BO_pi; + bo_ji->BO_pi2 = bo_ij->BO_pi2 = BO_pi2; + + /* Bond Order page2-3, derivative of total bond order prime */ + Cln_BOp_s = twbp->p_bo2 * C12 / r2; + Cln_BOp_pi = twbp->p_bo4 * C34 / r2; + Cln_BOp_pi2 = twbp->p_bo6 * C56 / r2; + + /* Only dln_BOp_xx wrt. dr_i is stored here, note that + dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 */ + rvec_Scale(bo_ij->dln_BOp_s,-bo_ij->BO_s*Cln_BOp_s,ibond->dvec); + rvec_Scale(bo_ij->dln_BOp_pi,-bo_ij->BO_pi*Cln_BOp_pi,ibond->dvec); + rvec_Scale(bo_ij->dln_BOp_pi2, + -bo_ij->BO_pi2*Cln_BOp_pi2,ibond->dvec); + rvec_Scale(bo_ji->dln_BOp_s, -1., bo_ij->dln_BOp_s); + rvec_Scale(bo_ji->dln_BOp_pi, -1., bo_ij->dln_BOp_pi ); + rvec_Scale(bo_ji->dln_BOp_pi2, -1., bo_ij->dln_BOp_pi2 ); + + /* Only dBOp wrt. dr_i is stored here, note that + dBOp/dr_i = -dBOp/dr_j and all others are 0 */ + rvec_Scale( bo_ij->dBOp, + -(bo_ij->BO_s * Cln_BOp_s + + bo_ij->BO_pi * Cln_BOp_pi + + bo_ij->BO_pi2 * Cln_BOp_pi2), ibond->dvec ); + rvec_Scale( bo_ji->dBOp, -1., bo_ij->dBOp ); + + rvec_Add( workspace->dDeltap_self[i], bo_ij->dBOp ); + rvec_Add( workspace->dDeltap_self[j], bo_ji->dBOp ); + + bo_ij->BO_s -= control->bo_cut; + bo_ij->BO -= control->bo_cut; + bo_ji->BO_s -= control->bo_cut; + bo_ji->BO -= control->bo_cut; + workspace->total_bond_order[i] += bo_ij->BO; //currently total_BOp + workspace->total_bond_order[j] += bo_ji->BO; //currently total_BOp + bo_ij->Cdbo = bo_ij->Cdbopi = bo_ij->Cdbopi2 = 0.0; + bo_ji->Cdbo = bo_ji->Cdbopi = bo_ji->Cdbopi2 = 0.0; + + Set_End_Index( j, btop_j+1, bonds ); + } + } + } + } + + H->entries[Htop].j = i; + H->entries[Htop].val = system->reaxprm.sbp[type_i].eta; + ++Htop; + + Set_End_Index( i, btop_i, bonds ); + if( ihb == 1 ) + Set_End_Index( workspace->hbond_index[i], ihb_top, hbonds ); + } + + // mark the end of j list + H->start[i] = Htop; + /* validate lists - decide if reallocation is required! */ + Validate_Lists( workspace, lists, + data->step, system->N, H->m, Htop, num_bonds, num_hbonds ); #if defined(DEBUG_FOCUS) - fprintf( stderr, "step%d: Htop = %d, num_bonds = %d, num_hbonds = %d\n", - data->step, Htop, num_bonds, num_hbonds ); - //Print_Bonds( system, bonds, "sbonds.out" ); - //Print_Bond_List2( system, bonds, "sbonds.out" ); - //Print_Sparse_Matrix2( H, "H.out" ); + fprintf( stderr, "step%d: Htop = %d, num_bonds = %d, num_hbonds = %d\n", + data->step, Htop, num_bonds, num_hbonds ); + //Print_Bonds( system, bonds, "sbonds.out" ); + //Print_Bond_List2( system, bonds, "sbonds.out" ); + //Print_Sparse_Matrix2( H, "H.out" ); #endif } void Estimate_Storage_Sizes( reax_system *system, control_params *control, - list **lists, int *Htop, int *hb_top, - int *bond_top, int *num_3body ) { - int i, j, pj; - int start_i, end_i; - int type_i, type_j; - int ihb, jhb; - real r_ij, r2; - real C12, C34, C56; - real BO, BO_s, BO_pi, BO_pi2; - real p_boc1, p_boc2; - list *far_nbrs; - single_body_parameters *sbp_i, *sbp_j; - two_body_parameters *twbp; - far_neighbor_data *nbr_pj; - reax_atom *atom_i, *atom_j; - - far_nbrs = *lists + FAR_NBRS; - p_boc1 = system->reaxprm.gp.l[0]; - p_boc2 = system->reaxprm.gp.l[1]; - - for( i = 0; i < system->N; ++i ) { - atom_i = &(system->atoms[i]); - type_i = atom_i->type; - start_i = Start_Index(i, far_nbrs); - end_i = End_Index(i, far_nbrs); - sbp_i = &(system->reaxprm.sbp[type_i]); - ihb = sbp_i->p_hbond; - - for( pj = start_i; pj < end_i; ++pj ) { - nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); - j = nbr_pj->nbr; - atom_j = &(system->atoms[j]); - type_j = atom_j->type; - sbp_j = &(system->reaxprm.sbp[type_j]); - twbp = &(system->reaxprm.tbp[ index_tbp (type_i,type_j,&system->reaxprm) ]); - - if( nbr_pj->d <= control->r_cut ) { - ++(*Htop); - - /* hydrogen bond lists */ - if( control->hb_cut > 0.1 && (ihb==1 || ihb==2) && - nbr_pj->d <= control->hb_cut ) { - jhb = sbp_j->p_hbond; - if( ihb == 1 && jhb == 2 ) - ++hb_top[i]; - else if( ihb == 2 && jhb == 1 ) - ++hb_top[j]; - } - - /* uncorrected bond orders */ - if( nbr_pj->d <= control->nbr_cut ) { - r_ij = nbr_pj->d; - r2 = SQR(r_ij); - - if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) { - C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 ); - BO_s = (1.0 + control->bo_cut) * EXP( C12 ); - } - else BO_s = C12 = 0.0; - - if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) { - C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 ); - BO_pi = EXP( C34 ); - } - else BO_pi = C34 = 0.0; - - if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) { - C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 ); - BO_pi2= EXP( C56 ); - } - else BO_pi2 = C56 = 0.0; - - /* Initially BO values are the uncorrected ones, page 1 */ - BO = BO_s + BO_pi + BO_pi2; - - if( BO >= control->bo_cut ) { - ++bond_top[i]; - ++bond_top[j]; - } - } - } - } - } - - *Htop += system->N; - *Htop *= SAFE_ZONE; - - for( i = 0; i < system->N; ++i ) { - hb_top[i] = MAX( hb_top[i] * SAFE_HBONDS, MIN_HBONDS ); - *num_3body += SQR(bond_top[i]); - bond_top[i] = MAX( bond_top[i] * 2, MIN_BONDS ); - } - *num_3body *= SAFE_ZONE; + list **lists, int *Htop, int *hb_top, + int *bond_top, int *num_3body ) { + int i, j, pj; + int start_i, end_i; + int type_i, type_j; + int ihb, jhb; + real r_ij, r2; + real C12, C34, C56; + real BO, BO_s, BO_pi, BO_pi2; + real p_boc1, p_boc2; + list *far_nbrs; + single_body_parameters *sbp_i, *sbp_j; + two_body_parameters *twbp; + far_neighbor_data *nbr_pj; + reax_atom *atom_i, *atom_j; + + far_nbrs = *lists + FAR_NBRS; + p_boc1 = system->reaxprm.gp.l[0]; + p_boc2 = system->reaxprm.gp.l[1]; + + for( i = 0; i < system->N; ++i ) { + atom_i = &(system->atoms[i]); + type_i = atom_i->type; + start_i = Start_Index(i, far_nbrs); + end_i = End_Index(i, far_nbrs); + sbp_i = &(system->reaxprm.sbp[type_i]); + ihb = sbp_i->p_hbond; + + for( pj = start_i; pj < end_i; ++pj ) { + nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); + j = nbr_pj->nbr; + atom_j = &(system->atoms[j]); + type_j = atom_j->type; + sbp_j = &(system->reaxprm.sbp[type_j]); + twbp = &(system->reaxprm.tbp[ index_tbp (type_i,type_j,&system->reaxprm) ]); + + if( nbr_pj->d <= control->r_cut ) { + ++(*Htop); + + /* hydrogen bond lists */ + if( control->hb_cut > 0.1 && (ihb==1 || ihb==2) && + nbr_pj->d <= control->hb_cut ) { + jhb = sbp_j->p_hbond; + if( ihb == 1 && jhb == 2 ) + ++hb_top[i]; + else if( ihb == 2 && jhb == 1 ) + ++hb_top[j]; + } + + /* uncorrected bond orders */ + if( nbr_pj->d <= control->nbr_cut ) { + r_ij = nbr_pj->d; + r2 = SQR(r_ij); + + if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) { + C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 ); + BO_s = (1.0 + control->bo_cut) * EXP( C12 ); + } + else BO_s = C12 = 0.0; + + if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) { + C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 ); + BO_pi = EXP( C34 ); + } + else BO_pi = C34 = 0.0; + + if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) { + C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 ); + BO_pi2= EXP( C56 ); + } + else BO_pi2 = C56 = 0.0; + + /* Initially BO values are the uncorrected ones, page 1 */ + BO = BO_s + BO_pi + BO_pi2; + + if( BO >= control->bo_cut ) { + ++bond_top[i]; + ++bond_top[j]; + } + } + } + } + } + + *Htop += system->N; + *Htop *= SAFE_ZONE; + + for( i = 0; i < system->N; ++i ) { + hb_top[i] = MAX( hb_top[i] * SAFE_HBONDS, MIN_HBONDS ); + *num_3body += SQR(bond_top[i]); + bond_top[i] = MAX( bond_top[i] * 2, MIN_BONDS ); + } + *num_3body *= SAFE_ZONE; } void Cuda_Estimate_Storage_Sizes (reax_system *system, control_params *control, int *output) { - int *Htop, *num_3body, input_size; - int *hb_top, *bond_top; - int *input = (int *) scratch; - int max_3body = 0; + int *Htop, *num_3body, input_size; + int *hb_top, *bond_top; + int *input = (int *) scratch; + int max_3body = 0; - Htop = 0; - num_3body = 0; - input_size = INT_SIZE * (2 * system->N + 1 + 1); + Htop = 0; + num_3body = 0; + input_size = INT_SIZE * (2 * system->N + 1 + 1); - //cuda_malloc ((void **) &input, input_size, 1, __LINE__); - cuda_memset (input, 0, input_size, RES_SCRATCH ); + //cuda_malloc ((void **) &input, input_size, 1, __LINE__); + cuda_memset (input, 0, input_size, RES_SCRATCH ); - Estimate_Storage_Sizes <<<BLOCKS_POW_2, BLOCK_SIZE>>> - (system->d_atoms, system->N, system->reaxprm.d_sbp, system->reaxprm.d_tbp, - system->reaxprm.d_gp, (control_params *)control->d_control, *(dev_lists + FAR_NBRS), - system->reaxprm.num_atom_types, input); - cudaThreadSynchronize (); - cudaCheckError (); + Estimate_Storage_Sizes <<<BLOCKS_POW_2, BLOCK_SIZE>>> + (system->d_atoms, system->N, system->reaxprm.d_sbp, system->reaxprm.d_tbp, + system->reaxprm.d_gp, (control_params *)control->d_control, *(dev_lists + FAR_NBRS), + system->reaxprm.num_atom_types, input); + cudaThreadSynchronize (); + cudaCheckError (); - copy_host_device (output, input, input_size, cudaMemcpyDeviceToHost, __LINE__ ); + copy_host_device (output, input, input_size, cudaMemcpyDeviceToHost, __LINE__ ); - Htop = &output[0]; - num_3body = &output[1]; - hb_top = &output[ 2 ]; - bond_top = &output[ 2 + system->N ]; + Htop = &output[0]; + num_3body = &output[1]; + hb_top = &output[ 2 ]; + bond_top = &output[ 2 + system->N ]; - *Htop += system->N; - *Htop *= SAFE_ZONE; + *Htop += system->N; + *Htop *= SAFE_ZONE; - for( int i = 0; i < system->N; ++i ) { - hb_top[i] = MAX( hb_top[i] * SAFE_HBONDS, MIN_HBONDS ); + for( int i = 0; i < system->N; ++i ) { + hb_top[i] = MAX( hb_top[i] * SAFE_HBONDS, MIN_HBONDS ); - if (max_3body <= SQR (bond_top[i])) - max_3body = SQR (bond_top[i]); + if (max_3body <= SQR (bond_top[i])) + max_3body = SQR (bond_top[i]); - *num_3body += SQR(bond_top[i]); - bond_top[i] = MAX( bond_top[i] * 2, MIN_BONDS ); - } + *num_3body += SQR(bond_top[i]); + bond_top[i] = MAX( bond_top[i] * 2, MIN_BONDS ); + } - *num_3body = max_3body * SAFE_ZONE; + *num_3body = max_3body * SAFE_ZONE; } -GLOBAL void Estimate_Storage_Sizes (reax_atom *atoms, - int N, - single_body_parameters *sbp, - two_body_parameters *tbp, - global_parameters gp, - control_params *control, - list far_nbrs, - int num_atom_types, int *results) +GLOBAL void Estimate_Storage_Sizes (reax_atom *atoms, + int N, + single_body_parameters *sbp, + two_body_parameters *tbp, + global_parameters gp, + control_params *control, + list far_nbrs, + int num_atom_types, int *results) { - int *Htop = &results[0]; - int *num_3body = &results[1]; - int *hb_top = &results [ 2 ]; - int *bond_top = &results [ 2 + N ]; - - int i, j, pj; - int start_i, end_i; - int type_i, type_j; - int ihb, jhb; - real r_ij, r2; - real C12, C34, C56; - real BO, BO_s, BO_pi, BO_pi2; - real p_boc1, p_boc2; - single_body_parameters *sbp_i, *sbp_j; - two_body_parameters *twbp; - far_neighbor_data *nbr_pj; - reax_atom *atom_i, *atom_j; - - p_boc1 = gp.l[0]; - p_boc2 = gp.l[1]; - - //for( i = 0; i < N; ++i ) { - i = blockIdx.x * blockDim.x + threadIdx.x; - - if (i >= N ) return ; - - atom_i = &(atoms[i]); - type_i = atom_i->type; - start_i = Start_Index(i, &far_nbrs); - end_i = End_Index(i, &far_nbrs); - sbp_i = &(sbp[type_i]); - ihb = sbp_i->p_hbond; - - for( pj = start_i; pj < end_i; ++pj ) { - nbr_pj = &( far_nbrs.select.far_nbr_list[pj] ); - j = nbr_pj->nbr; - atom_j = &( atoms[j] ); - type_j = atom_j->type; - sbp_j = &( sbp[type_j] ); - twbp = &( tbp[ index_tbp (type_i,type_j,num_atom_types) ] ); - - - if( nbr_pj->d <= control->r_cut ) { - //++(*Htop); - atomicAdd (Htop, 1); - - /* hydrogen bond lists */ - //TODO - CHANGE ORIGINAL - if( control->hb_cut > 0 && (ihb==1 || ihb==2) && - nbr_pj->d <= control->hb_cut ) { - jhb = sbp_j->p_hbond; - if( ihb == 1 && jhb == 2 ) - //++hb_top[i]; - atomicAdd (&hb_top[i], 1); - else if( ihb == 2 && jhb == 1 ) - //++hb_top[j]; - //atomicAdd (&hb_top[j], 1); - atomicAdd (&hb_top[i], 1); - } - //TODO -- CHANGE ORIGINAL - - //CHANGE ORIGINAL - if (i < j) continue; - //CHANGE ORIGINAL - - - /* uncorrected bond orders */ - if( nbr_pj->d <= control->nbr_cut ) { - r_ij = nbr_pj->d; - r2 = SQR(r_ij); - - if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) { - C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 ); - BO_s = (1.0 + control->bo_cut) * EXP( C12 ); - } - else BO_s = C12 = 0.0; - - if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) { - C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 ); - BO_pi = EXP( C34 ); - } - else BO_pi = C34 = 0.0; - - if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) { - C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 ); - BO_pi2= EXP( C56 ); - } - else BO_pi2 = C56 = 0.0; - - /* Initially BO values are the uncorrected ones, page 1 */ - BO = BO_s + BO_pi + BO_pi2; - - if( BO >= control->bo_cut ) { - //++bond_top[i]; - //++bond_top[j]; - atomicAdd (&bond_top[i], 1); - atomicAdd (&bond_top[j], 1); - } - } - } - } - //} + int *Htop = &results[0]; + int *num_3body = &results[1]; + int *hb_top = &results [ 2 ]; + int *bond_top = &results [ 2 + N ]; + + int i, j, pj; + int start_i, end_i; + int type_i, type_j; + int ihb, jhb; + real r_ij, r2; + real C12, C34, C56; + real BO, BO_s, BO_pi, BO_pi2; + real p_boc1, p_boc2; + single_body_parameters *sbp_i, *sbp_j; + two_body_parameters *twbp; + far_neighbor_data *nbr_pj; + reax_atom *atom_i, *atom_j; + + p_boc1 = gp.l[0]; + p_boc2 = gp.l[1]; + + //for( i = 0; i < N; ++i ) { + i = blockIdx.x * blockDim.x + threadIdx.x; + + if (i >= N ) return ; + + atom_i = &(atoms[i]); + type_i = atom_i->type; + start_i = Start_Index(i, &far_nbrs); + end_i = End_Index(i, &far_nbrs); + sbp_i = &(sbp[type_i]); + ihb = sbp_i->p_hbond; + + for( pj = start_i; pj < end_i; ++pj ) { + nbr_pj = &( far_nbrs.select.far_nbr_list[pj] ); + j = nbr_pj->nbr; + atom_j = &( atoms[j] ); + type_j = atom_j->type; + sbp_j = &( sbp[type_j] ); + twbp = &( tbp[ index_tbp (type_i,type_j,num_atom_types) ] ); + + + if( nbr_pj->d <= control->r_cut ) { + //++(*Htop); + atomicAdd (Htop, 1); + + /* hydrogen bond lists */ + //TODO - CHANGE ORIGINAL + if( control->hb_cut > 0 && (ihb==1 || ihb==2) && + nbr_pj->d <= control->hb_cut ) { + jhb = sbp_j->p_hbond; + if( ihb == 1 && jhb == 2 ) + //++hb_top[i]; + atomicAdd (&hb_top[i], 1); + else if( ihb == 2 && jhb == 1 ) + //++hb_top[j]; + //atomicAdd (&hb_top[j], 1); + atomicAdd (&hb_top[i], 1); + } + //TODO -- CHANGE ORIGINAL + + //CHANGE ORIGINAL + if (i < j) continue; + //CHANGE ORIGINAL + + + /* uncorrected bond orders */ + if( nbr_pj->d <= control->nbr_cut ) { + r_ij = nbr_pj->d; + r2 = SQR(r_ij); + + if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) { + C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 ); + BO_s = (1.0 + control->bo_cut) * EXP( C12 ); + } + else BO_s = C12 = 0.0; + + if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) { + C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 ); + BO_pi = EXP( C34 ); + } + else BO_pi = C34 = 0.0; + + if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) { + C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 ); + BO_pi2= EXP( C56 ); + } + else BO_pi2 = C56 = 0.0; + + /* Initially BO values are the uncorrected ones, page 1 */ + BO = BO_s + BO_pi + BO_pi2; + + if( BO >= control->bo_cut ) { + //++bond_top[i]; + //++bond_top[j]; + atomicAdd (&bond_top[i], 1); + atomicAdd (&bond_top[j], 1); + } + } + } + } + //} } void Cuda_Compute_Forces( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - list** lists, output_controls *out_control ) + simulation_data *data, static_storage *workspace, + list** lists, output_controls *out_control ) { - real t_start, t_elapsed; - real t_1, t_2; - int *indices; - int *Htop; - int max_sparse_entries = 0; - list *far_nbrs = dev_lists + FAR_NBRS; - int hblocks; - - t_start = Get_Time (); - if ( !control->tabulate ) { - Init_Forces <<<BLOCKS, BLOCK_SIZE>>> - (system->d_atoms, system->reaxprm.d_gp, (control_params *)control->d_control, - system->reaxprm.d_sbp, system->reaxprm.d_tbp, - (simulation_data *)data->d_simulation_data, (simulation_box *)system->d_box, *dev_workspace, - *(dev_lists + FAR_NBRS), *(dev_lists + BONDS), *(dev_lists + HBONDS), - system->N, system->max_sparse_matrix_entries, system->reaxprm.num_atom_types ); - cudaThreadSynchronize (); - cudaCheckError (); - } - else - { - Init_Forces_Tab <<< BLOCKS, BLOCK_SIZE >>> - ( system->d_atoms, system->reaxprm.d_gp, (control_params *)control->d_control, - system->reaxprm.d_sbp, system->reaxprm.d_tbp, - (simulation_data *)data->d_simulation_data, (simulation_box *)system->d_box, *dev_workspace, - *(dev_lists + FAR_NBRS), *(dev_lists + BONDS), *(dev_lists + HBONDS), - system->N, system->max_sparse_matrix_entries, system->reaxprm.num_atom_types, - d_LR ); - cudaThreadSynchronize (); - cudaCheckError (); - } - - /*This is for bonds processing to fix dbond and sym_indexes */ - t_1 = Get_Time (); - fix_sym_dbond_indices <<<BLOCKS, BLOCK_SIZE>>> (*(dev_lists + BONDS), system->N); - cudaThreadSynchronize (); - cudaCheckError (); - t_2 = Get_Timing_Info ( t_1 ); - - //FIX -1 HYDROGEN BOND fix for cases where there are no hbonds. - if ((control->hb_cut > 0) && (dev_workspace->num_H > 0)) - { - - hblocks = (system->N * HBONDS_SYM_THREADS_PER_ATOM / HBONDS_SYM_BLOCK_SIZE) + - ((system->N * HBONDS_SYM_THREADS_PER_ATOM % HBONDS_SYM_BLOCK_SIZE) == 0 ? 0 : 1); - t_1 = Get_Time (); - /* - int bs = system->N; - int ss = 32; - fix_sym_hbond_indices <<<bs, ss>>> (*dev_workspace, *(dev_lists + HBONDS), system->N); - */ - New_fix_sym_hbond_indices <<<hblocks, HBONDS_SYM_BLOCK_SIZE>>> (*dev_workspace, *(dev_lists + HBONDS), system->N); - cudaThreadSynchronize (); - cudaCheckError (); - } - t_2 = Get_Timing_Info ( t_1 ); - - t_elapsed = Get_Timing_Info (t_start); - d_timing.init_forces+= t_elapsed; - - Cuda_Validate_Lists( system, dev_workspace, &dev_lists, data->step, system->N, - system->num_bonds, system->num_hbonds ); + real t_start, t_elapsed; + real t_1, t_2; + int *indices; + int *Htop; + int max_sparse_entries = 0; + list *far_nbrs = dev_lists + FAR_NBRS; + int hblocks; + + t_start = Get_Time (); + if ( !control->tabulate ) { + Init_Forces <<<BLOCKS, BLOCK_SIZE>>> + (system->d_atoms, system->reaxprm.d_gp, (control_params *)control->d_control, + system->reaxprm.d_sbp, system->reaxprm.d_tbp, + (simulation_data *)data->d_simulation_data, (simulation_box *)system->d_box, *dev_workspace, + *(dev_lists + FAR_NBRS), *(dev_lists + BONDS), *(dev_lists + HBONDS), + system->N, system->max_sparse_matrix_entries, system->reaxprm.num_atom_types ); + cudaThreadSynchronize (); + cudaCheckError (); + } + else + { + Init_Forces_Tab <<< BLOCKS, BLOCK_SIZE >>> + ( system->d_atoms, system->reaxprm.d_gp, (control_params *)control->d_control, + system->reaxprm.d_sbp, system->reaxprm.d_tbp, + (simulation_data *)data->d_simulation_data, (simulation_box *)system->d_box, *dev_workspace, + *(dev_lists + FAR_NBRS), *(dev_lists + BONDS), *(dev_lists + HBONDS), + system->N, system->max_sparse_matrix_entries, system->reaxprm.num_atom_types, + d_LR ); + cudaThreadSynchronize (); + cudaCheckError (); + } + + /*This is for bonds processing to fix dbond and sym_indexes */ + t_1 = Get_Time (); + fix_sym_dbond_indices <<<BLOCKS, BLOCK_SIZE>>> (*(dev_lists + BONDS), system->N); + cudaThreadSynchronize (); + cudaCheckError (); + t_2 = Get_Timing_Info ( t_1 ); + + //FIX -1 HYDROGEN BOND fix for cases where there are no hbonds. + if ((control->hb_cut > 0) && (dev_workspace->num_H > 0)) + { + + hblocks = (system->N * HBONDS_SYM_THREADS_PER_ATOM / HBONDS_SYM_BLOCK_SIZE) + + ((system->N * HBONDS_SYM_THREADS_PER_ATOM % HBONDS_SYM_BLOCK_SIZE) == 0 ? 0 : 1); + t_1 = Get_Time (); + /* + int bs = system->N; + int ss = 32; + fix_sym_hbond_indices <<<bs, ss>>> (*dev_workspace, *(dev_lists + HBONDS), system->N); + */ + New_fix_sym_hbond_indices <<<hblocks, HBONDS_SYM_BLOCK_SIZE>>> (*dev_workspace, *(dev_lists + HBONDS), system->N); + cudaThreadSynchronize (); + cudaCheckError (); + } + t_2 = Get_Timing_Info ( t_1 ); + + t_elapsed = Get_Timing_Info (t_start); + d_timing.init_forces+= t_elapsed; + + Cuda_Validate_Lists( system, dev_workspace, &dev_lists, data->step, system->N, + system->num_bonds, system->num_hbonds ); #ifdef __DEBUG_CUDA__ - fprintf (stderr, "Done with Cuda List Validation \n"); + fprintf (stderr, "Done with Cuda List Validation \n"); #endif - //Bonded Force Calculations here. - t_start = Get_Time (); - Cuda_Compute_Bonded_Forces( system, control, data, workspace, lists, out_control ); - t_elapsed = Get_Timing_Info (t_start); - d_timing.bonded += t_elapsed; - - //Compute the Non Bonded Forces here. - t_start = Get_Time (); - Cuda_Compute_NonBonded_Forces( system, control, data, workspace, lists, out_control ); - t_elapsed = Get_Timing_Info (t_start); - d_timing.nonb += t_elapsed; - - //Compute Total Forces here - Cuda_Compute_Total_Force<<< BLOCKS, BLOCK_SIZE >>> - (system->d_atoms, (simulation_data *)data->d_simulation_data, *dev_workspace, - *(dev_lists + BONDS), control->ensemble, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_Compute_Total_Force_PostProcess<<< BLOCKS, BLOCK_SIZE >>> - (system->d_atoms, (simulation_data *)data->d_simulation_data, *dev_workspace, - *(dev_lists + BONDS), control->ensemble, system->N); - cudaThreadSynchronize (); - cudaCheckError (); + //Bonded Force Calculations here. + t_start = Get_Time (); + Cuda_Compute_Bonded_Forces( system, control, data, workspace, lists, out_control ); + t_elapsed = Get_Timing_Info (t_start); + d_timing.bonded += t_elapsed; + + //Compute the Non Bonded Forces here. + t_start = Get_Time (); + Cuda_Compute_NonBonded_Forces( system, control, data, workspace, lists, out_control ); + t_elapsed = Get_Timing_Info (t_start); + d_timing.nonb += t_elapsed; + + //Compute Total Forces here + Cuda_Compute_Total_Force<<< BLOCKS, BLOCK_SIZE >>> + (system->d_atoms, (simulation_data *)data->d_simulation_data, *dev_workspace, + *(dev_lists + BONDS), control->ensemble, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_Compute_Total_Force_PostProcess<<< BLOCKS, BLOCK_SIZE >>> + (system->d_atoms, (simulation_data *)data->d_simulation_data, *dev_workspace, + *(dev_lists + BONDS), control->ensemble, system->N); + cudaThreadSynchronize (); + cudaCheckError (); } void Compute_Forces( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - list** lists, output_controls *out_control ) + simulation_data *data, static_storage *workspace, + list** lists, output_controls *out_control ) { - real t_start, t_elapsed; + real t_start, t_elapsed; - t_start = Get_Time( ); - if( !control->tabulate ) - Init_Forces( system, control, data, workspace, lists, out_control ); - else Init_Forces_Tab( system, control, data, workspace, lists, out_control ); - t_elapsed = Get_Timing_Info( t_start ); - data->timing.init_forces += t_elapsed; + t_start = Get_Time( ); + if( !control->tabulate ) + Init_Forces( system, control, data, workspace, lists, out_control ); + else Init_Forces_Tab( system, control, data, workspace, lists, out_control ); + t_elapsed = Get_Timing_Info( t_start ); + data->timing.init_forces += t_elapsed; #if defined(DEBUG_FOCUS) - print_sparse_matrix (system, workspace); - fprintf( stderr, "init_forces - "); + print_sparse_matrix (system, workspace); + fprintf( stderr, "init_forces - "); #endif - //analyze_hbonds (system, workspace, lists); + //analyze_hbonds (system, workspace, lists); - t_start = Get_Time( ); - Compute_Bonded_Forces( system, control, data, workspace, lists, out_control ); - t_elapsed = Get_Timing_Info( t_start ); - data->timing.bonded += t_elapsed; + t_start = Get_Time( ); + Compute_Bonded_Forces( system, control, data, workspace, lists, out_control ); + t_elapsed = Get_Timing_Info( t_start ); + data->timing.bonded += t_elapsed; - //print_bond_list (system, workspace, lists); - //exit (0); + //print_bond_list (system, workspace, lists); + //exit (0); #if defined(DEBUG_FOCUS) - fprintf( stderr, "bonded_forces - "); + fprintf( stderr, "bonded_forces - "); #endif - t_start = Get_Time( ); - Compute_NonBonded_Forces( system, control, data, workspace, - lists, out_control ); - t_elapsed = Get_Timing_Info( t_start ); - data->timing.nonb += t_elapsed; + t_start = Get_Time( ); + Compute_NonBonded_Forces( system, control, data, workspace, + lists, out_control ); + t_elapsed = Get_Timing_Info( t_start ); + data->timing.nonb += t_elapsed; #ifdef __DEBUG_CUDA__ - fprintf( stderr, "non_bonded_forces - %lf \n", t_elapsed); + fprintf( stderr, "non_bonded_forces - %lf \n", t_elapsed); #endif #if defined(DEBUG_FOCUS) - fprintf( stderr, "nonbondeds - "); + fprintf( stderr, "nonbondeds - "); #endif - Compute_Total_Force( system, control, data, workspace, lists ); - //Print_Total_Force( system, control, data, workspace, lists, out_control ); + Compute_Total_Force( system, control, data, workspace, lists ); + //Print_Total_Force( system, control, data, workspace, lists, out_control ); #if defined(DEBUG_FOCUS) - fprintf( stderr, "totalforces - "); - //Print_Total_Force( system, control, data, workspace, lists, out_control ); + fprintf( stderr, "totalforces - "); + //Print_Total_Force( system, control, data, workspace, lists, out_control ); #endif #ifdef TEST_FORCES - Print_Total_Force( system, control, data, workspace, lists, out_control ); - Compare_Total_Forces( system, control, data, workspace, lists, out_control ); + Print_Total_Force( system, control, data, workspace, lists, out_control ); + Compare_Total_Forces( system, control, data, workspace, lists, out_control ); #endif #if defined(DEBUG_FOCUS) - fprintf( stderr, "forces - "); + fprintf( stderr, "forces - "); #endif } bool validate_device (reax_system *system, simulation_data *data, static_storage *workspace, list **lists ) { - bool retval = false; + bool retval = false; #ifdef __BUILD_DEBUG__ - retval |= validate_neighbors (system, lists); - retval |= validate_sym_dbond_indices (system, workspace, lists); - retval |= validate_bonds (system, workspace, lists); - retval |= validate_sparse_matrix (system, workspace); - retval |= validate_three_bodies (system, workspace, lists ); - retval |= validate_hbonds (system, workspace, lists); - retval |= validate_workspace (system, workspace, lists); - retval |= validate_data (system, data); - retval |= validate_atoms (system, lists); - //analyze_hbonds (system, workspace, lists); - - if (!retval) { - fprintf (stderr, "Results *DOES NOT* mattch between device and host \n"); - } + retval |= validate_neighbors (system, lists); + retval |= validate_sym_dbond_indices (system, workspace, lists); + retval |= validate_bonds (system, workspace, lists); + retval |= validate_sparse_matrix (system, workspace); + retval |= validate_three_bodies (system, workspace, lists ); + retval |= validate_hbonds (system, workspace, lists); + retval |= validate_workspace (system, workspace, lists); + retval |= validate_data (system, data); + retval |= validate_atoms (system, lists); + //analyze_hbonds (system, workspace, lists); + + if (!retval) { + fprintf (stderr, "Results *DOES NOT* mattch between device and host \n"); + } #endif - return retval; + return retval; } diff --git a/PuReMD-GPU/src/four_body_interactions.cu b/PuReMD-GPU/src/four_body_interactions.cu index da72bff7..d7bf757e 100644 --- a/PuReMD-GPU/src/four_body_interactions.cu +++ b/PuReMD-GPU/src/four_body_interactions.cu @@ -32,116 +32,116 @@ #define MIN_SINE 1e-10 HOST_DEVICE real Calculate_Omega( rvec dvec_ij, real r_ij, rvec dvec_jk, real r_jk, - rvec dvec_kl, real r_kl, rvec dvec_li, real r_li, - three_body_interaction_data *p_ijk, - three_body_interaction_data *p_jkl, - rvec dcos_omega_di, rvec dcos_omega_dj, - rvec dcos_omega_dk, rvec dcos_omega_dl, - output_controls *out_control ) + rvec dvec_kl, real r_kl, rvec dvec_li, real r_li, + three_body_interaction_data *p_ijk, + three_body_interaction_data *p_jkl, + rvec dcos_omega_di, rvec dcos_omega_dj, + rvec dcos_omega_dk, rvec dcos_omega_dl, + output_controls *out_control ) { - real unnorm_cos_omega, unnorm_sin_omega, omega; - real sin_ijk, cos_ijk, sin_jkl, cos_jkl; - real htra, htrb, htrc, hthd, hthe, hnra, hnrc, hnhd, hnhe; - real arg, poem, tel; - rvec cross_jk_kl; - - sin_ijk = SIN( p_ijk->theta ); - cos_ijk = COS( p_ijk->theta ); - sin_jkl = SIN( p_jkl->theta ); - cos_jkl = COS( p_jkl->theta ); - - /* omega */ - unnorm_cos_omega = -rvec_Dot( dvec_ij,dvec_jk )*rvec_Dot( dvec_jk,dvec_kl ) + - SQR( r_jk ) * rvec_Dot( dvec_ij,dvec_kl ); - rvec_Cross( cross_jk_kl, dvec_jk, dvec_kl ); - unnorm_sin_omega = -r_jk * rvec_Dot( dvec_ij, cross_jk_kl ); - omega = atan2( unnorm_sin_omega, unnorm_cos_omega ); - - /* derivatives */ - /* coef for adjusments to cos_theta's */ - /* rla = r_ij, rlb = r_jk, rlc = r_kl, r4 = r_li; - coshd = cos_ijk, coshe = cos_jkl; - sinhd = sin_ijk, sinhe = sin_jkl; */ - htra = r_ij + cos_ijk * ( r_kl * cos_jkl - r_jk ); - htrb = r_jk - r_ij * cos_ijk - r_kl * cos_jkl; - htrc = r_kl + cos_jkl * ( r_ij * cos_ijk - r_jk ); - hthd = r_ij * sin_ijk * ( r_jk - r_kl * cos_jkl ); - hthe = r_kl * sin_jkl * ( r_jk - r_ij * cos_ijk ); - hnra = r_kl * sin_ijk * sin_jkl; - hnrc = r_ij * sin_ijk * sin_jkl; - hnhd = r_ij * r_kl * cos_ijk * sin_jkl; - hnhe = r_ij * r_kl * sin_ijk * cos_jkl; - - - poem = 2.0 * r_ij * r_kl * sin_ijk * sin_jkl; - if( poem < 1e-20 ) poem = 1e-20; - - tel = (SQR(r_ij) + SQR(r_jk) + SQR(r_kl) - SQR(r_li)) - - 2.0 * ( r_ij * r_jk * cos_ijk - r_ij * r_kl * cos_ijk * cos_jkl + - r_jk * r_kl * cos_jkl ); - - arg = tel / poem; - if( arg > 1.0 ) arg = 1.0; - if( arg < -1.0 ) arg = -1.0; - - - /*fprintf( out_control->etor, - "%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n", - htra, htrb, htrc, hthd, hthe, hnra, hnrc, hnhd, hnhe ); - fprintf( out_control->etor, "%23.15e%23.15e%23.15e\n", - dvec_ij[0]/r_ij, dvec_ij[1]/r_ij, dvec_ij[2]/r_ij ); - fprintf( out_control->etor, "%23.15e%23.15e%23.15e\n", - -dvec_jk[0]/r_jk, -dvec_jk[1]/r_jk, -dvec_jk[2]/r_jk ); - fprintf( out_control->etor, "%23.15e%23.15e%23.15e\n", - -dvec_kl[0]/r_kl, -dvec_kl[1]/r_kl, -dvec_kl[2]/r_kl ); - fprintf( out_control->etor, "%23.15e%23.15e%23.15e%23.15e\n", - r_li, dvec_li[0], dvec_li[1], dvec_li[2] ); - fprintf( out_control->etor, "%23.15e%23.15e%23.15e%23.15e\n", - r_ij, r_jk, r_kl, r_li ); - fprintf( out_control->etor, "%23.15e%23.15e%23.15e%23.15e\n", - cos_ijk, cos_jkl, sin_ijk, sin_jkl ); - fprintf( out_control->etor, "%23.15e%23.15e%23.15e\n", - poem, tel, arg );*/ - /* fprintf( out_control->etor, "%23.15e%23.15e%23.15e\n", - -p_ijk->dcos_dk[0]/sin_ijk, - -p_ijk->dcos_dk[1]/sin_ijk, - -p_ijk->dcos_dk[2]/sin_ijk ); - fprintf( out_control->etor, "%23.15e%23.15e%23.15e\n", - -p_jkl->dcos_dk[0]/sin_jkl, - -p_jkl->dcos_dk[1]/sin_jkl, - -p_jkl->dcos_dk[2]/sin_jkl );*/ - - if( sin_ijk >= 0 && sin_ijk <= MIN_SINE ) sin_ijk = MIN_SINE; - else if( sin_ijk <= 0 && sin_ijk >= -MIN_SINE ) sin_ijk = -MIN_SINE; - if( sin_jkl >= 0 && sin_jkl <= MIN_SINE ) sin_jkl = MIN_SINE; - else if( sin_jkl <= 0 && sin_jkl >= -MIN_SINE ) sin_jkl = -MIN_SINE; - - // dcos_omega_di - rvec_ScaledSum( dcos_omega_di, (htra-arg*hnra)/r_ij, dvec_ij, -1., dvec_li ); - rvec_ScaledAdd( dcos_omega_di,-(hthd - arg*hnhd)/sin_ijk, p_ijk->dcos_dk ); - rvec_Scale( dcos_omega_di, 2.0 / poem, dcos_omega_di ); - - // dcos_omega_dj - rvec_ScaledSum( dcos_omega_dj,-(htra-arg*hnra)/r_ij, dvec_ij, - -htrb / r_jk, dvec_jk ); - rvec_ScaledAdd( dcos_omega_dj,-(hthd-arg*hnhd) / sin_ijk, p_ijk->dcos_dj ); - rvec_ScaledAdd( dcos_omega_dj,-(hthe-arg*hnhe) / sin_jkl, p_jkl->dcos_di ); - rvec_Scale( dcos_omega_dj, 2.0 / poem, dcos_omega_dj ); - - // dcos_omega_dk - rvec_ScaledSum( dcos_omega_dk,-(htrc-arg*hnrc) / r_kl, dvec_kl, - htrb / r_jk, dvec_jk ); - rvec_ScaledAdd( dcos_omega_dk,-(hthd-arg*hnhd) / sin_ijk, p_ijk->dcos_di ); - rvec_ScaledAdd( dcos_omega_dk,-(hthe-arg*hnhe) / sin_jkl, p_jkl->dcos_dj ); - rvec_Scale( dcos_omega_dk, 2.0 / poem, dcos_omega_dk ); - - // dcos_omega_dl - rvec_ScaledSum( dcos_omega_dl, (htrc-arg*hnrc) / r_kl, dvec_kl, 1., dvec_li ); - rvec_ScaledAdd( dcos_omega_dl,-(hthe-arg*hnhe) / sin_jkl, p_jkl->dcos_dk ); - rvec_Scale( dcos_omega_dl, 2.0 / poem, dcos_omega_dl ); - - return omega; - //return arg; + real unnorm_cos_omega, unnorm_sin_omega, omega; + real sin_ijk, cos_ijk, sin_jkl, cos_jkl; + real htra, htrb, htrc, hthd, hthe, hnra, hnrc, hnhd, hnhe; + real arg, poem, tel; + rvec cross_jk_kl; + + sin_ijk = SIN( p_ijk->theta ); + cos_ijk = COS( p_ijk->theta ); + sin_jkl = SIN( p_jkl->theta ); + cos_jkl = COS( p_jkl->theta ); + + /* omega */ + unnorm_cos_omega = -rvec_Dot( dvec_ij,dvec_jk )*rvec_Dot( dvec_jk,dvec_kl ) + + SQR( r_jk ) * rvec_Dot( dvec_ij,dvec_kl ); + rvec_Cross( cross_jk_kl, dvec_jk, dvec_kl ); + unnorm_sin_omega = -r_jk * rvec_Dot( dvec_ij, cross_jk_kl ); + omega = atan2( unnorm_sin_omega, unnorm_cos_omega ); + + /* derivatives */ + /* coef for adjusments to cos_theta's */ + /* rla = r_ij, rlb = r_jk, rlc = r_kl, r4 = r_li; + coshd = cos_ijk, coshe = cos_jkl; + sinhd = sin_ijk, sinhe = sin_jkl; */ + htra = r_ij + cos_ijk * ( r_kl * cos_jkl - r_jk ); + htrb = r_jk - r_ij * cos_ijk - r_kl * cos_jkl; + htrc = r_kl + cos_jkl * ( r_ij * cos_ijk - r_jk ); + hthd = r_ij * sin_ijk * ( r_jk - r_kl * cos_jkl ); + hthe = r_kl * sin_jkl * ( r_jk - r_ij * cos_ijk ); + hnra = r_kl * sin_ijk * sin_jkl; + hnrc = r_ij * sin_ijk * sin_jkl; + hnhd = r_ij * r_kl * cos_ijk * sin_jkl; + hnhe = r_ij * r_kl * sin_ijk * cos_jkl; + + + poem = 2.0 * r_ij * r_kl * sin_ijk * sin_jkl; + if( poem < 1e-20 ) poem = 1e-20; + + tel = (SQR(r_ij) + SQR(r_jk) + SQR(r_kl) - SQR(r_li)) - + 2.0 * ( r_ij * r_jk * cos_ijk - r_ij * r_kl * cos_ijk * cos_jkl + + r_jk * r_kl * cos_jkl ); + + arg = tel / poem; + if( arg > 1.0 ) arg = 1.0; + if( arg < -1.0 ) arg = -1.0; + + + /*fprintf( out_control->etor, + "%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n", + htra, htrb, htrc, hthd, hthe, hnra, hnrc, hnhd, hnhe ); + fprintf( out_control->etor, "%23.15e%23.15e%23.15e\n", + dvec_ij[0]/r_ij, dvec_ij[1]/r_ij, dvec_ij[2]/r_ij ); + fprintf( out_control->etor, "%23.15e%23.15e%23.15e\n", + -dvec_jk[0]/r_jk, -dvec_jk[1]/r_jk, -dvec_jk[2]/r_jk ); + fprintf( out_control->etor, "%23.15e%23.15e%23.15e\n", + -dvec_kl[0]/r_kl, -dvec_kl[1]/r_kl, -dvec_kl[2]/r_kl ); + fprintf( out_control->etor, "%23.15e%23.15e%23.15e%23.15e\n", + r_li, dvec_li[0], dvec_li[1], dvec_li[2] ); + fprintf( out_control->etor, "%23.15e%23.15e%23.15e%23.15e\n", + r_ij, r_jk, r_kl, r_li ); + fprintf( out_control->etor, "%23.15e%23.15e%23.15e%23.15e\n", + cos_ijk, cos_jkl, sin_ijk, sin_jkl ); + fprintf( out_control->etor, "%23.15e%23.15e%23.15e\n", + poem, tel, arg );*/ + /* fprintf( out_control->etor, "%23.15e%23.15e%23.15e\n", + -p_ijk->dcos_dk[0]/sin_ijk, + -p_ijk->dcos_dk[1]/sin_ijk, + -p_ijk->dcos_dk[2]/sin_ijk ); + fprintf( out_control->etor, "%23.15e%23.15e%23.15e\n", + -p_jkl->dcos_dk[0]/sin_jkl, + -p_jkl->dcos_dk[1]/sin_jkl, + -p_jkl->dcos_dk[2]/sin_jkl );*/ + + if( sin_ijk >= 0 && sin_ijk <= MIN_SINE ) sin_ijk = MIN_SINE; + else if( sin_ijk <= 0 && sin_ijk >= -MIN_SINE ) sin_ijk = -MIN_SINE; + if( sin_jkl >= 0 && sin_jkl <= MIN_SINE ) sin_jkl = MIN_SINE; + else if( sin_jkl <= 0 && sin_jkl >= -MIN_SINE ) sin_jkl = -MIN_SINE; + + // dcos_omega_di + rvec_ScaledSum( dcos_omega_di, (htra-arg*hnra)/r_ij, dvec_ij, -1., dvec_li ); + rvec_ScaledAdd( dcos_omega_di,-(hthd - arg*hnhd)/sin_ijk, p_ijk->dcos_dk ); + rvec_Scale( dcos_omega_di, 2.0 / poem, dcos_omega_di ); + + // dcos_omega_dj + rvec_ScaledSum( dcos_omega_dj,-(htra-arg*hnra)/r_ij, dvec_ij, + -htrb / r_jk, dvec_jk ); + rvec_ScaledAdd( dcos_omega_dj,-(hthd-arg*hnhd) / sin_ijk, p_ijk->dcos_dj ); + rvec_ScaledAdd( dcos_omega_dj,-(hthe-arg*hnhe) / sin_jkl, p_jkl->dcos_di ); + rvec_Scale( dcos_omega_dj, 2.0 / poem, dcos_omega_dj ); + + // dcos_omega_dk + rvec_ScaledSum( dcos_omega_dk,-(htrc-arg*hnrc) / r_kl, dvec_kl, + htrb / r_jk, dvec_jk ); + rvec_ScaledAdd( dcos_omega_dk,-(hthd-arg*hnhd) / sin_ijk, p_ijk->dcos_di ); + rvec_ScaledAdd( dcos_omega_dk,-(hthe-arg*hnhe) / sin_jkl, p_jkl->dcos_dj ); + rvec_Scale( dcos_omega_dk, 2.0 / poem, dcos_omega_dk ); + + // dcos_omega_dl + rvec_ScaledSum( dcos_omega_dl, (htrc-arg*hnrc) / r_kl, dvec_kl, 1., dvec_li ); + rvec_ScaledAdd( dcos_omega_dl,-(hthe-arg*hnhe) / sin_jkl, p_jkl->dcos_dk ); + rvec_Scale( dcos_omega_dl, 2.0 / poem, dcos_omega_dl ); + + return omega; + //return arg; } @@ -149,519 +149,519 @@ HOST_DEVICE real Calculate_Omega( rvec dvec_ij, real r_ij, rvec dvec_jk, real r_ void Four_Body_Interactions( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - list **lists, output_controls *out_control ) + simulation_data *data, static_storage *workspace, + list **lists, output_controls *out_control ) { - int i, j, k, l, pi, pj, pk, pl, pij, plk; - int type_i, type_j, type_k, type_l; - int start_j, end_j, start_k, end_k; - int start_pj, end_pj, start_pk, end_pk; - int num_frb_intrs = 0; - - real Delta_j, Delta_k; - real r_ij, r_jk, r_kl, r_li; - real BOA_ij, BOA_jk, BOA_kl; - - real exp_tor2_ij, exp_tor2_jk, exp_tor2_kl; - real exp_tor1, exp_tor3_DjDk, exp_tor4_DjDk, exp_tor34_inv; - real exp_cot2_jk, exp_cot2_ij, exp_cot2_kl; - real fn10, f11_DjDk, dfn11, fn12; - - real theta_ijk, theta_jkl; - real sin_ijk, sin_jkl; - real cos_ijk, cos_jkl; - real tan_ijk_i, tan_jkl_i; - - real omega, cos_omega, cos2omega, cos3omega; - rvec dcos_omega_di, dcos_omega_dj, dcos_omega_dk, dcos_omega_dl; - - real CV, cmn, CEtors1, CEtors2, CEtors3, CEtors4; - real CEtors5, CEtors6, CEtors7, CEtors8, CEtors9; - real Cconj, CEconj1, CEconj2, CEconj3; - real CEconj4, CEconj5, CEconj6; - - real e_tor, e_con; - rvec dvec_li; - rvec force, ext_press; - ivec rel_box_jl; - // rtensor total_rtensor, temp_rtensor; - - four_body_header *fbh; - four_body_parameters *fbp; - bond_data *pbond_ij, *pbond_jk, *pbond_kl; - bond_order_data *bo_ij, *bo_jk, *bo_kl; - three_body_interaction_data *p_ijk, *p_jkl; - - real p_tor2 = system->reaxprm.gp.l[23]; - real p_tor3 = system->reaxprm.gp.l[24]; - real p_tor4 = system->reaxprm.gp.l[25]; - real p_cot2 = system->reaxprm.gp.l[27]; - - list *bonds = (*lists) + BONDS; - list *thb_intrs = (*lists) + THREE_BODIES; - - - for( j = 0; j < system->N; ++j ) { - type_j = system->atoms[j].type; - Delta_j = workspace->Delta_boc[j]; - start_j = Start_Index(j, bonds); - end_j = End_Index(j, bonds); - - - for( pk = start_j; pk < end_j; ++pk ) { - pbond_jk = &( bonds->select.bond_list[pk] ); - k = pbond_jk->nbr; - bo_jk = &( pbond_jk->bo_data ); - BOA_jk = bo_jk->BO - control->thb_cut; - - /* see if there are any 3-body interactions involving j&k - where j is the central atom. Otherwise there is no point in - trying to form a 4-body interaction out of this neighborhood */ - if( j < k && bo_jk->BO > control->thb_cut/*0*/ && - Num_Entries(pk, thb_intrs) ) { - start_k = Start_Index(k, bonds); - end_k = End_Index(k, bonds); - pj = pbond_jk->sym_index; // pj points to j on k's list - - /* do the same check as above: are there any 3-body interactions - involving k&j where k is the central atom */ - if( Num_Entries(pj, thb_intrs) ) { - type_k = system->atoms[k].type; - Delta_k = workspace->Delta_boc[k]; - r_jk = pbond_jk->d; - - start_pk = Start_Index(pk, thb_intrs ); - end_pk = End_Index(pk, thb_intrs ); - start_pj = Start_Index(pj, thb_intrs ); - end_pj = End_Index(pj, thb_intrs ); - - exp_tor2_jk = EXP( -p_tor2 * BOA_jk ); - exp_cot2_jk = EXP( -p_cot2 * SQR(BOA_jk - 1.5) ); - exp_tor3_DjDk = EXP( -p_tor3 * (Delta_j + Delta_k) ); - exp_tor4_DjDk = EXP( p_tor4 * (Delta_j + Delta_k) ); - exp_tor34_inv = 1.0 / (1.0 + exp_tor3_DjDk + exp_tor4_DjDk); - f11_DjDk = (2.0 + exp_tor3_DjDk) * exp_tor34_inv; - - - /* pick i up from j-k interaction where j is the centre atom */ - for( pi = start_pk; pi < end_pk; ++pi ) { - p_ijk = &( thb_intrs->select.three_body_list[pi] ); - pij = p_ijk->pthb; // pij is pointer to i on j's bond_list - pbond_ij = &( bonds->select.bond_list[pij] ); - bo_ij = &( pbond_ij->bo_data ); - - - if( bo_ij->BO > control->thb_cut/*0*/ ) { - i = p_ijk->thb; - type_i = system->atoms[i].type; - r_ij = pbond_ij->d; - BOA_ij = bo_ij->BO - control->thb_cut; - - theta_ijk = p_ijk->theta; - sin_ijk = SIN( theta_ijk ); - cos_ijk = COS( theta_ijk ); - //tan_ijk_i = 1. / TAN( theta_ijk ); - if( sin_ijk >= 0 && sin_ijk <= MIN_SINE ) - tan_ijk_i = cos_ijk / MIN_SINE; - else if( sin_ijk <= 0 && sin_ijk >= -MIN_SINE ) - tan_ijk_i = cos_ijk / -MIN_SINE; - else tan_ijk_i = cos_ijk / sin_ijk; - - exp_tor2_ij = EXP( -p_tor2 * BOA_ij ); - exp_cot2_ij = EXP( -p_cot2 * SQR(BOA_ij -1.5) ); - - /* pick l up from j-k intr. where k is the centre */ - for( pl = start_pj; pl < end_pj; ++pl ) { - p_jkl = &( thb_intrs->select.three_body_list[pl] ); - l = p_jkl->thb; - plk = p_jkl->pthb; //pointer to l on k's bond_list! - pbond_kl = &( bonds->select.bond_list[plk] ); - bo_kl = &( pbond_kl->bo_data ); - type_l = system->atoms[l].type; - fbh = &(system->reaxprm.fbp[ index_fbp (type_i,type_j,type_k,type_l,&system->reaxprm ) ]); - fbp = &(system->reaxprm.fbp[ index_fbp (type_i,type_j,type_k,type_l,&system->reaxprm )].prm[0]); - - if( i != l && fbh->cnt && bo_kl->BO > control->thb_cut/*0*/ && - bo_ij->BO * bo_jk->BO * bo_kl->BO > control->thb_cut/*0*/ ){ - ++num_frb_intrs; - r_kl = pbond_kl->d; - BOA_kl = bo_kl->BO - control->thb_cut; - - theta_jkl = p_jkl->theta; - sin_jkl = SIN( theta_jkl ); - cos_jkl = COS( theta_jkl ); - //tan_jkl_i = 1. / TAN( theta_jkl ); - if( sin_jkl >= 0 && sin_jkl <= MIN_SINE ) - tan_jkl_i = cos_jkl / MIN_SINE; - else if( sin_jkl <= 0 && sin_jkl >= -MIN_SINE ) - tan_jkl_i = cos_jkl / -MIN_SINE; - else tan_jkl_i = cos_jkl /sin_jkl; - - Sq_Distance_on_T3( system->atoms[l].x, system->atoms[i].x, - &(system->box), dvec_li ); - r_li = rvec_Norm( dvec_li ); - - - /* omega and its derivative */ - //cos_omega=Calculate_Omega(pbond_ij->dvec,r_ij,pbond_jk->dvec, - omega = Calculate_Omega(pbond_ij->dvec, r_ij, pbond_jk->dvec, - r_jk, pbond_kl->dvec, r_kl, - dvec_li, r_li, p_ijk, p_jkl, - dcos_omega_di, dcos_omega_dj, - dcos_omega_dk, dcos_omega_dl, - out_control); - cos_omega = COS( omega ); - cos2omega = COS( 2. * omega ); - cos3omega = COS( 3. * omega ); - /* end omega calculations */ - - /* torsion energy */ - exp_tor1 = EXP(fbp->p_tor1 * SQR(2.-bo_jk->BO_pi-f11_DjDk)); - exp_tor2_kl = EXP( -p_tor2 * BOA_kl ); - exp_cot2_kl = EXP( -p_cot2 * SQR(BOA_kl-1.5) ); - fn10 = (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_jk) * - (1.0 - exp_tor2_kl); - - CV = 0.5 * ( fbp->V1 * (1.0 + cos_omega) + - fbp->V2 * exp_tor1 * (1.0 - cos2omega) + - fbp->V3 * (1.0 + cos3omega) ); - //CV = 0.5 * fbp->V1 * (1.0 + cos_omega) + - // fbp->V2 * exp_tor1 * (1.0 - SQR(cos_omega)) + - // fbp->V3 * (0.5 + 2.0*CUBE(cos_omega) - 1.5 * cos_omega); - - data->E_Tor += e_tor = fn10 * sin_ijk * sin_jkl * CV; - - dfn11 = (-p_tor3 * exp_tor3_DjDk + - (p_tor3 * exp_tor3_DjDk - p_tor4 * exp_tor4_DjDk) * - (2.+exp_tor3_DjDk) * exp_tor34_inv) * exp_tor34_inv; - - CEtors1 = sin_ijk * sin_jkl * CV; - - CEtors2 = -fn10 * 2.0 * fbp->p_tor1 * fbp->V2 * exp_tor1 * - (2.0 - bo_jk->BO_pi - f11_DjDk) * (1.0 - SQR(cos_omega)) * - sin_ijk * sin_jkl; - - CEtors3 = CEtors2 * dfn11; - - CEtors4 = CEtors1 * p_tor2 * exp_tor2_ij * - (1.0 - exp_tor2_jk) * (1.0 - exp_tor2_kl); - - CEtors5 = CEtors1 * p_tor2 * exp_tor2_jk * - (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_kl); - - CEtors6 = CEtors1 * p_tor2 * exp_tor2_kl * - (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_jk); - - cmn = -fn10 * CV; - CEtors7 = cmn * sin_jkl * tan_ijk_i; - CEtors8 = cmn * sin_ijk * tan_jkl_i; - CEtors9 = fn10 * sin_ijk * sin_jkl * - (0.5 * fbp->V1 - 2.0 * fbp->V2 * exp_tor1 * cos_omega + - 1.5 * fbp->V3 * (cos2omega + 2. * SQR(cos_omega))); - //cmn = -fn10 * CV; - //CEtors7 = cmn * sin_jkl * cos_ijk; - //CEtors8 = cmn * sin_ijk * cos_jkl; - //CEtors9 = fn10 * sin_ijk * sin_jkl * - // (0.5 * fbp->V1 - 2.0 * fbp->V2 * exp_tor1 * cos_omega + - // fbp->V3 * (6*SQR(cos_omega) - 1.50)); - /* end of torsion energy */ - - - /* 4-body conjugation energy */ - fn12 = exp_cot2_ij * exp_cot2_jk * exp_cot2_kl; - data->E_Con += e_con = fbp->p_cot1 * fn12 * - (1. + (SQR(cos_omega)-1.) * sin_ijk*sin_jkl); - - Cconj = -2.0 * fn12 * fbp->p_cot1 * p_cot2 * - (1. + (SQR(cos_omega)-1.) * sin_ijk*sin_jkl); - - CEconj1 = Cconj * (BOA_ij - 1.5e0); - CEconj2 = Cconj * (BOA_jk - 1.5e0); - CEconj3 = Cconj * (BOA_kl - 1.5e0); - - CEconj4 = -fbp->p_cot1 * fn12 * - (SQR(cos_omega) - 1.0) * sin_jkl * tan_ijk_i; - CEconj5 = -fbp->p_cot1 * fn12 * - (SQR(cos_omega) - 1.0) * sin_ijk * tan_jkl_i; - //CEconj4 = -fbp->p_cot1 * fn12 * - // (SQR(cos_omega) - 1.0) * sin_jkl * cos_ijk; - //CEconj5 = -fbp->p_cot1 * fn12 * - // (SQR(cos_omega) - 1.0) * sin_ijk * cos_jkl; - CEconj6 = 2.0 * fbp->p_cot1 * fn12 * - cos_omega * sin_ijk * sin_jkl; - /* end 4-body conjugation energy */ - - //fprintf(stdout, "%6d %6d %6d %6d %7.3f %7.3f %7.3f %7.3f ", - // workspace->orig_id[i], workspace->orig_id[j], - // workspace->orig_id[k], workspace->orig_id[l], - // omega, cos_omega, cos2omega, cos3omega ); - //fprintf(stdout, - // "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", - // CEtors2, CEtors3, CEtors4, CEtors5, - // CEtors6, CEtors7, CEtors8, CEtors9 ); - //fprintf(stdout, "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", - // theta_ijk, theta_jkl, sin_ijk, - // sin_jkl, cos_jkl, tan_jkl_i ); - - /* forces */ - bo_jk->Cdbopi += CEtors2; - workspace->CdDelta[j] += CEtors3; - workspace->CdDelta[k] += CEtors3; - bo_ij->Cdbo += (CEtors4 + CEconj1); - bo_jk->Cdbo += (CEtors5 + CEconj2); - - bo_kl->Cdbo += (CEtors6 + CEconj3); - - if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) { - /* dcos_theta_ijk */ - rvec_ScaledAdd( system->atoms[i].f, - CEtors7 + CEconj4, p_ijk->dcos_dk ); - rvec_ScaledAdd( system->atoms[j].f, - CEtors7 + CEconj4, p_ijk->dcos_dj ); - rvec_ScaledAdd( system->atoms[k].f, - CEtors7 + CEconj4, p_ijk->dcos_di ); - - /* dcos_theta_jkl */ - rvec_ScaledAdd( system->atoms[j].f, - CEtors8 + CEconj5, p_jkl->dcos_di ); - rvec_ScaledAdd( system->atoms[k].f, - CEtors8 + CEconj5, p_jkl->dcos_dj ); - rvec_ScaledAdd( system->atoms[l].f, - CEtors8 + CEconj5, p_jkl->dcos_dk ); - - /* dcos_omega */ - rvec_ScaledAdd( system->atoms[i].f, - CEtors9 + CEconj6, dcos_omega_di ); - rvec_ScaledAdd( system->atoms[j].f, - CEtors9 + CEconj6, dcos_omega_dj ); - rvec_ScaledAdd( system->atoms[k].f, - CEtors9 + CEconj6, dcos_omega_dk ); - rvec_ScaledAdd( system->atoms[l].f, - CEtors9 + CEconj6, dcos_omega_dl ); - } - else { - ivec_Sum(rel_box_jl, pbond_jk->rel_box, pbond_kl->rel_box); - - /* dcos_theta_ijk */ - rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_dk ); - rvec_Add( system->atoms[i].f, force ); - rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); - rvec_Add( data->ext_press, ext_press ); - - rvec_ScaledAdd( system->atoms[j].f, - CEtors7 + CEconj4, p_ijk->dcos_dj ); - - rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_di ); - rvec_Add( system->atoms[k].f, force ); - rvec_iMultiply( ext_press, pbond_jk->rel_box, force ); - rvec_Add( data->ext_press, ext_press ); - - - /* dcos_theta_jkl */ - rvec_ScaledAdd( system->atoms[j].f, - CEtors8 + CEconj5, p_jkl->dcos_di ); - - rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dj ); - rvec_Add( system->atoms[k].f, force ); - rvec_iMultiply( ext_press, pbond_jk->rel_box, force ); - rvec_Add( data->ext_press, ext_press ); - - rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dk ); - rvec_Add( system->atoms[l].f, force ); - rvec_iMultiply( ext_press, rel_box_jl, force ); - rvec_Add( data->ext_press, ext_press ); - - - /* dcos_omega */ - rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_di ); - rvec_Add( system->atoms[i].f, force ); - rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); - rvec_Add( data->ext_press, ext_press ); - - rvec_ScaledAdd( system->atoms[j].f, - CEtors9 + CEconj6, dcos_omega_dj ); - - rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dk ); - rvec_Add( system->atoms[k].f, force ); - rvec_iMultiply( ext_press, pbond_jk->rel_box, force ); - rvec_Add( data->ext_press, ext_press ); - - rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dl ); - rvec_Add( system->atoms[l].f, force ); - rvec_iMultiply( ext_press, rel_box_jl, force ); - rvec_Add( data->ext_press, ext_press ); - - - /* This part is intended for a fully-flexible box */ - /* rvec_ScaledSum( temp_rvec, - CEtors7 + CEconj4, p_ijk->dcos_dk, // i - CEtors9 + CEconj6, dcos_omega_di ); - rvec_OuterProduct( temp_rtensor, - temp_rvec, system->atoms[i].x ); - rtensor_Copy( total_rtensor, temp_rtensor ); - - rvec_ScaledSum( temp_rvec, - CEtors7 + CEconj4, p_ijk->dcos_dj, // j - CEtors8 + CEconj5, p_jkl->dcos_di ); - rvec_ScaledAdd( temp_rvec, - CEtors9 + CEconj6, dcos_omega_dj ); - rvec_OuterProduct( temp_rtensor, - temp_rvec, system->atoms[j].x ); - rtensor_Add( total_rtensor, temp_rtensor ); - - rvec_ScaledSum( temp_rvec, - CEtors7 + CEconj4, p_ijk->dcos_di, // k - CEtors8 + CEconj5, p_jkl->dcos_dj ); - rvec_ScaledAdd( temp_rvec, - CEtors9 + CEconj6, dcos_omega_dk ); - rvec_OuterProduct( temp_rtensor, - temp_rvec, system->atoms[k].x ); - rtensor_Add( total_rtensor, temp_rtensor ); - - rvec_ScaledSum( temp_rvec, - CEtors8 + CEconj5, p_jkl->dcos_dk, // l - CEtors9 + CEconj6, dcos_omega_dl ); - rvec_OuterProduct( temp_rtensor, - temp_rvec, system->atoms[l].x ); - rtensor_Copy( total_rtensor, temp_rtensor ); - - if( pbond_ij->imaginary || pbond_jk->imaginary || - pbond_kl->imaginary ) - rtensor_ScaledAdd( data->flex_bar.P, -1., total_rtensor ); - else - rtensor_Add( data->flex_bar.P, total_rtensor ); */ - } + int i, j, k, l, pi, pj, pk, pl, pij, plk; + int type_i, type_j, type_k, type_l; + int start_j, end_j, start_k, end_k; + int start_pj, end_pj, start_pk, end_pk; + int num_frb_intrs = 0; + + real Delta_j, Delta_k; + real r_ij, r_jk, r_kl, r_li; + real BOA_ij, BOA_jk, BOA_kl; + + real exp_tor2_ij, exp_tor2_jk, exp_tor2_kl; + real exp_tor1, exp_tor3_DjDk, exp_tor4_DjDk, exp_tor34_inv; + real exp_cot2_jk, exp_cot2_ij, exp_cot2_kl; + real fn10, f11_DjDk, dfn11, fn12; + + real theta_ijk, theta_jkl; + real sin_ijk, sin_jkl; + real cos_ijk, cos_jkl; + real tan_ijk_i, tan_jkl_i; + + real omega, cos_omega, cos2omega, cos3omega; + rvec dcos_omega_di, dcos_omega_dj, dcos_omega_dk, dcos_omega_dl; + + real CV, cmn, CEtors1, CEtors2, CEtors3, CEtors4; + real CEtors5, CEtors6, CEtors7, CEtors8, CEtors9; + real Cconj, CEconj1, CEconj2, CEconj3; + real CEconj4, CEconj5, CEconj6; + + real e_tor, e_con; + rvec dvec_li; + rvec force, ext_press; + ivec rel_box_jl; + // rtensor total_rtensor, temp_rtensor; + + four_body_header *fbh; + four_body_parameters *fbp; + bond_data *pbond_ij, *pbond_jk, *pbond_kl; + bond_order_data *bo_ij, *bo_jk, *bo_kl; + three_body_interaction_data *p_ijk, *p_jkl; + + real p_tor2 = system->reaxprm.gp.l[23]; + real p_tor3 = system->reaxprm.gp.l[24]; + real p_tor4 = system->reaxprm.gp.l[25]; + real p_cot2 = system->reaxprm.gp.l[27]; + + list *bonds = (*lists) + BONDS; + list *thb_intrs = (*lists) + THREE_BODIES; + + + for( j = 0; j < system->N; ++j ) { + type_j = system->atoms[j].type; + Delta_j = workspace->Delta_boc[j]; + start_j = Start_Index(j, bonds); + end_j = End_Index(j, bonds); + + + for( pk = start_j; pk < end_j; ++pk ) { + pbond_jk = &( bonds->select.bond_list[pk] ); + k = pbond_jk->nbr; + bo_jk = &( pbond_jk->bo_data ); + BOA_jk = bo_jk->BO - control->thb_cut; + + /* see if there are any 3-body interactions involving j&k + where j is the central atom. Otherwise there is no point in + trying to form a 4-body interaction out of this neighborhood */ + if( j < k && bo_jk->BO > control->thb_cut/*0*/ && + Num_Entries(pk, thb_intrs) ) { + start_k = Start_Index(k, bonds); + end_k = End_Index(k, bonds); + pj = pbond_jk->sym_index; // pj points to j on k's list + + /* do the same check as above: are there any 3-body interactions + involving k&j where k is the central atom */ + if( Num_Entries(pj, thb_intrs) ) { + type_k = system->atoms[k].type; + Delta_k = workspace->Delta_boc[k]; + r_jk = pbond_jk->d; + + start_pk = Start_Index(pk, thb_intrs ); + end_pk = End_Index(pk, thb_intrs ); + start_pj = Start_Index(pj, thb_intrs ); + end_pj = End_Index(pj, thb_intrs ); + + exp_tor2_jk = EXP( -p_tor2 * BOA_jk ); + exp_cot2_jk = EXP( -p_cot2 * SQR(BOA_jk - 1.5) ); + exp_tor3_DjDk = EXP( -p_tor3 * (Delta_j + Delta_k) ); + exp_tor4_DjDk = EXP( p_tor4 * (Delta_j + Delta_k) ); + exp_tor34_inv = 1.0 / (1.0 + exp_tor3_DjDk + exp_tor4_DjDk); + f11_DjDk = (2.0 + exp_tor3_DjDk) * exp_tor34_inv; + + + /* pick i up from j-k interaction where j is the centre atom */ + for( pi = start_pk; pi < end_pk; ++pi ) { + p_ijk = &( thb_intrs->select.three_body_list[pi] ); + pij = p_ijk->pthb; // pij is pointer to i on j's bond_list + pbond_ij = &( bonds->select.bond_list[pij] ); + bo_ij = &( pbond_ij->bo_data ); + + + if( bo_ij->BO > control->thb_cut/*0*/ ) { + i = p_ijk->thb; + type_i = system->atoms[i].type; + r_ij = pbond_ij->d; + BOA_ij = bo_ij->BO - control->thb_cut; + + theta_ijk = p_ijk->theta; + sin_ijk = SIN( theta_ijk ); + cos_ijk = COS( theta_ijk ); + //tan_ijk_i = 1. / TAN( theta_ijk ); + if( sin_ijk >= 0 && sin_ijk <= MIN_SINE ) + tan_ijk_i = cos_ijk / MIN_SINE; + else if( sin_ijk <= 0 && sin_ijk >= -MIN_SINE ) + tan_ijk_i = cos_ijk / -MIN_SINE; + else tan_ijk_i = cos_ijk / sin_ijk; + + exp_tor2_ij = EXP( -p_tor2 * BOA_ij ); + exp_cot2_ij = EXP( -p_cot2 * SQR(BOA_ij -1.5) ); + + /* pick l up from j-k intr. where k is the centre */ + for( pl = start_pj; pl < end_pj; ++pl ) { + p_jkl = &( thb_intrs->select.three_body_list[pl] ); + l = p_jkl->thb; + plk = p_jkl->pthb; //pointer to l on k's bond_list! + pbond_kl = &( bonds->select.bond_list[plk] ); + bo_kl = &( pbond_kl->bo_data ); + type_l = system->atoms[l].type; + fbh = &(system->reaxprm.fbp[ index_fbp (type_i,type_j,type_k,type_l,&system->reaxprm ) ]); + fbp = &(system->reaxprm.fbp[ index_fbp (type_i,type_j,type_k,type_l,&system->reaxprm )].prm[0]); + + if( i != l && fbh->cnt && bo_kl->BO > control->thb_cut/*0*/ && + bo_ij->BO * bo_jk->BO * bo_kl->BO > control->thb_cut/*0*/ ){ + ++num_frb_intrs; + r_kl = pbond_kl->d; + BOA_kl = bo_kl->BO - control->thb_cut; + + theta_jkl = p_jkl->theta; + sin_jkl = SIN( theta_jkl ); + cos_jkl = COS( theta_jkl ); + //tan_jkl_i = 1. / TAN( theta_jkl ); + if( sin_jkl >= 0 && sin_jkl <= MIN_SINE ) + tan_jkl_i = cos_jkl / MIN_SINE; + else if( sin_jkl <= 0 && sin_jkl >= -MIN_SINE ) + tan_jkl_i = cos_jkl / -MIN_SINE; + else tan_jkl_i = cos_jkl /sin_jkl; + + Sq_Distance_on_T3( system->atoms[l].x, system->atoms[i].x, + &(system->box), dvec_li ); + r_li = rvec_Norm( dvec_li ); + + + /* omega and its derivative */ + //cos_omega=Calculate_Omega(pbond_ij->dvec,r_ij,pbond_jk->dvec, + omega = Calculate_Omega(pbond_ij->dvec, r_ij, pbond_jk->dvec, + r_jk, pbond_kl->dvec, r_kl, + dvec_li, r_li, p_ijk, p_jkl, + dcos_omega_di, dcos_omega_dj, + dcos_omega_dk, dcos_omega_dl, + out_control); + cos_omega = COS( omega ); + cos2omega = COS( 2. * omega ); + cos3omega = COS( 3. * omega ); + /* end omega calculations */ + + /* torsion energy */ + exp_tor1 = EXP(fbp->p_tor1 * SQR(2.-bo_jk->BO_pi-f11_DjDk)); + exp_tor2_kl = EXP( -p_tor2 * BOA_kl ); + exp_cot2_kl = EXP( -p_cot2 * SQR(BOA_kl-1.5) ); + fn10 = (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_jk) * + (1.0 - exp_tor2_kl); + + CV = 0.5 * ( fbp->V1 * (1.0 + cos_omega) + + fbp->V2 * exp_tor1 * (1.0 - cos2omega) + + fbp->V3 * (1.0 + cos3omega) ); + //CV = 0.5 * fbp->V1 * (1.0 + cos_omega) + + // fbp->V2 * exp_tor1 * (1.0 - SQR(cos_omega)) + + // fbp->V3 * (0.5 + 2.0*CUBE(cos_omega) - 1.5 * cos_omega); + + data->E_Tor += e_tor = fn10 * sin_ijk * sin_jkl * CV; + + dfn11 = (-p_tor3 * exp_tor3_DjDk + + (p_tor3 * exp_tor3_DjDk - p_tor4 * exp_tor4_DjDk) * + (2.+exp_tor3_DjDk) * exp_tor34_inv) * exp_tor34_inv; + + CEtors1 = sin_ijk * sin_jkl * CV; + + CEtors2 = -fn10 * 2.0 * fbp->p_tor1 * fbp->V2 * exp_tor1 * + (2.0 - bo_jk->BO_pi - f11_DjDk) * (1.0 - SQR(cos_omega)) * + sin_ijk * sin_jkl; + + CEtors3 = CEtors2 * dfn11; + + CEtors4 = CEtors1 * p_tor2 * exp_tor2_ij * + (1.0 - exp_tor2_jk) * (1.0 - exp_tor2_kl); + + CEtors5 = CEtors1 * p_tor2 * exp_tor2_jk * + (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_kl); + + CEtors6 = CEtors1 * p_tor2 * exp_tor2_kl * + (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_jk); + + cmn = -fn10 * CV; + CEtors7 = cmn * sin_jkl * tan_ijk_i; + CEtors8 = cmn * sin_ijk * tan_jkl_i; + CEtors9 = fn10 * sin_ijk * sin_jkl * + (0.5 * fbp->V1 - 2.0 * fbp->V2 * exp_tor1 * cos_omega + + 1.5 * fbp->V3 * (cos2omega + 2. * SQR(cos_omega))); + //cmn = -fn10 * CV; + //CEtors7 = cmn * sin_jkl * cos_ijk; + //CEtors8 = cmn * sin_ijk * cos_jkl; + //CEtors9 = fn10 * sin_ijk * sin_jkl * + // (0.5 * fbp->V1 - 2.0 * fbp->V2 * exp_tor1 * cos_omega + + // fbp->V3 * (6*SQR(cos_omega) - 1.50)); + /* end of torsion energy */ + + + /* 4-body conjugation energy */ + fn12 = exp_cot2_ij * exp_cot2_jk * exp_cot2_kl; + data->E_Con += e_con = fbp->p_cot1 * fn12 * + (1. + (SQR(cos_omega)-1.) * sin_ijk*sin_jkl); + + Cconj = -2.0 * fn12 * fbp->p_cot1 * p_cot2 * + (1. + (SQR(cos_omega)-1.) * sin_ijk*sin_jkl); + + CEconj1 = Cconj * (BOA_ij - 1.5e0); + CEconj2 = Cconj * (BOA_jk - 1.5e0); + CEconj3 = Cconj * (BOA_kl - 1.5e0); + + CEconj4 = -fbp->p_cot1 * fn12 * + (SQR(cos_omega) - 1.0) * sin_jkl * tan_ijk_i; + CEconj5 = -fbp->p_cot1 * fn12 * + (SQR(cos_omega) - 1.0) * sin_ijk * tan_jkl_i; + //CEconj4 = -fbp->p_cot1 * fn12 * + // (SQR(cos_omega) - 1.0) * sin_jkl * cos_ijk; + //CEconj5 = -fbp->p_cot1 * fn12 * + // (SQR(cos_omega) - 1.0) * sin_ijk * cos_jkl; + CEconj6 = 2.0 * fbp->p_cot1 * fn12 * + cos_omega * sin_ijk * sin_jkl; + /* end 4-body conjugation energy */ + + //fprintf(stdout, "%6d %6d %6d %6d %7.3f %7.3f %7.3f %7.3f ", + // workspace->orig_id[i], workspace->orig_id[j], + // workspace->orig_id[k], workspace->orig_id[l], + // omega, cos_omega, cos2omega, cos3omega ); + //fprintf(stdout, + // "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", + // CEtors2, CEtors3, CEtors4, CEtors5, + // CEtors6, CEtors7, CEtors8, CEtors9 ); + //fprintf(stdout, "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", + // theta_ijk, theta_jkl, sin_ijk, + // sin_jkl, cos_jkl, tan_jkl_i ); + + /* forces */ + bo_jk->Cdbopi += CEtors2; + workspace->CdDelta[j] += CEtors3; + workspace->CdDelta[k] += CEtors3; + bo_ij->Cdbo += (CEtors4 + CEconj1); + bo_jk->Cdbo += (CEtors5 + CEconj2); + + bo_kl->Cdbo += (CEtors6 + CEconj3); + + if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) { + /* dcos_theta_ijk */ + rvec_ScaledAdd( system->atoms[i].f, + CEtors7 + CEconj4, p_ijk->dcos_dk ); + rvec_ScaledAdd( system->atoms[j].f, + CEtors7 + CEconj4, p_ijk->dcos_dj ); + rvec_ScaledAdd( system->atoms[k].f, + CEtors7 + CEconj4, p_ijk->dcos_di ); + + /* dcos_theta_jkl */ + rvec_ScaledAdd( system->atoms[j].f, + CEtors8 + CEconj5, p_jkl->dcos_di ); + rvec_ScaledAdd( system->atoms[k].f, + CEtors8 + CEconj5, p_jkl->dcos_dj ); + rvec_ScaledAdd( system->atoms[l].f, + CEtors8 + CEconj5, p_jkl->dcos_dk ); + + /* dcos_omega */ + rvec_ScaledAdd( system->atoms[i].f, + CEtors9 + CEconj6, dcos_omega_di ); + rvec_ScaledAdd( system->atoms[j].f, + CEtors9 + CEconj6, dcos_omega_dj ); + rvec_ScaledAdd( system->atoms[k].f, + CEtors9 + CEconj6, dcos_omega_dk ); + rvec_ScaledAdd( system->atoms[l].f, + CEtors9 + CEconj6, dcos_omega_dl ); + } + else { + ivec_Sum(rel_box_jl, pbond_jk->rel_box, pbond_kl->rel_box); + + /* dcos_theta_ijk */ + rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_dk ); + rvec_Add( system->atoms[i].f, force ); + rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); + rvec_Add( data->ext_press, ext_press ); + + rvec_ScaledAdd( system->atoms[j].f, + CEtors7 + CEconj4, p_ijk->dcos_dj ); + + rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_di ); + rvec_Add( system->atoms[k].f, force ); + rvec_iMultiply( ext_press, pbond_jk->rel_box, force ); + rvec_Add( data->ext_press, ext_press ); + + + /* dcos_theta_jkl */ + rvec_ScaledAdd( system->atoms[j].f, + CEtors8 + CEconj5, p_jkl->dcos_di ); + + rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dj ); + rvec_Add( system->atoms[k].f, force ); + rvec_iMultiply( ext_press, pbond_jk->rel_box, force ); + rvec_Add( data->ext_press, ext_press ); + + rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dk ); + rvec_Add( system->atoms[l].f, force ); + rvec_iMultiply( ext_press, rel_box_jl, force ); + rvec_Add( data->ext_press, ext_press ); + + + /* dcos_omega */ + rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_di ); + rvec_Add( system->atoms[i].f, force ); + rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); + rvec_Add( data->ext_press, ext_press ); + + rvec_ScaledAdd( system->atoms[j].f, + CEtors9 + CEconj6, dcos_omega_dj ); + + rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dk ); + rvec_Add( system->atoms[k].f, force ); + rvec_iMultiply( ext_press, pbond_jk->rel_box, force ); + rvec_Add( data->ext_press, ext_press ); + + rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dl ); + rvec_Add( system->atoms[l].f, force ); + rvec_iMultiply( ext_press, rel_box_jl, force ); + rvec_Add( data->ext_press, ext_press ); + + + /* This part is intended for a fully-flexible box */ + /* rvec_ScaledSum( temp_rvec, + CEtors7 + CEconj4, p_ijk->dcos_dk, // i + CEtors9 + CEconj6, dcos_omega_di ); + rvec_OuterProduct( temp_rtensor, + temp_rvec, system->atoms[i].x ); + rtensor_Copy( total_rtensor, temp_rtensor ); + + rvec_ScaledSum( temp_rvec, + CEtors7 + CEconj4, p_ijk->dcos_dj, // j + CEtors8 + CEconj5, p_jkl->dcos_di ); + rvec_ScaledAdd( temp_rvec, + CEtors9 + CEconj6, dcos_omega_dj ); + rvec_OuterProduct( temp_rtensor, + temp_rvec, system->atoms[j].x ); + rtensor_Add( total_rtensor, temp_rtensor ); + + rvec_ScaledSum( temp_rvec, + CEtors7 + CEconj4, p_ijk->dcos_di, // k + CEtors8 + CEconj5, p_jkl->dcos_dj ); + rvec_ScaledAdd( temp_rvec, + CEtors9 + CEconj6, dcos_omega_dk ); + rvec_OuterProduct( temp_rtensor, + temp_rvec, system->atoms[k].x ); + rtensor_Add( total_rtensor, temp_rtensor ); + + rvec_ScaledSum( temp_rvec, + CEtors8 + CEconj5, p_jkl->dcos_dk, // l + CEtors9 + CEconj6, dcos_omega_dl ); + rvec_OuterProduct( temp_rtensor, + temp_rvec, system->atoms[l].x ); + rtensor_Copy( total_rtensor, temp_rtensor ); + + if( pbond_ij->imaginary || pbond_jk->imaginary || + pbond_kl->imaginary ) + rtensor_ScaledAdd( data->flex_bar.P, -1., total_rtensor ); + else + rtensor_Add( data->flex_bar.P, total_rtensor ); */ + } #ifdef TEST_ENERGY - /*fprintf( out_control->etor, - //"%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", - //r_ij, r_jk, r_kl, - "%12.8f%12.8f%12.8f%12.8f\n", - cos_ijk, cos_jkl, sin_ijk, sin_jkl );*/ - // fprintf( out_control->etor, "%12.8f\n", dfn11 ); - fprintf( out_control->etor, "%12.8f%12.8f%12.8f\n", - fn10, cos_omega, CV ); - - fprintf( out_control->etor, - "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", - CEtors2, CEtors3, CEtors4, CEtors5, - CEtors6, CEtors7, CEtors8, CEtors9 ); - - /* fprintf( out_control->etor, - "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", - htra, htrb, htrc, hthd, hthe, hnra, hnrc, hnhd, hnhe ); */ - - fprintf( out_control->etor, - "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", - CEconj1, CEconj2, CEconj3, CEconj4, CEconj5, CEconj6 ); - /* fprintf(out_control->etor,"%23.15e%23.15e%23.15e%23.15e\n", - fbp->V1, fbp->V2, fbp->V3, fbp->p_tor1 );*/ - - fprintf( out_control->etor, - //"%6d%6d%6d%6d%23.15e%23.15e%23.15e%23.15e\n", - "%6d%6d%6d%6d%12.8f%12.8f\n", - workspace->orig_id[i], workspace->orig_id[j], - workspace->orig_id[k], workspace->orig_id[l], - e_tor, e_con ); - //RAD2DEG(omega), BOA_jk, e_tor, data->E_Tor ); - - fprintf( out_control->econ, - "%6d%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n", - workspace->orig_id[i], workspace->orig_id[j], - workspace->orig_id[k], workspace->orig_id[l], - RAD2DEG(omega), BOA_ij, BOA_jk, BOA_kl, - e_con,data->E_Con ); - - /* fprintf( out_control->etor, - "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n", - (CEtors7 + CEconj4)*p_ijk->dcos_dk[0], - (CEtors7 + CEconj4)*p_ijk->dcos_dk[1], - (CEtors7 + CEconj4)*p_ijk->dcos_dk[2], - (CEtors7 + CEconj4)*p_ijk->dcos_dj[0], - (CEtors7 + CEconj4)*p_ijk->dcos_dj[1], - (CEtors7 + CEconj4)*p_ijk->dcos_dj[2], - (CEtors7 + CEconj4)*p_ijk->dcos_di[0], - (CEtors7 + CEconj4)*p_ijk->dcos_di[1], - (CEtors7 + CEconj4)*p_ijk->dcos_di[2] ); */ - - - /* fprintf( out_control->etor, - "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n", - (CEtors8 + CEconj5)*p_jkl->dcos_di[0], - (CEtors8 + CEconj5)*p_jkl->dcos_di[1], - (CEtors8 + CEconj5)*p_jkl->dcos_di[2], - (CEtors8 + CEconj5)*p_jkl->dcos_dj[0], - (CEtors8 + CEconj5)*p_jkl->dcos_dj[1], - (CEtors8 + CEconj5)*p_jkl->dcos_dj[2], - (CEtors8 + CEconj5)*p_jkl->dcos_dk[0], - (CEtors8 + CEconj5)*p_jkl->dcos_dk[1], - (CEtors8 + CEconj5)*p_jkl->dcos_dk[2] ); */ - - fprintf( out_control->etor, - "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n", - dcos_omega_di[0], dcos_omega_di[1], dcos_omega_di[2], - dcos_omega_dj[0], dcos_omega_dj[1], dcos_omega_dj[2], - dcos_omega_dk[0], dcos_omega_dk[1], dcos_omega_dk[2], - dcos_omega_dl[0], dcos_omega_dl[1], dcos_omega_dl[2] ); + /*fprintf( out_control->etor, + //"%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", + //r_ij, r_jk, r_kl, + "%12.8f%12.8f%12.8f%12.8f\n", + cos_ijk, cos_jkl, sin_ijk, sin_jkl );*/ + // fprintf( out_control->etor, "%12.8f\n", dfn11 ); + fprintf( out_control->etor, "%12.8f%12.8f%12.8f\n", + fn10, cos_omega, CV ); + + fprintf( out_control->etor, + "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", + CEtors2, CEtors3, CEtors4, CEtors5, + CEtors6, CEtors7, CEtors8, CEtors9 ); + + /* fprintf( out_control->etor, + "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", + htra, htrb, htrc, hthd, hthe, hnra, hnrc, hnhd, hnhe ); */ + + fprintf( out_control->etor, + "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", + CEconj1, CEconj2, CEconj3, CEconj4, CEconj5, CEconj6 ); + /* fprintf(out_control->etor,"%23.15e%23.15e%23.15e%23.15e\n", + fbp->V1, fbp->V2, fbp->V3, fbp->p_tor1 );*/ + + fprintf( out_control->etor, + //"%6d%6d%6d%6d%23.15e%23.15e%23.15e%23.15e\n", + "%6d%6d%6d%6d%12.8f%12.8f\n", + workspace->orig_id[i], workspace->orig_id[j], + workspace->orig_id[k], workspace->orig_id[l], + e_tor, e_con ); + //RAD2DEG(omega), BOA_jk, e_tor, data->E_Tor ); + + fprintf( out_control->econ, + "%6d%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n", + workspace->orig_id[i], workspace->orig_id[j], + workspace->orig_id[k], workspace->orig_id[l], + RAD2DEG(omega), BOA_ij, BOA_jk, BOA_kl, + e_con,data->E_Con ); + + /* fprintf( out_control->etor, + "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n", + (CEtors7 + CEconj4)*p_ijk->dcos_dk[0], + (CEtors7 + CEconj4)*p_ijk->dcos_dk[1], + (CEtors7 + CEconj4)*p_ijk->dcos_dk[2], + (CEtors7 + CEconj4)*p_ijk->dcos_dj[0], + (CEtors7 + CEconj4)*p_ijk->dcos_dj[1], + (CEtors7 + CEconj4)*p_ijk->dcos_dj[2], + (CEtors7 + CEconj4)*p_ijk->dcos_di[0], + (CEtors7 + CEconj4)*p_ijk->dcos_di[1], + (CEtors7 + CEconj4)*p_ijk->dcos_di[2] ); */ + + + /* fprintf( out_control->etor, + "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n", + (CEtors8 + CEconj5)*p_jkl->dcos_di[0], + (CEtors8 + CEconj5)*p_jkl->dcos_di[1], + (CEtors8 + CEconj5)*p_jkl->dcos_di[2], + (CEtors8 + CEconj5)*p_jkl->dcos_dj[0], + (CEtors8 + CEconj5)*p_jkl->dcos_dj[1], + (CEtors8 + CEconj5)*p_jkl->dcos_dj[2], + (CEtors8 + CEconj5)*p_jkl->dcos_dk[0], + (CEtors8 + CEconj5)*p_jkl->dcos_dk[1], + (CEtors8 + CEconj5)*p_jkl->dcos_dk[2] ); */ + + fprintf( out_control->etor, + "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n", + dcos_omega_di[0], dcos_omega_di[1], dcos_omega_di[2], + dcos_omega_dj[0], dcos_omega_dj[1], dcos_omega_dj[2], + dcos_omega_dk[0], dcos_omega_dk[1], dcos_omega_dk[2], + dcos_omega_dl[0], dcos_omega_dl[1], dcos_omega_dl[2] ); #endif #ifdef TEST_FORCES - // Torsion Forces - Add_dBOpinpi2(system, lists, j, pk, CEtors2, 0., - workspace->f_tor, workspace->f_tor); - Add_dDelta( system, lists, j, CEtors3, workspace->f_tor ); - Add_dDelta( system, lists, k, CEtors3, workspace->f_tor ); - Add_dBO( system, lists, j, pij, CEtors4, workspace->f_tor ); - Add_dBO( system, lists, j, pk, CEtors5, workspace->f_tor ); - Add_dBO( system, lists, k, plk, CEtors6, workspace->f_tor ); - - rvec_ScaledAdd(workspace->f_tor[i], CEtors7, p_ijk->dcos_dk); - rvec_ScaledAdd(workspace->f_tor[j], CEtors7, p_ijk->dcos_dj); - rvec_ScaledAdd(workspace->f_tor[k], CEtors7, p_ijk->dcos_di); - - rvec_ScaledAdd(workspace->f_tor[j], CEtors8, p_jkl->dcos_di); - rvec_ScaledAdd(workspace->f_tor[k], CEtors8, p_jkl->dcos_dj); - rvec_ScaledAdd(workspace->f_tor[l], CEtors8, p_jkl->dcos_dk); - - rvec_ScaledAdd( workspace->f_tor[i], CEtors9, dcos_omega_di ); - rvec_ScaledAdd( workspace->f_tor[j], CEtors9, dcos_omega_dj ); - rvec_ScaledAdd( workspace->f_tor[k], CEtors9, dcos_omega_dk ); - rvec_ScaledAdd( workspace->f_tor[l], CEtors9, dcos_omega_dl ); - - // Conjugation Forces - Add_dBO( system, lists, j, pij, CEconj1, workspace->f_con ); - Add_dBO( system, lists, j, pk, CEconj2, workspace->f_con ); - Add_dBO( system, lists, k, plk, CEconj3, workspace->f_con ); - - rvec_ScaledAdd(workspace->f_con[i], CEconj4, p_ijk->dcos_dk); - rvec_ScaledAdd(workspace->f_con[j], CEconj4, p_ijk->dcos_dj); - rvec_ScaledAdd(workspace->f_con[k], CEconj4, p_ijk->dcos_di); - - rvec_ScaledAdd(workspace->f_con[j], CEconj5, p_jkl->dcos_di); - rvec_ScaledAdd(workspace->f_con[k], CEconj5, p_jkl->dcos_dj); - rvec_ScaledAdd(workspace->f_con[l], CEconj5, p_jkl->dcos_dk); - - rvec_ScaledAdd( workspace->f_con[i], CEconj6, dcos_omega_di ); - rvec_ScaledAdd( workspace->f_con[j], CEconj6, dcos_omega_dj ); - rvec_ScaledAdd( workspace->f_con[k], CEconj6, dcos_omega_dk ); - rvec_ScaledAdd( workspace->f_con[l], CEconj6, dcos_omega_dl ); + // Torsion Forces + Add_dBOpinpi2(system, lists, j, pk, CEtors2, 0., + workspace->f_tor, workspace->f_tor); + Add_dDelta( system, lists, j, CEtors3, workspace->f_tor ); + Add_dDelta( system, lists, k, CEtors3, workspace->f_tor ); + Add_dBO( system, lists, j, pij, CEtors4, workspace->f_tor ); + Add_dBO( system, lists, j, pk, CEtors5, workspace->f_tor ); + Add_dBO( system, lists, k, plk, CEtors6, workspace->f_tor ); + + rvec_ScaledAdd(workspace->f_tor[i], CEtors7, p_ijk->dcos_dk); + rvec_ScaledAdd(workspace->f_tor[j], CEtors7, p_ijk->dcos_dj); + rvec_ScaledAdd(workspace->f_tor[k], CEtors7, p_ijk->dcos_di); + + rvec_ScaledAdd(workspace->f_tor[j], CEtors8, p_jkl->dcos_di); + rvec_ScaledAdd(workspace->f_tor[k], CEtors8, p_jkl->dcos_dj); + rvec_ScaledAdd(workspace->f_tor[l], CEtors8, p_jkl->dcos_dk); + + rvec_ScaledAdd( workspace->f_tor[i], CEtors9, dcos_omega_di ); + rvec_ScaledAdd( workspace->f_tor[j], CEtors9, dcos_omega_dj ); + rvec_ScaledAdd( workspace->f_tor[k], CEtors9, dcos_omega_dk ); + rvec_ScaledAdd( workspace->f_tor[l], CEtors9, dcos_omega_dl ); + + // Conjugation Forces + Add_dBO( system, lists, j, pij, CEconj1, workspace->f_con ); + Add_dBO( system, lists, j, pk, CEconj2, workspace->f_con ); + Add_dBO( system, lists, k, plk, CEconj3, workspace->f_con ); + + rvec_ScaledAdd(workspace->f_con[i], CEconj4, p_ijk->dcos_dk); + rvec_ScaledAdd(workspace->f_con[j], CEconj4, p_ijk->dcos_dj); + rvec_ScaledAdd(workspace->f_con[k], CEconj4, p_ijk->dcos_di); + + rvec_ScaledAdd(workspace->f_con[j], CEconj5, p_jkl->dcos_di); + rvec_ScaledAdd(workspace->f_con[k], CEconj5, p_jkl->dcos_dj); + rvec_ScaledAdd(workspace->f_con[l], CEconj5, p_jkl->dcos_dk); + + rvec_ScaledAdd( workspace->f_con[i], CEconj6, dcos_omega_di ); + rvec_ScaledAdd( workspace->f_con[j], CEconj6, dcos_omega_dj ); + rvec_ScaledAdd( workspace->f_con[k], CEconj6, dcos_omega_dk ); + rvec_ScaledAdd( workspace->f_con[l], CEconj6, dcos_omega_dl ); #endif - } // pl check ends - } // pl loop ends - } // pi check ends - } // pi loop ends - } // k-j neighbor check ends - } // j<k && j-k neighbor check ends - } // pk loop ends - } // j loop - - /* fprintf( stderr, "4body: ext_press (%23.15e %23.15e %23.15e)\n", - data->ext_press[0], data->ext_press[1], data->ext_press[2] );*/ + } // pl check ends + } // pl loop ends + } // pi check ends + } // pi loop ends + } // k-j neighbor check ends + } // j<k && j-k neighbor check ends + } // pk loop ends + } // j loop + + /* fprintf( stderr, "4body: ext_press (%23.15e %23.15e %23.15e)\n", + data->ext_press[0], data->ext_press[1], data->ext_press[2] );*/ #ifdef TEST_FORCES - fprintf( stderr, "Number of torsion angles: %d\n", num_frb_intrs ); - fprintf( stderr, "Torsion Energy: %g\t Conjugation Energy: %g\n", - data->E_Tor, data->E_Con ); + fprintf( stderr, "Number of torsion angles: %d\n", num_frb_intrs ); + fprintf( stderr, "Torsion Energy: %g\t Conjugation Energy: %g\n", + data->E_Tor, data->E_Con ); #endif } @@ -671,692 +671,692 @@ void Four_Body_Interactions( reax_system *system, control_params *control, //////////////////////////////////////////////////////////////////////// GLOBAL void Four_Body_Interactions ( reax_atom *atoms, - global_parameters g_params, - four_body_header *d_fbp, - control_params *control, - list p_bonds, list p_thb_intrs, - simulation_box *box, - simulation_data *data, - static_storage p_workspace, - int N, int num_atom_types, - real *E_Tor, real *E_Con, rvec *aux_ext_press) + global_parameters g_params, + four_body_header *d_fbp, + control_params *control, + list p_bonds, list p_thb_intrs, + simulation_box *box, + simulation_data *data, + static_storage p_workspace, + int N, int num_atom_types, + real *E_Tor, real *E_Con, rvec *aux_ext_press) { - /* - extern __shared__ real _tor[]; - extern __shared__ real _con []; - extern __shared__ rvec _press[]; - real *sh_tor, *sh_con; rvec *sh_press; - */ - - int i, j, k, l, pi, pj, pk, pl, pij, plk; - int type_i, type_j, type_k, type_l; - int start_j, end_j, start_k, end_k; - int start_pj, end_pj, start_pk, end_pk; - int num_frb_intrs = 0; - - real Delta_j, Delta_k; - real r_ij, r_jk, r_kl, r_li; - real BOA_ij, BOA_jk, BOA_kl; - - real exp_tor2_ij, exp_tor2_jk, exp_tor2_kl; - real exp_tor1, exp_tor3_DjDk, exp_tor4_DjDk, exp_tor34_inv; - real exp_cot2_jk, exp_cot2_ij, exp_cot2_kl; - real fn10, f11_DjDk, dfn11, fn12; - - real theta_ijk, theta_jkl; - real sin_ijk, sin_jkl; - real cos_ijk, cos_jkl; - real tan_ijk_i, tan_jkl_i; - - real omega, cos_omega, cos2omega, cos3omega; - rvec dcos_omega_di, dcos_omega_dj, dcos_omega_dk, dcos_omega_dl; - - real CV, cmn, CEtors1, CEtors2, CEtors3, CEtors4; - real CEtors5, CEtors6, CEtors7, CEtors8, CEtors9; - real Cconj, CEconj1, CEconj2, CEconj3; - real CEconj4, CEconj5, CEconj6; - - real e_tor, e_con; - rvec dvec_li; - rvec force, ext_press; - ivec rel_box_jl; - // rtensor total_rtensor, temp_rtensor; - - four_body_header *fbh; - four_body_parameters *fbp; - bond_data *pbond_ij, *pbond_jk, *pbond_kl; - bond_order_data *bo_ij, *bo_jk, *bo_kl; - three_body_interaction_data *p_ijk, *p_jkl; - - j = blockIdx.x * blockDim.x + threadIdx.x; - if (j >= N) return; - // j = blockIdx.x; - - real p_tor2 = g_params.l[23]; - real p_tor3 = g_params.l[24]; - real p_tor4 = g_params.l[25]; - real p_cot2 = g_params.l[27]; - - list *bonds = &p_bonds; - list *thb_intrs = &p_thb_intrs; - static_storage *workspace = &p_workspace; - - - //for( j = 0; j < system->N; ++j ) { - type_j = atoms[j].type; - Delta_j = workspace->Delta_boc[j]; - start_j = Start_Index(j, bonds); - end_j = End_Index(j, bonds); - - /* - sh_tor = _tor; - sh_con = sh_tor + blockDim.x; - sh_press = (rvec *) (sh_tor + 2*blockDim.x); - - sh_tor[threadIdx.x] = 0; - sh_con [threadIdx.x] = 0; - rvec_MakeZero (sh_press [threadIdx.x] ); - pk = threadIdx.x + start_j; - */ - - E_Tor [j] = 0; - E_Con [j] = 0; - rvec_MakeZero (aux_ext_press [j]); - - - for( pk = start_j; pk < end_j; ++pk ) - //while (pk < end_j) - { - pbond_jk = &( bonds->select.bond_list[pk] ); - k = pbond_jk->nbr; - bo_jk = &( pbond_jk->bo_data ); - BOA_jk = bo_jk->BO - control->thb_cut; - - /* see if there are any 3-body interactions involving j&k - where j is the central atom. Otherwise there is no point in - trying to form a 4-body interaction out of this neighborhood */ - if( j < k && bo_jk->BO > control->thb_cut/*0*/ && - Num_Entries(pk, thb_intrs) ) { - start_k = Start_Index(k, bonds); - end_k = End_Index(k, bonds); - pj = pbond_jk->sym_index; // pj points to j on k's list - - /* do the same check as above: are there any 3-body interactions - involving k&j where k is the central atom */ - if( Num_Entries(pj, thb_intrs) ) { - type_k = atoms[k].type; - Delta_k = workspace->Delta_boc[k]; - r_jk = pbond_jk->d; - - start_pk = Start_Index(pk, thb_intrs ); - end_pk = End_Index(pk, thb_intrs ); - start_pj = Start_Index(pj, thb_intrs ); - end_pj = End_Index(pj, thb_intrs ); - - exp_tor2_jk = EXP( -p_tor2 * BOA_jk ); - exp_cot2_jk = EXP( -p_cot2 * SQR(BOA_jk - 1.5) ); - exp_tor3_DjDk = EXP( -p_tor3 * (Delta_j + Delta_k) ); - exp_tor4_DjDk = EXP( p_tor4 * (Delta_j + Delta_k) ); - exp_tor34_inv = 1.0 / (1.0 + exp_tor3_DjDk + exp_tor4_DjDk); - f11_DjDk = (2.0 + exp_tor3_DjDk) * exp_tor34_inv; - - - /* pick i up from j-k interaction where j is the centre atom */ - for( pi = start_pk; pi < end_pk; ++pi ) { - p_ijk = &( thb_intrs->select.three_body_list[pi] ); - pij = p_ijk->pthb; // pij is pointer to i on j's bond_list - pbond_ij = &( bonds->select.bond_list[pij] ); - bo_ij = &( pbond_ij->bo_data ); - - - if( bo_ij->BO > control->thb_cut/*0*/ ) { - i = p_ijk->thb; - type_i = atoms[i].type; - r_ij = pbond_ij->d; - BOA_ij = bo_ij->BO - control->thb_cut; - - theta_ijk = p_ijk->theta; - sin_ijk = SIN( theta_ijk ); - cos_ijk = COS( theta_ijk ); - //tan_ijk_i = 1. / TAN( theta_ijk ); - if( sin_ijk >= 0 && sin_ijk <= MIN_SINE ) - tan_ijk_i = cos_ijk / MIN_SINE; - else if( sin_ijk <= 0 && sin_ijk >= -MIN_SINE ) - tan_ijk_i = cos_ijk / -MIN_SINE; - else tan_ijk_i = cos_ijk / sin_ijk; - - exp_tor2_ij = EXP( -p_tor2 * BOA_ij ); - exp_cot2_ij = EXP( -p_cot2 * SQR(BOA_ij -1.5) ); - - /* pick l up from j-k intr. where k is the centre */ - for( pl = start_pj; pl < end_pj; ++pl ) { - p_jkl = &( thb_intrs->select.three_body_list[pl] ); - l = p_jkl->thb; - plk = p_jkl->pthb; //pointer to l on k's bond_list! - pbond_kl = &( bonds->select.bond_list[plk] ); - bo_kl = &( pbond_kl->bo_data ); - type_l = atoms[l].type; - fbh = &(d_fbp[ index_fbp (type_i,type_j,type_k,type_l,num_atom_types) ]); - fbp = &(d_fbp[ index_fbp (type_i,type_j,type_k,type_l,num_atom_types)].prm[0]); - - if( i != l && fbh->cnt && bo_kl->BO > control->thb_cut/*0*/ && - bo_ij->BO * bo_jk->BO * bo_kl->BO > control->thb_cut/*0*/ ){ - ++num_frb_intrs; - r_kl = pbond_kl->d; - BOA_kl = bo_kl->BO - control->thb_cut; - - theta_jkl = p_jkl->theta; - sin_jkl = SIN( theta_jkl ); - cos_jkl = COS( theta_jkl ); - //tan_jkl_i = 1. / TAN( theta_jkl ); - if( sin_jkl >= 0 && sin_jkl <= MIN_SINE ) - tan_jkl_i = cos_jkl / MIN_SINE; - else if( sin_jkl <= 0 && sin_jkl >= -MIN_SINE ) - tan_jkl_i = cos_jkl / -MIN_SINE; - else tan_jkl_i = cos_jkl /sin_jkl; - - Sq_Distance_on_T3( atoms[l].x, atoms[i].x, - box, dvec_li ); - r_li = rvec_Norm( dvec_li ); - - - /* omega and its derivative */ - //cos_omega=Calculate_Omega(pbond_ij->dvec,r_ij,pbond_jk->dvec, - omega = Calculate_Omega(pbond_ij->dvec, r_ij, pbond_jk->dvec, - r_jk, pbond_kl->dvec, r_kl, - dvec_li, r_li, p_ijk, p_jkl, - dcos_omega_di, dcos_omega_dj, - dcos_omega_dk, dcos_omega_dl, - NULL); //TODO *check* - cos_omega = COS( omega ); - cos2omega = COS( 2. * omega ); - cos3omega = COS( 3. * omega ); - /* end omega calculations */ - - /* torsion energy */ - exp_tor1 = EXP(fbp->p_tor1 * SQR(2.-bo_jk->BO_pi-f11_DjDk)); - exp_tor2_kl = EXP( -p_tor2 * BOA_kl ); - exp_cot2_kl = EXP( -p_cot2 * SQR(BOA_kl-1.5) ); - fn10 = (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_jk) * - (1.0 - exp_tor2_kl); - - CV = 0.5 * ( fbp->V1 * (1.0 + cos_omega) + - fbp->V2 * exp_tor1 * (1.0 - cos2omega) + - fbp->V3 * (1.0 + cos3omega) ); - //CV = 0.5 * fbp->V1 * (1.0 + cos_omega) + - // fbp->V2 * exp_tor1 * (1.0 - SQR(cos_omega)) + - // fbp->V3 * (0.5 + 2.0*CUBE(cos_omega) - 1.5 * cos_omega); - - //PERFORMANCE IMPACT - e_tor = fn10 * sin_ijk * sin_jkl * CV; - //atomicAdd (&data->E_Tor ,e_tor ); - E_Tor [j] += e_tor; - //sh_tor [threadIdx.x] += e_tor; - - dfn11 = (-p_tor3 * exp_tor3_DjDk + - (p_tor3 * exp_tor3_DjDk - p_tor4 * exp_tor4_DjDk) * - (2.+exp_tor3_DjDk) * exp_tor34_inv) * exp_tor34_inv; - - CEtors1 = sin_ijk * sin_jkl * CV; - - CEtors2 = -fn10 * 2.0 * fbp->p_tor1 * fbp->V2 * exp_tor1 * - (2.0 - bo_jk->BO_pi - f11_DjDk) * (1.0 - SQR(cos_omega)) * - sin_ijk * sin_jkl; - - CEtors3 = CEtors2 * dfn11; - - CEtors4 = CEtors1 * p_tor2 * exp_tor2_ij * - (1.0 - exp_tor2_jk) * (1.0 - exp_tor2_kl); - - CEtors5 = CEtors1 * p_tor2 * exp_tor2_jk * - (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_kl); - - CEtors6 = CEtors1 * p_tor2 * exp_tor2_kl * - (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_jk); - - cmn = -fn10 * CV; - CEtors7 = cmn * sin_jkl * tan_ijk_i; - CEtors8 = cmn * sin_ijk * tan_jkl_i; - CEtors9 = fn10 * sin_ijk * sin_jkl * - (0.5 * fbp->V1 - 2.0 * fbp->V2 * exp_tor1 * cos_omega + - 1.5 * fbp->V3 * (cos2omega + 2. * SQR(cos_omega))); - //cmn = -fn10 * CV; - //CEtors7 = cmn * sin_jkl * cos_ijk; - //CEtors8 = cmn * sin_ijk * cos_jkl; - //CEtors9 = fn10 * sin_ijk * sin_jkl * - // (0.5 * fbp->V1 - 2.0 * fbp->V2 * exp_tor1 * cos_omega + - // fbp->V3 * (6*SQR(cos_omega) - 1.50)); - /* end of torsion energy */ - - - /* 4-body conjugation energy */ - fn12 = exp_cot2_ij * exp_cot2_jk * exp_cot2_kl; - //PERFORMANCE IMPACT - e_con = fbp->p_cot1 * fn12 * (1. + (SQR(cos_omega)-1.) * sin_ijk*sin_jkl); - //atomicAdd (&data->E_Con ,e_con ); - E_Con [j] += e_con ; - //sh_con [threadIdx.x] += e_con; - - Cconj = -2.0 * fn12 * fbp->p_cot1 * p_cot2 * - (1. + (SQR(cos_omega)-1.) * sin_ijk*sin_jkl); - - CEconj1 = Cconj * (BOA_ij - 1.5e0); - CEconj2 = Cconj * (BOA_jk - 1.5e0); - CEconj3 = Cconj * (BOA_kl - 1.5e0); - - CEconj4 = -fbp->p_cot1 * fn12 * - (SQR(cos_omega) - 1.0) * sin_jkl * tan_ijk_i; - CEconj5 = -fbp->p_cot1 * fn12 * - (SQR(cos_omega) - 1.0) * sin_ijk * tan_jkl_i; - //CEconj4 = -fbp->p_cot1 * fn12 * - // (SQR(cos_omega) - 1.0) * sin_jkl * cos_ijk; - //CEconj5 = -fbp->p_cot1 * fn12 * - // (SQR(cos_omega) - 1.0) * sin_ijk * cos_jkl; - CEconj6 = 2.0 * fbp->p_cot1 * fn12 * - cos_omega * sin_ijk * sin_jkl; - /* end 4-body conjugation energy */ - - //fprintf(stdout, "%6d %6d %6d %6d %7.3f %7.3f %7.3f %7.3f ", - // workspace->orig_id[i], workspace->orig_id[j], - // workspace->orig_id[k], workspace->orig_id[l], - // omega, cos_omega, cos2omega, cos3omega ); - //fprintf(stdout, - // "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", - // CEtors2, CEtors3, CEtors4, CEtors5, - // CEtors6, CEtors7, CEtors8, CEtors9 ); - //fprintf(stdout, "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", - // theta_ijk, theta_jkl, sin_ijk, - // sin_jkl, cos_jkl, tan_jkl_i ); - - /* forces */ - //PERFORMANCE IMPACT - /* - atomicAdd ( &bo_jk->Cdbopi, CEtors2 ); - atomicAdd ( &workspace->CdDelta[j], CEtors3 ); - atomicAdd ( &workspace->CdDelta[k], CEtors3 ); - atomicAdd ( &bo_ij->Cdbo, (CEtors4 + CEconj1) ); - atomicAdd ( &bo_jk->Cdbo, (CEtors5 + CEconj2) ); - atomicAdd ( &bo_kl->Cdbo, (CEtors6 + CEconj3) ); - */ - - //PERFORMANCE IMPACT - bo_jk->Cdbopi += CEtors2; - workspace->CdDelta[j] += CEtors3; - pbond_jk->CdDelta_jk += CEtors3; - bo_ij->Cdbo += CEtors4 + CEconj1; - bo_jk->Cdbo += CEtors5 + CEconj2; - - //TODO REMOVE THIS ATOMIC OPERATION IF POSSIBLE - atomicAdd (&pbond_kl->Cdbo_kl, CEtors6 + CEconj3 ); - //TODO REMOVE THIS ATOMIC OPERATION IF POSSIBLE - - if( control->ensemble == NVE || control->ensemble == NVT ||control->ensemble == bNVT) { - /* dcos_theta_ijk */ - //PERFORMANCE IMPACT - atomic_rvecScaledAdd (pbond_ij->i_f, - CEtors7 + CEconj4, p_ijk->dcos_dk ); - rvec_ScaledAdd( atoms[j].f, - CEtors7 + CEconj4, p_ijk->dcos_dj ); - atomic_rvecScaledAdd( pbond_jk->k_f, - CEtors7 + CEconj4, p_ijk->dcos_di ); - - - /* dcos_theta_jkl */ - //PERFORMANCE IMPACT - rvec_ScaledAdd( atoms[j].f, - CEtors8 + CEconj5, p_jkl->dcos_di ); - atomic_rvecScaledAdd( pbond_jk->i_f, - CEtors8 + CEconj5, p_jkl->dcos_dj ); - atomic_rvecScaledAdd( pbond_kl->k_f, - CEtors8 + CEconj5, p_jkl->dcos_dk ); - - /* dcos_omega */ - //PERFORMANCE IMPACT - atomic_rvecScaledAdd( pbond_ij->i_f, - CEtors9 + CEconj6, dcos_omega_di ); - rvec_ScaledAdd( atoms[j].f, - CEtors9 + CEconj6, dcos_omega_dj ); - atomic_rvecScaledAdd( pbond_jk->i_f, - CEtors9 + CEconj6, dcos_omega_dk ); - atomic_rvecScaledAdd( pbond_kl->k_f, - CEtors9 + CEconj6, dcos_omega_dl ); - } - else { - ivec_Sum(rel_box_jl, pbond_jk->rel_box, pbond_kl->rel_box); - - /* dcos_theta_ijk */ - rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_dk ); - //PERFORMANCE IMPACT - atomic_rvecAdd( pbond_ij->i_f, force ); - rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); - rvec_Add( aux_ext_press [j], ext_press ); - //rvec_Add (sh_press [threadIdx.x], ext_press); - - //PERFORMANCE IMPACT - rvec_ScaledAdd( atoms[j].f, - CEtors7 + CEconj4, p_ijk->dcos_dj ); - - rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_di ); - //PERFORMANCE IMPACT - atomic_rvecAdd( pbond_jk->i_f, force ); - rvec_iMultiply( ext_press, pbond_jk->rel_box, force ); - //PERFORMANCE IMPACT - rvec_Add( aux_ext_press [j], ext_press ); - //rvec_Add (sh_press [threadIdx.x], ext_press); - - - /* dcos_theta_jkl */ - //PERFORMANCE IMPACT - rvec_ScaledAdd( atoms[j].f, - CEtors8 + CEconj5, p_jkl->dcos_di ); - - rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dj ); - //PERFORMANCE IMPACT - atomic_rvecAdd( pbond_jk->i_f, force ); - rvec_iMultiply( ext_press, pbond_jk->rel_box, force ); - rvec_Add( aux_ext_press [j], ext_press ); - //rvec_Add (sh_press [threadIdx.x], ext_press); - - rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dk ); - //PERFORMANCE IMPACT - atomic_rvecAdd( pbond_kl->k_f, force ); - rvec_iMultiply( ext_press, rel_box_jl, force ); - rvec_Add( aux_ext_press [j], ext_press ); - //rvec_Add (sh_press [threadIdx.x], ext_press); - - - /* dcos_omega */ - rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_di ); - //PERFORMANCE IMPACT - atomic_rvecAdd( pbond_ij->i_f, force ); - rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); - rvec_Add( aux_ext_press [j], ext_press ); - //rvec_Add (sh_press [threadIdx.x], ext_press); - - //PERFORMANCE IMPACT - rvec_ScaledAdd( atoms[j].f, - CEtors9 + CEconj6, dcos_omega_dj ); - - rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dk ); - //PERFORMANCE IMPACT - atomic_rvecAdd( pbond_jk->i_f, force ); - rvec_iMultiply( ext_press, pbond_jk->rel_box, force ); - rvec_Add( aux_ext_press [j], ext_press ); - //rvec_Add (sh_press [threadIdx.x], ext_press); - - rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dl ); - //PERFORMANCE IMPACT - atomic_rvecAdd( pbond_kl->k_f, force ); - rvec_iMultiply( ext_press, rel_box_jl, force ); - rvec_Add( aux_ext_press [j], ext_press ); - //rvec_Add (sh_press [threadIdx.x], ext_press); - - - /* This part is intended for a fully-flexible box */ - /* rvec_ScaledSum( temp_rvec, - CEtors7 + CEconj4, p_ijk->dcos_dk, // i - CEtors9 + CEconj6, dcos_omega_di ); - rvec_OuterProduct( temp_rtensor, - temp_rvec, system->atoms[i].x ); - rtensor_Copy( total_rtensor, temp_rtensor ); - - rvec_ScaledSum( temp_rvec, - CEtors7 + CEconj4, p_ijk->dcos_dj, // j - CEtors8 + CEconj5, p_jkl->dcos_di ); - rvec_ScaledAdd( temp_rvec, - CEtors9 + CEconj6, dcos_omega_dj ); - rvec_OuterProduct( temp_rtensor, - temp_rvec, system->atoms[j].x ); - rtensor_Add( total_rtensor, temp_rtensor ); - - rvec_ScaledSum( temp_rvec, - CEtors7 + CEconj4, p_ijk->dcos_di, // k - CEtors8 + CEconj5, p_jkl->dcos_dj ); - rvec_ScaledAdd( temp_rvec, - CEtors9 + CEconj6, dcos_omega_dk ); - rvec_OuterProduct( temp_rtensor, - temp_rvec, system->atoms[k].x ); - rtensor_Add( total_rtensor, temp_rtensor ); - - rvec_ScaledSum( temp_rvec, - CEtors8 + CEconj5, p_jkl->dcos_dk, // l - CEtors9 + CEconj6, dcos_omega_dl ); - rvec_OuterProduct( temp_rtensor, - temp_rvec, system->atoms[l].x ); - rtensor_Copy( total_rtensor, temp_rtensor ); - - if( pbond_ij->imaginary || pbond_jk->imaginary || - pbond_kl->imaginary ) - rtensor_ScaledAdd( data->flex_bar.P, -1., total_rtensor ); - else - rtensor_Add( data->flex_bar.P, total_rtensor ); */ - } + /* + extern __shared__ real _tor[]; + extern __shared__ real _con []; + extern __shared__ rvec _press[]; + real *sh_tor, *sh_con; rvec *sh_press; + */ + + int i, j, k, l, pi, pj, pk, pl, pij, plk; + int type_i, type_j, type_k, type_l; + int start_j, end_j, start_k, end_k; + int start_pj, end_pj, start_pk, end_pk; + int num_frb_intrs = 0; + + real Delta_j, Delta_k; + real r_ij, r_jk, r_kl, r_li; + real BOA_ij, BOA_jk, BOA_kl; + + real exp_tor2_ij, exp_tor2_jk, exp_tor2_kl; + real exp_tor1, exp_tor3_DjDk, exp_tor4_DjDk, exp_tor34_inv; + real exp_cot2_jk, exp_cot2_ij, exp_cot2_kl; + real fn10, f11_DjDk, dfn11, fn12; + + real theta_ijk, theta_jkl; + real sin_ijk, sin_jkl; + real cos_ijk, cos_jkl; + real tan_ijk_i, tan_jkl_i; + + real omega, cos_omega, cos2omega, cos3omega; + rvec dcos_omega_di, dcos_omega_dj, dcos_omega_dk, dcos_omega_dl; + + real CV, cmn, CEtors1, CEtors2, CEtors3, CEtors4; + real CEtors5, CEtors6, CEtors7, CEtors8, CEtors9; + real Cconj, CEconj1, CEconj2, CEconj3; + real CEconj4, CEconj5, CEconj6; + + real e_tor, e_con; + rvec dvec_li; + rvec force, ext_press; + ivec rel_box_jl; + // rtensor total_rtensor, temp_rtensor; + + four_body_header *fbh; + four_body_parameters *fbp; + bond_data *pbond_ij, *pbond_jk, *pbond_kl; + bond_order_data *bo_ij, *bo_jk, *bo_kl; + three_body_interaction_data *p_ijk, *p_jkl; + + j = blockIdx.x * blockDim.x + threadIdx.x; + if (j >= N) return; + // j = blockIdx.x; + + real p_tor2 = g_params.l[23]; + real p_tor3 = g_params.l[24]; + real p_tor4 = g_params.l[25]; + real p_cot2 = g_params.l[27]; + + list *bonds = &p_bonds; + list *thb_intrs = &p_thb_intrs; + static_storage *workspace = &p_workspace; + + + //for( j = 0; j < system->N; ++j ) { + type_j = atoms[j].type; + Delta_j = workspace->Delta_boc[j]; + start_j = Start_Index(j, bonds); + end_j = End_Index(j, bonds); + + /* + sh_tor = _tor; + sh_con = sh_tor + blockDim.x; + sh_press = (rvec *) (sh_tor + 2*blockDim.x); + + sh_tor[threadIdx.x] = 0; + sh_con [threadIdx.x] = 0; + rvec_MakeZero (sh_press [threadIdx.x] ); + pk = threadIdx.x + start_j; + */ + + E_Tor [j] = 0; + E_Con [j] = 0; + rvec_MakeZero (aux_ext_press [j]); + + + for( pk = start_j; pk < end_j; ++pk ) + //while (pk < end_j) + { + pbond_jk = &( bonds->select.bond_list[pk] ); + k = pbond_jk->nbr; + bo_jk = &( pbond_jk->bo_data ); + BOA_jk = bo_jk->BO - control->thb_cut; + + /* see if there are any 3-body interactions involving j&k + where j is the central atom. Otherwise there is no point in + trying to form a 4-body interaction out of this neighborhood */ + if( j < k && bo_jk->BO > control->thb_cut/*0*/ && + Num_Entries(pk, thb_intrs) ) { + start_k = Start_Index(k, bonds); + end_k = End_Index(k, bonds); + pj = pbond_jk->sym_index; // pj points to j on k's list + + /* do the same check as above: are there any 3-body interactions + involving k&j where k is the central atom */ + if( Num_Entries(pj, thb_intrs) ) { + type_k = atoms[k].type; + Delta_k = workspace->Delta_boc[k]; + r_jk = pbond_jk->d; + + start_pk = Start_Index(pk, thb_intrs ); + end_pk = End_Index(pk, thb_intrs ); + start_pj = Start_Index(pj, thb_intrs ); + end_pj = End_Index(pj, thb_intrs ); + + exp_tor2_jk = EXP( -p_tor2 * BOA_jk ); + exp_cot2_jk = EXP( -p_cot2 * SQR(BOA_jk - 1.5) ); + exp_tor3_DjDk = EXP( -p_tor3 * (Delta_j + Delta_k) ); + exp_tor4_DjDk = EXP( p_tor4 * (Delta_j + Delta_k) ); + exp_tor34_inv = 1.0 / (1.0 + exp_tor3_DjDk + exp_tor4_DjDk); + f11_DjDk = (2.0 + exp_tor3_DjDk) * exp_tor34_inv; + + + /* pick i up from j-k interaction where j is the centre atom */ + for( pi = start_pk; pi < end_pk; ++pi ) { + p_ijk = &( thb_intrs->select.three_body_list[pi] ); + pij = p_ijk->pthb; // pij is pointer to i on j's bond_list + pbond_ij = &( bonds->select.bond_list[pij] ); + bo_ij = &( pbond_ij->bo_data ); + + + if( bo_ij->BO > control->thb_cut/*0*/ ) { + i = p_ijk->thb; + type_i = atoms[i].type; + r_ij = pbond_ij->d; + BOA_ij = bo_ij->BO - control->thb_cut; + + theta_ijk = p_ijk->theta; + sin_ijk = SIN( theta_ijk ); + cos_ijk = COS( theta_ijk ); + //tan_ijk_i = 1. / TAN( theta_ijk ); + if( sin_ijk >= 0 && sin_ijk <= MIN_SINE ) + tan_ijk_i = cos_ijk / MIN_SINE; + else if( sin_ijk <= 0 && sin_ijk >= -MIN_SINE ) + tan_ijk_i = cos_ijk / -MIN_SINE; + else tan_ijk_i = cos_ijk / sin_ijk; + + exp_tor2_ij = EXP( -p_tor2 * BOA_ij ); + exp_cot2_ij = EXP( -p_cot2 * SQR(BOA_ij -1.5) ); + + /* pick l up from j-k intr. where k is the centre */ + for( pl = start_pj; pl < end_pj; ++pl ) { + p_jkl = &( thb_intrs->select.three_body_list[pl] ); + l = p_jkl->thb; + plk = p_jkl->pthb; //pointer to l on k's bond_list! + pbond_kl = &( bonds->select.bond_list[plk] ); + bo_kl = &( pbond_kl->bo_data ); + type_l = atoms[l].type; + fbh = &(d_fbp[ index_fbp (type_i,type_j,type_k,type_l,num_atom_types) ]); + fbp = &(d_fbp[ index_fbp (type_i,type_j,type_k,type_l,num_atom_types)].prm[0]); + + if( i != l && fbh->cnt && bo_kl->BO > control->thb_cut/*0*/ && + bo_ij->BO * bo_jk->BO * bo_kl->BO > control->thb_cut/*0*/ ){ + ++num_frb_intrs; + r_kl = pbond_kl->d; + BOA_kl = bo_kl->BO - control->thb_cut; + + theta_jkl = p_jkl->theta; + sin_jkl = SIN( theta_jkl ); + cos_jkl = COS( theta_jkl ); + //tan_jkl_i = 1. / TAN( theta_jkl ); + if( sin_jkl >= 0 && sin_jkl <= MIN_SINE ) + tan_jkl_i = cos_jkl / MIN_SINE; + else if( sin_jkl <= 0 && sin_jkl >= -MIN_SINE ) + tan_jkl_i = cos_jkl / -MIN_SINE; + else tan_jkl_i = cos_jkl /sin_jkl; + + Sq_Distance_on_T3( atoms[l].x, atoms[i].x, + box, dvec_li ); + r_li = rvec_Norm( dvec_li ); + + + /* omega and its derivative */ + //cos_omega=Calculate_Omega(pbond_ij->dvec,r_ij,pbond_jk->dvec, + omega = Calculate_Omega(pbond_ij->dvec, r_ij, pbond_jk->dvec, + r_jk, pbond_kl->dvec, r_kl, + dvec_li, r_li, p_ijk, p_jkl, + dcos_omega_di, dcos_omega_dj, + dcos_omega_dk, dcos_omega_dl, + NULL); //TODO *check* + cos_omega = COS( omega ); + cos2omega = COS( 2. * omega ); + cos3omega = COS( 3. * omega ); + /* end omega calculations */ + + /* torsion energy */ + exp_tor1 = EXP(fbp->p_tor1 * SQR(2.-bo_jk->BO_pi-f11_DjDk)); + exp_tor2_kl = EXP( -p_tor2 * BOA_kl ); + exp_cot2_kl = EXP( -p_cot2 * SQR(BOA_kl-1.5) ); + fn10 = (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_jk) * + (1.0 - exp_tor2_kl); + + CV = 0.5 * ( fbp->V1 * (1.0 + cos_omega) + + fbp->V2 * exp_tor1 * (1.0 - cos2omega) + + fbp->V3 * (1.0 + cos3omega) ); + //CV = 0.5 * fbp->V1 * (1.0 + cos_omega) + + // fbp->V2 * exp_tor1 * (1.0 - SQR(cos_omega)) + + // fbp->V3 * (0.5 + 2.0*CUBE(cos_omega) - 1.5 * cos_omega); + + //PERFORMANCE IMPACT + e_tor = fn10 * sin_ijk * sin_jkl * CV; + //atomicAdd (&data->E_Tor ,e_tor ); + E_Tor [j] += e_tor; + //sh_tor [threadIdx.x] += e_tor; + + dfn11 = (-p_tor3 * exp_tor3_DjDk + + (p_tor3 * exp_tor3_DjDk - p_tor4 * exp_tor4_DjDk) * + (2.+exp_tor3_DjDk) * exp_tor34_inv) * exp_tor34_inv; + + CEtors1 = sin_ijk * sin_jkl * CV; + + CEtors2 = -fn10 * 2.0 * fbp->p_tor1 * fbp->V2 * exp_tor1 * + (2.0 - bo_jk->BO_pi - f11_DjDk) * (1.0 - SQR(cos_omega)) * + sin_ijk * sin_jkl; + + CEtors3 = CEtors2 * dfn11; + + CEtors4 = CEtors1 * p_tor2 * exp_tor2_ij * + (1.0 - exp_tor2_jk) * (1.0 - exp_tor2_kl); + + CEtors5 = CEtors1 * p_tor2 * exp_tor2_jk * + (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_kl); + + CEtors6 = CEtors1 * p_tor2 * exp_tor2_kl * + (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_jk); + + cmn = -fn10 * CV; + CEtors7 = cmn * sin_jkl * tan_ijk_i; + CEtors8 = cmn * sin_ijk * tan_jkl_i; + CEtors9 = fn10 * sin_ijk * sin_jkl * + (0.5 * fbp->V1 - 2.0 * fbp->V2 * exp_tor1 * cos_omega + + 1.5 * fbp->V3 * (cos2omega + 2. * SQR(cos_omega))); + //cmn = -fn10 * CV; + //CEtors7 = cmn * sin_jkl * cos_ijk; + //CEtors8 = cmn * sin_ijk * cos_jkl; + //CEtors9 = fn10 * sin_ijk * sin_jkl * + // (0.5 * fbp->V1 - 2.0 * fbp->V2 * exp_tor1 * cos_omega + + // fbp->V3 * (6*SQR(cos_omega) - 1.50)); + /* end of torsion energy */ + + + /* 4-body conjugation energy */ + fn12 = exp_cot2_ij * exp_cot2_jk * exp_cot2_kl; + //PERFORMANCE IMPACT + e_con = fbp->p_cot1 * fn12 * (1. + (SQR(cos_omega)-1.) * sin_ijk*sin_jkl); + //atomicAdd (&data->E_Con ,e_con ); + E_Con [j] += e_con ; + //sh_con [threadIdx.x] += e_con; + + Cconj = -2.0 * fn12 * fbp->p_cot1 * p_cot2 * + (1. + (SQR(cos_omega)-1.) * sin_ijk*sin_jkl); + + CEconj1 = Cconj * (BOA_ij - 1.5e0); + CEconj2 = Cconj * (BOA_jk - 1.5e0); + CEconj3 = Cconj * (BOA_kl - 1.5e0); + + CEconj4 = -fbp->p_cot1 * fn12 * + (SQR(cos_omega) - 1.0) * sin_jkl * tan_ijk_i; + CEconj5 = -fbp->p_cot1 * fn12 * + (SQR(cos_omega) - 1.0) * sin_ijk * tan_jkl_i; + //CEconj4 = -fbp->p_cot1 * fn12 * + // (SQR(cos_omega) - 1.0) * sin_jkl * cos_ijk; + //CEconj5 = -fbp->p_cot1 * fn12 * + // (SQR(cos_omega) - 1.0) * sin_ijk * cos_jkl; + CEconj6 = 2.0 * fbp->p_cot1 * fn12 * + cos_omega * sin_ijk * sin_jkl; + /* end 4-body conjugation energy */ + + //fprintf(stdout, "%6d %6d %6d %6d %7.3f %7.3f %7.3f %7.3f ", + // workspace->orig_id[i], workspace->orig_id[j], + // workspace->orig_id[k], workspace->orig_id[l], + // omega, cos_omega, cos2omega, cos3omega ); + //fprintf(stdout, + // "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", + // CEtors2, CEtors3, CEtors4, CEtors5, + // CEtors6, CEtors7, CEtors8, CEtors9 ); + //fprintf(stdout, "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", + // theta_ijk, theta_jkl, sin_ijk, + // sin_jkl, cos_jkl, tan_jkl_i ); + + /* forces */ + //PERFORMANCE IMPACT + /* + atomicAdd ( &bo_jk->Cdbopi, CEtors2 ); + atomicAdd ( &workspace->CdDelta[j], CEtors3 ); + atomicAdd ( &workspace->CdDelta[k], CEtors3 ); + atomicAdd ( &bo_ij->Cdbo, (CEtors4 + CEconj1) ); + atomicAdd ( &bo_jk->Cdbo, (CEtors5 + CEconj2) ); + atomicAdd ( &bo_kl->Cdbo, (CEtors6 + CEconj3) ); + */ + + //PERFORMANCE IMPACT + bo_jk->Cdbopi += CEtors2; + workspace->CdDelta[j] += CEtors3; + pbond_jk->CdDelta_jk += CEtors3; + bo_ij->Cdbo += CEtors4 + CEconj1; + bo_jk->Cdbo += CEtors5 + CEconj2; + + //TODO REMOVE THIS ATOMIC OPERATION IF POSSIBLE + atomicAdd (&pbond_kl->Cdbo_kl, CEtors6 + CEconj3 ); + //TODO REMOVE THIS ATOMIC OPERATION IF POSSIBLE + + if( control->ensemble == NVE || control->ensemble == NVT ||control->ensemble == bNVT) { + /* dcos_theta_ijk */ + //PERFORMANCE IMPACT + atomic_rvecScaledAdd (pbond_ij->i_f, + CEtors7 + CEconj4, p_ijk->dcos_dk ); + rvec_ScaledAdd( atoms[j].f, + CEtors7 + CEconj4, p_ijk->dcos_dj ); + atomic_rvecScaledAdd( pbond_jk->k_f, + CEtors7 + CEconj4, p_ijk->dcos_di ); + + + /* dcos_theta_jkl */ + //PERFORMANCE IMPACT + rvec_ScaledAdd( atoms[j].f, + CEtors8 + CEconj5, p_jkl->dcos_di ); + atomic_rvecScaledAdd( pbond_jk->i_f, + CEtors8 + CEconj5, p_jkl->dcos_dj ); + atomic_rvecScaledAdd( pbond_kl->k_f, + CEtors8 + CEconj5, p_jkl->dcos_dk ); + + /* dcos_omega */ + //PERFORMANCE IMPACT + atomic_rvecScaledAdd( pbond_ij->i_f, + CEtors9 + CEconj6, dcos_omega_di ); + rvec_ScaledAdd( atoms[j].f, + CEtors9 + CEconj6, dcos_omega_dj ); + atomic_rvecScaledAdd( pbond_jk->i_f, + CEtors9 + CEconj6, dcos_omega_dk ); + atomic_rvecScaledAdd( pbond_kl->k_f, + CEtors9 + CEconj6, dcos_omega_dl ); + } + else { + ivec_Sum(rel_box_jl, pbond_jk->rel_box, pbond_kl->rel_box); + + /* dcos_theta_ijk */ + rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_dk ); + //PERFORMANCE IMPACT + atomic_rvecAdd( pbond_ij->i_f, force ); + rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); + rvec_Add( aux_ext_press [j], ext_press ); + //rvec_Add (sh_press [threadIdx.x], ext_press); + + //PERFORMANCE IMPACT + rvec_ScaledAdd( atoms[j].f, + CEtors7 + CEconj4, p_ijk->dcos_dj ); + + rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_di ); + //PERFORMANCE IMPACT + atomic_rvecAdd( pbond_jk->i_f, force ); + rvec_iMultiply( ext_press, pbond_jk->rel_box, force ); + //PERFORMANCE IMPACT + rvec_Add( aux_ext_press [j], ext_press ); + //rvec_Add (sh_press [threadIdx.x], ext_press); + + + /* dcos_theta_jkl */ + //PERFORMANCE IMPACT + rvec_ScaledAdd( atoms[j].f, + CEtors8 + CEconj5, p_jkl->dcos_di ); + + rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dj ); + //PERFORMANCE IMPACT + atomic_rvecAdd( pbond_jk->i_f, force ); + rvec_iMultiply( ext_press, pbond_jk->rel_box, force ); + rvec_Add( aux_ext_press [j], ext_press ); + //rvec_Add (sh_press [threadIdx.x], ext_press); + + rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dk ); + //PERFORMANCE IMPACT + atomic_rvecAdd( pbond_kl->k_f, force ); + rvec_iMultiply( ext_press, rel_box_jl, force ); + rvec_Add( aux_ext_press [j], ext_press ); + //rvec_Add (sh_press [threadIdx.x], ext_press); + + + /* dcos_omega */ + rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_di ); + //PERFORMANCE IMPACT + atomic_rvecAdd( pbond_ij->i_f, force ); + rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); + rvec_Add( aux_ext_press [j], ext_press ); + //rvec_Add (sh_press [threadIdx.x], ext_press); + + //PERFORMANCE IMPACT + rvec_ScaledAdd( atoms[j].f, + CEtors9 + CEconj6, dcos_omega_dj ); + + rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dk ); + //PERFORMANCE IMPACT + atomic_rvecAdd( pbond_jk->i_f, force ); + rvec_iMultiply( ext_press, pbond_jk->rel_box, force ); + rvec_Add( aux_ext_press [j], ext_press ); + //rvec_Add (sh_press [threadIdx.x], ext_press); + + rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dl ); + //PERFORMANCE IMPACT + atomic_rvecAdd( pbond_kl->k_f, force ); + rvec_iMultiply( ext_press, rel_box_jl, force ); + rvec_Add( aux_ext_press [j], ext_press ); + //rvec_Add (sh_press [threadIdx.x], ext_press); + + + /* This part is intended for a fully-flexible box */ + /* rvec_ScaledSum( temp_rvec, + CEtors7 + CEconj4, p_ijk->dcos_dk, // i + CEtors9 + CEconj6, dcos_omega_di ); + rvec_OuterProduct( temp_rtensor, + temp_rvec, system->atoms[i].x ); + rtensor_Copy( total_rtensor, temp_rtensor ); + + rvec_ScaledSum( temp_rvec, + CEtors7 + CEconj4, p_ijk->dcos_dj, // j + CEtors8 + CEconj5, p_jkl->dcos_di ); + rvec_ScaledAdd( temp_rvec, + CEtors9 + CEconj6, dcos_omega_dj ); + rvec_OuterProduct( temp_rtensor, + temp_rvec, system->atoms[j].x ); + rtensor_Add( total_rtensor, temp_rtensor ); + + rvec_ScaledSum( temp_rvec, + CEtors7 + CEconj4, p_ijk->dcos_di, // k + CEtors8 + CEconj5, p_jkl->dcos_dj ); + rvec_ScaledAdd( temp_rvec, + CEtors9 + CEconj6, dcos_omega_dk ); + rvec_OuterProduct( temp_rtensor, + temp_rvec, system->atoms[k].x ); + rtensor_Add( total_rtensor, temp_rtensor ); + + rvec_ScaledSum( temp_rvec, + CEtors8 + CEconj5, p_jkl->dcos_dk, // l + CEtors9 + CEconj6, dcos_omega_dl ); + rvec_OuterProduct( temp_rtensor, + temp_rvec, system->atoms[l].x ); + rtensor_Copy( total_rtensor, temp_rtensor ); + + if( pbond_ij->imaginary || pbond_jk->imaginary || + pbond_kl->imaginary ) + rtensor_ScaledAdd( data->flex_bar.P, -1., total_rtensor ); + else + rtensor_Add( data->flex_bar.P, total_rtensor ); */ + } #ifdef TEST_ENERGY - /*fprintf( out_control->etor, - //"%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", - //r_ij, r_jk, r_kl, - "%12.8f%12.8f%12.8f%12.8f\n", - cos_ijk, cos_jkl, sin_ijk, sin_jkl );*/ - // fprintf( out_control->etor, "%12.8f\n", dfn11 ); - /* - fprintf( out_control->etor, "%12.8f%12.8f%12.8f\n", - fn10, cos_omega, CV ); - - fprintf( out_control->etor, - "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", - CEtors2, CEtors3, CEtors4, CEtors5, - CEtors6, CEtors7, CEtors8, CEtors9 ); - */ - //end - - /* fprintf( out_control->etor, - "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", - htra, htrb, htrc, hthd, hthe, hnra, hnrc, hnhd, hnhe ); */ - - /* - fprintf( out_control->etor, - "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", - CEconj1, CEconj2, CEconj3, CEconj4, CEconj5, CEconj6 ); - */ - //end - /* fprintf(out_control->etor,"%23.15e%23.15e%23.15e%23.15e\n", - fbp->V1, fbp->V2, fbp->V3, fbp->p_tor1 );*/ - - /* - - fprintf( out_control->etor, - //"%6d%6d%6d%6d%23.15e%23.15e%23.15e%23.15e\n", - "%6d%6d%6d%6d%12.8f%12.8f\n", - workspace->orig_id[i], workspace->orig_id[j], - workspace->orig_id[k], workspace->orig_id[l], - e_tor, e_con ); - //RAD2DEG(omega), BOA_jk, e_tor, data->E_Tor ); - - fprintf( out_control->econ, - "%6d%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n", - workspace->orig_id[i], workspace->orig_id[j], - workspace->orig_id[k], workspace->orig_id[l], - RAD2DEG(omega), BOA_ij, BOA_jk, BOA_kl, - e_con,data->E_Con ); - */ - //end - - /* fprintf( out_control->etor, - "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n", - (CEtors7 + CEconj4)*p_ijk->dcos_dk[0], - (CEtors7 + CEconj4)*p_ijk->dcos_dk[1], - (CEtors7 + CEconj4)*p_ijk->dcos_dk[2], - (CEtors7 + CEconj4)*p_ijk->dcos_dj[0], - (CEtors7 + CEconj4)*p_ijk->dcos_dj[1], - (CEtors7 + CEconj4)*p_ijk->dcos_dj[2], - (CEtors7 + CEconj4)*p_ijk->dcos_di[0], - (CEtors7 + CEconj4)*p_ijk->dcos_di[1], - (CEtors7 + CEconj4)*p_ijk->dcos_di[2] ); */ - - - /* fprintf( out_control->etor, - "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n", - (CEtors8 + CEconj5)*p_jkl->dcos_di[0], - (CEtors8 + CEconj5)*p_jkl->dcos_di[1], - (CEtors8 + CEconj5)*p_jkl->dcos_di[2], - (CEtors8 + CEconj5)*p_jkl->dcos_dj[0], - (CEtors8 + CEconj5)*p_jkl->dcos_dj[1], - (CEtors8 + CEconj5)*p_jkl->dcos_dj[2], - (CEtors8 + CEconj5)*p_jkl->dcos_dk[0], - (CEtors8 + CEconj5)*p_jkl->dcos_dk[1], - (CEtors8 + CEconj5)*p_jkl->dcos_dk[2] ); */ - - /* - fprintf( out_control->etor, - "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n", - dcos_omega_di[0], dcos_omega_di[1], dcos_omega_di[2], - dcos_omega_dj[0], dcos_omega_dj[1], dcos_omega_dj[2], - dcos_omega_dk[0], dcos_omega_dk[1], dcos_omega_dk[2], - dcos_omega_dl[0], dcos_omega_dl[1], dcos_omega_dl[2] ); - */ - //end + /*fprintf( out_control->etor, + //"%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", + //r_ij, r_jk, r_kl, + "%12.8f%12.8f%12.8f%12.8f\n", + cos_ijk, cos_jkl, sin_ijk, sin_jkl );*/ + // fprintf( out_control->etor, "%12.8f\n", dfn11 ); + /* + fprintf( out_control->etor, "%12.8f%12.8f%12.8f\n", + fn10, cos_omega, CV ); + + fprintf( out_control->etor, + "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", + CEtors2, CEtors3, CEtors4, CEtors5, + CEtors6, CEtors7, CEtors8, CEtors9 ); + */ + //end + + /* fprintf( out_control->etor, + "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", + htra, htrb, htrc, hthd, hthe, hnra, hnrc, hnhd, hnhe ); */ + + /* + fprintf( out_control->etor, + "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", + CEconj1, CEconj2, CEconj3, CEconj4, CEconj5, CEconj6 ); + */ + //end + /* fprintf(out_control->etor,"%23.15e%23.15e%23.15e%23.15e\n", + fbp->V1, fbp->V2, fbp->V3, fbp->p_tor1 );*/ + + /* + + fprintf( out_control->etor, + //"%6d%6d%6d%6d%23.15e%23.15e%23.15e%23.15e\n", + "%6d%6d%6d%6d%12.8f%12.8f\n", + workspace->orig_id[i], workspace->orig_id[j], + workspace->orig_id[k], workspace->orig_id[l], + e_tor, e_con ); + //RAD2DEG(omega), BOA_jk, e_tor, data->E_Tor ); + + fprintf( out_control->econ, + "%6d%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n", + workspace->orig_id[i], workspace->orig_id[j], + workspace->orig_id[k], workspace->orig_id[l], + RAD2DEG(omega), BOA_ij, BOA_jk, BOA_kl, + e_con,data->E_Con ); + */ + //end + + /* fprintf( out_control->etor, + "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n", + (CEtors7 + CEconj4)*p_ijk->dcos_dk[0], + (CEtors7 + CEconj4)*p_ijk->dcos_dk[1], + (CEtors7 + CEconj4)*p_ijk->dcos_dk[2], + (CEtors7 + CEconj4)*p_ijk->dcos_dj[0], + (CEtors7 + CEconj4)*p_ijk->dcos_dj[1], + (CEtors7 + CEconj4)*p_ijk->dcos_dj[2], + (CEtors7 + CEconj4)*p_ijk->dcos_di[0], + (CEtors7 + CEconj4)*p_ijk->dcos_di[1], + (CEtors7 + CEconj4)*p_ijk->dcos_di[2] ); */ + + + /* fprintf( out_control->etor, + "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n", + (CEtors8 + CEconj5)*p_jkl->dcos_di[0], + (CEtors8 + CEconj5)*p_jkl->dcos_di[1], + (CEtors8 + CEconj5)*p_jkl->dcos_di[2], + (CEtors8 + CEconj5)*p_jkl->dcos_dj[0], + (CEtors8 + CEconj5)*p_jkl->dcos_dj[1], + (CEtors8 + CEconj5)*p_jkl->dcos_dj[2], + (CEtors8 + CEconj5)*p_jkl->dcos_dk[0], + (CEtors8 + CEconj5)*p_jkl->dcos_dk[1], + (CEtors8 + CEconj5)*p_jkl->dcos_dk[2] ); */ + + /* + fprintf( out_control->etor, + "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n", + dcos_omega_di[0], dcos_omega_di[1], dcos_omega_di[2], + dcos_omega_dj[0], dcos_omega_dj[1], dcos_omega_dj[2], + dcos_omega_dk[0], dcos_omega_dk[1], dcos_omega_dk[2], + dcos_omega_dl[0], dcos_omega_dl[1], dcos_omega_dl[2] ); + */ + //end #endif #ifdef TEST_FORCES - /* - // Torsion Forces - Add_dBOpinpi2(system, lists, j, pk, CEtors2, 0., - workspace->f_tor, workspace->f_tor); - Add_dDelta( system, lists, j, CEtors3, workspace->f_tor ); - Add_dDelta( system, lists, k, CEtors3, workspace->f_tor ); - Add_dBO( system, lists, j, pij, CEtors4, workspace->f_tor ); - Add_dBO( system, lists, j, pk, CEtors5, workspace->f_tor ); - Add_dBO( system, lists, k, plk, CEtors6, workspace->f_tor ); - - rvec_ScaledAdd(workspace->f_tor[i], CEtors7, p_ijk->dcos_dk); - rvec_ScaledAdd(workspace->f_tor[j], CEtors7, p_ijk->dcos_dj); - rvec_ScaledAdd(workspace->f_tor[k], CEtors7, p_ijk->dcos_di); - - rvec_ScaledAdd(workspace->f_tor[j], CEtors8, p_jkl->dcos_di); - rvec_ScaledAdd(workspace->f_tor[k], CEtors8, p_jkl->dcos_dj); - rvec_ScaledAdd(workspace->f_tor[l], CEtors8, p_jkl->dcos_dk); - - rvec_ScaledAdd( workspace->f_tor[i], CEtors9, dcos_omega_di ); - rvec_ScaledAdd( workspace->f_tor[j], CEtors9, dcos_omega_dj ); - rvec_ScaledAdd( workspace->f_tor[k], CEtors9, dcos_omega_dk ); - rvec_ScaledAdd( workspace->f_tor[l], CEtors9, dcos_omega_dl ); - - // Conjugation Forces - Add_dBO( system, lists, j, pij, CEconj1, workspace->f_con ); - Add_dBO( system, lists, j, pk, CEconj2, workspace->f_con ); - Add_dBO( system, lists, k, plk, CEconj3, workspace->f_con ); - - rvec_ScaledAdd(workspace->f_con[i], CEconj4, p_ijk->dcos_dk); - rvec_ScaledAdd(workspace->f_con[j], CEconj4, p_ijk->dcos_dj); - rvec_ScaledAdd(workspace->f_con[k], CEconj4, p_ijk->dcos_di); - - rvec_ScaledAdd(workspace->f_con[j], CEconj5, p_jkl->dcos_di); - rvec_ScaledAdd(workspace->f_con[k], CEconj5, p_jkl->dcos_dj); - rvec_ScaledAdd(workspace->f_con[l], CEconj5, p_jkl->dcos_dk); - - rvec_ScaledAdd( workspace->f_con[i], CEconj6, dcos_omega_di ); - rvec_ScaledAdd( workspace->f_con[j], CEconj6, dcos_omega_dj ); - rvec_ScaledAdd( workspace->f_con[k], CEconj6, dcos_omega_dk ); - rvec_ScaledAdd( workspace->f_con[l], CEconj6, dcos_omega_dl ); - */ - //end + /* + // Torsion Forces + Add_dBOpinpi2(system, lists, j, pk, CEtors2, 0., + workspace->f_tor, workspace->f_tor); + Add_dDelta( system, lists, j, CEtors3, workspace->f_tor ); + Add_dDelta( system, lists, k, CEtors3, workspace->f_tor ); + Add_dBO( system, lists, j, pij, CEtors4, workspace->f_tor ); + Add_dBO( system, lists, j, pk, CEtors5, workspace->f_tor ); + Add_dBO( system, lists, k, plk, CEtors6, workspace->f_tor ); + + rvec_ScaledAdd(workspace->f_tor[i], CEtors7, p_ijk->dcos_dk); + rvec_ScaledAdd(workspace->f_tor[j], CEtors7, p_ijk->dcos_dj); + rvec_ScaledAdd(workspace->f_tor[k], CEtors7, p_ijk->dcos_di); + + rvec_ScaledAdd(workspace->f_tor[j], CEtors8, p_jkl->dcos_di); + rvec_ScaledAdd(workspace->f_tor[k], CEtors8, p_jkl->dcos_dj); + rvec_ScaledAdd(workspace->f_tor[l], CEtors8, p_jkl->dcos_dk); + + rvec_ScaledAdd( workspace->f_tor[i], CEtors9, dcos_omega_di ); + rvec_ScaledAdd( workspace->f_tor[j], CEtors9, dcos_omega_dj ); + rvec_ScaledAdd( workspace->f_tor[k], CEtors9, dcos_omega_dk ); + rvec_ScaledAdd( workspace->f_tor[l], CEtors9, dcos_omega_dl ); + + // Conjugation Forces + Add_dBO( system, lists, j, pij, CEconj1, workspace->f_con ); + Add_dBO( system, lists, j, pk, CEconj2, workspace->f_con ); + Add_dBO( system, lists, k, plk, CEconj3, workspace->f_con ); + + rvec_ScaledAdd(workspace->f_con[i], CEconj4, p_ijk->dcos_dk); + rvec_ScaledAdd(workspace->f_con[j], CEconj4, p_ijk->dcos_dj); + rvec_ScaledAdd(workspace->f_con[k], CEconj4, p_ijk->dcos_di); + + rvec_ScaledAdd(workspace->f_con[j], CEconj5, p_jkl->dcos_di); + rvec_ScaledAdd(workspace->f_con[k], CEconj5, p_jkl->dcos_dj); + rvec_ScaledAdd(workspace->f_con[l], CEconj5, p_jkl->dcos_dk); + + rvec_ScaledAdd( workspace->f_con[i], CEconj6, dcos_omega_di ); + rvec_ScaledAdd( workspace->f_con[j], CEconj6, dcos_omega_dj ); + rvec_ScaledAdd( workspace->f_con[k], CEconj6, dcos_omega_dk ); + rvec_ScaledAdd( workspace->f_con[l], CEconj6, dcos_omega_dl ); + */ + //end #endif - } // pl check ends - } // pl loop ends - } // pi check ends - } // pi loop ends - } // k-j neighbor check ends - } // j<k && j-k neighbor check ends + } // pl check ends + } // pl loop ends + } // pi check ends + } // pi loop ends + } // k-j neighbor check ends + } // j<k && j-k neighbor check ends - //pk += blockDim.x; + //pk += blockDim.x; - } // pk loop ends - //} // j loop -- REMOVED FOR CUDA + } // pk loop ends + //} // j loop -- REMOVED FOR CUDA - /* fprintf( stderr, "4body: ext_press (%23.15e %23.15e %23.15e)\n", - data->ext_press[0], data->ext_press[1], data->ext_press[2] );*/ + /* fprintf( stderr, "4body: ext_press (%23.15e %23.15e %23.15e)\n", + data->ext_press[0], data->ext_press[1], data->ext_press[2] );*/ #ifdef TEST_FORCES - /* - fprintf( stderr, "Number of torsion angles: %d\n", num_frb_intrs ); - fprintf( stderr, "Torsion Energy: %g\t Conjugation Energy: %g\n", - data->E_Tor, data->E_Con ); - */ + /* + fprintf( stderr, "Number of torsion angles: %d\n", num_frb_intrs ); + fprintf( stderr, "Torsion Energy: %g\t Conjugation Energy: %g\n", + data->E_Tor, data->E_Con ); + */ #endif - /* - //do the reduction for the shared memory variables - // now do a reduce inside the warp for E_vdW, E_Ele and force. - if (threadIdx.x < 16) { - sh_tor [threadIdx.x] += sh_tor [threadIdx.x + 16]; - sh_con [threadIdx.x] += sh_con [threadIdx.x + 16]; - rvec_Add (sh_press [threadIdx.x], sh_press[threadIdx.x + 16]); - } - if (threadIdx.x < 8) { - sh_tor [threadIdx.x] += sh_tor [threadIdx.x + 8]; - sh_con [threadIdx.x] += sh_con [threadIdx.x + 8]; - rvec_Add (sh_press [threadIdx.x], sh_press[threadIdx.x + 8]); - } - if (threadIdx.x < 4) { - sh_tor [threadIdx.x] += sh_tor [threadIdx.x + 4]; - sh_con [threadIdx.x] += sh_con [threadIdx.x + 4]; - rvec_Add (sh_press [threadIdx.x], sh_press[threadIdx.x + 4]); - } - if (threadIdx.x < 2) { - sh_tor [threadIdx.x] += sh_tor [threadIdx.x + 2]; - sh_con [threadIdx.x] += sh_con [threadIdx.x + 2]; - rvec_Add (sh_press [threadIdx.x], sh_press[threadIdx.x + 2]); - } - if (threadIdx.x < 1) { - sh_tor [threadIdx.x] += sh_tor [threadIdx.x + 1]; - sh_con [threadIdx.x] += sh_con [threadIdx.x + 1]; - rvec_Add (sh_press [threadIdx.x], sh_press[threadIdx.x + 1]); - } - - if (threadIdx.x == 0) { - E_Tor[j] = sh_tor [threadIdx.x]; - E_Con[j] = sh_con [threadIdx.x]; - rvec_Copy (aux_ext_press[j], sh_press[threadIdx.x]); - } - */ + /* + //do the reduction for the shared memory variables + // now do a reduce inside the warp for E_vdW, E_Ele and force. + if (threadIdx.x < 16) { + sh_tor [threadIdx.x] += sh_tor [threadIdx.x + 16]; + sh_con [threadIdx.x] += sh_con [threadIdx.x + 16]; + rvec_Add (sh_press [threadIdx.x], sh_press[threadIdx.x + 16]); + } + if (threadIdx.x < 8) { + sh_tor [threadIdx.x] += sh_tor [threadIdx.x + 8]; + sh_con [threadIdx.x] += sh_con [threadIdx.x + 8]; + rvec_Add (sh_press [threadIdx.x], sh_press[threadIdx.x + 8]); + } + if (threadIdx.x < 4) { + sh_tor [threadIdx.x] += sh_tor [threadIdx.x + 4]; + sh_con [threadIdx.x] += sh_con [threadIdx.x + 4]; + rvec_Add (sh_press [threadIdx.x], sh_press[threadIdx.x + 4]); + } + if (threadIdx.x < 2) { + sh_tor [threadIdx.x] += sh_tor [threadIdx.x + 2]; + sh_con [threadIdx.x] += sh_con [threadIdx.x + 2]; + rvec_Add (sh_press [threadIdx.x], sh_press[threadIdx.x + 2]); + } + if (threadIdx.x < 1) { + sh_tor [threadIdx.x] += sh_tor [threadIdx.x + 1]; + sh_con [threadIdx.x] += sh_con [threadIdx.x + 1]; + rvec_Add (sh_press [threadIdx.x], sh_press[threadIdx.x + 1]); + } + + if (threadIdx.x == 0) { + E_Tor[j] = sh_tor [threadIdx.x]; + E_Con[j] = sh_con [threadIdx.x]; + rvec_Copy (aux_ext_press[j], sh_press[threadIdx.x]); + } + */ } GLOBAL void Four_Body_Postprocess ( reax_atom *atoms, - static_storage p_workspace, - list p_bonds, int N ) + static_storage p_workspace, + list p_bonds, int N ) { - int i, pj; + int i, pj; - bond_data *pbond; - bond_data *sym_index_bond; - bond_order_data *bo_data; + bond_data *pbond; + bond_data *sym_index_bond; + bond_order_data *bo_data; - list *bonds = &p_bonds; - static_storage *workspace = &p_workspace; + list *bonds = &p_bonds; + static_storage *workspace = &p_workspace; - i = blockIdx.x * blockDim.x + threadIdx.x; + i = blockIdx.x * blockDim.x + threadIdx.x; - if ( i >= N) return; + if ( i >= N) return; - for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ){ + for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ){ - pbond = &(bonds->select.bond_list[pj]); - bo_data = &pbond->bo_data; - sym_index_bond = &( bonds->select.bond_list[ pbond->sym_index ] ); + pbond = &(bonds->select.bond_list[pj]); + bo_data = &pbond->bo_data; + sym_index_bond = &( bonds->select.bond_list[ pbond->sym_index ] ); - workspace->CdDelta [i] += sym_index_bond->CdDelta_jk; + workspace->CdDelta [i] += sym_index_bond->CdDelta_jk; - //bo_data->Cdbo += sym_index_bond->Cdbo_kl; - bo_data->Cdbo += pbond->Cdbo_kl; + //bo_data->Cdbo += sym_index_bond->Cdbo_kl; + bo_data->Cdbo += pbond->Cdbo_kl; - //update f vector - rvec_Add (atoms [i].f, sym_index_bond->i_f ); - rvec_Add (atoms [i].f, sym_index_bond->k_f ); - } + //update f vector + rvec_Add (atoms [i].f, sym_index_bond->i_f ); + rvec_Add (atoms [i].f, sym_index_bond->k_f ); + } } diff --git a/PuReMD-GPU/src/grid.cu b/PuReMD-GPU/src/grid.cu index 27447a1a..00e638f4 100644 --- a/PuReMD-GPU/src/grid.cu +++ b/PuReMD-GPU/src/grid.cu @@ -28,459 +28,459 @@ int Estimate_GCell_Population( reax_system* system ) { - int i, j, k, l; - int max_atoms; - grid *g; - - g = &( system->g ); - Reset_Grid( g ); - - for( l = 0; l < system->N; l++ ) { - i = (int)(system->atoms[l].x[0] * g->inv_len[0]); - j = (int)(system->atoms[l].x[1] * g->inv_len[1]); - k = (int)(system->atoms[l].x[2] * g->inv_len[2]); - g->top[index_grid_3d (i, j, k, g)]++; - // fprintf( stderr, "\tatom%-6d (%8.3f%8.3f%8.3f) --> (%3d%3d%3d)\n", - // l, system->atoms[l].x[0], system->atoms[l].x[1], system->atoms[l].x[2], - // i, j, k ); - } - - max_atoms = 0; - for( i = 0; i < g->ncell[0]; i++ ) - for( j = 0; j < g->ncell[1]; j++ ) - for( k = 0; k < g->ncell[2]; k++ ) - if( max_atoms < g->top[index_grid_3d (i, j, k, g)] ) - max_atoms = g->top[index_grid_3d (i, j, k, g)]; - - return MAX(max_atoms*SAFE_ZONE, MIN_GCELL_POPL); + int i, j, k, l; + int max_atoms; + grid *g; + + g = &( system->g ); + Reset_Grid( g ); + + for( l = 0; l < system->N; l++ ) { + i = (int)(system->atoms[l].x[0] * g->inv_len[0]); + j = (int)(system->atoms[l].x[1] * g->inv_len[1]); + k = (int)(system->atoms[l].x[2] * g->inv_len[2]); + g->top[index_grid_3d (i, j, k, g)]++; + // fprintf( stderr, "\tatom%-6d (%8.3f%8.3f%8.3f) --> (%3d%3d%3d)\n", + // l, system->atoms[l].x[0], system->atoms[l].x[1], system->atoms[l].x[2], + // i, j, k ); + } + + max_atoms = 0; + for( i = 0; i < g->ncell[0]; i++ ) + for( j = 0; j < g->ncell[1]; j++ ) + for( k = 0; k < g->ncell[2]; k++ ) + if( max_atoms < g->top[index_grid_3d (i, j, k, g)] ) + max_atoms = g->top[index_grid_3d (i, j, k, g)]; + + return MAX(max_atoms*SAFE_ZONE, MIN_GCELL_POPL); } void Allocate_Space_for_Grid( reax_system *system ) { - int i, j, k, l; - grid *g = &(system->g); - - int total = g->ncell[0] * g->ncell[1] * g->ncell[2]; - - g = &(system->g); - g->max_nbrs = (2*g->spread[0]+1) * (2*g->spread[1]+1) * (2*g->spread[2]+1)+3; - - /* allocate space for the new grid */ - g->top = (int*) calloc( total, sizeof( int )); - g->mark = (int*) calloc( total, sizeof( int )); - g->start = (int*) calloc( total, sizeof( int )); - g->end = (int*) calloc( total, sizeof( int )); - g->nbrs = (ivec*) calloc( total * g->max_nbrs, sizeof( ivec )); - g->nbrs_cp = (rvec*) calloc( total * g->max_nbrs, sizeof( rvec )); - - for( i = 0; i < g->ncell[0]; i++ ) { - for( j = 0; j < g->ncell[1]; j++ ) { - for( k = 0; k < g->ncell[2]; k++ ) { - for( l = 0; l < g->max_nbrs; ++l ){ - g->nbrs[ index_grid_nbrs (i, j, k, l, g) ][0] = -1; - g->nbrs[ index_grid_nbrs (i, j, k, l, g) ][1] = -1; - g->nbrs[ index_grid_nbrs (i, j, k, l, g) ][2] = -1; - - g->nbrs_cp[ index_grid_nbrs (i, j, k, l, g) ][0] = -1; - g->nbrs_cp[ index_grid_nbrs (i, j, k, l, g) ][1] = -1; - g->nbrs_cp[ index_grid_nbrs (i, j, k, l, g) ][2] = -1; - } - } - } - } - - g->max_atoms = Estimate_GCell_Population( system ); - - g->atoms = (int*) calloc( total * g->max_atoms, sizeof( int )); + int i, j, k, l; + grid *g = &(system->g); + + int total = g->ncell[0] * g->ncell[1] * g->ncell[2]; + + g = &(system->g); + g->max_nbrs = (2*g->spread[0]+1) * (2*g->spread[1]+1) * (2*g->spread[2]+1)+3; + + /* allocate space for the new grid */ + g->top = (int*) calloc( total, sizeof( int )); + g->mark = (int*) calloc( total, sizeof( int )); + g->start = (int*) calloc( total, sizeof( int )); + g->end = (int*) calloc( total, sizeof( int )); + g->nbrs = (ivec*) calloc( total * g->max_nbrs, sizeof( ivec )); + g->nbrs_cp = (rvec*) calloc( total * g->max_nbrs, sizeof( rvec )); + + for( i = 0; i < g->ncell[0]; i++ ) { + for( j = 0; j < g->ncell[1]; j++ ) { + for( k = 0; k < g->ncell[2]; k++ ) { + for( l = 0; l < g->max_nbrs; ++l ){ + g->nbrs[ index_grid_nbrs (i, j, k, l, g) ][0] = -1; + g->nbrs[ index_grid_nbrs (i, j, k, l, g) ][1] = -1; + g->nbrs[ index_grid_nbrs (i, j, k, l, g) ][2] = -1; + + g->nbrs_cp[ index_grid_nbrs (i, j, k, l, g) ][0] = -1; + g->nbrs_cp[ index_grid_nbrs (i, j, k, l, g) ][1] = -1; + g->nbrs_cp[ index_grid_nbrs (i, j, k, l, g) ][2] = -1; + } + } + } + } + + g->max_atoms = Estimate_GCell_Population( system ); + + g->atoms = (int*) calloc( total * g->max_atoms, sizeof( int )); } void Deallocate_Grid_Space( grid *g ) { - free( g->atoms ); - free( g->top ); - free( g->mark ); - free( g->nbrs ); - free( g->nbrs_cp ); + free( g->atoms ); + free( g->top ); + free( g->mark ); + free( g->nbrs ); + free( g->nbrs_cp ); } int Shift(int p, int dp, int dim, grid *g ) { - int dim_len = 0; - int newp = p + dp; - - switch( dim ) { - case 0: dim_len = g->ncell[0]; - break; - case 1: dim_len = g->ncell[1]; - break; - case 2: dim_len = g->ncell[2]; - } - - while( newp < 0 ) newp = newp + dim_len; - while( newp >= dim_len ) newp = newp - dim_len; - return newp; + int dim_len = 0; + int newp = p + dp; + + switch( dim ) { + case 0: dim_len = g->ncell[0]; + break; + case 1: dim_len = g->ncell[1]; + break; + case 2: dim_len = g->ncell[2]; + } + + while( newp < 0 ) newp = newp + dim_len; + while( newp >= dim_len ) newp = newp - dim_len; + return newp; } /* finds the closest point between two grid cells denoted by c1 and c2. periodic boundary conditions are taken into consideration as well. */ void Find_Closest_Point( grid *g, int c1x, int c1y, int c1z, - int c2x, int c2y, int c2z, rvec closest_point ) + int c2x, int c2y, int c2z, rvec closest_point ) { - int i, d; - ivec c1 = { c1x, c1y, c1z }; - ivec c2 = { c2x, c2y, c2z }; - - for( i = 0; i < 3; i++ ) { - if( g->ncell[i] < 5 ) { - closest_point[i] = NEG_INF - 1.; - continue; - } - - d = c2[i] - c1[i]; - if( abs(d) <= g->ncell[i] / 2 ) { - if( d > 0 ) - closest_point[i] = c2[i] * g->len[i]; - else if ( d == 0 ) - closest_point[i] = NEG_INF - 1.; - else - closest_point[i] = ( c2[i] + 1 ) * g->len[i]; - } - else { - if( d > 0 ) - closest_point[i] = ( c2[i] - g->ncell[i] + 1 ) * g->len[i]; - else - closest_point[i] = ( c2[i] + g->ncell[i] ) * g->len[i]; - } - } + int i, d; + ivec c1 = { c1x, c1y, c1z }; + ivec c2 = { c2x, c2y, c2z }; + + for( i = 0; i < 3; i++ ) { + if( g->ncell[i] < 5 ) { + closest_point[i] = NEG_INF - 1.; + continue; + } + + d = c2[i] - c1[i]; + if( abs(d) <= g->ncell[i] / 2 ) { + if( d > 0 ) + closest_point[i] = c2[i] * g->len[i]; + else if ( d == 0 ) + closest_point[i] = NEG_INF - 1.; + else + closest_point[i] = ( c2[i] + 1 ) * g->len[i]; + } + else { + if( d > 0 ) + closest_point[i] = ( c2[i] - g->ncell[i] + 1 ) * g->len[i]; + else + closest_point[i] = ( c2[i] + g->ncell[i] ) * g->len[i]; + } + } } void Find_Neighbor_GridCells( grid *g ) { - int i, j, k; - int di, dj, dk; - int x, y, z; - int stack_top; - ivec *nbrs_stack; - rvec *cp_stack; - - /* pick up a cell in the grid */ - for( i = 0; i < g->ncell[0]; i++ ) - for( j = 0; j < g->ncell[1]; j++ ) - for( k = 0; k < g->ncell[2]; k++ ) { - nbrs_stack = &( g->nbrs[ index_grid_nbrs (i, j, k, 0, g) ] ); - cp_stack = &( g->nbrs_cp[ index_grid_nbrs (i, j, k, 0, g) ] ); - stack_top = 0; - //fprintf( stderr, "grid1: %d %d %d\n", i, j, k ); - - /* choose an unmarked neighbor cell*/ - for( di = -g->spread[0]; di <= g->spread[0]; di++ ) { - x = Shift( i, di, 0, g ); - - for( dj = -g->spread[1]; dj <= g->spread[1]; dj++ ) { - y = Shift( j, dj, 1, g ); - - for( dk = -g->spread[2]; dk <= g->spread[2]; dk++ ) { - z = Shift( k, dk, 2, g ); - //fprintf( stderr, "\tgrid2: %d %d %d\n", x, y, z ); - - if( !g->mark[ index_grid_3d (x, y, z, g) ] ) { - /*(di < 0 || // 9 combinations - (di == 0 && dj < 0) || // 3 combinations - (di == 0 && dj == 0 && dk < 0) ) )*/ - /* put the neighbor cell into the stack and mark it */ - nbrs_stack[stack_top][0] = x; - nbrs_stack[stack_top][1] = y; - nbrs_stack[stack_top][2] = z; - g->mark[ index_grid_3d(x,y,z,g) ] = 1; - - Find_Closest_Point( g, i, j, k, x, y, z, cp_stack[stack_top] ); - //fprintf( stderr, "\tcp: %lf %lf %lf\n", - // cp_stack[stack_top][0], cp_stack[stack_top][1], - // cp_stack[stack_top][2]); - stack_top++; - } - } - } - } - - /*nbrs_stack[stack_top][0] = i; - nbrs_stack[stack_top][1] = j; - nbrs_stack[stack_top][2] = k; - Find_Closest_Point( g, i, j, k, i, j, k, cp_stack[stack_top] ); - nbrs_stack[stack_top+1][0] = -1; - nbrs_stack[stack_top+1][1] = -1; - nbrs_stack[stack_top+1][2] = -1; - Reset_Marks( g, nbrs_stack, stack_top+1 );*/ - nbrs_stack[stack_top][0] = -1; - nbrs_stack[stack_top][1] = -1; - nbrs_stack[stack_top][2] = -1; - Reset_Marks( g, nbrs_stack, stack_top ); - } + int i, j, k; + int di, dj, dk; + int x, y, z; + int stack_top; + ivec *nbrs_stack; + rvec *cp_stack; + + /* pick up a cell in the grid */ + for( i = 0; i < g->ncell[0]; i++ ) + for( j = 0; j < g->ncell[1]; j++ ) + for( k = 0; k < g->ncell[2]; k++ ) { + nbrs_stack = &( g->nbrs[ index_grid_nbrs (i, j, k, 0, g) ] ); + cp_stack = &( g->nbrs_cp[ index_grid_nbrs (i, j, k, 0, g) ] ); + stack_top = 0; + //fprintf( stderr, "grid1: %d %d %d\n", i, j, k ); + + /* choose an unmarked neighbor cell*/ + for( di = -g->spread[0]; di <= g->spread[0]; di++ ) { + x = Shift( i, di, 0, g ); + + for( dj = -g->spread[1]; dj <= g->spread[1]; dj++ ) { + y = Shift( j, dj, 1, g ); + + for( dk = -g->spread[2]; dk <= g->spread[2]; dk++ ) { + z = Shift( k, dk, 2, g ); + //fprintf( stderr, "\tgrid2: %d %d %d\n", x, y, z ); + + if( !g->mark[ index_grid_3d (x, y, z, g) ] ) { + /*(di < 0 || // 9 combinations + (di == 0 && dj < 0) || // 3 combinations + (di == 0 && dj == 0 && dk < 0) ) )*/ + /* put the neighbor cell into the stack and mark it */ + nbrs_stack[stack_top][0] = x; + nbrs_stack[stack_top][1] = y; + nbrs_stack[stack_top][2] = z; + g->mark[ index_grid_3d(x,y,z,g) ] = 1; + + Find_Closest_Point( g, i, j, k, x, y, z, cp_stack[stack_top] ); + //fprintf( stderr, "\tcp: %lf %lf %lf\n", + // cp_stack[stack_top][0], cp_stack[stack_top][1], + // cp_stack[stack_top][2]); + stack_top++; + } + } + } + } + + /*nbrs_stack[stack_top][0] = i; + nbrs_stack[stack_top][1] = j; + nbrs_stack[stack_top][2] = k; + Find_Closest_Point( g, i, j, k, i, j, k, cp_stack[stack_top] ); + nbrs_stack[stack_top+1][0] = -1; + nbrs_stack[stack_top+1][1] = -1; + nbrs_stack[stack_top+1][2] = -1; + Reset_Marks( g, nbrs_stack, stack_top+1 );*/ + nbrs_stack[stack_top][0] = -1; + nbrs_stack[stack_top][1] = -1; + nbrs_stack[stack_top][2] = -1; + Reset_Marks( g, nbrs_stack, stack_top ); + } } void Setup_Grid( reax_system* system ) { - int d; - ivec ncell; - grid *g = &( system->g ); - simulation_box *my_box = &( system->box ); + int d; + ivec ncell; + grid *g = &( system->g ); + simulation_box *my_box = &( system->box ); - /* determine number of grid cells in each direction */ - ivec_rScale( ncell, 1. / g->cell_size, my_box->box_norms ); + /* determine number of grid cells in each direction */ + ivec_rScale( ncell, 1. / g->cell_size, my_box->box_norms ); - for( d = 0; d < 3; ++d ) - if( ncell[d] <= 0 ) - ncell[d] = 1; + for( d = 0; d < 3; ++d ) + if( ncell[d] <= 0 ) + ncell[d] = 1; - /* find the number of grid cells */ - g->total = ncell[0] * ncell[1] * ncell[2]; - ivec_Copy( g->ncell, ncell ); + /* find the number of grid cells */ + g->total = ncell[0] * ncell[1] * ncell[2]; + ivec_Copy( g->ncell, ncell ); - /* compute cell lengths */ - rvec_iDivide( g->len, my_box->box_norms, g->ncell ); - rvec_Invert( g->inv_len, g->len ); + /* compute cell lengths */ + rvec_iDivide( g->len, my_box->box_norms, g->ncell ); + rvec_Invert( g->inv_len, g->len ); - Allocate_Space_for_Grid( system ); - Find_Neighbor_GridCells( g ); + Allocate_Space_for_Grid( system ); + Find_Neighbor_GridCells( g ); #if defined(DEBUG_FOCUS) - fprintf( stderr, "setting up the grid: " ); - fprintf( stderr, "ncell[%d %d %d] ", g->ncell[0], g->ncell[1], g->ncell[2] ); - fprintf( stderr, "len[%5.2f %5.2f %5.2f] ", g->len[0], g->len[1], g->len[2] ); - fprintf( stderr, "g->max_atoms = %d\n", g->max_atoms ); + fprintf( stderr, "setting up the grid: " ); + fprintf( stderr, "ncell[%d %d %d] ", g->ncell[0], g->ncell[1], g->ncell[2] ); + fprintf( stderr, "len[%5.2f %5.2f %5.2f] ", g->len[0], g->len[1], g->len[2] ); + fprintf( stderr, "g->max_atoms = %d\n", g->max_atoms ); #endif } void Update_Grid( reax_system* system ) { - int d, i, j, k, x, y, z, itr; - ivec ncell; - ivec *nbrs; - rvec *nbrs_cp; - grid *g = &( system->g ); - simulation_box *my_box = &( system->box ); - - /* determine number of grid cells in each direction */ - ivec_rScale( ncell, 1. / g->cell_size, my_box->box_norms ); - - for( d = 0; d < 3; ++d ) - if( ncell[d] == 0 ) - ncell[d] = 1; - - if( ivec_isEqual( ncell, g->ncell ) ) {/* ncell are unchanged */ - /* update cell lengths */ - rvec_iDivide( g->len, my_box->box_norms, g->ncell ); - rvec_Invert( g->inv_len, g->len ); - - /* update closest point distances between gcells */ - for( i = 0; i < g->ncell[0]; i++ ) - for( j = 0; j < g->ncell[1]; j++ ) - for( k = 0; k < g->ncell[2]; k++ ) { - nbrs = &( g->nbrs[ index_grid_nbrs (i, j, k, 0, g) ] ); - nbrs_cp = &( g->nbrs_cp[ index_grid_nbrs (i, j, k, 0, g) ] ); - //fprintf( stderr, "gridcell %d %d %d\n", i, j, k ); - - itr = 0; - while( nbrs[itr][0] >= 0 ){ - x = nbrs[itr][0]; - y = nbrs[itr][1]; - z = nbrs[itr][2]; - - Find_Closest_Point( g, i, j, k, x, y, z, nbrs_cp[itr] ); - ++itr; - } - } - } - else{ /* at least one of ncell has changed */ - Deallocate_Grid_Space( g ); - /* update number of grid cells */ - g->total = ncell[0] * ncell[1] * ncell[2]; - ivec_Copy( g->ncell, ncell ); - /* update cell lengths */ - rvec_iDivide( g->len, my_box->box_norms, g->ncell ); - rvec_Invert( g->inv_len, g->len ); - - Allocate_Space_for_Grid( system ); - Find_Neighbor_GridCells( g ); + int d, i, j, k, x, y, z, itr; + ivec ncell; + ivec *nbrs; + rvec *nbrs_cp; + grid *g = &( system->g ); + simulation_box *my_box = &( system->box ); + + /* determine number of grid cells in each direction */ + ivec_rScale( ncell, 1. / g->cell_size, my_box->box_norms ); + + for( d = 0; d < 3; ++d ) + if( ncell[d] == 0 ) + ncell[d] = 1; + + if( ivec_isEqual( ncell, g->ncell ) ) {/* ncell are unchanged */ + /* update cell lengths */ + rvec_iDivide( g->len, my_box->box_norms, g->ncell ); + rvec_Invert( g->inv_len, g->len ); + + /* update closest point distances between gcells */ + for( i = 0; i < g->ncell[0]; i++ ) + for( j = 0; j < g->ncell[1]; j++ ) + for( k = 0; k < g->ncell[2]; k++ ) { + nbrs = &( g->nbrs[ index_grid_nbrs (i, j, k, 0, g) ] ); + nbrs_cp = &( g->nbrs_cp[ index_grid_nbrs (i, j, k, 0, g) ] ); + //fprintf( stderr, "gridcell %d %d %d\n", i, j, k ); + + itr = 0; + while( nbrs[itr][0] >= 0 ){ + x = nbrs[itr][0]; + y = nbrs[itr][1]; + z = nbrs[itr][2]; + + Find_Closest_Point( g, i, j, k, x, y, z, nbrs_cp[itr] ); + ++itr; + } + } + } + else{ /* at least one of ncell has changed */ + Deallocate_Grid_Space( g ); + /* update number of grid cells */ + g->total = ncell[0] * ncell[1] * ncell[2]; + ivec_Copy( g->ncell, ncell ); + /* update cell lengths */ + rvec_iDivide( g->len, my_box->box_norms, g->ncell ); + rvec_Invert( g->inv_len, g->len ); + + Allocate_Space_for_Grid( system ); + Find_Neighbor_GridCells( g ); #if defined(DEBUG_FOCUS) - fprintf( stderr, "updated grid: " ); - fprintf( stderr, "ncell[%d %d %d] ", - g->ncell[0], g->ncell[1], g->ncell[2] ); - fprintf( stderr, "len[%5.2f %5.2f %5.2f] ", - g->len[0], g->len[1], g->len[2] ); - fprintf( stderr, "g->max_atoms = %d\n", g->max_atoms ); + fprintf( stderr, "updated grid: " ); + fprintf( stderr, "ncell[%d %d %d] ", + g->ncell[0], g->ncell[1], g->ncell[2] ); + fprintf( stderr, "len[%5.2f %5.2f %5.2f] ", + g->len[0], g->len[1], g->len[2] ); + fprintf( stderr, "g->max_atoms = %d\n", g->max_atoms ); #endif - } + } } void Bin_Atoms( reax_system* system, static_storage *workspace ) { - int i, j, k, l; - int max_atoms; - grid *g = &( system->g ); + int i, j, k, l; + int max_atoms; + grid *g = &( system->g ); - Reset_Grid( g ); + Reset_Grid( g ); - for( l = 0; l < system->N; l++ ) { - i = (int)(system->atoms[l].x[0] * g->inv_len[0]); - j = (int)(system->atoms[l].x[1] * g->inv_len[1]); - k = (int)(system->atoms[l].x[2] * g->inv_len[2]); + for( l = 0; l < system->N; l++ ) { + i = (int)(system->atoms[l].x[0] * g->inv_len[0]); + j = (int)(system->atoms[l].x[1] * g->inv_len[1]); + k = (int)(system->atoms[l].x[2] * g->inv_len[2]); #ifdef __BNVT_FIX__ - if (i >= g->ncell[0]) i = g->ncell[0]-1; - if (j >= g->ncell[1]) j = g->ncell[1]-1; - if (k >= g->ncell[2]) k = g->ncell[2]-1; + if (i >= g->ncell[0]) i = g->ncell[0]-1; + if (j >= g->ncell[1]) j = g->ncell[1]-1; + if (k >= g->ncell[2]) k = g->ncell[2]-1; #endif - g->atoms[ index_grid_atoms (i,j,k,g->top[ index_grid_3d (i,j,k,g) ], g) ] = l; - g->top[index_grid_3d (i,j,k,g) ]++; + g->atoms[ index_grid_atoms (i,j,k,g->top[ index_grid_3d (i,j,k,g) ], g) ] = l; + g->top[index_grid_3d (i,j,k,g) ]++; - //fprintf( stderr, "\tatom%-6d (%8.3f%8.3f%8.3f) --> (%3d%3d%3d)\n", - //l, system->atoms[l].x[0], system->atoms[l].x[1], system->atoms[l].x[2], - //i, j, k ); - } + //fprintf( stderr, "\tatom%-6d (%8.3f%8.3f%8.3f) --> (%3d%3d%3d)\n", + //l, system->atoms[l].x[0], system->atoms[l].x[1], system->atoms[l].x[2], + //i, j, k ); + } - max_atoms = 0; - for( i = 0; i < g->ncell[0]; i++ ) - for( j = 0; j < g->ncell[1]; j++ ) - for( k = 0; k < g->ncell[2]; k++ ) - if( max_atoms < g->top[ index_grid_3d (i, j, k, g) ] ) - max_atoms = g->top[ index_grid_3d (i, j, k, g) ]; + max_atoms = 0; + for( i = 0; i < g->ncell[0]; i++ ) + for( j = 0; j < g->ncell[1]; j++ ) + for( k = 0; k < g->ncell[2]; k++ ) + if( max_atoms < g->top[ index_grid_3d (i, j, k, g) ] ) + max_atoms = g->top[ index_grid_3d (i, j, k, g) ]; - /* check if current gcell->max_atoms is safe */ - if( max_atoms >= g->max_atoms * SAFE_ZONE ) - workspace->realloc.gcell_atoms = MAX(max_atoms*SAFE_ZONE,MIN_GCELL_POPL); + /* check if current gcell->max_atoms is safe */ + if( max_atoms >= g->max_atoms * SAFE_ZONE ) + workspace->realloc.gcell_atoms = MAX(max_atoms*SAFE_ZONE,MIN_GCELL_POPL); } void Cuda_Bin_Atoms (reax_system *system, static_storage *workspace ) { - Cuda_Reset_Grid ( &system->d_g); + Cuda_Reset_Grid ( &system->d_g); - Bin_Atoms ( system, workspace ); + Bin_Atoms ( system, workspace ); - dev_workspace->realloc.gcell_atoms = workspace->realloc.gcell_atoms; + dev_workspace->realloc.gcell_atoms = workspace->realloc.gcell_atoms; } void Cuda_Bin_Atoms_Sync (reax_system *system) { - copy_host_device (system->g.top, system->d_g.top, - INT_SIZE * system->g.ncell[0]*system->g.ncell[1]*system->g.ncell[2], cudaMemcpyHostToDevice, RES_GRID_TOP); + copy_host_device (system->g.top, system->d_g.top, + INT_SIZE * system->g.ncell[0]*system->g.ncell[1]*system->g.ncell[2], cudaMemcpyHostToDevice, RES_GRID_TOP); - copy_host_device (system->g.atoms, system->d_g.atoms, - INT_SIZE * system->g.max_atoms*system->g.ncell[0]*system->g.ncell[1]*system->g.ncell[2], cudaMemcpyHostToDevice, RES_GRID_ATOMS); + copy_host_device (system->g.atoms, system->d_g.atoms, + INT_SIZE * system->g.max_atoms*system->g.ncell[0]*system->g.ncell[1]*system->g.ncell[2], cudaMemcpyHostToDevice, RES_GRID_ATOMS); } inline void reax_atom_Copy( reax_atom *dest, reax_atom *src ) { - dest->type = src->type; - rvec_Copy( dest->x, src->x ); - rvec_Copy( dest->v, src->v ); - strcpy( dest->name, src->name ); + dest->type = src->type; + rvec_Copy( dest->x, src->x ); + rvec_Copy( dest->v, src->v ); + strcpy( dest->name, src->name ); } void Copy_Storage( reax_system *system, static_storage *workspace, - int top, int old_id, int old_type, - int *num_H, real *v, real *s, real *t, - int *orig_id, rvec *f_old ) + int top, int old_id, int old_type, + int *num_H, real *v, real *s, real *t, + int *orig_id, rvec *f_old ) { - int i; + int i; - for( i = 0; i < RESTART+1; ++i ) - v[ index_wkspace_sys (i,top, system) ] = workspace->v[ index_wkspace_sys (i,old_id, system) ]; + for( i = 0; i < RESTART+1; ++i ) + v[ index_wkspace_sys (i,top, system) ] = workspace->v[ index_wkspace_sys (i,old_id, system) ]; - for( i = 0; i < 3; ++i ) { - s[ index_wkspace_sys (i,top, system) ] = workspace->s[ index_wkspace_sys (i,old_id, system) ]; - t[ index_wkspace_sys (i,top, system) ] = workspace->t[ index_wkspace_sys (i,old_id, system) ]; - } + for( i = 0; i < 3; ++i ) { + s[ index_wkspace_sys (i,top, system) ] = workspace->s[ index_wkspace_sys (i,old_id, system) ]; + t[ index_wkspace_sys (i,top, system) ] = workspace->t[ index_wkspace_sys (i,old_id, system) ]; + } - orig_id[top] = workspace->orig_id[old_id]; + orig_id[top] = workspace->orig_id[old_id]; - workspace->Hdia_inv[top] = 1. / system->reaxprm.sbp[ old_type ].eta; - workspace->b_s[top] = -system->reaxprm.sbp[ old_type ].chi; - workspace->b_t[top] = -1.0; + workspace->Hdia_inv[top] = 1. / system->reaxprm.sbp[ old_type ].eta; + workspace->b_s[top] = -system->reaxprm.sbp[ old_type ].chi; + workspace->b_t[top] = -1.0; - if( system->reaxprm.sbp[ old_type ].p_hbond == 1 ) // H atom - workspace->hbond_index[top] = (*num_H)++; - else workspace->hbond_index[top] = -1; + if( system->reaxprm.sbp[ old_type ].p_hbond == 1 ) // H atom + workspace->hbond_index[top] = (*num_H)++; + else workspace->hbond_index[top] = -1; - rvec_Copy( f_old[top], workspace->f_old[old_id] ); + rvec_Copy( f_old[top], workspace->f_old[old_id] ); } void Free_Storage( static_storage *workspace ) { - free( workspace->v ); - free( workspace->s ); - free( workspace->t ); - free( workspace->orig_id ); + free( workspace->v ); + free( workspace->s ); + free( workspace->t ); + free( workspace->orig_id ); } void Assign_New_Storage( static_storage *workspace, - real *v, real *s, real *t, - int *orig_id, rvec *f_old ) + real *v, real *s, real *t, + int *orig_id, rvec *f_old ) { - workspace->v = v; + workspace->v = v; - workspace->s = s; - workspace->t = t; + workspace->s = s; + workspace->t = t; - workspace->orig_id = orig_id; + workspace->orig_id = orig_id; - workspace->f_old = f_old; + workspace->f_old = f_old; } void Cluster_Atoms( reax_system *system, static_storage *workspace ) { - int i, j, k, l, top, old_id, num_H = 0; - reax_atom *old_atom; - grid *g = &( system->g ); - reax_atom *new_atoms = (reax_atom*) calloc( system->N, sizeof(reax_atom) ); - int *orig_id = (int *) calloc( system->N, sizeof( int ) ); - real *v; - real *s, *t; - rvec *f_old = (rvec*) calloc( system->N, sizeof(rvec) ); - - s = (real*) calloc( 3, sizeof( real ) * system->N ); - t = (real*) calloc( 3, sizeof( real ) * system->N ); - v = (real*) calloc( RESTART+1, sizeof( real ) * system->N ); - - top = 0; - - for( i = 0; i < g->ncell[0]; i++ ) - for( j = 0; j < g->ncell[1]; j++ ) - for( k = 0; k < g->ncell[2]; k++ ) { - g->start[ index_grid_3d (i, j, k, g) ] = top; - - for( l = 0; l < g->top[ index_grid_3d (i, j, k, g) ]; ++l ) { - old_id = g->atoms[ index_grid_atoms (i, j, k, l, g) ]; - old_atom = &( system->atoms[old_id] ); - // fprintf( stderr, "%d <-- %d\n", top, old_id ); - - reax_atom_Copy( &(new_atoms[top]), old_atom ); - Copy_Storage( system, workspace, top, old_id, old_atom->type, - &num_H, v, s, t, orig_id, f_old ); - ++top; - } - - g->end[ index_grid_3d (i, j, k, g) ] = top; - } - - - free( system->atoms ); - Free_Storage( workspace ); - - system->atoms = new_atoms; - Assign_New_Storage( workspace, v, s, t, orig_id, f_old ); + int i, j, k, l, top, old_id, num_H = 0; + reax_atom *old_atom; + grid *g = &( system->g ); + reax_atom *new_atoms = (reax_atom*) calloc( system->N, sizeof(reax_atom) ); + int *orig_id = (int *) calloc( system->N, sizeof( int ) ); + real *v; + real *s, *t; + rvec *f_old = (rvec*) calloc( system->N, sizeof(rvec) ); + + s = (real*) calloc( 3, sizeof( real ) * system->N ); + t = (real*) calloc( 3, sizeof( real ) * system->N ); + v = (real*) calloc( RESTART+1, sizeof( real ) * system->N ); + + top = 0; + + for( i = 0; i < g->ncell[0]; i++ ) + for( j = 0; j < g->ncell[1]; j++ ) + for( k = 0; k < g->ncell[2]; k++ ) { + g->start[ index_grid_3d (i, j, k, g) ] = top; + + for( l = 0; l < g->top[ index_grid_3d (i, j, k, g) ]; ++l ) { + old_id = g->atoms[ index_grid_atoms (i, j, k, l, g) ]; + old_atom = &( system->atoms[old_id] ); + // fprintf( stderr, "%d <-- %d\n", top, old_id ); + + reax_atom_Copy( &(new_atoms[top]), old_atom ); + Copy_Storage( system, workspace, top, old_id, old_atom->type, + &num_H, v, s, t, orig_id, f_old ); + ++top; + } + + g->end[ index_grid_3d (i, j, k, g) ] = top; + } + + + free( system->atoms ); + Free_Storage( workspace ); + + system->atoms = new_atoms; + Assign_New_Storage( workspace, v, s, t, orig_id, f_old ); } diff --git a/PuReMD-GPU/src/helpers.cu b/PuReMD-GPU/src/helpers.cu index 82c8e248..29ae31e3 100644 --- a/PuReMD-GPU/src/helpers.cu +++ b/PuReMD-GPU/src/helpers.cu @@ -24,12 +24,12 @@ GLOBAL void compute_Inc_on_T3 (reax_atom *atoms, unsigned int N, simulation_box *box, real d1, real d2, real d3) { - int index = blockIdx.x * blockDim.x + threadIdx.x; - rvec dx; - dx[0] = d1; - dx[1] = d2; - dx[2] = d3; + int index = blockIdx.x * blockDim.x + threadIdx.x; + rvec dx; + dx[0] = d1; + dx[1] = d2; + dx[2] = d3; - if (index < N ) - Inc_on_T3( atoms[index].x, dx, box ); + if (index < N ) + Inc_on_T3( atoms[index].x, dx, box ); } diff --git a/PuReMD-GPU/src/init_md.cu b/PuReMD-GPU/src/init_md.cu index 3c8ace27..e1912d3c 100644 --- a/PuReMD-GPU/src/init_md.cu +++ b/PuReMD-GPU/src/init_md.cu @@ -41,1321 +41,1321 @@ #include "helpers.h" #include "reduction.h" -#include "index_utils.h" +#include "index_utils.h" #include "validation.h" void Generate_Initial_Velocities(reax_system *system, real T ) { - int i; - real scale, norm; + int i; + real scale, norm; - if( T <= 0.1 ) { - for (i=0; i < system->N; i++) - rvec_MakeZero( system->atoms[i].v ); + if( T <= 0.1 ) { + for (i=0; i < system->N; i++) + rvec_MakeZero( system->atoms[i].v ); #if defined(DEBUG) - fprintf( stderr, "no random velocities...\n" ); + fprintf( stderr, "no random velocities...\n" ); #endif - } - else { - for( i = 0; i < system->N; i++ ) { - rvec_Random( system->atoms[i].v ); - - norm = rvec_Norm_Sqr( system->atoms[i].v ); - scale = SQRT( system->reaxprm.sbp[ system->atoms[i].type ].mass * - norm / (3.0 * K_B * T) ); - - rvec_Scale( system->atoms[i].v, 1.0/scale, system->atoms[i].v ); - - /* - fprintf( stderr, "v = %f %f %f\n", - system->atoms[i].v[0],system->atoms[i].v[1],system->atoms[i].v[2]); - fprintf( stderr, "scale = %f\n", scale ); - fprintf( stderr, "v = %f %f %f\n", - system->atoms[i].v[0],system->atoms[i].v[1],system->atoms[i].v[2]); - */ - } - } + } + else { + for( i = 0; i < system->N; i++ ) { + rvec_Random( system->atoms[i].v ); + + norm = rvec_Norm_Sqr( system->atoms[i].v ); + scale = SQRT( system->reaxprm.sbp[ system->atoms[i].type ].mass * + norm / (3.0 * K_B * T) ); + + rvec_Scale( system->atoms[i].v, 1.0/scale, system->atoms[i].v ); + + /* + fprintf( stderr, "v = %f %f %f\n", + system->atoms[i].v[0],system->atoms[i].v[1],system->atoms[i].v[2]); + fprintf( stderr, "scale = %f\n", scale ); + fprintf( stderr, "v = %f %f %f\n", + system->atoms[i].v[0],system->atoms[i].v[1],system->atoms[i].v[2]); + */ + } + } } void Init_System( reax_system *system, control_params *control, - simulation_data *data ) + simulation_data *data ) { - int i; - rvec dx; - - if( !control->restart ) - Reset_Atoms( system ); - - Compute_Total_Mass( system, data ); - - Compute_Center_of_Mass( system, data, stderr ); - - /* reposition atoms */ - // just fit the atoms to the periodic box - if( control->reposition_atoms == 0 ) { - rvec_MakeZero( dx ); - } - // put the center of mass to the center of the box - else if( control->reposition_atoms == 1 ) { - rvec_Scale( dx, 0.5, system->box.box_norms ); - rvec_ScaledAdd( dx, -1., data->xcm ); - } - // put the center of mass to the origin - else if( control->reposition_atoms == 2 ) { - rvec_Scale( dx, -1., data->xcm ); - } - else { - fprintf( stderr, "UNKNOWN OPTION: reposition_atoms. Terminating...\n" ); - exit( UNKNOWN_OPTION ); - } - - for( i = 0; i < system->N; ++i ) { - Inc_on_T3( system->atoms[i].x, dx, &(system->box) ); - /*fprintf( stderr, "%6d%2d%8.3f%8.3f%8.3f\n", - i, system->atoms[i].type, - system->atoms[i].x[0], system->atoms[i].x[1], system->atoms[i].x[2] );*/ - } - - /* Initialize velocities so that desired init T can be attained */ - if( !control->restart || (control->restart && control->random_vel) ) { - Generate_Initial_Velocities( system, control->T_init ); - } - - Setup_Grid( system ); + int i; + rvec dx; + + if( !control->restart ) + Reset_Atoms( system ); + + Compute_Total_Mass( system, data ); + + Compute_Center_of_Mass( system, data, stderr ); + + /* reposition atoms */ + // just fit the atoms to the periodic box + if( control->reposition_atoms == 0 ) { + rvec_MakeZero( dx ); + } + // put the center of mass to the center of the box + else if( control->reposition_atoms == 1 ) { + rvec_Scale( dx, 0.5, system->box.box_norms ); + rvec_ScaledAdd( dx, -1., data->xcm ); + } + // put the center of mass to the origin + else if( control->reposition_atoms == 2 ) { + rvec_Scale( dx, -1., data->xcm ); + } + else { + fprintf( stderr, "UNKNOWN OPTION: reposition_atoms. Terminating...\n" ); + exit( UNKNOWN_OPTION ); + } + + for( i = 0; i < system->N; ++i ) { + Inc_on_T3( system->atoms[i].x, dx, &(system->box) ); + /*fprintf( stderr, "%6d%2d%8.3f%8.3f%8.3f\n", + i, system->atoms[i].type, + system->atoms[i].x[0], system->atoms[i].x[1], system->atoms[i].x[2] );*/ + } + + /* Initialize velocities so that desired init T can be attained */ + if( !control->restart || (control->restart && control->random_vel) ) { + Generate_Initial_Velocities( system, control->T_init ); + } + + Setup_Grid( system ); } void Cuda_Init_System( reax_system *system, control_params *control, - simulation_data *data ) + simulation_data *data ) { - int i; - rvec dx; - - if( !control->restart ) - Cuda_Reset_Atoms( system ); - - Cuda_Compute_Total_Mass( system, data ); - - Cuda_Compute_Center_of_Mass( system, data, stderr ); - - /* reposition atoms */ - // just fit the atoms to the periodic box - if( control->reposition_atoms == 0 ) { - rvec_MakeZero( dx ); - } - // put the center of mass to the center of the box - else if( control->reposition_atoms == 1 ) { - rvec_Scale( dx, 0.5, system->box.box_norms ); - rvec_ScaledAdd( dx, -1., data->xcm ); - } - // put the center of mass to the origin - else if( control->reposition_atoms == 2 ) { - rvec_Scale( dx, -1., data->xcm ); - } - else { - fprintf( stderr, "UNKNOWN OPTION: reposition_atoms. Terminating...\n" ); - exit( UNKNOWN_OPTION ); - } - - compute_Inc_on_T3 <<<BLOCKS_POW_2, BLOCK_SIZE>>> - (system->d_atoms, system->N, system->d_box, dx[0], dx[1], dx[2]); - cudaThreadSynchronize (); - cudaCheckError (); - - //copy back the atoms from device to the host - copy_host_device (system->atoms, system->d_atoms, REAX_ATOM_SIZE * system->N , - cudaMemcpyDeviceToHost, RES_SYSTEM_ATOMS ); - - /* Initialize velocities so that desired init T can be attained */ - if( !control->restart || (control->restart && control->random_vel) ) { - Generate_Initial_Velocities( system, control->T_init ); - } - - Setup_Grid( system ); + int i; + rvec dx; + + if( !control->restart ) + Cuda_Reset_Atoms( system ); + + Cuda_Compute_Total_Mass( system, data ); + + Cuda_Compute_Center_of_Mass( system, data, stderr ); + + /* reposition atoms */ + // just fit the atoms to the periodic box + if( control->reposition_atoms == 0 ) { + rvec_MakeZero( dx ); + } + // put the center of mass to the center of the box + else if( control->reposition_atoms == 1 ) { + rvec_Scale( dx, 0.5, system->box.box_norms ); + rvec_ScaledAdd( dx, -1., data->xcm ); + } + // put the center of mass to the origin + else if( control->reposition_atoms == 2 ) { + rvec_Scale( dx, -1., data->xcm ); + } + else { + fprintf( stderr, "UNKNOWN OPTION: reposition_atoms. Terminating...\n" ); + exit( UNKNOWN_OPTION ); + } + + compute_Inc_on_T3 <<<BLOCKS_POW_2, BLOCK_SIZE>>> + (system->d_atoms, system->N, system->d_box, dx[0], dx[1], dx[2]); + cudaThreadSynchronize (); + cudaCheckError (); + + //copy back the atoms from device to the host + copy_host_device (system->atoms, system->d_atoms, REAX_ATOM_SIZE * system->N , + cudaMemcpyDeviceToHost, RES_SYSTEM_ATOMS ); + + /* Initialize velocities so that desired init T can be attained */ + if( !control->restart || (control->restart && control->random_vel) ) { + Generate_Initial_Velocities( system, control->T_init ); + } + + Setup_Grid( system ); } void Init_Simulation_Data( reax_system *system, control_params *control, - simulation_data *data, output_controls *out_control, - evolve_function *Evolve ) + simulation_data *data, output_controls *out_control, + evolve_function *Evolve ) { - Reset_Simulation_Data( data ); + Reset_Simulation_Data( data ); - if( !control->restart ) - data->step = data->prev_steps = 0; + if( !control->restart ) + data->step = data->prev_steps = 0; - switch( control->ensemble ) { - case NVE: - data->N_f = 3 * system->N; - *Evolve = Velocity_Verlet_NVE; - break; + switch( control->ensemble ) { + case NVE: + data->N_f = 3 * system->N; + *Evolve = Velocity_Verlet_NVE; + break; - case NVT: - data->N_f = 3 * system->N + 1; - //control->Tau_T = 100 * data->N_f * K_B * control->T_final; - if( !control->restart || (control->restart && control->random_vel) ) { - data->therm.G_xi = control->Tau_T * (2.0 * data->E_Kin - - data->N_f * K_B * control->T ); - data->therm.v_xi = data->therm.G_xi * control->dt; - data->therm.v_xi_old = 0; - data->therm.xi = 0; + case NVT: + data->N_f = 3 * system->N + 1; + //control->Tau_T = 100 * data->N_f * K_B * control->T_final; + if( !control->restart || (control->restart && control->random_vel) ) { + data->therm.G_xi = control->Tau_T * (2.0 * data->E_Kin - + data->N_f * K_B * control->T ); + data->therm.v_xi = data->therm.G_xi * control->dt; + data->therm.v_xi_old = 0; + data->therm.xi = 0; #if defined(DEBUG_FOCUS) - fprintf( stderr, "init_md: G_xi=%f Tau_T=%f E_kin=%f N_f=%f v_xi=%f\n", - data->therm.G_xi, control->Tau_T, data->E_Kin, - data->N_f, data->therm.v_xi ); + fprintf( stderr, "init_md: G_xi=%f Tau_T=%f E_kin=%f N_f=%f v_xi=%f\n", + data->therm.G_xi, control->Tau_T, data->E_Kin, + data->N_f, data->therm.v_xi ); #endif - } - - *Evolve = Velocity_Verlet_Nose_Hoover_NVT_Klein; - break; - - - case NPT: // Anisotropic NPT - fprintf( stderr, "THIS OPTION IS NOT YET IMPLEMENTED! TERMINATING...\n" ); - exit( UNKNOWN_OPTION ); - data->N_f = 3 * system->N + 9; - if( !control->restart ) { - data->therm.G_xi = control->Tau_T * (2.0 * data->E_Kin - - data->N_f * K_B * control->T ); - data->therm.v_xi = data->therm.G_xi * control->dt; - data->iso_bar.eps = 0.33333 * log(system->box.volume); - //data->inv_W = 1. / (data->N_f*K_B*control->T*SQR(control->Tau_P)); - //Compute_Pressure( system, data, workspace ); - } - *Evolve = Velocity_Verlet_Berendsen_Isotropic_NPT; - break; - - - case sNPT: // Semi-Isotropic NPT - data->N_f = 3 * system->N + 4; - *Evolve = Velocity_Verlet_Berendsen_SemiIsotropic_NPT; - break; - - - case iNPT: // Isotropic NPT - data->N_f = 3 * system->N + 2; - *Evolve = Velocity_Verlet_Berendsen_Isotropic_NPT; - break; - - case bNVT: //berendensen NVT - data->N_f = 3 * system->N + 1; - *Evolve = Velocity_Verlet_Berendsen_NVT; - break; - - default: - break; - } - - Compute_Kinetic_Energy( system, data ); - - /* init timing info for the host*/ - data->timing.start = Get_Time( ); - data->timing.total = data->timing.start; - data->timing.nbrs = 0; - data->timing.init_forces = 0; - data->timing.bonded = 0; - data->timing.nonb = 0; - data->timing.QEq = 0; - data->timing.matvecs = 0; + } + + *Evolve = Velocity_Verlet_Nose_Hoover_NVT_Klein; + break; + + + case NPT: // Anisotropic NPT + fprintf( stderr, "THIS OPTION IS NOT YET IMPLEMENTED! TERMINATING...\n" ); + exit( UNKNOWN_OPTION ); + data->N_f = 3 * system->N + 9; + if( !control->restart ) { + data->therm.G_xi = control->Tau_T * (2.0 * data->E_Kin - + data->N_f * K_B * control->T ); + data->therm.v_xi = data->therm.G_xi * control->dt; + data->iso_bar.eps = 0.33333 * log(system->box.volume); + //data->inv_W = 1. / (data->N_f*K_B*control->T*SQR(control->Tau_P)); + //Compute_Pressure( system, data, workspace ); + } + *Evolve = Velocity_Verlet_Berendsen_Isotropic_NPT; + break; + + + case sNPT: // Semi-Isotropic NPT + data->N_f = 3 * system->N + 4; + *Evolve = Velocity_Verlet_Berendsen_SemiIsotropic_NPT; + break; + + + case iNPT: // Isotropic NPT + data->N_f = 3 * system->N + 2; + *Evolve = Velocity_Verlet_Berendsen_Isotropic_NPT; + break; + + case bNVT: //berendensen NVT + data->N_f = 3 * system->N + 1; + *Evolve = Velocity_Verlet_Berendsen_NVT; + break; + + default: + break; + } + + Compute_Kinetic_Energy( system, data ); + + /* init timing info for the host*/ + data->timing.start = Get_Time( ); + data->timing.total = data->timing.start; + data->timing.nbrs = 0; + data->timing.init_forces = 0; + data->timing.bonded = 0; + data->timing.nonb = 0; + data->timing.QEq = 0; + data->timing.matvecs = 0; } void Cuda_Init_Simulation_Data( reax_system *system, control_params *control, - simulation_data *data, output_controls *out_control, - evolve_function *Evolve ) + simulation_data *data, output_controls *out_control, + evolve_function *Evolve ) { - Reset_Simulation_Data( data ); + Reset_Simulation_Data( data ); - if( !control->restart ) - data->step = data->prev_steps = 0; + if( !control->restart ) + data->step = data->prev_steps = 0; - switch( control->ensemble ) { - case NVE: - data->N_f = 3 * system->N; - *Evolve = Cuda_Velocity_Verlet_NVE; - break; + switch( control->ensemble ) { + case NVE: + data->N_f = 3 * system->N; + *Evolve = Cuda_Velocity_Verlet_NVE; + break; - case NVT: - data->N_f = 3 * system->N + 1; - //control->Tau_T = 100 * data->N_f * K_B * control->T_final; - if( !control->restart || (control->restart && control->random_vel) ) { - data->therm.G_xi = control->Tau_T * (2.0 * data->E_Kin - - data->N_f * K_B * control->T ); - data->therm.v_xi = data->therm.G_xi * control->dt; - data->therm.v_xi_old = 0; - data->therm.xi = 0; + case NVT: + data->N_f = 3 * system->N + 1; + //control->Tau_T = 100 * data->N_f * K_B * control->T_final; + if( !control->restart || (control->restart && control->random_vel) ) { + data->therm.G_xi = control->Tau_T * (2.0 * data->E_Kin - + data->N_f * K_B * control->T ); + data->therm.v_xi = data->therm.G_xi * control->dt; + data->therm.v_xi_old = 0; + data->therm.xi = 0; #if defined(DEBUG_FOCUS) - fprintf( stderr, "init_md: G_xi=%f Tau_T=%f E_kin=%f N_f=%f v_xi=%f\n", - data->therm.G_xi, control->Tau_T, data->E_Kin, - data->N_f, data->therm.v_xi ); + fprintf( stderr, "init_md: G_xi=%f Tau_T=%f E_kin=%f N_f=%f v_xi=%f\n", + data->therm.G_xi, control->Tau_T, data->E_Kin, + data->N_f, data->therm.v_xi ); #endif - } - - *Evolve = Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein; - break; - - - case NPT: // Anisotropic NPT - fprintf( stderr, "THIS OPTION IS NOT YET IMPLEMENTED! TERMINATING...\n" ); - exit( UNKNOWN_OPTION ); - data->N_f = 3 * system->N + 9; - if( !control->restart ) { - data->therm.G_xi = control->Tau_T * (2.0 * data->E_Kin - - data->N_f * K_B * control->T ); - data->therm.v_xi = data->therm.G_xi * control->dt; - data->iso_bar.eps = 0.33333 * log(system->box.volume); - //data->inv_W = 1. / (data->N_f*K_B*control->T*SQR(control->Tau_P)); - //Compute_Pressure( system, data, workspace ); - } - *Evolve = Velocity_Verlet_Berendsen_Isotropic_NPT; - break; - - - case sNPT: // Semi-Isotropic NPT - fprintf( stderr, "THIS OPTION IS NOT YET IMPLEMENTED! TERMINATING...\n" ); - exit( UNKNOWN_OPTION ); - data->N_f = 3 * system->N + 4; - *Evolve = Velocity_Verlet_Berendsen_SemiIsotropic_NPT; - break; - - - case iNPT: // Isotropic NPT - fprintf( stderr, "THIS OPTION IS NOT YET IMPLEMENTED! TERMINATING...\n" ); - exit( UNKNOWN_OPTION ); - data->N_f = 3 * system->N + 2; - *Evolve = Velocity_Verlet_Berendsen_Isotropic_NPT; - break; - - case bNVT: //berendensen NVT - data->N_f = 3 * system->N + 1; - *Evolve = Cuda_Velocity_Verlet_Berendsen_NVT; - break; - - default: - break; - } - - Cuda_Compute_Kinetic_Energy (system, data); + } + + *Evolve = Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein; + break; + + + case NPT: // Anisotropic NPT + fprintf( stderr, "THIS OPTION IS NOT YET IMPLEMENTED! TERMINATING...\n" ); + exit( UNKNOWN_OPTION ); + data->N_f = 3 * system->N + 9; + if( !control->restart ) { + data->therm.G_xi = control->Tau_T * (2.0 * data->E_Kin - + data->N_f * K_B * control->T ); + data->therm.v_xi = data->therm.G_xi * control->dt; + data->iso_bar.eps = 0.33333 * log(system->box.volume); + //data->inv_W = 1. / (data->N_f*K_B*control->T*SQR(control->Tau_P)); + //Compute_Pressure( system, data, workspace ); + } + *Evolve = Velocity_Verlet_Berendsen_Isotropic_NPT; + break; + + + case sNPT: // Semi-Isotropic NPT + fprintf( stderr, "THIS OPTION IS NOT YET IMPLEMENTED! TERMINATING...\n" ); + exit( UNKNOWN_OPTION ); + data->N_f = 3 * system->N + 4; + *Evolve = Velocity_Verlet_Berendsen_SemiIsotropic_NPT; + break; + + + case iNPT: // Isotropic NPT + fprintf( stderr, "THIS OPTION IS NOT YET IMPLEMENTED! TERMINATING...\n" ); + exit( UNKNOWN_OPTION ); + data->N_f = 3 * system->N + 2; + *Evolve = Velocity_Verlet_Berendsen_Isotropic_NPT; + break; + + case bNVT: //berendensen NVT + data->N_f = 3 * system->N + 1; + *Evolve = Cuda_Velocity_Verlet_Berendsen_NVT; + break; + + default: + break; + } + + Cuda_Compute_Kinetic_Energy (system, data); #ifdef __BUILD_DEBUG__ - real t_E_Kin = 0; - t_E_Kin = data->E_Kin; + real t_E_Kin = 0; + t_E_Kin = data->E_Kin; #endif - copy_host_device (&data->E_Kin, &((simulation_data *)data->d_simulation_data)->E_Kin, - REAL_SIZE, cudaMemcpyDeviceToHost, RES_SIMULATION_DATA ); - data->therm.T = (2. * data->E_Kin) / (data->N_f * K_B); - if ( fabs(data->therm.T) < ALMOST_ZERO ) // avoid T being an absolute zero! - data->therm.T = ALMOST_ZERO; + copy_host_device (&data->E_Kin, &((simulation_data *)data->d_simulation_data)->E_Kin, + REAL_SIZE, cudaMemcpyDeviceToHost, RES_SIMULATION_DATA ); + data->therm.T = (2. * data->E_Kin) / (data->N_f * K_B); + if ( fabs(data->therm.T) < ALMOST_ZERO ) // avoid T being an absolute zero! + data->therm.T = ALMOST_ZERO; #ifdef __BUILD_DEBUG__ - if (check_zero (t_E_Kin, data->E_Kin)){ - fprintf (stderr, "SimulationData:E_Kin does not match between host and device (%f %f) \n", t_E_Kin, data->E_Kin ); - exit (1); - } - //validate_data ( system, data ); + if (check_zero (t_E_Kin, data->E_Kin)){ + fprintf (stderr, "SimulationData:E_Kin does not match between host and device (%f %f) \n", t_E_Kin, data->E_Kin ); + exit (1); + } + //validate_data ( system, data ); #endif - /* init timing info for the host*/ - data->timing.start = Get_Time( ); - data->timing.total = data->timing.start; - data->timing.nbrs = 0; - data->timing.init_forces = 0; - data->timing.bonded = 0; - data->timing.nonb = 0; - data->timing.QEq = 0; - data->timing.matvecs = 0; - - /* init timing info for the device */ - d_timing.start = Get_Time( ); - d_timing.total = data->timing.start; - d_timing.nbrs = 0; - d_timing.init_forces = 0; - d_timing.bonded = 0; - d_timing.nonb = 0; - d_timing.QEq = 0; - d_timing.matvecs = 0; + /* init timing info for the host*/ + data->timing.start = Get_Time( ); + data->timing.total = data->timing.start; + data->timing.nbrs = 0; + data->timing.init_forces = 0; + data->timing.bonded = 0; + data->timing.nonb = 0; + data->timing.QEq = 0; + data->timing.matvecs = 0; + + /* init timing info for the device */ + d_timing.start = Get_Time( ); + d_timing.total = data->timing.start; + d_timing.nbrs = 0; + d_timing.init_forces = 0; + d_timing.bonded = 0; + d_timing.nonb = 0; + d_timing.QEq = 0; + d_timing.matvecs = 0; } void Init_Workspace( reax_system *system, control_params *control, - static_storage *workspace ) + static_storage *workspace ) { - int i; - - /* Allocate space for hydrogen bond list */ - workspace->hbond_index = (int *) malloc( system->N * sizeof( int ) ); - - /* bond order related storage */ - workspace->total_bond_order = (real *) malloc( system->N * sizeof( real ) ); - workspace->Deltap = (real *) malloc( system->N * sizeof( real ) ); - workspace->Deltap_boc = (real *) malloc( system->N * sizeof( real ) ); - workspace->dDeltap_self = (rvec *) malloc( system->N * sizeof( rvec ) ); - - workspace->Delta = (real *) malloc( system->N * sizeof( real ) ); - workspace->Delta_lp = (real *) malloc( system->N * sizeof( real ) ); - workspace->Delta_lp_temp = (real *) malloc( system->N * sizeof( real ) ); - workspace->dDelta_lp = (real *) malloc( system->N * sizeof( real ) ); - workspace->dDelta_lp_temp = (real *) malloc( system->N * sizeof( real ) ); - workspace->Delta_e = (real *) malloc( system->N * sizeof( real ) ); - workspace->Delta_boc = (real *) malloc( system->N * sizeof( real ) ); - workspace->nlp = (real *) malloc( system->N * sizeof( real ) ); - workspace->nlp_temp = (real *) malloc( system->N * sizeof( real ) ); - workspace->Clp = (real *) malloc( system->N * sizeof( real ) ); - workspace->CdDelta = (real *) malloc( system->N * sizeof( real ) ); - workspace->vlpex = (real *) malloc( system->N * sizeof( real ) ); - - /* QEq storage */ - //workspace->H = NULL; - //workspace->L = NULL; - //workspace->U = NULL; - // - workspace->H.start = NULL; - workspace->L.start = NULL; - workspace->U.start = NULL; - - workspace->H.entries = NULL; - workspace->L.entries = NULL; - workspace->U.entries = NULL; - - workspace->droptol = (real *) calloc( system->N, sizeof( real ) ); - workspace->w = (real *) calloc( system->N, sizeof( real ) ); - workspace->Hdia_inv = (real *) calloc( system->N, sizeof( real ) ); - workspace->b = (real *) calloc( system->N * 2, sizeof( real ) ); - workspace->b_s = (real *) calloc( system->N, sizeof( real ) ); - workspace->b_t = (real *) calloc( system->N, sizeof( real ) ); - workspace->b_prc = (real *) calloc( system->N * 2, sizeof( real ) ); - workspace->b_prm = (real *) calloc( system->N * 2, sizeof( real ) ); - workspace->s_t = (real *) calloc( system->N * 2, sizeof( real ) ); - workspace->s = (real *) calloc( 5 * system->N, sizeof( real ) ); - workspace->t = (real *) calloc( 5 * system->N, sizeof( real ) ); - // workspace->s_old = (real *) calloc( system->N, sizeof( real ) ); - // workspace->t_old = (real *) calloc( system->N, sizeof( real ) ); - // workspace->s_oldest = (real *) calloc( system->N, sizeof( real ) ); - // workspace->t_oldest = (real *) calloc( system->N, sizeof( real ) ); - - for( i = 0; i < system->N; ++i ) { - workspace->Hdia_inv[i] = 1./system->reaxprm.sbp[system->atoms[i].type].eta; - workspace->b_s[i] = -system->reaxprm.sbp[ system->atoms[i].type ].chi; - workspace->b_t[i] = -1.0; - - workspace->b[i] = -system->reaxprm.sbp[ system->atoms[i].type ].chi; - workspace->b[i+system->N] = -1.0; - } - - /* GMRES storage */ - workspace->y = (real *) calloc( RESTART+1, sizeof( real ) ); - workspace->z = (real *) calloc( RESTART+1, sizeof( real ) ); - workspace->g = (real *) calloc( RESTART+1, sizeof( real ) ); - workspace->hs = (real *) calloc( RESTART+1, sizeof( real ) ); - workspace->hc = (real *) calloc( RESTART+1, sizeof( real ) ); - - workspace->rn = (real *) calloc( (RESTART+1)*system->N*2, sizeof( real) ); - workspace->v = (real *) calloc( (RESTART+1)*system->N, sizeof( real) ); - workspace->h = (real *) calloc( (RESTART+1)*(RESTART+1), sizeof( real) ); - - /* CG storage */ - workspace->r = (real *) calloc( system->N, sizeof( real ) ); - workspace->d = (real *) calloc( system->N, sizeof( real ) ); - workspace->q = (real *) calloc( system->N, sizeof( real ) ); - workspace->p = (real *) calloc( system->N, sizeof( real ) ); - - - /* integrator storage */ - workspace->a = (rvec *) malloc( system->N * sizeof( rvec ) ); - workspace->f_old = (rvec *) malloc( system->N * sizeof( rvec ) ); - workspace->v_const = (rvec *) malloc( system->N * sizeof( rvec ) ); - - - /* storage for analysis */ - if( control->molec_anal || control->diffusion_coef ) - { - workspace->mark = (int *) calloc( system->N, sizeof(int) ); - workspace->old_mark = (int *) calloc( system->N, sizeof(int) ); - } - else - workspace->mark = workspace->old_mark = NULL; - - if( control->diffusion_coef ) - workspace->x_old = (rvec *) calloc( system->N, sizeof( rvec ) ); - else workspace->x_old = NULL; + int i; + + /* Allocate space for hydrogen bond list */ + workspace->hbond_index = (int *) malloc( system->N * sizeof( int ) ); + + /* bond order related storage */ + workspace->total_bond_order = (real *) malloc( system->N * sizeof( real ) ); + workspace->Deltap = (real *) malloc( system->N * sizeof( real ) ); + workspace->Deltap_boc = (real *) malloc( system->N * sizeof( real ) ); + workspace->dDeltap_self = (rvec *) malloc( system->N * sizeof( rvec ) ); + + workspace->Delta = (real *) malloc( system->N * sizeof( real ) ); + workspace->Delta_lp = (real *) malloc( system->N * sizeof( real ) ); + workspace->Delta_lp_temp = (real *) malloc( system->N * sizeof( real ) ); + workspace->dDelta_lp = (real *) malloc( system->N * sizeof( real ) ); + workspace->dDelta_lp_temp = (real *) malloc( system->N * sizeof( real ) ); + workspace->Delta_e = (real *) malloc( system->N * sizeof( real ) ); + workspace->Delta_boc = (real *) malloc( system->N * sizeof( real ) ); + workspace->nlp = (real *) malloc( system->N * sizeof( real ) ); + workspace->nlp_temp = (real *) malloc( system->N * sizeof( real ) ); + workspace->Clp = (real *) malloc( system->N * sizeof( real ) ); + workspace->CdDelta = (real *) malloc( system->N * sizeof( real ) ); + workspace->vlpex = (real *) malloc( system->N * sizeof( real ) ); + + /* QEq storage */ + //workspace->H = NULL; + //workspace->L = NULL; + //workspace->U = NULL; + // + workspace->H.start = NULL; + workspace->L.start = NULL; + workspace->U.start = NULL; + + workspace->H.entries = NULL; + workspace->L.entries = NULL; + workspace->U.entries = NULL; + + workspace->droptol = (real *) calloc( system->N, sizeof( real ) ); + workspace->w = (real *) calloc( system->N, sizeof( real ) ); + workspace->Hdia_inv = (real *) calloc( system->N, sizeof( real ) ); + workspace->b = (real *) calloc( system->N * 2, sizeof( real ) ); + workspace->b_s = (real *) calloc( system->N, sizeof( real ) ); + workspace->b_t = (real *) calloc( system->N, sizeof( real ) ); + workspace->b_prc = (real *) calloc( system->N * 2, sizeof( real ) ); + workspace->b_prm = (real *) calloc( system->N * 2, sizeof( real ) ); + workspace->s_t = (real *) calloc( system->N * 2, sizeof( real ) ); + workspace->s = (real *) calloc( 5 * system->N, sizeof( real ) ); + workspace->t = (real *) calloc( 5 * system->N, sizeof( real ) ); + // workspace->s_old = (real *) calloc( system->N, sizeof( real ) ); + // workspace->t_old = (real *) calloc( system->N, sizeof( real ) ); + // workspace->s_oldest = (real *) calloc( system->N, sizeof( real ) ); + // workspace->t_oldest = (real *) calloc( system->N, sizeof( real ) ); + + for( i = 0; i < system->N; ++i ) { + workspace->Hdia_inv[i] = 1./system->reaxprm.sbp[system->atoms[i].type].eta; + workspace->b_s[i] = -system->reaxprm.sbp[ system->atoms[i].type ].chi; + workspace->b_t[i] = -1.0; + + workspace->b[i] = -system->reaxprm.sbp[ system->atoms[i].type ].chi; + workspace->b[i+system->N] = -1.0; + } + + /* GMRES storage */ + workspace->y = (real *) calloc( RESTART+1, sizeof( real ) ); + workspace->z = (real *) calloc( RESTART+1, sizeof( real ) ); + workspace->g = (real *) calloc( RESTART+1, sizeof( real ) ); + workspace->hs = (real *) calloc( RESTART+1, sizeof( real ) ); + workspace->hc = (real *) calloc( RESTART+1, sizeof( real ) ); + + workspace->rn = (real *) calloc( (RESTART+1)*system->N*2, sizeof( real) ); + workspace->v = (real *) calloc( (RESTART+1)*system->N, sizeof( real) ); + workspace->h = (real *) calloc( (RESTART+1)*(RESTART+1), sizeof( real) ); + + /* CG storage */ + workspace->r = (real *) calloc( system->N, sizeof( real ) ); + workspace->d = (real *) calloc( system->N, sizeof( real ) ); + workspace->q = (real *) calloc( system->N, sizeof( real ) ); + workspace->p = (real *) calloc( system->N, sizeof( real ) ); + + + /* integrator storage */ + workspace->a = (rvec *) malloc( system->N * sizeof( rvec ) ); + workspace->f_old = (rvec *) malloc( system->N * sizeof( rvec ) ); + workspace->v_const = (rvec *) malloc( system->N * sizeof( rvec ) ); + + + /* storage for analysis */ + if( control->molec_anal || control->diffusion_coef ) + { + workspace->mark = (int *) calloc( system->N, sizeof(int) ); + workspace->old_mark = (int *) calloc( system->N, sizeof(int) ); + } + else + workspace->mark = workspace->old_mark = NULL; + + if( control->diffusion_coef ) + workspace->x_old = (rvec *) calloc( system->N, sizeof( rvec ) ); + else workspace->x_old = NULL; #ifdef TEST_FORCES - workspace->dDelta = (rvec *) malloc( system->N * sizeof( rvec ) ); - workspace->f_ele = (rvec *) malloc( system->N * sizeof( rvec ) ); - workspace->f_vdw = (rvec *) malloc( system->N * sizeof( rvec ) ); - workspace->f_bo = (rvec *) malloc( system->N * sizeof( rvec ) ); - workspace->f_be = (rvec *) malloc( system->N * sizeof( rvec ) ); - workspace->f_lp = (rvec *) malloc( system->N * sizeof( rvec ) ); - workspace->f_ov = (rvec *) malloc( system->N * sizeof( rvec ) ); - workspace->f_un = (rvec *) malloc( system->N * sizeof( rvec ) ); - workspace->f_ang = (rvec *) malloc( system->N * sizeof( rvec ) ); - workspace->f_coa = (rvec *) malloc( system->N * sizeof( rvec ) ); - workspace->f_pen = (rvec *) malloc( system->N * sizeof( rvec ) ); - workspace->f_hb = (rvec *) malloc( system->N * sizeof( rvec ) ); - workspace->f_tor = (rvec *) malloc( system->N * sizeof( rvec ) ); - workspace->f_con = (rvec *) malloc( system->N * sizeof( rvec ) ); + workspace->dDelta = (rvec *) malloc( system->N * sizeof( rvec ) ); + workspace->f_ele = (rvec *) malloc( system->N * sizeof( rvec ) ); + workspace->f_vdw = (rvec *) malloc( system->N * sizeof( rvec ) ); + workspace->f_bo = (rvec *) malloc( system->N * sizeof( rvec ) ); + workspace->f_be = (rvec *) malloc( system->N * sizeof( rvec ) ); + workspace->f_lp = (rvec *) malloc( system->N * sizeof( rvec ) ); + workspace->f_ov = (rvec *) malloc( system->N * sizeof( rvec ) ); + workspace->f_un = (rvec *) malloc( system->N * sizeof( rvec ) ); + workspace->f_ang = (rvec *) malloc( system->N * sizeof( rvec ) ); + workspace->f_coa = (rvec *) malloc( system->N * sizeof( rvec ) ); + workspace->f_pen = (rvec *) malloc( system->N * sizeof( rvec ) ); + workspace->f_hb = (rvec *) malloc( system->N * sizeof( rvec ) ); + workspace->f_tor = (rvec *) malloc( system->N * sizeof( rvec ) ); + workspace->f_con = (rvec *) malloc( system->N * sizeof( rvec ) ); #endif - workspace->realloc.num_far = -1; - workspace->realloc.Htop = -1; - workspace->realloc.hbonds = -1; - workspace->realloc.bonds = -1; - workspace->realloc.num_3body = -1; - workspace->realloc.gcell_atoms = -1; + workspace->realloc.num_far = -1; + workspace->realloc.Htop = -1; + workspace->realloc.hbonds = -1; + workspace->realloc.bonds = -1; + workspace->realloc.num_3body = -1; + workspace->realloc.gcell_atoms = -1; - Reset_Workspace( system, workspace ); + Reset_Workspace( system, workspace ); } void compare_far_neighbors (int *test, int *start, int *end, far_neighbor_data *data, list *slist, int N) { - int index = 0; - int count = 0; - int jicount = 0; - int end_index, gpu_index, gpu_end, k; - far_neighbor_data gpu, cpu; - - /* - for (int i = 0; i < N ; i++ ) - { - if (test[i] != start[i]) { - fprintf (stderr, "start index does not match \n"); - exit (0); - } - - if (test[i+1] != (end[i]) ){ - fprintf (stderr, "end index does not match for atom %d (cpu: %d gpu: %d) \n", i, test[i+1], end[i]); - exit (0); - } - } - */ - - - for (int i = 0; i < N; i++){ - index = Start_Index (i, slist); - //fprintf (stderr, "GPU : Neighbors of atom --> %d (start: %d , end: %d )\n", i, start[i], end[i]); - - - for (int j = start[i]; j < end[i]; j++){ - gpu = data[j]; - - if (i < data[j].nbr) continue; - /* - if (i < data[j].nbr) { - //fprintf (stderr, " atom %d and neighbor %d @ index %d\n", i, data[j].nbr, j); - int src = data[j].nbr; - int dest = i; - int x; - - - for (x = start[src]; x < end[src]; x++) { - if (data[x].nbr != dest) continue; - - gpu = data[x]; - cpu = data[j]; - - if ( (gpu.d != cpu.d) || - (cpu.dvec[0] != gpu.dvec[0]) || (cpu.dvec[1] != gpu.dvec[1]) || (cpu.dvec[2] != gpu.dvec[2]) || - (cpu.rel_box[0] != gpu.rel_box[0]) || (cpu.rel_box[1] != gpu.rel_box[1]) || (cpu.rel_box[2] != gpu.rel_box[2])) { - fprintf (stderr, " atom %d neighbor %d (%f, %d, %d, %d - %f %f %f) \n", i, data[j].nbr, - data[j].d, - data[j].rel_box[0], - data[j].rel_box[1], - data[j].rel_box[2], - data[j].dvec[0], - data[j].dvec[1], - data[j].dvec[2] - ); - fprintf (stderr, " atom %d neighbor %d (%f, %d, %d, %d - %f %f %f) \n", data[j].nbr, data[x].nbr, - data[x].d, - data[x].rel_box[0], - data[x].rel_box[1], - data[x].rel_box[2], - data[x].dvec[0], - data[x].dvec[1], - data[x].dvec[2] - ); - jicount++; - } - break; - } - - if (x >= end[src]) { - fprintf (stderr, "could not find the neighbor duplicate data for ij (%d %d)\n", i, src ); - exit (0); - } - - continue; - } - */ - - cpu = slist->select.far_nbr_list[index]; - //if ( (gpu.nbr != cpu.nbr) || (gpu.d != cpu.d) ){ - //if ( (gpu->d != cpu->d) ){ - if ( (gpu.nbr != cpu.nbr) || (gpu.d != cpu.d) || - (cpu.dvec[0] != gpu.dvec[0]) || (cpu.dvec[1] != gpu.dvec[1]) || (cpu.dvec[2] != gpu.dvec[2]) || - (cpu.rel_box[0] != gpu.rel_box[0]) || (cpu.rel_box[1] != gpu.rel_box[1]) || (cpu.rel_box[2] != gpu.rel_box[2])) { - //if ( (gpu.dvec[0] != i) || (gpu.dvec[1] != i) ||(gpu.dvec[2] != i) || - // (gpu.rel_box[0] != i) || (gpu.rel_box[1] != i) ||(gpu.rel_box[2] != i) ) { - //if (memcmp (&gpu, &cpu, FAR_NEIGHBOR_SIZE - RVEC_SIZE - INT_SIZE )){ - - fprintf (stderr, "GPU:atom --> %d (s: %d , e: %d, i: %d ) (%d %d %d) \n", i, start[i], end[i], j, gpu.rel_box[0], gpu.rel_box[1], gpu.rel_box[2] ); - fprintf (stderr, "CPU:atom --> %d (s: %d , e: %d, i: %d )\n", i, Start_Index(i, slist), End_Index (i, slist), index); - - /* - fprintf (stdout, "Far neighbors does not match atom: %d \n", i ); - fprintf (stdout, "neighbor %d , %d \n", cpu.nbr, gpu.nbr); - fprintf (stdout, "d %f , %f \n", slist->select.far_nbr_list[index].d, data[j].d); - fprintf (stdout, "dvec (%f %f %f) (%f %f %f) \n", - cpu.dvec[0], cpu.dvec[1], cpu.dvec[2], - gpu.dvec[0], gpu.dvec[1], gpu.dvec[2] ); - - fprintf (stdout, "ivec (%d %d %d) (%d %d %d) \n", - cpu.rel_box[0], cpu.rel_box[1], cpu.rel_box[2], - gpu.rel_box[0], gpu.rel_box[1], gpu.rel_box[2] ); - - */ - count ++; - } - - //fprintf (stderr, "GPU (neighbor %d , d %d )\n", gpu->nbr, gpu->d); - index ++; - } - - if (index != End_Index (i, slist)) - { - fprintf (stderr, "End index does not match for atom --> %d end index (%d) Cpu (%d, %d ) gpu (%d, %d)\n", i, index, Start_Index (i, slist), End_Index(i, slist), - start[i], end[i]); - exit (10); - } - } - - fprintf (stderr, "Far neighbors MATCH between CPU and GPU -->%d reverse %d \n", count, jicount); - - /* - for (int i = 0; i < N; i++) - { - index = Start_Index (i, slist); - end_index = End_Index (i, slist); - - gpu_index = start[i]; - gpu_end = end[i]; - for (int j = index; j < end_index; j++) - { - far_neighbor_data *cpu = &slist->select.far_nbr_list[j]; - far_neighbor_data *gpu; - - for (k = gpu_index; k < gpu_end; k++) { - gpu = &data[k]; - if (gpu->nbr == cpu->nbr) break; - } - - if (k == gpu_end) { fprintf (stderr, " could not find neighbor for atom %d \n", i); exit (1); } - - if ( (gpu->nbr != cpu->nbr) || (gpu->d != cpu->d) || - ((cpu->dvec[0] || gpu->dvec[0]) || (cpu->dvec[1] || gpu->dvec[1]) || (cpu->dvec[2] || gpu->dvec[2])) || - ((cpu->rel_box[0] || gpu->rel_box[0]) || (cpu->rel_box[1] || gpu->rel_box[1]) || (cpu->rel_box[2] || gpu->rel_box[2])) ) { - - fprintf (stderr, "Far neighbors does not match atom: %d \n", i ); - fprintf (stderr, "neighbor %d , %d \n", cpu->nbr, gpu->nbr); - fprintf (stderr, "d %d , %d \n", cpu->d, gpu->d); - fprintf (stderr, "dvec (%f %f %f) (%f %f %f) \n", - cpu->dvec[0], cpu->dvec[1], cpu->dvec[2], - gpu->dvec[0], gpu->dvec[1], gpu->dvec[2] ); - - fprintf (stderr, "ivec (%d %d %d) (%d %d %d) \n", - cpu->rel_box[0], cpu->rel_box[1], cpu->rel_box[2], - gpu->rel_box[0], gpu->rel_box[1], gpu->rel_box[2] ); - fprintf (stderr, "GPU start %d GPU End %d \n", gpu_index, gpu_end ); - - exit (1); - } - } - } - - */ - } - - int Estimate_Device_Matrix (reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - list **lists, output_controls *out_control ) - { - int *indices, *Htop; - list *far_nbrs = dev_lists + FAR_NBRS; - int max_sparse_entries = 0; - real t1, t2; - - indices = (int *) scratch; - cuda_memset ( indices, 0, INT_SIZE * system->N, RES_SCRATCH ); - - t1 = Get_Time (); - - Estimate_Sparse_Matrix_Entries <<<BLOCKS, BLOCK_SIZE>>> - ( system->d_atoms, (control_params *)control->d_control, - (simulation_data *)data->d_simulation_data, (simulation_box *)system->d_box, - *far_nbrs, system->N, indices ); - cudaThreadSynchronize (); - cudaCheckError (); - - t2 = Get_Timing_Info ( t1 ); - - //fprintf (stderr, " Time to estimate sparse matrix entries --- > %f \n", t2 ); - - Htop = (int *) malloc (INT_SIZE * (system->N + 1)); - memset (Htop, 0, INT_SIZE * (system->N + 1)); - copy_host_device (Htop, indices, system->N * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__); - - for (int i = 0; i < system->N; i++) - { - if (max_sparse_entries < Htop[i]) { - max_sparse_entries = Htop[i]; - } - } + int index = 0; + int count = 0; + int jicount = 0; + int end_index, gpu_index, gpu_end, k; + far_neighbor_data gpu, cpu; + + /* + for (int i = 0; i < N ; i++ ) + { + if (test[i] != start[i]) { + fprintf (stderr, "start index does not match \n"); + exit (0); + } + + if (test[i+1] != (end[i]) ){ + fprintf (stderr, "end index does not match for atom %d (cpu: %d gpu: %d) \n", i, test[i+1], end[i]); + exit (0); + } + } + */ + + + for (int i = 0; i < N; i++){ + index = Start_Index (i, slist); + //fprintf (stderr, "GPU : Neighbors of atom --> %d (start: %d , end: %d )\n", i, start[i], end[i]); + + + for (int j = start[i]; j < end[i]; j++){ + gpu = data[j]; + + if (i < data[j].nbr) continue; + /* + if (i < data[j].nbr) { + //fprintf (stderr, " atom %d and neighbor %d @ index %d\n", i, data[j].nbr, j); + int src = data[j].nbr; + int dest = i; + int x; + + + for (x = start[src]; x < end[src]; x++) { + if (data[x].nbr != dest) continue; + + gpu = data[x]; + cpu = data[j]; + + if ( (gpu.d != cpu.d) || + (cpu.dvec[0] != gpu.dvec[0]) || (cpu.dvec[1] != gpu.dvec[1]) || (cpu.dvec[2] != gpu.dvec[2]) || + (cpu.rel_box[0] != gpu.rel_box[0]) || (cpu.rel_box[1] != gpu.rel_box[1]) || (cpu.rel_box[2] != gpu.rel_box[2])) { + fprintf (stderr, " atom %d neighbor %d (%f, %d, %d, %d - %f %f %f) \n", i, data[j].nbr, + data[j].d, + data[j].rel_box[0], + data[j].rel_box[1], + data[j].rel_box[2], + data[j].dvec[0], + data[j].dvec[1], + data[j].dvec[2] + ); + fprintf (stderr, " atom %d neighbor %d (%f, %d, %d, %d - %f %f %f) \n", data[j].nbr, data[x].nbr, + data[x].d, + data[x].rel_box[0], + data[x].rel_box[1], + data[x].rel_box[2], + data[x].dvec[0], + data[x].dvec[1], + data[x].dvec[2] + ); + jicount++; + } + break; + } + + if (x >= end[src]) { + fprintf (stderr, "could not find the neighbor duplicate data for ij (%d %d)\n", i, src ); + exit (0); + } + + continue; + } + */ + + cpu = slist->select.far_nbr_list[index]; + //if ( (gpu.nbr != cpu.nbr) || (gpu.d != cpu.d) ){ + //if ( (gpu->d != cpu->d) ){ + if ( (gpu.nbr != cpu.nbr) || (gpu.d != cpu.d) || + (cpu.dvec[0] != gpu.dvec[0]) || (cpu.dvec[1] != gpu.dvec[1]) || (cpu.dvec[2] != gpu.dvec[2]) || + (cpu.rel_box[0] != gpu.rel_box[0]) || (cpu.rel_box[1] != gpu.rel_box[1]) || (cpu.rel_box[2] != gpu.rel_box[2])) { + //if ( (gpu.dvec[0] != i) || (gpu.dvec[1] != i) ||(gpu.dvec[2] != i) || + // (gpu.rel_box[0] != i) || (gpu.rel_box[1] != i) ||(gpu.rel_box[2] != i) ) { + //if (memcmp (&gpu, &cpu, FAR_NEIGHBOR_SIZE - RVEC_SIZE - INT_SIZE )){ + + fprintf (stderr, "GPU:atom --> %d (s: %d , e: %d, i: %d ) (%d %d %d) \n", i, start[i], end[i], j, gpu.rel_box[0], gpu.rel_box[1], gpu.rel_box[2] ); + fprintf (stderr, "CPU:atom --> %d (s: %d , e: %d, i: %d )\n", i, Start_Index(i, slist), End_Index (i, slist), index); + + /* + fprintf (stdout, "Far neighbors does not match atom: %d \n", i ); + fprintf (stdout, "neighbor %d , %d \n", cpu.nbr, gpu.nbr); + fprintf (stdout, "d %f , %f \n", slist->select.far_nbr_list[index].d, data[j].d); + fprintf (stdout, "dvec (%f %f %f) (%f %f %f) \n", + cpu.dvec[0], cpu.dvec[1], cpu.dvec[2], + gpu.dvec[0], gpu.dvec[1], gpu.dvec[2] ); + + fprintf (stdout, "ivec (%d %d %d) (%d %d %d) \n", + cpu.rel_box[0], cpu.rel_box[1], cpu.rel_box[2], + gpu.rel_box[0], gpu.rel_box[1], gpu.rel_box[2] ); + + */ + count ++; + } + + //fprintf (stderr, "GPU (neighbor %d , d %d )\n", gpu->nbr, gpu->d); + index ++; + } + + if (index != End_Index (i, slist)) + { + fprintf (stderr, "End index does not match for atom --> %d end index (%d) Cpu (%d, %d ) gpu (%d, %d)\n", i, index, Start_Index (i, slist), End_Index(i, slist), + start[i], end[i]); + exit (10); + } + } + + fprintf (stderr, "Far neighbors MATCH between CPU and GPU -->%d reverse %d \n", count, jicount); + + /* + for (int i = 0; i < N; i++) + { + index = Start_Index (i, slist); + end_index = End_Index (i, slist); + + gpu_index = start[i]; + gpu_end = end[i]; + for (int j = index; j < end_index; j++) + { + far_neighbor_data *cpu = &slist->select.far_nbr_list[j]; + far_neighbor_data *gpu; + + for (k = gpu_index; k < gpu_end; k++) { + gpu = &data[k]; + if (gpu->nbr == cpu->nbr) break; + } + + if (k == gpu_end) { fprintf (stderr, " could not find neighbor for atom %d \n", i); exit (1); } + + if ( (gpu->nbr != cpu->nbr) || (gpu->d != cpu->d) || + ((cpu->dvec[0] || gpu->dvec[0]) || (cpu->dvec[1] || gpu->dvec[1]) || (cpu->dvec[2] || gpu->dvec[2])) || + ((cpu->rel_box[0] || gpu->rel_box[0]) || (cpu->rel_box[1] || gpu->rel_box[1]) || (cpu->rel_box[2] || gpu->rel_box[2])) ) { + + fprintf (stderr, "Far neighbors does not match atom: %d \n", i ); + fprintf (stderr, "neighbor %d , %d \n", cpu->nbr, gpu->nbr); + fprintf (stderr, "d %d , %d \n", cpu->d, gpu->d); + fprintf (stderr, "dvec (%f %f %f) (%f %f %f) \n", + cpu->dvec[0], cpu->dvec[1], cpu->dvec[2], + gpu->dvec[0], gpu->dvec[1], gpu->dvec[2] ); + + fprintf (stderr, "ivec (%d %d %d) (%d %d %d) \n", + cpu->rel_box[0], cpu->rel_box[1], cpu->rel_box[2], + gpu->rel_box[0], gpu->rel_box[1], gpu->rel_box[2] ); + fprintf (stderr, "GPU start %d GPU End %d \n", gpu_index, gpu_end ); + + exit (1); + } + } + } + + */ + } + + int Estimate_Device_Matrix (reax_system *system, control_params *control, + simulation_data *data, static_storage *workspace, + list **lists, output_controls *out_control ) + { + int *indices, *Htop; + list *far_nbrs = dev_lists + FAR_NBRS; + int max_sparse_entries = 0; + real t1, t2; + + indices = (int *) scratch; + cuda_memset ( indices, 0, INT_SIZE * system->N, RES_SCRATCH ); + + t1 = Get_Time (); + + Estimate_Sparse_Matrix_Entries <<<BLOCKS, BLOCK_SIZE>>> + ( system->d_atoms, (control_params *)control->d_control, + (simulation_data *)data->d_simulation_data, (simulation_box *)system->d_box, + *far_nbrs, system->N, indices ); + cudaThreadSynchronize (); + cudaCheckError (); + + t2 = Get_Timing_Info ( t1 ); + + //fprintf (stderr, " Time to estimate sparse matrix entries --- > %f \n", t2 ); + + Htop = (int *) malloc (INT_SIZE * (system->N + 1)); + memset (Htop, 0, INT_SIZE * (system->N + 1)); + copy_host_device (Htop, indices, system->N * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__); + + for (int i = 0; i < system->N; i++) + { + if (max_sparse_entries < Htop[i]) { + max_sparse_entries = Htop[i]; + } + } #ifdef __DEBUG_CUDA__ - fprintf (stderr, " Max sparse entries for this run are ---> %d \n", max_sparse_entries ); + fprintf (stderr, " Max sparse entries for this run are ---> %d \n", max_sparse_entries ); #endif - return max_sparse_entries * SAFE_ZONE; - //return max_sparse_entries; - } + return max_sparse_entries * SAFE_ZONE; + //return max_sparse_entries; + } - void Allocate_Device_Matrix (reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - list **lists, output_controls *out_control ) - { + void Allocate_Device_Matrix (reax_system *system, control_params *control, + simulation_data *data, static_storage *workspace, + list **lists, output_controls *out_control ) + { - //Allocate space for the sparse Matrix entries here. - system->max_sparse_matrix_entries = - Estimate_Device_Matrix (system, control, data, workspace, lists, out_control ); - dev_workspace->H.n = system->N ; - dev_workspace->H.m = system->N * system->max_sparse_matrix_entries; - Cuda_Init_Sparse_Matrix (&dev_workspace->H, system->max_sparse_matrix_entries * system->N, system->N ); + //Allocate space for the sparse Matrix entries here. + system->max_sparse_matrix_entries = + Estimate_Device_Matrix (system, control, data, workspace, lists, out_control ); + dev_workspace->H.n = system->N ; + dev_workspace->H.m = system->N * system->max_sparse_matrix_entries; + Cuda_Init_Sparse_Matrix (&dev_workspace->H, system->max_sparse_matrix_entries * system->N, system->N ); #ifdef __CUDA_MEM__ - fprintf( stderr, "Device memory allocated: sparse matrix= %ld (MB)\n", - system->max_sparse_matrix_entries * system->N * sizeof(sparse_matrix_entry) / (1024*1024) ); + fprintf( stderr, "Device memory allocated: sparse matrix= %ld (MB)\n", + system->max_sparse_matrix_entries * system->N * sizeof(sparse_matrix_entry) / (1024*1024) ); #endif - } + } - void Cuda_Init_Lists( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - list **lists, output_controls *out_control ) - { - int i, num_nbrs, num_hbonds, num_bonds, num_3body, Htop; - int *hb_top, *bond_top; + void Cuda_Init_Lists( reax_system *system, control_params *control, + simulation_data *data, static_storage *workspace, + list **lists, output_controls *out_control ) + { + int i, num_nbrs, num_hbonds, num_bonds, num_3body, Htop; + int *hb_top, *bond_top; - real t_start, t_elapsed; + real t_start, t_elapsed; - grid *g = &( system->g ); - int *d_indices = (int *) scratch; - int total = g->ncell[0] * g->ncell[1] * g->ncell[2]; + grid *g = &( system->g ); + int *d_indices = (int *) scratch; + int total = g->ncell[0] * g->ncell[1] * g->ncell[2]; - cuda_memset ( d_indices, 0, INT_SIZE * system->N, RES_SCRATCH ); + cuda_memset ( d_indices, 0, INT_SIZE * system->N, RES_SCRATCH ); #ifdef __BUILD_DEBUG__ - for (int i = 0; i < g->max_nbrs; i ++) { - if ((g->nbrs[i][0] >= g->ncell[0]) || - (g->nbrs[i][1] >= g->ncell[1]) || - (g->nbrs[i][2] >= g->ncell[2]) ) { - fprintf (stderr, " Grid Incorrectly built.... \n"); - exit (1); - } - - } + for (int i = 0; i < g->max_nbrs; i ++) { + if ((g->nbrs[i][0] >= g->ncell[0]) || + (g->nbrs[i][1] >= g->ncell[1]) || + (g->nbrs[i][2] >= g->ncell[2]) ) { + fprintf (stderr, " Grid Incorrectly built.... \n"); + exit (1); + } + + } #endif - dim3 blockspergrid (system->g.ncell[0], system->g.ncell[1], system->g.ncell[2]); - dim3 threadsperblock (system->g.max_atoms); + dim3 blockspergrid (system->g.ncell[0], system->g.ncell[1], system->g.ncell[2]); + dim3 threadsperblock (system->g.max_atoms); #ifdef __BUILD_DEBUG__ - fprintf (stderr, "Blocks per grid (%d %d %d)\n", system->g.ncell[0], system->g.ncell[1], system->g.ncell[2]); - fprintf (stderr, "Estimate Num Neighbors with threads per block as %d \n", system->d_g.max_atoms); - fprintf (stderr, "Max nbrs %d \n", system->d_g.max_nbrs); + fprintf (stderr, "Blocks per grid (%d %d %d)\n", system->g.ncell[0], system->g.ncell[1], system->g.ncell[2]); + fprintf (stderr, "Estimate Num Neighbors with threads per block as %d \n", system->d_g.max_atoms); + fprintf (stderr, "Max nbrs %d \n", system->d_g.max_nbrs); #endif - //First Bin atoms and they sync the host and the device for the grid. - //This will copy the atoms from host to device. - Cuda_Bin_Atoms (system, workspace); - Sync_Host_Device (&system->g, &system->d_g, cudaMemcpyHostToDevice ); + //First Bin atoms and they sync the host and the device for the grid. + //This will copy the atoms from host to device. + Cuda_Bin_Atoms (system, workspace); + Sync_Host_Device (&system->g, &system->d_g, cudaMemcpyHostToDevice ); - Estimate_NumNeighbors <<<blockspergrid, threadsperblock >>> - (system->d_atoms, system->d_g, system->d_box, - (control_params *)control->d_control, d_indices); - cudaThreadSynchronize (); - cudaCheckError (); + Estimate_NumNeighbors <<<blockspergrid, threadsperblock >>> + (system->d_atoms, system->d_g, system->d_box, + (control_params *)control->d_control, d_indices); + cudaThreadSynchronize (); + cudaCheckError (); - int *nbrs_indices = (int *) malloc( INT_SIZE * (system->N+1) ); - memset (nbrs_indices , 0, INT_SIZE * (system->N + 1)); + int *nbrs_indices = (int *) malloc( INT_SIZE * (system->N+1) ); + memset (nbrs_indices , 0, INT_SIZE * (system->N + 1)); - nbrs_indices [0] = 0; - copy_host_device (&nbrs_indices [1], d_indices, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); + nbrs_indices [0] = 0; + copy_host_device (&nbrs_indices [1], d_indices, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); - for (int i = 1; i <= system->N; i++) - nbrs_indices [i] += nbrs_indices [i-1]; + for (int i = 1; i <= system->N; i++) + nbrs_indices [i] += nbrs_indices [i-1]; - num_nbrs = nbrs_indices [system->N] ; - system->num_nbrs = num_nbrs; + num_nbrs = nbrs_indices [system->N] ; + system->num_nbrs = num_nbrs; #ifdef __DEBUG_CUDA__ - fprintf (stderr, "Total neighbors %d \n", nbrs_indices[system->N]); - fprintf (stderr, "Corrected Total neighbors %d \n", num_nbrs); + fprintf (stderr, "Total neighbors %d \n", nbrs_indices[system->N]); + fprintf (stderr, "Corrected Total neighbors %d \n", num_nbrs); #endif - list *far_nbrs = (dev_lists + FAR_NBRS); - if( !Make_List(system->N, num_nbrs, TYP_FAR_NEIGHBOR, far_nbrs, TYP_DEVICE) ) { - fprintf(stderr, "Problem in initializing far nbrs list. Terminating!\n"); - exit( INIT_ERR ); - } + list *far_nbrs = (dev_lists + FAR_NBRS); + if( !Make_List(system->N, num_nbrs, TYP_FAR_NEIGHBOR, far_nbrs, TYP_DEVICE) ) { + fprintf(stderr, "Problem in initializing far nbrs list. Terminating!\n"); + exit( INIT_ERR ); + } #ifdef __CUDA_MEM__ - fprintf( stderr, "Device memory allocated: far_nbrs = %ld (MB)\n", - num_nbrs * sizeof(far_neighbor_data) / (1024*1024) ); + fprintf( stderr, "Device memory allocated: far_nbrs = %ld (MB)\n", + num_nbrs * sizeof(far_neighbor_data) / (1024*1024) ); #endif - copy_host_device (nbrs_indices, far_nbrs->index, INT_SIZE * system->N, cudaMemcpyHostToDevice, __LINE__ ); - copy_host_device (nbrs_indices, far_nbrs->end_index, INT_SIZE * system->N, cudaMemcpyHostToDevice, __LINE__ ); - Cuda_Generate_Neighbor_Lists (system, workspace, control, false); + copy_host_device (nbrs_indices, far_nbrs->index, INT_SIZE * system->N, cudaMemcpyHostToDevice, __LINE__ ); + copy_host_device (nbrs_indices, far_nbrs->end_index, INT_SIZE * system->N, cudaMemcpyHostToDevice, __LINE__ ); + Cuda_Generate_Neighbor_Lists (system, workspace, control, false); #ifdef __BUILD_DEBUG__ - int *end = (int *)malloc (sizeof (int) * system->N); - int *start = (int *) malloc (sizeof (int) * system->N ); + int *end = (int *)malloc (sizeof (int) * system->N); + int *start = (int *) malloc (sizeof (int) * system->N ); - copy_host_device (start, far_nbrs->index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, 0); - copy_host_device (end, far_nbrs->end_index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, 0); + copy_host_device (start, far_nbrs->index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, 0); + copy_host_device (end, far_nbrs->end_index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, 0); - far_neighbor_data *far_data = (far_neighbor_data *) - malloc (FAR_NEIGHBOR_SIZE * num_nbrs); - copy_host_device (far_data, far_nbrs->select.far_nbr_list, - FAR_NEIGHBOR_SIZE * num_nbrs, cudaMemcpyDeviceToHost, 0); + far_neighbor_data *far_data = (far_neighbor_data *) + malloc (FAR_NEIGHBOR_SIZE * num_nbrs); + copy_host_device (far_data, far_nbrs->select.far_nbr_list, + FAR_NEIGHBOR_SIZE * num_nbrs, cudaMemcpyDeviceToHost, 0); - compare_far_neighbors (nbrs_indices, start, end, far_data, *lists + FAR_NBRS, system->N); + compare_far_neighbors (nbrs_indices, start, end, far_data, *lists + FAR_NBRS, system->N); - free (start); - free (end); + free (start); + free (end); #endif - int *output, size; - size = INT_SIZE * 2 * system->N + 2; - output = (int *) malloc (size); - Cuda_Estimate_Storage_Sizes (system, control, output); + int *output, size; + size = INT_SIZE * 2 * system->N + 2; + output = (int *) malloc (size); + Cuda_Estimate_Storage_Sizes (system, control, output); - Htop = output[0]; - num_3body = output[1]; - hb_top = &output[ 2 ]; - bond_top = &output[ 2 + system->N ]; + Htop = output[0]; + num_3body = output[1]; + hb_top = &output[ 2 ]; + bond_top = &output[ 2 + system->N ]; #ifdef __DEBUG_CUDA__ - int max_hbonds = 0; - int min_hbonds = 1000; - int max_bonds = 0; - int min_bonds = 1000; - for (int i = 0; i < system->N; i++) { - if ( max_hbonds < hb_top[i]) - max_hbonds = hb_top[i]; - if (min_hbonds > hb_top[i]) - min_hbonds = hb_top[i]; - - if (max_bonds < bond_top [i]) - max_bonds = bond_top[i]; - if (min_bonds > bond_top[i]) - min_bonds = bond_top[i]; - } - - fprintf (stderr, "Max Hbonds %d min Hbonds %d \n", max_hbonds, min_hbonds ); - fprintf (stderr, "Max bonds %d min bonds %d \n", max_bonds, min_bonds ); - fprintf (stderr, "Device HTop --> %d and num_3body --> %d \n", Htop, num_3body ); + int max_hbonds = 0; + int min_hbonds = 1000; + int max_bonds = 0; + int min_bonds = 1000; + for (int i = 0; i < system->N; i++) { + if ( max_hbonds < hb_top[i]) + max_hbonds = hb_top[i]; + if (min_hbonds > hb_top[i]) + min_hbonds = hb_top[i]; + + if (max_bonds < bond_top [i]) + max_bonds = bond_top[i]; + if (min_bonds > bond_top[i]) + min_bonds = bond_top[i]; + } + + fprintf (stderr, "Max Hbonds %d min Hbonds %d \n", max_hbonds, min_hbonds ); + fprintf (stderr, "Max bonds %d min bonds %d \n", max_bonds, min_bonds ); + fprintf (stderr, "Device HTop --> %d and num_3body --> %d \n", Htop, num_3body ); #endif - Allocate_Device_Matrix (system, control, data, workspace, lists, out_control ); + Allocate_Device_Matrix (system, control, data, workspace, lists, out_control ); - dev_workspace->num_H = 0; + dev_workspace->num_H = 0; - if( control->hb_cut > 0 ) { + if( control->hb_cut > 0 ) { - int *hbond_index = (int *) malloc ( INT_SIZE * system->N ); - // init H indexes - num_hbonds = 0; - for( i = 0; i < system->N; ++i ) - if( system->reaxprm.sbp[ system->atoms[i].type ].p_hbond == 1 || - system->reaxprm.sbp[ system->atoms[i].type ].p_hbond == 2 ) // H atom - //hbond_index[i] = workspace->num_H++; - hbond_index[i] = num_hbonds ++; - else - hbond_index[i] = -1; + int *hbond_index = (int *) malloc ( INT_SIZE * system->N ); + // init H indexes + num_hbonds = 0; + for( i = 0; i < system->N; ++i ) + if( system->reaxprm.sbp[ system->atoms[i].type ].p_hbond == 1 || + system->reaxprm.sbp[ system->atoms[i].type ].p_hbond == 2 ) // H atom + //hbond_index[i] = workspace->num_H++; + hbond_index[i] = num_hbonds ++; + else + hbond_index[i] = -1; - copy_host_device (hbond_index, dev_workspace->hbond_index, - system->N * INT_SIZE, cudaMemcpyHostToDevice, RES_STORAGE_HBOND_INDEX ); - dev_workspace->num_H = num_hbonds; + copy_host_device (hbond_index, dev_workspace->hbond_index, + system->N * INT_SIZE, cudaMemcpyHostToDevice, RES_STORAGE_HBOND_INDEX ); + dev_workspace->num_H = num_hbonds; #ifdef __DEBUG_CUDA__ - fprintf (stderr, "Device num_H --> %d \n", dev_workspace->num_H ); + fprintf (stderr, "Device num_H --> %d \n", dev_workspace->num_H ); #endif - Cuda_Allocate_HBond_List( system->N, dev_workspace->num_H, dev_workspace->hbond_index, - hb_top, (dev_lists+HBONDS) ); - num_hbonds = hb_top[system->N-1]; - system->num_hbonds = num_hbonds; + Cuda_Allocate_HBond_List( system->N, dev_workspace->num_H, dev_workspace->hbond_index, + hb_top, (dev_lists+HBONDS) ); + num_hbonds = hb_top[system->N-1]; + system->num_hbonds = num_hbonds; #ifdef __CUDA_MEM__ - fprintf (stderr, "Device memory allocated: Hydrogen Bonds list: %ld (MB) \n", - sizeof (hbond_data) * num_hbonds / (1024*1024)); + fprintf (stderr, "Device memory allocated: Hydrogen Bonds list: %ld (MB) \n", + sizeof (hbond_data) * num_hbonds / (1024*1024)); #endif #ifdef __DEBUG_CUDA__ - fprintf (stderr, "Device Total number of HBonds --> %d \n", num_hbonds ); + fprintf (stderr, "Device Total number of HBonds --> %d \n", num_hbonds ); #endif - free (hbond_index); - } + free (hbond_index); + } - // bonds list - Cuda_Allocate_Bond_List( system->N, bond_top, dev_lists+BONDS ); - num_bonds = bond_top[system->N-1]; - system->num_bonds = num_bonds; + // bonds list + Cuda_Allocate_Bond_List( system->N, bond_top, dev_lists+BONDS ); + num_bonds = bond_top[system->N-1]; + system->num_bonds = num_bonds; #ifdef __CUDA_MEM__ - fprintf (stderr, "Device memory allocated: Bonds list: %ld (MB) \n", - sizeof (bond_data) * num_bonds / (1024*1024)); + fprintf (stderr, "Device memory allocated: Bonds list: %ld (MB) \n", + sizeof (bond_data) * num_bonds / (1024*1024)); #endif #ifdef __DEBUG_CUDA__ - fprintf (stderr, "Device Total Bonds --> %d \n", num_bonds ); + fprintf (stderr, "Device Total Bonds --> %d \n", num_bonds ); #endif - // system->max_thb_intrs = num_3body; - // 3bodies list - //if(!Make_List(num_bonds, num_bonds * MAX_THREE_BODIES, TYP_THREE_BODY, dev_lists + THREE_BODIES, TYP_DEVICE)) { - // fprintf( stderr, "Problem in initializing angles list. Terminating!\n" ); - // exit( INIT_ERR ); - //} + // system->max_thb_intrs = num_3body; + // 3bodies list + //if(!Make_List(num_bonds, num_bonds * MAX_THREE_BODIES, TYP_THREE_BODY, dev_lists + THREE_BODIES, TYP_DEVICE)) { + // fprintf( stderr, "Problem in initializing angles list. Terminating!\n" ); + // exit( INIT_ERR ); + //} - //fprintf( stderr, "***memory allocated: three_body = %ldMB\n", - // num_bonds * MAX_THREE_BODIES *sizeof(three_body_interaction_data) / (1024*1024) ); - //fprintf (stderr, "size of (three_body_interaction_data) : %d \n", sizeof (three_body_interaction_data)); + //fprintf( stderr, "***memory allocated: three_body = %ldMB\n", + // num_bonds * MAX_THREE_BODIES *sizeof(three_body_interaction_data) / (1024*1024) ); + //fprintf (stderr, "size of (three_body_interaction_data) : %d \n", sizeof (three_body_interaction_data)); - //Free local resources - free (output); - free (nbrs_indices); - } + //Free local resources + free (output); + free (nbrs_indices); + } - void Init_Lists( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - list **lists, output_controls *out_control ) - { - int i, num_nbrs, num_hbonds, num_bonds, num_3body, Htop; - int *hb_top, *bond_top; + void Init_Lists( reax_system *system, control_params *control, + simulation_data *data, static_storage *workspace, + list **lists, output_controls *out_control ) + { + int i, num_nbrs, num_hbonds, num_bonds, num_3body, Htop; + int *hb_top, *bond_top; - real t_start, t_elapsed; + real t_start, t_elapsed; - num_nbrs = Estimate_NumNeighbors( system, control, workspace, lists ); + num_nbrs = Estimate_NumNeighbors( system, control, workspace, lists ); #ifdef __DEBUG_CUDA__ - fprintf (stderr, "Serial NumNeighbors ---> %d \n", num_nbrs); + fprintf (stderr, "Serial NumNeighbors ---> %d \n", num_nbrs); #endif - if( !Make_List(system->N, num_nbrs, TYP_FAR_NEIGHBOR, (*lists)+FAR_NBRS) ) { - fprintf(stderr, "Problem in initializing far nbrs list. Terminating!\n"); - exit( INIT_ERR ); - } + if( !Make_List(system->N, num_nbrs, TYP_FAR_NEIGHBOR, (*lists)+FAR_NBRS) ) { + fprintf(stderr, "Problem in initializing far nbrs list. Terminating!\n"); + exit( INIT_ERR ); + } #if defined(DEBUG_FOCUS) - fprintf( stderr, "memory allocated: far_nbrs = %ldMB\n", - num_nbrs * sizeof(far_neighbor_data) / (1024*1024) ); + fprintf( stderr, "memory allocated: far_nbrs = %ldMB\n", + num_nbrs * sizeof(far_neighbor_data) / (1024*1024) ); #endif - t_start = Get_Time (); - Generate_Neighbor_Lists(system,control,data,workspace,lists,out_control); - t_elapsed = Get_Timing_Info ( t_start ); + t_start = Get_Time (); + Generate_Neighbor_Lists(system,control,data,workspace,lists,out_control); + t_elapsed = Get_Timing_Info ( t_start ); #ifdef __DEBUG_CUDA__ - fprintf (stderr, " Timing Generate Neighbors %lf \n", t_elapsed ); + fprintf (stderr, " Timing Generate Neighbors %lf \n", t_elapsed ); #endif - Htop = 0; - hb_top = (int*) calloc( system->N, sizeof(int) ); - bond_top = (int*) calloc( system->N, sizeof(int) ); - num_3body = 0; - Estimate_Storage_Sizes( system, control, lists, - &Htop, hb_top, bond_top, &num_3body ); + Htop = 0; + hb_top = (int*) calloc( system->N, sizeof(int) ); + bond_top = (int*) calloc( system->N, sizeof(int) ); + num_3body = 0; + Estimate_Storage_Sizes( system, control, lists, + &Htop, hb_top, bond_top, &num_3body ); - Allocate_Matrix( &(workspace->H), system->N, Htop ); + Allocate_Matrix( &(workspace->H), system->N, Htop ); #if defined(DEBUG_FOCUS) - fprintf( stderr, "estimated storage - Htop: %d\n", Htop ); - fprintf( stderr, "memory allocated: H = %ldMB\n", - Htop * sizeof(sparse_matrix_entry) / (1024*1024) ); + fprintf( stderr, "estimated storage - Htop: %d\n", Htop ); + fprintf( stderr, "memory allocated: H = %ldMB\n", + Htop * sizeof(sparse_matrix_entry) / (1024*1024) ); #endif - workspace->num_H = 0; - if( control->hb_cut > 0 ) { - /* init H indexes */ - for( i = 0; i < system->N; ++i ) - if( system->reaxprm.sbp[ system->atoms[i].type ].p_hbond == 1 ) // H atom - workspace->hbond_index[i] = workspace->num_H++; - else workspace->hbond_index[i] = -1; + workspace->num_H = 0; + if( control->hb_cut > 0 ) { + /* init H indexes */ + for( i = 0; i < system->N; ++i ) + if( system->reaxprm.sbp[ system->atoms[i].type ].p_hbond == 1 ) // H atom + workspace->hbond_index[i] = workspace->num_H++; + else workspace->hbond_index[i] = -1; - Allocate_HBond_List( system->N, workspace->num_H, workspace->hbond_index, - hb_top, (*lists)+HBONDS ); - num_hbonds = hb_top[system->N-1]; + Allocate_HBond_List( system->N, workspace->num_H, workspace->hbond_index, + hb_top, (*lists)+HBONDS ); + num_hbonds = hb_top[system->N-1]; #ifdef __DEBUG_CUDA__ - fprintf( stderr, "Serial num_hbonds: %d\n", num_hbonds ); + fprintf( stderr, "Serial num_hbonds: %d\n", num_hbonds ); #endif #if defined(DEBUG_FOCUS) - fprintf( stderr, "estimated storage - num_hbonds: %d\n", num_hbonds ); - fprintf( stderr, "memory allocated: hbonds = %ldMB\n", - num_hbonds * sizeof(hbond_data) / (1024*1024) ); + fprintf( stderr, "estimated storage - num_hbonds: %d\n", num_hbonds ); + fprintf( stderr, "memory allocated: hbonds = %ldMB\n", + num_hbonds * sizeof(hbond_data) / (1024*1024) ); #endif - } + } - /* bonds list */ - Allocate_Bond_List( system->N, bond_top, (*lists)+BONDS ); - num_bonds = bond_top[system->N-1]; + /* bonds list */ + Allocate_Bond_List( system->N, bond_top, (*lists)+BONDS ); + num_bonds = bond_top[system->N-1]; #if defined(DEBUG_FOCUS) - fprintf( stderr, "estimated storage - num_bonds: %d\n", num_bonds ); - fprintf( stderr, "memory allocated: bonds = %ldMB\n", - num_bonds * sizeof(bond_data) / (1024*1024) ); + fprintf( stderr, "estimated storage - num_bonds: %d\n", num_bonds ); + fprintf( stderr, "memory allocated: bonds = %ldMB\n", + num_bonds * sizeof(bond_data) / (1024*1024) ); #endif #ifdef __DEBUG_CUDA__ - fprintf (stderr, " host num_3body : %d \n", num_3body); - fprintf (stderr, " host num_bonds : %d \n", num_bonds); + fprintf (stderr, " host num_3body : %d \n", num_3body); + fprintf (stderr, " host num_bonds : %d \n", num_bonds); #endif - /* 3bodies list */ - if(!Make_List(num_bonds, num_3body, TYP_THREE_BODY, (*lists)+THREE_BODIES)) { - fprintf( stderr, "Problem in initializing angles list. Terminating!\n" ); - exit( INIT_ERR ); - } + /* 3bodies list */ + if(!Make_List(num_bonds, num_3body, TYP_THREE_BODY, (*lists)+THREE_BODIES)) { + fprintf( stderr, "Problem in initializing angles list. Terminating!\n" ); + exit( INIT_ERR ); + } #if defined(DEBUG_FOCUS) - fprintf( stderr, "estimated storage - num_3body: %d\n", num_3body ); - fprintf( stderr, "memory allocated: 3-body = %ldMB\n", - num_3body * sizeof(three_body_interaction_data) / (1024*1024) ); + fprintf( stderr, "estimated storage - num_3body: %d\n", num_3body ); + fprintf( stderr, "memory allocated: 3-body = %ldMB\n", + num_3body * sizeof(three_body_interaction_data) / (1024*1024) ); #endif #ifdef TEST_FORCES - if(!Make_List( system->N, num_bonds * 8, TYP_DDELTA, (*lists) + DDELTA )) { - fprintf( stderr, "Problem in initializing dDelta list. Terminating!\n" ); - exit( INIT_ERR ); - } - - if( !Make_List( num_bonds, num_bonds*MAX_BONDS*3, TYP_DBO, (*lists)+DBO ) ) { - fprintf( stderr, "Problem in initializing dBO list. Terminating!\n" ); - exit( INIT_ERR ); - } + if(!Make_List( system->N, num_bonds * 8, TYP_DDELTA, (*lists) + DDELTA )) { + fprintf( stderr, "Problem in initializing dDelta list. Terminating!\n" ); + exit( INIT_ERR ); + } + + if( !Make_List( num_bonds, num_bonds*MAX_BONDS*3, TYP_DBO, (*lists)+DBO ) ) { + fprintf( stderr, "Problem in initializing dBO list. Terminating!\n" ); + exit( INIT_ERR ); + } #endif - free( hb_top ); - free( bond_top ); - } - - - void Init_Out_Controls(reax_system *system, control_params *control, - static_storage *workspace, output_controls *out_control) - { - char temp[1000]; - - /* Init trajectory file */ - if( out_control->write_steps > 0 ) { - strcpy( temp, control->sim_name ); - strcat( temp, ".trj" ); - out_control->trj = fopen( temp, "w" ); - out_control->write_header( system, control, workspace, out_control ); - } - - if( out_control->energy_update_freq > 0 ) { - /* Init out file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".out" ); - out_control->out = fopen( temp, "w" ); - fprintf( out_control->out, "%-6s%16s%16s%16s%11s%11s%13s%13s%13s\n", - "step", "total energy", "poten. energy", "kin. energy", - "temp.", "target", "volume", "press.", "target" ); - fflush( out_control->out ); - - /* Init potentials file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".pot" ); - out_control->pot = fopen( temp, "w" ); - fprintf( out_control->pot, - "%-6s%13s%13s%13s%13s%13s%13s%13s%13s%13s%13s%13s\n", - "step", "ebond", "eatom", "elp", "eang", "ecoa", "ehb", - "etor", "econj", "evdw","ecoul", "epol" ); - fflush( out_control->pot ); - - /* Init log file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".log" ); - out_control->log = fopen( temp, "w" ); - fprintf( out_control->log, "%-6s%10s%10s%10s%10s%10s%10s%10s\n", - "step", "total", "neighbors", "init", "bonded", - "nonbonded", "QEq", "matvec" ); - } - - /* Init pressure file */ - if( control->ensemble == NPT || - control->ensemble == iNPT || - control->ensemble == sNPT ) { - strcpy( temp, control->sim_name ); - strcat( temp, ".prs" ); - out_control->prs = fopen( temp, "w" ); - fprintf( out_control->prs, "%-6s%13s%13s%13s%13s%13s%13s%13s%13s\n", - "step", "norm_x", "norm_y", "norm_z", - "press_x", "press_y", "press_z", "target_p", "volume" ); - fflush( out_control->prs ); - } - - /* Init molecular analysis file */ - if( control->molec_anal ) { - sprintf( temp, "%s.mol", control->sim_name ); - out_control->mol = fopen( temp, "w" ); - if( control->num_ignored ) { - sprintf( temp, "%s.ign", control->sim_name ); - out_control->ign = fopen( temp, "w" ); - } - } - - /* Init electric dipole moment analysis file */ - if( control->dipole_anal ) { - strcpy( temp, control->sim_name ); - strcat( temp, ".dpl" ); - out_control->dpl = fopen( temp, "w" ); - fprintf( out_control->dpl, - "Step Molecule Count Avg. Dipole Moment Norm\n" ); - fflush( out_control->dpl ); - } - - /* Init diffusion coef analysis file */ - if( control->diffusion_coef ) { - strcpy( temp, control->sim_name ); - strcat( temp, ".drft" ); - out_control->drft = fopen( temp, "w" ); - fprintf( out_control->drft, "Step Type Count Avg Squared Disp\n" ); - fflush( out_control->drft ); - } + free( hb_top ); + free( bond_top ); + } + + + void Init_Out_Controls(reax_system *system, control_params *control, + static_storage *workspace, output_controls *out_control) + { + char temp[1000]; + + /* Init trajectory file */ + if( out_control->write_steps > 0 ) { + strcpy( temp, control->sim_name ); + strcat( temp, ".trj" ); + out_control->trj = fopen( temp, "w" ); + out_control->write_header( system, control, workspace, out_control ); + } + + if( out_control->energy_update_freq > 0 ) { + /* Init out file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".out" ); + out_control->out = fopen( temp, "w" ); + fprintf( out_control->out, "%-6s%16s%16s%16s%11s%11s%13s%13s%13s\n", + "step", "total energy", "poten. energy", "kin. energy", + "temp.", "target", "volume", "press.", "target" ); + fflush( out_control->out ); + + /* Init potentials file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".pot" ); + out_control->pot = fopen( temp, "w" ); + fprintf( out_control->pot, + "%-6s%13s%13s%13s%13s%13s%13s%13s%13s%13s%13s%13s\n", + "step", "ebond", "eatom", "elp", "eang", "ecoa", "ehb", + "etor", "econj", "evdw","ecoul", "epol" ); + fflush( out_control->pot ); + + /* Init log file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".log" ); + out_control->log = fopen( temp, "w" ); + fprintf( out_control->log, "%-6s%10s%10s%10s%10s%10s%10s%10s\n", + "step", "total", "neighbors", "init", "bonded", + "nonbonded", "QEq", "matvec" ); + } + + /* Init pressure file */ + if( control->ensemble == NPT || + control->ensemble == iNPT || + control->ensemble == sNPT ) { + strcpy( temp, control->sim_name ); + strcat( temp, ".prs" ); + out_control->prs = fopen( temp, "w" ); + fprintf( out_control->prs, "%-6s%13s%13s%13s%13s%13s%13s%13s%13s\n", + "step", "norm_x", "norm_y", "norm_z", + "press_x", "press_y", "press_z", "target_p", "volume" ); + fflush( out_control->prs ); + } + + /* Init molecular analysis file */ + if( control->molec_anal ) { + sprintf( temp, "%s.mol", control->sim_name ); + out_control->mol = fopen( temp, "w" ); + if( control->num_ignored ) { + sprintf( temp, "%s.ign", control->sim_name ); + out_control->ign = fopen( temp, "w" ); + } + } + + /* Init electric dipole moment analysis file */ + if( control->dipole_anal ) { + strcpy( temp, control->sim_name ); + strcat( temp, ".dpl" ); + out_control->dpl = fopen( temp, "w" ); + fprintf( out_control->dpl, + "Step Molecule Count Avg. Dipole Moment Norm\n" ); + fflush( out_control->dpl ); + } + + /* Init diffusion coef analysis file */ + if( control->diffusion_coef ) { + strcpy( temp, control->sim_name ); + strcat( temp, ".drft" ); + out_control->drft = fopen( temp, "w" ); + fprintf( out_control->drft, "Step Type Count Avg Squared Disp\n" ); + fflush( out_control->drft ); + } #ifdef TEST_ENERGY - /* open bond energy file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".ebond" ); - out_control->ebond = fopen( temp, "w" ); - - /* open lone-pair energy file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".elp" ); - out_control->elp = fopen( temp, "w" ); - - /* open overcoordination energy file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".eov" ); - out_control->eov = fopen( temp, "w" ); - - /* open undercoordination energy file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".eun" ); - out_control->eun = fopen( temp, "w" ); - - /* open angle energy file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".eval" ); - out_control->eval = fopen( temp, "w" ); - - /* open penalty energy file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".epen" ); - out_control->epen = fopen( temp, "w" ); - - /* open coalition energy file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".ecoa" ); - out_control->ecoa = fopen( temp, "w" ); - - /* open hydrogen bond energy file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".ehb" ); - out_control->ehb = fopen( temp, "w" ); - - /* open torsion energy file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".etor" ); - out_control->etor = fopen( temp, "w" ); - - /* open conjugation energy file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".econ" ); - out_control->econ = fopen( temp, "w" ); - - /* open vdWaals energy file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".evdw" ); - out_control->evdw = fopen( temp, "w" ); - - /* open coulomb energy file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".ecou" ); - out_control->ecou = fopen( temp, "w" ); + /* open bond energy file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".ebond" ); + out_control->ebond = fopen( temp, "w" ); + + /* open lone-pair energy file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".elp" ); + out_control->elp = fopen( temp, "w" ); + + /* open overcoordination energy file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".eov" ); + out_control->eov = fopen( temp, "w" ); + + /* open undercoordination energy file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".eun" ); + out_control->eun = fopen( temp, "w" ); + + /* open angle energy file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".eval" ); + out_control->eval = fopen( temp, "w" ); + + /* open penalty energy file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".epen" ); + out_control->epen = fopen( temp, "w" ); + + /* open coalition energy file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".ecoa" ); + out_control->ecoa = fopen( temp, "w" ); + + /* open hydrogen bond energy file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".ehb" ); + out_control->ehb = fopen( temp, "w" ); + + /* open torsion energy file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".etor" ); + out_control->etor = fopen( temp, "w" ); + + /* open conjugation energy file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".econ" ); + out_control->econ = fopen( temp, "w" ); + + /* open vdWaals energy file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".evdw" ); + out_control->evdw = fopen( temp, "w" ); + + /* open coulomb energy file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".ecou" ); + out_control->ecou = fopen( temp, "w" ); #endif #ifdef TEST_FORCES - /* open bond orders file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".fbo" ); - out_control->fbo = fopen( temp, "w" ); - - /* open bond orders derivatives file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".fdbo" ); - out_control->fdbo = fopen( temp, "w" ); - - /* open bond forces file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".fbond" ); - out_control->fbond = fopen( temp, "w" ); - - /* open lone-pair forces file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".flp" ); - out_control->flp = fopen( temp, "w" ); - - /* open overcoordination forces file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".fatom" ); - out_control->fatom = fopen( temp, "w" ); - - /* open angle forces file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".f3body" ); - out_control->f3body = fopen( temp, "w" ); - - /* open hydrogen bond forces file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".fhb" ); - out_control->fhb = fopen( temp, "w" ); - - /* open torsion forces file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".f4body" ); - out_control->f4body = fopen( temp, "w" ); - - /* open nonbonded forces file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".fnonb" ); - out_control->fnonb = fopen( temp, "w" ); - - /* open total force file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".ftot" ); - out_control->ftot = fopen( temp, "w" ); - - /* open coulomb forces file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".ftot2" ); - out_control->ftot2 = fopen( temp, "w" ); + /* open bond orders file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".fbo" ); + out_control->fbo = fopen( temp, "w" ); + + /* open bond orders derivatives file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".fdbo" ); + out_control->fdbo = fopen( temp, "w" ); + + /* open bond forces file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".fbond" ); + out_control->fbond = fopen( temp, "w" ); + + /* open lone-pair forces file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".flp" ); + out_control->flp = fopen( temp, "w" ); + + /* open overcoordination forces file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".fatom" ); + out_control->fatom = fopen( temp, "w" ); + + /* open angle forces file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".f3body" ); + out_control->f3body = fopen( temp, "w" ); + + /* open hydrogen bond forces file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".fhb" ); + out_control->fhb = fopen( temp, "w" ); + + /* open torsion forces file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".f4body" ); + out_control->f4body = fopen( temp, "w" ); + + /* open nonbonded forces file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".fnonb" ); + out_control->fnonb = fopen( temp, "w" ); + + /* open total force file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".ftot" ); + out_control->ftot = fopen( temp, "w" ); + + /* open coulomb forces file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".ftot2" ); + out_control->ftot2 = fopen( temp, "w" ); #endif - /* Error handling */ - /* if ( out_control->out == NULL || out_control->pot == NULL || - out_control->log == NULL || out_control->mol == NULL || - out_control->dpl == NULL || out_control->drft == NULL || - out_control->pdb == NULL ) - { - fprintf( stderr, "FILE OPEN ERROR. TERMINATING..." ); - exit( CANNOT_OPEN_OUTFILE ); - }*/ - } + /* Error handling */ + /* if ( out_control->out == NULL || out_control->pot == NULL || + out_control->log == NULL || out_control->mol == NULL || + out_control->dpl == NULL || out_control->drft == NULL || + out_control->pdb == NULL ) + { + fprintf( stderr, "FILE OPEN ERROR. TERMINATING..." ); + exit( CANNOT_OPEN_OUTFILE ); + }*/ + } - void Initialize(reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, list **lists, - output_controls *out_control, evolve_function *Evolve) - { - Randomize(); + void Initialize(reax_system *system, control_params *control, + simulation_data *data, static_storage *workspace, list **lists, + output_controls *out_control, evolve_function *Evolve) + { + Randomize(); - Init_System( system, control, data ); + Init_System( system, control, data ); - Init_Simulation_Data( system, control, data, out_control, Evolve ); + Init_Simulation_Data( system, control, data, out_control, Evolve ); - Init_Workspace( system, control, workspace ); + Init_Workspace( system, control, workspace ); - Init_Lists( system, control, data, workspace, lists, out_control ); + Init_Lists( system, control, data, workspace, lists, out_control ); - Init_Out_Controls( system, control, workspace, out_control ); + Init_Out_Controls( system, control, workspace, out_control ); - /* These are done in forces.c, only forces.c can see all those functions */ - Init_Bonded_Force_Functions( control ); + /* These are done in forces.c, only forces.c can see all those functions */ + Init_Bonded_Force_Functions( control ); #ifdef TEST_FORCES - Init_Force_Test_Functions( ); + Init_Force_Test_Functions( ); #endif - if( control->tabulate ) - Make_LR_Lookup_Table( system, control ); + if( control->tabulate ) + Make_LR_Lookup_Table( system, control ); #if defined(DEBUG_FOCUS) - fprintf( stderr, "data structures have been initialized...\n" ); + fprintf( stderr, "data structures have been initialized...\n" ); #endif - } + } - void Cuda_Initialize(reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, list **lists, - output_controls *out_control, evolve_function *Evolve) - { - Randomize (); + void Cuda_Initialize(reax_system *system, control_params *control, + simulation_data *data, static_storage *workspace, list **lists, + output_controls *out_control, evolve_function *Evolve) + { + Randomize (); - Cuda_Init_Scratch (); + Cuda_Init_Scratch (); - //System - Cuda_Init_System (system); - Sync_Host_Device ( system, cudaMemcpyHostToDevice ); - Cuda_Init_System (system, control, data ); + //System + Cuda_Init_System (system); + Sync_Host_Device ( system, cudaMemcpyHostToDevice ); + Cuda_Init_System (system, control, data ); - //Simulation Data - copy_host_device (system->atoms, system->d_atoms, REAX_ATOM_SIZE * system->N , - cudaMemcpyHostToDevice, RES_SYSTEM_ATOMS ); - Cuda_Init_Simulation_Data (data); - //Sync_Host_Device (data, (simulation_data *)data->d_simulation_data, cudaMemcpyHostToDevice); - Cuda_Init_Simulation_Data( system, control, data, out_control, Evolve ); - Sync_Host_Device (data, (simulation_data *)data->d_simulation_data, cudaMemcpyHostToDevice); + //Simulation Data + copy_host_device (system->atoms, system->d_atoms, REAX_ATOM_SIZE * system->N , + cudaMemcpyHostToDevice, RES_SYSTEM_ATOMS ); + Cuda_Init_Simulation_Data (data); + //Sync_Host_Device (data, (simulation_data *)data->d_simulation_data, cudaMemcpyHostToDevice); + Cuda_Init_Simulation_Data( system, control, data, out_control, Evolve ); + Sync_Host_Device (data, (simulation_data *)data->d_simulation_data, cudaMemcpyHostToDevice); - //static storage - Cuda_Init_Workspace_System ( system, dev_workspace ); - Cuda_Init_Workspace ( system, control, dev_workspace ); - Cuda_Init_Workspace_Device (workspace); + //static storage + Cuda_Init_Workspace_System ( system, dev_workspace ); + Cuda_Init_Workspace ( system, control, dev_workspace ); + Cuda_Init_Workspace_Device (workspace); - //control - Cuda_Init_Control (control); + //control + Cuda_Init_Control (control); - //Grid - Cuda_Init_Grid (&system->g, &system->d_g ); + //Grid + Cuda_Init_Grid (&system->g, &system->d_g ); - //lists - Cuda_Init_Lists (system, control, data, workspace, lists, out_control ); + //lists + Cuda_Init_Lists (system, control, data, workspace, lists, out_control ); - Init_Out_Controls( system, control, workspace, out_control ); + Init_Out_Controls( system, control, workspace, out_control ); - if( control->tabulate ) { - real start, end; - start = Get_Time (); - Make_LR_Lookup_Table( system, control ); - copy_LR_table_to_device (system, control ); - end = Get_Timing_Info ( start ); + if( control->tabulate ) { + real start, end; + start = Get_Time (); + Make_LR_Lookup_Table( system, control ); + copy_LR_table_to_device (system, control ); + end = Get_Timing_Info ( start ); #ifdef __DEBUG_CUDA__ - fprintf (stderr, "Done copying the LR table to the device ---> %f \n", end ); + fprintf (stderr, "Done copying the LR table to the device ---> %f \n", end ); #endif - } - } + } + } diff --git a/PuReMD-GPU/src/integrate.cu b/PuReMD-GPU/src/integrate.cu index 5d56d622..d0790286 100644 --- a/PuReMD-GPU/src/integrate.cu +++ b/PuReMD-GPU/src/integrate.cu @@ -38,49 +38,49 @@ void Velocity_Verlet_NVE(reax_system* system, control_params* control, - simulation_data *data, static_storage *workspace, - list **lists, output_controls *out_control ) + simulation_data *data, static_storage *workspace, + list **lists, output_controls *out_control ) { - int i, steps, renbr; - real inv_m, dt, dt_sqr; - rvec dx; - - dt = control->dt; - dt_sqr = SQR(dt); - steps = data->step - data->prev_steps; - renbr = (steps % control->reneighbor == 0); + int i, steps, renbr; + real inv_m, dt, dt_sqr; + rvec dx; + + dt = control->dt; + dt_sqr = SQR(dt); + steps = data->step - data->prev_steps; + renbr = (steps % control->reneighbor == 0); #if defined(DEBUG_FOCUS) - fprintf( stderr, "step%d: ", data->step ); + fprintf( stderr, "step%d: ", data->step ); #endif - for( i = 0; i < system->N; i++ ) { - inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass; + for( i = 0; i < system->N; i++ ) { + inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass; - rvec_ScaledSum( dx, dt, system->atoms[i].v, - 0.5 * dt_sqr * -F_CONV * inv_m, system->atoms[i].f ); - Inc_on_T3( system->atoms[i].x, dx, &( system->box ) ); + rvec_ScaledSum( dx, dt, system->atoms[i].v, + 0.5 * dt_sqr * -F_CONV * inv_m, system->atoms[i].f ); + Inc_on_T3( system->atoms[i].x, dx, &( system->box ) ); - rvec_ScaledAdd( system->atoms[i].v, - 0.5 * dt * -F_CONV * inv_m, system->atoms[i].f ); - } + rvec_ScaledAdd( system->atoms[i].v, + 0.5 * dt * -F_CONV * inv_m, system->atoms[i].f ); + } #if defined(DEBUG_FOCUS) - fprintf( stderr, "verlet1 - "); + fprintf( stderr, "verlet1 - "); #endif - Reallocate( system, workspace, lists, renbr ); - Reset( system, control, data, workspace, lists ); - if( renbr ) - Generate_Neighbor_Lists( system, control, data, workspace, - lists, out_control ); - Compute_Forces( system, control, data, workspace, lists, out_control ); - - for( i = 0; i < system->N; i++ ) { - inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass; - rvec_ScaledAdd( system->atoms[i].v, - 0.5 * dt * -F_CONV * inv_m, system->atoms[i].f ); - } + Reallocate( system, workspace, lists, renbr ); + Reset( system, control, data, workspace, lists ); + if( renbr ) + Generate_Neighbor_Lists( system, control, data, workspace, + lists, out_control ); + Compute_Forces( system, control, data, workspace, lists, out_control ); + + for( i = 0; i < system->N; i++ ) { + inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass; + rvec_ScaledAdd( system->atoms[i].v, + 0.5 * dt * -F_CONV * inv_m, system->atoms[i].f ); + } #if defined(DEBUG_FOCUS) - fprintf( stderr, "verlet2\n"); + fprintf( stderr, "verlet2\n"); #endif } @@ -89,209 +89,209 @@ void Velocity_Verlet_NVE(reax_system* system, control_params* control, /////////////////////////////////////////////////////////////////// GLOBAL void Cuda_Velocity_Verlet_NVE_atoms1 (reax_atom *atoms, - single_body_parameters *sbp, - simulation_box *box, - int N, real dt) + single_body_parameters *sbp, + simulation_box *box, + int N, real dt) { - real inv_m, dt_sqr; - rvec dx; - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= N) return; - - dt_sqr = SQR(dt); - //for( i = 0; i < system->N; i++ ) { - inv_m = 1.0 / sbp[atoms[i].type].mass; - - rvec_ScaledSum( dx, dt, atoms[i].v, - 0.5 * dt_sqr * -F_CONV * inv_m, atoms[i].f ); - Inc_on_T3( atoms[i].x, dx, box ); - - rvec_ScaledAdd( atoms[i].v, - 0.5 * dt * -F_CONV * inv_m, atoms[i].f ); - //} + real inv_m, dt_sqr; + rvec dx; + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= N) return; + + dt_sqr = SQR(dt); + //for( i = 0; i < system->N; i++ ) { + inv_m = 1.0 / sbp[atoms[i].type].mass; + + rvec_ScaledSum( dx, dt, atoms[i].v, + 0.5 * dt_sqr * -F_CONV * inv_m, atoms[i].f ); + Inc_on_T3( atoms[i].x, dx, box ); + + rvec_ScaledAdd( atoms[i].v, + 0.5 * dt * -F_CONV * inv_m, atoms[i].f ); + //} } GLOBAL void Cuda_Velocity_Verlet_NVE_atoms2 (reax_atom *atoms, single_body_parameters *sbp, int N, real dt) { - real inv_m; - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= N) return; - - //for( i = 0; i < system->N; i++ ) { - inv_m = 1.0 / sbp[atoms[i].type].mass; - rvec_ScaledAdd( atoms[i].v, - 0.5 * dt * -F_CONV * inv_m, atoms[i].f ); - //} + real inv_m; + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= N) return; + + //for( i = 0; i < system->N; i++ ) { + inv_m = 1.0 / sbp[atoms[i].type].mass; + rvec_ScaledAdd( atoms[i].v, + 0.5 * dt * -F_CONV * inv_m, atoms[i].f ); + //} } void Cuda_Velocity_Verlet_NVE(reax_system* system, control_params* control, - simulation_data *data, static_storage *workspace, - list **lists, output_controls *out_control ) + simulation_data *data, static_storage *workspace, + list **lists, output_controls *out_control ) { - int i, steps, renbr; - real inv_m, dt, dt_sqr; - rvec dx; - int blocks, block_size; + int i, steps, renbr; + real inv_m, dt, dt_sqr; + rvec dx; + int blocks, block_size; - dt = control->dt; - dt_sqr = SQR(dt); - steps = data->step - data->prev_steps; - renbr = (steps % control->reneighbor == 0); + dt = control->dt; + dt_sqr = SQR(dt); + steps = data->step - data->prev_steps; + renbr = (steps % control->reneighbor == 0); #if defined(DEBUG_FOCUS) - fprintf( stderr, "step%d: ", data->step ); + fprintf( stderr, "step%d: ", data->step ); #endif - compute_blocks (&blocks, &block_size, system->N); - Cuda_Velocity_Verlet_NVE_atoms1 <<<blocks, block_size>>> - (system->d_atoms, system->reaxprm.d_sbp, - (simulation_box *)system->d_box, system->N, dt); - cudaThreadSynchronize (); + compute_blocks (&blocks, &block_size, system->N); + Cuda_Velocity_Verlet_NVE_atoms1 <<<blocks, block_size>>> + (system->d_atoms, system->reaxprm.d_sbp, + (simulation_box *)system->d_box, system->N, dt); + cudaThreadSynchronize (); #if defined(DEBUG_FOCUS) - fprintf( stderr, "verlet1 - "); + fprintf( stderr, "verlet1 - "); #endif - Cuda_Reallocate( system, dev_workspace, dev_lists, renbr, data->step ); - Cuda_Reset( system, control, data, workspace, lists ); + Cuda_Reallocate( system, dev_workspace, dev_lists, renbr, data->step ); + Cuda_Reset( system, control, data, workspace, lists ); - if( renbr ) { - Cuda_Generate_Neighbor_Lists (system, dev_workspace, control, true); - } + if( renbr ) { + Cuda_Generate_Neighbor_Lists (system, dev_workspace, control, true); + } - Cuda_Compute_Forces( system, control, data, workspace, lists, out_control ); + Cuda_Compute_Forces( system, control, data, workspace, lists, out_control ); - Cuda_Velocity_Verlet_NVE_atoms2<<<blocks, block_size>>> - (system->d_atoms, system->reaxprm.d_sbp, system->N, dt); - cudaThreadSynchronize (); + Cuda_Velocity_Verlet_NVE_atoms2<<<blocks, block_size>>> + (system->d_atoms, system->reaxprm.d_sbp, system->N, dt); + cudaThreadSynchronize (); #if defined(DEBUG_FOCUS) - fprintf( stderr, "verlet2\n"); + fprintf( stderr, "verlet2\n"); #endif } void Velocity_Verlet_Nose_Hoover_NVT_Klein(reax_system* system, - control_params* control, - simulation_data *data, - static_storage *workspace, - list **lists, - output_controls *out_control ) + control_params* control, + simulation_data *data, + static_storage *workspace, + list **lists, + output_controls *out_control ) { - int i, itr, steps, renbr; - real inv_m, coef_v, dt, dt_sqr; - real E_kin_new, G_xi_new, v_xi_new, v_xi_old; - rvec dx; - thermostat *therm; - - dt = control->dt; - dt_sqr = SQR( dt ); - therm = &( data->therm ); - steps = data->step - data->prev_steps; - renbr = (steps % control->reneighbor == 0); + int i, itr, steps, renbr; + real inv_m, coef_v, dt, dt_sqr; + real E_kin_new, G_xi_new, v_xi_new, v_xi_old; + rvec dx; + thermostat *therm; + + dt = control->dt; + dt_sqr = SQR( dt ); + therm = &( data->therm ); + steps = data->step - data->prev_steps; + renbr = (steps % control->reneighbor == 0); #if defined(DEBUG_FOCUS) - fprintf( stderr, "step%d: ", data->step ); + fprintf( stderr, "step%d: ", data->step ); #endif #ifdef __DEBUG_CUDA__ - fprintf (stderr, " Entering Velocity_Verlet_Nose_Hoover_NVT_Klein: coef to update velocity --> %6.10f\n", therm->v_xi_old); + fprintf (stderr, " Entering Velocity_Verlet_Nose_Hoover_NVT_Klein: coef to update velocity --> %6.10f\n", therm->v_xi_old); #endif - /* Compute x(t + dt) and copy old forces */ - for (i=0; i < system->N; i++) { - inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass; + /* Compute x(t + dt) and copy old forces */ + for (i=0; i < system->N; i++) { + inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass; - rvec_ScaledSum( dx, dt - 0.5 * dt_sqr * therm->v_xi, system->atoms[i].v, - 0.5 * dt_sqr * inv_m * -F_CONV, system->atoms[i].f ); + rvec_ScaledSum( dx, dt - 0.5 * dt_sqr * therm->v_xi, system->atoms[i].v, + 0.5 * dt_sqr * inv_m * -F_CONV, system->atoms[i].f ); - Inc_on_T3( system->atoms[i].x, dx, &(system->box) ); + Inc_on_T3( system->atoms[i].x, dx, &(system->box) ); - rvec_Copy( workspace->f_old[i], system->atoms[i].f ); - } - /* Compute xi(t + dt) */ - therm->xi += ( therm->v_xi * dt + 0.5 * dt_sqr * therm->G_xi ); + rvec_Copy( workspace->f_old[i], system->atoms[i].f ); + } + /* Compute xi(t + dt) */ + therm->xi += ( therm->v_xi * dt + 0.5 * dt_sqr * therm->G_xi ); #if defined(DEBUG_FOCUS) - fprintf( stderr, "verlet1 - " ); + fprintf( stderr, "verlet1 - " ); #endif - Reallocate( system, workspace, lists, renbr ); - Reset( system, control, data, workspace, lists ); + Reallocate( system, workspace, lists, renbr ); + Reset( system, control, data, workspace, lists ); - if( renbr ) - Generate_Neighbor_Lists( system, control, data, workspace, - lists, out_control ); + if( renbr ) + Generate_Neighbor_Lists( system, control, data, workspace, + lists, out_control ); - /* Calculate Forces at time (t + dt) */ - Compute_Forces( system,control,data, workspace, lists, out_control ); + /* Calculate Forces at time (t + dt) */ + Compute_Forces( system,control,data, workspace, lists, out_control ); - /* Compute iteration constants for each atom's velocity */ - for( i = 0; i < system->N; ++i ) { - inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass; + /* Compute iteration constants for each atom's velocity */ + for( i = 0; i < system->N; ++i ) { + inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass; - rvec_Scale( workspace->v_const[i], - 1.0 - 0.5 * dt * therm->v_xi, system->atoms[i].v ); - rvec_ScaledAdd( workspace->v_const[i], - 0.5 * dt * inv_m * -F_CONV, workspace->f_old[i] ); - rvec_ScaledAdd( workspace->v_const[i], - 0.5 * dt * inv_m * -F_CONV, system->atoms[i].f ); + rvec_Scale( workspace->v_const[i], + 1.0 - 0.5 * dt * therm->v_xi, system->atoms[i].v ); + rvec_ScaledAdd( workspace->v_const[i], + 0.5 * dt * inv_m * -F_CONV, workspace->f_old[i] ); + rvec_ScaledAdd( workspace->v_const[i], + 0.5 * dt * inv_m * -F_CONV, system->atoms[i].f ); #if defined(DEBUG) - fprintf( stderr, "atom%d: inv_m=%f, C1=%f, C2=%f, v_const=%f %f %f\n", - i, inv_m, 1.0 - 0.5 * dt * therm->v_xi, - 0.5 * dt * inv_m * -F_CONV, workspace->v_const[i][0], - workspace->v_const[i][1], workspace->v_const[i][2] ); + fprintf( stderr, "atom%d: inv_m=%f, C1=%f, C2=%f, v_const=%f %f %f\n", + i, inv_m, 1.0 - 0.5 * dt * therm->v_xi, + 0.5 * dt * inv_m * -F_CONV, workspace->v_const[i][0], + workspace->v_const[i][1], workspace->v_const[i][2] ); #endif - } + } - v_xi_new = therm->v_xi_old + 2.0 * dt * therm->G_xi; - E_kin_new = G_xi_new = v_xi_old = 0; - itr = 0; - do { - itr++; + v_xi_new = therm->v_xi_old + 2.0 * dt * therm->G_xi; + E_kin_new = G_xi_new = v_xi_old = 0; + itr = 0; + do { + itr++; - /* new values become old in this iteration */ - v_xi_old = v_xi_new; - coef_v = 1.0 / (1.0 + 0.5 * dt * v_xi_old); - E_kin_new = 0; + /* new values become old in this iteration */ + v_xi_old = v_xi_new; + coef_v = 1.0 / (1.0 + 0.5 * dt * v_xi_old); + E_kin_new = 0; #ifdef __DEBUG_CUDA__ - fprintf (stderr, " *********** coef to update velocity --> %6.10f, %6.10f, %6.10f\n", coef_v, dt, therm->v_xi_old); - //print_sys_atoms (system); + fprintf (stderr, " *********** coef to update velocity --> %6.10f, %6.10f, %6.10f\n", coef_v, dt, therm->v_xi_old); + //print_sys_atoms (system); #endif - for( i = 0; i < system->N; ++i ) { - rvec_Scale( system->atoms[i].v, coef_v, workspace->v_const[i] ); + for( i = 0; i < system->N; ++i ) { + rvec_Scale( system->atoms[i].v, coef_v, workspace->v_const[i] ); - E_kin_new += ( 0.5*system->reaxprm.sbp[system->atoms[i].type].mass * - rvec_Dot( system->atoms[i].v, system->atoms[i].v ) ); + E_kin_new += ( 0.5*system->reaxprm.sbp[system->atoms[i].type].mass * + rvec_Dot( system->atoms[i].v, system->atoms[i].v ) ); #if defined(DEBUG) - fprintf( stderr, "itr%d-atom%d: coef_v = %f, v_xi_old = %f\n", - itr, i, coef_v, v_xi_old ); + fprintf( stderr, "itr%d-atom%d: coef_v = %f, v_xi_old = %f\n", + itr, i, coef_v, v_xi_old ); #endif - } + } - G_xi_new = control->Tau_T * ( 2.0 * E_kin_new - - data->N_f * K_B * control->T ); - v_xi_new = therm->v_xi + 0.5 * dt * ( therm->G_xi + G_xi_new ); + G_xi_new = control->Tau_T * ( 2.0 * E_kin_new - + data->N_f * K_B * control->T ); + v_xi_new = therm->v_xi + 0.5 * dt * ( therm->G_xi + G_xi_new ); #if defined(DEBUG) - fprintf( stderr, "itr%d: G_xi_new = %f, v_xi_new = %f, v_xi_old = %f\n", - itr, G_xi_new, v_xi_new, v_xi_old ); + fprintf( stderr, "itr%d: G_xi_new = %f, v_xi_new = %f, v_xi_old = %f\n", + itr, G_xi_new, v_xi_new, v_xi_old ); #endif - } - while( fabs(v_xi_new - v_xi_old ) > 1e-5 ); + } + while( fabs(v_xi_new - v_xi_old ) > 1e-5 ); #ifdef __DEBUG_CUDA__ - fprintf (stderr, " Iteration Count in NVE --> %d \n", itr ); + fprintf (stderr, " Iteration Count in NVE --> %d \n", itr ); #endif #ifndef __BUILD_DEBUG__ - therm->v_xi_old = therm->v_xi; - therm->v_xi = v_xi_new; - therm->G_xi = G_xi_new; + therm->v_xi_old = therm->v_xi; + therm->v_xi = v_xi_new; + therm->G_xi = G_xi_new; #endif #if defined(DEBUG_FOCUS) - fprintf( stderr,"vel scale\n" ); + fprintf( stderr,"vel scale\n" ); #endif } @@ -303,200 +303,200 @@ void Velocity_Verlet_Nose_Hoover_NVT_Klein(reax_system* system, /////////////////////////////////////////////////////////////////// GLOBAL void Compute_X_t_dt (real dt, real dt_sqr, thermostat p_therm, - reax_atom *atoms, single_body_parameters *sbp, - simulation_box *box, - static_storage p_workspace, int N) + reax_atom *atoms, single_body_parameters *sbp, + simulation_box *box, + static_storage p_workspace, int N) { - real inv_m; - rvec dx; - int i = blockIdx.x * blockDim.x + threadIdx.x; + real inv_m; + rvec dx; + int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= N) return; + if (i >= N) return; - static_storage *workspace = &p_workspace; - thermostat *therm = &p_therm; + static_storage *workspace = &p_workspace; + thermostat *therm = &p_therm; - /* Compute x(t + dt) and copy old forces */ - //for (i=0; i < system->N; i++) { - inv_m = 1.0 / sbp[atoms[i].type].mass; + /* Compute x(t + dt) and copy old forces */ + //for (i=0; i < system->N; i++) { + inv_m = 1.0 / sbp[atoms[i].type].mass; - rvec_ScaledSum( dx, dt - 0.5 * dt_sqr * therm->v_xi, atoms[i].v, - 0.5 * dt_sqr * inv_m * -F_CONV, atoms[i].f ); + rvec_ScaledSum( dx, dt - 0.5 * dt_sqr * therm->v_xi, atoms[i].v, + 0.5 * dt_sqr * inv_m * -F_CONV, atoms[i].f ); - Inc_on_T3( atoms[i].x, dx, box ); + Inc_on_T3( atoms[i].x, dx, box ); - rvec_Copy( workspace->f_old[i], atoms[i].f ); - //} + rvec_Copy( workspace->f_old[i], atoms[i].f ); + //} } GLOBAL void Update_Velocity (reax_atom *atoms, single_body_parameters *sbp, - static_storage p_workspace, real dt, thermostat p_therm, - int N) + static_storage p_workspace, real dt, thermostat p_therm, + int N) { - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= N) return; - - real inv_m; - static_storage *workspace = &p_workspace; - thermostat *therm = &p_therm; - - //for( i = 0; i < system->N; ++i ) { - inv_m = 1.0 / sbp[atoms[i].type].mass; - - rvec_Scale( workspace->v_const[i], - 1.0 - 0.5 * dt * therm->v_xi, atoms[i].v ); - rvec_ScaledAdd( workspace->v_const[i], - 0.5 * dt * inv_m * -F_CONV, workspace->f_old[i] ); - rvec_ScaledAdd( workspace->v_const[i], - 0.5 * dt * inv_m * -F_CONV, atoms[i].f ); - //} + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= N) return; + + real inv_m; + static_storage *workspace = &p_workspace; + thermostat *therm = &p_therm; + + //for( i = 0; i < system->N; ++i ) { + inv_m = 1.0 / sbp[atoms[i].type].mass; + + rvec_Scale( workspace->v_const[i], + 1.0 - 0.5 * dt * therm->v_xi, atoms[i].v ); + rvec_ScaledAdd( workspace->v_const[i], + 0.5 * dt * inv_m * -F_CONV, workspace->f_old[i] ); + rvec_ScaledAdd( workspace->v_const[i], + 0.5 * dt * inv_m * -F_CONV, atoms[i].f ); + //} } GLOBAL void E_Kin_Reduction (reax_atom *atoms, static_storage p_workspace, - single_body_parameters *sbp, - real *per_block_results, real coef_v, const size_t n) + single_body_parameters *sbp, + real *per_block_results, real coef_v, const size_t n) { - extern __shared__ real sdata[]; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - real x = 0; - static_storage *workspace = &p_workspace; - - if(i < n) - { - rvec_Scale( atoms[i].v, coef_v, workspace->v_const[i] ); - x = ( 0.5 * sbp[atoms[i].type].mass * - rvec_Dot( atoms[i].v, atoms[i].v ) ); - } - sdata[threadIdx.x] = x; - __syncthreads(); - - for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) - { - if(threadIdx.x < offset) - { - sdata[threadIdx.x] += sdata[threadIdx.x + offset]; - } - - __syncthreads(); - } - - if(threadIdx.x == 0) - { - per_block_results[blockIdx.x] = sdata[0]; - } + extern __shared__ real sdata[]; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + real x = 0; + static_storage *workspace = &p_workspace; + + if(i < n) + { + rvec_Scale( atoms[i].v, coef_v, workspace->v_const[i] ); + x = ( 0.5 * sbp[atoms[i].type].mass * + rvec_Dot( atoms[i].v, atoms[i].v ) ); + } + sdata[threadIdx.x] = x; + __syncthreads(); + + for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) + { + if(threadIdx.x < offset) + { + sdata[threadIdx.x] += sdata[threadIdx.x + offset]; + } + + __syncthreads(); + } + + if(threadIdx.x == 0) + { + per_block_results[blockIdx.x] = sdata[0]; + } } void Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein(reax_system* system, - control_params* control, - simulation_data *data, - static_storage *workspace, - list **lists, - output_controls *out_control ) + control_params* control, + simulation_data *data, + static_storage *workspace, + list **lists, + output_controls *out_control ) { - int i, itr, steps, renbr; - real inv_m, coef_v, dt, dt_sqr; - real E_kin_new, G_xi_new, v_xi_new, v_xi_old; - rvec dx; - thermostat *therm; + int i, itr, steps, renbr; + real inv_m, coef_v, dt, dt_sqr; + real E_kin_new, G_xi_new, v_xi_new, v_xi_old; + rvec dx; + thermostat *therm; - real *results = (real *)scratch; + real *results = (real *)scratch; - dt = control->dt; - dt_sqr = SQR( dt ); - therm = &( data->therm ); - steps = data->step - data->prev_steps; - renbr = (steps % control->reneighbor == 0); + dt = control->dt; + dt_sqr = SQR( dt ); + therm = &( data->therm ); + steps = data->step - data->prev_steps; + renbr = (steps % control->reneighbor == 0); #ifdef __DEBUG_CUDA__ - fprintf (stderr, " Device: Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein --> coef to update velocity --> %6.10f\n", therm->v_xi_old); + fprintf (stderr, " Device: Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein --> coef to update velocity --> %6.10f\n", therm->v_xi_old); #endif #if defined(DEBUG_FOCUS) - fprintf( stderr, "step%d: ", data->step ); + fprintf( stderr, "step%d: ", data->step ); #endif - Compute_X_t_dt <<< BLOCKS, BLOCK_SIZE >>> - (dt, dt_sqr, data->therm, system->d_atoms, - system->reaxprm.d_sbp, system->d_box, *dev_workspace, system->N); - cudaThreadSynchronize (); - cudaCheckError (); + Compute_X_t_dt <<< BLOCKS, BLOCK_SIZE >>> + (dt, dt_sqr, data->therm, system->d_atoms, + system->reaxprm.d_sbp, system->d_box, *dev_workspace, system->N); + cudaThreadSynchronize (); + cudaCheckError (); - /* Compute xi(t + dt) */ - therm->xi += ( therm->v_xi * dt + 0.5 * dt_sqr * therm->G_xi ); + /* Compute xi(t + dt) */ + therm->xi += ( therm->v_xi * dt + 0.5 * dt_sqr * therm->G_xi ); #if defined(DEBUG_FOCUS) - fprintf( stderr, "verlet1 - " ); + fprintf( stderr, "verlet1 - " ); #endif - Cuda_Reallocate( system, dev_workspace, dev_lists, renbr, data->step ); - Cuda_Reset( system, control, data, workspace, lists ); + Cuda_Reallocate( system, dev_workspace, dev_lists, renbr, data->step ); + Cuda_Reset( system, control, data, workspace, lists ); - if( renbr ) { - //generate_neighbor_lists here - Cuda_Generate_Neighbor_Lists (system, dev_workspace, control, true); - } + if( renbr ) { + //generate_neighbor_lists here + Cuda_Generate_Neighbor_Lists (system, dev_workspace, control, true); + } - /* Calculate Forces at time (t + dt) */ - Cuda_Compute_Forces( system,control,data, workspace, lists, out_control ); + /* Calculate Forces at time (t + dt) */ + Cuda_Compute_Forces( system,control,data, workspace, lists, out_control ); - /* Compute iteration constants for each atom's velocity */ - Update_Velocity <<< BLOCKS, BLOCK_SIZE >>> - (system->d_atoms, system->reaxprm.d_sbp, *dev_workspace, - dt, *therm, system->N ); - cudaThreadSynchronize (); - cudaCheckError (); + /* Compute iteration constants for each atom's velocity */ + Update_Velocity <<< BLOCKS, BLOCK_SIZE >>> + (system->d_atoms, system->reaxprm.d_sbp, *dev_workspace, + dt, *therm, system->N ); + cudaThreadSynchronize (); + cudaCheckError (); - v_xi_new = therm->v_xi_old + 2.0 * dt * therm->G_xi; - E_kin_new = G_xi_new = v_xi_old = 0; - itr = 0; - do { - itr++; + v_xi_new = therm->v_xi_old + 2.0 * dt * therm->G_xi; + E_kin_new = G_xi_new = v_xi_old = 0; + itr = 0; + do { + itr++; - /* new values become old in this iteration */ - v_xi_old = v_xi_new; - coef_v = 1.0 / (1.0 + 0.5 * dt * v_xi_old); - E_kin_new = 0; + /* new values become old in this iteration */ + v_xi_old = v_xi_new; + coef_v = 1.0 / (1.0 + 0.5 * dt * v_xi_old); + E_kin_new = 0; - /*reduction for the E_Kin_new here*/ + /*reduction for the E_Kin_new here*/ #ifdef __DEBUG_CUDA__ - fprintf (stderr, " Device: coef to update velocity --> %6.10f, %6.10f, %6.10f\n", coef_v, dt, therm->v_xi_old); + fprintf (stderr, " Device: coef to update velocity --> %6.10f, %6.10f, %6.10f\n", coef_v, dt, therm->v_xi_old); #endif - cuda_memset (results, 0, 2 * BLOCK_SIZE * REAL_SIZE, RES_SCRATCH ); - E_Kin_Reduction <<< BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> - (system->d_atoms, *dev_workspace, system->reaxprm.d_sbp, - results, coef_v, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> - (results, results + BLOCKS_POW_2, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); - - copy_host_device (&E_kin_new, results + BLOCKS_POW_2, REAL_SIZE, cudaMemcpyDeviceToHost, RES_SCRATCH ); - - G_xi_new = control->Tau_T * ( 2.0 * E_kin_new - - data->N_f * K_B * control->T ); - v_xi_new = therm->v_xi + 0.5 * dt * ( therm->G_xi + G_xi_new ); + cuda_memset (results, 0, 2 * BLOCK_SIZE * REAL_SIZE, RES_SCRATCH ); + E_Kin_Reduction <<< BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> + (system->d_atoms, *dev_workspace, system->reaxprm.d_sbp, + results, coef_v, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> + (results, results + BLOCKS_POW_2, BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); + + copy_host_device (&E_kin_new, results + BLOCKS_POW_2, REAL_SIZE, cudaMemcpyDeviceToHost, RES_SCRATCH ); + + G_xi_new = control->Tau_T * ( 2.0 * E_kin_new - + data->N_f * K_B * control->T ); + v_xi_new = therm->v_xi + 0.5 * dt * ( therm->G_xi + G_xi_new ); #if defined(DEBUG) - fprintf( stderr, "itr%d: G_xi_new = %f, v_xi_new = %f, v_xi_old = %f\n", - itr, G_xi_new, v_xi_new, v_xi_old ); + fprintf( stderr, "itr%d: G_xi_new = %f, v_xi_new = %f, v_xi_old = %f\n", + itr, G_xi_new, v_xi_new, v_xi_old ); #endif - } - while( fabs(v_xi_new - v_xi_old ) > 1e-5 ); + } + while( fabs(v_xi_new - v_xi_old ) > 1e-5 ); #ifdef __DEBUG_CUDA__ - fprintf (stderr, " Iteration Count in NVE --> %d \n", itr ); + fprintf (stderr, " Iteration Count in NVE --> %d \n", itr ); #endif - therm->v_xi_old = therm->v_xi; - therm->v_xi = v_xi_new; - therm->G_xi = G_xi_new; + therm->v_xi_old = therm->v_xi; + therm->v_xi = v_xi_new; + therm->G_xi = G_xi_new; #if defined(DEBUG_FOCUS) - fprintf( stderr,"vel scale\n" ); + fprintf( stderr,"vel scale\n" ); #endif } @@ -509,109 +509,109 @@ void Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein(reax_system* system, All box dimensions are scaled by the same amount, there is no change in the angles between axes. */ void Velocity_Verlet_Berendsen_Isotropic_NPT( reax_system* system, - control_params* control, - simulation_data *data, - static_storage *workspace, - list **lists, - output_controls *out_control ) + control_params* control, + simulation_data *data, + static_storage *workspace, + list **lists, + output_controls *out_control ) { - int i, steps, renbr; - real inv_m, dt, lambda, mu; - rvec dx; + int i, steps, renbr; + real inv_m, dt, lambda, mu; + rvec dx; - dt = control->dt; - steps = data->step - data->prev_steps; - renbr = (steps % control->reneighbor == 0); + dt = control->dt; + steps = data->step - data->prev_steps; + renbr = (steps % control->reneighbor == 0); #if defined(DEBUG_FOCUS) - //fprintf( out_control->prs, - // "tau_t: %g tau_p: %g dt/tau_t: %g dt/tau_p: %g\n", - //control->Tau_T, control->Tau_P, dt / control->Tau_T, dt / control->Tau_P ); - fprintf( stderr, "step %d: ", data->step ); + //fprintf( out_control->prs, + // "tau_t: %g tau_p: %g dt/tau_t: %g dt/tau_p: %g\n", + //control->Tau_T, control->Tau_P, dt / control->Tau_T, dt / control->Tau_P ); + fprintf( stderr, "step %d: ", data->step ); #endif - /* velocity verlet, 1st part */ - for( i = 0; i < system->N; i++ ) { - inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass; - /* Compute x(t + dt) */ - rvec_ScaledSum( dx, dt, system->atoms[i].v, - 0.5 * -F_CONV * inv_m * SQR(dt), system->atoms[i].f ); - Inc_on_T3( system->atoms[i].x, dx, &(system->box) ); - /* Compute v(t + dt/2) */ - rvec_ScaledAdd( system->atoms[i].v, - 0.5 * -F_CONV * inv_m * dt, system->atoms[i].f ); - /*fprintf( stderr, "%6d %15.8f %15.8f %15.8f %15.8f %15.8f %15.8f\n", - workspace->orig_id[i], - system->atoms[i].x[0], system->atoms[i].x[1], system->atoms[i].x[2], - 0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[0], - 0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[1], - 0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[2] ); */ - } + /* velocity verlet, 1st part */ + for( i = 0; i < system->N; i++ ) { + inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass; + /* Compute x(t + dt) */ + rvec_ScaledSum( dx, dt, system->atoms[i].v, + 0.5 * -F_CONV * inv_m * SQR(dt), system->atoms[i].f ); + Inc_on_T3( system->atoms[i].x, dx, &(system->box) ); + /* Compute v(t + dt/2) */ + rvec_ScaledAdd( system->atoms[i].v, + 0.5 * -F_CONV * inv_m * dt, system->atoms[i].f ); + /*fprintf( stderr, "%6d %15.8f %15.8f %15.8f %15.8f %15.8f %15.8f\n", + workspace->orig_id[i], + system->atoms[i].x[0], system->atoms[i].x[1], system->atoms[i].x[2], + 0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[0], + 0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[1], + 0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[2] ); */ + } #if defined(DEBUG_FOCUS) - fprintf( stderr, "verlet1 - " ); + fprintf( stderr, "verlet1 - " ); #endif - Reallocate( system, workspace, lists, renbr ); - Reset( system, control, data, workspace, lists ); - if( renbr ) { - Update_Grid( system ); - Generate_Neighbor_Lists( system, control, data, workspace, - lists, out_control ); - } - Compute_Forces( system, control, data, workspace, lists, out_control ); - - /* velocity verlet, 2nd part */ - for( i = 0; i < system->N; i++ ) { - inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass; - /* Compute v(t + dt) */ - rvec_ScaledAdd( system->atoms[i].v, - 0.5 * dt * -F_CONV * inv_m, system->atoms[i].f ); - /* fprintf( stderr, "%6d %15f %15f %15f %15.8f %15.8f %15.8f\n", - workspace->orig_id[i], - system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2], - 0.5 * dt * -F_CONV * inv_m * system->atoms[i].f[0], - 0.5 * dt * -F_CONV * inv_m * system->atoms[i].f[1], - 0.5 * dt * -F_CONV * inv_m * system->atoms[i].f[2] );*/ - } - //Compute_Kinetic_Energy( system, data ); - Compute_Pressure_Isotropic( system, control, data, out_control ); + Reallocate( system, workspace, lists, renbr ); + Reset( system, control, data, workspace, lists ); + if( renbr ) { + Update_Grid( system ); + Generate_Neighbor_Lists( system, control, data, workspace, + lists, out_control ); + } + Compute_Forces( system, control, data, workspace, lists, out_control ); + + /* velocity verlet, 2nd part */ + for( i = 0; i < system->N; i++ ) { + inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass; + /* Compute v(t + dt) */ + rvec_ScaledAdd( system->atoms[i].v, + 0.5 * dt * -F_CONV * inv_m, system->atoms[i].f ); + /* fprintf( stderr, "%6d %15f %15f %15f %15.8f %15.8f %15.8f\n", + workspace->orig_id[i], + system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2], + 0.5 * dt * -F_CONV * inv_m * system->atoms[i].f[0], + 0.5 * dt * -F_CONV * inv_m * system->atoms[i].f[1], + 0.5 * dt * -F_CONV * inv_m * system->atoms[i].f[2] );*/ + } + //Compute_Kinetic_Energy( system, data ); + Compute_Pressure_Isotropic( system, control, data, out_control ); #if defined(DEBUG_FOCUS) - fprintf( stderr, "verlet2 - " ); + fprintf( stderr, "verlet2 - " ); #endif - /* pressure scaler */ - mu = POW( 1.0 + (dt / control->Tau_P[0]) * (data->iso_bar.P - control->P[0]), - 1.0 / 3 ); - if( mu < MIN_dV ) - mu = MIN_dV; - else if( mu > MAX_dV ) - mu = MAX_dV; - - /* temperature scaler */ - lambda = 1.0 + (dt / control->Tau_T) * (control->T / data->therm.T - 1.0); - if( lambda < MIN_dT ) - lambda = MIN_dT; - else if (lambda > MAX_dT ) - lambda = MAX_dT; - lambda = SQRT( lambda ); - - /* Scale velocities and positions at t+dt */ - for( i = 0; i < system->N; ++i ) { - rvec_Scale( system->atoms[i].v, lambda, system->atoms[i].v ); - /* IMPORTANT: What Adri does with scaling positions first to - unit coordinates and then back to cartesian coordinates essentially - is scaling the coordinates with mu^2. However, this causes unphysical - modifications on the system because box dimensions - are being scaled with mu! We need to discuss this with Adri! */ - rvec_Scale( system->atoms[i].x, mu, system->atoms[i].x ); - } - //Compute_Kinetic_Energy( system, data ); + /* pressure scaler */ + mu = POW( 1.0 + (dt / control->Tau_P[0]) * (data->iso_bar.P - control->P[0]), + 1.0 / 3 ); + if( mu < MIN_dV ) + mu = MIN_dV; + else if( mu > MAX_dV ) + mu = MAX_dV; + + /* temperature scaler */ + lambda = 1.0 + (dt / control->Tau_T) * (control->T / data->therm.T - 1.0); + if( lambda < MIN_dT ) + lambda = MIN_dT; + else if (lambda > MAX_dT ) + lambda = MAX_dT; + lambda = SQRT( lambda ); + + /* Scale velocities and positions at t+dt */ + for( i = 0; i < system->N; ++i ) { + rvec_Scale( system->atoms[i].v, lambda, system->atoms[i].v ); + /* IMPORTANT: What Adri does with scaling positions first to + unit coordinates and then back to cartesian coordinates essentially + is scaling the coordinates with mu^2. However, this causes unphysical + modifications on the system because box dimensions + are being scaled with mu! We need to discuss this with Adri! */ + rvec_Scale( system->atoms[i].x, mu, system->atoms[i].x ); + } + //Compute_Kinetic_Energy( system, data ); #if defined(DEBUG_FOCUS) - fprintf( stderr, "scaling - " ); + fprintf( stderr, "scaling - " ); #endif - Update_Box_Isotropic( &(system->box), mu ); + Update_Box_Isotropic( &(system->box), mu ); #if defined(DEBUG_FOCUS) - fprintf( stderr, "updated box\n" ); + fprintf( stderr, "updated box\n" ); #endif } @@ -620,112 +620,112 @@ void Velocity_Verlet_Berendsen_Isotropic_NPT( reax_system* system, All box dimensions are scaled by the same amount, there is no change in the angles between axes. */ void Velocity_Verlet_Berendsen_SemiIsotropic_NPT( reax_system* system, - control_params* control, - simulation_data *data, - static_storage *workspace, - list **lists, - output_controls *out_control ) + control_params* control, + simulation_data *data, + static_storage *workspace, + list **lists, + output_controls *out_control ) { - int i, d, steps, renbr; - real dt, inv_m, lambda; - rvec dx, mu; + int i, d, steps, renbr; + real dt, inv_m, lambda; + rvec dx, mu; - dt = control->dt; - steps = data->step - data->prev_steps; - renbr = (steps % control->reneighbor == 0); + dt = control->dt; + steps = data->step - data->prev_steps; + renbr = (steps % control->reneighbor == 0); #if defined(DEBUG_FOCUS) - //fprintf( out_control->prs, - // "tau_t: %g tau_p: %g dt/tau_t: %g dt/tau_p: %g\n", - //control->Tau_T, control->Tau_P, dt / control->Tau_T, dt / control->Tau_P ); - fprintf( stderr, "step %d: ", data->step ); + //fprintf( out_control->prs, + // "tau_t: %g tau_p: %g dt/tau_t: %g dt/tau_p: %g\n", + //control->Tau_T, control->Tau_P, dt / control->Tau_T, dt / control->Tau_P ); + fprintf( stderr, "step %d: ", data->step ); #endif - /* velocity verlet, 1st part */ - for( i = 0; i < system->N; i++ ) { - inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass; - /* Compute x(t + dt) */ - rvec_ScaledSum( dx, dt, system->atoms[i].v, - 0.5 * -F_CONV * inv_m * SQR(dt), system->atoms[i].f ); - Inc_on_T3( system->atoms[i].x, dx, &(system->box) ); - /* Compute v(t + dt/2) */ - rvec_ScaledAdd( system->atoms[i].v, - 0.5 * -F_CONV * inv_m * dt, system->atoms[i].f ); - /*fprintf( stderr, "%6d %15.8f %15.8f %15.8f %15.8f %15.8f %15.8f\n", - workspace->orig_id[i], - system->atoms[i].x[0], system->atoms[i].x[1], system->atoms[i].x[2], - 0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[0], - 0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[1], - 0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[2] ); */ - } + /* velocity verlet, 1st part */ + for( i = 0; i < system->N; i++ ) { + inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass; + /* Compute x(t + dt) */ + rvec_ScaledSum( dx, dt, system->atoms[i].v, + 0.5 * -F_CONV * inv_m * SQR(dt), system->atoms[i].f ); + Inc_on_T3( system->atoms[i].x, dx, &(system->box) ); + /* Compute v(t + dt/2) */ + rvec_ScaledAdd( system->atoms[i].v, + 0.5 * -F_CONV * inv_m * dt, system->atoms[i].f ); + /*fprintf( stderr, "%6d %15.8f %15.8f %15.8f %15.8f %15.8f %15.8f\n", + workspace->orig_id[i], + system->atoms[i].x[0], system->atoms[i].x[1], system->atoms[i].x[2], + 0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[0], + 0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[1], + 0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[2] ); */ + } #if defined(DEBUG_FOCUS) - fprintf( stderr, "verlet1 - " ); + fprintf( stderr, "verlet1 - " ); #endif - Reallocate( system, workspace, lists, renbr ); - Reset( system, control, data, workspace, lists ); - if( renbr ) { - Update_Grid( system ); - Generate_Neighbor_Lists( system, control, data, workspace, - lists, out_control ); - } - Compute_Forces( system, control, data, workspace, lists, out_control ); - - /* velocity verlet, 2nd part */ - for( i = 0; i < system->N; i++ ) { - inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass; - /* Compute v(t + dt) */ - rvec_ScaledAdd( system->atoms[i].v, - 0.5 * dt * -F_CONV * inv_m, system->atoms[i].f ); - /* fprintf( stderr, "%6d %15f %15f %15f %15.8f %15.8f %15.8f\n", - workspace->orig_id[i], - system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2], - 0.5 * dt * -F_CONV * inv_m * system->atoms[i].f[0], - 0.5 * dt * -F_CONV * inv_m * system->atoms[i].f[1], - 0.5 * dt * -F_CONV * inv_m * system->atoms[i].f[2] );*/ - } - //Compute_Kinetic_Energy( system, data ); - Compute_Pressure_Isotropic( system, control, data, out_control ); + Reallocate( system, workspace, lists, renbr ); + Reset( system, control, data, workspace, lists ); + if( renbr ) { + Update_Grid( system ); + Generate_Neighbor_Lists( system, control, data, workspace, + lists, out_control ); + } + Compute_Forces( system, control, data, workspace, lists, out_control ); + + /* velocity verlet, 2nd part */ + for( i = 0; i < system->N; i++ ) { + inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass; + /* Compute v(t + dt) */ + rvec_ScaledAdd( system->atoms[i].v, + 0.5 * dt * -F_CONV * inv_m, system->atoms[i].f ); + /* fprintf( stderr, "%6d %15f %15f %15f %15.8f %15.8f %15.8f\n", + workspace->orig_id[i], + system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2], + 0.5 * dt * -F_CONV * inv_m * system->atoms[i].f[0], + 0.5 * dt * -F_CONV * inv_m * system->atoms[i].f[1], + 0.5 * dt * -F_CONV * inv_m * system->atoms[i].f[2] );*/ + } + //Compute_Kinetic_Energy( system, data ); + Compute_Pressure_Isotropic( system, control, data, out_control ); #if defined(DEBUG_FOCUS) - fprintf( stderr, "verlet2 - " ); + fprintf( stderr, "verlet2 - " ); #endif - /* pressure scaler */ - for( d = 0; d < 3; ++d ){ - mu[d] = POW( 1.0+(dt/control->Tau_P[d])*(data->tot_press[d]-control->P[d]), - 1.0 / 3 ); - if( mu[d] < MIN_dV ) - mu[d] = MIN_dV; - else if( mu[d] > MAX_dV ) - mu[d] = MAX_dV; - } - - /* temperature scaler */ - lambda = 1.0 + (dt / control->Tau_T) * (control->T / data->therm.T - 1.0); - if( lambda < MIN_dT ) - lambda = MIN_dT; - else if (lambda > MAX_dT ) - lambda = MAX_dT; - lambda = SQRT( lambda ); - - /* Scale velocities and positions at t+dt */ - for( i = 0; i < system->N; ++i ) { - rvec_Scale( system->atoms[i].v, lambda, system->atoms[i].v ); - /* IMPORTANT: What Adri does with scaling positions first to - unit coordinates and then back to cartesian coordinates essentially - is scaling the coordinates with mu^2. However, this causes unphysical - modifications on the system because box dimensions - are being scaled with mu! We need to discuss this with Adri! */ - for( d = 0; d < 3; ++d ) - system->atoms[i].x[d] = system->atoms[i].x[d] * mu[d]; - } - //Compute_Kinetic_Energy( system, data ); + /* pressure scaler */ + for( d = 0; d < 3; ++d ){ + mu[d] = POW( 1.0+(dt/control->Tau_P[d])*(data->tot_press[d]-control->P[d]), + 1.0 / 3 ); + if( mu[d] < MIN_dV ) + mu[d] = MIN_dV; + else if( mu[d] > MAX_dV ) + mu[d] = MAX_dV; + } + + /* temperature scaler */ + lambda = 1.0 + (dt / control->Tau_T) * (control->T / data->therm.T - 1.0); + if( lambda < MIN_dT ) + lambda = MIN_dT; + else if (lambda > MAX_dT ) + lambda = MAX_dT; + lambda = SQRT( lambda ); + + /* Scale velocities and positions at t+dt */ + for( i = 0; i < system->N; ++i ) { + rvec_Scale( system->atoms[i].v, lambda, system->atoms[i].v ); + /* IMPORTANT: What Adri does with scaling positions first to + unit coordinates and then back to cartesian coordinates essentially + is scaling the coordinates with mu^2. However, this causes unphysical + modifications on the system because box dimensions + are being scaled with mu! We need to discuss this with Adri! */ + for( d = 0; d < 3; ++d ) + system->atoms[i].x[d] = system->atoms[i].x[d] * mu[d]; + } + //Compute_Kinetic_Energy( system, data ); #if defined(DEBUG_FOCUS) - fprintf( stderr, "scaling - " ); + fprintf( stderr, "scaling - " ); #endif - Update_Box_SemiIsotropic( &(system->box), mu ); + Update_Box_SemiIsotropic( &(system->box), mu ); #if defined(DEBUG_FOCUS) - fprintf( stderr, "updated box & grid\n" ); + fprintf( stderr, "updated box & grid\n" ); #endif } @@ -741,243 +741,243 @@ void Velocity_Verlet_Berendsen_SemiIsotropic_NPT( reax_system* system, #ifdef ANISOTROPIC void Velocity_Verlet_Nose_Hoover_NVT(reax_system* system, - control_params* control, - simulation_data *data, - static_storage *workspace, - list **lists, - output_controls *out_control ) + control_params* control, + simulation_data *data, + static_storage *workspace, + list **lists, + output_controls *out_control ) { - int i; - real inv_m; - real dt = control->dt; - real dt_sqr = SQR(dt); - rvec dx; - - for (i=0; i < system->N; i++) - { - inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass; - - // Compute x(t + dt) - rvec_ScaledSum( dx, dt, system->atoms[i].v, - 0.5 * dt_sqr * -F_CONV * inv_m, system->atoms[i].f ); - Inc_on_T3_Gen( system->atoms[i].x, dx, &(system->box) ); - - // Compute v(t + dt/2) - rvec_ScaledAdd( system->atoms[i].v, - -0.5 * dt * data->therm.xi, system->atoms[i].v ); - rvec_ScaledAdd( system->atoms[i].v, - 0.5 * dt * -F_CONV * inv_m, system->atoms[i].f ); - } - - // Compute zeta(t + dt/2), E_Kininetic(t + dt/2) - // IMPORTANT: What will be the initial value of zeta? and what is g? - data->therm.xi += 0.5 * dt * control->Tau_T * - ( 2.0 * data->E_Kin - data->N_f * K_B * control->T ); - - Reset( system, control, data, workspace ); - fprintf(out_control->log,"reset-"); fflush( out_control->log ); - - Generate_Neighbor_Lists( system, control, data, workspace, - lists, out_control ); - fprintf(out_control->log,"nbrs-"); fflush( out_control->log ); - - /* QEq( system, control, workspace, lists[FAR_NBRS], out_control ); - fprintf(out_control->log,"qeq-"); fflush( out_control->log ); */ - - Compute_Forces( system, control, data, workspace, lists, out_control ); - fprintf(out_control->log,"forces\n"); fflush( out_control->log ); - - //Compute_Kinetic_Energy( system, data ); - - for( i = 0; i < system->N; i++ ) - { - inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass; - - // compute v(t + dt) - rvec_ScaledAdd( system->atoms[i].v, - -0.5 * dt * data->therm.xi, system->atoms[i].v ); - rvec_ScaledAdd( system->atoms[i].v, - 0.5 * dt * -F_CONV * inv_m, system->atoms[i].f ); - } - - // Compute zeta(t + dt) - data->therm.xi += 0.5*dt * control->Tau_T * ( 2.0 * data->E_Kin - - data->N_f * K_B * control->T ); - - fprintf( out_control->log,"Xi: %8.3f %8.3f %8.3f\n", - data->therm.xi, data->E_Kin, data->N_f * K_B * control->T ); - fflush( out_control->log ); + int i; + real inv_m; + real dt = control->dt; + real dt_sqr = SQR(dt); + rvec dx; + + for (i=0; i < system->N; i++) + { + inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass; + + // Compute x(t + dt) + rvec_ScaledSum( dx, dt, system->atoms[i].v, + 0.5 * dt_sqr * -F_CONV * inv_m, system->atoms[i].f ); + Inc_on_T3_Gen( system->atoms[i].x, dx, &(system->box) ); + + // Compute v(t + dt/2) + rvec_ScaledAdd( system->atoms[i].v, + -0.5 * dt * data->therm.xi, system->atoms[i].v ); + rvec_ScaledAdd( system->atoms[i].v, + 0.5 * dt * -F_CONV * inv_m, system->atoms[i].f ); + } + + // Compute zeta(t + dt/2), E_Kininetic(t + dt/2) + // IMPORTANT: What will be the initial value of zeta? and what is g? + data->therm.xi += 0.5 * dt * control->Tau_T * + ( 2.0 * data->E_Kin - data->N_f * K_B * control->T ); + + Reset( system, control, data, workspace ); + fprintf(out_control->log,"reset-"); fflush( out_control->log ); + + Generate_Neighbor_Lists( system, control, data, workspace, + lists, out_control ); + fprintf(out_control->log,"nbrs-"); fflush( out_control->log ); + + /* QEq( system, control, workspace, lists[FAR_NBRS], out_control ); + fprintf(out_control->log,"qeq-"); fflush( out_control->log ); */ + + Compute_Forces( system, control, data, workspace, lists, out_control ); + fprintf(out_control->log,"forces\n"); fflush( out_control->log ); + + //Compute_Kinetic_Energy( system, data ); + + for( i = 0; i < system->N; i++ ) + { + inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass; + + // compute v(t + dt) + rvec_ScaledAdd( system->atoms[i].v, + -0.5 * dt * data->therm.xi, system->atoms[i].v ); + rvec_ScaledAdd( system->atoms[i].v, + 0.5 * dt * -F_CONV * inv_m, system->atoms[i].f ); + } + + // Compute zeta(t + dt) + data->therm.xi += 0.5*dt * control->Tau_T * ( 2.0 * data->E_Kin - + data->N_f * K_B * control->T ); + + fprintf( out_control->log,"Xi: %8.3f %8.3f %8.3f\n", + data->therm.xi, data->E_Kin, data->N_f * K_B * control->T ); + fflush( out_control->log ); } void Velocity_Verlet_Isotropic_NPT( reax_system* system, - control_params* control, - simulation_data *data, - static_storage *workspace, - list **lists, - output_controls *out_control ) + control_params* control, + simulation_data *data, + static_storage *workspace, + list **lists, + output_controls *out_control ) { - int i, itr; - real deps, v_eps_new=0, v_eps_old=0, G_xi_new; - real dxi, v_xi_new=0, v_xi_old=0, a_eps_new; - real inv_m, exp_deps, inv_3V; - real E_kin, P_int, P_int_const; - real coef_v, coef_v_eps; - real dt = control->dt; - real dt_sqr = SQR( dt ); - thermostat *therm = &( data->therm ); - isotropic_barostat *iso_bar = &( data->iso_bar ); - simulation_box *box = &( system->box ); - rvec dx, dv; - - // Here we just calculate how much to increment eps, xi, v_eps, v_xi. - // Commits are done after positions and velocities of atoms are updated - // because position, velocity updates uses v_eps, v_xi terms; - // yet we need EXP( deps ) to be able to calculate - // positions and velocities accurately. - iso_bar->a_eps = control->Tau_P * - ( 3.0 * box->volume * (iso_bar->P - control->P) + - 6.0 * data->E_Kin / data->N_f ) - iso_bar->v_eps * therm->v_xi; - deps = dt * iso_bar->v_eps + 0.5 * dt_sqr * iso_bar->a_eps; - exp_deps = EXP( deps ); - - therm->G_xi = control->Tau_T * ( 2.0 * data->E_Kin + - SQR( iso_bar->v_eps ) / control->Tau_P - - (data->N_f +1) * K_B * control->T ); - dxi = therm->v_xi * dt + 0.5 * therm->G_xi * dt_sqr; - - fprintf(out_control->log, "a: %12.6f eps: %12.6f deps: %12.6f\n", - iso_bar->a_eps, iso_bar->v_eps, iso_bar->eps); - fprintf(out_control->log, "G: %12.6f xi : %12.6f dxi : %12.6f\n", - therm->G_xi, therm->v_xi, therm->xi ); - - // Update positions and velocities - // NOTE: v_old, v_xi_old, v_eps_old are meant to be the old values - // in the iteration not the old values at time t or before! - for (i=0; i < system->N; i++) - { - inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass; - - // Compute x(t + dt) - rvec_ScaledSum( workspace->a[i], -F_CONV * inv_m, system->atoms[i].f, - -( (2.0 + 3.0/data->N_f) * iso_bar->v_eps + therm->v_xi ), - system->atoms[i].v ); - rvec_ScaledSum( dx, dt, system->atoms[i].v, - 0.5 * dt_sqr, workspace->a[i] ); - Inc_on_T3( system->atoms[i].x, dx, &(system->box) ); - rvec_Scale( system->atoms[i].x, exp_deps, system->atoms[i].x ); - } - - // Commit updates - therm->xi += dxi; - iso_bar->eps += deps; - //Update_Box_Isotropic( EXP( 3.0 * iso_bar->eps ), &(system->box) ); - Update_Box_Isotropic( &(system->box), EXP( 3.0 * iso_bar->eps ) ); - - - // Calculate new forces, f(t + dt) - Reset( system, control, data, workspace ); - fprintf(out_control->log,"reset-"); fflush( out_control->log ); - - Generate_Neighbor_Lists( system, control, data, workspace, - lists, out_control ); - fprintf(out_control->log,"nbrs-"); fflush( out_control->log ); - - /* QEq( system, control, workspace, lists[FAR_NBRS], out_control ); - fprintf(out_control->log,"qeq-"); fflush( out_control->log ); */ - - Compute_Forces( system, control, data, workspace, lists, out_control ); - fprintf(out_control->log,"forces\n"); fflush( out_control->log ); - - - // Compute iteration constants for each atom's velocity and for P_internal - // Compute kinetic energy for initial velocities of the iteration - P_int_const = E_kin = 0; - for( i = 0; i < system->N; ++i ) - { - inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass; - - rvec_ScaledSum( dv, 0.5 * dt, workspace->a[i], - 0.5 * dt * -F_CONV * inv_m, system->atoms[i].f ); - rvec_Add( dv, system->atoms[i].v ); - rvec_Scale( workspace->v_const[i], exp_deps, dv ); - - P_int_const += ( -F_CONV * - rvec_Dot( system->atoms[i].f, system->atoms[i].x ) ); - - E_kin += (0.5 * system->reaxprm.sbp[system->atoms[i].type].mass * - rvec_Dot( system->atoms[i].v, system->atoms[i].v ) ); - } - - - // Compute initial p_int - inv_3V = 1.0 / (3.0 * system->box.volume); - P_int = inv_3V * ( 2.0 * E_kin + P_int_const ); - - v_xi_new = therm->v_xi_old + 2.0 * dt * therm->G_xi; - v_eps_new = iso_bar->v_eps_old + 2.0 * dt * iso_bar->a_eps; - - itr = 0; - do - { - itr++; - // new values become old in this iteration - v_xi_old = v_xi_new; - v_eps_old = v_eps_new; - - - for( i = 0; i < system->N; ++i ) - { - coef_v = 1.0 / (1.0 + 0.5 * dt * exp_deps * - ( (2.0 + 3.0/data->N_f) * v_eps_old + v_xi_old ) ); - rvec_Scale( system->atoms[i].v, coef_v, workspace->v_const[i] ); - } - - - coef_v_eps = 1.0 / (1.0 + 0.5 * dt * v_xi_old); - a_eps_new = 3.0 * control->Tau_P * - ( system->box.volume * (P_int - control->P) + 2.0 * E_kin / data->N_f ); - v_eps_new = coef_v_eps * ( iso_bar->v_eps + - 0.5 * dt * ( iso_bar->a_eps + a_eps_new ) ); + int i, itr; + real deps, v_eps_new=0, v_eps_old=0, G_xi_new; + real dxi, v_xi_new=0, v_xi_old=0, a_eps_new; + real inv_m, exp_deps, inv_3V; + real E_kin, P_int, P_int_const; + real coef_v, coef_v_eps; + real dt = control->dt; + real dt_sqr = SQR( dt ); + thermostat *therm = &( data->therm ); + isotropic_barostat *iso_bar = &( data->iso_bar ); + simulation_box *box = &( system->box ); + rvec dx, dv; + + // Here we just calculate how much to increment eps, xi, v_eps, v_xi. + // Commits are done after positions and velocities of atoms are updated + // because position, velocity updates uses v_eps, v_xi terms; + // yet we need EXP( deps ) to be able to calculate + // positions and velocities accurately. + iso_bar->a_eps = control->Tau_P * + ( 3.0 * box->volume * (iso_bar->P - control->P) + + 6.0 * data->E_Kin / data->N_f ) - iso_bar->v_eps * therm->v_xi; + deps = dt * iso_bar->v_eps + 0.5 * dt_sqr * iso_bar->a_eps; + exp_deps = EXP( deps ); + + therm->G_xi = control->Tau_T * ( 2.0 * data->E_Kin + + SQR( iso_bar->v_eps ) / control->Tau_P - + (data->N_f +1) * K_B * control->T ); + dxi = therm->v_xi * dt + 0.5 * therm->G_xi * dt_sqr; + + fprintf(out_control->log, "a: %12.6f eps: %12.6f deps: %12.6f\n", + iso_bar->a_eps, iso_bar->v_eps, iso_bar->eps); + fprintf(out_control->log, "G: %12.6f xi : %12.6f dxi : %12.6f\n", + therm->G_xi, therm->v_xi, therm->xi ); + + // Update positions and velocities + // NOTE: v_old, v_xi_old, v_eps_old are meant to be the old values + // in the iteration not the old values at time t or before! + for (i=0; i < system->N; i++) + { + inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass; + + // Compute x(t + dt) + rvec_ScaledSum( workspace->a[i], -F_CONV * inv_m, system->atoms[i].f, + -( (2.0 + 3.0/data->N_f) * iso_bar->v_eps + therm->v_xi ), + system->atoms[i].v ); + rvec_ScaledSum( dx, dt, system->atoms[i].v, + 0.5 * dt_sqr, workspace->a[i] ); + Inc_on_T3( system->atoms[i].x, dx, &(system->box) ); + rvec_Scale( system->atoms[i].x, exp_deps, system->atoms[i].x ); + } + + // Commit updates + therm->xi += dxi; + iso_bar->eps += deps; + //Update_Box_Isotropic( EXP( 3.0 * iso_bar->eps ), &(system->box) ); + Update_Box_Isotropic( &(system->box), EXP( 3.0 * iso_bar->eps ) ); + + + // Calculate new forces, f(t + dt) + Reset( system, control, data, workspace ); + fprintf(out_control->log,"reset-"); fflush( out_control->log ); + + Generate_Neighbor_Lists( system, control, data, workspace, + lists, out_control ); + fprintf(out_control->log,"nbrs-"); fflush( out_control->log ); + + /* QEq( system, control, workspace, lists[FAR_NBRS], out_control ); + fprintf(out_control->log,"qeq-"); fflush( out_control->log ); */ + + Compute_Forces( system, control, data, workspace, lists, out_control ); + fprintf(out_control->log,"forces\n"); fflush( out_control->log ); + + + // Compute iteration constants for each atom's velocity and for P_internal + // Compute kinetic energy for initial velocities of the iteration + P_int_const = E_kin = 0; + for( i = 0; i < system->N; ++i ) + { + inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass; + + rvec_ScaledSum( dv, 0.5 * dt, workspace->a[i], + 0.5 * dt * -F_CONV * inv_m, system->atoms[i].f ); + rvec_Add( dv, system->atoms[i].v ); + rvec_Scale( workspace->v_const[i], exp_deps, dv ); + + P_int_const += ( -F_CONV * + rvec_Dot( system->atoms[i].f, system->atoms[i].x ) ); + + E_kin += (0.5 * system->reaxprm.sbp[system->atoms[i].type].mass * + rvec_Dot( system->atoms[i].v, system->atoms[i].v ) ); + } + + + // Compute initial p_int + inv_3V = 1.0 / (3.0 * system->box.volume); + P_int = inv_3V * ( 2.0 * E_kin + P_int_const ); + + v_xi_new = therm->v_xi_old + 2.0 * dt * therm->G_xi; + v_eps_new = iso_bar->v_eps_old + 2.0 * dt * iso_bar->a_eps; + + itr = 0; + do + { + itr++; + // new values become old in this iteration + v_xi_old = v_xi_new; + v_eps_old = v_eps_new; + + + for( i = 0; i < system->N; ++i ) + { + coef_v = 1.0 / (1.0 + 0.5 * dt * exp_deps * + ( (2.0 + 3.0/data->N_f) * v_eps_old + v_xi_old ) ); + rvec_Scale( system->atoms[i].v, coef_v, workspace->v_const[i] ); + } + + + coef_v_eps = 1.0 / (1.0 + 0.5 * dt * v_xi_old); + a_eps_new = 3.0 * control->Tau_P * + ( system->box.volume * (P_int - control->P) + 2.0 * E_kin / data->N_f ); + v_eps_new = coef_v_eps * ( iso_bar->v_eps + + 0.5 * dt * ( iso_bar->a_eps + a_eps_new ) ); - G_xi_new = control->Tau_T * ( 2.0 * E_kin + - SQR( v_eps_old ) / control->Tau_P - - (data->N_f + 1) * K_B * control->T ); - v_xi_new = therm->v_xi + 0.5 * dt * ( therm->G_xi + G_xi_new ); + G_xi_new = control->Tau_T * ( 2.0 * E_kin + + SQR( v_eps_old ) / control->Tau_P - + (data->N_f + 1) * K_B * control->T ); + v_xi_new = therm->v_xi + 0.5 * dt * ( therm->G_xi + G_xi_new ); - E_kin = 0; - for( i = 0; i < system->N; ++i ) - E_kin += (0.5 * system->reaxprm.sbp[system->atoms[i].type].mass * - rvec_Dot( system->atoms[i].v, system->atoms[i].v ) ); + E_kin = 0; + for( i = 0; i < system->N; ++i ) + E_kin += (0.5 * system->reaxprm.sbp[system->atoms[i].type].mass * + rvec_Dot( system->atoms[i].v, system->atoms[i].v ) ); - P_int = inv_3V * ( 2.0*E_kin + P_int_const ); + P_int = inv_3V * ( 2.0*E_kin + P_int_const ); - fprintf( out_control->log, - "itr %d E_kin: %8.3f veps_n:%8.3f veps_o:%8.3f vxi_n:%8.3f vxi_o: %8.3f\n", - itr, E_kin, v_eps_new, v_eps_old, v_xi_new, v_xi_old ); - } - while( fabs(v_eps_new - v_eps_old) + fabs(v_xi_new - v_xi_old) > 2e-3 ); + fprintf( out_control->log, + "itr %d E_kin: %8.3f veps_n:%8.3f veps_o:%8.3f vxi_n:%8.3f vxi_o: %8.3f\n", + itr, E_kin, v_eps_new, v_eps_old, v_xi_new, v_xi_old ); + } + while( fabs(v_eps_new - v_eps_old) + fabs(v_xi_new - v_xi_old) > 2e-3 ); - therm->v_xi_old = therm->v_xi; - therm->v_xi = v_xi_new; - therm->G_xi = G_xi_new; + therm->v_xi_old = therm->v_xi; + therm->v_xi = v_xi_new; + therm->G_xi = G_xi_new; - iso_bar->v_eps_old = iso_bar->v_eps; - iso_bar->v_eps = v_eps_new; - iso_bar->a_eps = a_eps_new; + iso_bar->v_eps_old = iso_bar->v_eps; + iso_bar->v_eps = v_eps_new; + iso_bar->a_eps = a_eps_new; - fprintf( out_control->log, "V: %8.3ff\tsides{%8.3f, %8.3f, %8.3f}\n", - system->box.volume, - system->box.box[0][0],system->box.box[1][1],system->box.box[2][2] ); - fprintf(out_control->log,"eps:\ta- %8.3f v- %8.3f eps- %8.3f\n", - iso_bar->a_eps, iso_bar->v_eps, iso_bar->eps); - fprintf(out_control->log,"xi: \tG- %8.3f v- %8.3f xi - %8.3f\n", - therm->G_xi, therm->v_xi, therm->xi); + fprintf( out_control->log, "V: %8.3ff\tsides{%8.3f, %8.3f, %8.3f}\n", + system->box.volume, + system->box.box[0][0],system->box.box[1][1],system->box.box[2][2] ); + fprintf(out_control->log,"eps:\ta- %8.3f v- %8.3f eps- %8.3f\n", + iso_bar->a_eps, iso_bar->v_eps, iso_bar->eps); + fprintf(out_control->log,"xi: \tG- %8.3f v- %8.3f xi - %8.3f\n", + therm->G_xi, therm->v_xi, therm->xi); } #endif @@ -989,256 +989,256 @@ void Velocity_Verlet_Isotropic_NPT( reax_system* system, All box dimensions are scaled by the same amount, there is no change in the angles between axes. */ void Velocity_Verlet_Berendsen_NVT( reax_system* system, - control_params* control, - simulation_data *data, - static_storage *workspace, - list **lists, - output_controls *out_control - ) + control_params* control, + simulation_data *data, + static_storage *workspace, + list **lists, + output_controls *out_control + ) { - int i, steps, renbr; - real inv_m, dt, lambda; - rvec dx; - reax_atom *atom; + int i, steps, renbr; + real inv_m, dt, lambda; + rvec dx; + reax_atom *atom; - fprintf (stderr, " Velocity_Verlet_Berendsen_NVT: step :%d \n", data->step); + fprintf (stderr, " Velocity_Verlet_Berendsen_NVT: step :%d \n", data->step); #if defined(DEBUG_FOCUS) - fprintf( stderr, "step%d\n", data->step ); + fprintf( stderr, "step%d\n", data->step ); #endif - dt = control->dt; - steps = data->step - data->prev_steps; - renbr = (steps % control->reneighbor == 0); - - /* velocity verlet, 1st part */ - for( i = 0; i < system->N; i++ ) { - atom = &(system->atoms[i]); - inv_m = 1.0 / system->reaxprm.sbp[atom->type].mass; - /* Compute x(t + dt) */ - rvec_ScaledSum( dx, dt, atom->v, 0.5 * -F_CONV * inv_m * SQR(dt), atom->f ); - rvec_Add( atom->x, dx ); - /* Compute v(t + dt/2) */ - rvec_ScaledAdd( atom->v, 0.5 * -F_CONV * inv_m * dt, atom->f ); - } + dt = control->dt; + steps = data->step - data->prev_steps; + renbr = (steps % control->reneighbor == 0); + + /* velocity verlet, 1st part */ + for( i = 0; i < system->N; i++ ) { + atom = &(system->atoms[i]); + inv_m = 1.0 / system->reaxprm.sbp[atom->type].mass; + /* Compute x(t + dt) */ + rvec_ScaledSum( dx, dt, atom->v, 0.5 * -F_CONV * inv_m * SQR(dt), atom->f ); + rvec_Add( atom->x, dx ); + /* Compute v(t + dt/2) */ + rvec_ScaledAdd( atom->v, 0.5 * -F_CONV * inv_m * dt, atom->f ); + } #if defined(DEBUG_FOCUS) - fprintf(stderr, "step%d: verlet1 done\n", data->step); + fprintf(stderr, "step%d: verlet1 done\n", data->step); #endif - Reallocate( system, workspace, lists, renbr ); - Reset( system, control, data, workspace, lists ); + Reallocate( system, workspace, lists, renbr ); + Reset( system, control, data, workspace, lists ); - if( renbr ) - Generate_Neighbor_Lists( system, control, data, workspace, lists, out_control ); + if( renbr ) + Generate_Neighbor_Lists( system, control, data, workspace, lists, out_control ); - Compute_Forces( system, control, data, workspace, - lists, out_control ); + Compute_Forces( system, control, data, workspace, + lists, out_control ); - /* velocity verlet, 2nd part */ - for( i = 0; i < system->N; i++ ) { - atom = &(system->atoms[i]); - inv_m = 1.0 / system->reaxprm.sbp[atom->type].mass; - /* Compute v(t + dt) */ - rvec_ScaledAdd( atom->v, 0.5 * dt * -F_CONV * inv_m, atom->f ); - } + /* velocity verlet, 2nd part */ + for( i = 0; i < system->N; i++ ) { + atom = &(system->atoms[i]); + inv_m = 1.0 / system->reaxprm.sbp[atom->type].mass; + /* Compute v(t + dt) */ + rvec_ScaledAdd( atom->v, 0.5 * dt * -F_CONV * inv_m, atom->f ); + } #if defined(DEBUG_FOCUS) - fprintf(stderr, "step%d: verlet2 done\n", data->step); + fprintf(stderr, "step%d: verlet2 done\n", data->step); #endif - /* temperature scaler */ - Compute_Kinetic_Energy( system, data ); - lambda = 1.0 + (dt / control->Tau_T) * (control->T / data->therm.T - 1.0); - if( lambda < MIN_dT ) - lambda = MIN_dT; - else if (lambda > MAX_dT ) - lambda = MAX_dT; - lambda = SQRT( lambda ); - - /* Scale velocities and positions at t+dt */ - for( i = 0; i < system->N; ++i ) { - atom = &(system->atoms[i]); - rvec_Scale( atom->v, lambda, atom->v ); - } - Compute_Kinetic_Energy( system, data ); + /* temperature scaler */ + Compute_Kinetic_Energy( system, data ); + lambda = 1.0 + (dt / control->Tau_T) * (control->T / data->therm.T - 1.0); + if( lambda < MIN_dT ) + lambda = MIN_dT; + else if (lambda > MAX_dT ) + lambda = MAX_dT; + lambda = SQRT( lambda ); + + /* Scale velocities and positions at t+dt */ + for( i = 0; i < system->N; ++i ) { + atom = &(system->atoms[i]); + rvec_Scale( atom->v, lambda, atom->v ); + } + Compute_Kinetic_Energy( system, data ); #if defined(DEBUG_FOCUS) - fprintf( stderr, "step%d: scaled velocities\n", - data->step ); + fprintf( stderr, "step%d: scaled velocities\n", + data->step ); #endif } GLOBAL void ker_update_velocity_1 (reax_atom *atoms, - single_body_parameters *sbp, - real dt, - simulation_box *box, - int N) + single_body_parameters *sbp, + real dt, + simulation_box *box, + int N) { - real inv_m; - rvec dx; - reax_atom *atom; - int i = blockIdx.x * blockDim.x + threadIdx.x; - if ( i >= N ) return; - - /* velocity verlet, 1st part */ - //for( i = 0; i < system->n; i++ ) { - atom = &(atoms[i]); - inv_m = 1.0 / sbp[atom->type].mass; - /* Compute x(t + dt) */ - rvec_ScaledSum( dx, dt, atom->v, 0.5 * -F_CONV * inv_m * SQR(dt), atom->f ); - rvec_Add( atom->x, dx ); - - /* Metin's suggestion to rebox the atoms */ - /* bNVT fix */ - Inc_on_T3( atoms[i].x, dx, box ); - /* bNVT fix */ - - /* Compute v(t + dt/2) */ - rvec_ScaledAdd( atom->v, 0.5 * -F_CONV * inv_m * dt, atom->f ); - //} + real inv_m; + rvec dx; + reax_atom *atom; + int i = blockIdx.x * blockDim.x + threadIdx.x; + if ( i >= N ) return; + + /* velocity verlet, 1st part */ + //for( i = 0; i < system->n; i++ ) { + atom = &(atoms[i]); + inv_m = 1.0 / sbp[atom->type].mass; + /* Compute x(t + dt) */ + rvec_ScaledSum( dx, dt, atom->v, 0.5 * -F_CONV * inv_m * SQR(dt), atom->f ); + rvec_Add( atom->x, dx ); + + /* Metin's suggestion to rebox the atoms */ + /* bNVT fix */ + Inc_on_T3( atoms[i].x, dx, box ); + /* bNVT fix */ + + /* Compute v(t + dt/2) */ + rvec_ScaledAdd( atom->v, 0.5 * -F_CONV * inv_m * dt, atom->f ); + //} } void bNVT_update_velocity_part1 (reax_system *system, simulation_box *box, real dt) { - ker_update_velocity_1 <<< BLOCKS, BLOCK_SIZE>>> - (system->d_atoms, system->reaxprm.d_sbp, dt, box, system->N); - cudaThreadSynchronize (); - cudaCheckError (); + ker_update_velocity_1 <<< BLOCKS, BLOCK_SIZE>>> + (system->d_atoms, system->reaxprm.d_sbp, dt, box, system->N); + cudaThreadSynchronize (); + cudaCheckError (); } GLOBAL void ker_update_velocity_2 (reax_atom *atoms, - single_body_parameters *sbp, - real dt, - int N) + single_body_parameters *sbp, + real dt, + int N) { - reax_atom *atom; - real inv_m; - int i = blockIdx.x * blockDim.x + threadIdx.x; - if ( i >= N ) return; - - /* velocity verlet, 2nd part */ - //for( i = 0; i < system->n; i++ ) { - atom = &(atoms[i]); - inv_m = 1.0 / sbp[atom->type].mass; - /* Compute v(t + dt) */ - rvec_ScaledAdd( atom->v, 0.5 * dt * -F_CONV * inv_m, atom->f ); - //} + reax_atom *atom; + real inv_m; + int i = blockIdx.x * blockDim.x + threadIdx.x; + if ( i >= N ) return; + + /* velocity verlet, 2nd part */ + //for( i = 0; i < system->n; i++ ) { + atom = &(atoms[i]); + inv_m = 1.0 / sbp[atom->type].mass; + /* Compute v(t + dt) */ + rvec_ScaledAdd( atom->v, 0.5 * dt * -F_CONV * inv_m, atom->f ); + //} } void bNVT_update_velocity_part2 (reax_system *system, real dt) { - ker_update_velocity_2 <<< BLOCKS, BLOCK_SIZE >>> - (system->d_atoms, system->reaxprm.d_sbp, dt, system->N); - cudaThreadSynchronize (); - cudaCheckError (); + ker_update_velocity_2 <<< BLOCKS, BLOCK_SIZE >>> + (system->d_atoms, system->reaxprm.d_sbp, dt, system->N); + cudaThreadSynchronize (); + cudaCheckError (); } GLOBAL void ker_scale_velocities (reax_atom *atoms, real lambda, int N) { - reax_atom *atom; - int i = blockIdx.x * blockDim.x + threadIdx.x; - if ( i >= N ) return; - - /* Scale velocities and positions at t+dt */ - //for( i = 0; i < system->n; ++i ) { - atom = &(atoms[i]); - rvec_Scale( atom->v, lambda, atom->v ); - //} + reax_atom *atom; + int i = blockIdx.x * blockDim.x + threadIdx.x; + if ( i >= N ) return; + + /* Scale velocities and positions at t+dt */ + //for( i = 0; i < system->n; ++i ) { + atom = &(atoms[i]); + rvec_Scale( atom->v, lambda, atom->v ); + //} } void bNVT_scale_velocities (reax_system *system, real lambda) { - ker_scale_velocities <<< BLOCKS, BLOCK_SIZE >>> - (system->d_atoms, lambda, system->N); - cudaThreadSynchronize (); - cudaCheckError (); + ker_scale_velocities <<< BLOCKS, BLOCK_SIZE >>> + (system->d_atoms, lambda, system->N); + cudaThreadSynchronize (); + cudaCheckError (); } void Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* system, - control_params* control, - simulation_data *data, - static_storage *workspace, - list **lists, - output_controls *out_control - ) + control_params* control, + simulation_data *data, + static_storage *workspace, + list **lists, + output_controls *out_control + ) { - int i, steps, renbr; - real inv_m, dt, lambda; - rvec dx; - reax_atom *atom; + int i, steps, renbr; + real inv_m, dt, lambda; + rvec dx; + reax_atom *atom; #if defined(DEBUG_FOCUS) - fprintf( stderr, "step%d\n", data->step ); + fprintf( stderr, "step%d\n", data->step ); #endif - dt = control->dt; - steps = data->step - data->prev_steps; - renbr = (steps % control->reneighbor == 0); - - /* velocity verlet, 1st part - for( i = 0; i < system->N; i++ ) { - atom = &(system->atoms[i]); - inv_m = 1.0 / system->reaxprm.sbp[atom->type].mass; - // Compute x(t + dt) - rvec_ScaledSum( dx, dt, atom->v, 0.5 * -F_CONV * inv_m * SQR(dt), atom->f ); - rvec_Add( atom->x, dx ); - // Compute v(t + dt/2) - rvec_ScaledAdd( atom->v, 0.5 * -F_CONV * inv_m * dt, atom->f ); - } - */ - bNVT_update_velocity_part1 (system, (simulation_box *) system->d_box, dt); + dt = control->dt; + steps = data->step - data->prev_steps; + renbr = (steps % control->reneighbor == 0); + + /* velocity verlet, 1st part + for( i = 0; i < system->N; i++ ) { + atom = &(system->atoms[i]); + inv_m = 1.0 / system->reaxprm.sbp[atom->type].mass; + // Compute x(t + dt) + rvec_ScaledSum( dx, dt, atom->v, 0.5 * -F_CONV * inv_m * SQR(dt), atom->f ); + rvec_Add( atom->x, dx ); + // Compute v(t + dt/2) + rvec_ScaledAdd( atom->v, 0.5 * -F_CONV * inv_m * dt, atom->f ); + } + */ + bNVT_update_velocity_part1 (system, (simulation_box *) system->d_box, dt); #if defined(DEBUG_FOCUS) - fprintf(stderr, "step%d: verlet1 done\n", data->step); + fprintf(stderr, "step%d: verlet1 done\n", data->step); #endif - Cuda_Reallocate( system, dev_workspace, dev_lists, renbr, data->step ); - Cuda_Reset( system, control, data, workspace, lists ); - - if( renbr ) { - Cuda_Generate_Neighbor_Lists( system, workspace, control, true); - } - - Cuda_Compute_Forces( system, control, data, workspace, - lists, out_control ); - - /* velocity verlet, 2nd part - for( i = 0; i < system->N; i++ ) { - atom = &(system->atoms[i]); - inv_m = 1.0 / system->reaxprm.sbp[atom->type].mass; - // Compute v(t + dt) - rvec_ScaledAdd( atom->v, 0.5 * dt * -F_CONV * inv_m, atom->f ); - } - */ - bNVT_update_velocity_part2 (system, dt); + Cuda_Reallocate( system, dev_workspace, dev_lists, renbr, data->step ); + Cuda_Reset( system, control, data, workspace, lists ); + + if( renbr ) { + Cuda_Generate_Neighbor_Lists( system, workspace, control, true); + } + + Cuda_Compute_Forces( system, control, data, workspace, + lists, out_control ); + + /* velocity verlet, 2nd part + for( i = 0; i < system->N; i++ ) { + atom = &(system->atoms[i]); + inv_m = 1.0 / system->reaxprm.sbp[atom->type].mass; + // Compute v(t + dt) + rvec_ScaledAdd( atom->v, 0.5 * dt * -F_CONV * inv_m, atom->f ); + } + */ + bNVT_update_velocity_part2 (system, dt); #if defined(DEBUG_FOCUS) - fprintf(stderr, "step%d: verlet2 done\n", data->step); + fprintf(stderr, "step%d: verlet2 done\n", data->step); #endif - /* temperature scaler */ - Cuda_Compute_Kinetic_Energy( system, data ); - //get the latest temperature from the device to the host. - copy_host_device (&data->therm, &((simulation_data *)data->d_simulation_data)->therm, - sizeof (thermostat), cudaMemcpyDeviceToHost, RES_SIMULATION_DATA ); - - lambda = 1.0 + (dt / control->Tau_T) * (control->T / data->therm.T - 1.0); - if( lambda < MIN_dT ) - lambda = MIN_dT; - else if (lambda > MAX_dT ) - lambda = MAX_dT; - lambda = SQRT( lambda ); - - //fprintf (stderr, "step:%d lambda -> %f \n", data->step, lambda); - - /* Scale velocities and positions at t+dt - for( i = 0; i < system->N; ++i ) { - atom = &(system->atoms[i]); - rvec_Scale( atom->v, lambda, atom->v ); - } - */ - bNVT_scale_velocities (system, lambda); - Cuda_Compute_Kinetic_Energy( system, data ); + /* temperature scaler */ + Cuda_Compute_Kinetic_Energy( system, data ); + //get the latest temperature from the device to the host. + copy_host_device (&data->therm, &((simulation_data *)data->d_simulation_data)->therm, + sizeof (thermostat), cudaMemcpyDeviceToHost, RES_SIMULATION_DATA ); + + lambda = 1.0 + (dt / control->Tau_T) * (control->T / data->therm.T - 1.0); + if( lambda < MIN_dT ) + lambda = MIN_dT; + else if (lambda > MAX_dT ) + lambda = MAX_dT; + lambda = SQRT( lambda ); + + //fprintf (stderr, "step:%d lambda -> %f \n", data->step, lambda); + + /* Scale velocities and positions at t+dt + for( i = 0; i < system->N; ++i ) { + atom = &(system->atoms[i]); + rvec_Scale( atom->v, lambda, atom->v ); + } + */ + bNVT_scale_velocities (system, lambda); + Cuda_Compute_Kinetic_Energy( system, data ); #if defined(DEBUG_FOCUS) - fprintf( stderr, "step%d: scaled velocities\n", - data->step ); + fprintf( stderr, "step%d: scaled velocities\n", + data->step ); #endif } diff --git a/PuReMD-GPU/src/list.cu b/PuReMD-GPU/src/list.cu index 5412c951..095409aa 100644 --- a/PuReMD-GPU/src/list.cu +++ b/PuReMD-GPU/src/list.cu @@ -23,213 +23,213 @@ HOST char Make_List(int n, int num_intrs, int type, list* l, int proc) { - char success=1; - - if (proc == TYP_HOST) { - - l->n = n; - l->num_intrs = num_intrs; - - l->index = (int*) malloc( n * sizeof(int) ); - l->end_index = (int*) malloc( n * sizeof(int) ); - - if (l->index == NULL) success = 0; - if (l->end_index == NULL) success = 0; - - l->type = type; - - switch(type) - { - case TYP_VOID: - l->select.v = (void *) malloc(l->num_intrs*sizeof(void)); - if (l->select.v == NULL) success = 0; - break; - - case TYP_THREE_BODY: - l->select.three_body_list = (three_body_interaction_data*) - malloc(l->num_intrs*sizeof(three_body_interaction_data)); - if (l->select.three_body_list == NULL) success = 0; - break; - - case TYP_BOND: - l->select.bond_list = (bond_data*) - malloc(l->num_intrs * sizeof(bond_data)); - if (l->select.bond_list == NULL) success = 0; - break; - - case TYP_DBO: - l->select.dbo_list = (dbond_data*) - malloc(l->num_intrs * sizeof(dbond_data)); - if (l->select.dbo_list == NULL) success = 0; - break; - - case TYP_DDELTA: - l->select.dDelta_list = (dDelta_data*) - malloc(l->num_intrs*sizeof(dDelta_data)); - if (l->select.dDelta_list == NULL) success = 0; - break; - - case TYP_FAR_NEIGHBOR: - l->select.far_nbr_list = (far_neighbor_data*) - malloc(l->num_intrs*sizeof(far_neighbor_data)); - if (l->select.far_nbr_list == NULL) success = 0; - break; - - case TYP_NEAR_NEIGHBOR: - l->select.near_nbr_list = (near_neighbor_data*) - malloc(l->num_intrs*sizeof(near_neighbor_data)); - if (l->select.near_nbr_list == NULL) success = 0; - break; - - case TYP_HBOND: - l->select.hbond_list = (hbond_data*) - malloc( l->num_intrs * sizeof(hbond_data) ); - if (l->select.hbond_list == NULL) success = 0; - break; - - default: - l->select.v = (void *) malloc(l->num_intrs*sizeof(void)); - if (l->select.v == NULL) success = 0; - l->type = TYP_VOID; - break; - } - - } - else - { - l->n = n; - l->num_intrs = num_intrs; - - cuda_malloc ((void **)&l->index, n * sizeof(int), 1, LIST_INDEX ); - cuda_malloc ((void **)&l->end_index, n * sizeof(int), 1, LIST_END_INDEX ); - - switch(type) - { - case TYP_FAR_NEIGHBOR: - cuda_malloc ((void **) &l->select.far_nbr_list, - l->num_intrs*sizeof(far_neighbor_data), - 1, LIST_FAR_NEIGHBOR_DATA); - /* - cudaHostAlloc ((void **) &l->select.far_nbr_list, - l->num_intrs*sizeof(far_neighbor_data), - cudaHostAllocMapped); - - cudaHostGetDevicePointer ( (void **) &l->select.far_nbr_list, - (void *)l->select.far_nbr_list, 0); - */ - break; - - case TYP_HBOND: - cuda_malloc ((void **) &l->select.hbond_list, - l->num_intrs * sizeof(hbond_data), - 1, LIST_HBOND_DATA ); - break; - - case TYP_BOND: - cuda_malloc ((void **) &l->select.bond_list, - l->num_intrs * sizeof(bond_data), - 1, LIST_BOND_DATA ); - break; - - case TYP_THREE_BODY: - cuda_malloc ( (void **) &l->select.three_body_list, - l->num_intrs * sizeof(three_body_interaction_data), - 1, LIST_THREE_BODY_DATA ); - break; - - default: - fprintf (stderr, "Unknown list creation \n" ); - exit (1); - } - } - - return success; + char success=1; + + if (proc == TYP_HOST) { + + l->n = n; + l->num_intrs = num_intrs; + + l->index = (int*) malloc( n * sizeof(int) ); + l->end_index = (int*) malloc( n * sizeof(int) ); + + if (l->index == NULL) success = 0; + if (l->end_index == NULL) success = 0; + + l->type = type; + + switch(type) + { + case TYP_VOID: + l->select.v = (void *) malloc(l->num_intrs*sizeof(void)); + if (l->select.v == NULL) success = 0; + break; + + case TYP_THREE_BODY: + l->select.three_body_list = (three_body_interaction_data*) + malloc(l->num_intrs*sizeof(three_body_interaction_data)); + if (l->select.three_body_list == NULL) success = 0; + break; + + case TYP_BOND: + l->select.bond_list = (bond_data*) + malloc(l->num_intrs * sizeof(bond_data)); + if (l->select.bond_list == NULL) success = 0; + break; + + case TYP_DBO: + l->select.dbo_list = (dbond_data*) + malloc(l->num_intrs * sizeof(dbond_data)); + if (l->select.dbo_list == NULL) success = 0; + break; + + case TYP_DDELTA: + l->select.dDelta_list = (dDelta_data*) + malloc(l->num_intrs*sizeof(dDelta_data)); + if (l->select.dDelta_list == NULL) success = 0; + break; + + case TYP_FAR_NEIGHBOR: + l->select.far_nbr_list = (far_neighbor_data*) + malloc(l->num_intrs*sizeof(far_neighbor_data)); + if (l->select.far_nbr_list == NULL) success = 0; + break; + + case TYP_NEAR_NEIGHBOR: + l->select.near_nbr_list = (near_neighbor_data*) + malloc(l->num_intrs*sizeof(near_neighbor_data)); + if (l->select.near_nbr_list == NULL) success = 0; + break; + + case TYP_HBOND: + l->select.hbond_list = (hbond_data*) + malloc( l->num_intrs * sizeof(hbond_data) ); + if (l->select.hbond_list == NULL) success = 0; + break; + + default: + l->select.v = (void *) malloc(l->num_intrs*sizeof(void)); + if (l->select.v == NULL) success = 0; + l->type = TYP_VOID; + break; + } + + } + else + { + l->n = n; + l->num_intrs = num_intrs; + + cuda_malloc ((void **)&l->index, n * sizeof(int), 1, LIST_INDEX ); + cuda_malloc ((void **)&l->end_index, n * sizeof(int), 1, LIST_END_INDEX ); + + switch(type) + { + case TYP_FAR_NEIGHBOR: + cuda_malloc ((void **) &l->select.far_nbr_list, + l->num_intrs*sizeof(far_neighbor_data), + 1, LIST_FAR_NEIGHBOR_DATA); + /* + cudaHostAlloc ((void **) &l->select.far_nbr_list, + l->num_intrs*sizeof(far_neighbor_data), + cudaHostAllocMapped); + + cudaHostGetDevicePointer ( (void **) &l->select.far_nbr_list, + (void *)l->select.far_nbr_list, 0); + */ + break; + + case TYP_HBOND: + cuda_malloc ((void **) &l->select.hbond_list, + l->num_intrs * sizeof(hbond_data), + 1, LIST_HBOND_DATA ); + break; + + case TYP_BOND: + cuda_malloc ((void **) &l->select.bond_list, + l->num_intrs * sizeof(bond_data), + 1, LIST_BOND_DATA ); + break; + + case TYP_THREE_BODY: + cuda_malloc ( (void **) &l->select.three_body_list, + l->num_intrs * sizeof(three_body_interaction_data), + 1, LIST_THREE_BODY_DATA ); + break; + + default: + fprintf (stderr, "Unknown list creation \n" ); + exit (1); + } + } + + return success; } HOST void Delete_List(list* l, int type) { - if (type == TYP_HOST ) - { - if( l->index != NULL ) - free(l->index); - if( l->end_index != NULL ) - free(l->end_index); - - switch(l->type) - { - case TYP_VOID: - if( l->select.v != NULL ) - free(l->select.v); - break; - case TYP_THREE_BODY: - if( l->select.three_body_list != NULL ) - free(l->select.three_body_list); - break; - case TYP_BOND: - if( l->select.bond_list != NULL ) - free(l->select.bond_list); - break; - case TYP_DBO: - if( l->select.dbo_list != NULL ) - free(l->select.dbo_list); - break; - case TYP_DDELTA: - if( l->select.dDelta_list != NULL ) - free(l->select.dDelta_list); - break; - case TYP_FAR_NEIGHBOR: - if( l->select.far_nbr_list != NULL ) - free(l->select.far_nbr_list); - break; - case TYP_NEAR_NEIGHBOR: - if( l->select.near_nbr_list != NULL ) - free(l->select.near_nbr_list); - break; - case TYP_HBOND: - if( l->select.hbond_list != NULL ) - free(l->select.hbond_list); - break; - - default: - // Report fatal error - break; - } - } - else - { - if (l->index != NULL) - cuda_free (l->index, LIST_INDEX ); - if (l->end_index != NULL) - cuda_free (l->end_index, LIST_END_INDEX ); - - switch(type) - { - case TYP_FAR_NEIGHBOR: - if (l->select.far_nbr_list != NULL) - cuda_free (l->select.far_nbr_list, LIST_FAR_NEIGHBOR_DATA); - break; - - case TYP_HBOND: - if (l->select.hbond_list != NULL) - cuda_free (l->select.hbond_list, LIST_HBOND_DATA ); - break; - - case TYP_BOND: - if (l->select.bond_list != NULL) - cuda_free (l->select.bond_list, LIST_BOND_DATA ); - break; - - case TYP_THREE_BODY: - if (l->select.three_body_list != NULL) - cuda_free ( l->select.three_body_list, LIST_THREE_BODY_DATA ); - break; - - default: - fprintf (stderr, "Unknown list deletion \n" ); - exit (1); - } - } + if (type == TYP_HOST ) + { + if( l->index != NULL ) + free(l->index); + if( l->end_index != NULL ) + free(l->end_index); + + switch(l->type) + { + case TYP_VOID: + if( l->select.v != NULL ) + free(l->select.v); + break; + case TYP_THREE_BODY: + if( l->select.three_body_list != NULL ) + free(l->select.three_body_list); + break; + case TYP_BOND: + if( l->select.bond_list != NULL ) + free(l->select.bond_list); + break; + case TYP_DBO: + if( l->select.dbo_list != NULL ) + free(l->select.dbo_list); + break; + case TYP_DDELTA: + if( l->select.dDelta_list != NULL ) + free(l->select.dDelta_list); + break; + case TYP_FAR_NEIGHBOR: + if( l->select.far_nbr_list != NULL ) + free(l->select.far_nbr_list); + break; + case TYP_NEAR_NEIGHBOR: + if( l->select.near_nbr_list != NULL ) + free(l->select.near_nbr_list); + break; + case TYP_HBOND: + if( l->select.hbond_list != NULL ) + free(l->select.hbond_list); + break; + + default: + // Report fatal error + break; + } + } + else + { + if (l->index != NULL) + cuda_free (l->index, LIST_INDEX ); + if (l->end_index != NULL) + cuda_free (l->end_index, LIST_END_INDEX ); + + switch(type) + { + case TYP_FAR_NEIGHBOR: + if (l->select.far_nbr_list != NULL) + cuda_free (l->select.far_nbr_list, LIST_FAR_NEIGHBOR_DATA); + break; + + case TYP_HBOND: + if (l->select.hbond_list != NULL) + cuda_free (l->select.hbond_list, LIST_HBOND_DATA ); + break; + + case TYP_BOND: + if (l->select.bond_list != NULL) + cuda_free (l->select.bond_list, LIST_BOND_DATA ); + break; + + case TYP_THREE_BODY: + if (l->select.three_body_list != NULL) + cuda_free ( l->select.three_body_list, LIST_THREE_BODY_DATA ); + break; + + default: + fprintf (stderr, "Unknown list deletion \n" ); + exit (1); + } + } } diff --git a/PuReMD-GPU/src/lookup.cu b/PuReMD-GPU/src/lookup.cu index 95fa5c46..c6cc23cf 100644 --- a/PuReMD-GPU/src/lookup.cu +++ b/PuReMD-GPU/src/lookup.cu @@ -25,53 +25,53 @@ #include "index_utils.h" void Make_Lookup_Table(real xmin, real xmax, int n, - lookup_function f, lookup_table* t) + lookup_function f, lookup_table* t) { - int i; - - t->xmin = xmin; - t->xmax = xmax; - t->n = n; - t->dx = (xmax - xmin)/(n-1); - t->inv_dx = 1.0 / t->dx; - t->a = (n-1)/(xmax-xmin); - t->y = (real*) malloc(n*sizeof(real)); - - for(i=0; i < n; i++) - t->y[i] = f(i*t->dx + t->xmin); - - // //fprintf(stdout,"dx = %lf\n",t->dx); - // for(i=0; i < n; i++) - // //fprintf( stdout,"%d %lf %lf %lf\n", - // i, i/t->a+t->xmin, t->y[i], exp(i/t->a+t->xmin) ); + int i; + + t->xmin = xmin; + t->xmax = xmax; + t->n = n; + t->dx = (xmax - xmin)/(n-1); + t->inv_dx = 1.0 / t->dx; + t->a = (n-1)/(xmax-xmin); + t->y = (real*) malloc(n*sizeof(real)); + + for(i=0; i < n; i++) + t->y[i] = f(i*t->dx + t->xmin); + + // //fprintf(stdout,"dx = %lf\n",t->dx); + // for(i=0; i < n; i++) + // //fprintf( stdout,"%d %lf %lf %lf\n", + // i, i/t->a+t->xmin, t->y[i], exp(i/t->a+t->xmin) ); } /* Fills solution into x. Warning: will modify c and d! */ HOST_DEVICE void Tridiagonal_Solve( const real *a, const real *b, - real *c, real *d, real *x, unsigned int n){ - int i; - real id; - - /* Modify the coefficients. */ - c[0] /= b[0]; /* Division by zero risk. */ - d[0] /= b[0]; /* Division by zero would imply a singular matrix. */ - for(i = 1; i < n; i++){ - id = (b[i] - c[i-1] * a[i]); /* Division by zero risk. */ - c[i] /= id; /* Last value calculated is redundant. */ - d[i] = (d[i] - d[i-1] * a[i])/id; - } - - /* Now back substitute. */ - x[n - 1] = d[n - 1]; - for(i = n - 2; i >= 0; i--) - x[i] = d[i] - c[i] * x[i + 1]; + real *c, real *d, real *x, unsigned int n){ + int i; + real id; + + /* Modify the coefficients. */ + c[0] /= b[0]; /* Division by zero risk. */ + d[0] /= b[0]; /* Division by zero would imply a singular matrix. */ + for(i = 1; i < n; i++){ + id = (b[i] - c[i-1] * a[i]); /* Division by zero risk. */ + c[i] /= id; /* Last value calculated is redundant. */ + d[i] = (d[i] - d[i-1] * a[i])/id; + } + + /* Now back substitute. */ + x[n - 1] = d[n - 1]; + for(i = n - 2; i >= 0; i--) + x[i] = d[i] - c[i] * x[i + 1]; } GLOBAL void Cuda_Tridiagonal_Solve (const real *a, const real *b, - real *c, real *d, real *x, unsigned int n) + real *c, real *d, real *x, unsigned int n) { - Tridiagonal_Solve ( a, b, c, d, x, n ); + Tridiagonal_Solve ( a, b, c, d, x, n ); } @@ -84,189 +84,189 @@ GLOBAL void Cuda_Tridiagonal_Solve (const real *a, const real *b, void Natural_Cubic_Spline( const real *h, const real *f, - cubic_spline_coef *coef, unsigned int n ) + cubic_spline_coef *coef, unsigned int n ) { - int i; - real *a, *b, *c, *d, *v; - - /* allocate space for the linear system */ - a = (real*) malloc( n * sizeof(real) ); - b = (real*) malloc( n * sizeof(real) ); - c = (real*) malloc( n * sizeof(real) ); - d = (real*) malloc( n * sizeof(real) ); - v = (real*) malloc( n * sizeof(real) ); - - /* build the linear system */ - a[0] = a[1] = a[n-1] = 0; - for( i = 2; i < n-1; ++i ) - a[i] = h[i-1]; - - b[0] = b[n-1] = 0; - for( i = 1; i < n-1; ++i ) - b[i] = 2 * (h[i-1] + h[i]); - - c[0] = c[n-2] = c[n-1] = 0; - for( i = 1; i < n-2; ++i ) - c[i] = h[i]; - - d[0] = d[n-1] = 0; - for( i = 1; i < n-1; ++i ) - d[i] = 6 * ((f[i+1]-f[i])/h[i] - (f[i]-f[i-1])/h[i-1]); - - /*//fprintf( stderr, "i a b c d\n" ); - for( i = 0; i < n; ++i ) - //fprintf( stderr, "%d %f %f %f %f\n", i, a[i], b[i], c[i], d[i] );*/ - v[0] = 0; - v[n-1] = 0; - Tridiagonal_Solve( &(a[1]), &(b[1]), &(c[1]), &(d[1]), &(v[1]), n-2 ); - - for( i = 1; i < n; ++i ){ - coef[i-1].d = (v[i] - v[i-1]) / (6*h[i-1]); - coef[i-1].c = v[i]/2; - coef[i-1].b = (f[i]-f[i-1])/h[i-1] + h[i-1]*(2*v[i] + v[i-1])/6; - coef[i-1].a = f[i]; - } - - /*//fprintf( stderr, "i v coef\n" ); - for( i = 0; i < n; ++i ) - //fprintf( stderr, "%d %f %f %f %f %f\n", - i, v[i], coef[i].a, coef[i].b, coef[i].c, coef[i].d ); */ + int i; + real *a, *b, *c, *d, *v; + + /* allocate space for the linear system */ + a = (real*) malloc( n * sizeof(real) ); + b = (real*) malloc( n * sizeof(real) ); + c = (real*) malloc( n * sizeof(real) ); + d = (real*) malloc( n * sizeof(real) ); + v = (real*) malloc( n * sizeof(real) ); + + /* build the linear system */ + a[0] = a[1] = a[n-1] = 0; + for( i = 2; i < n-1; ++i ) + a[i] = h[i-1]; + + b[0] = b[n-1] = 0; + for( i = 1; i < n-1; ++i ) + b[i] = 2 * (h[i-1] + h[i]); + + c[0] = c[n-2] = c[n-1] = 0; + for( i = 1; i < n-2; ++i ) + c[i] = h[i]; + + d[0] = d[n-1] = 0; + for( i = 1; i < n-1; ++i ) + d[i] = 6 * ((f[i+1]-f[i])/h[i] - (f[i]-f[i-1])/h[i-1]); + + /*//fprintf( stderr, "i a b c d\n" ); + for( i = 0; i < n; ++i ) + //fprintf( stderr, "%d %f %f %f %f\n", i, a[i], b[i], c[i], d[i] );*/ + v[0] = 0; + v[n-1] = 0; + Tridiagonal_Solve( &(a[1]), &(b[1]), &(c[1]), &(d[1]), &(v[1]), n-2 ); + + for( i = 1; i < n; ++i ){ + coef[i-1].d = (v[i] - v[i-1]) / (6*h[i-1]); + coef[i-1].c = v[i]/2; + coef[i-1].b = (f[i]-f[i-1])/h[i-1] + h[i-1]*(2*v[i] + v[i-1])/6; + coef[i-1].a = f[i]; + } + + /*//fprintf( stderr, "i v coef\n" ); + for( i = 0; i < n; ++i ) + //fprintf( stderr, "%d %f %f %f %f %f\n", + i, v[i], coef[i].a, coef[i].b, coef[i].c, coef[i].d ); */ } GLOBAL void cubic_spline_init_a ( real *a, const real *h, int n ) { - int i = blockIdx.x * blockDim.x + threadIdx.x; - if ( i >= n ) return; - - if (i == 0 || i == 1 || i == (n-1)) { - a[i] = 0; - } else { - a[i] = h[i-1]; - } + int i = blockIdx.x * blockDim.x + threadIdx.x; + if ( i >= n ) return; + + if (i == 0 || i == 1 || i == (n-1)) { + a[i] = 0; + } else { + a[i] = h[i-1]; + } } GLOBAL void cubic_spline_init_b (real *b, const real *h, int n ) { - int i = blockIdx.x * blockDim.x + threadIdx.x; - if ( i >= n ) return; - - if (i == 0 || i == (n-1)) { - b[i] = 0; - } else { - b[i] = 2 * (h[i-1] + h[i]); - } + int i = blockIdx.x * blockDim.x + threadIdx.x; + if ( i >= n ) return; + + if (i == 0 || i == (n-1)) { + b[i] = 0; + } else { + b[i] = 2 * (h[i-1] + h[i]); + } } GLOBAL void cubic_spline_init_c (real *c, const real *h, int n ) { - int i = blockIdx.x * blockDim.x + threadIdx.x; - if ( i >= n ) return; - - if (i == 0 || i == (n-1) || i == (n-2)) { - c[i] = 0; - } else { - c[i] = h[i]; - } + int i = blockIdx.x * blockDim.x + threadIdx.x; + if ( i >= n ) return; + + if (i == 0 || i == (n-1) || i == (n-2)) { + c[i] = 0; + } else { + c[i] = h[i]; + } } GLOBAL void cubic_spline_init_d (real *d, const real *f, const real *h, int n ) { - int i = blockIdx.x * blockDim.x + threadIdx.x; - if ( i >= n ) return; - - if ( i == 0 || i == (n-1) ) { - d[i] = 0; - } else { - d[i] = 6 * ((f[i+1]-f[i])/h[i] - (f[i]-f[i-1])/h[i-1]); - } + int i = blockIdx.x * blockDim.x + threadIdx.x; + if ( i >= n ) return; + + if ( i == 0 || i == (n-1) ) { + d[i] = 0; + } else { + d[i] = 6 * ((f[i+1]-f[i])/h[i] - (f[i]-f[i-1])/h[i-1]); + } } GLOBAL void calculate_cubic_spline_coef ( const real *f, real *v, const real *h, LR_lookup_table *data, int offset, int n ) { - cubic_spline_coef *coef; - - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= n || i == 0) return; - - if (offset == SPLINE_H_OFFSET) - coef = &data->H[1]; - else if(offset == SPLINE_CEVD_OFFSET) - coef = &data->CEvd[1]; - else if (offset == SPLINE_CECLMB_OFFSET) - coef = &data->CEclmb[1]; - else if (offset == SPLINE_VDW_OFFSET) - coef = &data->vdW[1]; - else if (offset == SPLINE_ELE_OFFSET) - coef = &data->ele[1]; - else - coef = 0; - - coef[i-1].d = (v[i] - v[i-1]) / (6*h[i-1]); - coef[i-1].c = v[i]/2; - coef[i-1].b = (f[i]-f[i-1])/h[i-1] + h[i-1]*(2*v[i] + v[i-1])/6; - coef[i-1].a = f[i]; + cubic_spline_coef *coef; + + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n || i == 0) return; + + if (offset == SPLINE_H_OFFSET) + coef = &data->H[1]; + else if(offset == SPLINE_CEVD_OFFSET) + coef = &data->CEvd[1]; + else if (offset == SPLINE_CECLMB_OFFSET) + coef = &data->CEclmb[1]; + else if (offset == SPLINE_VDW_OFFSET) + coef = &data->vdW[1]; + else if (offset == SPLINE_ELE_OFFSET) + coef = &data->ele[1]; + else + coef = 0; + + coef[i-1].d = (v[i] - v[i-1]) / (6*h[i-1]); + coef[i-1].c = v[i]/2; + coef[i-1].b = (f[i]-f[i-1])/h[i-1] + h[i-1]*(2*v[i] + v[i-1])/6; + coef[i-1].a = f[i]; } void Cuda_Natural_Cubic_Spline( const real *h, const real *f, - LR_lookup_table *data, int offset, unsigned int n ) + LR_lookup_table *data, int offset, unsigned int n ) { - int i; - real *a, *b, *c, *d, *v; - int blocks, block_size; - - ////fprintf (stderr, "Entering Cuda_Natural_Cubic_Spline ... \n"); - - /* allocate space for the linear system */ - cuda_malloc ((void **) &a, REAL_SIZE * n, 0, __LINE__ ); - cuda_malloc ((void **) &b, REAL_SIZE * n, 0, __LINE__ ); - cuda_malloc ((void **) &c, REAL_SIZE * n, 0, __LINE__ ); - cuda_malloc ((void **) &d, REAL_SIZE * n, 0, __LINE__ ); - cuda_malloc ((void **) &v, REAL_SIZE * n, 1, __LINE__ ); - - ////fprintf (stderr, "Mem allocation done... \n"); - - /* build linear system */ - compute_blocks ( &blocks, &block_size, n); - cubic_spline_init_a <<< blocks, block_size >>> - ( a, h, n ); - cudaThreadSynchronize (); - ////fprintf (stderr, "cubic_spline_init_a done.... -> %d \n", cudaGetLastError ()); - - cubic_spline_init_b <<< blocks, block_size >>> - ( b, h, n ); - cudaThreadSynchronize (); - ////fprintf (stderr, "cubic_spline_init_b done.... -> %d \n", cudaGetLastError ()); - - cubic_spline_init_c <<< blocks, block_size >>> - ( c, h, n ); - cudaThreadSynchronize (); - //fprintf (stderr, "cubic_spline_init_c done.... -> %d \n", cudaGetLastError ()); - - cubic_spline_init_d <<< blocks, block_size >>> - ( d, f, h, n ); - cudaThreadSynchronize (); - //fprintf (stderr, "cubic_spline_init_d done.... -> %d \n", cudaGetLastError ()); - - /*//fprintf( stderr, "i a b c d\n" ); - for( i = 0; i < n; ++i ) - //fprintf( stderr, "%d %f %f %f %f\n", i, a[i], b[i], c[i], d[i] );*/ - - Cuda_Tridiagonal_Solve <<<1, 1>>> - ( &(a[1]), &(b[1]), &(c[1]), &(d[1]), &(v[1]), n-2 ); - cudaThreadSynchronize (); - //fprintf (stderr, "Tridiagonal_Solve done.... -> %d \n", cudaGetLastError ()); - - calculate_cubic_spline_coef <<< blocks, block_size >>> - ( f, v, h, data,offset, n ); - cudaThreadSynchronize (); - //fprintf (stderr, "calculate_cubic_spline_coef done.... -> %d \n", cudaGetLastError ()); - - /*//fprintf( stderr, "i v coef\n" ); - for( i = 0; i < n; ++i ) - //fprintf( stderr, "%d %f %f %f %f %f\n", - i, v[i], coef[i].a, coef[i].b, coef[i].c, coef[i].d ); */ + int i; + real *a, *b, *c, *d, *v; + int blocks, block_size; + + ////fprintf (stderr, "Entering Cuda_Natural_Cubic_Spline ... \n"); + + /* allocate space for the linear system */ + cuda_malloc ((void **) &a, REAL_SIZE * n, 0, __LINE__ ); + cuda_malloc ((void **) &b, REAL_SIZE * n, 0, __LINE__ ); + cuda_malloc ((void **) &c, REAL_SIZE * n, 0, __LINE__ ); + cuda_malloc ((void **) &d, REAL_SIZE * n, 0, __LINE__ ); + cuda_malloc ((void **) &v, REAL_SIZE * n, 1, __LINE__ ); + + ////fprintf (stderr, "Mem allocation done... \n"); + + /* build linear system */ + compute_blocks ( &blocks, &block_size, n); + cubic_spline_init_a <<< blocks, block_size >>> + ( a, h, n ); + cudaThreadSynchronize (); + ////fprintf (stderr, "cubic_spline_init_a done.... -> %d \n", cudaGetLastError ()); + + cubic_spline_init_b <<< blocks, block_size >>> + ( b, h, n ); + cudaThreadSynchronize (); + ////fprintf (stderr, "cubic_spline_init_b done.... -> %d \n", cudaGetLastError ()); + + cubic_spline_init_c <<< blocks, block_size >>> + ( c, h, n ); + cudaThreadSynchronize (); + //fprintf (stderr, "cubic_spline_init_c done.... -> %d \n", cudaGetLastError ()); + + cubic_spline_init_d <<< blocks, block_size >>> + ( d, f, h, n ); + cudaThreadSynchronize (); + //fprintf (stderr, "cubic_spline_init_d done.... -> %d \n", cudaGetLastError ()); + + /*//fprintf( stderr, "i a b c d\n" ); + for( i = 0; i < n; ++i ) + //fprintf( stderr, "%d %f %f %f %f\n", i, a[i], b[i], c[i], d[i] );*/ + + Cuda_Tridiagonal_Solve <<<1, 1>>> + ( &(a[1]), &(b[1]), &(c[1]), &(d[1]), &(v[1]), n-2 ); + cudaThreadSynchronize (); + //fprintf (stderr, "Tridiagonal_Solve done.... -> %d \n", cudaGetLastError ()); + + calculate_cubic_spline_coef <<< blocks, block_size >>> + ( f, v, h, data,offset, n ); + cudaThreadSynchronize (); + //fprintf (stderr, "calculate_cubic_spline_coef done.... -> %d \n", cudaGetLastError ()); + + /*//fprintf( stderr, "i v coef\n" ); + for( i = 0; i < n; ++i ) + //fprintf( stderr, "%d %f %f %f %f %f\n", + i, v[i], coef[i].a, coef[i].b, coef[i].c, coef[i].d ); */ } @@ -280,194 +280,194 @@ void Cuda_Natural_Cubic_Spline( const real *h, const real *f, void Complete_Cubic_Spline( const real *h, const real *f, real v0, real vlast, - cubic_spline_coef *coef, unsigned int n ) + cubic_spline_coef *coef, unsigned int n ) { - int i; - real *a, *b, *c, *d, *v; - - /* allocate space for the linear system */ - a = (real*) malloc( n * sizeof(real) ); - b = (real*) malloc( n * sizeof(real) ); - c = (real*) malloc( n * sizeof(real) ); - d = (real*) malloc( n * sizeof(real) ); - v = (real*) malloc( n * sizeof(real) ); - - /* build the linear system */ - a[0] = 0; - for( i = 1; i < n; ++i ) - a[i] = h[i-1]; - - b[0] = 2*h[0]; - for( i = 1; i < n; ++i ) - b[i] = 2 * (h[i-1] + h[i]); - - c[n-1] = 0; - for( i = 0; i < n-1; ++i ) - c[i] = h[i]; - - d[0] = 6 * (f[1]-f[0])/h[0] - 6 * v0; - d[n-1] = 6 * vlast - 6 * (f[n-1]-f[n-2]/h[n-2]); - for( i = 1; i < n-1; ++i ) - d[i] = 6 * ((f[i+1]-f[i])/h[i] - (f[i]-f[i-1])/h[i-1]); - - /*//fprintf( stderr, "i a b c d\n" ); - for( i = 0; i < n; ++i ) - //fprintf( stderr, "%d %f %f %f %f\n", i, a[i], b[i], c[i], d[i] );*/ - Tridiagonal_Solve( &(a[0]), &(b[0]), &(c[0]), &(d[0]), &(v[0]), n ); - // Tridiagonal_Solve( &(a[1]), &(b[1]), &(c[1]), &(d[1]), &(v[1]), n-2 ); - - for( i = 1; i < n; ++i ){ - coef[i-1].d = (v[i] - v[i-1]) / (6*h[i-1]); - coef[i-1].c = v[i]/2; - coef[i-1].b = (f[i]-f[i-1])/h[i-1] + h[i-1]*(2*v[i] + v[i-1])/6; - coef[i-1].a = f[i]; - } - - /*//fprintf( stderr, "i v coef\n" ); - for( i = 0; i < n; ++i ) - //fprintf( stderr, "%d %f %f %f %f %f\n", - i, v[i], coef[i].a, coef[i].b, coef[i].c, coef[i].d ); */ + int i; + real *a, *b, *c, *d, *v; + + /* allocate space for the linear system */ + a = (real*) malloc( n * sizeof(real) ); + b = (real*) malloc( n * sizeof(real) ); + c = (real*) malloc( n * sizeof(real) ); + d = (real*) malloc( n * sizeof(real) ); + v = (real*) malloc( n * sizeof(real) ); + + /* build the linear system */ + a[0] = 0; + for( i = 1; i < n; ++i ) + a[i] = h[i-1]; + + b[0] = 2*h[0]; + for( i = 1; i < n; ++i ) + b[i] = 2 * (h[i-1] + h[i]); + + c[n-1] = 0; + for( i = 0; i < n-1; ++i ) + c[i] = h[i]; + + d[0] = 6 * (f[1]-f[0])/h[0] - 6 * v0; + d[n-1] = 6 * vlast - 6 * (f[n-1]-f[n-2]/h[n-2]); + for( i = 1; i < n-1; ++i ) + d[i] = 6 * ((f[i+1]-f[i])/h[i] - (f[i]-f[i-1])/h[i-1]); + + /*//fprintf( stderr, "i a b c d\n" ); + for( i = 0; i < n; ++i ) + //fprintf( stderr, "%d %f %f %f %f\n", i, a[i], b[i], c[i], d[i] );*/ + Tridiagonal_Solve( &(a[0]), &(b[0]), &(c[0]), &(d[0]), &(v[0]), n ); + // Tridiagonal_Solve( &(a[1]), &(b[1]), &(c[1]), &(d[1]), &(v[1]), n-2 ); + + for( i = 1; i < n; ++i ){ + coef[i-1].d = (v[i] - v[i-1]) / (6*h[i-1]); + coef[i-1].c = v[i]/2; + coef[i-1].b = (f[i]-f[i-1])/h[i-1] + h[i-1]*(2*v[i] + v[i-1])/6; + coef[i-1].a = f[i]; + } + + /*//fprintf( stderr, "i v coef\n" ); + for( i = 0; i < n; ++i ) + //fprintf( stderr, "%d %f %f %f %f %f\n", + i, v[i], coef[i].a, coef[i].b, coef[i].c, coef[i].d ); */ } GLOBAL void complete_cubic_spline_init_a (real *a, const real *h, int n) { - int i = blockIdx.x * blockDim.x + threadIdx.x; - if ( i >= n ) return; + int i = blockIdx.x * blockDim.x + threadIdx.x; + if ( i >= n ) return; - if (i == 0) a[0] = 0; - else { - a[i] = h[i]; - } + if (i == 0) a[0] = 0; + else { + a[i] = h[i]; + } } GLOBAL void complete_cubic_spline_init_b (real *b, const real *h, int n) { - int i = blockIdx.x * blockDim.x + threadIdx.x; - if ( i >= n ) return; + int i = blockIdx.x * blockDim.x + threadIdx.x; + if ( i >= n ) return; - if (i == 0) b[0] = 2 * h[0]; - else { - b[i] = 2 * (h[i-1] + h[i]); - } + if (i == 0) b[0] = 2 * h[0]; + else { + b[i] = 2 * (h[i-1] + h[i]); + } } GLOBAL void complete_cubic_spline_init_c (real *c, const real *h, int n ) { - int i = blockIdx.x * blockDim.x + threadIdx.x; - if ( i >= n ) return; + int i = blockIdx.x * blockDim.x + threadIdx.x; + if ( i >= n ) return; - if (i == (n-1)) c[n-1] = 0; - else { - c[i] = h[i]; - } + if (i == (n-1)) c[n-1] = 0; + else { + c[i] = h[i]; + } } GLOBAL void complete_cubic_spline_init_d (real *d, const real *f, const real *h, int v0_r, int vlast_r, int n) { - int i = blockIdx.x * blockDim.x + threadIdx.x; - real v0, vlast; - if ( i >= n ) return; - - v0 = 0; - vlast = 0; - - if (i == 0) { - d[0] = 6 * (f[1]-f[0])/h[0] - 6 * v0; - } - else if (i == (n-1)) { - d[n-1] = 6 * vlast - 6 * (f[n-1]-f[n-2]/h[n-2]); - } - else - d[i] = 6 * ((f[i+1]-f[i])/h[i] - (f[i]-f[i-1])/h[i-1]); + int i = blockIdx.x * blockDim.x + threadIdx.x; + real v0, vlast; + if ( i >= n ) return; + + v0 = 0; + vlast = 0; + + if (i == 0) { + d[0] = 6 * (f[1]-f[0])/h[0] - 6 * v0; + } + else if (i == (n-1)) { + d[n-1] = 6 * vlast - 6 * (f[n-1]-f[n-2]/h[n-2]); + } + else + d[i] = 6 * ((f[i+1]-f[i])/h[i] - (f[i]-f[i-1])/h[i-1]); } GLOBAL void calculate_complete_cubic_spline_coef (LR_lookup_table *data, int offset, real *v, const real *h, const real *f, int n) { - cubic_spline_coef *coef; - int i = blockIdx.x * blockDim.x + threadIdx.x; - if ( i >= n ) return; - - if (offset == SPLINE_H_OFFSET) - coef = &data->H[1]; - else if(offset == SPLINE_CEVD_OFFSET) - coef = &data->CEvd[1]; - else if (offset == SPLINE_CECLMB_OFFSET) - coef = &data->CEclmb[1]; - else if (offset == SPLINE_VDW_OFFSET) - coef = &data->vdW[1]; - else if (offset == SPLINE_ELE_OFFSET) - coef = &data->ele[1]; - else - coef = 0; - - coef[i-1].d = (v[i] - v[i-1]) / (6*h[i-1]); - coef[i-1].c = v[i]/2; - coef[i-1].b = (f[i]-f[i-1])/h[i-1] + h[i-1]*(2*v[i] + v[i-1])/6; - coef[i-1].a = f[i]; + cubic_spline_coef *coef; + int i = blockIdx.x * blockDim.x + threadIdx.x; + if ( i >= n ) return; + + if (offset == SPLINE_H_OFFSET) + coef = &data->H[1]; + else if(offset == SPLINE_CEVD_OFFSET) + coef = &data->CEvd[1]; + else if (offset == SPLINE_CECLMB_OFFSET) + coef = &data->CEclmb[1]; + else if (offset == SPLINE_VDW_OFFSET) + coef = &data->vdW[1]; + else if (offset == SPLINE_ELE_OFFSET) + coef = &data->ele[1]; + else + coef = 0; + + coef[i-1].d = (v[i] - v[i-1]) / (6*h[i-1]); + coef[i-1].c = v[i]/2; + coef[i-1].b = (f[i]-f[i-1])/h[i-1] + h[i-1]*(2*v[i] + v[i-1])/6; + coef[i-1].a = f[i]; } void Cuda_Complete_Cubic_Spline( const real *h, const real *f, int v0_r, int vlast_r, - LR_lookup_table *data, int offset, unsigned int n ) + LR_lookup_table *data, int offset, unsigned int n ) { - int i; - real *a, *b, *c, *d, *v; - - int blocks, block_size; - - /* allocate space for the linear system */ - cuda_malloc ((void **) &a, REAL_SIZE * n, 0, __LINE__ ); - cuda_malloc ((void **) &b, REAL_SIZE * n, 0, __LINE__ ); - cuda_malloc ((void **) &c, REAL_SIZE * n, 0, __LINE__ ); - cuda_malloc ((void **) &d, REAL_SIZE * n, 0, __LINE__ ); - cuda_malloc ((void **) &v, REAL_SIZE * n, 1, __LINE__ ); - - /* build the linear system */ - compute_blocks ( &blocks, &block_size, n ); - - complete_cubic_spline_init_a <<< blocks, block_size >>> - (a, h, n); - cudaThreadSynchronize (); - //fprintf (stderr, "complete_cubic_spline_init_a done.... -> %d \n", cudaGetLastError ()); - - complete_cubic_spline_init_b <<< blocks, block_size >>> - (b, h, n); - cudaThreadSynchronize (); - //fprintf (stderr, "complete_cubic_spline_init_b done.... -> %d \n", cudaGetLastError ()); - - complete_cubic_spline_init_c <<< blocks, block_size >>> - ( c, h, n ); - cudaThreadSynchronize (); - //fprintf (stderr, "complete_cubic_spline_init_c done.... -> %d \n", cudaGetLastError ()); - - complete_cubic_spline_init_d <<< blocks, block_size >>> - (d, f, h, v0_r, vlast_r, n); - cudaThreadSynchronize (); - //fprintf (stderr, "complete_cubic_spline_init_d done.... -> %d \n", cudaGetLastError ()); - - /*//fprintf( stderr, "i a b c d\n" ); - for( i = 0; i < n; ++i ) - //fprintf( stderr, "%d %f %f %f %f\n", i, a[i], b[i], c[i], d[i] );*/ - - - Cuda_Tridiagonal_Solve <<< 1, 1 >>> - ( &(a[0]), &(b[0]), &(c[0]), &(d[0]), &(v[0]), n ); - cudaThreadSynchronize (); - //fprintf (stderr, "Tridiagonal_Solve done.... -> %d \n", cudaGetLastError ()); - // Tridiagonal_Solve( &(a[1]), &(b[1]), &(c[1]), &(d[1]), &(v[1]), n-2 ); - - - calculate_complete_cubic_spline_coef <<< blocks, block_size >>> - (data, offset, v, h, f, n); - cudaThreadSynchronize (); - //fprintf (stderr, " calculate_complete_cubic_spline_coef done.... -> %d \n", cudaGetLastError ()); - - /*//fprintf( stderr, "i v coef\n" ); - for( i = 0; i < n; ++i ) - //fprintf( stderr, "%d %f %f %f %f %f\n", - i, v[i], coef[i].a, coef[i].b, coef[i].c, coef[i].d ); */ + int i; + real *a, *b, *c, *d, *v; + + int blocks, block_size; + + /* allocate space for the linear system */ + cuda_malloc ((void **) &a, REAL_SIZE * n, 0, __LINE__ ); + cuda_malloc ((void **) &b, REAL_SIZE * n, 0, __LINE__ ); + cuda_malloc ((void **) &c, REAL_SIZE * n, 0, __LINE__ ); + cuda_malloc ((void **) &d, REAL_SIZE * n, 0, __LINE__ ); + cuda_malloc ((void **) &v, REAL_SIZE * n, 1, __LINE__ ); + + /* build the linear system */ + compute_blocks ( &blocks, &block_size, n ); + + complete_cubic_spline_init_a <<< blocks, block_size >>> + (a, h, n); + cudaThreadSynchronize (); + //fprintf (stderr, "complete_cubic_spline_init_a done.... -> %d \n", cudaGetLastError ()); + + complete_cubic_spline_init_b <<< blocks, block_size >>> + (b, h, n); + cudaThreadSynchronize (); + //fprintf (stderr, "complete_cubic_spline_init_b done.... -> %d \n", cudaGetLastError ()); + + complete_cubic_spline_init_c <<< blocks, block_size >>> + ( c, h, n ); + cudaThreadSynchronize (); + //fprintf (stderr, "complete_cubic_spline_init_c done.... -> %d \n", cudaGetLastError ()); + + complete_cubic_spline_init_d <<< blocks, block_size >>> + (d, f, h, v0_r, vlast_r, n); + cudaThreadSynchronize (); + //fprintf (stderr, "complete_cubic_spline_init_d done.... -> %d \n", cudaGetLastError ()); + + /*//fprintf( stderr, "i a b c d\n" ); + for( i = 0; i < n; ++i ) + //fprintf( stderr, "%d %f %f %f %f\n", i, a[i], b[i], c[i], d[i] );*/ + + + Cuda_Tridiagonal_Solve <<< 1, 1 >>> + ( &(a[0]), &(b[0]), &(c[0]), &(d[0]), &(v[0]), n ); + cudaThreadSynchronize (); + //fprintf (stderr, "Tridiagonal_Solve done.... -> %d \n", cudaGetLastError ()); + // Tridiagonal_Solve( &(a[1]), &(b[1]), &(c[1]), &(d[1]), &(v[1]), n-2 ); + + + calculate_complete_cubic_spline_coef <<< blocks, block_size >>> + (data, offset, v, h, f, n); + cudaThreadSynchronize (); + //fprintf (stderr, " calculate_complete_cubic_spline_coef done.... -> %d \n", cudaGetLastError ()); + + /*//fprintf( stderr, "i v coef\n" ); + for( i = 0; i < n; ++i ) + //fprintf( stderr, "%d %f %f %f %f %f\n", + i, v[i], coef[i].a, coef[i].b, coef[i].c, coef[i].d ); */ } @@ -475,168 +475,168 @@ void Cuda_Complete_Cubic_Spline( const real *h, const real *f, int v0_r, int vla void LR_Lookup( LR_lookup_table *t, real r, LR_data *y ) { - int i; - real base, dif; - - i = (int)(r * t->inv_dx); - if( i == 0 ) ++i; - base = (real)(i+1) * t->dx; - dif = r - base; - ////fprintf( stderr, "r: %f, i: %d, base: %f, dif: %f\n", r, i, base, dif ); - - y->e_vdW = ((t->vdW[i].d*dif + t->vdW[i].c)*dif + t->vdW[i].b)*dif + - t->vdW[i].a; - y->CEvd = ((t->CEvd[i].d*dif + t->CEvd[i].c)*dif + - t->CEvd[i].b)*dif + t->CEvd[i].a; - //y->CEvd = (3*t->vdW[i].d*dif + 2*t->vdW[i].c)*dif + t->vdW[i].b; - - y->e_ele = ((t->ele[i].d*dif + t->ele[i].c)*dif + t->ele[i].b)*dif + - t->ele[i].a; - y->CEclmb = ((t->CEclmb[i].d*dif + t->CEclmb[i].c)*dif + t->CEclmb[i].b)*dif + - t->CEclmb[i].a; - - y->H = y->e_ele * EV_to_KCALpMOL / C_ele; - //y->H = ((t->H[i].d*dif + t->H[i].c)*dif + t->H[i].b)*dif + t->H[i].a; + int i; + real base, dif; + + i = (int)(r * t->inv_dx); + if( i == 0 ) ++i; + base = (real)(i+1) * t->dx; + dif = r - base; + ////fprintf( stderr, "r: %f, i: %d, base: %f, dif: %f\n", r, i, base, dif ); + + y->e_vdW = ((t->vdW[i].d*dif + t->vdW[i].c)*dif + t->vdW[i].b)*dif + + t->vdW[i].a; + y->CEvd = ((t->CEvd[i].d*dif + t->CEvd[i].c)*dif + + t->CEvd[i].b)*dif + t->CEvd[i].a; + //y->CEvd = (3*t->vdW[i].d*dif + 2*t->vdW[i].c)*dif + t->vdW[i].b; + + y->e_ele = ((t->ele[i].d*dif + t->ele[i].c)*dif + t->ele[i].b)*dif + + t->ele[i].a; + y->CEclmb = ((t->CEclmb[i].d*dif + t->CEclmb[i].c)*dif + t->CEclmb[i].b)*dif + + t->CEclmb[i].a; + + y->H = y->e_ele * EV_to_KCALpMOL / C_ele; + //y->H = ((t->H[i].d*dif + t->H[i].c)*dif + t->H[i].b)*dif + t->H[i].a; } void Make_LR_Lookup_Table( reax_system *system, control_params *control ) { - int i, j, r; - int num_atom_types; - int existing_types[MAX_ATOM_TYPES]; - real dr; - real *h, *fh, *fvdw, *fele, *fCEvd, *fCEclmb; - real v0_vdw, v0_ele, vlast_vdw, vlast_ele; - /* real rand_dist; - real evdw_abserr, evdw_relerr, fvdw_abserr, fvdw_relerr; - real eele_abserr, eele_relerr, fele_abserr, fele_relerr; - real evdw_maxerr, eele_maxerr; - LR_data y, y_spline; */ - - /* initializations */ - vlast_ele = 0; - vlast_vdw = 0; - v0_ele = 0; - v0_vdw = 0; - - num_atom_types = system->reaxprm.num_atom_types; - dr = control->r_cut / control->tabulate; - h = (real*) malloc( (control->tabulate+1) * sizeof(real) ); - fh = (real*) malloc( (control->tabulate+1) * sizeof(real) ); - fvdw = (real*) malloc( (control->tabulate+1) * sizeof(real) ); - fCEvd = (real*) malloc( (control->tabulate+1) * sizeof(real) ); - fele = (real*) malloc( (control->tabulate+1) * sizeof(real) ); - fCEclmb = (real*) malloc( (control->tabulate+1) * sizeof(real) ); - - /* allocate Long-Range LookUp Table space based on - number of atom types in the ffield file */ - //LR = (LR_lookup_table**) malloc( num_atom_types * sizeof(LR_lookup_table*) ); - //for( i = 0; i < num_atom_types; ++i ) - // LR[i] = (LR_lookup_table*) malloc(num_atom_types * sizeof(LR_lookup_table)); - - LR = (LR_lookup_table*) malloc(num_atom_types * num_atom_types * sizeof(LR_lookup_table)); - - /* most atom types in ffield file will not exist in the current - simulation. to avoid unnecessary lookup table space, determine - the atom types that exist in the current simulation */ - for( i = 0; i < MAX_ATOM_TYPES; ++i ) - existing_types[i] = 0; - for( i = 0; i < system->N; ++i ) - existing_types[ system->atoms[i].type ] = 1; - - /* fill in the lookup table entries for existing atom types. - only lower half should be enough. */ - for( i = 0; i < num_atom_types; ++i ) - if( existing_types[i] ) - for( j = i; j < num_atom_types; ++j ) - if( existing_types[j] ) { - LR[ index_lr (i,j,num_atom_types) ].xmin = 0; - LR[ index_lr (i,j,num_atom_types) ].xmax = control->r_cut; - LR[ index_lr (i,j,num_atom_types) ].n = control->tabulate + 1; - LR[ index_lr (i,j,num_atom_types) ].dx = dr; - LR[ index_lr (i,j,num_atom_types) ].inv_dx = control->tabulate / control->r_cut; - LR[ index_lr (i,j,num_atom_types) ].y = (LR_data*) - malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(LR_data)); - LR[ index_lr (i,j,num_atom_types) ].H = (cubic_spline_coef*) - malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(cubic_spline_coef)); - LR[ index_lr (i,j,num_atom_types) ].vdW = (cubic_spline_coef*) - malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(cubic_spline_coef)); - LR[ index_lr (i,j,num_atom_types) ].CEvd = (cubic_spline_coef*) - malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(cubic_spline_coef)); - LR[ index_lr (i,j,num_atom_types) ].ele = (cubic_spline_coef*) - malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(cubic_spline_coef)); - LR[ index_lr (i,j,num_atom_types) ].CEclmb = (cubic_spline_coef*) - malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(cubic_spline_coef)); - - for( r = 1; r <= control->tabulate; ++r ) { - LR_vdW_Coulomb( system, control, i, j, r * dr, &(LR[ index_lr (i,j,num_atom_types) ].y[r]) ); - h[r] = LR[ index_lr (i,j,num_atom_types) ].dx; - fh[r] = LR[ index_lr (i,j,num_atom_types) ].y[r].H; - fvdw[r] = LR[ index_lr (i,j,num_atom_types) ].y[r].e_vdW; - fCEvd[r] = LR[ index_lr (i,j,num_atom_types) ].y[r].CEvd; - fele[r] = LR[ index_lr (i,j,num_atom_types) ].y[r].e_ele; - fCEclmb[r] = LR[ index_lr (i,j,num_atom_types) ].y[r].CEclmb; - - if( r == 1 ){ - v0_vdw = LR[ index_lr (i,j,num_atom_types) ].y[r].CEvd; - v0_ele = LR[ index_lr (i,j,num_atom_types) ].y[r].CEclmb; - } - else if( r == control->tabulate ){ - vlast_vdw = LR[ index_lr (i,j,num_atom_types) ].y[r].CEvd; - vlast_ele = LR[ index_lr (i,j,num_atom_types) ].y[r].CEclmb; - } - } - - /*//fprintf( stderr, "%-6s %-6s %-6s\n", "r", "h", "fh" ); - for( r = 1; r <= control->tabulate; ++r ) - //fprintf( stderr, "%f %f %f\n", r * dr, h[r], fh[r] ); */ - Natural_Cubic_Spline( &h[1], &fh[1], - &(LR[ index_lr (i,j,num_atom_types) ].H[1]), control->tabulate+1 ); - - /*//fprintf( stderr, "%-6s %-6s %-6s\n", "r", "h", "fvdw" ); - for( r = 1; r <= control->tabulate; ++r ) - //fprintf( stderr, "%f %f %f\n", r * dr, h[r], fvdw[r] ); - //fprintf( stderr, "v0_vdw: %f, vlast_vdw: %f\n", v0_vdw, vlast_vdw ); - */ - Complete_Cubic_Spline( &h[1], &fvdw[1], v0_vdw, vlast_vdw, - &(LR[ index_lr (i,j,num_atom_types) ].vdW[1]), control->tabulate+1 ); - Natural_Cubic_Spline( &h[1], &fCEvd[1], - &(LR[ index_lr (i,j,num_atom_types) ].CEvd[1]), control->tabulate+1 ); - - /*//fprintf( stderr, "%-6s %-6s %-6s\n", "r", "h", "fele" ); - for( r = 1; r <= control->tabulate; ++r ) - //fprintf( stderr, "%f %f %f\n", r * dr, h[r], fele[r] ); - //fprintf( stderr, "v0_ele: %f, vlast_ele: %f\n", v0_ele, vlast_ele ); - */ - Complete_Cubic_Spline( &h[1], &fele[1], v0_ele, vlast_ele, - &(LR[ index_lr (i,j,num_atom_types) ].ele[1]), control->tabulate+1 ); - Natural_Cubic_Spline( &h[1], &fCEclmb[1], - &(LR[ index_lr (i,j,num_atom_types) ].CEclmb[1]), control->tabulate+1 ); - } - - /***** //test LR-Lookup table - evdw_maxerr = 0; - eele_maxerr = 0; - for( i = 0; i < num_atom_types; ++i ) - if( existing_types[i] ) - for( j = i; j < num_atom_types; ++j ) - if( existing_types[j] ) { - for( r = 1; r <= 100; ++r ) { - rand_dist = (real)rand()/RAND_MAX * control->r_cut; - LR_vdW_Coulomb( system, control, i, j, rand_dist, &y ); - LR_Lookup( &(LR[i][j]), rand_dist, &y_spline ); - - evdw_abserr = fabs(y.e_vdW - y_spline.e_vdW); - evdw_relerr = fabs(evdw_abserr / y.e_vdW); - fvdw_abserr = fabs(y.CEvd - y_spline.CEvd); - fvdw_relerr = fabs(fvdw_abserr / y.CEvd); - eele_abserr = fabs(y.e_ele - y_spline.e_ele); - eele_relerr = fabs(eele_abserr / y.e_ele); - fele_abserr = fabs(y.CEclmb - y_spline.CEclmb); - fele_relerr = fabs(fele_abserr / y.CEclmb); - - if( evdw_relerr > 1e-10 || eele_relerr > 1e-10 ){ + int i, j, r; + int num_atom_types; + int existing_types[MAX_ATOM_TYPES]; + real dr; + real *h, *fh, *fvdw, *fele, *fCEvd, *fCEclmb; + real v0_vdw, v0_ele, vlast_vdw, vlast_ele; + /* real rand_dist; + real evdw_abserr, evdw_relerr, fvdw_abserr, fvdw_relerr; + real eele_abserr, eele_relerr, fele_abserr, fele_relerr; + real evdw_maxerr, eele_maxerr; + LR_data y, y_spline; */ + + /* initializations */ + vlast_ele = 0; + vlast_vdw = 0; + v0_ele = 0; + v0_vdw = 0; + + num_atom_types = system->reaxprm.num_atom_types; + dr = control->r_cut / control->tabulate; + h = (real*) malloc( (control->tabulate+1) * sizeof(real) ); + fh = (real*) malloc( (control->tabulate+1) * sizeof(real) ); + fvdw = (real*) malloc( (control->tabulate+1) * sizeof(real) ); + fCEvd = (real*) malloc( (control->tabulate+1) * sizeof(real) ); + fele = (real*) malloc( (control->tabulate+1) * sizeof(real) ); + fCEclmb = (real*) malloc( (control->tabulate+1) * sizeof(real) ); + + /* allocate Long-Range LookUp Table space based on + number of atom types in the ffield file */ + //LR = (LR_lookup_table**) malloc( num_atom_types * sizeof(LR_lookup_table*) ); + //for( i = 0; i < num_atom_types; ++i ) + // LR[i] = (LR_lookup_table*) malloc(num_atom_types * sizeof(LR_lookup_table)); + + LR = (LR_lookup_table*) malloc(num_atom_types * num_atom_types * sizeof(LR_lookup_table)); + + /* most atom types in ffield file will not exist in the current + simulation. to avoid unnecessary lookup table space, determine + the atom types that exist in the current simulation */ + for( i = 0; i < MAX_ATOM_TYPES; ++i ) + existing_types[i] = 0; + for( i = 0; i < system->N; ++i ) + existing_types[ system->atoms[i].type ] = 1; + + /* fill in the lookup table entries for existing atom types. + only lower half should be enough. */ + for( i = 0; i < num_atom_types; ++i ) + if( existing_types[i] ) + for( j = i; j < num_atom_types; ++j ) + if( existing_types[j] ) { + LR[ index_lr (i,j,num_atom_types) ].xmin = 0; + LR[ index_lr (i,j,num_atom_types) ].xmax = control->r_cut; + LR[ index_lr (i,j,num_atom_types) ].n = control->tabulate + 1; + LR[ index_lr (i,j,num_atom_types) ].dx = dr; + LR[ index_lr (i,j,num_atom_types) ].inv_dx = control->tabulate / control->r_cut; + LR[ index_lr (i,j,num_atom_types) ].y = (LR_data*) + malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(LR_data)); + LR[ index_lr (i,j,num_atom_types) ].H = (cubic_spline_coef*) + malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(cubic_spline_coef)); + LR[ index_lr (i,j,num_atom_types) ].vdW = (cubic_spline_coef*) + malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(cubic_spline_coef)); + LR[ index_lr (i,j,num_atom_types) ].CEvd = (cubic_spline_coef*) + malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(cubic_spline_coef)); + LR[ index_lr (i,j,num_atom_types) ].ele = (cubic_spline_coef*) + malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(cubic_spline_coef)); + LR[ index_lr (i,j,num_atom_types) ].CEclmb = (cubic_spline_coef*) + malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(cubic_spline_coef)); + + for( r = 1; r <= control->tabulate; ++r ) { + LR_vdW_Coulomb( system, control, i, j, r * dr, &(LR[ index_lr (i,j,num_atom_types) ].y[r]) ); + h[r] = LR[ index_lr (i,j,num_atom_types) ].dx; + fh[r] = LR[ index_lr (i,j,num_atom_types) ].y[r].H; + fvdw[r] = LR[ index_lr (i,j,num_atom_types) ].y[r].e_vdW; + fCEvd[r] = LR[ index_lr (i,j,num_atom_types) ].y[r].CEvd; + fele[r] = LR[ index_lr (i,j,num_atom_types) ].y[r].e_ele; + fCEclmb[r] = LR[ index_lr (i,j,num_atom_types) ].y[r].CEclmb; + + if( r == 1 ){ + v0_vdw = LR[ index_lr (i,j,num_atom_types) ].y[r].CEvd; + v0_ele = LR[ index_lr (i,j,num_atom_types) ].y[r].CEclmb; + } + else if( r == control->tabulate ){ + vlast_vdw = LR[ index_lr (i,j,num_atom_types) ].y[r].CEvd; + vlast_ele = LR[ index_lr (i,j,num_atom_types) ].y[r].CEclmb; + } + } + + /*//fprintf( stderr, "%-6s %-6s %-6s\n", "r", "h", "fh" ); + for( r = 1; r <= control->tabulate; ++r ) + //fprintf( stderr, "%f %f %f\n", r * dr, h[r], fh[r] ); */ + Natural_Cubic_Spline( &h[1], &fh[1], + &(LR[ index_lr (i,j,num_atom_types) ].H[1]), control->tabulate+1 ); + + /*//fprintf( stderr, "%-6s %-6s %-6s\n", "r", "h", "fvdw" ); + for( r = 1; r <= control->tabulate; ++r ) + //fprintf( stderr, "%f %f %f\n", r * dr, h[r], fvdw[r] ); + //fprintf( stderr, "v0_vdw: %f, vlast_vdw: %f\n", v0_vdw, vlast_vdw ); + */ + Complete_Cubic_Spline( &h[1], &fvdw[1], v0_vdw, vlast_vdw, + &(LR[ index_lr (i,j,num_atom_types) ].vdW[1]), control->tabulate+1 ); + Natural_Cubic_Spline( &h[1], &fCEvd[1], + &(LR[ index_lr (i,j,num_atom_types) ].CEvd[1]), control->tabulate+1 ); + + /*//fprintf( stderr, "%-6s %-6s %-6s\n", "r", "h", "fele" ); + for( r = 1; r <= control->tabulate; ++r ) + //fprintf( stderr, "%f %f %f\n", r * dr, h[r], fele[r] ); + //fprintf( stderr, "v0_ele: %f, vlast_ele: %f\n", v0_ele, vlast_ele ); + */ + Complete_Cubic_Spline( &h[1], &fele[1], v0_ele, vlast_ele, + &(LR[ index_lr (i,j,num_atom_types) ].ele[1]), control->tabulate+1 ); + Natural_Cubic_Spline( &h[1], &fCEclmb[1], + &(LR[ index_lr (i,j,num_atom_types) ].CEclmb[1]), control->tabulate+1 ); + } + + /***** //test LR-Lookup table + evdw_maxerr = 0; + eele_maxerr = 0; + for( i = 0; i < num_atom_types; ++i ) + if( existing_types[i] ) + for( j = i; j < num_atom_types; ++j ) + if( existing_types[j] ) { + for( r = 1; r <= 100; ++r ) { + rand_dist = (real)rand()/RAND_MAX * control->r_cut; + LR_vdW_Coulomb( system, control, i, j, rand_dist, &y ); + LR_Lookup( &(LR[i][j]), rand_dist, &y_spline ); + + evdw_abserr = fabs(y.e_vdW - y_spline.e_vdW); + evdw_relerr = fabs(evdw_abserr / y.e_vdW); + fvdw_abserr = fabs(y.CEvd - y_spline.CEvd); + fvdw_relerr = fabs(fvdw_abserr / y.CEvd); + eele_abserr = fabs(y.e_ele - y_spline.e_ele); + eele_relerr = fabs(eele_abserr / y.e_ele); + fele_abserr = fabs(y.CEclmb - y_spline.CEclmb); + fele_relerr = fabs(fele_abserr / y.CEclmb); + + if( evdw_relerr > 1e-10 || eele_relerr > 1e-10 ){ //fprintf( stderr, "rand_dist = %24.15e\n", rand_dist ); //fprintf( stderr, "%24.15e %24.15e %24.15e %24.15e\n", y.H, y_spline.H, @@ -661,7 +661,7 @@ eele_maxerr = eele_relerr; } //fprintf( stderr, "evdw_maxerr: %24.15e\n", evdw_maxerr ); //fprintf( stderr, "eele_maxerr: %24.15e\n", eele_maxerr ); - *******/ + *******/ free(h); free(fh); @@ -673,58 +673,58 @@ free(fCEclmb); void copy_LR_table_to_device (reax_system *system, control_params *control) { - int i, j, r; - int num_atom_types; - int existing_types[MAX_ATOM_TYPES]; - LR_data *d_y; - cubic_spline_coef *temp; + int i, j, r; + int num_atom_types; + int existing_types[MAX_ATOM_TYPES]; + LR_data *d_y; + cubic_spline_coef *temp; - num_atom_types = system->reaxprm.num_atom_types; + num_atom_types = system->reaxprm.num_atom_types; - //fprintf (stderr, "Copying the LR Lookyp Table to the device ... \n"); + //fprintf (stderr, "Copying the LR Lookyp Table to the device ... \n"); - cuda_malloc ((void **) &d_LR, LR_LOOKUP_TABLE_SIZE * ( num_atom_types * num_atom_types ), 0, RES_LR_LOOKUP_TABLE ); + cuda_malloc ((void **) &d_LR, LR_LOOKUP_TABLE_SIZE * ( num_atom_types * num_atom_types ), 0, RES_LR_LOOKUP_TABLE ); - for( i = 0; i < MAX_ATOM_TYPES; ++i ) - existing_types[i] = 0; + for( i = 0; i < MAX_ATOM_TYPES; ++i ) + existing_types[i] = 0; - for( i = 0; i < system->N; ++i ) - existing_types[ system->atoms[i].type ] = 1; + for( i = 0; i < system->N; ++i ) + existing_types[ system->atoms[i].type ] = 1; - copy_host_device ( LR, d_LR, LR_LOOKUP_TABLE_SIZE * (num_atom_types * num_atom_types), cudaMemcpyHostToDevice, RES_LR_LOOKUP_TABLE ); + copy_host_device ( LR, d_LR, LR_LOOKUP_TABLE_SIZE * (num_atom_types * num_atom_types), cudaMemcpyHostToDevice, RES_LR_LOOKUP_TABLE ); - for( i = 0; i < num_atom_types; ++i ) - if( existing_types[i] ) - for( j = i; j < num_atom_types; ++j ) + for( i = 0; i < num_atom_types; ++i ) + if( existing_types[i] ) + for( j = i; j < num_atom_types; ++j ) - if( existing_types[j] ) { + if( existing_types[j] ) { - cuda_malloc ((void **) &d_y, LR_DATA_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_Y ); - copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].y, d_y, LR_DATA_SIZE * (control->tabulate + 1), cudaMemcpyHostToDevice, RES_LR_LOOKUP_Y ); - copy_host_device ( &d_y, &d_LR [ index_lr (i, j, num_atom_types) ].y, LR_DATA_PTR_SIZE, cudaMemcpyHostToDevice, RES_LR_LOOKUP_Y ); + cuda_malloc ((void **) &d_y, LR_DATA_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_Y ); + copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].y, d_y, LR_DATA_SIZE * (control->tabulate + 1), cudaMemcpyHostToDevice, RES_LR_LOOKUP_Y ); + copy_host_device ( &d_y, &d_LR [ index_lr (i, j, num_atom_types) ].y, LR_DATA_PTR_SIZE, cudaMemcpyHostToDevice, RES_LR_LOOKUP_Y ); - cuda_malloc ((void **) &temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_H ); - copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].H, temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), cudaMemcpyHostToDevice, RES_LR_LOOKUP_H ); - copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].H, CUBIC_SPLINE_COEF_PTR_SIZE, cudaMemcpyHostToDevice, RES_LR_LOOKUP_H ); + cuda_malloc ((void **) &temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_H ); + copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].H, temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), cudaMemcpyHostToDevice, RES_LR_LOOKUP_H ); + copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].H, CUBIC_SPLINE_COEF_PTR_SIZE, cudaMemcpyHostToDevice, RES_LR_LOOKUP_H ); - cuda_malloc ((void **) &temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_VDW ); - copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].vdW, temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), cudaMemcpyHostToDevice, RES_LR_LOOKUP_VDW ); - copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].vdW,CUBIC_SPLINE_COEF_PTR_SIZE, cudaMemcpyHostToDevice, RES_LR_LOOKUP_VDW ); + cuda_malloc ((void **) &temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_VDW ); + copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].vdW, temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), cudaMemcpyHostToDevice, RES_LR_LOOKUP_VDW ); + copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].vdW,CUBIC_SPLINE_COEF_PTR_SIZE, cudaMemcpyHostToDevice, RES_LR_LOOKUP_VDW ); - cuda_malloc ((void **) &temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_CEVD ); - copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].CEvd, temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), cudaMemcpyHostToDevice, RES_LR_LOOKUP_CEVD ); - copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].CEvd, CUBIC_SPLINE_COEF_PTR_SIZE, cudaMemcpyHostToDevice, RES_LR_LOOKUP_CEVD ); + cuda_malloc ((void **) &temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_CEVD ); + copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].CEvd, temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), cudaMemcpyHostToDevice, RES_LR_LOOKUP_CEVD ); + copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].CEvd, CUBIC_SPLINE_COEF_PTR_SIZE, cudaMemcpyHostToDevice, RES_LR_LOOKUP_CEVD ); - cuda_malloc ((void **) &temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_ELE ); - copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].ele, temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), cudaMemcpyHostToDevice, RES_LR_LOOKUP_ELE ); - copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].ele, CUBIC_SPLINE_COEF_PTR_SIZE, cudaMemcpyHostToDevice, RES_LR_LOOKUP_ELE ); + cuda_malloc ((void **) &temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_ELE ); + copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].ele, temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), cudaMemcpyHostToDevice, RES_LR_LOOKUP_ELE ); + copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].ele, CUBIC_SPLINE_COEF_PTR_SIZE, cudaMemcpyHostToDevice, RES_LR_LOOKUP_ELE ); - cuda_malloc ((void **) &temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_CECLMB ); - copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].CEclmb, temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), cudaMemcpyHostToDevice, RES_LR_LOOKUP_CECLMB ); - copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].CEclmb, CUBIC_SPLINE_COEF_PTR_SIZE, cudaMemcpyHostToDevice, RES_LR_LOOKUP_CECLMB ); - } + cuda_malloc ((void **) &temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_CECLMB ); + copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].CEclmb, temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), cudaMemcpyHostToDevice, RES_LR_LOOKUP_CECLMB ); + copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].CEclmb, CUBIC_SPLINE_COEF_PTR_SIZE, cudaMemcpyHostToDevice, RES_LR_LOOKUP_CECLMB ); + } - //fprintf (stderr, "Copy of the LR Lookup Table to the device complete ... \n"); + //fprintf (stderr, "Copy of the LR Lookup Table to the device complete ... \n"); } @@ -753,159 +753,159 @@ void copy_LR_table_to_device (reax_system *system, control_params *control) ////////////////////////////////////////////////////////////////////////// GLOBAL void calculate_LR_Values ( LR_lookup_table *d_LR, real *h, real *fh, real *fvdw, real *fCEvd, real *fele, real *fCEclmb, - global_parameters g_params, two_body_parameters *tbp, - control_params *control, int i, - int j, int num_atom_types, LR_data *data, real dr, int count ) + global_parameters g_params, two_body_parameters *tbp, + control_params *control, int i, + int j, int num_atom_types, LR_data *data, real dr, int count ) { - int r = blockIdx.x * blockDim.x + threadIdx.x; - if ( r == 0 || r > count ) return; + int r = blockIdx.x * blockDim.x + threadIdx.x; + if ( r == 0 || r > count ) return; - LR_vdW_Coulomb ( g_params, tbp, control, i, j, r * dr, &data[r], num_atom_types ); + LR_vdW_Coulomb ( g_params, tbp, control, i, j, r * dr, &data[r], num_atom_types ); - h[r] = d_LR[ index_lr (i, j, num_atom_types) ].dx; - fh[r] = d_LR[ index_lr (i, j, num_atom_types) ].y[r].H; - fvdw[r] = d_LR[ index_lr (i, j, num_atom_types) ].y[r].e_vdW; - fCEvd[r] = d_LR[ index_lr (i, j, num_atom_types) ].y[r].CEvd; - fele[r] = d_LR[ index_lr (i, j, num_atom_types) ].y[r].e_ele; - fCEclmb[r] = d_LR[ index_lr (i, j, num_atom_types) ].y[r].CEclmb; + h[r] = d_LR[ index_lr (i, j, num_atom_types) ].dx; + fh[r] = d_LR[ index_lr (i, j, num_atom_types) ].y[r].H; + fvdw[r] = d_LR[ index_lr (i, j, num_atom_types) ].y[r].e_vdW; + fCEvd[r] = d_LR[ index_lr (i, j, num_atom_types) ].y[r].CEvd; + fele[r] = d_LR[ index_lr (i, j, num_atom_types) ].y[r].e_ele; + fCEclmb[r] = d_LR[ index_lr (i, j, num_atom_types) ].y[r].CEclmb; } GLOBAL void init_LR_values ( LR_lookup_table *d_LR, control_params *control, real dr, int i, int j, int num_atom_types ) { - d_LR[ index_lr (i, j, num_atom_types) ].xmin = 0; - d_LR[ index_lr (i, j, num_atom_types) ].xmax = control->r_cut; - d_LR[ index_lr (i, j, num_atom_types) ].n = control->tabulate + 1; - d_LR[ index_lr (i, j, num_atom_types) ].dx = dr; - d_LR[ index_lr (i, j, num_atom_types) ].inv_dx = control->tabulate / control->r_cut; + d_LR[ index_lr (i, j, num_atom_types) ].xmin = 0; + d_LR[ index_lr (i, j, num_atom_types) ].xmax = control->r_cut; + d_LR[ index_lr (i, j, num_atom_types) ].n = control->tabulate + 1; + d_LR[ index_lr (i, j, num_atom_types) ].dx = dr; + d_LR[ index_lr (i, j, num_atom_types) ].inv_dx = control->tabulate / control->r_cut; } void Cuda_Make_LR_Lookup_Table( reax_system *system, control_params *control ) { - int i, j, r; - int num_atom_types; - int existing_types[MAX_ATOM_TYPES]; - real dr; - real *h, *fh, *fvdw, *fele, *fCEvd, *fCEclmb; - - int v0_vdw_r, v0_ele_r, vlast_vdw_r, vlast_ele_r; - - void *temp; - LR_data *d_y; - int blocks, block_size; - - /* initializations */ - vlast_ele_r = 0; - vlast_vdw_r = 0; - v0_ele_r = 0; - v0_vdw_r = 0; - - num_atom_types = system->reaxprm.num_atom_types; - dr = control->r_cut / control->tabulate; - - cuda_malloc ((void **) &h, REAL_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_Y); - cuda_malloc ((void **) &fh, REAL_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_H); - cuda_malloc ((void **) &fvdw, REAL_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_VDW); - cuda_malloc ((void **) &fCEvd, REAL_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_CEVD); - cuda_malloc ((void **) &fele, REAL_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_ELE); - cuda_malloc ((void **) &fCEclmb, REAL_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_CECLMB); - - /* allocate Long-Range LookUp Table space based on - number of atom types in the ffield file */ - cuda_malloc ((void **) &d_LR, LR_LOOKUP_TABLE_SIZE * ( num_atom_types * num_atom_types ), 0, RES_LR_LOOKUP_TABLE ); - - /* most atom types in ffield file will not exist in the current - simulation. to avoid unnecessary lookup table space, determine - the atom types that exist in the current simulation */ - for( i = 0; i < MAX_ATOM_TYPES; ++i ) - existing_types[i] = 0; - - for( i = 0; i < system->N; ++i ) - existing_types[ system->atoms[i].type ] = 1; - - /* fill in the lookup table entries for existing atom types. - only lower half should be enough. */ - for( i = 0; i < num_atom_types; ++i ) - if( existing_types[i] ) - for( j = i; j < num_atom_types; ++j ) - if( existing_types[j] ) { - - init_LR_values <<< 1, 1 >>> - ( d_LR, (control_params *)control->d_control, dr, i, j, num_atom_types ); - cudaThreadSynchronize (); - //fprintf (stderr, "Done with init LR Values --> %d \n", cudaGetLastError ()); - - cuda_malloc ((void **) &d_y, LR_DATA_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_Y ); - copy_host_device ( &d_y, &d_LR [ index_lr (i, j, num_atom_types) ].y, LR_DATA_PTR_SIZE, cudaMemcpyHostToDevice, RES_LR_LOOKUP_Y ); - - cuda_malloc ((void **) &temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_H ); - copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].H, CUBIC_SPLINE_COEF_PTR_SIZE, cudaMemcpyHostToDevice, RES_LR_LOOKUP_H ); - - cuda_malloc ((void **) &temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_VDW ); - copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].vdW,CUBIC_SPLINE_COEF_PTR_SIZE, cudaMemcpyHostToDevice, RES_LR_LOOKUP_VDW ); - - cuda_malloc ((void **) &temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_CEVD ); - copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].CEvd, CUBIC_SPLINE_COEF_PTR_SIZE, cudaMemcpyHostToDevice, RES_LR_LOOKUP_CEVD ); - - cuda_malloc ((void **) &temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_ELE ); - copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].ele, CUBIC_SPLINE_COEF_PTR_SIZE, cudaMemcpyHostToDevice, RES_LR_LOOKUP_ELE ); - - cuda_malloc ((void **) &temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_CECLMB ); - copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].CEclmb, CUBIC_SPLINE_COEF_PTR_SIZE, cudaMemcpyHostToDevice, RES_LR_LOOKUP_CECLMB ); - - //TODO check the bounds - compute_blocks ( &blocks, &block_size, control->tabulate ); - calculate_LR_Values <<<blocks, block_size>>> - ( d_LR, h, fh, fvdw, fCEvd, fele, fCEclmb, - system->reaxprm.d_gp, system->reaxprm.d_tbp, - (control_params *)control->d_control, i, j, system->reaxprm.num_atom_types, - d_y, dr, control->tabulate ); - cudaThreadSynchronize (); - - //fprintf (stderr, "Done with LR Values Calculation --> %d \n", cudaGetLastError ()); - - /*//fprintf( stderr, "%-6s %-6s %-6s\n", "r", "h", "fh" ); - for( r = 1; r <= control->tabulate; ++r ) - //fprintf( stderr, "%f %f %f\n", r * dr, h[r], fh[r] ); */ - Cuda_Natural_Cubic_Spline( h+1, fh+1, - d_LR + index_lr (i,j,num_atom_types), SPLINE_H_OFFSET, control->tabulate+1 ); - - /*//fprintf( stderr, "%-6s %-6s %-6s\n", "r", "h", "fvdw" ); - for( r = 1; r <= control->tabulate; ++r ) - //fprintf( stderr, "%f %f %f\n", r * dr, h[r], fvdw[r] ); - //fprintf( stderr, "v0_vdw: %f, vlast_vdw: %f\n", v0_vdw, vlast_vdw ); - */ - - //TODO -- Pass the right v0 and vlast for the cubic spline - //Cuda_Complete_Cubic_Spline( &h[1], &fvdw[1], v0_vdw_r, vlast_vdw_r, - // &(LR[ index_lr (i,j,num_atom_types) ].vdW[1]), control->tabulate+1 ); - //Cuda_Natural_Cubic_Spline( &h[1], &fCEvd[1], - // &(LR[ index_lr (i,j,num_atom_types) ].CEvd[1]), control->tabulate+1 ); - Cuda_Complete_Cubic_Spline( &h[1], &fvdw[1], v0_vdw_r, vlast_vdw_r, - d_LR + index_lr (i,j,num_atom_types) , SPLINE_VDW_OFFSET, control->tabulate+1 ); - Cuda_Natural_Cubic_Spline( &h[1], &fCEvd[1], - d_LR + index_lr (i,j,num_atom_types) , SPLINE_CEVD_OFFSET, control->tabulate+1 ); - - /*//fprintf( stderr, "%-6s %-6s %-6s\n", "r", "h", "fele" ); - for( r = 1; r <= control->tabulate; ++r ) - //fprintf( stderr, "%f %f %f\n", r * dr, h[r], fele[r] ); - //fprintf( stderr, "v0_ele: %f, vlast_ele: %f\n", v0_ele, vlast_ele ); - */ - //Cuda_Complete_Cubic_Spline( &h[1], &fele[1], v0_ele_r, vlast_ele_r, - // &(LR[index_lr (i,j,num_atom_types) ].ele[1]), control->tabulate+1 ); - //Cuda_Natural_Cubic_Spline( &h[1], &fCEclmb[1], - // &(LR[ index_lr (i,j,num_atom_types) ].CEclmb[1]), control->tabulate+1 ); - Cuda_Complete_Cubic_Spline( &h[1], &fele[1], v0_ele_r, vlast_ele_r, - d_LR + index_lr (i,j,num_atom_types) , SPLINE_ELE_OFFSET, control->tabulate+1 ); - Cuda_Natural_Cubic_Spline( &h[1], &fCEclmb[1], - d_LR + index_lr (i,j,num_atom_types) , SPLINE_CECLMB_OFFSET, control->tabulate+1 ); - } - - cuda_free(h, RES_LR_LOOKUP_Y); - cuda_free(fh, RES_LR_LOOKUP_H); - cuda_free(fvdw, RES_LR_LOOKUP_VDW); - cuda_free(fCEvd, RES_LR_LOOKUP_CEVD); - cuda_free(fele, RES_LR_LOOKUP_ELE); - cuda_free(fCEclmb, RES_LR_LOOKUP_CECLMB); + int i, j, r; + int num_atom_types; + int existing_types[MAX_ATOM_TYPES]; + real dr; + real *h, *fh, *fvdw, *fele, *fCEvd, *fCEclmb; + + int v0_vdw_r, v0_ele_r, vlast_vdw_r, vlast_ele_r; + + void *temp; + LR_data *d_y; + int blocks, block_size; + + /* initializations */ + vlast_ele_r = 0; + vlast_vdw_r = 0; + v0_ele_r = 0; + v0_vdw_r = 0; + + num_atom_types = system->reaxprm.num_atom_types; + dr = control->r_cut / control->tabulate; + + cuda_malloc ((void **) &h, REAL_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_Y); + cuda_malloc ((void **) &fh, REAL_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_H); + cuda_malloc ((void **) &fvdw, REAL_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_VDW); + cuda_malloc ((void **) &fCEvd, REAL_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_CEVD); + cuda_malloc ((void **) &fele, REAL_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_ELE); + cuda_malloc ((void **) &fCEclmb, REAL_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_CECLMB); + + /* allocate Long-Range LookUp Table space based on + number of atom types in the ffield file */ + cuda_malloc ((void **) &d_LR, LR_LOOKUP_TABLE_SIZE * ( num_atom_types * num_atom_types ), 0, RES_LR_LOOKUP_TABLE ); + + /* most atom types in ffield file will not exist in the current + simulation. to avoid unnecessary lookup table space, determine + the atom types that exist in the current simulation */ + for( i = 0; i < MAX_ATOM_TYPES; ++i ) + existing_types[i] = 0; + + for( i = 0; i < system->N; ++i ) + existing_types[ system->atoms[i].type ] = 1; + + /* fill in the lookup table entries for existing atom types. + only lower half should be enough. */ + for( i = 0; i < num_atom_types; ++i ) + if( existing_types[i] ) + for( j = i; j < num_atom_types; ++j ) + if( existing_types[j] ) { + + init_LR_values <<< 1, 1 >>> + ( d_LR, (control_params *)control->d_control, dr, i, j, num_atom_types ); + cudaThreadSynchronize (); + //fprintf (stderr, "Done with init LR Values --> %d \n", cudaGetLastError ()); + + cuda_malloc ((void **) &d_y, LR_DATA_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_Y ); + copy_host_device ( &d_y, &d_LR [ index_lr (i, j, num_atom_types) ].y, LR_DATA_PTR_SIZE, cudaMemcpyHostToDevice, RES_LR_LOOKUP_Y ); + + cuda_malloc ((void **) &temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_H ); + copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].H, CUBIC_SPLINE_COEF_PTR_SIZE, cudaMemcpyHostToDevice, RES_LR_LOOKUP_H ); + + cuda_malloc ((void **) &temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_VDW ); + copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].vdW,CUBIC_SPLINE_COEF_PTR_SIZE, cudaMemcpyHostToDevice, RES_LR_LOOKUP_VDW ); + + cuda_malloc ((void **) &temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_CEVD ); + copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].CEvd, CUBIC_SPLINE_COEF_PTR_SIZE, cudaMemcpyHostToDevice, RES_LR_LOOKUP_CEVD ); + + cuda_malloc ((void **) &temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_ELE ); + copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].ele, CUBIC_SPLINE_COEF_PTR_SIZE, cudaMemcpyHostToDevice, RES_LR_LOOKUP_ELE ); + + cuda_malloc ((void **) &temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_CECLMB ); + copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].CEclmb, CUBIC_SPLINE_COEF_PTR_SIZE, cudaMemcpyHostToDevice, RES_LR_LOOKUP_CECLMB ); + + //TODO check the bounds + compute_blocks ( &blocks, &block_size, control->tabulate ); + calculate_LR_Values <<<blocks, block_size>>> + ( d_LR, h, fh, fvdw, fCEvd, fele, fCEclmb, + system->reaxprm.d_gp, system->reaxprm.d_tbp, + (control_params *)control->d_control, i, j, system->reaxprm.num_atom_types, + d_y, dr, control->tabulate ); + cudaThreadSynchronize (); + + //fprintf (stderr, "Done with LR Values Calculation --> %d \n", cudaGetLastError ()); + + /*//fprintf( stderr, "%-6s %-6s %-6s\n", "r", "h", "fh" ); + for( r = 1; r <= control->tabulate; ++r ) + //fprintf( stderr, "%f %f %f\n", r * dr, h[r], fh[r] ); */ + Cuda_Natural_Cubic_Spline( h+1, fh+1, + d_LR + index_lr (i,j,num_atom_types), SPLINE_H_OFFSET, control->tabulate+1 ); + + /*//fprintf( stderr, "%-6s %-6s %-6s\n", "r", "h", "fvdw" ); + for( r = 1; r <= control->tabulate; ++r ) + //fprintf( stderr, "%f %f %f\n", r * dr, h[r], fvdw[r] ); + //fprintf( stderr, "v0_vdw: %f, vlast_vdw: %f\n", v0_vdw, vlast_vdw ); + */ + + //TODO -- Pass the right v0 and vlast for the cubic spline + //Cuda_Complete_Cubic_Spline( &h[1], &fvdw[1], v0_vdw_r, vlast_vdw_r, + // &(LR[ index_lr (i,j,num_atom_types) ].vdW[1]), control->tabulate+1 ); + //Cuda_Natural_Cubic_Spline( &h[1], &fCEvd[1], + // &(LR[ index_lr (i,j,num_atom_types) ].CEvd[1]), control->tabulate+1 ); + Cuda_Complete_Cubic_Spline( &h[1], &fvdw[1], v0_vdw_r, vlast_vdw_r, + d_LR + index_lr (i,j,num_atom_types) , SPLINE_VDW_OFFSET, control->tabulate+1 ); + Cuda_Natural_Cubic_Spline( &h[1], &fCEvd[1], + d_LR + index_lr (i,j,num_atom_types) , SPLINE_CEVD_OFFSET, control->tabulate+1 ); + + /*//fprintf( stderr, "%-6s %-6s %-6s\n", "r", "h", "fele" ); + for( r = 1; r <= control->tabulate; ++r ) + //fprintf( stderr, "%f %f %f\n", r * dr, h[r], fele[r] ); + //fprintf( stderr, "v0_ele: %f, vlast_ele: %f\n", v0_ele, vlast_ele ); + */ + //Cuda_Complete_Cubic_Spline( &h[1], &fele[1], v0_ele_r, vlast_ele_r, + // &(LR[index_lr (i,j,num_atom_types) ].ele[1]), control->tabulate+1 ); + //Cuda_Natural_Cubic_Spline( &h[1], &fCEclmb[1], + // &(LR[ index_lr (i,j,num_atom_types) ].CEclmb[1]), control->tabulate+1 ); + Cuda_Complete_Cubic_Spline( &h[1], &fele[1], v0_ele_r, vlast_ele_r, + d_LR + index_lr (i,j,num_atom_types) , SPLINE_ELE_OFFSET, control->tabulate+1 ); + Cuda_Natural_Cubic_Spline( &h[1], &fCEclmb[1], + d_LR + index_lr (i,j,num_atom_types) , SPLINE_CECLMB_OFFSET, control->tabulate+1 ); + } + + cuda_free(h, RES_LR_LOOKUP_Y); + cuda_free(fh, RES_LR_LOOKUP_H); + cuda_free(fvdw, RES_LR_LOOKUP_VDW); + cuda_free(fCEvd, RES_LR_LOOKUP_CEVD); + cuda_free(fele, RES_LR_LOOKUP_ELE); + cuda_free(fCEclmb, RES_LR_LOOKUP_CECLMB); } @@ -923,36 +923,36 @@ void Cuda_Make_LR_Lookup_Table( reax_system *system, control_params *control ) int Lookup_Index_Of( real x, lookup_table* t ) { - return (int)( t->a * ( x - t->xmin ) ); + return (int)( t->a * ( x - t->xmin ) ); } real Lookup( real x, lookup_table* t ) { - real x1, x2; - real b; - int i; - - /* if ( x < t->xmin) - { - //fprintf(stderr,"Domain check %lf > %lf\n",t->xmin,x); - exit(0); - } - if ( x > t->xmax) - { - //fprintf(stderr,"Domain check %lf < %lf\n",t->xmax,x); - exit(0); - } */ - - i = Lookup_Index_Of( x, t ); - x1 = i * t->dx + t->xmin; - x2 = (i+1) * t->dx + t->xmin; - - b = ( x2 * t->y[i] - x1 * t->y[i+1] ) * t->inv_dx; - // //fprintf( stdout,"SLookup_Entry: %d, %lf, %lf, %lf, %lf: %lf, %lf\n", - // i,x1,x2,x,b,t->one_over_dx*(t->y[i+1]-t->y[i])*x+b,exp(x)); - - return t->inv_dx * ( t->y[i+1] - t->y[i] ) * x + b; + real x1, x2; + real b; + int i; + + /* if ( x < t->xmin) + { + //fprintf(stderr,"Domain check %lf > %lf\n",t->xmin,x); + exit(0); + } + if ( x > t->xmax) + { + //fprintf(stderr,"Domain check %lf < %lf\n",t->xmax,x); + exit(0); + } */ + + i = Lookup_Index_Of( x, t ); + x1 = i * t->dx + t->xmin; + x2 = (i+1) * t->dx + t->xmin; + + b = ( x2 * t->y[i] - x1 * t->y[i+1] ) * t->inv_dx; + // //fprintf( stdout,"SLookup_Entry: %d, %lf, %lf, %lf, %lf: %lf, %lf\n", + // i,x1,x2,x,b,t->one_over_dx*(t->y[i+1]-t->y[i])*x+b,exp(x)); + + return t->inv_dx * ( t->y[i+1] - t->y[i] ) * x + b; } diff --git a/PuReMD-GPU/src/matvec.cu b/PuReMD-GPU/src/matvec.cu index 2f0b7bb0..bf08cdf8 100644 --- a/PuReMD-GPU/src/matvec.cu +++ b/PuReMD-GPU/src/matvec.cu @@ -24,22 +24,22 @@ //one thread per row GLOBAL void Cuda_Matvec (sparse_matrix H, real *vec, real *results, int rows) { - real results_row = 0; - int col; - real val; + real results_row = 0; + int col; + real val; - int i = blockIdx.x * blockDim.x + threadIdx.x; - if ( i >= rows) return; + int i = blockIdx.x * blockDim.x + threadIdx.x; + if ( i >= rows) return; - for (int c = H.start[i]; c < H.end[i]; c++) - { - col = H.entries [c].j; - val = H.entries[c].val; + for (int c = H.start[i]; c < H.end[i]; c++) + { + col = H.entries [c].j; + val = H.entries[c].val; - results_row += val * vec [col]; - } + results_row += val * vec [col]; + } - results [i] = results_row; + results [i] = results_row; } //32 thread warp per matrix row. @@ -47,43 +47,43 @@ GLOBAL void Cuda_Matvec (sparse_matrix H, real *vec, real *results, int rows) // <<< system->N, 32 >>> GLOBAL void Cuda_Matvec_csr (sparse_matrix H, real *vec, real *results, int num_rows) { - extern __shared__ real vals []; - int thread_id = blockDim.x * blockIdx.x + threadIdx.x; - int warp_id = thread_id / 32; - int lane = thread_id & (32 - 1); - - int row_start; - int row_end; - - // one warp per row - //int row = warp_id; - int row = warp_id; - //if (row < num_rows) - { - vals[threadIdx.x] = 0; - - if (row < num_rows) { - row_start = H.start[row]; - row_end = H.end[row]; - - // compute running sum per thread - for(int jj = row_start + lane; jj < row_end; jj += 32) - vals[threadIdx.x] += H.entries[jj].val * vec [ H.entries[jj].j ]; - //vals[threadIdx.x] += H.val[jj] * vec [ H.j[jj] ]; - } - - __syncthreads (); - - // parallel reduction in shared memory - //SIMD instructions with a WARP are synchronous -- so we do not need to synch here - if (lane < 16) vals[threadIdx.x] += vals[threadIdx.x + 16]; __syncthreads(); - if (lane < 8) vals[threadIdx.x] += vals[threadIdx.x + 8]; __syncthreads (); - if (lane < 4) vals[threadIdx.x] += vals[threadIdx.x + 4]; __syncthreads (); - if (lane < 2) vals[threadIdx.x] += vals[threadIdx.x + 2]; __syncthreads (); - if (lane < 1) vals[threadIdx.x] += vals[threadIdx.x + 1]; __syncthreads (); - - // first thread writes the result - if (lane == 0 && row < num_rows) - results[row] = vals[threadIdx.x]; - } + extern __shared__ real vals []; + int thread_id = blockDim.x * blockIdx.x + threadIdx.x; + int warp_id = thread_id / 32; + int lane = thread_id & (32 - 1); + + int row_start; + int row_end; + + // one warp per row + //int row = warp_id; + int row = warp_id; + //if (row < num_rows) + { + vals[threadIdx.x] = 0; + + if (row < num_rows) { + row_start = H.start[row]; + row_end = H.end[row]; + + // compute running sum per thread + for(int jj = row_start + lane; jj < row_end; jj += 32) + vals[threadIdx.x] += H.entries[jj].val * vec [ H.entries[jj].j ]; + //vals[threadIdx.x] += H.val[jj] * vec [ H.j[jj] ]; + } + + __syncthreads (); + + // parallel reduction in shared memory + //SIMD instructions with a WARP are synchronous -- so we do not need to synch here + if (lane < 16) vals[threadIdx.x] += vals[threadIdx.x + 16]; __syncthreads(); + if (lane < 8) vals[threadIdx.x] += vals[threadIdx.x + 8]; __syncthreads (); + if (lane < 4) vals[threadIdx.x] += vals[threadIdx.x + 4]; __syncthreads (); + if (lane < 2) vals[threadIdx.x] += vals[threadIdx.x + 2]; __syncthreads (); + if (lane < 1) vals[threadIdx.x] += vals[threadIdx.x + 1]; __syncthreads (); + + // first thread writes the result + if (lane == 0 && row < num_rows) + results[row] = vals[threadIdx.x]; + } } diff --git a/PuReMD-GPU/src/neighbors.cu b/PuReMD-GPU/src/neighbors.cu index 7bf8ed2a..90779538 100644 --- a/PuReMD-GPU/src/neighbors.cu +++ b/PuReMD-GPU/src/neighbors.cu @@ -30,1383 +30,1383 @@ extern inline DEVICE int index_grid (int blocksize) { - return blockIdx.x * gridDim.y * gridDim.z * blocksize + - blockIdx.y * gridDim.z * blocksize + - blockIdx.z * blocksize ; + return blockIdx.x * gridDim.y * gridDim.z * blocksize + + blockIdx.y * gridDim.z * blocksize + + blockIdx.z * blocksize ; } extern inline HOST_DEVICE int index_grid_debug (int x, int y, int z, int blocksize) { - return x * 8 * 8 * blocksize + - y * 8 * blocksize + - z * blocksize ; + return x * 8 * 8 * blocksize + + y * 8 * blocksize + + z * blocksize ; } inline HOST_DEVICE real DistSqr_to_CP( rvec cp, rvec x ) { - int i; - real d_sqr = 0; + int i; + real d_sqr = 0; - for( i = 0; i < 3; ++i ) - if( cp[i] > NEG_INF ) - d_sqr += SQR( cp[i] - x[i] ); + for( i = 0; i < 3; ++i ) + if( cp[i] > NEG_INF ) + d_sqr += SQR( cp[i] - x[i] ); - return d_sqr; + return d_sqr; } HOST_DEVICE int Are_Far_Neighbors( rvec x1, rvec x2, simulation_box *box, - real cutoff, far_neighbor_data *data ) + real cutoff, far_neighbor_data *data ) { - real norm_sqr, d, tmp; - int i; - - norm_sqr = 0; - - for( i = 0; i < 3; i++ ) { - d = x2[i] - x1[i]; - tmp = SQR(d); - - if( tmp >= SQR( box->box_norms[i] / 2.0 ) ) { - if( x2[i] > x1[i] ) { - d -= box->box_norms[i]; - data->rel_box[i] = -1; - } - else { - d += box->box_norms[i]; - data->rel_box[i] = +1; - } - - data->dvec[i] = d; - norm_sqr += SQR(d); - } - else { - data->dvec[i] = d; - norm_sqr += tmp; - data->rel_box[i] = 0; - } - } - - if( norm_sqr <= SQR(cutoff) ){ - data->d = sqrt(norm_sqr); - return 1; - } - - return 0; + real norm_sqr, d, tmp; + int i; + + norm_sqr = 0; + + for( i = 0; i < 3; i++ ) { + d = x2[i] - x1[i]; + tmp = SQR(d); + + if( tmp >= SQR( box->box_norms[i] / 2.0 ) ) { + if( x2[i] > x1[i] ) { + d -= box->box_norms[i]; + data->rel_box[i] = -1; + } + else { + d += box->box_norms[i]; + data->rel_box[i] = +1; + } + + data->dvec[i] = d; + norm_sqr += SQR(d); + } + else { + data->dvec[i] = d; + norm_sqr += tmp; + data->rel_box[i] = 0; + } + } + + if( norm_sqr <= SQR(cutoff) ){ + data->d = sqrt(norm_sqr); + return 1; + } + + return 0; } void Generate_Neighbor_Lists( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - list **lists, output_controls *out_control ) + simulation_data *data, static_storage *workspace, + list **lists, output_controls *out_control ) { - int i, j, k, l, m, itr; - int x, y, z; - int atom1, atom2, max; - int num_far; - int *nbr_atoms; - ivec *nbrs; - rvec *nbrs_cp; - grid *g; - list *far_nbrs; - far_neighbor_data *nbr_data; - real t_start, t_elapsed; - - // fprintf( stderr, "\n\tentered nbrs - " ); - g = &( system->g ); - far_nbrs = (*lists) + FAR_NBRS; - Bin_Atoms( system, workspace ); - - t_start = Get_Time( ); - - // fprintf( stderr, "atoms sorted - " ); - num_far = 0; - - /* first pick up a cell in the grid */ - for( i = 0; i < g->ncell[0]; i++ ) - for( j = 0; j < g->ncell[1]; j++ ) - for( k = 0; k < g->ncell[2]; k++ ) { - nbrs = &g->nbrs[ index_grid_nbrs (i,j,k,0,g) ]; - nbrs_cp = &g->nbrs_cp[ index_grid_nbrs (i,j,k,0,g) ]; - //fprintf( stderr, "gridcell %d %d %d\n", i, j, k ); - - /* pick up an atom from the current cell */ - for(l = 0; l < g->top[ index_grid_3d (i,j,k,g) ]; ++l ){ - atom1 = g->atoms[ index_grid_atoms (i,j,k,l,g) ]; - Set_Start_Index( atom1, num_far, far_nbrs ); - //fprintf( stderr, "\tatom %d\n", atom1 ); - - itr = 0; - while( nbrs[itr][0] >= 0 ){ - x = nbrs[itr][0]; - y = nbrs[itr][1]; - z = nbrs[itr][2]; - //fprintf( stderr, "\t\tgridcell %d %d %d\n", x, y, z ); - - if( DistSqr_to_CP(nbrs_cp[itr], system->atoms[atom1].x ) <= - SQR(control->vlist_cut) ) { - nbr_atoms = &g->atoms[ index_grid_atoms (x,y,z,0,g) ]; - max = g->top[ index_grid_3d (x,y,z,g) ]; - //fprintf( stderr, "\t\tmax: %d\n", max ); - - /* pick up another atom from the neighbor cell */ - for( m = 0; m < max; ++m ) { - atom2 = nbr_atoms[m]; - if( atom1 > atom2 ) { - nbr_data = &(far_nbrs->select.far_nbr_list[num_far]); - if(Are_Far_Neighbors(system->atoms[atom1].x, - system->atoms[atom2].x, - &(system->box), control->vlist_cut, - nbr_data)) { - nbr_data->nbr = atom2; - - ++num_far; - } - } - } - } - - ++itr; - } - - Set_End_Index( atom1, num_far, far_nbrs ); - //fprintf(stderr, "i:%d, start: %d, end: %d - itr: %d\n", - // atom1,Start_Index(atom1,far_nbrs),End_Index(atom1,far_nbrs), - // itr); - } - } - - fprintf (stderr, " TOTAL HOST NEIGHBORS : %d \n", num_far); - - if( num_far > far_nbrs->num_intrs * DANGER_ZONE ) { - workspace->realloc.num_far = num_far; - if( num_far > far_nbrs->num_intrs ){ - fprintf( stderr, "step%d-ran out of space on far_nbrs: top=%d, max=%d", - data->step, num_far, far_nbrs->num_intrs ); - exit( INSUFFICIENT_SPACE ); - } - } - - t_elapsed = Get_Timing_Info( t_start ); - data->timing.nbrs += t_elapsed; + int i, j, k, l, m, itr; + int x, y, z; + int atom1, atom2, max; + int num_far; + int *nbr_atoms; + ivec *nbrs; + rvec *nbrs_cp; + grid *g; + list *far_nbrs; + far_neighbor_data *nbr_data; + real t_start, t_elapsed; + + // fprintf( stderr, "\n\tentered nbrs - " ); + g = &( system->g ); + far_nbrs = (*lists) + FAR_NBRS; + Bin_Atoms( system, workspace ); + + t_start = Get_Time( ); + + // fprintf( stderr, "atoms sorted - " ); + num_far = 0; + + /* first pick up a cell in the grid */ + for( i = 0; i < g->ncell[0]; i++ ) + for( j = 0; j < g->ncell[1]; j++ ) + for( k = 0; k < g->ncell[2]; k++ ) { + nbrs = &g->nbrs[ index_grid_nbrs (i,j,k,0,g) ]; + nbrs_cp = &g->nbrs_cp[ index_grid_nbrs (i,j,k,0,g) ]; + //fprintf( stderr, "gridcell %d %d %d\n", i, j, k ); + + /* pick up an atom from the current cell */ + for(l = 0; l < g->top[ index_grid_3d (i,j,k,g) ]; ++l ){ + atom1 = g->atoms[ index_grid_atoms (i,j,k,l,g) ]; + Set_Start_Index( atom1, num_far, far_nbrs ); + //fprintf( stderr, "\tatom %d\n", atom1 ); + + itr = 0; + while( nbrs[itr][0] >= 0 ){ + x = nbrs[itr][0]; + y = nbrs[itr][1]; + z = nbrs[itr][2]; + //fprintf( stderr, "\t\tgridcell %d %d %d\n", x, y, z ); + + if( DistSqr_to_CP(nbrs_cp[itr], system->atoms[atom1].x ) <= + SQR(control->vlist_cut) ) { + nbr_atoms = &g->atoms[ index_grid_atoms (x,y,z,0,g) ]; + max = g->top[ index_grid_3d (x,y,z,g) ]; + //fprintf( stderr, "\t\tmax: %d\n", max ); + + /* pick up another atom from the neighbor cell */ + for( m = 0; m < max; ++m ) { + atom2 = nbr_atoms[m]; + if( atom1 > atom2 ) { + nbr_data = &(far_nbrs->select.far_nbr_list[num_far]); + if(Are_Far_Neighbors(system->atoms[atom1].x, + system->atoms[atom2].x, + &(system->box), control->vlist_cut, + nbr_data)) { + nbr_data->nbr = atom2; + + ++num_far; + } + } + } + } + + ++itr; + } + + Set_End_Index( atom1, num_far, far_nbrs ); + //fprintf(stderr, "i:%d, start: %d, end: %d - itr: %d\n", + // atom1,Start_Index(atom1,far_nbrs),End_Index(atom1,far_nbrs), + // itr); + } + } + + fprintf (stderr, " TOTAL HOST NEIGHBORS : %d \n", num_far); + + if( num_far > far_nbrs->num_intrs * DANGER_ZONE ) { + workspace->realloc.num_far = num_far; + if( num_far > far_nbrs->num_intrs ){ + fprintf( stderr, "step%d-ran out of space on far_nbrs: top=%d, max=%d", + data->step, num_far, far_nbrs->num_intrs ); + exit( INSUFFICIENT_SPACE ); + } + } + + t_elapsed = Get_Timing_Info( t_start ); + data->timing.nbrs += t_elapsed; #if defined(DEBUG) - for( i = 0; i < system->N; ++i ) { - qsort( &(far_nbrs->select.far_nbr_list[ Start_Index(i, far_nbrs) ]), - Num_Entries(i, far_nbrs), sizeof(far_neighbor_data), - compare_far_nbrs ); - } + for( i = 0; i < system->N; ++i ) { + qsort( &(far_nbrs->select.far_nbr_list[ Start_Index(i, far_nbrs) ]), + Num_Entries(i, far_nbrs), sizeof(far_neighbor_data), + compare_far_nbrs ); + } #endif #if defined(DEBUG_FOCUS) - //fprintf( stderr, "nbrs - "); - //fprintf( stderr, "nbrs done, num_far: %d\n", num_far ); + //fprintf( stderr, "nbrs - "); + //fprintf( stderr, "nbrs done, num_far: %d\n", num_far ); #endif #if defined(TEST_ENERGY) - //Print_Far_Neighbors( system, control, workspace, lists ); + //Print_Far_Neighbors( system, control, workspace, lists ); #endif } int Estimate_NumNeighbors( reax_system *system, control_params *control, - static_storage *workspace, list **lists ) + static_storage *workspace, list **lists ) { - int i, j, k, l, m, itr; - int x, y, z; - int atom1, atom2, max; - int num_far; - int *nbr_atoms; - ivec *nbrs; - rvec *nbrs_cp; - grid *g; - far_neighbor_data nbr_data; - - int start = 0, finish = 0; - - // fprintf( stderr, "\n\tentered nbrs - " ); - g = &( system->g ); - Bin_Atoms( system, workspace ); - // fprintf( stderr, "atoms sorted - " ); - num_far = 0; - g->max_cuda_nbrs = 0; - - /* first pick up a cell in the grid */ - for( i = 0; i < g->ncell[0]; i++ ) - for( j = 0; j < g->ncell[1]; j++ ) - for( k = 0; k < g->ncell[2]; k++ ) { - nbrs = &g->nbrs[index_grid_nbrs (i,j,k,0,g) ]; - nbrs_cp = &g->nbrs_cp[index_grid_nbrs (i,j,k,0,g) ]; - //fprintf( stderr, "gridcell %d %d %d\n", i, j, k ); - - /* pick up an atom from the current cell */ - for(l = 0; l < g->top[index_grid_3d (i,j,k,g) ]; ++l ){ - atom1 = g->atoms[index_grid_atoms (i,j,k,l,g) ]; - start = num_far; - - itr = 0; - while( nbrs[itr][0] >= 0 ){ - x = nbrs[itr][0]; - y = nbrs[itr][1]; - z = nbrs[itr][2]; - //fprintf( stderr, "\t\tgridcell %d %d %d\n", x, y, z ); - - if( DistSqr_to_CP(nbrs_cp[itr], system->atoms[atom1].x ) <= - SQR(control->vlist_cut) ) { - nbr_atoms = &g->atoms[index_grid_atoms (x,y,z,0,g) ]; - max = g->top[index_grid_3d (x,y,z,g) ]; - //fprintf( stderr, "\t\tmax: %d\n", max ); - - /* pick up another atom from the neighbor cell - - we have to compare atom1 with its own periodic images as well, - that's why there is also equality in the if stmt below */ - for( m = 0; m < max; ++m ) { - atom2 = nbr_atoms[m]; - //if( nbrs[itr+1][0] >= 0 || atom1 > atom2 ) { - if( atom1 > atom2 ) { - if(Are_Far_Neighbors(system->atoms[atom1].x, - system->atoms[atom2].x, - &(system->box), control->vlist_cut, - &nbr_data)) - ++num_far; - } - } - } - - ++itr; - } - - // finish note - finish = num_far; - if (g->max_cuda_nbrs <= (finish - start)){ - g->max_cuda_nbrs = finish - start; - } - } - } + int i, j, k, l, m, itr; + int x, y, z; + int atom1, atom2, max; + int num_far; + int *nbr_atoms; + ivec *nbrs; + rvec *nbrs_cp; + grid *g; + far_neighbor_data nbr_data; + + int start = 0, finish = 0; + + // fprintf( stderr, "\n\tentered nbrs - " ); + g = &( system->g ); + Bin_Atoms( system, workspace ); + // fprintf( stderr, "atoms sorted - " ); + num_far = 0; + g->max_cuda_nbrs = 0; + + /* first pick up a cell in the grid */ + for( i = 0; i < g->ncell[0]; i++ ) + for( j = 0; j < g->ncell[1]; j++ ) + for( k = 0; k < g->ncell[2]; k++ ) { + nbrs = &g->nbrs[index_grid_nbrs (i,j,k,0,g) ]; + nbrs_cp = &g->nbrs_cp[index_grid_nbrs (i,j,k,0,g) ]; + //fprintf( stderr, "gridcell %d %d %d\n", i, j, k ); + + /* pick up an atom from the current cell */ + for(l = 0; l < g->top[index_grid_3d (i,j,k,g) ]; ++l ){ + atom1 = g->atoms[index_grid_atoms (i,j,k,l,g) ]; + start = num_far; + + itr = 0; + while( nbrs[itr][0] >= 0 ){ + x = nbrs[itr][0]; + y = nbrs[itr][1]; + z = nbrs[itr][2]; + //fprintf( stderr, "\t\tgridcell %d %d %d\n", x, y, z ); + + if( DistSqr_to_CP(nbrs_cp[itr], system->atoms[atom1].x ) <= + SQR(control->vlist_cut) ) { + nbr_atoms = &g->atoms[index_grid_atoms (x,y,z,0,g) ]; + max = g->top[index_grid_3d (x,y,z,g) ]; + //fprintf( stderr, "\t\tmax: %d\n", max ); + + /* pick up another atom from the neighbor cell - + we have to compare atom1 with its own periodic images as well, + that's why there is also equality in the if stmt below */ + for( m = 0; m < max; ++m ) { + atom2 = nbr_atoms[m]; + //if( nbrs[itr+1][0] >= 0 || atom1 > atom2 ) { + if( atom1 > atom2 ) { + if(Are_Far_Neighbors(system->atoms[atom1].x, + system->atoms[atom2].x, + &(system->box), control->vlist_cut, + &nbr_data)) + ++num_far; + } + } + } + + ++itr; + } + + // finish note + finish = num_far; + if (g->max_cuda_nbrs <= (finish - start)){ + g->max_cuda_nbrs = finish - start; + } + } + } #if defined(DEBUG_FOCUS) - fprintf( stderr, "estimate nbrs done, num_far: %d\n", num_far ); + fprintf( stderr, "estimate nbrs done, num_far: %d\n", num_far ); #endif - return num_far * SAFE_ZONE; - } - - GLOBAL void Estimate_NumNeighbors ( reax_atom *sys_atoms, - grid g, - simulation_box *box, - control_params *control, - int *indices) - { - int *atoms = g.atoms; - int *top = g.top; - ivec *nbrs = g.nbrs; - rvec *nbrs_cp = g.nbrs_cp; - - int *nbr_atoms; - int atom1, atom2, l, iter, max, m, num_far; - far_neighbor_data nbr_data; - int x, y, z, i; - - if (threadIdx.x >= *(top + index_grid(1))){ - return; - } - - nbrs = nbrs + index_grid (g.max_nbrs); - nbrs_cp = nbrs_cp + index_grid (g.max_nbrs); - atom1 = atoms [ index_grid (g.max_atoms) + threadIdx.x]; - - num_far = 0; - iter = 0; - - while (nbrs[iter][0] >= 0) { - x = nbrs[iter][0]; - y = nbrs[iter][1]; - z = nbrs[iter][2]; - - //condition check for cutoff here - if (DistSqr_to_CP (nbrs_cp[iter], sys_atoms[atom1].x) <= - SQR (control->vlist_cut)) - { - nbr_atoms = &(atoms [index_grid_atoms (x, y, z, 0, &g) ]); - max = top [index_grid_3d(x, y, z, &g)]; - for (m = 0; m < max; m++) { - atom2 = nbr_atoms[m]; - - //CHANGE ORIGINAL - /* - if (atom1 > atom2) { - if (Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, - control->vlist_cut, &nbr_data)){ - ++num_far; - } - } - */ - if (atom1 > atom2) { - if (Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, - control->vlist_cut, &nbr_data)){ - ++num_far; - } - } - else if (atom1 < atom2) { - if (Are_Far_Neighbors (sys_atoms[atom2].x, sys_atoms[atom1].x, box, - control->vlist_cut, &nbr_data)){ - ++num_far; - } - } - //CHANGE ORIGINAL - } - } - ++iter; - } - - //indices[ atom1 ] = num_far;// * SAFE_ZONE; - indices[ atom1 ] = num_far * SAFE_ZONE; - } - - /*One thread per atom Implementation */ - GLOBAL void New_Estimate_NumNeighbors ( reax_atom *sys_atoms, - grid g, - simulation_box *box, - control_params* control, - int N, int *indices) - { - int *atoms = g.atoms; - int *top = g.top; - ivec *nbrs = g.nbrs; - rvec *nbrs_cp = g.nbrs_cp; - - int *nbr_atoms; - int atom1, atom2, iter, max, m, num_far; - int x, y, z, i; - int atom_x, atom_y, atom_z; - far_neighbor_data temp; - rvec atom1_x; - - int index = blockIdx.x * blockDim.x + threadIdx.x; - if (index > N) return; - - atom_x = (int)(sys_atoms[index].x[0] * g.inv_len[0]); - atom_y = (int)(sys_atoms[index].x[1] * g.inv_len[1]); - atom_z = (int)(sys_atoms[index].x[2] * g.inv_len[2]); + return num_far * SAFE_ZONE; + } + + GLOBAL void Estimate_NumNeighbors ( reax_atom *sys_atoms, + grid g, + simulation_box *box, + control_params *control, + int *indices) + { + int *atoms = g.atoms; + int *top = g.top; + ivec *nbrs = g.nbrs; + rvec *nbrs_cp = g.nbrs_cp; + + int *nbr_atoms; + int atom1, atom2, l, iter, max, m, num_far; + far_neighbor_data nbr_data; + int x, y, z, i; + + if (threadIdx.x >= *(top + index_grid(1))){ + return; + } + + nbrs = nbrs + index_grid (g.max_nbrs); + nbrs_cp = nbrs_cp + index_grid (g.max_nbrs); + atom1 = atoms [ index_grid (g.max_atoms) + threadIdx.x]; + + num_far = 0; + iter = 0; + + while (nbrs[iter][0] >= 0) { + x = nbrs[iter][0]; + y = nbrs[iter][1]; + z = nbrs[iter][2]; + + //condition check for cutoff here + if (DistSqr_to_CP (nbrs_cp[iter], sys_atoms[atom1].x) <= + SQR (control->vlist_cut)) + { + nbr_atoms = &(atoms [index_grid_atoms (x, y, z, 0, &g) ]); + max = top [index_grid_3d(x, y, z, &g)]; + for (m = 0; m < max; m++) { + atom2 = nbr_atoms[m]; + + //CHANGE ORIGINAL + /* + if (atom1 > atom2) { + if (Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, + control->vlist_cut, &nbr_data)){ + ++num_far; + } + } + */ + if (atom1 > atom2) { + if (Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, + control->vlist_cut, &nbr_data)){ + ++num_far; + } + } + else if (atom1 < atom2) { + if (Are_Far_Neighbors (sys_atoms[atom2].x, sys_atoms[atom1].x, box, + control->vlist_cut, &nbr_data)){ + ++num_far; + } + } + //CHANGE ORIGINAL + } + } + ++iter; + } + + //indices[ atom1 ] = num_far;// * SAFE_ZONE; + indices[ atom1 ] = num_far * SAFE_ZONE; + } + + /*One thread per atom Implementation */ + GLOBAL void New_Estimate_NumNeighbors ( reax_atom *sys_atoms, + grid g, + simulation_box *box, + control_params* control, + int N, int *indices) + { + int *atoms = g.atoms; + int *top = g.top; + ivec *nbrs = g.nbrs; + rvec *nbrs_cp = g.nbrs_cp; + + int *nbr_atoms; + int atom1, atom2, iter, max, m, num_far; + int x, y, z, i; + int atom_x, atom_y, atom_z; + far_neighbor_data temp; + rvec atom1_x; + + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index > N) return; + + atom_x = (int)(sys_atoms[index].x[0] * g.inv_len[0]); + atom_y = (int)(sys_atoms[index].x[1] * g.inv_len[1]); + atom_z = (int)(sys_atoms[index].x[2] * g.inv_len[2]); #ifdef __BNVT_FIX__ - if (atom_x >= g.ncell[0]) atom_x = g.ncell[0]-1; - if (atom_y >= g.ncell[1]) atom_y = g.ncell[1]-1; - if (atom_z >= g.ncell[2]) atom_z = g.ncell[2]-1; + if (atom_x >= g.ncell[0]) atom_x = g.ncell[0]-1; + if (atom_y >= g.ncell[1]) atom_y = g.ncell[1]-1; + if (atom_z >= g.ncell[2]) atom_z = g.ncell[2]-1; #endif - nbrs = nbrs + index_grid_nbrs (atom_x, atom_y, atom_z, 0, &g); - nbrs_cp = nbrs_cp + index_grid_nbrs (atom_x, atom_y, atom_z, 0, &g); - atom1 = index; - - rvec_Copy (atom1_x, sys_atoms [atom1].x ); - - num_far = 0; - iter = 0; - - while (nbrs[iter][0] >= 0) { - x = nbrs[iter][0]; - y = nbrs[iter][1]; - z = nbrs[iter][2]; - - if (DistSqr_to_CP (nbrs_cp[iter], atom1_x) <= - SQR (control->vlist_cut)) - { - nbr_atoms = &(atoms [index_grid_atoms (x, y, z, 0, &g) ]); - max = top [index_grid_3d(x, y, z, &g)]; - - for (m = 0; m < max; m++) - { - atom2 = nbr_atoms[m]; - if (atom1 > atom2) { - if (Are_Far_Neighbors (atom1_x, sys_atoms[atom2].x, box, - control->vlist_cut, &temp)){ - num_far++; - } - } - else if (atom1 < atom2) { - if (Are_Far_Neighbors (sys_atoms[atom2].x, atom1_x, box, - control->vlist_cut, &temp)){ - num_far ++; - } - } - } - } - ++iter; - } - indices [atom1] = num_far * SAFE_ZONE; - } - - - - /*One thread per entry in the gcell implementation */ - GLOBAL void Generate_Neighbor_Lists ( reax_atom *sys_atoms, - grid g, - simulation_box *box, - control_params* control, - list far_nbrs) - { - int *atoms = g.atoms; - int *top = g.top; - ivec *nbrs = g.nbrs; - rvec *nbrs_cp = g.nbrs_cp; - - int *nbr_atoms; - int atom1, atom2, l, iter, max, m, num_far; - int x, y, z, i; - far_neighbor_data *nbr_data; - far_neighbor_data temp; - - if (threadIdx.x >= *(top + index_grid(1))){ - return; - } - - nbrs = nbrs + index_grid (g.max_nbrs); - nbrs_cp = nbrs_cp + index_grid (g.max_nbrs); - atom1 = atoms [ index_grid (g.max_atoms) + threadIdx.x]; - - num_far = Start_Index (atom1, &far_nbrs); - //Set_Start_Index (atom1, 0, &far_nbrs); - //num_far = 0; - iter = 0; - - while (nbrs[iter][0] >= 0) { - x = nbrs[iter][0]; - y = nbrs[iter][1]; - z = nbrs[iter][2]; - - //condition check for cutoff here - if (DistSqr_to_CP (nbrs_cp[iter], sys_atoms[atom1].x) <= - SQR (control->vlist_cut)) - { - nbr_atoms = &(atoms [index_grid_atoms (x, y, z, 0, &g) ]); - max = top [index_grid_3d(x, y, z, &g)]; - - for (m = 0; m < max; m++) { - atom2 = nbr_atoms[m]; - - //nbr_data = & ( far_nbrs.select.far_nbr_list[atom1 * g.max_cuda_nbrs + num_far] ); - - //CHANGE ORIGINAL - /* - if (atom1 > atom2) { - if (Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, - control->vlist_cut, &temp)){ - - nbr_data = & ( far_nbrs.select.far_nbr_list[num_far] ); - nbr_data->nbr = atom2; - nbr_data->rel_box[0] = temp.rel_box[0]; - nbr_data->rel_box[1] = temp.rel_box[1]; - nbr_data->rel_box[2] = temp.rel_box[2]; - - nbr_data->d = temp.d; - nbr_data->dvec[0] = temp.dvec[0]; - nbr_data->dvec[1] = temp.dvec[1]; - nbr_data->dvec[2] = temp.dvec[2]; - ++num_far; - } - } - */ - if (atom1 > atom2) { - if (Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, - control->vlist_cut, &temp)){ - nbr_data = & ( far_nbrs.select.far_nbr_list[num_far] ); - nbr_data->nbr = atom2; - nbr_data->rel_box[0] = temp.rel_box[0]; - nbr_data->rel_box[1] = temp.rel_box[1]; - nbr_data->rel_box[2] = temp.rel_box[2]; - - nbr_data->d = temp.d; - nbr_data->dvec[0] = temp.dvec[0]; - nbr_data->dvec[1] = temp.dvec[1]; - nbr_data->dvec[2] = temp.dvec[2]; - ++num_far; - } - } - else if (atom1 < atom2) { - if (Are_Far_Neighbors (sys_atoms[atom2].x, sys_atoms[atom1].x, box, - control->vlist_cut, &temp)){ - nbr_data = & ( far_nbrs.select.far_nbr_list[num_far] ); - nbr_data->nbr = atom2; - nbr_data->rel_box[0] = temp.rel_box[0]; - nbr_data->rel_box[1] = temp.rel_box[1]; - nbr_data->rel_box[2] = temp.rel_box[2]; - - nbr_data->d = temp.d; - nbr_data->dvec[0] = temp.dvec[0]; - nbr_data->dvec[1] = temp.dvec[1]; - nbr_data->dvec[2] = temp.dvec[2]; - ++num_far; - } - } - //CHANGE ORIGINAL - } - } - ++iter; - } - - //end the far_neighbor list here - Set_End_Index (atom1, num_far, &far_nbrs); - } - - - /*One thread per atom Implementation */ - GLOBAL void New_Generate_Neighbor_Lists ( reax_atom *sys_atoms, - grid g, - simulation_box *box, - control_params* control, - list far_nbrs, int N) - { - int *atoms = g.atoms; - int *top = g.top; - ivec *nbrs = g.nbrs; - rvec *nbrs_cp = g.nbrs_cp; - - int *nbr_atoms; - int atom1, atom2, l, iter, max, m, num_far; - int x, y, z, i; - far_neighbor_data *nbr_data, *my_start; - far_neighbor_data temp; - int atom_x, atom_y, atom_z; - rvec atom1_x; - - int index = blockIdx.x * blockDim.x + threadIdx.x; - if (index > N) return; - - atom_x = (int)(sys_atoms[index].x[0] * g.inv_len[0]); - atom_y = (int)(sys_atoms[index].x[1] * g.inv_len[1]); - atom_z = (int)(sys_atoms[index].x[2] * g.inv_len[2]); + nbrs = nbrs + index_grid_nbrs (atom_x, atom_y, atom_z, 0, &g); + nbrs_cp = nbrs_cp + index_grid_nbrs (atom_x, atom_y, atom_z, 0, &g); + atom1 = index; + + rvec_Copy (atom1_x, sys_atoms [atom1].x ); + + num_far = 0; + iter = 0; + + while (nbrs[iter][0] >= 0) { + x = nbrs[iter][0]; + y = nbrs[iter][1]; + z = nbrs[iter][2]; + + if (DistSqr_to_CP (nbrs_cp[iter], atom1_x) <= + SQR (control->vlist_cut)) + { + nbr_atoms = &(atoms [index_grid_atoms (x, y, z, 0, &g) ]); + max = top [index_grid_3d(x, y, z, &g)]; + + for (m = 0; m < max; m++) + { + atom2 = nbr_atoms[m]; + if (atom1 > atom2) { + if (Are_Far_Neighbors (atom1_x, sys_atoms[atom2].x, box, + control->vlist_cut, &temp)){ + num_far++; + } + } + else if (atom1 < atom2) { + if (Are_Far_Neighbors (sys_atoms[atom2].x, atom1_x, box, + control->vlist_cut, &temp)){ + num_far ++; + } + } + } + } + ++iter; + } + indices [atom1] = num_far * SAFE_ZONE; + } + + + + /*One thread per entry in the gcell implementation */ + GLOBAL void Generate_Neighbor_Lists ( reax_atom *sys_atoms, + grid g, + simulation_box *box, + control_params* control, + list far_nbrs) + { + int *atoms = g.atoms; + int *top = g.top; + ivec *nbrs = g.nbrs; + rvec *nbrs_cp = g.nbrs_cp; + + int *nbr_atoms; + int atom1, atom2, l, iter, max, m, num_far; + int x, y, z, i; + far_neighbor_data *nbr_data; + far_neighbor_data temp; + + if (threadIdx.x >= *(top + index_grid(1))){ + return; + } + + nbrs = nbrs + index_grid (g.max_nbrs); + nbrs_cp = nbrs_cp + index_grid (g.max_nbrs); + atom1 = atoms [ index_grid (g.max_atoms) + threadIdx.x]; + + num_far = Start_Index (atom1, &far_nbrs); + //Set_Start_Index (atom1, 0, &far_nbrs); + //num_far = 0; + iter = 0; + + while (nbrs[iter][0] >= 0) { + x = nbrs[iter][0]; + y = nbrs[iter][1]; + z = nbrs[iter][2]; + + //condition check for cutoff here + if (DistSqr_to_CP (nbrs_cp[iter], sys_atoms[atom1].x) <= + SQR (control->vlist_cut)) + { + nbr_atoms = &(atoms [index_grid_atoms (x, y, z, 0, &g) ]); + max = top [index_grid_3d(x, y, z, &g)]; + + for (m = 0; m < max; m++) { + atom2 = nbr_atoms[m]; + + //nbr_data = & ( far_nbrs.select.far_nbr_list[atom1 * g.max_cuda_nbrs + num_far] ); + + //CHANGE ORIGINAL + /* + if (atom1 > atom2) { + if (Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, + control->vlist_cut, &temp)){ + + nbr_data = & ( far_nbrs.select.far_nbr_list[num_far] ); + nbr_data->nbr = atom2; + nbr_data->rel_box[0] = temp.rel_box[0]; + nbr_data->rel_box[1] = temp.rel_box[1]; + nbr_data->rel_box[2] = temp.rel_box[2]; + + nbr_data->d = temp.d; + nbr_data->dvec[0] = temp.dvec[0]; + nbr_data->dvec[1] = temp.dvec[1]; + nbr_data->dvec[2] = temp.dvec[2]; + ++num_far; + } + } + */ + if (atom1 > atom2) { + if (Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, + control->vlist_cut, &temp)){ + nbr_data = & ( far_nbrs.select.far_nbr_list[num_far] ); + nbr_data->nbr = atom2; + nbr_data->rel_box[0] = temp.rel_box[0]; + nbr_data->rel_box[1] = temp.rel_box[1]; + nbr_data->rel_box[2] = temp.rel_box[2]; + + nbr_data->d = temp.d; + nbr_data->dvec[0] = temp.dvec[0]; + nbr_data->dvec[1] = temp.dvec[1]; + nbr_data->dvec[2] = temp.dvec[2]; + ++num_far; + } + } + else if (atom1 < atom2) { + if (Are_Far_Neighbors (sys_atoms[atom2].x, sys_atoms[atom1].x, box, + control->vlist_cut, &temp)){ + nbr_data = & ( far_nbrs.select.far_nbr_list[num_far] ); + nbr_data->nbr = atom2; + nbr_data->rel_box[0] = temp.rel_box[0]; + nbr_data->rel_box[1] = temp.rel_box[1]; + nbr_data->rel_box[2] = temp.rel_box[2]; + + nbr_data->d = temp.d; + nbr_data->dvec[0] = temp.dvec[0]; + nbr_data->dvec[1] = temp.dvec[1]; + nbr_data->dvec[2] = temp.dvec[2]; + ++num_far; + } + } + //CHANGE ORIGINAL + } + } + ++iter; + } + + //end the far_neighbor list here + Set_End_Index (atom1, num_far, &far_nbrs); + } + + + /*One thread per atom Implementation */ + GLOBAL void New_Generate_Neighbor_Lists ( reax_atom *sys_atoms, + grid g, + simulation_box *box, + control_params* control, + list far_nbrs, int N) + { + int *atoms = g.atoms; + int *top = g.top; + ivec *nbrs = g.nbrs; + rvec *nbrs_cp = g.nbrs_cp; + + int *nbr_atoms; + int atom1, atom2, l, iter, max, m, num_far; + int x, y, z, i; + far_neighbor_data *nbr_data, *my_start; + far_neighbor_data temp; + int atom_x, atom_y, atom_z; + rvec atom1_x; + + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index > N) return; + + atom_x = (int)(sys_atoms[index].x[0] * g.inv_len[0]); + atom_y = (int)(sys_atoms[index].x[1] * g.inv_len[1]); + atom_z = (int)(sys_atoms[index].x[2] * g.inv_len[2]); #ifdef __BNVT_FIX__ - if (atom_x >= g.ncell[0]) atom_x = g.ncell[0]-1; - if (atom_y >= g.ncell[1]) atom_y = g.ncell[1]-1; - if (atom_z >= g.ncell[2]) atom_z = g.ncell[2]-1; + if (atom_x >= g.ncell[0]) atom_x = g.ncell[0]-1; + if (atom_y >= g.ncell[1]) atom_y = g.ncell[1]-1; + if (atom_z >= g.ncell[2]) atom_z = g.ncell[2]-1; #endif - nbrs = nbrs + index_grid_nbrs (atom_x, atom_y, atom_z, 0, &g); - nbrs_cp = nbrs_cp + index_grid_nbrs (atom_x, atom_y, atom_z, 0, &g); - atom1 = index; - - rvec_Copy (atom1_x, sys_atoms [atom1].x ); - - num_far = Start_Index (atom1, &far_nbrs); - my_start = & (far_nbrs.select.far_nbr_list [num_far] ); - - //Set_Start_Index (atom1, 0, &far_nbrs); - //num_far = 0; - iter = 0; - - while (nbrs[iter][0] >= 0) { - x = nbrs[iter][0]; - y = nbrs[iter][1]; - z = nbrs[iter][2]; - - //condition check for cutoff here - //if (DistSqr_to_CP (nbrs_cp[iter], sys_atoms[atom1].x) <= - if (DistSqr_to_CP (nbrs_cp[iter], atom1_x) <= - SQR (control->vlist_cut)) - { - nbr_atoms = &(atoms [index_grid_atoms (x, y, z, 0, &g) ]); - max = top [index_grid_3d(x, y, z, &g)]; - - for (m = 0; m < max; m++) - { - atom2 = nbr_atoms[m]; - if (atom1 > atom2) { - if (Are_Far_Neighbors (atom1_x, sys_atoms[atom2].x, box, - control->vlist_cut, &temp)){ - //nbr_data = & ( far_nbrs.select.far_nbr_list[num_far] ); - nbr_data = my_start; - nbr_data->nbr = atom2; - nbr_data->rel_box[0] = temp.rel_box[0]; - nbr_data->rel_box[1] = temp.rel_box[1]; - nbr_data->rel_box[2] = temp.rel_box[2]; - - nbr_data->d = temp.d; - nbr_data->dvec[0] = temp.dvec[0]; - nbr_data->dvec[1] = temp.dvec[1]; - nbr_data->dvec[2] = temp.dvec[2]; - num_far++; - my_start ++; - } - } - else if (atom1 < atom2) { - if (Are_Far_Neighbors (sys_atoms[atom2].x, atom1_x, box, - control->vlist_cut, &temp)){ - //nbr_data = & ( far_nbrs.select.far_nbr_list[num_far] ); - nbr_data = my_start; - nbr_data->nbr = atom2; - nbr_data->rel_box[0] = temp.rel_box[0]; - nbr_data->rel_box[1] = temp.rel_box[1]; - nbr_data->rel_box[2] = temp.rel_box[2]; - - nbr_data->d = temp.d; - nbr_data->dvec[0] = temp.dvec[0]; - nbr_data->dvec[1] = temp.dvec[1]; - nbr_data->dvec[2] = temp.dvec[2]; - num_far ++; - my_start ++; - } - } - //CHANGE ORIGINAL - } - } - ++iter; - } - - //end the far_neighbor list here - Set_End_Index (atom1, num_far, &far_nbrs); - } - - /*Multiple threads per atom Implementation */ - GLOBAL void Test_Generate_Neighbor_Lists ( reax_atom *sys_atoms, - grid g, - simulation_box *box, - control_params* control, - list far_nbrs, int N ) - { - - extern __shared__ int __nbr[]; - extern __shared__ int __sofar []; - bool nbrgen; - - int __THREADS_PER_ATOM__ = NBRS_THREADS_PER_ATOM; - - int thread_id = blockIdx.x * blockDim.x + threadIdx.x; - int warp_id = thread_id / __THREADS_PER_ATOM__; - int lane_id = thread_id & (__THREADS_PER_ATOM__ -1); - int my_bucket = threadIdx.x / __THREADS_PER_ATOM__; - - if (warp_id >= N ) return; - - int *tnbr = __nbr; - //int *nbrssofar = __nbr + __THREADS_PER_ATOM__; - int *nbrssofar = __nbr + blockDim.x; - - int *atoms = g.atoms; - int *top = g.top; - ivec *nbrs = g.nbrs; - rvec *nbrs_cp = g.nbrs_cp; - - int *nbr_atoms; - int atom1, atom2, l, iter, max, m, num_far; - int leader = -10; - int x, y, z, i; - far_neighbor_data *nbr_data, *my_start; - far_neighbor_data temp; - int atom_x, atom_y, atom_z; - - - atom1 = warp_id; - atom_x = (int)(sys_atoms[atom1].x[0] * g.inv_len[0]); - atom_y = (int)(sys_atoms[atom1].x[1] * g.inv_len[1]); - atom_z = (int)(sys_atoms[atom1].x[2] * g.inv_len[2]); + nbrs = nbrs + index_grid_nbrs (atom_x, atom_y, atom_z, 0, &g); + nbrs_cp = nbrs_cp + index_grid_nbrs (atom_x, atom_y, atom_z, 0, &g); + atom1 = index; + + rvec_Copy (atom1_x, sys_atoms [atom1].x ); + + num_far = Start_Index (atom1, &far_nbrs); + my_start = & (far_nbrs.select.far_nbr_list [num_far] ); + + //Set_Start_Index (atom1, 0, &far_nbrs); + //num_far = 0; + iter = 0; + + while (nbrs[iter][0] >= 0) { + x = nbrs[iter][0]; + y = nbrs[iter][1]; + z = nbrs[iter][2]; + + //condition check for cutoff here + //if (DistSqr_to_CP (nbrs_cp[iter], sys_atoms[atom1].x) <= + if (DistSqr_to_CP (nbrs_cp[iter], atom1_x) <= + SQR (control->vlist_cut)) + { + nbr_atoms = &(atoms [index_grid_atoms (x, y, z, 0, &g) ]); + max = top [index_grid_3d(x, y, z, &g)]; + + for (m = 0; m < max; m++) + { + atom2 = nbr_atoms[m]; + if (atom1 > atom2) { + if (Are_Far_Neighbors (atom1_x, sys_atoms[atom2].x, box, + control->vlist_cut, &temp)){ + //nbr_data = & ( far_nbrs.select.far_nbr_list[num_far] ); + nbr_data = my_start; + nbr_data->nbr = atom2; + nbr_data->rel_box[0] = temp.rel_box[0]; + nbr_data->rel_box[1] = temp.rel_box[1]; + nbr_data->rel_box[2] = temp.rel_box[2]; + + nbr_data->d = temp.d; + nbr_data->dvec[0] = temp.dvec[0]; + nbr_data->dvec[1] = temp.dvec[1]; + nbr_data->dvec[2] = temp.dvec[2]; + num_far++; + my_start ++; + } + } + else if (atom1 < atom2) { + if (Are_Far_Neighbors (sys_atoms[atom2].x, atom1_x, box, + control->vlist_cut, &temp)){ + //nbr_data = & ( far_nbrs.select.far_nbr_list[num_far] ); + nbr_data = my_start; + nbr_data->nbr = atom2; + nbr_data->rel_box[0] = temp.rel_box[0]; + nbr_data->rel_box[1] = temp.rel_box[1]; + nbr_data->rel_box[2] = temp.rel_box[2]; + + nbr_data->d = temp.d; + nbr_data->dvec[0] = temp.dvec[0]; + nbr_data->dvec[1] = temp.dvec[1]; + nbr_data->dvec[2] = temp.dvec[2]; + num_far ++; + my_start ++; + } + } + //CHANGE ORIGINAL + } + } + ++iter; + } + + //end the far_neighbor list here + Set_End_Index (atom1, num_far, &far_nbrs); + } + + /*Multiple threads per atom Implementation */ + GLOBAL void Test_Generate_Neighbor_Lists ( reax_atom *sys_atoms, + grid g, + simulation_box *box, + control_params* control, + list far_nbrs, int N ) + { + + extern __shared__ int __nbr[]; + extern __shared__ int __sofar []; + bool nbrgen; + + int __THREADS_PER_ATOM__ = NBRS_THREADS_PER_ATOM; + + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + int warp_id = thread_id / __THREADS_PER_ATOM__; + int lane_id = thread_id & (__THREADS_PER_ATOM__ -1); + int my_bucket = threadIdx.x / __THREADS_PER_ATOM__; + + if (warp_id >= N ) return; + + int *tnbr = __nbr; + //int *nbrssofar = __nbr + __THREADS_PER_ATOM__; + int *nbrssofar = __nbr + blockDim.x; + + int *atoms = g.atoms; + int *top = g.top; + ivec *nbrs = g.nbrs; + rvec *nbrs_cp = g.nbrs_cp; + + int *nbr_atoms; + int atom1, atom2, l, iter, max, m, num_far; + int leader = -10; + int x, y, z, i; + far_neighbor_data *nbr_data, *my_start; + far_neighbor_data temp; + int atom_x, atom_y, atom_z; + + + atom1 = warp_id; + atom_x = (int)(sys_atoms[atom1].x[0] * g.inv_len[0]); + atom_y = (int)(sys_atoms[atom1].x[1] * g.inv_len[1]); + atom_z = (int)(sys_atoms[atom1].x[2] * g.inv_len[2]); #ifdef __BNVT_FIX__ - if (atom_x >= g.ncell[0]) atom_x = g.ncell[0]-1; - if (atom_y >= g.ncell[1]) atom_y = g.ncell[1]-1; - if (atom_z >= g.ncell[2]) atom_z = g.ncell[2]-1; + if (atom_x >= g.ncell[0]) atom_x = g.ncell[0]-1; + if (atom_y >= g.ncell[1]) atom_y = g.ncell[1]-1; + if (atom_z >= g.ncell[2]) atom_z = g.ncell[2]-1; #endif - nbrs = nbrs + index_grid_nbrs (atom_x, atom_y, atom_z, 0, &g); - nbrs_cp = nbrs_cp + index_grid_nbrs (atom_x, atom_y, atom_z, 0, &g); - - num_far = Start_Index (atom1, &far_nbrs); - my_start = & (far_nbrs.select.far_nbr_list [num_far] ); - - iter = 0; - tnbr[threadIdx.x] = 0; - - if (lane_id == 0) { - //nbrssofar [threadIdx.x /__THREADS_PER_ATOM__] = 0; - nbrssofar [my_bucket] = 0; - } - - __syncthreads (); - - while ((nbrs[iter][0] >= 0)) { - x = nbrs[iter][0]; - y = nbrs[iter][1]; - z = nbrs[iter][2]; - - tnbr[threadIdx.x] = 0; - nbrgen = false; - - if (DistSqr_to_CP (nbrs_cp[iter], sys_atoms [atom1].x) <= - SQR (control->vlist_cut)) - { - nbr_atoms = &(atoms [index_grid_atoms (x, y, z, 0, &g) ]); - max = top [index_grid_3d(x, y, z, &g)]; - - tnbr[threadIdx.x] = 0; - nbrgen = false; - m = lane_id ; //0-31 - int loopcount = max / __THREADS_PER_ATOM__ + ((max % __THREADS_PER_ATOM__) == 0 ? 0 : 1); - int iterations = 0; - //while (m < max) - while (iterations < loopcount) - { - tnbr [threadIdx.x] = 0; - nbrgen = false; - - if (m < max) { - atom2 = nbr_atoms[m]; - if (atom1 > atom2) { - if (Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, - control->vlist_cut, &temp)) - { - tnbr [threadIdx.x] = 1; - nbrgen = true; - } - } - else if (atom1 < atom2) { - if (Are_Far_Neighbors (sys_atoms[atom2].x, sys_atoms[atom1].x, box, - control->vlist_cut, &temp)){ - tnbr [threadIdx.x] = 1; - nbrgen = true; - } - } - } - - if (nbrgen) - { - //do leader selection here - leader = -1; - //for (l = threadIdx.x / __THREADS_PER_ATOM__; l < threadIdx.x / __THREADS_PER_ATOM__ + __THREADS_PER_ATOM__; l++) - for (l = my_bucket *__THREADS_PER_ATOM__; l < (my_bucket)*__THREADS_PER_ATOM__ + __THREADS_PER_ATOM__; l++) - if (tnbr[l]){ - leader = l; - break; - } - - //do the reduction; - if (threadIdx.x == leader) - for (l = 1; l < __THREADS_PER_ATOM__; l++) - //tnbr [(threadIdx.x / __THREADS_PER_ATOM__) * __THREADS_PER_ATOM__ + l] += tnbr [(threadIdx.x / __THREADS_PER_ATOM__) * __THREADS_PER_ATOM__ + (l-1)]; - tnbr [my_bucket * __THREADS_PER_ATOM__ + l] += tnbr [my_bucket * __THREADS_PER_ATOM__ + (l-1)]; - } - - //__syncthreads (); - //atomicAdd ( &warp_sync [threadIdx.x / __THREADS_PER_ATOM__ ], 1); - //while ( warp_sync [threadIdx.x / __THREADS_PER_ATOM__ ] < __THREADS_PER_ATOM__ ) ; - - if (nbrgen) - { - //got the indices - //nbr_data = my_start + nbrssofar[threadIdx.x / __THREADS_PER_ATOM__] + tnbr [threadIdx.x] - 1; - nbr_data = my_start + nbrssofar[my_bucket] + tnbr [threadIdx.x] - 1; - nbr_data->nbr = atom2; - nbr_data->rel_box[0] = temp.rel_box[0]; - nbr_data->rel_box[1] = temp.rel_box[1]; - nbr_data->rel_box[2] = temp.rel_box[2]; - - nbr_data->d = temp.d; - nbr_data->dvec[0] = temp.dvec[0]; - nbr_data->dvec[1] = temp.dvec[1]; - nbr_data->dvec[2] = temp.dvec[2]; - - if (threadIdx.x == leader) - //nbrssofar[threadIdx.x / __THREADS_PER_ATOM__] += tnbr[(threadIdx.x / __THREADS_PER_ATOM__)*__THREADS_PER_ATOM__ + (__THREADS_PER_ATOM__ - 1)]; - nbrssofar[my_bucket] += tnbr[my_bucket *__THREADS_PER_ATOM__ + (__THREADS_PER_ATOM__ - 1)]; - } - - m += __THREADS_PER_ATOM__; - iterations ++; - - //cleanup - nbrgen = false; - tnbr [threadIdx.x] = 0; - } - } - ++iter; - } - - __syncthreads (); - - //end the far_neighbor list here - if (lane_id == 0) - Set_End_Index (atom1, num_far + nbrssofar[my_bucket], &far_nbrs); - //Set_End_Index (atom1, num_far + tnbr[63], &far_nbrs); - } - - void Cuda_Generate_Neighbor_Lists (reax_system *system, static_storage *workspace, control_params *control, bool estimate) - { - real t_start, t_elapsed; - real t_1, t_2; - - list *far_nbrs = dev_lists + FAR_NBRS; - - int *d_indices = (int *) scratch; - int *nbrs_start, *nbrs_end; - int i, max_nbrs = 0; - int nbs; - - t_start = Get_Time (); - - Cuda_Bin_Atoms (system, workspace); - Cuda_Bin_Atoms_Sync ( system ); - - if (dev_workspace->realloc.estimate_nbrs > -1) { - - /*reset the re-neighbor condition */ - dev_workspace->realloc.estimate_nbrs = -1; - - //#ifdef __DEBUG_CUDA__ - fprintf (stderr, "Recomputing the neighbors estimate.... \n"); - //#endif - cuda_memset (d_indices, 0, INT_SIZE * system->N, RES_SCRATCH ); - /* - dim3 blockspergrid (system->g.ncell[0], system->g.ncell[1], system->g.ncell[2]); - dim3 threadsperblock (system->g.max_atoms); - - Estimate_NumNeighbors <<<blockspergrid, threadsperblock >>> - (system->d_atoms, system->d_g, system->d_box, - (control_params *)control->d_control, d_indices); - cudaThreadSynchronize (); - cudaCheckError (); - */ - nbs = (system->N / NBRS_BLOCK_SIZE) + (((system->N) % NBRS_BLOCK_SIZE) == 0 ? 0 : 1); - New_Estimate_NumNeighbors <<<nbs, NBRS_BLOCK_SIZE>>> - ( system->d_atoms, system->d_g, - system->d_box, (control_params *)control->d_control, - system->N, d_indices); - cudaThreadSynchronize (); - cudaCheckError (); - - - int *nbrs_indices = NULL; - nbrs_indices = (int *) malloc( INT_SIZE * (system->N+1) ); - if (nbrs_indices == NULL) - { - fprintf (stderr, "Malloc failed for nbrs indices .... \n"); - exit (1); - } - memset (nbrs_indices , 0, INT_SIZE * (system->N+1) ); - - copy_host_device (nbrs_indices+1, d_indices, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); - for (int i = 1; i <= system->N; i++) - nbrs_indices [i] += nbrs_indices [i-1]; - - copy_host_device (nbrs_indices, (far_nbrs->index), INT_SIZE * (system->N), cudaMemcpyHostToDevice, __LINE__ ); - copy_host_device (nbrs_indices, (far_nbrs->end_index), INT_SIZE * (system->N), cudaMemcpyHostToDevice, __LINE__ ); - - free (nbrs_indices); - } - - /* - One thread per atom Implementation - Generate_Neighbor_Lists <<<blockspergrid, threadsperblock >>> - (system->d_atoms, system->d_g, system->d_box, - (control_params *)control->d_control, *far_nbrs); - */ - nbs = (system->N * NBRS_THREADS_PER_ATOM/ NBRS_BLOCK_SIZE) + - (((system->N *NBRS_THREADS_PER_ATOM) % NBRS_BLOCK_SIZE) == 0 ? 0 : 1); - - /* Multiple threads per atom Implementation */ - Test_Generate_Neighbor_Lists <<<nbs, NBRS_BLOCK_SIZE, - INT_SIZE * (NBRS_BLOCK_SIZE+ NBRS_BLOCK_SIZE/NBRS_THREADS_PER_ATOM) >>> - (system->d_atoms, system->d_g, system->d_box, - (control_params *)control->d_control, *far_nbrs, system->N ); - cudaThreadSynchronize (); - cudaCheckError (); - - t_elapsed = Get_Timing_Info (t_start); - d_timing.nbrs += t_elapsed; + nbrs = nbrs + index_grid_nbrs (atom_x, atom_y, atom_z, 0, &g); + nbrs_cp = nbrs_cp + index_grid_nbrs (atom_x, atom_y, atom_z, 0, &g); + + num_far = Start_Index (atom1, &far_nbrs); + my_start = & (far_nbrs.select.far_nbr_list [num_far] ); + + iter = 0; + tnbr[threadIdx.x] = 0; + + if (lane_id == 0) { + //nbrssofar [threadIdx.x /__THREADS_PER_ATOM__] = 0; + nbrssofar [my_bucket] = 0; + } + + __syncthreads (); + + while ((nbrs[iter][0] >= 0)) { + x = nbrs[iter][0]; + y = nbrs[iter][1]; + z = nbrs[iter][2]; + + tnbr[threadIdx.x] = 0; + nbrgen = false; + + if (DistSqr_to_CP (nbrs_cp[iter], sys_atoms [atom1].x) <= + SQR (control->vlist_cut)) + { + nbr_atoms = &(atoms [index_grid_atoms (x, y, z, 0, &g) ]); + max = top [index_grid_3d(x, y, z, &g)]; + + tnbr[threadIdx.x] = 0; + nbrgen = false; + m = lane_id ; //0-31 + int loopcount = max / __THREADS_PER_ATOM__ + ((max % __THREADS_PER_ATOM__) == 0 ? 0 : 1); + int iterations = 0; + //while (m < max) + while (iterations < loopcount) + { + tnbr [threadIdx.x] = 0; + nbrgen = false; + + if (m < max) { + atom2 = nbr_atoms[m]; + if (atom1 > atom2) { + if (Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, + control->vlist_cut, &temp)) + { + tnbr [threadIdx.x] = 1; + nbrgen = true; + } + } + else if (atom1 < atom2) { + if (Are_Far_Neighbors (sys_atoms[atom2].x, sys_atoms[atom1].x, box, + control->vlist_cut, &temp)){ + tnbr [threadIdx.x] = 1; + nbrgen = true; + } + } + } + + if (nbrgen) + { + //do leader selection here + leader = -1; + //for (l = threadIdx.x / __THREADS_PER_ATOM__; l < threadIdx.x / __THREADS_PER_ATOM__ + __THREADS_PER_ATOM__; l++) + for (l = my_bucket *__THREADS_PER_ATOM__; l < (my_bucket)*__THREADS_PER_ATOM__ + __THREADS_PER_ATOM__; l++) + if (tnbr[l]){ + leader = l; + break; + } + + //do the reduction; + if (threadIdx.x == leader) + for (l = 1; l < __THREADS_PER_ATOM__; l++) + //tnbr [(threadIdx.x / __THREADS_PER_ATOM__) * __THREADS_PER_ATOM__ + l] += tnbr [(threadIdx.x / __THREADS_PER_ATOM__) * __THREADS_PER_ATOM__ + (l-1)]; + tnbr [my_bucket * __THREADS_PER_ATOM__ + l] += tnbr [my_bucket * __THREADS_PER_ATOM__ + (l-1)]; + } + + //__syncthreads (); + //atomicAdd ( &warp_sync [threadIdx.x / __THREADS_PER_ATOM__ ], 1); + //while ( warp_sync [threadIdx.x / __THREADS_PER_ATOM__ ] < __THREADS_PER_ATOM__ ) ; + + if (nbrgen) + { + //got the indices + //nbr_data = my_start + nbrssofar[threadIdx.x / __THREADS_PER_ATOM__] + tnbr [threadIdx.x] - 1; + nbr_data = my_start + nbrssofar[my_bucket] + tnbr [threadIdx.x] - 1; + nbr_data->nbr = atom2; + nbr_data->rel_box[0] = temp.rel_box[0]; + nbr_data->rel_box[1] = temp.rel_box[1]; + nbr_data->rel_box[2] = temp.rel_box[2]; + + nbr_data->d = temp.d; + nbr_data->dvec[0] = temp.dvec[0]; + nbr_data->dvec[1] = temp.dvec[1]; + nbr_data->dvec[2] = temp.dvec[2]; + + if (threadIdx.x == leader) + //nbrssofar[threadIdx.x / __THREADS_PER_ATOM__] += tnbr[(threadIdx.x / __THREADS_PER_ATOM__)*__THREADS_PER_ATOM__ + (__THREADS_PER_ATOM__ - 1)]; + nbrssofar[my_bucket] += tnbr[my_bucket *__THREADS_PER_ATOM__ + (__THREADS_PER_ATOM__ - 1)]; + } + + m += __THREADS_PER_ATOM__; + iterations ++; + + //cleanup + nbrgen = false; + tnbr [threadIdx.x] = 0; + } + } + ++iter; + } + + __syncthreads (); + + //end the far_neighbor list here + if (lane_id == 0) + Set_End_Index (atom1, num_far + nbrssofar[my_bucket], &far_nbrs); + //Set_End_Index (atom1, num_far + tnbr[63], &far_nbrs); + } + + void Cuda_Generate_Neighbor_Lists (reax_system *system, static_storage *workspace, control_params *control, bool estimate) + { + real t_start, t_elapsed; + real t_1, t_2; + + list *far_nbrs = dev_lists + FAR_NBRS; + + int *d_indices = (int *) scratch; + int *nbrs_start, *nbrs_end; + int i, max_nbrs = 0; + int nbs; + + t_start = Get_Time (); + + Cuda_Bin_Atoms (system, workspace); + Cuda_Bin_Atoms_Sync ( system ); + + if (dev_workspace->realloc.estimate_nbrs > -1) { + + /*reset the re-neighbor condition */ + dev_workspace->realloc.estimate_nbrs = -1; + + //#ifdef __DEBUG_CUDA__ + fprintf (stderr, "Recomputing the neighbors estimate.... \n"); + //#endif + cuda_memset (d_indices, 0, INT_SIZE * system->N, RES_SCRATCH ); + /* + dim3 blockspergrid (system->g.ncell[0], system->g.ncell[1], system->g.ncell[2]); + dim3 threadsperblock (system->g.max_atoms); + + Estimate_NumNeighbors <<<blockspergrid, threadsperblock >>> + (system->d_atoms, system->d_g, system->d_box, + (control_params *)control->d_control, d_indices); + cudaThreadSynchronize (); + cudaCheckError (); + */ + nbs = (system->N / NBRS_BLOCK_SIZE) + (((system->N) % NBRS_BLOCK_SIZE) == 0 ? 0 : 1); + New_Estimate_NumNeighbors <<<nbs, NBRS_BLOCK_SIZE>>> + ( system->d_atoms, system->d_g, + system->d_box, (control_params *)control->d_control, + system->N, d_indices); + cudaThreadSynchronize (); + cudaCheckError (); + + + int *nbrs_indices = NULL; + nbrs_indices = (int *) malloc( INT_SIZE * (system->N+1) ); + if (nbrs_indices == NULL) + { + fprintf (stderr, "Malloc failed for nbrs indices .... \n"); + exit (1); + } + memset (nbrs_indices , 0, INT_SIZE * (system->N+1) ); + + copy_host_device (nbrs_indices+1, d_indices, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); + for (int i = 1; i <= system->N; i++) + nbrs_indices [i] += nbrs_indices [i-1]; + + copy_host_device (nbrs_indices, (far_nbrs->index), INT_SIZE * (system->N), cudaMemcpyHostToDevice, __LINE__ ); + copy_host_device (nbrs_indices, (far_nbrs->end_index), INT_SIZE * (system->N), cudaMemcpyHostToDevice, __LINE__ ); + + free (nbrs_indices); + } + + /* + One thread per atom Implementation + Generate_Neighbor_Lists <<<blockspergrid, threadsperblock >>> + (system->d_atoms, system->d_g, system->d_box, + (control_params *)control->d_control, *far_nbrs); + */ + nbs = (system->N * NBRS_THREADS_PER_ATOM/ NBRS_BLOCK_SIZE) + + (((system->N *NBRS_THREADS_PER_ATOM) % NBRS_BLOCK_SIZE) == 0 ? 0 : 1); + + /* Multiple threads per atom Implementation */ + Test_Generate_Neighbor_Lists <<<nbs, NBRS_BLOCK_SIZE, + INT_SIZE * (NBRS_BLOCK_SIZE+ NBRS_BLOCK_SIZE/NBRS_THREADS_PER_ATOM) >>> + (system->d_atoms, system->d_g, system->d_box, + (control_params *)control->d_control, *far_nbrs, system->N ); + cudaThreadSynchronize (); + cudaCheckError (); + + t_elapsed = Get_Timing_Info (t_start); + d_timing.nbrs += t_elapsed; #ifdef __DEBUG_CUDA__ - fprintf (stderr, "Done with neighbor generation ---> %f \n", t_elapsed); + fprintf (stderr, "Done with neighbor generation ---> %f \n", t_elapsed); #endif - /*validate neighbors list*/ - nbrs_start = (int *) calloc (system->N, INT_SIZE); - nbrs_end = (int *) calloc (system->N, INT_SIZE); + /*validate neighbors list*/ + nbrs_start = (int *) calloc (system->N, INT_SIZE); + nbrs_end = (int *) calloc (system->N, INT_SIZE); - copy_host_device (nbrs_start, far_nbrs->index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__ ); - copy_host_device (nbrs_end, far_nbrs->end_index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__ ); + copy_host_device (nbrs_start, far_nbrs->index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__ ); + copy_host_device (nbrs_end, far_nbrs->end_index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__ ); - int device_nbrs = 0; - for(i = 0; i < system->N; i++) - { - if ((nbrs_end[i] - nbrs_start[i]) > max_nbrs) - max_nbrs = nbrs_end[i] - nbrs_start[i]; + int device_nbrs = 0; + for(i = 0; i < system->N; i++) + { + if ((nbrs_end[i] - nbrs_start[i]) > max_nbrs) + max_nbrs = nbrs_end[i] - nbrs_start[i]; - device_nbrs += nbrs_end[i] - nbrs_start[i]; - } + device_nbrs += nbrs_end[i] - nbrs_start[i]; + } #ifdef __CUDA_TEST__ - //fprintf (stderr, " New Device count is : %d \n", device_nbrs); - //dev_workspace->realloc.num_far = device_nbrs; + //fprintf (stderr, " New Device count is : %d \n", device_nbrs); + //dev_workspace->realloc.num_far = device_nbrs; #endif #ifdef __DEBUG_CUDA__ - fprintf (stderr, "Max neighbors is ---> %d \n", max_nbrs ); - fprintf (stderr, "DEVICE NEIGHBORS ---> %d \n", device_nbrs); + fprintf (stderr, "Max neighbors is ---> %d \n", max_nbrs ); + fprintf (stderr, "DEVICE NEIGHBORS ---> %d \n", device_nbrs); #endif - //validate check here - //get the num_far from the list here - for (i = 0; i < system->N-1; i++) - { - if ((nbrs_end[i] - nbrs_start[i]) > (nbrs_start[i+1] - nbrs_start[i]) * DANGER_ZONE ) - { - dev_workspace->realloc.num_far = device_nbrs; - //#ifdef __CUDA_MEM__ - //fprintf (stderr, "Need to reallocate the neighbors ----> %d \n", dev_workspace->realloc.num_far); - //fprintf (stderr, "Reaching the limits of neighbors for index ----> %d (%d %d %d) \n", - // i, nbrs_start[i], nbrs_end[i], nbrs_start[i+1]); - //#endif - } - - if (nbrs_end[i] > nbrs_start[i+1]) { - fprintf( stderr, "**ran out of space on far_nbrs: start[i] = %d, end[i]=%d, start[i+1]=%d, end[i+1] = %d", - nbrs_start[i], nbrs_end[i], nbrs_start[i+1], nbrs_end[i+1]); - exit( INSUFFICIENT_SPACE ); - } - } - - if ((nbrs_end[i] - nbrs_start[i]) > (far_nbrs->num_intrs - nbrs_start[i]) * DANGER_ZONE ) { - dev_workspace->realloc.num_far = device_nbrs; - //#ifdef __CUDA_MEM__ - //fprintf (stderr, "Need to reallocate the neighbors ----> %d \n", dev_workspace->realloc.num_far); - //fprintf (stderr, "Reaching the limits of neighbors for index ----> %d start: %d, end: %d, count: %d\n" - // , i, nbrs_start[i], nbrs_end[i], far_nbrs->num_intrs); - //#endif - } - if (nbrs_end[i] > far_nbrs->num_intrs) { - fprintf( stderr, "**ran out of space on far_nbrs: top=%d, max=%d", - nbrs_end[i], far_nbrs->num_intrs ); - exit( INSUFFICIENT_SPACE ); - } - - free (nbrs_start); - free (nbrs_end); - } - - //Code not used anymore + //validate check here + //get the num_far from the list here + for (i = 0; i < system->N-1; i++) + { + if ((nbrs_end[i] - nbrs_start[i]) > (nbrs_start[i+1] - nbrs_start[i]) * DANGER_ZONE ) + { + dev_workspace->realloc.num_far = device_nbrs; + //#ifdef __CUDA_MEM__ + //fprintf (stderr, "Need to reallocate the neighbors ----> %d \n", dev_workspace->realloc.num_far); + //fprintf (stderr, "Reaching the limits of neighbors for index ----> %d (%d %d %d) \n", + // i, nbrs_start[i], nbrs_end[i], nbrs_start[i+1]); + //#endif + } + + if (nbrs_end[i] > nbrs_start[i+1]) { + fprintf( stderr, "**ran out of space on far_nbrs: start[i] = %d, end[i]=%d, start[i+1]=%d, end[i+1] = %d", + nbrs_start[i], nbrs_end[i], nbrs_start[i+1], nbrs_end[i+1]); + exit( INSUFFICIENT_SPACE ); + } + } + + if ((nbrs_end[i] - nbrs_start[i]) > (far_nbrs->num_intrs - nbrs_start[i]) * DANGER_ZONE ) { + dev_workspace->realloc.num_far = device_nbrs; + //#ifdef __CUDA_MEM__ + //fprintf (stderr, "Need to reallocate the neighbors ----> %d \n", dev_workspace->realloc.num_far); + //fprintf (stderr, "Reaching the limits of neighbors for index ----> %d start: %d, end: %d, count: %d\n" + // , i, nbrs_start[i], nbrs_end[i], far_nbrs->num_intrs); + //#endif + } + if (nbrs_end[i] > far_nbrs->num_intrs) { + fprintf( stderr, "**ran out of space on far_nbrs: top=%d, max=%d", + nbrs_end[i], far_nbrs->num_intrs ); + exit( INSUFFICIENT_SPACE ); + } + + free (nbrs_start); + free (nbrs_end); + } + + //Code not used anymore #if defined DONE - void Choose_Neighbor_Finder( reax_system *system, control_params *control, - get_far_neighbors_function *Get_Far_Neighbors ) - { - if( control->periodic_boundaries ) - { - if( system->box.box_norms[0] > 2.0 * control->vlist_cut && - system->box.box_norms[1] > 2.0 * control->vlist_cut && - system->box.box_norms[2] > 2.0 * control->vlist_cut ) - (*Get_Far_Neighbors) = Get_Periodic_Far_Neighbors_Big_Box; - else (*Get_Far_Neighbors) = Get_Periodic_Far_Neighbors_Small_Box; - } - else - (*Get_Far_Neighbors) = Get_NonPeriodic_Far_Neighbors; - } - - - int compare_near_nbrs(const void *v1, const void *v2) - { - return ((*(near_neighbor_data *)v1).nbr - (*(near_neighbor_data *)v2).nbr); - } - - - int compare_far_nbrs(const void *v1, const void *v2) - { - return ((*(far_neighbor_data *)v1).nbr - (*(far_neighbor_data *)v2).nbr); - } - - - inline void Set_Far_Neighbor( far_neighbor_data *dest, int nbr, real d, real C, - rvec dvec, ivec rel_box/*, rvec ext_factor*/ ) - { - dest->nbr = nbr; - dest->d = d; - rvec_Scale( dest->dvec, C, dvec ); - ivec_Copy( dest->rel_box, rel_box ); - // rvec_Scale( dest->ext_factor, C, ext_factor ); - } - - - inline void Set_Near_Neighbor(near_neighbor_data *dest, int nbr, real d, real C, - rvec dvec, ivec rel_box/*, rvec ext_factor*/) - { - dest->nbr = nbr; - dest->d = d; - rvec_Scale( dest->dvec, C, dvec ); - ivec_Scale( dest->rel_box, C, rel_box ); - // rvec_Scale( dest->ext_factor, C, ext_factor ); - } - - - /* In case bond restrictions are applied, this method checks if - atom1 and atom2 are allowed to bond with each other */ - inline int can_Bond( static_storage *workspace, int atom1, int atom2 ) - { - int i; - - // fprintf( stderr, "can bond %6d %6d?\n", atom1, atom2 ); - - if( !workspace->restricted[ atom1 ] && !workspace->restricted[ atom2 ] ) - return 1; - - for( i = 0; i < workspace->restricted[ atom1 ]; ++i ) - if( workspace->restricted_list[ atom1 ][i] == atom2 ) - return 1; - - for( i = 0; i < workspace->restricted[ atom2 ]; ++i ) - if( workspace->restricted_list[ atom2 ][i] == atom1 ) - return 1; - - return 0; - } - - - /* check if atom2 is on atom1's near neighbor list */ - inline int is_Near_Neighbor( list *near_nbrs, int atom1, int atom2 ) - { - int i; - - for( i=Start_Index(atom1,near_nbrs); i<End_Index(atom1,near_nbrs); ++i ) - if( near_nbrs->select.near_nbr_list[i].nbr == atom2 ) - { - // fprintf( stderr, "near neighbors %6d %6d\n", atom1, atom2 ); - return 1; - } - - return 0; - } - - void Generate_Neighbor_Lists( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - list **lists, output_controls *out_control ) - { - int i, j, k; - int x, y, z; - int *nbr_atoms; - int atom1, atom2, max; - int num_far; - int c, count; - int grid_top; - grid *g = &( system->g ); - list *far_nbrs = (*lists) + FAR_NBRS; - //int hb_type1, hb_type2; - //list *hbonds = (*lists) + HBOND; - //int top_hbond1, top_hbond2; - get_far_neighbors_function Get_Far_Neighbors; - far_neighbor_data new_nbrs[125]; + void Choose_Neighbor_Finder( reax_system *system, control_params *control, + get_far_neighbors_function *Get_Far_Neighbors ) + { + if( control->periodic_boundaries ) + { + if( system->box.box_norms[0] > 2.0 * control->vlist_cut && + system->box.box_norms[1] > 2.0 * control->vlist_cut && + system->box.box_norms[2] > 2.0 * control->vlist_cut ) + (*Get_Far_Neighbors) = Get_Periodic_Far_Neighbors_Big_Box; + else (*Get_Far_Neighbors) = Get_Periodic_Far_Neighbors_Small_Box; + } + else + (*Get_Far_Neighbors) = Get_NonPeriodic_Far_Neighbors; + } + + + int compare_near_nbrs(const void *v1, const void *v2) + { + return ((*(near_neighbor_data *)v1).nbr - (*(near_neighbor_data *)v2).nbr); + } + + + int compare_far_nbrs(const void *v1, const void *v2) + { + return ((*(far_neighbor_data *)v1).nbr - (*(far_neighbor_data *)v2).nbr); + } + + + inline void Set_Far_Neighbor( far_neighbor_data *dest, int nbr, real d, real C, + rvec dvec, ivec rel_box/*, rvec ext_factor*/ ) + { + dest->nbr = nbr; + dest->d = d; + rvec_Scale( dest->dvec, C, dvec ); + ivec_Copy( dest->rel_box, rel_box ); + // rvec_Scale( dest->ext_factor, C, ext_factor ); + } + + + inline void Set_Near_Neighbor(near_neighbor_data *dest, int nbr, real d, real C, + rvec dvec, ivec rel_box/*, rvec ext_factor*/) + { + dest->nbr = nbr; + dest->d = d; + rvec_Scale( dest->dvec, C, dvec ); + ivec_Scale( dest->rel_box, C, rel_box ); + // rvec_Scale( dest->ext_factor, C, ext_factor ); + } + + + /* In case bond restrictions are applied, this method checks if + atom1 and atom2 are allowed to bond with each other */ + inline int can_Bond( static_storage *workspace, int atom1, int atom2 ) + { + int i; + + // fprintf( stderr, "can bond %6d %6d?\n", atom1, atom2 ); + + if( !workspace->restricted[ atom1 ] && !workspace->restricted[ atom2 ] ) + return 1; + + for( i = 0; i < workspace->restricted[ atom1 ]; ++i ) + if( workspace->restricted_list[ atom1 ][i] == atom2 ) + return 1; + + for( i = 0; i < workspace->restricted[ atom2 ]; ++i ) + if( workspace->restricted_list[ atom2 ][i] == atom1 ) + return 1; + + return 0; + } + + + /* check if atom2 is on atom1's near neighbor list */ + inline int is_Near_Neighbor( list *near_nbrs, int atom1, int atom2 ) + { + int i; + + for( i=Start_Index(atom1,near_nbrs); i<End_Index(atom1,near_nbrs); ++i ) + if( near_nbrs->select.near_nbr_list[i].nbr == atom2 ) + { + // fprintf( stderr, "near neighbors %6d %6d\n", atom1, atom2 ); + return 1; + } + + return 0; + } + + void Generate_Neighbor_Lists( reax_system *system, control_params *control, + simulation_data *data, static_storage *workspace, + list **lists, output_controls *out_control ) + { + int i, j, k; + int x, y, z; + int *nbr_atoms; + int atom1, atom2, max; + int num_far; + int c, count; + int grid_top; + grid *g = &( system->g ); + list *far_nbrs = (*lists) + FAR_NBRS; + //int hb_type1, hb_type2; + //list *hbonds = (*lists) + HBOND; + //int top_hbond1, top_hbond2; + get_far_neighbors_function Get_Far_Neighbors; + far_neighbor_data new_nbrs[125]; #ifndef REORDER_ATOMS - int l, m; + int l, m; #endif - // fprintf( stderr, "\n\tentered nbrs - " ); - if( control->ensemble == iNPT || control->ensemble == sNPT || - control->ensemble == NPT ) - Update_Grid( system ); - // fprintf( stderr, "grid updated - " ); + // fprintf( stderr, "\n\tentered nbrs - " ); + if( control->ensemble == iNPT || control->ensemble == sNPT || + control->ensemble == NPT ) + Update_Grid( system ); + // fprintf( stderr, "grid updated - " ); - Bin_Atoms( system, out_control ); - // fprintf( stderr, "atoms sorted - " ); + Bin_Atoms( system, out_control ); + // fprintf( stderr, "atoms sorted - " ); #ifdef REORDER_ATOMS - Cluster_Atoms( system, workspace ); - // fprintf( stderr, "atoms clustered - " ); + Cluster_Atoms( system, workspace ); + // fprintf( stderr, "atoms clustered - " ); #endif - Choose_Neighbor_Finder( system, control, &Get_Far_Neighbors ); - // fprintf( stderr, "function chosen - " ); - - Reset_Neighbor_Lists( system, workspace, lists ); - // fprintf( stderr, "lists cleared - " ); - - num_far = 0; - num_near = 0; - c = 0; - - /* first pick up a cell in the grid */ - for( i = 0; i < g->ncell[0]; i++ ) - for( j = 0; j < g->ncell[1]; j++ ) - for( k = 0; k < g->ncell[2]; k++ ) { - nbrs = g->nbrs[i][j][k]; - nbrs_cp = g->nbrs_cp[i][j][k]; - - /* pick up an atom from the current cell */ - //#ifdef REORDER_ATOMS - // for(atom1 = g->start[i][j][k]; atom1 < g->end[i][j][k]; atom1++) - //#else - for(l = 0; l < g->top[i][j][k]; ++l ){ - atom1 = g->atoms[i][j][k][l]; - Set_End_Index( atom1, num_far, far_nbrs ); - // fprintf( stderr, "atom %d:\n", atom1 ); - - itr = 0; - while( nbrs[itr][0] > 0 ){ - x = nbrs[itr][0]; - y = nbrs[itr][1]; - z = nbrs[itr][2]; - - // if( DistSqr_to_CP(nbrs_cp[itr], system->atoms[atom1].x ) <= - // SQR(control->r_cut)) - nbr_atoms = g->atoms[x][y][z]; - max_atoms = g->top[x][y][z]; - - /* pick up another atom from the neighbor cell - - we have to compare atom1 with its own periodic images as well, - that's why there is also equality in the if stmt below */ - //#ifdef REORDER_ATOMS - //for(atom2=g->start[x][y][z]; atom2<g->end[x][y][z]; atom2++) - //#else - for( m = 0, atom2=nbr_atoms[m]; m < max; ++m, atom2=nbr_atoms[m] ) - if( atom1 >= atom2 ) { - //fprintf( stderr, "\tatom2 %d", atom2 ); - //top_near1 = End_Index( atom1, near_nbrs ); - //Set_Start_Index( atom1, num_far, far_nbrs ); - //hb_type1=system->reaxprm.sbp[system->atoms[atom1].type].p_hbond; - Get_Far_Neighbors( system->atoms[atom1].x, - system->atoms[atom2].x, - &(system->box), control, new_nbrs, &count ); - fprintf( stderr, "\t%d count:%d\n", atom2, count ); - - for( c = 0; c < count; ++c ) - if(atom1 != atom2 || (atom1 == atom2 && new_nbrs[c].d>=0.1)){ - Set_Far_Neighbor(&(far_nbrs->select.far_nbr_list[num_far]), - atom2, new_nbrs[c].d, 1.0, - new_nbrs[c].dvec, new_nbrs[c].rel_box ); - ++num_far; - - /*fprintf(stderr,"FARNBR:%6d%6d%8.3f[%8.3f%8.3f%8.3f]\n", - atom1, atom2, new_nbrs[c].d, - new_nbrs[c].dvec[0], new_nbrs[c].dvec[1], - new_nbrs[c].dvec[2] ); */ - - - /* hydrogen bond lists */ - /*if( control->hb_cut > 0.1 && - new_nbrs[c].d <= control->hb_cut ) { - // fprintf( stderr, "%d %d\n", atom1, atom2 ); - hb_type2=system->reaxprm.sbp[system->atoms[atom2].type].p_hbond; - if( hb_type1 == 1 && hb_type2 == 2 ) { - top_hbond1=End_Index(workspace->hbond_index[atom1],hbonds); - Set_Near_Neighbor(&(hbonds->select.hbond_list[top_hbond1]), - atom2, new_nbrs[c].d, 1.0, new_nbrs[c].dvec, - new_nbrs[c].rel_box ); - Set_End_Index( workspace->hbond_index[atom1], - top_hbond1 + 1, hbonds ); - } - else if( hb_type1 == 2 && hb_type2 == 1 ) { - top_hbond2 = End_Index( workspace->hbond_index[atom2], hbonds ); - Set_Near_Neighbor(&(hbonds->select.hbond_list[top_hbond2]), - atom1, new_nbrs[c].d, -1.0, new_nbrs[c].dvec, - new_nbrs[c].rel_box ); - Set_End_Index( workspace->hbond_index[atom2], - top_hbond2 + 1, hbonds ); - }*/ - } - } - } - - Set_End_Index( atom1, top_far1, far_nbrs ); - } - } - - - fprintf( stderr, "nbrs done-" ); - - - /* apply restrictions on near neighbors only */ - if( (data->step - data->prev_steps) < control->restrict_bonds ) { - for( atom1 = 0; atom1 < system->N; ++atom1 ) - if( workspace->restricted[ atom1 ] ) { - // fprintf( stderr, "atom1: %d\n", atom1 ); - - top_near1 = End_Index( atom1, near_nbrs ); - - for( j = 0; j < workspace->restricted[ atom1 ]; ++j ) - if(!is_Near_Neighbor(near_nbrs, atom1, - atom2 = workspace->restricted_list[atom1][j])) { - fprintf( stderr, "%3d-%3d: added bond by applying restrictions!\n", - atom1, atom2 ); - - top_near2 = End_Index( atom2, near_nbrs ); - - /* we just would like to get the nearest image, so a call to - Get_Periodic_Far_Neighbors_Big_Box is good enough. */ - Get_Periodic_Far_Neighbors_Big_Box( system->atoms[ atom1 ].x, - system->atoms[ atom2 ].x, - &(system->box), control, - new_nbrs, &count ); - - Set_Near_Neighbor( &(near_nbrs->select.near_nbr_list[ top_near1 ]), - atom2, new_nbrs[c].d, 1.0, - new_nbrs[c].dvec, new_nbrs[c].rel_box ); - ++top_near1; - - Set_Near_Neighbor( &(near_nbrs->select.near_nbr_list[ top_near2 ]), - atom1, new_nbrs[c].d, -1.0, - new_nbrs[c].dvec, new_nbrs[c].rel_box ); - Set_End_Index( atom2, top_near2+1, near_nbrs ); - } - - Set_End_Index( atom1, top_near1, near_nbrs ); - } - } - // fprintf( stderr, "restrictions applied-" ); - - - /* verify nbrlists, count num_intrs, sort nearnbrs */ - near_nbrs->num_intrs = 0; - far_nbrs->num_intrs = 0; - for( i = 0; i < system->N-1; ++i ) { - if( End_Index(i, near_nbrs) > Start_Index(i+1, near_nbrs) ) { - fprintf( stderr, - "step%3d: nearnbr list of atom%d is overwritten by atom%d\n", - data->step, i+1, i ); - exit( 1 ); - } - - near_nbrs->num_intrs += Num_Entries(i, near_nbrs); - - if( End_Index(i, far_nbrs) > Start_Index(i+1, far_nbrs) ) { - fprintf( stderr, - "step%3d: farnbr list of atom%d is overwritten by atom%d\n", - data->step, i+1, i ); - exit( 1 ); - } - - far_nbrs->num_intrs += Num_Entries(i, far_nbrs); - } - - for( i = 0; i < system->N; ++i ) { - qsort( &(near_nbrs->select.near_nbr_list[ Start_Index(i, near_nbrs) ]), - Num_Entries(i, near_nbrs), sizeof(near_neighbor_data), - compare_near_nbrs ); - } - // fprintf( stderr, "near nbrs sorted\n" ); + Choose_Neighbor_Finder( system, control, &Get_Far_Neighbors ); + // fprintf( stderr, "function chosen - " ); + + Reset_Neighbor_Lists( system, workspace, lists ); + // fprintf( stderr, "lists cleared - " ); + + num_far = 0; + num_near = 0; + c = 0; + + /* first pick up a cell in the grid */ + for( i = 0; i < g->ncell[0]; i++ ) + for( j = 0; j < g->ncell[1]; j++ ) + for( k = 0; k < g->ncell[2]; k++ ) { + nbrs = g->nbrs[i][j][k]; + nbrs_cp = g->nbrs_cp[i][j][k]; + + /* pick up an atom from the current cell */ + //#ifdef REORDER_ATOMS + // for(atom1 = g->start[i][j][k]; atom1 < g->end[i][j][k]; atom1++) + //#else + for(l = 0; l < g->top[i][j][k]; ++l ){ + atom1 = g->atoms[i][j][k][l]; + Set_End_Index( atom1, num_far, far_nbrs ); + // fprintf( stderr, "atom %d:\n", atom1 ); + + itr = 0; + while( nbrs[itr][0] > 0 ){ + x = nbrs[itr][0]; + y = nbrs[itr][1]; + z = nbrs[itr][2]; + + // if( DistSqr_to_CP(nbrs_cp[itr], system->atoms[atom1].x ) <= + // SQR(control->r_cut)) + nbr_atoms = g->atoms[x][y][z]; + max_atoms = g->top[x][y][z]; + + /* pick up another atom from the neighbor cell - + we have to compare atom1 with its own periodic images as well, + that's why there is also equality in the if stmt below */ + //#ifdef REORDER_ATOMS + //for(atom2=g->start[x][y][z]; atom2<g->end[x][y][z]; atom2++) + //#else + for( m = 0, atom2=nbr_atoms[m]; m < max; ++m, atom2=nbr_atoms[m] ) + if( atom1 >= atom2 ) { + //fprintf( stderr, "\tatom2 %d", atom2 ); + //top_near1 = End_Index( atom1, near_nbrs ); + //Set_Start_Index( atom1, num_far, far_nbrs ); + //hb_type1=system->reaxprm.sbp[system->atoms[atom1].type].p_hbond; + Get_Far_Neighbors( system->atoms[atom1].x, + system->atoms[atom2].x, + &(system->box), control, new_nbrs, &count ); + fprintf( stderr, "\t%d count:%d\n", atom2, count ); + + for( c = 0; c < count; ++c ) + if(atom1 != atom2 || (atom1 == atom2 && new_nbrs[c].d>=0.1)){ + Set_Far_Neighbor(&(far_nbrs->select.far_nbr_list[num_far]), + atom2, new_nbrs[c].d, 1.0, + new_nbrs[c].dvec, new_nbrs[c].rel_box ); + ++num_far; + + /*fprintf(stderr,"FARNBR:%6d%6d%8.3f[%8.3f%8.3f%8.3f]\n", + atom1, atom2, new_nbrs[c].d, + new_nbrs[c].dvec[0], new_nbrs[c].dvec[1], + new_nbrs[c].dvec[2] ); */ + + + /* hydrogen bond lists */ + /*if( control->hb_cut > 0.1 && + new_nbrs[c].d <= control->hb_cut ) { + // fprintf( stderr, "%d %d\n", atom1, atom2 ); + hb_type2=system->reaxprm.sbp[system->atoms[atom2].type].p_hbond; + if( hb_type1 == 1 && hb_type2 == 2 ) { + top_hbond1=End_Index(workspace->hbond_index[atom1],hbonds); + Set_Near_Neighbor(&(hbonds->select.hbond_list[top_hbond1]), + atom2, new_nbrs[c].d, 1.0, new_nbrs[c].dvec, + new_nbrs[c].rel_box ); + Set_End_Index( workspace->hbond_index[atom1], + top_hbond1 + 1, hbonds ); + } + else if( hb_type1 == 2 && hb_type2 == 1 ) { + top_hbond2 = End_Index( workspace->hbond_index[atom2], hbonds ); + Set_Near_Neighbor(&(hbonds->select.hbond_list[top_hbond2]), + atom1, new_nbrs[c].d, -1.0, new_nbrs[c].dvec, + new_nbrs[c].rel_box ); + Set_End_Index( workspace->hbond_index[atom2], + top_hbond2 + 1, hbonds ); + }*/ + } + } + } + + Set_End_Index( atom1, top_far1, far_nbrs ); + } + } + + + fprintf( stderr, "nbrs done-" ); + + + /* apply restrictions on near neighbors only */ + if( (data->step - data->prev_steps) < control->restrict_bonds ) { + for( atom1 = 0; atom1 < system->N; ++atom1 ) + if( workspace->restricted[ atom1 ] ) { + // fprintf( stderr, "atom1: %d\n", atom1 ); + + top_near1 = End_Index( atom1, near_nbrs ); + + for( j = 0; j < workspace->restricted[ atom1 ]; ++j ) + if(!is_Near_Neighbor(near_nbrs, atom1, + atom2 = workspace->restricted_list[atom1][j])) { + fprintf( stderr, "%3d-%3d: added bond by applying restrictions!\n", + atom1, atom2 ); + + top_near2 = End_Index( atom2, near_nbrs ); + + /* we just would like to get the nearest image, so a call to + Get_Periodic_Far_Neighbors_Big_Box is good enough. */ + Get_Periodic_Far_Neighbors_Big_Box( system->atoms[ atom1 ].x, + system->atoms[ atom2 ].x, + &(system->box), control, + new_nbrs, &count ); + + Set_Near_Neighbor( &(near_nbrs->select.near_nbr_list[ top_near1 ]), + atom2, new_nbrs[c].d, 1.0, + new_nbrs[c].dvec, new_nbrs[c].rel_box ); + ++top_near1; + + Set_Near_Neighbor( &(near_nbrs->select.near_nbr_list[ top_near2 ]), + atom1, new_nbrs[c].d, -1.0, + new_nbrs[c].dvec, new_nbrs[c].rel_box ); + Set_End_Index( atom2, top_near2+1, near_nbrs ); + } + + Set_End_Index( atom1, top_near1, near_nbrs ); + } + } + // fprintf( stderr, "restrictions applied-" ); + + + /* verify nbrlists, count num_intrs, sort nearnbrs */ + near_nbrs->num_intrs = 0; + far_nbrs->num_intrs = 0; + for( i = 0; i < system->N-1; ++i ) { + if( End_Index(i, near_nbrs) > Start_Index(i+1, near_nbrs) ) { + fprintf( stderr, + "step%3d: nearnbr list of atom%d is overwritten by atom%d\n", + data->step, i+1, i ); + exit( 1 ); + } + + near_nbrs->num_intrs += Num_Entries(i, near_nbrs); + + if( End_Index(i, far_nbrs) > Start_Index(i+1, far_nbrs) ) { + fprintf( stderr, + "step%3d: farnbr list of atom%d is overwritten by atom%d\n", + data->step, i+1, i ); + exit( 1 ); + } + + far_nbrs->num_intrs += Num_Entries(i, far_nbrs); + } + + for( i = 0; i < system->N; ++i ) { + qsort( &(near_nbrs->select.near_nbr_list[ Start_Index(i, near_nbrs) ]), + Num_Entries(i, near_nbrs), sizeof(near_neighbor_data), + compare_near_nbrs ); + } + // fprintf( stderr, "near nbrs sorted\n" ); #ifdef TEST_ENERGY - /* for( i = 0; i < system->N; ++i ) { - qsort( &(far_nbrs->select.far_nbr_list[ Start_Index(i, far_nbrs) ]), - Num_Entries(i, far_nbrs), sizeof(far_neighbor_data), - compare_far_nbrs ); - } */ - - fprintf( stderr, "Near neighbors/atom: %d (compare to 150)\n", - num_near / system->N ); - fprintf( stderr, "Far neighbors per atom: %d (compare to %d)\n", - num_far / system->N, control->max_far_nbrs ); + /* for( i = 0; i < system->N; ++i ) { + qsort( &(far_nbrs->select.far_nbr_list[ Start_Index(i, far_nbrs) ]), + Num_Entries(i, far_nbrs), sizeof(far_neighbor_data), + compare_far_nbrs ); + } */ + + fprintf( stderr, "Near neighbors/atom: %d (compare to 150)\n", + num_near / system->N ); + fprintf( stderr, "Far neighbors per atom: %d (compare to %d)\n", + num_far / system->N, control->max_far_nbrs ); #endif - //fprintf( stderr, "step%d: num of nearnbrs = %6d num of farnbrs: %6d\n", - // data->step, num_near, num_far ); - - //fprintf( stderr, "\talloc nearnbrs = %6d alloc farnbrs: %6d\n", - // system->N * near_nbrs->intrs_per_unit, - // system->N * far_nbrs->intrs_per_unit ); - } - - - - void Generate_Neighbor_Lists( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - list **lists, output_controls *out_control ) - { - int i, j, k, l, m, itr; - int x, y, z; - int atom1, atom2, max; - int num_far, c, count; - int *nbr_atoms; - ivec *nbrs; - rvec *nbrs_cp; - grid *g; - list *far_nbrs; - get_far_neighbors_function Get_Far_Neighbors; - far_neighbor_data new_nbrs[125]; - - g = &( system->g ); - far_nbrs = (*lists) + FAR_NBRS; - - // fprintf( stderr, "\n\tentered nbrs - " ); - if( control->ensemble == iNPT || - control->ensemble == sNPT || - control->ensemble == NPT ) - Update_Grid( system ); - // fprintf( stderr, "grid updated - " ); - - Bin_Atoms( system, out_control ); - // fprintf( stderr, "atoms sorted - " ); - Choose_Neighbor_Finder( system, control, &Get_Far_Neighbors ); - // fprintf( stderr, "function chosen - " ); - Reset_Neighbor_Lists( system, workspace, lists ); - // fprintf( stderr, "lists cleared - " ); - - num_far = 0; - c = 0; - - /* first pick up a cell in the grid */ - for( i = 0; i < g->ncell[0]; i++ ) - for( j = 0; j < g->ncell[1]; j++ ) - for( k = 0; k < g->ncell[2]; k++ ) { - nbrs = g->nbrs[i][j][k]; - nbrs_cp = g->nbrs_cp[i][j][k]; - fprintf( stderr, "gridcell %d %d %d\n", i, j, k ); - - /* pick up an atom from the current cell */ - for(l = 0; l < g->top[i][j][k]; ++l ){ - atom1 = g->atoms[i][j][k][l]; - Set_Start_Index( atom1, num_far, far_nbrs ); - fprintf( stderr, "\tatom %d\n", atom1 ); - - itr = 0; - while( nbrs[itr][0] > 0 ){ - x = nbrs[itr][0]; - y = nbrs[itr][1]; - z = nbrs[itr][2]; - fprintf( stderr, "\t\tgridcell %d %d %d\n", x, y, z ); - - // if( DistSqr_to_CP(nbrs_cp[itr], system->atoms[atom1].x ) <= - // SQR(control->r_cut)) - nbr_atoms = g->atoms[x][y][z]; - max = g->top[x][y][z]; - fprintf( stderr, "\t\tmax: %d\n", max ); - - - /* pick up another atom from the neighbor cell - - we have to compare atom1 with its own periodic images as well, - that's why there is also equality in the if stmt below */ - for( m = 0, atom2=nbr_atoms[m]; m < max; ++m, atom2=nbr_atoms[m] ) - if( atom1 >= atom2 ) { - Get_Far_Neighbors( system->atoms[atom1].x, - system->atoms[atom2].x, - &(system->box), control, new_nbrs, &count ); - fprintf( stderr, "\t\t\t%d count:%d\n", atom2, count ); - - for( c = 0; c < count; ++c ) - if(atom1 != atom2 || (atom1 == atom2 && new_nbrs[c].d>=0.1)){ - Set_Far_Neighbor(&(far_nbrs->select.far_nbr_list[num_far]), - atom2, new_nbrs[c].d, 1.0, - new_nbrs[c].dvec, new_nbrs[c].rel_box ); - ++num_far; - - /*fprintf(stderr,"FARNBR:%6d%6d%8.3f[%8.3f%8.3f%8.3f]\n", - atom1, atom2, new_nbrs[c].d, - new_nbrs[c].dvec[0], new_nbrs[c].dvec[1], - new_nbrs[c].dvec[2] ); */ - } - } - - ++itr; - } - - Set_End_Index( atom1, num_far, far_nbrs ); - } - } - - far_nbrs->num_intrs = num_far; - fprintf( stderr, "nbrs done, num_far: %d\n", num_far ); + //fprintf( stderr, "step%d: num of nearnbrs = %6d num of farnbrs: %6d\n", + // data->step, num_near, num_far ); + + //fprintf( stderr, "\talloc nearnbrs = %6d alloc farnbrs: %6d\n", + // system->N * near_nbrs->intrs_per_unit, + // system->N * far_nbrs->intrs_per_unit ); + } + + + + void Generate_Neighbor_Lists( reax_system *system, control_params *control, + simulation_data *data, static_storage *workspace, + list **lists, output_controls *out_control ) + { + int i, j, k, l, m, itr; + int x, y, z; + int atom1, atom2, max; + int num_far, c, count; + int *nbr_atoms; + ivec *nbrs; + rvec *nbrs_cp; + grid *g; + list *far_nbrs; + get_far_neighbors_function Get_Far_Neighbors; + far_neighbor_data new_nbrs[125]; + + g = &( system->g ); + far_nbrs = (*lists) + FAR_NBRS; + + // fprintf( stderr, "\n\tentered nbrs - " ); + if( control->ensemble == iNPT || + control->ensemble == sNPT || + control->ensemble == NPT ) + Update_Grid( system ); + // fprintf( stderr, "grid updated - " ); + + Bin_Atoms( system, out_control ); + // fprintf( stderr, "atoms sorted - " ); + Choose_Neighbor_Finder( system, control, &Get_Far_Neighbors ); + // fprintf( stderr, "function chosen - " ); + Reset_Neighbor_Lists( system, workspace, lists ); + // fprintf( stderr, "lists cleared - " ); + + num_far = 0; + c = 0; + + /* first pick up a cell in the grid */ + for( i = 0; i < g->ncell[0]; i++ ) + for( j = 0; j < g->ncell[1]; j++ ) + for( k = 0; k < g->ncell[2]; k++ ) { + nbrs = g->nbrs[i][j][k]; + nbrs_cp = g->nbrs_cp[i][j][k]; + fprintf( stderr, "gridcell %d %d %d\n", i, j, k ); + + /* pick up an atom from the current cell */ + for(l = 0; l < g->top[i][j][k]; ++l ){ + atom1 = g->atoms[i][j][k][l]; + Set_Start_Index( atom1, num_far, far_nbrs ); + fprintf( stderr, "\tatom %d\n", atom1 ); + + itr = 0; + while( nbrs[itr][0] > 0 ){ + x = nbrs[itr][0]; + y = nbrs[itr][1]; + z = nbrs[itr][2]; + fprintf( stderr, "\t\tgridcell %d %d %d\n", x, y, z ); + + // if( DistSqr_to_CP(nbrs_cp[itr], system->atoms[atom1].x ) <= + // SQR(control->r_cut)) + nbr_atoms = g->atoms[x][y][z]; + max = g->top[x][y][z]; + fprintf( stderr, "\t\tmax: %d\n", max ); + + + /* pick up another atom from the neighbor cell - + we have to compare atom1 with its own periodic images as well, + that's why there is also equality in the if stmt below */ + for( m = 0, atom2=nbr_atoms[m]; m < max; ++m, atom2=nbr_atoms[m] ) + if( atom1 >= atom2 ) { + Get_Far_Neighbors( system->atoms[atom1].x, + system->atoms[atom2].x, + &(system->box), control, new_nbrs, &count ); + fprintf( stderr, "\t\t\t%d count:%d\n", atom2, count ); + + for( c = 0; c < count; ++c ) + if(atom1 != atom2 || (atom1 == atom2 && new_nbrs[c].d>=0.1)){ + Set_Far_Neighbor(&(far_nbrs->select.far_nbr_list[num_far]), + atom2, new_nbrs[c].d, 1.0, + new_nbrs[c].dvec, new_nbrs[c].rel_box ); + ++num_far; + + /*fprintf(stderr,"FARNBR:%6d%6d%8.3f[%8.3f%8.3f%8.3f]\n", + atom1, atom2, new_nbrs[c].d, + new_nbrs[c].dvec[0], new_nbrs[c].dvec[1], + new_nbrs[c].dvec[2] ); */ + } + } + + ++itr; + } + + Set_End_Index( atom1, num_far, far_nbrs ); + } + } + + far_nbrs->num_intrs = num_far; + fprintf( stderr, "nbrs done, num_far: %d\n", num_far ); #if defined(DEBUG) - for( i = 0; i < system->N; ++i ) { - qsort( &(far_nbrs->select.far_nbr_list[ Start_Index(i, far_nbrs) ]), - Num_Entries(i, far_nbrs), sizeof(far_neighbor_data), - compare_far_nbrs ); - } - - fprintf( stderr, "step%d: num of farnbrs=%6d\n", data->step, num_far ); - fprintf( stderr, "\tallocated farnbrs: %6d\n", - system->N * far_nbrs->intrs_per_unit ); + for( i = 0; i < system->N; ++i ) { + qsort( &(far_nbrs->select.far_nbr_list[ Start_Index(i, far_nbrs) ]), + Num_Entries(i, far_nbrs), sizeof(far_neighbor_data), + compare_far_nbrs ); + } + + fprintf( stderr, "step%d: num of farnbrs=%6d\n", data->step, num_far ); + fprintf( stderr, "\tallocated farnbrs: %6d\n", + system->N * far_nbrs->intrs_per_unit ); #endif - } + } diff --git a/PuReMD-GPU/src/reduction.cu b/PuReMD-GPU/src/reduction.cu index 4e2ee5bd..48fb5efc 100644 --- a/PuReMD-GPU/src/reduction.cu +++ b/PuReMD-GPU/src/reduction.cu @@ -25,124 +25,124 @@ GLOBAL void Cuda_reduction(const real *input, real *per_block_results, const size_t n) { - extern __shared__ real sdata[]; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - real x = 0; - - if(i < n) - { - x = input[i]; - } - sdata[threadIdx.x] = x; - __syncthreads(); - - for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) - { - if(threadIdx.x < offset) - { - sdata[threadIdx.x] += sdata[threadIdx.x + offset]; - } - - __syncthreads(); - } - - if(threadIdx.x == 0) - { - per_block_results[blockIdx.x] = sdata[0]; - } + extern __shared__ real sdata[]; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + real x = 0; + + if(i < n) + { + x = input[i]; + } + sdata[threadIdx.x] = x; + __syncthreads(); + + for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) + { + if(threadIdx.x < offset) + { + sdata[threadIdx.x] += sdata[threadIdx.x + offset]; + } + + __syncthreads(); + } + + if(threadIdx.x == 0) + { + per_block_results[blockIdx.x] = sdata[0]; + } } GLOBAL void Cuda_Norm (const real *input, real *per_block_results, const size_t n, int pass) { - extern __shared__ real sdata[]; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - real x = 0; - - if(i < n) - { - if (pass == INITIAL) - x = SQR (input[i]); - else - x = input[i]; - } - sdata[threadIdx.x] = x; - __syncthreads(); - - for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) - { - if(threadIdx.x < offset) - { - sdata[threadIdx.x] += sdata[threadIdx.x + offset]; - } - - __syncthreads(); - } - - if(threadIdx.x == 0) - { - if (pass == INITIAL) - per_block_results[blockIdx.x] = sdata[0]; - else - per_block_results[blockIdx.x] = SQRT (sdata[0]); - } + extern __shared__ real sdata[]; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + real x = 0; + + if(i < n) + { + if (pass == INITIAL) + x = SQR (input[i]); + else + x = input[i]; + } + sdata[threadIdx.x] = x; + __syncthreads(); + + for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) + { + if(threadIdx.x < offset) + { + sdata[threadIdx.x] += sdata[threadIdx.x + offset]; + } + + __syncthreads(); + } + + if(threadIdx.x == 0) + { + if (pass == INITIAL) + per_block_results[blockIdx.x] = sdata[0]; + else + per_block_results[blockIdx.x] = SQRT (sdata[0]); + } } GLOBAL void Cuda_Dot (const real *a, const real *b, real *per_block_results, const size_t n ) { - extern __shared__ real sdata[]; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - real x = 0; - - if(i < n) - { - x = a[i] * b[i]; - } - sdata[threadIdx.x] = x; - __syncthreads(); - - for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) - { - if(threadIdx.x < offset) - { - sdata[threadIdx.x] += sdata[threadIdx.x + offset]; - } - - __syncthreads(); - } - - if(threadIdx.x == 0) - { - per_block_results[blockIdx.x] = sdata[0]; - } + extern __shared__ real sdata[]; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + real x = 0; + + if(i < n) + { + x = a[i] * b[i]; + } + sdata[threadIdx.x] = x; + __syncthreads(); + + for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) + { + if(threadIdx.x < offset) + { + sdata[threadIdx.x] += sdata[threadIdx.x + offset]; + } + + __syncthreads(); + } + + if(threadIdx.x == 0) + { + per_block_results[blockIdx.x] = sdata[0]; + } } GLOBAL void Cuda_matrix_col_reduction(const real *input, real *per_block_results, const size_t n) { - extern __shared__ real sdata[]; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - real x = 0; - - if(i < n) - { - x = input[i * n + i]; - } - sdata[threadIdx.x] = x; - __syncthreads(); - - for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) - { - if(threadIdx.x < offset) - { - sdata[threadIdx.x] += sdata[threadIdx.x + offset]; - } - - __syncthreads(); - } - - if(threadIdx.x == 0) - { - per_block_results[blockIdx.x] = sdata[0]; - } + extern __shared__ real sdata[]; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + real x = 0; + + if(i < n) + { + x = input[i * n + i]; + } + sdata[threadIdx.x] = x; + __syncthreads(); + + for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) + { + if(threadIdx.x < offset) + { + sdata[threadIdx.x] += sdata[threadIdx.x + offset]; + } + + __syncthreads(); + } + + if(threadIdx.x == 0) + { + per_block_results[blockIdx.x] = sdata[0]; + } } @@ -152,65 +152,65 @@ GLOBAL void Cuda_matrix_col_reduction(const real *input, real *per_block_results GLOBAL void Cuda_reduction(const int *input, int *per_block_results, const size_t n) { - extern __shared__ int sh_input[]; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - real x = 0; - - if(i < n) - { - x = input[i]; - } - sh_input[threadIdx.x] = x; - __syncthreads(); - - for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) - { - if(threadIdx.x < offset) - { - sh_input[threadIdx.x] += sh_input[threadIdx.x + offset]; - } - - __syncthreads(); - } - - if(threadIdx.x == 0) - { - per_block_results[blockIdx.x] = sh_input[0]; - } + extern __shared__ int sh_input[]; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + real x = 0; + + if(i < n) + { + x = input[i]; + } + sh_input[threadIdx.x] = x; + __syncthreads(); + + for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) + { + if(threadIdx.x < offset) + { + sh_input[threadIdx.x] += sh_input[threadIdx.x + offset]; + } + + __syncthreads(); + } + + if(threadIdx.x == 0) + { + per_block_results[blockIdx.x] = sh_input[0]; + } } GLOBAL void Cuda_reduction_rvec (rvec *input, rvec *results, size_t n) { - extern __shared__ rvec svec_data[]; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - rvec x; - - rvec_MakeZero (x); - - if(i < n) - { - rvec_Copy (x, input[i]); - } - - rvec_Copy (svec_data[threadIdx.x], x); - __syncthreads(); - - for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) - { - if(threadIdx.x < offset) - { - rvec_Add (svec_data[threadIdx.x], svec_data[threadIdx.x + offset]); - } - - __syncthreads(); - } - - if(threadIdx.x == 0) - { - //rvec_Copy (results[blockIdx.x], svec_data[0]); - rvec_Add (results[blockIdx.x], svec_data[0]); - } + extern __shared__ rvec svec_data[]; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + rvec x; + + rvec_MakeZero (x); + + if(i < n) + { + rvec_Copy (x, input[i]); + } + + rvec_Copy (svec_data[threadIdx.x], x); + __syncthreads(); + + for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) + { + if(threadIdx.x < offset) + { + rvec_Add (svec_data[threadIdx.x], svec_data[threadIdx.x + offset]); + } + + __syncthreads(); + } + + if(threadIdx.x == 0) + { + //rvec_Copy (results[blockIdx.x], svec_data[0]); + rvec_Add (results[blockIdx.x], svec_data[0]); + } } ////////////////////////////////////////////////// @@ -219,24 +219,24 @@ GLOBAL void Cuda_reduction_rvec (rvec *input, rvec *results, size_t n) GLOBAL void Cuda_Vector_Sum( real* dest, real c, real* v, real d, real* y, int k ) { - int i = blockIdx.x * blockDim.x + threadIdx.x; - if ( i >= k) return; + int i = blockIdx.x * blockDim.x + threadIdx.x; + if ( i >= k) return; - dest[i] = c * v[i] + d * y[i]; + dest[i] = c * v[i] + d * y[i]; } GLOBAL void Cuda_Vector_Scale( real* dest, real c, real* v, int k ) { - int i = blockIdx.x * blockDim.x + threadIdx.x; - if ( i >= k) return; + int i = blockIdx.x * blockDim.x + threadIdx.x; + if ( i >= k) return; - dest[i] = c * v[i]; + dest[i] = c * v[i]; } GLOBAL void Cuda_Vector_Add( real* dest, real c, real* v, int k ) { - int i = blockIdx.x * blockDim.x + threadIdx.x; - if ( i >= k) return; + int i = blockIdx.x * blockDim.x + threadIdx.x; + if ( i >= k) return; - dest[i] += c * v[i]; + dest[i] += c * v[i]; } diff --git a/PuReMD-GPU/src/reset_utils.cu b/PuReMD-GPU/src/reset_utils.cu index 9e5c5075..0c6f852b 100644 --- a/PuReMD-GPU/src/reset_utils.cu +++ b/PuReMD-GPU/src/reset_utils.cu @@ -27,68 +27,68 @@ GLOBAL void Reset_Atoms (reax_atom *atoms, int N) { - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= N) return; + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= N) return; - atoms[i].f[0] = 0.0; - atoms[i].f[1] = 0.0; - atoms[i].f[2] = 0.0; + atoms[i].f[0] = 0.0; + atoms[i].f[1] = 0.0; + atoms[i].f[2] = 0.0; } void Cuda_Reset_Atoms (reax_system *system ) { - Reset_Atoms <<<BLOCKS, BLOCK_SIZE>>> - (system->d_atoms, system->N); - cudaThreadSynchronize (); - cudaCheckError (); + Reset_Atoms <<<BLOCKS, BLOCK_SIZE>>> + (system->d_atoms, system->N); + cudaThreadSynchronize (); + cudaCheckError (); } void Reset_Atoms( reax_system* system ) { - int i; + int i; - for( i = 0; i < system->N; ++i ) - memset( system->atoms[i].f, 0.0, RVEC_SIZE ); + for( i = 0; i < system->N; ++i ) + memset( system->atoms[i].f, 0.0, RVEC_SIZE ); } void Reset_Pressures( simulation_data *data ) { - rtensor_MakeZero( data->flex_bar.P ); - data->iso_bar.P = 0; - rvec_MakeZero( data->int_press ); - rvec_MakeZero( data->ext_press ); - /* fprintf( stderr, "reset: ext_press (%12.6f %12.6f %12.6f)\n", - data->ext_press[0], data->ext_press[1], data->ext_press[2] ); */ + rtensor_MakeZero( data->flex_bar.P ); + data->iso_bar.P = 0; + rvec_MakeZero( data->int_press ); + rvec_MakeZero( data->ext_press ); + /* fprintf( stderr, "reset: ext_press (%12.6f %12.6f %12.6f)\n", + data->ext_press[0], data->ext_press[1], data->ext_press[2] ); */ } void Reset_Simulation_Data( simulation_data* data ) { - data->E_BE = 0; - data->E_Ov = 0; - data->E_Un = 0; - data->E_Lp = 0; - data->E_Ang = 0; - data->E_Pen = 0; - data->E_Coa = 0; - data->E_HB = 0; - data->E_Tor = 0; - data->E_Con = 0; - data->E_vdW = 0; - data->E_Ele = 0; - data->E_Kin = 0; + data->E_BE = 0; + data->E_Ov = 0; + data->E_Un = 0; + data->E_Lp = 0; + data->E_Ang = 0; + data->E_Pen = 0; + data->E_Coa = 0; + data->E_HB = 0; + data->E_Tor = 0; + data->E_Con = 0; + data->E_vdW = 0; + data->E_Ele = 0; + data->E_Kin = 0; } void Cuda_Sync_Simulation_Data (simulation_data *data) { - //copy_host_device (&data->E_BE, &((simulation_data *)data->d_simulation_data)->E_BE, - // REAL_SIZE * 12, cudaMemcpyHostToDevice, RES_SIMULATION_DATA ); - cuda_memset (&((simulation_data *)data->d_simulation_data)->E_BE, 0, REAL_SIZE * 12, RES_SIMULATION_DATA ); + //copy_host_device (&data->E_BE, &((simulation_data *)data->d_simulation_data)->E_BE, + // REAL_SIZE * 12, cudaMemcpyHostToDevice, RES_SIMULATION_DATA ); + cuda_memset (&((simulation_data *)data->d_simulation_data)->E_BE, 0, REAL_SIZE * 12, RES_SIMULATION_DATA ); - //copy_host_device (&data->E_Kin, &((simulation_data *)data->d_simulation_data)->E_Kin, - // REAL_SIZE, cudaMemcpyHostToDevice, RES_SIMULATION_DATA ); - cuda_memset (&((simulation_data *)data->d_simulation_data)->E_Kin, 0, REAL_SIZE, RES_SIMULATION_DATA ); + //copy_host_device (&data->E_Kin, &((simulation_data *)data->d_simulation_data)->E_Kin, + // REAL_SIZE, cudaMemcpyHostToDevice, RES_SIMULATION_DATA ); + cuda_memset (&((simulation_data *)data->d_simulation_data)->E_Kin, 0, REAL_SIZE, RES_SIMULATION_DATA ); } @@ -96,195 +96,195 @@ void Cuda_Sync_Simulation_Data (simulation_data *data) #ifdef TEST_FORCES void Reset_Test_Forces( reax_system *system, static_storage *workspace ) { - memset( workspace->f_ele, 0, system->N * sizeof(rvec) ); - memset( workspace->f_vdw, 0, system->N * sizeof(rvec) ); - memset( workspace->f_bo, 0, system->N * sizeof(rvec) ); - memset( workspace->f_be, 0, system->N * sizeof(rvec) ); - memset( workspace->f_lp, 0, system->N * sizeof(rvec) ); - memset( workspace->f_ov, 0, system->N * sizeof(rvec) ); - memset( workspace->f_un, 0, system->N * sizeof(rvec) ); - memset( workspace->f_ang, 0, system->N * sizeof(rvec) ); - memset( workspace->f_coa, 0, system->N * sizeof(rvec) ); - memset( workspace->f_pen, 0, system->N * sizeof(rvec) ); - memset( workspace->f_hb, 0, system->N * sizeof(rvec) ); - memset( workspace->f_tor, 0, system->N * sizeof(rvec) ); - memset( workspace->f_con, 0, system->N * sizeof(rvec) ); + memset( workspace->f_ele, 0, system->N * sizeof(rvec) ); + memset( workspace->f_vdw, 0, system->N * sizeof(rvec) ); + memset( workspace->f_bo, 0, system->N * sizeof(rvec) ); + memset( workspace->f_be, 0, system->N * sizeof(rvec) ); + memset( workspace->f_lp, 0, system->N * sizeof(rvec) ); + memset( workspace->f_ov, 0, system->N * sizeof(rvec) ); + memset( workspace->f_un, 0, system->N * sizeof(rvec) ); + memset( workspace->f_ang, 0, system->N * sizeof(rvec) ); + memset( workspace->f_coa, 0, system->N * sizeof(rvec) ); + memset( workspace->f_pen, 0, system->N * sizeof(rvec) ); + memset( workspace->f_hb, 0, system->N * sizeof(rvec) ); + memset( workspace->f_tor, 0, system->N * sizeof(rvec) ); + memset( workspace->f_con, 0, system->N * sizeof(rvec) ); } #endif void Reset_Workspace( reax_system *system, static_storage *workspace ) { - memset( workspace->total_bond_order, 0, system->N * sizeof( real ) ); - memset( workspace->dDeltap_self, 0, system->N * sizeof( rvec ) ); + memset( workspace->total_bond_order, 0, system->N * sizeof( real ) ); + memset( workspace->dDeltap_self, 0, system->N * sizeof( rvec ) ); - memset( workspace->CdDelta, 0, system->N * sizeof( real ) ); - //memset( workspace->virial_forces, 0, system->N * sizeof( rvec ) ); + memset( workspace->CdDelta, 0, system->N * sizeof( real ) ); + //memset( workspace->virial_forces, 0, system->N * sizeof( rvec ) ); #ifdef TEST_FORCES - memset( workspace->dDelta, 0, sizeof(rvec) * system->N ); - Reset_Test_Forces( system, workspace ); + memset( workspace->dDelta, 0, sizeof(rvec) * system->N ); + Reset_Test_Forces( system, workspace ); #endif } void Cuda_Reset_Workspace( reax_system *system, static_storage *workspace ) { - cuda_memset( workspace->total_bond_order, 0, system->N * REAL_SIZE, RES_STORAGE_TOTAL_BOND_ORDER ); - cuda_memset( workspace->dDeltap_self, 0, system->N * RVEC_SIZE, RES_STORAGE_DDELTAP_SELF ); - cuda_memset( workspace->CdDelta, 0, system->N * REAL_SIZE, RES_STORAGE_CDDELTA ); + cuda_memset( workspace->total_bond_order, 0, system->N * REAL_SIZE, RES_STORAGE_TOTAL_BOND_ORDER ); + cuda_memset( workspace->dDeltap_self, 0, system->N * RVEC_SIZE, RES_STORAGE_DDELTAP_SELF ); + cuda_memset( workspace->CdDelta, 0, system->N * REAL_SIZE, RES_STORAGE_CDDELTA ); } GLOBAL void Reset_Neighbor_Lists (single_body_parameters *sbp, reax_atom *atoms, - list bonds, list hbonds, control_params *control, - static_storage workspace, int N) + list bonds, list hbonds, control_params *control, + static_storage workspace, int N) { - int tmp; - int index = blockIdx.x * blockDim.x + threadIdx.x; + int tmp; + int index = blockIdx.x * blockDim.x + threadIdx.x; - if (index >= N) return; + if (index >= N) return; - tmp = Start_Index (index, &bonds); - Set_End_Index (index, tmp, &bonds); + tmp = Start_Index (index, &bonds); + Set_End_Index (index, tmp, &bonds); - if (control->hb_cut > 0) { - if ((sbp[ atoms[index].type ].p_hbond == 1) || - (sbp[ atoms[index].type ].p_hbond == 2)) { - tmp = Start_Index ( workspace.hbond_index[index], &hbonds ); - Set_End_Index ( workspace.hbond_index[index], tmp, &hbonds ); - } - } + if (control->hb_cut > 0) { + if ((sbp[ atoms[index].type ].p_hbond == 1) || + (sbp[ atoms[index].type ].p_hbond == 2)) { + tmp = Start_Index ( workspace.hbond_index[index], &hbonds ); + Set_End_Index ( workspace.hbond_index[index], tmp, &hbonds ); + } + } } void Cuda_Reset_Neighbor_Lists (reax_system *system, control_params *control, - static_storage *workspace, list **lists ) + static_storage *workspace, list **lists ) { - Reset_Neighbor_Lists <<<BLOCKS, BLOCK_SIZE>>> - ( system->reaxprm.d_sbp, system->d_atoms, *(dev_lists + BONDS), *(dev_lists + HBONDS), - (control_params *)control->d_control, *dev_workspace, system->N ); - cudaThreadSynchronize (); - cudaCheckError (); - - //reset here - list *bonds = (dev_lists + BONDS ); - //TODO - check if this is needed. - cuda_memset (bonds->select.bond_list, 0, BOND_DATA_SIZE * bonds->num_intrs, LIST_BOND_DATA ); + Reset_Neighbor_Lists <<<BLOCKS, BLOCK_SIZE>>> + ( system->reaxprm.d_sbp, system->d_atoms, *(dev_lists + BONDS), *(dev_lists + HBONDS), + (control_params *)control->d_control, *dev_workspace, system->N ); + cudaThreadSynchronize (); + cudaCheckError (); + + //reset here + list *bonds = (dev_lists + BONDS ); + //TODO - check if this is needed. + cuda_memset (bonds->select.bond_list, 0, BOND_DATA_SIZE * bonds->num_intrs, LIST_BOND_DATA ); } GLOBAL void Reset_Far_Neighbors_List (list far_nbrs, int N) { - int tmp; - int index = blockIdx.x * blockDim.x + threadIdx.x; + int tmp; + int index = blockIdx.x * blockDim.x + threadIdx.x; - if (index >= N) return; + if (index >= N) return; - tmp = Start_Index (index, &far_nbrs); - Set_End_Index (index, tmp, &far_nbrs); + tmp = Start_Index (index, &far_nbrs); + Set_End_Index (index, tmp, &far_nbrs); } void Cuda_Reset_Far_Neighbors_List ( reax_system *system ) { - Reset_Far_Neighbors_List <<<BLOCKS, BLOCK_SIZE>>> - (*(dev_lists + FAR_NBRS), system->N); - cudaThreadSynchronize (); - cudaCheckError (); + Reset_Far_Neighbors_List <<<BLOCKS, BLOCK_SIZE>>> + (*(dev_lists + FAR_NBRS), system->N); + cudaThreadSynchronize (); + cudaCheckError (); } void Reset_Neighbor_Lists( reax_system *system, control_params *control, - static_storage *workspace, list **lists ) + static_storage *workspace, list **lists ) { - int i, tmp; - list *bonds = (*lists) + BONDS; - list *hbonds = (*lists) + HBONDS; - - for( i = 0; i < system->N; ++i ) { - tmp = Start_Index( i, bonds ); - Set_End_Index( i, tmp, bonds ); - } - - //TODO check if this is needed - memset (bonds->select.bond_list, 0, BOND_DATA_SIZE * bonds->num_intrs ); - - if( control->hb_cut > 0 ) - for( i = 0; i < system->N; ++i ) - if( system->reaxprm.sbp[system->atoms[i].type].p_hbond == 1) { - tmp = Start_Index( workspace->hbond_index[i], hbonds ); - Set_End_Index( workspace->hbond_index[i], tmp, hbonds ); - /* fprintf( stderr, "i:%d, hbond: %d-%d\n", - i, Start_Index( workspace->hbond_index[i], hbonds ), - End_Index( workspace->hbond_index[i], hbonds ) );*/ - } + int i, tmp; + list *bonds = (*lists) + BONDS; + list *hbonds = (*lists) + HBONDS; + + for( i = 0; i < system->N; ++i ) { + tmp = Start_Index( i, bonds ); + Set_End_Index( i, tmp, bonds ); + } + + //TODO check if this is needed + memset (bonds->select.bond_list, 0, BOND_DATA_SIZE * bonds->num_intrs ); + + if( control->hb_cut > 0 ) + for( i = 0; i < system->N; ++i ) + if( system->reaxprm.sbp[system->atoms[i].type].p_hbond == 1) { + tmp = Start_Index( workspace->hbond_index[i], hbonds ); + Set_End_Index( workspace->hbond_index[i], tmp, hbonds ); + /* fprintf( stderr, "i:%d, hbond: %d-%d\n", + i, Start_Index( workspace->hbond_index[i], hbonds ), + End_Index( workspace->hbond_index[i], hbonds ) );*/ + } } void Reset( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, list **lists ) + simulation_data *data, static_storage *workspace, list **lists ) { - Reset_Atoms( system ); + Reset_Atoms( system ); - Reset_Simulation_Data( data ); + Reset_Simulation_Data( data ); - if( control->ensemble == NPT || control->ensemble == sNPT || - control->ensemble == iNPT ) - Reset_Pressures( data ); + if( control->ensemble == NPT || control->ensemble == sNPT || + control->ensemble == iNPT ) + Reset_Pressures( data ); - Reset_Workspace( system, workspace ); + Reset_Workspace( system, workspace ); - Reset_Neighbor_Lists( system, control, workspace, lists ); + Reset_Neighbor_Lists( system, control, workspace, lists ); #if defined(DEBUG_FOCUS) - fprintf( stderr, "reset - "); + fprintf( stderr, "reset - "); #endif } void Cuda_Reset_Sparse_Matrix (reax_system *system, static_storage *workspace) { - cuda_memset (workspace->H.j, 0, (system->N + 1) * INT_SIZE, RES_SPARSE_MATRIX_INDEX ); - cuda_memset (workspace->H.val, 0, (system->N * system->max_sparse_matrix_entries) * INT_SIZE, RES_SPARSE_MATRIX_INDEX ); + cuda_memset (workspace->H.j, 0, (system->N + 1) * INT_SIZE, RES_SPARSE_MATRIX_INDEX ); + cuda_memset (workspace->H.val, 0, (system->N * system->max_sparse_matrix_entries) * INT_SIZE, RES_SPARSE_MATRIX_INDEX ); } void Cuda_Reset( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, list **lists ) + simulation_data *data, static_storage *workspace, list **lists ) { - Cuda_Reset_Atoms( system ); + Cuda_Reset_Atoms( system ); - //Reset_Simulation_Data( data ); - Cuda_Sync_Simulation_Data ( data ); - //Sync_Host_Device (data, (simulation_data *)data->d_simulation_data, cudaMemcpyHostToDevice); + //Reset_Simulation_Data( data ); + Cuda_Sync_Simulation_Data ( data ); + //Sync_Host_Device (data, (simulation_data *)data->d_simulation_data, cudaMemcpyHostToDevice); - if( control->ensemble == NPT || control->ensemble == sNPT || - control->ensemble == iNPT ) - Reset_Pressures( data ); + if( control->ensemble == NPT || control->ensemble == sNPT || + control->ensemble == iNPT ) + Reset_Pressures( data ); - Cuda_Reset_Workspace( system, dev_workspace ); + Cuda_Reset_Workspace( system, dev_workspace ); - Cuda_Reset_Neighbor_Lists( system, control, workspace, lists ); + Cuda_Reset_Neighbor_Lists( system, control, workspace, lists ); - Cuda_Reset_Far_Neighbors_List (system); + Cuda_Reset_Far_Neighbors_List (system); - Cuda_Reset_Sparse_Matrix (system, dev_workspace); + Cuda_Reset_Sparse_Matrix (system, dev_workspace); } void Reset_Grid( grid *g ) { - memset (g->top, 0, INT_SIZE * g->ncell[0]*g->ncell[1]*g->ncell[2]); + memset (g->top, 0, INT_SIZE * g->ncell[0]*g->ncell[1]*g->ncell[2]); } void Cuda_Reset_Grid (grid *g) { - cuda_memset (g->top, 0, INT_SIZE * g->ncell[0]*g->ncell[1]*g->ncell[2], RES_GRID_TOP); + cuda_memset (g->top, 0, INT_SIZE * g->ncell[0]*g->ncell[1]*g->ncell[2], RES_GRID_TOP); } void Reset_Marks( grid *g, ivec *grid_stack, int grid_top ) { - int i; + int i; - for( i = 0; i < grid_top; ++i ) - g->mark[grid_stack[i][0] * g->ncell[1]*g->ncell[2] + - grid_stack[i][1] * g->ncell[2] + - grid_stack[i][2]] = 0; + for( i = 0; i < grid_top; ++i ) + g->mark[grid_stack[i][0] * g->ncell[1]*g->ncell[2] + + grid_stack[i][1] * g->ncell[2] + + grid_stack[i][2]] = 0; } diff --git a/PuReMD-GPU/src/single_body_interactions.cu b/PuReMD-GPU/src/single_body_interactions.cu index 2c3fd44f..3c6c0882 100644 --- a/PuReMD-GPU/src/single_body_interactions.cu +++ b/PuReMD-GPU/src/single_body_interactions.cu @@ -29,289 +29,289 @@ void LonePair_OverUnder_Coordination_Energy( reax_system *system, - control_params *control, - simulation_data *data, - static_storage *workspace, - list **lists, - output_controls *out_control ) + control_params *control, + simulation_data *data, + static_storage *workspace, + list **lists, + output_controls *out_control ) { - int i, j, pj, type_i, type_j; - real Delta_lpcorr, dfvl; - real e_lp, expvd2, inv_expvd2, dElp, CElp, DlpVi; - real e_lph, Di, vov3, deahu2dbo, deahu2dsbo; - real e_ov, CEover1, CEover2, CEover3, CEover4; - real exp_ovun1, exp_ovun2, sum_ovun1, sum_ovun2; - real exp_ovun2n, exp_ovun6, exp_ovun8; - real inv_exp_ovun1, inv_exp_ovun2, inv_exp_ovun2n, inv_exp_ovun8; - real e_un, CEunder1, CEunder2, CEunder3, CEunder4; - real p_lp1, p_lp2, p_lp3; - real p_ovun2, p_ovun3, p_ovun4, p_ovun5, p_ovun6, p_ovun7, p_ovun8; - - single_body_parameters *sbp_i, *sbp_j; - two_body_parameters *twbp; - bond_data *pbond; - bond_order_data *bo_ij; - list *bonds = (*lists) + BONDS; - - /* Initialize parameters */ - p_lp1 = system->reaxprm.gp.l[15]; - p_lp3 = system->reaxprm.gp.l[5]; - p_ovun3 = system->reaxprm.gp.l[32]; - p_ovun4 = system->reaxprm.gp.l[31]; - p_ovun6 = system->reaxprm.gp.l[6]; - p_ovun7 = system->reaxprm.gp.l[8]; - p_ovun8 = system->reaxprm.gp.l[9]; - - for( i = 0; i < system->N; ++i ) { - /* set the parameter pointer */ - type_i = system->atoms[i].type; - sbp_i = &(system->reaxprm.sbp[ type_i ]); - - /* lone-pair Energy */ - p_lp2 = sbp_i->p_lp2; - expvd2 = EXP( -75 * workspace->Delta_lp[i] ); - inv_expvd2 = 1. / (1. + expvd2 ); - - /* calculate the energy */ - data->E_Lp += e_lp = - p_lp2 * workspace->Delta_lp[i] * inv_expvd2; - - dElp = p_lp2 * inv_expvd2 + - 75 * p_lp2 * workspace->Delta_lp[i] * expvd2 * SQR(inv_expvd2); - CElp = dElp * workspace->dDelta_lp[i]; - - workspace->CdDelta[i] += CElp; // lp - 1st term + int i, j, pj, type_i, type_j; + real Delta_lpcorr, dfvl; + real e_lp, expvd2, inv_expvd2, dElp, CElp, DlpVi; + real e_lph, Di, vov3, deahu2dbo, deahu2dsbo; + real e_ov, CEover1, CEover2, CEover3, CEover4; + real exp_ovun1, exp_ovun2, sum_ovun1, sum_ovun2; + real exp_ovun2n, exp_ovun6, exp_ovun8; + real inv_exp_ovun1, inv_exp_ovun2, inv_exp_ovun2n, inv_exp_ovun8; + real e_un, CEunder1, CEunder2, CEunder3, CEunder4; + real p_lp1, p_lp2, p_lp3; + real p_ovun2, p_ovun3, p_ovun4, p_ovun5, p_ovun6, p_ovun7, p_ovun8; + + single_body_parameters *sbp_i, *sbp_j; + two_body_parameters *twbp; + bond_data *pbond; + bond_order_data *bo_ij; + list *bonds = (*lists) + BONDS; + + /* Initialize parameters */ + p_lp1 = system->reaxprm.gp.l[15]; + p_lp3 = system->reaxprm.gp.l[5]; + p_ovun3 = system->reaxprm.gp.l[32]; + p_ovun4 = system->reaxprm.gp.l[31]; + p_ovun6 = system->reaxprm.gp.l[6]; + p_ovun7 = system->reaxprm.gp.l[8]; + p_ovun8 = system->reaxprm.gp.l[9]; + + for( i = 0; i < system->N; ++i ) { + /* set the parameter pointer */ + type_i = system->atoms[i].type; + sbp_i = &(system->reaxprm.sbp[ type_i ]); + + /* lone-pair Energy */ + p_lp2 = sbp_i->p_lp2; + expvd2 = EXP( -75 * workspace->Delta_lp[i] ); + inv_expvd2 = 1. / (1. + expvd2 ); + + /* calculate the energy */ + data->E_Lp += e_lp = + p_lp2 * workspace->Delta_lp[i] * inv_expvd2; + + dElp = p_lp2 * inv_expvd2 + + 75 * p_lp2 * workspace->Delta_lp[i] * expvd2 * SQR(inv_expvd2); + CElp = dElp * workspace->dDelta_lp[i]; + + workspace->CdDelta[i] += CElp; // lp - 1st term #ifdef TEST_ENERGY - fprintf( out_control->elp, "%23.15e%23.15e%23.15e%23.15e\n", - p_lp2, workspace->Delta_lp_temp[i], expvd2, dElp ); - fprintf( out_control->elp, "%6d%23.15e%23.15e%23.15e\n", - workspace->orig_id[i]+1, workspace->nlp[i], e_lp, data->E_Lp ); + fprintf( out_control->elp, "%23.15e%23.15e%23.15e%23.15e\n", + p_lp2, workspace->Delta_lp_temp[i], expvd2, dElp ); + fprintf( out_control->elp, "%6d%23.15e%23.15e%23.15e\n", + workspace->orig_id[i]+1, workspace->nlp[i], e_lp, data->E_Lp ); #endif #ifdef TEST_FORCES - Add_dDelta( system, lists, i, CElp, workspace->f_lp ); // lp - 1st term + Add_dDelta( system, lists, i, CElp, workspace->f_lp ); // lp - 1st term #endif - /* correction for C2 */ - if( system->reaxprm.gp.l[5] > 0.001 && - !strcmp( system->reaxprm.sbp[type_i].name, "C" ) ) - for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ) - if( i < bonds->select.bond_list[pj].nbr ) { - j = bonds->select.bond_list[pj].nbr; - type_j = system->atoms[j].type; - - if( !strcmp( system->reaxprm.sbp[type_j].name, "C" ) ) { - twbp = &( system->reaxprm.tbp[ index_tbp (type_i,type_j,&system->reaxprm) ]); - bo_ij = &( bonds->select.bond_list[pj].bo_data ); - Di = workspace->Delta[i]; - vov3 = bo_ij->BO - Di - 0.040*POW(Di, 4.); - - if( vov3 > 3. ) { - data->E_Lp += e_lph = p_lp3 * SQR(vov3-3.0); - //estrain(i) += e_lph; - - deahu2dbo = 2.*p_lp3*(vov3 - 3.); - deahu2dsbo = 2.*p_lp3*(vov3 - 3.)*(-1. - 0.16*POW(Di, 3.)); - - bo_ij->Cdbo += deahu2dbo; - workspace->CdDelta[i] += deahu2dsbo; + /* correction for C2 */ + if( system->reaxprm.gp.l[5] > 0.001 && + !strcmp( system->reaxprm.sbp[type_i].name, "C" ) ) + for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ) + if( i < bonds->select.bond_list[pj].nbr ) { + j = bonds->select.bond_list[pj].nbr; + type_j = system->atoms[j].type; + + if( !strcmp( system->reaxprm.sbp[type_j].name, "C" ) ) { + twbp = &( system->reaxprm.tbp[ index_tbp (type_i,type_j,&system->reaxprm) ]); + bo_ij = &( bonds->select.bond_list[pj].bo_data ); + Di = workspace->Delta[i]; + vov3 = bo_ij->BO - Di - 0.040*POW(Di, 4.); + + if( vov3 > 3. ) { + data->E_Lp += e_lph = p_lp3 * SQR(vov3-3.0); + //estrain(i) += e_lph; + + deahu2dbo = 2.*p_lp3*(vov3 - 3.); + deahu2dsbo = 2.*p_lp3*(vov3 - 3.)*(-1. - 0.16*POW(Di, 3.)); + + bo_ij->Cdbo += deahu2dbo; + workspace->CdDelta[i] += deahu2dsbo; #ifdef TEST_ENERGY - fprintf(out_control->elp,"C2cor%6d%6d%23.15e%23.15e%23.15e\n", - // workspace->orig_id[i], workspace->orig_id[j], - i+1, j+1, e_lph, deahu2dbo, deahu2dsbo ); + fprintf(out_control->elp,"C2cor%6d%6d%23.15e%23.15e%23.15e\n", + // workspace->orig_id[i], workspace->orig_id[j], + i+1, j+1, e_lph, deahu2dbo, deahu2dsbo ); #endif #ifdef TEST_FORCES - Add_dBO(system, lists, i, pj, deahu2dbo, workspace->f_lp); - Add_dDelta(system, lists, i, deahu2dsbo, workspace->f_lp); + Add_dBO(system, lists, i, pj, deahu2dbo, workspace->f_lp); + Add_dDelta(system, lists, i, deahu2dsbo, workspace->f_lp); #endif - } - } + } + } - } - } + } + } - for( i = 0; i < system->N; ++i ) { - type_i = system->atoms[i].type; - sbp_i = &(system->reaxprm.sbp[ type_i ]); + for( i = 0; i < system->N; ++i ) { + type_i = system->atoms[i].type; + sbp_i = &(system->reaxprm.sbp[ type_i ]); - /* over-coordination energy */ - if( sbp_i->mass > 21.0 ) - dfvl = 0.0; - else dfvl = 1.0; // only for 1st-row elements + /* over-coordination energy */ + if( sbp_i->mass > 21.0 ) + dfvl = 0.0; + else dfvl = 1.0; // only for 1st-row elements - p_ovun2 = sbp_i->p_ovun2; - sum_ovun1 = 0; - sum_ovun2 = 0; + p_ovun2 = sbp_i->p_ovun2; + sum_ovun1 = 0; + sum_ovun2 = 0; - for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ) { - j = bonds->select.bond_list[pj].nbr; - type_j = system->atoms[j].type; - bo_ij = &(bonds->select.bond_list[pj].bo_data); - sbp_j = &(system->reaxprm.sbp[ type_j ]); - twbp = &(system->reaxprm.tbp[ index_tbp (type_i,type_j,&system->reaxprm) ]); + for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ) { + j = bonds->select.bond_list[pj].nbr; + type_j = system->atoms[j].type; + bo_ij = &(bonds->select.bond_list[pj].bo_data); + sbp_j = &(system->reaxprm.sbp[ type_j ]); + twbp = &(system->reaxprm.tbp[ index_tbp (type_i,type_j,&system->reaxprm) ]); - sum_ovun1 += twbp->p_ovun1 * twbp->De_s * bo_ij->BO; - sum_ovun2 += (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j])* - ( bo_ij->BO_pi + bo_ij->BO_pi2 ); + sum_ovun1 += twbp->p_ovun1 * twbp->De_s * bo_ij->BO; + sum_ovun2 += (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j])* + ( bo_ij->BO_pi + bo_ij->BO_pi2 ); - /*fprintf( stdout, "%4d%4d%23.15e%23.15e%23.15e\n", - i+1, j+1, - dfvl * workspace->Delta_lp_temp[j], - sbp_j->nlp_opt, - workspace->nlp_temp[j] );*/ - } + /*fprintf( stdout, "%4d%4d%23.15e%23.15e%23.15e\n", + i+1, j+1, + dfvl * workspace->Delta_lp_temp[j], + sbp_j->nlp_opt, + workspace->nlp_temp[j] );*/ + } - exp_ovun1 = p_ovun3 * EXP( p_ovun4 * sum_ovun2 ); - inv_exp_ovun1 = 1.0 / (1 + exp_ovun1); - Delta_lpcorr = workspace->Delta[i] - - (dfvl*workspace->Delta_lp_temp[i]) * inv_exp_ovun1; + exp_ovun1 = p_ovun3 * EXP( p_ovun4 * sum_ovun2 ); + inv_exp_ovun1 = 1.0 / (1 + exp_ovun1); + Delta_lpcorr = workspace->Delta[i] - + (dfvl*workspace->Delta_lp_temp[i]) * inv_exp_ovun1; - exp_ovun2 = EXP( p_ovun2 * Delta_lpcorr ); - inv_exp_ovun2 = 1.0 / (1.0 + exp_ovun2); + exp_ovun2 = EXP( p_ovun2 * Delta_lpcorr ); + inv_exp_ovun2 = 1.0 / (1.0 + exp_ovun2); - DlpVi = 1.0 / (Delta_lpcorr + sbp_i->valency + 1e-8 ); - CEover1 = Delta_lpcorr * DlpVi * inv_exp_ovun2; + DlpVi = 1.0 / (Delta_lpcorr + sbp_i->valency + 1e-8 ); + CEover1 = Delta_lpcorr * DlpVi * inv_exp_ovun2; - data->E_Ov += e_ov = sum_ovun1 * CEover1; + data->E_Ov += e_ov = sum_ovun1 * CEover1; - CEover2 = sum_ovun1 * DlpVi * inv_exp_ovun2 * - ( 1.0 - Delta_lpcorr*( DlpVi + p_ovun2 * exp_ovun2 * inv_exp_ovun2 ) ); + CEover2 = sum_ovun1 * DlpVi * inv_exp_ovun2 * + ( 1.0 - Delta_lpcorr*( DlpVi + p_ovun2 * exp_ovun2 * inv_exp_ovun2 ) ); - CEover3 = CEover2 * (1.0 - dfvl*workspace->dDelta_lp[i]*inv_exp_ovun1 ); + CEover3 = CEover2 * (1.0 - dfvl*workspace->dDelta_lp[i]*inv_exp_ovun1 ); - CEover4 = CEover2 * (dfvl*workspace->Delta_lp_temp[i]) * - p_ovun4 * exp_ovun1 * SQR(inv_exp_ovun1); + CEover4 = CEover2 * (dfvl*workspace->Delta_lp_temp[i]) * + p_ovun4 * exp_ovun1 * SQR(inv_exp_ovun1); - /* under-coordination potential */ - p_ovun2 = sbp_i->p_ovun2; - p_ovun5 = sbp_i->p_ovun5; + /* under-coordination potential */ + p_ovun2 = sbp_i->p_ovun2; + p_ovun5 = sbp_i->p_ovun5; - exp_ovun2n = 1.0 / exp_ovun2; - exp_ovun6 = EXP( p_ovun6 * Delta_lpcorr ); - exp_ovun8 = p_ovun7 * EXP(p_ovun8 * sum_ovun2); - inv_exp_ovun2n = 1.0 / (1.0 + exp_ovun2n); - inv_exp_ovun8 = 1.0 / (1.0 + exp_ovun8); + exp_ovun2n = 1.0 / exp_ovun2; + exp_ovun6 = EXP( p_ovun6 * Delta_lpcorr ); + exp_ovun8 = p_ovun7 * EXP(p_ovun8 * sum_ovun2); + inv_exp_ovun2n = 1.0 / (1.0 + exp_ovun2n); + inv_exp_ovun8 = 1.0 / (1.0 + exp_ovun8); - data->E_Un += e_un = - -p_ovun5 * (1.0 - exp_ovun6) * inv_exp_ovun2n * inv_exp_ovun8; + data->E_Un += e_un = + -p_ovun5 * (1.0 - exp_ovun6) * inv_exp_ovun2n * inv_exp_ovun8; - CEunder1 = inv_exp_ovun2n * ( p_ovun5*p_ovun6*exp_ovun6*inv_exp_ovun8 + - p_ovun2 * e_un * exp_ovun2n); - CEunder2 = -e_un * p_ovun8 * exp_ovun8 * inv_exp_ovun8; - CEunder3 = CEunder1 * (1.0 - dfvl*workspace->dDelta_lp[i]*inv_exp_ovun1); - CEunder4 = CEunder1 * (dfvl*workspace->Delta_lp_temp[i]) * - p_ovun4 * exp_ovun1 * SQR(inv_exp_ovun1) + CEunder2; + CEunder1 = inv_exp_ovun2n * ( p_ovun5*p_ovun6*exp_ovun6*inv_exp_ovun8 + + p_ovun2 * e_un * exp_ovun2n); + CEunder2 = -e_un * p_ovun8 * exp_ovun8 * inv_exp_ovun8; + CEunder3 = CEunder1 * (1.0 - dfvl*workspace->dDelta_lp[i]*inv_exp_ovun1); + CEunder4 = CEunder1 * (dfvl*workspace->Delta_lp_temp[i]) * + p_ovun4 * exp_ovun1 * SQR(inv_exp_ovun1) + CEunder2; - //fprintf( stdout, "%6d%23.15e%23.15e%23.15e\n", - // i+1, sum_ovun2, e_ov, e_un ); + //fprintf( stdout, "%6d%23.15e%23.15e%23.15e\n", + // i+1, sum_ovun2, e_ov, e_un ); - /* forces */ - workspace->CdDelta[i] += CEover3; // OvCoor - 2nd term - workspace->CdDelta[i] += CEunder3; // UnCoor - 1st term + /* forces */ + workspace->CdDelta[i] += CEover3; // OvCoor - 2nd term + workspace->CdDelta[i] += CEunder3; // UnCoor - 1st term #ifdef TEST_FORCES - Add_dDelta( system, lists, i, CEover3, workspace->f_ov ); // OvCoor - 2nd - Add_dDelta( system, lists, i, CEunder3, workspace->f_un ); // UnCoor - 1st + Add_dDelta( system, lists, i, CEover3, workspace->f_ov ); // OvCoor - 2nd + Add_dDelta( system, lists, i, CEunder3, workspace->f_un ); // UnCoor - 1st #endif - for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ){ - pbond = &(bonds->select.bond_list[pj]); - j = pbond->nbr; - type_j = system->atoms[j].type; - bo_ij = &(pbond->bo_data); - twbp = &(system->reaxprm.tbp[ index_tbp (type_i,type_j,&system->reaxprm) ]); + for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ){ + pbond = &(bonds->select.bond_list[pj]); + j = pbond->nbr; + type_j = system->atoms[j].type; + bo_ij = &(pbond->bo_data); + twbp = &(system->reaxprm.tbp[ index_tbp (type_i,type_j,&system->reaxprm) ]); - bo_ij->Cdbo += CEover1 * twbp->p_ovun1 * twbp->De_s; // OvCoor - 1st - workspace->CdDelta[j] += CEover4*(1.0 - dfvl*workspace->dDelta_lp[j])* - (bo_ij->BO_pi + bo_ij->BO_pi2); // OvCoor - 3a - bo_ij->Cdbopi += CEover4 * - (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//OvCoor-3b - bo_ij->Cdbopi2 += CEover4 * - (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//OvCoor-3b + bo_ij->Cdbo += CEover1 * twbp->p_ovun1 * twbp->De_s; // OvCoor - 1st + workspace->CdDelta[j] += CEover4*(1.0 - dfvl*workspace->dDelta_lp[j])* + (bo_ij->BO_pi + bo_ij->BO_pi2); // OvCoor - 3a + bo_ij->Cdbopi += CEover4 * + (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//OvCoor-3b + bo_ij->Cdbopi2 += CEover4 * + (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//OvCoor-3b - workspace->CdDelta[j] += CEunder4*(1.0-dfvl*workspace->dDelta_lp[j]) * - (bo_ij->BO_pi + bo_ij->BO_pi2); // UnCoor - 2a - bo_ij->Cdbopi += CEunder4 * - (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//UnCoor-2b - bo_ij->Cdbopi2 += CEunder4 * - (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//UnCoor-2b + workspace->CdDelta[j] += CEunder4*(1.0-dfvl*workspace->dDelta_lp[j]) * + (bo_ij->BO_pi + bo_ij->BO_pi2); // UnCoor - 2a + bo_ij->Cdbopi += CEunder4 * + (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//UnCoor-2b + bo_ij->Cdbopi2 += CEunder4 * + (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//UnCoor-2b #ifdef TEST_ENERGY - /* fprintf( out_control->eov, "%6d%23.15e%23.15e" - workspace->orig_id[j]+1, - //twbp->p_ovun1,twbp->De_s,Delta_lpcorr*DlpVi*inv_exp_ovun2, - CEover1*twbp->p_ovun1*twbp->De_s, CEover3 ); */ - - /*fprintf( out_control->eov, "%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n", - workspace->orig_id[j]+1, - CEover4, - CEover4* - (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]), - CEover4 * (bo_ij->BO_pi + bo_ij->BO_pi2), - (1.0 - dfvl*workspace->dDelta_lp[j]), - CEover4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * - (bo_ij->BO_pi + bo_ij->BO_pi2) );*/ - - /* fprintf( out_control->eun, "%6d%23.15e\n", - workspace->orig_id[j]+1, CEunder3 ); */ - - /*fprintf( out_control->eun, "%6d%23.15e%23.15e%23.15e%23.15e\n", - workspace->orig_id[j]+1, - CEunder4, - (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]), - CEunder4* - (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]), - CEunder4*(1.0 - dfvl*workspace->dDelta_lp[j])* - (bo_ij->BO_pi + bo_ij->BO_pi2) );*/ + /* fprintf( out_control->eov, "%6d%23.15e%23.15e" + workspace->orig_id[j]+1, + //twbp->p_ovun1,twbp->De_s,Delta_lpcorr*DlpVi*inv_exp_ovun2, + CEover1*twbp->p_ovun1*twbp->De_s, CEover3 ); */ + + /*fprintf( out_control->eov, "%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n", + workspace->orig_id[j]+1, + CEover4, + CEover4* + (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]), + CEover4 * (bo_ij->BO_pi + bo_ij->BO_pi2), + (1.0 - dfvl*workspace->dDelta_lp[j]), + CEover4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * + (bo_ij->BO_pi + bo_ij->BO_pi2) );*/ + + /* fprintf( out_control->eun, "%6d%23.15e\n", + workspace->orig_id[j]+1, CEunder3 ); */ + + /*fprintf( out_control->eun, "%6d%23.15e%23.15e%23.15e%23.15e\n", + workspace->orig_id[j]+1, + CEunder4, + (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]), + CEunder4* + (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]), + CEunder4*(1.0 - dfvl*workspace->dDelta_lp[j])* + (bo_ij->BO_pi + bo_ij->BO_pi2) );*/ #endif #ifdef TEST_FORCES - Add_dBO( system, lists, i, pj, CEover1 * twbp->p_ovun1 * twbp->De_s, - workspace->f_ov ); // OvCoor - 1st term - - Add_dDelta( system, lists, j, - CEover4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * - (bo_ij->BO_pi+bo_ij->BO_pi2), workspace->f_ov );//OvCoor3a - - Add_dBOpinpi2( system, lists, i, pj, - CEover4 * (workspace->Delta[j] - - dfvl * workspace->Delta_lp_temp[j]), - CEover4 * (workspace->Delta[j] - - dfvl * workspace->Delta_lp_temp[j]), - workspace->f_ov, workspace->f_ov ); // OvCoor - 3b - - Add_dDelta( system, lists, j, - CEunder4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * - (bo_ij->BO_pi + bo_ij->BO_pi2), - workspace->f_un ); // UnCoor - 2a - - Add_dBOpinpi2( system, lists, i, pj, - CEunder4 * (workspace->Delta[j] - - dfvl * workspace->Delta_lp_temp[j]), - CEunder4 * (workspace->Delta[j] - - dfvl * workspace->Delta_lp_temp[j]), - workspace->f_un, workspace->f_un ); // UnCoor - 2b + Add_dBO( system, lists, i, pj, CEover1 * twbp->p_ovun1 * twbp->De_s, + workspace->f_ov ); // OvCoor - 1st term + + Add_dDelta( system, lists, j, + CEover4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * + (bo_ij->BO_pi+bo_ij->BO_pi2), workspace->f_ov );//OvCoor3a + + Add_dBOpinpi2( system, lists, i, pj, + CEover4 * (workspace->Delta[j] - + dfvl * workspace->Delta_lp_temp[j]), + CEover4 * (workspace->Delta[j] - + dfvl * workspace->Delta_lp_temp[j]), + workspace->f_ov, workspace->f_ov ); // OvCoor - 3b + + Add_dDelta( system, lists, j, + CEunder4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * + (bo_ij->BO_pi + bo_ij->BO_pi2), + workspace->f_un ); // UnCoor - 2a + + Add_dBOpinpi2( system, lists, i, pj, + CEunder4 * (workspace->Delta[j] - + dfvl * workspace->Delta_lp_temp[j]), + CEunder4 * (workspace->Delta[j] - + dfvl * workspace->Delta_lp_temp[j]), + workspace->f_un, workspace->f_un ); // UnCoor - 2b #endif - } + } #ifdef TEST_ENERGY - fprintf( out_control->eov, "%6d%15.8f%15.8f%15.8f\n", - i+1, DlpVi, Delta_lpcorr, sbp_i->valency ); + fprintf( out_control->eov, "%6d%15.8f%15.8f%15.8f\n", + i+1, DlpVi, Delta_lpcorr, sbp_i->valency ); - fprintf( out_control->eov, "%6d%15.8f%15.8f\n", - i+1/*workspace->orig_id[i]+1*/, e_ov, data->E_Ov + data->E_Un ); + fprintf( out_control->eov, "%6d%15.8f%15.8f\n", + i+1/*workspace->orig_id[i]+1*/, e_ov, data->E_Ov + data->E_Un ); - fprintf( out_control->eov, "%6d%15.8f%15.8f\n", - i+1/*workspace->orig_id[i]+1*/, e_un, data->E_Ov + data->E_Un ); + fprintf( out_control->eov, "%6d%15.8f%15.8f\n", + i+1/*workspace->orig_id[i]+1*/, e_un, data->E_Ov + data->E_Un ); #endif - } + } } @@ -324,324 +324,324 @@ void LonePair_OverUnder_Coordination_Energy( reax_system *system, //CUDA Functions GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, global_parameters g_params, - single_body_parameters *sbp, two_body_parameters *tbp, - static_storage p_workspace, simulation_data *data, - list p_bonds, int N, int num_atom_types ) + single_body_parameters *sbp, two_body_parameters *tbp, + static_storage p_workspace, simulation_data *data, + list p_bonds, int N, int num_atom_types ) { - int i, j, pj, type_i, type_j; - real Delta_lpcorr, dfvl; - real e_lp, expvd2, inv_expvd2, dElp, CElp, DlpVi; - real e_lph, Di, vov3, deahu2dbo, deahu2dsbo; - real e_ov, CEover1, CEover2, CEover3, CEover4; - real exp_ovun1, exp_ovun2, sum_ovun1, sum_ovun2; - real exp_ovun2n, exp_ovun6, exp_ovun8; - real inv_exp_ovun1, inv_exp_ovun2, inv_exp_ovun2n, inv_exp_ovun8; - real e_un, CEunder1, CEunder2, CEunder3, CEunder4; - real p_lp1, p_lp2, p_lp3; - real p_ovun2, p_ovun3, p_ovun4, p_ovun5, p_ovun6, p_ovun7, p_ovun8; - - single_body_parameters *sbp_i, *sbp_j; - two_body_parameters *twbp; - bond_data *pbond; - bond_order_data *bo_ij; - list *bonds = &p_bonds; - static_storage *workspace = &p_workspace; - - i = blockIdx.x * blockDim.x + threadIdx.x; - //if (i >= N) return; - - /* Initialize parameters */ - p_lp1 = g_params.l[15]; - p_lp3 = g_params.l[5]; - p_ovun3 = g_params.l[32]; - p_ovun4 = g_params.l[31]; - p_ovun6 = g_params.l[6]; - p_ovun7 = g_params.l[8]; - p_ovun8 = g_params.l[9]; - - //for( i = 0; i < system->N; ++i ) { - if (i < N) { - // set the parameter pointer - type_i = atoms[i].type; - sbp_i = &(sbp[ type_i ]); - - // lone-pair Energy - p_lp2 = sbp_i->p_lp2; - expvd2 = EXP( -75 * workspace->Delta_lp[i] ); - inv_expvd2 = 1. / (1. + expvd2 ); - - // calculate the energy - e_lp = p_lp2 * workspace->Delta_lp[i] * inv_expvd2; - - //PERFORMANCE IMPACT - atomicAdd (&data->E_Lp, e_lp); - - dElp = p_lp2 * inv_expvd2 + - 75 * p_lp2 * workspace->Delta_lp[i] * expvd2 * SQR(inv_expvd2); - CElp = dElp * workspace->dDelta_lp[i]; - - //PERFORMANCE IMPACT - //workspace->CdDelta[i] += CElp; // lp - 1st term - atomicAdd (&workspace->CdDelta[i], CElp); + int i, j, pj, type_i, type_j; + real Delta_lpcorr, dfvl; + real e_lp, expvd2, inv_expvd2, dElp, CElp, DlpVi; + real e_lph, Di, vov3, deahu2dbo, deahu2dsbo; + real e_ov, CEover1, CEover2, CEover3, CEover4; + real exp_ovun1, exp_ovun2, sum_ovun1, sum_ovun2; + real exp_ovun2n, exp_ovun6, exp_ovun8; + real inv_exp_ovun1, inv_exp_ovun2, inv_exp_ovun2n, inv_exp_ovun8; + real e_un, CEunder1, CEunder2, CEunder3, CEunder4; + real p_lp1, p_lp2, p_lp3; + real p_ovun2, p_ovun3, p_ovun4, p_ovun5, p_ovun6, p_ovun7, p_ovun8; + + single_body_parameters *sbp_i, *sbp_j; + two_body_parameters *twbp; + bond_data *pbond; + bond_order_data *bo_ij; + list *bonds = &p_bonds; + static_storage *workspace = &p_workspace; + + i = blockIdx.x * blockDim.x + threadIdx.x; + //if (i >= N) return; + + /* Initialize parameters */ + p_lp1 = g_params.l[15]; + p_lp3 = g_params.l[5]; + p_ovun3 = g_params.l[32]; + p_ovun4 = g_params.l[31]; + p_ovun6 = g_params.l[6]; + p_ovun7 = g_params.l[8]; + p_ovun8 = g_params.l[9]; + + //for( i = 0; i < system->N; ++i ) { + if (i < N) { + // set the parameter pointer + type_i = atoms[i].type; + sbp_i = &(sbp[ type_i ]); + + // lone-pair Energy + p_lp2 = sbp_i->p_lp2; + expvd2 = EXP( -75 * workspace->Delta_lp[i] ); + inv_expvd2 = 1. / (1. + expvd2 ); + + // calculate the energy + e_lp = p_lp2 * workspace->Delta_lp[i] * inv_expvd2; + + //PERFORMANCE IMPACT + atomicAdd (&data->E_Lp, e_lp); + + dElp = p_lp2 * inv_expvd2 + + 75 * p_lp2 * workspace->Delta_lp[i] * expvd2 * SQR(inv_expvd2); + CElp = dElp * workspace->dDelta_lp[i]; + + //PERFORMANCE IMPACT + //workspace->CdDelta[i] += CElp; // lp - 1st term + atomicAdd (&workspace->CdDelta[i], CElp); #ifdef TEST_ENERGY - //TODO - //fprintf( out_control->elp, "%23.15e%23.15e%23.15e%23.15e\n", - // p_lp2, workspace->Delta_lp_temp[i], expvd2, dElp ); - //fprintf( out_control->elp, "%6d%23.15e%23.15e%23.15e\n", - // workspace->orig_id[i]+1, workspace->nlp[i], e_lp, data->E_Lp ); + //TODO + //fprintf( out_control->elp, "%23.15e%23.15e%23.15e%23.15e\n", + // p_lp2, workspace->Delta_lp_temp[i], expvd2, dElp ); + //fprintf( out_control->elp, "%6d%23.15e%23.15e%23.15e\n", + // workspace->orig_id[i]+1, workspace->nlp[i], e_lp, data->E_Lp ); #endif #ifdef TEST_FORCES - //TODO - //Add_dDelta( system, lists, i, CElp, workspace->f_lp ); // lp - 1st term - //TODO + //TODO + //Add_dDelta( system, lists, i, CElp, workspace->f_lp ); // lp - 1st term + //TODO #endif - // correction for C2 - if( g_params.l[5] > 0.001 && - !cuda_strcmp( sbp[type_i].name, "C" , 15) ) - for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ) - if( i < bonds->select.bond_list[pj].nbr ) { - j = bonds->select.bond_list[pj].nbr; - type_j = atoms[j].type; + // correction for C2 + if( g_params.l[5] > 0.001 && + !cuda_strcmp( sbp[type_i].name, "C" , 15) ) + for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ) + if( i < bonds->select.bond_list[pj].nbr ) { + j = bonds->select.bond_list[pj].nbr; + type_j = atoms[j].type; - if( !cuda_strcmp( sbp[type_j].name, "C", 15 ) ) { - twbp = &( tbp[ index_tbp (type_i,type_j,num_atom_types) ]); - bo_ij = &( bonds->select.bond_list[pj].bo_data ); - Di = workspace->Delta[i]; - vov3 = bo_ij->BO - Di - 0.040*POW(Di, 4.); + if( !cuda_strcmp( sbp[type_j].name, "C", 15 ) ) { + twbp = &( tbp[ index_tbp (type_i,type_j,num_atom_types) ]); + bo_ij = &( bonds->select.bond_list[pj].bo_data ); + Di = workspace->Delta[i]; + vov3 = bo_ij->BO - Di - 0.040*POW(Di, 4.); - if( vov3 > 3. ) { + if( vov3 > 3. ) { - //PERFORMANCE IMPACT - e_lph = p_lp3 * SQR(vov3-3.0); - atomicAdd (&data->E_Lp, e_lph ); - //estrain(i) += e_lph; + //PERFORMANCE IMPACT + e_lph = p_lp3 * SQR(vov3-3.0); + atomicAdd (&data->E_Lp, e_lph ); + //estrain(i) += e_lph; - deahu2dbo = 2.*p_lp3*(vov3 - 3.); - deahu2dsbo = 2.*p_lp3*(vov3 - 3.)*(-1. - 0.16*POW(Di, 3.)); + deahu2dbo = 2.*p_lp3*(vov3 - 3.); + deahu2dsbo = 2.*p_lp3*(vov3 - 3.)*(-1. - 0.16*POW(Di, 3.)); - bo_ij->Cdbo += deahu2dbo; + bo_ij->Cdbo += deahu2dbo; - //PERFORMANCE IMPACT - atomicAdd (&workspace->CdDelta[i], deahu2dsbo); + //PERFORMANCE IMPACT + atomicAdd (&workspace->CdDelta[i], deahu2dsbo); #ifdef TEST_ENERGY - //TODO - //fprintf(out_control->elp,"C2cor%6d%6d%23.15e%23.15e%23.15e\n", - // workspace->orig_id[i], workspace->orig_id[j], - // i+1, j+1, e_lph, deahu2dbo, deahu2dsbo ); + //TODO + //fprintf(out_control->elp,"C2cor%6d%6d%23.15e%23.15e%23.15e\n", + // workspace->orig_id[i], workspace->orig_id[j], + // i+1, j+1, e_lph, deahu2dbo, deahu2dsbo ); #endif #ifdef TEST_FORCES - //TODO - //Add_dBO(system, lists, i, pj, deahu2dbo, workspace->f_lp); - //Add_dDelta(system, lists, i, deahu2dsbo, workspace->f_lp); + //TODO + //Add_dBO(system, lists, i, pj, deahu2dbo, workspace->f_lp); + //Add_dDelta(system, lists, i, deahu2dsbo, workspace->f_lp); #endif - } - } + } + } - } - } // end of if statement for the all the threads + } + } // end of if statement for the all the threads - __syncthreads (); + __syncthreads (); - //TODO - if (i >= N) return; - //TODO + //TODO + if (i >= N) return; + //TODO - //for( i = 0; i < system->N; ++i ) { - type_i = atoms[i].type; - sbp_i = &(sbp[ type_i ]); + //for( i = 0; i < system->N; ++i ) { + type_i = atoms[i].type; + sbp_i = &(sbp[ type_i ]); - // over-coordination energy - if( sbp_i->mass > 21.0 ) - dfvl = 0.0; - else dfvl = 1.0; // only for 1st-row elements + // over-coordination energy + if( sbp_i->mass > 21.0 ) + dfvl = 0.0; + else dfvl = 1.0; // only for 1st-row elements - p_ovun2 = sbp_i->p_ovun2; - sum_ovun1 = 0; - sum_ovun2 = 0; + p_ovun2 = sbp_i->p_ovun2; + sum_ovun1 = 0; + sum_ovun2 = 0; - for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ) { - j = bonds->select.bond_list[pj].nbr; - type_j = atoms[j].type; - bo_ij = &(bonds->select.bond_list[pj].bo_data); - sbp_j = &(sbp[ type_j ]); - twbp = &(tbp[ index_tbp (type_i,type_j,num_atom_types) ]); + for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ) { + j = bonds->select.bond_list[pj].nbr; + type_j = atoms[j].type; + bo_ij = &(bonds->select.bond_list[pj].bo_data); + sbp_j = &(sbp[ type_j ]); + twbp = &(tbp[ index_tbp (type_i,type_j,num_atom_types) ]); - sum_ovun1 += twbp->p_ovun1 * twbp->De_s * bo_ij->BO; - sum_ovun2 += (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j])* - ( bo_ij->BO_pi + bo_ij->BO_pi2 ); + sum_ovun1 += twbp->p_ovun1 * twbp->De_s * bo_ij->BO; + sum_ovun2 += (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j])* + ( bo_ij->BO_pi + bo_ij->BO_pi2 ); - //fprintf( stdout, "%4d%4d%23.15e%23.15e%23.15e\n", - //i+1, j+1, - //dfvl * workspace->Delta_lp_temp[j], - //sbp_j->nlp_opt, - //workspace->nlp_temp[j] ); - } + //fprintf( stdout, "%4d%4d%23.15e%23.15e%23.15e\n", + //i+1, j+1, + //dfvl * workspace->Delta_lp_temp[j], + //sbp_j->nlp_opt, + //workspace->nlp_temp[j] ); + } - //__syncthreads (); + //__syncthreads (); - exp_ovun1 = p_ovun3 * EXP( p_ovun4 * sum_ovun2 ); - inv_exp_ovun1 = 1.0 / (1 + exp_ovun1); - Delta_lpcorr = workspace->Delta[i] - - (dfvl*workspace->Delta_lp_temp[i]) * inv_exp_ovun1; + exp_ovun1 = p_ovun3 * EXP( p_ovun4 * sum_ovun2 ); + inv_exp_ovun1 = 1.0 / (1 + exp_ovun1); + Delta_lpcorr = workspace->Delta[i] - + (dfvl*workspace->Delta_lp_temp[i]) * inv_exp_ovun1; - exp_ovun2 = EXP( p_ovun2 * Delta_lpcorr ); - inv_exp_ovun2 = 1.0 / (1.0 + exp_ovun2); + exp_ovun2 = EXP( p_ovun2 * Delta_lpcorr ); + inv_exp_ovun2 = 1.0 / (1.0 + exp_ovun2); - DlpVi = 1.0 / (Delta_lpcorr + sbp_i->valency + 1e-8 ); - CEover1 = Delta_lpcorr * DlpVi * inv_exp_ovun2; + DlpVi = 1.0 / (Delta_lpcorr + sbp_i->valency + 1e-8 ); + CEover1 = Delta_lpcorr * DlpVi * inv_exp_ovun2; - //PERFORMANCE IMPACT - //data->E_Ov += e_ov = sum_ovun1 * CEover1; - e_ov = sum_ovun1 * CEover1; - atomicAdd (&data->E_Ov, e_ov ); + //PERFORMANCE IMPACT + //data->E_Ov += e_ov = sum_ovun1 * CEover1; + e_ov = sum_ovun1 * CEover1; + atomicAdd (&data->E_Ov, e_ov ); - CEover2 = sum_ovun1 * DlpVi * inv_exp_ovun2 * - ( 1.0 - Delta_lpcorr*( DlpVi + p_ovun2 * exp_ovun2 * inv_exp_ovun2 ) ); + CEover2 = sum_ovun1 * DlpVi * inv_exp_ovun2 * + ( 1.0 - Delta_lpcorr*( DlpVi + p_ovun2 * exp_ovun2 * inv_exp_ovun2 ) ); - CEover3 = CEover2 * (1.0 - dfvl*workspace->dDelta_lp[i]*inv_exp_ovun1 ); + CEover3 = CEover2 * (1.0 - dfvl*workspace->dDelta_lp[i]*inv_exp_ovun1 ); - CEover4 = CEover2 * (dfvl*workspace->Delta_lp_temp[i]) * - p_ovun4 * exp_ovun1 * SQR(inv_exp_ovun1); + CEover4 = CEover2 * (dfvl*workspace->Delta_lp_temp[i]) * + p_ovun4 * exp_ovun1 * SQR(inv_exp_ovun1); - // under-coordination potential - p_ovun2 = sbp_i->p_ovun2; - p_ovun5 = sbp_i->p_ovun5; + // under-coordination potential + p_ovun2 = sbp_i->p_ovun2; + p_ovun5 = sbp_i->p_ovun5; - exp_ovun2n = 1.0 / exp_ovun2; - exp_ovun6 = EXP( p_ovun6 * Delta_lpcorr ); - exp_ovun8 = p_ovun7 * EXP(p_ovun8 * sum_ovun2); - inv_exp_ovun2n = 1.0 / (1.0 + exp_ovun2n); - inv_exp_ovun8 = 1.0 / (1.0 + exp_ovun8); + exp_ovun2n = 1.0 / exp_ovun2; + exp_ovun6 = EXP( p_ovun6 * Delta_lpcorr ); + exp_ovun8 = p_ovun7 * EXP(p_ovun8 * sum_ovun2); + inv_exp_ovun2n = 1.0 / (1.0 + exp_ovun2n); + inv_exp_ovun8 = 1.0 / (1.0 + exp_ovun8); - //PERFORMANCE IMPACT - e_un = -p_ovun5 * (1.0 - exp_ovun6) * inv_exp_ovun2n * inv_exp_ovun8; - atomicAdd (&data->E_Un, e_un ); + //PERFORMANCE IMPACT + e_un = -p_ovun5 * (1.0 - exp_ovun6) * inv_exp_ovun2n * inv_exp_ovun8; + atomicAdd (&data->E_Un, e_un ); - CEunder1 = inv_exp_ovun2n * ( p_ovun5*p_ovun6*exp_ovun6*inv_exp_ovun8 + - p_ovun2 * e_un * exp_ovun2n); - CEunder2 = -e_un * p_ovun8 * exp_ovun8 * inv_exp_ovun8; - CEunder3 = CEunder1 * (1.0 - dfvl*workspace->dDelta_lp[i]*inv_exp_ovun1); - CEunder4 = CEunder1 * (dfvl*workspace->Delta_lp_temp[i]) * - p_ovun4 * exp_ovun1 * SQR(inv_exp_ovun1) + CEunder2; + CEunder1 = inv_exp_ovun2n * ( p_ovun5*p_ovun6*exp_ovun6*inv_exp_ovun8 + + p_ovun2 * e_un * exp_ovun2n); + CEunder2 = -e_un * p_ovun8 * exp_ovun8 * inv_exp_ovun8; + CEunder3 = CEunder1 * (1.0 - dfvl*workspace->dDelta_lp[i]*inv_exp_ovun1); + CEunder4 = CEunder1 * (dfvl*workspace->Delta_lp_temp[i]) * + p_ovun4 * exp_ovun1 * SQR(inv_exp_ovun1) + CEunder2; - //fprintf( stdout, "%6d%23.15e%23.15e%23.15e\n", - // i+1, sum_ovun2, e_ov, e_un ); + //fprintf( stdout, "%6d%23.15e%23.15e%23.15e\n", + // i+1, sum_ovun2, e_ov, e_un ); - // forces - //PERFORMANCE IMPACT - atomicAdd (&workspace->CdDelta[i] , CEover3); // OvCoor - 2nd term - atomicAdd (&workspace->CdDelta[i], CEunder3); // UnCoor - 1st term + // forces + //PERFORMANCE IMPACT + atomicAdd (&workspace->CdDelta[i] , CEover3); // OvCoor - 2nd term + atomicAdd (&workspace->CdDelta[i], CEunder3); // UnCoor - 1st term #ifdef TEST_FORCES - //TODO - //Add_dDelta( system, lists, i, CEover3, workspace->f_ov ); // OvCoor - 2nd - //Add_dDelta( system, lists, i, CEunder3, workspace->f_un ); // UnCoor - 1st + //TODO + //Add_dDelta( system, lists, i, CEover3, workspace->f_ov ); // OvCoor - 2nd + //Add_dDelta( system, lists, i, CEunder3, workspace->f_un ); // UnCoor - 1st #endif - //__syncthreads (); + //__syncthreads (); - for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ){ - pbond = &(bonds->select.bond_list[pj]); - j = pbond->nbr; - type_j = atoms[j].type; - bo_ij = &(pbond->bo_data); - twbp = &(tbp[ index_tbp (type_i,type_j,num_atom_types) ]); + for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ){ + pbond = &(bonds->select.bond_list[pj]); + j = pbond->nbr; + type_j = atoms[j].type; + bo_ij = &(pbond->bo_data); + twbp = &(tbp[ index_tbp (type_i,type_j,num_atom_types) ]); - bo_ij->Cdbo += CEover1 * twbp->p_ovun1 * twbp->De_s; // OvCoor - 1st + bo_ij->Cdbo += CEover1 * twbp->p_ovun1 * twbp->De_s; // OvCoor - 1st - //PERFORMANCE IMPACT - atomicAdd (&workspace->CdDelta[j], CEover4*(1.0 - dfvl*workspace->dDelta_lp[j])* (bo_ij->BO_pi + bo_ij->BO_pi2)); // OvCoor - 3a + //PERFORMANCE IMPACT + atomicAdd (&workspace->CdDelta[j], CEover4*(1.0 - dfvl*workspace->dDelta_lp[j])* (bo_ij->BO_pi + bo_ij->BO_pi2)); // OvCoor - 3a - bo_ij->Cdbopi += CEover4 * - (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//OvCoor-3b - bo_ij->Cdbopi2 += CEover4 * - (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//OvCoor-3b + bo_ij->Cdbopi += CEover4 * + (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//OvCoor-3b + bo_ij->Cdbopi2 += CEover4 * + (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//OvCoor-3b - //PERFORMANCE IMPACT - atomicAdd (&workspace->CdDelta[j], CEunder4*(1.0-dfvl*workspace->dDelta_lp[j]) * (bo_ij->BO_pi + bo_ij->BO_pi2) ); // UnCoor - 2a + //PERFORMANCE IMPACT + atomicAdd (&workspace->CdDelta[j], CEunder4*(1.0-dfvl*workspace->dDelta_lp[j]) * (bo_ij->BO_pi + bo_ij->BO_pi2) ); // UnCoor - 2a - bo_ij->Cdbopi += CEunder4 * - (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//UnCoor-2b - bo_ij->Cdbopi2 += CEunder4 * - (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//UnCoor-2b + bo_ij->Cdbopi += CEunder4 * + (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//UnCoor-2b + bo_ij->Cdbopi2 += CEunder4 * + (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//UnCoor-2b #ifdef TEST_ENERGY - // fprintf( out_control->eov, "%6d%23.15e%23.15e" - // workspace->orig_id[j]+1, - //twbp->p_ovun1,twbp->De_s,Delta_lpcorr*DlpVi*inv_exp_ovun2, - // CEover1*twbp->p_ovun1*twbp->De_s, CEover3 ); - - // fprintf( out_control->eov, "%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n", - // workspace->orig_id[j]+1, - // CEover4, - // CEover4* - // (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]), - // CEover4 * (bo_ij->BO_pi + bo_ij->BO_pi2), - // (1.0 - dfvl*workspace->dDelta_lp[j]), - // CEover4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * - // (bo_ij->BO_pi + bo_ij->BO_pi2) ); - - // fprintf( out_control->eun, "%6d%23.15e\n", - // workspace->orig_id[j]+1, CEunder3 ); - - // fprintf( out_control->eun, "%6d%23.15e%23.15e%23.15e%23.15e\n", - // workspace->orig_id[j]+1, - // CEunder4, - // (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]), - // CEunder4* - // (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]), - // CEunder4*(1.0 - dfvl*workspace->dDelta_lp[j])* - // (bo_ij->BO_pi + bo_ij->BO_pi2) ); + // fprintf( out_control->eov, "%6d%23.15e%23.15e" + // workspace->orig_id[j]+1, + //twbp->p_ovun1,twbp->De_s,Delta_lpcorr*DlpVi*inv_exp_ovun2, + // CEover1*twbp->p_ovun1*twbp->De_s, CEover3 ); + + // fprintf( out_control->eov, "%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n", + // workspace->orig_id[j]+1, + // CEover4, + // CEover4* + // (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]), + // CEover4 * (bo_ij->BO_pi + bo_ij->BO_pi2), + // (1.0 - dfvl*workspace->dDelta_lp[j]), + // CEover4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * + // (bo_ij->BO_pi + bo_ij->BO_pi2) ); + + // fprintf( out_control->eun, "%6d%23.15e\n", + // workspace->orig_id[j]+1, CEunder3 ); + + // fprintf( out_control->eun, "%6d%23.15e%23.15e%23.15e%23.15e\n", + // workspace->orig_id[j]+1, + // CEunder4, + // (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]), + // CEunder4* + // (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]), + // CEunder4*(1.0 - dfvl*workspace->dDelta_lp[j])* + // (bo_ij->BO_pi + bo_ij->BO_pi2) ); #endif #ifdef TEST_FORCES - //TODO - // Add_dBO( system, lists, i, pj, CEover1 * twbp->p_ovun1 * twbp->De_s, - // workspace->f_ov ); // OvCoor - 1st term - - // Add_dDelta( system, lists, j, - // CEover4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * - // (bo_ij->BO_pi+bo_ij->BO_pi2), workspace->f_ov );//OvCoor3a - - // Add_dBOpinpi2( system, lists, i, pj, - // CEover4 * (workspace->Delta[j] - - // dfvl * workspace->Delta_lp_temp[j]), - // CEover4 * (workspace->Delta[j] - - // dfvl * workspace->Delta_lp_temp[j]), - // workspace->f_ov, workspace->f_ov ); // OvCoor - 3b - - // Add_dDelta( system, lists, j, - // CEunder4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * - // (bo_ij->BO_pi + bo_ij->BO_pi2), - // workspace->f_un ); // UnCoor - 2a - - // Add_dBOpinpi2( system, lists, i, pj, - // CEunder4 * (workspace->Delta[j] - - // dfvl * workspace->Delta_lp_temp[j]), - // CEunder4 * (workspace->Delta[j] - - // dfvl * workspace->Delta_lp_temp[j]), - // workspace->f_un, workspace->f_un ); // UnCoor - 2b + //TODO + // Add_dBO( system, lists, i, pj, CEover1 * twbp->p_ovun1 * twbp->De_s, + // workspace->f_ov ); // OvCoor - 1st term + + // Add_dDelta( system, lists, j, + // CEover4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * + // (bo_ij->BO_pi+bo_ij->BO_pi2), workspace->f_ov );//OvCoor3a + + // Add_dBOpinpi2( system, lists, i, pj, + // CEover4 * (workspace->Delta[j] - + // dfvl * workspace->Delta_lp_temp[j]), + // CEover4 * (workspace->Delta[j] - + // dfvl * workspace->Delta_lp_temp[j]), + // workspace->f_ov, workspace->f_ov ); // OvCoor - 3b + + // Add_dDelta( system, lists, j, + // CEunder4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * + // (bo_ij->BO_pi + bo_ij->BO_pi2), + // workspace->f_un ); // UnCoor - 2a + + // Add_dBOpinpi2( system, lists, i, pj, + // CEunder4 * (workspace->Delta[j] - + // dfvl * workspace->Delta_lp_temp[j]), + // CEunder4 * (workspace->Delta[j] - + // dfvl * workspace->Delta_lp_temp[j]), + // workspace->f_un, workspace->f_un ); // UnCoor - 2b #endif - } + } #ifdef TEST_ENERGY - //TODO - //replace the code here... you deleted for compiling - //TODO + //TODO + //replace the code here... you deleted for compiling + //TODO #endif - //} .. end of for loop + //} .. end of for loop } @@ -656,325 +656,325 @@ GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob //CUDA Functions GLOBAL void test_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, global_parameters g_params, - single_body_parameters *sbp, two_body_parameters *tbp, - static_storage p_workspace, simulation_data *data, - list p_bonds, int N, int num_atom_types, - real *E_Lp, real *E_Ov, real *E_Un) + single_body_parameters *sbp, two_body_parameters *tbp, + static_storage p_workspace, simulation_data *data, + list p_bonds, int N, int num_atom_types, + real *E_Lp, real *E_Ov, real *E_Un) { - int i, j, pj, type_i, type_j; - real Delta_lpcorr, dfvl; - real e_lp, expvd2, inv_expvd2, dElp, CElp, DlpVi; - real e_lph, Di, vov3, deahu2dbo, deahu2dsbo; - real e_ov, CEover1, CEover2, CEover3, CEover4; - real exp_ovun1, exp_ovun2, sum_ovun1, sum_ovun2; - real exp_ovun2n, exp_ovun6, exp_ovun8; - real inv_exp_ovun1, inv_exp_ovun2, inv_exp_ovun2n, inv_exp_ovun8; - real e_un, CEunder1, CEunder2, CEunder3, CEunder4; - real p_lp1, p_lp2, p_lp3; - real p_ovun2, p_ovun3, p_ovun4, p_ovun5, p_ovun6, p_ovun7, p_ovun8; - - single_body_parameters *sbp_i, *sbp_j; - two_body_parameters *twbp; - bond_data *pbond; - bond_order_data *bo_ij; - list *bonds = &p_bonds; - static_storage *workspace = &p_workspace; - - i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= N) return; - - /* Initialize parameters */ - p_lp1 = g_params.l[15]; - p_lp3 = g_params.l[5]; - p_ovun3 = g_params.l[32]; - p_ovun4 = g_params.l[31]; - p_ovun6 = g_params.l[6]; - p_ovun7 = g_params.l[8]; - p_ovun8 = g_params.l[9]; - - /* - if (i < N) { - // set the parameter pointer - type_i = atoms[i].type; - sbp_i = &(sbp[ type_i ]); - - // lone-pair Energy - p_lp2 = sbp_i->p_lp2; - expvd2 = EXP( -75 * workspace->Delta_lp[i] ); - inv_expvd2 = 1. / (1. + expvd2 ); - - // calculate the energy - e_lp = p_lp2 * workspace->Delta_lp[i] * inv_expvd2; - //atomicAdd (&data->E_Lp, e_lp ); - E_Lp [ i ] = e_lp; - - dElp = p_lp2 * inv_expvd2 + - 75 * p_lp2 * workspace->Delta_lp[i] * expvd2 * SQR(inv_expvd2); - CElp = dElp * workspace->dDelta_lp[i]; - - workspace->CdDelta[i] += CElp; // lp - 1st term - - // correction for C2 - if( g_params.l[5] > 0.001 && - !cuda_strcmp( sbp[type_i].name, "C" , 15) ) - for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ) - if( i < bonds->select.bond_list[pj].nbr ) { - j = bonds->select.bond_list[pj].nbr; - type_j = atoms[j].type; - - if( !cuda_strcmp( sbp[type_j].name, "C", 15 ) ) { - twbp = &( tbp[ index_tbp (type_i,type_j,num_atom_types) ]); - bo_ij = &( bonds->select.bond_list[pj].bo_data ); - Di = workspace->Delta[i]; - vov3 = bo_ij->BO - Di - 0.040*POW(Di, 4.); - - if( vov3 > 3. ) { - - e_lph = p_lp3 * SQR(vov3-3.0); - E_Lp [i] += e_lph; - //atomicAdd (&data->E_Lp, e_lph ); - //estrain(i) += e_lph; - - deahu2dbo = 2.*p_lp3*(vov3 - 3.); - deahu2dsbo = 2.*p_lp3*(vov3 - 3.)*(-1. - 0.16*POW(Di, 3.)); - - bo_ij->Cdbo += deahu2dbo; - - workspace->CdDelta[i] += deahu2dsbo; - } - } - } - } // end of if statement for the all the threads - - __syncthreads (); - - if (i >= N) return; - - */ - - type_i = atoms[i].type; - sbp_i = &(sbp[ type_i ]); - - // over-coordination energy - if( sbp_i->mass > 21.0 ) - dfvl = 0.0; - else dfvl = 1.0; // only for 1st-row elements - - p_ovun2 = sbp_i->p_ovun2; - sum_ovun1 = 0; - sum_ovun2 = 0; - - for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ) { - j = bonds->select.bond_list[pj].nbr; - type_j = atoms[j].type; - bo_ij = &(bonds->select.bond_list[pj].bo_data); - sbp_j = &(sbp[ type_j ]); - twbp = &(tbp[ index_tbp (type_i,type_j,num_atom_types) ]); - - sum_ovun1 += twbp->p_ovun1 * twbp->De_s * bo_ij->BO; - sum_ovun2 += (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j])* - ( bo_ij->BO_pi + bo_ij->BO_pi2 ); - } - - - exp_ovun1 = p_ovun3 * EXP( p_ovun4 * sum_ovun2 ); - inv_exp_ovun1 = 1.0 / (1 + exp_ovun1); - Delta_lpcorr = workspace->Delta[i] - - (dfvl*workspace->Delta_lp_temp[i]) * inv_exp_ovun1; + int i, j, pj, type_i, type_j; + real Delta_lpcorr, dfvl; + real e_lp, expvd2, inv_expvd2, dElp, CElp, DlpVi; + real e_lph, Di, vov3, deahu2dbo, deahu2dsbo; + real e_ov, CEover1, CEover2, CEover3, CEover4; + real exp_ovun1, exp_ovun2, sum_ovun1, sum_ovun2; + real exp_ovun2n, exp_ovun6, exp_ovun8; + real inv_exp_ovun1, inv_exp_ovun2, inv_exp_ovun2n, inv_exp_ovun8; + real e_un, CEunder1, CEunder2, CEunder3, CEunder4; + real p_lp1, p_lp2, p_lp3; + real p_ovun2, p_ovun3, p_ovun4, p_ovun5, p_ovun6, p_ovun7, p_ovun8; + + single_body_parameters *sbp_i, *sbp_j; + two_body_parameters *twbp; + bond_data *pbond; + bond_order_data *bo_ij; + list *bonds = &p_bonds; + static_storage *workspace = &p_workspace; + + i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= N) return; + + /* Initialize parameters */ + p_lp1 = g_params.l[15]; + p_lp3 = g_params.l[5]; + p_ovun3 = g_params.l[32]; + p_ovun4 = g_params.l[31]; + p_ovun6 = g_params.l[6]; + p_ovun7 = g_params.l[8]; + p_ovun8 = g_params.l[9]; + + /* + if (i < N) { + // set the parameter pointer + type_i = atoms[i].type; + sbp_i = &(sbp[ type_i ]); + + // lone-pair Energy + p_lp2 = sbp_i->p_lp2; + expvd2 = EXP( -75 * workspace->Delta_lp[i] ); + inv_expvd2 = 1. / (1. + expvd2 ); + + // calculate the energy + e_lp = p_lp2 * workspace->Delta_lp[i] * inv_expvd2; + //atomicAdd (&data->E_Lp, e_lp ); + E_Lp [ i ] = e_lp; + + dElp = p_lp2 * inv_expvd2 + + 75 * p_lp2 * workspace->Delta_lp[i] * expvd2 * SQR(inv_expvd2); + CElp = dElp * workspace->dDelta_lp[i]; + + workspace->CdDelta[i] += CElp; // lp - 1st term + + // correction for C2 + if( g_params.l[5] > 0.001 && + !cuda_strcmp( sbp[type_i].name, "C" , 15) ) + for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ) + if( i < bonds->select.bond_list[pj].nbr ) { + j = bonds->select.bond_list[pj].nbr; + type_j = atoms[j].type; + + if( !cuda_strcmp( sbp[type_j].name, "C", 15 ) ) { + twbp = &( tbp[ index_tbp (type_i,type_j,num_atom_types) ]); + bo_ij = &( bonds->select.bond_list[pj].bo_data ); + Di = workspace->Delta[i]; + vov3 = bo_ij->BO - Di - 0.040*POW(Di, 4.); + + if( vov3 > 3. ) { + + e_lph = p_lp3 * SQR(vov3-3.0); + E_Lp [i] += e_lph; + //atomicAdd (&data->E_Lp, e_lph ); + //estrain(i) += e_lph; + + deahu2dbo = 2.*p_lp3*(vov3 - 3.); + deahu2dsbo = 2.*p_lp3*(vov3 - 3.)*(-1. - 0.16*POW(Di, 3.)); + + bo_ij->Cdbo += deahu2dbo; + + workspace->CdDelta[i] += deahu2dsbo; + } + } + } + } // end of if statement for the all the threads + + __syncthreads (); + + if (i >= N) return; + + */ + + type_i = atoms[i].type; + sbp_i = &(sbp[ type_i ]); + + // over-coordination energy + if( sbp_i->mass > 21.0 ) + dfvl = 0.0; + else dfvl = 1.0; // only for 1st-row elements + + p_ovun2 = sbp_i->p_ovun2; + sum_ovun1 = 0; + sum_ovun2 = 0; + + for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ) { + j = bonds->select.bond_list[pj].nbr; + type_j = atoms[j].type; + bo_ij = &(bonds->select.bond_list[pj].bo_data); + sbp_j = &(sbp[ type_j ]); + twbp = &(tbp[ index_tbp (type_i,type_j,num_atom_types) ]); + + sum_ovun1 += twbp->p_ovun1 * twbp->De_s * bo_ij->BO; + sum_ovun2 += (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j])* + ( bo_ij->BO_pi + bo_ij->BO_pi2 ); + } + + + exp_ovun1 = p_ovun3 * EXP( p_ovun4 * sum_ovun2 ); + inv_exp_ovun1 = 1.0 / (1 + exp_ovun1); + Delta_lpcorr = workspace->Delta[i] - + (dfvl*workspace->Delta_lp_temp[i]) * inv_exp_ovun1; - exp_ovun2 = EXP( p_ovun2 * Delta_lpcorr ); - inv_exp_ovun2 = 1.0 / (1.0 + exp_ovun2); - - DlpVi = 1.0 / (Delta_lpcorr + sbp_i->valency + 1e-8 ); - CEover1 = Delta_lpcorr * DlpVi * inv_exp_ovun2; + exp_ovun2 = EXP( p_ovun2 * Delta_lpcorr ); + inv_exp_ovun2 = 1.0 / (1.0 + exp_ovun2); + + DlpVi = 1.0 / (Delta_lpcorr + sbp_i->valency + 1e-8 ); + CEover1 = Delta_lpcorr * DlpVi * inv_exp_ovun2; - e_ov = sum_ovun1 * CEover1; - E_Ov [ i ] = e_ov; - //atomicAdd ( &data->E_Ov, e_ov ); + e_ov = sum_ovun1 * CEover1; + E_Ov [ i ] = e_ov; + //atomicAdd ( &data->E_Ov, e_ov ); - CEover2 = sum_ovun1 * DlpVi * inv_exp_ovun2 * - ( 1.0 - Delta_lpcorr*( DlpVi + p_ovun2 * exp_ovun2 * inv_exp_ovun2 ) ); + CEover2 = sum_ovun1 * DlpVi * inv_exp_ovun2 * + ( 1.0 - Delta_lpcorr*( DlpVi + p_ovun2 * exp_ovun2 * inv_exp_ovun2 ) ); - CEover3 = CEover2 * (1.0 - dfvl*workspace->dDelta_lp[i]*inv_exp_ovun1 ); + CEover3 = CEover2 * (1.0 - dfvl*workspace->dDelta_lp[i]*inv_exp_ovun1 ); - CEover4 = CEover2 * (dfvl*workspace->Delta_lp_temp[i]) * - p_ovun4 * exp_ovun1 * SQR(inv_exp_ovun1); + CEover4 = CEover2 * (dfvl*workspace->Delta_lp_temp[i]) * + p_ovun4 * exp_ovun1 * SQR(inv_exp_ovun1); - // under-coordination potential - p_ovun2 = sbp_i->p_ovun2; - p_ovun5 = sbp_i->p_ovun5; + // under-coordination potential + p_ovun2 = sbp_i->p_ovun2; + p_ovun5 = sbp_i->p_ovun5; - exp_ovun2n = 1.0 / exp_ovun2; - exp_ovun6 = EXP( p_ovun6 * Delta_lpcorr ); - exp_ovun8 = p_ovun7 * EXP(p_ovun8 * sum_ovun2); - inv_exp_ovun2n = 1.0 / (1.0 + exp_ovun2n); - inv_exp_ovun8 = 1.0 / (1.0 + exp_ovun8); + exp_ovun2n = 1.0 / exp_ovun2; + exp_ovun6 = EXP( p_ovun6 * Delta_lpcorr ); + exp_ovun8 = p_ovun7 * EXP(p_ovun8 * sum_ovun2); + inv_exp_ovun2n = 1.0 / (1.0 + exp_ovun2n); + inv_exp_ovun8 = 1.0 / (1.0 + exp_ovun8); - e_un = -p_ovun5 * (1.0 - exp_ovun6) * inv_exp_ovun2n * inv_exp_ovun8; - E_Un [i] = e_un; - //atomicAdd ( &data->E_Un, e_un ); + e_un = -p_ovun5 * (1.0 - exp_ovun6) * inv_exp_ovun2n * inv_exp_ovun8; + E_Un [i] = e_un; + //atomicAdd ( &data->E_Un, e_un ); - CEunder1 = inv_exp_ovun2n * ( p_ovun5*p_ovun6*exp_ovun6*inv_exp_ovun8 + - p_ovun2 * e_un * exp_ovun2n); - CEunder2 = -e_un * p_ovun8 * exp_ovun8 * inv_exp_ovun8; - CEunder3 = CEunder1 * (1.0 - dfvl*workspace->dDelta_lp[i]*inv_exp_ovun1); - CEunder4 = CEunder1 * (dfvl*workspace->Delta_lp_temp[i]) * - p_ovun4 * exp_ovun1 * SQR(inv_exp_ovun1) + CEunder2; + CEunder1 = inv_exp_ovun2n * ( p_ovun5*p_ovun6*exp_ovun6*inv_exp_ovun8 + + p_ovun2 * e_un * exp_ovun2n); + CEunder2 = -e_un * p_ovun8 * exp_ovun8 * inv_exp_ovun8; + CEunder3 = CEunder1 * (1.0 - dfvl*workspace->dDelta_lp[i]*inv_exp_ovun1); + CEunder4 = CEunder1 * (dfvl*workspace->Delta_lp_temp[i]) * + p_ovun4 * exp_ovun1 * SQR(inv_exp_ovun1) + CEunder2; - // forces - workspace->CdDelta[i] += CEover3; // OvCoor - 2nd term - workspace->CdDelta[i] += CEunder3; // UnCoor - 1st term + // forces + workspace->CdDelta[i] += CEover3; // OvCoor - 2nd term + workspace->CdDelta[i] += CEunder3; // UnCoor - 1st term - for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ){ - pbond = &(bonds->select.bond_list[pj]); - j = pbond->nbr; - type_j = atoms[j].type; - bo_ij = &(pbond->bo_data); - twbp = &(tbp[ index_tbp (type_i,type_j,num_atom_types) ]); + for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ){ + pbond = &(bonds->select.bond_list[pj]); + j = pbond->nbr; + type_j = atoms[j].type; + bo_ij = &(pbond->bo_data); + twbp = &(tbp[ index_tbp (type_i,type_j,num_atom_types) ]); - bo_ij->Cdbo += CEover1 * twbp->p_ovun1 * twbp->De_s; // OvCoor - 1st + bo_ij->Cdbo += CEover1 * twbp->p_ovun1 * twbp->De_s; // OvCoor - 1st - //workspace->CdDelta[j] += CEover4*(1.0 - dfvl*workspace->dDelta_lp[j])* (bo_ij->BO_pi + bo_ij->BO_pi2); // OvCoor - 3a - pbond->scratch += CEover4*(1.0 - dfvl*workspace->dDelta_lp[j])* (bo_ij->BO_pi + bo_ij->BO_pi2); // OvCoor - 3a + //workspace->CdDelta[j] += CEover4*(1.0 - dfvl*workspace->dDelta_lp[j])* (bo_ij->BO_pi + bo_ij->BO_pi2); // OvCoor - 3a + pbond->scratch += CEover4*(1.0 - dfvl*workspace->dDelta_lp[j])* (bo_ij->BO_pi + bo_ij->BO_pi2); // OvCoor - 3a - bo_ij->Cdbopi += CEover4 * - (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//OvCoor-3b - bo_ij->Cdbopi2 += CEover4 * - (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//OvCoor-3b + bo_ij->Cdbopi += CEover4 * + (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//OvCoor-3b + bo_ij->Cdbopi2 += CEover4 * + (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//OvCoor-3b - //workspace->CdDelta[j] += CEunder4*(1.0-dfvl*workspace->dDelta_lp[j]) * (bo_ij->BO_pi + bo_ij->BO_pi2) ; // UnCoor - 2a - pbond->scratch += CEunder4*(1.0-dfvl*workspace->dDelta_lp[j]) * (bo_ij->BO_pi + bo_ij->BO_pi2) ; // UnCoor - 2a + //workspace->CdDelta[j] += CEunder4*(1.0-dfvl*workspace->dDelta_lp[j]) * (bo_ij->BO_pi + bo_ij->BO_pi2) ; // UnCoor - 2a + pbond->scratch += CEunder4*(1.0-dfvl*workspace->dDelta_lp[j]) * (bo_ij->BO_pi + bo_ij->BO_pi2) ; // UnCoor - 2a - bo_ij->Cdbopi += CEunder4 * - (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//UnCoor-2b - bo_ij->Cdbopi2 += CEunder4 * - (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//UnCoor-2b + bo_ij->Cdbopi += CEunder4 * + (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//UnCoor-2b + bo_ij->Cdbopi2 += CEunder4 * + (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//UnCoor-2b - } + } } /////////////////////////////////////////////////////////// GLOBAL void test_LonePair_OverUnder_Coordination_Energy_LP ( reax_atom *atoms, global_parameters g_params, - single_body_parameters *sbp, two_body_parameters *tbp, - static_storage p_workspace, simulation_data *data, - list p_bonds, int N, int num_atom_types, - real *E_Lp, real *E_Ov, real *E_Un) + single_body_parameters *sbp, two_body_parameters *tbp, + static_storage p_workspace, simulation_data *data, + list p_bonds, int N, int num_atom_types, + real *E_Lp, real *E_Ov, real *E_Un) { - int i, j, pj, type_i, type_j; - real Delta_lpcorr, dfvl; - real e_lp, expvd2, inv_expvd2, dElp, CElp, DlpVi; - real e_lph, Di, vov3, deahu2dbo, deahu2dsbo; - real e_ov, CEover1, CEover2, CEover3, CEover4; - real exp_ovun1, exp_ovun2, sum_ovun1, sum_ovun2; - real exp_ovun2n, exp_ovun6, exp_ovun8; - real inv_exp_ovun1, inv_exp_ovun2, inv_exp_ovun2n, inv_exp_ovun8; - real e_un, CEunder1, CEunder2, CEunder3, CEunder4; - real p_lp1, p_lp2, p_lp3; - real p_ovun2, p_ovun3, p_ovun4, p_ovun5, p_ovun6, p_ovun7, p_ovun8; - - single_body_parameters *sbp_i, *sbp_j; - two_body_parameters *twbp; - bond_data *pbond; - bond_order_data *bo_ij; - list *bonds = &p_bonds; - static_storage *workspace = &p_workspace; - - i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= N) return; - - /* Initialize parameters */ - p_lp1 = g_params.l[15]; - p_lp3 = g_params.l[5]; - p_ovun3 = g_params.l[32]; - p_ovun4 = g_params.l[31]; - p_ovun6 = g_params.l[6]; - p_ovun7 = g_params.l[8]; - p_ovun8 = g_params.l[9]; - - // set the parameter pointer - type_i = atoms[i].type; - sbp_i = &(sbp[ type_i ]); - - // lone-pair Energy - p_lp2 = sbp_i->p_lp2; - expvd2 = EXP( -75 * workspace->Delta_lp[i] ); - inv_expvd2 = 1. / (1. + expvd2 ); - - // calculate the energy - e_lp = p_lp2 * workspace->Delta_lp[i] * inv_expvd2; - //atomicAdd (&data->E_Lp, e_lp ); - E_Lp [ i ] = e_lp; - - dElp = p_lp2 * inv_expvd2 + - 75 * p_lp2 * workspace->Delta_lp[i] * expvd2 * SQR(inv_expvd2); - CElp = dElp * workspace->dDelta_lp[i]; - - workspace->CdDelta[i] += CElp; // lp - 1st term - - // correction for C2 - if( g_params.l[5] > 0.001 && - !cuda_strcmp( sbp[type_i].name, "C" , 15) ) - for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ) - if( i < bonds->select.bond_list[pj].nbr ) { - j = bonds->select.bond_list[pj].nbr; - type_j = atoms[j].type; - - if( !cuda_strcmp( sbp[type_j].name, "C", 15 ) ) { - twbp = &( tbp[ index_tbp (type_i,type_j,num_atom_types) ]); - bo_ij = &( bonds->select.bond_list[pj].bo_data ); - Di = workspace->Delta[i]; - vov3 = bo_ij->BO - Di - 0.040*POW(Di, 4.); - - if( vov3 > 3. ) { - - e_lph = p_lp3 * SQR(vov3-3.0); - E_Lp [i] += e_lph; - //atomicAdd (&data->E_Lp, e_lph ); - //estrain(i) += e_lph; - - deahu2dbo = 2.*p_lp3*(vov3 - 3.); - deahu2dsbo = 2.*p_lp3*(vov3 - 3.)*(-1. - 0.16*POW(Di, 3.)); - - bo_ij->Cdbo += deahu2dbo; - - workspace->CdDelta[i] += deahu2dsbo; - } - } - } + int i, j, pj, type_i, type_j; + real Delta_lpcorr, dfvl; + real e_lp, expvd2, inv_expvd2, dElp, CElp, DlpVi; + real e_lph, Di, vov3, deahu2dbo, deahu2dsbo; + real e_ov, CEover1, CEover2, CEover3, CEover4; + real exp_ovun1, exp_ovun2, sum_ovun1, sum_ovun2; + real exp_ovun2n, exp_ovun6, exp_ovun8; + real inv_exp_ovun1, inv_exp_ovun2, inv_exp_ovun2n, inv_exp_ovun8; + real e_un, CEunder1, CEunder2, CEunder3, CEunder4; + real p_lp1, p_lp2, p_lp3; + real p_ovun2, p_ovun3, p_ovun4, p_ovun5, p_ovun6, p_ovun7, p_ovun8; + + single_body_parameters *sbp_i, *sbp_j; + two_body_parameters *twbp; + bond_data *pbond; + bond_order_data *bo_ij; + list *bonds = &p_bonds; + static_storage *workspace = &p_workspace; + + i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= N) return; + + /* Initialize parameters */ + p_lp1 = g_params.l[15]; + p_lp3 = g_params.l[5]; + p_ovun3 = g_params.l[32]; + p_ovun4 = g_params.l[31]; + p_ovun6 = g_params.l[6]; + p_ovun7 = g_params.l[8]; + p_ovun8 = g_params.l[9]; + + // set the parameter pointer + type_i = atoms[i].type; + sbp_i = &(sbp[ type_i ]); + + // lone-pair Energy + p_lp2 = sbp_i->p_lp2; + expvd2 = EXP( -75 * workspace->Delta_lp[i] ); + inv_expvd2 = 1. / (1. + expvd2 ); + + // calculate the energy + e_lp = p_lp2 * workspace->Delta_lp[i] * inv_expvd2; + //atomicAdd (&data->E_Lp, e_lp ); + E_Lp [ i ] = e_lp; + + dElp = p_lp2 * inv_expvd2 + + 75 * p_lp2 * workspace->Delta_lp[i] * expvd2 * SQR(inv_expvd2); + CElp = dElp * workspace->dDelta_lp[i]; + + workspace->CdDelta[i] += CElp; // lp - 1st term + + // correction for C2 + if( g_params.l[5] > 0.001 && + !cuda_strcmp( sbp[type_i].name, "C" , 15) ) + for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ) + if( i < bonds->select.bond_list[pj].nbr ) { + j = bonds->select.bond_list[pj].nbr; + type_j = atoms[j].type; + + if( !cuda_strcmp( sbp[type_j].name, "C", 15 ) ) { + twbp = &( tbp[ index_tbp (type_i,type_j,num_atom_types) ]); + bo_ij = &( bonds->select.bond_list[pj].bo_data ); + Di = workspace->Delta[i]; + vov3 = bo_ij->BO - Di - 0.040*POW(Di, 4.); + + if( vov3 > 3. ) { + + e_lph = p_lp3 * SQR(vov3-3.0); + E_Lp [i] += e_lph; + //atomicAdd (&data->E_Lp, e_lph ); + //estrain(i) += e_lph; + + deahu2dbo = 2.*p_lp3*(vov3 - 3.); + deahu2dsbo = 2.*p_lp3*(vov3 - 3.)*(-1. - 0.16*POW(Di, 3.)); + + bo_ij->Cdbo += deahu2dbo; + + workspace->CdDelta[i] += deahu2dsbo; + } + } + } } /////////////////////////////////////////////////////////// GLOBAL void test_LonePair_Postprocess ( reax_atom *atoms, global_parameters g_params, - single_body_parameters *sbp, two_body_parameters *tbp, - static_storage p_workspace, simulation_data *data, - list p_bonds, int N, int num_atom_types ) + single_body_parameters *sbp, two_body_parameters *tbp, + static_storage p_workspace, simulation_data *data, + list p_bonds, int N, int num_atom_types ) { - int i, j, pj, type_i, type_j; + int i, j, pj, type_i, type_j; - single_body_parameters *sbp_i, *sbp_j; - two_body_parameters *twbp; - bond_data *pbond, *sbond; - bond_data *dbond_index_bond, *sym_index_bond; - bond_order_data *bo_ij; - list *bonds = &p_bonds; - static_storage *workspace = &p_workspace; + single_body_parameters *sbp_i, *sbp_j; + two_body_parameters *twbp; + bond_data *pbond, *sbond; + bond_data *dbond_index_bond, *sym_index_bond; + bond_order_data *bo_ij; + list *bonds = &p_bonds; + static_storage *workspace = &p_workspace; - i = blockIdx.x * blockDim.x + threadIdx.x; + i = blockIdx.x * blockDim.x + threadIdx.x; - if ( i >= N) return; + if ( i >= N) return; - for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ){ + for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ){ - /* - pbond = &(bonds->select.bond_list[pj]); - dbond_index_bond = &( bonds->select.bond_list[ pbond->dbond_index ] ); - workspace->CdDelta [i] += dbond_index_bond->scratch; - */ + /* + pbond = &(bonds->select.bond_list[pj]); + dbond_index_bond = &( bonds->select.bond_list[ pbond->dbond_index ] ); + workspace->CdDelta [i] += dbond_index_bond->scratch; + */ - sbond = &(bonds->select.bond_list [pj]); - sym_index_bond = &( bonds->select.bond_list[ sbond->sym_index ]); - workspace->CdDelta [i] += sym_index_bond->scratch; - } + sbond = &(bonds->select.bond_list [pj]); + sym_index_bond = &( bonds->select.bond_list[ sbond->sym_index ]); + workspace->CdDelta [i] += sym_index_bond->scratch; + } } diff --git a/PuReMD-GPU/src/system_props.cu b/PuReMD-GPU/src/system_props.cu index 9de96916..3ec39134 100644 --- a/PuReMD-GPU/src/system_props.cu +++ b/PuReMD-GPU/src/system_props.cu @@ -31,460 +31,460 @@ real Get_Time( ) { - struct timeval tim; + struct timeval tim; - gettimeofday(&tim, NULL ); - return( tim.tv_sec + (tim.tv_usec / 1000000.0) ); + gettimeofday(&tim, NULL ); + return( tim.tv_sec + (tim.tv_usec / 1000000.0) ); } real Get_Timing_Info( real t_start ) { - struct timeval tim; - real t_end; + struct timeval tim; + real t_end; - gettimeofday(&tim, NULL ); - t_end = tim.tv_sec + (tim.tv_usec / 1000000.0); - return (t_end - t_start); + gettimeofday(&tim, NULL ); + t_end = tim.tv_sec + (tim.tv_usec / 1000000.0); + return (t_end - t_start); } void Temperature_Control( control_params *control, simulation_data *data, - output_controls *out_control ) + output_controls *out_control ) { - real tmp; - - if( control->T_mode == 1 ) { // step-wise temperature control - if( (data->step - data->prev_steps) % - ((int)(control->T_freq / control->dt)) == 0 ) { - if( fabs( control->T - control->T_final ) >= fabs( control->T_rate ) ) - control->T += control->T_rate; - else control->T = control->T_final; - } - } - else if( control->T_mode == 2 ) { // constant slope control - tmp = control->T_rate * control->dt / control->T_freq; - - if( fabs( control->T - control->T_final ) >= fabs( tmp ) ) - control->T += tmp; - } + real tmp; + + if( control->T_mode == 1 ) { // step-wise temperature control + if( (data->step - data->prev_steps) % + ((int)(control->T_freq / control->dt)) == 0 ) { + if( fabs( control->T - control->T_final ) >= fabs( control->T_rate ) ) + control->T += control->T_rate; + else control->T = control->T_final; + } + } + else if( control->T_mode == 2 ) { // constant slope control + tmp = control->T_rate * control->dt / control->T_freq; + + if( fabs( control->T - control->T_final ) >= fabs( tmp ) ) + control->T += tmp; + } } void prep_dev_system (reax_system *system) { - //copy the system atoms to the device - Sync_Host_Device ( system, cudaMemcpyHostToDevice ); + //copy the system atoms to the device + Sync_Host_Device ( system, cudaMemcpyHostToDevice ); } void Compute_Total_Mass( reax_system *system, simulation_data *data ) { - int i; - int blocks; - int block_size; - real *partial_sums = 0; + int i; + int blocks; + int block_size; + real *partial_sums = 0; - data->M = 0; + data->M = 0; - for( i = 0; i < system->N; i++ ) - data->M += system->reaxprm.sbp[ system->atoms[i].type ].mass; + for( i = 0; i < system->N; i++ ) + data->M += system->reaxprm.sbp[ system->atoms[i].type ].mass; - data->inv_M = 1. / data->M; + data->inv_M = 1. / data->M; } void Cuda_Compute_Total_Mass( reax_system *system, simulation_data *data ) { - real *partial_sums = (real *) scratch; - //data->M = 0; - - //cuda_malloc ((void **)&partial_sums, sizeof (real) * (blocks + 1), 1, 0); - cuda_memset (partial_sums, 0, REAL_SIZE * (BLOCKS_POW_2 + 1), RES_SCRATCH ); - - Compute_Total_Mass <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> - (system->reaxprm.d_sbp, system->d_atoms, partial_sums, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> - (partial_sums, partial_sums + BLOCKS_POW_2, BLOCKS_POW_2); - //(partial_sums, &((simulation_data *)data->d_simulation_data)->M, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); - - //#ifdef __BUILD_DEBUG__ - // validate_data ( system, data ); - //#endif - - //copy_host_device (&data->M, &((simulation_data *)data->d_simulation_data)->M, - //#ifdef __BUILD_DEBUG__ - // t_data_M = data->M; - //#endif - copy_host_device (&data->M, partial_sums + BLOCKS_POW_2, - REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - - //#ifdef __BUILD_DEBUG__ - // if (check_zero (t_data, data->M)) - // { - // fprintf (stderr, "SimulationData:M does not match on host and device (%f %f) \n", t_data, data->M ); - // exit (0); - // } - //#endif - data->inv_M = 1. / data->M; + real *partial_sums = (real *) scratch; + //data->M = 0; + + //cuda_malloc ((void **)&partial_sums, sizeof (real) * (blocks + 1), 1, 0); + cuda_memset (partial_sums, 0, REAL_SIZE * (BLOCKS_POW_2 + 1), RES_SCRATCH ); + + Compute_Total_Mass <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> + (system->reaxprm.d_sbp, system->d_atoms, partial_sums, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> + (partial_sums, partial_sums + BLOCKS_POW_2, BLOCKS_POW_2); + //(partial_sums, &((simulation_data *)data->d_simulation_data)->M, BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); + + //#ifdef __BUILD_DEBUG__ + // validate_data ( system, data ); + //#endif + + //copy_host_device (&data->M, &((simulation_data *)data->d_simulation_data)->M, + //#ifdef __BUILD_DEBUG__ + // t_data_M = data->M; + //#endif + copy_host_device (&data->M, partial_sums + BLOCKS_POW_2, + REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + + //#ifdef __BUILD_DEBUG__ + // if (check_zero (t_data, data->M)) + // { + // fprintf (stderr, "SimulationData:M does not match on host and device (%f %f) \n", t_data, data->M ); + // exit (0); + // } + //#endif + data->inv_M = 1. / data->M; } GLOBAL void Compute_Total_Mass (single_body_parameters *sbp, reax_atom *atoms, real *per_block_results, size_t n) { - extern __shared__ real sdata[]; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - real x = 0; - - if(i < n) - x = sbp [ atoms[ i ].type ].mass; - - sdata[threadIdx.x] = x; - __syncthreads(); - - for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) - { - if(threadIdx.x < offset) - { - sdata[threadIdx.x] += sdata[threadIdx.x + offset]; - } - __syncthreads(); - } - - if(threadIdx.x == 0) - { - per_block_results[blockIdx.x] = sdata[0]; - } + extern __shared__ real sdata[]; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + real x = 0; + + if(i < n) + x = sbp [ atoms[ i ].type ].mass; + + sdata[threadIdx.x] = x; + __syncthreads(); + + for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) + { + if(threadIdx.x < offset) + { + sdata[threadIdx.x] += sdata[threadIdx.x + offset]; + } + __syncthreads(); + } + + if(threadIdx.x == 0) + { + per_block_results[blockIdx.x] = sdata[0]; + } } void Compute_Center_of_Mass( reax_system *system, simulation_data *data, - FILE *fout ) + FILE *fout ) { - int i; - real m, xx, xy, xz, yy, yz, zz, det; - rvec tvec, diff; - rtensor mat, inv; - - int blocks; - int block_size; - rvec *l_xcm, *l_vcm, *l_amcm; - real t_start, t_end; - - rvec_MakeZero( data->xcm ); // position of CoM - rvec_MakeZero( data->vcm ); // velocity of CoM - rvec_MakeZero( data->amcm ); // angular momentum of CoM - rvec_MakeZero( data->avcm ); // angular velocity of CoM - - /* Compute the position, velocity and angular momentum about the CoM */ - for( i = 0; i < system->N; ++i ) { - m = system->reaxprm.sbp[ system->atoms[i].type ].mass; - - rvec_ScaledAdd( data->xcm, m, system->atoms[i].x ); - rvec_ScaledAdd( data->vcm, m, system->atoms[i].v ); - - rvec_Cross( tvec, system->atoms[i].x, system->atoms[i].v ); - rvec_ScaledAdd( data->amcm, m, tvec ); - - /*fprintf( fout,"%3d %g %g %g\n", - i+1, - system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2] ); - fprintf( fout, "vcm: %g %g %g\n", - data->vcm[0], data->vcm[1], data->vcm[2] ); - */ - } - - rvec_Scale( data->xcm, data->inv_M, data->xcm ); - rvec_Scale( data->vcm, data->inv_M, data->vcm ); - - rvec_Cross( tvec, data->xcm, data->vcm ); - rvec_ScaledAdd( data->amcm, -data->M, tvec ); - - data->etran_cm = 0.5 * data->M * rvec_Norm_Sqr( data->vcm ); - - /* Calculate and then invert the inertial tensor */ - xx = xy = xz = yy = yz = zz = 0; - - for( i = 0; i < system->N; ++i ) { - m = system->reaxprm.sbp[ system->atoms[i].type ].mass; - - rvec_ScaledSum( diff, 1., system->atoms[i].x, -1., data->xcm ); - xx += diff[0] * diff[0] * m; - xy += diff[0] * diff[1] * m; - xz += diff[0] * diff[2] * m; - yy += diff[1] * diff[1] * m; - yz += diff[1] * diff[2] * m; - zz += diff[2] * diff[2] * m; - } + int i; + real m, xx, xy, xz, yy, yz, zz, det; + rvec tvec, diff; + rtensor mat, inv; + + int blocks; + int block_size; + rvec *l_xcm, *l_vcm, *l_amcm; + real t_start, t_end; + + rvec_MakeZero( data->xcm ); // position of CoM + rvec_MakeZero( data->vcm ); // velocity of CoM + rvec_MakeZero( data->amcm ); // angular momentum of CoM + rvec_MakeZero( data->avcm ); // angular velocity of CoM + + /* Compute the position, velocity and angular momentum about the CoM */ + for( i = 0; i < system->N; ++i ) { + m = system->reaxprm.sbp[ system->atoms[i].type ].mass; + + rvec_ScaledAdd( data->xcm, m, system->atoms[i].x ); + rvec_ScaledAdd( data->vcm, m, system->atoms[i].v ); + + rvec_Cross( tvec, system->atoms[i].x, system->atoms[i].v ); + rvec_ScaledAdd( data->amcm, m, tvec ); + + /*fprintf( fout,"%3d %g %g %g\n", + i+1, + system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2] ); + fprintf( fout, "vcm: %g %g %g\n", + data->vcm[0], data->vcm[1], data->vcm[2] ); + */ + } + + rvec_Scale( data->xcm, data->inv_M, data->xcm ); + rvec_Scale( data->vcm, data->inv_M, data->vcm ); + + rvec_Cross( tvec, data->xcm, data->vcm ); + rvec_ScaledAdd( data->amcm, -data->M, tvec ); + + data->etran_cm = 0.5 * data->M * rvec_Norm_Sqr( data->vcm ); + + /* Calculate and then invert the inertial tensor */ + xx = xy = xz = yy = yz = zz = 0; + + for( i = 0; i < system->N; ++i ) { + m = system->reaxprm.sbp[ system->atoms[i].type ].mass; + + rvec_ScaledSum( diff, 1., system->atoms[i].x, -1., data->xcm ); + xx += diff[0] * diff[0] * m; + xy += diff[0] * diff[1] * m; + xz += diff[0] * diff[2] * m; + yy += diff[1] * diff[1] * m; + yz += diff[1] * diff[2] * m; + zz += diff[2] * diff[2] * m; + } #ifdef __DEBUG_CUDA__ - fprintf (stderr, " xx: %f \n", xx); - fprintf (stderr, " xy: %f \n", xy); - fprintf (stderr, " xz: %f \n", xz); - fprintf (stderr, " yy: %f \n", yy); - fprintf (stderr, " yz: %f \n", yz); - fprintf (stderr, " zz: %f \n", zz); + fprintf (stderr, " xx: %f \n", xx); + fprintf (stderr, " xy: %f \n", xy); + fprintf (stderr, " xz: %f \n", xz); + fprintf (stderr, " yy: %f \n", yy); + fprintf (stderr, " yz: %f \n", yz); + fprintf (stderr, " zz: %f \n", zz); #endif - mat[0][0] = yy + zz; - mat[0][1] = mat[1][0] = -xy; - mat[0][2] = mat[2][0] = -xz; - mat[1][1] = xx + zz; - mat[2][1] = mat[1][2] = -yz; - mat[2][2] = xx + yy; - - /* invert the inertial tensor */ - det = ( mat[0][0] * mat[1][1] * mat[2][2] + - mat[0][1] * mat[1][2] * mat[2][0] + - mat[0][2] * mat[1][0] * mat[2][1] ) - - ( mat[0][0] * mat[1][2] * mat[2][1] + - mat[0][1] * mat[1][0] * mat[2][2] + - mat[0][2] * mat[1][1] * mat[2][0] ); - - inv[0][0] = mat[1][1] * mat[2][2] - mat[1][2] * mat[2][1]; - inv[0][1] = mat[0][2] * mat[2][1] - mat[0][1] * mat[2][2]; - inv[0][2] = mat[0][1] * mat[1][2] - mat[0][2] * mat[1][1]; - inv[1][0] = mat[1][2] * mat[2][0] - mat[1][0] * mat[2][2]; - inv[1][1] = mat[0][0] * mat[2][2] - mat[0][2] * mat[2][0]; - inv[1][2] = mat[0][2] * mat[1][0] - mat[0][0] * mat[1][2]; - inv[2][0] = mat[1][0] * mat[2][1] - mat[2][0] * mat[1][1]; - inv[2][1] = mat[2][0] * mat[0][1] - mat[0][0] * mat[2][1]; - inv[2][2] = mat[0][0] * mat[1][1] - mat[1][0] * mat[0][1]; - - if( fabs(det) > ALMOST_ZERO ) - rtensor_Scale( inv, 1./det, inv ); - else - rtensor_MakeZero( inv ); - - /* Compute the angular velocity about the centre of mass */ - rtensor_MatVec( data->avcm, inv, data->amcm ); - data->erot_cm = 0.5 * E_CONV * rvec_Dot( data->avcm, data->amcm ); + mat[0][0] = yy + zz; + mat[0][1] = mat[1][0] = -xy; + mat[0][2] = mat[2][0] = -xz; + mat[1][1] = xx + zz; + mat[2][1] = mat[1][2] = -yz; + mat[2][2] = xx + yy; + + /* invert the inertial tensor */ + det = ( mat[0][0] * mat[1][1] * mat[2][2] + + mat[0][1] * mat[1][2] * mat[2][0] + + mat[0][2] * mat[1][0] * mat[2][1] ) - + ( mat[0][0] * mat[1][2] * mat[2][1] + + mat[0][1] * mat[1][0] * mat[2][2] + + mat[0][2] * mat[1][1] * mat[2][0] ); + + inv[0][0] = mat[1][1] * mat[2][2] - mat[1][2] * mat[2][1]; + inv[0][1] = mat[0][2] * mat[2][1] - mat[0][1] * mat[2][2]; + inv[0][2] = mat[0][1] * mat[1][2] - mat[0][2] * mat[1][1]; + inv[1][0] = mat[1][2] * mat[2][0] - mat[1][0] * mat[2][2]; + inv[1][1] = mat[0][0] * mat[2][2] - mat[0][2] * mat[2][0]; + inv[1][2] = mat[0][2] * mat[1][0] - mat[0][0] * mat[1][2]; + inv[2][0] = mat[1][0] * mat[2][1] - mat[2][0] * mat[1][1]; + inv[2][1] = mat[2][0] * mat[0][1] - mat[0][0] * mat[2][1]; + inv[2][2] = mat[0][0] * mat[1][1] - mat[1][0] * mat[0][1]; + + if( fabs(det) > ALMOST_ZERO ) + rtensor_Scale( inv, 1./det, inv ); + else + rtensor_MakeZero( inv ); + + /* Compute the angular velocity about the centre of mass */ + rtensor_MatVec( data->avcm, inv, data->amcm ); + data->erot_cm = 0.5 * E_CONV * rvec_Dot( data->avcm, data->amcm ); #if defined(DEBUG) - fprintf( stderr, "xcm: %24.15e %24.15e %24.15e\n", - data->xcm[0], data->xcm[1], data->xcm[2] ); - fprintf( stderr, "vcm: %24.15e %24.15e %24.15e\n", - data->vcm[0], data->vcm[1], data->vcm[2] ); - fprintf( stderr, "amcm: %24.15e %24.15e %24.15e\n", - data->amcm[0], data->amcm[1], data->amcm[2] ); - /* fprintf( fout, "mat: %f %f %f\n %f %f %f\n %f %f %f\n", - mat[0][0], mat[0][1], mat[0][2], - mat[1][0], mat[1][1], mat[1][2], - mat[2][0], mat[2][1], mat[2][2] ); - fprintf( fout, "inv: %g %g %g\n %g %g %g\n %g %g %g\n", - inv[0][0], inv[0][1], inv[0][2], - inv[1][0], inv[1][1], inv[1][2], - inv[2][0], inv[2][1], inv[2][2] ); - fflush( fout ); */ - fprintf( stderr, "avcm: %24.15e %24.15e %24.15e\n", - data->avcm[0], data->avcm[1], data->avcm[2] ); + fprintf( stderr, "xcm: %24.15e %24.15e %24.15e\n", + data->xcm[0], data->xcm[1], data->xcm[2] ); + fprintf( stderr, "vcm: %24.15e %24.15e %24.15e\n", + data->vcm[0], data->vcm[1], data->vcm[2] ); + fprintf( stderr, "amcm: %24.15e %24.15e %24.15e\n", + data->amcm[0], data->amcm[1], data->amcm[2] ); + /* fprintf( fout, "mat: %f %f %f\n %f %f %f\n %f %f %f\n", + mat[0][0], mat[0][1], mat[0][2], + mat[1][0], mat[1][1], mat[1][2], + mat[2][0], mat[2][1], mat[2][2] ); + fprintf( fout, "inv: %g %g %g\n %g %g %g\n %g %g %g\n", + inv[0][0], inv[0][1], inv[0][2], + inv[1][0], inv[1][1], inv[1][2], + inv[2][0], inv[2][1], inv[2][2] ); + fflush( fout ); */ + fprintf( stderr, "avcm: %24.15e %24.15e %24.15e\n", + data->avcm[0], data->avcm[1], data->avcm[2] ); #endif } void Cuda_Compute_Center_of_Mass( reax_system *system, simulation_data *data, - FILE *fout ) + FILE *fout ) { - int i; - real m, xx, xy, xz, yy, yz, zz, det; - rvec tvec, diff; - rtensor mat, inv; - - int blocks; - int block_size; - rvec *l_xcm, *l_vcm, *l_amcm; - real t_start, t_end; - - rvec t_xcm, t_vcm, t_amcm; - - rvec *r_scratch = (rvec *)scratch; - - //rvec_MakeZero( data->xcm ); // position of CoM - //rvec_MakeZero( data->vcm ); // velocity of CoM - //rvec_MakeZero( data->amcm ); // angular momentum of CoM - //rvec_MakeZero( data->avcm ); // angular velocity of CoM - - //cuda_malloc ((void **)&l_xcm, RVEC_SIZE * (blocks + 1), 1, 0); - //cuda_malloc ((void **)&l_vcm, RVEC_SIZE * (blocks + 1), 1, 0); - //cuda_malloc ((void **)&l_amcm, RVEC_SIZE * (blocks + 1), 1, 0); - - cuda_memset ( scratch, 0, 3 * RVEC_SIZE * (BLOCKS_POW_2 + 1), RES_SCRATCH ); - l_xcm = r_scratch; - l_vcm = r_scratch + (BLOCKS_POW_2 + 1); - l_amcm = r_scratch + 2 * (BLOCKS_POW_2 + 1); - - center_of_mass_blocks <<<BLOCKS_POW_2, BLOCK_SIZE, 3 * (RVEC_SIZE * BLOCK_SIZE) >>> - (system->reaxprm.d_sbp, system->d_atoms, l_xcm, l_vcm, l_amcm, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - center_of_mass <<<1, BLOCKS_POW_2, 3 * (RVEC_SIZE * BLOCKS_POW_2) >>> - (l_xcm, l_vcm, l_amcm, - l_xcm + BLOCKS_POW_2, - l_vcm + BLOCKS_POW_2, - l_amcm + BLOCKS_POW_2, - BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); - - //#ifdef __BUILD_DEBUG - // validate_data ( system, data ); - //#endif - - //#ifdef __BUILD_DEBUG__ - // rvec_MakeZero (t_xcm); - // rvec_MakeZero (t_vcm); - // rvec_MakeZero (t_amcm); - // - // rvec_Copy (t_xcm, data->xcm); - // rvec_Copy (t_vcm, data->vcm); - // rvec_Copy (t_amcm, data->amcm); - //#endif - - copy_host_device (data->xcm, l_xcm + BLOCKS_POW_2, RVEC_SIZE, cudaMemcpyDeviceToHost, __LINE__); - copy_host_device (data->vcm, l_vcm + BLOCKS_POW_2, RVEC_SIZE, cudaMemcpyDeviceToHost, __LINE__); - copy_host_device (data->amcm, l_amcm + BLOCKS_POW_2, RVEC_SIZE, cudaMemcpyDeviceToHost, __LINE__); - - rvec_Scale( data->xcm, data->inv_M, data->xcm ); - rvec_Scale( data->vcm, data->inv_M, data->vcm ); - - rvec_Cross( tvec, data->xcm, data->vcm ); - rvec_ScaledAdd( data->amcm, -data->M, tvec ); - - //#ifdef __BUILD_DEBUG__ - // if (check_zero (t_xcm, data->xcm) || - // check_zero (t_vcm, data->vcm) || - // check_zero (t_amcm, data->amcm)){ - // fprintf (stderr, "SimulationData (xcm, vcm, amcm) does not match between device and host \n"); - // exit (0); - // } - //#endif - - data->etran_cm = 0.5 * data->M * rvec_Norm_Sqr( data->vcm ); - - /* Calculate and then invert the inertial tensor */ - xx = xy = xz = yy = yz = zz = 0; + int i; + real m, xx, xy, xz, yy, yz, zz, det; + rvec tvec, diff; + rtensor mat, inv; + + int blocks; + int block_size; + rvec *l_xcm, *l_vcm, *l_amcm; + real t_start, t_end; + + rvec t_xcm, t_vcm, t_amcm; + + rvec *r_scratch = (rvec *)scratch; + + //rvec_MakeZero( data->xcm ); // position of CoM + //rvec_MakeZero( data->vcm ); // velocity of CoM + //rvec_MakeZero( data->amcm ); // angular momentum of CoM + //rvec_MakeZero( data->avcm ); // angular velocity of CoM + + //cuda_malloc ((void **)&l_xcm, RVEC_SIZE * (blocks + 1), 1, 0); + //cuda_malloc ((void **)&l_vcm, RVEC_SIZE * (blocks + 1), 1, 0); + //cuda_malloc ((void **)&l_amcm, RVEC_SIZE * (blocks + 1), 1, 0); + + cuda_memset ( scratch, 0, 3 * RVEC_SIZE * (BLOCKS_POW_2 + 1), RES_SCRATCH ); + l_xcm = r_scratch; + l_vcm = r_scratch + (BLOCKS_POW_2 + 1); + l_amcm = r_scratch + 2 * (BLOCKS_POW_2 + 1); + + center_of_mass_blocks <<<BLOCKS_POW_2, BLOCK_SIZE, 3 * (RVEC_SIZE * BLOCK_SIZE) >>> + (system->reaxprm.d_sbp, system->d_atoms, l_xcm, l_vcm, l_amcm, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + center_of_mass <<<1, BLOCKS_POW_2, 3 * (RVEC_SIZE * BLOCKS_POW_2) >>> + (l_xcm, l_vcm, l_amcm, + l_xcm + BLOCKS_POW_2, + l_vcm + BLOCKS_POW_2, + l_amcm + BLOCKS_POW_2, + BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); + + //#ifdef __BUILD_DEBUG + // validate_data ( system, data ); + //#endif + + //#ifdef __BUILD_DEBUG__ + // rvec_MakeZero (t_xcm); + // rvec_MakeZero (t_vcm); + // rvec_MakeZero (t_amcm); + // + // rvec_Copy (t_xcm, data->xcm); + // rvec_Copy (t_vcm, data->vcm); + // rvec_Copy (t_amcm, data->amcm); + //#endif + + copy_host_device (data->xcm, l_xcm + BLOCKS_POW_2, RVEC_SIZE, cudaMemcpyDeviceToHost, __LINE__); + copy_host_device (data->vcm, l_vcm + BLOCKS_POW_2, RVEC_SIZE, cudaMemcpyDeviceToHost, __LINE__); + copy_host_device (data->amcm, l_amcm + BLOCKS_POW_2, RVEC_SIZE, cudaMemcpyDeviceToHost, __LINE__); + + rvec_Scale( data->xcm, data->inv_M, data->xcm ); + rvec_Scale( data->vcm, data->inv_M, data->vcm ); + + rvec_Cross( tvec, data->xcm, data->vcm ); + rvec_ScaledAdd( data->amcm, -data->M, tvec ); + + //#ifdef __BUILD_DEBUG__ + // if (check_zero (t_xcm, data->xcm) || + // check_zero (t_vcm, data->vcm) || + // check_zero (t_amcm, data->amcm)){ + // fprintf (stderr, "SimulationData (xcm, vcm, amcm) does not match between device and host \n"); + // exit (0); + // } + //#endif + + data->etran_cm = 0.5 * data->M * rvec_Norm_Sqr( data->vcm ); + + /* Calculate and then invert the inertial tensor */ + xx = xy = xz = yy = yz = zz = 0; #ifdef __BUILD_DEBUG__ - for( i = 0; i < system->N; ++i ) { - m = system->reaxprm.sbp[ system->atoms[i].type ].mass; + for( i = 0; i < system->N; ++i ) { + m = system->reaxprm.sbp[ system->atoms[i].type ].mass; - rvec_ScaledSum( diff, 1., system->atoms[i].x, -1., data->xcm ); - xx += diff[0] * diff[0] * m; - xy += diff[0] * diff[1] * m; - xz += diff[0] * diff[2] * m; - yy += diff[1] * diff[1] * m; - yz += diff[1] * diff[2] * m; - zz += diff[2] * diff[2] * m; - } + rvec_ScaledSum( diff, 1., system->atoms[i].x, -1., data->xcm ); + xx += diff[0] * diff[0] * m; + xy += diff[0] * diff[1] * m; + xz += diff[0] * diff[2] * m; + yy += diff[1] * diff[1] * m; + yz += diff[1] * diff[2] * m; + zz += diff[2] * diff[2] * m; + } #endif - real *partial_results = (real *) scratch; - real *local_results; + real *partial_results = (real *) scratch; + real *local_results; - //cuda_malloc ((void **)&partial_results, 6 * sizeof (real) * (blocks + 1), 1, 0); - cuda_memset (partial_results, 0, REAL_SIZE * 6 * (BLOCKS_POW_2 + 1), RES_SCRATCH ); - local_results = (real *) malloc (REAL_SIZE * 6 *(BLOCKS_POW_2+ 1)); + //cuda_malloc ((void **)&partial_results, 6 * sizeof (real) * (blocks + 1), 1, 0); + cuda_memset (partial_results, 0, REAL_SIZE * 6 * (BLOCKS_POW_2 + 1), RES_SCRATCH ); + local_results = (real *) malloc (REAL_SIZE * 6 *(BLOCKS_POW_2+ 1)); - compute_center_mass <<<BLOCKS_POW_2, BLOCK_SIZE, 6 * (REAL_SIZE * BLOCK_SIZE) >>> - (system->reaxprm.d_sbp, system->d_atoms, partial_results, - data->xcm[0], data->xcm[1], data->xcm[2], system->N); - cudaThreadSynchronize (); - cudaCheckError (); + compute_center_mass <<<BLOCKS_POW_2, BLOCK_SIZE, 6 * (REAL_SIZE * BLOCK_SIZE) >>> + (system->reaxprm.d_sbp, system->d_atoms, partial_results, + data->xcm[0], data->xcm[1], data->xcm[2], system->N); + cudaThreadSynchronize (); + cudaCheckError (); - compute_center_mass <<<1, BLOCKS_POW_2, 6 * (REAL_SIZE * BLOCKS_POW_2) >>> - (partial_results, partial_results + (BLOCKS_POW_2 * 6), BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); + compute_center_mass <<<1, BLOCKS_POW_2, 6 * (REAL_SIZE * BLOCKS_POW_2) >>> + (partial_results, partial_results + (BLOCKS_POW_2 * 6), BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); - copy_host_device (local_results, partial_results + 6 * BLOCKS_POW_2, REAL_SIZE * 6, cudaMemcpyDeviceToHost, __LINE__); + copy_host_device (local_results, partial_results + 6 * BLOCKS_POW_2, REAL_SIZE * 6, cudaMemcpyDeviceToHost, __LINE__); #ifdef __BUILD_DEBUG__ - if (check_zero (local_results[0],xx) || - check_zero (local_results[1],xy) || - check_zero (local_results[2],xz) || - check_zero (local_results[3],yy) || - check_zero (local_results[4],yz) || - check_zero (local_results[5],zz) ) - { - fprintf (stderr, " xx (%4.15f %4.15f) \n", xx, local_results[0]); - fprintf (stderr, " xy (%4.15f %4.15f) \n", xy, local_results[1]); - fprintf (stderr, " xz (%4.15f %4.15f) \n", xz, local_results[2]); - fprintf (stderr, " yy (%4.15f %4.15f) \n", yy, local_results[3]); - fprintf (stderr, " yz (%4.15f %4.15f) \n", yz, local_results[4]); - fprintf (stderr, " zz (%4.15f %4.15f) \n", zz, local_results[5]); - fprintf (stderr, " Failed to compute the center of mass \n"); - exit (1); - } + if (check_zero (local_results[0],xx) || + check_zero (local_results[1],xy) || + check_zero (local_results[2],xz) || + check_zero (local_results[3],yy) || + check_zero (local_results[4],yz) || + check_zero (local_results[5],zz) ) + { + fprintf (stderr, " xx (%4.15f %4.15f) \n", xx, local_results[0]); + fprintf (stderr, " xy (%4.15f %4.15f) \n", xy, local_results[1]); + fprintf (stderr, " xz (%4.15f %4.15f) \n", xz, local_results[2]); + fprintf (stderr, " yy (%4.15f %4.15f) \n", yy, local_results[3]); + fprintf (stderr, " yz (%4.15f %4.15f) \n", yz, local_results[4]); + fprintf (stderr, " zz (%4.15f %4.15f) \n", zz, local_results[5]); + fprintf (stderr, " Failed to compute the center of mass \n"); + exit (1); + } #endif - xx = local_results[0]; - xy = local_results[1]; - xz = local_results[2]; - yy = local_results[3]; - yz = local_results[4]; - zz = local_results[5]; - - mat[0][0] = yy + zz; - mat[0][1] = mat[1][0] = -xy; - mat[0][2] = mat[2][0] = -xz; - mat[1][1] = xx + zz; - mat[2][1] = mat[1][2] = -yz; - mat[2][2] = xx + yy; - - /* invert the inertial tensor */ - det = ( mat[0][0] * mat[1][1] * mat[2][2] + - mat[0][1] * mat[1][2] * mat[2][0] + - mat[0][2] * mat[1][0] * mat[2][1] ) - - ( mat[0][0] * mat[1][2] * mat[2][1] + - mat[0][1] * mat[1][0] * mat[2][2] + - mat[0][2] * mat[1][1] * mat[2][0] ); - - inv[0][0] = mat[1][1] * mat[2][2] - mat[1][2] * mat[2][1]; - inv[0][1] = mat[0][2] * mat[2][1] - mat[0][1] * mat[2][2]; - inv[0][2] = mat[0][1] * mat[1][2] - mat[0][2] * mat[1][1]; - inv[1][0] = mat[1][2] * mat[2][0] - mat[1][0] * mat[2][2]; - inv[1][1] = mat[0][0] * mat[2][2] - mat[0][2] * mat[2][0]; - inv[1][2] = mat[0][2] * mat[1][0] - mat[0][0] * mat[1][2]; - inv[2][0] = mat[1][0] * mat[2][1] - mat[2][0] * mat[1][1]; - inv[2][1] = mat[2][0] * mat[0][1] - mat[0][0] * mat[2][1]; - inv[2][2] = mat[0][0] * mat[1][1] - mat[1][0] * mat[0][1]; - - if( fabs(det) > ALMOST_ZERO ) - rtensor_Scale( inv, 1./det, inv ); - else - rtensor_MakeZero( inv ); - - /* Compute the angular velocity about the centre of mass */ - rtensor_MatVec( data->avcm, inv, data->amcm ); - data->erot_cm = 0.5 * E_CONV * rvec_Dot( data->avcm, data->amcm ); - - //free the resources - free (local_results); + xx = local_results[0]; + xy = local_results[1]; + xz = local_results[2]; + yy = local_results[3]; + yz = local_results[4]; + zz = local_results[5]; + + mat[0][0] = yy + zz; + mat[0][1] = mat[1][0] = -xy; + mat[0][2] = mat[2][0] = -xz; + mat[1][1] = xx + zz; + mat[2][1] = mat[1][2] = -yz; + mat[2][2] = xx + yy; + + /* invert the inertial tensor */ + det = ( mat[0][0] * mat[1][1] * mat[2][2] + + mat[0][1] * mat[1][2] * mat[2][0] + + mat[0][2] * mat[1][0] * mat[2][1] ) - + ( mat[0][0] * mat[1][2] * mat[2][1] + + mat[0][1] * mat[1][0] * mat[2][2] + + mat[0][2] * mat[1][1] * mat[2][0] ); + + inv[0][0] = mat[1][1] * mat[2][2] - mat[1][2] * mat[2][1]; + inv[0][1] = mat[0][2] * mat[2][1] - mat[0][1] * mat[2][2]; + inv[0][2] = mat[0][1] * mat[1][2] - mat[0][2] * mat[1][1]; + inv[1][0] = mat[1][2] * mat[2][0] - mat[1][0] * mat[2][2]; + inv[1][1] = mat[0][0] * mat[2][2] - mat[0][2] * mat[2][0]; + inv[1][2] = mat[0][2] * mat[1][0] - mat[0][0] * mat[1][2]; + inv[2][0] = mat[1][0] * mat[2][1] - mat[2][0] * mat[1][1]; + inv[2][1] = mat[2][0] * mat[0][1] - mat[0][0] * mat[2][1]; + inv[2][2] = mat[0][0] * mat[1][1] - mat[1][0] * mat[0][1]; + + if( fabs(det) > ALMOST_ZERO ) + rtensor_Scale( inv, 1./det, inv ); + else + rtensor_MakeZero( inv ); + + /* Compute the angular velocity about the centre of mass */ + rtensor_MatVec( data->avcm, inv, data->amcm ); + data->erot_cm = 0.5 * E_CONV * rvec_Dot( data->avcm, data->amcm ); + + //free the resources + free (local_results); #if defined(DEBUG) - fprintf( stderr, "xcm: %24.15e %24.15e %24.15e\n", - data->xcm[0], data->xcm[1], data->xcm[2] ); - fprintf( stderr, "vcm: %24.15e %24.15e %24.15e\n", - data->vcm[0], data->vcm[1], data->vcm[2] ); - fprintf( stderr, "amcm: %24.15e %24.15e %24.15e\n", - data->amcm[0], data->amcm[1], data->amcm[2] ); - /* fprintf( fout, "mat: %f %f %f\n %f %f %f\n %f %f %f\n", - mat[0][0], mat[0][1], mat[0][2], - mat[1][0], mat[1][1], mat[1][2], - mat[2][0], mat[2][1], mat[2][2] ); - fprintf( fout, "inv: %g %g %g\n %g %g %g\n %g %g %g\n", - inv[0][0], inv[0][1], inv[0][2], - inv[1][0], inv[1][1], inv[1][2], - inv[2][0], inv[2][1], inv[2][2] ); - fflush( fout ); */ - fprintf( stderr, "avcm: %24.15e %24.15e %24.15e\n", - data->avcm[0], data->avcm[1], data->avcm[2] ); + fprintf( stderr, "xcm: %24.15e %24.15e %24.15e\n", + data->xcm[0], data->xcm[1], data->xcm[2] ); + fprintf( stderr, "vcm: %24.15e %24.15e %24.15e\n", + data->vcm[0], data->vcm[1], data->vcm[2] ); + fprintf( stderr, "amcm: %24.15e %24.15e %24.15e\n", + data->amcm[0], data->amcm[1], data->amcm[2] ); + /* fprintf( fout, "mat: %f %f %f\n %f %f %f\n %f %f %f\n", + mat[0][0], mat[0][1], mat[0][2], + mat[1][0], mat[1][1], mat[1][2], + mat[2][0], mat[2][1], mat[2][2] ); + fprintf( fout, "inv: %g %g %g\n %g %g %g\n %g %g %g\n", + inv[0][0], inv[0][1], inv[0][2], + inv[1][0], inv[1][1], inv[1][2], + inv[2][0], inv[2][1], inv[2][2] ); + fflush( fout ); */ + fprintf( stderr, "avcm: %24.15e %24.15e %24.15e\n", + data->avcm[0], data->avcm[1], data->avcm[2] ); #endif } @@ -492,109 +492,109 @@ void Cuda_Compute_Center_of_Mass( reax_system *system, simulation_data *data, void Compute_Kinetic_Energy( reax_system* system, simulation_data* data ) { - int i; - rvec p; - real m; + int i; + rvec p; + real m; - data->E_Kin = 0.0; + data->E_Kin = 0.0; - for (i=0; i < system->N; i++) { - m = system->reaxprm.sbp[system->atoms[i].type].mass; + for (i=0; i < system->N; i++) { + m = system->reaxprm.sbp[system->atoms[i].type].mass; - rvec_Scale( p, m, system->atoms[i].v ); - data->E_Kin += 0.5 * rvec_Dot( p, system->atoms[i].v ); + rvec_Scale( p, m, system->atoms[i].v ); + data->E_Kin += 0.5 * rvec_Dot( p, system->atoms[i].v ); - /* fprintf(stderr,"%d, %lf, %lf, %lf %lf\n", - i,system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2], - system->reaxprm.sbp[system->atoms[i].type].mass); */ - } + /* fprintf(stderr,"%d, %lf, %lf, %lf %lf\n", + i,system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2], + system->reaxprm.sbp[system->atoms[i].type].mass); */ + } - data->therm.T = (2. * data->E_Kin) / (data->N_f * K_B); + data->therm.T = (2. * data->E_Kin) / (data->N_f * K_B); - if ( fabs(data->therm.T) < ALMOST_ZERO ) /* avoid T being an absolute zero! */ - data->therm.T = ALMOST_ZERO; + if ( fabs(data->therm.T) < ALMOST_ZERO ) /* avoid T being an absolute zero! */ + data->therm.T = ALMOST_ZERO; } GLOBAL void Compute_Kinetic_Energy( single_body_parameters* sbp, reax_atom* atoms, - unsigned int N, real *output) + unsigned int N, real *output) { - extern __shared__ real sh_ekin[]; - unsigned int index = blockIdx.x * blockDim.x + threadIdx.x; - rvec p; - real m, tmp; - - tmp = 0; - m = 0; - if (index < N) { - m = sbp[atoms[index].type].mass; - rvec_Scale( p, m, atoms[index].v ); - tmp = 0.5 * rvec_Dot( p, atoms[index].v ); - } - sh_ekin[threadIdx.x] = tmp; - __syncthreads (); - - for (int offset = blockDim.x/2; offset > 0; offset >>= 1) { - if (threadIdx.x < offset ) { - index = threadIdx.x + offset; - sh_ekin[threadIdx.x] += sh_ekin[ index ]; - } - __syncthreads (); - } - - if (threadIdx.x == 0) { - output [ blockIdx.x ] = sh_ekin [ 0 ]; - } + extern __shared__ real sh_ekin[]; + unsigned int index = blockIdx.x * blockDim.x + threadIdx.x; + rvec p; + real m, tmp; + + tmp = 0; + m = 0; + if (index < N) { + m = sbp[atoms[index].type].mass; + rvec_Scale( p, m, atoms[index].v ); + tmp = 0.5 * rvec_Dot( p, atoms[index].v ); + } + sh_ekin[threadIdx.x] = tmp; + __syncthreads (); + + for (int offset = blockDim.x/2; offset > 0; offset >>= 1) { + if (threadIdx.x < offset ) { + index = threadIdx.x + offset; + sh_ekin[threadIdx.x] += sh_ekin[ index ]; + } + __syncthreads (); + } + + if (threadIdx.x == 0) { + output [ blockIdx.x ] = sh_ekin [ 0 ]; + } } GLOBAL void Kinetic_Energy_Reduction (simulation_data *data, - real *input, int n) + real *input, int n) { - extern __shared__ real sdata[]; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - real x = 0; - - if(i < n) - { - x = input[i]; - } - sdata[threadIdx.x] = x; - __syncthreads(); - - for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) - { - if(threadIdx.x < offset) - { - sdata[threadIdx.x] += sdata[threadIdx.x + offset]; - } - - __syncthreads(); - } - - if(threadIdx.x == 0) - { - //per_block_results[blockIdx.x] = sdata[0]; - data->E_Kin = sdata[0]; - data->therm.T = (2. * data->E_Kin) / (data->N_f * K_B); - - if ( fabs(data->therm.T) < ALMOST_ZERO ) // avoid T being an absolute zero! - data->therm.T = ALMOST_ZERO; - } + extern __shared__ real sdata[]; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + real x = 0; + + if(i < n) + { + x = input[i]; + } + sdata[threadIdx.x] = x; + __syncthreads(); + + for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) + { + if(threadIdx.x < offset) + { + sdata[threadIdx.x] += sdata[threadIdx.x + offset]; + } + + __syncthreads(); + } + + if(threadIdx.x == 0) + { + //per_block_results[blockIdx.x] = sdata[0]; + data->E_Kin = sdata[0]; + data->therm.T = (2. * data->E_Kin) / (data->N_f * K_B); + + if ( fabs(data->therm.T) < ALMOST_ZERO ) // avoid T being an absolute zero! + data->therm.T = ALMOST_ZERO; + } } void Cuda_Compute_Kinetic_Energy (reax_system *system, simulation_data *data) { - real *results = (real *) scratch; - cuda_memset (results, 0, REAL_SIZE * BLOCKS_POW_2, RES_SCRATCH); - Compute_Kinetic_Energy <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> - (system->reaxprm.d_sbp, system->d_atoms, system->N, (real *) results); - cudaThreadSynchronize (); - cudaCheckError (); - - Kinetic_Energy_Reduction <<< 1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> - ((simulation_data *)data->d_simulation_data, results, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); + real *results = (real *) scratch; + cuda_memset (results, 0, REAL_SIZE * BLOCKS_POW_2, RES_SCRATCH); + Compute_Kinetic_Energy <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> + (system->reaxprm.d_sbp, system->d_atoms, system->N, (real *) results); + cudaThreadSynchronize (); + cudaCheckError (); + + Kinetic_Energy_Reduction <<< 1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> + ((simulation_data *)data->d_simulation_data, results, BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); } /* @@ -632,7 +632,7 @@ void Cuda_Compute_Kinetic_Energy (reax_system *system, simulation_data *data) __syncthreads (); //if ((blockIdx.x == 0) && (threadIdx.x < gridDim.x)) { -// sh_ekin [ threadIdx.x ] = output [ threadIdx.x ]; +// sh_ekin [ threadIdx.x ] = output [ threadIdx.x ]; //} //__syncthreads (); @@ -669,108 +669,108 @@ data->therm.T = ALMOST_ZERO; * We may want to add that for more accuracy. */ void Compute_Pressure_Isotropic( reax_system* system, control_params *control, - simulation_data* data, - output_controls *out_control ) + simulation_data* data, + output_controls *out_control ) { - int i; - reax_atom *p_atom; - rvec tx; - rvec tmp; - simulation_box *box = &(system->box); - - /* Calculate internal pressure */ - rvec_MakeZero( data->int_press ); - - // 0: both int and ext, 1: ext only, 2: int only - if( control->press_mode == 0 || control->press_mode == 2 ) { - for( i = 0; i < system->N; ++i ) { - p_atom = &( system->atoms[i] ); - - /* transform x into unitbox coordinates */ - Transform_to_UnitBox( p_atom->x, box, 1, tx ); - - /* this atom's contribution to internal pressure */ - rvec_Multiply( tmp, p_atom->f, tx ); - rvec_Add( data->int_press, tmp ); - - if( out_control->debug_level > 0 ) { - fprintf( out_control->prs, "%-8d%8.2f%8.2f%8.2f", - i+1, p_atom->x[0], p_atom->x[1], p_atom->x[2] ); - fprintf( out_control->prs, "%8.2f%8.2f%8.2f", - p_atom->f[0], p_atom->f[1], p_atom->f[2] ); - fprintf( out_control->prs, "%8.2f%8.2f%8.2f\n", - data->int_press[0],data->int_press[1],data->int_press[2]); - } - } - } - - /* kinetic contribution */ - data->kin_press = 2. * (E_CONV * data->E_Kin) / ( 3. * box->volume * P_CONV ); - - /* Calculate total pressure in each direction */ - data->tot_press[0] = data->kin_press - - ((data->int_press[0] + data->ext_press[0]) / - (box->box_norms[1] * box->box_norms[2] * P_CONV)); - - data->tot_press[1] = data->kin_press - - ((data->int_press[1] + data->ext_press[1])/ - (box->box_norms[0] * box->box_norms[2] * P_CONV)); - - data->tot_press[2] = data->kin_press - - ((data->int_press[2] + data->ext_press[2])/ - (box->box_norms[0] * box->box_norms[1] * P_CONV)); - - /* Average pressure for the whole box */ - data->iso_bar.P=(data->tot_press[0]+data->tot_press[1]+data->tot_press[2])/3; + int i; + reax_atom *p_atom; + rvec tx; + rvec tmp; + simulation_box *box = &(system->box); + + /* Calculate internal pressure */ + rvec_MakeZero( data->int_press ); + + // 0: both int and ext, 1: ext only, 2: int only + if( control->press_mode == 0 || control->press_mode == 2 ) { + for( i = 0; i < system->N; ++i ) { + p_atom = &( system->atoms[i] ); + + /* transform x into unitbox coordinates */ + Transform_to_UnitBox( p_atom->x, box, 1, tx ); + + /* this atom's contribution to internal pressure */ + rvec_Multiply( tmp, p_atom->f, tx ); + rvec_Add( data->int_press, tmp ); + + if( out_control->debug_level > 0 ) { + fprintf( out_control->prs, "%-8d%8.2f%8.2f%8.2f", + i+1, p_atom->x[0], p_atom->x[1], p_atom->x[2] ); + fprintf( out_control->prs, "%8.2f%8.2f%8.2f", + p_atom->f[0], p_atom->f[1], p_atom->f[2] ); + fprintf( out_control->prs, "%8.2f%8.2f%8.2f\n", + data->int_press[0],data->int_press[1],data->int_press[2]); + } + } + } + + /* kinetic contribution */ + data->kin_press = 2. * (E_CONV * data->E_Kin) / ( 3. * box->volume * P_CONV ); + + /* Calculate total pressure in each direction */ + data->tot_press[0] = data->kin_press - + ((data->int_press[0] + data->ext_press[0]) / + (box->box_norms[1] * box->box_norms[2] * P_CONV)); + + data->tot_press[1] = data->kin_press - + ((data->int_press[1] + data->ext_press[1])/ + (box->box_norms[0] * box->box_norms[2] * P_CONV)); + + data->tot_press[2] = data->kin_press - + ((data->int_press[2] + data->ext_press[2])/ + (box->box_norms[0] * box->box_norms[1] * P_CONV)); + + /* Average pressure for the whole box */ + data->iso_bar.P=(data->tot_press[0]+data->tot_press[1]+data->tot_press[2])/3; } void Compute_Pressure_Isotropic_Klein( reax_system* system, - simulation_data* data ) + simulation_data* data ) { - int i; - reax_atom *p_atom; - rvec dx; - - // IMPORTANT: This function assumes that current kinetic energy and - // the center of mass of the system is already computed before. - data->iso_bar.P = 2.0 * data->E_Kin; - - for( i = 0; i < system->N; ++i ) - { - p_atom = &( system->atoms[i] ); - rvec_ScaledSum(dx,1.0,p_atom->x,-1.0,data->xcm); - data->iso_bar.P += ( -F_CONV * rvec_Dot(p_atom->f, dx) ); - } - - data->iso_bar.P /= (3.0 * system->box.volume); - - // IMPORTANT: In Klein's paper, it is stated that a dU/dV term needs - // to be added when there are long-range interactions or long-range - // corrections to short-range interactions present. - // We may want to add that for more accuracy. + int i; + reax_atom *p_atom; + rvec dx; + + // IMPORTANT: This function assumes that current kinetic energy and + // the center of mass of the system is already computed before. + data->iso_bar.P = 2.0 * data->E_Kin; + + for( i = 0; i < system->N; ++i ) + { + p_atom = &( system->atoms[i] ); + rvec_ScaledSum(dx,1.0,p_atom->x,-1.0,data->xcm); + data->iso_bar.P += ( -F_CONV * rvec_Dot(p_atom->f, dx) ); + } + + data->iso_bar.P /= (3.0 * system->box.volume); + + // IMPORTANT: In Klein's paper, it is stated that a dU/dV term needs + // to be added when there are long-range interactions or long-range + // corrections to short-range interactions present. + // We may want to add that for more accuracy. } void Compute_Pressure( reax_system* system, simulation_data* data, - static_storage *workspace ) + static_storage *workspace ) { - int i; - reax_atom *p_atom; - rtensor temp; - - rtensor_MakeZero( data->flex_bar.P ); - - for( i = 0; i < system->N; ++i ) { - p_atom = &( system->atoms[i] ); - // Distance_on_T3_Gen( data->rcm, p_atom->x, &(system->box), &dx ); - rvec_OuterProduct( temp, p_atom->v, p_atom->v ); - rtensor_ScaledAdd( data->flex_bar.P, - system->reaxprm.sbp[ p_atom->type ].mass, temp ); - // rvec_OuterProduct(temp, workspace->virial_forces[i], p_atom->x ); - rtensor_ScaledAdd( data->flex_bar.P, -F_CONV, temp ); - } - - rtensor_Scale( data->flex_bar.P, 1.0 / system->box.volume, data->flex_bar.P ); - data->iso_bar.P = rtensor_Trace( data->flex_bar.P ) / 3.0; + int i; + reax_atom *p_atom; + rtensor temp; + + rtensor_MakeZero( data->flex_bar.P ); + + for( i = 0; i < system->N; ++i ) { + p_atom = &( system->atoms[i] ); + // Distance_on_T3_Gen( data->rcm, p_atom->x, &(system->box), &dx ); + rvec_OuterProduct( temp, p_atom->v, p_atom->v ); + rtensor_ScaledAdd( data->flex_bar.P, + system->reaxprm.sbp[ p_atom->type ].mass, temp ); + // rvec_OuterProduct(temp, workspace->virial_forces[i], p_atom->x ); + rtensor_ScaledAdd( data->flex_bar.P, -F_CONV, temp ); + } + + rtensor_Scale( data->flex_bar.P, 1.0 / system->box.volume, data->flex_bar.P ); + data->iso_bar.P = rtensor_Trace( data->flex_bar.P ) / 3.0; } diff --git a/PuReMD-GPU/src/testmd.cu b/PuReMD-GPU/src/testmd.cu index ffca47ff..93f286cc 100644 --- a/PuReMD-GPU/src/testmd.cu +++ b/PuReMD-GPU/src/testmd.cu @@ -48,7 +48,7 @@ print_interaction Print_Interactions[NO_OF_INTERACTIONS]; LR_lookup_table *LR; LR_lookup_table *d_LR; -list *dev_lists; +list *dev_lists; static_storage *dev_workspace; reax_timing d_timing; @@ -70,398 +70,398 @@ cusparseMatDescr_t matdescriptor; void Post_Evolve( reax_system* system, control_params* control, - simulation_data* data, static_storage* workspace, - list** lists, output_controls *out_control ) + simulation_data* data, static_storage* workspace, + list** lists, output_controls *out_control ) { - int i; - rvec diff, cross; - - /* if velocity dependent force then - { - Generate_Neighbor_Lists( &system, &control, &lists ); - QEq(system, control, workspace, lists[FAR_NBRS]); - Introduce compute_force here if we are using velocity dependent forces - Compute_Forces(system,control,data,workspace,lists); - } */ - - /* compute kinetic energy of the system */ - Compute_Kinetic_Energy( system, data ); - - /* remove rotational and translational velocity of the center of mass */ - if( control->ensemble != NVE && - control->remove_CoM_vel && - data->step && data->step % control->remove_CoM_vel == 0 ) { - - /* compute velocity of the center of mass */ - Compute_Center_of_Mass( system, data, out_control->prs ); - - for( i = 0; i < system->N; i++ ) { - // remove translational - rvec_ScaledAdd( system->atoms[i].v, -1., data->vcm ); - - // remove rotational - rvec_ScaledSum( diff, 1., system->atoms[i].x, -1., data->xcm ); - rvec_Cross( cross, data->avcm, diff ); - rvec_ScaledAdd( system->atoms[i].v, -1., cross ); - } - } + int i; + rvec diff, cross; + + /* if velocity dependent force then + { + Generate_Neighbor_Lists( &system, &control, &lists ); + QEq(system, control, workspace, lists[FAR_NBRS]); + Introduce compute_force here if we are using velocity dependent forces + Compute_Forces(system,control,data,workspace,lists); + } */ + + /* compute kinetic energy of the system */ + Compute_Kinetic_Energy( system, data ); + + /* remove rotational and translational velocity of the center of mass */ + if( control->ensemble != NVE && + control->remove_CoM_vel && + data->step && data->step % control->remove_CoM_vel == 0 ) { + + /* compute velocity of the center of mass */ + Compute_Center_of_Mass( system, data, out_control->prs ); + + for( i = 0; i < system->N; i++ ) { + // remove translational + rvec_ScaledAdd( system->atoms[i].v, -1., data->vcm ); + + // remove rotational + rvec_ScaledSum( diff, 1., system->atoms[i].x, -1., data->xcm ); + rvec_Cross( cross, data->avcm, diff ); + rvec_ScaledAdd( system->atoms[i].v, -1., cross ); + } + } } GLOBAL void Update_Atoms_Post_Evolve (reax_atom *atoms, simulation_data *data, int N) { - rvec diff, cross; - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= N) return; - - //for( i = 0; i < system->N; i++ ) { - // remove translational - rvec_ScaledAdd( atoms[i].v, -1., data->vcm ); - - // remove rotational - rvec_ScaledSum( diff, 1., atoms[i].x, -1., data->xcm ); - rvec_Cross( cross, data->avcm, diff ); - rvec_ScaledAdd( atoms[i].v, -1., cross ); - //} + rvec diff, cross; + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= N) return; + + //for( i = 0; i < system->N; i++ ) { + // remove translational + rvec_ScaledAdd( atoms[i].v, -1., data->vcm ); + + // remove rotational + rvec_ScaledSum( diff, 1., atoms[i].x, -1., data->xcm ); + rvec_Cross( cross, data->avcm, diff ); + rvec_ScaledAdd( atoms[i].v, -1., cross ); + //} } void Cuda_Post_Evolve( reax_system* system, control_params* control, - simulation_data* data, static_storage* workspace, - list** lists, output_controls *out_control ) + simulation_data* data, static_storage* workspace, + list** lists, output_controls *out_control ) { - int i; - rvec diff, cross; - - /* compute kinetic energy of the system */ - /* - real *results = (real *) scratch; - cuda_memset (results, 0, REAL_SIZE * BLOCKS_POW_2, RES_SCRATCH); - Compute_Kinetic_Energy <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> - (system->reaxprm.d_sbp, system->d_atoms, system->N, - (simulation_data *)data->d_simulation_data, (real *) results); - cudaThreadSynchronize (); - cudaCheckError (); - */ - - //fprintf (stderr, "Cuda_Post_Evolve: Begin\n"); - Cuda_Compute_Kinetic_Energy (system, data); - //fprintf (stderr, " Cuda_Compute_Kinetic_Energy done.... \n"); - - /* remove rotational and translational velocity of the center of mass */ - if( control->ensemble != NVE && - control->remove_CoM_vel && - data->step && data->step % control->remove_CoM_vel == 0 ) { - - /* - rvec t_xcm, t_vcm, t_avcm; - rvec_MakeZero (t_xcm); - rvec_MakeZero (t_vcm); - rvec_MakeZero (t_avcm); - - rvec_Copy (t_xcm, data->xcm); - rvec_Copy (t_vcm, data->vcm); - rvec_Copy (t_avcm, data->avcm); - */ - - /* compute velocity of the center of mass */ - Cuda_Compute_Center_of_Mass( system, data, out_control->prs ); - //fprintf (stderr, "Cuda_Compute_Center_of_Mass done... \n"); - /* - fprintf (stderr, "center of mass done on the device \n"); - - fprintf (stderr, "xcm --> %4.10f %4.10f \n", t_xcm, data->xcm ); - fprintf (stderr, "vcm --> %4.10f %4.10f \n", t_vcm, data->vcm ); - fprintf (stderr, "avcm --> %4.10f %4.10f \n", t_avcm, data->avcm ); - - if (check_zero (t_xcm, data->xcm) || - check_zero (t_vcm, data->vcm) || - check_zero (t_avcm, data->avcm)){ - fprintf (stderr, "SimulationData (xcm, vcm, avcm) does not match between device and host \n"); - exit (0); - } - */ - - //xcm, avcm, - copy_host_device (data->vcm, ((simulation_data *)data->d_simulation_data)->vcm, RVEC_SIZE, cudaMemcpyHostToDevice, RES_SIMULATION_DATA ); - copy_host_device (data->xcm, ((simulation_data *)data->d_simulation_data)->xcm, RVEC_SIZE, cudaMemcpyHostToDevice, RES_SIMULATION_DATA ); - copy_host_device (data->avcm, ((simulation_data *)data->d_simulation_data)->avcm, RVEC_SIZE, cudaMemcpyHostToDevice, RES_SIMULATION_DATA ); - - //fprintf (stderr, "data copied.... \n"); - - Update_Atoms_Post_Evolve <<< BLOCKS, BLOCK_SIZE >>> - (system->d_atoms, (simulation_data *)data->d_simulation_data, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - //fprintf (stderr, " Cuda_Post_Evolve:End \n"); - - } + int i; + rvec diff, cross; + + /* compute kinetic energy of the system */ + /* + real *results = (real *) scratch; + cuda_memset (results, 0, REAL_SIZE * BLOCKS_POW_2, RES_SCRATCH); + Compute_Kinetic_Energy <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> + (system->reaxprm.d_sbp, system->d_atoms, system->N, + (simulation_data *)data->d_simulation_data, (real *) results); + cudaThreadSynchronize (); + cudaCheckError (); + */ + + //fprintf (stderr, "Cuda_Post_Evolve: Begin\n"); + Cuda_Compute_Kinetic_Energy (system, data); + //fprintf (stderr, " Cuda_Compute_Kinetic_Energy done.... \n"); + + /* remove rotational and translational velocity of the center of mass */ + if( control->ensemble != NVE && + control->remove_CoM_vel && + data->step && data->step % control->remove_CoM_vel == 0 ) { + + /* + rvec t_xcm, t_vcm, t_avcm; + rvec_MakeZero (t_xcm); + rvec_MakeZero (t_vcm); + rvec_MakeZero (t_avcm); + + rvec_Copy (t_xcm, data->xcm); + rvec_Copy (t_vcm, data->vcm); + rvec_Copy (t_avcm, data->avcm); + */ + + /* compute velocity of the center of mass */ + Cuda_Compute_Center_of_Mass( system, data, out_control->prs ); + //fprintf (stderr, "Cuda_Compute_Center_of_Mass done... \n"); + /* + fprintf (stderr, "center of mass done on the device \n"); + + fprintf (stderr, "xcm --> %4.10f %4.10f \n", t_xcm, data->xcm ); + fprintf (stderr, "vcm --> %4.10f %4.10f \n", t_vcm, data->vcm ); + fprintf (stderr, "avcm --> %4.10f %4.10f \n", t_avcm, data->avcm ); + + if (check_zero (t_xcm, data->xcm) || + check_zero (t_vcm, data->vcm) || + check_zero (t_avcm, data->avcm)){ + fprintf (stderr, "SimulationData (xcm, vcm, avcm) does not match between device and host \n"); + exit (0); + } + */ + + //xcm, avcm, + copy_host_device (data->vcm, ((simulation_data *)data->d_simulation_data)->vcm, RVEC_SIZE, cudaMemcpyHostToDevice, RES_SIMULATION_DATA ); + copy_host_device (data->xcm, ((simulation_data *)data->d_simulation_data)->xcm, RVEC_SIZE, cudaMemcpyHostToDevice, RES_SIMULATION_DATA ); + copy_host_device (data->avcm, ((simulation_data *)data->d_simulation_data)->avcm, RVEC_SIZE, cudaMemcpyHostToDevice, RES_SIMULATION_DATA ); + + //fprintf (stderr, "data copied.... \n"); + + Update_Atoms_Post_Evolve <<< BLOCKS, BLOCK_SIZE >>> + (system->d_atoms, (simulation_data *)data->d_simulation_data, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + //fprintf (stderr, " Cuda_Post_Evolve:End \n"); + + } } void Read_System( char *geof, char *ff, char *ctrlf, - reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - output_controls *out_control ) + reax_system *system, control_params *control, + simulation_data *data, static_storage *workspace, + output_controls *out_control ) { - FILE *ffield, *ctrl; - - ffield = fopen( ff, "r" ); - ctrl = fopen( ctrlf, "r" ); - - /* ffield file */ - Read_Force_Field( ffield, &(system->reaxprm) ); - - /* control file */ - Read_Control_File( ctrl, system, control, out_control ); - - /* geo file */ - if( control->geo_format == XYZ ) { - fprintf( stderr, "xyz input is not implemented yet\n" ); - exit(1); - } - else if( control->geo_format == PDB ) - Read_PDB( geof, system, control, data, workspace ); - else if( control->geo_format == BGF ) - Read_BGF( geof, system, control, data, workspace ); - else if( control->geo_format == ASCII_RESTART ) { - Read_ASCII_Restart( geof, system, control, data, workspace ); - control->restart = 1; - } - else if( control->geo_format == BINARY_RESTART ) { - Read_Binary_Restart( geof, system, control, data, workspace ); - control->restart = 1; - } - else { - fprintf( stderr, "unknown geo file format. terminating!\n" ); - exit(1); - } + FILE *ffield, *ctrl; + + ffield = fopen( ff, "r" ); + ctrl = fopen( ctrlf, "r" ); + + /* ffield file */ + Read_Force_Field( ffield, &(system->reaxprm) ); + + /* control file */ + Read_Control_File( ctrl, system, control, out_control ); + + /* geo file */ + if( control->geo_format == XYZ ) { + fprintf( stderr, "xyz input is not implemented yet\n" ); + exit(1); + } + else if( control->geo_format == PDB ) + Read_PDB( geof, system, control, data, workspace ); + else if( control->geo_format == BGF ) + Read_BGF( geof, system, control, data, workspace ); + else if( control->geo_format == ASCII_RESTART ) { + Read_ASCII_Restart( geof, system, control, data, workspace ); + control->restart = 1; + } + else if( control->geo_format == BINARY_RESTART ) { + Read_Binary_Restart( geof, system, control, data, workspace ); + control->restart = 1; + } + else { + fprintf( stderr, "unknown geo file format. terminating!\n" ); + exit(1); + } #if defined(DEBUG_FOCUS) - fprintf( stderr, "input files have been read...\n" ); - Print_Box_Information( &(system->box), stderr ); + fprintf( stderr, "input files have been read...\n" ); + Print_Box_Information( &(system->box), stderr ); #endif } void Init_Data_Structures (simulation_data *data) { - //data->step = 0; - //data->prev_steps = 0; - //data->time = 0; + //data->step = 0; + //data->prev_steps = 0; + //data->time = 0; - memset (data, 0, SIMULATION_DATA_SIZE ); + memset (data, 0, SIMULATION_DATA_SIZE ); } int main(int argc, char* argv[]) { - reax_system system; - control_params control; - simulation_data data; - static_storage workspace; - list *lists; - output_controls out_control; - evolve_function Evolve; - evolve_function Cuda_Evolve; - int steps; + reax_system system; + control_params control; + simulation_data data; + static_storage workspace; + list *lists; + output_controls out_control; + evolve_function Evolve; + evolve_function Cuda_Evolve; + int steps; - real t_start, t_elapsed; - real *results = NULL; + real t_start, t_elapsed; + real *results = NULL; - lists = (list*) malloc( sizeof(list) * LIST_N ); + lists = (list*) malloc( sizeof(list) * LIST_N ); - cudaDeviceSetLimit (cudaLimitStackSize, 8192); - cudaDeviceSetCacheConfig (cudaFuncCachePreferL1); - cudaCheckError (); + cudaDeviceSetLimit (cudaLimitStackSize, 8192); + cudaDeviceSetCacheConfig (cudaFuncCachePreferL1); + cudaCheckError (); - cublasCheckError (cublasStatus = cublasCreate (&cublasHandle)); + cublasCheckError (cublasStatus = cublasCreate (&cublasHandle)); - cusparseCheckError (cusparseStatus = cusparseCreate (&cusparseHandle)); - cusparseCheckError (cusparseCreateMatDescr (&matdescriptor)); - cusparseSetMatType (matdescriptor, CUSPARSE_MATRIX_TYPE_GENERAL); - cusparseSetMatIndexBase (matdescriptor, CUSPARSE_INDEX_BASE_ZERO); + cusparseCheckError (cusparseStatus = cusparseCreate (&cusparseHandle)); + cusparseCheckError (cusparseCreateMatDescr (&matdescriptor)); + cusparseSetMatType (matdescriptor, CUSPARSE_MATRIX_TYPE_GENERAL); + cusparseSetMatIndexBase (matdescriptor, CUSPARSE_INDEX_BASE_ZERO); - dev_lists = (list *) malloc (sizeof (list) * LIST_N ); - dev_workspace = (static_storage *) malloc (STORAGE_SIZE); + dev_lists = (list *) malloc (sizeof (list) * LIST_N ); + dev_workspace = (static_storage *) malloc (STORAGE_SIZE); - //init the nbrs estimate - dev_workspace->realloc.estimate_nbrs = -1; + //init the nbrs estimate + dev_workspace->realloc.estimate_nbrs = -1; - //Cleanup before usage. - Init_Data_Structures (&data); - system.init_thblist = false; + //Cleanup before usage. + Init_Data_Structures (&data); + system.init_thblist = false; - Read_System( argv[1], argv[2], argv[3], &system, &control, - &data, &workspace, &out_control ); + Read_System( argv[1], argv[2], argv[3], &system, &control, + &data, &workspace, &out_control ); - compute_blocks (&BLOCKS, &BLOCK_SIZE, system.N); - compute_nearest_pow_2 (BLOCKS, &BLOCKS_POW_2); + compute_blocks (&BLOCKS, &BLOCK_SIZE, system.N); + compute_nearest_pow_2 (BLOCKS, &BLOCKS_POW_2); - //MATVEC_BLOCKS = system.N; - //MATVEC_BLOCK_SIZE = 32; + //MATVEC_BLOCKS = system.N; + //MATVEC_BLOCK_SIZE = 32; - MATVEC_BLOCKS = (system.N * MATVEC_THREADS_PER_ROW / MATVEC_BLOCK_SIZE) + - ((system.N * MATVEC_THREADS_PER_ROW / MATVEC_BLOCK_SIZE) == 0 ? 0 : 1); + MATVEC_BLOCKS = (system.N * MATVEC_THREADS_PER_ROW / MATVEC_BLOCK_SIZE) + + ((system.N * MATVEC_THREADS_PER_ROW / MATVEC_BLOCK_SIZE) == 0 ? 0 : 1); #ifdef __DEBUG_CUDA__ - fprintf (stderr, " MATVEC Blocks : %d, Block_Size : %d \n", MATVEC_BLOCKS, MATVEC_BLOCK_SIZE ); - fprintf (stderr, " Blocks : %d, Blocks_Pow_2 : %d, Block_Size : %d \n", BLOCKS, BLOCKS_POW_2, BLOCK_SIZE ); - fprintf (stderr, " Size of far neighbor data %d \n", sizeof (far_neighbor_data)); - fprintf (stderr, " Size of reax_atom %d \n", sizeof (reax_atom)); - fprintf (stderr, " size of sparse matrix entry %d \n", sizeof (sparse_matrix_entry)); - fprintf (stderr, " TOTAL NUMBER OF ATOMS IN THE SYSTEM --> %d \n", system.N); + fprintf (stderr, " MATVEC Blocks : %d, Block_Size : %d \n", MATVEC_BLOCKS, MATVEC_BLOCK_SIZE ); + fprintf (stderr, " Blocks : %d, Blocks_Pow_2 : %d, Block_Size : %d \n", BLOCKS, BLOCKS_POW_2, BLOCK_SIZE ); + fprintf (stderr, " Size of far neighbor data %d \n", sizeof (far_neighbor_data)); + fprintf (stderr, " Size of reax_atom %d \n", sizeof (reax_atom)); + fprintf (stderr, " size of sparse matrix entry %d \n", sizeof (sparse_matrix_entry)); + fprintf (stderr, " TOTAL NUMBER OF ATOMS IN THE SYSTEM --> %d \n", system.N); #endif #ifdef __CUDA_MEM__ - print_device_mem_usage (); + print_device_mem_usage (); #endif #ifdef __BUILD_DEBUG__ - Initialize( &system, &control, &data, &workspace, &lists, - &out_control, &Evolve ); + Initialize( &system, &control, &data, &workspace, &lists, + &out_control, &Evolve ); #endif - t_start = Get_Time (); - Cuda_Initialize( &system, &control, &data, &workspace, &lists, - &out_control, &Cuda_Evolve); - t_elapsed = Get_Timing_Info (t_start); + t_start = Get_Time (); + Cuda_Initialize( &system, &control, &data, &workspace, &lists, + &out_control, &Cuda_Evolve); + t_elapsed = Get_Timing_Info (t_start); #ifdef __DEBUG_CUDA__ - fprintf (stderr, " Cuda Initialize timing ---> %f \n", t_elapsed ); + fprintf (stderr, " Cuda Initialize timing ---> %f \n", t_elapsed ); #endif #ifdef __CUDA_MEM__ - print_device_mem_usage (); + print_device_mem_usage (); #endif #ifdef __BUILD_DEBUG__ - Reset( &system, &control, &data, &workspace, &lists ); + Reset( &system, &control, &data, &workspace, &lists ); #endif - Cuda_Reset( &system, &control, &data, &workspace, &lists ); + Cuda_Reset( &system, &control, &data, &workspace, &lists ); #ifdef __BUILD_DEBUG__ - Generate_Neighbor_Lists ( &system, &control, &data, &workspace, - &lists, &out_control ); + Generate_Neighbor_Lists ( &system, &control, &data, &workspace, + &lists, &out_control ); #endif - /* - dim3 blockspergrid (system.g.ncell[0], system.g.ncell[1], system.g.ncell[2]); - dim3 threadsperblock (system.g.max_atoms); + /* + dim3 blockspergrid (system.g.ncell[0], system.g.ncell[1], system.g.ncell[2]); + dim3 threadsperblock (system.g.max_atoms); - t_start = Get_Time (); - Cuda_Bin_Atoms (&system, &workspace); - Cuda_Bin_Atoms_Sync ( &system ); + t_start = Get_Time (); + Cuda_Bin_Atoms (&system, &workspace); + Cuda_Bin_Atoms_Sync ( &system ); - Generate_Neighbor_Lists <<<blockspergrid, threadsperblock >>> - (system.d_atoms, system.d_g, system.d_box, - (control_params *)control.d_control, *(dev_lists + FAR_NBRS)); - cudaThreadSynchronize (); - cudaCheckError (); - t_elapsed = Get_Timing_Info (t_start); + Generate_Neighbor_Lists <<<blockspergrid, threadsperblock >>> + (system.d_atoms, system.d_g, system.d_box, + (control_params *)control.d_control, *(dev_lists + FAR_NBRS)); + cudaThreadSynchronize (); + cudaCheckError (); + t_elapsed = Get_Timing_Info (t_start); - d_timing.nbrs += t_elapsed; - */ + d_timing.nbrs += t_elapsed; + */ - Cuda_Generate_Neighbor_Lists (&system, &workspace, &control, false); + Cuda_Generate_Neighbor_Lists (&system, &workspace, &control, false); #ifdef __BUILD_DEBUG__ - Compute_Forces(&system, &control, &data, &workspace, &lists, &out_control); + Compute_Forces(&system, &control, &data, &workspace, &lists, &out_control); #endif - Cuda_Compute_Forces(&system, &control, &data, &workspace, &lists, &out_control); + Cuda_Compute_Forces(&system, &control, &data, &workspace, &lists, &out_control); #ifdef __BUILD_DEBUG__ - Compute_Kinetic_Energy( &system, &data ); + Compute_Kinetic_Energy( &system, &data ); #endif - Cuda_Compute_Kinetic_Energy (&system, &data); + Cuda_Compute_Kinetic_Energy (&system, &data); #ifndef __BUILD_DEBUG__ - // Here sync the simulation data, because it has been changed. - Prep_Device_For_Output ( &system, &data ); - Output_Results(&system, &control, &data, &workspace, &lists, &out_control); + // Here sync the simulation data, because it has been changed. + Prep_Device_For_Output ( &system, &data ); + Output_Results(&system, &control, &data, &workspace, &lists, &out_control); #endif #ifdef __BUILD_DEBUG__ - if (!validate_device (&system, &data, &workspace, &lists) ) - { - fprintf (stderr, " Results does not match between Device and host @ step --> %d \n", data.step); - exit (1); - } + if (!validate_device (&system, &data, &workspace, &lists) ) + { + fprintf (stderr, " Results does not match between Device and host @ step --> %d \n", data.step); + exit (1); + } #endif #ifdef __DEBUG_CUDA__ - fprintf (stderr, "step -> %d <- done. \n", data.step); + fprintf (stderr, "step -> %d <- done. \n", data.step); #endif - ++data.step; + ++data.step; - for( ; data.step <= control.nsteps; data.step++ ) { + for( ; data.step <= control.nsteps; data.step++ ) { - //fprintf (stderr, "Begin ... \n"); - //to Sync step to the device. - //Sync_Host_Device (&data, (simulation_data *)data.d_simulation_data, cudaMemcpyHostToDevice ); - copy_host_device (&data.step, &((simulation_data *)data.d_simulation_data)->step, - INT_SIZE, cudaMemcpyHostToDevice, RES_SIMULATION_DATA ); + //fprintf (stderr, "Begin ... \n"); + //to Sync step to the device. + //Sync_Host_Device (&data, (simulation_data *)data.d_simulation_data, cudaMemcpyHostToDevice ); + copy_host_device (&data.step, &((simulation_data *)data.d_simulation_data)->step, + INT_SIZE, cudaMemcpyHostToDevice, RES_SIMULATION_DATA ); - //fprintf (stderr, "Synched data .... \n"); - if( control.T_mode ) { - Temperature_Control( &control, &data, &out_control ); - Sync_Host_Device (&control, (control_params *)control.d_control, cudaMemcpyHostToDevice ); - } - //fprintf (stderr, "Temp. Control done ... \n"); + //fprintf (stderr, "Synched data .... \n"); + if( control.T_mode ) { + Temperature_Control( &control, &data, &out_control ); + Sync_Host_Device (&control, (control_params *)control.d_control, cudaMemcpyHostToDevice ); + } + //fprintf (stderr, "Temp. Control done ... \n"); #ifdef __BUILD_DEBUG__ - Evolve( &system, &control, &data, &workspace, &lists, &out_control ); + Evolve( &system, &control, &data, &workspace, &lists, &out_control ); #endif - Cuda_Evolve( &system, &control, &data, &workspace, &lists, &out_control ); + Cuda_Evolve( &system, &control, &data, &workspace, &lists, &out_control ); - //fprintf (stderr, "Evolve done \n"); + //fprintf (stderr, "Evolve done \n"); #ifdef __BUILD_DEBUG__ - Post_Evolve( &system, &control, &data, &workspace, &lists, &out_control ); + Post_Evolve( &system, &control, &data, &workspace, &lists, &out_control ); #endif - Cuda_Post_Evolve( &system, &control, &data, &workspace, &lists, &out_control ); - //fprintf (stderr, "Post Evolve done \n"); + Cuda_Post_Evolve( &system, &control, &data, &workspace, &lists, &out_control ); + //fprintf (stderr, "Post Evolve done \n"); #ifndef __BUILD_DEBUG__ - Prep_Device_For_Output ( &system, &data ); - Output_Results(&system, &control, &data, &workspace, &lists, &out_control); - - /* - Analysis( &system, &control, &data, &workspace, &lists, &out_control ); - */ - steps = data.step - data.prev_steps; - if( steps && out_control.restart_freq && - steps % out_control.restart_freq == 0 ) - Write_Restart( &system, &control, &data, &workspace, &out_control ); + Prep_Device_For_Output ( &system, &data ); + Output_Results(&system, &control, &data, &workspace, &lists, &out_control); + + /* + Analysis( &system, &control, &data, &workspace, &lists, &out_control ); + */ + steps = data.step - data.prev_steps; + if( steps && out_control.restart_freq && + steps % out_control.restart_freq == 0 ) + Write_Restart( &system, &control, &data, &workspace, &out_control ); #endif #ifdef __BUILD_DEBUG__ - if (!validate_device (&system, &data, &workspace, &lists) ) - { - fprintf (stderr, " Results does not match between Device and host @ step --> %d \n", data.step); - exit (1); - } + if (!validate_device (&system, &data, &workspace, &lists) ) + { + fprintf (stderr, " Results does not match between Device and host @ step --> %d \n", data.step); + exit (1); + } #endif - fprintf (stderr, "step -> %d <- done. \n", data.step); - } + fprintf (stderr, "step -> %d <- done. \n", data.step); + } - if( out_control.write_steps > 0 ) { - fclose( out_control.trj ); - //Write_PDB( &system, &control, &data, &workspace, - // &(lists[BONDS]), &out_control ); - } + if( out_control.write_steps > 0 ) { + fclose( out_control.trj ); + //Write_PDB( &system, &control, &data, &workspace, + // &(lists[BONDS]), &out_control ); + } - data.timing.end = Get_Time( ); - data.timing.elapsed = Get_Timing_Info( data.timing.start ); - fprintf( out_control.log, "total: %.2f secs\n", data.timing.elapsed ); + data.timing.end = Get_Time( ); + data.timing.elapsed = Get_Timing_Info( data.timing.start ); + fprintf( out_control.log, "total: %.2f secs\n", data.timing.elapsed ); - return 0; + return 0; } diff --git a/PuReMD-GPU/src/three_body_interactions.cu b/PuReMD-GPU/src/three_body_interactions.cu index bc4d73cf..c2eed63b 100644 --- a/PuReMD-GPU/src/three_body_interactions.cu +++ b/PuReMD-GPU/src/three_body_interactions.cu @@ -30,43 +30,43 @@ /* calculates the theta angle between i-j-k */ HOST_DEVICE void Calculate_Theta( rvec dvec_ji, real d_ji, rvec dvec_jk, real d_jk, - real *theta, real *cos_theta ) + real *theta, real *cos_theta ) { - (*cos_theta) = Dot( dvec_ji, dvec_jk, 3 ) / ( d_ji * d_jk ); - if( *cos_theta > 1. ) *cos_theta = 1.0; - if( *cos_theta < -1. ) *cos_theta = -1.0; + (*cos_theta) = Dot( dvec_ji, dvec_jk, 3 ) / ( d_ji * d_jk ); + if( *cos_theta > 1. ) *cos_theta = 1.0; + if( *cos_theta < -1. ) *cos_theta = -1.0; - (*theta) = ACOS( *cos_theta ); + (*theta) = ACOS( *cos_theta ); } /* calculates the derivative of the cosine of the angle between i-j-k */ HOST_DEVICE void Calculate_dCos_Theta( rvec dvec_ji, real d_ji, rvec dvec_jk, real d_jk, - rvec* dcos_theta_di, rvec* dcos_theta_dj, - rvec* dcos_theta_dk ) + rvec* dcos_theta_di, rvec* dcos_theta_dj, + rvec* dcos_theta_dk ) { - int t; - real sqr_d_ji = SQR(d_ji); - real sqr_d_jk = SQR(d_jk); - real inv_dists = 1.0 / (d_ji * d_jk); - real inv_dists3 = POW( inv_dists, 3 ); - real dot_dvecs = Dot( dvec_ji, dvec_jk, 3 ); - real Cdot_inv3 = dot_dvecs * inv_dists3; - - for( t = 0; t < 3; ++t ) { - (*dcos_theta_di)[t] = dvec_jk[t] * inv_dists - - Cdot_inv3 * sqr_d_jk * dvec_ji[t]; - - (*dcos_theta_dj)[t] = -(dvec_jk[t] + dvec_ji[t]) * inv_dists + - Cdot_inv3 * ( sqr_d_jk * dvec_ji[t] + sqr_d_ji * dvec_jk[t] ); - - (*dcos_theta_dk)[t] = dvec_ji[t] * inv_dists - - Cdot_inv3 * sqr_d_ji * dvec_jk[t]; - } - - /*fprintf( stderr, - "%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n", - dvec_jk[t] * inv_dists*/ + int t; + real sqr_d_ji = SQR(d_ji); + real sqr_d_jk = SQR(d_jk); + real inv_dists = 1.0 / (d_ji * d_jk); + real inv_dists3 = POW( inv_dists, 3 ); + real dot_dvecs = Dot( dvec_ji, dvec_jk, 3 ); + real Cdot_inv3 = dot_dvecs * inv_dists3; + + for( t = 0; t < 3; ++t ) { + (*dcos_theta_di)[t] = dvec_jk[t] * inv_dists - + Cdot_inv3 * sqr_d_jk * dvec_ji[t]; + + (*dcos_theta_dj)[t] = -(dvec_jk[t] + dvec_ji[t]) * inv_dists + + Cdot_inv3 * ( sqr_d_jk * dvec_ji[t] + sqr_d_ji * dvec_jk[t] ); + + (*dcos_theta_dk)[t] = dvec_ji[t] * inv_dists - + Cdot_inv3 * sqr_d_ji * dvec_jk[t]; + } + + /*fprintf( stderr, + "%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n", + dvec_jk[t] * inv_dists*/ } @@ -83,508 +83,508 @@ HOST_DEVICE void Calculate_dCos_Theta( rvec dvec_ji, real d_ji, rvec dvec_jk, re /* this is a 3-body interaction in which the main role is played by j which sits in the middle of the other two. */ void Three_Body_Interactions( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - list **lists, output_controls *out_control ) + simulation_data *data, static_storage *workspace, + list **lists, output_controls *out_control ) { - int i, j, pi, k, pk, t; - int type_i, type_j, type_k; - int start_j, end_j, start_pk, end_pk; - int flag, cnt, num_thb_intrs; - - real temp, temp_bo_jt, pBOjt7; - real p_val1, p_val2, p_val3, p_val4, p_val5; - real p_val6, p_val7, p_val8, p_val9, p_val10; - real p_pen1, p_pen2, p_pen3, p_pen4; - real p_coa1, p_coa2, p_coa3, p_coa4; - real trm8, expval6, expval7, expval2theta, expval12theta, exp3ij, exp3jk; - real exp_pen2ij, exp_pen2jk, exp_pen3, exp_pen4, trm_pen34, exp_coa2; - real dSBO1, dSBO2, SBO, SBO2, CSBO2, SBOp, prod_SBO; - real CEval1, CEval2, CEval3, CEval4, CEval5, CEval6, CEval7, CEval8; - real CEpen1, CEpen2, CEpen3; - real e_ang, e_coa, e_pen; - real CEcoa1, CEcoa2, CEcoa3, CEcoa4, CEcoa5; - real Cf7ij, Cf7jk, Cf8j, Cf9j; - real f7_ij, f7_jk, f8_Dj, f9_Dj; - real Ctheta_0, theta_0, theta_00, theta, cos_theta, sin_theta; - real r_ij, r_jk; - real BOA_ij, BOA_jk; - real vlpadj; - rvec force, ext_press; - // rtensor temp_rtensor, total_rtensor; - real *total_bo; - three_body_header *thbh; - three_body_parameters *thbp; - three_body_interaction_data *p_ijk, *p_kji; - bond_data *pbond_ij, *pbond_jk, *pbond_jt; - bond_order_data *bo_ij, *bo_jk, *bo_jt; - list *bonds, *thb_intrs; - bond_data *bond_list; - three_body_interaction_data *thb_list; - - total_bo = workspace->total_bond_order; - bonds = (*lists) + BONDS; - bond_list = bonds->select.bond_list; - thb_intrs = (*lists) + THREE_BODIES; - thb_list = thb_intrs->select.three_body_list; - - /* global parameters used in these calculations */ - p_val6 = system->reaxprm.gp.l[14]; - p_val8 = system->reaxprm.gp.l[33]; - p_val9 = system->reaxprm.gp.l[16]; - p_val10 = system->reaxprm.gp.l[17]; - num_thb_intrs = 0; - - for( j = 0; j < system->N; ++j ) { - // fprintf( out_control->eval, "j: %d\n", j ); - type_j = system->atoms[j].type; - start_j = Start_Index(j, bonds); - end_j = End_Index(j, bonds); - - p_val3 = system->reaxprm.sbp[ type_j ].p_val3; - p_val5 = system->reaxprm.sbp[ type_j ].p_val5; - - SBOp = 0, prod_SBO = 1; - for( t = start_j; t < end_j; ++t ) { - bo_jt = &(bond_list[t].bo_data); - SBOp += (bo_jt->BO_pi + bo_jt->BO_pi2); - temp = SQR( bo_jt->BO ); - temp *= temp; - temp *= temp; - prod_SBO *= EXP( -temp ); - } - - /* modifications to match Adri's code - 09/01/09 */ - if( workspace->vlpex[j] >= 0 ){ - vlpadj = 0; - dSBO2 = prod_SBO - 1; - } - else{ - vlpadj = workspace->nlp[j]; - dSBO2 = (prod_SBO - 1) * (1 - p_val8 * workspace->dDelta_lp[j]); - } - - SBO = SBOp + (1 - prod_SBO) * (-workspace->Delta_boc[j] - p_val8 * vlpadj); - dSBO1 = -8 * prod_SBO * ( workspace->Delta_boc[j] + p_val8 * vlpadj ); - - if( SBO <= 0 ) - SBO2 = 0, CSBO2 = 0; - else if( SBO > 0 && SBO <= 1 ) { - SBO2 = POW( SBO, p_val9 ); - CSBO2 = p_val9 * POW( SBO, p_val9 - 1 ); - } - else if( SBO > 1 && SBO < 2 ) { - SBO2 = 2 - POW( 2-SBO, p_val9 ); - CSBO2 = p_val9 * POW( 2 - SBO, p_val9 - 1 ); - } - else - SBO2 = 2, CSBO2 = 0; - - expval6 = EXP( p_val6 * workspace->Delta_boc[j] ); - - /* unlike 2-body intrs where we enforce i<j, we cannot put any such - restrictions here. such a restriction would prevent us from producing - all 4-body intrs correctly */ - for( pi = start_j; pi < end_j; ++pi ) { - Set_Start_Index( pi, num_thb_intrs, thb_intrs ); - - pbond_ij = &(bond_list[pi]); - bo_ij = &(pbond_ij->bo_data); - BOA_ij = bo_ij->BO - control->thb_cut; - - - if( BOA_ij/*bo_ij->BO*/ > (real) 0.0 ) { - i = pbond_ij->nbr; - r_ij = pbond_ij->d; - type_i = system->atoms[i].type; - // fprintf( out_control->eval, "i: %d\n", i ); - - - /* first copy 3-body intrs from previously computed ones where i>k. + int i, j, pi, k, pk, t; + int type_i, type_j, type_k; + int start_j, end_j, start_pk, end_pk; + int flag, cnt, num_thb_intrs; + + real temp, temp_bo_jt, pBOjt7; + real p_val1, p_val2, p_val3, p_val4, p_val5; + real p_val6, p_val7, p_val8, p_val9, p_val10; + real p_pen1, p_pen2, p_pen3, p_pen4; + real p_coa1, p_coa2, p_coa3, p_coa4; + real trm8, expval6, expval7, expval2theta, expval12theta, exp3ij, exp3jk; + real exp_pen2ij, exp_pen2jk, exp_pen3, exp_pen4, trm_pen34, exp_coa2; + real dSBO1, dSBO2, SBO, SBO2, CSBO2, SBOp, prod_SBO; + real CEval1, CEval2, CEval3, CEval4, CEval5, CEval6, CEval7, CEval8; + real CEpen1, CEpen2, CEpen3; + real e_ang, e_coa, e_pen; + real CEcoa1, CEcoa2, CEcoa3, CEcoa4, CEcoa5; + real Cf7ij, Cf7jk, Cf8j, Cf9j; + real f7_ij, f7_jk, f8_Dj, f9_Dj; + real Ctheta_0, theta_0, theta_00, theta, cos_theta, sin_theta; + real r_ij, r_jk; + real BOA_ij, BOA_jk; + real vlpadj; + rvec force, ext_press; + // rtensor temp_rtensor, total_rtensor; + real *total_bo; + three_body_header *thbh; + three_body_parameters *thbp; + three_body_interaction_data *p_ijk, *p_kji; + bond_data *pbond_ij, *pbond_jk, *pbond_jt; + bond_order_data *bo_ij, *bo_jk, *bo_jt; + list *bonds, *thb_intrs; + bond_data *bond_list; + three_body_interaction_data *thb_list; + + total_bo = workspace->total_bond_order; + bonds = (*lists) + BONDS; + bond_list = bonds->select.bond_list; + thb_intrs = (*lists) + THREE_BODIES; + thb_list = thb_intrs->select.three_body_list; + + /* global parameters used in these calculations */ + p_val6 = system->reaxprm.gp.l[14]; + p_val8 = system->reaxprm.gp.l[33]; + p_val9 = system->reaxprm.gp.l[16]; + p_val10 = system->reaxprm.gp.l[17]; + num_thb_intrs = 0; + + for( j = 0; j < system->N; ++j ) { + // fprintf( out_control->eval, "j: %d\n", j ); + type_j = system->atoms[j].type; + start_j = Start_Index(j, bonds); + end_j = End_Index(j, bonds); + + p_val3 = system->reaxprm.sbp[ type_j ].p_val3; + p_val5 = system->reaxprm.sbp[ type_j ].p_val5; + + SBOp = 0, prod_SBO = 1; + for( t = start_j; t < end_j; ++t ) { + bo_jt = &(bond_list[t].bo_data); + SBOp += (bo_jt->BO_pi + bo_jt->BO_pi2); + temp = SQR( bo_jt->BO ); + temp *= temp; + temp *= temp; + prod_SBO *= EXP( -temp ); + } + + /* modifications to match Adri's code - 09/01/09 */ + if( workspace->vlpex[j] >= 0 ){ + vlpadj = 0; + dSBO2 = prod_SBO - 1; + } + else{ + vlpadj = workspace->nlp[j]; + dSBO2 = (prod_SBO - 1) * (1 - p_val8 * workspace->dDelta_lp[j]); + } + + SBO = SBOp + (1 - prod_SBO) * (-workspace->Delta_boc[j] - p_val8 * vlpadj); + dSBO1 = -8 * prod_SBO * ( workspace->Delta_boc[j] + p_val8 * vlpadj ); + + if( SBO <= 0 ) + SBO2 = 0, CSBO2 = 0; + else if( SBO > 0 && SBO <= 1 ) { + SBO2 = POW( SBO, p_val9 ); + CSBO2 = p_val9 * POW( SBO, p_val9 - 1 ); + } + else if( SBO > 1 && SBO < 2 ) { + SBO2 = 2 - POW( 2-SBO, p_val9 ); + CSBO2 = p_val9 * POW( 2 - SBO, p_val9 - 1 ); + } + else + SBO2 = 2, CSBO2 = 0; + + expval6 = EXP( p_val6 * workspace->Delta_boc[j] ); + + /* unlike 2-body intrs where we enforce i<j, we cannot put any such + restrictions here. such a restriction would prevent us from producing + all 4-body intrs correctly */ + for( pi = start_j; pi < end_j; ++pi ) { + Set_Start_Index( pi, num_thb_intrs, thb_intrs ); + + pbond_ij = &(bond_list[pi]); + bo_ij = &(pbond_ij->bo_data); + BOA_ij = bo_ij->BO - control->thb_cut; + + + if( BOA_ij/*bo_ij->BO*/ > (real) 0.0 ) { + i = pbond_ij->nbr; + r_ij = pbond_ij->d; + type_i = system->atoms[i].type; + // fprintf( out_control->eval, "i: %d\n", i ); + + + /* first copy 3-body intrs from previously computed ones where i>k. IMPORTANT: if it is less costly to compute theta and its derivative, we should definitely re-compute them, instead of copying! in the second for-loop below, we compute only new 3-body intrs where i < k */ - for( pk = start_j; pk < pi; ++pk ) { - // fprintf( out_control->eval, "pk: %d\n", pk ); - start_pk = Start_Index( pk, thb_intrs ); - end_pk = End_Index( pk, thb_intrs ); - - for( t = start_pk; t < end_pk; ++t ) - if( thb_list[t].thb == i ) { - p_ijk = &(thb_list[num_thb_intrs]); - p_kji = &(thb_list[t]); - - p_ijk->thb = bond_list[pk].nbr; - p_ijk->pthb = pk; - p_ijk->theta = p_kji->theta; - rvec_Copy( p_ijk->dcos_di, p_kji->dcos_dk ); - rvec_Copy( p_ijk->dcos_dj, p_kji->dcos_dj ); - rvec_Copy( p_ijk->dcos_dk, p_kji->dcos_di ); - - //if (j == 12) - //fprintf (stderr, "Adding one for matched atom %d \n", i); - - ++num_thb_intrs; - break; - } - } - - - /* and this is the second for loop mentioned above */ - for( pk = pi+1; pk < end_j; ++pk ) { - pbond_jk = &(bond_list[pk]); - bo_jk = &(pbond_jk->bo_data); - BOA_jk = bo_jk->BO - control->thb_cut; - k = pbond_jk->nbr; - type_k = system->atoms[k].type; - p_ijk = &( thb_list[num_thb_intrs] ); - - //TODO - CHANGE ORIGINAL - if (BOA_jk <= 0) continue; - - Calculate_Theta( pbond_ij->dvec, pbond_ij->d, - pbond_jk->dvec, pbond_jk->d, - &theta, &cos_theta ); - - Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, - pbond_jk->dvec, pbond_jk->d, - &(p_ijk->dcos_di), &(p_ijk->dcos_dj), - &(p_ijk->dcos_dk) ); - - p_ijk->thb = k; - p_ijk->pthb = pk; - p_ijk->theta = theta; - - //if (j == 12) - //fprintf (stderr, "Adding one for the rest %d \n", k); - - sin_theta = SIN( theta ); - if( sin_theta < 1.0e-5 ) - sin_theta = 1.0e-5; - - ++num_thb_intrs; - - - if( BOA_jk > 0.0 && - (bo_ij->BO * bo_jk->BO) > SQR(control->thb_cut)/*0*/) { - r_jk = pbond_jk->d; - thbh = &( system->reaxprm.thbp[ index_thbp (type_i,type_j,type_k,&system->reaxprm) ] ); - flag = 0; - - /* if( workspace->orig_id[i] < workspace->orig_id[k] ) - fprintf( stdout, "%6d %6d %6d %7.3f %7.3f %7.3f\n", - workspace->orig_id[i], workspace->orig_id[j], - workspace->orig_id[k], bo_ij->BO, bo_jk->BO, p_ijk->theta ); - else - fprintf( stdout, "%6d %6d %6d %7.3f %7.3f %7.3f\n", - workspace->orig_id[k], workspace->orig_id[j], - workspace->orig_id[i], bo_jk->BO, bo_ij->BO, p_ijk->theta ); */ - - - for( cnt = 0; cnt < thbh->cnt; ++cnt ) { - // fprintf( out_control->eval, - // "%6d%6d%6d -- exists in thbp\n", i+1, j+1, k+1 ); - - if( fabs(thbh->prm[cnt].p_val1) > 0.001 ) { - thbp = &( thbh->prm[cnt] ); - - /* ANGLE ENERGY */ - p_val1 = thbp->p_val1; - p_val2 = thbp->p_val2; - p_val4 = thbp->p_val4; - p_val7 = thbp->p_val7; - theta_00 = thbp->theta_00; - - exp3ij = EXP( -p_val3 * POW( BOA_ij, p_val4 ) ); - f7_ij = 1.0 - exp3ij; - Cf7ij = p_val3 * p_val4 * - POW( BOA_ij, p_val4 - 1.0 ) * exp3ij; - - exp3jk = EXP( -p_val3 * POW( BOA_jk, p_val4 ) ); - f7_jk = 1.0 - exp3jk; - Cf7jk = p_val3 * p_val4 * - POW( BOA_jk, p_val4 - 1.0 ) * exp3jk; - - expval7 = EXP( -p_val7 * workspace->Delta_boc[j] ); - trm8 = 1.0 + expval6 + expval7; - f8_Dj = p_val5 - ( (p_val5 - 1.0) * (2.0 + expval6) / trm8 ); - Cf8j = ( (1.0 - p_val5) / SQR(trm8) ) * - (p_val6 * expval6 * trm8 - - (2.0 + expval6) * ( p_val6 * expval6 - p_val7 * expval7 )); - - theta_0 = 180.0 - - theta_00 * (1.0 - EXP(-p_val10 * (2.0 - SBO2))); - theta_0 = DEG2RAD( theta_0 ); - - expval2theta = EXP(-p_val2 * SQR(theta_0-theta)); - if( p_val1 >= 0 ) - expval12theta = p_val1 * (1.0 - expval2theta); - else // To avoid linear Me-H-Me angles (6/6/06) - expval12theta = p_val1 * -expval2theta; - - CEval1 = Cf7ij * f7_jk * f8_Dj * expval12theta; - CEval2 = Cf7jk * f7_ij * f8_Dj * expval12theta; - CEval3 = Cf8j * f7_ij * f7_jk * expval12theta; - CEval4 = -2.0 * p_val1 * p_val2 * f7_ij * f7_jk * f8_Dj * - expval2theta * (theta_0 - theta); - - Ctheta_0 = p_val10 * DEG2RAD(theta_00) * - exp( -p_val10 * (2.0 - SBO2) ); - - CEval5 = -CEval4 * Ctheta_0 * CSBO2; - CEval6 = CEval5 * dSBO1; - CEval7 = CEval5 * dSBO2; - CEval8 = -CEval4 / sin_theta; - - data->E_Ang += e_ang = f7_ij * f7_jk * f8_Dj * expval12theta; - /* END ANGLE ENERGY*/ - - - /* PENALTY ENERGY */ - p_pen1 = thbp->p_pen1; - p_pen2 = system->reaxprm.gp.l[19]; - p_pen3 = system->reaxprm.gp.l[20]; - p_pen4 = system->reaxprm.gp.l[21]; - - exp_pen2ij = EXP( -p_pen2 * SQR( BOA_ij - 2.0 ) ); - exp_pen2jk = EXP( -p_pen2 * SQR( BOA_jk - 2.0 ) ); - exp_pen3 = EXP( -p_pen3 * workspace->Delta[j] ); - exp_pen4 = EXP( p_pen4 * workspace->Delta[j] ); - trm_pen34 = 1.0 + exp_pen3 + exp_pen4; - f9_Dj = ( 2.0 + exp_pen3 ) / trm_pen34; - Cf9j = (-p_pen3 * exp_pen3 * trm_pen34 - - (2.0 + exp_pen3) * ( -p_pen3 * exp_pen3 + - p_pen4 * exp_pen4 )) / - SQR( trm_pen34 ); - - data->E_Pen += e_pen = - p_pen1 * f9_Dj * exp_pen2ij * exp_pen2jk; - - CEpen1 = e_pen * Cf9j / f9_Dj; - temp = -2.0 * p_pen2 * e_pen; - CEpen2 = temp * (BOA_ij - 2.0); - CEpen3 = temp * (BOA_jk - 2.0); - /* END PENALTY ENERGY */ - - - /* COALITION ENERGY */ - p_coa1 = thbp->p_coa1; - p_coa2 = system->reaxprm.gp.l[2]; - p_coa3 = system->reaxprm.gp.l[38]; - p_coa4 = system->reaxprm.gp.l[30]; - - exp_coa2 = EXP( p_coa2 * workspace->Delta_boc[j] ); - data->E_Coa += e_coa = - p_coa1 / (1. + exp_coa2) * - EXP( -p_coa3 * SQR(total_bo[i] - BOA_ij) ) * - EXP( -p_coa3 * SQR(total_bo[k] - BOA_jk) ) * - EXP( -p_coa4 * SQR(BOA_ij - 1.5) ) * - EXP( -p_coa4 * SQR(BOA_jk - 1.5) ); - - CEcoa1 = -2 * p_coa4 * (BOA_ij - 1.5) * e_coa; - CEcoa2 = -2 * p_coa4 * (BOA_jk - 1.5) * e_coa; - CEcoa3 = -p_coa2 * exp_coa2 * e_coa / (1+exp_coa2); - CEcoa4 = -2*p_coa3 * (total_bo[i]-BOA_ij) * e_coa; - CEcoa5 = -2*p_coa3 * (total_bo[k]-BOA_jk) * e_coa; - /* END COALITION ENERGY */ - - /* FORCES */ - bo_ij->Cdbo += (CEval1 + CEpen2 + (CEcoa1-CEcoa4)); - bo_jk->Cdbo += (CEval2 + CEpen3 + (CEcoa2-CEcoa5)); - workspace->CdDelta[j] += ((CEval3 + CEval7) + - CEpen1 + CEcoa3); - workspace->CdDelta[i] += CEcoa4; - workspace->CdDelta[k] += CEcoa5; - - for( t = start_j; t < end_j; ++t ) { - pbond_jt = &( bond_list[t] ); - bo_jt = &(pbond_jt->bo_data); - temp_bo_jt = bo_jt->BO; - temp = CUBE( temp_bo_jt ); - pBOjt7 = temp * temp * temp_bo_jt; - - // fprintf( out_control->eval, "%6d%12.8f\n", - // workspace->orig_id[ bond_list[t].nbr ], - // (CEval6 * pBOjt7) ); - - bo_jt->Cdbo += (CEval6 * pBOjt7); - bo_jt->Cdbopi += CEval5; - bo_jt->Cdbopi2 += CEval5; - } - - - if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) { - - rvec_ScaledAdd( system->atoms[i].f, CEval8, p_ijk->dcos_di ); - rvec_ScaledAdd( system->atoms[j].f, CEval8, p_ijk->dcos_dj ); - rvec_ScaledAdd( system->atoms[k].f, CEval8, p_ijk->dcos_dk ); - - /* - if (i == 0) fprintf (stderr, " atom %d adding to i (j) = 0\n", j); - if (k == 0) fprintf (stderr, " atom %d adding to i (k) = 0\n", j); - */ - } - else { - /* terms not related to bond order derivatives - are added directly into - forces and pressure vector/tensor */ - rvec_Scale( force, CEval8, p_ijk->dcos_di ); - rvec_Add( system->atoms[i].f, force ); - rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); - rvec_Add( data->ext_press, ext_press ); - - rvec_ScaledAdd( system->atoms[j].f, CEval8, p_ijk->dcos_dj ); - - rvec_Scale( force, CEval8, p_ijk->dcos_dk ); - rvec_Add( system->atoms[k].f, force ); - rvec_iMultiply( ext_press, pbond_jk->rel_box, force ); - rvec_Add( data->ext_press, ext_press ); - - - /* This part is for a fully-flexible box */ - /* rvec_OuterProduct( temp_rtensor, - p_ijk->dcos_di, system->atoms[i].x ); - rtensor_Scale( total_rtensor, +CEval8, temp_rtensor ); - - rvec_OuterProduct( temp_rtensor, - p_ijk->dcos_dj, system->atoms[j].x ); - rtensor_ScaledAdd(total_rtensor, CEval8, temp_rtensor); - - rvec_OuterProduct( temp_rtensor, - p_ijk->dcos_dk, system->atoms[k].x ); - rtensor_ScaledAdd(total_rtensor, CEval8, temp_rtensor); - - if( pbond_ij->imaginary || pbond_jk->imaginary ) - rtensor_ScaledAdd( data->flex_bar.P, - -1.0, total_rtensor ); - else - rtensor_Add( data->flex_bar.P, total_rtensor ); */ - } + for( pk = start_j; pk < pi; ++pk ) { + // fprintf( out_control->eval, "pk: %d\n", pk ); + start_pk = Start_Index( pk, thb_intrs ); + end_pk = End_Index( pk, thb_intrs ); + + for( t = start_pk; t < end_pk; ++t ) + if( thb_list[t].thb == i ) { + p_ijk = &(thb_list[num_thb_intrs]); + p_kji = &(thb_list[t]); + + p_ijk->thb = bond_list[pk].nbr; + p_ijk->pthb = pk; + p_ijk->theta = p_kji->theta; + rvec_Copy( p_ijk->dcos_di, p_kji->dcos_dk ); + rvec_Copy( p_ijk->dcos_dj, p_kji->dcos_dj ); + rvec_Copy( p_ijk->dcos_dk, p_kji->dcos_di ); + + //if (j == 12) + //fprintf (stderr, "Adding one for matched atom %d \n", i); + + ++num_thb_intrs; + break; + } + } + + + /* and this is the second for loop mentioned above */ + for( pk = pi+1; pk < end_j; ++pk ) { + pbond_jk = &(bond_list[pk]); + bo_jk = &(pbond_jk->bo_data); + BOA_jk = bo_jk->BO - control->thb_cut; + k = pbond_jk->nbr; + type_k = system->atoms[k].type; + p_ijk = &( thb_list[num_thb_intrs] ); + + //TODO - CHANGE ORIGINAL + if (BOA_jk <= 0) continue; + + Calculate_Theta( pbond_ij->dvec, pbond_ij->d, + pbond_jk->dvec, pbond_jk->d, + &theta, &cos_theta ); + + Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, + pbond_jk->dvec, pbond_jk->d, + &(p_ijk->dcos_di), &(p_ijk->dcos_dj), + &(p_ijk->dcos_dk) ); + + p_ijk->thb = k; + p_ijk->pthb = pk; + p_ijk->theta = theta; + + //if (j == 12) + //fprintf (stderr, "Adding one for the rest %d \n", k); + + sin_theta = SIN( theta ); + if( sin_theta < 1.0e-5 ) + sin_theta = 1.0e-5; + + ++num_thb_intrs; + + + if( BOA_jk > 0.0 && + (bo_ij->BO * bo_jk->BO) > SQR(control->thb_cut)/*0*/) { + r_jk = pbond_jk->d; + thbh = &( system->reaxprm.thbp[ index_thbp (type_i,type_j,type_k,&system->reaxprm) ] ); + flag = 0; + + /* if( workspace->orig_id[i] < workspace->orig_id[k] ) + fprintf( stdout, "%6d %6d %6d %7.3f %7.3f %7.3f\n", + workspace->orig_id[i], workspace->orig_id[j], + workspace->orig_id[k], bo_ij->BO, bo_jk->BO, p_ijk->theta ); + else + fprintf( stdout, "%6d %6d %6d %7.3f %7.3f %7.3f\n", + workspace->orig_id[k], workspace->orig_id[j], + workspace->orig_id[i], bo_jk->BO, bo_ij->BO, p_ijk->theta ); */ + + + for( cnt = 0; cnt < thbh->cnt; ++cnt ) { + // fprintf( out_control->eval, + // "%6d%6d%6d -- exists in thbp\n", i+1, j+1, k+1 ); + + if( fabs(thbh->prm[cnt].p_val1) > 0.001 ) { + thbp = &( thbh->prm[cnt] ); + + /* ANGLE ENERGY */ + p_val1 = thbp->p_val1; + p_val2 = thbp->p_val2; + p_val4 = thbp->p_val4; + p_val7 = thbp->p_val7; + theta_00 = thbp->theta_00; + + exp3ij = EXP( -p_val3 * POW( BOA_ij, p_val4 ) ); + f7_ij = 1.0 - exp3ij; + Cf7ij = p_val3 * p_val4 * + POW( BOA_ij, p_val4 - 1.0 ) * exp3ij; + + exp3jk = EXP( -p_val3 * POW( BOA_jk, p_val4 ) ); + f7_jk = 1.0 - exp3jk; + Cf7jk = p_val3 * p_val4 * + POW( BOA_jk, p_val4 - 1.0 ) * exp3jk; + + expval7 = EXP( -p_val7 * workspace->Delta_boc[j] ); + trm8 = 1.0 + expval6 + expval7; + f8_Dj = p_val5 - ( (p_val5 - 1.0) * (2.0 + expval6) / trm8 ); + Cf8j = ( (1.0 - p_val5) / SQR(trm8) ) * + (p_val6 * expval6 * trm8 - + (2.0 + expval6) * ( p_val6 * expval6 - p_val7 * expval7 )); + + theta_0 = 180.0 - + theta_00 * (1.0 - EXP(-p_val10 * (2.0 - SBO2))); + theta_0 = DEG2RAD( theta_0 ); + + expval2theta = EXP(-p_val2 * SQR(theta_0-theta)); + if( p_val1 >= 0 ) + expval12theta = p_val1 * (1.0 - expval2theta); + else // To avoid linear Me-H-Me angles (6/6/06) + expval12theta = p_val1 * -expval2theta; + + CEval1 = Cf7ij * f7_jk * f8_Dj * expval12theta; + CEval2 = Cf7jk * f7_ij * f8_Dj * expval12theta; + CEval3 = Cf8j * f7_ij * f7_jk * expval12theta; + CEval4 = -2.0 * p_val1 * p_val2 * f7_ij * f7_jk * f8_Dj * + expval2theta * (theta_0 - theta); + + Ctheta_0 = p_val10 * DEG2RAD(theta_00) * + exp( -p_val10 * (2.0 - SBO2) ); + + CEval5 = -CEval4 * Ctheta_0 * CSBO2; + CEval6 = CEval5 * dSBO1; + CEval7 = CEval5 * dSBO2; + CEval8 = -CEval4 / sin_theta; + + data->E_Ang += e_ang = f7_ij * f7_jk * f8_Dj * expval12theta; + /* END ANGLE ENERGY*/ + + + /* PENALTY ENERGY */ + p_pen1 = thbp->p_pen1; + p_pen2 = system->reaxprm.gp.l[19]; + p_pen3 = system->reaxprm.gp.l[20]; + p_pen4 = system->reaxprm.gp.l[21]; + + exp_pen2ij = EXP( -p_pen2 * SQR( BOA_ij - 2.0 ) ); + exp_pen2jk = EXP( -p_pen2 * SQR( BOA_jk - 2.0 ) ); + exp_pen3 = EXP( -p_pen3 * workspace->Delta[j] ); + exp_pen4 = EXP( p_pen4 * workspace->Delta[j] ); + trm_pen34 = 1.0 + exp_pen3 + exp_pen4; + f9_Dj = ( 2.0 + exp_pen3 ) / trm_pen34; + Cf9j = (-p_pen3 * exp_pen3 * trm_pen34 - + (2.0 + exp_pen3) * ( -p_pen3 * exp_pen3 + + p_pen4 * exp_pen4 )) / + SQR( trm_pen34 ); + + data->E_Pen += e_pen = + p_pen1 * f9_Dj * exp_pen2ij * exp_pen2jk; + + CEpen1 = e_pen * Cf9j / f9_Dj; + temp = -2.0 * p_pen2 * e_pen; + CEpen2 = temp * (BOA_ij - 2.0); + CEpen3 = temp * (BOA_jk - 2.0); + /* END PENALTY ENERGY */ + + + /* COALITION ENERGY */ + p_coa1 = thbp->p_coa1; + p_coa2 = system->reaxprm.gp.l[2]; + p_coa3 = system->reaxprm.gp.l[38]; + p_coa4 = system->reaxprm.gp.l[30]; + + exp_coa2 = EXP( p_coa2 * workspace->Delta_boc[j] ); + data->E_Coa += e_coa = + p_coa1 / (1. + exp_coa2) * + EXP( -p_coa3 * SQR(total_bo[i] - BOA_ij) ) * + EXP( -p_coa3 * SQR(total_bo[k] - BOA_jk) ) * + EXP( -p_coa4 * SQR(BOA_ij - 1.5) ) * + EXP( -p_coa4 * SQR(BOA_jk - 1.5) ); + + CEcoa1 = -2 * p_coa4 * (BOA_ij - 1.5) * e_coa; + CEcoa2 = -2 * p_coa4 * (BOA_jk - 1.5) * e_coa; + CEcoa3 = -p_coa2 * exp_coa2 * e_coa / (1+exp_coa2); + CEcoa4 = -2*p_coa3 * (total_bo[i]-BOA_ij) * e_coa; + CEcoa5 = -2*p_coa3 * (total_bo[k]-BOA_jk) * e_coa; + /* END COALITION ENERGY */ + + /* FORCES */ + bo_ij->Cdbo += (CEval1 + CEpen2 + (CEcoa1-CEcoa4)); + bo_jk->Cdbo += (CEval2 + CEpen3 + (CEcoa2-CEcoa5)); + workspace->CdDelta[j] += ((CEval3 + CEval7) + + CEpen1 + CEcoa3); + workspace->CdDelta[i] += CEcoa4; + workspace->CdDelta[k] += CEcoa5; + + for( t = start_j; t < end_j; ++t ) { + pbond_jt = &( bond_list[t] ); + bo_jt = &(pbond_jt->bo_data); + temp_bo_jt = bo_jt->BO; + temp = CUBE( temp_bo_jt ); + pBOjt7 = temp * temp * temp_bo_jt; + + // fprintf( out_control->eval, "%6d%12.8f\n", + // workspace->orig_id[ bond_list[t].nbr ], + // (CEval6 * pBOjt7) ); + + bo_jt->Cdbo += (CEval6 * pBOjt7); + bo_jt->Cdbopi += CEval5; + bo_jt->Cdbopi2 += CEval5; + } + + + if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) { + + rvec_ScaledAdd( system->atoms[i].f, CEval8, p_ijk->dcos_di ); + rvec_ScaledAdd( system->atoms[j].f, CEval8, p_ijk->dcos_dj ); + rvec_ScaledAdd( system->atoms[k].f, CEval8, p_ijk->dcos_dk ); + + /* + if (i == 0) fprintf (stderr, " atom %d adding to i (j) = 0\n", j); + if (k == 0) fprintf (stderr, " atom %d adding to i (k) = 0\n", j); + */ + } + else { + /* terms not related to bond order derivatives + are added directly into + forces and pressure vector/tensor */ + rvec_Scale( force, CEval8, p_ijk->dcos_di ); + rvec_Add( system->atoms[i].f, force ); + rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); + rvec_Add( data->ext_press, ext_press ); + + rvec_ScaledAdd( system->atoms[j].f, CEval8, p_ijk->dcos_dj ); + + rvec_Scale( force, CEval8, p_ijk->dcos_dk ); + rvec_Add( system->atoms[k].f, force ); + rvec_iMultiply( ext_press, pbond_jk->rel_box, force ); + rvec_Add( data->ext_press, ext_press ); + + + /* This part is for a fully-flexible box */ + /* rvec_OuterProduct( temp_rtensor, + p_ijk->dcos_di, system->atoms[i].x ); + rtensor_Scale( total_rtensor, +CEval8, temp_rtensor ); + + rvec_OuterProduct( temp_rtensor, + p_ijk->dcos_dj, system->atoms[j].x ); + rtensor_ScaledAdd(total_rtensor, CEval8, temp_rtensor); + + rvec_OuterProduct( temp_rtensor, + p_ijk->dcos_dk, system->atoms[k].x ); + rtensor_ScaledAdd(total_rtensor, CEval8, temp_rtensor); + + if( pbond_ij->imaginary || pbond_jk->imaginary ) + rtensor_ScaledAdd( data->flex_bar.P, + -1.0, total_rtensor ); + else + rtensor_Add( data->flex_bar.P, total_rtensor ); */ + } #ifdef TEST_ENERGY - fprintf( out_control->eval, - //"%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e", - "%6d%6d%6d%23.15e%23.15e%23.15e\n", - i+1, j+1, k+1, - //workspace->orig_id[i]+1, - //workspace->orig_id[j]+1, - //workspace->orig_id[k]+1, - //workspace->Delta_boc[j], - RAD2DEG(theta), /*BOA_ij, BOA_jk, */ - e_ang, data->E_Ang ); - - /*fprintf( out_control->eval, - "%23.15e%23.15e%23.15e%23.15e", - p_val3, p_val4, BOA_ij, BOA_jk ); - fprintf( out_control->eval, - "%23.15e%23.15e%23.15e%23.15e", - f7_ij, f7_jk, f8_Dj, expval12theta ); - fprintf( out_control->eval, - "%23.15e%23.15e%23.15e%23.15e%23.15e\n", - CEval1, CEval2, CEval3, CEval4, CEval5 - //CEval6, CEval7, CEval8 );*/ - - /*fprintf( out_control->eval, - "%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n", - -p_ijk->dcos_di[0]/sin_theta, - -p_ijk->dcos_di[1]/sin_theta, - -p_ijk->dcos_di[2]/sin_theta, - -p_ijk->dcos_dj[0]/sin_theta, - -p_ijk->dcos_dj[1]/sin_theta, - -p_ijk->dcos_dj[2]/sin_theta, - -p_ijk->dcos_dk[0]/sin_theta, - -p_ijk->dcos_dk[1]/sin_theta, - -p_ijk->dcos_dk[2]/sin_theta );*/ - - /* fprintf( out_control->epen, - "%23.15e%23.15e%23.15e\n", - CEpen1, CEpen2, CEpen3 ); - fprintf( out_control->epen, - "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n", - workspace->orig_id[i], workspace->orig_id[j], - workspace->orig_id[k], RAD2DEG(theta), - BOA_ij, BOA_jk, e_pen, data->E_Pen ); */ - - fprintf( out_control->ecoa, - "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n", - workspace->orig_id[i], - workspace->orig_id[j], - workspace->orig_id[k], - RAD2DEG(theta), BOA_ij, BOA_jk, - e_coa, data->E_Coa ); + fprintf( out_control->eval, + //"%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e", + "%6d%6d%6d%23.15e%23.15e%23.15e\n", + i+1, j+1, k+1, + //workspace->orig_id[i]+1, + //workspace->orig_id[j]+1, + //workspace->orig_id[k]+1, + //workspace->Delta_boc[j], + RAD2DEG(theta), /*BOA_ij, BOA_jk, */ + e_ang, data->E_Ang ); + + /*fprintf( out_control->eval, + "%23.15e%23.15e%23.15e%23.15e", + p_val3, p_val4, BOA_ij, BOA_jk ); + fprintf( out_control->eval, + "%23.15e%23.15e%23.15e%23.15e", + f7_ij, f7_jk, f8_Dj, expval12theta ); + fprintf( out_control->eval, + "%23.15e%23.15e%23.15e%23.15e%23.15e\n", + CEval1, CEval2, CEval3, CEval4, CEval5 + //CEval6, CEval7, CEval8 );*/ + + /*fprintf( out_control->eval, + "%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n", + -p_ijk->dcos_di[0]/sin_theta, + -p_ijk->dcos_di[1]/sin_theta, + -p_ijk->dcos_di[2]/sin_theta, + -p_ijk->dcos_dj[0]/sin_theta, + -p_ijk->dcos_dj[1]/sin_theta, + -p_ijk->dcos_dj[2]/sin_theta, + -p_ijk->dcos_dk[0]/sin_theta, + -p_ijk->dcos_dk[1]/sin_theta, + -p_ijk->dcos_dk[2]/sin_theta );*/ + + /* fprintf( out_control->epen, + "%23.15e%23.15e%23.15e\n", + CEpen1, CEpen2, CEpen3 ); + fprintf( out_control->epen, + "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n", + workspace->orig_id[i], workspace->orig_id[j], + workspace->orig_id[k], RAD2DEG(theta), + BOA_ij, BOA_jk, e_pen, data->E_Pen ); */ + + fprintf( out_control->ecoa, + "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n", + workspace->orig_id[i], + workspace->orig_id[j], + workspace->orig_id[k], + RAD2DEG(theta), BOA_ij, BOA_jk, + e_coa, data->E_Coa ); #endif #ifdef TEST_FORCES /* angle forces */ - Add_dBO( system, lists, j, pi, CEval1, workspace->f_ang ); - Add_dBO( system, lists, j, pk, CEval2, workspace->f_ang ); - Add_dDelta( system, lists, - j, CEval3 + CEval7, workspace->f_ang ); - - for( t = start_j; t < end_j; ++t ) { - pbond_jt = &( bond_list[t] ); - bo_jt = &(pbond_jt->bo_data); - temp_bo_jt = bo_jt->BO; - temp = CUBE( temp_bo_jt ); - pBOjt7 = temp * temp * temp_bo_jt; - - Add_dBO( system, lists, j, t, pBOjt7 * CEval6, - workspace->f_ang ); - Add_dBOpinpi2( system, lists, j, t, - CEval5, CEval5, - workspace->f_ang, workspace->f_ang ); - } - - rvec_ScaledAdd( workspace->f_ang[i], CEval8, p_ijk->dcos_di ); - rvec_ScaledAdd( workspace->f_ang[j], CEval8, p_ijk->dcos_dj ); - rvec_ScaledAdd( workspace->f_ang[k], CEval8, p_ijk->dcos_dk ); - /* end angle forces */ - - /* penalty forces */ - Add_dDelta( system, lists, j, CEpen1, workspace->f_pen ); - Add_dBO( system, lists, j, pi, CEpen2, workspace->f_pen ); - Add_dBO( system, lists, j, pk, CEpen3, workspace->f_pen ); - /* end penalty forces */ - - /* coalition forces */ - Add_dBO( system, lists, - j, pi, CEcoa1-CEcoa4, workspace->f_coa ); - Add_dBO( system, lists, - j, pk, CEcoa2-CEcoa5, workspace->f_coa ); - Add_dDelta( system, lists, j, CEcoa3, workspace->f_coa ); - Add_dDelta( system, lists, i, CEcoa4, workspace->f_coa ); - Add_dDelta( system, lists, k, CEcoa5, workspace->f_coa ); - /* end coalition forces */ + Add_dBO( system, lists, j, pi, CEval1, workspace->f_ang ); + Add_dBO( system, lists, j, pk, CEval2, workspace->f_ang ); + Add_dDelta( system, lists, + j, CEval3 + CEval7, workspace->f_ang ); + + for( t = start_j; t < end_j; ++t ) { + pbond_jt = &( bond_list[t] ); + bo_jt = &(pbond_jt->bo_data); + temp_bo_jt = bo_jt->BO; + temp = CUBE( temp_bo_jt ); + pBOjt7 = temp * temp * temp_bo_jt; + + Add_dBO( system, lists, j, t, pBOjt7 * CEval6, + workspace->f_ang ); + Add_dBOpinpi2( system, lists, j, t, + CEval5, CEval5, + workspace->f_ang, workspace->f_ang ); + } + + rvec_ScaledAdd( workspace->f_ang[i], CEval8, p_ijk->dcos_di ); + rvec_ScaledAdd( workspace->f_ang[j], CEval8, p_ijk->dcos_dj ); + rvec_ScaledAdd( workspace->f_ang[k], CEval8, p_ijk->dcos_dk ); + /* end angle forces */ + + /* penalty forces */ + Add_dDelta( system, lists, j, CEpen1, workspace->f_pen ); + Add_dBO( system, lists, j, pi, CEpen2, workspace->f_pen ); + Add_dBO( system, lists, j, pk, CEpen3, workspace->f_pen ); + /* end penalty forces */ + + /* coalition forces */ + Add_dBO( system, lists, + j, pi, CEcoa1-CEcoa4, workspace->f_coa ); + Add_dBO( system, lists, + j, pk, CEcoa2-CEcoa5, workspace->f_coa ); + Add_dDelta( system, lists, j, CEcoa3, workspace->f_coa ); + Add_dDelta( system, lists, i, CEcoa4, workspace->f_coa ); + Add_dDelta( system, lists, k, CEcoa5, workspace->f_coa ); + /* end coalition forces */ #endif - } - } - } - } - } - - Set_End_Index(pi, num_thb_intrs, thb_intrs ); - } - } - - if( num_thb_intrs >= thb_intrs->num_intrs * DANGER_ZONE ) { - workspace->realloc.num_3body = num_thb_intrs; - if( num_thb_intrs > thb_intrs->num_intrs ) { - fprintf( stderr, "step%d-ran out of space on angle_list: top=%d, max=%d", - data->step, num_thb_intrs, thb_intrs->num_intrs ); - exit( INSUFFICIENT_SPACE ); - } - } - - //fprintf( stderr,"%d: Number of angle interactions: %d\n", - // data->step, num_thb_intrs ); + } + } + } + } + } + + Set_End_Index(pi, num_thb_intrs, thb_intrs ); + } + } + + if( num_thb_intrs >= thb_intrs->num_intrs * DANGER_ZONE ) { + workspace->realloc.num_3body = num_thb_intrs; + if( num_thb_intrs > thb_intrs->num_intrs ) { + fprintf( stderr, "step%d-ran out of space on angle_list: top=%d, max=%d", + data->step, num_thb_intrs, thb_intrs->num_intrs ); + exit( INSUFFICIENT_SPACE ); + } + } + + //fprintf( stderr,"%d: Number of angle interactions: %d\n", + // data->step, num_thb_intrs ); #ifdef TEST_ENERGY - fprintf( stderr,"Number of angle interactions: %d\n", num_thb_intrs ); + fprintf( stderr,"Number of angle interactions: %d\n", num_thb_intrs ); - fprintf( stderr,"Angle Energy:%g\t Penalty Energy:%g\t Coalition Energy:%g\n", - data->E_Ang, data->E_Pen, data->E_Coa ); + fprintf( stderr,"Angle Energy:%g\t Penalty Energy:%g\t Coalition Energy:%g\n", + data->E_Ang, data->E_Pen, data->E_Coa ); - fprintf( stderr,"3body: ext_press (%23.15e %23.15e %23.15e)\n", - data->ext_press[0], data->ext_press[1], data->ext_press[2] ); + fprintf( stderr,"3body: ext_press (%23.15e %23.15e %23.15e)\n", + data->ext_press[0], data->ext_press[1], data->ext_press[2] ); #endif } @@ -597,598 +597,598 @@ where i < k */ /* this is a 3-body interaction in which the main role is played by j which sits in the middle of the other two. */ GLOBAL void Three_Body_Interactions( reax_atom *atoms, - single_body_parameters *sbp, - three_body_header *d_thbp, - global_parameters g_params, - control_params *control, - simulation_data *data, - static_storage p_workspace, - list p_bonds, list p_thb_intrs, - int N, int num_atom_types, - real *E_Ang, real *E_Pen, real *E_Coa, rvec *aux_ext_press ) + single_body_parameters *sbp, + three_body_header *d_thbp, + global_parameters g_params, + control_params *control, + simulation_data *data, + static_storage p_workspace, + list p_bonds, list p_thb_intrs, + int N, int num_atom_types, + real *E_Ang, real *E_Pen, real *E_Coa, rvec *aux_ext_press ) { - int i, j, pi, k, pk, t; - int type_i, type_j, type_k; - int start_j, end_j, start_pk, end_pk; - int flag, cnt, num_thb_intrs; - - real temp, temp_bo_jt, pBOjt7; - real p_val1, p_val2, p_val3, p_val4, p_val5; - real p_val6, p_val7, p_val8, p_val9, p_val10; - real p_pen1, p_pen2, p_pen3, p_pen4; - real p_coa1, p_coa2, p_coa3, p_coa4; - real trm8, expval6, expval7, expval2theta, expval12theta, exp3ij, exp3jk; - real exp_pen2ij, exp_pen2jk, exp_pen3, exp_pen4, trm_pen34, exp_coa2; - real dSBO1, dSBO2, SBO, SBO2, CSBO2, SBOp, prod_SBO; - real CEval1, CEval2, CEval3, CEval4, CEval5, CEval6, CEval7, CEval8; - real CEpen1, CEpen2, CEpen3; - real e_ang, e_coa, e_pen; - real CEcoa1, CEcoa2, CEcoa3, CEcoa4, CEcoa5; - real Cf7ij, Cf7jk, Cf8j, Cf9j; - real f7_ij, f7_jk, f8_Dj, f9_Dj; - real Ctheta_0, theta_0, theta_00, theta, cos_theta, sin_theta; - real r_ij, r_jk; - real BOA_ij, BOA_jk; - real vlpadj; - rvec force, ext_press; - // rtensor temp_rtensor, total_rtensor; - real *total_bo; - three_body_header *thbh; - three_body_parameters *thbp; - three_body_interaction_data *p_ijk, *p_kji; - bond_data *pbond_ij, *pbond_jk, *pbond_jt; - bond_order_data *bo_ij, *bo_jk, *bo_jt; - list *bonds, *thb_intrs; - bond_data *bond_list; - three_body_interaction_data *thb_list; - static_storage *workspace = &p_workspace; - - j = blockIdx.x * blockDim.x + threadIdx.x; - if (j >= N) return; - - - total_bo = workspace->total_bond_order; - bonds = &p_bonds; - bond_list = bonds->select.bond_list; - thb_intrs = &p_thb_intrs; - thb_list = thb_intrs->select.three_body_list; - - /* global parameters used in these calculations */ - p_val6 = g_params.l[14]; - p_val8 = g_params.l[33]; - p_val9 = g_params.l[16]; - p_val10 = g_params.l[17]; - - //TODO check this, initially this was zero, - // I am changing it to the starting index for this atom. - //num_thb_intrs = j * MAX_TH_BODY; - - //for( j = 0; j < system->N; ++j ) { - // fprintf( out_control->eval, "j: %d\n", j ); - type_j = atoms[j].type; - start_j = Start_Index(j, bonds); - end_j = End_Index(j, bonds); - - p_val3 = sbp[ type_j ].p_val3; - p_val5 = sbp[ type_j ].p_val5; - - SBOp = 0, prod_SBO = 1; - for( t = start_j; t < end_j; ++t ) { - bo_jt = &(bond_list[t].bo_data); - SBOp += (bo_jt->BO_pi + bo_jt->BO_pi2); - temp = SQR( bo_jt->BO ); - temp *= temp; - temp *= temp; - prod_SBO *= EXP( -temp ); - } - - /* modifications to match Adri's code - 09/01/09 */ - if( workspace->vlpex[j] >= 0 ){ - vlpadj = 0; - dSBO2 = prod_SBO - 1; - } - else{ - vlpadj = workspace->nlp[j]; - dSBO2 = (prod_SBO - 1) * (1 - p_val8 * workspace->dDelta_lp[j]); - } - - SBO = SBOp + (1 - prod_SBO) * (-workspace->Delta_boc[j] - p_val8 * vlpadj); - dSBO1 = -8 * prod_SBO * ( workspace->Delta_boc[j] + p_val8 * vlpadj ); - - if( SBO <= 0 ) - SBO2 = 0, CSBO2 = 0; - else if( SBO > 0 && SBO <= 1 ) { - SBO2 = POW( SBO, p_val9 ); - CSBO2 = p_val9 * POW( SBO, p_val9 - 1 ); - } - else if( SBO > 1 && SBO < 2 ) { - SBO2 = 2 - POW( 2-SBO, p_val9 ); - CSBO2 = p_val9 * POW( 2 - SBO, p_val9 - 1 ); - } - else - SBO2 = 2, CSBO2 = 0; - - expval6 = EXP( p_val6 * workspace->Delta_boc[j] ); - - /* unlike 2-body intrs where we enforce i<j, we cannot put any such - restrictions here. such a restriction would prevent us from producing - all 4-body intrs correctly */ - for( pi = start_j; pi < end_j; ++pi ) { - - //TODO - //num_thb_intrs = pi * MAX_THREE_BODIES; - //TODO - - //Set_Start_Index( pi, num_thb_intrs, thb_intrs ); - num_thb_intrs = Start_Index (pi, thb_intrs); - - pbond_ij = &(bond_list[pi]); - bo_ij = &(pbond_ij->bo_data); - BOA_ij = bo_ij->BO - control->thb_cut; - - - if( BOA_ij/*bo_ij->BO*/ > 0.0 ) { - i = pbond_ij->nbr; - r_ij = pbond_ij->d; - type_i = atoms[i].type; - // fprintf( out_control->eval, "i: %d\n", i ); - - - /* first copy 3-body intrs from previously computed ones where i>k. + int i, j, pi, k, pk, t; + int type_i, type_j, type_k; + int start_j, end_j, start_pk, end_pk; + int flag, cnt, num_thb_intrs; + + real temp, temp_bo_jt, pBOjt7; + real p_val1, p_val2, p_val3, p_val4, p_val5; + real p_val6, p_val7, p_val8, p_val9, p_val10; + real p_pen1, p_pen2, p_pen3, p_pen4; + real p_coa1, p_coa2, p_coa3, p_coa4; + real trm8, expval6, expval7, expval2theta, expval12theta, exp3ij, exp3jk; + real exp_pen2ij, exp_pen2jk, exp_pen3, exp_pen4, trm_pen34, exp_coa2; + real dSBO1, dSBO2, SBO, SBO2, CSBO2, SBOp, prod_SBO; + real CEval1, CEval2, CEval3, CEval4, CEval5, CEval6, CEval7, CEval8; + real CEpen1, CEpen2, CEpen3; + real e_ang, e_coa, e_pen; + real CEcoa1, CEcoa2, CEcoa3, CEcoa4, CEcoa5; + real Cf7ij, Cf7jk, Cf8j, Cf9j; + real f7_ij, f7_jk, f8_Dj, f9_Dj; + real Ctheta_0, theta_0, theta_00, theta, cos_theta, sin_theta; + real r_ij, r_jk; + real BOA_ij, BOA_jk; + real vlpadj; + rvec force, ext_press; + // rtensor temp_rtensor, total_rtensor; + real *total_bo; + three_body_header *thbh; + three_body_parameters *thbp; + three_body_interaction_data *p_ijk, *p_kji; + bond_data *pbond_ij, *pbond_jk, *pbond_jt; + bond_order_data *bo_ij, *bo_jk, *bo_jt; + list *bonds, *thb_intrs; + bond_data *bond_list; + three_body_interaction_data *thb_list; + static_storage *workspace = &p_workspace; + + j = blockIdx.x * blockDim.x + threadIdx.x; + if (j >= N) return; + + + total_bo = workspace->total_bond_order; + bonds = &p_bonds; + bond_list = bonds->select.bond_list; + thb_intrs = &p_thb_intrs; + thb_list = thb_intrs->select.three_body_list; + + /* global parameters used in these calculations */ + p_val6 = g_params.l[14]; + p_val8 = g_params.l[33]; + p_val9 = g_params.l[16]; + p_val10 = g_params.l[17]; + + //TODO check this, initially this was zero, + // I am changing it to the starting index for this atom. + //num_thb_intrs = j * MAX_TH_BODY; + + //for( j = 0; j < system->N; ++j ) { + // fprintf( out_control->eval, "j: %d\n", j ); + type_j = atoms[j].type; + start_j = Start_Index(j, bonds); + end_j = End_Index(j, bonds); + + p_val3 = sbp[ type_j ].p_val3; + p_val5 = sbp[ type_j ].p_val5; + + SBOp = 0, prod_SBO = 1; + for( t = start_j; t < end_j; ++t ) { + bo_jt = &(bond_list[t].bo_data); + SBOp += (bo_jt->BO_pi + bo_jt->BO_pi2); + temp = SQR( bo_jt->BO ); + temp *= temp; + temp *= temp; + prod_SBO *= EXP( -temp ); + } + + /* modifications to match Adri's code - 09/01/09 */ + if( workspace->vlpex[j] >= 0 ){ + vlpadj = 0; + dSBO2 = prod_SBO - 1; + } + else{ + vlpadj = workspace->nlp[j]; + dSBO2 = (prod_SBO - 1) * (1 - p_val8 * workspace->dDelta_lp[j]); + } + + SBO = SBOp + (1 - prod_SBO) * (-workspace->Delta_boc[j] - p_val8 * vlpadj); + dSBO1 = -8 * prod_SBO * ( workspace->Delta_boc[j] + p_val8 * vlpadj ); + + if( SBO <= 0 ) + SBO2 = 0, CSBO2 = 0; + else if( SBO > 0 && SBO <= 1 ) { + SBO2 = POW( SBO, p_val9 ); + CSBO2 = p_val9 * POW( SBO, p_val9 - 1 ); + } + else if( SBO > 1 && SBO < 2 ) { + SBO2 = 2 - POW( 2-SBO, p_val9 ); + CSBO2 = p_val9 * POW( 2 - SBO, p_val9 - 1 ); + } + else + SBO2 = 2, CSBO2 = 0; + + expval6 = EXP( p_val6 * workspace->Delta_boc[j] ); + + /* unlike 2-body intrs where we enforce i<j, we cannot put any such + restrictions here. such a restriction would prevent us from producing + all 4-body intrs correctly */ + for( pi = start_j; pi < end_j; ++pi ) { + + //TODO + //num_thb_intrs = pi * MAX_THREE_BODIES; + //TODO + + //Set_Start_Index( pi, num_thb_intrs, thb_intrs ); + num_thb_intrs = Start_Index (pi, thb_intrs); + + pbond_ij = &(bond_list[pi]); + bo_ij = &(pbond_ij->bo_data); + BOA_ij = bo_ij->BO - control->thb_cut; + + + if( BOA_ij/*bo_ij->BO*/ > 0.0 ) { + i = pbond_ij->nbr; + r_ij = pbond_ij->d; + type_i = atoms[i].type; + // fprintf( out_control->eval, "i: %d\n", i ); + + + /* first copy 3-body intrs from previously computed ones where i>k. IMPORTANT: if it is less costly to compute theta and its derivative, we should definitely re-compute them, instead of copying! in the second for-loop below, we compute only new 3-body intrs where i < k */ - for( pk = start_j; pk < pi; ++pk ) { - // fprintf( out_control->eval, "pk: %d\n", pk ); - start_pk = Start_Index( pk, thb_intrs ); - end_pk = End_Index( pk, thb_intrs ); - - for( t = start_pk; t < end_pk; ++t ) - if( thb_list[t].thb == i ) { - p_ijk = &(thb_list[num_thb_intrs]); - p_kji = &(thb_list[t]); - - p_ijk->thb = bond_list[pk].nbr; - p_ijk->pthb = pk; - p_ijk->theta = p_kji->theta; - rvec_Copy( p_ijk->dcos_di, p_kji->dcos_dk ); - rvec_Copy( p_ijk->dcos_dj, p_kji->dcos_dj ); - rvec_Copy( p_ijk->dcos_dk, p_kji->dcos_di ); - - ++num_thb_intrs; - break; - } - } - - - /* and this is the second for loop mentioned above */ - for( pk = pi+1; pk < end_j; ++pk ) { - pbond_jk = &(bond_list[pk]); - bo_jk = &(pbond_jk->bo_data); - BOA_jk = bo_jk->BO - control->thb_cut; - k = pbond_jk->nbr; - type_k = atoms[k].type; - p_ijk = &( thb_list[num_thb_intrs] ); - - //CHANGE ORIGINAL - if (BOA_jk <= 0) continue; - //CHANGE ORIGINAL - - Calculate_Theta( pbond_ij->dvec, pbond_ij->d, - pbond_jk->dvec, pbond_jk->d, - &theta, &cos_theta ); - - Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, - pbond_jk->dvec, pbond_jk->d, - &(p_ijk->dcos_di), &(p_ijk->dcos_dj), - &(p_ijk->dcos_dk) ); - - p_ijk->thb = k; - p_ijk->pthb = pk; - p_ijk->theta = theta; - - sin_theta = SIN( theta ); - if( sin_theta < 1.0e-5 ) - sin_theta = 1.0e-5; - - ++num_thb_intrs; - - - if( BOA_jk > 0.0 && - (bo_ij->BO * bo_jk->BO) > SQR(control->thb_cut)/*0*/) { - r_jk = pbond_jk->d; - thbh = &( d_thbp[ index_thbp (type_i,type_j,type_k,num_atom_types) ] ); - flag = 0; - - /* if( workspace->orig_id[i] < workspace->orig_id[k] ) - fprintf( stdout, "%6d %6d %6d %7.3f %7.3f %7.3f\n", - workspace->orig_id[i], workspace->orig_id[j], - workspace->orig_id[k], bo_ij->BO, bo_jk->BO, p_ijk->theta ); - else - fprintf( stdout, "%6d %6d %6d %7.3f %7.3f %7.3f\n", - workspace->orig_id[k], workspace->orig_id[j], - workspace->orig_id[i], bo_jk->BO, bo_ij->BO, p_ijk->theta ); */ - - //TODO: - //pbond_jk->scratch = thbh->cnt; - - for( cnt = 0; cnt < thbh->cnt; ++cnt ) { - // fprintf( out_control->eval, - // "%6d%6d%6d -- exists in thbp\n", i+1, j+1, k+1 ); - - if( fabs(thbh->prm[cnt].p_val1) > 0.001 ) { - thbp = &( thbh->prm[cnt] ); - - /* ANGLE ENERGY */ - p_val1 = thbp->p_val1; - p_val2 = thbp->p_val2; - p_val4 = thbp->p_val4; - p_val7 = thbp->p_val7; - theta_00 = thbp->theta_00; - - exp3ij = EXP( -p_val3 * POW( BOA_ij, p_val4 ) ); - f7_ij = 1.0 - exp3ij; - Cf7ij = p_val3 * p_val4 * - POW( BOA_ij, p_val4 - 1.0 ) * exp3ij; - - exp3jk = EXP( -p_val3 * POW( BOA_jk, p_val4 ) ); - f7_jk = 1.0 - exp3jk; - Cf7jk = p_val3 * p_val4 * - POW( BOA_jk, p_val4 - 1.0 ) * exp3jk; - - expval7 = EXP( -p_val7 * workspace->Delta_boc[j] ); - trm8 = 1.0 + expval6 + expval7; - f8_Dj = p_val5 - ( (p_val5 - 1.0) * (2.0 + expval6) / trm8 ); - Cf8j = ( (1.0 - p_val5) / SQR(trm8) ) * - (p_val6 * expval6 * trm8 - - (2.0 + expval6) * ( p_val6 * expval6 - p_val7 * expval7 )); - - theta_0 = 180.0 - - theta_00 * (1.0 - EXP(-p_val10 * (2.0 - SBO2))); - theta_0 = DEG2RAD( theta_0 ); - - expval2theta = EXP(-p_val2 * SQR(theta_0-theta)); - if( p_val1 >= 0 ) - expval12theta = p_val1 * (1.0 - expval2theta); - else // To avoid linear Me-H-Me angles (6/6/06) - expval12theta = p_val1 * -expval2theta; - - CEval1 = Cf7ij * f7_jk * f8_Dj * expval12theta; - CEval2 = Cf7jk * f7_ij * f8_Dj * expval12theta; - CEval3 = Cf8j * f7_ij * f7_jk * expval12theta; - CEval4 = -2.0 * p_val1 * p_val2 * f7_ij * f7_jk * f8_Dj * - expval2theta * (theta_0 - theta); - - Ctheta_0 = p_val10 * DEG2RAD(theta_00) * - exp( -p_val10 * (2.0 - SBO2) ); - - CEval5 = -CEval4 * Ctheta_0 * CSBO2; - CEval6 = CEval5 * dSBO1; - CEval7 = CEval5 * dSBO2; - CEval8 = -CEval4 / sin_theta; - - e_ang = f7_ij * f7_jk * f8_Dj * expval12theta; - //PERFORMANCE IMPACT - //atomicAdd (&data->E_Ang, e_ang); - E_Ang [j] += e_ang; - /* END ANGLE ENERGY*/ - - - /* PENALTY ENERGY */ - p_pen1 = thbp->p_pen1; - p_pen2 = g_params.l[19]; - p_pen3 = g_params.l[20]; - p_pen4 = g_params.l[21]; - - exp_pen2ij = EXP( -p_pen2 * SQR( BOA_ij - 2.0 ) ); - exp_pen2jk = EXP( -p_pen2 * SQR( BOA_jk - 2.0 ) ); - exp_pen3 = EXP( -p_pen3 * workspace->Delta[j] ); - exp_pen4 = EXP( p_pen4 * workspace->Delta[j] ); - trm_pen34 = 1.0 + exp_pen3 + exp_pen4; - f9_Dj = ( 2.0 + exp_pen3 ) / trm_pen34; - Cf9j = (-p_pen3 * exp_pen3 * trm_pen34 - - (2.0 + exp_pen3) * ( -p_pen3 * exp_pen3 + - p_pen4 * exp_pen4 )) / - SQR( trm_pen34 ); - - e_pen = p_pen1 * f9_Dj * exp_pen2ij * exp_pen2jk; - //PERFORMANCE IMPACT - //atomicAdd (&data->E_Pen, e_pen); - E_Pen [j] += e_pen; - - - CEpen1 = e_pen * Cf9j / f9_Dj; - temp = -2.0 * p_pen2 * e_pen; - CEpen2 = temp * (BOA_ij - 2.0); - CEpen3 = temp * (BOA_jk - 2.0); - /* END PENALTY ENERGY */ - - - /* COALITION ENERGY */ - p_coa1 = thbp->p_coa1; - p_coa2 = g_params.l[2]; - p_coa3 = g_params.l[38]; - p_coa4 = g_params.l[30]; - - exp_coa2 = EXP( p_coa2 * workspace->Delta_boc[j] ); - e_coa = - p_coa1 / (1. + exp_coa2) * - EXP( -p_coa3 * SQR(total_bo[i] - BOA_ij) ) * - EXP( -p_coa3 * SQR(total_bo[k] - BOA_jk) ) * - EXP( -p_coa4 * SQR(BOA_ij - 1.5) ) * - EXP( -p_coa4 * SQR(BOA_jk - 1.5) ); - - //PERFORMANCE IMPACT - //atomicAdd (&data->E_Coa, e_coa); - E_Coa [j] += e_coa; - - CEcoa1 = -2 * p_coa4 * (BOA_ij - 1.5) * e_coa; - CEcoa2 = -2 * p_coa4 * (BOA_jk - 1.5) * e_coa; - CEcoa3 = -p_coa2 * exp_coa2 * e_coa / (1+exp_coa2); - CEcoa4 = -2*p_coa3 * (total_bo[i]-BOA_ij) * e_coa; - CEcoa5 = -2*p_coa3 * (total_bo[k]-BOA_jk) * e_coa; - /* END COALITION ENERGY */ - - /* FORCES */ - /* - atomicAdd (&bo_ij->Cdbo, (CEval1 + CEpen2 + (CEcoa1-CEcoa4)) ); - atomicAdd (&bo_jk->Cdbo, (CEval2 + CEpen3 + (CEcoa2-CEcoa5)) ); - atomicAdd (&workspace->CdDelta[j], ((CEval3 + CEval7) + CEpen1 + CEcoa3) ); - atomicAdd (&workspace->CdDelta[i], CEcoa4 ); - atomicAdd (&workspace->CdDelta[k], CEcoa5 ); - */ - - bo_ij->Cdbo += (CEval1 + CEpen2 + (CEcoa1-CEcoa4)) ; - bo_jk->Cdbo += (CEval2 + CEpen3 + (CEcoa2-CEcoa5)) ; - workspace->CdDelta[j] += ((CEval3 + CEval7) + CEpen1 + CEcoa3) ; - //atomicAdd (&workspace->CdDelta[i], CEcoa4 ); - pbond_ij->CdDelta_ij += CEcoa4 ; - //atomicAdd (&workspace->CdDelta[k], CEcoa5 ); - pbond_jk->CdDelta_ij += CEcoa5; - - for( t = start_j; t < end_j; ++t ) { - pbond_jt = &( bond_list[t] ); - bo_jt = &(pbond_jt->bo_data); - temp_bo_jt = bo_jt->BO; - temp = CUBE( temp_bo_jt ); - pBOjt7 = temp * temp * temp_bo_jt; - - // fprintf( out_control->eval, "%6d%12.8f\n", - // workspace->orig_id[ bond_list[t].nbr ], - // (CEval6 * pBOjt7) ); - - /* - atomicAdd (&bo_jt->Cdbo, (CEval6 * pBOjt7) ); - atomicAdd (&bo_jt->Cdbopi, CEval5 ); - atomicAdd (&bo_jt->Cdbopi2, CEval5 ); - */ - bo_jt->Cdbo += (CEval6 * pBOjt7) ; - bo_jt->Cdbopi += CEval5 ; - bo_jt->Cdbopi2 += CEval5 ; - } - - - if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) { - /* - atomic_rvecScaledAdd( atoms[i].f, CEval8, p_ijk->dcos_di ); - atomic_rvecScaledAdd( atoms[j].f, CEval8, p_ijk->dcos_dj ); - atomic_rvecScaledAdd( atoms[k].f, CEval8, p_ijk->dcos_dk ); - */ - rvec_ScaledAdd( pbond_ij->f, CEval8, p_ijk->dcos_di ); - rvec_ScaledAdd( atoms[j].f, CEval8, p_ijk->dcos_dj ); - rvec_ScaledAdd( pbond_jk->f, CEval8, p_ijk->dcos_dk ); - - - } - else { - /* terms not related to bond order derivatives - are added directly into - forces and pressure vector/tensor */ - rvec_Scale( force, CEval8, p_ijk->dcos_di ); - //atomic_rvecAdd( atoms[i].f, force ); - rvec_Add( pbond_ij->f, force ); - - rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); - //atomic_rvecAdd( data->ext_press, ext_press ); - rvec_Add( aux_ext_press [j], ext_press ); - - //atomic_rvecScaledAdd( atoms[j].f, CEval8, p_ijk->dcos_dj ); - rvec_ScaledAdd( atoms[j].f, CEval8, p_ijk->dcos_dj ); - - rvec_Scale( force, CEval8, p_ijk->dcos_dk ); - //atomic_rvecAdd( atoms[k].f, force ); - rvec_Add( pbond_jk->f, force ); - rvec_iMultiply( ext_press, pbond_jk->rel_box, force ); - //atomic_rvecAdd( data->ext_press, ext_press ); - rvec_Add( aux_ext_press [j], ext_press ); - - - /* This part is for a fully-flexible box */ - /* rvec_OuterProduct( temp_rtensor, - p_ijk->dcos_di, system->atoms[i].x ); - rtensor_Scale( total_rtensor, +CEval8, temp_rtensor ); - - rvec_OuterProduct( temp_rtensor, - p_ijk->dcos_dj, system->atoms[j].x ); - rtensor_ScaledAdd(total_rtensor, CEval8, temp_rtensor); - - rvec_OuterProduct( temp_rtensor, - p_ijk->dcos_dk, system->atoms[k].x ); - rtensor_ScaledAdd(total_rtensor, CEval8, temp_rtensor); - - if( pbond_ij->imaginary || pbond_jk->imaginary ) - rtensor_ScaledAdd( data->flex_bar.P, - -1.0, total_rtensor ); - else - rtensor_Add( data->flex_bar.P, total_rtensor ); */ - } + for( pk = start_j; pk < pi; ++pk ) { + // fprintf( out_control->eval, "pk: %d\n", pk ); + start_pk = Start_Index( pk, thb_intrs ); + end_pk = End_Index( pk, thb_intrs ); + + for( t = start_pk; t < end_pk; ++t ) + if( thb_list[t].thb == i ) { + p_ijk = &(thb_list[num_thb_intrs]); + p_kji = &(thb_list[t]); + + p_ijk->thb = bond_list[pk].nbr; + p_ijk->pthb = pk; + p_ijk->theta = p_kji->theta; + rvec_Copy( p_ijk->dcos_di, p_kji->dcos_dk ); + rvec_Copy( p_ijk->dcos_dj, p_kji->dcos_dj ); + rvec_Copy( p_ijk->dcos_dk, p_kji->dcos_di ); + + ++num_thb_intrs; + break; + } + } + + + /* and this is the second for loop mentioned above */ + for( pk = pi+1; pk < end_j; ++pk ) { + pbond_jk = &(bond_list[pk]); + bo_jk = &(pbond_jk->bo_data); + BOA_jk = bo_jk->BO - control->thb_cut; + k = pbond_jk->nbr; + type_k = atoms[k].type; + p_ijk = &( thb_list[num_thb_intrs] ); + + //CHANGE ORIGINAL + if (BOA_jk <= 0) continue; + //CHANGE ORIGINAL + + Calculate_Theta( pbond_ij->dvec, pbond_ij->d, + pbond_jk->dvec, pbond_jk->d, + &theta, &cos_theta ); + + Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, + pbond_jk->dvec, pbond_jk->d, + &(p_ijk->dcos_di), &(p_ijk->dcos_dj), + &(p_ijk->dcos_dk) ); + + p_ijk->thb = k; + p_ijk->pthb = pk; + p_ijk->theta = theta; + + sin_theta = SIN( theta ); + if( sin_theta < 1.0e-5 ) + sin_theta = 1.0e-5; + + ++num_thb_intrs; + + + if( BOA_jk > 0.0 && + (bo_ij->BO * bo_jk->BO) > SQR(control->thb_cut)/*0*/) { + r_jk = pbond_jk->d; + thbh = &( d_thbp[ index_thbp (type_i,type_j,type_k,num_atom_types) ] ); + flag = 0; + + /* if( workspace->orig_id[i] < workspace->orig_id[k] ) + fprintf( stdout, "%6d %6d %6d %7.3f %7.3f %7.3f\n", + workspace->orig_id[i], workspace->orig_id[j], + workspace->orig_id[k], bo_ij->BO, bo_jk->BO, p_ijk->theta ); + else + fprintf( stdout, "%6d %6d %6d %7.3f %7.3f %7.3f\n", + workspace->orig_id[k], workspace->orig_id[j], + workspace->orig_id[i], bo_jk->BO, bo_ij->BO, p_ijk->theta ); */ + + //TODO: + //pbond_jk->scratch = thbh->cnt; + + for( cnt = 0; cnt < thbh->cnt; ++cnt ) { + // fprintf( out_control->eval, + // "%6d%6d%6d -- exists in thbp\n", i+1, j+1, k+1 ); + + if( fabs(thbh->prm[cnt].p_val1) > 0.001 ) { + thbp = &( thbh->prm[cnt] ); + + /* ANGLE ENERGY */ + p_val1 = thbp->p_val1; + p_val2 = thbp->p_val2; + p_val4 = thbp->p_val4; + p_val7 = thbp->p_val7; + theta_00 = thbp->theta_00; + + exp3ij = EXP( -p_val3 * POW( BOA_ij, p_val4 ) ); + f7_ij = 1.0 - exp3ij; + Cf7ij = p_val3 * p_val4 * + POW( BOA_ij, p_val4 - 1.0 ) * exp3ij; + + exp3jk = EXP( -p_val3 * POW( BOA_jk, p_val4 ) ); + f7_jk = 1.0 - exp3jk; + Cf7jk = p_val3 * p_val4 * + POW( BOA_jk, p_val4 - 1.0 ) * exp3jk; + + expval7 = EXP( -p_val7 * workspace->Delta_boc[j] ); + trm8 = 1.0 + expval6 + expval7; + f8_Dj = p_val5 - ( (p_val5 - 1.0) * (2.0 + expval6) / trm8 ); + Cf8j = ( (1.0 - p_val5) / SQR(trm8) ) * + (p_val6 * expval6 * trm8 - + (2.0 + expval6) * ( p_val6 * expval6 - p_val7 * expval7 )); + + theta_0 = 180.0 - + theta_00 * (1.0 - EXP(-p_val10 * (2.0 - SBO2))); + theta_0 = DEG2RAD( theta_0 ); + + expval2theta = EXP(-p_val2 * SQR(theta_0-theta)); + if( p_val1 >= 0 ) + expval12theta = p_val1 * (1.0 - expval2theta); + else // To avoid linear Me-H-Me angles (6/6/06) + expval12theta = p_val1 * -expval2theta; + + CEval1 = Cf7ij * f7_jk * f8_Dj * expval12theta; + CEval2 = Cf7jk * f7_ij * f8_Dj * expval12theta; + CEval3 = Cf8j * f7_ij * f7_jk * expval12theta; + CEval4 = -2.0 * p_val1 * p_val2 * f7_ij * f7_jk * f8_Dj * + expval2theta * (theta_0 - theta); + + Ctheta_0 = p_val10 * DEG2RAD(theta_00) * + exp( -p_val10 * (2.0 - SBO2) ); + + CEval5 = -CEval4 * Ctheta_0 * CSBO2; + CEval6 = CEval5 * dSBO1; + CEval7 = CEval5 * dSBO2; + CEval8 = -CEval4 / sin_theta; + + e_ang = f7_ij * f7_jk * f8_Dj * expval12theta; + //PERFORMANCE IMPACT + //atomicAdd (&data->E_Ang, e_ang); + E_Ang [j] += e_ang; + /* END ANGLE ENERGY*/ + + + /* PENALTY ENERGY */ + p_pen1 = thbp->p_pen1; + p_pen2 = g_params.l[19]; + p_pen3 = g_params.l[20]; + p_pen4 = g_params.l[21]; + + exp_pen2ij = EXP( -p_pen2 * SQR( BOA_ij - 2.0 ) ); + exp_pen2jk = EXP( -p_pen2 * SQR( BOA_jk - 2.0 ) ); + exp_pen3 = EXP( -p_pen3 * workspace->Delta[j] ); + exp_pen4 = EXP( p_pen4 * workspace->Delta[j] ); + trm_pen34 = 1.0 + exp_pen3 + exp_pen4; + f9_Dj = ( 2.0 + exp_pen3 ) / trm_pen34; + Cf9j = (-p_pen3 * exp_pen3 * trm_pen34 - + (2.0 + exp_pen3) * ( -p_pen3 * exp_pen3 + + p_pen4 * exp_pen4 )) / + SQR( trm_pen34 ); + + e_pen = p_pen1 * f9_Dj * exp_pen2ij * exp_pen2jk; + //PERFORMANCE IMPACT + //atomicAdd (&data->E_Pen, e_pen); + E_Pen [j] += e_pen; + + + CEpen1 = e_pen * Cf9j / f9_Dj; + temp = -2.0 * p_pen2 * e_pen; + CEpen2 = temp * (BOA_ij - 2.0); + CEpen3 = temp * (BOA_jk - 2.0); + /* END PENALTY ENERGY */ + + + /* COALITION ENERGY */ + p_coa1 = thbp->p_coa1; + p_coa2 = g_params.l[2]; + p_coa3 = g_params.l[38]; + p_coa4 = g_params.l[30]; + + exp_coa2 = EXP( p_coa2 * workspace->Delta_boc[j] ); + e_coa = + p_coa1 / (1. + exp_coa2) * + EXP( -p_coa3 * SQR(total_bo[i] - BOA_ij) ) * + EXP( -p_coa3 * SQR(total_bo[k] - BOA_jk) ) * + EXP( -p_coa4 * SQR(BOA_ij - 1.5) ) * + EXP( -p_coa4 * SQR(BOA_jk - 1.5) ); + + //PERFORMANCE IMPACT + //atomicAdd (&data->E_Coa, e_coa); + E_Coa [j] += e_coa; + + CEcoa1 = -2 * p_coa4 * (BOA_ij - 1.5) * e_coa; + CEcoa2 = -2 * p_coa4 * (BOA_jk - 1.5) * e_coa; + CEcoa3 = -p_coa2 * exp_coa2 * e_coa / (1+exp_coa2); + CEcoa4 = -2*p_coa3 * (total_bo[i]-BOA_ij) * e_coa; + CEcoa5 = -2*p_coa3 * (total_bo[k]-BOA_jk) * e_coa; + /* END COALITION ENERGY */ + + /* FORCES */ + /* + atomicAdd (&bo_ij->Cdbo, (CEval1 + CEpen2 + (CEcoa1-CEcoa4)) ); + atomicAdd (&bo_jk->Cdbo, (CEval2 + CEpen3 + (CEcoa2-CEcoa5)) ); + atomicAdd (&workspace->CdDelta[j], ((CEval3 + CEval7) + CEpen1 + CEcoa3) ); + atomicAdd (&workspace->CdDelta[i], CEcoa4 ); + atomicAdd (&workspace->CdDelta[k], CEcoa5 ); + */ + + bo_ij->Cdbo += (CEval1 + CEpen2 + (CEcoa1-CEcoa4)) ; + bo_jk->Cdbo += (CEval2 + CEpen3 + (CEcoa2-CEcoa5)) ; + workspace->CdDelta[j] += ((CEval3 + CEval7) + CEpen1 + CEcoa3) ; + //atomicAdd (&workspace->CdDelta[i], CEcoa4 ); + pbond_ij->CdDelta_ij += CEcoa4 ; + //atomicAdd (&workspace->CdDelta[k], CEcoa5 ); + pbond_jk->CdDelta_ij += CEcoa5; + + for( t = start_j; t < end_j; ++t ) { + pbond_jt = &( bond_list[t] ); + bo_jt = &(pbond_jt->bo_data); + temp_bo_jt = bo_jt->BO; + temp = CUBE( temp_bo_jt ); + pBOjt7 = temp * temp * temp_bo_jt; + + // fprintf( out_control->eval, "%6d%12.8f\n", + // workspace->orig_id[ bond_list[t].nbr ], + // (CEval6 * pBOjt7) ); + + /* + atomicAdd (&bo_jt->Cdbo, (CEval6 * pBOjt7) ); + atomicAdd (&bo_jt->Cdbopi, CEval5 ); + atomicAdd (&bo_jt->Cdbopi2, CEval5 ); + */ + bo_jt->Cdbo += (CEval6 * pBOjt7) ; + bo_jt->Cdbopi += CEval5 ; + bo_jt->Cdbopi2 += CEval5 ; + } + + + if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) { + /* + atomic_rvecScaledAdd( atoms[i].f, CEval8, p_ijk->dcos_di ); + atomic_rvecScaledAdd( atoms[j].f, CEval8, p_ijk->dcos_dj ); + atomic_rvecScaledAdd( atoms[k].f, CEval8, p_ijk->dcos_dk ); + */ + rvec_ScaledAdd( pbond_ij->f, CEval8, p_ijk->dcos_di ); + rvec_ScaledAdd( atoms[j].f, CEval8, p_ijk->dcos_dj ); + rvec_ScaledAdd( pbond_jk->f, CEval8, p_ijk->dcos_dk ); + + + } + else { + /* terms not related to bond order derivatives + are added directly into + forces and pressure vector/tensor */ + rvec_Scale( force, CEval8, p_ijk->dcos_di ); + //atomic_rvecAdd( atoms[i].f, force ); + rvec_Add( pbond_ij->f, force ); + + rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); + //atomic_rvecAdd( data->ext_press, ext_press ); + rvec_Add( aux_ext_press [j], ext_press ); + + //atomic_rvecScaledAdd( atoms[j].f, CEval8, p_ijk->dcos_dj ); + rvec_ScaledAdd( atoms[j].f, CEval8, p_ijk->dcos_dj ); + + rvec_Scale( force, CEval8, p_ijk->dcos_dk ); + //atomic_rvecAdd( atoms[k].f, force ); + rvec_Add( pbond_jk->f, force ); + rvec_iMultiply( ext_press, pbond_jk->rel_box, force ); + //atomic_rvecAdd( data->ext_press, ext_press ); + rvec_Add( aux_ext_press [j], ext_press ); + + + /* This part is for a fully-flexible box */ + /* rvec_OuterProduct( temp_rtensor, + p_ijk->dcos_di, system->atoms[i].x ); + rtensor_Scale( total_rtensor, +CEval8, temp_rtensor ); + + rvec_OuterProduct( temp_rtensor, + p_ijk->dcos_dj, system->atoms[j].x ); + rtensor_ScaledAdd(total_rtensor, CEval8, temp_rtensor); + + rvec_OuterProduct( temp_rtensor, + p_ijk->dcos_dk, system->atoms[k].x ); + rtensor_ScaledAdd(total_rtensor, CEval8, temp_rtensor); + + if( pbond_ij->imaginary || pbond_jk->imaginary ) + rtensor_ScaledAdd( data->flex_bar.P, + -1.0, total_rtensor ); + else + rtensor_Add( data->flex_bar.P, total_rtensor ); */ + } #ifdef TEST_ENERGY - //TODO -- check this - // fprintf( out_control->eval, - //"%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e", - // "%6d%6d%6d%23.15e%23.15e%23.15e\n", - // i+1, j+1, k+1, - //workspace->orig_id[i]+1, - //workspace->orig_id[j]+1, - //workspace->orig_id[k]+1, - //workspace->Delta_boc[j], - // RAD2DEG(theta), /*BOA_ij, BOA_jk, */ - // e_ang, data->E_Ang ); - - /*fprintf( out_control->eval, - "%23.15e%23.15e%23.15e%23.15e", - p_val3, p_val4, BOA_ij, BOA_jk ); - fprintf( out_control->eval, - "%23.15e%23.15e%23.15e%23.15e", - f7_ij, f7_jk, f8_Dj, expval12theta ); - fprintf( out_control->eval, - "%23.15e%23.15e%23.15e%23.15e%23.15e\n", - CEval1, CEval2, CEval3, CEval4, CEval5 - //CEval6, CEval7, CEval8 );*/ - - /*fprintf( out_control->eval, - "%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n", - -p_ijk->dcos_di[0]/sin_theta, - -p_ijk->dcos_di[1]/sin_theta, - -p_ijk->dcos_di[2]/sin_theta, - -p_ijk->dcos_dj[0]/sin_theta, - -p_ijk->dcos_dj[1]/sin_theta, - -p_ijk->dcos_dj[2]/sin_theta, - -p_ijk->dcos_dk[0]/sin_theta, - -p_ijk->dcos_dk[1]/sin_theta, - -p_ijk->dcos_dk[2]/sin_theta );*/ - - /* fprintf( out_control->epen, - "%23.15e%23.15e%23.15e\n", - CEpen1, CEpen2, CEpen3 ); - fprintf( out_control->epen, - "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n", - workspace->orig_id[i], workspace->orig_id[j], - workspace->orig_id[k], RAD2DEG(theta), - BOA_ij, BOA_jk, e_pen, data->E_Pen ); */ - - // fprintf( out_control->ecoa, - // "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n", - // workspace->orig_id[i], - // workspace->orig_id[j], - // workspace->orig_id[k], - // RAD2DEG(theta), BOA_ij, BOA_jk, - // e_coa, data->E_Coa ); + //TODO -- check this + // fprintf( out_control->eval, + //"%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e", + // "%6d%6d%6d%23.15e%23.15e%23.15e\n", + // i+1, j+1, k+1, + //workspace->orig_id[i]+1, + //workspace->orig_id[j]+1, + //workspace->orig_id[k]+1, + //workspace->Delta_boc[j], + // RAD2DEG(theta), /*BOA_ij, BOA_jk, */ + // e_ang, data->E_Ang ); + + /*fprintf( out_control->eval, + "%23.15e%23.15e%23.15e%23.15e", + p_val3, p_val4, BOA_ij, BOA_jk ); + fprintf( out_control->eval, + "%23.15e%23.15e%23.15e%23.15e", + f7_ij, f7_jk, f8_Dj, expval12theta ); + fprintf( out_control->eval, + "%23.15e%23.15e%23.15e%23.15e%23.15e\n", + CEval1, CEval2, CEval3, CEval4, CEval5 + //CEval6, CEval7, CEval8 );*/ + + /*fprintf( out_control->eval, + "%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n", + -p_ijk->dcos_di[0]/sin_theta, + -p_ijk->dcos_di[1]/sin_theta, + -p_ijk->dcos_di[2]/sin_theta, + -p_ijk->dcos_dj[0]/sin_theta, + -p_ijk->dcos_dj[1]/sin_theta, + -p_ijk->dcos_dj[2]/sin_theta, + -p_ijk->dcos_dk[0]/sin_theta, + -p_ijk->dcos_dk[1]/sin_theta, + -p_ijk->dcos_dk[2]/sin_theta );*/ + + /* fprintf( out_control->epen, + "%23.15e%23.15e%23.15e\n", + CEpen1, CEpen2, CEpen3 ); + fprintf( out_control->epen, + "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n", + workspace->orig_id[i], workspace->orig_id[j], + workspace->orig_id[k], RAD2DEG(theta), + BOA_ij, BOA_jk, e_pen, data->E_Pen ); */ + + // fprintf( out_control->ecoa, + // "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n", + // workspace->orig_id[i], + // workspace->orig_id[j], + // workspace->orig_id[k], + // RAD2DEG(theta), BOA_ij, BOA_jk, + // e_coa, data->E_Coa ); #endif #ifdef TEST_FORCES /* angle forces */ - //TODO -- check this - /* - Add_dBO( system, lists, j, pi, CEval1, workspace->f_ang ); - Add_dBO( system, lists, j, pk, CEval2, workspace->f_ang ); - Add_dDelta( system, lists, - j, CEval3 + CEval7, workspace->f_ang ); - - for( t = start_j; t < end_j; ++t ) { - pbond_jt = &( bond_list[t] ); - bo_jt = &(pbond_jt->bo_data); - temp_bo_jt = bo_jt->BO; - temp = CUBE( temp_bo_jt ); - pBOjt7 = temp * temp * temp_bo_jt; - - Add_dBO( system, lists, j, t, pBOjt7 * CEval6, - workspace->f_ang ); - Add_dBOpinpi2( system, lists, j, t, - CEval5, CEval5, - workspace->f_ang, workspace->f_ang ); - } - - rvec_ScaledAdd( workspace->f_ang[i], CEval8, p_ijk->dcos_di ); - rvec_ScaledAdd( workspace->f_ang[j], CEval8, p_ijk->dcos_dj ); - rvec_ScaledAdd( workspace->f_ang[k], CEval8, p_ijk->dcos_dk ); - // end angle forces - - // penalty forces - Add_dDelta( system, lists, j, CEpen1, workspace->f_pen ); - Add_dBO( system, lists, j, pi, CEpen2, workspace->f_pen ); - Add_dBO( system, lists, j, pk, CEpen3, workspace->f_pen ); - // end penalty forces - - // coalition forces - Add_dBO( system, lists, - j, pi, CEcoa1-CEcoa4, workspace->f_coa ); - Add_dBO( system, lists, - j, pk, CEcoa2-CEcoa5, workspace->f_coa ); - Add_dDelta( system, lists, j, CEcoa3, workspace->f_coa ); - Add_dDelta( system, lists, i, CEcoa4, workspace->f_coa ); - Add_dDelta( system, lists, k, CEcoa5, workspace->f_coa ); - // end coalition forces - - */ + //TODO -- check this + /* + Add_dBO( system, lists, j, pi, CEval1, workspace->f_ang ); + Add_dBO( system, lists, j, pk, CEval2, workspace->f_ang ); + Add_dDelta( system, lists, + j, CEval3 + CEval7, workspace->f_ang ); + + for( t = start_j; t < end_j; ++t ) { + pbond_jt = &( bond_list[t] ); + bo_jt = &(pbond_jt->bo_data); + temp_bo_jt = bo_jt->BO; + temp = CUBE( temp_bo_jt ); + pBOjt7 = temp * temp * temp_bo_jt; + + Add_dBO( system, lists, j, t, pBOjt7 * CEval6, + workspace->f_ang ); + Add_dBOpinpi2( system, lists, j, t, + CEval5, CEval5, + workspace->f_ang, workspace->f_ang ); + } + + rvec_ScaledAdd( workspace->f_ang[i], CEval8, p_ijk->dcos_di ); + rvec_ScaledAdd( workspace->f_ang[j], CEval8, p_ijk->dcos_dj ); + rvec_ScaledAdd( workspace->f_ang[k], CEval8, p_ijk->dcos_dk ); + // end angle forces + + // penalty forces + Add_dDelta( system, lists, j, CEpen1, workspace->f_pen ); + Add_dBO( system, lists, j, pi, CEpen2, workspace->f_pen ); + Add_dBO( system, lists, j, pk, CEpen3, workspace->f_pen ); + // end penalty forces + + // coalition forces + Add_dBO( system, lists, + j, pi, CEcoa1-CEcoa4, workspace->f_coa ); + Add_dBO( system, lists, + j, pk, CEcoa2-CEcoa5, workspace->f_coa ); + Add_dDelta( system, lists, j, CEcoa3, workspace->f_coa ); + Add_dDelta( system, lists, i, CEcoa4, workspace->f_coa ); + Add_dDelta( system, lists, k, CEcoa5, workspace->f_coa ); + // end coalition forces + + */ #endif - } - } - } - } - } + } + } + } + } + } - Set_End_Index(pi, num_thb_intrs, thb_intrs ); - } - // } // end of the main for loop here + Set_End_Index(pi, num_thb_intrs, thb_intrs ); + } + // } // end of the main for loop here - //TODO - to be done on the CPU - /* + //TODO - to be done on the CPU + /* - if( num_thb_intrs >= thb_intrs->num_intrs * DANGER_ZONE ) { - workspace->realloc.num_3body = num_thb_intrs; - if( num_thb_intrs > thb_intrs->num_intrs ) { - fprintf( stderr, "step%d-ran out of space on angle_list: top=%d, max=%d", - data->step, num_thb_intrs, thb_intrs->num_intrs ); - exit( INSUFFICIENT_SPACE ); - } - } - */ + if( num_thb_intrs >= thb_intrs->num_intrs * DANGER_ZONE ) { + workspace->realloc.num_3body = num_thb_intrs; + if( num_thb_intrs > thb_intrs->num_intrs ) { + fprintf( stderr, "step%d-ran out of space on angle_list: top=%d, max=%d", + data->step, num_thb_intrs, thb_intrs->num_intrs ); + exit( INSUFFICIENT_SPACE ); + } + } + */ - //fprintf( stderr,"%d: Number of angle interactions: %d\n", - // data->step, num_thb_intrs ); + //fprintf( stderr,"%d: Number of angle interactions: %d\n", + // data->step, num_thb_intrs ); #ifdef TEST_ENERGY - /* - fprintf( stderr,"Number of angle interactions: %d\n", num_thb_intrs ); + /* + fprintf( stderr,"Number of angle interactions: %d\n", num_thb_intrs ); - fprintf( stderr,"Angle Energy:%g\t Penalty Energy:%g\t Coalition Energy:%g\n", - data->E_Ang, data->E_Pen, data->E_Coa ); + fprintf( stderr,"Angle Energy:%g\t Penalty Energy:%g\t Coalition Energy:%g\n", + data->E_Ang, data->E_Pen, data->E_Coa ); - fprintf( stderr,"3body: ext_press (%23.15e %23.15e %23.15e)\n", - data->ext_press[0], data->ext_press[1], data->ext_press[2] ); - */ + fprintf( stderr,"3body: ext_press (%23.15e %23.15e %23.15e)\n", + data->ext_press[0], data->ext_press[1], data->ext_press[2] ); + */ #endif } -GLOBAL void Three_Body_Interactions_results ( reax_atom *atoms, control_params *control, - static_storage p_workspace, - list p_bonds, int N ) +GLOBAL void Three_Body_Interactions_results ( reax_atom *atoms, control_params *control, + static_storage p_workspace, + list p_bonds, int N ) { - int i, pj; + int i, pj; - bond_data *pbond; - bond_data *sym_index_bond; - list *bonds = &p_bonds; - static_storage *workspace = &p_workspace; + bond_data *pbond; + bond_data *sym_index_bond; + list *bonds = &p_bonds; + static_storage *workspace = &p_workspace; - i = blockIdx.x * blockDim.x + threadIdx.x; + i = blockIdx.x * blockDim.x + threadIdx.x; - if ( i >= N) return; + if ( i >= N) return; - for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ){ + for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ){ - pbond = &(bonds->select.bond_list[pj]); - sym_index_bond = &( bonds->select.bond_list[ pbond->sym_index ] ); + pbond = &(bonds->select.bond_list[pj]); + sym_index_bond = &( bonds->select.bond_list[ pbond->sym_index ] ); - workspace->CdDelta [i] += sym_index_bond->CdDelta_ij; + workspace->CdDelta [i] += sym_index_bond->CdDelta_ij; - rvec_Add (atoms[i].f, sym_index_bond->f ); - } + rvec_Add (atoms[i].f, sym_index_bond->f ); + } } @@ -1201,78 +1201,78 @@ GLOBAL void Three_Body_Interactions_results ( reax_atom *atoms, control_params /* this is a 3-body interaction in which the main role is played by j which sits in the middle of the other two. */ GLOBAL void Three_Body_Estimate ( reax_atom *atoms, - control_params *control, - list p_bonds, int N, - int *count) + control_params *control, + list p_bonds, int N, + int *count) { - int i, j, pi, k, pk, t; - int type_i, type_j, type_k; - int start_j, end_j ; - int flag, cnt, num_thb_intrs; + int i, j, pi, k, pk, t; + int type_i, type_j, type_k; + int start_j, end_j ; + int flag, cnt, num_thb_intrs; - real r_ij, r_jk; - real BOA_ij, BOA_jk; - list *bonds; + real r_ij, r_jk; + real BOA_ij, BOA_jk; + list *bonds; - bond_order_data *bo_ij, *bo_jk, *bo_jt; - bond_data *bond_list; - bond_data *pbond_ij, *pbond_jk, *pbond_jt; + bond_order_data *bo_ij, *bo_jk, *bo_jt; + bond_data *bond_list; + bond_data *pbond_ij, *pbond_jk, *pbond_jt; - j = blockIdx.x * blockDim.x + threadIdx.x; - if (j >= N) return; + j = blockIdx.x * blockDim.x + threadIdx.x; + if (j >= N) return; - bonds = &p_bonds; - bond_list = bonds->select.bond_list; + bonds = &p_bonds; + bond_list = bonds->select.bond_list; - type_j = atoms[j].type; - start_j = Start_Index(j, bonds); - end_j = End_Index(j, bonds); + type_j = atoms[j].type; + start_j = Start_Index(j, bonds); + end_j = End_Index(j, bonds); - for( pi = start_j; pi < end_j; ++pi ) { + for( pi = start_j; pi < end_j; ++pi ) { - num_thb_intrs = 0; - count [pi] = 0; + num_thb_intrs = 0; + count [pi] = 0; - pbond_ij = &(bond_list[pi]); - bo_ij = &(pbond_ij->bo_data); - BOA_ij = bo_ij->BO - control->thb_cut; + pbond_ij = &(bond_list[pi]); + bo_ij = &(pbond_ij->bo_data); + BOA_ij = bo_ij->BO - control->thb_cut; - if( BOA_ij/*bo_ij->BO*/ > 0.0 ) { - i = pbond_ij->nbr; - r_ij = pbond_ij->d; - type_i = atoms[i].type; + if( BOA_ij/*bo_ij->BO*/ > 0.0 ) { + i = pbond_ij->nbr; + r_ij = pbond_ij->d; + type_i = atoms[i].type; - /* - for( pk = start_j; pk < pi; ++pk ) { - start_pk = Start_Index( pk, thb_intrs ); - end_pk = End_Index( pk, thb_intrs ); + /* + for( pk = start_j; pk < pi; ++pk ) { + start_pk = Start_Index( pk, thb_intrs ); + end_pk = End_Index( pk, thb_intrs ); - for( t = start_pk; t < end_pk; ++t ) - if( thb_list[t].thb == i ) { + for( t = start_pk; t < end_pk; ++t ) + if( thb_list[t].thb == i ) { - ++num_thb_intrs; - break; - } - } - */ + ++num_thb_intrs; + break; + } + } + */ - /* and this is the second for loop mentioned above */ - for( pk = start_j; pk < end_j; ++pk ) { - if (pk == pi) continue; + /* and this is the second for loop mentioned above */ + for( pk = start_j; pk < end_j; ++pk ) { + if (pk == pi) continue; - pbond_jk = &(bond_list[pk]); - bo_jk = &(pbond_jk->bo_data); - BOA_jk = bo_jk->BO - control->thb_cut; + pbond_jk = &(bond_list[pk]); + bo_jk = &(pbond_jk->bo_data); + BOA_jk = bo_jk->BO - control->thb_cut; - if (BOA_jk <= 0) continue; + if (BOA_jk <= 0) continue; - ++num_thb_intrs; - } - } + ++num_thb_intrs; + } + } - count [pi] = num_thb_intrs; - } + count [pi] = num_thb_intrs; + } } @@ -1295,224 +1295,224 @@ GLOBAL void Three_Body_Estimate ( reax_atom *atoms, void Hydrogen_Bonds( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - list **lists, output_controls *out_control ) + simulation_data *data, static_storage *workspace, + list **lists, output_controls *out_control ) { - int i, j, k, pi, pk, itr, top; - int type_i, type_j, type_k; - int start_j, end_j, hb_start_j, hb_end_j; - int hblist[MAX_BONDS]; - int num_hb_intrs = 0; - real r_ij, r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2; - real e_hb, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3; - rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk; - rvec dvec_jk, force, ext_press; - ivec rel_jk; - // rtensor temp_rtensor, total_rtensor; - hbond_parameters *hbp; - bond_order_data *bo_ij; - bond_data *pbond_ij; - far_neighbor_data *nbr_jk; - list *bonds, *hbonds; - bond_data *bond_list; - hbond_data *hbond_list; - - bonds = (*lists) + BONDS; - bond_list = bonds->select.bond_list; - - hbonds = (*lists) + HBONDS; - hbond_list = hbonds->select.hbond_list; - - /* loops below discover the Hydrogen bonds between i-j-k triplets. - here j is H atom and there has to be some bond between i and j. - Hydrogen bond is between j and k. - so in this function i->X, j->H, k->Z when we map - variables onto the ones in the handout.*/ - for( j = 0; j < system->N; ++j ) - if( system->reaxprm.sbp[system->atoms[j].type].p_hbond==1 ) {// j must be H - /*set j's variables */ - type_j = system->atoms[j].type; - start_j = Start_Index(j, bonds); - end_j = End_Index(j, bonds); - hb_start_j = Start_Index( workspace->hbond_index[j], hbonds ); - hb_end_j = End_Index ( workspace->hbond_index[j], hbonds ); - - top = 0; - for( pi = start_j; pi < end_j; ++pi ) { - pbond_ij = &( bond_list[pi] ); - i = pbond_ij->nbr; - bo_ij = &(pbond_ij->bo_data); - type_i = system->atoms[i].type; - - if( system->reaxprm.sbp[type_i].p_hbond == 2 && - bo_ij->BO >= HB_THRESHOLD ) - hblist[top++] = pi; - } - - // fprintf( stderr, "j: %d, top: %d, hb_start_j: %d, hb_end_j:%d\n", - // j, top, hb_start_j, hb_end_j ); - - for( pk = hb_start_j; pk < hb_end_j; ++pk ) { - /* set k's varibles */ - k = hbond_list[pk].nbr; - type_k = system->atoms[k].type; - nbr_jk = hbond_list[pk].ptr; - r_jk = nbr_jk->d; - rvec_Scale( dvec_jk, hbond_list[pk].scl, nbr_jk->dvec ); - - for( itr=0; itr < top; ++itr ) { - pi = hblist[itr]; - pbond_ij = &( bond_list[pi] ); - i = pbond_ij->nbr; - - if( i != k ) { - bo_ij = &(pbond_ij->bo_data); - type_i = system->atoms[i].type; - r_ij = pbond_ij->d; - hbp = &(system->reaxprm.hbp[ index_hbp(type_i, type_j, type_k, &system->reaxprm) ]); - ++num_hb_intrs; - - Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk, - &theta, &cos_theta ); - /* the derivative of cos(theta) */ - Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk, - &dcos_theta_di, &dcos_theta_dj, - &dcos_theta_dk ); - - /* hydrogen bond energy*/ - sin_theta2 = SIN( theta/2.0 ); - sin_xhz4 = SQR(sin_theta2); - sin_xhz4 *= sin_xhz4; - cos_xhz1 = ( 1.0 - cos_theta ); - exp_hb2 = EXP( -hbp->p_hb2 * bo_ij->BO ); - exp_hb3 = EXP( -hbp->p_hb3 * ( hbp->r0_hb / r_jk + - r_jk / hbp->r0_hb - 2.0 ) ); - - data->E_HB += e_hb = - hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4; - - CEhb1 = hbp->p_hb1*hbp->p_hb2 * exp_hb2*exp_hb3 * sin_xhz4; - CEhb2 = -hbp->p_hb1/2.0*(1.0 - exp_hb2) * exp_hb3 * cos_xhz1; - CEhb3 = -hbp->p_hb3 * e_hb * (-hbp->r0_hb / SQR(r_jk) + - 1.0 / hbp->r0_hb); - - /* hydrogen bond forces */ - bo_ij->Cdbo += CEhb1; // dbo term - - if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT ) { - rvec_ScaledAdd( system->atoms[i].f, - +CEhb2, dcos_theta_di ); //dcos terms - rvec_ScaledAdd( system->atoms[j].f, - +CEhb2, dcos_theta_dj ); - - - - - //TODO - rvec_ScaledAdd( system->atoms[k].f, - +CEhb2, dcos_theta_dk ); - - //dr terms - rvec_ScaledAdd( system->atoms[j].f, -CEhb3/r_jk, dvec_jk ); - - - //TODO - rvec_ScaledAdd( system->atoms[k].f, +CEhb3/r_jk, dvec_jk ); - } - else - { - /* for pressure coupling, terms that are not related - to bond order derivatives are added directly into - pressure vector/tensor */ - rvec_Scale( force, +CEhb2, dcos_theta_di ); // dcos terms - rvec_Add( system->atoms[i].f, force ); - rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); - rvec_ScaledAdd( data->ext_press, 1.0, ext_press ); - - rvec_ScaledAdd( system->atoms[j].f, +CEhb2, dcos_theta_dj ); - - ivec_Scale( rel_jk, hbond_list[pk].scl, nbr_jk->rel_box ); - rvec_Scale( force, +CEhb2, dcos_theta_dk ); - - - - //TODO - rvec_Add( system->atoms[k].f, force ); - - + int i, j, k, pi, pk, itr, top; + int type_i, type_j, type_k; + int start_j, end_j, hb_start_j, hb_end_j; + int hblist[MAX_BONDS]; + int num_hb_intrs = 0; + real r_ij, r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2; + real e_hb, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3; + rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk; + rvec dvec_jk, force, ext_press; + ivec rel_jk; + // rtensor temp_rtensor, total_rtensor; + hbond_parameters *hbp; + bond_order_data *bo_ij; + bond_data *pbond_ij; + far_neighbor_data *nbr_jk; + list *bonds, *hbonds; + bond_data *bond_list; + hbond_data *hbond_list; + + bonds = (*lists) + BONDS; + bond_list = bonds->select.bond_list; + + hbonds = (*lists) + HBONDS; + hbond_list = hbonds->select.hbond_list; + + /* loops below discover the Hydrogen bonds between i-j-k triplets. + here j is H atom and there has to be some bond between i and j. + Hydrogen bond is between j and k. + so in this function i->X, j->H, k->Z when we map + variables onto the ones in the handout.*/ + for( j = 0; j < system->N; ++j ) + if( system->reaxprm.sbp[system->atoms[j].type].p_hbond==1 ) {// j must be H + /*set j's variables */ + type_j = system->atoms[j].type; + start_j = Start_Index(j, bonds); + end_j = End_Index(j, bonds); + hb_start_j = Start_Index( workspace->hbond_index[j], hbonds ); + hb_end_j = End_Index ( workspace->hbond_index[j], hbonds ); + + top = 0; + for( pi = start_j; pi < end_j; ++pi ) { + pbond_ij = &( bond_list[pi] ); + i = pbond_ij->nbr; + bo_ij = &(pbond_ij->bo_data); + type_i = system->atoms[i].type; + + if( system->reaxprm.sbp[type_i].p_hbond == 2 && + bo_ij->BO >= HB_THRESHOLD ) + hblist[top++] = pi; + } + + // fprintf( stderr, "j: %d, top: %d, hb_start_j: %d, hb_end_j:%d\n", + // j, top, hb_start_j, hb_end_j ); + + for( pk = hb_start_j; pk < hb_end_j; ++pk ) { + /* set k's varibles */ + k = hbond_list[pk].nbr; + type_k = system->atoms[k].type; + nbr_jk = hbond_list[pk].ptr; + r_jk = nbr_jk->d; + rvec_Scale( dvec_jk, hbond_list[pk].scl, nbr_jk->dvec ); + + for( itr=0; itr < top; ++itr ) { + pi = hblist[itr]; + pbond_ij = &( bond_list[pi] ); + i = pbond_ij->nbr; + + if( i != k ) { + bo_ij = &(pbond_ij->bo_data); + type_i = system->atoms[i].type; + r_ij = pbond_ij->d; + hbp = &(system->reaxprm.hbp[ index_hbp(type_i, type_j, type_k, &system->reaxprm) ]); + ++num_hb_intrs; + + Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk, + &theta, &cos_theta ); + /* the derivative of cos(theta) */ + Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk, + &dcos_theta_di, &dcos_theta_dj, + &dcos_theta_dk ); + + /* hydrogen bond energy*/ + sin_theta2 = SIN( theta/2.0 ); + sin_xhz4 = SQR(sin_theta2); + sin_xhz4 *= sin_xhz4; + cos_xhz1 = ( 1.0 - cos_theta ); + exp_hb2 = EXP( -hbp->p_hb2 * bo_ij->BO ); + exp_hb3 = EXP( -hbp->p_hb3 * ( hbp->r0_hb / r_jk + + r_jk / hbp->r0_hb - 2.0 ) ); + + data->E_HB += e_hb = + hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4; + + CEhb1 = hbp->p_hb1*hbp->p_hb2 * exp_hb2*exp_hb3 * sin_xhz4; + CEhb2 = -hbp->p_hb1/2.0*(1.0 - exp_hb2) * exp_hb3 * cos_xhz1; + CEhb3 = -hbp->p_hb3 * e_hb * (-hbp->r0_hb / SQR(r_jk) + + 1.0 / hbp->r0_hb); + + /* hydrogen bond forces */ + bo_ij->Cdbo += CEhb1; // dbo term + + if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT ) { + rvec_ScaledAdd( system->atoms[i].f, + +CEhb2, dcos_theta_di ); //dcos terms + rvec_ScaledAdd( system->atoms[j].f, + +CEhb2, dcos_theta_dj ); + + + + + //TODO + rvec_ScaledAdd( system->atoms[k].f, + +CEhb2, dcos_theta_dk ); + + //dr terms + rvec_ScaledAdd( system->atoms[j].f, -CEhb3/r_jk, dvec_jk ); + + + //TODO + rvec_ScaledAdd( system->atoms[k].f, +CEhb3/r_jk, dvec_jk ); + } + else + { + /* for pressure coupling, terms that are not related + to bond order derivatives are added directly into + pressure vector/tensor */ + rvec_Scale( force, +CEhb2, dcos_theta_di ); // dcos terms + rvec_Add( system->atoms[i].f, force ); + rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); + rvec_ScaledAdd( data->ext_press, 1.0, ext_press ); + + rvec_ScaledAdd( system->atoms[j].f, +CEhb2, dcos_theta_dj ); + + ivec_Scale( rel_jk, hbond_list[pk].scl, nbr_jk->rel_box ); + rvec_Scale( force, +CEhb2, dcos_theta_dk ); + + + + //TODO + rvec_Add( system->atoms[k].f, force ); + + - rvec_iMultiply( ext_press, rel_jk, force ); - rvec_ScaledAdd( data->ext_press, 1.0, ext_press ); + rvec_iMultiply( ext_press, rel_jk, force ); + rvec_ScaledAdd( data->ext_press, 1.0, ext_press ); - //dr terms - rvec_ScaledAdd( system->atoms[j].f, -CEhb3/r_jk, dvec_jk ); + //dr terms + rvec_ScaledAdd( system->atoms[j].f, -CEhb3/r_jk, dvec_jk ); - rvec_Scale( force, CEhb3/r_jk, dvec_jk ); - rvec_Add( system->atoms[k].f, force ); - rvec_iMultiply( ext_press, rel_jk, force ); - rvec_ScaledAdd( data->ext_press, 1.0, ext_press ); + rvec_Scale( force, CEhb3/r_jk, dvec_jk ); + rvec_Add( system->atoms[k].f, force ); + rvec_iMultiply( ext_press, rel_jk, force ); + rvec_ScaledAdd( data->ext_press, 1.0, ext_press ); - /* This part is intended for a fully-flexible box */ - /* rvec_OuterProduct( temp_rtensor, - dcos_theta_di, system->atoms[i].x ); - rtensor_Scale( total_rtensor, -CEhb2, temp_rtensor ); + /* This part is intended for a fully-flexible box */ + /* rvec_OuterProduct( temp_rtensor, + dcos_theta_di, system->atoms[i].x ); + rtensor_Scale( total_rtensor, -CEhb2, temp_rtensor ); - rvec_ScaledSum( temp_rvec, -CEhb2, dcos_theta_dj, - -CEhb3/r_jk, pbond_jk->dvec ); - rvec_OuterProduct( temp_rtensor, - temp_rvec, system->atoms[j].x ); - rtensor_Add( total_rtensor, temp_rtensor ); + rvec_ScaledSum( temp_rvec, -CEhb2, dcos_theta_dj, + -CEhb3/r_jk, pbond_jk->dvec ); + rvec_OuterProduct( temp_rtensor, + temp_rvec, system->atoms[j].x ); + rtensor_Add( total_rtensor, temp_rtensor ); - rvec_ScaledSum( temp_rvec, -CEhb2, dcos_theta_dk, - +CEhb3/r_jk, pbond_jk->dvec ); - rvec_OuterProduct( temp_rtensor, - temp_rvec, system->atoms[k].x ); - rtensor_Add( total_rtensor, temp_rtensor ); + rvec_ScaledSum( temp_rvec, -CEhb2, dcos_theta_dk, + +CEhb3/r_jk, pbond_jk->dvec ); + rvec_OuterProduct( temp_rtensor, + temp_rvec, system->atoms[k].x ); + rtensor_Add( total_rtensor, temp_rtensor ); - if( pbond_ij->imaginary || pbond_jk->imaginary ) - rtensor_ScaledAdd( data->flex_bar.P, -1.0, total_rtensor ); - else - rtensor_Add( data->flex_bar.P, total_rtensor ); */ - } + if( pbond_ij->imaginary || pbond_jk->imaginary ) + rtensor_ScaledAdd( data->flex_bar.P, -1.0, total_rtensor ); + else + rtensor_Add( data->flex_bar.P, total_rtensor ); */ + } #ifdef TEST_ENERGY - /*fprintf( out_control->ehb, - "%23.15e%23.15e%23.15e\n%23.15e%23.15e%23.15e\n%23.15e%23.15e%23.15e\n", - dcos_theta_di[0], dcos_theta_di[1], dcos_theta_di[2], - dcos_theta_dj[0], dcos_theta_dj[1], dcos_theta_dj[2], - dcos_theta_dk[0], dcos_theta_dk[1], dcos_theta_dk[2]); - fprintf( out_control->ehb, "%23.15e%23.15e%23.15e\n", - CEhb1, CEhb2, CEhb3 ); */ - fprintf( stderr, //out_control->ehb, - "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n", - workspace->orig_id[i], - workspace->orig_id[j], - workspace->orig_id[k], - r_jk, theta, bo_ij->BO, e_hb, data->E_HB ); + /*fprintf( out_control->ehb, + "%23.15e%23.15e%23.15e\n%23.15e%23.15e%23.15e\n%23.15e%23.15e%23.15e\n", + dcos_theta_di[0], dcos_theta_di[1], dcos_theta_di[2], + dcos_theta_dj[0], dcos_theta_dj[1], dcos_theta_dj[2], + dcos_theta_dk[0], dcos_theta_dk[1], dcos_theta_dk[2]); + fprintf( out_control->ehb, "%23.15e%23.15e%23.15e\n", + CEhb1, CEhb2, CEhb3 ); */ + fprintf( stderr, //out_control->ehb, + "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n", + workspace->orig_id[i], + workspace->orig_id[j], + workspace->orig_id[k], + r_jk, theta, bo_ij->BO, e_hb, data->E_HB ); #endif #ifdef TEST_FORCES - // dbo term - Add_dBO( system, lists, j, pi, +CEhb1, workspace->f_hb ); - // dcos terms - rvec_ScaledAdd( workspace->f_hb[i], +CEhb2, dcos_theta_di ); - rvec_ScaledAdd( workspace->f_hb[j], +CEhb2, dcos_theta_dj ); - rvec_ScaledAdd( workspace->f_hb[k], +CEhb2, dcos_theta_dk ); - // dr terms - rvec_ScaledAdd( workspace->f_hb[j], -CEhb3/r_jk, dvec_jk ); - rvec_ScaledAdd( workspace->f_hb[k], +CEhb3/r_jk, dvec_jk ); + // dbo term + Add_dBO( system, lists, j, pi, +CEhb1, workspace->f_hb ); + // dcos terms + rvec_ScaledAdd( workspace->f_hb[i], +CEhb2, dcos_theta_di ); + rvec_ScaledAdd( workspace->f_hb[j], +CEhb2, dcos_theta_dj ); + rvec_ScaledAdd( workspace->f_hb[k], +CEhb2, dcos_theta_dk ); + // dr terms + rvec_ScaledAdd( workspace->f_hb[j], -CEhb3/r_jk, dvec_jk ); + rvec_ScaledAdd( workspace->f_hb[k], +CEhb3/r_jk, dvec_jk ); #endif - } - } - } - } + } + } + } + } - /* fprintf( stderr, "hydbonds: ext_press (%23.15e %23.15e %23.15e)\n", - data->ext_press[0], data->ext_press[1], data->ext_press[2] ); */ + /* fprintf( stderr, "hydbonds: ext_press (%23.15e %23.15e %23.15e)\n", + data->ext_press[0], data->ext_press[1], data->ext_press[2] ); */ #ifdef TEST_FORCES - fprintf( stderr, "Number of hydrogen bonds: %d\n", num_hb_intrs ); - fprintf( stderr, "Hydrogen Bond Energy: %g\n", data->E_HB ); + fprintf( stderr, "Number of hydrogen bonds: %d\n", num_hb_intrs ); + fprintf( stderr, "Hydrogen Bond Energy: %g\n", data->E_HB ); #endif } @@ -1525,740 +1525,740 @@ void Hydrogen_Bonds( reax_system *system, control_params *control, // Cuda Function //////////////////////////////////////////////////////////////////// -GLOBAL void Hydrogen_Bonds ( reax_atom *atoms, - single_body_parameters *sbp, - hbond_parameters *d_hbp, - control_params *control, - simulation_data *data, - static_storage p_workspace, - list p_bonds, list p_hbonds, - int N, int num_atom_types, - real *E_HB, rvec *aux_ext_press, rvec *atoms_f ) +GLOBAL void Hydrogen_Bonds ( reax_atom *atoms, + single_body_parameters *sbp, + hbond_parameters *d_hbp, + control_params *control, + simulation_data *data, + static_storage p_workspace, + list p_bonds, list p_hbonds, + int N, int num_atom_types, + real *E_HB, rvec *aux_ext_press, rvec *atoms_f ) { - extern __shared__ real t_hb[]; - extern __shared__ real t_f[]; - //extern __shared__ rvec t_cdbo[]; - //extern __shared__ rvec t_hf []; - - real *sh_hb = t_hb; - rvec *sh_atomf = (rvec *)(t_hb + blockDim.x); - //real *sh_cdbo = t_hb + blockDim.x; - //rvec *sh_hf = (rvec *) (sh_atomf + blockDim.x); - - int i, j, k, pi, pk, itr, top; - int type_i, type_j, type_k; - int start_j, end_j, hb_start_j, hb_end_j; - int hblist[MAX_BONDS]; - int num_hb_intrs = 0; - real r_ij, r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2; - real e_hb, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3; - rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk; - rvec dvec_jk, force, ext_press; - ivec rel_jk; - // rtensor temp_rtensor, total_rtensor; - hbond_parameters *hbp; - bond_order_data *bo_ij; - bond_data *pbond_ij; - far_neighbor_data *nbr_jk; - list *bonds, *hbonds; - bond_data *bond_list; - hbond_data *hbond_list, *hbond_jk; - static_storage *workspace = &p_workspace; - - j = blockIdx.x * blockDim.x + threadIdx.x; - if (j >= N) return; - - //j = blockIdx.x; - - bonds = &p_bonds; - bond_list = bonds->select.bond_list; - - hbonds = &p_hbonds; - hbond_list = hbonds->select.hbond_list; - - // loops below discover the Hydrogen bonds between i-j-k triplets. - // here j is H atom and there has to be some bond between i and j. - // Hydrogen bond is between j and k. - // so in this function i->X, j->H, k->Z when we map - // variables onto the ones in the handout. - - //for( j = 0; j < system->N; ++j ) - sh_hb [threadIdx.x] = 0; - rvec_MakeZero ( sh_atomf[ threadIdx.x] ); - - if( sbp[atoms[j].type].p_hbond==1) {// j must be H - //set j's variables - type_j = atoms[j].type; - start_j = Start_Index(j, bonds); - end_j = End_Index(j, bonds); - hb_start_j = Start_Index( workspace->hbond_index[j], hbonds ); - hb_end_j = End_Index ( workspace->hbond_index[j], hbonds ); - - top = 0; - for( pi = start_j; pi < end_j; ++pi ) { - pbond_ij = &( bond_list[pi] ); - i = pbond_ij->nbr; - bo_ij = &(pbond_ij->bo_data); - type_i = atoms[i].type; - - if( sbp[type_i].p_hbond == 2 && - bo_ij->BO >= HB_THRESHOLD ) - hblist[top++] = pi; - } - - // fprintf( stderr, "j: %d, top: %d, hb_start_j: %d, hb_end_j:%d\n", - // j, top, hb_start_j, hb_end_j ); - - for( pk = hb_start_j; pk < hb_end_j; ++pk ) - //pk = hb_start_j + threadIdx.x; - //while (pk < hb_end_j) - { - // set k's varibles - //TODO - hbond_jk = &( hbond_list[pk] ); - //TODO - k = hbond_list[pk].nbr; - type_k = atoms[k].type; - nbr_jk = hbond_list[pk].ptr; - r_jk = nbr_jk->d; - rvec_Scale( dvec_jk, hbond_list[pk].scl, nbr_jk->dvec ); - - //TODO Double check this Hydrogen Bonds fix - //rvec_MakeZero ( nbr_jk->h_f ); - rvec_MakeZero ( hbond_jk->h_f ); - //TODO Double check this Hydrogen Bonds fix - - //sh_hb [threadIdx.x] = 0; - - - //itr = threadIdx.x; - for( itr=0; itr < top; ++itr ) { - //while (itr < top) { - pi = hblist[itr]; - pbond_ij = &( bond_list[pi] ); - i = pbond_ij->nbr; - - //TODO - //rvec_MakeZero (sh_hf [threadIdx.x]); - //sh_cdbo [threadIdx.x] = 0; - - //rvec_MakeZero ( sh_atomf[ threadIdx.x] ); - - - if( i != k ) { - bo_ij = &(pbond_ij->bo_data); - type_i = atoms[i].type; - r_ij = pbond_ij->d; - hbp = &(d_hbp[ index_hbp(type_i, type_j, type_k, num_atom_types) ]); - ++num_hb_intrs; - - Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk, - &theta, &cos_theta ); - // the derivative of cos(theta) - Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk, - &dcos_theta_di, &dcos_theta_dj, - &dcos_theta_dk ); - - // hydrogen bond energy - sin_theta2 = SIN( theta/2.0 ); - sin_xhz4 = SQR(sin_theta2); - sin_xhz4 *= sin_xhz4; - cos_xhz1 = ( 1.0 - cos_theta ); - exp_hb2 = EXP( -hbp->p_hb2 * bo_ij->BO ); - exp_hb3 = EXP( -hbp->p_hb3 * ( hbp->r0_hb / r_jk + - r_jk / hbp->r0_hb - 2.0 ) ); - - //PERFORMANCE IMPACT - e_hb = hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4; - //atomicAdd ( &data->E_HB, e_hb ); - //E_HB [j] += e_hb; - sh_hb [threadIdx.x] += e_hb; - - CEhb1 = hbp->p_hb1*hbp->p_hb2 * exp_hb2*exp_hb3 * sin_xhz4; - CEhb2 = -hbp->p_hb1/2.0*(1.0 - exp_hb2) * exp_hb3 * cos_xhz1; - CEhb3 = -hbp->p_hb3 * e_hb * (-hbp->r0_hb / SQR(r_jk) + - 1.0 / hbp->r0_hb); - - //this is the problem here - //TODO - // hydrogen bond forces - bo_ij->Cdbo += CEhb1; // dbo term - //sh_cdbo[threadIdx.x] += CEhb1; - //TODO - - - if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) { - - //PERFORMANCE IMPACT - /* - atomic_rvecScaledAdd( atoms[i].f, - +CEhb2, dcos_theta_di ); //dcos terms - atomic_rvecScaledAdd( atoms[j].f, - +CEhb2, dcos_theta_dj ); - atomic_rvecScaledAdd( atoms[k].f, - +CEhb2, dcos_theta_dk ); - //dr terms - atomic_rvecScaledAdd( atoms[j].f, -CEhb3/r_jk, dvec_jk ); - atomic_rvecScaledAdd( atoms[k].f, +CEhb3/r_jk, dvec_jk ); - */ - - //PERFORMANCE IMPACT - rvec_ScaledAdd( pbond_ij->h_f, +CEhb2, dcos_theta_di ); //dcos terms - //rvec_ScaledAdd( sh_hf [threadIdx.x], +CEhb2, dcos_theta_di ); //dcos terms - - //rvec_ScaledAdd( atoms[j].f, +CEhb2, dcos_theta_dj ); - rvec_ScaledAdd( sh_atomf [threadIdx.x], +CEhb2, dcos_theta_dj ); - - //TODO you forgot here - //TODO Hydrogen bonds fix. -- BE VERY CAREFUL ***** - rvec_ScaledAdd( hbond_jk->h_f, - +CEhb2, dcos_theta_dk ); - - //rvec_ScaledAdd( nbr_jk->h_f, - // +CEhb2, dcos_theta_dk ); - - //dr terms - //rvec_ScaledAdd( atoms[j].f, -CEhb3/r_jk, dvec_jk ); - rvec_ScaledAdd( sh_atomf [threadIdx.x], -CEhb3/r_jk, dvec_jk ); - - //atoms_f [j] ++; - - //TODO you forgot - rvec_ScaledAdd( hbond_jk->h_f, +CEhb3/r_jk, dvec_jk ); - //rvec_ScaledAdd( nbr_jk->h_f, +CEhb3/r_jk, dvec_jk ); - } - else - { - // for pressure coupling, terms that are not related - // to bond order derivatives are added directly into - // pressure vector/tensor - rvec_Scale( force, +CEhb2, dcos_theta_di ); // dcos terms - rvec_Add( pbond_ij->h_f, force ); - rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); - //rvec_ScaledAdd( aux_ext_press [j], 1.0, ext_press ); - //rvec_ScaledAdd (sh_press [threadIdx.x], 1.0, ext_press ); - - rvec_ScaledAdd( atoms[j].f, +CEhb2, dcos_theta_dj ); - - ivec_Scale( rel_jk, hbond_list[pk].scl, nbr_jk->rel_box ); - rvec_Scale( force, +CEhb2, dcos_theta_dk ); - - //rvec_Add( nbr_jk->h_f, force ); - rvec_Add( hbond_jk->h_f, force ); - - rvec_iMultiply( ext_press, rel_jk, force ); - //rvec_ScaledAdd( aux_ext_press [j], 1.0, ext_press ); - //rvec_ScaledAdd( sh_press [threadIdx.x], 1.0, ext_press ); - - //dr terms - rvec_ScaledAdd( atoms[j].f, -CEhb3/r_jk, dvec_jk ); - - rvec_Scale( force, CEhb3/r_jk, dvec_jk ); - rvec_Add( hbond_jk->h_f, force ); - rvec_iMultiply( ext_press, rel_jk, force ); - //rvec_ScaledAdd( aux_ext_press [j], 1.0, ext_press ); - //rvec_ScaledAdd( sh_press [threadIdx.x], 1.0, ext_press ); - - } - - //do the reduction for the bond_ij here - /* - if (threadIdx.x < 16){ - sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 16]; - rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 16]); - - sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 16]; - rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 16] ); - } - if (threadIdx.x < 8){ - //sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 8]; - //rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 8]); - - sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 8]; - //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 8] ); - } - if (threadIdx.x < 4){ - //sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 4]; - //rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 4]); - - sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 4]; - //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 4] ); - } - if (threadIdx.x < 2){ - //sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 2]; - //rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 2]); - - sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 2]; - //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 2] ); - } - if (threadIdx.x < 1){ - //sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 1]; - //rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 1]); - - sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 1]; - //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 1] ); - } - if (threadIdx.x == 0){ - //bo_ij->Cdbo += sh_cdbo [threadIdx.x]; - //rvec_Add (pbond_ij->h_f, sh_hf [threadIdx.x]); - - E_HB [j] += sh_hb [threadIdx.x]; - //rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]); - } - */ - - - } // i != k if statement - - - //itr += blockDim.x; - - } //itr for statement - - /* - __syncthreads (); - - for (int x = 1; x < blockDim.x; x++) - sh_hb [0] += sh_hb [x]; - - E_HB [j] += sh_hb[0]; - if (threadIdx.x < 16) sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 16]; - if (threadIdx.x < 8) sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 8]; - if (threadIdx.x < 4) sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 4]; - if (threadIdx.x < 2) sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 2]; - if (threadIdx.x < 1) sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 1]; - if (threadIdx.x == 0) E_HB [j] += sh_hb [threadIdx.x]; - */ - - - //pk += blockDim.x; - - } // pk for statement - } // main if statment - - //do the reduction for the bond_ij here - /* - if (threadIdx.x < 16){ - sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 16]; - //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 16] ); - } - if (threadIdx.x < 8){ - sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 8]; - //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 8] ); - } - if (threadIdx.x < 4){ - sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 4]; - //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 4] ); - } - if (threadIdx.x < 2){ - sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 2]; - //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 2] ); - } - if (threadIdx.x < 1){ - sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 1]; - //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 1] ); - } - if (threadIdx.x == 0){ - E_HB [j] += sh_hb [threadIdx.x]; - //rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]); - } - */ - - E_HB [j] += sh_hb [threadIdx.x]; - rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]); - - //rvec_Copy (atoms_f [j], sh_atomf [threadIdx.x]); - } - - - DEVICE void warpReduce(volatile real* sdata, int tid) - { - if (tid < 16) sdata[tid] += sdata[tid + 16]; - if (tid < 8) sdata[tid] += sdata[tid + 8]; - if (tid < 4) sdata[tid] += sdata[tid + 4]; - if (tid < 2) sdata[tid] += sdata[tid + 2]; - if (tid < 1) sdata[tid] += sdata[tid + 1]; - } - - - - - GLOBAL void Hydrogen_Bonds_HB ( reax_atom *atoms, - single_body_parameters *sbp, - hbond_parameters *d_hbp, - control_params *control, - simulation_data *data, - static_storage p_workspace, - list p_bonds, list p_hbonds, - int N, int num_atom_types, - real *E_HB, rvec *aux_ext_press, rvec *atoms_f ) - { - extern __shared__ real t_hb[]; - extern __shared__ rvec t__f[]; - extern __shared__ rvec t_cdbo[]; - extern __shared__ rvec t_hf []; - - real *sh_hb = t_hb; - real *sh_cdbo = t_hb + blockDim.x; - rvec *sh_atomf = (rvec *)(sh_cdbo + blockDim.x); - rvec *sh_hf = (rvec *) (sh_atomf + blockDim.x); - - int __THREADS_PER_ATOM__ = HBONDS_THREADS_PER_ATOM; - - int thread_id = blockIdx.x * blockDim.x + threadIdx.x; - int warp_id = thread_id / __THREADS_PER_ATOM__; - int lane_id = thread_id & (__THREADS_PER_ATOM__ -1); - int my_bucket = threadIdx.x / __THREADS_PER_ATOM__; - - if (warp_id >= N ) return; - - - int i, j, k, pi, pk, itr, top; - int type_i, type_j, type_k; - int start_j, end_j, hb_start_j, hb_end_j; - int hblist[MAX_BONDS]; - int num_hb_intrs = 0; - real r_ij, r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2; - real e_hb, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3; - rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk; - rvec dvec_jk, force, ext_press; - ivec rel_jk; - // rtensor temp_rtensor, total_rtensor; - hbond_parameters *hbp; - bond_order_data *bo_ij; - bond_data *pbond_ij; - far_neighbor_data *nbr_jk; - list *bonds, *hbonds; - bond_data *bond_list; - hbond_data *hbond_list, *hbond_jk; - static_storage *workspace = &p_workspace; - - /* - j = blockIdx.x * blockDim.x + threadIdx.x; - if (j >= N) return; - */ - - // j = blockIdx.x; - - j = warp_id; - - bonds = &p_bonds; - bond_list = bonds->select.bond_list; - - hbonds = &p_hbonds; - hbond_list = hbonds->select.hbond_list; - - // loops below discover the Hydrogen bonds between i-j-k triplets. - // here j is H atom and there has to be some bond between i and j. - // Hydrogen bond is between j and k. - // so in this function i->X, j->H, k->Z when we map - // variables onto the ones in the handout. - - //for( j = 0; j < system->N; ++j ) - sh_hb [threadIdx.x] = 0; - rvec_MakeZero ( sh_atomf[ threadIdx.x] ); - - if( sbp[atoms[j].type].p_hbond==1) {// j must be H - //set j's variables - type_j = atoms[j].type; - start_j = Start_Index(j, bonds); - end_j = End_Index(j, bonds); - hb_start_j = Start_Index( workspace->hbond_index[j], hbonds ); - hb_end_j = End_Index ( workspace->hbond_index[j], hbonds ); - - top = 0; - for( pi = start_j; pi < end_j; ++pi ) { - pbond_ij = &( bond_list[pi] ); - i = pbond_ij->nbr; - bo_ij = &(pbond_ij->bo_data); - type_i = atoms[i].type; - - if( sbp[type_i].p_hbond == 2 && - bo_ij->BO >= HB_THRESHOLD ) { - hblist[top++] = pi; - } - } - - // fprintf( stderr, "j: %d, top: %d, hb_start_j: %d, hb_end_j:%d\n", - // j, top, hb_start_j, hb_end_j ); - - for( itr=0; itr < top; ++itr ) { - pi = hblist[itr]; - pbond_ij = &( bond_list[pi] ); - i = pbond_ij->nbr; - - //TODO - rvec_MakeZero (sh_hf [threadIdx.x]); - sh_cdbo [threadIdx.x] = 0; - - - //for( pk = hb_start_j; pk < hb_end_j; ++pk ) - int loopcount = (hb_end_j - hb_start_j) / HBONDS_THREADS_PER_ATOM + (((hb_end_j - hb_start_j)%HBONDS_THREADS_PER_ATOM == 0) ? 0 : 1); - int count = 0; - //jpk = hb_start_j + threadIdx.x; - pk = hb_start_j + lane_id; - //while (pk < hb_end_j) - while (count < loopcount) - { - - if (pk < hb_end_j) - { - // set k's varibles - //TODO - hbond_jk = &( hbond_list[pk] ); - //TODO - k = hbond_list[pk].nbr; - type_k = atoms[k].type; - nbr_jk = hbond_list[pk].ptr; - r_jk = nbr_jk->d; - rvec_Scale( dvec_jk, hbond_list[pk].scl, nbr_jk->dvec ); - } - else k = -1; - - //TODO Double check this Hydrogen Bonds fix - //rvec_MakeZero ( nbr_jk->h_f ); - //rvec_MakeZero ( hbond_jk->h_f ); - //TODO Double check this Hydrogen Bonds fix - - //sh_hb [threadIdx.x] = 0; - //rvec_MakeZero ( sh_atomf[ threadIdx.x] ); - //__syncthreads (); - - - if(( i != k ) && (k != -1)) { - bo_ij = &(pbond_ij->bo_data); - type_i = atoms[i].type; - r_ij = pbond_ij->d; - hbp = &(d_hbp[ index_hbp(type_i, type_j, type_k, num_atom_types) ]); - ++num_hb_intrs; - - Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk, - &theta, &cos_theta ); - // the derivative of cos(theta) - Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk, - &dcos_theta_di, &dcos_theta_dj, - &dcos_theta_dk ); - - // hydrogen bond energy - sin_theta2 = SIN( theta/2.0 ); - sin_xhz4 = SQR(sin_theta2); - sin_xhz4 *= sin_xhz4; - cos_xhz1 = ( 1.0 - cos_theta ); - exp_hb2 = EXP( -hbp->p_hb2 * bo_ij->BO ); - exp_hb3 = EXP( -hbp->p_hb3 * ( hbp->r0_hb / r_jk + - r_jk / hbp->r0_hb - 2.0 ) ); - - //PERFORMANCE IMPACT - e_hb = hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4; - //atomicAdd ( &data->E_HB, e_hb ); - //E_HB [j] += e_hb; - sh_hb [threadIdx.x] += e_hb; - - CEhb1 = hbp->p_hb1*hbp->p_hb2 * exp_hb2*exp_hb3 * sin_xhz4; - CEhb2 = -hbp->p_hb1/2.0*(1.0 - exp_hb2) * exp_hb3 * cos_xhz1; - CEhb3 = -hbp->p_hb3 * e_hb * (-hbp->r0_hb / SQR(r_jk) + - 1.0 / hbp->r0_hb); - - //this is the problem here - //TODO - // hydrogen bond forces - //bo_ij->Cdbo += CEhb1; // dbo term - sh_cdbo[threadIdx.x] += CEhb1; - //TODO - //warpReduce (sh_cdbo, threadIdx.x); - //if (threadIdx.x == 0) - // bo_ij->Cdbo += sh_cdbo [0]; - - - - if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT ) { - - //PERFORMANCE IMPACT - /* - atomic_rvecScaledAdd( atoms[i].f, - +CEhb2, dcos_theta_di ); //dcos terms - atomic_rvecScaledAdd( atoms[j].f, - +CEhb2, dcos_theta_dj ); - atomic_rvecScaledAdd( atoms[k].f, - +CEhb2, dcos_theta_dk ); - //dr terms - atomic_rvecScaledAdd( atoms[j].f, -CEhb3/r_jk, dvec_jk ); - atomic_rvecScaledAdd( atoms[k].f, +CEhb3/r_jk, dvec_jk ); - */ - - //PERFORMANCE IMPACT - //rvec_ScaledAdd( pbond_ij->h_f, +CEhb2, dcos_theta_di ); //dcos terms - rvec_ScaledAdd( sh_hf [threadIdx.x], +CEhb2, dcos_theta_di ); //dcos terms - - //rvec_ScaledAdd( atoms[j].f, +CEhb2, dcos_theta_dj ); - rvec_ScaledAdd( sh_atomf [threadIdx.x], +CEhb2, dcos_theta_dj ); - - - //TODO you forgot here - //TODO Hydrogen bonds fix. -- BE VERY CAREFUL ***** - rvec_ScaledAdd( hbond_jk->h_f, +CEhb2, dcos_theta_dk ); - - //rvec_ScaledAdd( nbr_jk->h_f, - // +CEhb2, dcos_theta_dk ); - - //dr terms - //rvec_ScaledAdd( atoms[j].f, -CEhb3/r_jk, dvec_jk ); - rvec_ScaledAdd( sh_atomf [threadIdx.x], -CEhb3/r_jk, dvec_jk ); - - //TODO you forgot - rvec_ScaledAdd( hbond_jk->h_f, +CEhb3/r_jk, dvec_jk ); - //rvec_ScaledAdd( nbr_jk->h_f, +CEhb3/r_jk, dvec_jk ); - } - else - { - // for pressure coupling, terms that are not related - // to bond order derivatives are added directly into - // pressure vector/tensor - //rvec_Scale( force, +CEhb2, dcos_theta_di ); // dcos terms - //rvec_Add( pbond_ij->h_f, force ); - //rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); - //rvec_ScaledAdd( aux_ext_press [j], 1.0, ext_press ); - //rvec_ScaledAdd (sh_press [threadIdx.x], 1.0, ext_press ); - - //rvec_ScaledAdd( atoms[j].f, +CEhb2, dcos_theta_dj ); - - //ivec_Scale( rel_jk, hbond_list[pk].scl, nbr_jk->rel_box ); - //rvec_Scale( force, +CEhb2, dcos_theta_dk ); - - //rvec_Add( nbr_jk->h_f, force ); - //rvec_Add( hbond_jk->h_f, force ); - - //rvec_iMultiply( ext_press, rel_jk, force ); - //rvec_ScaledAdd( aux_ext_press [j], 1.0, ext_press ); - //rvec_ScaledAdd( sh_press [threadIdx.x], 1.0, ext_press ); - - //dr terms - //rvec_ScaledAdd( atoms[j].f, -CEhb3/r_jk, dvec_jk ); - - //rvec_Scale( force, CEhb3/r_jk, dvec_jk ); - //rvec_Add( hbond_jk->h_f, force ); - //rvec_iMultiply( ext_press, rel_jk, force ); - //rvec_ScaledAdd( aux_ext_press [j], 1.0, ext_press ); - //rvec_ScaledAdd( sh_press [threadIdx.x], 1.0, ext_press ); - - } - - } // i != k if statement - - pk += __THREADS_PER_ATOM__; - count ++; - - } // pk for statement - - //__syncthreads (); - - //at this point done with one bond.... - //do the reduction now - //if (threadIdx.x == 0){ - if (lane_id < 16) { - sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 16]; - rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 16]); - } - if (lane_id < 8) { - sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 8]; - rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 8]); - } - if (lane_id < 4) { - sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 4]; - rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 4]); - } - if (lane_id < 2) { - sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 2]; - rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 2]); - } - if (lane_id < 1) { - sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 1]; - rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 1]); - - bo_ij->Cdbo += sh_cdbo [threadIdx.x]; - rvec_Add (pbond_ij->h_f, sh_hf [threadIdx.x]); - } - /* - if (lane_id == 0){ - for (i = 1; i < 32; i++) - { - //sh_cdbo [threadIdx.x] += sh_cdbo [i]; - //rvec_Add (sh_hf [threadIdx.x], sh_hf [i]); + extern __shared__ real t_hb[]; + extern __shared__ real t_f[]; + //extern __shared__ rvec t_cdbo[]; + //extern __shared__ rvec t_hf []; + + real *sh_hb = t_hb; + rvec *sh_atomf = (rvec *)(t_hb + blockDim.x); + //real *sh_cdbo = t_hb + blockDim.x; + //rvec *sh_hf = (rvec *) (sh_atomf + blockDim.x); + + int i, j, k, pi, pk, itr, top; + int type_i, type_j, type_k; + int start_j, end_j, hb_start_j, hb_end_j; + int hblist[MAX_BONDS]; + int num_hb_intrs = 0; + real r_ij, r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2; + real e_hb, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3; + rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk; + rvec dvec_jk, force, ext_press; + ivec rel_jk; + // rtensor temp_rtensor, total_rtensor; + hbond_parameters *hbp; + bond_order_data *bo_ij; + bond_data *pbond_ij; + far_neighbor_data *nbr_jk; + list *bonds, *hbonds; + bond_data *bond_list; + hbond_data *hbond_list, *hbond_jk; + static_storage *workspace = &p_workspace; + + j = blockIdx.x * blockDim.x + threadIdx.x; + if (j >= N) return; + + //j = blockIdx.x; + + bonds = &p_bonds; + bond_list = bonds->select.bond_list; + + hbonds = &p_hbonds; + hbond_list = hbonds->select.hbond_list; + + // loops below discover the Hydrogen bonds between i-j-k triplets. + // here j is H atom and there has to be some bond between i and j. + // Hydrogen bond is between j and k. + // so in this function i->X, j->H, k->Z when we map + // variables onto the ones in the handout. + + //for( j = 0; j < system->N; ++j ) + sh_hb [threadIdx.x] = 0; + rvec_MakeZero ( sh_atomf[ threadIdx.x] ); + + if( sbp[atoms[j].type].p_hbond==1) {// j must be H + //set j's variables + type_j = atoms[j].type; + start_j = Start_Index(j, bonds); + end_j = End_Index(j, bonds); + hb_start_j = Start_Index( workspace->hbond_index[j], hbonds ); + hb_end_j = End_Index ( workspace->hbond_index[j], hbonds ); + + top = 0; + for( pi = start_j; pi < end_j; ++pi ) { + pbond_ij = &( bond_list[pi] ); + i = pbond_ij->nbr; + bo_ij = &(pbond_ij->bo_data); + type_i = atoms[i].type; + + if( sbp[type_i].p_hbond == 2 && + bo_ij->BO >= HB_THRESHOLD ) + hblist[top++] = pi; + } + + // fprintf( stderr, "j: %d, top: %d, hb_start_j: %d, hb_end_j:%d\n", + // j, top, hb_start_j, hb_end_j ); + + for( pk = hb_start_j; pk < hb_end_j; ++pk ) + //pk = hb_start_j + threadIdx.x; + //while (pk < hb_end_j) + { + // set k's varibles + //TODO + hbond_jk = &( hbond_list[pk] ); + //TODO + k = hbond_list[pk].nbr; + type_k = atoms[k].type; + nbr_jk = hbond_list[pk].ptr; + r_jk = nbr_jk->d; + rvec_Scale( dvec_jk, hbond_list[pk].scl, nbr_jk->dvec ); + + //TODO Double check this Hydrogen Bonds fix + //rvec_MakeZero ( nbr_jk->h_f ); + rvec_MakeZero ( hbond_jk->h_f ); + //TODO Double check this Hydrogen Bonds fix + + //sh_hb [threadIdx.x] = 0; + + + //itr = threadIdx.x; + for( itr=0; itr < top; ++itr ) { + //while (itr < top) { + pi = hblist[itr]; + pbond_ij = &( bond_list[pi] ); + i = pbond_ij->nbr; + + //TODO + //rvec_MakeZero (sh_hf [threadIdx.x]); + //sh_cdbo [threadIdx.x] = 0; + + //rvec_MakeZero ( sh_atomf[ threadIdx.x] ); + + + if( i != k ) { + bo_ij = &(pbond_ij->bo_data); + type_i = atoms[i].type; + r_ij = pbond_ij->d; + hbp = &(d_hbp[ index_hbp(type_i, type_j, type_k, num_atom_types) ]); + ++num_hb_intrs; + + Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk, + &theta, &cos_theta ); + // the derivative of cos(theta) + Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk, + &dcos_theta_di, &dcos_theta_dj, + &dcos_theta_dk ); + + // hydrogen bond energy + sin_theta2 = SIN( theta/2.0 ); + sin_xhz4 = SQR(sin_theta2); + sin_xhz4 *= sin_xhz4; + cos_xhz1 = ( 1.0 - cos_theta ); + exp_hb2 = EXP( -hbp->p_hb2 * bo_ij->BO ); + exp_hb3 = EXP( -hbp->p_hb3 * ( hbp->r0_hb / r_jk + + r_jk / hbp->r0_hb - 2.0 ) ); + + //PERFORMANCE IMPACT + e_hb = hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4; + //atomicAdd ( &data->E_HB, e_hb ); + //E_HB [j] += e_hb; + sh_hb [threadIdx.x] += e_hb; + + CEhb1 = hbp->p_hb1*hbp->p_hb2 * exp_hb2*exp_hb3 * sin_xhz4; + CEhb2 = -hbp->p_hb1/2.0*(1.0 - exp_hb2) * exp_hb3 * cos_xhz1; + CEhb3 = -hbp->p_hb3 * e_hb * (-hbp->r0_hb / SQR(r_jk) + + 1.0 / hbp->r0_hb); + + //this is the problem here + //TODO + // hydrogen bond forces + bo_ij->Cdbo += CEhb1; // dbo term + //sh_cdbo[threadIdx.x] += CEhb1; + //TODO + + + if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) { + + //PERFORMANCE IMPACT + /* + atomic_rvecScaledAdd( atoms[i].f, + +CEhb2, dcos_theta_di ); //dcos terms + atomic_rvecScaledAdd( atoms[j].f, + +CEhb2, dcos_theta_dj ); + atomic_rvecScaledAdd( atoms[k].f, + +CEhb2, dcos_theta_dk ); + //dr terms + atomic_rvecScaledAdd( atoms[j].f, -CEhb3/r_jk, dvec_jk ); + atomic_rvecScaledAdd( atoms[k].f, +CEhb3/r_jk, dvec_jk ); + */ + + //PERFORMANCE IMPACT + rvec_ScaledAdd( pbond_ij->h_f, +CEhb2, dcos_theta_di ); //dcos terms + //rvec_ScaledAdd( sh_hf [threadIdx.x], +CEhb2, dcos_theta_di ); //dcos terms + + //rvec_ScaledAdd( atoms[j].f, +CEhb2, dcos_theta_dj ); + rvec_ScaledAdd( sh_atomf [threadIdx.x], +CEhb2, dcos_theta_dj ); + + //TODO you forgot here + //TODO Hydrogen bonds fix. -- BE VERY CAREFUL ***** + rvec_ScaledAdd( hbond_jk->h_f, + +CEhb2, dcos_theta_dk ); + + //rvec_ScaledAdd( nbr_jk->h_f, + // +CEhb2, dcos_theta_dk ); + + //dr terms + //rvec_ScaledAdd( atoms[j].f, -CEhb3/r_jk, dvec_jk ); + rvec_ScaledAdd( sh_atomf [threadIdx.x], -CEhb3/r_jk, dvec_jk ); + + //atoms_f [j] ++; + + //TODO you forgot + rvec_ScaledAdd( hbond_jk->h_f, +CEhb3/r_jk, dvec_jk ); + //rvec_ScaledAdd( nbr_jk->h_f, +CEhb3/r_jk, dvec_jk ); + } + else + { + // for pressure coupling, terms that are not related + // to bond order derivatives are added directly into + // pressure vector/tensor + rvec_Scale( force, +CEhb2, dcos_theta_di ); // dcos terms + rvec_Add( pbond_ij->h_f, force ); + rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); + //rvec_ScaledAdd( aux_ext_press [j], 1.0, ext_press ); + //rvec_ScaledAdd (sh_press [threadIdx.x], 1.0, ext_press ); + + rvec_ScaledAdd( atoms[j].f, +CEhb2, dcos_theta_dj ); + + ivec_Scale( rel_jk, hbond_list[pk].scl, nbr_jk->rel_box ); + rvec_Scale( force, +CEhb2, dcos_theta_dk ); + + //rvec_Add( nbr_jk->h_f, force ); + rvec_Add( hbond_jk->h_f, force ); + + rvec_iMultiply( ext_press, rel_jk, force ); + //rvec_ScaledAdd( aux_ext_press [j], 1.0, ext_press ); + //rvec_ScaledAdd( sh_press [threadIdx.x], 1.0, ext_press ); + + //dr terms + rvec_ScaledAdd( atoms[j].f, -CEhb3/r_jk, dvec_jk ); + + rvec_Scale( force, CEhb3/r_jk, dvec_jk ); + rvec_Add( hbond_jk->h_f, force ); + rvec_iMultiply( ext_press, rel_jk, force ); + //rvec_ScaledAdd( aux_ext_press [j], 1.0, ext_press ); + //rvec_ScaledAdd( sh_press [threadIdx.x], 1.0, ext_press ); + + } + + //do the reduction for the bond_ij here + /* + if (threadIdx.x < 16){ + sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 16]; + rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 16]); + + sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 16]; + rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 16] ); + } + if (threadIdx.x < 8){ + //sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 8]; + //rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 8]); + + sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 8]; + //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 8] ); + } + if (threadIdx.x < 4){ + //sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 4]; + //rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 4]); + + sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 4]; + //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 4] ); + } + if (threadIdx.x < 2){ + //sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 2]; + //rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 2]); + + sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 2]; + //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 2] ); + } + if (threadIdx.x < 1){ + //sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 1]; + //rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 1]); + + sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 1]; + //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 1] ); + } + if (threadIdx.x == 0){ + //bo_ij->Cdbo += sh_cdbo [threadIdx.x]; + //rvec_Add (pbond_ij->h_f, sh_hf [threadIdx.x]); + + E_HB [j] += sh_hb [threadIdx.x]; + //rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]); + } + */ + + + } // i != k if statement + + + //itr += blockDim.x; + + } //itr for statement + + /* + __syncthreads (); + + for (int x = 1; x < blockDim.x; x++) + sh_hb [0] += sh_hb [x]; + + E_HB [j] += sh_hb[0]; + if (threadIdx.x < 16) sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 16]; + if (threadIdx.x < 8) sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 8]; + if (threadIdx.x < 4) sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 4]; + if (threadIdx.x < 2) sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 2]; + if (threadIdx.x < 1) sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 1]; + if (threadIdx.x == 0) E_HB [j] += sh_hb [threadIdx.x]; + */ + + + //pk += blockDim.x; + + } // pk for statement + } // main if statment + + //do the reduction for the bond_ij here + /* + if (threadIdx.x < 16){ + sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 16]; + //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 16] ); + } + if (threadIdx.x < 8){ + sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 8]; + //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 8] ); + } + if (threadIdx.x < 4){ + sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 4]; + //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 4] ); + } + if (threadIdx.x < 2){ + sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 2]; + //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 2] ); + } + if (threadIdx.x < 1){ + sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 1]; + //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 1] ); + } + if (threadIdx.x == 0){ + E_HB [j] += sh_hb [threadIdx.x]; + //rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]); + } + */ + + E_HB [j] += sh_hb [threadIdx.x]; + rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]); + + //rvec_Copy (atoms_f [j], sh_atomf [threadIdx.x]); + } + + + DEVICE void warpReduce(volatile real* sdata, int tid) + { + if (tid < 16) sdata[tid] += sdata[tid + 16]; + if (tid < 8) sdata[tid] += sdata[tid + 8]; + if (tid < 4) sdata[tid] += sdata[tid + 4]; + if (tid < 2) sdata[tid] += sdata[tid + 2]; + if (tid < 1) sdata[tid] += sdata[tid + 1]; + } + + + + + GLOBAL void Hydrogen_Bonds_HB ( reax_atom *atoms, + single_body_parameters *sbp, + hbond_parameters *d_hbp, + control_params *control, + simulation_data *data, + static_storage p_workspace, + list p_bonds, list p_hbonds, + int N, int num_atom_types, + real *E_HB, rvec *aux_ext_press, rvec *atoms_f ) + { + extern __shared__ real t_hb[]; + extern __shared__ rvec t__f[]; + extern __shared__ rvec t_cdbo[]; + extern __shared__ rvec t_hf []; + + real *sh_hb = t_hb; + real *sh_cdbo = t_hb + blockDim.x; + rvec *sh_atomf = (rvec *)(sh_cdbo + blockDim.x); + rvec *sh_hf = (rvec *) (sh_atomf + blockDim.x); + + int __THREADS_PER_ATOM__ = HBONDS_THREADS_PER_ATOM; + + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + int warp_id = thread_id / __THREADS_PER_ATOM__; + int lane_id = thread_id & (__THREADS_PER_ATOM__ -1); + int my_bucket = threadIdx.x / __THREADS_PER_ATOM__; + + if (warp_id >= N ) return; + + + int i, j, k, pi, pk, itr, top; + int type_i, type_j, type_k; + int start_j, end_j, hb_start_j, hb_end_j; + int hblist[MAX_BONDS]; + int num_hb_intrs = 0; + real r_ij, r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2; + real e_hb, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3; + rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk; + rvec dvec_jk, force, ext_press; + ivec rel_jk; + // rtensor temp_rtensor, total_rtensor; + hbond_parameters *hbp; + bond_order_data *bo_ij; + bond_data *pbond_ij; + far_neighbor_data *nbr_jk; + list *bonds, *hbonds; + bond_data *bond_list; + hbond_data *hbond_list, *hbond_jk; + static_storage *workspace = &p_workspace; + + /* + j = blockIdx.x * blockDim.x + threadIdx.x; + if (j >= N) return; + */ + + // j = blockIdx.x; + + j = warp_id; + + bonds = &p_bonds; + bond_list = bonds->select.bond_list; + + hbonds = &p_hbonds; + hbond_list = hbonds->select.hbond_list; + + // loops below discover the Hydrogen bonds between i-j-k triplets. + // here j is H atom and there has to be some bond between i and j. + // Hydrogen bond is between j and k. + // so in this function i->X, j->H, k->Z when we map + // variables onto the ones in the handout. + + //for( j = 0; j < system->N; ++j ) + sh_hb [threadIdx.x] = 0; + rvec_MakeZero ( sh_atomf[ threadIdx.x] ); + + if( sbp[atoms[j].type].p_hbond==1) {// j must be H + //set j's variables + type_j = atoms[j].type; + start_j = Start_Index(j, bonds); + end_j = End_Index(j, bonds); + hb_start_j = Start_Index( workspace->hbond_index[j], hbonds ); + hb_end_j = End_Index ( workspace->hbond_index[j], hbonds ); + + top = 0; + for( pi = start_j; pi < end_j; ++pi ) { + pbond_ij = &( bond_list[pi] ); + i = pbond_ij->nbr; + bo_ij = &(pbond_ij->bo_data); + type_i = atoms[i].type; + + if( sbp[type_i].p_hbond == 2 && + bo_ij->BO >= HB_THRESHOLD ) { + hblist[top++] = pi; + } + } + + // fprintf( stderr, "j: %d, top: %d, hb_start_j: %d, hb_end_j:%d\n", + // j, top, hb_start_j, hb_end_j ); + + for( itr=0; itr < top; ++itr ) { + pi = hblist[itr]; + pbond_ij = &( bond_list[pi] ); + i = pbond_ij->nbr; + + //TODO + rvec_MakeZero (sh_hf [threadIdx.x]); + sh_cdbo [threadIdx.x] = 0; + + + //for( pk = hb_start_j; pk < hb_end_j; ++pk ) + int loopcount = (hb_end_j - hb_start_j) / HBONDS_THREADS_PER_ATOM + (((hb_end_j - hb_start_j)%HBONDS_THREADS_PER_ATOM == 0) ? 0 : 1); + int count = 0; + //jpk = hb_start_j + threadIdx.x; + pk = hb_start_j + lane_id; + //while (pk < hb_end_j) + while (count < loopcount) + { + + if (pk < hb_end_j) + { + // set k's varibles + //TODO + hbond_jk = &( hbond_list[pk] ); + //TODO + k = hbond_list[pk].nbr; + type_k = atoms[k].type; + nbr_jk = hbond_list[pk].ptr; + r_jk = nbr_jk->d; + rvec_Scale( dvec_jk, hbond_list[pk].scl, nbr_jk->dvec ); + } + else k = -1; + + //TODO Double check this Hydrogen Bonds fix + //rvec_MakeZero ( nbr_jk->h_f ); + //rvec_MakeZero ( hbond_jk->h_f ); + //TODO Double check this Hydrogen Bonds fix + + //sh_hb [threadIdx.x] = 0; + //rvec_MakeZero ( sh_atomf[ threadIdx.x] ); + //__syncthreads (); + + + if(( i != k ) && (k != -1)) { + bo_ij = &(pbond_ij->bo_data); + type_i = atoms[i].type; + r_ij = pbond_ij->d; + hbp = &(d_hbp[ index_hbp(type_i, type_j, type_k, num_atom_types) ]); + ++num_hb_intrs; + + Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk, + &theta, &cos_theta ); + // the derivative of cos(theta) + Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk, + &dcos_theta_di, &dcos_theta_dj, + &dcos_theta_dk ); + + // hydrogen bond energy + sin_theta2 = SIN( theta/2.0 ); + sin_xhz4 = SQR(sin_theta2); + sin_xhz4 *= sin_xhz4; + cos_xhz1 = ( 1.0 - cos_theta ); + exp_hb2 = EXP( -hbp->p_hb2 * bo_ij->BO ); + exp_hb3 = EXP( -hbp->p_hb3 * ( hbp->r0_hb / r_jk + + r_jk / hbp->r0_hb - 2.0 ) ); + + //PERFORMANCE IMPACT + e_hb = hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4; + //atomicAdd ( &data->E_HB, e_hb ); + //E_HB [j] += e_hb; + sh_hb [threadIdx.x] += e_hb; + + CEhb1 = hbp->p_hb1*hbp->p_hb2 * exp_hb2*exp_hb3 * sin_xhz4; + CEhb2 = -hbp->p_hb1/2.0*(1.0 - exp_hb2) * exp_hb3 * cos_xhz1; + CEhb3 = -hbp->p_hb3 * e_hb * (-hbp->r0_hb / SQR(r_jk) + + 1.0 / hbp->r0_hb); + + //this is the problem here + //TODO + // hydrogen bond forces + //bo_ij->Cdbo += CEhb1; // dbo term + sh_cdbo[threadIdx.x] += CEhb1; + //TODO + //warpReduce (sh_cdbo, threadIdx.x); + //if (threadIdx.x == 0) + // bo_ij->Cdbo += sh_cdbo [0]; + + + + if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT ) { + + //PERFORMANCE IMPACT + /* + atomic_rvecScaledAdd( atoms[i].f, + +CEhb2, dcos_theta_di ); //dcos terms + atomic_rvecScaledAdd( atoms[j].f, + +CEhb2, dcos_theta_dj ); + atomic_rvecScaledAdd( atoms[k].f, + +CEhb2, dcos_theta_dk ); + //dr terms + atomic_rvecScaledAdd( atoms[j].f, -CEhb3/r_jk, dvec_jk ); + atomic_rvecScaledAdd( atoms[k].f, +CEhb3/r_jk, dvec_jk ); + */ + + //PERFORMANCE IMPACT + //rvec_ScaledAdd( pbond_ij->h_f, +CEhb2, dcos_theta_di ); //dcos terms + rvec_ScaledAdd( sh_hf [threadIdx.x], +CEhb2, dcos_theta_di ); //dcos terms + + //rvec_ScaledAdd( atoms[j].f, +CEhb2, dcos_theta_dj ); + rvec_ScaledAdd( sh_atomf [threadIdx.x], +CEhb2, dcos_theta_dj ); + + + //TODO you forgot here + //TODO Hydrogen bonds fix. -- BE VERY CAREFUL ***** + rvec_ScaledAdd( hbond_jk->h_f, +CEhb2, dcos_theta_dk ); + + //rvec_ScaledAdd( nbr_jk->h_f, + // +CEhb2, dcos_theta_dk ); + + //dr terms + //rvec_ScaledAdd( atoms[j].f, -CEhb3/r_jk, dvec_jk ); + rvec_ScaledAdd( sh_atomf [threadIdx.x], -CEhb3/r_jk, dvec_jk ); + + //TODO you forgot + rvec_ScaledAdd( hbond_jk->h_f, +CEhb3/r_jk, dvec_jk ); + //rvec_ScaledAdd( nbr_jk->h_f, +CEhb3/r_jk, dvec_jk ); + } + else + { + // for pressure coupling, terms that are not related + // to bond order derivatives are added directly into + // pressure vector/tensor + //rvec_Scale( force, +CEhb2, dcos_theta_di ); // dcos terms + //rvec_Add( pbond_ij->h_f, force ); + //rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); + //rvec_ScaledAdd( aux_ext_press [j], 1.0, ext_press ); + //rvec_ScaledAdd (sh_press [threadIdx.x], 1.0, ext_press ); + + //rvec_ScaledAdd( atoms[j].f, +CEhb2, dcos_theta_dj ); + + //ivec_Scale( rel_jk, hbond_list[pk].scl, nbr_jk->rel_box ); + //rvec_Scale( force, +CEhb2, dcos_theta_dk ); + + //rvec_Add( nbr_jk->h_f, force ); + //rvec_Add( hbond_jk->h_f, force ); + + //rvec_iMultiply( ext_press, rel_jk, force ); + //rvec_ScaledAdd( aux_ext_press [j], 1.0, ext_press ); + //rvec_ScaledAdd( sh_press [threadIdx.x], 1.0, ext_press ); + + //dr terms + //rvec_ScaledAdd( atoms[j].f, -CEhb3/r_jk, dvec_jk ); + + //rvec_Scale( force, CEhb3/r_jk, dvec_jk ); + //rvec_Add( hbond_jk->h_f, force ); + //rvec_iMultiply( ext_press, rel_jk, force ); + //rvec_ScaledAdd( aux_ext_press [j], 1.0, ext_press ); + //rvec_ScaledAdd( sh_press [threadIdx.x], 1.0, ext_press ); + + } + + } // i != k if statement + + pk += __THREADS_PER_ATOM__; + count ++; + + } // pk for statement + + //__syncthreads (); + + //at this point done with one bond.... + //do the reduction now + //if (threadIdx.x == 0){ + if (lane_id < 16) { + sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 16]; + rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 16]); + } + if (lane_id < 8) { + sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 8]; + rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 8]); + } + if (lane_id < 4) { + sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 4]; + rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 4]); + } + if (lane_id < 2) { + sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 2]; + rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 2]); + } + if (lane_id < 1) { + sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 1]; + rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 1]); + + bo_ij->Cdbo += sh_cdbo [threadIdx.x]; + rvec_Add (pbond_ij->h_f, sh_hf [threadIdx.x]); + } + /* + if (lane_id == 0){ + for (i = 1; i < 32; i++) + { + //sh_cdbo [threadIdx.x] += sh_cdbo [i]; + //rvec_Add (sh_hf [threadIdx.x], sh_hf [i]); - sh_cdbo [lane_id] += sh_cdbo [lane_id + i]; - rvec_Add (sh_hf [lane_id], sh_hf [lane_id + i]); - } + sh_cdbo [lane_id] += sh_cdbo [lane_id + i]; + rvec_Add (sh_hf [lane_id], sh_hf [lane_id + i]); + } - //bo_ij->Cdbo += sh_cdbo [threadIdx.x]; - //rvec_Add (pbond_ij->h_f, sh_hf [threadIdx.x]); + //bo_ij->Cdbo += sh_cdbo [threadIdx.x]; + //rvec_Add (pbond_ij->h_f, sh_hf [threadIdx.x]); - bo_ij->Cdbo += sh_cdbo [lane_id]; - rvec_Add (pbond_ij->h_f, sh_hf [lane_id]); - } - */ + bo_ij->Cdbo += sh_cdbo [lane_id]; + rvec_Add (pbond_ij->h_f, sh_hf [lane_id]); + } + */ - } //itr for statement + } //itr for statement - //__syncthreads (); - } // main if statment + //__syncthreads (); + } // main if statment - //__syncthreads (); + //__syncthreads (); - //do the reduction for the bond_ij here - if (lane_id < 16){ - sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 16]; - rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 16] ); - } - if (lane_id < 8){ - sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 8]; - rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 8] ); - } - if (lane_id < 4){ - sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 4]; - rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 4] ); - } - if (lane_id < 2){ - sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 2]; - rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 2] ); - } - if (lane_id < 1){ - sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 1]; - rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 1] ); + //do the reduction for the bond_ij here + if (lane_id < 16){ + sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 16]; + rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 16] ); + } + if (lane_id < 8){ + sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 8]; + rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 8] ); + } + if (lane_id < 4){ + sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 4]; + rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 4] ); + } + if (lane_id < 2){ + sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 2]; + rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 2] ); + } + if (lane_id < 1){ + sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 1]; + rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 1] ); - E_HB [j] += sh_hb [threadIdx.x]; - rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]); - } - /* - if (lane == 0){ - //E_HB [j] += sh_hb [threadIdx.x]; - rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]); - rvec_Copy (atoms_f [j], sh_atomf [threadIdx.x]); - } - */ - //if (threadIdx.x == 0){ - /* - if (lane_id == 0){ - for (i = 1; i < 32; i++) - { - //sh_hb [threadIdx.x] += sh_hb [i]; - //rvec_Add (sh_atomf [threadIdx.x], sh_atomf [i]); - sh_hb [lane_id] += sh_hb [lane_id + i]; - rvec_Add (sh_atomf [lane_id], sh_atomf [lane_id + i]); - } + E_HB [j] += sh_hb [threadIdx.x]; + rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]); + } + /* + if (lane == 0){ + //E_HB [j] += sh_hb [threadIdx.x]; + rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]); + rvec_Copy (atoms_f [j], sh_atomf [threadIdx.x]); + } + */ + //if (threadIdx.x == 0){ + /* + if (lane_id == 0){ + for (i = 1; i < 32; i++) + { + //sh_hb [threadIdx.x] += sh_hb [i]; + //rvec_Add (sh_atomf [threadIdx.x], sh_atomf [i]); + sh_hb [lane_id] += sh_hb [lane_id + i]; + rvec_Add (sh_atomf [lane_id], sh_atomf [lane_id + i]); + } - //E_HB [j] += sh_hb [threadIdx.x]; - //rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]); + //E_HB [j] += sh_hb [threadIdx.x]; + //rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]); - E_HB [j] += sh_hb [lane_id]; - rvec_Add (atoms[j].f, sh_atomf [lane_id]); - //rvec_Copy (atoms_f[j], sh_atomf [threadIdx.x]); - } - */ + E_HB [j] += sh_hb [lane_id]; + rvec_Add (atoms[j].f, sh_atomf [lane_id]); + //rvec_Copy (atoms_f[j], sh_atomf [threadIdx.x]); + } + */ - //E_HB [j] += sh_hb [threadIdx.x]; - //rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]); - } + //E_HB [j] += sh_hb [threadIdx.x]; + //rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]); + } @@ -2309,154 +2309,154 @@ GLOBAL void Hydrogen_Bonds ( reax_atom *atoms, - GLOBAL void Hydrogen_Bonds_Postprocess ( reax_atom *atoms, - single_body_parameters *sbp, - static_storage p_workspace, - list p_bonds, list p_hbonds, list p_far_nbrs, int N, - real *e_hb) - { + GLOBAL void Hydrogen_Bonds_Postprocess ( reax_atom *atoms, + single_body_parameters *sbp, + static_storage p_workspace, + list p_bonds, list p_hbonds, list p_far_nbrs, int N, + real *e_hb) + { - int i, pj, hj, nbr, k, j; - int start, end; + int i, pj, hj, nbr, k, j; + int start, end; - bond_data *pbond; - bond_data *sym_index_bond; - far_neighbor_data *nbr_pj, *sym_index_nbr; + bond_data *pbond; + bond_data *sym_index_bond; + far_neighbor_data *nbr_pj, *sym_index_nbr; - list *bonds = &p_bonds; - list *far_nbrs = &p_far_nbrs; + list *bonds = &p_bonds; + list *far_nbrs = &p_far_nbrs; - i = blockIdx.x * blockDim.x + threadIdx.x; + i = blockIdx.x * blockDim.x + threadIdx.x; - if ( i >= N) return; + if ( i >= N) return; - // For processing ij information - start = Start_Index(i, bonds); - end = End_Index(i, bonds); + // For processing ij information + start = Start_Index(i, bonds); + end = End_Index(i, bonds); - //rvec_Scale (atoms[i].f, e_hb[i], atoms[i].f); + //rvec_Scale (atoms[i].f, e_hb[i], atoms[i].f); - for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ){ + for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ){ - pbond = &(bonds->select.bond_list[pj]); - sym_index_bond = &( bonds->select.bond_list[ pbond->sym_index ] ); + pbond = &(bonds->select.bond_list[pj]); + sym_index_bond = &( bonds->select.bond_list[ pbond->sym_index ] ); - rvec_Add (atoms[i].f, sym_index_bond->h_f ); - } + rvec_Add (atoms[i].f, sym_index_bond->h_f ); + } - /* - for (pj = Start_Index (i, far_nbrs); pj < End_Index (i, far_nbrs); pj ++) - { - // check if the neighbor is of h_type - nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); - j = nbr_pj->nbr; + /* + for (pj = Start_Index (i, far_nbrs); pj < End_Index (i, far_nbrs); pj ++) + { + // check if the neighbor is of h_type + nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); + j = nbr_pj->nbr; - sym_index_nbr = & (far_nbrs->select.far_nbr_list[ nbr_pj->sym_index ]); - rvec_Add (atoms[i].f, sym_index_nbr->h_f ); - } - */ + sym_index_nbr = & (far_nbrs->select.far_nbr_list[ nbr_pj->sym_index ]); + rvec_Add (atoms[i].f, sym_index_nbr->h_f ); + } + */ - // if (workspace->hbond_index [j] != -1) - // { - // hb_start_j = Start_Index( workspace->hbond_index[j], hbonds ); - // hb_end_j = End_Index ( workspace->hbond_index[j], hbonds ); + // if (workspace->hbond_index [j] != -1) + // { + // hb_start_j = Start_Index( workspace->hbond_index[j], hbonds ); + // hb_end_j = End_Index ( workspace->hbond_index[j], hbonds ); - // for ( hj = hb_start_j; hj < hb_end_j; hj ++ ) - // { - // h_bond_data = &( hbonds->select.hbond_list [hj] ); - // nbr = h_bond_data->nbr; + // for ( hj = hb_start_j; hj < hb_end_j; hj ++ ) + // { + // h_bond_data = &( hbonds->select.hbond_list [hj] ); + // nbr = h_bond_data->nbr; - // if (nbr == i) { - // rvec_Add (atoms[i].f, h_bond_data->h_f ); - // } - // } - // } - } + // if (nbr == i) { + // rvec_Add (atoms[i].f, h_bond_data->h_f ); + // } + // } + // } + } - GLOBAL void Hydrogen_Bonds_Far_Nbrs ( reax_atom *atoms, - single_body_parameters *sbp, - static_storage p_workspace, - list p_bonds, list p_hbonds, list p_far_nbrs, int N ) - { + GLOBAL void Hydrogen_Bonds_Far_Nbrs ( reax_atom *atoms, + single_body_parameters *sbp, + static_storage p_workspace, + list p_bonds, list p_hbonds, list p_far_nbrs, int N ) + { - extern __shared__ rvec __f[]; - int i, pj,j; - int start, end; + extern __shared__ rvec __f[]; + int i, pj,j; + int start, end; - far_neighbor_data *nbr_pj, *sym_index_nbr; - list *far_nbrs = &p_far_nbrs; + far_neighbor_data *nbr_pj, *sym_index_nbr; + list *far_nbrs = &p_far_nbrs; - i = blockIdx.x; + i = blockIdx.x; - start = Start_Index (i, far_nbrs); - end = End_Index (i, far_nbrs); - pj = start + threadIdx.x; + start = Start_Index (i, far_nbrs); + end = End_Index (i, far_nbrs); + pj = start + threadIdx.x; - rvec_MakeZero (__f[threadIdx.x]); + rvec_MakeZero (__f[threadIdx.x]); - while (pj < end) - { - nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); - j = nbr_pj->nbr; + while (pj < end) + { + nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); + j = nbr_pj->nbr; - //sym_index_nbr = & (far_nbrs->select.far_nbr_list[ nbr_pj->sym_index ]); - // - //rvec_Add (atoms[i].f, sym_index_nbr->h_f ); - // - //rvec_Add (__f[threadIdx.x], sym_index_nbr->h_f ); + //sym_index_nbr = & (far_nbrs->select.far_nbr_list[ nbr_pj->sym_index ]); + // + //rvec_Add (atoms[i].f, sym_index_nbr->h_f ); + // + //rvec_Add (__f[threadIdx.x], sym_index_nbr->h_f ); - pj += blockDim.x; - } + pj += blockDim.x; + } - if (threadIdx.x < 16) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 16]); - if (threadIdx.x < 8) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 8]); - if (threadIdx.x < 4) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 4]); - if (threadIdx.x < 2) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 2]); - if (threadIdx.x < 1) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 1]); + if (threadIdx.x < 16) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 16]); + if (threadIdx.x < 8) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 8]); + if (threadIdx.x < 4) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 4]); + if (threadIdx.x < 2) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 2]); + if (threadIdx.x < 1) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 1]); - if (threadIdx.x == 0) - rvec_Add (atoms[i].f, __f[0]); - } + if (threadIdx.x == 0) + rvec_Add (atoms[i].f, __f[0]); + } - GLOBAL void Hydrogen_Bonds_HNbrs ( reax_atom *atoms, - single_body_parameters *sbp, - static_storage p_workspace, - list p_bonds, list p_hbonds, list p_far_nbrs, int N ) - { + GLOBAL void Hydrogen_Bonds_HNbrs ( reax_atom *atoms, + single_body_parameters *sbp, + static_storage p_workspace, + list p_bonds, list p_hbonds, list p_far_nbrs, int N ) + { - extern __shared__ rvec __f[]; - int i, pj,j; - int start, end; + extern __shared__ rvec __f[]; + int i, pj,j; + int start, end; - hbond_data *nbr_pj, *sym_index_nbr; - list *hbonds = &p_hbonds; - - i = blockIdx.x; + hbond_data *nbr_pj, *sym_index_nbr; + list *hbonds = &p_hbonds; + + i = blockIdx.x; - start = Start_Index (i, hbonds); - end = End_Index (i, hbonds); - pj = start + threadIdx.x; - - rvec_MakeZero (__f[threadIdx.x]); - - while (pj < end) - { - nbr_pj = &( hbonds->select.hbond_list[pj] ); - j = nbr_pj->nbr; - - sym_index_nbr = & (hbonds->select.hbond_list[ nbr_pj->sym_index ]); - rvec_Add (__f[threadIdx.x], sym_index_nbr->h_f ); - - pj += blockDim.x; - } - - if (threadIdx.x < 16) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 16]); - if (threadIdx.x < 8) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 8]); - if (threadIdx.x < 4) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 4]); - if (threadIdx.x < 2) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 2]); - if (threadIdx.x < 1) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 1]); - - if (threadIdx.x == 0) - rvec_Add (atoms[i].f, __f[0]); - } + start = Start_Index (i, hbonds); + end = End_Index (i, hbonds); + pj = start + threadIdx.x; + + rvec_MakeZero (__f[threadIdx.x]); + + while (pj < end) + { + nbr_pj = &( hbonds->select.hbond_list[pj] ); + j = nbr_pj->nbr; + + sym_index_nbr = & (hbonds->select.hbond_list[ nbr_pj->sym_index ]); + rvec_Add (__f[threadIdx.x], sym_index_nbr->h_f ); + + pj += blockDim.x; + } + + if (threadIdx.x < 16) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 16]); + if (threadIdx.x < 8) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 8]); + if (threadIdx.x < 4) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 4]); + if (threadIdx.x < 2) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 2]); + if (threadIdx.x < 1) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 1]); + + if (threadIdx.x == 0) + rvec_Add (atoms[i].f, __f[0]); + } diff --git a/PuReMD-GPU/src/traj.cu b/PuReMD-GPU/src/traj.cu index 5575c61a..97496e7f 100644 --- a/PuReMD-GPU/src/traj.cu +++ b/PuReMD-GPU/src/traj.cu @@ -27,418 +27,418 @@ /************************************************/ int Write_Custom_Header(reax_system *system, control_params *control, - static_storage *workspace, output_controls *out_control) + static_storage *workspace, output_controls *out_control) { - int i, header_len, control_block_len, frame_format_len; - // char buffer[2048]; - char control_block[2048]; - char frame_format[2048]; - char atom_format[100], bond_format[100], angle_format[100]; - - sprintf( control_block, CONTROL_BLOCK, - system->N, - control->restart, - control->restart_from, - control->random_vel, - out_control->restart_freq, - control->ensemble, - control->nsteps, - control->dt, - control->reposition_atoms, - control->restrict_bonds, - control->tabulate, - control->nbr_cut, - control->r_cut, - control->bg_cut, - control->bo_cut, - control->thb_cut, - control->hb_cut, - control->q_err, - control->T_init, - control->T_final, - control->Tau_T, - control->T_mode, - control->T_rate, - control->T_freq, - control->P[0], control->P[1], control->P[2], - control->Tau_P[0], control->Tau_P[1], control->Tau_P[2], - control->compressibility, - control->press_mode, - control->remove_CoM_vel, - out_control->write_steps, - out_control->traj_compress, - out_control->traj_format, - out_control->atom_format, - out_control->bond_info, - out_control->angle_info, - out_control->energy_update_freq, - control->molec_anal, - control->freq_molec_anal ); - - control_block_len = strlen( control_block ); - - - sprintf( frame_format, "Frame Format: %d\n%s\n%s\n", - NUM_FRAME_GLOBALS, FRAME_GLOBALS_FORMAT, FRAME_GLOBAL_NAMES ); - - atom_format[0] = OPT_NOATOM; - switch( out_control->atom_format ) - { - case OPT_ATOM_BASIC: sprintf( atom_format, "Atom_Basic: %s", ATOM_BASIC ); - break; - case OPT_ATOM_wF: sprintf( atom_format, "Atom_wF: %s", ATOM_wF ); - break; - case OPT_ATOM_wV: sprintf( atom_format, "Atom_wV: %s", ATOM_wV ); - break; - case OPT_ATOM_FULL: sprintf( atom_format, "Atom_Full: %s", ATOM_FULL ); - break; - } - strcat( frame_format, atom_format ); - - bond_format[0] = OPT_NOBOND; - if( out_control->bond_info == OPT_BOND_BASIC ) - sprintf( bond_format, "Bond_Line: %s", BOND_BASIC ); - else if( out_control->bond_info == OPT_BOND_FULL ) - sprintf( bond_format, "Bond_Line_Full: %s", BOND_FULL ); - strcat( frame_format, bond_format ); - - angle_format[0] = OPT_NOANGLE; - if( out_control->angle_info == OPT_ANGLE_BASIC ) - sprintf( angle_format, "Angle_Line: %s", ANGLE_BASIC ); - strcat( frame_format, angle_format ); - - frame_format_len = strlen( frame_format ); - - - header_len = HEADER_INIT_LEN + (control_block_len + SIZE_INFO_LEN2)+ - (frame_format_len + SIZE_INFO_LEN2) + - (ATOM_MAPPING_LEN * system->N + SIZE_INFO_LEN2); - - out_control->write( out_control->trj, HEADER_INIT, - header_len, HEADER_INIT_LEN, out_control->traj_title ); - - out_control->write( out_control->trj, SIZE_INFO_LINE2, - control_block_len + (frame_format_len + SIZE_INFO_LEN2) + - (ATOM_MAPPING_LEN * system->N + SIZE_INFO_LEN2), - control_block_len ); - out_control->write( out_control->trj, "%s", control_block ); - - out_control->write( out_control->trj, SIZE_INFO_LINE2, - frame_format_len + - (ATOM_MAPPING_LEN * system->N + SIZE_INFO_LEN2), - frame_format_len ); - out_control->write( out_control->trj, "%s", frame_format ); - - out_control->write( out_control->trj, SIZE_INFO_LINE2, - ATOM_MAPPING_LEN * system->N, - ATOM_MAPPING_LEN * system->N ); - - for( i = 0; i < system->N; ++i ) - out_control->write( out_control->trj, ATOM_MAPPING, - workspace->orig_id[i], - system->atoms[i].type, - system->atoms[i].name, - system->reaxprm.sbp[ system->atoms[i].type ].mass ); - - fflush( out_control->trj ); - - return 0; + int i, header_len, control_block_len, frame_format_len; + // char buffer[2048]; + char control_block[2048]; + char frame_format[2048]; + char atom_format[100], bond_format[100], angle_format[100]; + + sprintf( control_block, CONTROL_BLOCK, + system->N, + control->restart, + control->restart_from, + control->random_vel, + out_control->restart_freq, + control->ensemble, + control->nsteps, + control->dt, + control->reposition_atoms, + control->restrict_bonds, + control->tabulate, + control->nbr_cut, + control->r_cut, + control->bg_cut, + control->bo_cut, + control->thb_cut, + control->hb_cut, + control->q_err, + control->T_init, + control->T_final, + control->Tau_T, + control->T_mode, + control->T_rate, + control->T_freq, + control->P[0], control->P[1], control->P[2], + control->Tau_P[0], control->Tau_P[1], control->Tau_P[2], + control->compressibility, + control->press_mode, + control->remove_CoM_vel, + out_control->write_steps, + out_control->traj_compress, + out_control->traj_format, + out_control->atom_format, + out_control->bond_info, + out_control->angle_info, + out_control->energy_update_freq, + control->molec_anal, + control->freq_molec_anal ); + + control_block_len = strlen( control_block ); + + + sprintf( frame_format, "Frame Format: %d\n%s\n%s\n", + NUM_FRAME_GLOBALS, FRAME_GLOBALS_FORMAT, FRAME_GLOBAL_NAMES ); + + atom_format[0] = OPT_NOATOM; + switch( out_control->atom_format ) + { + case OPT_ATOM_BASIC: sprintf( atom_format, "Atom_Basic: %s", ATOM_BASIC ); + break; + case OPT_ATOM_wF: sprintf( atom_format, "Atom_wF: %s", ATOM_wF ); + break; + case OPT_ATOM_wV: sprintf( atom_format, "Atom_wV: %s", ATOM_wV ); + break; + case OPT_ATOM_FULL: sprintf( atom_format, "Atom_Full: %s", ATOM_FULL ); + break; + } + strcat( frame_format, atom_format ); + + bond_format[0] = OPT_NOBOND; + if( out_control->bond_info == OPT_BOND_BASIC ) + sprintf( bond_format, "Bond_Line: %s", BOND_BASIC ); + else if( out_control->bond_info == OPT_BOND_FULL ) + sprintf( bond_format, "Bond_Line_Full: %s", BOND_FULL ); + strcat( frame_format, bond_format ); + + angle_format[0] = OPT_NOANGLE; + if( out_control->angle_info == OPT_ANGLE_BASIC ) + sprintf( angle_format, "Angle_Line: %s", ANGLE_BASIC ); + strcat( frame_format, angle_format ); + + frame_format_len = strlen( frame_format ); + + + header_len = HEADER_INIT_LEN + (control_block_len + SIZE_INFO_LEN2)+ + (frame_format_len + SIZE_INFO_LEN2) + + (ATOM_MAPPING_LEN * system->N + SIZE_INFO_LEN2); + + out_control->write( out_control->trj, HEADER_INIT, + header_len, HEADER_INIT_LEN, out_control->traj_title ); + + out_control->write( out_control->trj, SIZE_INFO_LINE2, + control_block_len + (frame_format_len + SIZE_INFO_LEN2) + + (ATOM_MAPPING_LEN * system->N + SIZE_INFO_LEN2), + control_block_len ); + out_control->write( out_control->trj, "%s", control_block ); + + out_control->write( out_control->trj, SIZE_INFO_LINE2, + frame_format_len + + (ATOM_MAPPING_LEN * system->N + SIZE_INFO_LEN2), + frame_format_len ); + out_control->write( out_control->trj, "%s", frame_format ); + + out_control->write( out_control->trj, SIZE_INFO_LINE2, + ATOM_MAPPING_LEN * system->N, + ATOM_MAPPING_LEN * system->N ); + + for( i = 0; i < system->N; ++i ) + out_control->write( out_control->trj, ATOM_MAPPING, + workspace->orig_id[i], + system->atoms[i].type, + system->atoms[i].name, + system->reaxprm.sbp[ system->atoms[i].type ].mass ); + + fflush( out_control->trj ); + + return 0; } int Append_Custom_Frame( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - list **lists, output_controls *out_control ) + simulation_data *data, static_storage *workspace, + list **lists, output_controls *out_control ) { - int i, j, pi, pk, pk_j; - int write_atoms, write_bonds, write_angles; - int frame_len, atom_line_len, bond_line_len, angle_line_len, rest_of_frame_len; - int frame_globals_len, num_bonds, num_thb_intrs; - real P; - char buffer[2048]; - list *bonds = (*lists) + BONDS; - list *thb_intrs = (*lists) + THREE_BODIES; - bond_data *bo_ij; - - - /* IMPORTANT: This whole part will go to init_trj after finalized! */ - switch( out_control->atom_format ) - { - case OPT_ATOM_BASIC: - atom_line_len = ATOM_BASIC_LEN; - write_atoms = 1; - break; - case OPT_ATOM_wF: - atom_line_len = ATOM_wF_LEN; - write_atoms = 1; - break; - case OPT_ATOM_wV: - atom_line_len = ATOM_wV_LEN; - write_atoms = 1; - break; - case OPT_ATOM_FULL: - atom_line_len = ATOM_FULL_LEN; - write_atoms = 1; - break; - default: - atom_line_len = 0; - write_atoms = 0; - } - - - /* bond preparations */ - bond_line_len = write_bonds = 0; - if( out_control->bond_info == OPT_BOND_BASIC ) - { - bond_line_len = BOND_BASIC_LEN; - write_bonds = 1; - } - else if( out_control->bond_info == OPT_BOND_FULL ) - { - bond_line_len = BOND_FULL_LEN; - write_bonds = 1; - } + int i, j, pi, pk, pk_j; + int write_atoms, write_bonds, write_angles; + int frame_len, atom_line_len, bond_line_len, angle_line_len, rest_of_frame_len; + int frame_globals_len, num_bonds, num_thb_intrs; + real P; + char buffer[2048]; + list *bonds = (*lists) + BONDS; + list *thb_intrs = (*lists) + THREE_BODIES; + bond_data *bo_ij; + + + /* IMPORTANT: This whole part will go to init_trj after finalized! */ + switch( out_control->atom_format ) + { + case OPT_ATOM_BASIC: + atom_line_len = ATOM_BASIC_LEN; + write_atoms = 1; + break; + case OPT_ATOM_wF: + atom_line_len = ATOM_wF_LEN; + write_atoms = 1; + break; + case OPT_ATOM_wV: + atom_line_len = ATOM_wV_LEN; + write_atoms = 1; + break; + case OPT_ATOM_FULL: + atom_line_len = ATOM_FULL_LEN; + write_atoms = 1; + break; + default: + atom_line_len = 0; + write_atoms = 0; + } + + + /* bond preparations */ + bond_line_len = write_bonds = 0; + if( out_control->bond_info == OPT_BOND_BASIC ) + { + bond_line_len = BOND_BASIC_LEN; + write_bonds = 1; + } + else if( out_control->bond_info == OPT_BOND_FULL ) + { + bond_line_len = BOND_FULL_LEN; + write_bonds = 1; + } #ifdef __DEBUG_CUDA__ - fprintf (stderr, "Append Custom Frame -- write_bonds --> %d \n", write_bonds); + fprintf (stderr, "Append Custom Frame -- write_bonds --> %d \n", write_bonds); #endif - num_bonds = 0; - if( write_bonds ) - { + num_bonds = 0; + if( write_bonds ) + { #ifndef __PRINT_CPU_RESULTS__ - //fprintf (stderr, "Synching bonds from device for printing ....\n"); - Sync_Host_Device (bonds, (dev_lists + BONDS), TYP_BOND ); + //fprintf (stderr, "Synching bonds from device for printing ....\n"); + Sync_Host_Device (bonds, (dev_lists + BONDS), TYP_BOND ); #endif - for( i = 0; i < system->N; ++i ) - for( j = Start_Index( i, bonds ); j < End_Index( i, bonds ); ++j ) - if( i < bonds->select.bond_list[j].nbr && - bonds->select.bond_list[j].bo_data.BO >= control->bg_cut ) - ++num_bonds; - } - - - /* angle preparations */ - if( out_control->angle_info == OPT_ANGLE_BASIC ) - { - angle_line_len = ANGLE_BASIC_LEN; - write_angles = 1; - } - else - { - angle_line_len = 0; - write_angles = 0; - } + for( i = 0; i < system->N; ++i ) + for( j = Start_Index( i, bonds ); j < End_Index( i, bonds ); ++j ) + if( i < bonds->select.bond_list[j].nbr && + bonds->select.bond_list[j].bo_data.BO >= control->bg_cut ) + ++num_bonds; + } + + + /* angle preparations */ + if( out_control->angle_info == OPT_ANGLE_BASIC ) + { + angle_line_len = ANGLE_BASIC_LEN; + write_angles = 1; + } + else + { + angle_line_len = 0; + write_angles = 0; + } #ifdef __DEBUG_CUDA__ - fprintf (stderr, "Append Custom Frame -- write-angles --> %d \n", write_angles ); + fprintf (stderr, "Append Custom Frame -- write-angles --> %d \n", write_angles ); #endif - num_thb_intrs = 0; - if( write_angles ) { + num_thb_intrs = 0; + if( write_angles ) { #ifndef __PRINT_CPU_RESULTS__ - //fprintf (stderr, "Synching three bodies from deivce for printing ... \n"); - Sync_Host_Device (thb_intrs, dev_lists + THREE_BODIES, TYP_THREE_BODY ); - if ( !write_bonds) { - //fprintf (stderr, "Synching bonds for three bodies from device for printing ... \n"); - Sync_Host_Device (bonds, (dev_lists + BONDS), TYP_BOND ); - } + //fprintf (stderr, "Synching three bodies from deivce for printing ... \n"); + Sync_Host_Device (thb_intrs, dev_lists + THREE_BODIES, TYP_THREE_BODY ); + if ( !write_bonds) { + //fprintf (stderr, "Synching bonds for three bodies from device for printing ... \n"); + Sync_Host_Device (bonds, (dev_lists + BONDS), TYP_BOND ); + } #endif - for( j = 0; j < system->N; ++j ) - for( pi = Start_Index(j, bonds); pi < End_Index(j, bonds); ++pi ) - if( bonds->select.bond_list[pi].bo_data.BO >= control->bg_cut ) - // physical j&i bond - for( pk = Start_Index( pi, thb_intrs ); - pk < End_Index( pi, thb_intrs ); ++pk ) - if( bonds->select.bond_list[pi].nbr < - thb_intrs->select.three_body_list[pk].thb ) { - // get k's pointer on j's bond list - pk_j = thb_intrs->select.three_body_list[pk].pthb; - - if( bonds->select.bond_list[pk_j].bo_data.BO >= control->bg_cut ) - // physical j&k bond - ++num_thb_intrs; - } - } - - - - /* get correct pressure */ - if( control->ensemble == NPT || control->ensemble == sNPT ) - P = data->flex_bar.P_scalar; - else if( control->ensemble == iNPT ) - P = data->iso_bar.P; - else P = 0; - - - /* calculate total frame length*/ - sprintf( buffer, FRAME_GLOBALS, - data->step, data->time, - data->E_Tot, data->E_Pot, E_CONV * data->E_Kin, data->therm.T, - P, system->box.volume, - system->box.box_norms[0], - system->box.box_norms[1], - system->box.box_norms[2], - 90.0, 90.0, 90.0, // IMPORTANT: need to rewrite for flexible boxes! - data->E_BE, - data->E_Ov, data->E_Un, data->E_Lp, - data->E_Ang, data->E_Pen, data->E_Coa, data->E_HB, - data->E_Tor, data->E_Con, - data->E_vdW, data->E_Ele, data->E_Pol ); - frame_globals_len = strlen( buffer ); - - frame_len = frame_globals_len + - write_atoms * SIZE_INFO_LEN3 + system->N * atom_line_len + - write_bonds * SIZE_INFO_LEN3 + num_bonds * bond_line_len + - write_angles * SIZE_INFO_LEN3 + num_thb_intrs * angle_line_len; - - - /* write size info & frame globals */ - out_control->write( out_control->trj, SIZE_INFO_LINE2, - frame_len, frame_globals_len ); - out_control->write( out_control->trj, "%s", buffer ); - - - /* write size info & atom lines */ - if( write_atoms ) - { - rest_of_frame_len = system->N * atom_line_len + - write_bonds * SIZE_INFO_LEN3 + num_bonds * bond_line_len + - write_angles * SIZE_INFO_LEN3 + num_thb_intrs * angle_line_len; - - out_control->write( out_control->trj, SIZE_INFO_LINE3, - rest_of_frame_len, system->N * atom_line_len, - system->N ); - } - - switch( out_control->atom_format ) - { - case 4: - for( i = 0; i < system->N; ++i ) - out_control->write( out_control->trj, ATOM_BASIC, - workspace->orig_id[i], - system->atoms[i].x[0], - system->atoms[i].x[1], - system->atoms[i].x[2], - system->atoms[i].q ); - break; - case 5: - for( i = 0; i < system->N; ++i ) - out_control->write( out_control->trj, ATOM_wF, - workspace->orig_id[i], - system->atoms[i].x[0], - system->atoms[i].x[1], - system->atoms[i].x[2], - system->atoms[i].f[0], - system->atoms[i].f[1], - system->atoms[i].f[2], - system->atoms[i].q ); - break; - case 6: - for( i = 0; i < system->N; ++i ) - out_control->write( out_control->trj, ATOM_wV, - workspace->orig_id[i], - system->atoms[i].x[0], - system->atoms[i].x[1], - system->atoms[i].x[2], - system->atoms[i].v[0], - system->atoms[i].v[1], - system->atoms[i].v[2], - system->atoms[i].q ); - break; - case 7: - for( i = 0; i < system->N; ++i ) - out_control->write( out_control->trj, ATOM_FULL, - workspace->orig_id[i], - system->atoms[i].x[0], - system->atoms[i].x[1], - system->atoms[i].x[2], - system->atoms[i].v[0], - system->atoms[i].v[1], - system->atoms[i].v[2], - system->atoms[i].f[0], - system->atoms[i].f[1], - system->atoms[i].f[2], - system->atoms[i].q ); - break; - } - fflush( out_control->trj ); - - - /* write size info & bond lines */ - if( write_bonds ) - { - rest_of_frame_len = num_bonds * bond_line_len + - write_angles * SIZE_INFO_LEN3 + num_thb_intrs * angle_line_len; - - out_control->write( out_control->trj, SIZE_INFO_LINE3, - rest_of_frame_len, num_bonds * bond_line_len, - num_bonds ); - } - - if( out_control->bond_info == 1 ) { - for( i = 0; i < system->N; ++i ) - for( j = Start_Index( i, bonds ); j < End_Index( i, bonds ); ++j ) - if( i < bonds->select.bond_list[j].nbr && - bonds->select.bond_list[j].bo_data.BO >= control->bg_cut ) { - bo_ij = &( bonds->select.bond_list[j] ); - out_control->write( out_control->trj, BOND_BASIC, - workspace->orig_id[i], - workspace->orig_id[bo_ij->nbr], - bo_ij->d, bo_ij->bo_data.BO ); - } - } - else if( out_control->bond_info == 2 ) { - for( i = 0; i < system->N; ++i ) - for( j = Start_Index( i, bonds ); j < End_Index( i, bonds ); ++j ) - if( i < bonds->select.bond_list[j].nbr && - bonds->select.bond_list[j].bo_data.BO >= control->bg_cut ) { - bo_ij = &( bonds->select.bond_list[j] ); - out_control->write( out_control->trj, BOND_FULL, - workspace->orig_id[i], - workspace->orig_id[bo_ij->nbr], - bo_ij->d, bo_ij->bo_data.BO, bo_ij->bo_data.BO_s, - bo_ij->bo_data.BO_pi, bo_ij->bo_data.BO_pi2 ); - } - } - - fflush( out_control->trj ); - - - /* write size info & angle lines */ - if( out_control->angle_info ) { - out_control->write( out_control->trj, SIZE_INFO_LINE3, - num_thb_intrs * angle_line_len, - num_thb_intrs * angle_line_len, num_thb_intrs ); - - for( j = 0; j < system->N; ++j ) - for( pi = Start_Index(j, bonds); pi < End_Index(j, bonds); ++pi ) - if( bonds->select.bond_list[pi].bo_data.BO >= control->bg_cut ) - // physical j&i bond - for( pk = Start_Index( pi, thb_intrs ); - pk < End_Index( pi, thb_intrs ); ++pk ) - if( bonds->select.bond_list[pi].nbr < - thb_intrs->select.three_body_list[pk].thb ) { - pk_j = thb_intrs->select.three_body_list[pk].pthb; - // get k's pointer on j's bond list - - if( bonds->select.bond_list[pk_j].bo_data.BO >= control->bg_cut ) - // physical j&k bond - out_control->write( out_control->trj, ANGLE_BASIC, - workspace->orig_id[bonds->select.bond_list[pi].nbr], - workspace->orig_id[j], - workspace->orig_id[thb_intrs->select.three_body_list[pk].thb], - RAD2DEG(thb_intrs->select.three_body_list[pk].theta) ); - } - } - - fflush( out_control->trj ); - - return 0; + for( j = 0; j < system->N; ++j ) + for( pi = Start_Index(j, bonds); pi < End_Index(j, bonds); ++pi ) + if( bonds->select.bond_list[pi].bo_data.BO >= control->bg_cut ) + // physical j&i bond + for( pk = Start_Index( pi, thb_intrs ); + pk < End_Index( pi, thb_intrs ); ++pk ) + if( bonds->select.bond_list[pi].nbr < + thb_intrs->select.three_body_list[pk].thb ) { + // get k's pointer on j's bond list + pk_j = thb_intrs->select.three_body_list[pk].pthb; + + if( bonds->select.bond_list[pk_j].bo_data.BO >= control->bg_cut ) + // physical j&k bond + ++num_thb_intrs; + } + } + + + + /* get correct pressure */ + if( control->ensemble == NPT || control->ensemble == sNPT ) + P = data->flex_bar.P_scalar; + else if( control->ensemble == iNPT ) + P = data->iso_bar.P; + else P = 0; + + + /* calculate total frame length*/ + sprintf( buffer, FRAME_GLOBALS, + data->step, data->time, + data->E_Tot, data->E_Pot, E_CONV * data->E_Kin, data->therm.T, + P, system->box.volume, + system->box.box_norms[0], + system->box.box_norms[1], + system->box.box_norms[2], + 90.0, 90.0, 90.0, // IMPORTANT: need to rewrite for flexible boxes! + data->E_BE, + data->E_Ov, data->E_Un, data->E_Lp, + data->E_Ang, data->E_Pen, data->E_Coa, data->E_HB, + data->E_Tor, data->E_Con, + data->E_vdW, data->E_Ele, data->E_Pol ); + frame_globals_len = strlen( buffer ); + + frame_len = frame_globals_len + + write_atoms * SIZE_INFO_LEN3 + system->N * atom_line_len + + write_bonds * SIZE_INFO_LEN3 + num_bonds * bond_line_len + + write_angles * SIZE_INFO_LEN3 + num_thb_intrs * angle_line_len; + + + /* write size info & frame globals */ + out_control->write( out_control->trj, SIZE_INFO_LINE2, + frame_len, frame_globals_len ); + out_control->write( out_control->trj, "%s", buffer ); + + + /* write size info & atom lines */ + if( write_atoms ) + { + rest_of_frame_len = system->N * atom_line_len + + write_bonds * SIZE_INFO_LEN3 + num_bonds * bond_line_len + + write_angles * SIZE_INFO_LEN3 + num_thb_intrs * angle_line_len; + + out_control->write( out_control->trj, SIZE_INFO_LINE3, + rest_of_frame_len, system->N * atom_line_len, + system->N ); + } + + switch( out_control->atom_format ) + { + case 4: + for( i = 0; i < system->N; ++i ) + out_control->write( out_control->trj, ATOM_BASIC, + workspace->orig_id[i], + system->atoms[i].x[0], + system->atoms[i].x[1], + system->atoms[i].x[2], + system->atoms[i].q ); + break; + case 5: + for( i = 0; i < system->N; ++i ) + out_control->write( out_control->trj, ATOM_wF, + workspace->orig_id[i], + system->atoms[i].x[0], + system->atoms[i].x[1], + system->atoms[i].x[2], + system->atoms[i].f[0], + system->atoms[i].f[1], + system->atoms[i].f[2], + system->atoms[i].q ); + break; + case 6: + for( i = 0; i < system->N; ++i ) + out_control->write( out_control->trj, ATOM_wV, + workspace->orig_id[i], + system->atoms[i].x[0], + system->atoms[i].x[1], + system->atoms[i].x[2], + system->atoms[i].v[0], + system->atoms[i].v[1], + system->atoms[i].v[2], + system->atoms[i].q ); + break; + case 7: + for( i = 0; i < system->N; ++i ) + out_control->write( out_control->trj, ATOM_FULL, + workspace->orig_id[i], + system->atoms[i].x[0], + system->atoms[i].x[1], + system->atoms[i].x[2], + system->atoms[i].v[0], + system->atoms[i].v[1], + system->atoms[i].v[2], + system->atoms[i].f[0], + system->atoms[i].f[1], + system->atoms[i].f[2], + system->atoms[i].q ); + break; + } + fflush( out_control->trj ); + + + /* write size info & bond lines */ + if( write_bonds ) + { + rest_of_frame_len = num_bonds * bond_line_len + + write_angles * SIZE_INFO_LEN3 + num_thb_intrs * angle_line_len; + + out_control->write( out_control->trj, SIZE_INFO_LINE3, + rest_of_frame_len, num_bonds * bond_line_len, + num_bonds ); + } + + if( out_control->bond_info == 1 ) { + for( i = 0; i < system->N; ++i ) + for( j = Start_Index( i, bonds ); j < End_Index( i, bonds ); ++j ) + if( i < bonds->select.bond_list[j].nbr && + bonds->select.bond_list[j].bo_data.BO >= control->bg_cut ) { + bo_ij = &( bonds->select.bond_list[j] ); + out_control->write( out_control->trj, BOND_BASIC, + workspace->orig_id[i], + workspace->orig_id[bo_ij->nbr], + bo_ij->d, bo_ij->bo_data.BO ); + } + } + else if( out_control->bond_info == 2 ) { + for( i = 0; i < system->N; ++i ) + for( j = Start_Index( i, bonds ); j < End_Index( i, bonds ); ++j ) + if( i < bonds->select.bond_list[j].nbr && + bonds->select.bond_list[j].bo_data.BO >= control->bg_cut ) { + bo_ij = &( bonds->select.bond_list[j] ); + out_control->write( out_control->trj, BOND_FULL, + workspace->orig_id[i], + workspace->orig_id[bo_ij->nbr], + bo_ij->d, bo_ij->bo_data.BO, bo_ij->bo_data.BO_s, + bo_ij->bo_data.BO_pi, bo_ij->bo_data.BO_pi2 ); + } + } + + fflush( out_control->trj ); + + + /* write size info & angle lines */ + if( out_control->angle_info ) { + out_control->write( out_control->trj, SIZE_INFO_LINE3, + num_thb_intrs * angle_line_len, + num_thb_intrs * angle_line_len, num_thb_intrs ); + + for( j = 0; j < system->N; ++j ) + for( pi = Start_Index(j, bonds); pi < End_Index(j, bonds); ++pi ) + if( bonds->select.bond_list[pi].bo_data.BO >= control->bg_cut ) + // physical j&i bond + for( pk = Start_Index( pi, thb_intrs ); + pk < End_Index( pi, thb_intrs ); ++pk ) + if( bonds->select.bond_list[pi].nbr < + thb_intrs->select.three_body_list[pk].thb ) { + pk_j = thb_intrs->select.three_body_list[pk].pthb; + // get k's pointer on j's bond list + + if( bonds->select.bond_list[pk_j].bo_data.BO >= control->bg_cut ) + // physical j&k bond + out_control->write( out_control->trj, ANGLE_BASIC, + workspace->orig_id[bonds->select.bond_list[pi].nbr], + workspace->orig_id[j], + workspace->orig_id[thb_intrs->select.three_body_list[pk].thb], + RAD2DEG(thb_intrs->select.three_body_list[pk].theta) ); + } + } + + fflush( out_control->trj ); + + return 0; } /* @@ -480,35 +480,35 @@ gzclose( out_control->trj ); /********************************************************/ int Write_xyz_Header( reax_system *system, control_params *control, - static_storage* workspace, output_controls *out_control ) + static_storage* workspace, output_controls *out_control ) { - fflush( out_control->trj ); + fflush( out_control->trj ); - return 1; + return 1; } int Append_xyz_Frame( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - list **lists, output_controls *out_control ) + simulation_data *data, static_storage *workspace, + list **lists, output_controls *out_control ) { - int i; + int i; - out_control->write( out_control->trj, "%d\n", system->N ); + out_control->write( out_control->trj, "%d\n", system->N ); - out_control->write( out_control->trj, "%d\t%8.3f\t%8.3f\t%8.3f\t%8.3f\n", - data->step, - data->E_Tot, data->E_Pot, - E_CONV*data->E_Kin, data->therm.T ); + out_control->write( out_control->trj, "%d\t%8.3f\t%8.3f\t%8.3f\t%8.3f\n", + data->step, + data->E_Tot, data->E_Pot, + E_CONV*data->E_Kin, data->therm.T ); - for( i = 0; i < system->N; ++i ) - out_control->write( out_control->trj, "%3s %10.5f %10.5f %10.5f\n", - system->reaxprm.sbp[ system->atoms[i].type ].name, - system->atoms[i].x[0], - system->atoms[i].x[1], - system->atoms[i].x[2] ); + for( i = 0; i < system->N; ++i ) + out_control->write( out_control->trj, "%3s %10.5f %10.5f %10.5f\n", + system->reaxprm.sbp[ system->atoms[i].type ].name, + system->atoms[i].x[0], + system->atoms[i].x[1], + system->atoms[i].x[2] ); - fflush( out_control->trj ); + fflush( out_control->trj ); - return 1; + return 1; } diff --git a/PuReMD-GPU/src/two_body_interactions.cu b/PuReMD-GPU/src/two_body_interactions.cu index f1f5a18c..f53b0cfb 100644 --- a/PuReMD-GPU/src/two_body_interactions.cu +++ b/PuReMD-GPU/src/two_body_interactions.cu @@ -29,126 +29,126 @@ void Bond_Energy( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - list **lists, output_controls *out_control ) + simulation_data *data, static_storage *workspace, + list **lists, output_controls *out_control ) { - int i, j, pj; - int start_i, end_i; - int type_i, type_j; - real ebond, pow_BOs_be2, exp_be12, CEbo; - real gp3, gp4, gp7, gp10, gp37; - real exphu, exphua1, exphub1, exphuov, hulpov, estriph; - real decobdbo, decobdboua, decobdboub; - single_body_parameters *sbp_i, *sbp_j; - two_body_parameters *twbp; - bond_order_data *bo_ij; - list *bonds; - - bonds = (*lists) + BONDS; - gp3 = system->reaxprm.gp.l[3]; - gp4 = system->reaxprm.gp.l[4]; - gp7 = system->reaxprm.gp.l[7]; - gp10 = system->reaxprm.gp.l[10]; - gp37 = (int) system->reaxprm.gp.l[37]; - - for( i=0; i < system->N; ++i ) { - start_i = Start_Index(i, bonds); - end_i = End_Index(i, bonds); - //fprintf( stderr, "i=%d start=%d end=%d\n", i, start_i, end_i ); - for( pj = start_i; pj < end_i; ++pj ) - if( i < bonds->select.bond_list[pj].nbr ) { - /* set the pointers */ - j = bonds->select.bond_list[pj].nbr; - type_i = system->atoms[i].type; - type_j = system->atoms[j].type; - sbp_i = &( system->reaxprm.sbp[type_i] ); - sbp_j = &( system->reaxprm.sbp[type_j] ); - twbp = &( system->reaxprm.tbp[ index_tbp (type_i,type_j,&system->reaxprm) ] ); - bo_ij = &( bonds->select.bond_list[pj].bo_data ); - - /* calculate the constants */ - pow_BOs_be2 = POW( bo_ij->BO_s, twbp->p_be2 ); - exp_be12 = EXP( twbp->p_be1 * ( 1.0 - pow_BOs_be2 ) ); - CEbo = -twbp->De_s * exp_be12 * - ( 1.0 - twbp->p_be1 * twbp->p_be2 * pow_BOs_be2 ); - - /* calculate the Bond Energy */ - ebond = - -twbp->De_s * bo_ij->BO_s * exp_be12 - -twbp->De_p * bo_ij->BO_pi - -twbp->De_pp * bo_ij->BO_pi2; - - data->E_BE += ebond; - - /* calculate derivatives of Bond Orders */ - bo_ij->Cdbo += CEbo; - bo_ij->Cdbopi -= (CEbo + twbp->De_p); - bo_ij->Cdbopi2 -= (CEbo + twbp->De_pp); + int i, j, pj; + int start_i, end_i; + int type_i, type_j; + real ebond, pow_BOs_be2, exp_be12, CEbo; + real gp3, gp4, gp7, gp10, gp37; + real exphu, exphua1, exphub1, exphuov, hulpov, estriph; + real decobdbo, decobdboua, decobdboub; + single_body_parameters *sbp_i, *sbp_j; + two_body_parameters *twbp; + bond_order_data *bo_ij; + list *bonds; + + bonds = (*lists) + BONDS; + gp3 = system->reaxprm.gp.l[3]; + gp4 = system->reaxprm.gp.l[4]; + gp7 = system->reaxprm.gp.l[7]; + gp10 = system->reaxprm.gp.l[10]; + gp37 = (int) system->reaxprm.gp.l[37]; + + for( i=0; i < system->N; ++i ) { + start_i = Start_Index(i, bonds); + end_i = End_Index(i, bonds); + //fprintf( stderr, "i=%d start=%d end=%d\n", i, start_i, end_i ); + for( pj = start_i; pj < end_i; ++pj ) + if( i < bonds->select.bond_list[pj].nbr ) { + /* set the pointers */ + j = bonds->select.bond_list[pj].nbr; + type_i = system->atoms[i].type; + type_j = system->atoms[j].type; + sbp_i = &( system->reaxprm.sbp[type_i] ); + sbp_j = &( system->reaxprm.sbp[type_j] ); + twbp = &( system->reaxprm.tbp[ index_tbp (type_i,type_j,&system->reaxprm) ] ); + bo_ij = &( bonds->select.bond_list[pj].bo_data ); + + /* calculate the constants */ + pow_BOs_be2 = POW( bo_ij->BO_s, twbp->p_be2 ); + exp_be12 = EXP( twbp->p_be1 * ( 1.0 - pow_BOs_be2 ) ); + CEbo = -twbp->De_s * exp_be12 * + ( 1.0 - twbp->p_be1 * twbp->p_be2 * pow_BOs_be2 ); + + /* calculate the Bond Energy */ + ebond = + -twbp->De_s * bo_ij->BO_s * exp_be12 + -twbp->De_p * bo_ij->BO_pi + -twbp->De_pp * bo_ij->BO_pi2; + + data->E_BE += ebond; + + /* calculate derivatives of Bond Orders */ + bo_ij->Cdbo += CEbo; + bo_ij->Cdbopi -= (CEbo + twbp->De_p); + bo_ij->Cdbopi2 -= (CEbo + twbp->De_pp); #ifdef TEST_ENERGY - fprintf( out_control->ebond, "%6d%6d%24.15e%24.15e\n", - workspace->orig_id[i], workspace->orig_id[j], - // i+1, j+1, - bo_ij->BO, ebond/*, data->E_BE*/ ); - /* fprintf( out_control->ebond, "%6d%6d%12.6f%12.6f%12.6f\n", - workspace->orig_id[i], workspace->orig_id[j], - CEbo, -twbp->De_p, -twbp->De_pp );*/ + fprintf( out_control->ebond, "%6d%6d%24.15e%24.15e\n", + workspace->orig_id[i], workspace->orig_id[j], + // i+1, j+1, + bo_ij->BO, ebond/*, data->E_BE*/ ); + /* fprintf( out_control->ebond, "%6d%6d%12.6f%12.6f%12.6f\n", + workspace->orig_id[i], workspace->orig_id[j], + CEbo, -twbp->De_p, -twbp->De_pp );*/ #endif #ifdef TEST_FORCES - Add_dBO( system, lists, i, pj, CEbo, workspace->f_be ); - Add_dBOpinpi2( system, lists, i, pj, - -(CEbo + twbp->De_p), -(CEbo + twbp->De_pp), - workspace->f_be, workspace->f_be ); + Add_dBO( system, lists, i, pj, CEbo, workspace->f_be ); + Add_dBOpinpi2( system, lists, i, pj, + -(CEbo + twbp->De_p), -(CEbo + twbp->De_pp), + workspace->f_be, workspace->f_be ); #endif - /* Stabilisation terminal triple bond */ - if( bo_ij->BO >= 1.00 ) { - if( gp37 == 2 || - (sbp_i->mass == 12.0000 && sbp_j->mass == 15.9990) || - (sbp_j->mass == 12.0000 && sbp_i->mass == 15.9990) ) { - // ba = SQR(bo_ij->BO - 2.50); - exphu = EXP( -gp7 * SQR(bo_ij->BO - 2.50) ); - //oboa=abo(j1)-boa; - //obob=abo(j2)-boa; - exphua1 = EXP(-gp3*(workspace->total_bond_order[i]-bo_ij->BO)); - exphub1 = EXP(-gp3*(workspace->total_bond_order[j]-bo_ij->BO)); - //ovoab=abo(j1)-aval(it1)+abo(j2)-aval(it2); - exphuov = EXP(gp4*(workspace->Delta[i] + workspace->Delta[j])); - hulpov = 1.0 / (1.0 + 25.0 * exphuov); - - estriph = gp10 * exphu * hulpov * (exphua1 + exphub1); - //estrain(j1) = estrain(j1) + 0.50*estriph; - //estrain(j2) = estrain(j2) + 0.50*estriph; - data->E_BE += estriph; - - decobdbo = gp10 * exphu * hulpov * (exphua1 + exphub1) * - ( gp3 - 2.0 * gp7 * (bo_ij->BO-2.50) ); - decobdboua = -gp10 * exphu * hulpov * - (gp3*exphua1 + 25.0*gp4*exphuov*hulpov*(exphua1+exphub1)); - decobdboub = -gp10 * exphu * hulpov * - (gp3*exphub1 + 25.0*gp4*exphuov*hulpov*(exphua1+exphub1)); - - bo_ij->Cdbo += decobdbo; - workspace->CdDelta[i] += decobdboua; - workspace->CdDelta[j] += decobdboub; - //loop_j ++; - //fprintf (stderr, "incrementing loopj %d \n", loop_j); + /* Stabilisation terminal triple bond */ + if( bo_ij->BO >= 1.00 ) { + if( gp37 == 2 || + (sbp_i->mass == 12.0000 && sbp_j->mass == 15.9990) || + (sbp_j->mass == 12.0000 && sbp_i->mass == 15.9990) ) { + // ba = SQR(bo_ij->BO - 2.50); + exphu = EXP( -gp7 * SQR(bo_ij->BO - 2.50) ); + //oboa=abo(j1)-boa; + //obob=abo(j2)-boa; + exphua1 = EXP(-gp3*(workspace->total_bond_order[i]-bo_ij->BO)); + exphub1 = EXP(-gp3*(workspace->total_bond_order[j]-bo_ij->BO)); + //ovoab=abo(j1)-aval(it1)+abo(j2)-aval(it2); + exphuov = EXP(gp4*(workspace->Delta[i] + workspace->Delta[j])); + hulpov = 1.0 / (1.0 + 25.0 * exphuov); + + estriph = gp10 * exphu * hulpov * (exphua1 + exphub1); + //estrain(j1) = estrain(j1) + 0.50*estriph; + //estrain(j2) = estrain(j2) + 0.50*estriph; + data->E_BE += estriph; + + decobdbo = gp10 * exphu * hulpov * (exphua1 + exphub1) * + ( gp3 - 2.0 * gp7 * (bo_ij->BO-2.50) ); + decobdboua = -gp10 * exphu * hulpov * + (gp3*exphua1 + 25.0*gp4*exphuov*hulpov*(exphua1+exphub1)); + decobdboub = -gp10 * exphu * hulpov * + (gp3*exphub1 + 25.0*gp4*exphuov*hulpov*(exphua1+exphub1)); + + bo_ij->Cdbo += decobdbo; + workspace->CdDelta[i] += decobdboua; + workspace->CdDelta[j] += decobdboub; + //loop_j ++; + //fprintf (stderr, "incrementing loopj %d \n", loop_j); #ifdef TEST_ENERGY - fprintf( out_control->ebond, - "%6d%6d%24.15e%24.15e%24.15e%24.15e\n", - workspace->orig_id[i], workspace->orig_id[j], - //i+1, j+1, - estriph, decobdbo, decobdboua, decobdboub ); + fprintf( out_control->ebond, + "%6d%6d%24.15e%24.15e%24.15e%24.15e\n", + workspace->orig_id[i], workspace->orig_id[j], + //i+1, j+1, + estriph, decobdbo, decobdboua, decobdboub ); #endif #ifdef TEST_FORCES - Add_dBO( system, lists, i, pj, decobdbo, workspace->f_be ); - Add_dDelta( system, lists, i, decobdboua, workspace->f_be ); - Add_dDelta( system, lists, j, decobdboub, workspace->f_be ); + Add_dBO( system, lists, i, pj, decobdbo, workspace->f_be ); + Add_dDelta( system, lists, i, decobdboua, workspace->f_be ); + Add_dDelta( system, lists, j, decobdboub, workspace->f_be ); #endif - } - } - } - } + } + } + } + } } @@ -158,361 +158,361 @@ void Bond_Energy( reax_system *system, control_params *control, GLOBAL void Cuda_Bond_Energy ( reax_atom *atoms, global_parameters g_params, - single_body_parameters *sbp, two_body_parameters *tbp, - simulation_data *data, - static_storage p_workspace, list p_bonds, - int N, int num_atom_types, real *E_BE) + single_body_parameters *sbp, two_body_parameters *tbp, + simulation_data *data, + static_storage p_workspace, list p_bonds, + int N, int num_atom_types, real *E_BE) { - int i, j, pj; - int start_i, end_i; - int type_i, type_j; - real ebond, pow_BOs_be2, exp_be12, CEbo; - real gp3, gp4, gp7, gp10, gp37; - real exphu, exphua1, exphub1, exphuov, hulpov, estriph; - real decobdbo, decobdboua, decobdboub; - single_body_parameters *sbp_i, *sbp_j; - two_body_parameters *twbp; - bond_order_data *bo_ij; - list *bonds; - static_storage *workspace; - - i = blockIdx.x * blockDim.x + threadIdx.x; - if ( i >= N ) return; - - bonds = &p_bonds; - workspace = &p_workspace; - - gp3 = g_params.l[3]; - gp4 = g_params.l[4]; - gp7 = g_params.l[7]; - gp10 = g_params.l[10]; - gp37 = (int) g_params.l[37]; - - //for( i=0; i < system->N; ++i ) - start_i = Start_Index(i, bonds); - end_i = End_Index(i, bonds); - //fprintf( stderr, "i=%d start=%d end=%d\n", i, start_i, end_i ); - for( pj = start_i; pj < end_i; ++pj ) - { - //TODO - //if( i < bonds->select.bond_list[pj].nbr ) - if( i < bonds->select.bond_list[pj].nbr ) - { - //TODO - /* set the pointers */ - j = bonds->select.bond_list[pj].nbr; - type_i = atoms[i].type; - type_j = atoms[j].type; - sbp_i = &( sbp[type_i] ); - sbp_j = &( sbp[type_j] ); - twbp = &( tbp[ index_tbp (type_i,type_j,num_atom_types) ] ); - bo_ij = &( bonds->select.bond_list[pj].bo_data ); - - /* calculate the constants */ - pow_BOs_be2 = POW( bo_ij->BO_s, twbp->p_be2 ); - exp_be12 = EXP( twbp->p_be1 * ( 1.0 - pow_BOs_be2 ) ); - CEbo = -twbp->De_s * exp_be12 * - ( 1.0 - twbp->p_be1 * twbp->p_be2 * pow_BOs_be2 ); - - /* calculate the Bond Energy */ - ebond = - -twbp->De_s * bo_ij->BO_s * exp_be12 - -twbp->De_p * bo_ij->BO_pi - -twbp->De_pp * bo_ij->BO_pi2; - - //PERFORMANCE IMAPCT - //atomicAdd (&data->E_BE, ebond); - //TODO - //E_BE [ i ] += ebond/2.0; - E_BE [ i ] += ebond; - //data->E_BE += ebond; - - /* calculate derivatives of Bond Orders */ - bo_ij->Cdbo += CEbo; - bo_ij->Cdbopi -= (CEbo + twbp->De_p); - bo_ij->Cdbopi2 -= (CEbo + twbp->De_pp); + int i, j, pj; + int start_i, end_i; + int type_i, type_j; + real ebond, pow_BOs_be2, exp_be12, CEbo; + real gp3, gp4, gp7, gp10, gp37; + real exphu, exphua1, exphub1, exphuov, hulpov, estriph; + real decobdbo, decobdboua, decobdboub; + single_body_parameters *sbp_i, *sbp_j; + two_body_parameters *twbp; + bond_order_data *bo_ij; + list *bonds; + static_storage *workspace; + + i = blockIdx.x * blockDim.x + threadIdx.x; + if ( i >= N ) return; + + bonds = &p_bonds; + workspace = &p_workspace; + + gp3 = g_params.l[3]; + gp4 = g_params.l[4]; + gp7 = g_params.l[7]; + gp10 = g_params.l[10]; + gp37 = (int) g_params.l[37]; + + //for( i=0; i < system->N; ++i ) + start_i = Start_Index(i, bonds); + end_i = End_Index(i, bonds); + //fprintf( stderr, "i=%d start=%d end=%d\n", i, start_i, end_i ); + for( pj = start_i; pj < end_i; ++pj ) + { + //TODO + //if( i < bonds->select.bond_list[pj].nbr ) + if( i < bonds->select.bond_list[pj].nbr ) + { + //TODO + /* set the pointers */ + j = bonds->select.bond_list[pj].nbr; + type_i = atoms[i].type; + type_j = atoms[j].type; + sbp_i = &( sbp[type_i] ); + sbp_j = &( sbp[type_j] ); + twbp = &( tbp[ index_tbp (type_i,type_j,num_atom_types) ] ); + bo_ij = &( bonds->select.bond_list[pj].bo_data ); + + /* calculate the constants */ + pow_BOs_be2 = POW( bo_ij->BO_s, twbp->p_be2 ); + exp_be12 = EXP( twbp->p_be1 * ( 1.0 - pow_BOs_be2 ) ); + CEbo = -twbp->De_s * exp_be12 * + ( 1.0 - twbp->p_be1 * twbp->p_be2 * pow_BOs_be2 ); + + /* calculate the Bond Energy */ + ebond = + -twbp->De_s * bo_ij->BO_s * exp_be12 + -twbp->De_p * bo_ij->BO_pi + -twbp->De_pp * bo_ij->BO_pi2; + + //PERFORMANCE IMAPCT + //atomicAdd (&data->E_BE, ebond); + //TODO + //E_BE [ i ] += ebond/2.0; + E_BE [ i ] += ebond; + //data->E_BE += ebond; + + /* calculate derivatives of Bond Orders */ + bo_ij->Cdbo += CEbo; + bo_ij->Cdbopi -= (CEbo + twbp->De_p); + bo_ij->Cdbopi2 -= (CEbo + twbp->De_pp); #ifdef TEST_ENERGY - //TODO - //fprintf( out_control->ebond, "%6d%6d%24.15e%24.15e\n", - // workspace->orig_id[i], workspace->orig_id[j], - // i+1, j+1, - // bo_ij->BO, ebond/*, data->E_BE*/ ); - /* - fprintf( out_control->ebond, "%6d%6d%12.6f%12.6f%12.6f\n", - workspace->orig_id[i], workspace->orig_id[j], - CEbo, -twbp->De_p, -twbp->De_pp );*/ + //TODO + //fprintf( out_control->ebond, "%6d%6d%24.15e%24.15e\n", + // workspace->orig_id[i], workspace->orig_id[j], + // i+1, j+1, + // bo_ij->BO, ebond/*, data->E_BE*/ ); + /* + fprintf( out_control->ebond, "%6d%6d%12.6f%12.6f%12.6f\n", + workspace->orig_id[i], workspace->orig_id[j], + CEbo, -twbp->De_p, -twbp->De_pp );*/ #endif #ifdef TEST_FORCES - //TODO - /* - Add_dBO( system, lists, i, pj, CEbo, workspace->f_be ); - Add_dBOpinpi2( system, lists, i, pj, - -(CEbo + twbp->De_p), -(CEbo + twbp->De_pp), - workspace->f_be, workspace->f_be ); - */ - //TODO + //TODO + /* + Add_dBO( system, lists, i, pj, CEbo, workspace->f_be ); + Add_dBOpinpi2( system, lists, i, pj, + -(CEbo + twbp->De_p), -(CEbo + twbp->De_pp), + workspace->f_be, workspace->f_be ); + */ + //TODO #endif - /* Stabilisation terminal triple bond */ - if( bo_ij->BO >= 1.00 ) { - if( gp37 == 2 || - (sbp_i->mass == 12.0000 && sbp_j->mass == 15.9990) || - (sbp_j->mass == 12.0000 && sbp_i->mass == 15.9990) ) { - // ba = SQR(bo_ij->BO - 2.50); - exphu = EXP( -gp7 * SQR(bo_ij->BO - 2.50) ); - //oboa=abo(j1)-boa; - //obob=abo(j2)-boa; - exphua1 = EXP(-gp3*(workspace->total_bond_order[i]-bo_ij->BO)); - exphub1 = EXP(-gp3*(workspace->total_bond_order[j]-bo_ij->BO)); - //ovoab=abo(j1)-aval(it1)+abo(j2)-aval(it2); - exphuov = EXP(gp4*(workspace->Delta[i] + workspace->Delta[j])); - hulpov = 1.0 / (1.0 + 25.0 * exphuov); - - estriph = gp10 * exphu * hulpov * (exphua1 + exphub1); - //estrain(j1) = estrain(j1) + 0.50*estriph; - //estrain(j2) = estrain(j2) + 0.50*estriph; - - //PERFORMANCE IMPACT - //atomicAdd (&data->E_BE, estriph); - E_BE [ i] += estriph; - //data->E_BE += estriph; - - decobdbo = gp10 * exphu * hulpov * (exphua1 + exphub1) * - ( gp3 - 2.0 * gp7 * (bo_ij->BO-2.50) ); - decobdboua = -gp10 * exphu * hulpov * - (gp3*exphua1 + 25.0*gp4*exphuov*hulpov*(exphua1+exphub1)); - decobdboub = -gp10 * exphu * hulpov * - (gp3*exphub1 + 25.0*gp4*exphuov*hulpov*(exphua1+exphub1)); - - bo_ij->Cdbo += decobdbo; - - //PERFORMANCE IMAPCT - workspace->CdDelta[i] += decobdboua; - //atomicAdd (&workspace->CdDelta[j], decobdboub); - //CdDelta [ i * N + i ] += decobdboua; - //CdDelta [ i * N + j ] += decobdboua; - //workspace->CdDelta [i] += decobdboua; - //workspace->CdDelta [j] += decobdboub; + /* Stabilisation terminal triple bond */ + if( bo_ij->BO >= 1.00 ) { + if( gp37 == 2 || + (sbp_i->mass == 12.0000 && sbp_j->mass == 15.9990) || + (sbp_j->mass == 12.0000 && sbp_i->mass == 15.9990) ) { + // ba = SQR(bo_ij->BO - 2.50); + exphu = EXP( -gp7 * SQR(bo_ij->BO - 2.50) ); + //oboa=abo(j1)-boa; + //obob=abo(j2)-boa; + exphua1 = EXP(-gp3*(workspace->total_bond_order[i]-bo_ij->BO)); + exphub1 = EXP(-gp3*(workspace->total_bond_order[j]-bo_ij->BO)); + //ovoab=abo(j1)-aval(it1)+abo(j2)-aval(it2); + exphuov = EXP(gp4*(workspace->Delta[i] + workspace->Delta[j])); + hulpov = 1.0 / (1.0 + 25.0 * exphuov); + + estriph = gp10 * exphu * hulpov * (exphua1 + exphub1); + //estrain(j1) = estrain(j1) + 0.50*estriph; + //estrain(j2) = estrain(j2) + 0.50*estriph; + + //PERFORMANCE IMPACT + //atomicAdd (&data->E_BE, estriph); + E_BE [ i] += estriph; + //data->E_BE += estriph; + + decobdbo = gp10 * exphu * hulpov * (exphua1 + exphub1) * + ( gp3 - 2.0 * gp7 * (bo_ij->BO-2.50) ); + decobdboua = -gp10 * exphu * hulpov * + (gp3*exphua1 + 25.0*gp4*exphuov*hulpov*(exphua1+exphub1)); + decobdboub = -gp10 * exphu * hulpov * + (gp3*exphub1 + 25.0*gp4*exphuov*hulpov*(exphua1+exphub1)); + + bo_ij->Cdbo += decobdbo; + + //PERFORMANCE IMAPCT + workspace->CdDelta[i] += decobdboua; + //atomicAdd (&workspace->CdDelta[j], decobdboub); + //CdDelta [ i * N + i ] += decobdboua; + //CdDelta [ i * N + j ] += decobdboua; + //workspace->CdDelta [i] += decobdboua; + //workspace->CdDelta [j] += decobdboub; #ifdef TEST_ENERGY - /* - fprintf( out_control->ebond, - "%6d%6d%24.15e%24.15e%24.15e%24.15e\n", - workspace->orig_id[i], workspace->orig_id[j], - //i+1, j+1, - estriph, decobdbo, decobdboua, decobdboub ); - */ + /* + fprintf( out_control->ebond, + "%6d%6d%24.15e%24.15e%24.15e%24.15e\n", + workspace->orig_id[i], workspace->orig_id[j], + //i+1, j+1, + estriph, decobdbo, decobdboua, decobdboub ); + */ #endif #ifdef TEST_FORCES - /* - Add_dBO( system, lists, i, pj, decobdbo, workspace->f_be ); - Add_dDelta( system, lists, i, decobdboua, workspace->f_be ); - Add_dDelta( system, lists, j, decobdboub, workspace->f_be ); - */ + /* + Add_dBO( system, lists, i, pj, decobdbo, workspace->f_be ); + Add_dDelta( system, lists, i, decobdboua, workspace->f_be ); + Add_dDelta( system, lists, j, decobdboub, workspace->f_be ); + */ #endif - } - } - } - } //TODO commented out the if statement for processing i < j. - // we process all teh bonds and add only half the energy + } + } + } + } //TODO commented out the if statement for processing i < j. + // we process all teh bonds and add only half the energy } void vdW_Coulomb_Energy( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - list **lists, output_controls *out_control ) + simulation_data *data, static_storage *workspace, + list **lists, output_controls *out_control ) { - int i, j, pj; - int start_i, end_i; - real self_coef; - real p_vdW1, p_vdW1i; - real powr_vdW1, powgi_vdW1; - real tmp, r_ij, fn13, exp1, exp2; - real Tap, dTap, dfn13, CEvd, CEclmb; - real dr3gamij_1, dr3gamij_3; - real e_ele, e_vdW, e_core, de_core; - rvec temp, ext_press; - // rtensor temp_rtensor, total_rtensor; - two_body_parameters *twbp; - far_neighbor_data *nbr_pj; - list *far_nbrs; - - p_vdW1 = system->reaxprm.gp.l[28]; - p_vdW1i = 1.0 / p_vdW1; - far_nbrs = (*lists) + FAR_NBRS; - e_ele = 0; - e_vdW = 0; - e_core = 0; - de_core = 0; - - for( i = 0; i < system->N; ++i ) { - start_i = Start_Index(i, far_nbrs); - end_i = End_Index(i, far_nbrs); - // fprintf( stderr, "i: %d, start: %d, end: %d\n", - // i, start_i, end_i ); - - for( pj = start_i; pj < end_i; ++pj ) - if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) { - nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); - j = nbr_pj->nbr; - r_ij = nbr_pj->d; - twbp = &(system->reaxprm.tbp[ index_tbp (system->atoms[i].type, system->atoms[j].type, &system->reaxprm) ]); - self_coef = (i == j) ? 0.5 : 1.0; // for supporting small boxes! - - /* Calculate Taper and its derivative */ - // Tap = nbr_pj->Tap; -- precomputed during compte_H - Tap = control->Tap7 * r_ij + control->Tap6; - Tap = Tap * r_ij + control->Tap5; - Tap = Tap * r_ij + control->Tap4; - Tap = Tap * r_ij + control->Tap3; - Tap = Tap * r_ij + control->Tap2; - Tap = Tap * r_ij + control->Tap1; - Tap = Tap * r_ij + control->Tap0; - - dTap = 7*control->Tap7 * r_ij + 6*control->Tap6; - dTap = dTap * r_ij + 5*control->Tap5; - dTap = dTap * r_ij + 4*control->Tap4; - dTap = dTap * r_ij + 3*control->Tap3; - dTap = dTap * r_ij + 2*control->Tap2; - dTap += control->Tap1/r_ij; - - /*vdWaals Calculations*/ - if(system->reaxprm.gp.vdw_type==1 || system->reaxprm.gp.vdw_type==3) { - // shielding - powr_vdW1 = POW(r_ij, p_vdW1); - powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1); - - fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i ); - exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); - exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); - - data->E_vdW += e_vdW = - self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2); - - dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * - POW(r_ij, p_vdW1 - 2.0); - - CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2 * exp2) - - Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * - (exp1 - exp2) * dfn13 ); - } - else{ // no shielding - exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) ); - exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) ); - - data->E_vdW += e_vdW = - self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2); - - CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2.0 * exp2) - - Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * - (exp1 - exp2) ); - } - - if(system->reaxprm.gp.vdw_type==2 || system->reaxprm.gp.vdw_type==3) { - // innner wall - e_core = twbp->ecore * EXP(twbp->acore * (1.0-(r_ij/twbp->rcore))); - e_vdW += self_coef * Tap * e_core; - data->E_vdW += self_coef * Tap * e_core; - - de_core = -(twbp->acore/twbp->rcore) * e_core; - CEvd += self_coef * ( dTap * e_core + Tap * de_core ); - } - - /*Coulomb Calculations*/ - dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma ); - dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 ); - - tmp = Tap / dr3gamij_3; - //tmp = Tap * nbr_pj->inv_dr3gamij_3; -- precomputed during compte_H - data->E_Ele += e_ele = - self_coef * C_ele * system->atoms[i].q * system->atoms[j].q * tmp; - - - CEclmb = self_coef * C_ele * system->atoms[i].q * system->atoms[j].q * - ( dTap - Tap * r_ij / dr3gamij_1 ) / dr3gamij_3; - /*CEclmb = self_coef*C_ele*system->atoms[i].q*system->atoms[j].q* - ( dTap- Tap*r_ij*nbr_pj->inv_dr3gamij_1 )*nbr_pj->inv_dr3gamij_3;*/ - - - if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) { - rvec_ScaledAdd( system->atoms[i].f, - -(CEvd+CEclmb), nbr_pj->dvec ); - rvec_ScaledAdd( system->atoms[j].f, - +(CEvd+CEclmb), nbr_pj->dvec ); - } - else { // NPT, iNPT or sNPT - /* for pressure coupling, terms not related to bond order - derivatives are added directly into pressure vector/tensor */ - rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec ); - - rvec_ScaledAdd( system->atoms[i].f, -1., temp ); - rvec_Add( system->atoms[j].f, temp ); - - rvec_iMultiply( ext_press, nbr_pj->rel_box, temp ); - rvec_Add( data->ext_press, ext_press ); - - /*fprintf( stderr, "nonbonded(%d,%d): rel_box (%f %f %f)", - i,j,nbr_pj->rel_box[0],nbr_pj->rel_box[1],nbr_pj->rel_box[2] ); - - fprintf( stderr, "force(%f %f %f)", temp[0], temp[1], temp[2] ); - - fprintf( stderr, "ext_press (%12.6f %12.6f %12.6f)\n", - data->ext_press[0], data->ext_press[1], data->ext_press[2] );*/ - - /* This part is intended for a fully-flexible box */ - /* rvec_OuterProduct( temp_rtensor, nbr_pj->dvec, - system->atoms[i].x ); - rtensor_Scale( total_rtensor, - F_C * -(CEvd + CEclmb), temp_rtensor ); - rvec_OuterProduct( temp_rtensor, - nbr_pj->dvec, system->atoms[j].x ); - rtensor_ScaledAdd( total_rtensor, - F_C * +(CEvd + CEclmb), temp_rtensor ); - - if( nbr_pj->imaginary ) - // This is an external force due to an imaginary nbr - rtensor_ScaledAdd( data->flex_bar.P, -1.0, total_rtensor ); - else - // This interaction is completely internal - rtensor_Add( data->flex_bar.P, total_rtensor ); */ - } + int i, j, pj; + int start_i, end_i; + real self_coef; + real p_vdW1, p_vdW1i; + real powr_vdW1, powgi_vdW1; + real tmp, r_ij, fn13, exp1, exp2; + real Tap, dTap, dfn13, CEvd, CEclmb; + real dr3gamij_1, dr3gamij_3; + real e_ele, e_vdW, e_core, de_core; + rvec temp, ext_press; + // rtensor temp_rtensor, total_rtensor; + two_body_parameters *twbp; + far_neighbor_data *nbr_pj; + list *far_nbrs; + + p_vdW1 = system->reaxprm.gp.l[28]; + p_vdW1i = 1.0 / p_vdW1; + far_nbrs = (*lists) + FAR_NBRS; + e_ele = 0; + e_vdW = 0; + e_core = 0; + de_core = 0; + + for( i = 0; i < system->N; ++i ) { + start_i = Start_Index(i, far_nbrs); + end_i = End_Index(i, far_nbrs); + // fprintf( stderr, "i: %d, start: %d, end: %d\n", + // i, start_i, end_i ); + + for( pj = start_i; pj < end_i; ++pj ) + if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) { + nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); + j = nbr_pj->nbr; + r_ij = nbr_pj->d; + twbp = &(system->reaxprm.tbp[ index_tbp (system->atoms[i].type, system->atoms[j].type, &system->reaxprm) ]); + self_coef = (i == j) ? 0.5 : 1.0; // for supporting small boxes! + + /* Calculate Taper and its derivative */ + // Tap = nbr_pj->Tap; -- precomputed during compte_H + Tap = control->Tap7 * r_ij + control->Tap6; + Tap = Tap * r_ij + control->Tap5; + Tap = Tap * r_ij + control->Tap4; + Tap = Tap * r_ij + control->Tap3; + Tap = Tap * r_ij + control->Tap2; + Tap = Tap * r_ij + control->Tap1; + Tap = Tap * r_ij + control->Tap0; + + dTap = 7*control->Tap7 * r_ij + 6*control->Tap6; + dTap = dTap * r_ij + 5*control->Tap5; + dTap = dTap * r_ij + 4*control->Tap4; + dTap = dTap * r_ij + 3*control->Tap3; + dTap = dTap * r_ij + 2*control->Tap2; + dTap += control->Tap1/r_ij; + + /*vdWaals Calculations*/ + if(system->reaxprm.gp.vdw_type==1 || system->reaxprm.gp.vdw_type==3) { + // shielding + powr_vdW1 = POW(r_ij, p_vdW1); + powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1); + + fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i ); + exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); + exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); + + data->E_vdW += e_vdW = + self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2); + + dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * + POW(r_ij, p_vdW1 - 2.0); + + CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2 * exp2) - + Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * + (exp1 - exp2) * dfn13 ); + } + else{ // no shielding + exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) ); + exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) ); + + data->E_vdW += e_vdW = + self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2); + + CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2.0 * exp2) - + Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * + (exp1 - exp2) ); + } + + if(system->reaxprm.gp.vdw_type==2 || system->reaxprm.gp.vdw_type==3) { + // innner wall + e_core = twbp->ecore * EXP(twbp->acore * (1.0-(r_ij/twbp->rcore))); + e_vdW += self_coef * Tap * e_core; + data->E_vdW += self_coef * Tap * e_core; + + de_core = -(twbp->acore/twbp->rcore) * e_core; + CEvd += self_coef * ( dTap * e_core + Tap * de_core ); + } + + /*Coulomb Calculations*/ + dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma ); + dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 ); + + tmp = Tap / dr3gamij_3; + //tmp = Tap * nbr_pj->inv_dr3gamij_3; -- precomputed during compte_H + data->E_Ele += e_ele = + self_coef * C_ele * system->atoms[i].q * system->atoms[j].q * tmp; + + + CEclmb = self_coef * C_ele * system->atoms[i].q * system->atoms[j].q * + ( dTap - Tap * r_ij / dr3gamij_1 ) / dr3gamij_3; + /*CEclmb = self_coef*C_ele*system->atoms[i].q*system->atoms[j].q* + ( dTap- Tap*r_ij*nbr_pj->inv_dr3gamij_1 )*nbr_pj->inv_dr3gamij_3;*/ + + + if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) { + rvec_ScaledAdd( system->atoms[i].f, + -(CEvd+CEclmb), nbr_pj->dvec ); + rvec_ScaledAdd( system->atoms[j].f, + +(CEvd+CEclmb), nbr_pj->dvec ); + } + else { // NPT, iNPT or sNPT + /* for pressure coupling, terms not related to bond order + derivatives are added directly into pressure vector/tensor */ + rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec ); + + rvec_ScaledAdd( system->atoms[i].f, -1., temp ); + rvec_Add( system->atoms[j].f, temp ); + + rvec_iMultiply( ext_press, nbr_pj->rel_box, temp ); + rvec_Add( data->ext_press, ext_press ); + + /*fprintf( stderr, "nonbonded(%d,%d): rel_box (%f %f %f)", + i,j,nbr_pj->rel_box[0],nbr_pj->rel_box[1],nbr_pj->rel_box[2] ); + + fprintf( stderr, "force(%f %f %f)", temp[0], temp[1], temp[2] ); + + fprintf( stderr, "ext_press (%12.6f %12.6f %12.6f)\n", + data->ext_press[0], data->ext_press[1], data->ext_press[2] );*/ + + /* This part is intended for a fully-flexible box */ + /* rvec_OuterProduct( temp_rtensor, nbr_pj->dvec, + system->atoms[i].x ); + rtensor_Scale( total_rtensor, + F_C * -(CEvd + CEclmb), temp_rtensor ); + rvec_OuterProduct( temp_rtensor, + nbr_pj->dvec, system->atoms[j].x ); + rtensor_ScaledAdd( total_rtensor, + F_C * +(CEvd + CEclmb), temp_rtensor ); + + if( nbr_pj->imaginary ) + // This is an external force due to an imaginary nbr + rtensor_ScaledAdd( data->flex_bar.P, -1.0, total_rtensor ); + else + // This interaction is completely internal + rtensor_Add( data->flex_bar.P, total_rtensor ); */ + } #ifdef TEST_ENERGY - rvec_MakeZero( temp ); - rvec_ScaledAdd( temp, +CEvd, nbr_pj->dvec ); - fprintf( out_control->evdw, - "%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n", - //i+1, j+1, - MIN( workspace->orig_id[i], workspace->orig_id[j] ), - MAX( workspace->orig_id[i], workspace->orig_id[j] ), - r_ij, e_vdW, temp[0], temp[1], temp[2]/*, data->E_vdW*/ ); - - fprintf( out_control->ecou, "%6d%6d%24.15e%24.15e%24.15e%24.15e\n", - MIN( workspace->orig_id[i], workspace->orig_id[j] ), - MAX( workspace->orig_id[i], workspace->orig_id[j] ), - r_ij, system->atoms[i].q, system->atoms[j].q, - e_ele/*, data->E_Ele*/ ); + rvec_MakeZero( temp ); + rvec_ScaledAdd( temp, +CEvd, nbr_pj->dvec ); + fprintf( out_control->evdw, + "%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n", + //i+1, j+1, + MIN( workspace->orig_id[i], workspace->orig_id[j] ), + MAX( workspace->orig_id[i], workspace->orig_id[j] ), + r_ij, e_vdW, temp[0], temp[1], temp[2]/*, data->E_vdW*/ ); + + fprintf( out_control->ecou, "%6d%6d%24.15e%24.15e%24.15e%24.15e\n", + MIN( workspace->orig_id[i], workspace->orig_id[j] ), + MAX( workspace->orig_id[i], workspace->orig_id[j] ), + r_ij, system->atoms[i].q, system->atoms[j].q, + e_ele/*, data->E_Ele*/ ); #endif #ifdef TEST_FORCES - rvec_ScaledAdd( workspace->f_vdw[i], -CEvd, nbr_pj->dvec ); - rvec_ScaledAdd( workspace->f_vdw[j], +CEvd, nbr_pj->dvec ); - rvec_ScaledAdd( workspace->f_ele[i], -CEclmb, nbr_pj->dvec ); - rvec_ScaledAdd( workspace->f_ele[j], +CEclmb, nbr_pj->dvec ); + rvec_ScaledAdd( workspace->f_vdw[i], -CEvd, nbr_pj->dvec ); + rvec_ScaledAdd( workspace->f_vdw[j], +CEvd, nbr_pj->dvec ); + rvec_ScaledAdd( workspace->f_ele[i], -CEclmb, nbr_pj->dvec ); + rvec_ScaledAdd( workspace->f_ele[j], +CEclmb, nbr_pj->dvec ); #endif - } - } + } + } - // fclose( fout ); + // fclose( fout ); - // fprintf( stderr, "nonbonded: ext_press (%24.15e %24.15e %24.15e)\n", - // data->ext_press[0], data->ext_press[1], data->ext_press[2] ); + // fprintf( stderr, "nonbonded: ext_press (%24.15e %24.15e %24.15e)\n", + // data->ext_press[0], data->ext_press[1], data->ext_press[2] ); } /* - GLOBAL void Cuda_vdW_Coulomb_Energy( reax_atom *atoms, + GLOBAL void Cuda_vdW_Coulomb_Energy( reax_atom *atoms, two_body_parameters *tbp, global_parameters g_p, control_params *control, @@ -583,47 +583,47 @@ dTap += control->Tap1/r_ij; //vdWaals Calculations if(g_p.vdw_type==1 || g_p.vdw_type==3) { - // shielding - powr_vdW1 = POW(r_ij, p_vdW1); - powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1); + // shielding + powr_vdW1 = POW(r_ij, p_vdW1); + powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1); - fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i ); - exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); - exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); + fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i ); + exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); + exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); - e_vdW = self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2); - E_vdW [i] += e_vdW / 2.0; + e_vdW = self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2); + E_vdW [i] += e_vdW / 2.0; - dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * - POW(r_ij, p_vdW1 - 2.0); + dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * + POW(r_ij, p_vdW1 - 2.0); - CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2 * exp2) - - Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * - (exp1 - exp2) * dfn13 ); + CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2 * exp2) - + Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * + (exp1 - exp2) * dfn13 ); } else{ // no shielding - exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) ); - exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) ); + exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) ); + exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) ); - e_vdW = self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2); - E_vdW [i] += e_vdW / 2.0; + e_vdW = self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2); + E_vdW [i] += e_vdW / 2.0; - CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2.0 * exp2) - - Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * - (exp1 - exp2) ); + CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2.0 * exp2) - + Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * + (exp1 - exp2) ); } if(g_p.vdw_type==2 || g_p.vdw_type==3) { - // innner wall - e_core = twbp->ecore * EXP(twbp->acore * (1.0-(r_ij/twbp->rcore))); - e_vdW = self_coef * Tap * e_core; + // innner wall + e_core = twbp->ecore * EXP(twbp->acore * (1.0-(r_ij/twbp->rcore))); + e_vdW = self_coef * Tap * e_core; - //TODO check this - E_vdW [i] += e_vdW / 2.0; - //TODO check this + //TODO check this + E_vdW [i] += e_vdW / 2.0; + //TODO check this - de_core = -(twbp->acore/twbp->rcore) * e_core; - CEvd += self_coef * ( dTap * e_core + Tap * de_core ); + de_core = -(twbp->acore/twbp->rcore) * e_core; + CEvd += self_coef * ( dTap * e_core + Tap * de_core ); } //Coulomb Calculations @@ -642,27 +642,27 @@ CEclmb = self_coef * C_ele * atoms[i].q * atoms[j].q * // ( dTap- Tap*r_ij*nbr_pj->inv_dr3gamij_1 )*nbr_pj->inv_dr3gamij_3; if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) { - if (i >= j) - rvec_ScaledAdd( atoms[i].f, -(CEvd+CEclmb), nbr_pj->dvec ); - else - rvec_ScaledAdd( atoms[i].f, +(CEvd+CEclmb), nbr_pj->dvec ); + if (i >= j) + rvec_ScaledAdd( atoms[i].f, -(CEvd+CEclmb), nbr_pj->dvec ); + else + rvec_ScaledAdd( atoms[i].f, +(CEvd+CEclmb), nbr_pj->dvec ); } else { // NPT, iNPT or sNPT - // for pressure coupling, terms not related to bond order - // derivatives are added directly into pressure vector/tensor - rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec ); + // for pressure coupling, terms not related to bond order + // derivatives are added directly into pressure vector/tensor + rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec ); - if ( i >= j) - rvec_ScaledAdd( atoms[i].f, -1., temp ); - else - rvec_Add( atoms[i].f, temp ); + if ( i >= j) + rvec_ScaledAdd( atoms[i].f, -1., temp ); + else + rvec_Add( atoms[i].f, temp ); - rvec_iMultiply( ext_press, nbr_pj->rel_box, temp ); + rvec_iMultiply( ext_press, nbr_pj->rel_box, temp ); - //rvec_Add( data->ext_press, ext_press ); - rvec_Copy (aux_ext_press[i], ext_press); + //rvec_Add( data->ext_press, ext_press ); + rvec_Copy (aux_ext_press[i], ext_press); - //TODO CHECK THIS calculation here, it should be divided by two somehow. + //TODO CHECK THIS calculation here, it should be divided by two somehow. } } //} @@ -673,921 +673,921 @@ else { // NPT, iNPT or sNPT -GLOBAL void Cuda_vdW_Coulomb_Energy( reax_atom *atoms, - two_body_parameters *tbp, - global_parameters g_p, - control_params *control, - simulation_data *data, - list p_far_nbrs, - real *E_vdW, real *E_Ele, rvec *aux_ext_press, - int num_atom_types, int N ) +GLOBAL void Cuda_vdW_Coulomb_Energy( reax_atom *atoms, + two_body_parameters *tbp, + global_parameters g_p, + control_params *control, + simulation_data *data, + list p_far_nbrs, + real *E_vdW, real *E_Ele, rvec *aux_ext_press, + int num_atom_types, int N ) { - extern __shared__ real _vdw[]; - extern __shared__ real _ele[]; - extern __shared__ rvec _force []; - - real *sh_vdw; - real *sh_ele; - rvec *sh_force; - - int i, j, pj; - int start_i, end_i; - real self_coef; - real p_vdW1, p_vdW1i; - real powr_vdW1, powgi_vdW1; - real tmp, r_ij, fn13, exp1, exp2; - real Tap, dTap, dfn13, CEvd, CEclmb; - real dr3gamij_1, dr3gamij_3; - real e_ele, e_vdW, e_core, de_core; - rvec temp, ext_press; - // rtensor temp_rtensor, total_rtensor; - two_body_parameters *twbp; - far_neighbor_data *nbr_pj; - list *far_nbrs = &p_far_nbrs; - - int thread_id = blockIdx.x * blockDim.x + threadIdx.x; - int warpid = thread_id / VDW_THREADS_PER_ATOM; - int laneid = thread_id & (VDW_THREADS_PER_ATOM -1); - - i = warpid; - - sh_vdw = _vdw; - sh_ele = _vdw + blockDim.x; - sh_force = (rvec *)( _vdw + 2*blockDim.x); - - sh_vdw[threadIdx.x] = 0.0; - sh_ele[threadIdx.x] = 0.0; - rvec_MakeZero ( sh_force [threadIdx.x] ); - - if (i < N) - { - - p_vdW1 = g_p.l[28]; - p_vdW1i = 1.0 / p_vdW1; - e_ele = 0; - e_vdW = 0; - e_core = 0; - de_core = 0; - - //for( i = 0; i < system->N; ++i ) { - start_i = Start_Index(i, far_nbrs); - end_i = End_Index(i, far_nbrs); - // fprintf( stderr, "i: %d, start: %d, end: %d\n", - // i, start_i, end_i ); - - pj = start_i + laneid; - //for( pj = start_i; pj < end_i; ++pj ) - while (pj < end_i) - { - if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) { - nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); - j = nbr_pj->nbr; - r_ij = nbr_pj->d; - twbp = &(tbp[ index_tbp (atoms[i].type, atoms[j].type, num_atom_types) ]); - self_coef = (i == j) ? 0.5 : 1.0; // for supporting small boxes! - - //CHANGE ORIGINAL - //if (i <= j) continue; - //CHANGE ORIGINAL - - // Calculate Taper and its derivative - // Tap = nbr_pj->Tap; -- precomputed during compte_H - Tap = control->Tap7 * r_ij + control->Tap6; - Tap = Tap * r_ij + control->Tap5; - Tap = Tap * r_ij + control->Tap4; - Tap = Tap * r_ij + control->Tap3; - Tap = Tap * r_ij + control->Tap2; - Tap = Tap * r_ij + control->Tap1; - Tap = Tap * r_ij + control->Tap0; - - dTap = 7*control->Tap7 * r_ij + 6*control->Tap6; - dTap = dTap * r_ij + 5*control->Tap5; - dTap = dTap * r_ij + 4*control->Tap4; - dTap = dTap * r_ij + 3*control->Tap3; - dTap = dTap * r_ij + 2*control->Tap2; - dTap += control->Tap1/r_ij; - - //vdWaals Calculations - if(g_p.vdw_type==1 || g_p.vdw_type==3) { - // shielding - powr_vdW1 = POW(r_ij, p_vdW1); - powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1); - - fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i ); - exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); - exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); - - e_vdW = self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2); - - - //E_vdW [i] += e_vdW / 2.0; - sh_vdw [threadIdx.x] += e_vdW/2.0; - - dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * - POW(r_ij, p_vdW1 - 2.0); - - CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2 * exp2) - - Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * - (exp1 - exp2) * dfn13 ); - } - else{ // no shielding - exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) ); - exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) ); - - e_vdW = self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2); - - - //E_vdW [i] += e_vdW / 2.0; - sh_vdw [threadIdx.x] += e_vdW/2.0; - - CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2.0 * exp2) - - Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * - (exp1 - exp2) ); - } - - if(g_p.vdw_type==2 || g_p.vdw_type==3) { - // innner wall - e_core = twbp->ecore * EXP(twbp->acore * (1.0-(r_ij/twbp->rcore))); - e_vdW = self_coef * Tap * e_core; - - //TODO check this - //E_vdW [i] += e_vdW / 2.0; - sh_vdw [threadIdx.x] += e_vdW / 2.0; - //TODO check this - - de_core = -(twbp->acore/twbp->rcore) * e_core; - CEvd += self_coef * ( dTap * e_core + Tap * de_core ); - } - - //Coulomb Calculations - dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma ); - dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 ); - - tmp = Tap / dr3gamij_3; - //tmp = Tap * nbr_pj->inv_dr3gamij_3; -- precomputed during compte_H - e_ele = - self_coef * C_ele * atoms[i].q * atoms[j].q * tmp; - - //E_Ele [i] += e_ele / 2.0; - sh_ele [threadIdx.x] += e_ele / 2.0; - - CEclmb = self_coef * C_ele * atoms[i].q * atoms[j].q * - ( dTap - Tap * r_ij / dr3gamij_1 ) / dr3gamij_3; - //CEclmb = self_coef*C_ele*system->atoms[i].q*system->atoms[j].q* - // ( dTap- Tap*r_ij*nbr_pj->inv_dr3gamij_1 )*nbr_pj->inv_dr3gamij_3; - - if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) { - if (i >= j){ - //rvec_ScaledAdd( atoms[i].f, -(CEvd+CEclmb), nbr_pj->dvec ); - rvec_ScaledAdd( sh_force[threadIdx.x], -(CEvd+CEclmb), nbr_pj->dvec ); - } - else - { - //rvec_ScaledAdd( atoms[i].f, +(CEvd+CEclmb), nbr_pj->dvec ); - rvec_ScaledAdd( sh_force[threadIdx.x], +(CEvd+CEclmb), nbr_pj->dvec ); - } - } - else { // NPT, iNPT or sNPT - // for pressure coupling, terms not related to bond order - // derivatives are added directly into pressure vector/tensor - rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec ); - - if ( i >= j) - { - //rvec_ScaledAdd( atoms[i].f, -1., temp ); - rvec_ScaledAdd( sh_force[threadIdx.x], -1., temp ); - } - else - { - //rvec_Add( atoms[i].f, temp ); - rvec_Add( sh_force[threadIdx.x], temp ); - } - - rvec_iMultiply( ext_press, nbr_pj->rel_box, temp ); - - //rvec_Add( data->ext_press, ext_press ); - rvec_Copy (aux_ext_press[i], ext_press); - - //TODO CHECK THIS calculation here, it should be divided by two somehow. - } - } // if condition for far neighbors - - - pj += VDW_THREADS_PER_ATOM; - - } // end of while loop for pj < end_i condition - } // if (i < N ) condition - //} - - __syncthreads (); - - if (laneid < 16) { - sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 16]; - sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 16]; - rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 16] ); - } - __syncthreads (); - if (laneid < 8) { - sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 8]; - sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 8]; - rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 8] ); - } - __syncthreads (); - if (laneid < 4) { - sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 4]; - sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 4]; - rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 4] ); - } - __syncthreads (); - if (laneid < 2) { - sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 2]; - sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 2]; - rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 2] ); - } - __syncthreads (); - if (laneid < 1) { - sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 1]; - sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 1]; - rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 1] ); - } - __syncthreads (); - if (laneid == 0) { - E_vdW [i] += sh_vdw[threadIdx.x]; - E_Ele [i] += sh_ele[threadIdx.x]; - rvec_Add (atoms[i].f, sh_force [ threadIdx.x ]); - } + extern __shared__ real _vdw[]; + extern __shared__ real _ele[]; + extern __shared__ rvec _force []; + + real *sh_vdw; + real *sh_ele; + rvec *sh_force; + + int i, j, pj; + int start_i, end_i; + real self_coef; + real p_vdW1, p_vdW1i; + real powr_vdW1, powgi_vdW1; + real tmp, r_ij, fn13, exp1, exp2; + real Tap, dTap, dfn13, CEvd, CEclmb; + real dr3gamij_1, dr3gamij_3; + real e_ele, e_vdW, e_core, de_core; + rvec temp, ext_press; + // rtensor temp_rtensor, total_rtensor; + two_body_parameters *twbp; + far_neighbor_data *nbr_pj; + list *far_nbrs = &p_far_nbrs; + + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + int warpid = thread_id / VDW_THREADS_PER_ATOM; + int laneid = thread_id & (VDW_THREADS_PER_ATOM -1); + + i = warpid; + + sh_vdw = _vdw; + sh_ele = _vdw + blockDim.x; + sh_force = (rvec *)( _vdw + 2*blockDim.x); + + sh_vdw[threadIdx.x] = 0.0; + sh_ele[threadIdx.x] = 0.0; + rvec_MakeZero ( sh_force [threadIdx.x] ); + + if (i < N) + { + + p_vdW1 = g_p.l[28]; + p_vdW1i = 1.0 / p_vdW1; + e_ele = 0; + e_vdW = 0; + e_core = 0; + de_core = 0; + + //for( i = 0; i < system->N; ++i ) { + start_i = Start_Index(i, far_nbrs); + end_i = End_Index(i, far_nbrs); + // fprintf( stderr, "i: %d, start: %d, end: %d\n", + // i, start_i, end_i ); + + pj = start_i + laneid; + //for( pj = start_i; pj < end_i; ++pj ) + while (pj < end_i) + { + if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) { + nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); + j = nbr_pj->nbr; + r_ij = nbr_pj->d; + twbp = &(tbp[ index_tbp (atoms[i].type, atoms[j].type, num_atom_types) ]); + self_coef = (i == j) ? 0.5 : 1.0; // for supporting small boxes! + + //CHANGE ORIGINAL + //if (i <= j) continue; + //CHANGE ORIGINAL + + // Calculate Taper and its derivative + // Tap = nbr_pj->Tap; -- precomputed during compte_H + Tap = control->Tap7 * r_ij + control->Tap6; + Tap = Tap * r_ij + control->Tap5; + Tap = Tap * r_ij + control->Tap4; + Tap = Tap * r_ij + control->Tap3; + Tap = Tap * r_ij + control->Tap2; + Tap = Tap * r_ij + control->Tap1; + Tap = Tap * r_ij + control->Tap0; + + dTap = 7*control->Tap7 * r_ij + 6*control->Tap6; + dTap = dTap * r_ij + 5*control->Tap5; + dTap = dTap * r_ij + 4*control->Tap4; + dTap = dTap * r_ij + 3*control->Tap3; + dTap = dTap * r_ij + 2*control->Tap2; + dTap += control->Tap1/r_ij; + + //vdWaals Calculations + if(g_p.vdw_type==1 || g_p.vdw_type==3) { + // shielding + powr_vdW1 = POW(r_ij, p_vdW1); + powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1); + + fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i ); + exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); + exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); + + e_vdW = self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2); + + + //E_vdW [i] += e_vdW / 2.0; + sh_vdw [threadIdx.x] += e_vdW/2.0; + + dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * + POW(r_ij, p_vdW1 - 2.0); + + CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2 * exp2) - + Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * + (exp1 - exp2) * dfn13 ); + } + else{ // no shielding + exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) ); + exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) ); + + e_vdW = self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2); + + + //E_vdW [i] += e_vdW / 2.0; + sh_vdw [threadIdx.x] += e_vdW/2.0; + + CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2.0 * exp2) - + Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * + (exp1 - exp2) ); + } + + if(g_p.vdw_type==2 || g_p.vdw_type==3) { + // innner wall + e_core = twbp->ecore * EXP(twbp->acore * (1.0-(r_ij/twbp->rcore))); + e_vdW = self_coef * Tap * e_core; + + //TODO check this + //E_vdW [i] += e_vdW / 2.0; + sh_vdw [threadIdx.x] += e_vdW / 2.0; + //TODO check this + + de_core = -(twbp->acore/twbp->rcore) * e_core; + CEvd += self_coef * ( dTap * e_core + Tap * de_core ); + } + + //Coulomb Calculations + dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma ); + dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 ); + + tmp = Tap / dr3gamij_3; + //tmp = Tap * nbr_pj->inv_dr3gamij_3; -- precomputed during compte_H + e_ele = + self_coef * C_ele * atoms[i].q * atoms[j].q * tmp; + + //E_Ele [i] += e_ele / 2.0; + sh_ele [threadIdx.x] += e_ele / 2.0; + + CEclmb = self_coef * C_ele * atoms[i].q * atoms[j].q * + ( dTap - Tap * r_ij / dr3gamij_1 ) / dr3gamij_3; + //CEclmb = self_coef*C_ele*system->atoms[i].q*system->atoms[j].q* + // ( dTap- Tap*r_ij*nbr_pj->inv_dr3gamij_1 )*nbr_pj->inv_dr3gamij_3; + + if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) { + if (i >= j){ + //rvec_ScaledAdd( atoms[i].f, -(CEvd+CEclmb), nbr_pj->dvec ); + rvec_ScaledAdd( sh_force[threadIdx.x], -(CEvd+CEclmb), nbr_pj->dvec ); + } + else + { + //rvec_ScaledAdd( atoms[i].f, +(CEvd+CEclmb), nbr_pj->dvec ); + rvec_ScaledAdd( sh_force[threadIdx.x], +(CEvd+CEclmb), nbr_pj->dvec ); + } + } + else { // NPT, iNPT or sNPT + // for pressure coupling, terms not related to bond order + // derivatives are added directly into pressure vector/tensor + rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec ); + + if ( i >= j) + { + //rvec_ScaledAdd( atoms[i].f, -1., temp ); + rvec_ScaledAdd( sh_force[threadIdx.x], -1., temp ); + } + else + { + //rvec_Add( atoms[i].f, temp ); + rvec_Add( sh_force[threadIdx.x], temp ); + } + + rvec_iMultiply( ext_press, nbr_pj->rel_box, temp ); + + //rvec_Add( data->ext_press, ext_press ); + rvec_Copy (aux_ext_press[i], ext_press); + + //TODO CHECK THIS calculation here, it should be divided by two somehow. + } + } // if condition for far neighbors + + + pj += VDW_THREADS_PER_ATOM; + + } // end of while loop for pj < end_i condition + } // if (i < N ) condition + //} + + __syncthreads (); + + if (laneid < 16) { + sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 16]; + sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 16]; + rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 16] ); + } + __syncthreads (); + if (laneid < 8) { + sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 8]; + sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 8]; + rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 8] ); + } + __syncthreads (); + if (laneid < 4) { + sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 4]; + sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 4]; + rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 4] ); + } + __syncthreads (); + if (laneid < 2) { + sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 2]; + sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 2]; + rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 2] ); + } + __syncthreads (); + if (laneid < 1) { + sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 1]; + sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 1]; + rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 1] ); + } + __syncthreads (); + if (laneid == 0) { + E_vdW [i] += sh_vdw[threadIdx.x]; + E_Ele [i] += sh_ele[threadIdx.x]; + rvec_Add (atoms[i].f, sh_force [ threadIdx.x ]); + } } void LR_vdW_Coulomb( reax_system *system, control_params *control, - int i, int j, real r_ij, LR_data *lr ) + int i, int j, real r_ij, LR_data *lr ) { - real p_vdW1 = system->reaxprm.gp.l[28]; - real p_vdW1i = 1.0 / p_vdW1; - real powr_vdW1, powgi_vdW1; - real tmp, fn13, exp1, exp2; - real Tap, dTap, dfn13; - real dr3gamij_1, dr3gamij_3; - real e_core, de_core; - two_body_parameters *twbp; - - twbp = &(system->reaxprm.tbp[ index_tbp (i,j,&system->reaxprm) ]); - e_core = 0; - de_core = 0; - - /* calculate taper and its derivative */ - Tap = control->Tap7 * r_ij + control->Tap6; - Tap = Tap * r_ij + control->Tap5; - Tap = Tap * r_ij + control->Tap4; - Tap = Tap * r_ij + control->Tap3; - Tap = Tap * r_ij + control->Tap2; - Tap = Tap * r_ij + control->Tap1; - Tap = Tap * r_ij + control->Tap0; - - dTap = 7*control->Tap7 * r_ij + 6*control->Tap6; - dTap = dTap * r_ij + 5*control->Tap5; - dTap = dTap * r_ij + 4*control->Tap4; - dTap = dTap * r_ij + 3*control->Tap3; - dTap = dTap * r_ij + 2*control->Tap2; - dTap += control->Tap1/r_ij; - - - /* vdWaals calculations */ - powr_vdW1 = POW(r_ij, p_vdW1); - powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1); - - fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i ); - exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); - exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); - - lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2); - /* fprintf(stderr,"vdW: Tap:%f, r: %f, f13:%f, D:%f, Energy:%f,\ + real p_vdW1 = system->reaxprm.gp.l[28]; + real p_vdW1i = 1.0 / p_vdW1; + real powr_vdW1, powgi_vdW1; + real tmp, fn13, exp1, exp2; + real Tap, dTap, dfn13; + real dr3gamij_1, dr3gamij_3; + real e_core, de_core; + two_body_parameters *twbp; + + twbp = &(system->reaxprm.tbp[ index_tbp (i,j,&system->reaxprm) ]); + e_core = 0; + de_core = 0; + + /* calculate taper and its derivative */ + Tap = control->Tap7 * r_ij + control->Tap6; + Tap = Tap * r_ij + control->Tap5; + Tap = Tap * r_ij + control->Tap4; + Tap = Tap * r_ij + control->Tap3; + Tap = Tap * r_ij + control->Tap2; + Tap = Tap * r_ij + control->Tap1; + Tap = Tap * r_ij + control->Tap0; + + dTap = 7*control->Tap7 * r_ij + 6*control->Tap6; + dTap = dTap * r_ij + 5*control->Tap5; + dTap = dTap * r_ij + 4*control->Tap4; + dTap = dTap * r_ij + 3*control->Tap3; + dTap = dTap * r_ij + 2*control->Tap2; + dTap += control->Tap1/r_ij; + + + /* vdWaals calculations */ + powr_vdW1 = POW(r_ij, p_vdW1); + powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1); + + fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i ); + exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); + exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); + + lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2); + /* fprintf(stderr,"vdW: Tap:%f, r: %f, f13:%f, D:%f, Energy:%f,\ Gamma_w:%f, p_vdw: %f, alpha: %f, r_vdw: %f, %lf %lf\n", Tap, r_ij, fn13, twbp->D, Tap * twbp->D * (exp1 - 2.0 * exp2), powgi_vdW1, p_vdW1, twbp->alpha, twbp->r_vdW, exp1, exp2); */ - dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * POW(r_ij, p_vdW1 - 2.0); + dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * POW(r_ij, p_vdW1 - 2.0); - lr->CEvd = dTap * twbp->D * (exp1 - 2 * exp2) - - Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13; + lr->CEvd = dTap * twbp->D * (exp1 - 2 * exp2) - + Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13; - /*vdWaals Calculations*/ - if(system->reaxprm.gp.vdw_type==1 || system->reaxprm.gp.vdw_type==3) - { // shielding - powr_vdW1 = POW(r_ij, p_vdW1); - powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1); + /*vdWaals Calculations*/ + if(system->reaxprm.gp.vdw_type==1 || system->reaxprm.gp.vdw_type==3) + { // shielding + powr_vdW1 = POW(r_ij, p_vdW1); + powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1); - fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i ); - exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); - exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); + fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i ); + exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); + exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); - lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2); + lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2); - dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * - POW(r_ij, p_vdW1 - 2.0); + dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * + POW(r_ij, p_vdW1 - 2.0); - lr->CEvd = dTap * twbp->D * (exp1 - 2.0 * exp2) - - Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13; - } - else{ // no shielding - exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) ); - exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) ); + lr->CEvd = dTap * twbp->D * (exp1 - 2.0 * exp2) - + Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13; + } + else{ // no shielding + exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) ); + exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) ); - lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2); + lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2); - lr->CEvd = dTap * twbp->D * (exp1 - 2.0 * exp2) - - Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2); - } + lr->CEvd = dTap * twbp->D * (exp1 - 2.0 * exp2) - + Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2); + } - if(system->reaxprm.gp.vdw_type==2 || system->reaxprm.gp.vdw_type==3) - { // innner wall - e_core = twbp->ecore * EXP(twbp->acore * (1.0-(r_ij/twbp->rcore))); - lr->e_vdW += Tap * e_core; + if(system->reaxprm.gp.vdw_type==2 || system->reaxprm.gp.vdw_type==3) + { // innner wall + e_core = twbp->ecore * EXP(twbp->acore * (1.0-(r_ij/twbp->rcore))); + lr->e_vdW += Tap * e_core; - de_core = -(twbp->acore/twbp->rcore) * e_core; - lr->CEvd += dTap * e_core + Tap * de_core; - } + de_core = -(twbp->acore/twbp->rcore) * e_core; + lr->CEvd += dTap * e_core + Tap * de_core; + } - /* Coulomb calculations */ - dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma ); - dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 ); + /* Coulomb calculations */ + dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma ); + dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 ); - tmp = Tap / dr3gamij_3; - lr->H = EV_to_KCALpMOL * tmp; - lr->e_ele = C_ele * tmp; - /* fprintf( stderr,"i:%d(%d), j:%d(%d), gamma:%f,\ + tmp = Tap / dr3gamij_3; + lr->H = EV_to_KCALpMOL * tmp; + lr->e_ele = C_ele * tmp; + /* fprintf( stderr,"i:%d(%d), j:%d(%d), gamma:%f,\ Tap:%f, dr3gamij_3:%f, qi: %f, qj: %f\n", i, system->atoms[i].type, j, system->atoms[j].type, twbp->gamma, Tap, dr3gamij_3, system->atoms[i].q, system->atoms[j].q ); */ - lr->CEclmb = C_ele * ( dTap - Tap * r_ij / dr3gamij_1 ) / dr3gamij_3; - /* fprintf( stdout, "%d %d\t%g\t%g %g\t%g %g\t%g %g\n", - i+1, j+1, r_ij, e_vdW, CEvd * r_ij, - system->atoms[i].q, system->atoms[j].q, e_ele, CEclmb * r_ij ); */ + lr->CEclmb = C_ele * ( dTap - Tap * r_ij / dr3gamij_1 ) / dr3gamij_3; + /* fprintf( stdout, "%d %d\t%g\t%g %g\t%g %g\t%g %g\n", + i+1, j+1, r_ij, e_vdW, CEvd * r_ij, + system->atoms[i].q, system->atoms[j].q, e_ele, CEclmb * r_ij ); */ - /* fprintf( stderr,"LR_Lookup:%3d%3d%5.3f-%8.5f,%8.5f%8.5f,%8.5f%8.5f\n", - i, j, r_ij, lr->H, lr->e_vdW, lr->CEvd, lr->e_ele, lr->CEclmb ); */ + /* fprintf( stderr,"LR_Lookup:%3d%3d%5.3f-%8.5f,%8.5f%8.5f,%8.5f%8.5f\n", + i, j, r_ij, lr->H, lr->e_vdW, lr->CEvd, lr->e_ele, lr->CEclmb ); */ } void Tabulated_vdW_Coulomb_Energy( reax_system *system, control_params *control, - simulation_data *data, - static_storage *workspace, list **lists, - output_controls *out_control ) + simulation_data *data, + static_storage *workspace, list **lists, + output_controls *out_control ) { - int i, j, pj, r, steps, update_freq, update_energies; - int type_i, type_j, tmin, tmax; - int start_i, end_i; - real r_ij, self_coef, base, dif; - real e_vdW, e_ele; - real CEvd, CEclmb; - rvec temp, ext_press; - far_neighbor_data *nbr_pj; - list *far_nbrs = (*lists) + FAR_NBRS; - LR_lookup_table *t; - - steps = data->step - data->prev_steps; - update_freq = out_control->energy_update_freq; - update_energies = update_freq > 0 && steps % update_freq == 0; - - for( i = 0; i < system->N; ++i ) { - type_i = system->atoms[i].type; - start_i = Start_Index(i,far_nbrs); - end_i = End_Index(i,far_nbrs); - - for( pj = start_i; pj < end_i; ++pj ) - if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) { - nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); - j = nbr_pj->nbr; - type_j = system->atoms[j].type; - r_ij = nbr_pj->d; - self_coef = (i == j) ? 0.5 : 1.0; - tmin = MIN( type_i, type_j ); - tmax = MAX( type_i, type_j ); - t = &( LR[ index_lr (tmin,tmax,system->reaxprm.num_atom_types) ] ); - - /* Cubic Spline Interpolation */ - r = (int)(r_ij * t->inv_dx); - if( r == 0 ) ++r; - base = (real)(r+1) * t->dx; - dif = r_ij - base; - //fprintf(stderr, "r: %f, i: %d, base: %f, dif: %f\n", r, i, base, dif); - - if( update_energies ) { - e_vdW = ((t->vdW[r].d*dif + t->vdW[r].c)*dif + t->vdW[r].b)*dif + - t->vdW[r].a; - e_vdW *= self_coef; - - e_ele = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + - t->ele[r].a; - e_ele *= self_coef * system->atoms[i].q * system->atoms[j].q; - - data->E_vdW += e_vdW; - data->E_Ele += e_ele; - } - - CEvd = ((t->CEvd[r].d*dif + t->CEvd[r].c)*dif + t->CEvd[r].b)*dif + - t->CEvd[r].a; - CEvd *= self_coef; - //CEvd = (3*t->vdW[r].d*dif + 2*t->vdW[r].c)*dif + t->vdW[r].b; - - CEclmb = ((t->CEclmb[r].d*dif+t->CEclmb[r].c)*dif+t->CEclmb[r].b)*dif + - t->CEclmb[r].a; - CEclmb *= self_coef * system->atoms[i].q * system->atoms[j].q; - - if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) { - rvec_ScaledAdd( system->atoms[i].f, -(CEvd + CEclmb), nbr_pj->dvec ); - rvec_ScaledAdd( system->atoms[j].f, +(CEvd + CEclmb), nbr_pj->dvec ); - } - else { // NPT, iNPT or sNPT - /* for pressure coupling, terms not related to bond order - derivatives are added directly into pressure vector/tensor */ - rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec ); - rvec_ScaledAdd( system->atoms[i].f, -1., temp ); - rvec_Add( system->atoms[j].f, temp ); - rvec_iMultiply( ext_press, nbr_pj->rel_box, temp ); - rvec_Add( data->ext_press, ext_press ); - } + int i, j, pj, r, steps, update_freq, update_energies; + int type_i, type_j, tmin, tmax; + int start_i, end_i; + real r_ij, self_coef, base, dif; + real e_vdW, e_ele; + real CEvd, CEclmb; + rvec temp, ext_press; + far_neighbor_data *nbr_pj; + list *far_nbrs = (*lists) + FAR_NBRS; + LR_lookup_table *t; + + steps = data->step - data->prev_steps; + update_freq = out_control->energy_update_freq; + update_energies = update_freq > 0 && steps % update_freq == 0; + + for( i = 0; i < system->N; ++i ) { + type_i = system->atoms[i].type; + start_i = Start_Index(i,far_nbrs); + end_i = End_Index(i,far_nbrs); + + for( pj = start_i; pj < end_i; ++pj ) + if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) { + nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); + j = nbr_pj->nbr; + type_j = system->atoms[j].type; + r_ij = nbr_pj->d; + self_coef = (i == j) ? 0.5 : 1.0; + tmin = MIN( type_i, type_j ); + tmax = MAX( type_i, type_j ); + t = &( LR[ index_lr (tmin,tmax,system->reaxprm.num_atom_types) ] ); + + /* Cubic Spline Interpolation */ + r = (int)(r_ij * t->inv_dx); + if( r == 0 ) ++r; + base = (real)(r+1) * t->dx; + dif = r_ij - base; + //fprintf(stderr, "r: %f, i: %d, base: %f, dif: %f\n", r, i, base, dif); + + if( update_energies ) { + e_vdW = ((t->vdW[r].d*dif + t->vdW[r].c)*dif + t->vdW[r].b)*dif + + t->vdW[r].a; + e_vdW *= self_coef; + + e_ele = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + + t->ele[r].a; + e_ele *= self_coef * system->atoms[i].q * system->atoms[j].q; + + data->E_vdW += e_vdW; + data->E_Ele += e_ele; + } + + CEvd = ((t->CEvd[r].d*dif + t->CEvd[r].c)*dif + t->CEvd[r].b)*dif + + t->CEvd[r].a; + CEvd *= self_coef; + //CEvd = (3*t->vdW[r].d*dif + 2*t->vdW[r].c)*dif + t->vdW[r].b; + + CEclmb = ((t->CEclmb[r].d*dif+t->CEclmb[r].c)*dif+t->CEclmb[r].b)*dif + + t->CEclmb[r].a; + CEclmb *= self_coef * system->atoms[i].q * system->atoms[j].q; + + if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) { + rvec_ScaledAdd( system->atoms[i].f, -(CEvd + CEclmb), nbr_pj->dvec ); + rvec_ScaledAdd( system->atoms[j].f, +(CEvd + CEclmb), nbr_pj->dvec ); + } + else { // NPT, iNPT or sNPT + /* for pressure coupling, terms not related to bond order + derivatives are added directly into pressure vector/tensor */ + rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec ); + rvec_ScaledAdd( system->atoms[i].f, -1., temp ); + rvec_Add( system->atoms[j].f, temp ); + rvec_iMultiply( ext_press, nbr_pj->rel_box, temp ); + rvec_Add( data->ext_press, ext_press ); + } #ifdef TEST_ENERGY - fprintf(out_control->evdw, "%6d%6d%24.15e%24.15e%24.15e\n", - workspace->orig_id[i], workspace->orig_id[j], - r_ij, e_vdW, data->E_vdW ); - fprintf(out_control->ecou,"%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n", - workspace->orig_id[i], workspace->orig_id[j], - r_ij, system->atoms[i].q, system->atoms[j].q, - e_ele, data->E_Ele ); + fprintf(out_control->evdw, "%6d%6d%24.15e%24.15e%24.15e\n", + workspace->orig_id[i], workspace->orig_id[j], + r_ij, e_vdW, data->E_vdW ); + fprintf(out_control->ecou,"%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n", + workspace->orig_id[i], workspace->orig_id[j], + r_ij, system->atoms[i].q, system->atoms[j].q, + e_ele, data->E_Ele ); #endif #ifdef TEST_FORCES - rvec_ScaledAdd( workspace->f_vdw[i], -CEvd, nbr_pj->dvec ); - rvec_ScaledAdd( workspace->f_vdw[j], +CEvd, nbr_pj->dvec ); - rvec_ScaledAdd( workspace->f_ele[i], -CEclmb, nbr_pj->dvec ); - rvec_ScaledAdd( workspace->f_ele[j], +CEclmb, nbr_pj->dvec ); + rvec_ScaledAdd( workspace->f_vdw[i], -CEvd, nbr_pj->dvec ); + rvec_ScaledAdd( workspace->f_vdw[j], +CEvd, nbr_pj->dvec ); + rvec_ScaledAdd( workspace->f_ele[i], -CEclmb, nbr_pj->dvec ); + rvec_ScaledAdd( workspace->f_ele[j], +CEclmb, nbr_pj->dvec ); #endif - } - } + } + } } -GLOBAL void Cuda_Tabulated_vdW_Coulomb_Energy( reax_atom *atoms, - control_params *control, - simulation_data *data, - list p_far_nbrs, - real *E_vdW, real *E_Ele, rvec *aux_ext_press, - LR_lookup_table *d_LR, - int num_atom_types, - int energy_update_freq, - int N ) +GLOBAL void Cuda_Tabulated_vdW_Coulomb_Energy( reax_atom *atoms, + control_params *control, + simulation_data *data, + list p_far_nbrs, + real *E_vdW, real *E_Ele, rvec *aux_ext_press, + LR_lookup_table *d_LR, + int num_atom_types, + int energy_update_freq, + int N ) { - extern __shared__ real _vdw[]; - extern __shared__ real _ele[]; - extern __shared__ rvec _force []; - - real *sh_vdw; - real *sh_ele; - rvec *sh_force; - - int i, j, pj, r, steps, update_freq, update_energies; - int type_i, type_j, tmin, tmax; - int start_i, end_i; - real r_ij, self_coef, base, dif; - real e_vdW, e_ele; - real CEvd, CEclmb; - rvec temp, ext_press; - far_neighbor_data *nbr_pj; - LR_lookup_table *t; - list *far_nbrs = &p_far_nbrs; - - int thread_id = blockIdx.x * blockDim.x + threadIdx.x; - int warpid = thread_id / VDW_THREADS_PER_ATOM; - int laneid = thread_id & (VDW_THREADS_PER_ATOM -1); - - i = warpid; - - sh_vdw = _vdw; - sh_ele = _vdw + blockDim.x; - sh_force = (rvec *)( _vdw + 2*blockDim.x); - - sh_vdw[threadIdx.x] = 0.0; - sh_ele[threadIdx.x] = 0.0; - rvec_MakeZero ( sh_force [threadIdx.x] ); - - if ( i < N ) - { - - reax_atom local_atom ; - local_atom.q = atoms[i].q; - //local_atom.q = d_far_data.q[i]; - local_atom.type = atoms[i].type; - //local_atom.type = d_far_data.type[i]; - - /* - sh_vdw = _vdw; - sh_ele = _vdw + warpid; - sh_force = (rvec *)( _vdw + 2*warpid); - - sh_vdw[threadIdx.x] = 0.0; - sh_ele[threadIdx.x] = 0.0; - rvec_MakeZero ( sh_force [threadIdx.x] ); - */ - - - steps = data->step - data->prev_steps; - update_freq = energy_update_freq; - update_energies = update_freq > 0 && steps % update_freq == 0; - - //for( i = 0; i < system->N; ++i ) { - type_i = local_atom.type; - start_i = Start_Index(i,far_nbrs); - end_i = End_Index(i,far_nbrs); - - pj = start_i + laneid; - - //for( pj = start_i; pj < end_i; ++pj ) - while (pj < end_i) - { - if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) - //if( d_far_data.d[pj] <= control->r_cut ) - { - nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); - j = nbr_pj->nbr; - //j = d_far_data.nbrs[pj]; - type_j = atoms[j].type; - //type_j = d_far_data.type[j]; - r_ij = nbr_pj->d; - //r_ij = d_far_data.d[pj]; - self_coef = (i == j) ? 0.5 : 1.0; - tmin = MIN( type_i, type_j ); - tmax = MAX( type_i, type_j ); - t = &( d_LR[ index_lr (tmin,tmax,num_atom_types) ] ); - - //TODO - //CHANGE ORIGINAL - //if (i <= j) { pj += blockDim.x; continue; } - //CHANGE ORIGINAL - - /* Cubic Spline Interpolation */ - r = (int)(r_ij * t->inv_dx); - if( r == 0 ) ++r; - base = (real)(r+1) * t->dx; - dif = r_ij - base; - - if(( update_energies )) - { - e_vdW = ((t->vdW[r].d*dif + t->vdW[r].c)*dif + t->vdW[r].b)*dif + - t->vdW[r].a; - e_vdW *= self_coef; - - e_ele = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + t->ele[r].a; - e_ele *= self_coef * local_atom.q * atoms[j].q; - - - //data->E_vdW += e_vdW; - //TODO - //E_vdW [i] += e_vdW / 2.0; - //E_vdW [i] = __dadd_rd (E_vdW [i], e_vdW/2.0); - sh_vdw [threadIdx.x] += e_vdW/2.0; - //E_vdW [i] += e_vdW; - - //TODO - //data->E_Ele += e_ele; - //E_Ele [i] += e_ele / 2.0; - //E_Ele [i] = __dadd_rd ( E_Ele [i], e_ele / 2.0); - sh_ele [threadIdx.x] += e_ele/2.0; - //E_Ele [i] += e_ele; - } - - CEvd = ((t->CEvd[r].d*dif + t->CEvd[r].c)*dif + t->CEvd[r].b)*dif + - t->CEvd[r].a; - CEvd *= self_coef; - - CEclmb = ((t->CEclmb[r].d*dif+t->CEclmb[r].c)*dif+t->CEclmb[r].b)*dif + - t->CEclmb[r].a; - CEclmb *= self_coef * local_atom.q * atoms[j].q; - //CEclmb *= self_coef * local_atom.q * d_far_data.q[j]; - - if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) { - if ( i >= j) - //rvec_ScaledAdd( atoms[i].f, -(CEvd + CEclmb), nbr_pj->dvec ); - rvec_ScaledAdd( sh_force [threadIdx.x], -(CEvd + CEclmb), nbr_pj->dvec ); - //rvec_ScaledAdd( sh_force [threadIdx.x], -(CEvd + CEclmb), d_far_data.dvec[pj] ); - else - //rvec_ScaledAdd( atoms[i].f, +(CEvd + CEclmb), nbr_pj->dvec ); - rvec_ScaledAdd( sh_force [threadIdx.x], +(CEvd + CEclmb), nbr_pj->dvec ); - //rvec_ScaledAdd( sh_force [threadIdx.x], +(CEvd + CEclmb), d_far_data.dvec[pj] ); - } - else { // NPT, iNPT or sNPT - // for pressure coupling, terms not related to bond order - // derivatives are added directly into pressure vector/tensor / - rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec ); - if (i >= j) - rvec_ScaledAdd( atoms[i].f, -1., temp ); - else - rvec_Add( atoms[i].f, temp ); - rvec_iMultiply( ext_press, nbr_pj->rel_box, temp ); - - //rvec_Add( data->ext_press, ext_press ); - rvec_Copy (aux_ext_press [i], ext_press ); - - //TODO CHECK THIS - } - - - - } - - pj += VDW_THREADS_PER_ATOM; - } - - }// if i < n condition - - __syncthreads (); - - if (laneid < 16) { - sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 16]; - sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 16]; - rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 16] ); - } - __syncthreads (); - if (laneid < 8) { - sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 8]; - sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 8]; - rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 8] ); - } - __syncthreads (); - if (laneid < 4) { - sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 4]; - sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 4]; - rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 4] ); - } - __syncthreads (); - if (laneid < 2) { - sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 2]; - sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 2]; - rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 2] ); - } - __syncthreads (); - if (laneid < 1) { - sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 1]; - sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 1]; - rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 1] ); - } - __syncthreads (); - if (laneid == 0) { - E_vdW [i] += sh_vdw[threadIdx.x]; - E_Ele [i] += sh_ele[threadIdx.x]; - rvec_Add (atoms[i].f, sh_force [ threadIdx.x ]); - } - - - } - - - - - GLOBAL void Cuda_Tabulated_vdW_Coulomb_Energy_1( reax_atom *atoms, - control_params *control, - simulation_data *data, - list p_far_nbrs, - real *E_vdW, real *E_Ele, rvec *aux_ext_press, - LR_lookup_table *d_LR, - int num_atom_types, - int energy_update_freq, - int N ) - { - - extern __shared__ real _vdw[]; - extern __shared__ real _ele[]; - - real *sh_vdw; - real *sh_ele; - - int i, j, pj, r, steps, update_freq, update_energies; - int type_i, type_j, tmin, tmax; - int start_i, end_i; - real r_ij, self_coef, base, dif; - real e_vdW, e_ele; - real CEvd, CEclmb; - rvec temp, ext_press; - far_neighbor_data *nbr_pj; - LR_lookup_table *t; - list *far_nbrs = &p_far_nbrs; - - i = blockIdx.x; - - reax_atom local_atom; - local_atom.q = atoms[i].q; - local_atom.type = atoms[i].type; - - sh_vdw = _vdw; - sh_ele = _vdw + blockDim.x; - - sh_vdw[threadIdx.x] = 0.0; - sh_ele[threadIdx.x] = 0.0; - - - steps = data->step - data->prev_steps; - update_freq = energy_update_freq; - update_energies = update_freq > 0 && steps % update_freq == 0; - - type_i = local_atom.type; - start_i = Start_Index(i,far_nbrs); - end_i = End_Index(i,far_nbrs); - - pj = start_i + threadIdx.x; - - while (pj < end_i) - { - if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) - { - nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); - j = nbr_pj->nbr; - type_j = atoms[j].type; - r_ij = nbr_pj->d; - self_coef = (i == j) ? 0.5 : 1.0; - tmin = MIN( type_i, type_j ); - tmax = MAX( type_i, type_j ); - t = &( d_LR[ index_lr (tmin,tmax,num_atom_types) ] ); - - /* Cubic Spline Interpolation */ - r = (int)(r_ij * t->inv_dx); - if( r == 0 ) ++r; - base = (real)(r+1) * t->dx; - dif = r_ij - base; - - if(( update_energies )) - { - e_vdW = ((t->vdW[r].d*dif + t->vdW[r].c)*dif + t->vdW[r].b)*dif + - t->vdW[r].a; - e_vdW *= self_coef; - - e_ele = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + - t->ele[r].a; - e_ele *= self_coef * local_atom.q * atoms[j].q; - - sh_vdw [threadIdx.x] += e_vdW/2.0; - sh_ele [threadIdx.x] += e_ele/2.0; - } - } - - pj += blockDim.x; - } - - // now do a reduce inside the warp for E_vdW, E_Ele and force. - if (threadIdx.x < 16) { - sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 16]; - sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 16]; - } - if (threadIdx.x < 8) { - sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 8]; - sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 8]; - } - if (threadIdx.x < 4) { - sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 4]; - sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 4]; - } - if (threadIdx.x < 2) { - sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 2]; - sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 2]; - } - if (threadIdx.x < 1) { - sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 1]; - sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 1]; - } - if (threadIdx.x == 0) { - E_vdW [i] += sh_vdw[0]; - E_Ele [i] += sh_ele[0]; - } - - } - - - - - - - GLOBAL void Cuda_Tabulated_vdW_Coulomb_Energy_2( reax_atom *atoms, - control_params *control, - simulation_data *data, - list p_far_nbrs, - real *E_vdW, real *E_Ele, rvec *aux_ext_press, - LR_lookup_table *d_LR, - int num_atom_types, - int energy_update_freq, - int N ) - { - - extern __shared__ rvec _force []; - - rvec *sh_force; - - int i, j, pj, r, steps, update_freq, update_energies; - int type_i, type_j, tmin, tmax; - int start_i, end_i; - real r_ij, self_coef, base, dif; - real e_vdW, e_ele; - real CEvd, CEclmb; - rvec temp, ext_press; - far_neighbor_data *nbr_pj; - LR_lookup_table *t; - list *far_nbrs = &p_far_nbrs; - - i = blockIdx.x; - - reax_atom local_atom; - local_atom.q = atoms[i].q; - local_atom.type = atoms[i].type; - - sh_force = _force; - rvec_MakeZero ( sh_force [threadIdx.x] ); - + extern __shared__ real _vdw[]; + extern __shared__ real _ele[]; + extern __shared__ rvec _force []; + + real *sh_vdw; + real *sh_ele; + rvec *sh_force; + + int i, j, pj, r, steps, update_freq, update_energies; + int type_i, type_j, tmin, tmax; + int start_i, end_i; + real r_ij, self_coef, base, dif; + real e_vdW, e_ele; + real CEvd, CEclmb; + rvec temp, ext_press; + far_neighbor_data *nbr_pj; + LR_lookup_table *t; + list *far_nbrs = &p_far_nbrs; + + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + int warpid = thread_id / VDW_THREADS_PER_ATOM; + int laneid = thread_id & (VDW_THREADS_PER_ATOM -1); + + i = warpid; + + sh_vdw = _vdw; + sh_ele = _vdw + blockDim.x; + sh_force = (rvec *)( _vdw + 2*blockDim.x); + + sh_vdw[threadIdx.x] = 0.0; + sh_ele[threadIdx.x] = 0.0; + rvec_MakeZero ( sh_force [threadIdx.x] ); + + if ( i < N ) + { + + reax_atom local_atom ; + local_atom.q = atoms[i].q; + //local_atom.q = d_far_data.q[i]; + local_atom.type = atoms[i].type; + //local_atom.type = d_far_data.type[i]; + + /* + sh_vdw = _vdw; + sh_ele = _vdw + warpid; + sh_force = (rvec *)( _vdw + 2*warpid); + + sh_vdw[threadIdx.x] = 0.0; + sh_ele[threadIdx.x] = 0.0; + rvec_MakeZero ( sh_force [threadIdx.x] ); + */ + + + steps = data->step - data->prev_steps; + update_freq = energy_update_freq; + update_energies = update_freq > 0 && steps % update_freq == 0; + + //for( i = 0; i < system->N; ++i ) { + type_i = local_atom.type; + start_i = Start_Index(i,far_nbrs); + end_i = End_Index(i,far_nbrs); + + pj = start_i + laneid; + + //for( pj = start_i; pj < end_i; ++pj ) + while (pj < end_i) + { + if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) + //if( d_far_data.d[pj] <= control->r_cut ) + { + nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); + j = nbr_pj->nbr; + //j = d_far_data.nbrs[pj]; + type_j = atoms[j].type; + //type_j = d_far_data.type[j]; + r_ij = nbr_pj->d; + //r_ij = d_far_data.d[pj]; + self_coef = (i == j) ? 0.5 : 1.0; + tmin = MIN( type_i, type_j ); + tmax = MAX( type_i, type_j ); + t = &( d_LR[ index_lr (tmin,tmax,num_atom_types) ] ); + + //TODO + //CHANGE ORIGINAL + //if (i <= j) { pj += blockDim.x; continue; } + //CHANGE ORIGINAL + + /* Cubic Spline Interpolation */ + r = (int)(r_ij * t->inv_dx); + if( r == 0 ) ++r; + base = (real)(r+1) * t->dx; + dif = r_ij - base; + + if(( update_energies )) + { + e_vdW = ((t->vdW[r].d*dif + t->vdW[r].c)*dif + t->vdW[r].b)*dif + + t->vdW[r].a; + e_vdW *= self_coef; + + e_ele = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + t->ele[r].a; + e_ele *= self_coef * local_atom.q * atoms[j].q; + + + //data->E_vdW += e_vdW; + //TODO + //E_vdW [i] += e_vdW / 2.0; + //E_vdW [i] = __dadd_rd (E_vdW [i], e_vdW/2.0); + sh_vdw [threadIdx.x] += e_vdW/2.0; + //E_vdW [i] += e_vdW; + + //TODO + //data->E_Ele += e_ele; + //E_Ele [i] += e_ele / 2.0; + //E_Ele [i] = __dadd_rd ( E_Ele [i], e_ele / 2.0); + sh_ele [threadIdx.x] += e_ele/2.0; + //E_Ele [i] += e_ele; + } + + CEvd = ((t->CEvd[r].d*dif + t->CEvd[r].c)*dif + t->CEvd[r].b)*dif + + t->CEvd[r].a; + CEvd *= self_coef; + + CEclmb = ((t->CEclmb[r].d*dif+t->CEclmb[r].c)*dif+t->CEclmb[r].b)*dif + + t->CEclmb[r].a; + CEclmb *= self_coef * local_atom.q * atoms[j].q; + //CEclmb *= self_coef * local_atom.q * d_far_data.q[j]; + + if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) { + if ( i >= j) + //rvec_ScaledAdd( atoms[i].f, -(CEvd + CEclmb), nbr_pj->dvec ); + rvec_ScaledAdd( sh_force [threadIdx.x], -(CEvd + CEclmb), nbr_pj->dvec ); + //rvec_ScaledAdd( sh_force [threadIdx.x], -(CEvd + CEclmb), d_far_data.dvec[pj] ); + else + //rvec_ScaledAdd( atoms[i].f, +(CEvd + CEclmb), nbr_pj->dvec ); + rvec_ScaledAdd( sh_force [threadIdx.x], +(CEvd + CEclmb), nbr_pj->dvec ); + //rvec_ScaledAdd( sh_force [threadIdx.x], +(CEvd + CEclmb), d_far_data.dvec[pj] ); + } + else { // NPT, iNPT or sNPT + // for pressure coupling, terms not related to bond order + // derivatives are added directly into pressure vector/tensor / + rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec ); + if (i >= j) + rvec_ScaledAdd( atoms[i].f, -1., temp ); + else + rvec_Add( atoms[i].f, temp ); + rvec_iMultiply( ext_press, nbr_pj->rel_box, temp ); + + //rvec_Add( data->ext_press, ext_press ); + rvec_Copy (aux_ext_press [i], ext_press ); + + //TODO CHECK THIS + } + + + + } + + pj += VDW_THREADS_PER_ATOM; + } + + }// if i < n condition + + __syncthreads (); + + if (laneid < 16) { + sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 16]; + sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 16]; + rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 16] ); + } + __syncthreads (); + if (laneid < 8) { + sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 8]; + sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 8]; + rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 8] ); + } + __syncthreads (); + if (laneid < 4) { + sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 4]; + sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 4]; + rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 4] ); + } + __syncthreads (); + if (laneid < 2) { + sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 2]; + sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 2]; + rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 2] ); + } + __syncthreads (); + if (laneid < 1) { + sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 1]; + sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 1]; + rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 1] ); + } + __syncthreads (); + if (laneid == 0) { + E_vdW [i] += sh_vdw[threadIdx.x]; + E_Ele [i] += sh_ele[threadIdx.x]; + rvec_Add (atoms[i].f, sh_force [ threadIdx.x ]); + } + + + } + + + + + GLOBAL void Cuda_Tabulated_vdW_Coulomb_Energy_1( reax_atom *atoms, + control_params *control, + simulation_data *data, + list p_far_nbrs, + real *E_vdW, real *E_Ele, rvec *aux_ext_press, + LR_lookup_table *d_LR, + int num_atom_types, + int energy_update_freq, + int N ) + { + + extern __shared__ real _vdw[]; + extern __shared__ real _ele[]; + + real *sh_vdw; + real *sh_ele; + + int i, j, pj, r, steps, update_freq, update_energies; + int type_i, type_j, tmin, tmax; + int start_i, end_i; + real r_ij, self_coef, base, dif; + real e_vdW, e_ele; + real CEvd, CEclmb; + rvec temp, ext_press; + far_neighbor_data *nbr_pj; + LR_lookup_table *t; + list *far_nbrs = &p_far_nbrs; + + i = blockIdx.x; + + reax_atom local_atom; + local_atom.q = atoms[i].q; + local_atom.type = atoms[i].type; + + sh_vdw = _vdw; + sh_ele = _vdw + blockDim.x; + + sh_vdw[threadIdx.x] = 0.0; + sh_ele[threadIdx.x] = 0.0; + + + steps = data->step - data->prev_steps; + update_freq = energy_update_freq; + update_energies = update_freq > 0 && steps % update_freq == 0; + + type_i = local_atom.type; + start_i = Start_Index(i,far_nbrs); + end_i = End_Index(i,far_nbrs); + + pj = start_i + threadIdx.x; + + while (pj < end_i) + { + if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) + { + nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); + j = nbr_pj->nbr; + type_j = atoms[j].type; + r_ij = nbr_pj->d; + self_coef = (i == j) ? 0.5 : 1.0; + tmin = MIN( type_i, type_j ); + tmax = MAX( type_i, type_j ); + t = &( d_LR[ index_lr (tmin,tmax,num_atom_types) ] ); + + /* Cubic Spline Interpolation */ + r = (int)(r_ij * t->inv_dx); + if( r == 0 ) ++r; + base = (real)(r+1) * t->dx; + dif = r_ij - base; + + if(( update_energies )) + { + e_vdW = ((t->vdW[r].d*dif + t->vdW[r].c)*dif + t->vdW[r].b)*dif + + t->vdW[r].a; + e_vdW *= self_coef; + + e_ele = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + + t->ele[r].a; + e_ele *= self_coef * local_atom.q * atoms[j].q; + + sh_vdw [threadIdx.x] += e_vdW/2.0; + sh_ele [threadIdx.x] += e_ele/2.0; + } + } + + pj += blockDim.x; + } + + // now do a reduce inside the warp for E_vdW, E_Ele and force. + if (threadIdx.x < 16) { + sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 16]; + sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 16]; + } + if (threadIdx.x < 8) { + sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 8]; + sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 8]; + } + if (threadIdx.x < 4) { + sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 4]; + sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 4]; + } + if (threadIdx.x < 2) { + sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 2]; + sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 2]; + } + if (threadIdx.x < 1) { + sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 1]; + sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 1]; + } + if (threadIdx.x == 0) { + E_vdW [i] += sh_vdw[0]; + E_Ele [i] += sh_ele[0]; + } + + } + + + + + + + GLOBAL void Cuda_Tabulated_vdW_Coulomb_Energy_2( reax_atom *atoms, + control_params *control, + simulation_data *data, + list p_far_nbrs, + real *E_vdW, real *E_Ele, rvec *aux_ext_press, + LR_lookup_table *d_LR, + int num_atom_types, + int energy_update_freq, + int N ) + { + + extern __shared__ rvec _force []; + + rvec *sh_force; + + int i, j, pj, r, steps, update_freq, update_energies; + int type_i, type_j, tmin, tmax; + int start_i, end_i; + real r_ij, self_coef, base, dif; + real e_vdW, e_ele; + real CEvd, CEclmb; + rvec temp, ext_press; + far_neighbor_data *nbr_pj; + LR_lookup_table *t; + list *far_nbrs = &p_far_nbrs; + + i = blockIdx.x; + + reax_atom local_atom; + local_atom.q = atoms[i].q; + local_atom.type = atoms[i].type; + + sh_force = _force; + rvec_MakeZero ( sh_force [threadIdx.x] ); + - steps = data->step - data->prev_steps; - update_freq = energy_update_freq; - update_energies = update_freq > 0 && steps % update_freq == 0; + steps = data->step - data->prev_steps; + update_freq = energy_update_freq; + update_energies = update_freq > 0 && steps % update_freq == 0; - //for( i = 0; i < system->N; ++i ) { - type_i = local_atom.type; - start_i = Start_Index(i,far_nbrs); - end_i = End_Index(i,far_nbrs); + //for( i = 0; i < system->N; ++i ) { + type_i = local_atom.type; + start_i = Start_Index(i,far_nbrs); + end_i = End_Index(i,far_nbrs); - pj = start_i + threadIdx.x; + pj = start_i + threadIdx.x; - while (pj < end_i) - { - if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) - { - nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); - j = nbr_pj->nbr; - type_j = atoms[j].type; - r_ij = nbr_pj->d; - self_coef = (i == j) ? 0.5 : 1.0; - tmin = MIN( type_i, type_j ); - tmax = MAX( type_i, type_j ); - t = &( d_LR[ index_lr (tmin,tmax,num_atom_types) ] ); + while (pj < end_i) + { + if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) + { + nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); + j = nbr_pj->nbr; + type_j = atoms[j].type; + r_ij = nbr_pj->d; + self_coef = (i == j) ? 0.5 : 1.0; + tmin = MIN( type_i, type_j ); + tmax = MAX( type_i, type_j ); + t = &( d_LR[ index_lr (tmin,tmax,num_atom_types) ] ); - /* Cubic Spline Interpolation */ - r = (int)(r_ij * t->inv_dx); - if( r == 0 ) ++r; - base = (real)(r+1) * t->dx; - dif = r_ij - base; + /* Cubic Spline Interpolation */ + r = (int)(r_ij * t->inv_dx); + if( r == 0 ) ++r; + base = (real)(r+1) * t->dx; + dif = r_ij - base; - CEvd = ((t->CEvd[r].d*dif + t->CEvd[r].c)*dif + t->CEvd[r].b)*dif + - t->CEvd[r].a; - CEvd *= self_coef; + CEvd = ((t->CEvd[r].d*dif + t->CEvd[r].c)*dif + t->CEvd[r].b)*dif + + t->CEvd[r].a; + CEvd *= self_coef; - CEclmb = ((t->CEclmb[r].d*dif+t->CEclmb[r].c)*dif+t->CEclmb[r].b)*dif + - t->CEclmb[r].a; - CEclmb *= self_coef * local_atom.q * atoms[j].q; + CEclmb = ((t->CEclmb[r].d*dif+t->CEclmb[r].c)*dif+t->CEclmb[r].b)*dif + + t->CEclmb[r].a; + CEclmb *= self_coef * local_atom.q * atoms[j].q; - if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT ) { - if ( i >= j) - rvec_ScaledAdd( sh_force [threadIdx.x], -(CEvd + CEclmb), nbr_pj->dvec ); - else - rvec_ScaledAdd( sh_force [threadIdx.x], +(CEvd + CEclmb), nbr_pj->dvec ); - } - else { // NPT, iNPT or sNPT - // for pressure coupling, terms not related to bond order - // derivatives are added directly into pressure vector/tensor / - rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec ); - if (i >= j) - rvec_ScaledAdd( atoms[i].f, -1., temp ); - else - rvec_Add( atoms[i].f, temp ); - rvec_iMultiply( ext_press, nbr_pj->rel_box, temp ); + if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT ) { + if ( i >= j) + rvec_ScaledAdd( sh_force [threadIdx.x], -(CEvd + CEclmb), nbr_pj->dvec ); + else + rvec_ScaledAdd( sh_force [threadIdx.x], +(CEvd + CEclmb), nbr_pj->dvec ); + } + else { // NPT, iNPT or sNPT + // for pressure coupling, terms not related to bond order + // derivatives are added directly into pressure vector/tensor / + rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec ); + if (i >= j) + rvec_ScaledAdd( atoms[i].f, -1., temp ); + else + rvec_Add( atoms[i].f, temp ); + rvec_iMultiply( ext_press, nbr_pj->rel_box, temp ); - rvec_Copy (aux_ext_press [i], ext_press ); - } - } + rvec_Copy (aux_ext_press [i], ext_press ); + } + } - pj += blockDim.x; - } + pj += blockDim.x; + } - // now do a reduce inside the warp for E_vdW, E_Ele and force. - if (threadIdx.x < 16) { - rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 16] ); - } - if (threadIdx.x < 8) { - rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 8] ); - } - if (threadIdx.x < 4) { - rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 4] ); - } - if (threadIdx.x < 2) { - rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 2] ); - } - if (threadIdx.x < 1) { - rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 1] ); - } - if (threadIdx.x == 0) { - rvec_Add (atoms[i].f, sh_force [ 0 ]); - } + // now do a reduce inside the warp for E_vdW, E_Ele and force. + if (threadIdx.x < 16) { + rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 16] ); + } + if (threadIdx.x < 8) { + rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 8] ); + } + if (threadIdx.x < 4) { + rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 4] ); + } + if (threadIdx.x < 2) { + rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 2] ); + } + if (threadIdx.x < 1) { + rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 1] ); + } + if (threadIdx.x == 0) { + rvec_Add (atoms[i].f, sh_force [ 0 ]); + } - } + } @@ -1613,18 +1613,18 @@ GLOBAL void Cuda_Tabulated_vdW_Coulomb_Energy( reax_atom *atoms, #if defined(OLD) - /* Linear extrapolation */ - /*p = (r_ij * t->inv_dx; - r = (int) p; - prev = &( t->y[r] ); - next = &( t->y[r+1] ); - - tmp = p - r; - e_vdW = self_coef * (prev->e_vdW + tmp*(next->e_vdW - prev->e_vdW )); - CEvd = self_coef * (prev->CEvd + tmp*(next->CEvd - prev->CEvd )); - - e_ele = self_coef * (prev->e_ele + tmp*(next->e_ele - prev->e_ele )); - e_ele = e_ele * system->atoms[i].q * system->atoms[j].q; - CEclmb = self_coef * (prev->CEclmb+tmp*(next->CEclmb - prev->CEclmb)); - CEclmb = CEclmb * system->atoms[i].q * system->atoms[j].q;*/ + /* Linear extrapolation */ + /*p = (r_ij * t->inv_dx; + r = (int) p; + prev = &( t->y[r] ); + next = &( t->y[r+1] ); + + tmp = p - r; + e_vdW = self_coef * (prev->e_vdW + tmp*(next->e_vdW - prev->e_vdW )); + CEvd = self_coef * (prev->CEvd + tmp*(next->CEvd - prev->CEvd )); + + e_ele = self_coef * (prev->e_ele + tmp*(next->e_ele - prev->e_ele )); + e_ele = e_ele * system->atoms[i].q * system->atoms[j].q; + CEclmb = self_coef * (prev->CEclmb+tmp*(next->CEclmb - prev->CEclmb)); + CEclmb = CEclmb * system->atoms[i].q * system->atoms[j].q;*/ #endif diff --git a/PuReMD-GPU/src/validation.cu b/PuReMD-GPU/src/validation.cu index c5497977..f8261555 100644 --- a/PuReMD-GPU/src/validation.cu +++ b/PuReMD-GPU/src/validation.cu @@ -29,1931 +29,1931 @@ bool check_zero (real p1, real p2) { - if (abs (p1 - p2) >= GPU_TOLERANCE) - return true; - else - return false; + if (abs (p1 - p2) >= GPU_TOLERANCE) + return true; + else + return false; } bool check_zero (rvec p1, rvec p2) { - if (((abs (p1[0] - p2[0])) >= GPU_TOLERANCE) || - ((abs (p1[1] - p2[1])) >= GPU_TOLERANCE) || - ((abs (p1[2] - p2[2])) >= GPU_TOLERANCE )) - return true; - else return false; + if (((abs (p1[0] - p2[0])) >= GPU_TOLERANCE) || + ((abs (p1[1] - p2[1])) >= GPU_TOLERANCE) || + ((abs (p1[2] - p2[2])) >= GPU_TOLERANCE )) + return true; + else return false; } bool check_same (ivec p1, ivec p2) { - if ( (p1[0] == p2[0]) || (p1[1] == p2[1]) || (p1[2] == p2[2]) ) - return true; - else - return false; + if ( (p1[0] == p2[0]) || (p1[1] == p2[1]) || (p1[2] == p2[2]) ) + return true; + else + return false; } bool validate_box (simulation_box *host, simulation_box *dev) { - simulation_box test; + simulation_box test; - copy_host_device (&test, dev, SIMULATION_BOX_SIZE, cudaMemcpyDeviceToHost, RES_SYSTEM_SIMULATION_BOX ); + copy_host_device (&test, dev, SIMULATION_BOX_SIZE, cudaMemcpyDeviceToHost, RES_SYSTEM_SIMULATION_BOX ); - if (memcmp (&test, host, SIMULATION_BOX_SIZE)) { - fprintf (stderr, " Simulation box is not in synch between host and device \n"); - return false; - } + if (memcmp (&test, host, SIMULATION_BOX_SIZE)) { + fprintf (stderr, " Simulation box is not in synch between host and device \n"); + return false; + } - fprintf (stderr, " Simulation box is in **synch** between host and device \n"); - return true; + fprintf (stderr, " Simulation box is in **synch** between host and device \n"); + return true; } bool validate_atoms (reax_system *system, list **lists) { - int start, end, index, count, miscount; - reax_atom *test = (reax_atom *) malloc (REAX_ATOM_SIZE * system->N); - copy_host_device (test, system->d_atoms, REAX_ATOM_SIZE * system->N, cudaMemcpyDeviceToHost, RES_SYSTEM_ATOMS ); - - /* - int *d_start, *d_end; - bond_data *d_bond_data; - list *d_bonds = dev_lists + BONDS; - list *bonds = *lists + BONDS; - - d_end = (int *)malloc (sizeof (int) * system->N); - d_start = (int *) malloc (sizeof (int) * system->N ); - d_bond_data = (bond_data *) malloc (BOND_DATA_SIZE * system->num_bonds ); - - copy_host_device (d_start, d_bonds->index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); - copy_host_device (d_end, d_bonds->end_index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); - copy_host_device (d_bond_data, d_bonds->select.bond_list, BOND_DATA_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__); - - - count = 0; - miscount = 0; - for (int i = 0; i < 1; i++) { - - for (int j = d_start[i]; j < d_end[i]; j++) { - bond_data *src, *tgt; - src = &d_bond_data[j]; - tgt = &d_bond_data[ src->dbond_index ]; - - fprintf (stderr, "Atom %d f neighbor %d vector (%e %e %e) thbh count %d \n", i, src->nbr, tgt->f[0], tgt->f[1], tgt->f[2], src->scratch ); - } - } - exit (-1); - */ - - //if (memcmp (test, system->atoms, REAX_ATOM_SIZE * system->N)) { - count = miscount = 0; - for (int i = 0; i < system->N; i++) - { - if (test[i].type != system->atoms[i].type) { - fprintf (stderr, " Type does not match (%d %d) @ index %d \n", system->atoms[i].type, test[i].type, i); - exit (-1); - } - - if ( check_zero (test[i].x, system->atoms[i].x) ) - { - fprintf (stderr, "Atom :%d x --> host (%f %f %f) device (%f %f %f) \n", i, - system->atoms[i].x[0], system->atoms[i].x[1], system->atoms[i].x[2], - test[i].x[0], test[i].x[1], test[i].x[2] ); - miscount ++; - exit (-1); - } - if ( check_zero (test[i].v, system->atoms[i].v) ) - { - fprintf (stderr, "Atom :%d v --> host (%6.10f %6.10f %6.10f) device (%6.10f %6.10f %6.10f) \n", i, - system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2], - test[i].v[0], test[i].v[1], test[i].v[2] ); - miscount ++; - exit (-1); - } - if ( check_zero (test[i].f, system->atoms[i].f) ) - { - fprintf (stderr, "Atom :%d f --> host (%6.10f %6.10f %6.10f) device (%6.10f %6.10f %6.10f) \n", i, - system->atoms[i].f[0], system->atoms[i].f[1], system->atoms[i].f[2], - test[i].f[0], test[i].f[1], test[i].f[2] ); - miscount ++; - exit (-1); - } - - if ( check_zero (test[i].q, system->atoms[i].q) ) - { - fprintf (stderr, "Atom :%d q --> host (%f) device (%f) \n", i, - system->atoms[i].q, test[i].q ); - miscount ++; - exit (-1); - } - - count ++; - } - - //fprintf (stderr, "Reax Atoms DOES **match** between host and device --> %d miscount --> %d \n", count, miscount); - - free (test); - return true; + int start, end, index, count, miscount; + reax_atom *test = (reax_atom *) malloc (REAX_ATOM_SIZE * system->N); + copy_host_device (test, system->d_atoms, REAX_ATOM_SIZE * system->N, cudaMemcpyDeviceToHost, RES_SYSTEM_ATOMS ); + + /* + int *d_start, *d_end; + bond_data *d_bond_data; + list *d_bonds = dev_lists + BONDS; + list *bonds = *lists + BONDS; + + d_end = (int *)malloc (sizeof (int) * system->N); + d_start = (int *) malloc (sizeof (int) * system->N ); + d_bond_data = (bond_data *) malloc (BOND_DATA_SIZE * system->num_bonds ); + + copy_host_device (d_start, d_bonds->index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); + copy_host_device (d_end, d_bonds->end_index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); + copy_host_device (d_bond_data, d_bonds->select.bond_list, BOND_DATA_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__); + + + count = 0; + miscount = 0; + for (int i = 0; i < 1; i++) { + + for (int j = d_start[i]; j < d_end[i]; j++) { + bond_data *src, *tgt; + src = &d_bond_data[j]; + tgt = &d_bond_data[ src->dbond_index ]; + + fprintf (stderr, "Atom %d f neighbor %d vector (%e %e %e) thbh count %d \n", i, src->nbr, tgt->f[0], tgt->f[1], tgt->f[2], src->scratch ); + } + } + exit (-1); + */ + + //if (memcmp (test, system->atoms, REAX_ATOM_SIZE * system->N)) { + count = miscount = 0; + for (int i = 0; i < system->N; i++) + { + if (test[i].type != system->atoms[i].type) { + fprintf (stderr, " Type does not match (%d %d) @ index %d \n", system->atoms[i].type, test[i].type, i); + exit (-1); + } + + if ( check_zero (test[i].x, system->atoms[i].x) ) + { + fprintf (stderr, "Atom :%d x --> host (%f %f %f) device (%f %f %f) \n", i, + system->atoms[i].x[0], system->atoms[i].x[1], system->atoms[i].x[2], + test[i].x[0], test[i].x[1], test[i].x[2] ); + miscount ++; + exit (-1); + } + if ( check_zero (test[i].v, system->atoms[i].v) ) + { + fprintf (stderr, "Atom :%d v --> host (%6.10f %6.10f %6.10f) device (%6.10f %6.10f %6.10f) \n", i, + system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2], + test[i].v[0], test[i].v[1], test[i].v[2] ); + miscount ++; + exit (-1); + } + if ( check_zero (test[i].f, system->atoms[i].f) ) + { + fprintf (stderr, "Atom :%d f --> host (%6.10f %6.10f %6.10f) device (%6.10f %6.10f %6.10f) \n", i, + system->atoms[i].f[0], system->atoms[i].f[1], system->atoms[i].f[2], + test[i].f[0], test[i].f[1], test[i].f[2] ); + miscount ++; + exit (-1); + } + + if ( check_zero (test[i].q, system->atoms[i].q) ) + { + fprintf (stderr, "Atom :%d q --> host (%f) device (%f) \n", i, + system->atoms[i].q, test[i].q ); + miscount ++; + exit (-1); + } + + count ++; + } + + //fprintf (stderr, "Reax Atoms DOES **match** between host and device --> %d miscount --> %d \n", count, miscount); + + free (test); + return true; } void Print_Matrix( sparse_matrix *A ) { - int i, j; - for( i = 0; i < 10; ++i ) { - fprintf( stderr, "i:%d j(val):", i ); + int i, j; + for( i = 0; i < 10; ++i ) { + fprintf( stderr, "i:%d j(val):", i ); - for( j = A->start[i]; j < A->end[i]; ++j ) - fprintf( stderr, "%d(%.4f) ", A->entries[j].j, A->entries[j].val ); + for( j = A->start[i]; j < A->end[i]; ++j ) + fprintf( stderr, "%d(%.4f) ", A->entries[j].j, A->entries[j].val ); - fprintf( stderr, "\n" ); - } + fprintf( stderr, "\n" ); + } } void Print_Matrix_L( sparse_matrix *A ) { - int i, j; - for( i = 0; i < 10; ++i ) { - fprintf( stderr, "i:%d j(val):", i ); + int i, j; + for( i = 0; i < 10; ++i ) { + fprintf( stderr, "i:%d j(val):", i ); - for( j = A->start[i]; j < A->start[i+1]; ++j ) - fprintf( stderr, "%d(%.4f) ", A->entries[j].j, A->entries[j].val ); + for( j = A->start[i]; j < A->start[i+1]; ++j ) + fprintf( stderr, "%d(%.4f) ", A->entries[j].j, A->entries[j].val ); - fprintf( stderr, "\n" ); - } + fprintf( stderr, "\n" ); + } } bool validate_sort_matrix (reax_system *system, static_storage *workspace) { - sparse_matrix test; - int index, count; - test.start = (int *) malloc (INT_SIZE * (system->N + 1)); - test.end = (int *) malloc (INT_SIZE * (system->N + 1)); - - test.entries = (sparse_matrix_entry *) malloc (SPARSE_MATRIX_ENTRY_SIZE * (system->N * system->max_sparse_matrix_entries)); - memset (test.entries, 0xFF, SPARSE_MATRIX_ENTRY_SIZE * system->N * system->max_sparse_matrix_entries); - - copy_host_device ( test.entries, dev_workspace->H.entries, SPARSE_MATRIX_ENTRY_SIZE * system->N * system->max_sparse_matrix_entries, - cudaMemcpyDeviceToHost, __LINE__ ); - copy_host_device ( test.start, dev_workspace->H.start, INT_SIZE * (system->N + 1), cudaMemcpyDeviceToHost, __LINE__ ); - copy_host_device ( test.end , dev_workspace->H.end, INT_SIZE * (system->N + 1), cudaMemcpyDeviceToHost, __LINE__ ); - - //Print_Matrix ( &test ); - - for (int i = 0; i < system->N; i++) - { - int start = test.start[i]; - int end = test.end [i]; - - //d_quick_sort ( & (test.entries[start]), 0, end - start - 1 ); - for (int x = start; x < end-1; x++) - if (test.entries[x].j > test.entries[x+1].j) { - fprintf (stderr, "Matrix is not sorted for the entri %d \n", i ); - exit (-1); - } - } - fprintf (stderr, " Done sorting with all the entries in the sparse matrix \n"); - - free (test.start); - free (test.end); - free (test.entries); + sparse_matrix test; + int index, count; + test.start = (int *) malloc (INT_SIZE * (system->N + 1)); + test.end = (int *) malloc (INT_SIZE * (system->N + 1)); + + test.entries = (sparse_matrix_entry *) malloc (SPARSE_MATRIX_ENTRY_SIZE * (system->N * system->max_sparse_matrix_entries)); + memset (test.entries, 0xFF, SPARSE_MATRIX_ENTRY_SIZE * system->N * system->max_sparse_matrix_entries); + + copy_host_device ( test.entries, dev_workspace->H.entries, SPARSE_MATRIX_ENTRY_SIZE * system->N * system->max_sparse_matrix_entries, + cudaMemcpyDeviceToHost, __LINE__ ); + copy_host_device ( test.start, dev_workspace->H.start, INT_SIZE * (system->N + 1), cudaMemcpyDeviceToHost, __LINE__ ); + copy_host_device ( test.end , dev_workspace->H.end, INT_SIZE * (system->N + 1), cudaMemcpyDeviceToHost, __LINE__ ); + + //Print_Matrix ( &test ); + + for (int i = 0; i < system->N; i++) + { + int start = test.start[i]; + int end = test.end [i]; + + //d_quick_sort ( & (test.entries[start]), 0, end - start - 1 ); + for (int x = start; x < end-1; x++) + if (test.entries[x].j > test.entries[x+1].j) { + fprintf (stderr, "Matrix is not sorted for the entri %d \n", i ); + exit (-1); + } + } + fprintf (stderr, " Done sorting with all the entries in the sparse matrix \n"); + + free (test.start); + free (test.end); + free (test.entries); } bool validate_sparse_matrix( reax_system *system, static_storage *workspace ) { - sparse_matrix test; - int index, count; - test.start = (int *) malloc (INT_SIZE * (system->N + 1)); - test.end = (int *) malloc (INT_SIZE * (system->N + 1)); - - test.entries = (sparse_matrix_entry *) malloc (SPARSE_MATRIX_ENTRY_SIZE * (system->N * system->max_sparse_matrix_entries)); - - memset (test.entries, 0xFF, SPARSE_MATRIX_ENTRY_SIZE * system->N * system->max_sparse_matrix_entries); - copy_host_device ( test.entries, dev_workspace->H.entries, SPARSE_MATRIX_ENTRY_SIZE * system->N * system->max_sparse_matrix_entries, - cudaMemcpyDeviceToHost, __LINE__ ); - copy_host_device ( test.start, dev_workspace->H.start, INT_SIZE * (system->N + 1), cudaMemcpyDeviceToHost, __LINE__ ); - copy_host_device ( test.end , dev_workspace->H.end, INT_SIZE * (system->N + 1), cudaMemcpyDeviceToHost, __LINE__ ); - - /* - for (int i = 0 ; i < system->N; i++) { - if ((test.end[i] - test.start[i]) != (workspace->H.start[i+1] - workspace->H.start[i])){ - //if ((test.end[i] - test.start[i]) < 32 ){ - fprintf (stderr, "Sparse Matrix gpu (%d %d) cpu (%d %d)\n", - test.start[i], test.end[i], - workspace->H.start[i], workspace->H.start[i+1]); - exit (-1); - } - } - */ - //fprintf (stderr, "Sparse Matrix COUNT matches between HOST and DEVICE \n"); - - count = 0; - for (int i = 0; i < system->N; i++) { - for (int j = workspace->H.start[i]; j < workspace->H.start[i+1]; j++) { - sparse_matrix_entry *src = &workspace->H.entries[j]; - - for (int k = test.start[i]; k < test.end[i]; k++) { - sparse_matrix_entry *tgt = &test.entries [k]; - if (src->j == tgt->j){ - if ( check_zero (src->val, tgt->val)) { - index = test.start [i]; - /* - fprintf (stderr, " i-1 (%d %d ) (%d %d) \n", - test.start[i-1], test.end[i-1], - workspace->H.start[i-1], workspace->H.start[i]); - fprintf (stderr, " Sparse matrix entry does not match for atom %d at index %d (%d %d) (%d %d) \n", - i, k, test.start[i], test.end[i], - workspace->H.start[i], workspace->H.start[i+1]); - for (int x = workspace->H.start[i]; x < workspace->H.start[i+1]; x ++) - { - src = &workspace->H.entries[x]; - tgt = &test.entries [index]; - fprintf (stderr, " cpu (%d %f)**** <--> gpu (%d %f) index %d \n", src->j, src->val, tgt->j, tgt->val, index); - index ++; - } - */ - fprintf (stderr, "Sparse Matrix DOES NOT match between device and host \n"); - exit (-1); - count++; - } else break; - } - } - } - } - - //fprintf (stderr, "Sparse Matrix mismatch count %d \n", count); - free (test.start); - free (test.end); - free (test.entries); - return true; + sparse_matrix test; + int index, count; + test.start = (int *) malloc (INT_SIZE * (system->N + 1)); + test.end = (int *) malloc (INT_SIZE * (system->N + 1)); + + test.entries = (sparse_matrix_entry *) malloc (SPARSE_MATRIX_ENTRY_SIZE * (system->N * system->max_sparse_matrix_entries)); + + memset (test.entries, 0xFF, SPARSE_MATRIX_ENTRY_SIZE * system->N * system->max_sparse_matrix_entries); + copy_host_device ( test.entries, dev_workspace->H.entries, SPARSE_MATRIX_ENTRY_SIZE * system->N * system->max_sparse_matrix_entries, + cudaMemcpyDeviceToHost, __LINE__ ); + copy_host_device ( test.start, dev_workspace->H.start, INT_SIZE * (system->N + 1), cudaMemcpyDeviceToHost, __LINE__ ); + copy_host_device ( test.end , dev_workspace->H.end, INT_SIZE * (system->N + 1), cudaMemcpyDeviceToHost, __LINE__ ); + + /* + for (int i = 0 ; i < system->N; i++) { + if ((test.end[i] - test.start[i]) != (workspace->H.start[i+1] - workspace->H.start[i])){ + //if ((test.end[i] - test.start[i]) < 32 ){ + fprintf (stderr, "Sparse Matrix gpu (%d %d) cpu (%d %d)\n", + test.start[i], test.end[i], + workspace->H.start[i], workspace->H.start[i+1]); + exit (-1); + } + } + */ + //fprintf (stderr, "Sparse Matrix COUNT matches between HOST and DEVICE \n"); + + count = 0; + for (int i = 0; i < system->N; i++) { + for (int j = workspace->H.start[i]; j < workspace->H.start[i+1]; j++) { + sparse_matrix_entry *src = &workspace->H.entries[j]; + + for (int k = test.start[i]; k < test.end[i]; k++) { + sparse_matrix_entry *tgt = &test.entries [k]; + if (src->j == tgt->j){ + if ( check_zero (src->val, tgt->val)) { + index = test.start [i]; + /* + fprintf (stderr, " i-1 (%d %d ) (%d %d) \n", + test.start[i-1], test.end[i-1], + workspace->H.start[i-1], workspace->H.start[i]); + fprintf (stderr, " Sparse matrix entry does not match for atom %d at index %d (%d %d) (%d %d) \n", + i, k, test.start[i], test.end[i], + workspace->H.start[i], workspace->H.start[i+1]); + for (int x = workspace->H.start[i]; x < workspace->H.start[i+1]; x ++) + { + src = &workspace->H.entries[x]; + tgt = &test.entries [index]; + fprintf (stderr, " cpu (%d %f)**** <--> gpu (%d %f) index %d \n", src->j, src->val, tgt->j, tgt->val, index); + index ++; + } + */ + fprintf (stderr, "Sparse Matrix DOES NOT match between device and host \n"); + exit (-1); + count++; + } else break; + } + } + } + } + + //fprintf (stderr, "Sparse Matrix mismatch count %d \n", count); + free (test.start); + free (test.end); + free (test.entries); + return true; } bool validate_lu (static_storage *workspace) { - sparse_matrix test; - int index, count; - - test.start = (int *) malloc (INT_SIZE * (dev_workspace->L.n + 1)); - test.end = (int *) malloc (INT_SIZE * (dev_workspace->L.n + 1)); - test.entries = (sparse_matrix_entry *) malloc (SPARSE_MATRIX_ENTRY_SIZE * (dev_workspace->L.m)); - - memset (test.entries, 0xFF, SPARSE_MATRIX_ENTRY_SIZE * dev_workspace->L.m); - copy_host_device ( test.entries, dev_workspace->L.entries, SPARSE_MATRIX_ENTRY_SIZE * dev_workspace->L.m, cudaMemcpyDeviceToHost, __LINE__ ); - copy_host_device ( test.start, dev_workspace->L.start, INT_SIZE * (dev_workspace->L.n + 1), cudaMemcpyDeviceToHost, __LINE__ ); - copy_host_device ( test.end , dev_workspace->L.end, INT_SIZE * (dev_workspace->L.n + 1), cudaMemcpyDeviceToHost, __LINE__ ); - - count = 0; - for (int i = 0; i < workspace->L.n; i ++) - { - if (workspace->L.start[i] != test.start[i]){ - fprintf (stderr, "L -- Count does not match for index %d \n", i); - exit (-1); - } - - for (int j = workspace->L.start[i]; j < workspace->L.start[i+1]; j++) - { - if (check_zero (workspace->L.entries [j].val, test.entries[j].val) || - workspace->L.entries[j].j != test.entries [j].j) - { - fprintf (stderr, "L -- J or value does not match for the index %d \n", i); - count ++; - exit (-1); - } - } - } - - test.start = (int *) malloc (INT_SIZE * (dev_workspace->U.n + 1)); - test.end = (int *) malloc (INT_SIZE * (dev_workspace->U.n + 1)); - test.entries = (sparse_matrix_entry *) malloc (SPARSE_MATRIX_ENTRY_SIZE * (dev_workspace->U.m)); - - memset (test.entries, 0xFF, SPARSE_MATRIX_ENTRY_SIZE * dev_workspace->U.m); - copy_host_device ( test.entries, dev_workspace->U.entries, SPARSE_MATRIX_ENTRY_SIZE * dev_workspace->U.m, cudaMemcpyDeviceToHost, __LINE__ ); - copy_host_device ( test.start, dev_workspace->U.start, INT_SIZE * (dev_workspace->U.n + 1), cudaMemcpyDeviceToHost, __LINE__ ); - copy_host_device ( test.end , dev_workspace->U.end, INT_SIZE * (dev_workspace->U.n + 1), cudaMemcpyDeviceToHost, __LINE__ ); - - count = 0; - for (int i = 0; i < workspace->U.n; i ++) - { - if (workspace->U.start[i] != test.start[i]){ - fprintf (stderr, "U -- Count does not match for index %d \n", i); - exit (-1); - } - - for (int j = workspace->U.start[i]; j < workspace->U.start[i+1]; j++) - { - if (check_zero (workspace->U.entries [j].val, test.entries[j].val) || - workspace->U.entries[j].j != test.entries [j].j) - { - fprintf (stderr, "U -- J or value does not match for the index %d \n", i); - count ++; - exit (-1); - } - } - } - - //fprintf (stderr, "L and U match on device and host \n"); - return true; + sparse_matrix test; + int index, count; + + test.start = (int *) malloc (INT_SIZE * (dev_workspace->L.n + 1)); + test.end = (int *) malloc (INT_SIZE * (dev_workspace->L.n + 1)); + test.entries = (sparse_matrix_entry *) malloc (SPARSE_MATRIX_ENTRY_SIZE * (dev_workspace->L.m)); + + memset (test.entries, 0xFF, SPARSE_MATRIX_ENTRY_SIZE * dev_workspace->L.m); + copy_host_device ( test.entries, dev_workspace->L.entries, SPARSE_MATRIX_ENTRY_SIZE * dev_workspace->L.m, cudaMemcpyDeviceToHost, __LINE__ ); + copy_host_device ( test.start, dev_workspace->L.start, INT_SIZE * (dev_workspace->L.n + 1), cudaMemcpyDeviceToHost, __LINE__ ); + copy_host_device ( test.end , dev_workspace->L.end, INT_SIZE * (dev_workspace->L.n + 1), cudaMemcpyDeviceToHost, __LINE__ ); + + count = 0; + for (int i = 0; i < workspace->L.n; i ++) + { + if (workspace->L.start[i] != test.start[i]){ + fprintf (stderr, "L -- Count does not match for index %d \n", i); + exit (-1); + } + + for (int j = workspace->L.start[i]; j < workspace->L.start[i+1]; j++) + { + if (check_zero (workspace->L.entries [j].val, test.entries[j].val) || + workspace->L.entries[j].j != test.entries [j].j) + { + fprintf (stderr, "L -- J or value does not match for the index %d \n", i); + count ++; + exit (-1); + } + } + } + + test.start = (int *) malloc (INT_SIZE * (dev_workspace->U.n + 1)); + test.end = (int *) malloc (INT_SIZE * (dev_workspace->U.n + 1)); + test.entries = (sparse_matrix_entry *) malloc (SPARSE_MATRIX_ENTRY_SIZE * (dev_workspace->U.m)); + + memset (test.entries, 0xFF, SPARSE_MATRIX_ENTRY_SIZE * dev_workspace->U.m); + copy_host_device ( test.entries, dev_workspace->U.entries, SPARSE_MATRIX_ENTRY_SIZE * dev_workspace->U.m, cudaMemcpyDeviceToHost, __LINE__ ); + copy_host_device ( test.start, dev_workspace->U.start, INT_SIZE * (dev_workspace->U.n + 1), cudaMemcpyDeviceToHost, __LINE__ ); + copy_host_device ( test.end , dev_workspace->U.end, INT_SIZE * (dev_workspace->U.n + 1), cudaMemcpyDeviceToHost, __LINE__ ); + + count = 0; + for (int i = 0; i < workspace->U.n; i ++) + { + if (workspace->U.start[i] != test.start[i]){ + fprintf (stderr, "U -- Count does not match for index %d \n", i); + exit (-1); + } + + for (int j = workspace->U.start[i]; j < workspace->U.start[i+1]; j++) + { + if (check_zero (workspace->U.entries [j].val, test.entries[j].val) || + workspace->U.entries[j].j != test.entries [j].j) + { + fprintf (stderr, "U -- J or value does not match for the index %d \n", i); + count ++; + exit (-1); + } + } + } + + //fprintf (stderr, "L and U match on device and host \n"); + return true; } void print_sparse_matrix (reax_system *system, static_storage *workspace) { - sparse_matrix test; - int index, count; - - test.start = (int *) malloc (INT_SIZE * (system->N + 1)); - test.end = (int *) malloc (INT_SIZE * (system->N + 1)); - - test.entries = (sparse_matrix_entry *) malloc (SPARSE_MATRIX_ENTRY_SIZE * (system->N * system->max_sparse_matrix_entries)); - memset (test.entries, 0xFF, SPARSE_MATRIX_ENTRY_SIZE * system->N * system->max_sparse_matrix_entries); - - test.j = (int *) malloc (INT_SIZE * (system->N * system->max_sparse_matrix_entries)); - test.val = (real *) malloc (REAL_SIZE * (system->N * system->max_sparse_matrix_entries)); - - copy_host_device ( test.entries, dev_workspace->H.entries, - SPARSE_MATRIX_ENTRY_SIZE * system->N * system->max_sparse_matrix_entries, cudaMemcpyDeviceToHost, __LINE__ ); - copy_host_device ( test.start, dev_workspace->H.start, INT_SIZE * (system->N + 1), cudaMemcpyDeviceToHost, __LINE__ ); - copy_host_device ( test.end , dev_workspace->H.end, INT_SIZE * (system->N + 1), cudaMemcpyDeviceToHost, __LINE__ ); - - copy_host_device ( test.j , dev_workspace->H.j, INT_SIZE * (system->N * system->max_sparse_matrix_entries), cudaMemcpyDeviceToHost, __LINE__ ); - copy_host_device ( test.val , dev_workspace->H.val, REAL_SIZE * (system->N * system->max_sparse_matrix_entries), cudaMemcpyDeviceToHost, __LINE__ ); - - count = 0; - for (int i = 0; i < 1; i++) { - //for (int j = workspace->H.start[i]; j < workspace->H.start[i+1]; j++) { - // sparse_matrix_entry *src = &workspace->H.entries[j]; - // fprintf (stderr, " cpu (%d %f) \n", src->j, src->val); - //} - //fprintf (stderr, " start: %d -- end: %d ------- count %d\n", test.start[i], test.end[i], test.end[i] - test.start[i]); - for (int j = test.start[i]; j < test.end[i]; j++) { - //sparse_matrix_entry *src = &test.entries[j]; - //fprintf (stderr, "Row:%d:%d:%f\n", i, src->j, src->val); - fprintf (stderr, "Row:%d:%d:%f\n", i, test.j[j], test.val[j]); - } - - //if (test.end[i] - test.start[i] > 500 ) - // fprintf (stderr, " Row -- %d, count %d \n", i, test.end[i] - test.start[i] ); - } - fprintf (stderr, "--------------- "); - - free (test.start); - free (test.end); - free (test.entries); - free (test.j); - free (test.val); + sparse_matrix test; + int index, count; + + test.start = (int *) malloc (INT_SIZE * (system->N + 1)); + test.end = (int *) malloc (INT_SIZE * (system->N + 1)); + + test.entries = (sparse_matrix_entry *) malloc (SPARSE_MATRIX_ENTRY_SIZE * (system->N * system->max_sparse_matrix_entries)); + memset (test.entries, 0xFF, SPARSE_MATRIX_ENTRY_SIZE * system->N * system->max_sparse_matrix_entries); + + test.j = (int *) malloc (INT_SIZE * (system->N * system->max_sparse_matrix_entries)); + test.val = (real *) malloc (REAL_SIZE * (system->N * system->max_sparse_matrix_entries)); + + copy_host_device ( test.entries, dev_workspace->H.entries, + SPARSE_MATRIX_ENTRY_SIZE * system->N * system->max_sparse_matrix_entries, cudaMemcpyDeviceToHost, __LINE__ ); + copy_host_device ( test.start, dev_workspace->H.start, INT_SIZE * (system->N + 1), cudaMemcpyDeviceToHost, __LINE__ ); + copy_host_device ( test.end , dev_workspace->H.end, INT_SIZE * (system->N + 1), cudaMemcpyDeviceToHost, __LINE__ ); + + copy_host_device ( test.j , dev_workspace->H.j, INT_SIZE * (system->N * system->max_sparse_matrix_entries), cudaMemcpyDeviceToHost, __LINE__ ); + copy_host_device ( test.val , dev_workspace->H.val, REAL_SIZE * (system->N * system->max_sparse_matrix_entries), cudaMemcpyDeviceToHost, __LINE__ ); + + count = 0; + for (int i = 0; i < 1; i++) { + //for (int j = workspace->H.start[i]; j < workspace->H.start[i+1]; j++) { + // sparse_matrix_entry *src = &workspace->H.entries[j]; + // fprintf (stderr, " cpu (%d %f) \n", src->j, src->val); + //} + //fprintf (stderr, " start: %d -- end: %d ------- count %d\n", test.start[i], test.end[i], test.end[i] - test.start[i]); + for (int j = test.start[i]; j < test.end[i]; j++) { + //sparse_matrix_entry *src = &test.entries[j]; + //fprintf (stderr, "Row:%d:%d:%f\n", i, src->j, src->val); + fprintf (stderr, "Row:%d:%d:%f\n", i, test.j[j], test.val[j]); + } + + //if (test.end[i] - test.start[i] > 500 ) + // fprintf (stderr, " Row -- %d, count %d \n", i, test.end[i] - test.start[i] ); + } + fprintf (stderr, "--------------- "); + + free (test.start); + free (test.end); + free (test.entries); + free (test.j); + free (test.val); } bool validate_bonds (reax_system *system, static_storage *workspace, list **lists) { - int start, end, index, count, miscount; - int *d_start, *d_end; - bond_data *d_bond_data; - list *d_bonds = dev_lists + BONDS; - list *bonds = *lists + BONDS; - - d_end = (int *)malloc (sizeof (int) * system->N); - d_start = (int *) malloc (sizeof (int) * system->N ); - d_bond_data = (bond_data *) malloc (BOND_DATA_SIZE * system->num_bonds ); - //fprintf (stderr, "Num bonds copied from device to host is --> %d \n", system->num_bonds ); - - copy_host_device (d_start, d_bonds->index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); - copy_host_device (d_end, d_bonds->end_index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); - copy_host_device (d_bond_data, d_bonds->select.bond_list, BOND_DATA_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__); - - count = 0; - for (int i = 0; i < system->N; i++) { - start = Start_Index (i, bonds); - end = End_Index (i, bonds); - - count += end - start; - if ((end-start) != (d_end[i]-d_start[i])){ - fprintf (stderr, "Entries does NOT match --> atom %d: cpu (%d %d) gpu (%d %d) \n", - i, start, end, d_start[i], d_end[i]); - exit (-1); - } - - } - fprintf (stderr, "BOND LIST COUNT match on device and host count %d \n", count); - - for (int i = 0; i < system->N-1; i++) { - if ( d_end[i] >= d_start[i+1] ){ - fprintf (stderr, "Bonds list check Overwrite @ index --> %d \n", i); - exit (-1); - } - } - //fprintf (stderr, " BOND LIST Overwrite *PASSED* \n"); - - count = 0; - miscount = 0; - for (int i = 0; i < system->N; i++) { - - for (int j = d_start[i]; j < d_end[i]; j++) { - bond_data *src, *tgt; - src = &d_bond_data[j]; - bond_data *src_sym = & d_bond_data[ src->sym_index ]; - - //Previously this was commented out. Thats why it was working. - //if (i >= src->nbr) continue; - - int k = 0; - for (k = Start_Index (i, bonds); k < End_Index (i, bonds); k++) { - tgt = & (bonds->select.bond_list[k]); - bond_data *tgt_sym = &(bonds->select.bond_list [tgt->sym_index]); - - if ((src->nbr == tgt->nbr) && !check_zero (src->d,tgt->d) && - !check_zero (src->dvec,tgt->dvec) && check_same (src->rel_box, tgt->rel_box)) { - - bond_order_data *s, *t; - s = &(src->bo_data); - t = &(tgt->bo_data); - - /* - if (i == 45){ - fprintf (stderr, " Host %e for %d\n", t->BO, tgt->nbr); - fprintf (stderr, " Device %e for %d\n", s->BO, src->nbr); - } - */ - - if ( !check_zero (s->BO,t->BO) && - !check_zero (s->BO_s,t->BO_s) && - !check_zero(s->BO_pi,t->BO_pi) && - !check_zero (s->BO_pi2,t->BO_pi2) && - !check_zero (s->Cdbo,t->Cdbo) && !check_zero (s->Cdbopi,t->Cdbopi) && !check_zero (s->Cdbopi2,t->Cdbopi2) && - !check_zero (s->C1dbo,t->C1dbo) && !check_zero (s->C2dbo,t->C2dbo) && !check_zero (s->C3dbo,t->C3dbo) && - !check_zero(s->C1dbopi,t->C1dbopi) && !check_zero(s->C2dbopi,t->C2dbopi) && !check_zero(s->C3dbopi,t->C3dbopi) && !check_zero(s->C4dbopi,t->C4dbopi) && - !check_zero(s->C1dbopi2,t->C1dbopi2) && !check_zero(s->C2dbopi2,t->C2dbopi2) &&!check_zero(s->C3dbopi2,t->C3dbopi2) &&!check_zero(s->C4dbopi2,t->C4dbopi2) && - !check_zero (s->dln_BOp_s, t->dln_BOp_s ) && - !check_zero (s->dln_BOp_pi, t->dln_BOp_pi ) && - !check_zero (s->dln_BOp_pi2, t->dln_BOp_pi2 ) && - !check_zero (s->dBOp, t->dBOp )) { - count ++; - - //Check the sym index and dbond index here for double checking - // bond_ij on both device and hosts are matched now. - bond_order_data *ss, *ts; - ss = & (src_sym->bo_data ); - ts = & (tgt_sym->bo_data ); - - if ((src_sym->nbr != tgt_sym->nbr) || check_zero (src_sym->d,tgt_sym->d) || - check_zero (src_sym->dvec,tgt_sym->dvec) || !check_same (src_sym->rel_box, tgt_sym->rel_box) - || check_zero (ss->Cdbo, ts->Cdbo)){ - - fprintf (stderr, " Sym Index information does not match for atom %d \n", i); - fprintf (stderr, " atom --> %d \n", i); - fprintf (stderr, " nbr --> %d %d\n", src->nbr, tgt->nbr ); - fprintf (stderr, " d --> %f %f \n", src_sym->d, tgt_sym->d ); - fprintf (stderr, " sym Index nbr --> %d %d \n", src_sym->nbr, tgt_sym->nbr ); - fprintf (stderr, " dvec (%f %f %f) (%f %f %f) \n", - src_sym->dvec[0], src_sym->dvec[1], src_sym->dvec[2], - tgt_sym->dvec[0], tgt_sym->dvec[1], tgt_sym->dvec[2] ); - fprintf (stderr, " ivec (%d %d %d) (%d %d %d) \n", - src_sym->rel_box[0], src_sym->rel_box[1], src_sym->rel_box[2], - tgt_sym->rel_box[0], tgt_sym->rel_box[1], tgt_sym->rel_box[2] ); - - fprintf (stderr, " sym index Cdbo (%4.10e %4.10e) \n", ss->Cdbo,ts->Cdbo ); - exit (-1); - } - - break; - } - fprintf (stderr, " d --> %f %f \n", src->d, tgt->d ); - fprintf (stderr, " dvec (%f %f %f) (%f %f %f) \n", - src->dvec[0], src->dvec[1], src->dvec[2], - tgt->dvec[0], tgt->dvec[1], tgt->dvec[2] ); - fprintf (stderr, " ivec (%d %d %d) (%d %d %d) \n", - src->rel_box[0], src->rel_box[1], src->rel_box[2], - tgt->rel_box[0], tgt->rel_box[1], tgt->rel_box[2] ); - - fprintf (stderr, "Bond_Order_Data does not match for atom %d neighbor (%d %d) BO (%e %e) BO_s (%e %e) BO_pi (%e %e) BO_pi2 (%e %e) \n", i, - src->nbr, tgt->nbr, - s->BO, t->BO, - s->BO_s, t->BO_s, - s->BO_pi, t->BO_pi, - s->BO_pi2, t->BO_pi2 - ); - fprintf (stderr, " dBOp (%e %e %e) (%e %e %e) \n", s->dBOp[0], s->dBOp[1], s->dBOp[2], - t->dBOp[0], t->dBOp[1], t->dBOp[2] ); - - fprintf (stderr, " Cdbo (%4.10e %4.10e) \n", s->Cdbo,t->Cdbo ); - fprintf (stderr, " Cdbopi (%e %e) \n", s->Cdbopi,t->Cdbopi ); - fprintf (stderr, " Cdbopi2 (%e %e) \n", s->Cdbopi2,t->Cdbopi2 ); - fprintf (stderr, " C1dbo (%e %e %e)(%e %e %e) \n", s->C1dbo,s->C2dbo,s->C3dbo, t->C1dbo,t->C2dbo,t->C3dbo ); - fprintf (stderr, " C1dbopi (%e %e %e %e) (%e %e %e %e)\n", s->C1dbopi,s->C2dbopi,s->C3dbopi,s->C4dbopi, t->C1dbopi,t->C2dbopi,t->C3dbopi,t->C4dbopi); - fprintf (stderr, " C1dbopi2 (%e %e %e %e) (%e %e %e %e)\n", s->C1dbopi2,s->C2dbopi2,s->C3dbopi2,s->C4dbopi2, t->C1dbopi2,t->C2dbopi2,t->C3dbopi2,t->C4dbopi2); - fprintf (stderr, " dln_BOp_s (%e %e %e ) (%e %e %e) \n", - s->dln_BOp_s[0], s->dln_BOp_s[1], s->dln_BOp_s[2], - t->dln_BOp_s[0], t->dln_BOp_s[1], t->dln_BOp_s[2] ); - fprintf (stderr, " dln_BOp_pi (%e %e %e ) (%e %e %e) \n", - s->dln_BOp_pi[0], s->dln_BOp_pi[1], s->dln_BOp_pi[2], - t->dln_BOp_pi[0], t->dln_BOp_pi[1], t->dln_BOp_pi[2] ); - fprintf (stderr, " dln_BOp_pi2 (%e %e %e ) (%e %e %e) \n", - s->dln_BOp_pi2[0], s->dln_BOp_pi2[1], s->dln_BOp_pi2[2], - t->dln_BOp_pi2[0], t->dln_BOp_pi2[1], t->dln_BOp_pi2[2] ); - - //exit (-1); - } - } - - if (k >= End_Index (i, bonds)) { - miscount ++; - fprintf (stderr, " We have a problem with the atom %d and bond entry %d \n", i, j); - exit (-1); - } - } - } - - fprintf (stderr, " Total bond order matched count %d miscount %d (%d) \n", count, miscount, (count+miscount)); - - /* - for (int i = 5423; i < 5424; i++) { - start = Start_Index (i, bonds); - end = End_Index (i, bonds); - - index = d_start[i]; - - fprintf (stderr, "Bond Count %d \n", end-start); - for (int j = start; j < end; j++) - { - bond_data src, tgt; - src = bonds->select.bond_list[j]; - tgt = d_bond_data[index]; - index ++; - - //compare here - if ((src.nbr != tgt.nbr) || (src.d != tgt.d) || - memcmp (src.rel_box, tgt.rel_box, IVEC_SIZE) || - memcmp (src.dvec, tgt.dvec, RVEC_SIZE) ) { - fprintf (stderr, "Entries does not MATCH with bond data at atom %d index %d \r\n src ( %d %f (%d %d %d) (%f %f %f) ) tgt (%d %f (%d %d %d) (%f %f %f))\n", - i, j, - src.nbr, src.d, src.rel_box[0], src.rel_box[1], src.rel_box[2], - src.dvec[0], src.dvec[1], src.dvec[2], - tgt.nbr, tgt.d, tgt.rel_box[0], tgt.rel_box[1], tgt.rel_box[2], - tgt.dvec[0], tgt.dvec[1], tgt.dvec[2] ); - } - } - } - */ - - //fprintf (stderr, "BOND LIST match on device and host \n"); - - free (d_start); - free (d_end); - free (d_bond_data); - return true; + int start, end, index, count, miscount; + int *d_start, *d_end; + bond_data *d_bond_data; + list *d_bonds = dev_lists + BONDS; + list *bonds = *lists + BONDS; + + d_end = (int *)malloc (sizeof (int) * system->N); + d_start = (int *) malloc (sizeof (int) * system->N ); + d_bond_data = (bond_data *) malloc (BOND_DATA_SIZE * system->num_bonds ); + //fprintf (stderr, "Num bonds copied from device to host is --> %d \n", system->num_bonds ); + + copy_host_device (d_start, d_bonds->index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); + copy_host_device (d_end, d_bonds->end_index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); + copy_host_device (d_bond_data, d_bonds->select.bond_list, BOND_DATA_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__); + + count = 0; + for (int i = 0; i < system->N; i++) { + start = Start_Index (i, bonds); + end = End_Index (i, bonds); + + count += end - start; + if ((end-start) != (d_end[i]-d_start[i])){ + fprintf (stderr, "Entries does NOT match --> atom %d: cpu (%d %d) gpu (%d %d) \n", + i, start, end, d_start[i], d_end[i]); + exit (-1); + } + + } + fprintf (stderr, "BOND LIST COUNT match on device and host count %d \n", count); + + for (int i = 0; i < system->N-1; i++) { + if ( d_end[i] >= d_start[i+1] ){ + fprintf (stderr, "Bonds list check Overwrite @ index --> %d \n", i); + exit (-1); + } + } + //fprintf (stderr, " BOND LIST Overwrite *PASSED* \n"); + + count = 0; + miscount = 0; + for (int i = 0; i < system->N; i++) { + + for (int j = d_start[i]; j < d_end[i]; j++) { + bond_data *src, *tgt; + src = &d_bond_data[j]; + bond_data *src_sym = & d_bond_data[ src->sym_index ]; + + //Previously this was commented out. Thats why it was working. + //if (i >= src->nbr) continue; + + int k = 0; + for (k = Start_Index (i, bonds); k < End_Index (i, bonds); k++) { + tgt = & (bonds->select.bond_list[k]); + bond_data *tgt_sym = &(bonds->select.bond_list [tgt->sym_index]); + + if ((src->nbr == tgt->nbr) && !check_zero (src->d,tgt->d) && + !check_zero (src->dvec,tgt->dvec) && check_same (src->rel_box, tgt->rel_box)) { + + bond_order_data *s, *t; + s = &(src->bo_data); + t = &(tgt->bo_data); + + /* + if (i == 45){ + fprintf (stderr, " Host %e for %d\n", t->BO, tgt->nbr); + fprintf (stderr, " Device %e for %d\n", s->BO, src->nbr); + } + */ + + if ( !check_zero (s->BO,t->BO) && + !check_zero (s->BO_s,t->BO_s) && + !check_zero(s->BO_pi,t->BO_pi) && + !check_zero (s->BO_pi2,t->BO_pi2) && + !check_zero (s->Cdbo,t->Cdbo) && !check_zero (s->Cdbopi,t->Cdbopi) && !check_zero (s->Cdbopi2,t->Cdbopi2) && + !check_zero (s->C1dbo,t->C1dbo) && !check_zero (s->C2dbo,t->C2dbo) && !check_zero (s->C3dbo,t->C3dbo) && + !check_zero(s->C1dbopi,t->C1dbopi) && !check_zero(s->C2dbopi,t->C2dbopi) && !check_zero(s->C3dbopi,t->C3dbopi) && !check_zero(s->C4dbopi,t->C4dbopi) && + !check_zero(s->C1dbopi2,t->C1dbopi2) && !check_zero(s->C2dbopi2,t->C2dbopi2) &&!check_zero(s->C3dbopi2,t->C3dbopi2) &&!check_zero(s->C4dbopi2,t->C4dbopi2) && + !check_zero (s->dln_BOp_s, t->dln_BOp_s ) && + !check_zero (s->dln_BOp_pi, t->dln_BOp_pi ) && + !check_zero (s->dln_BOp_pi2, t->dln_BOp_pi2 ) && + !check_zero (s->dBOp, t->dBOp )) { + count ++; + + //Check the sym index and dbond index here for double checking + // bond_ij on both device and hosts are matched now. + bond_order_data *ss, *ts; + ss = & (src_sym->bo_data ); + ts = & (tgt_sym->bo_data ); + + if ((src_sym->nbr != tgt_sym->nbr) || check_zero (src_sym->d,tgt_sym->d) || + check_zero (src_sym->dvec,tgt_sym->dvec) || !check_same (src_sym->rel_box, tgt_sym->rel_box) + || check_zero (ss->Cdbo, ts->Cdbo)){ + + fprintf (stderr, " Sym Index information does not match for atom %d \n", i); + fprintf (stderr, " atom --> %d \n", i); + fprintf (stderr, " nbr --> %d %d\n", src->nbr, tgt->nbr ); + fprintf (stderr, " d --> %f %f \n", src_sym->d, tgt_sym->d ); + fprintf (stderr, " sym Index nbr --> %d %d \n", src_sym->nbr, tgt_sym->nbr ); + fprintf (stderr, " dvec (%f %f %f) (%f %f %f) \n", + src_sym->dvec[0], src_sym->dvec[1], src_sym->dvec[2], + tgt_sym->dvec[0], tgt_sym->dvec[1], tgt_sym->dvec[2] ); + fprintf (stderr, " ivec (%d %d %d) (%d %d %d) \n", + src_sym->rel_box[0], src_sym->rel_box[1], src_sym->rel_box[2], + tgt_sym->rel_box[0], tgt_sym->rel_box[1], tgt_sym->rel_box[2] ); + + fprintf (stderr, " sym index Cdbo (%4.10e %4.10e) \n", ss->Cdbo,ts->Cdbo ); + exit (-1); + } + + break; + } + fprintf (stderr, " d --> %f %f \n", src->d, tgt->d ); + fprintf (stderr, " dvec (%f %f %f) (%f %f %f) \n", + src->dvec[0], src->dvec[1], src->dvec[2], + tgt->dvec[0], tgt->dvec[1], tgt->dvec[2] ); + fprintf (stderr, " ivec (%d %d %d) (%d %d %d) \n", + src->rel_box[0], src->rel_box[1], src->rel_box[2], + tgt->rel_box[0], tgt->rel_box[1], tgt->rel_box[2] ); + + fprintf (stderr, "Bond_Order_Data does not match for atom %d neighbor (%d %d) BO (%e %e) BO_s (%e %e) BO_pi (%e %e) BO_pi2 (%e %e) \n", i, + src->nbr, tgt->nbr, + s->BO, t->BO, + s->BO_s, t->BO_s, + s->BO_pi, t->BO_pi, + s->BO_pi2, t->BO_pi2 + ); + fprintf (stderr, " dBOp (%e %e %e) (%e %e %e) \n", s->dBOp[0], s->dBOp[1], s->dBOp[2], + t->dBOp[0], t->dBOp[1], t->dBOp[2] ); + + fprintf (stderr, " Cdbo (%4.10e %4.10e) \n", s->Cdbo,t->Cdbo ); + fprintf (stderr, " Cdbopi (%e %e) \n", s->Cdbopi,t->Cdbopi ); + fprintf (stderr, " Cdbopi2 (%e %e) \n", s->Cdbopi2,t->Cdbopi2 ); + fprintf (stderr, " C1dbo (%e %e %e)(%e %e %e) \n", s->C1dbo,s->C2dbo,s->C3dbo, t->C1dbo,t->C2dbo,t->C3dbo ); + fprintf (stderr, " C1dbopi (%e %e %e %e) (%e %e %e %e)\n", s->C1dbopi,s->C2dbopi,s->C3dbopi,s->C4dbopi, t->C1dbopi,t->C2dbopi,t->C3dbopi,t->C4dbopi); + fprintf (stderr, " C1dbopi2 (%e %e %e %e) (%e %e %e %e)\n", s->C1dbopi2,s->C2dbopi2,s->C3dbopi2,s->C4dbopi2, t->C1dbopi2,t->C2dbopi2,t->C3dbopi2,t->C4dbopi2); + fprintf (stderr, " dln_BOp_s (%e %e %e ) (%e %e %e) \n", + s->dln_BOp_s[0], s->dln_BOp_s[1], s->dln_BOp_s[2], + t->dln_BOp_s[0], t->dln_BOp_s[1], t->dln_BOp_s[2] ); + fprintf (stderr, " dln_BOp_pi (%e %e %e ) (%e %e %e) \n", + s->dln_BOp_pi[0], s->dln_BOp_pi[1], s->dln_BOp_pi[2], + t->dln_BOp_pi[0], t->dln_BOp_pi[1], t->dln_BOp_pi[2] ); + fprintf (stderr, " dln_BOp_pi2 (%e %e %e ) (%e %e %e) \n", + s->dln_BOp_pi2[0], s->dln_BOp_pi2[1], s->dln_BOp_pi2[2], + t->dln_BOp_pi2[0], t->dln_BOp_pi2[1], t->dln_BOp_pi2[2] ); + + //exit (-1); + } + } + + if (k >= End_Index (i, bonds)) { + miscount ++; + fprintf (stderr, " We have a problem with the atom %d and bond entry %d \n", i, j); + exit (-1); + } + } + } + + fprintf (stderr, " Total bond order matched count %d miscount %d (%d) \n", count, miscount, (count+miscount)); + + /* + for (int i = 5423; i < 5424; i++) { + start = Start_Index (i, bonds); + end = End_Index (i, bonds); + + index = d_start[i]; + + fprintf (stderr, "Bond Count %d \n", end-start); + for (int j = start; j < end; j++) + { + bond_data src, tgt; + src = bonds->select.bond_list[j]; + tgt = d_bond_data[index]; + index ++; + + //compare here + if ((src.nbr != tgt.nbr) || (src.d != tgt.d) || + memcmp (src.rel_box, tgt.rel_box, IVEC_SIZE) || + memcmp (src.dvec, tgt.dvec, RVEC_SIZE) ) { + fprintf (stderr, "Entries does not MATCH with bond data at atom %d index %d \r\n src ( %d %f (%d %d %d) (%f %f %f) ) tgt (%d %f (%d %d %d) (%f %f %f))\n", + i, j, + src.nbr, src.d, src.rel_box[0], src.rel_box[1], src.rel_box[2], + src.dvec[0], src.dvec[1], src.dvec[2], + tgt.nbr, tgt.d, tgt.rel_box[0], tgt.rel_box[1], tgt.rel_box[2], + tgt.dvec[0], tgt.dvec[1], tgt.dvec[2] ); + } + } + } + */ + + //fprintf (stderr, "BOND LIST match on device and host \n"); + + free (d_start); + free (d_end); + free (d_bond_data); + return true; } bool validate_sym_dbond_indices (reax_system *system, static_storage *workspace, list **lists) { - int start, end, index, count, miscount; - int *d_start, *d_end; - bond_data *d_bond_data; - list *d_bonds = dev_lists + BONDS; - list *bonds = *lists + BONDS; - - d_end = (int *)malloc (sizeof (int) * system->N); - d_start = (int *) malloc (sizeof (int) * system->N ); - d_bond_data = (bond_data *) malloc (BOND_DATA_SIZE * system->num_bonds ); - //fprintf (stderr, "Num bonds copied from device to host is --> %d \n", system->num_bonds ); - - copy_host_device (d_start, d_bonds->index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); - copy_host_device (d_end, d_bonds->end_index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); - copy_host_device (d_bond_data, d_bonds->select.bond_list, BOND_DATA_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__); - - count = 0; - miscount = 0; - for (int i = 0; i < system->N; i++) { - - for (int j = d_start[i]; j < d_end[i]; j++) { - bond_data *src, *tgt; - src = &d_bond_data[j]; - - tgt = &d_bond_data[ src->sym_index ]; - - if ((src->dbond_index == tgt->dbond_index) ) - count ++; - else - miscount ++; - } - } - fprintf (stderr, "Sym and dbond indexes done count(device) --> %d (%d)\n", count, miscount); - - count = 0; - miscount = 0; - for (int i = 0; i < system->N; i++) { - - for (int j = Start_Index (i, bonds); j < End_Index(i, bonds); j++) { - bond_data *src, *tgt; - src = &bonds->select.bond_list [j]; - - tgt = &bonds->select.bond_list [ src->sym_index ]; - - if ((src->dbond_index == tgt->dbond_index) ) - count ++; - else - miscount ++; - } - } - fprintf (stderr, "Sym and dbond indexes done count (host) --> %d (%d)\n", count, miscount); - - free (d_start); - free (d_end); - free (d_bond_data); - return true; + int start, end, index, count, miscount; + int *d_start, *d_end; + bond_data *d_bond_data; + list *d_bonds = dev_lists + BONDS; + list *bonds = *lists + BONDS; + + d_end = (int *)malloc (sizeof (int) * system->N); + d_start = (int *) malloc (sizeof (int) * system->N ); + d_bond_data = (bond_data *) malloc (BOND_DATA_SIZE * system->num_bonds ); + //fprintf (stderr, "Num bonds copied from device to host is --> %d \n", system->num_bonds ); + + copy_host_device (d_start, d_bonds->index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); + copy_host_device (d_end, d_bonds->end_index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); + copy_host_device (d_bond_data, d_bonds->select.bond_list, BOND_DATA_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__); + + count = 0; + miscount = 0; + for (int i = 0; i < system->N; i++) { + + for (int j = d_start[i]; j < d_end[i]; j++) { + bond_data *src, *tgt; + src = &d_bond_data[j]; + + tgt = &d_bond_data[ src->sym_index ]; + + if ((src->dbond_index == tgt->dbond_index) ) + count ++; + else + miscount ++; + } + } + fprintf (stderr, "Sym and dbond indexes done count(device) --> %d (%d)\n", count, miscount); + + count = 0; + miscount = 0; + for (int i = 0; i < system->N; i++) { + + for (int j = Start_Index (i, bonds); j < End_Index(i, bonds); j++) { + bond_data *src, *tgt; + src = &bonds->select.bond_list [j]; + + tgt = &bonds->select.bond_list [ src->sym_index ]; + + if ((src->dbond_index == tgt->dbond_index) ) + count ++; + else + miscount ++; + } + } + fprintf (stderr, "Sym and dbond indexes done count (host) --> %d (%d)\n", count, miscount); + + free (d_start); + free (d_end); + free (d_bond_data); + return true; } bool analyze_hbonds (reax_system *system, static_storage *workspace, list **lists) { - int hindex, nbr_hindex; - int pj, hj, hb_start_j, hb_end_j, j, nbr; - far_neighbor_data *nbr_pj; - - list *far_nbrs = *lists + FAR_NBRS; - list *hbonds = *lists + HBONDS; - hbond_data *src, *tgt, *h_bond_data; - int i, k, l; - - for (i = 0; i < system->N; i ++) - for (pj = Start_Index (i, far_nbrs); pj < End_Index (i, far_nbrs); pj ++) - { - // check if the neighbor is of h_type - nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); - j = nbr_pj->nbr; - - if (workspace->hbond_index [j] != -1) - { - hb_start_j = Start_Index( workspace->hbond_index[j], hbonds ); - hb_end_j = End_Index ( workspace->hbond_index[j], hbonds ); - - if (hb_start_j == hb_end_j) fprintf (stderr, "start == end \n"); - - for ( hj = hb_start_j; hj < hb_end_j; hj ++ ) - { - h_bond_data = &( hbonds->select.hbond_list [hj] ); - nbr = h_bond_data->nbr; - - if (nbr == i) - fprintf (stderr, "found it for atom %d and neighbor %d neighbor %d \n", i, j , nbr); - if (Start_Index (workspace->hbond_index [nbr], hbonds) == End_Index (workspace->hbond_index [nbr], hbonds)) - fprintf (stderr, " neighbor start == end \n"); - - for ( k = Start_Index (workspace->hbond_index [nbr], hbonds); - k < End_Index (workspace->hbond_index [nbr], hbonds); - k ++) - { - if (hbonds->select.hbond_list [k].nbr == i) { - fprintf (stderr, "found it for atom %d and neighbor %d \n", i, j); - } - } - } - } - else fprintf (stderr, "hbond index in workspace is -1\n"); - } - - - for (i = 0; i < system->N; i++) - { - hindex = workspace->hbond_index [i]; - if (hindex != -1) - { - for (j = Start_Index ( hindex, hbonds ); j < End_Index ( hindex, hbonds ); j ++) - { - src = &hbonds->select.hbond_list [j]; - - nbr_hindex = workspace->hbond_index [src->nbr]; - if (nbr_hindex == -1) { - fprintf (stderr, " HBonds are NOT symmetric atom %d, neighbor %d\n", i, src->nbr); - exit (-1); - } - - for (k = Start_Index ( nbr_hindex, hbonds ); k < End_Index ( nbr_hindex, hbonds ); k++) - { - tgt = &hbonds->select.hbond_list [k]; - if ((tgt->nbr == i) && (src->scl == tgt->scl)) - { - break; - } - } - - if ( k >= End_Index (nbr_hindex, hbonds)) { - fprintf (stderr, " Could not find the other half of the hbonds \n"); - exit (-1); - } - } - } - } - - fprintf (stderr, "HBONDS list is symmetric \n"); + int hindex, nbr_hindex; + int pj, hj, hb_start_j, hb_end_j, j, nbr; + far_neighbor_data *nbr_pj; + + list *far_nbrs = *lists + FAR_NBRS; + list *hbonds = *lists + HBONDS; + hbond_data *src, *tgt, *h_bond_data; + int i, k, l; + + for (i = 0; i < system->N; i ++) + for (pj = Start_Index (i, far_nbrs); pj < End_Index (i, far_nbrs); pj ++) + { + // check if the neighbor is of h_type + nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); + j = nbr_pj->nbr; + + if (workspace->hbond_index [j] != -1) + { + hb_start_j = Start_Index( workspace->hbond_index[j], hbonds ); + hb_end_j = End_Index ( workspace->hbond_index[j], hbonds ); + + if (hb_start_j == hb_end_j) fprintf (stderr, "start == end \n"); + + for ( hj = hb_start_j; hj < hb_end_j; hj ++ ) + { + h_bond_data = &( hbonds->select.hbond_list [hj] ); + nbr = h_bond_data->nbr; + + if (nbr == i) + fprintf (stderr, "found it for atom %d and neighbor %d neighbor %d \n", i, j , nbr); + if (Start_Index (workspace->hbond_index [nbr], hbonds) == End_Index (workspace->hbond_index [nbr], hbonds)) + fprintf (stderr, " neighbor start == end \n"); + + for ( k = Start_Index (workspace->hbond_index [nbr], hbonds); + k < End_Index (workspace->hbond_index [nbr], hbonds); + k ++) + { + if (hbonds->select.hbond_list [k].nbr == i) { + fprintf (stderr, "found it for atom %d and neighbor %d \n", i, j); + } + } + } + } + else fprintf (stderr, "hbond index in workspace is -1\n"); + } + + + for (i = 0; i < system->N; i++) + { + hindex = workspace->hbond_index [i]; + if (hindex != -1) + { + for (j = Start_Index ( hindex, hbonds ); j < End_Index ( hindex, hbonds ); j ++) + { + src = &hbonds->select.hbond_list [j]; + + nbr_hindex = workspace->hbond_index [src->nbr]; + if (nbr_hindex == -1) { + fprintf (stderr, " HBonds are NOT symmetric atom %d, neighbor %d\n", i, src->nbr); + exit (-1); + } + + for (k = Start_Index ( nbr_hindex, hbonds ); k < End_Index ( nbr_hindex, hbonds ); k++) + { + tgt = &hbonds->select.hbond_list [k]; + if ((tgt->nbr == i) && (src->scl == tgt->scl)) + { + break; + } + } + + if ( k >= End_Index (nbr_hindex, hbonds)) { + fprintf (stderr, " Could not find the other half of the hbonds \n"); + exit (-1); + } + } + } + } + + fprintf (stderr, "HBONDS list is symmetric \n"); } bool validate_hbonds (reax_system *system, static_storage *workspace, list **lists) { - int *hbond_index, count; - int *d_start, *d_end, index, d_index; - hbond_data *data, src, tgt; - list *d_hbonds = dev_lists + HBONDS; - list *hbonds = *lists + HBONDS; - - hbond_index = (int *) malloc (INT_SIZE * system->N); - copy_host_device (hbond_index, dev_workspace->hbond_index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); - - d_end = (int *)malloc (INT_SIZE * system->N); - d_start = (int *) malloc (INT_SIZE * system->N ); - - copy_host_device (d_start, d_hbonds->index, INT_SIZE * dev_workspace->num_H, cudaMemcpyDeviceToHost, __LINE__); - copy_host_device (d_end, d_hbonds->end_index, INT_SIZE * dev_workspace->num_H, cudaMemcpyDeviceToHost, __LINE__); - - //fprintf (stderr, "Copying hbonds to host %d \n", system->num_hbonds); - data = (hbond_data *) malloc (HBOND_DATA_SIZE * system->num_hbonds); - copy_host_device (data, d_hbonds->select.hbond_list, HBOND_DATA_SIZE * system->num_hbonds, cudaMemcpyDeviceToHost, __LINE__); - - /* - Now the hbonds list is symmetric. will not work any longer - - for (int i = 0; i < system->N; i++) - if (hbond_index[i] != workspace->hbond_index[i]) { - fprintf (stderr, "hbond index does not match for atom %d (%d %d)\n", - i, workspace->hbond_index[i], hbond_index[i]); - exit (-1); - } - - */ - - //fprintf (stderr, "hbond_index match between host and device \n"); - - for (int i = 0; i < system->N; i++) { - - if ( system->reaxprm.sbp[ system->atoms[i].type ].p_hbond == 1 ) - { - if (hbond_index[i] >= 0) { - if ((d_end[ hbond_index[i]] - d_start[hbond_index[i]]) != - (End_Index (workspace->hbond_index[i], hbonds) - Start_Index (workspace->hbond_index[i], hbonds))) { - fprintf (stderr, "%d %d - d(%d %d) c(%d %d) \n",hbond_index[i], workspace->hbond_index[i], - d_start[hbond_index[i]], d_end[ hbond_index[i]], - Start_Index (workspace->hbond_index[i], hbonds), - End_Index (workspace->hbond_index[i], hbonds) ); - exit (-1); - } - } - } - } - //fprintf (stderr, "hbonds count match between host and device \n"); - - count = 0; - for (int i = 0; i < system->N; i++) { - - int d = workspace->hbond_index[i]; - if (d == -1) continue; - - d_index = hbond_index[i]; - /* - fprintf (stderr, " Count cpu %d gpu %d \n", - End_Index (workspace->hbond_index[i], hbonds) - index, - d_end[d_index] - d_start[d_index]); - */ - for (int j = d_start[d_index]; j < d_end[d_index]; j++ ) - { - tgt = data[j]; - - int k = 0; - for (k = Start_Index (workspace->hbond_index[i], hbonds); - k < End_Index (workspace->hbond_index[i], hbonds); k++) { - src = hbonds->select.hbond_list[k]; - - if ((src.nbr == tgt.nbr) || (src.scl == tgt.scl)) { - /* - fprintf (stderr, "Mismatch at atom %d index %d (%d %d) -- (%d %d) \n", i, k, - src.nbr, src.scl, - tgt.nbr, tgt.scl); - */ - count ++; - break; - } - } - - /* - if ( ((End_Index (workspace->hbond_index[i], hbonds) - index) != index ) && - (k >= End_Index (workspace->hbond_index[i], hbonds))) { - fprintf (stderr, "Hbonds does not match for atom %d hbond_Index %d \n", i, d_index ); - exit (-1); - } - */ - - if ( k >= (End_Index (workspace->hbond_index[i], hbonds) )){ - fprintf (stderr, "Hbonds does not match for atom %d hbond_Index %d \n", i, j); - exit (-1); - } - } - - if ((End_Index (workspace->hbond_index[i], hbonds)- Start_Index(workspace->hbond_index[i], hbonds)) != (d_end[d_index] - d_start[d_index])){ - fprintf (stderr, "End index does not match between device and host \n"); - exit (-1); - } - } - - //fprintf (stderr, "HBONDs match on device and Host count --> %d\n", count); - - free (d_start); - free (d_end); - free (data); - return true; + int *hbond_index, count; + int *d_start, *d_end, index, d_index; + hbond_data *data, src, tgt; + list *d_hbonds = dev_lists + HBONDS; + list *hbonds = *lists + HBONDS; + + hbond_index = (int *) malloc (INT_SIZE * system->N); + copy_host_device (hbond_index, dev_workspace->hbond_index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); + + d_end = (int *)malloc (INT_SIZE * system->N); + d_start = (int *) malloc (INT_SIZE * system->N ); + + copy_host_device (d_start, d_hbonds->index, INT_SIZE * dev_workspace->num_H, cudaMemcpyDeviceToHost, __LINE__); + copy_host_device (d_end, d_hbonds->end_index, INT_SIZE * dev_workspace->num_H, cudaMemcpyDeviceToHost, __LINE__); + + //fprintf (stderr, "Copying hbonds to host %d \n", system->num_hbonds); + data = (hbond_data *) malloc (HBOND_DATA_SIZE * system->num_hbonds); + copy_host_device (data, d_hbonds->select.hbond_list, HBOND_DATA_SIZE * system->num_hbonds, cudaMemcpyDeviceToHost, __LINE__); + + /* + Now the hbonds list is symmetric. will not work any longer + + for (int i = 0; i < system->N; i++) + if (hbond_index[i] != workspace->hbond_index[i]) { + fprintf (stderr, "hbond index does not match for atom %d (%d %d)\n", + i, workspace->hbond_index[i], hbond_index[i]); + exit (-1); + } + + */ + + //fprintf (stderr, "hbond_index match between host and device \n"); + + for (int i = 0; i < system->N; i++) { + + if ( system->reaxprm.sbp[ system->atoms[i].type ].p_hbond == 1 ) + { + if (hbond_index[i] >= 0) { + if ((d_end[ hbond_index[i]] - d_start[hbond_index[i]]) != + (End_Index (workspace->hbond_index[i], hbonds) - Start_Index (workspace->hbond_index[i], hbonds))) { + fprintf (stderr, "%d %d - d(%d %d) c(%d %d) \n",hbond_index[i], workspace->hbond_index[i], + d_start[hbond_index[i]], d_end[ hbond_index[i]], + Start_Index (workspace->hbond_index[i], hbonds), + End_Index (workspace->hbond_index[i], hbonds) ); + exit (-1); + } + } + } + } + //fprintf (stderr, "hbonds count match between host and device \n"); + + count = 0; + for (int i = 0; i < system->N; i++) { + + int d = workspace->hbond_index[i]; + if (d == -1) continue; + + d_index = hbond_index[i]; + /* + fprintf (stderr, " Count cpu %d gpu %d \n", + End_Index (workspace->hbond_index[i], hbonds) - index, + d_end[d_index] - d_start[d_index]); + */ + for (int j = d_start[d_index]; j < d_end[d_index]; j++ ) + { + tgt = data[j]; + + int k = 0; + for (k = Start_Index (workspace->hbond_index[i], hbonds); + k < End_Index (workspace->hbond_index[i], hbonds); k++) { + src = hbonds->select.hbond_list[k]; + + if ((src.nbr == tgt.nbr) || (src.scl == tgt.scl)) { + /* + fprintf (stderr, "Mismatch at atom %d index %d (%d %d) -- (%d %d) \n", i, k, + src.nbr, src.scl, + tgt.nbr, tgt.scl); + */ + count ++; + break; + } + } + + /* + if ( ((End_Index (workspace->hbond_index[i], hbonds) - index) != index ) && + (k >= End_Index (workspace->hbond_index[i], hbonds))) { + fprintf (stderr, "Hbonds does not match for atom %d hbond_Index %d \n", i, d_index ); + exit (-1); + } + */ + + if ( k >= (End_Index (workspace->hbond_index[i], hbonds) )){ + fprintf (stderr, "Hbonds does not match for atom %d hbond_Index %d \n", i, j); + exit (-1); + } + } + + if ((End_Index (workspace->hbond_index[i], hbonds)- Start_Index(workspace->hbond_index[i], hbonds)) != (d_end[d_index] - d_start[d_index])){ + fprintf (stderr, "End index does not match between device and host \n"); + exit (-1); + } + } + + //fprintf (stderr, "HBONDs match on device and Host count --> %d\n", count); + + free (d_start); + free (d_end); + free (data); + return true; } bool validate_neighbors (reax_system *system, list **lists) { - list *far_nbrs = *lists + FAR_NBRS; - list *d_nbrs = dev_lists + FAR_NBRS; - far_neighbor_data gpu, cpu; - int index, count, jicount; - - int *end = (int *)malloc (sizeof (int) * system->N); - int *start = (int *) malloc (sizeof (int) * system->N ); - - //fprintf (stderr, "numnbrs %d \n", system->num_nbrs); - - copy_host_device (start, d_nbrs->index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, 1); - copy_host_device (end, d_nbrs->end_index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, 2); - - far_neighbor_data *data = (far_neighbor_data *) malloc (FAR_NEIGHBOR_SIZE * system->num_nbrs); - copy_host_device (data, d_nbrs->select.far_nbr_list, FAR_NEIGHBOR_SIZE * system->num_nbrs, cudaMemcpyDeviceToHost, 3); - - int cpu_count = 0; - int gpu_count = 0; - - for (int i = 0; i < system->N; i++){ - cpu_count += Num_Entries (i, far_nbrs); - gpu_count += end[i] - start[i]; - } - - //fprintf (stderr, " Nbrs count cpu: %d -- gpu: %d \n", cpu_count, gpu_count ); - for (int i = 0; i < system->N-1; i++){ - if (end [i] > start [i+1]) - { - fprintf (stderr, " Far Neighbors index over write @ index %d\n", i); - exit (-1); - } - } - - - - for (int i = 0; i < system->N; i++){ - index = Start_Index (i, far_nbrs); - - for (int j = start[i]; j < end[i]; j++){ - gpu = data[j]; - - if (i < data[j].nbr) { - int src = data[j].nbr; - int dest = i; - int x; - - - for (x = start[src]; x < end[src]; x++) { - if (data[x].nbr != dest) continue; - - gpu = data[x]; - cpu = data[j]; - - if ( (gpu.d != cpu.d) || - (cpu.dvec[0] != gpu.dvec[0]) || (cpu.dvec[1] != gpu.dvec[1]) || (cpu.dvec[2] != gpu.dvec[2]) || - (cpu.rel_box[0] != gpu.rel_box[0]) || (cpu.rel_box[1] != gpu.rel_box[1]) || (cpu.rel_box[2] != gpu.rel_box[2])) { - fprintf (stderr, " atom %d neighbor %d (%f, %d, %d, %d - %f %f %f) \n", i, data[j].nbr, - data[j].d, - data[j].rel_box[0], - data[j].rel_box[1], - data[j].rel_box[2], - data[j].dvec[0], - data[j].dvec[1], - data[j].dvec[2] - ); - fprintf (stderr, " atom %d neighbor %d (%f, %d, %d, %d - %f %f %f) \n", data[j].nbr, data[x].nbr, - data[x].d, - data[x].rel_box[0], - data[x].rel_box[1], - data[x].rel_box[2], - data[x].dvec[0], - data[x].dvec[1], - data[x].dvec[2] - ); - jicount++; - - fprintf (stderr, " Far Neighbors DOES NOT match between Deivce and Host \n"); - exit (-1); - } - break; - } - - if (x >= end[src]) { - fprintf (stderr, "could not find the neighbor duplicate data for ij (%d %d)\n", i, src ); - exit (-1); - } - - continue; - } - - cpu = far_nbrs->select.far_nbr_list[index]; - //if ( (gpu.d != cpu.d) || (gpu.nbr != cpu.nbr) || - // (cpu.dvec[0] != gpu.dvec[0]) || (cpu.dvec[1] != gpu.dvec[1]) || (cpu.dvec[2] != gpu.dvec[2]) || - // (cpu.rel_box[0] != gpu.rel_box[0]) || (cpu.rel_box[1] != gpu.rel_box[1]) || (cpu.rel_box[2] != gpu.rel_box[2])) { - //if (memcmp (&gpu, &cpu, FAR_NEIGHBOR_SIZE)) { - if ( check_zero (gpu.d, cpu.d) || - (gpu.nbr != cpu.nbr) || - check_zero (cpu.dvec, gpu.dvec) || - !check_same (cpu.rel_box, gpu.rel_box)) { - - fprintf (stderr, "GPU:atom --> %d (s: %d , e: %d, i: %d )\n", i, start[i], end[i], j ); - fprintf (stderr, "CPU:atom --> %d (s: %d , e: %d, i: %d )\n", i, Start_Index(i, far_nbrs), End_Index (i, far_nbrs), index); - fprintf (stdout, "Far neighbors does not match atom: %d \n", i ); - fprintf (stdout, "neighbor %d , %d \n", cpu.nbr, gpu.nbr); - fprintf (stdout, "d %f , %f \n", cpu.d, data[j].d); - fprintf (stdout, "dvec (%f %f %f) (%f %f %f) \n", - cpu.dvec[0], cpu.dvec[1], cpu.dvec[2], - gpu.dvec[0], gpu.dvec[1], gpu.dvec[2] ); - - fprintf (stdout, "rel_box (%d %d %d) (%d %d %d) \n", - cpu.rel_box[0], cpu.rel_box[1], cpu.rel_box[2], - gpu.rel_box[0], gpu.rel_box[1], gpu.rel_box[2] ); - - fprintf (stderr, " Far Neighbors DOES NOT match between Deivce and Host **** \n"); - exit (-1); - count ++; - } - index ++; - } - - if (index != End_Index (i, far_nbrs)) - { - fprintf (stderr, "End index does not match for atom --> %d end index (%d) Cpu (%d, %d ) gpu (%d, %d)\n", i, index, Start_Index (i, far_nbrs), End_Index(i, far_nbrs), - start[i], end[i]); - exit (10); - } - } - - //fprintf (stderr, "FAR Neighbors match between device and host \n"); - free (start); - free (end); - free (data); - return true; - } - - bool validate_workspace (reax_system *system, static_storage *workspace, list **lists) - { - real *total_bond_order; - int count, tcount; - - total_bond_order = (real *) malloc ( system->N * REAL_SIZE ); - copy_host_device (total_bond_order, dev_workspace->total_bond_order, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - - count = 0; - for (int i = 0; i < system->N; i++) { - - //if (abs (workspace->total_bond_order[i] - total_bond_order[i]) >= GPU_TOLERANCE){ - if ( check_zero (workspace->total_bond_order[i], total_bond_order[i])){ - fprintf (stderr, "Total bond order does not match for atom %d (%4.15e %4.15e)\n", - i, workspace->total_bond_order[i], total_bond_order[i]); - exit (-1); - count ++; - } - } - free (total_bond_order); - //fprintf (stderr, "TOTAL Bond Order mismatch count %d\n", count); - - - rvec *dDeltap_self; - dDeltap_self = (rvec *) calloc (system->N, RVEC_SIZE); - copy_host_device (dDeltap_self, dev_workspace->dDeltap_self, system->N * RVEC_SIZE, cudaMemcpyDeviceToHost, __LINE__); - - count = 0; - for (int i = 0; i < system->N; i++ ) - { - if (check_zero (workspace->dDeltap_self[i], dDeltap_self[i])) - { - fprintf (stderr, "index: %d c (%f %f %f) g (%f %f %f )\n", i, - workspace->dDeltap_self[i][0], - workspace->dDeltap_self[i][1], - workspace->dDeltap_self[i][2], - dDeltap_self[3*i+0], - dDeltap_self[3*i+1], - dDeltap_self[3*i+2] ); - exit (-1); - count ++; - } - } - free (dDeltap_self); - //fprintf (stderr, "dDeltap_self mismatch count %d\n", count); - - //exit for init_forces - - real *test; - test = (real *) malloc (system->N * REAL_SIZE); - - copy_host_device (test, dev_workspace->Deltap, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - count = 0; - for (int i = 0; i < system->N; i++ ) - { - if (check_zero (workspace->Deltap[i], test[i])) - { - fprintf (stderr, "Deltap: Mismatch index --> %d (%f %f) \n", i, workspace->Deltap[i], test[i]); - exit (-1); - count ++; - } - } - //fprintf (stderr, "Deltap mismatch count %d\n", count); - - copy_host_device (test, dev_workspace->Deltap_boc, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - count = 0; - for (int i = 0; i < system->N; i++ ) - { - if (check_zero (workspace->Deltap_boc[i], test[i])) - { - fprintf (stderr, "Deltap_boc: Mismatch index --> %d (%f %f) \n", i, workspace->Deltap_boc[i], test[i]); - exit (-1); - count ++; - } - } - //fprintf (stderr, "dDeltap_boc mismatch count %d\n", count); - - copy_host_device (test, dev_workspace->Delta, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - count = 0; - for (int i = 0; i < system->N; i++ ) { - if (check_zero (workspace->Delta[i], test[i])) { - fprintf (stderr, "Delta: Mismatch index --> %d (%f %f) \n", i, workspace->Delta[i], test[i]); - exit (-1); - count ++; - } - } - //fprintf (stderr, "Delta mismatch count %d\n", count); - - copy_host_device (test, dev_workspace->Delta_e, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - count = 0; - for (int i = 0; i < system->N; i++ ) { - if (check_zero (workspace->Delta_e[i], test[i])) { - fprintf (stderr, "Delta_e: Mismatch index --> %d (%f %f) \n", i, workspace->Delta_e[i], test[i]); - exit (-1); - count ++; - } - } - //fprintf (stderr, "Delta_e mismatch count %d\n", count); - - copy_host_device (test, dev_workspace->vlpex, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - count = 0; - for (int i = 0; i < system->N; i++ ) { - if (check_zero (workspace->vlpex[i], test[i])) { - fprintf (stderr, "vlpex: Mismatch index --> %d (%f %f) \n", i, workspace->vlpex[i], test[i]); - exit (-1); - count ++; - } - } - //fprintf (stderr, "vlpex mismatch count %d\n", count); - - copy_host_device (test, dev_workspace->nlp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - count = 0; - for (int i = 0; i < system->N; i++ ) { - if (check_zero (workspace->nlp[i], test[i])) { - fprintf (stderr, "nlp: Mismatch index --> %d (%f %f) \n", i, workspace->nlp[i], test[i]); - exit (-1); - count ++; - } - } - //fprintf (stderr, "nlp mismatch count %d\n", count); - - copy_host_device (test, dev_workspace->Delta_lp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - count = 0; - for (int i = 0; i < system->N; i++ ) { - if (check_zero (workspace->Delta_lp[i], test[i])) { - fprintf (stderr, "Delta_lp: Mismatch index --> %d (%f %f) \n", i, workspace->Delta_lp[i], test[i]); - exit (-1); - count ++; - } - } - //fprintf (stderr, "Delta_lp mismatch count %d\n", count); - - copy_host_device (test, dev_workspace->Clp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - count = 0; - for (int i = 0; i < system->N; i++ ) { - if (check_zero (workspace->Clp[i], test[i])) { - fprintf (stderr, "Clp: Mismatch index --> %d (%f %f) \n", i, workspace->Clp[i], test[i]); - exit (-1); - count ++; - } - } - //fprintf (stderr, "Clp mismatch count %d\n", count); - - copy_host_device (test, dev_workspace->dDelta_lp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - count = 0; - for (int i = 0; i < system->N; i++ ) { - if (check_zero (workspace->dDelta_lp[i], test[i])) { - fprintf (stderr, "dDelta_lp: Mismatch index --> %d (%f %f) \n", i, workspace->dDelta_lp[i], test[i]); - exit (-1); - count ++; - } - } - //fprintf (stderr, "dDelta_lp mismatch count %d\n", count); - - copy_host_device (test, dev_workspace->nlp_temp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - count = 0; - for (int i = 0; i < system->N; i++ ) { - if (check_zero (workspace->nlp_temp[i], test[i])) { - fprintf (stderr, "nlp_temp: Mismatch index --> %d (%f %f) \n", i, workspace->nlp_temp[i], test[i]); - exit (-1); - count ++; - } - } - //fprintf (stderr, "nlp_temp mismatch count %d\n", count); - - copy_host_device (test, dev_workspace->Delta_lp_temp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - count = 0; - for (int i = 0; i < system->N; i++ ) { - if (check_zero (workspace->Delta_lp_temp[i], test[i])) { - fprintf (stderr, "Delta_lp_temp: Mismatch index --> %d (%f %f) \n", i, workspace->Delta_lp_temp[i], test[i]); - exit (-1); - count ++; - } - } - //fprintf (stderr, "Delta_lp_temp mismatch count %d\n", count); - - copy_host_device (test, dev_workspace->dDelta_lp_temp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - count = 0; - for (int i = 0; i < system->N; i++ ) { - if (check_zero (workspace->dDelta_lp_temp[i], test[i])) { - fprintf (stderr, "dDelta_lp_temp: Mismatch index --> %d (%f %f) \n", i, workspace->dDelta_lp_temp[i], test[i]); - exit (-1); - count ++; - } - } - //fprintf (stderr, "dDelta_lp_temp mismatch count %d\n", count); - - //exit for Bond order calculations - - - copy_host_device (test, dev_workspace->CdDelta, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - count = 0; - for (int i = 0; i < system->N; i++ ) { - if (check_zero (workspace->CdDelta[i], test[i])) { - fprintf (stderr, " CdDelta does NOT match (%f %f) for atom %d \n", workspace->CdDelta[i], test[i], i); - exit (-1); - count ++; - } - } - //fprintf (stderr, "CdDelta mismatch count %d\n", count); - //exit for Bond Energy calculations - - /* - copy_host_device (test, dev_workspace->droptol, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - count = 0; - for (int i = 0; i < system->N; i++ ) { - if (check_zero (workspace->droptol[i], test[i])) { - fprintf (stderr, " Droptol Does not match (%f %f) \n", workspace->droptol[i], test[i]); - exit (-1); - count ++; - } - } - //fprintf (stderr, "droptol mismatch count %d\n", count); - */ - - - //exit for QEa calculations - /* - real *t_s; - - t_s = (real *) malloc (REAL_SIZE * (system->N * 2) ); - copy_host_device (t_s, dev_workspace->b_prm, REAL_SIZE * (system->N * 2), cudaMemcpyDeviceToHost, __LINE__); - - count = 0; - for (int i = 0; i < (system->N * 2); i++ ) { - if (check_zero (workspace->b_prm[i], t_s[i])) { - fprintf (stderr, " (%f %f) \n", workspace->b_prm[i], t_s[i]); - exit (-1); - count ++; - } - } - //fprintf (stderr, "b_prm mismatch count %d\n", count); - - t_s = (real *) malloc (REAL_SIZE * 5 * system->N); - copy_host_device (t_s, dev_workspace->s, system->N * REAL_SIZE * 5, cudaMemcpyDeviceToHost, __LINE__); - - count = 0; - for (int i = 0; i < 5*system->N; i++ ) { - if (check_zero (workspace->s[i], t_s[i])) { - //fprintf (stderr, " (%f %f) @ index %d \n", workspace->s[i], t_s[i], i); - count ++; - } - } - fprintf (stderr, "s mismatch count %d\n", count); - - - t_s = (real *) malloc (REAL_SIZE * 5 * system->N); - copy_host_device (t_s, dev_workspace->t, system->N * REAL_SIZE * 5, cudaMemcpyDeviceToHost, __LINE__); - - count = 0; - for (int i = 0; i < 5*system->N; i++ ) { - if (check_zero (workspace->t[i], t_s[i])) { - //fprintf (stderr, " (%f %f) @ index : %d\n", workspace->t[i], t_s[i], i); - count ++; - } - } - fprintf (stderr, "t mismatch count %d\n", count); - - - t_s = (real *) malloc (REAL_SIZE * (RESTART+1) * system->N); - copy_host_device (t_s, dev_workspace->v, system->N * REAL_SIZE * (RESTART+1), cudaMemcpyDeviceToHost, __LINE__); - - count = 0; - for (int i = 0; i < (RESTART + 1)*system->N; i++ ) { - if (check_zero (workspace->v[i], t_s[i])) { - //fprintf (stderr, " (%f %f) @ index %d \n", workspace->v[i], t_s[i], i); - count ++; - } - } - fprintf (stderr, "v mismatch count %d\n", count); - - t_s = (real *) malloc (REAL_SIZE * (RESTART+1) ); - copy_host_device (t_s, dev_workspace->y, REAL_SIZE * (RESTART+1), cudaMemcpyDeviceToHost, __LINE__); - - count = 0; - for (int i = 0; i < (RESTART + 1); i++ ) { - if (check_zero (workspace->y[i], t_s[i])) { - //fprintf (stderr, " (%f %f) \n", workspace->y[i], t_s[i]); - count ++; - } - } - fprintf (stderr, "y mismatch count %d\n", count); - - t_s = (real *) malloc (REAL_SIZE * (RESTART+1) ); - copy_host_device (t_s, dev_workspace->hc, REAL_SIZE * (RESTART+1), cudaMemcpyDeviceToHost, __LINE__); - - count = 0; - for (int i = 0; i < (RESTART + 1); i++ ) { - if (check_zero (workspace->hc[i], t_s[i])) { - //fprintf (stderr, " (%f %f) \n", workspace->hc[i], t_s[i]); - count ++; - } - } - fprintf (stderr, "hc mismatch count %d\n", count); - - t_s = (real *) malloc (REAL_SIZE * (RESTART+1) ); - copy_host_device (t_s, dev_workspace->hs, REAL_SIZE * (RESTART+1), cudaMemcpyDeviceToHost, __LINE__); - - count = 0; - for (int i = 0; i < (RESTART + 1); i++ ) { - if (check_zero (workspace->hs[i], t_s[i])) { - //fprintf (stderr, " (%f %f) \n", workspace->hs[i], t_s[i]); - count ++; - } - } - fprintf (stderr, "hs mismatch count %d\n", count); - - t_s = (real *) malloc (REAL_SIZE * (RESTART+1) * (RESTART+1) ); - copy_host_device (t_s, dev_workspace->h, REAL_SIZE * (RESTART+1)*(RESTART+1), cudaMemcpyDeviceToHost, __LINE__); - - count = 0; - for (int i = 0; i < (RESTART+1)*(RESTART+1); i++ ) { - if (check_zero (workspace->h[i], t_s[i])) { - //fprintf (stderr, " (%f %f) \n", workspace->h[i], t_s[i]); - count ++; - } - } - fprintf (stderr, "h mismatch count %d\n", count); - - t_s = (real *) malloc (REAL_SIZE * (RESTART+1) ); - copy_host_device (t_s, dev_workspace->g, REAL_SIZE * (RESTART+1), cudaMemcpyDeviceToHost, __LINE__); - - count = 0; - for (int i = 0; i < (RESTART + 1); i++ ) { - if (check_zero (workspace->g[i], t_s[i])) { - //fprintf (stderr, " (%f %f) @ index %d\n", workspace->g[i], t_s[i], i); - count ++; - } - } - fprintf (stderr, "g mismatch count %d\n", count); - */ - - rvec *r_s = (rvec *) malloc (RVEC_SIZE * system->N ); - copy_host_device (r_s, dev_workspace->v_const, RVEC_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); - - count = 0; - for (int i = 0; i < system->N; i++ ) { - if (check_zero (workspace->v_const[i], r_s[i])) { - fprintf (stderr, " v_const (%f %f %f) (%f %f %f) @ index %d\n", - workspace->v_const[i][0], - workspace->v_const[i][1], - workspace->v_const[i][2], - r_s[i][0], - r_s[i][1], - r_s[i][2], - i); - exit (-1); - count ++; - } - } - //fprintf (stderr, "v_const mismatch count %d\n", count); - - free (test); - free (r_s); - return true; - } - - bool validate_data (reax_system *system, simulation_data *host) - { - simulation_data device; - - copy_host_device (&device, host->d_simulation_data, SIMULATION_DATA_SIZE, cudaMemcpyDeviceToHost, __LINE__); - - if (check_zero (host->E_BE, device.E_BE)){ - fprintf (stderr, "E_BE does not match (%4.15e %4.15e) \n", host->E_BE, device.E_BE); - exit (-1); - } - - if (check_zero (host->E_Lp, device.E_Lp)){ - fprintf (stderr, "E_Lp does not match (%4.10e %4.10e) \n", host->E_Lp, device.E_Lp); - exit (-1); - } - - if (check_zero (host->E_Ov, device.E_Ov)){ - fprintf (stderr, "E_Ov does not match (%4.10e %4.10e) \n", host->E_Ov, device.E_Ov); - exit (-1); - } - - if (check_zero (host->E_Un, device.E_Un)){ - fprintf (stderr, "E_Un does not match (%4.10e %4.10e) \n", host->E_Un, device.E_Un); - exit (-1); - } - - if (check_zero (host->E_Tor, device.E_Tor)) { - fprintf (stderr, "E_Tor does not match (%4.10e %4.10e) \n", host->E_Tor, device.E_Tor); - exit (-1); - } - - if (check_zero (host->E_Con, device.E_Con)) { - fprintf (stderr, "E_Con does not match (%4.10e %4.10e) \n", host->E_Con, device.E_Con); - exit (-1); - } - - if (check_zero (host->ext_press, device.ext_press)) { - fprintf (stderr, "ext_press does not match (%4.10e %4.10e) \n", host->ext_press, device.ext_press); - exit (-1); - } - - if (check_zero (host->E_HB, device.E_HB)) { - fprintf (stderr, "E_Hb does not match (%4.10e %4.10e) \n", host->E_HB, device.E_HB); - exit (-1); - } - - if (check_zero (host->E_Ang, device.E_Ang)) { - fprintf (stderr, "E_Ang does not match (%4.10e %4.10e) \n", host->E_Ang, device.E_Ang); - exit (-1); - } - - if (check_zero (host->E_Pen, device.E_Pen)) { - fprintf (stderr, "E_Pen does not match (%4.10e %4.10e) \n", host->E_Pen, device.E_Pen); - exit (-1); - } - - if (check_zero (host->E_Coa, device.E_Coa)) { - fprintf (stderr, "E_Coa does not match (%4.10e %4.10e) \n", host->E_Coa, device.E_Coa); - exit (-1); - } - - if (check_zero (host->E_vdW, device.E_vdW)) { - fprintf (stderr, "E_vdW does not match (%4.20e %4.20e) \n", host->E_vdW, device.E_vdW); - exit (-1); - } - - if (check_zero (host->E_Ele, device.E_Ele)) { - fprintf (stderr, "E_Ele does not match (%4.20e %4.20e) \n", host->E_Ele, device.E_Ele); - exit (-1); - } - - if (check_zero (host->E_Pol, device.E_Pol)) { - fprintf (stderr, "E_Pol does not match (%4.10e %4.10e) \n", host->E_Pol, device.E_Pol); - exit (-1); - } - - - //fprintf (stderr, "Simulation Data match between host and device \n"); - return true; - } - - void print_bond_data (bond_order_data *s) - { - /* - fprintf (stderr, "Bond_Order_Data BO (%f ) BO_s (%f ) BO_pi (%f ) BO_pi2 (%f ) ", - s->BO, - s->BO_s, - s->BO_pi, - s->BO_pi2 ); - */ - fprintf (stderr, " Cdbo (%e) ", s->Cdbo ); - fprintf (stderr, " Cdbopi (%e) ", s->Cdbopi ); - fprintf (stderr, " Cdbopi2 (%e) ", s->Cdbopi2 ); - } - - void print_bond_list (reax_system *system, static_storage *workspace, list **lists) - { - list *bonds = *lists + BONDS; - - for (int i = 1; i < 2; i++) - { - fprintf (stderr, "Atom %d Bond_data ( nbrs \n", i); - for (int j = Start_Index (i, bonds); j < End_Index (i, bonds); j++) - { - bond_data *data = &bonds->select.bond_list [j]; - fprintf (stderr, " %d, ", data->nbr ); - print_bond_data (&data->bo_data); - fprintf (stderr, ")\n"); - } - } - - int *b_start = (int *) malloc (INT_SIZE * system->N); - int *b_end = (int *) malloc (INT_SIZE * system->N); - list *d_bonds = dev_lists + BONDS; - bond_data *d_bond_data; - - d_bond_data = (bond_data *) malloc (BOND_DATA_SIZE * system->num_bonds ); - - copy_host_device ( b_start, d_bonds->index, - INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); - copy_host_device ( b_end, d_bonds->end_index, - INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); - copy_host_device (d_bond_data, d_bonds->select.bond_list, BOND_DATA_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__); - for (int i = 0; i < 2; i++) - { - fprintf (stderr, "Atom %d Bond_data ( nbrs \n", i); - for (int j = b_start[i]; j < b_end[i]; j ++) { - bond_data *src = &d_bond_data[j]; - fprintf (stderr, " %d, ", src->nbr ); - print_bond_data (&src->bo_data); - fprintf (stderr, ")\n"); - } - } - } - - - - void count_three_bodies (reax_system *system, static_storage *workspace, list **lists) - { - list *three = *lists + THREE_BODIES; - list *bonds = *lists + BONDS; - - list *d_three = dev_lists + THREE_BODIES; - list *d_bonds = dev_lists + BONDS; - bond_data *d_bond_data; - real *test; - - three_body_interaction_data *data = (three_body_interaction_data *) - malloc ( sizeof (three_body_interaction_data) * system->num_thbodies); - int *start = (int *) malloc (INT_SIZE * system->num_bonds); - int *end = (int *) malloc (INT_SIZE * system->num_bonds); - - int *b_start = (int *) malloc (INT_SIZE * system->N); - int *b_end = (int *) malloc (INT_SIZE * system->N); - int count; - int hcount, dcount; - - copy_host_device ( start, d_three->index, - INT_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__); - copy_host_device ( end, d_three->end_index, - INT_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__); - copy_host_device ( data, d_three->select.three_body_list, - sizeof (three_body_interaction_data) * system->num_thbodies, - cudaMemcpyDeviceToHost, __LINE__); - - d_bond_data = (bond_data *) malloc (BOND_DATA_SIZE * system->num_bonds ); - - copy_host_device ( b_start, d_bonds->index, - INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); - copy_host_device ( b_end, d_bonds->end_index, - INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); - copy_host_device (d_bond_data, d_bonds->select.bond_list, BOND_DATA_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__); - - count = 0; - hcount = dcount = 0; - for (int i = 0; i < system->N; i++) - { - for (int j = b_start[i]; j < b_end[i]; j ++) { - dcount += end[j] - start[j]; - } - } - - fprintf (stderr, "Total Actual Three Body Count ---> %d \n", dcount); - - free (data); - free (start); - free (end); - free (b_start); - free (b_end); - free (d_bond_data); - } - - - - bool validate_three_bodies (reax_system *system, static_storage *workspace, list **lists) - { - list *three = *lists + THREE_BODIES; - list *bonds = *lists + BONDS; - - list *d_three = dev_lists + THREE_BODIES; - list *d_bonds = dev_lists + BONDS; - bond_data *d_bond_data; - real *test; - - three_body_interaction_data *data = (three_body_interaction_data *) - malloc ( sizeof (three_body_interaction_data) * system->num_thbodies); - int *start = (int *) malloc (INT_SIZE * system->num_bonds); - int *end = (int *) malloc (INT_SIZE * system->num_bonds); - - int *b_start = (int *) malloc (INT_SIZE * system->N); - int *b_end = (int *) malloc (INT_SIZE * system->N); - int count; - int hcount, dcount; - - - - copy_host_device ( start, d_three->index, - INT_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__); - copy_host_device ( end, d_three->end_index, - INT_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__); - copy_host_device ( data, d_three->select.three_body_list, - sizeof (three_body_interaction_data) * system->num_thbodies, - cudaMemcpyDeviceToHost, __LINE__); - - d_bond_data = (bond_data *) malloc (BOND_DATA_SIZE * system->num_bonds ); - - copy_host_device ( b_start, d_bonds->index, - INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); - copy_host_device ( b_end, d_bonds->end_index, - INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); - copy_host_device (d_bond_data, d_bonds->select.bond_list, BOND_DATA_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__); - - //test = (real *) malloc (REAL_SIZE * system->num_bonds); - //memset (test, 0, REAL_SIZE * system->num_bonds); - //copy_host_device (test, testdata, REAL_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__); - - count = 0; - for (int i = 0; i < system->N; i++) - { - //for (int j = bonds->index[i]; j < bonds->end_index[i]; j ++) - - hcount = dcount = 0; - for (int j = b_start[i]; j < b_end[i]; j ++) { - dcount += end[j] - start[j]; - hcount += Num_Entries (j, three); - - /* - if ((end[j] - start[j]) != (End_Index (j, three) - Start_Index (j, three))) - { - fprintf (stderr, " Three body count does not match between host and device\n"); - fprintf (stderr, " Host count : (%d, %d)\n", Start_Index (j, three), End_Index (j, three)); - fprintf (stderr, " Device count: (%d, %d)\n", start[j], end[j]); - } - */ - } - - - if ((dcount != hcount)) { - - fprintf (stderr, " Three body count does not match for the bond %d - %d \n", hcount, dcount); - - for (int j = b_start[i]; j < b_end[i]; j ++) { - bond_order_data *src = &d_bond_data[j].bo_data; - dcount = end[j] - start[j]; - hcount = Num_Entries (j, three); - fprintf (stderr, "device \n"); - print_bond_data (src); - - fprintf (stderr, "\n"); - src = &bonds->select.bond_list[j].bo_data; - fprintf (stderr, "host \n"); - print_bond_data (src); - fprintf (stderr, "\n"); - - //fprintf (stderr, "--- Device bo is %f \n", test[j]); - fprintf (stderr, "Device %d %d bonds (%d %d) - Host %d %d bonds (%d %d) \n", start[j], end[j],b_start[i], b_end[i], - Start_Index (j, three), End_Index (j, three), Start_Index (i, bonds), End_Index (i, bonds)); - fprintf (stderr, "Host %d Device %d -- atom %d index %d \n", hcount, dcount, i, j); - fprintf (stderr, "------\n"); - } - fprintf (stderr, " Three Bodies count does not match between host and device \n"); - exit (-1); - } - } - - //fprintf (stderr, "Three body count on DEVICE %d HOST %d \n", dcount, hcount); - - count = 0; - for (int i = 0; i < system->N; i++) - { - int x, y, z; - for (x = b_start[i]; x < b_end[i]; x++) - { - int t_start = start[x]; - int t_end = end[x]; - - bond_data *dev_bond = &d_bond_data [x]; - bond_data *host_bond; - for (z = Start_Index (i, bonds); z < End_Index (i, bonds); z++) - { - host_bond = &bonds->select.bond_list [z]; - if ((dev_bond->nbr == host_bond->nbr) && - check_same (dev_bond->rel_box, host_bond->rel_box) && - !check_zero (dev_bond->dvec, host_bond->dvec) && - !check_zero (dev_bond->d, host_bond->d) ) - { - break; - } - } - if (z >= End_Index (i, bonds)){ - fprintf (stderr, "Could not find the matching bond on host and device \n"); - exit (-1); - } - - //find this bond in the bonds on the host side. - - for (y = t_start; y < t_end; y++) - { - - three_body_interaction_data *device = data + y; - three_body_interaction_data *host; - - //fprintf (stderr, "Device thb %d pthb %d \n", device->thb, device->pthb); - - int xx; - for (xx = Start_Index (z, three); xx < End_Index (z, three); xx++) - { - host = &three->select.three_body_list [xx]; - //fprintf (stderr, "Host thb %d pthb %d \n", host->thb, host->pthb); - //if ((host->thb == device->thb) && (host->pthb == device->pthb)) - if ((host->thb == device->thb) && !check_zero (host->theta, device->theta)) - { - count ++; - break; - } - } - - if ( xx >= End_Index (z, three) ) { - fprintf (stderr, " Could not match for atom %d bonds %d (%d) Three body(%d %d) (%d %d) \n", i, x, z, - Start_Index (z, three), End_Index (z, three), start[x], end[x] ); - exit (-1); - }// else fprintf (stderr, "----------------- \n"); - } - } - } - free (data); - free (start); - free (end); - free (b_start); - free (b_end); - free (d_bond_data); - - //fprintf (stderr, "Three Body Interaction Data MATCH on device and HOST --> %d \n", count); - return true; - } - - bool bin_three_bodies (reax_system *system, static_storage *workspace, list **lists) - { - list *d_three = dev_lists + THREE_BODIES; - list *d_bonds = dev_lists + BONDS; - list *three = *lists + THREE_BODIES; - list *bonds = *lists + BONDS; - bond_data *d_bond_data; - - three_body_interaction_data *data = (three_body_interaction_data *) - malloc ( sizeof (three_body_interaction_data) * system->num_thbodies); - int *start = (int *) malloc (INT_SIZE * system->num_bonds); - int *end = (int *) malloc (INT_SIZE * system->num_bonds); - - int *b_start = (int *) malloc (INT_SIZE * system->N); - int *b_end = (int *) malloc (INT_SIZE * system->N); - - int *a = (int *) malloc (2 * INT_SIZE * system->N ); - int *b = (int *) malloc (2 * INT_SIZE * system->N ); - int *c = (int *) malloc (2 * INT_SIZE * system->N ); - int *d = (int *) malloc (2 * INT_SIZE * system->N ); - - for (int i = 0; i < 2 * system->N; i++) - a[i] = b[i] = c[i] = d[i] = -1; - - int count; - int hcount, dcount; - int index_a, index_b, index_c, index_d; - index_a = index_b = index_c = index_d = 0; - - copy_host_device ( start, d_three->index, - INT_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__); - copy_host_device ( end, d_three->end_index, - INT_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__); - copy_host_device ( data, d_three->select.three_body_list, - sizeof (three_body_interaction_data) * system->num_thbodies, - cudaMemcpyDeviceToHost, __LINE__); - - d_bond_data = (bond_data *) malloc (BOND_DATA_SIZE * system->num_bonds ); - - copy_host_device ( b_start, d_bonds->index, - INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); - copy_host_device ( b_end, d_bonds->end_index, - INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); - copy_host_device (d_bond_data, d_bonds->select.bond_list, BOND_DATA_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__); - - count = 0; - hcount = dcount = 0; - - /* - for (int i = 0; i < 20; i++) - { - for (int j = Start_Index (i, bonds); j < End_Index (i, bonds); j++) - { - for ( int k = Start_Index (j, three); k < End_Index (j, three); k ++) - { - three_body_interaction_data *host = &three->select.three_body_list [k]; - fprintf (stderr, " atom %d bond (%d %d) -- %d, (%d %d)\n", - i, Start_Index (i, bonds), End_Index (i, bonds), j, host->thb, host->pthb ); - - } - } - } - exit (-1); - */ - - count = 0; - for (int i = 0; i < system->N; i++) - { - for (int j = b_start[i]; j < b_end[i]; j ++) { - - /* - bond_data *src; - src = &d_bond_data[j]; - fprintf (stderr, " atom %d Neighbor %d \n", i, src->nbr ); - */ - - for (int x = start[j]; x < end[j]; x ++) - { - three_body_interaction_data *device = data + x; - - int center = device->j; - int d_i = device->i; - int d_k = device->k; - - - //fprintf (stderr, " atom %d bond (%d %d) -- %d, (%d %d %d) -- (%d %d)\n", - //i, b_start[i], b_end[i], j, center, d_i, d_k, device->thb, device->pthb); - - if ((a[system->N + center] != -1)) { - a[d_i] = a[d_k] = 1; - continue; - } else if ((b[system->N + center] != -1)) { - b[d_i] = b[d_k] = 1; - continue; - } else if ((c[system->N + center] != -1)) { - c[d_i] = c[d_k] = 1; - continue; - } else if ((d[system->N + center] != -1)) { - d[d_i] = d[d_k] = 1; - continue; - } - - if ((a[center] == -1) && (a[d_i] == -1) && (a[d_k] == -1)) { - a[center] = a[d_i] = a[d_k] = 1; - a[system->N + center] = 1; - } else if ((b[center] == -1) && (b[d_i] == -1) && (b[d_k] == -1)) { - b[center] = b[d_i] = b[d_k] = 1; - b[system->N + center] = 1; - } else if ((c[center] == -1) && (c[d_i] == -1) && (c[d_k] == -1)) { - c[center] = c[d_i] = c[d_k] = 1; - c[system->N + center] = 1; - } else if ((d[center] == -1) && (d[d_i] == -1) && (d[d_k] == -1)) { - d[center] = d[d_i] = d[d_k] = 1; - d[system->N + center]= 1; - } - else { - count ++; - break; - fprintf (stderr, "We have a problem with the four bins atom %d bond (%d %d) -- %d, (%d %d %d)\n", - i, b_start[i], b_end[i], j, center, d_i, d_k); - fprintf (stderr, "A's contents %d %d %d (%d %d %d)\n", - a[system->N + center], a[system->N + d_i], a[system->N + d_k], a[center], a[d_i], a[d_k]); - fprintf (stderr, "B's contents %d %d %d (%d %d %d)\n", - b[system->N + center], b[system->N + d_i], b[system->N + d_k], b[center], b[d_i], b[d_k]); - fprintf (stderr, "C's contents %d %d %d (%d %d %d)\n", - c[system->N + center], c[system->N + d_i], c[system->N + d_k], c[center], c[d_i], c[d_k]); - fprintf (stderr, "D's contents %d %d %d (%d %d %d)\n", - d[system->N + center], d[system->N + d_i], d[system->N + d_k], d[center], d[d_i], d[d_k]); - - } - } - } - } - fprintf (stderr, "Miscount is %d \n", count); - exit (-1); - - count = 0; - for (int i = 0; i < system->N; i++) - { - if (a[system->N + i] != -1) count ++; - if (b[system->N + i] != -1) count ++; - if (c[system->N + i] != -1) count ++; - if (d[system->N + i] != -1) count ++; - } - - fprintf (stderr, "binned so many atoms --> %d \n", count ); - } - - bool validate_grid (reax_system *system) - { - int total = system->g.ncell[0] * system->g.ncell[1] * system->g.ncell[2]; - int count = 0; - - int *dtop = (int *) malloc (INT_SIZE * total ); - copy_host_device (dtop, system->d_g.top, INT_SIZE * total, cudaMemcpyDeviceToHost, __LINE__); - - for (int i = 0; i < total; i++){ - if (system->g.top[i] != dtop[i]){ - fprintf (stderr, " top count does not match (%d %d) @ index %d \n", system->g.top[i], dtop[i], i ); - exit (-1); - } - } - free (dtop); - - int *datoms = (int *) malloc (INT_SIZE * total * system->d_g.max_atoms); - copy_host_device (datoms, system->d_g.atoms, INT_SIZE * total * system->d_g.max_atoms, cudaMemcpyDeviceToHost, __LINE__); - for (int i = 0; i < total*system->d_g.max_atoms; i++){ - if (system->g.atoms[i] != datoms[i]){ - fprintf (stderr, " atoms count does not match (%d %d) @ index %d \n", system->g.atoms[i], datoms[i], i ); - exit (-1); - } - } - free (datoms); - - ivec *dnbrs = (ivec *) malloc (IVEC_SIZE * total * system->d_g.max_nbrs); - copy_host_device (dnbrs, system->d_g.nbrs, IVEC_SIZE * total * system->d_g.max_nbrs, cudaMemcpyDeviceToHost, __LINE__); - for (int i = 0; i < total*system->d_g.max_nbrs; i++){ - if (!check_same (system->g.nbrs[i], dnbrs[i])){ - fprintf (stderr, " nbrs count does not match @ index %d \n", i ); - exit (-1); - } - } - free (dnbrs); - - rvec *dnbrs_cp = (rvec *) malloc (RVEC_SIZE * total * system->d_g.max_nbrs); - copy_host_device (dnbrs_cp, system->d_g.nbrs_cp, RVEC_SIZE * total * system->d_g.max_nbrs, cudaMemcpyDeviceToHost, __LINE__); - for (int i = 0; i < total*system->d_g.max_nbrs; i++){ - if (check_zero (system->g.nbrs_cp[i], dnbrs_cp[i])){ - fprintf (stderr, " nbrs_cp count does not match @ index %d \n", i ); - exit (-1); - } - } - free (dnbrs_cp); - - //fprintf (stderr, " Grid match between device and host \n"); - return true; - } - - void print_atoms (reax_system *system) - { - int start, end, index; - - reax_atom *test = (reax_atom *) malloc (REAX_ATOM_SIZE * system->N); - copy_host_device (test, system->d_atoms, REAX_ATOM_SIZE * system->N, cudaMemcpyDeviceToHost, RES_SYSTEM_ATOMS ); - - //for (int i = 0; i < system->N; i++) - for (int i = 0; i < 10; i++) - { - fprintf (stderr, "Atom:%d: Type:%d", i, test[i].type); - fprintf (stderr, " x(%6.10f %6.10f %6.10f)", test[i].x[0], test[i].x[1], test[i].x[2] ); - fprintf (stderr, " v(%6.10f %6.10f %6.10f)", test[i].v[0], test[i].v[1], test[i].v[2] ); - fprintf (stderr, " f(%6.10f %6.10f %6.10f)", test[i].f[0], test[i].f[1], test[i].f[2] ); - fprintf (stderr, " q(%6.10f) \n", test[i].q ); - } - } - - void print_sys_atoms (reax_system *system) - { - for (int i = 0; i < 10; i++) - { - fprintf (stderr, "Atom:%d: Type:%d", i, system->atoms[i].type); - fprintf (stderr, " x(%6.10f %6.10f %6.10f)",system->atoms[i].x[0], system->atoms[i].x[1], system->atoms[i].x[2] ); - fprintf (stderr, " v(%6.10f %6.10f %6.10f)",system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2] ); - fprintf (stderr, " f(%6.10f %6.10f %6.10f)", system->atoms[i].f[0], system->atoms[i].f[1], system->atoms[i].f[2] ); - fprintf (stderr, " q(%6.10f) \n", system->atoms[i].q ); - } - } - - - void print_grid (reax_system *system) - { - int i, j, k, x; - grid *g = &system->g; - - for( i = 0; i < g->ncell[0]; i++ ) - for( j = 0; j < g->ncell[1]; j++ ) - for( k = 0; k < g->ncell[2]; k++ ){ - fprintf (stderr, "Cell [%d,%d,%d]--(", i, j, k); - for (x = 0; x < g->top[index_grid_3d (i,j,k,g) ]; x++){ - fprintf (stderr, "%d,", g->atoms[ index_grid_atoms (i,j,k,x,g) ]); - } - fprintf (stderr, ")\n"); - } - } + list *far_nbrs = *lists + FAR_NBRS; + list *d_nbrs = dev_lists + FAR_NBRS; + far_neighbor_data gpu, cpu; + int index, count, jicount; + + int *end = (int *)malloc (sizeof (int) * system->N); + int *start = (int *) malloc (sizeof (int) * system->N ); + + //fprintf (stderr, "numnbrs %d \n", system->num_nbrs); + + copy_host_device (start, d_nbrs->index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, 1); + copy_host_device (end, d_nbrs->end_index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, 2); + + far_neighbor_data *data = (far_neighbor_data *) malloc (FAR_NEIGHBOR_SIZE * system->num_nbrs); + copy_host_device (data, d_nbrs->select.far_nbr_list, FAR_NEIGHBOR_SIZE * system->num_nbrs, cudaMemcpyDeviceToHost, 3); + + int cpu_count = 0; + int gpu_count = 0; + + for (int i = 0; i < system->N; i++){ + cpu_count += Num_Entries (i, far_nbrs); + gpu_count += end[i] - start[i]; + } + + //fprintf (stderr, " Nbrs count cpu: %d -- gpu: %d \n", cpu_count, gpu_count ); + for (int i = 0; i < system->N-1; i++){ + if (end [i] > start [i+1]) + { + fprintf (stderr, " Far Neighbors index over write @ index %d\n", i); + exit (-1); + } + } + + + + for (int i = 0; i < system->N; i++){ + index = Start_Index (i, far_nbrs); + + for (int j = start[i]; j < end[i]; j++){ + gpu = data[j]; + + if (i < data[j].nbr) { + int src = data[j].nbr; + int dest = i; + int x; + + + for (x = start[src]; x < end[src]; x++) { + if (data[x].nbr != dest) continue; + + gpu = data[x]; + cpu = data[j]; + + if ( (gpu.d != cpu.d) || + (cpu.dvec[0] != gpu.dvec[0]) || (cpu.dvec[1] != gpu.dvec[1]) || (cpu.dvec[2] != gpu.dvec[2]) || + (cpu.rel_box[0] != gpu.rel_box[0]) || (cpu.rel_box[1] != gpu.rel_box[1]) || (cpu.rel_box[2] != gpu.rel_box[2])) { + fprintf (stderr, " atom %d neighbor %d (%f, %d, %d, %d - %f %f %f) \n", i, data[j].nbr, + data[j].d, + data[j].rel_box[0], + data[j].rel_box[1], + data[j].rel_box[2], + data[j].dvec[0], + data[j].dvec[1], + data[j].dvec[2] + ); + fprintf (stderr, " atom %d neighbor %d (%f, %d, %d, %d - %f %f %f) \n", data[j].nbr, data[x].nbr, + data[x].d, + data[x].rel_box[0], + data[x].rel_box[1], + data[x].rel_box[2], + data[x].dvec[0], + data[x].dvec[1], + data[x].dvec[2] + ); + jicount++; + + fprintf (stderr, " Far Neighbors DOES NOT match between Deivce and Host \n"); + exit (-1); + } + break; + } + + if (x >= end[src]) { + fprintf (stderr, "could not find the neighbor duplicate data for ij (%d %d)\n", i, src ); + exit (-1); + } + + continue; + } + + cpu = far_nbrs->select.far_nbr_list[index]; + //if ( (gpu.d != cpu.d) || (gpu.nbr != cpu.nbr) || + // (cpu.dvec[0] != gpu.dvec[0]) || (cpu.dvec[1] != gpu.dvec[1]) || (cpu.dvec[2] != gpu.dvec[2]) || + // (cpu.rel_box[0] != gpu.rel_box[0]) || (cpu.rel_box[1] != gpu.rel_box[1]) || (cpu.rel_box[2] != gpu.rel_box[2])) { + //if (memcmp (&gpu, &cpu, FAR_NEIGHBOR_SIZE)) { + if ( check_zero (gpu.d, cpu.d) || + (gpu.nbr != cpu.nbr) || + check_zero (cpu.dvec, gpu.dvec) || + !check_same (cpu.rel_box, gpu.rel_box)) { + + fprintf (stderr, "GPU:atom --> %d (s: %d , e: %d, i: %d )\n", i, start[i], end[i], j ); + fprintf (stderr, "CPU:atom --> %d (s: %d , e: %d, i: %d )\n", i, Start_Index(i, far_nbrs), End_Index (i, far_nbrs), index); + fprintf (stdout, "Far neighbors does not match atom: %d \n", i ); + fprintf (stdout, "neighbor %d , %d \n", cpu.nbr, gpu.nbr); + fprintf (stdout, "d %f , %f \n", cpu.d, data[j].d); + fprintf (stdout, "dvec (%f %f %f) (%f %f %f) \n", + cpu.dvec[0], cpu.dvec[1], cpu.dvec[2], + gpu.dvec[0], gpu.dvec[1], gpu.dvec[2] ); + + fprintf (stdout, "rel_box (%d %d %d) (%d %d %d) \n", + cpu.rel_box[0], cpu.rel_box[1], cpu.rel_box[2], + gpu.rel_box[0], gpu.rel_box[1], gpu.rel_box[2] ); + + fprintf (stderr, " Far Neighbors DOES NOT match between Deivce and Host **** \n"); + exit (-1); + count ++; + } + index ++; + } + + if (index != End_Index (i, far_nbrs)) + { + fprintf (stderr, "End index does not match for atom --> %d end index (%d) Cpu (%d, %d ) gpu (%d, %d)\n", i, index, Start_Index (i, far_nbrs), End_Index(i, far_nbrs), + start[i], end[i]); + exit (10); + } + } + + //fprintf (stderr, "FAR Neighbors match between device and host \n"); + free (start); + free (end); + free (data); + return true; + } + + bool validate_workspace (reax_system *system, static_storage *workspace, list **lists) + { + real *total_bond_order; + int count, tcount; + + total_bond_order = (real *) malloc ( system->N * REAL_SIZE ); + copy_host_device (total_bond_order, dev_workspace->total_bond_order, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + + count = 0; + for (int i = 0; i < system->N; i++) { + + //if (abs (workspace->total_bond_order[i] - total_bond_order[i]) >= GPU_TOLERANCE){ + if ( check_zero (workspace->total_bond_order[i], total_bond_order[i])){ + fprintf (stderr, "Total bond order does not match for atom %d (%4.15e %4.15e)\n", + i, workspace->total_bond_order[i], total_bond_order[i]); + exit (-1); + count ++; + } + } + free (total_bond_order); + //fprintf (stderr, "TOTAL Bond Order mismatch count %d\n", count); + + + rvec *dDeltap_self; + dDeltap_self = (rvec *) calloc (system->N, RVEC_SIZE); + copy_host_device (dDeltap_self, dev_workspace->dDeltap_self, system->N * RVEC_SIZE, cudaMemcpyDeviceToHost, __LINE__); + + count = 0; + for (int i = 0; i < system->N; i++ ) + { + if (check_zero (workspace->dDeltap_self[i], dDeltap_self[i])) + { + fprintf (stderr, "index: %d c (%f %f %f) g (%f %f %f )\n", i, + workspace->dDeltap_self[i][0], + workspace->dDeltap_self[i][1], + workspace->dDeltap_self[i][2], + dDeltap_self[3*i+0], + dDeltap_self[3*i+1], + dDeltap_self[3*i+2] ); + exit (-1); + count ++; + } + } + free (dDeltap_self); + //fprintf (stderr, "dDeltap_self mismatch count %d\n", count); + + //exit for init_forces + + real *test; + test = (real *) malloc (system->N * REAL_SIZE); + + copy_host_device (test, dev_workspace->Deltap, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + count = 0; + for (int i = 0; i < system->N; i++ ) + { + if (check_zero (workspace->Deltap[i], test[i])) + { + fprintf (stderr, "Deltap: Mismatch index --> %d (%f %f) \n", i, workspace->Deltap[i], test[i]); + exit (-1); + count ++; + } + } + //fprintf (stderr, "Deltap mismatch count %d\n", count); + + copy_host_device (test, dev_workspace->Deltap_boc, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + count = 0; + for (int i = 0; i < system->N; i++ ) + { + if (check_zero (workspace->Deltap_boc[i], test[i])) + { + fprintf (stderr, "Deltap_boc: Mismatch index --> %d (%f %f) \n", i, workspace->Deltap_boc[i], test[i]); + exit (-1); + count ++; + } + } + //fprintf (stderr, "dDeltap_boc mismatch count %d\n", count); + + copy_host_device (test, dev_workspace->Delta, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + count = 0; + for (int i = 0; i < system->N; i++ ) { + if (check_zero (workspace->Delta[i], test[i])) { + fprintf (stderr, "Delta: Mismatch index --> %d (%f %f) \n", i, workspace->Delta[i], test[i]); + exit (-1); + count ++; + } + } + //fprintf (stderr, "Delta mismatch count %d\n", count); + + copy_host_device (test, dev_workspace->Delta_e, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + count = 0; + for (int i = 0; i < system->N; i++ ) { + if (check_zero (workspace->Delta_e[i], test[i])) { + fprintf (stderr, "Delta_e: Mismatch index --> %d (%f %f) \n", i, workspace->Delta_e[i], test[i]); + exit (-1); + count ++; + } + } + //fprintf (stderr, "Delta_e mismatch count %d\n", count); + + copy_host_device (test, dev_workspace->vlpex, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + count = 0; + for (int i = 0; i < system->N; i++ ) { + if (check_zero (workspace->vlpex[i], test[i])) { + fprintf (stderr, "vlpex: Mismatch index --> %d (%f %f) \n", i, workspace->vlpex[i], test[i]); + exit (-1); + count ++; + } + } + //fprintf (stderr, "vlpex mismatch count %d\n", count); + + copy_host_device (test, dev_workspace->nlp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + count = 0; + for (int i = 0; i < system->N; i++ ) { + if (check_zero (workspace->nlp[i], test[i])) { + fprintf (stderr, "nlp: Mismatch index --> %d (%f %f) \n", i, workspace->nlp[i], test[i]); + exit (-1); + count ++; + } + } + //fprintf (stderr, "nlp mismatch count %d\n", count); + + copy_host_device (test, dev_workspace->Delta_lp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + count = 0; + for (int i = 0; i < system->N; i++ ) { + if (check_zero (workspace->Delta_lp[i], test[i])) { + fprintf (stderr, "Delta_lp: Mismatch index --> %d (%f %f) \n", i, workspace->Delta_lp[i], test[i]); + exit (-1); + count ++; + } + } + //fprintf (stderr, "Delta_lp mismatch count %d\n", count); + + copy_host_device (test, dev_workspace->Clp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + count = 0; + for (int i = 0; i < system->N; i++ ) { + if (check_zero (workspace->Clp[i], test[i])) { + fprintf (stderr, "Clp: Mismatch index --> %d (%f %f) \n", i, workspace->Clp[i], test[i]); + exit (-1); + count ++; + } + } + //fprintf (stderr, "Clp mismatch count %d\n", count); + + copy_host_device (test, dev_workspace->dDelta_lp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + count = 0; + for (int i = 0; i < system->N; i++ ) { + if (check_zero (workspace->dDelta_lp[i], test[i])) { + fprintf (stderr, "dDelta_lp: Mismatch index --> %d (%f %f) \n", i, workspace->dDelta_lp[i], test[i]); + exit (-1); + count ++; + } + } + //fprintf (stderr, "dDelta_lp mismatch count %d\n", count); + + copy_host_device (test, dev_workspace->nlp_temp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + count = 0; + for (int i = 0; i < system->N; i++ ) { + if (check_zero (workspace->nlp_temp[i], test[i])) { + fprintf (stderr, "nlp_temp: Mismatch index --> %d (%f %f) \n", i, workspace->nlp_temp[i], test[i]); + exit (-1); + count ++; + } + } + //fprintf (stderr, "nlp_temp mismatch count %d\n", count); + + copy_host_device (test, dev_workspace->Delta_lp_temp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + count = 0; + for (int i = 0; i < system->N; i++ ) { + if (check_zero (workspace->Delta_lp_temp[i], test[i])) { + fprintf (stderr, "Delta_lp_temp: Mismatch index --> %d (%f %f) \n", i, workspace->Delta_lp_temp[i], test[i]); + exit (-1); + count ++; + } + } + //fprintf (stderr, "Delta_lp_temp mismatch count %d\n", count); + + copy_host_device (test, dev_workspace->dDelta_lp_temp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + count = 0; + for (int i = 0; i < system->N; i++ ) { + if (check_zero (workspace->dDelta_lp_temp[i], test[i])) { + fprintf (stderr, "dDelta_lp_temp: Mismatch index --> %d (%f %f) \n", i, workspace->dDelta_lp_temp[i], test[i]); + exit (-1); + count ++; + } + } + //fprintf (stderr, "dDelta_lp_temp mismatch count %d\n", count); + + //exit for Bond order calculations + + + copy_host_device (test, dev_workspace->CdDelta, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + count = 0; + for (int i = 0; i < system->N; i++ ) { + if (check_zero (workspace->CdDelta[i], test[i])) { + fprintf (stderr, " CdDelta does NOT match (%f %f) for atom %d \n", workspace->CdDelta[i], test[i], i); + exit (-1); + count ++; + } + } + //fprintf (stderr, "CdDelta mismatch count %d\n", count); + //exit for Bond Energy calculations + + /* + copy_host_device (test, dev_workspace->droptol, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + count = 0; + for (int i = 0; i < system->N; i++ ) { + if (check_zero (workspace->droptol[i], test[i])) { + fprintf (stderr, " Droptol Does not match (%f %f) \n", workspace->droptol[i], test[i]); + exit (-1); + count ++; + } + } + //fprintf (stderr, "droptol mismatch count %d\n", count); + */ + + + //exit for QEa calculations + /* + real *t_s; + + t_s = (real *) malloc (REAL_SIZE * (system->N * 2) ); + copy_host_device (t_s, dev_workspace->b_prm, REAL_SIZE * (system->N * 2), cudaMemcpyDeviceToHost, __LINE__); + + count = 0; + for (int i = 0; i < (system->N * 2); i++ ) { + if (check_zero (workspace->b_prm[i], t_s[i])) { + fprintf (stderr, " (%f %f) \n", workspace->b_prm[i], t_s[i]); + exit (-1); + count ++; + } + } + //fprintf (stderr, "b_prm mismatch count %d\n", count); + + t_s = (real *) malloc (REAL_SIZE * 5 * system->N); + copy_host_device (t_s, dev_workspace->s, system->N * REAL_SIZE * 5, cudaMemcpyDeviceToHost, __LINE__); + + count = 0; + for (int i = 0; i < 5*system->N; i++ ) { + if (check_zero (workspace->s[i], t_s[i])) { + //fprintf (stderr, " (%f %f) @ index %d \n", workspace->s[i], t_s[i], i); + count ++; + } + } + fprintf (stderr, "s mismatch count %d\n", count); + + + t_s = (real *) malloc (REAL_SIZE * 5 * system->N); + copy_host_device (t_s, dev_workspace->t, system->N * REAL_SIZE * 5, cudaMemcpyDeviceToHost, __LINE__); + + count = 0; + for (int i = 0; i < 5*system->N; i++ ) { + if (check_zero (workspace->t[i], t_s[i])) { + //fprintf (stderr, " (%f %f) @ index : %d\n", workspace->t[i], t_s[i], i); + count ++; + } + } + fprintf (stderr, "t mismatch count %d\n", count); + + + t_s = (real *) malloc (REAL_SIZE * (RESTART+1) * system->N); + copy_host_device (t_s, dev_workspace->v, system->N * REAL_SIZE * (RESTART+1), cudaMemcpyDeviceToHost, __LINE__); + + count = 0; + for (int i = 0; i < (RESTART + 1)*system->N; i++ ) { + if (check_zero (workspace->v[i], t_s[i])) { + //fprintf (stderr, " (%f %f) @ index %d \n", workspace->v[i], t_s[i], i); + count ++; + } + } + fprintf (stderr, "v mismatch count %d\n", count); + + t_s = (real *) malloc (REAL_SIZE * (RESTART+1) ); + copy_host_device (t_s, dev_workspace->y, REAL_SIZE * (RESTART+1), cudaMemcpyDeviceToHost, __LINE__); + + count = 0; + for (int i = 0; i < (RESTART + 1); i++ ) { + if (check_zero (workspace->y[i], t_s[i])) { + //fprintf (stderr, " (%f %f) \n", workspace->y[i], t_s[i]); + count ++; + } + } + fprintf (stderr, "y mismatch count %d\n", count); + + t_s = (real *) malloc (REAL_SIZE * (RESTART+1) ); + copy_host_device (t_s, dev_workspace->hc, REAL_SIZE * (RESTART+1), cudaMemcpyDeviceToHost, __LINE__); + + count = 0; + for (int i = 0; i < (RESTART + 1); i++ ) { + if (check_zero (workspace->hc[i], t_s[i])) { + //fprintf (stderr, " (%f %f) \n", workspace->hc[i], t_s[i]); + count ++; + } + } + fprintf (stderr, "hc mismatch count %d\n", count); + + t_s = (real *) malloc (REAL_SIZE * (RESTART+1) ); + copy_host_device (t_s, dev_workspace->hs, REAL_SIZE * (RESTART+1), cudaMemcpyDeviceToHost, __LINE__); + + count = 0; + for (int i = 0; i < (RESTART + 1); i++ ) { + if (check_zero (workspace->hs[i], t_s[i])) { + //fprintf (stderr, " (%f %f) \n", workspace->hs[i], t_s[i]); + count ++; + } + } + fprintf (stderr, "hs mismatch count %d\n", count); + + t_s = (real *) malloc (REAL_SIZE * (RESTART+1) * (RESTART+1) ); + copy_host_device (t_s, dev_workspace->h, REAL_SIZE * (RESTART+1)*(RESTART+1), cudaMemcpyDeviceToHost, __LINE__); + + count = 0; + for (int i = 0; i < (RESTART+1)*(RESTART+1); i++ ) { + if (check_zero (workspace->h[i], t_s[i])) { + //fprintf (stderr, " (%f %f) \n", workspace->h[i], t_s[i]); + count ++; + } + } + fprintf (stderr, "h mismatch count %d\n", count); + + t_s = (real *) malloc (REAL_SIZE * (RESTART+1) ); + copy_host_device (t_s, dev_workspace->g, REAL_SIZE * (RESTART+1), cudaMemcpyDeviceToHost, __LINE__); + + count = 0; + for (int i = 0; i < (RESTART + 1); i++ ) { + if (check_zero (workspace->g[i], t_s[i])) { + //fprintf (stderr, " (%f %f) @ index %d\n", workspace->g[i], t_s[i], i); + count ++; + } + } + fprintf (stderr, "g mismatch count %d\n", count); + */ + + rvec *r_s = (rvec *) malloc (RVEC_SIZE * system->N ); + copy_host_device (r_s, dev_workspace->v_const, RVEC_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); + + count = 0; + for (int i = 0; i < system->N; i++ ) { + if (check_zero (workspace->v_const[i], r_s[i])) { + fprintf (stderr, " v_const (%f %f %f) (%f %f %f) @ index %d\n", + workspace->v_const[i][0], + workspace->v_const[i][1], + workspace->v_const[i][2], + r_s[i][0], + r_s[i][1], + r_s[i][2], + i); + exit (-1); + count ++; + } + } + //fprintf (stderr, "v_const mismatch count %d\n", count); + + free (test); + free (r_s); + return true; + } + + bool validate_data (reax_system *system, simulation_data *host) + { + simulation_data device; + + copy_host_device (&device, host->d_simulation_data, SIMULATION_DATA_SIZE, cudaMemcpyDeviceToHost, __LINE__); + + if (check_zero (host->E_BE, device.E_BE)){ + fprintf (stderr, "E_BE does not match (%4.15e %4.15e) \n", host->E_BE, device.E_BE); + exit (-1); + } + + if (check_zero (host->E_Lp, device.E_Lp)){ + fprintf (stderr, "E_Lp does not match (%4.10e %4.10e) \n", host->E_Lp, device.E_Lp); + exit (-1); + } + + if (check_zero (host->E_Ov, device.E_Ov)){ + fprintf (stderr, "E_Ov does not match (%4.10e %4.10e) \n", host->E_Ov, device.E_Ov); + exit (-1); + } + + if (check_zero (host->E_Un, device.E_Un)){ + fprintf (stderr, "E_Un does not match (%4.10e %4.10e) \n", host->E_Un, device.E_Un); + exit (-1); + } + + if (check_zero (host->E_Tor, device.E_Tor)) { + fprintf (stderr, "E_Tor does not match (%4.10e %4.10e) \n", host->E_Tor, device.E_Tor); + exit (-1); + } + + if (check_zero (host->E_Con, device.E_Con)) { + fprintf (stderr, "E_Con does not match (%4.10e %4.10e) \n", host->E_Con, device.E_Con); + exit (-1); + } + + if (check_zero (host->ext_press, device.ext_press)) { + fprintf (stderr, "ext_press does not match (%4.10e %4.10e) \n", host->ext_press, device.ext_press); + exit (-1); + } + + if (check_zero (host->E_HB, device.E_HB)) { + fprintf (stderr, "E_Hb does not match (%4.10e %4.10e) \n", host->E_HB, device.E_HB); + exit (-1); + } + + if (check_zero (host->E_Ang, device.E_Ang)) { + fprintf (stderr, "E_Ang does not match (%4.10e %4.10e) \n", host->E_Ang, device.E_Ang); + exit (-1); + } + + if (check_zero (host->E_Pen, device.E_Pen)) { + fprintf (stderr, "E_Pen does not match (%4.10e %4.10e) \n", host->E_Pen, device.E_Pen); + exit (-1); + } + + if (check_zero (host->E_Coa, device.E_Coa)) { + fprintf (stderr, "E_Coa does not match (%4.10e %4.10e) \n", host->E_Coa, device.E_Coa); + exit (-1); + } + + if (check_zero (host->E_vdW, device.E_vdW)) { + fprintf (stderr, "E_vdW does not match (%4.20e %4.20e) \n", host->E_vdW, device.E_vdW); + exit (-1); + } + + if (check_zero (host->E_Ele, device.E_Ele)) { + fprintf (stderr, "E_Ele does not match (%4.20e %4.20e) \n", host->E_Ele, device.E_Ele); + exit (-1); + } + + if (check_zero (host->E_Pol, device.E_Pol)) { + fprintf (stderr, "E_Pol does not match (%4.10e %4.10e) \n", host->E_Pol, device.E_Pol); + exit (-1); + } + + + //fprintf (stderr, "Simulation Data match between host and device \n"); + return true; + } + + void print_bond_data (bond_order_data *s) + { + /* + fprintf (stderr, "Bond_Order_Data BO (%f ) BO_s (%f ) BO_pi (%f ) BO_pi2 (%f ) ", + s->BO, + s->BO_s, + s->BO_pi, + s->BO_pi2 ); + */ + fprintf (stderr, " Cdbo (%e) ", s->Cdbo ); + fprintf (stderr, " Cdbopi (%e) ", s->Cdbopi ); + fprintf (stderr, " Cdbopi2 (%e) ", s->Cdbopi2 ); + } + + void print_bond_list (reax_system *system, static_storage *workspace, list **lists) + { + list *bonds = *lists + BONDS; + + for (int i = 1; i < 2; i++) + { + fprintf (stderr, "Atom %d Bond_data ( nbrs \n", i); + for (int j = Start_Index (i, bonds); j < End_Index (i, bonds); j++) + { + bond_data *data = &bonds->select.bond_list [j]; + fprintf (stderr, " %d, ", data->nbr ); + print_bond_data (&data->bo_data); + fprintf (stderr, ")\n"); + } + } + + int *b_start = (int *) malloc (INT_SIZE * system->N); + int *b_end = (int *) malloc (INT_SIZE * system->N); + list *d_bonds = dev_lists + BONDS; + bond_data *d_bond_data; + + d_bond_data = (bond_data *) malloc (BOND_DATA_SIZE * system->num_bonds ); + + copy_host_device ( b_start, d_bonds->index, + INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); + copy_host_device ( b_end, d_bonds->end_index, + INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); + copy_host_device (d_bond_data, d_bonds->select.bond_list, BOND_DATA_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__); + for (int i = 0; i < 2; i++) + { + fprintf (stderr, "Atom %d Bond_data ( nbrs \n", i); + for (int j = b_start[i]; j < b_end[i]; j ++) { + bond_data *src = &d_bond_data[j]; + fprintf (stderr, " %d, ", src->nbr ); + print_bond_data (&src->bo_data); + fprintf (stderr, ")\n"); + } + } + } + + + + void count_three_bodies (reax_system *system, static_storage *workspace, list **lists) + { + list *three = *lists + THREE_BODIES; + list *bonds = *lists + BONDS; + + list *d_three = dev_lists + THREE_BODIES; + list *d_bonds = dev_lists + BONDS; + bond_data *d_bond_data; + real *test; + + three_body_interaction_data *data = (three_body_interaction_data *) + malloc ( sizeof (three_body_interaction_data) * system->num_thbodies); + int *start = (int *) malloc (INT_SIZE * system->num_bonds); + int *end = (int *) malloc (INT_SIZE * system->num_bonds); + + int *b_start = (int *) malloc (INT_SIZE * system->N); + int *b_end = (int *) malloc (INT_SIZE * system->N); + int count; + int hcount, dcount; + + copy_host_device ( start, d_three->index, + INT_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__); + copy_host_device ( end, d_three->end_index, + INT_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__); + copy_host_device ( data, d_three->select.three_body_list, + sizeof (three_body_interaction_data) * system->num_thbodies, + cudaMemcpyDeviceToHost, __LINE__); + + d_bond_data = (bond_data *) malloc (BOND_DATA_SIZE * system->num_bonds ); + + copy_host_device ( b_start, d_bonds->index, + INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); + copy_host_device ( b_end, d_bonds->end_index, + INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); + copy_host_device (d_bond_data, d_bonds->select.bond_list, BOND_DATA_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__); + + count = 0; + hcount = dcount = 0; + for (int i = 0; i < system->N; i++) + { + for (int j = b_start[i]; j < b_end[i]; j ++) { + dcount += end[j] - start[j]; + } + } + + fprintf (stderr, "Total Actual Three Body Count ---> %d \n", dcount); + + free (data); + free (start); + free (end); + free (b_start); + free (b_end); + free (d_bond_data); + } + + + + bool validate_three_bodies (reax_system *system, static_storage *workspace, list **lists) + { + list *three = *lists + THREE_BODIES; + list *bonds = *lists + BONDS; + + list *d_three = dev_lists + THREE_BODIES; + list *d_bonds = dev_lists + BONDS; + bond_data *d_bond_data; + real *test; + + three_body_interaction_data *data = (three_body_interaction_data *) + malloc ( sizeof (three_body_interaction_data) * system->num_thbodies); + int *start = (int *) malloc (INT_SIZE * system->num_bonds); + int *end = (int *) malloc (INT_SIZE * system->num_bonds); + + int *b_start = (int *) malloc (INT_SIZE * system->N); + int *b_end = (int *) malloc (INT_SIZE * system->N); + int count; + int hcount, dcount; + + + + copy_host_device ( start, d_three->index, + INT_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__); + copy_host_device ( end, d_three->end_index, + INT_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__); + copy_host_device ( data, d_three->select.three_body_list, + sizeof (three_body_interaction_data) * system->num_thbodies, + cudaMemcpyDeviceToHost, __LINE__); + + d_bond_data = (bond_data *) malloc (BOND_DATA_SIZE * system->num_bonds ); + + copy_host_device ( b_start, d_bonds->index, + INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); + copy_host_device ( b_end, d_bonds->end_index, + INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); + copy_host_device (d_bond_data, d_bonds->select.bond_list, BOND_DATA_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__); + + //test = (real *) malloc (REAL_SIZE * system->num_bonds); + //memset (test, 0, REAL_SIZE * system->num_bonds); + //copy_host_device (test, testdata, REAL_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__); + + count = 0; + for (int i = 0; i < system->N; i++) + { + //for (int j = bonds->index[i]; j < bonds->end_index[i]; j ++) + + hcount = dcount = 0; + for (int j = b_start[i]; j < b_end[i]; j ++) { + dcount += end[j] - start[j]; + hcount += Num_Entries (j, three); + + /* + if ((end[j] - start[j]) != (End_Index (j, three) - Start_Index (j, three))) + { + fprintf (stderr, " Three body count does not match between host and device\n"); + fprintf (stderr, " Host count : (%d, %d)\n", Start_Index (j, three), End_Index (j, three)); + fprintf (stderr, " Device count: (%d, %d)\n", start[j], end[j]); + } + */ + } + + + if ((dcount != hcount)) { + + fprintf (stderr, " Three body count does not match for the bond %d - %d \n", hcount, dcount); + + for (int j = b_start[i]; j < b_end[i]; j ++) { + bond_order_data *src = &d_bond_data[j].bo_data; + dcount = end[j] - start[j]; + hcount = Num_Entries (j, three); + fprintf (stderr, "device \n"); + print_bond_data (src); + + fprintf (stderr, "\n"); + src = &bonds->select.bond_list[j].bo_data; + fprintf (stderr, "host \n"); + print_bond_data (src); + fprintf (stderr, "\n"); + + //fprintf (stderr, "--- Device bo is %f \n", test[j]); + fprintf (stderr, "Device %d %d bonds (%d %d) - Host %d %d bonds (%d %d) \n", start[j], end[j],b_start[i], b_end[i], + Start_Index (j, three), End_Index (j, three), Start_Index (i, bonds), End_Index (i, bonds)); + fprintf (stderr, "Host %d Device %d -- atom %d index %d \n", hcount, dcount, i, j); + fprintf (stderr, "------\n"); + } + fprintf (stderr, " Three Bodies count does not match between host and device \n"); + exit (-1); + } + } + + //fprintf (stderr, "Three body count on DEVICE %d HOST %d \n", dcount, hcount); + + count = 0; + for (int i = 0; i < system->N; i++) + { + int x, y, z; + for (x = b_start[i]; x < b_end[i]; x++) + { + int t_start = start[x]; + int t_end = end[x]; + + bond_data *dev_bond = &d_bond_data [x]; + bond_data *host_bond; + for (z = Start_Index (i, bonds); z < End_Index (i, bonds); z++) + { + host_bond = &bonds->select.bond_list [z]; + if ((dev_bond->nbr == host_bond->nbr) && + check_same (dev_bond->rel_box, host_bond->rel_box) && + !check_zero (dev_bond->dvec, host_bond->dvec) && + !check_zero (dev_bond->d, host_bond->d) ) + { + break; + } + } + if (z >= End_Index (i, bonds)){ + fprintf (stderr, "Could not find the matching bond on host and device \n"); + exit (-1); + } + + //find this bond in the bonds on the host side. + + for (y = t_start; y < t_end; y++) + { + + three_body_interaction_data *device = data + y; + three_body_interaction_data *host; + + //fprintf (stderr, "Device thb %d pthb %d \n", device->thb, device->pthb); + + int xx; + for (xx = Start_Index (z, three); xx < End_Index (z, three); xx++) + { + host = &three->select.three_body_list [xx]; + //fprintf (stderr, "Host thb %d pthb %d \n", host->thb, host->pthb); + //if ((host->thb == device->thb) && (host->pthb == device->pthb)) + if ((host->thb == device->thb) && !check_zero (host->theta, device->theta)) + { + count ++; + break; + } + } + + if ( xx >= End_Index (z, three) ) { + fprintf (stderr, " Could not match for atom %d bonds %d (%d) Three body(%d %d) (%d %d) \n", i, x, z, + Start_Index (z, three), End_Index (z, three), start[x], end[x] ); + exit (-1); + }// else fprintf (stderr, "----------------- \n"); + } + } + } + free (data); + free (start); + free (end); + free (b_start); + free (b_end); + free (d_bond_data); + + //fprintf (stderr, "Three Body Interaction Data MATCH on device and HOST --> %d \n", count); + return true; + } + + bool bin_three_bodies (reax_system *system, static_storage *workspace, list **lists) + { + list *d_three = dev_lists + THREE_BODIES; + list *d_bonds = dev_lists + BONDS; + list *three = *lists + THREE_BODIES; + list *bonds = *lists + BONDS; + bond_data *d_bond_data; + + three_body_interaction_data *data = (three_body_interaction_data *) + malloc ( sizeof (three_body_interaction_data) * system->num_thbodies); + int *start = (int *) malloc (INT_SIZE * system->num_bonds); + int *end = (int *) malloc (INT_SIZE * system->num_bonds); + + int *b_start = (int *) malloc (INT_SIZE * system->N); + int *b_end = (int *) malloc (INT_SIZE * system->N); + + int *a = (int *) malloc (2 * INT_SIZE * system->N ); + int *b = (int *) malloc (2 * INT_SIZE * system->N ); + int *c = (int *) malloc (2 * INT_SIZE * system->N ); + int *d = (int *) malloc (2 * INT_SIZE * system->N ); + + for (int i = 0; i < 2 * system->N; i++) + a[i] = b[i] = c[i] = d[i] = -1; + + int count; + int hcount, dcount; + int index_a, index_b, index_c, index_d; + index_a = index_b = index_c = index_d = 0; + + copy_host_device ( start, d_three->index, + INT_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__); + copy_host_device ( end, d_three->end_index, + INT_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__); + copy_host_device ( data, d_three->select.three_body_list, + sizeof (three_body_interaction_data) * system->num_thbodies, + cudaMemcpyDeviceToHost, __LINE__); + + d_bond_data = (bond_data *) malloc (BOND_DATA_SIZE * system->num_bonds ); + + copy_host_device ( b_start, d_bonds->index, + INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); + copy_host_device ( b_end, d_bonds->end_index, + INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); + copy_host_device (d_bond_data, d_bonds->select.bond_list, BOND_DATA_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__); + + count = 0; + hcount = dcount = 0; + + /* + for (int i = 0; i < 20; i++) + { + for (int j = Start_Index (i, bonds); j < End_Index (i, bonds); j++) + { + for ( int k = Start_Index (j, three); k < End_Index (j, three); k ++) + { + three_body_interaction_data *host = &three->select.three_body_list [k]; + fprintf (stderr, " atom %d bond (%d %d) -- %d, (%d %d)\n", + i, Start_Index (i, bonds), End_Index (i, bonds), j, host->thb, host->pthb ); + + } + } + } + exit (-1); + */ + + count = 0; + for (int i = 0; i < system->N; i++) + { + for (int j = b_start[i]; j < b_end[i]; j ++) { + + /* + bond_data *src; + src = &d_bond_data[j]; + fprintf (stderr, " atom %d Neighbor %d \n", i, src->nbr ); + */ + + for (int x = start[j]; x < end[j]; x ++) + { + three_body_interaction_data *device = data + x; + + int center = device->j; + int d_i = device->i; + int d_k = device->k; + + + //fprintf (stderr, " atom %d bond (%d %d) -- %d, (%d %d %d) -- (%d %d)\n", + //i, b_start[i], b_end[i], j, center, d_i, d_k, device->thb, device->pthb); + + if ((a[system->N + center] != -1)) { + a[d_i] = a[d_k] = 1; + continue; + } else if ((b[system->N + center] != -1)) { + b[d_i] = b[d_k] = 1; + continue; + } else if ((c[system->N + center] != -1)) { + c[d_i] = c[d_k] = 1; + continue; + } else if ((d[system->N + center] != -1)) { + d[d_i] = d[d_k] = 1; + continue; + } + + if ((a[center] == -1) && (a[d_i] == -1) && (a[d_k] == -1)) { + a[center] = a[d_i] = a[d_k] = 1; + a[system->N + center] = 1; + } else if ((b[center] == -1) && (b[d_i] == -1) && (b[d_k] == -1)) { + b[center] = b[d_i] = b[d_k] = 1; + b[system->N + center] = 1; + } else if ((c[center] == -1) && (c[d_i] == -1) && (c[d_k] == -1)) { + c[center] = c[d_i] = c[d_k] = 1; + c[system->N + center] = 1; + } else if ((d[center] == -1) && (d[d_i] == -1) && (d[d_k] == -1)) { + d[center] = d[d_i] = d[d_k] = 1; + d[system->N + center]= 1; + } + else { + count ++; + break; + fprintf (stderr, "We have a problem with the four bins atom %d bond (%d %d) -- %d, (%d %d %d)\n", + i, b_start[i], b_end[i], j, center, d_i, d_k); + fprintf (stderr, "A's contents %d %d %d (%d %d %d)\n", + a[system->N + center], a[system->N + d_i], a[system->N + d_k], a[center], a[d_i], a[d_k]); + fprintf (stderr, "B's contents %d %d %d (%d %d %d)\n", + b[system->N + center], b[system->N + d_i], b[system->N + d_k], b[center], b[d_i], b[d_k]); + fprintf (stderr, "C's contents %d %d %d (%d %d %d)\n", + c[system->N + center], c[system->N + d_i], c[system->N + d_k], c[center], c[d_i], c[d_k]); + fprintf (stderr, "D's contents %d %d %d (%d %d %d)\n", + d[system->N + center], d[system->N + d_i], d[system->N + d_k], d[center], d[d_i], d[d_k]); + + } + } + } + } + fprintf (stderr, "Miscount is %d \n", count); + exit (-1); + + count = 0; + for (int i = 0; i < system->N; i++) + { + if (a[system->N + i] != -1) count ++; + if (b[system->N + i] != -1) count ++; + if (c[system->N + i] != -1) count ++; + if (d[system->N + i] != -1) count ++; + } + + fprintf (stderr, "binned so many atoms --> %d \n", count ); + } + + bool validate_grid (reax_system *system) + { + int total = system->g.ncell[0] * system->g.ncell[1] * system->g.ncell[2]; + int count = 0; + + int *dtop = (int *) malloc (INT_SIZE * total ); + copy_host_device (dtop, system->d_g.top, INT_SIZE * total, cudaMemcpyDeviceToHost, __LINE__); + + for (int i = 0; i < total; i++){ + if (system->g.top[i] != dtop[i]){ + fprintf (stderr, " top count does not match (%d %d) @ index %d \n", system->g.top[i], dtop[i], i ); + exit (-1); + } + } + free (dtop); + + int *datoms = (int *) malloc (INT_SIZE * total * system->d_g.max_atoms); + copy_host_device (datoms, system->d_g.atoms, INT_SIZE * total * system->d_g.max_atoms, cudaMemcpyDeviceToHost, __LINE__); + for (int i = 0; i < total*system->d_g.max_atoms; i++){ + if (system->g.atoms[i] != datoms[i]){ + fprintf (stderr, " atoms count does not match (%d %d) @ index %d \n", system->g.atoms[i], datoms[i], i ); + exit (-1); + } + } + free (datoms); + + ivec *dnbrs = (ivec *) malloc (IVEC_SIZE * total * system->d_g.max_nbrs); + copy_host_device (dnbrs, system->d_g.nbrs, IVEC_SIZE * total * system->d_g.max_nbrs, cudaMemcpyDeviceToHost, __LINE__); + for (int i = 0; i < total*system->d_g.max_nbrs; i++){ + if (!check_same (system->g.nbrs[i], dnbrs[i])){ + fprintf (stderr, " nbrs count does not match @ index %d \n", i ); + exit (-1); + } + } + free (dnbrs); + + rvec *dnbrs_cp = (rvec *) malloc (RVEC_SIZE * total * system->d_g.max_nbrs); + copy_host_device (dnbrs_cp, system->d_g.nbrs_cp, RVEC_SIZE * total * system->d_g.max_nbrs, cudaMemcpyDeviceToHost, __LINE__); + for (int i = 0; i < total*system->d_g.max_nbrs; i++){ + if (check_zero (system->g.nbrs_cp[i], dnbrs_cp[i])){ + fprintf (stderr, " nbrs_cp count does not match @ index %d \n", i ); + exit (-1); + } + } + free (dnbrs_cp); + + //fprintf (stderr, " Grid match between device and host \n"); + return true; + } + + void print_atoms (reax_system *system) + { + int start, end, index; + + reax_atom *test = (reax_atom *) malloc (REAX_ATOM_SIZE * system->N); + copy_host_device (test, system->d_atoms, REAX_ATOM_SIZE * system->N, cudaMemcpyDeviceToHost, RES_SYSTEM_ATOMS ); + + //for (int i = 0; i < system->N; i++) + for (int i = 0; i < 10; i++) + { + fprintf (stderr, "Atom:%d: Type:%d", i, test[i].type); + fprintf (stderr, " x(%6.10f %6.10f %6.10f)", test[i].x[0], test[i].x[1], test[i].x[2] ); + fprintf (stderr, " v(%6.10f %6.10f %6.10f)", test[i].v[0], test[i].v[1], test[i].v[2] ); + fprintf (stderr, " f(%6.10f %6.10f %6.10f)", test[i].f[0], test[i].f[1], test[i].f[2] ); + fprintf (stderr, " q(%6.10f) \n", test[i].q ); + } + } + + void print_sys_atoms (reax_system *system) + { + for (int i = 0; i < 10; i++) + { + fprintf (stderr, "Atom:%d: Type:%d", i, system->atoms[i].type); + fprintf (stderr, " x(%6.10f %6.10f %6.10f)",system->atoms[i].x[0], system->atoms[i].x[1], system->atoms[i].x[2] ); + fprintf (stderr, " v(%6.10f %6.10f %6.10f)",system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2] ); + fprintf (stderr, " f(%6.10f %6.10f %6.10f)", system->atoms[i].f[0], system->atoms[i].f[1], system->atoms[i].f[2] ); + fprintf (stderr, " q(%6.10f) \n", system->atoms[i].q ); + } + } + + + void print_grid (reax_system *system) + { + int i, j, k, x; + grid *g = &system->g; + + for( i = 0; i < g->ncell[0]; i++ ) + for( j = 0; j < g->ncell[1]; j++ ) + for( k = 0; k < g->ncell[2]; k++ ){ + fprintf (stderr, "Cell [%d,%d,%d]--(", i, j, k); + for (x = 0; x < g->top[index_grid_3d (i,j,k,g) ]; x++){ + fprintf (stderr, "%d,", g->atoms[ index_grid_atoms (i,j,k,x,g) ]); + } + fprintf (stderr, ")\n"); + } + } diff --git a/PuReMD-GPU/src/vector.cu b/PuReMD-GPU/src/vector.cu index 9da80d03..7cf06eb8 100644 --- a/PuReMD-GPU/src/vector.cu +++ b/PuReMD-GPU/src/vector.cu @@ -23,316 +23,316 @@ int Vector_isZero( real* v, int k ) { - for( --k; k>=0; --k ) - if( fabs( v[k] ) > ALMOST_ZERO ) - return 0; + for( --k; k>=0; --k ) + if( fabs( v[k] ) > ALMOST_ZERO ) + return 0; - return 1; + return 1; } void Vector_MakeZero( real *v, int k ) { - for( --k; k>=0; --k ) - v[k] = 0; + for( --k; k>=0; --k ) + v[k] = 0; } void Vector_Copy( real* dest, real* v, int k ) { - for( --k; k>=0; --k ) - dest[k] = v[k]; + for( --k; k>=0; --k ) + dest[k] = v[k]; } void Vector_Print( FILE *fout, char *vname, real *v, int k ) { - int i; + int i; - fprintf( fout, "%s:\n", vname ); - for( i = 0; i < k; ++i ) - fprintf( fout, "%24.15e\n", v[i] ); - fprintf( fout, "\n" ); + fprintf( fout, "%s:\n", vname ); + for( i = 0; i < k; ++i ) + fprintf( fout, "%24.15e\n", v[i] ); + fprintf( fout, "\n" ); } real Norm( real* v1, int k ) { - real ret = 0; + real ret = 0; - for( --k; k>=0; --k ) - ret += SQR( v1[k] ); + for( --k; k>=0; --k ) + ret += SQR( v1[k] ); - return SQRT( ret ); + return SQRT( ret ); } void rvec_Sum( rvec ret, rvec v1 ,rvec v2 ) { - ret[0] = v1[0] + v2[0]; - ret[1] = v1[1] + v2[1]; - ret[2] = v1[2] + v2[2]; + ret[0] = v1[0] + v2[0]; + ret[1] = v1[1] + v2[1]; + ret[2] = v1[2] + v2[2]; } real rvec_ScaledDot( real c1, rvec v1, real c2, rvec v2 ) { - return (c1*c2) * (v1[0]*v2[0] + v1[1]*v2[1] + v1[2]*v2[2]); + return (c1*c2) * (v1[0]*v2[0] + v1[1]*v2[1] + v1[2]*v2[2]); } void rvec_Multiply( rvec r, rvec v1, rvec v2 ) { - r[0] = v1[0] * v2[0]; - r[1] = v1[1] * v2[1]; - r[2] = v1[2] * v2[2]; + r[0] = v1[0] * v2[0]; + r[1] = v1[1] * v2[1]; + r[2] = v1[2] * v2[2]; } void rvec_Divide( rvec r, rvec v1, rvec v2 ) { - r[0] = v1[0] / v2[0]; - r[1] = v1[1] / v2[1]; - r[2] = v1[2] / v2[2]; + r[0] = v1[0] / v2[0]; + r[1] = v1[1] / v2[1]; + r[2] = v1[2] / v2[2]; } void rvec_iDivide( rvec r, rvec v1, ivec v2 ) { - r[0] = v1[0] / v2[0]; - r[1] = v1[1] / v2[1]; - r[2] = v1[2] / v2[2]; + r[0] = v1[0] / v2[0]; + r[1] = v1[1] / v2[1]; + r[2] = v1[2] / v2[2]; } void rvec_Invert( rvec r, rvec v ) { - r[0] = 1. / v[0]; - r[1] = 1. / v[1]; - r[2] = 1. / v[2]; + r[0] = 1. / v[0]; + r[1] = 1. / v[1]; + r[2] = 1. / v[2]; } void rvec_OuterProduct( rtensor r, rvec v1, rvec v2 ) { - int i, j; + int i, j; - for( i = 0; i < 3; ++i ) - for( j = 0; j < 3; ++j ) - r[i][j] = v1[i] * v2[j]; + for( i = 0; i < 3; ++i ) + for( j = 0; j < 3; ++j ) + r[i][j] = v1[i] * v2[j]; } int rvec_isZero( rvec v ) { - if( fabs(v[0]) > ALMOST_ZERO || - fabs(v[1]) > ALMOST_ZERO || - fabs(v[2]) > ALMOST_ZERO ) - return 0; - return 1; + if( fabs(v[0]) > ALMOST_ZERO || + fabs(v[1]) > ALMOST_ZERO || + fabs(v[2]) > ALMOST_ZERO ) + return 0; + return 1; } void rtensor_Multiply( rtensor ret, rtensor m1, rtensor m2 ) { - int i, j, k; - rtensor temp; - - // check if the result matrix is the same as one of m1, m2. - // if so, we cannot modify the contents of m1 or m2, so - // we have to use a temp matrix. - if( ret == m1 || ret == m2 ) - { - for( i = 0; i < 3; ++i ) - for( j = 0; j < 3; ++j ) - { - temp[i][j] = 0; - for( k = 0; k < 3; ++k ) - temp[i][j] += m1[i][k] * m2[k][j]; - } - - for( i = 0; i < 3; ++i ) - for( j = 0; j < 3; ++j ) - ret[i][j] = temp[i][j]; - } - else - { - for( i = 0; i < 3; ++i ) - for( j = 0; j < 3; ++j ) - { - ret[i][j] = 0; - for( k = 0; k < 3; ++k ) - ret[i][j] += m1[i][k] * m2[k][j]; - } - } + int i, j, k; + rtensor temp; + + // check if the result matrix is the same as one of m1, m2. + // if so, we cannot modify the contents of m1 or m2, so + // we have to use a temp matrix. + if( ret == m1 || ret == m2 ) + { + for( i = 0; i < 3; ++i ) + for( j = 0; j < 3; ++j ) + { + temp[i][j] = 0; + for( k = 0; k < 3; ++k ) + temp[i][j] += m1[i][k] * m2[k][j]; + } + + for( i = 0; i < 3; ++i ) + for( j = 0; j < 3; ++j ) + ret[i][j] = temp[i][j]; + } + else + { + for( i = 0; i < 3; ++i ) + for( j = 0; j < 3; ++j ) + { + ret[i][j] = 0; + for( k = 0; k < 3; ++k ) + ret[i][j] += m1[i][k] * m2[k][j]; + } + } } void rtensor_MatVec( rvec ret, rtensor m, rvec v ) { - int i; - rvec temp; - - // if ret is the same vector as v, we cannot modify the - // contents of v until all computation is finished. - if( ret == v ) - { - for( i = 0; i < 3; ++i ) - temp[i] = m[i][0] * v[0] + m[i][1] * v[1] + m[i][2] * v[2]; - - for( i = 0; i < 3; ++i ) - ret[i] = temp[i]; - } - else - { - for( i = 0; i < 3; ++i ) - ret[i] = m[i][0] * v[0] + m[i][1] * v[1] + m[i][2] * v[2]; - } + int i; + rvec temp; + + // if ret is the same vector as v, we cannot modify the + // contents of v until all computation is finished. + if( ret == v ) + { + for( i = 0; i < 3; ++i ) + temp[i] = m[i][0] * v[0] + m[i][1] * v[1] + m[i][2] * v[2]; + + for( i = 0; i < 3; ++i ) + ret[i] = temp[i]; + } + else + { + for( i = 0; i < 3; ++i ) + ret[i] = m[i][0] * v[0] + m[i][1] * v[1] + m[i][2] * v[2]; + } } void rtensor_Scale( rtensor ret, real c, rtensor m ) { - int i, j; + int i, j; - for( i = 0; i < 3; ++i ) - for( j = 0; j < 3; ++j ) - ret[i][j] = c * m[i][j]; + for( i = 0; i < 3; ++i ) + for( j = 0; j < 3; ++j ) + ret[i][j] = c * m[i][j]; } void rtensor_Add( rtensor ret, rtensor t ) { - int i, j; + int i, j; - for( i = 0; i < 3; ++i ) - for( j = 0; j < 3; ++j ) - ret[i][j] += t[i][j]; + for( i = 0; i < 3; ++i ) + for( j = 0; j < 3; ++j ) + ret[i][j] += t[i][j]; } void rtensor_ScaledAdd( rtensor ret, real c, rtensor t ) { - int i, j; + int i, j; - for( i = 0; i < 3; ++i ) - for( j = 0; j < 3; ++j ) - ret[i][j] += c * t[i][j]; + for( i = 0; i < 3; ++i ) + for( j = 0; j < 3; ++j ) + ret[i][j] += c * t[i][j]; } void rtensor_Sum( rtensor ret, rtensor t1, rtensor t2 ) { - int i, j; + int i, j; - for( i = 0; i < 3; ++i ) - for( j = 0; j < 3; ++j ) - ret[i][j] = t1[i][j] + t2[i][j]; + for( i = 0; i < 3; ++i ) + for( j = 0; j < 3; ++j ) + ret[i][j] = t1[i][j] + t2[i][j]; } void rtensor_ScaledSum( rtensor ret, real c1, rtensor t1, - real c2, rtensor t2 ) + real c2, rtensor t2 ) { - int i, j; + int i, j; - for( i = 0; i < 3; ++i ) - for( j = 0; j < 3; ++j ) - ret[i][j] = c1 * t1[i][j] + c2 * t2[i][j]; + for( i = 0; i < 3; ++i ) + for( j = 0; j < 3; ++j ) + ret[i][j] = c1 * t1[i][j] + c2 * t2[i][j]; } void rtensor_Copy( rtensor ret, rtensor t ) { - int i, j; + int i, j; - for( i = 0; i < 3; ++i ) - for( j = 0; j < 3; ++j ) - ret[i][j] = t[i][j]; + for( i = 0; i < 3; ++i ) + for( j = 0; j < 3; ++j ) + ret[i][j] = t[i][j]; } void rtensor_Identity( rtensor t ) { - t[0][0] = t[1][1] = t[2][2] = 1; - t[0][1] = t[0][2] = t[1][0] = t[1][2] = t[2][0] = t[2][1] = ZERO; + t[0][0] = t[1][1] = t[2][2] = 1; + t[0][1] = t[0][2] = t[1][0] = t[1][2] = t[2][0] = t[2][1] = ZERO; } void rtensor_MakeZero( rtensor t ) { - t[0][0] = t[0][1] = t[0][2] = ZERO; - t[1][0] = t[1][1] = t[1][2] = ZERO; - t[2][0] = t[2][1] = t[2][2] = ZERO; + t[0][0] = t[0][1] = t[0][2] = ZERO; + t[1][0] = t[1][1] = t[1][2] = ZERO; + t[2][0] = t[2][1] = t[2][2] = ZERO; } void rtensor_Transpose( rtensor ret, rtensor t ) { - ret[0][0] = t[0][0], ret[1][1] = t[1][1], ret[2][2] = t[2][2]; - ret[0][1] = t[1][0], ret[0][2] = t[2][0]; - ret[1][0] = t[0][1], ret[1][2] = t[2][1]; - ret[2][0] = t[0][2], ret[2][1] = t[1][2]; + ret[0][0] = t[0][0], ret[1][1] = t[1][1], ret[2][2] = t[2][2]; + ret[0][1] = t[1][0], ret[0][2] = t[2][0]; + ret[1][0] = t[0][1], ret[1][2] = t[2][1]; + ret[2][0] = t[0][2], ret[2][1] = t[1][2]; } real rtensor_Det( rtensor t ) { - return ( t[0][0] * (t[1][1] * t[2][2] - t[1][2] * t[2][1] ) + - t[0][1] * (t[1][2] * t[2][0] - t[1][0] * t[2][2] ) + - t[0][2] * (t[1][0] * t[2][1] - t[1][1] * t[2][0] ) ); + return ( t[0][0] * (t[1][1] * t[2][2] - t[1][2] * t[2][1] ) + + t[0][1] * (t[1][2] * t[2][0] - t[1][0] * t[2][2] ) + + t[0][2] * (t[1][0] * t[2][1] - t[1][1] * t[2][0] ) ); } real rtensor_Trace( rtensor t ) { - return (t[0][0] + t[1][1] + t[2][2]); + return (t[0][0] + t[1][1] + t[2][2]); } void Print_rTensor(FILE* fp, rtensor t) { - int i, j; - - for (i=0; i < 3; i++) - { - fprintf(fp,"["); - for (j=0; j < 3; j++) - fprintf(fp,"%8.3f,\t",t[i][j]); - fprintf(fp,"]\n"); - } + int i, j; + + for (i=0; i < 3; i++) + { + fprintf(fp,"["); + for (j=0; j < 3; j++) + fprintf(fp,"%8.3f,\t",t[i][j]); + fprintf(fp,"]\n"); + } } void ivec_MakeZero( ivec v ) { - v[0] = v[1] = v[2] = 0; + v[0] = v[1] = v[2] = 0; } void ivec_rScale( ivec dest, real C, rvec src ) { - dest[0] = (int)(C * src[0]); - dest[1] = (int)(C * src[1]); - dest[2] = (int)(C * src[2]); + dest[0] = (int)(C * src[0]); + dest[1] = (int)(C * src[1]); + dest[2] = (int)(C * src[2]); } int ivec_isZero( ivec v ) { - if( v[0]==0 && v[1]==0 && v[2]==0 ) - return 1; - return 0; + if( v[0]==0 && v[1]==0 && v[2]==0 ) + return 1; + return 0; } int ivec_isEqual( ivec v1, ivec v2 ) { - if( v1[0]==v2[0] && v1[1]==v2[1] && v1[2]==v2[2] ) - return 1; + if( v1[0]==v2[0] && v1[1]==v2[1] && v1[2]==v2[2] ) + return 1; - return 0; + return 0; } -- GitLab